embulk-executor-mapreduce 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/build.gradle +2 -0
- data/classpath/activation-1.1.jar +0 -0
- data/classpath/apacheds-i18n-2.0.0-M15.jar +0 -0
- data/classpath/apacheds-kerberos-codec-2.0.0-M15.jar +0 -0
- data/classpath/api-asn1-api-1.0.0-M20.jar +0 -0
- data/classpath/api-util-1.0.0-M20.jar +0 -0
- data/classpath/avro-1.7.4.jar +0 -0
- data/classpath/commons-beanutils-1.7.0.jar +0 -0
- data/classpath/commons-cli-1.2.jar +0 -0
- data/classpath/commons-codec-1.6.jar +0 -0
- data/classpath/commons-collections-3.2.1.jar +0 -0
- data/classpath/commons-compress-1.4.1.jar +0 -0
- data/classpath/commons-configuration-1.6.jar +0 -0
- data/classpath/commons-digester-1.8.jar +0 -0
- data/classpath/commons-httpclient-3.1.jar +0 -0
- data/classpath/commons-io-2.4.jar +0 -0
- data/classpath/commons-lang-2.6.jar +0 -0
- data/classpath/commons-logging-1.1.3.jar +0 -0
- data/classpath/commons-math3-3.1.1.jar +0 -0
- data/classpath/commons-net-3.1.jar +0 -0
- data/classpath/curator-client-2.6.0.jar +0 -0
- data/classpath/curator-framework-2.6.0.jar +0 -0
- data/classpath/curator-recipes-2.6.0.jar +0 -0
- data/classpath/embulk-executor-mapreduce-0.1.0.jar +0 -0
- data/classpath/gson-2.2.4.jar +0 -0
- data/classpath/hadoop-annotations-2.6.0.jar +0 -0
- data/classpath/hadoop-auth-2.6.0.jar +0 -0
- data/classpath/hadoop-client-2.6.0.jar +0 -0
- data/classpath/hadoop-common-2.6.0.jar +0 -0
- data/classpath/hadoop-hdfs-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-app-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-common-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-core-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-jobclient-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-shuffle-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-api-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-client-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-common-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-server-common-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-server-nodemanager-2.6.0.jar +0 -0
- data/classpath/htrace-core-3.0.4.jar +0 -0
- data/classpath/httpclient-4.2.5.jar +0 -0
- data/classpath/httpcore-4.2.4.jar +0 -0
- data/classpath/jackson-core-asl-1.9.13.jar +0 -0
- data/classpath/jackson-jaxrs-1.9.13.jar +0 -0
- data/classpath/jackson-mapper-asl-1.9.13.jar +0 -0
- data/classpath/jackson-xc-1.9.13.jar +0 -0
- data/classpath/jaxb-api-2.2.2.jar +0 -0
- data/classpath/jaxb-impl-2.2.3-1.jar +0 -0
- data/classpath/jersey-client-1.9.jar +0 -0
- data/classpath/jersey-core-1.9.jar +0 -0
- data/classpath/jersey-guice-1.9.jar +0 -0
- data/classpath/jersey-json-1.9.jar +0 -0
- data/classpath/jersey-server-1.9.jar +0 -0
- data/classpath/jettison-1.1.jar +0 -0
- data/classpath/jetty-util-6.1.26.jar +0 -0
- data/classpath/jline-0.9.94.jar +0 -0
- data/classpath/jsr305-1.3.9.jar +0 -0
- data/classpath/leveldbjni-all-1.8.jar +0 -0
- data/classpath/netty-3.7.0.Final.jar +0 -0
- data/classpath/paranamer-2.3.jar +0 -0
- data/classpath/protobuf-java-2.5.0.jar +0 -0
- data/classpath/servlet-api-2.5.jar +0 -0
- data/classpath/snappy-java-1.0.4.1.jar +0 -0
- data/classpath/stax-api-1.0-2.jar +0 -0
- data/classpath/xmlenc-0.52.jar +0 -0
- data/classpath/xz-1.0.jar +0 -0
- data/classpath/zookeeper-3.4.6.jar +0 -0
- data/lib/embulk/executor/mapreduce.rb +3 -0
- data/src/main/java/org/embulk/executor/mapreduce/AttemptState.java +154 -0
- data/src/main/java/org/embulk/executor/mapreduce/BufferWritable.java +74 -0
- data/src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java +158 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputFormat.java +37 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputSplit.java +61 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java +359 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java +303 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkRecordReader.java +63 -0
- data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java +391 -0
- data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutorTask.java +60 -0
- data/src/main/java/org/embulk/executor/mapreduce/PageWritable.java +66 -0
- data/src/main/java/org/embulk/executor/mapreduce/PartitionKey.java +11 -0
- data/src/main/java/org/embulk/executor/mapreduce/Partitioner.java +11 -0
- data/src/main/java/org/embulk/executor/mapreduce/Partitioning.java +12 -0
- data/src/main/java/org/embulk/executor/mapreduce/PluginArchive.java +189 -0
- data/src/main/java/org/embulk/executor/mapreduce/RemoteTaskFailedException.java +10 -0
- data/src/main/java/org/embulk/executor/mapreduce/SetContextClassLoader.java +19 -0
- data/src/main/java/org/embulk/executor/mapreduce/TimestampPartitioning.java +291 -0
- metadata +131 -0
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
|
2
|
+
|
|
3
|
+
import java.util.List;
|
|
4
|
+
import java.util.Map;
|
|
5
|
+
import java.util.Set;
|
|
6
|
+
import java.util.HashSet;
|
|
7
|
+
import java.io.File;
|
|
8
|
+
import java.io.IOException;
|
|
9
|
+
import java.io.EOFException;
|
|
10
|
+
import java.net.URI;
|
|
11
|
+
import java.net.URISyntaxException;
|
|
12
|
+
import java.net.URL;
|
|
13
|
+
import java.net.URLClassLoader;
|
|
14
|
+
import java.net.MalformedURLException;
|
|
15
|
+
import org.slf4j.Logger;
|
|
16
|
+
import org.joda.time.format.DateTimeFormat;
|
|
17
|
+
import com.google.inject.Inject;
|
|
18
|
+
import com.google.common.base.Optional;
|
|
19
|
+
import com.google.common.base.Throwables;
|
|
20
|
+
import com.google.common.collect.ImmutableList;
|
|
21
|
+
import com.google.common.collect.Iterators;
|
|
22
|
+
import org.jruby.embed.ScriptingContainer;
|
|
23
|
+
import org.apache.hadoop.util.StringUtils;
|
|
24
|
+
import org.apache.hadoop.io.IntWritable;
|
|
25
|
+
import org.apache.hadoop.io.NullWritable;
|
|
26
|
+
import org.apache.hadoop.fs.Path;
|
|
27
|
+
import org.apache.hadoop.fs.FsConstants;
|
|
28
|
+
import org.apache.hadoop.conf.Configuration;
|
|
29
|
+
import org.apache.hadoop.mapreduce.JobContext;
|
|
30
|
+
import org.apache.hadoop.mapreduce.Cluster;
|
|
31
|
+
import org.apache.hadoop.mapreduce.Job;
|
|
32
|
+
import org.apache.hadoop.mapreduce.Counters;
|
|
33
|
+
import org.apache.hadoop.mapreduce.TaskType;
|
|
34
|
+
import org.apache.hadoop.mapreduce.TaskAttemptID;
|
|
35
|
+
import org.apache.hadoop.mapreduce.TaskCompletionEvent;
|
|
36
|
+
import org.apache.hadoop.mapreduce.MRJobConfig;
|
|
37
|
+
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
|
|
38
|
+
import org.embulk.exec.ForSystemConfig;
|
|
39
|
+
import org.embulk.config.ConfigSource;
|
|
40
|
+
import org.embulk.config.ConfigException;
|
|
41
|
+
import org.embulk.config.TaskSource;
|
|
42
|
+
import org.embulk.config.ModelManager;
|
|
43
|
+
import org.embulk.spi.Exec;
|
|
44
|
+
import org.embulk.spi.ExecSession;
|
|
45
|
+
import org.embulk.spi.ExecutorPlugin;
|
|
46
|
+
import org.embulk.spi.ProcessTask;
|
|
47
|
+
import org.embulk.spi.ProcessState;
|
|
48
|
+
import org.embulk.spi.TaskState;
|
|
49
|
+
import org.embulk.spi.Schema;
|
|
50
|
+
import org.embulk.spi.time.Timestamp;
|
|
51
|
+
|
|
52
|
+
public class MapReduceExecutor
|
|
53
|
+
implements ExecutorPlugin
|
|
54
|
+
{
|
|
55
|
+
private final Logger log = Exec.getLogger(MapReduceExecutor.class);
|
|
56
|
+
private final ConfigSource systemConfig;
|
|
57
|
+
private final ScriptingContainer jruby;
|
|
58
|
+
|
|
59
|
+
@Inject
|
|
60
|
+
public MapReduceExecutor(@ForSystemConfig ConfigSource systemConfig,
|
|
61
|
+
ScriptingContainer jruby)
|
|
62
|
+
{
|
|
63
|
+
this.systemConfig = systemConfig;
|
|
64
|
+
this.jruby = jruby;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
@Override
|
|
68
|
+
public void transaction(ConfigSource config, Schema outputSchema, final int inputTaskCount,
|
|
69
|
+
ExecutorPlugin.Control control)
|
|
70
|
+
{
|
|
71
|
+
final MapReduceExecutorTask task = config.loadConfig(MapReduceExecutorTask.class);
|
|
72
|
+
task.setExecConfig(config);
|
|
73
|
+
|
|
74
|
+
final int outputTaskCount;
|
|
75
|
+
final int reduceTaskCount;
|
|
76
|
+
|
|
77
|
+
if (task.getPartitioning().isPresent()) {
|
|
78
|
+
reduceTaskCount = task.getReducers().or(inputTaskCount);
|
|
79
|
+
if (reduceTaskCount <= 0) {
|
|
80
|
+
throw new ConfigException("Reducers must be larger than 1 if partition: is set");
|
|
81
|
+
}
|
|
82
|
+
outputTaskCount = reduceTaskCount;
|
|
83
|
+
ConfigSource partitioningConfig = task.getPartitioning().get();
|
|
84
|
+
String partitioningType = partitioningConfig.get(String.class, "type");
|
|
85
|
+
Partitioning partitioning = newPartitioning(partitioningType);
|
|
86
|
+
TaskSource partitioningTask = partitioning.configure(partitioningConfig, outputSchema, reduceTaskCount);
|
|
87
|
+
task.setPartitioningType(Optional.of(partitioningType));
|
|
88
|
+
task.setPartitioningTask(Optional.of(partitioningTask));
|
|
89
|
+
} else {
|
|
90
|
+
reduceTaskCount = 0;
|
|
91
|
+
outputTaskCount = inputTaskCount;
|
|
92
|
+
task.setPartitioningType(Optional.<String>absent());
|
|
93
|
+
task.setPartitioningTask(Optional.<TaskSource>absent());
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
control.transaction(outputSchema, outputTaskCount, new ExecutorPlugin.Executor() {
|
|
97
|
+
public void execute(ProcessTask procTask, ProcessState state)
|
|
98
|
+
{
|
|
99
|
+
task.setProcessTask(procTask);
|
|
100
|
+
|
|
101
|
+
// hadoop uses ServiceLoader using context classloader to load some implementations
|
|
102
|
+
try (SetContextClassLoader closeLater = new SetContextClassLoader(MapReduceExecutor.class.getClassLoader())) {
|
|
103
|
+
run(task, inputTaskCount, reduceTaskCount, state);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
static Partitioning newPartitioning(String type)
|
|
110
|
+
{
|
|
111
|
+
switch (type) {
|
|
112
|
+
case "timestamp":
|
|
113
|
+
return new TimestampPartitioning();
|
|
114
|
+
default:
|
|
115
|
+
throw new ConfigException("Unknown partition type '"+type+"'");
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
void run(MapReduceExecutorTask task,
|
|
120
|
+
int mapTaskCount, int reduceTaskCount, ProcessState state)
|
|
121
|
+
{
|
|
122
|
+
ModelManager modelManager = task.getModelManager();
|
|
123
|
+
|
|
124
|
+
Configuration conf = new Configuration();
|
|
125
|
+
// don't call conf.setQuietMode(false). Configuraiton has invalid resource names by default
|
|
126
|
+
for (String path : task.getConfigFiles()) {
|
|
127
|
+
File file = new File(path);
|
|
128
|
+
if (!file.isFile()) {
|
|
129
|
+
throw new ConfigException(String.format("Config file '%s' does not exist", file));
|
|
130
|
+
}
|
|
131
|
+
try {
|
|
132
|
+
// use URL here. Configuration assumes String is a path of a resource in a ClassLoader
|
|
133
|
+
conf.addResource(file.toURI().toURL());
|
|
134
|
+
} catch (MalformedURLException ex) {
|
|
135
|
+
throw new RuntimeException(ex);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
String uniqueTransactionName = getTransactionUniqueName(Exec.session());
|
|
140
|
+
Path stateDir = new Path(new Path(task.getStatePath()), uniqueTransactionName);
|
|
141
|
+
|
|
142
|
+
Job job;
|
|
143
|
+
try {
|
|
144
|
+
job = Job.getInstance(conf);
|
|
145
|
+
} catch (IOException e) {
|
|
146
|
+
throw Throwables.propagate(e);
|
|
147
|
+
}
|
|
148
|
+
job.setJobName(task.getJobName());
|
|
149
|
+
|
|
150
|
+
// create a dedicated classloader for this yarn application.
|
|
151
|
+
// allow task.getConfig to overwrite this parameter
|
|
152
|
+
job.getConfiguration().set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER, "true"); // mapreduce.job.classloader
|
|
153
|
+
job.getConfiguration().set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER_SYSTEM_CLASSES, "java.,org.apache.hadoop."); // mapreduce.job.classloader.system.classes
|
|
154
|
+
|
|
155
|
+
// extra config
|
|
156
|
+
for (Map.Entry<String, String> pair : task.getConfig().entrySet()) {
|
|
157
|
+
job.getConfiguration().set(pair.getKey(), pair.getValue());
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// framework config
|
|
161
|
+
EmbulkMapReduce.setSystemConfig(job.getConfiguration(), modelManager, systemConfig);
|
|
162
|
+
EmbulkMapReduce.setExecutorTask(job.getConfiguration(), modelManager, task);
|
|
163
|
+
EmbulkMapReduce.setMapTaskCount(job.getConfiguration(), mapTaskCount); // used by EmbulkInputFormat
|
|
164
|
+
EmbulkMapReduce.setStateDirectoryPath(job.getConfiguration(), stateDir);
|
|
165
|
+
|
|
166
|
+
// create state dir
|
|
167
|
+
try {
|
|
168
|
+
stateDir.getFileSystem(job.getConfiguration()).mkdirs(stateDir);
|
|
169
|
+
} catch (IOException ex) {
|
|
170
|
+
throw new RuntimeException(ex);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// archive plugins
|
|
174
|
+
PluginArchive archive = new PluginArchive.Builder()
|
|
175
|
+
.addLoadedRubyGems(jruby)
|
|
176
|
+
.build();
|
|
177
|
+
try {
|
|
178
|
+
EmbulkMapReduce.writePluginArchive(job.getConfiguration(), stateDir, archive, modelManager);
|
|
179
|
+
} catch (IOException ex) {
|
|
180
|
+
throw new RuntimeException(ex);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// jar files
|
|
184
|
+
Iterable<Path> jars = collectJars(task.getLibjars());
|
|
185
|
+
job.getConfiguration().set("tmpjars", StringUtils.join(",", jars));
|
|
186
|
+
|
|
187
|
+
job.setInputFormatClass(EmbulkInputFormat.class);
|
|
188
|
+
|
|
189
|
+
if (reduceTaskCount > 0) {
|
|
190
|
+
job.setMapperClass(EmbulkPartitioningMapReduce.EmbulkPartitioningMapper.class);
|
|
191
|
+
job.setMapOutputKeyClass(BufferWritable.class);
|
|
192
|
+
job.setMapOutputValueClass(PageWritable.class);
|
|
193
|
+
|
|
194
|
+
job.setReducerClass(EmbulkPartitioningMapReduce.EmbulkPartitioningReducer.class);
|
|
195
|
+
|
|
196
|
+
job.setNumReduceTasks(reduceTaskCount);
|
|
197
|
+
|
|
198
|
+
} else {
|
|
199
|
+
job.setMapperClass(EmbulkMapReduce.EmbulkMapper.class);
|
|
200
|
+
job.setMapOutputKeyClass(NullWritable.class);
|
|
201
|
+
job.setMapOutputValueClass(NullWritable.class);
|
|
202
|
+
|
|
203
|
+
job.setReducerClass(EmbulkMapReduce.EmbulkReducer.class);
|
|
204
|
+
|
|
205
|
+
job.setNumReduceTasks(0);
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
job.setOutputFormatClass(NullOutputFormat.class);
|
|
209
|
+
job.setOutputKeyClass(NullWritable.class);
|
|
210
|
+
job.setOutputValueClass(NullWritable.class);
|
|
211
|
+
|
|
212
|
+
try {
|
|
213
|
+
job.submit();
|
|
214
|
+
|
|
215
|
+
int interval = Job.getCompletionPollInterval(job.getConfiguration());
|
|
216
|
+
while (!job.isComplete()) {
|
|
217
|
+
//if (job.getState() == JobStatus.State.PREP) {
|
|
218
|
+
// continue;
|
|
219
|
+
//}
|
|
220
|
+
log.info(String.format("map %.1f%% reduce %.1f%%",
|
|
221
|
+
job.mapProgress() * 100, job.reduceProgress() * 100));
|
|
222
|
+
Thread.sleep(interval);
|
|
223
|
+
|
|
224
|
+
updateProcessState(job, mapTaskCount, stateDir, state, modelManager);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
log.info(String.format("map %.1f%% reduce %.1f%%",
|
|
228
|
+
job.mapProgress() * 100, job.reduceProgress() * 100));
|
|
229
|
+
updateProcessState(job, mapTaskCount, stateDir, state, modelManager);
|
|
230
|
+
|
|
231
|
+
Counters counters = job.getCounters();
|
|
232
|
+
if (counters != null) {
|
|
233
|
+
log.info(counters.toString());
|
|
234
|
+
}
|
|
235
|
+
} catch (IOException | InterruptedException | ClassNotFoundException e) {
|
|
236
|
+
throw Throwables.propagate(e);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
private static Iterable<Path> collectJars(List<String> extraJars)
|
|
241
|
+
{
|
|
242
|
+
Set<Path> set = new HashSet<Path>();
|
|
243
|
+
|
|
244
|
+
collectURLClassLoaderJars(set, Exec.class.getClassLoader());
|
|
245
|
+
collectURLClassLoaderJars(set, MapReduceExecutor.class.getClassLoader());
|
|
246
|
+
|
|
247
|
+
for (String extraJar : extraJars) {
|
|
248
|
+
URI uri;
|
|
249
|
+
try {
|
|
250
|
+
uri = new URI(extraJar);
|
|
251
|
+
} catch (URISyntaxException ex) {
|
|
252
|
+
throw new ConfigException(String.format("Invalid jar path '%s'", extraJar), ex);
|
|
253
|
+
}
|
|
254
|
+
if (uri.getScheme() == null) {
|
|
255
|
+
set.add(localFileToLocalPath(new File(extraJar)));
|
|
256
|
+
} else {
|
|
257
|
+
set.add(new Path(uri));
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
return set;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
private static void collectURLClassLoaderJars(Set<Path> set, ClassLoader cl)
|
|
265
|
+
{
|
|
266
|
+
if (cl instanceof URLClassLoader) {
|
|
267
|
+
for (URL url : ((URLClassLoader) cl).getURLs()) {
|
|
268
|
+
File file = new File(url.getPath());
|
|
269
|
+
if (file.isFile()) {
|
|
270
|
+
// TODO log if not found
|
|
271
|
+
// TODO debug logging
|
|
272
|
+
set.add(localFileToLocalPath(file));
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
private static Path localFileToLocalPath(File file)
|
|
279
|
+
{
|
|
280
|
+
Path cwd = new Path(java.nio.file.Paths.get("").toAbsolutePath().toString()).makeQualified(FsConstants.LOCAL_FS_URI, new Path("/"));
|
|
281
|
+
return new Path(file.toString()).makeQualified(FsConstants.LOCAL_FS_URI, cwd);
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
private static String getTransactionUniqueName(ExecSession session)
|
|
285
|
+
{
|
|
286
|
+
// TODO implement Exec.getTransactionUniqueName()
|
|
287
|
+
Timestamp time = session.getTransactionTime();
|
|
288
|
+
return DateTimeFormat.forPattern("yyyyMMdd_HHmmss_").withZoneUTC()
|
|
289
|
+
.print(time.getEpochSecond() * 1000)
|
|
290
|
+
+ String.format("%09d", time.getNano());
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
private void updateProcessState(Job job, int mapTaskCount, Path stateDir,
|
|
294
|
+
ProcessState state, ModelManager modelManager) throws IOException
|
|
295
|
+
{
|
|
296
|
+
List<AttemptReport> reports = getAttemptReports(job.getConfiguration(), stateDir, modelManager);
|
|
297
|
+
|
|
298
|
+
for (AttemptReport report : reports) {
|
|
299
|
+
if (report == null) {
|
|
300
|
+
continue;
|
|
301
|
+
}
|
|
302
|
+
if (!report.isStarted()) {
|
|
303
|
+
continue;
|
|
304
|
+
}
|
|
305
|
+
AttemptState attempt = report.getAttemptState();
|
|
306
|
+
if (attempt.getInputTaskIndex().isPresent()) {
|
|
307
|
+
updateState(state.getInputTaskState(attempt.getInputTaskIndex().get()), attempt, true);
|
|
308
|
+
}
|
|
309
|
+
if (attempt.getOutputTaskIndex().isPresent()) {
|
|
310
|
+
updateState(state.getOutputTaskState(attempt.getOutputTaskIndex().get()), attempt, false);
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
private static void updateState(TaskState state, AttemptState attempt, boolean isInput)
|
|
316
|
+
{
|
|
317
|
+
state.start();
|
|
318
|
+
if (attempt.getException().isPresent()) {
|
|
319
|
+
if (!state.isCommitted()) {
|
|
320
|
+
state.setException(new RemoteTaskFailedException(attempt.getException().get()));
|
|
321
|
+
}
|
|
322
|
+
} else if (
|
|
323
|
+
(isInput && attempt.getInputCommitReport().isPresent()) ||
|
|
324
|
+
(!isInput && attempt.getOutputCommitReport().isPresent())) {
|
|
325
|
+
state.resetException();
|
|
326
|
+
}
|
|
327
|
+
if (isInput && attempt.getInputCommitReport().isPresent()) {
|
|
328
|
+
state.setCommitReport(attempt.getInputCommitReport().get());
|
|
329
|
+
state.finish();
|
|
330
|
+
}
|
|
331
|
+
if (!isInput && attempt.getOutputCommitReport().isPresent()) {
|
|
332
|
+
state.setCommitReport(attempt.getOutputCommitReport().get());
|
|
333
|
+
state.finish();
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
private static class AttemptReport
|
|
338
|
+
{
|
|
339
|
+
private final TaskAttemptID attemptId;
|
|
340
|
+
private final AttemptState attemptState;
|
|
341
|
+
|
|
342
|
+
public AttemptReport(TaskAttemptID attemptId)
|
|
343
|
+
{
|
|
344
|
+
this(attemptId, null);
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
public AttemptReport(TaskAttemptID attemptId, AttemptState attemptState)
|
|
348
|
+
{
|
|
349
|
+
this.attemptId = attemptId;
|
|
350
|
+
this.attemptState = attemptState;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
public boolean isStarted()
|
|
354
|
+
{
|
|
355
|
+
return attemptState != null;
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
public boolean isInputCommitted()
|
|
359
|
+
{
|
|
360
|
+
return attemptState != null && attemptState.getInputCommitReport().isPresent();
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
public boolean isOutputCommitted()
|
|
364
|
+
{
|
|
365
|
+
return attemptState != null && attemptState.getOutputCommitReport().isPresent();
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
public AttemptState getAttemptState()
|
|
369
|
+
{
|
|
370
|
+
return attemptState;
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
private static final int TASK_EVENT_FETCH_SIZE = 100;
|
|
375
|
+
|
|
376
|
+
private static List<AttemptReport> getAttemptReports(Configuration config,
|
|
377
|
+
Path stateDir, ModelManager modelManager) throws IOException
|
|
378
|
+
{
|
|
379
|
+
ImmutableList.Builder<AttemptReport> builder = ImmutableList.builder();
|
|
380
|
+
for (TaskAttemptID aid : EmbulkMapReduce.listAttempts(config, stateDir)) {
|
|
381
|
+
try {
|
|
382
|
+
AttemptState state = EmbulkMapReduce.readAttemptStateFile(config,
|
|
383
|
+
stateDir, aid, modelManager);
|
|
384
|
+
builder.add(new AttemptReport(aid, state));
|
|
385
|
+
} catch (EOFException ex) { // plus Not Found exception
|
|
386
|
+
builder.add(new AttemptReport(aid, null));
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
return builder.build();
|
|
390
|
+
}
|
|
391
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
|
2
|
+
|
|
3
|
+
import java.util.List;
|
|
4
|
+
import java.util.Map;
|
|
5
|
+
import com.google.common.base.Optional;
|
|
6
|
+
import org.embulk.config.Config;
|
|
7
|
+
import org.embulk.config.ConfigInject;
|
|
8
|
+
import org.embulk.config.ConfigDefault;
|
|
9
|
+
import org.embulk.config.ConfigSource;
|
|
10
|
+
import org.embulk.config.Task;
|
|
11
|
+
import org.embulk.config.TaskSource;
|
|
12
|
+
import org.embulk.config.ModelManager;
|
|
13
|
+
import org.embulk.spi.ProcessTask;
|
|
14
|
+
|
|
15
|
+
public interface MapReduceExecutorTask
|
|
16
|
+
extends Task
|
|
17
|
+
{
|
|
18
|
+
@Config("job_name")
|
|
19
|
+
@ConfigDefault("\"embulk\"")
|
|
20
|
+
public String getJobName();
|
|
21
|
+
|
|
22
|
+
@Config("config_files")
|
|
23
|
+
@ConfigDefault("[]")
|
|
24
|
+
public List<String> getConfigFiles();
|
|
25
|
+
|
|
26
|
+
@Config("config")
|
|
27
|
+
@ConfigDefault("{}")
|
|
28
|
+
public Map<String, String> getConfig();
|
|
29
|
+
|
|
30
|
+
@Config("libjars")
|
|
31
|
+
@ConfigDefault("[]")
|
|
32
|
+
public List<String> getLibjars();
|
|
33
|
+
|
|
34
|
+
@Config("state_path")
|
|
35
|
+
@ConfigDefault("\"/tmp/embulk\"")
|
|
36
|
+
public String getStatePath();
|
|
37
|
+
|
|
38
|
+
@Config("reducers")
|
|
39
|
+
@ConfigDefault("null")
|
|
40
|
+
public Optional<Integer> getReducers();
|
|
41
|
+
|
|
42
|
+
@Config("partitioning")
|
|
43
|
+
@ConfigDefault("null")
|
|
44
|
+
public Optional<ConfigSource> getPartitioning();
|
|
45
|
+
|
|
46
|
+
@ConfigInject
|
|
47
|
+
public ModelManager getModelManager();
|
|
48
|
+
|
|
49
|
+
public ConfigSource getExecConfig();
|
|
50
|
+
public void setExecConfig(ConfigSource execConfig);
|
|
51
|
+
|
|
52
|
+
public ProcessTask getProcessTask();
|
|
53
|
+
public void setProcessTask(ProcessTask task);
|
|
54
|
+
|
|
55
|
+
public Optional<String> getPartitioningType();
|
|
56
|
+
public void setPartitioningType(Optional<String> partitioningType);
|
|
57
|
+
|
|
58
|
+
public Optional<TaskSource> getPartitioningTask();
|
|
59
|
+
public void setPartitioningTask(Optional<TaskSource> partitioningTask);
|
|
60
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
|
2
|
+
|
|
3
|
+
import java.io.IOException;
|
|
4
|
+
import java.io.DataOutput;
|
|
5
|
+
import java.io.DataInput;
|
|
6
|
+
import java.util.List;
|
|
7
|
+
import java.util.ArrayList;
|
|
8
|
+
import org.apache.hadoop.io.Writable;
|
|
9
|
+
import org.apache.hadoop.io.WritableUtils;
|
|
10
|
+
import org.embulk.spi.Buffer;
|
|
11
|
+
import org.embulk.spi.Page;
|
|
12
|
+
import static java.nio.charset.StandardCharsets.UTF_8;
|
|
13
|
+
|
|
14
|
+
public class PageWritable
|
|
15
|
+
implements Writable
|
|
16
|
+
{
|
|
17
|
+
private Page page;
|
|
18
|
+
|
|
19
|
+
public PageWritable() { }
|
|
20
|
+
|
|
21
|
+
public void set(Page page)
|
|
22
|
+
{
|
|
23
|
+
this.page = page;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
public Page get()
|
|
27
|
+
{
|
|
28
|
+
return page;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
@Override
|
|
32
|
+
public void write(DataOutput out) throws IOException
|
|
33
|
+
{
|
|
34
|
+
Buffer buffer = page.buffer();
|
|
35
|
+
out.writeInt(buffer.limit());
|
|
36
|
+
out.write(buffer.array(), buffer.offset(), buffer.limit());
|
|
37
|
+
|
|
38
|
+
List<String> stringReferences = page.getStringReferences();
|
|
39
|
+
WritableUtils.writeVInt(out, stringReferences.size());
|
|
40
|
+
for (String s : stringReferences) {
|
|
41
|
+
out.writeUTF(s);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
@Override
|
|
46
|
+
public void readFields(DataInput in) throws IOException
|
|
47
|
+
{
|
|
48
|
+
int bufferSize = in.readInt();
|
|
49
|
+
byte[] bytes = new byte[bufferSize]; // TODO usa buffer allocator?
|
|
50
|
+
in.readFully(bytes, 0, bufferSize);
|
|
51
|
+
Buffer buffer = Buffer.wrap(bytes);
|
|
52
|
+
|
|
53
|
+
int stringCount = WritableUtils.readVInt(in);
|
|
54
|
+
List<String> strings = new ArrayList<String>(stringCount);
|
|
55
|
+
for (int i=0; i < stringCount; i++) {
|
|
56
|
+
strings.add(in.readUTF());
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
Page newPage = Page.wrap(buffer);
|
|
60
|
+
newPage.setStringReferences(strings);
|
|
61
|
+
if (page != null) {
|
|
62
|
+
page.release();
|
|
63
|
+
}
|
|
64
|
+
page = newPage;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
|
2
|
+
|
|
3
|
+
import org.embulk.config.ConfigSource;
|
|
4
|
+
import org.embulk.config.TaskSource;
|
|
5
|
+
import org.embulk.spi.Schema;
|
|
6
|
+
|
|
7
|
+
public interface Partitioning
|
|
8
|
+
{
|
|
9
|
+
public TaskSource configure(ConfigSource config, Schema schema, int outputTaskCount);
|
|
10
|
+
|
|
11
|
+
public Partitioner newPartitioner(TaskSource taskSource);
|
|
12
|
+
}
|