embulk-executor-mapreduce 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/build.gradle +2 -0
- data/classpath/activation-1.1.jar +0 -0
- data/classpath/apacheds-i18n-2.0.0-M15.jar +0 -0
- data/classpath/apacheds-kerberos-codec-2.0.0-M15.jar +0 -0
- data/classpath/api-asn1-api-1.0.0-M20.jar +0 -0
- data/classpath/api-util-1.0.0-M20.jar +0 -0
- data/classpath/avro-1.7.4.jar +0 -0
- data/classpath/commons-beanutils-1.7.0.jar +0 -0
- data/classpath/commons-cli-1.2.jar +0 -0
- data/classpath/commons-codec-1.6.jar +0 -0
- data/classpath/commons-collections-3.2.1.jar +0 -0
- data/classpath/commons-compress-1.4.1.jar +0 -0
- data/classpath/commons-configuration-1.6.jar +0 -0
- data/classpath/commons-digester-1.8.jar +0 -0
- data/classpath/commons-httpclient-3.1.jar +0 -0
- data/classpath/commons-io-2.4.jar +0 -0
- data/classpath/commons-lang-2.6.jar +0 -0
- data/classpath/commons-logging-1.1.3.jar +0 -0
- data/classpath/commons-math3-3.1.1.jar +0 -0
- data/classpath/commons-net-3.1.jar +0 -0
- data/classpath/curator-client-2.6.0.jar +0 -0
- data/classpath/curator-framework-2.6.0.jar +0 -0
- data/classpath/curator-recipes-2.6.0.jar +0 -0
- data/classpath/embulk-executor-mapreduce-0.1.0.jar +0 -0
- data/classpath/gson-2.2.4.jar +0 -0
- data/classpath/hadoop-annotations-2.6.0.jar +0 -0
- data/classpath/hadoop-auth-2.6.0.jar +0 -0
- data/classpath/hadoop-client-2.6.0.jar +0 -0
- data/classpath/hadoop-common-2.6.0.jar +0 -0
- data/classpath/hadoop-hdfs-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-app-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-common-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-core-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-jobclient-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-shuffle-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-api-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-client-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-common-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-server-common-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-server-nodemanager-2.6.0.jar +0 -0
- data/classpath/htrace-core-3.0.4.jar +0 -0
- data/classpath/httpclient-4.2.5.jar +0 -0
- data/classpath/httpcore-4.2.4.jar +0 -0
- data/classpath/jackson-core-asl-1.9.13.jar +0 -0
- data/classpath/jackson-jaxrs-1.9.13.jar +0 -0
- data/classpath/jackson-mapper-asl-1.9.13.jar +0 -0
- data/classpath/jackson-xc-1.9.13.jar +0 -0
- data/classpath/jaxb-api-2.2.2.jar +0 -0
- data/classpath/jaxb-impl-2.2.3-1.jar +0 -0
- data/classpath/jersey-client-1.9.jar +0 -0
- data/classpath/jersey-core-1.9.jar +0 -0
- data/classpath/jersey-guice-1.9.jar +0 -0
- data/classpath/jersey-json-1.9.jar +0 -0
- data/classpath/jersey-server-1.9.jar +0 -0
- data/classpath/jettison-1.1.jar +0 -0
- data/classpath/jetty-util-6.1.26.jar +0 -0
- data/classpath/jline-0.9.94.jar +0 -0
- data/classpath/jsr305-1.3.9.jar +0 -0
- data/classpath/leveldbjni-all-1.8.jar +0 -0
- data/classpath/netty-3.7.0.Final.jar +0 -0
- data/classpath/paranamer-2.3.jar +0 -0
- data/classpath/protobuf-java-2.5.0.jar +0 -0
- data/classpath/servlet-api-2.5.jar +0 -0
- data/classpath/snappy-java-1.0.4.1.jar +0 -0
- data/classpath/stax-api-1.0-2.jar +0 -0
- data/classpath/xmlenc-0.52.jar +0 -0
- data/classpath/xz-1.0.jar +0 -0
- data/classpath/zookeeper-3.4.6.jar +0 -0
- data/lib/embulk/executor/mapreduce.rb +3 -0
- data/src/main/java/org/embulk/executor/mapreduce/AttemptState.java +154 -0
- data/src/main/java/org/embulk/executor/mapreduce/BufferWritable.java +74 -0
- data/src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java +158 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputFormat.java +37 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputSplit.java +61 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java +359 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java +303 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkRecordReader.java +63 -0
- data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java +391 -0
- data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutorTask.java +60 -0
- data/src/main/java/org/embulk/executor/mapreduce/PageWritable.java +66 -0
- data/src/main/java/org/embulk/executor/mapreduce/PartitionKey.java +11 -0
- data/src/main/java/org/embulk/executor/mapreduce/Partitioner.java +11 -0
- data/src/main/java/org/embulk/executor/mapreduce/Partitioning.java +12 -0
- data/src/main/java/org/embulk/executor/mapreduce/PluginArchive.java +189 -0
- data/src/main/java/org/embulk/executor/mapreduce/RemoteTaskFailedException.java +10 -0
- data/src/main/java/org/embulk/executor/mapreduce/SetContextClassLoader.java +19 -0
- data/src/main/java/org/embulk/executor/mapreduce/TimestampPartitioning.java +291 -0
- metadata +131 -0
@@ -0,0 +1,391 @@
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
2
|
+
|
3
|
+
import java.util.List;
|
4
|
+
import java.util.Map;
|
5
|
+
import java.util.Set;
|
6
|
+
import java.util.HashSet;
|
7
|
+
import java.io.File;
|
8
|
+
import java.io.IOException;
|
9
|
+
import java.io.EOFException;
|
10
|
+
import java.net.URI;
|
11
|
+
import java.net.URISyntaxException;
|
12
|
+
import java.net.URL;
|
13
|
+
import java.net.URLClassLoader;
|
14
|
+
import java.net.MalformedURLException;
|
15
|
+
import org.slf4j.Logger;
|
16
|
+
import org.joda.time.format.DateTimeFormat;
|
17
|
+
import com.google.inject.Inject;
|
18
|
+
import com.google.common.base.Optional;
|
19
|
+
import com.google.common.base.Throwables;
|
20
|
+
import com.google.common.collect.ImmutableList;
|
21
|
+
import com.google.common.collect.Iterators;
|
22
|
+
import org.jruby.embed.ScriptingContainer;
|
23
|
+
import org.apache.hadoop.util.StringUtils;
|
24
|
+
import org.apache.hadoop.io.IntWritable;
|
25
|
+
import org.apache.hadoop.io.NullWritable;
|
26
|
+
import org.apache.hadoop.fs.Path;
|
27
|
+
import org.apache.hadoop.fs.FsConstants;
|
28
|
+
import org.apache.hadoop.conf.Configuration;
|
29
|
+
import org.apache.hadoop.mapreduce.JobContext;
|
30
|
+
import org.apache.hadoop.mapreduce.Cluster;
|
31
|
+
import org.apache.hadoop.mapreduce.Job;
|
32
|
+
import org.apache.hadoop.mapreduce.Counters;
|
33
|
+
import org.apache.hadoop.mapreduce.TaskType;
|
34
|
+
import org.apache.hadoop.mapreduce.TaskAttemptID;
|
35
|
+
import org.apache.hadoop.mapreduce.TaskCompletionEvent;
|
36
|
+
import org.apache.hadoop.mapreduce.MRJobConfig;
|
37
|
+
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
|
38
|
+
import org.embulk.exec.ForSystemConfig;
|
39
|
+
import org.embulk.config.ConfigSource;
|
40
|
+
import org.embulk.config.ConfigException;
|
41
|
+
import org.embulk.config.TaskSource;
|
42
|
+
import org.embulk.config.ModelManager;
|
43
|
+
import org.embulk.spi.Exec;
|
44
|
+
import org.embulk.spi.ExecSession;
|
45
|
+
import org.embulk.spi.ExecutorPlugin;
|
46
|
+
import org.embulk.spi.ProcessTask;
|
47
|
+
import org.embulk.spi.ProcessState;
|
48
|
+
import org.embulk.spi.TaskState;
|
49
|
+
import org.embulk.spi.Schema;
|
50
|
+
import org.embulk.spi.time.Timestamp;
|
51
|
+
|
52
|
+
public class MapReduceExecutor
|
53
|
+
implements ExecutorPlugin
|
54
|
+
{
|
55
|
+
private final Logger log = Exec.getLogger(MapReduceExecutor.class);
|
56
|
+
private final ConfigSource systemConfig;
|
57
|
+
private final ScriptingContainer jruby;
|
58
|
+
|
59
|
+
@Inject
|
60
|
+
public MapReduceExecutor(@ForSystemConfig ConfigSource systemConfig,
|
61
|
+
ScriptingContainer jruby)
|
62
|
+
{
|
63
|
+
this.systemConfig = systemConfig;
|
64
|
+
this.jruby = jruby;
|
65
|
+
}
|
66
|
+
|
67
|
+
@Override
|
68
|
+
public void transaction(ConfigSource config, Schema outputSchema, final int inputTaskCount,
|
69
|
+
ExecutorPlugin.Control control)
|
70
|
+
{
|
71
|
+
final MapReduceExecutorTask task = config.loadConfig(MapReduceExecutorTask.class);
|
72
|
+
task.setExecConfig(config);
|
73
|
+
|
74
|
+
final int outputTaskCount;
|
75
|
+
final int reduceTaskCount;
|
76
|
+
|
77
|
+
if (task.getPartitioning().isPresent()) {
|
78
|
+
reduceTaskCount = task.getReducers().or(inputTaskCount);
|
79
|
+
if (reduceTaskCount <= 0) {
|
80
|
+
throw new ConfigException("Reducers must be larger than 1 if partition: is set");
|
81
|
+
}
|
82
|
+
outputTaskCount = reduceTaskCount;
|
83
|
+
ConfigSource partitioningConfig = task.getPartitioning().get();
|
84
|
+
String partitioningType = partitioningConfig.get(String.class, "type");
|
85
|
+
Partitioning partitioning = newPartitioning(partitioningType);
|
86
|
+
TaskSource partitioningTask = partitioning.configure(partitioningConfig, outputSchema, reduceTaskCount);
|
87
|
+
task.setPartitioningType(Optional.of(partitioningType));
|
88
|
+
task.setPartitioningTask(Optional.of(partitioningTask));
|
89
|
+
} else {
|
90
|
+
reduceTaskCount = 0;
|
91
|
+
outputTaskCount = inputTaskCount;
|
92
|
+
task.setPartitioningType(Optional.<String>absent());
|
93
|
+
task.setPartitioningTask(Optional.<TaskSource>absent());
|
94
|
+
}
|
95
|
+
|
96
|
+
control.transaction(outputSchema, outputTaskCount, new ExecutorPlugin.Executor() {
|
97
|
+
public void execute(ProcessTask procTask, ProcessState state)
|
98
|
+
{
|
99
|
+
task.setProcessTask(procTask);
|
100
|
+
|
101
|
+
// hadoop uses ServiceLoader using context classloader to load some implementations
|
102
|
+
try (SetContextClassLoader closeLater = new SetContextClassLoader(MapReduceExecutor.class.getClassLoader())) {
|
103
|
+
run(task, inputTaskCount, reduceTaskCount, state);
|
104
|
+
}
|
105
|
+
}
|
106
|
+
});
|
107
|
+
}
|
108
|
+
|
109
|
+
static Partitioning newPartitioning(String type)
|
110
|
+
{
|
111
|
+
switch (type) {
|
112
|
+
case "timestamp":
|
113
|
+
return new TimestampPartitioning();
|
114
|
+
default:
|
115
|
+
throw new ConfigException("Unknown partition type '"+type+"'");
|
116
|
+
}
|
117
|
+
}
|
118
|
+
|
119
|
+
void run(MapReduceExecutorTask task,
|
120
|
+
int mapTaskCount, int reduceTaskCount, ProcessState state)
|
121
|
+
{
|
122
|
+
ModelManager modelManager = task.getModelManager();
|
123
|
+
|
124
|
+
Configuration conf = new Configuration();
|
125
|
+
// don't call conf.setQuietMode(false). Configuraiton has invalid resource names by default
|
126
|
+
for (String path : task.getConfigFiles()) {
|
127
|
+
File file = new File(path);
|
128
|
+
if (!file.isFile()) {
|
129
|
+
throw new ConfigException(String.format("Config file '%s' does not exist", file));
|
130
|
+
}
|
131
|
+
try {
|
132
|
+
// use URL here. Configuration assumes String is a path of a resource in a ClassLoader
|
133
|
+
conf.addResource(file.toURI().toURL());
|
134
|
+
} catch (MalformedURLException ex) {
|
135
|
+
throw new RuntimeException(ex);
|
136
|
+
}
|
137
|
+
}
|
138
|
+
|
139
|
+
String uniqueTransactionName = getTransactionUniqueName(Exec.session());
|
140
|
+
Path stateDir = new Path(new Path(task.getStatePath()), uniqueTransactionName);
|
141
|
+
|
142
|
+
Job job;
|
143
|
+
try {
|
144
|
+
job = Job.getInstance(conf);
|
145
|
+
} catch (IOException e) {
|
146
|
+
throw Throwables.propagate(e);
|
147
|
+
}
|
148
|
+
job.setJobName(task.getJobName());
|
149
|
+
|
150
|
+
// create a dedicated classloader for this yarn application.
|
151
|
+
// allow task.getConfig to overwrite this parameter
|
152
|
+
job.getConfiguration().set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER, "true"); // mapreduce.job.classloader
|
153
|
+
job.getConfiguration().set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER_SYSTEM_CLASSES, "java.,org.apache.hadoop."); // mapreduce.job.classloader.system.classes
|
154
|
+
|
155
|
+
// extra config
|
156
|
+
for (Map.Entry<String, String> pair : task.getConfig().entrySet()) {
|
157
|
+
job.getConfiguration().set(pair.getKey(), pair.getValue());
|
158
|
+
}
|
159
|
+
|
160
|
+
// framework config
|
161
|
+
EmbulkMapReduce.setSystemConfig(job.getConfiguration(), modelManager, systemConfig);
|
162
|
+
EmbulkMapReduce.setExecutorTask(job.getConfiguration(), modelManager, task);
|
163
|
+
EmbulkMapReduce.setMapTaskCount(job.getConfiguration(), mapTaskCount); // used by EmbulkInputFormat
|
164
|
+
EmbulkMapReduce.setStateDirectoryPath(job.getConfiguration(), stateDir);
|
165
|
+
|
166
|
+
// create state dir
|
167
|
+
try {
|
168
|
+
stateDir.getFileSystem(job.getConfiguration()).mkdirs(stateDir);
|
169
|
+
} catch (IOException ex) {
|
170
|
+
throw new RuntimeException(ex);
|
171
|
+
}
|
172
|
+
|
173
|
+
// archive plugins
|
174
|
+
PluginArchive archive = new PluginArchive.Builder()
|
175
|
+
.addLoadedRubyGems(jruby)
|
176
|
+
.build();
|
177
|
+
try {
|
178
|
+
EmbulkMapReduce.writePluginArchive(job.getConfiguration(), stateDir, archive, modelManager);
|
179
|
+
} catch (IOException ex) {
|
180
|
+
throw new RuntimeException(ex);
|
181
|
+
}
|
182
|
+
|
183
|
+
// jar files
|
184
|
+
Iterable<Path> jars = collectJars(task.getLibjars());
|
185
|
+
job.getConfiguration().set("tmpjars", StringUtils.join(",", jars));
|
186
|
+
|
187
|
+
job.setInputFormatClass(EmbulkInputFormat.class);
|
188
|
+
|
189
|
+
if (reduceTaskCount > 0) {
|
190
|
+
job.setMapperClass(EmbulkPartitioningMapReduce.EmbulkPartitioningMapper.class);
|
191
|
+
job.setMapOutputKeyClass(BufferWritable.class);
|
192
|
+
job.setMapOutputValueClass(PageWritable.class);
|
193
|
+
|
194
|
+
job.setReducerClass(EmbulkPartitioningMapReduce.EmbulkPartitioningReducer.class);
|
195
|
+
|
196
|
+
job.setNumReduceTasks(reduceTaskCount);
|
197
|
+
|
198
|
+
} else {
|
199
|
+
job.setMapperClass(EmbulkMapReduce.EmbulkMapper.class);
|
200
|
+
job.setMapOutputKeyClass(NullWritable.class);
|
201
|
+
job.setMapOutputValueClass(NullWritable.class);
|
202
|
+
|
203
|
+
job.setReducerClass(EmbulkMapReduce.EmbulkReducer.class);
|
204
|
+
|
205
|
+
job.setNumReduceTasks(0);
|
206
|
+
}
|
207
|
+
|
208
|
+
job.setOutputFormatClass(NullOutputFormat.class);
|
209
|
+
job.setOutputKeyClass(NullWritable.class);
|
210
|
+
job.setOutputValueClass(NullWritable.class);
|
211
|
+
|
212
|
+
try {
|
213
|
+
job.submit();
|
214
|
+
|
215
|
+
int interval = Job.getCompletionPollInterval(job.getConfiguration());
|
216
|
+
while (!job.isComplete()) {
|
217
|
+
//if (job.getState() == JobStatus.State.PREP) {
|
218
|
+
// continue;
|
219
|
+
//}
|
220
|
+
log.info(String.format("map %.1f%% reduce %.1f%%",
|
221
|
+
job.mapProgress() * 100, job.reduceProgress() * 100));
|
222
|
+
Thread.sleep(interval);
|
223
|
+
|
224
|
+
updateProcessState(job, mapTaskCount, stateDir, state, modelManager);
|
225
|
+
}
|
226
|
+
|
227
|
+
log.info(String.format("map %.1f%% reduce %.1f%%",
|
228
|
+
job.mapProgress() * 100, job.reduceProgress() * 100));
|
229
|
+
updateProcessState(job, mapTaskCount, stateDir, state, modelManager);
|
230
|
+
|
231
|
+
Counters counters = job.getCounters();
|
232
|
+
if (counters != null) {
|
233
|
+
log.info(counters.toString());
|
234
|
+
}
|
235
|
+
} catch (IOException | InterruptedException | ClassNotFoundException e) {
|
236
|
+
throw Throwables.propagate(e);
|
237
|
+
}
|
238
|
+
}
|
239
|
+
|
240
|
+
private static Iterable<Path> collectJars(List<String> extraJars)
|
241
|
+
{
|
242
|
+
Set<Path> set = new HashSet<Path>();
|
243
|
+
|
244
|
+
collectURLClassLoaderJars(set, Exec.class.getClassLoader());
|
245
|
+
collectURLClassLoaderJars(set, MapReduceExecutor.class.getClassLoader());
|
246
|
+
|
247
|
+
for (String extraJar : extraJars) {
|
248
|
+
URI uri;
|
249
|
+
try {
|
250
|
+
uri = new URI(extraJar);
|
251
|
+
} catch (URISyntaxException ex) {
|
252
|
+
throw new ConfigException(String.format("Invalid jar path '%s'", extraJar), ex);
|
253
|
+
}
|
254
|
+
if (uri.getScheme() == null) {
|
255
|
+
set.add(localFileToLocalPath(new File(extraJar)));
|
256
|
+
} else {
|
257
|
+
set.add(new Path(uri));
|
258
|
+
}
|
259
|
+
}
|
260
|
+
|
261
|
+
return set;
|
262
|
+
}
|
263
|
+
|
264
|
+
private static void collectURLClassLoaderJars(Set<Path> set, ClassLoader cl)
|
265
|
+
{
|
266
|
+
if (cl instanceof URLClassLoader) {
|
267
|
+
for (URL url : ((URLClassLoader) cl).getURLs()) {
|
268
|
+
File file = new File(url.getPath());
|
269
|
+
if (file.isFile()) {
|
270
|
+
// TODO log if not found
|
271
|
+
// TODO debug logging
|
272
|
+
set.add(localFileToLocalPath(file));
|
273
|
+
}
|
274
|
+
}
|
275
|
+
}
|
276
|
+
}
|
277
|
+
|
278
|
+
private static Path localFileToLocalPath(File file)
|
279
|
+
{
|
280
|
+
Path cwd = new Path(java.nio.file.Paths.get("").toAbsolutePath().toString()).makeQualified(FsConstants.LOCAL_FS_URI, new Path("/"));
|
281
|
+
return new Path(file.toString()).makeQualified(FsConstants.LOCAL_FS_URI, cwd);
|
282
|
+
}
|
283
|
+
|
284
|
+
private static String getTransactionUniqueName(ExecSession session)
|
285
|
+
{
|
286
|
+
// TODO implement Exec.getTransactionUniqueName()
|
287
|
+
Timestamp time = session.getTransactionTime();
|
288
|
+
return DateTimeFormat.forPattern("yyyyMMdd_HHmmss_").withZoneUTC()
|
289
|
+
.print(time.getEpochSecond() * 1000)
|
290
|
+
+ String.format("%09d", time.getNano());
|
291
|
+
}
|
292
|
+
|
293
|
+
private void updateProcessState(Job job, int mapTaskCount, Path stateDir,
|
294
|
+
ProcessState state, ModelManager modelManager) throws IOException
|
295
|
+
{
|
296
|
+
List<AttemptReport> reports = getAttemptReports(job.getConfiguration(), stateDir, modelManager);
|
297
|
+
|
298
|
+
for (AttemptReport report : reports) {
|
299
|
+
if (report == null) {
|
300
|
+
continue;
|
301
|
+
}
|
302
|
+
if (!report.isStarted()) {
|
303
|
+
continue;
|
304
|
+
}
|
305
|
+
AttemptState attempt = report.getAttemptState();
|
306
|
+
if (attempt.getInputTaskIndex().isPresent()) {
|
307
|
+
updateState(state.getInputTaskState(attempt.getInputTaskIndex().get()), attempt, true);
|
308
|
+
}
|
309
|
+
if (attempt.getOutputTaskIndex().isPresent()) {
|
310
|
+
updateState(state.getOutputTaskState(attempt.getOutputTaskIndex().get()), attempt, false);
|
311
|
+
}
|
312
|
+
}
|
313
|
+
}
|
314
|
+
|
315
|
+
private static void updateState(TaskState state, AttemptState attempt, boolean isInput)
|
316
|
+
{
|
317
|
+
state.start();
|
318
|
+
if (attempt.getException().isPresent()) {
|
319
|
+
if (!state.isCommitted()) {
|
320
|
+
state.setException(new RemoteTaskFailedException(attempt.getException().get()));
|
321
|
+
}
|
322
|
+
} else if (
|
323
|
+
(isInput && attempt.getInputCommitReport().isPresent()) ||
|
324
|
+
(!isInput && attempt.getOutputCommitReport().isPresent())) {
|
325
|
+
state.resetException();
|
326
|
+
}
|
327
|
+
if (isInput && attempt.getInputCommitReport().isPresent()) {
|
328
|
+
state.setCommitReport(attempt.getInputCommitReport().get());
|
329
|
+
state.finish();
|
330
|
+
}
|
331
|
+
if (!isInput && attempt.getOutputCommitReport().isPresent()) {
|
332
|
+
state.setCommitReport(attempt.getOutputCommitReport().get());
|
333
|
+
state.finish();
|
334
|
+
}
|
335
|
+
}
|
336
|
+
|
337
|
+
private static class AttemptReport
|
338
|
+
{
|
339
|
+
private final TaskAttemptID attemptId;
|
340
|
+
private final AttemptState attemptState;
|
341
|
+
|
342
|
+
public AttemptReport(TaskAttemptID attemptId)
|
343
|
+
{
|
344
|
+
this(attemptId, null);
|
345
|
+
}
|
346
|
+
|
347
|
+
public AttemptReport(TaskAttemptID attemptId, AttemptState attemptState)
|
348
|
+
{
|
349
|
+
this.attemptId = attemptId;
|
350
|
+
this.attemptState = attemptState;
|
351
|
+
}
|
352
|
+
|
353
|
+
public boolean isStarted()
|
354
|
+
{
|
355
|
+
return attemptState != null;
|
356
|
+
}
|
357
|
+
|
358
|
+
public boolean isInputCommitted()
|
359
|
+
{
|
360
|
+
return attemptState != null && attemptState.getInputCommitReport().isPresent();
|
361
|
+
}
|
362
|
+
|
363
|
+
public boolean isOutputCommitted()
|
364
|
+
{
|
365
|
+
return attemptState != null && attemptState.getOutputCommitReport().isPresent();
|
366
|
+
}
|
367
|
+
|
368
|
+
public AttemptState getAttemptState()
|
369
|
+
{
|
370
|
+
return attemptState;
|
371
|
+
}
|
372
|
+
}
|
373
|
+
|
374
|
+
private static final int TASK_EVENT_FETCH_SIZE = 100;
|
375
|
+
|
376
|
+
private static List<AttemptReport> getAttemptReports(Configuration config,
|
377
|
+
Path stateDir, ModelManager modelManager) throws IOException
|
378
|
+
{
|
379
|
+
ImmutableList.Builder<AttemptReport> builder = ImmutableList.builder();
|
380
|
+
for (TaskAttemptID aid : EmbulkMapReduce.listAttempts(config, stateDir)) {
|
381
|
+
try {
|
382
|
+
AttemptState state = EmbulkMapReduce.readAttemptStateFile(config,
|
383
|
+
stateDir, aid, modelManager);
|
384
|
+
builder.add(new AttemptReport(aid, state));
|
385
|
+
} catch (EOFException ex) { // plus Not Found exception
|
386
|
+
builder.add(new AttemptReport(aid, null));
|
387
|
+
}
|
388
|
+
}
|
389
|
+
return builder.build();
|
390
|
+
}
|
391
|
+
}
|
@@ -0,0 +1,60 @@
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
2
|
+
|
3
|
+
import java.util.List;
|
4
|
+
import java.util.Map;
|
5
|
+
import com.google.common.base.Optional;
|
6
|
+
import org.embulk.config.Config;
|
7
|
+
import org.embulk.config.ConfigInject;
|
8
|
+
import org.embulk.config.ConfigDefault;
|
9
|
+
import org.embulk.config.ConfigSource;
|
10
|
+
import org.embulk.config.Task;
|
11
|
+
import org.embulk.config.TaskSource;
|
12
|
+
import org.embulk.config.ModelManager;
|
13
|
+
import org.embulk.spi.ProcessTask;
|
14
|
+
|
15
|
+
public interface MapReduceExecutorTask
|
16
|
+
extends Task
|
17
|
+
{
|
18
|
+
@Config("job_name")
|
19
|
+
@ConfigDefault("\"embulk\"")
|
20
|
+
public String getJobName();
|
21
|
+
|
22
|
+
@Config("config_files")
|
23
|
+
@ConfigDefault("[]")
|
24
|
+
public List<String> getConfigFiles();
|
25
|
+
|
26
|
+
@Config("config")
|
27
|
+
@ConfigDefault("{}")
|
28
|
+
public Map<String, String> getConfig();
|
29
|
+
|
30
|
+
@Config("libjars")
|
31
|
+
@ConfigDefault("[]")
|
32
|
+
public List<String> getLibjars();
|
33
|
+
|
34
|
+
@Config("state_path")
|
35
|
+
@ConfigDefault("\"/tmp/embulk\"")
|
36
|
+
public String getStatePath();
|
37
|
+
|
38
|
+
@Config("reducers")
|
39
|
+
@ConfigDefault("null")
|
40
|
+
public Optional<Integer> getReducers();
|
41
|
+
|
42
|
+
@Config("partitioning")
|
43
|
+
@ConfigDefault("null")
|
44
|
+
public Optional<ConfigSource> getPartitioning();
|
45
|
+
|
46
|
+
@ConfigInject
|
47
|
+
public ModelManager getModelManager();
|
48
|
+
|
49
|
+
public ConfigSource getExecConfig();
|
50
|
+
public void setExecConfig(ConfigSource execConfig);
|
51
|
+
|
52
|
+
public ProcessTask getProcessTask();
|
53
|
+
public void setProcessTask(ProcessTask task);
|
54
|
+
|
55
|
+
public Optional<String> getPartitioningType();
|
56
|
+
public void setPartitioningType(Optional<String> partitioningType);
|
57
|
+
|
58
|
+
public Optional<TaskSource> getPartitioningTask();
|
59
|
+
public void setPartitioningTask(Optional<TaskSource> partitioningTask);
|
60
|
+
}
|
@@ -0,0 +1,66 @@
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
import java.io.DataOutput;
|
5
|
+
import java.io.DataInput;
|
6
|
+
import java.util.List;
|
7
|
+
import java.util.ArrayList;
|
8
|
+
import org.apache.hadoop.io.Writable;
|
9
|
+
import org.apache.hadoop.io.WritableUtils;
|
10
|
+
import org.embulk.spi.Buffer;
|
11
|
+
import org.embulk.spi.Page;
|
12
|
+
import static java.nio.charset.StandardCharsets.UTF_8;
|
13
|
+
|
14
|
+
public class PageWritable
|
15
|
+
implements Writable
|
16
|
+
{
|
17
|
+
private Page page;
|
18
|
+
|
19
|
+
public PageWritable() { }
|
20
|
+
|
21
|
+
public void set(Page page)
|
22
|
+
{
|
23
|
+
this.page = page;
|
24
|
+
}
|
25
|
+
|
26
|
+
public Page get()
|
27
|
+
{
|
28
|
+
return page;
|
29
|
+
}
|
30
|
+
|
31
|
+
@Override
|
32
|
+
public void write(DataOutput out) throws IOException
|
33
|
+
{
|
34
|
+
Buffer buffer = page.buffer();
|
35
|
+
out.writeInt(buffer.limit());
|
36
|
+
out.write(buffer.array(), buffer.offset(), buffer.limit());
|
37
|
+
|
38
|
+
List<String> stringReferences = page.getStringReferences();
|
39
|
+
WritableUtils.writeVInt(out, stringReferences.size());
|
40
|
+
for (String s : stringReferences) {
|
41
|
+
out.writeUTF(s);
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
@Override
|
46
|
+
public void readFields(DataInput in) throws IOException
|
47
|
+
{
|
48
|
+
int bufferSize = in.readInt();
|
49
|
+
byte[] bytes = new byte[bufferSize]; // TODO usa buffer allocator?
|
50
|
+
in.readFully(bytes, 0, bufferSize);
|
51
|
+
Buffer buffer = Buffer.wrap(bytes);
|
52
|
+
|
53
|
+
int stringCount = WritableUtils.readVInt(in);
|
54
|
+
List<String> strings = new ArrayList<String>(stringCount);
|
55
|
+
for (int i=0; i < stringCount; i++) {
|
56
|
+
strings.add(in.readUTF());
|
57
|
+
}
|
58
|
+
|
59
|
+
Page newPage = Page.wrap(buffer);
|
60
|
+
newPage.setStringReferences(strings);
|
61
|
+
if (page != null) {
|
62
|
+
page.release();
|
63
|
+
}
|
64
|
+
page = newPage;
|
65
|
+
}
|
66
|
+
}
|
@@ -0,0 +1,12 @@
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
2
|
+
|
3
|
+
import org.embulk.config.ConfigSource;
|
4
|
+
import org.embulk.config.TaskSource;
|
5
|
+
import org.embulk.spi.Schema;
|
6
|
+
|
7
|
+
public interface Partitioning
|
8
|
+
{
|
9
|
+
public TaskSource configure(ConfigSource config, Schema schema, int outputTaskCount);
|
10
|
+
|
11
|
+
public Partitioner newPartitioner(TaskSource taskSource);
|
12
|
+
}
|