embulk-executor-mapreduce 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/build.gradle +2 -0
  3. data/classpath/activation-1.1.jar +0 -0
  4. data/classpath/apacheds-i18n-2.0.0-M15.jar +0 -0
  5. data/classpath/apacheds-kerberos-codec-2.0.0-M15.jar +0 -0
  6. data/classpath/api-asn1-api-1.0.0-M20.jar +0 -0
  7. data/classpath/api-util-1.0.0-M20.jar +0 -0
  8. data/classpath/avro-1.7.4.jar +0 -0
  9. data/classpath/commons-beanutils-1.7.0.jar +0 -0
  10. data/classpath/commons-cli-1.2.jar +0 -0
  11. data/classpath/commons-codec-1.6.jar +0 -0
  12. data/classpath/commons-collections-3.2.1.jar +0 -0
  13. data/classpath/commons-compress-1.4.1.jar +0 -0
  14. data/classpath/commons-configuration-1.6.jar +0 -0
  15. data/classpath/commons-digester-1.8.jar +0 -0
  16. data/classpath/commons-httpclient-3.1.jar +0 -0
  17. data/classpath/commons-io-2.4.jar +0 -0
  18. data/classpath/commons-lang-2.6.jar +0 -0
  19. data/classpath/commons-logging-1.1.3.jar +0 -0
  20. data/classpath/commons-math3-3.1.1.jar +0 -0
  21. data/classpath/commons-net-3.1.jar +0 -0
  22. data/classpath/curator-client-2.6.0.jar +0 -0
  23. data/classpath/curator-framework-2.6.0.jar +0 -0
  24. data/classpath/curator-recipes-2.6.0.jar +0 -0
  25. data/classpath/embulk-executor-mapreduce-0.1.0.jar +0 -0
  26. data/classpath/gson-2.2.4.jar +0 -0
  27. data/classpath/hadoop-annotations-2.6.0.jar +0 -0
  28. data/classpath/hadoop-auth-2.6.0.jar +0 -0
  29. data/classpath/hadoop-client-2.6.0.jar +0 -0
  30. data/classpath/hadoop-common-2.6.0.jar +0 -0
  31. data/classpath/hadoop-hdfs-2.6.0.jar +0 -0
  32. data/classpath/hadoop-mapreduce-client-app-2.6.0.jar +0 -0
  33. data/classpath/hadoop-mapreduce-client-common-2.6.0.jar +0 -0
  34. data/classpath/hadoop-mapreduce-client-core-2.6.0.jar +0 -0
  35. data/classpath/hadoop-mapreduce-client-jobclient-2.6.0.jar +0 -0
  36. data/classpath/hadoop-mapreduce-client-shuffle-2.6.0.jar +0 -0
  37. data/classpath/hadoop-yarn-api-2.6.0.jar +0 -0
  38. data/classpath/hadoop-yarn-client-2.6.0.jar +0 -0
  39. data/classpath/hadoop-yarn-common-2.6.0.jar +0 -0
  40. data/classpath/hadoop-yarn-server-common-2.6.0.jar +0 -0
  41. data/classpath/hadoop-yarn-server-nodemanager-2.6.0.jar +0 -0
  42. data/classpath/htrace-core-3.0.4.jar +0 -0
  43. data/classpath/httpclient-4.2.5.jar +0 -0
  44. data/classpath/httpcore-4.2.4.jar +0 -0
  45. data/classpath/jackson-core-asl-1.9.13.jar +0 -0
  46. data/classpath/jackson-jaxrs-1.9.13.jar +0 -0
  47. data/classpath/jackson-mapper-asl-1.9.13.jar +0 -0
  48. data/classpath/jackson-xc-1.9.13.jar +0 -0
  49. data/classpath/jaxb-api-2.2.2.jar +0 -0
  50. data/classpath/jaxb-impl-2.2.3-1.jar +0 -0
  51. data/classpath/jersey-client-1.9.jar +0 -0
  52. data/classpath/jersey-core-1.9.jar +0 -0
  53. data/classpath/jersey-guice-1.9.jar +0 -0
  54. data/classpath/jersey-json-1.9.jar +0 -0
  55. data/classpath/jersey-server-1.9.jar +0 -0
  56. data/classpath/jettison-1.1.jar +0 -0
  57. data/classpath/jetty-util-6.1.26.jar +0 -0
  58. data/classpath/jline-0.9.94.jar +0 -0
  59. data/classpath/jsr305-1.3.9.jar +0 -0
  60. data/classpath/leveldbjni-all-1.8.jar +0 -0
  61. data/classpath/netty-3.7.0.Final.jar +0 -0
  62. data/classpath/paranamer-2.3.jar +0 -0
  63. data/classpath/protobuf-java-2.5.0.jar +0 -0
  64. data/classpath/servlet-api-2.5.jar +0 -0
  65. data/classpath/snappy-java-1.0.4.1.jar +0 -0
  66. data/classpath/stax-api-1.0-2.jar +0 -0
  67. data/classpath/xmlenc-0.52.jar +0 -0
  68. data/classpath/xz-1.0.jar +0 -0
  69. data/classpath/zookeeper-3.4.6.jar +0 -0
  70. data/lib/embulk/executor/mapreduce.rb +3 -0
  71. data/src/main/java/org/embulk/executor/mapreduce/AttemptState.java +154 -0
  72. data/src/main/java/org/embulk/executor/mapreduce/BufferWritable.java +74 -0
  73. data/src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java +158 -0
  74. data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputFormat.java +37 -0
  75. data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputSplit.java +61 -0
  76. data/src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java +359 -0
  77. data/src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java +303 -0
  78. data/src/main/java/org/embulk/executor/mapreduce/EmbulkRecordReader.java +63 -0
  79. data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java +391 -0
  80. data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutorTask.java +60 -0
  81. data/src/main/java/org/embulk/executor/mapreduce/PageWritable.java +66 -0
  82. data/src/main/java/org/embulk/executor/mapreduce/PartitionKey.java +11 -0
  83. data/src/main/java/org/embulk/executor/mapreduce/Partitioner.java +11 -0
  84. data/src/main/java/org/embulk/executor/mapreduce/Partitioning.java +12 -0
  85. data/src/main/java/org/embulk/executor/mapreduce/PluginArchive.java +189 -0
  86. data/src/main/java/org/embulk/executor/mapreduce/RemoteTaskFailedException.java +10 -0
  87. data/src/main/java/org/embulk/executor/mapreduce/SetContextClassLoader.java +19 -0
  88. data/src/main/java/org/embulk/executor/mapreduce/TimestampPartitioning.java +291 -0
  89. metadata +131 -0
@@ -0,0 +1,391 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.util.List;
4
+ import java.util.Map;
5
+ import java.util.Set;
6
+ import java.util.HashSet;
7
+ import java.io.File;
8
+ import java.io.IOException;
9
+ import java.io.EOFException;
10
+ import java.net.URI;
11
+ import java.net.URISyntaxException;
12
+ import java.net.URL;
13
+ import java.net.URLClassLoader;
14
+ import java.net.MalformedURLException;
15
+ import org.slf4j.Logger;
16
+ import org.joda.time.format.DateTimeFormat;
17
+ import com.google.inject.Inject;
18
+ import com.google.common.base.Optional;
19
+ import com.google.common.base.Throwables;
20
+ import com.google.common.collect.ImmutableList;
21
+ import com.google.common.collect.Iterators;
22
+ import org.jruby.embed.ScriptingContainer;
23
+ import org.apache.hadoop.util.StringUtils;
24
+ import org.apache.hadoop.io.IntWritable;
25
+ import org.apache.hadoop.io.NullWritable;
26
+ import org.apache.hadoop.fs.Path;
27
+ import org.apache.hadoop.fs.FsConstants;
28
+ import org.apache.hadoop.conf.Configuration;
29
+ import org.apache.hadoop.mapreduce.JobContext;
30
+ import org.apache.hadoop.mapreduce.Cluster;
31
+ import org.apache.hadoop.mapreduce.Job;
32
+ import org.apache.hadoop.mapreduce.Counters;
33
+ import org.apache.hadoop.mapreduce.TaskType;
34
+ import org.apache.hadoop.mapreduce.TaskAttemptID;
35
+ import org.apache.hadoop.mapreduce.TaskCompletionEvent;
36
+ import org.apache.hadoop.mapreduce.MRJobConfig;
37
+ import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
38
+ import org.embulk.exec.ForSystemConfig;
39
+ import org.embulk.config.ConfigSource;
40
+ import org.embulk.config.ConfigException;
41
+ import org.embulk.config.TaskSource;
42
+ import org.embulk.config.ModelManager;
43
+ import org.embulk.spi.Exec;
44
+ import org.embulk.spi.ExecSession;
45
+ import org.embulk.spi.ExecutorPlugin;
46
+ import org.embulk.spi.ProcessTask;
47
+ import org.embulk.spi.ProcessState;
48
+ import org.embulk.spi.TaskState;
49
+ import org.embulk.spi.Schema;
50
+ import org.embulk.spi.time.Timestamp;
51
+
52
+ public class MapReduceExecutor
53
+ implements ExecutorPlugin
54
+ {
55
+ private final Logger log = Exec.getLogger(MapReduceExecutor.class);
56
+ private final ConfigSource systemConfig;
57
+ private final ScriptingContainer jruby;
58
+
59
+ @Inject
60
+ public MapReduceExecutor(@ForSystemConfig ConfigSource systemConfig,
61
+ ScriptingContainer jruby)
62
+ {
63
+ this.systemConfig = systemConfig;
64
+ this.jruby = jruby;
65
+ }
66
+
67
+ @Override
68
+ public void transaction(ConfigSource config, Schema outputSchema, final int inputTaskCount,
69
+ ExecutorPlugin.Control control)
70
+ {
71
+ final MapReduceExecutorTask task = config.loadConfig(MapReduceExecutorTask.class);
72
+ task.setExecConfig(config);
73
+
74
+ final int outputTaskCount;
75
+ final int reduceTaskCount;
76
+
77
+ if (task.getPartitioning().isPresent()) {
78
+ reduceTaskCount = task.getReducers().or(inputTaskCount);
79
+ if (reduceTaskCount <= 0) {
80
+ throw new ConfigException("Reducers must be larger than 1 if partition: is set");
81
+ }
82
+ outputTaskCount = reduceTaskCount;
83
+ ConfigSource partitioningConfig = task.getPartitioning().get();
84
+ String partitioningType = partitioningConfig.get(String.class, "type");
85
+ Partitioning partitioning = newPartitioning(partitioningType);
86
+ TaskSource partitioningTask = partitioning.configure(partitioningConfig, outputSchema, reduceTaskCount);
87
+ task.setPartitioningType(Optional.of(partitioningType));
88
+ task.setPartitioningTask(Optional.of(partitioningTask));
89
+ } else {
90
+ reduceTaskCount = 0;
91
+ outputTaskCount = inputTaskCount;
92
+ task.setPartitioningType(Optional.<String>absent());
93
+ task.setPartitioningTask(Optional.<TaskSource>absent());
94
+ }
95
+
96
+ control.transaction(outputSchema, outputTaskCount, new ExecutorPlugin.Executor() {
97
+ public void execute(ProcessTask procTask, ProcessState state)
98
+ {
99
+ task.setProcessTask(procTask);
100
+
101
+ // hadoop uses ServiceLoader using context classloader to load some implementations
102
+ try (SetContextClassLoader closeLater = new SetContextClassLoader(MapReduceExecutor.class.getClassLoader())) {
103
+ run(task, inputTaskCount, reduceTaskCount, state);
104
+ }
105
+ }
106
+ });
107
+ }
108
+
109
+ static Partitioning newPartitioning(String type)
110
+ {
111
+ switch (type) {
112
+ case "timestamp":
113
+ return new TimestampPartitioning();
114
+ default:
115
+ throw new ConfigException("Unknown partition type '"+type+"'");
116
+ }
117
+ }
118
+
119
+ void run(MapReduceExecutorTask task,
120
+ int mapTaskCount, int reduceTaskCount, ProcessState state)
121
+ {
122
+ ModelManager modelManager = task.getModelManager();
123
+
124
+ Configuration conf = new Configuration();
125
+ // don't call conf.setQuietMode(false). Configuraiton has invalid resource names by default
126
+ for (String path : task.getConfigFiles()) {
127
+ File file = new File(path);
128
+ if (!file.isFile()) {
129
+ throw new ConfigException(String.format("Config file '%s' does not exist", file));
130
+ }
131
+ try {
132
+ // use URL here. Configuration assumes String is a path of a resource in a ClassLoader
133
+ conf.addResource(file.toURI().toURL());
134
+ } catch (MalformedURLException ex) {
135
+ throw new RuntimeException(ex);
136
+ }
137
+ }
138
+
139
+ String uniqueTransactionName = getTransactionUniqueName(Exec.session());
140
+ Path stateDir = new Path(new Path(task.getStatePath()), uniqueTransactionName);
141
+
142
+ Job job;
143
+ try {
144
+ job = Job.getInstance(conf);
145
+ } catch (IOException e) {
146
+ throw Throwables.propagate(e);
147
+ }
148
+ job.setJobName(task.getJobName());
149
+
150
+ // create a dedicated classloader for this yarn application.
151
+ // allow task.getConfig to overwrite this parameter
152
+ job.getConfiguration().set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER, "true"); // mapreduce.job.classloader
153
+ job.getConfiguration().set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER_SYSTEM_CLASSES, "java.,org.apache.hadoop."); // mapreduce.job.classloader.system.classes
154
+
155
+ // extra config
156
+ for (Map.Entry<String, String> pair : task.getConfig().entrySet()) {
157
+ job.getConfiguration().set(pair.getKey(), pair.getValue());
158
+ }
159
+
160
+ // framework config
161
+ EmbulkMapReduce.setSystemConfig(job.getConfiguration(), modelManager, systemConfig);
162
+ EmbulkMapReduce.setExecutorTask(job.getConfiguration(), modelManager, task);
163
+ EmbulkMapReduce.setMapTaskCount(job.getConfiguration(), mapTaskCount); // used by EmbulkInputFormat
164
+ EmbulkMapReduce.setStateDirectoryPath(job.getConfiguration(), stateDir);
165
+
166
+ // create state dir
167
+ try {
168
+ stateDir.getFileSystem(job.getConfiguration()).mkdirs(stateDir);
169
+ } catch (IOException ex) {
170
+ throw new RuntimeException(ex);
171
+ }
172
+
173
+ // archive plugins
174
+ PluginArchive archive = new PluginArchive.Builder()
175
+ .addLoadedRubyGems(jruby)
176
+ .build();
177
+ try {
178
+ EmbulkMapReduce.writePluginArchive(job.getConfiguration(), stateDir, archive, modelManager);
179
+ } catch (IOException ex) {
180
+ throw new RuntimeException(ex);
181
+ }
182
+
183
+ // jar files
184
+ Iterable<Path> jars = collectJars(task.getLibjars());
185
+ job.getConfiguration().set("tmpjars", StringUtils.join(",", jars));
186
+
187
+ job.setInputFormatClass(EmbulkInputFormat.class);
188
+
189
+ if (reduceTaskCount > 0) {
190
+ job.setMapperClass(EmbulkPartitioningMapReduce.EmbulkPartitioningMapper.class);
191
+ job.setMapOutputKeyClass(BufferWritable.class);
192
+ job.setMapOutputValueClass(PageWritable.class);
193
+
194
+ job.setReducerClass(EmbulkPartitioningMapReduce.EmbulkPartitioningReducer.class);
195
+
196
+ job.setNumReduceTasks(reduceTaskCount);
197
+
198
+ } else {
199
+ job.setMapperClass(EmbulkMapReduce.EmbulkMapper.class);
200
+ job.setMapOutputKeyClass(NullWritable.class);
201
+ job.setMapOutputValueClass(NullWritable.class);
202
+
203
+ job.setReducerClass(EmbulkMapReduce.EmbulkReducer.class);
204
+
205
+ job.setNumReduceTasks(0);
206
+ }
207
+
208
+ job.setOutputFormatClass(NullOutputFormat.class);
209
+ job.setOutputKeyClass(NullWritable.class);
210
+ job.setOutputValueClass(NullWritable.class);
211
+
212
+ try {
213
+ job.submit();
214
+
215
+ int interval = Job.getCompletionPollInterval(job.getConfiguration());
216
+ while (!job.isComplete()) {
217
+ //if (job.getState() == JobStatus.State.PREP) {
218
+ // continue;
219
+ //}
220
+ log.info(String.format("map %.1f%% reduce %.1f%%",
221
+ job.mapProgress() * 100, job.reduceProgress() * 100));
222
+ Thread.sleep(interval);
223
+
224
+ updateProcessState(job, mapTaskCount, stateDir, state, modelManager);
225
+ }
226
+
227
+ log.info(String.format("map %.1f%% reduce %.1f%%",
228
+ job.mapProgress() * 100, job.reduceProgress() * 100));
229
+ updateProcessState(job, mapTaskCount, stateDir, state, modelManager);
230
+
231
+ Counters counters = job.getCounters();
232
+ if (counters != null) {
233
+ log.info(counters.toString());
234
+ }
235
+ } catch (IOException | InterruptedException | ClassNotFoundException e) {
236
+ throw Throwables.propagate(e);
237
+ }
238
+ }
239
+
240
+ private static Iterable<Path> collectJars(List<String> extraJars)
241
+ {
242
+ Set<Path> set = new HashSet<Path>();
243
+
244
+ collectURLClassLoaderJars(set, Exec.class.getClassLoader());
245
+ collectURLClassLoaderJars(set, MapReduceExecutor.class.getClassLoader());
246
+
247
+ for (String extraJar : extraJars) {
248
+ URI uri;
249
+ try {
250
+ uri = new URI(extraJar);
251
+ } catch (URISyntaxException ex) {
252
+ throw new ConfigException(String.format("Invalid jar path '%s'", extraJar), ex);
253
+ }
254
+ if (uri.getScheme() == null) {
255
+ set.add(localFileToLocalPath(new File(extraJar)));
256
+ } else {
257
+ set.add(new Path(uri));
258
+ }
259
+ }
260
+
261
+ return set;
262
+ }
263
+
264
+ private static void collectURLClassLoaderJars(Set<Path> set, ClassLoader cl)
265
+ {
266
+ if (cl instanceof URLClassLoader) {
267
+ for (URL url : ((URLClassLoader) cl).getURLs()) {
268
+ File file = new File(url.getPath());
269
+ if (file.isFile()) {
270
+ // TODO log if not found
271
+ // TODO debug logging
272
+ set.add(localFileToLocalPath(file));
273
+ }
274
+ }
275
+ }
276
+ }
277
+
278
+ private static Path localFileToLocalPath(File file)
279
+ {
280
+ Path cwd = new Path(java.nio.file.Paths.get("").toAbsolutePath().toString()).makeQualified(FsConstants.LOCAL_FS_URI, new Path("/"));
281
+ return new Path(file.toString()).makeQualified(FsConstants.LOCAL_FS_URI, cwd);
282
+ }
283
+
284
+ private static String getTransactionUniqueName(ExecSession session)
285
+ {
286
+ // TODO implement Exec.getTransactionUniqueName()
287
+ Timestamp time = session.getTransactionTime();
288
+ return DateTimeFormat.forPattern("yyyyMMdd_HHmmss_").withZoneUTC()
289
+ .print(time.getEpochSecond() * 1000)
290
+ + String.format("%09d", time.getNano());
291
+ }
292
+
293
+ private void updateProcessState(Job job, int mapTaskCount, Path stateDir,
294
+ ProcessState state, ModelManager modelManager) throws IOException
295
+ {
296
+ List<AttemptReport> reports = getAttemptReports(job.getConfiguration(), stateDir, modelManager);
297
+
298
+ for (AttemptReport report : reports) {
299
+ if (report == null) {
300
+ continue;
301
+ }
302
+ if (!report.isStarted()) {
303
+ continue;
304
+ }
305
+ AttemptState attempt = report.getAttemptState();
306
+ if (attempt.getInputTaskIndex().isPresent()) {
307
+ updateState(state.getInputTaskState(attempt.getInputTaskIndex().get()), attempt, true);
308
+ }
309
+ if (attempt.getOutputTaskIndex().isPresent()) {
310
+ updateState(state.getOutputTaskState(attempt.getOutputTaskIndex().get()), attempt, false);
311
+ }
312
+ }
313
+ }
314
+
315
+ private static void updateState(TaskState state, AttemptState attempt, boolean isInput)
316
+ {
317
+ state.start();
318
+ if (attempt.getException().isPresent()) {
319
+ if (!state.isCommitted()) {
320
+ state.setException(new RemoteTaskFailedException(attempt.getException().get()));
321
+ }
322
+ } else if (
323
+ (isInput && attempt.getInputCommitReport().isPresent()) ||
324
+ (!isInput && attempt.getOutputCommitReport().isPresent())) {
325
+ state.resetException();
326
+ }
327
+ if (isInput && attempt.getInputCommitReport().isPresent()) {
328
+ state.setCommitReport(attempt.getInputCommitReport().get());
329
+ state.finish();
330
+ }
331
+ if (!isInput && attempt.getOutputCommitReport().isPresent()) {
332
+ state.setCommitReport(attempt.getOutputCommitReport().get());
333
+ state.finish();
334
+ }
335
+ }
336
+
337
+ private static class AttemptReport
338
+ {
339
+ private final TaskAttemptID attemptId;
340
+ private final AttemptState attemptState;
341
+
342
+ public AttemptReport(TaskAttemptID attemptId)
343
+ {
344
+ this(attemptId, null);
345
+ }
346
+
347
+ public AttemptReport(TaskAttemptID attemptId, AttemptState attemptState)
348
+ {
349
+ this.attemptId = attemptId;
350
+ this.attemptState = attemptState;
351
+ }
352
+
353
+ public boolean isStarted()
354
+ {
355
+ return attemptState != null;
356
+ }
357
+
358
+ public boolean isInputCommitted()
359
+ {
360
+ return attemptState != null && attemptState.getInputCommitReport().isPresent();
361
+ }
362
+
363
+ public boolean isOutputCommitted()
364
+ {
365
+ return attemptState != null && attemptState.getOutputCommitReport().isPresent();
366
+ }
367
+
368
+ public AttemptState getAttemptState()
369
+ {
370
+ return attemptState;
371
+ }
372
+ }
373
+
374
+ private static final int TASK_EVENT_FETCH_SIZE = 100;
375
+
376
+ private static List<AttemptReport> getAttemptReports(Configuration config,
377
+ Path stateDir, ModelManager modelManager) throws IOException
378
+ {
379
+ ImmutableList.Builder<AttemptReport> builder = ImmutableList.builder();
380
+ for (TaskAttemptID aid : EmbulkMapReduce.listAttempts(config, stateDir)) {
381
+ try {
382
+ AttemptState state = EmbulkMapReduce.readAttemptStateFile(config,
383
+ stateDir, aid, modelManager);
384
+ builder.add(new AttemptReport(aid, state));
385
+ } catch (EOFException ex) { // plus Not Found exception
386
+ builder.add(new AttemptReport(aid, null));
387
+ }
388
+ }
389
+ return builder.build();
390
+ }
391
+ }
@@ -0,0 +1,60 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.util.List;
4
+ import java.util.Map;
5
+ import com.google.common.base.Optional;
6
+ import org.embulk.config.Config;
7
+ import org.embulk.config.ConfigInject;
8
+ import org.embulk.config.ConfigDefault;
9
+ import org.embulk.config.ConfigSource;
10
+ import org.embulk.config.Task;
11
+ import org.embulk.config.TaskSource;
12
+ import org.embulk.config.ModelManager;
13
+ import org.embulk.spi.ProcessTask;
14
+
15
+ public interface MapReduceExecutorTask
16
+ extends Task
17
+ {
18
+ @Config("job_name")
19
+ @ConfigDefault("\"embulk\"")
20
+ public String getJobName();
21
+
22
+ @Config("config_files")
23
+ @ConfigDefault("[]")
24
+ public List<String> getConfigFiles();
25
+
26
+ @Config("config")
27
+ @ConfigDefault("{}")
28
+ public Map<String, String> getConfig();
29
+
30
+ @Config("libjars")
31
+ @ConfigDefault("[]")
32
+ public List<String> getLibjars();
33
+
34
+ @Config("state_path")
35
+ @ConfigDefault("\"/tmp/embulk\"")
36
+ public String getStatePath();
37
+
38
+ @Config("reducers")
39
+ @ConfigDefault("null")
40
+ public Optional<Integer> getReducers();
41
+
42
+ @Config("partitioning")
43
+ @ConfigDefault("null")
44
+ public Optional<ConfigSource> getPartitioning();
45
+
46
+ @ConfigInject
47
+ public ModelManager getModelManager();
48
+
49
+ public ConfigSource getExecConfig();
50
+ public void setExecConfig(ConfigSource execConfig);
51
+
52
+ public ProcessTask getProcessTask();
53
+ public void setProcessTask(ProcessTask task);
54
+
55
+ public Optional<String> getPartitioningType();
56
+ public void setPartitioningType(Optional<String> partitioningType);
57
+
58
+ public Optional<TaskSource> getPartitioningTask();
59
+ public void setPartitioningTask(Optional<TaskSource> partitioningTask);
60
+ }
@@ -0,0 +1,66 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.io.IOException;
4
+ import java.io.DataOutput;
5
+ import java.io.DataInput;
6
+ import java.util.List;
7
+ import java.util.ArrayList;
8
+ import org.apache.hadoop.io.Writable;
9
+ import org.apache.hadoop.io.WritableUtils;
10
+ import org.embulk.spi.Buffer;
11
+ import org.embulk.spi.Page;
12
+ import static java.nio.charset.StandardCharsets.UTF_8;
13
+
14
+ public class PageWritable
15
+ implements Writable
16
+ {
17
+ private Page page;
18
+
19
+ public PageWritable() { }
20
+
21
+ public void set(Page page)
22
+ {
23
+ this.page = page;
24
+ }
25
+
26
+ public Page get()
27
+ {
28
+ return page;
29
+ }
30
+
31
+ @Override
32
+ public void write(DataOutput out) throws IOException
33
+ {
34
+ Buffer buffer = page.buffer();
35
+ out.writeInt(buffer.limit());
36
+ out.write(buffer.array(), buffer.offset(), buffer.limit());
37
+
38
+ List<String> stringReferences = page.getStringReferences();
39
+ WritableUtils.writeVInt(out, stringReferences.size());
40
+ for (String s : stringReferences) {
41
+ out.writeUTF(s);
42
+ }
43
+ }
44
+
45
+ @Override
46
+ public void readFields(DataInput in) throws IOException
47
+ {
48
+ int bufferSize = in.readInt();
49
+ byte[] bytes = new byte[bufferSize]; // TODO usa buffer allocator?
50
+ in.readFully(bytes, 0, bufferSize);
51
+ Buffer buffer = Buffer.wrap(bytes);
52
+
53
+ int stringCount = WritableUtils.readVInt(in);
54
+ List<String> strings = new ArrayList<String>(stringCount);
55
+ for (int i=0; i < stringCount; i++) {
56
+ strings.add(in.readUTF());
57
+ }
58
+
59
+ Page newPage = Page.wrap(buffer);
60
+ newPage.setStringReferences(strings);
61
+ if (page != null) {
62
+ page.release();
63
+ }
64
+ page = newPage;
65
+ }
66
+ }
@@ -0,0 +1,11 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import org.embulk.spi.Buffer;
4
+
5
+ public interface PartitionKey
6
+ extends Cloneable
7
+ {
8
+ public void dump(Buffer buffer);
9
+
10
+ public PartitionKey clone();
11
+ }
@@ -0,0 +1,11 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import org.embulk.spi.Buffer;
4
+ import org.embulk.spi.PageReader;
5
+
6
+ public interface Partitioner
7
+ {
8
+ public Buffer newKeyBuffer();
9
+
10
+ public PartitionKey updateKey(PageReader record);
11
+ }
@@ -0,0 +1,12 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import org.embulk.config.ConfigSource;
4
+ import org.embulk.config.TaskSource;
5
+ import org.embulk.spi.Schema;
6
+
7
+ public interface Partitioning
8
+ {
9
+ public TaskSource configure(ConfigSource config, Schema schema, int outputTaskCount);
10
+
11
+ public Partitioner newPartitioner(TaskSource taskSource);
12
+ }