embulk-executor-mapreduce 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/build.gradle +2 -0
  3. data/classpath/activation-1.1.jar +0 -0
  4. data/classpath/apacheds-i18n-2.0.0-M15.jar +0 -0
  5. data/classpath/apacheds-kerberos-codec-2.0.0-M15.jar +0 -0
  6. data/classpath/api-asn1-api-1.0.0-M20.jar +0 -0
  7. data/classpath/api-util-1.0.0-M20.jar +0 -0
  8. data/classpath/avro-1.7.4.jar +0 -0
  9. data/classpath/commons-beanutils-1.7.0.jar +0 -0
  10. data/classpath/commons-cli-1.2.jar +0 -0
  11. data/classpath/commons-codec-1.6.jar +0 -0
  12. data/classpath/commons-collections-3.2.1.jar +0 -0
  13. data/classpath/commons-compress-1.4.1.jar +0 -0
  14. data/classpath/commons-configuration-1.6.jar +0 -0
  15. data/classpath/commons-digester-1.8.jar +0 -0
  16. data/classpath/commons-httpclient-3.1.jar +0 -0
  17. data/classpath/commons-io-2.4.jar +0 -0
  18. data/classpath/commons-lang-2.6.jar +0 -0
  19. data/classpath/commons-logging-1.1.3.jar +0 -0
  20. data/classpath/commons-math3-3.1.1.jar +0 -0
  21. data/classpath/commons-net-3.1.jar +0 -0
  22. data/classpath/curator-client-2.6.0.jar +0 -0
  23. data/classpath/curator-framework-2.6.0.jar +0 -0
  24. data/classpath/curator-recipes-2.6.0.jar +0 -0
  25. data/classpath/embulk-executor-mapreduce-0.1.0.jar +0 -0
  26. data/classpath/gson-2.2.4.jar +0 -0
  27. data/classpath/hadoop-annotations-2.6.0.jar +0 -0
  28. data/classpath/hadoop-auth-2.6.0.jar +0 -0
  29. data/classpath/hadoop-client-2.6.0.jar +0 -0
  30. data/classpath/hadoop-common-2.6.0.jar +0 -0
  31. data/classpath/hadoop-hdfs-2.6.0.jar +0 -0
  32. data/classpath/hadoop-mapreduce-client-app-2.6.0.jar +0 -0
  33. data/classpath/hadoop-mapreduce-client-common-2.6.0.jar +0 -0
  34. data/classpath/hadoop-mapreduce-client-core-2.6.0.jar +0 -0
  35. data/classpath/hadoop-mapreduce-client-jobclient-2.6.0.jar +0 -0
  36. data/classpath/hadoop-mapreduce-client-shuffle-2.6.0.jar +0 -0
  37. data/classpath/hadoop-yarn-api-2.6.0.jar +0 -0
  38. data/classpath/hadoop-yarn-client-2.6.0.jar +0 -0
  39. data/classpath/hadoop-yarn-common-2.6.0.jar +0 -0
  40. data/classpath/hadoop-yarn-server-common-2.6.0.jar +0 -0
  41. data/classpath/hadoop-yarn-server-nodemanager-2.6.0.jar +0 -0
  42. data/classpath/htrace-core-3.0.4.jar +0 -0
  43. data/classpath/httpclient-4.2.5.jar +0 -0
  44. data/classpath/httpcore-4.2.4.jar +0 -0
  45. data/classpath/jackson-core-asl-1.9.13.jar +0 -0
  46. data/classpath/jackson-jaxrs-1.9.13.jar +0 -0
  47. data/classpath/jackson-mapper-asl-1.9.13.jar +0 -0
  48. data/classpath/jackson-xc-1.9.13.jar +0 -0
  49. data/classpath/jaxb-api-2.2.2.jar +0 -0
  50. data/classpath/jaxb-impl-2.2.3-1.jar +0 -0
  51. data/classpath/jersey-client-1.9.jar +0 -0
  52. data/classpath/jersey-core-1.9.jar +0 -0
  53. data/classpath/jersey-guice-1.9.jar +0 -0
  54. data/classpath/jersey-json-1.9.jar +0 -0
  55. data/classpath/jersey-server-1.9.jar +0 -0
  56. data/classpath/jettison-1.1.jar +0 -0
  57. data/classpath/jetty-util-6.1.26.jar +0 -0
  58. data/classpath/jline-0.9.94.jar +0 -0
  59. data/classpath/jsr305-1.3.9.jar +0 -0
  60. data/classpath/leveldbjni-all-1.8.jar +0 -0
  61. data/classpath/netty-3.7.0.Final.jar +0 -0
  62. data/classpath/paranamer-2.3.jar +0 -0
  63. data/classpath/protobuf-java-2.5.0.jar +0 -0
  64. data/classpath/servlet-api-2.5.jar +0 -0
  65. data/classpath/snappy-java-1.0.4.1.jar +0 -0
  66. data/classpath/stax-api-1.0-2.jar +0 -0
  67. data/classpath/xmlenc-0.52.jar +0 -0
  68. data/classpath/xz-1.0.jar +0 -0
  69. data/classpath/zookeeper-3.4.6.jar +0 -0
  70. data/lib/embulk/executor/mapreduce.rb +3 -0
  71. data/src/main/java/org/embulk/executor/mapreduce/AttemptState.java +154 -0
  72. data/src/main/java/org/embulk/executor/mapreduce/BufferWritable.java +74 -0
  73. data/src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java +158 -0
  74. data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputFormat.java +37 -0
  75. data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputSplit.java +61 -0
  76. data/src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java +359 -0
  77. data/src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java +303 -0
  78. data/src/main/java/org/embulk/executor/mapreduce/EmbulkRecordReader.java +63 -0
  79. data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java +391 -0
  80. data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutorTask.java +60 -0
  81. data/src/main/java/org/embulk/executor/mapreduce/PageWritable.java +66 -0
  82. data/src/main/java/org/embulk/executor/mapreduce/PartitionKey.java +11 -0
  83. data/src/main/java/org/embulk/executor/mapreduce/Partitioner.java +11 -0
  84. data/src/main/java/org/embulk/executor/mapreduce/Partitioning.java +12 -0
  85. data/src/main/java/org/embulk/executor/mapreduce/PluginArchive.java +189 -0
  86. data/src/main/java/org/embulk/executor/mapreduce/RemoteTaskFailedException.java +10 -0
  87. data/src/main/java/org/embulk/executor/mapreduce/SetContextClassLoader.java +19 -0
  88. data/src/main/java/org/embulk/executor/mapreduce/TimestampPartitioning.java +291 -0
  89. metadata +131 -0
@@ -0,0 +1,391 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.util.List;
4
+ import java.util.Map;
5
+ import java.util.Set;
6
+ import java.util.HashSet;
7
+ import java.io.File;
8
+ import java.io.IOException;
9
+ import java.io.EOFException;
10
+ import java.net.URI;
11
+ import java.net.URISyntaxException;
12
+ import java.net.URL;
13
+ import java.net.URLClassLoader;
14
+ import java.net.MalformedURLException;
15
+ import org.slf4j.Logger;
16
+ import org.joda.time.format.DateTimeFormat;
17
+ import com.google.inject.Inject;
18
+ import com.google.common.base.Optional;
19
+ import com.google.common.base.Throwables;
20
+ import com.google.common.collect.ImmutableList;
21
+ import com.google.common.collect.Iterators;
22
+ import org.jruby.embed.ScriptingContainer;
23
+ import org.apache.hadoop.util.StringUtils;
24
+ import org.apache.hadoop.io.IntWritable;
25
+ import org.apache.hadoop.io.NullWritable;
26
+ import org.apache.hadoop.fs.Path;
27
+ import org.apache.hadoop.fs.FsConstants;
28
+ import org.apache.hadoop.conf.Configuration;
29
+ import org.apache.hadoop.mapreduce.JobContext;
30
+ import org.apache.hadoop.mapreduce.Cluster;
31
+ import org.apache.hadoop.mapreduce.Job;
32
+ import org.apache.hadoop.mapreduce.Counters;
33
+ import org.apache.hadoop.mapreduce.TaskType;
34
+ import org.apache.hadoop.mapreduce.TaskAttemptID;
35
+ import org.apache.hadoop.mapreduce.TaskCompletionEvent;
36
+ import org.apache.hadoop.mapreduce.MRJobConfig;
37
+ import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
38
+ import org.embulk.exec.ForSystemConfig;
39
+ import org.embulk.config.ConfigSource;
40
+ import org.embulk.config.ConfigException;
41
+ import org.embulk.config.TaskSource;
42
+ import org.embulk.config.ModelManager;
43
+ import org.embulk.spi.Exec;
44
+ import org.embulk.spi.ExecSession;
45
+ import org.embulk.spi.ExecutorPlugin;
46
+ import org.embulk.spi.ProcessTask;
47
+ import org.embulk.spi.ProcessState;
48
+ import org.embulk.spi.TaskState;
49
+ import org.embulk.spi.Schema;
50
+ import org.embulk.spi.time.Timestamp;
51
+
52
+ public class MapReduceExecutor
53
+ implements ExecutorPlugin
54
+ {
55
+ private final Logger log = Exec.getLogger(MapReduceExecutor.class);
56
+ private final ConfigSource systemConfig;
57
+ private final ScriptingContainer jruby;
58
+
59
+ @Inject
60
+ public MapReduceExecutor(@ForSystemConfig ConfigSource systemConfig,
61
+ ScriptingContainer jruby)
62
+ {
63
+ this.systemConfig = systemConfig;
64
+ this.jruby = jruby;
65
+ }
66
+
67
+ @Override
68
+ public void transaction(ConfigSource config, Schema outputSchema, final int inputTaskCount,
69
+ ExecutorPlugin.Control control)
70
+ {
71
+ final MapReduceExecutorTask task = config.loadConfig(MapReduceExecutorTask.class);
72
+ task.setExecConfig(config);
73
+
74
+ final int outputTaskCount;
75
+ final int reduceTaskCount;
76
+
77
+ if (task.getPartitioning().isPresent()) {
78
+ reduceTaskCount = task.getReducers().or(inputTaskCount);
79
+ if (reduceTaskCount <= 0) {
80
+ throw new ConfigException("Reducers must be larger than 1 if partition: is set");
81
+ }
82
+ outputTaskCount = reduceTaskCount;
83
+ ConfigSource partitioningConfig = task.getPartitioning().get();
84
+ String partitioningType = partitioningConfig.get(String.class, "type");
85
+ Partitioning partitioning = newPartitioning(partitioningType);
86
+ TaskSource partitioningTask = partitioning.configure(partitioningConfig, outputSchema, reduceTaskCount);
87
+ task.setPartitioningType(Optional.of(partitioningType));
88
+ task.setPartitioningTask(Optional.of(partitioningTask));
89
+ } else {
90
+ reduceTaskCount = 0;
91
+ outputTaskCount = inputTaskCount;
92
+ task.setPartitioningType(Optional.<String>absent());
93
+ task.setPartitioningTask(Optional.<TaskSource>absent());
94
+ }
95
+
96
+ control.transaction(outputSchema, outputTaskCount, new ExecutorPlugin.Executor() {
97
+ public void execute(ProcessTask procTask, ProcessState state)
98
+ {
99
+ task.setProcessTask(procTask);
100
+
101
+ // hadoop uses ServiceLoader using context classloader to load some implementations
102
+ try (SetContextClassLoader closeLater = new SetContextClassLoader(MapReduceExecutor.class.getClassLoader())) {
103
+ run(task, inputTaskCount, reduceTaskCount, state);
104
+ }
105
+ }
106
+ });
107
+ }
108
+
109
+ static Partitioning newPartitioning(String type)
110
+ {
111
+ switch (type) {
112
+ case "timestamp":
113
+ return new TimestampPartitioning();
114
+ default:
115
+ throw new ConfigException("Unknown partition type '"+type+"'");
116
+ }
117
+ }
118
+
119
+ void run(MapReduceExecutorTask task,
120
+ int mapTaskCount, int reduceTaskCount, ProcessState state)
121
+ {
122
+ ModelManager modelManager = task.getModelManager();
123
+
124
+ Configuration conf = new Configuration();
125
+ // don't call conf.setQuietMode(false). Configuraiton has invalid resource names by default
126
+ for (String path : task.getConfigFiles()) {
127
+ File file = new File(path);
128
+ if (!file.isFile()) {
129
+ throw new ConfigException(String.format("Config file '%s' does not exist", file));
130
+ }
131
+ try {
132
+ // use URL here. Configuration assumes String is a path of a resource in a ClassLoader
133
+ conf.addResource(file.toURI().toURL());
134
+ } catch (MalformedURLException ex) {
135
+ throw new RuntimeException(ex);
136
+ }
137
+ }
138
+
139
+ String uniqueTransactionName = getTransactionUniqueName(Exec.session());
140
+ Path stateDir = new Path(new Path(task.getStatePath()), uniqueTransactionName);
141
+
142
+ Job job;
143
+ try {
144
+ job = Job.getInstance(conf);
145
+ } catch (IOException e) {
146
+ throw Throwables.propagate(e);
147
+ }
148
+ job.setJobName(task.getJobName());
149
+
150
+ // create a dedicated classloader for this yarn application.
151
+ // allow task.getConfig to overwrite this parameter
152
+ job.getConfiguration().set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER, "true"); // mapreduce.job.classloader
153
+ job.getConfiguration().set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER_SYSTEM_CLASSES, "java.,org.apache.hadoop."); // mapreduce.job.classloader.system.classes
154
+
155
+ // extra config
156
+ for (Map.Entry<String, String> pair : task.getConfig().entrySet()) {
157
+ job.getConfiguration().set(pair.getKey(), pair.getValue());
158
+ }
159
+
160
+ // framework config
161
+ EmbulkMapReduce.setSystemConfig(job.getConfiguration(), modelManager, systemConfig);
162
+ EmbulkMapReduce.setExecutorTask(job.getConfiguration(), modelManager, task);
163
+ EmbulkMapReduce.setMapTaskCount(job.getConfiguration(), mapTaskCount); // used by EmbulkInputFormat
164
+ EmbulkMapReduce.setStateDirectoryPath(job.getConfiguration(), stateDir);
165
+
166
+ // create state dir
167
+ try {
168
+ stateDir.getFileSystem(job.getConfiguration()).mkdirs(stateDir);
169
+ } catch (IOException ex) {
170
+ throw new RuntimeException(ex);
171
+ }
172
+
173
+ // archive plugins
174
+ PluginArchive archive = new PluginArchive.Builder()
175
+ .addLoadedRubyGems(jruby)
176
+ .build();
177
+ try {
178
+ EmbulkMapReduce.writePluginArchive(job.getConfiguration(), stateDir, archive, modelManager);
179
+ } catch (IOException ex) {
180
+ throw new RuntimeException(ex);
181
+ }
182
+
183
+ // jar files
184
+ Iterable<Path> jars = collectJars(task.getLibjars());
185
+ job.getConfiguration().set("tmpjars", StringUtils.join(",", jars));
186
+
187
+ job.setInputFormatClass(EmbulkInputFormat.class);
188
+
189
+ if (reduceTaskCount > 0) {
190
+ job.setMapperClass(EmbulkPartitioningMapReduce.EmbulkPartitioningMapper.class);
191
+ job.setMapOutputKeyClass(BufferWritable.class);
192
+ job.setMapOutputValueClass(PageWritable.class);
193
+
194
+ job.setReducerClass(EmbulkPartitioningMapReduce.EmbulkPartitioningReducer.class);
195
+
196
+ job.setNumReduceTasks(reduceTaskCount);
197
+
198
+ } else {
199
+ job.setMapperClass(EmbulkMapReduce.EmbulkMapper.class);
200
+ job.setMapOutputKeyClass(NullWritable.class);
201
+ job.setMapOutputValueClass(NullWritable.class);
202
+
203
+ job.setReducerClass(EmbulkMapReduce.EmbulkReducer.class);
204
+
205
+ job.setNumReduceTasks(0);
206
+ }
207
+
208
+ job.setOutputFormatClass(NullOutputFormat.class);
209
+ job.setOutputKeyClass(NullWritable.class);
210
+ job.setOutputValueClass(NullWritable.class);
211
+
212
+ try {
213
+ job.submit();
214
+
215
+ int interval = Job.getCompletionPollInterval(job.getConfiguration());
216
+ while (!job.isComplete()) {
217
+ //if (job.getState() == JobStatus.State.PREP) {
218
+ // continue;
219
+ //}
220
+ log.info(String.format("map %.1f%% reduce %.1f%%",
221
+ job.mapProgress() * 100, job.reduceProgress() * 100));
222
+ Thread.sleep(interval);
223
+
224
+ updateProcessState(job, mapTaskCount, stateDir, state, modelManager);
225
+ }
226
+
227
+ log.info(String.format("map %.1f%% reduce %.1f%%",
228
+ job.mapProgress() * 100, job.reduceProgress() * 100));
229
+ updateProcessState(job, mapTaskCount, stateDir, state, modelManager);
230
+
231
+ Counters counters = job.getCounters();
232
+ if (counters != null) {
233
+ log.info(counters.toString());
234
+ }
235
+ } catch (IOException | InterruptedException | ClassNotFoundException e) {
236
+ throw Throwables.propagate(e);
237
+ }
238
+ }
239
+
240
+ private static Iterable<Path> collectJars(List<String> extraJars)
241
+ {
242
+ Set<Path> set = new HashSet<Path>();
243
+
244
+ collectURLClassLoaderJars(set, Exec.class.getClassLoader());
245
+ collectURLClassLoaderJars(set, MapReduceExecutor.class.getClassLoader());
246
+
247
+ for (String extraJar : extraJars) {
248
+ URI uri;
249
+ try {
250
+ uri = new URI(extraJar);
251
+ } catch (URISyntaxException ex) {
252
+ throw new ConfigException(String.format("Invalid jar path '%s'", extraJar), ex);
253
+ }
254
+ if (uri.getScheme() == null) {
255
+ set.add(localFileToLocalPath(new File(extraJar)));
256
+ } else {
257
+ set.add(new Path(uri));
258
+ }
259
+ }
260
+
261
+ return set;
262
+ }
263
+
264
+ private static void collectURLClassLoaderJars(Set<Path> set, ClassLoader cl)
265
+ {
266
+ if (cl instanceof URLClassLoader) {
267
+ for (URL url : ((URLClassLoader) cl).getURLs()) {
268
+ File file = new File(url.getPath());
269
+ if (file.isFile()) {
270
+ // TODO log if not found
271
+ // TODO debug logging
272
+ set.add(localFileToLocalPath(file));
273
+ }
274
+ }
275
+ }
276
+ }
277
+
278
+ private static Path localFileToLocalPath(File file)
279
+ {
280
+ Path cwd = new Path(java.nio.file.Paths.get("").toAbsolutePath().toString()).makeQualified(FsConstants.LOCAL_FS_URI, new Path("/"));
281
+ return new Path(file.toString()).makeQualified(FsConstants.LOCAL_FS_URI, cwd);
282
+ }
283
+
284
+ private static String getTransactionUniqueName(ExecSession session)
285
+ {
286
+ // TODO implement Exec.getTransactionUniqueName()
287
+ Timestamp time = session.getTransactionTime();
288
+ return DateTimeFormat.forPattern("yyyyMMdd_HHmmss_").withZoneUTC()
289
+ .print(time.getEpochSecond() * 1000)
290
+ + String.format("%09d", time.getNano());
291
+ }
292
+
293
+ private void updateProcessState(Job job, int mapTaskCount, Path stateDir,
294
+ ProcessState state, ModelManager modelManager) throws IOException
295
+ {
296
+ List<AttemptReport> reports = getAttemptReports(job.getConfiguration(), stateDir, modelManager);
297
+
298
+ for (AttemptReport report : reports) {
299
+ if (report == null) {
300
+ continue;
301
+ }
302
+ if (!report.isStarted()) {
303
+ continue;
304
+ }
305
+ AttemptState attempt = report.getAttemptState();
306
+ if (attempt.getInputTaskIndex().isPresent()) {
307
+ updateState(state.getInputTaskState(attempt.getInputTaskIndex().get()), attempt, true);
308
+ }
309
+ if (attempt.getOutputTaskIndex().isPresent()) {
310
+ updateState(state.getOutputTaskState(attempt.getOutputTaskIndex().get()), attempt, false);
311
+ }
312
+ }
313
+ }
314
+
315
+ private static void updateState(TaskState state, AttemptState attempt, boolean isInput)
316
+ {
317
+ state.start();
318
+ if (attempt.getException().isPresent()) {
319
+ if (!state.isCommitted()) {
320
+ state.setException(new RemoteTaskFailedException(attempt.getException().get()));
321
+ }
322
+ } else if (
323
+ (isInput && attempt.getInputCommitReport().isPresent()) ||
324
+ (!isInput && attempt.getOutputCommitReport().isPresent())) {
325
+ state.resetException();
326
+ }
327
+ if (isInput && attempt.getInputCommitReport().isPresent()) {
328
+ state.setCommitReport(attempt.getInputCommitReport().get());
329
+ state.finish();
330
+ }
331
+ if (!isInput && attempt.getOutputCommitReport().isPresent()) {
332
+ state.setCommitReport(attempt.getOutputCommitReport().get());
333
+ state.finish();
334
+ }
335
+ }
336
+
337
+ private static class AttemptReport
338
+ {
339
+ private final TaskAttemptID attemptId;
340
+ private final AttemptState attemptState;
341
+
342
+ public AttemptReport(TaskAttemptID attemptId)
343
+ {
344
+ this(attemptId, null);
345
+ }
346
+
347
+ public AttemptReport(TaskAttemptID attemptId, AttemptState attemptState)
348
+ {
349
+ this.attemptId = attemptId;
350
+ this.attemptState = attemptState;
351
+ }
352
+
353
+ public boolean isStarted()
354
+ {
355
+ return attemptState != null;
356
+ }
357
+
358
+ public boolean isInputCommitted()
359
+ {
360
+ return attemptState != null && attemptState.getInputCommitReport().isPresent();
361
+ }
362
+
363
+ public boolean isOutputCommitted()
364
+ {
365
+ return attemptState != null && attemptState.getOutputCommitReport().isPresent();
366
+ }
367
+
368
+ public AttemptState getAttemptState()
369
+ {
370
+ return attemptState;
371
+ }
372
+ }
373
+
374
+ private static final int TASK_EVENT_FETCH_SIZE = 100;
375
+
376
+ private static List<AttemptReport> getAttemptReports(Configuration config,
377
+ Path stateDir, ModelManager modelManager) throws IOException
378
+ {
379
+ ImmutableList.Builder<AttemptReport> builder = ImmutableList.builder();
380
+ for (TaskAttemptID aid : EmbulkMapReduce.listAttempts(config, stateDir)) {
381
+ try {
382
+ AttemptState state = EmbulkMapReduce.readAttemptStateFile(config,
383
+ stateDir, aid, modelManager);
384
+ builder.add(new AttemptReport(aid, state));
385
+ } catch (EOFException ex) { // plus Not Found exception
386
+ builder.add(new AttemptReport(aid, null));
387
+ }
388
+ }
389
+ return builder.build();
390
+ }
391
+ }
@@ -0,0 +1,60 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.util.List;
4
+ import java.util.Map;
5
+ import com.google.common.base.Optional;
6
+ import org.embulk.config.Config;
7
+ import org.embulk.config.ConfigInject;
8
+ import org.embulk.config.ConfigDefault;
9
+ import org.embulk.config.ConfigSource;
10
+ import org.embulk.config.Task;
11
+ import org.embulk.config.TaskSource;
12
+ import org.embulk.config.ModelManager;
13
+ import org.embulk.spi.ProcessTask;
14
+
15
+ public interface MapReduceExecutorTask
16
+ extends Task
17
+ {
18
+ @Config("job_name")
19
+ @ConfigDefault("\"embulk\"")
20
+ public String getJobName();
21
+
22
+ @Config("config_files")
23
+ @ConfigDefault("[]")
24
+ public List<String> getConfigFiles();
25
+
26
+ @Config("config")
27
+ @ConfigDefault("{}")
28
+ public Map<String, String> getConfig();
29
+
30
+ @Config("libjars")
31
+ @ConfigDefault("[]")
32
+ public List<String> getLibjars();
33
+
34
+ @Config("state_path")
35
+ @ConfigDefault("\"/tmp/embulk\"")
36
+ public String getStatePath();
37
+
38
+ @Config("reducers")
39
+ @ConfigDefault("null")
40
+ public Optional<Integer> getReducers();
41
+
42
+ @Config("partitioning")
43
+ @ConfigDefault("null")
44
+ public Optional<ConfigSource> getPartitioning();
45
+
46
+ @ConfigInject
47
+ public ModelManager getModelManager();
48
+
49
+ public ConfigSource getExecConfig();
50
+ public void setExecConfig(ConfigSource execConfig);
51
+
52
+ public ProcessTask getProcessTask();
53
+ public void setProcessTask(ProcessTask task);
54
+
55
+ public Optional<String> getPartitioningType();
56
+ public void setPartitioningType(Optional<String> partitioningType);
57
+
58
+ public Optional<TaskSource> getPartitioningTask();
59
+ public void setPartitioningTask(Optional<TaskSource> partitioningTask);
60
+ }
@@ -0,0 +1,66 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.io.IOException;
4
+ import java.io.DataOutput;
5
+ import java.io.DataInput;
6
+ import java.util.List;
7
+ import java.util.ArrayList;
8
+ import org.apache.hadoop.io.Writable;
9
+ import org.apache.hadoop.io.WritableUtils;
10
+ import org.embulk.spi.Buffer;
11
+ import org.embulk.spi.Page;
12
+ import static java.nio.charset.StandardCharsets.UTF_8;
13
+
14
+ public class PageWritable
15
+ implements Writable
16
+ {
17
+ private Page page;
18
+
19
+ public PageWritable() { }
20
+
21
+ public void set(Page page)
22
+ {
23
+ this.page = page;
24
+ }
25
+
26
+ public Page get()
27
+ {
28
+ return page;
29
+ }
30
+
31
+ @Override
32
+ public void write(DataOutput out) throws IOException
33
+ {
34
+ Buffer buffer = page.buffer();
35
+ out.writeInt(buffer.limit());
36
+ out.write(buffer.array(), buffer.offset(), buffer.limit());
37
+
38
+ List<String> stringReferences = page.getStringReferences();
39
+ WritableUtils.writeVInt(out, stringReferences.size());
40
+ for (String s : stringReferences) {
41
+ out.writeUTF(s);
42
+ }
43
+ }
44
+
45
+ @Override
46
+ public void readFields(DataInput in) throws IOException
47
+ {
48
+ int bufferSize = in.readInt();
49
+ byte[] bytes = new byte[bufferSize]; // TODO usa buffer allocator?
50
+ in.readFully(bytes, 0, bufferSize);
51
+ Buffer buffer = Buffer.wrap(bytes);
52
+
53
+ int stringCount = WritableUtils.readVInt(in);
54
+ List<String> strings = new ArrayList<String>(stringCount);
55
+ for (int i=0; i < stringCount; i++) {
56
+ strings.add(in.readUTF());
57
+ }
58
+
59
+ Page newPage = Page.wrap(buffer);
60
+ newPage.setStringReferences(strings);
61
+ if (page != null) {
62
+ page.release();
63
+ }
64
+ page = newPage;
65
+ }
66
+ }
@@ -0,0 +1,11 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import org.embulk.spi.Buffer;
4
+
5
+ public interface PartitionKey
6
+ extends Cloneable
7
+ {
8
+ public void dump(Buffer buffer);
9
+
10
+ public PartitionKey clone();
11
+ }
@@ -0,0 +1,11 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import org.embulk.spi.Buffer;
4
+ import org.embulk.spi.PageReader;
5
+
6
+ public interface Partitioner
7
+ {
8
+ public Buffer newKeyBuffer();
9
+
10
+ public PartitionKey updateKey(PageReader record);
11
+ }
@@ -0,0 +1,12 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import org.embulk.config.ConfigSource;
4
+ import org.embulk.config.TaskSource;
5
+ import org.embulk.spi.Schema;
6
+
7
+ public interface Partitioning
8
+ {
9
+ public TaskSource configure(ConfigSource config, Schema schema, int outputTaskCount);
10
+
11
+ public Partitioner newPartitioner(TaskSource taskSource);
12
+ }