embulk-executor-mapreduce 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/build.gradle +2 -0
  3. data/classpath/activation-1.1.jar +0 -0
  4. data/classpath/apacheds-i18n-2.0.0-M15.jar +0 -0
  5. data/classpath/apacheds-kerberos-codec-2.0.0-M15.jar +0 -0
  6. data/classpath/api-asn1-api-1.0.0-M20.jar +0 -0
  7. data/classpath/api-util-1.0.0-M20.jar +0 -0
  8. data/classpath/avro-1.7.4.jar +0 -0
  9. data/classpath/commons-beanutils-1.7.0.jar +0 -0
  10. data/classpath/commons-cli-1.2.jar +0 -0
  11. data/classpath/commons-codec-1.6.jar +0 -0
  12. data/classpath/commons-collections-3.2.1.jar +0 -0
  13. data/classpath/commons-compress-1.4.1.jar +0 -0
  14. data/classpath/commons-configuration-1.6.jar +0 -0
  15. data/classpath/commons-digester-1.8.jar +0 -0
  16. data/classpath/commons-httpclient-3.1.jar +0 -0
  17. data/classpath/commons-io-2.4.jar +0 -0
  18. data/classpath/commons-lang-2.6.jar +0 -0
  19. data/classpath/commons-logging-1.1.3.jar +0 -0
  20. data/classpath/commons-math3-3.1.1.jar +0 -0
  21. data/classpath/commons-net-3.1.jar +0 -0
  22. data/classpath/curator-client-2.6.0.jar +0 -0
  23. data/classpath/curator-framework-2.6.0.jar +0 -0
  24. data/classpath/curator-recipes-2.6.0.jar +0 -0
  25. data/classpath/embulk-executor-mapreduce-0.1.0.jar +0 -0
  26. data/classpath/gson-2.2.4.jar +0 -0
  27. data/classpath/hadoop-annotations-2.6.0.jar +0 -0
  28. data/classpath/hadoop-auth-2.6.0.jar +0 -0
  29. data/classpath/hadoop-client-2.6.0.jar +0 -0
  30. data/classpath/hadoop-common-2.6.0.jar +0 -0
  31. data/classpath/hadoop-hdfs-2.6.0.jar +0 -0
  32. data/classpath/hadoop-mapreduce-client-app-2.6.0.jar +0 -0
  33. data/classpath/hadoop-mapreduce-client-common-2.6.0.jar +0 -0
  34. data/classpath/hadoop-mapreduce-client-core-2.6.0.jar +0 -0
  35. data/classpath/hadoop-mapreduce-client-jobclient-2.6.0.jar +0 -0
  36. data/classpath/hadoop-mapreduce-client-shuffle-2.6.0.jar +0 -0
  37. data/classpath/hadoop-yarn-api-2.6.0.jar +0 -0
  38. data/classpath/hadoop-yarn-client-2.6.0.jar +0 -0
  39. data/classpath/hadoop-yarn-common-2.6.0.jar +0 -0
  40. data/classpath/hadoop-yarn-server-common-2.6.0.jar +0 -0
  41. data/classpath/hadoop-yarn-server-nodemanager-2.6.0.jar +0 -0
  42. data/classpath/htrace-core-3.0.4.jar +0 -0
  43. data/classpath/httpclient-4.2.5.jar +0 -0
  44. data/classpath/httpcore-4.2.4.jar +0 -0
  45. data/classpath/jackson-core-asl-1.9.13.jar +0 -0
  46. data/classpath/jackson-jaxrs-1.9.13.jar +0 -0
  47. data/classpath/jackson-mapper-asl-1.9.13.jar +0 -0
  48. data/classpath/jackson-xc-1.9.13.jar +0 -0
  49. data/classpath/jaxb-api-2.2.2.jar +0 -0
  50. data/classpath/jaxb-impl-2.2.3-1.jar +0 -0
  51. data/classpath/jersey-client-1.9.jar +0 -0
  52. data/classpath/jersey-core-1.9.jar +0 -0
  53. data/classpath/jersey-guice-1.9.jar +0 -0
  54. data/classpath/jersey-json-1.9.jar +0 -0
  55. data/classpath/jersey-server-1.9.jar +0 -0
  56. data/classpath/jettison-1.1.jar +0 -0
  57. data/classpath/jetty-util-6.1.26.jar +0 -0
  58. data/classpath/jline-0.9.94.jar +0 -0
  59. data/classpath/jsr305-1.3.9.jar +0 -0
  60. data/classpath/leveldbjni-all-1.8.jar +0 -0
  61. data/classpath/netty-3.7.0.Final.jar +0 -0
  62. data/classpath/paranamer-2.3.jar +0 -0
  63. data/classpath/protobuf-java-2.5.0.jar +0 -0
  64. data/classpath/servlet-api-2.5.jar +0 -0
  65. data/classpath/snappy-java-1.0.4.1.jar +0 -0
  66. data/classpath/stax-api-1.0-2.jar +0 -0
  67. data/classpath/xmlenc-0.52.jar +0 -0
  68. data/classpath/xz-1.0.jar +0 -0
  69. data/classpath/zookeeper-3.4.6.jar +0 -0
  70. data/lib/embulk/executor/mapreduce.rb +3 -0
  71. data/src/main/java/org/embulk/executor/mapreduce/AttemptState.java +154 -0
  72. data/src/main/java/org/embulk/executor/mapreduce/BufferWritable.java +74 -0
  73. data/src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java +158 -0
  74. data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputFormat.java +37 -0
  75. data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputSplit.java +61 -0
  76. data/src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java +359 -0
  77. data/src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java +303 -0
  78. data/src/main/java/org/embulk/executor/mapreduce/EmbulkRecordReader.java +63 -0
  79. data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java +391 -0
  80. data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutorTask.java +60 -0
  81. data/src/main/java/org/embulk/executor/mapreduce/PageWritable.java +66 -0
  82. data/src/main/java/org/embulk/executor/mapreduce/PartitionKey.java +11 -0
  83. data/src/main/java/org/embulk/executor/mapreduce/Partitioner.java +11 -0
  84. data/src/main/java/org/embulk/executor/mapreduce/Partitioning.java +12 -0
  85. data/src/main/java/org/embulk/executor/mapreduce/PluginArchive.java +189 -0
  86. data/src/main/java/org/embulk/executor/mapreduce/RemoteTaskFailedException.java +10 -0
  87. data/src/main/java/org/embulk/executor/mapreduce/SetContextClassLoader.java +19 -0
  88. data/src/main/java/org/embulk/executor/mapreduce/TimestampPartitioning.java +291 -0
  89. metadata +131 -0
@@ -0,0 +1,303 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.util.List;
4
+ import java.util.Iterator;
5
+ import java.io.IOException;
6
+ import com.google.common.base.Optional;
7
+ import com.google.common.collect.ImmutableList;
8
+ import org.apache.hadoop.fs.Path;
9
+ import org.apache.hadoop.io.NullWritable;
10
+ import org.apache.hadoop.io.IntWritable;
11
+ import org.apache.hadoop.conf.Configuration;
12
+ import org.apache.hadoop.mapreduce.Mapper;
13
+ import org.apache.hadoop.mapreduce.Reducer;
14
+ import org.embulk.config.ModelManager;
15
+ import org.embulk.config.CommitReport;
16
+ import org.embulk.config.ConfigDiff;
17
+ import org.embulk.config.TaskSource;
18
+ import org.embulk.config.ConfigSource;
19
+ import org.embulk.spi.Exec;
20
+ import org.embulk.spi.ExecAction;
21
+ import org.embulk.spi.ExecSession;
22
+ import org.embulk.spi.Schema;
23
+ import org.embulk.spi.Page;
24
+ import org.embulk.spi.PageReader;
25
+ import org.embulk.spi.PageOutput;
26
+ import org.embulk.spi.BufferAllocator;
27
+ import org.embulk.spi.InputPlugin;
28
+ import org.embulk.spi.OutputPlugin;
29
+ import org.embulk.spi.FilterPlugin;
30
+ import org.embulk.spi.ProcessTask;
31
+ import org.embulk.spi.TransactionalPageOutput;
32
+ import org.embulk.spi.util.Filters;
33
+ import org.embulk.spi.util.Executors;
34
+ import org.embulk.executor.mapreduce.EmbulkMapReduce.SessionRunner;
35
+ import org.embulk.executor.mapreduce.BufferedPagePartitioner.PartitionedPageOutput;
36
+ import org.embulk.executor.mapreduce.EmbulkMapReduce.AttemptStateUpdateHandler;
37
+ import static org.embulk.executor.mapreduce.MapReduceExecutor.newPartitioning;
38
+
39
+ public class EmbulkPartitioningMapReduce
40
+ {
41
+ public static class EmbulkPartitioningMapper
42
+ extends Mapper<IntWritable, NullWritable, BufferWritable, PageWritable>
43
+ {
44
+ private Context context;
45
+ private SessionRunner runner;
46
+
47
+ @Override
48
+ public void setup(Context context) throws IOException
49
+ {
50
+ this.context = context;
51
+ this.runner = new SessionRunner(context);
52
+ runner.readPluginArchive().restoreLoadPathsTo(runner.getScriptingContainer());
53
+ }
54
+
55
+ @Override
56
+ public void map(IntWritable key, NullWritable value, final Context context) throws IOException, InterruptedException
57
+ {
58
+ final int taskIndex = key.get();
59
+
60
+ runner.execSession(new ExecAction<Void>() {
61
+ public Void run() throws Exception
62
+ {
63
+ process(context, taskIndex);
64
+ return null;
65
+ }
66
+ });
67
+ }
68
+
69
+ private void process(final Context context, int taskIndex) throws IOException, InterruptedException
70
+ {
71
+ ProcessTask task = runner.getMapReduceExecutorTask().getProcessTask();
72
+ ExecSession exec = runner.getExecSession();
73
+
74
+ // input and filters run at mapper
75
+ InputPlugin inputPlugin = exec.newPlugin(InputPlugin.class, task.getInputPluginType());
76
+ List<FilterPlugin> filterPlugins = Filters.newFilterPlugins(exec, task.getFilterPluginTypes());
77
+
78
+ // output writes pages with partitioning key to the Context
79
+ Partitioning partitioning = newPartitioning(runner.getMapReduceExecutorTask().getPartitioningType().get());
80
+ final Partitioner partitioner = partitioning.newPartitioner(runner.getMapReduceExecutorTask().getPartitioningTask().get());
81
+ OutputPlugin outputPlugin = new MapperOutputPlugin(
82
+ runner.getBufferAllocator(), partitioner,
83
+ 128, // TODO configurable
84
+ new PartitionedPageOutput() {
85
+ private final BufferWritable keyWritable = new BufferWritable();
86
+ private final PageWritable valueWritable = new PageWritable();
87
+
88
+ {
89
+ keyWritable.set(partitioner.newKeyBuffer());
90
+ }
91
+
92
+ @Override
93
+ public void add(PartitionKey key, Page value)
94
+ {
95
+ try {
96
+ key.dump(keyWritable.get());
97
+ valueWritable.set(value);
98
+ context.write(keyWritable, valueWritable);
99
+ } catch (IOException | InterruptedException ex) {
100
+ throw new RuntimeException(ex);
101
+ } finally {
102
+ value.release();
103
+ }
104
+ }
105
+
106
+ @Override
107
+ public void finish()
108
+ { }
109
+
110
+ @Override
111
+ public void close()
112
+ { }
113
+ });
114
+
115
+ AttemptStateUpdateHandler handler = new AttemptStateUpdateHandler(runner,
116
+ new AttemptState(context.getTaskAttemptID(), Optional.of(taskIndex), Optional.<Integer>absent()));
117
+
118
+ try {
119
+ Executors.process(exec, taskIndex,
120
+ inputPlugin, task.getInputSchema(), task.getInputTaskSource(),
121
+ filterPlugins, task.getFilterSchemas(), task.getFilterTaskSources(),
122
+ outputPlugin, task.getOutputSchema(), task.getOutputTaskSource(),
123
+ handler);
124
+ } catch (Throwable ex) {
125
+ try {
126
+ handler.setException(ex);
127
+ } catch (Throwable e) {
128
+ e.addSuppressed(ex);
129
+ throw e;
130
+ }
131
+ //if (task.getTaskRecovery()) {
132
+ // throw ex;
133
+ //}
134
+ }
135
+ }
136
+ }
137
+
138
+ public static class EmbulkPartitioningReducer
139
+ extends Reducer<BufferWritable, PageWritable, NullWritable, NullWritable>
140
+ {
141
+ private Context context;
142
+ private SessionRunner runner;
143
+ private AttemptStateUpdateHandler handler;
144
+ private TransactionalPageOutput output;
145
+ private boolean failed = false;
146
+
147
+ @Override
148
+ public void setup(final Context context) throws IOException, InterruptedException
149
+ {
150
+ this.context = context;
151
+ this.runner = new SessionRunner(context);
152
+ runner.readPluginArchive().restoreLoadPathsTo(runner.getScriptingContainer());
153
+
154
+ runner.execSession(new ExecAction<Void>() {
155
+ public Void run() throws Exception
156
+ {
157
+ int taskIndex = context.getTaskAttemptID().getTaskID().getId();
158
+
159
+ ProcessTask task = runner.getMapReduceExecutorTask().getProcessTask();
160
+ ExecSession exec = runner.getExecSession();
161
+ OutputPlugin outputPlugin = exec.newPlugin(OutputPlugin.class, task.getOutputPluginType());
162
+
163
+ handler = new AttemptStateUpdateHandler(runner,
164
+ new AttemptState(context.getTaskAttemptID(), Optional.<Integer>absent(), Optional.of(taskIndex)));
165
+
166
+ output = outputPlugin.open(task.getOutputTaskSource(), task.getExecutorSchema(), taskIndex);
167
+
168
+ handler.started();
169
+
170
+ return null;
171
+ }
172
+ });
173
+ }
174
+
175
+ @Override
176
+ public void reduce(BufferWritable key, final Iterable<PageWritable> values, final Context context)
177
+ throws IOException, InterruptedException
178
+ {
179
+ runner.execSession(new ExecAction<Void>() {
180
+ public Void run() throws Exception
181
+ {
182
+ process(context, values);
183
+ return null;
184
+ }
185
+ });
186
+ }
187
+
188
+ private void process(final Context context, Iterable<PageWritable> values) throws IOException, InterruptedException
189
+ {
190
+ try {
191
+ for (PageWritable value : values) {
192
+ output.add(value.get());
193
+ }
194
+ } catch (Throwable ex) {
195
+ failed = true;
196
+ try {
197
+ handler.setException(ex);
198
+ } catch (Throwable e) {
199
+ e.addSuppressed(ex);
200
+ throw e;
201
+ }
202
+ }
203
+ }
204
+
205
+ protected void cleanup(Context context) throws IOException, InterruptedException
206
+ {
207
+ runner.execSession(new ExecAction<Void>() {
208
+ public Void run() throws Exception
209
+ {
210
+ try {
211
+ if (!failed) {
212
+ output.finish();
213
+ CommitReport report = output.commit();
214
+ handler.outputCommitted(report);
215
+ }
216
+ } finally {
217
+ output.close();
218
+ }
219
+ return null;
220
+ }
221
+ });
222
+ }
223
+ }
224
+
225
+ private static class MapperOutputPlugin
226
+ implements OutputPlugin
227
+ {
228
+ private final BufferAllocator bufferAllocator;
229
+ private final Partitioner partitioner;
230
+ private final int maxPageBufferCount;
231
+ private final PartitionedPageOutput output;
232
+
233
+ public MapperOutputPlugin(BufferAllocator bufferAllocator,
234
+ Partitioner partitioner, int maxPageBufferCount,
235
+ PartitionedPageOutput output)
236
+ {
237
+ this.bufferAllocator = bufferAllocator;
238
+ this.partitioner = partitioner;
239
+ this.maxPageBufferCount = maxPageBufferCount;
240
+ this.output = output;
241
+ }
242
+
243
+ public ConfigDiff transaction(ConfigSource config,
244
+ Schema schema, int taskCount,
245
+ OutputPlugin.Control control)
246
+ {
247
+ // won't be called
248
+ throw new RuntimeException("");
249
+ }
250
+
251
+ public ConfigDiff resume(TaskSource taskSource,
252
+ Schema schema, int taskCount,
253
+ OutputPlugin.Control control)
254
+ {
255
+ // won't be called
256
+ throw new RuntimeException("");
257
+ }
258
+
259
+ public void cleanup(TaskSource taskSource,
260
+ Schema schema, int taskCount,
261
+ List<CommitReport> successCommitReports)
262
+ {
263
+ // won't be called
264
+ throw new RuntimeException("");
265
+ }
266
+
267
+ public TransactionalPageOutput open(TaskSource taskSource, final Schema schema, int taskIndex)
268
+ {
269
+ return new TransactionalPageOutput() {
270
+ private final BufferedPagePartitioner bufferedPartitioner = new BufferedPagePartitioner(
271
+ bufferAllocator, schema, partitioner, maxPageBufferCount, output);
272
+ private final PageReader reader = new PageReader(schema);
273
+
274
+ public void add(Page page)
275
+ {
276
+ reader.setPage(page);
277
+ while (reader.nextRecord()) {
278
+ bufferedPartitioner.add(reader);
279
+ }
280
+ }
281
+
282
+ public void finish()
283
+ {
284
+ bufferedPartitioner.finish();
285
+ }
286
+
287
+ public void close()
288
+ {
289
+ reader.close();
290
+ bufferedPartitioner.close();
291
+ }
292
+
293
+ public void abort()
294
+ { }
295
+
296
+ public CommitReport commit()
297
+ {
298
+ return Exec.newCommitReport();
299
+ }
300
+ };
301
+ }
302
+ }
303
+ }
@@ -0,0 +1,63 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import org.apache.hadoop.io.IntWritable;
4
+ import org.apache.hadoop.io.NullWritable;
5
+ import org.apache.hadoop.mapreduce.InputSplit;
6
+ import org.apache.hadoop.mapreduce.TaskAttemptContext;
7
+ import org.apache.hadoop.mapreduce.RecordReader;
8
+
9
+ public class EmbulkRecordReader
10
+ extends RecordReader<IntWritable, NullWritable>
11
+ {
12
+ private final int[] taskIndexes;
13
+ private int offset;
14
+
15
+ private final IntWritable currentKey = new IntWritable();
16
+
17
+ public EmbulkRecordReader(EmbulkInputSplit split)
18
+ {
19
+ this.taskIndexes = split.getTaskIndexes();
20
+ this.offset = -1;
21
+ }
22
+
23
+ @Override
24
+ public void initialize(InputSplit split, TaskAttemptContext context)
25
+ { }
26
+
27
+ @Override
28
+ public boolean nextKeyValue()
29
+ {
30
+ offset++;
31
+ if (taskIndexes.length <= offset) {
32
+ return false;
33
+ }
34
+ currentKey.set(taskIndexes[offset]);
35
+ return true;
36
+ }
37
+
38
+ @Override
39
+ public float getProgress()
40
+ {
41
+ if (taskIndexes.length == 0) {
42
+ return (float) 1.0;
43
+ }
44
+ return offset / (float) taskIndexes.length;
45
+ }
46
+
47
+ @Override
48
+ public IntWritable getCurrentKey()
49
+ {
50
+ return currentKey;
51
+ }
52
+
53
+ @Override
54
+ public NullWritable getCurrentValue()
55
+ {
56
+ return NullWritable.get();
57
+ }
58
+
59
+ @Override
60
+ public void close()
61
+ {
62
+ }
63
+ }