embulk-executor-mapreduce 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/build.gradle +2 -0
  3. data/classpath/activation-1.1.jar +0 -0
  4. data/classpath/apacheds-i18n-2.0.0-M15.jar +0 -0
  5. data/classpath/apacheds-kerberos-codec-2.0.0-M15.jar +0 -0
  6. data/classpath/api-asn1-api-1.0.0-M20.jar +0 -0
  7. data/classpath/api-util-1.0.0-M20.jar +0 -0
  8. data/classpath/avro-1.7.4.jar +0 -0
  9. data/classpath/commons-beanutils-1.7.0.jar +0 -0
  10. data/classpath/commons-cli-1.2.jar +0 -0
  11. data/classpath/commons-codec-1.6.jar +0 -0
  12. data/classpath/commons-collections-3.2.1.jar +0 -0
  13. data/classpath/commons-compress-1.4.1.jar +0 -0
  14. data/classpath/commons-configuration-1.6.jar +0 -0
  15. data/classpath/commons-digester-1.8.jar +0 -0
  16. data/classpath/commons-httpclient-3.1.jar +0 -0
  17. data/classpath/commons-io-2.4.jar +0 -0
  18. data/classpath/commons-lang-2.6.jar +0 -0
  19. data/classpath/commons-logging-1.1.3.jar +0 -0
  20. data/classpath/commons-math3-3.1.1.jar +0 -0
  21. data/classpath/commons-net-3.1.jar +0 -0
  22. data/classpath/curator-client-2.6.0.jar +0 -0
  23. data/classpath/curator-framework-2.6.0.jar +0 -0
  24. data/classpath/curator-recipes-2.6.0.jar +0 -0
  25. data/classpath/embulk-executor-mapreduce-0.1.0.jar +0 -0
  26. data/classpath/gson-2.2.4.jar +0 -0
  27. data/classpath/hadoop-annotations-2.6.0.jar +0 -0
  28. data/classpath/hadoop-auth-2.6.0.jar +0 -0
  29. data/classpath/hadoop-client-2.6.0.jar +0 -0
  30. data/classpath/hadoop-common-2.6.0.jar +0 -0
  31. data/classpath/hadoop-hdfs-2.6.0.jar +0 -0
  32. data/classpath/hadoop-mapreduce-client-app-2.6.0.jar +0 -0
  33. data/classpath/hadoop-mapreduce-client-common-2.6.0.jar +0 -0
  34. data/classpath/hadoop-mapreduce-client-core-2.6.0.jar +0 -0
  35. data/classpath/hadoop-mapreduce-client-jobclient-2.6.0.jar +0 -0
  36. data/classpath/hadoop-mapreduce-client-shuffle-2.6.0.jar +0 -0
  37. data/classpath/hadoop-yarn-api-2.6.0.jar +0 -0
  38. data/classpath/hadoop-yarn-client-2.6.0.jar +0 -0
  39. data/classpath/hadoop-yarn-common-2.6.0.jar +0 -0
  40. data/classpath/hadoop-yarn-server-common-2.6.0.jar +0 -0
  41. data/classpath/hadoop-yarn-server-nodemanager-2.6.0.jar +0 -0
  42. data/classpath/htrace-core-3.0.4.jar +0 -0
  43. data/classpath/httpclient-4.2.5.jar +0 -0
  44. data/classpath/httpcore-4.2.4.jar +0 -0
  45. data/classpath/jackson-core-asl-1.9.13.jar +0 -0
  46. data/classpath/jackson-jaxrs-1.9.13.jar +0 -0
  47. data/classpath/jackson-mapper-asl-1.9.13.jar +0 -0
  48. data/classpath/jackson-xc-1.9.13.jar +0 -0
  49. data/classpath/jaxb-api-2.2.2.jar +0 -0
  50. data/classpath/jaxb-impl-2.2.3-1.jar +0 -0
  51. data/classpath/jersey-client-1.9.jar +0 -0
  52. data/classpath/jersey-core-1.9.jar +0 -0
  53. data/classpath/jersey-guice-1.9.jar +0 -0
  54. data/classpath/jersey-json-1.9.jar +0 -0
  55. data/classpath/jersey-server-1.9.jar +0 -0
  56. data/classpath/jettison-1.1.jar +0 -0
  57. data/classpath/jetty-util-6.1.26.jar +0 -0
  58. data/classpath/jline-0.9.94.jar +0 -0
  59. data/classpath/jsr305-1.3.9.jar +0 -0
  60. data/classpath/leveldbjni-all-1.8.jar +0 -0
  61. data/classpath/netty-3.7.0.Final.jar +0 -0
  62. data/classpath/paranamer-2.3.jar +0 -0
  63. data/classpath/protobuf-java-2.5.0.jar +0 -0
  64. data/classpath/servlet-api-2.5.jar +0 -0
  65. data/classpath/snappy-java-1.0.4.1.jar +0 -0
  66. data/classpath/stax-api-1.0-2.jar +0 -0
  67. data/classpath/xmlenc-0.52.jar +0 -0
  68. data/classpath/xz-1.0.jar +0 -0
  69. data/classpath/zookeeper-3.4.6.jar +0 -0
  70. data/lib/embulk/executor/mapreduce.rb +3 -0
  71. data/src/main/java/org/embulk/executor/mapreduce/AttemptState.java +154 -0
  72. data/src/main/java/org/embulk/executor/mapreduce/BufferWritable.java +74 -0
  73. data/src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java +158 -0
  74. data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputFormat.java +37 -0
  75. data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputSplit.java +61 -0
  76. data/src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java +359 -0
  77. data/src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java +303 -0
  78. data/src/main/java/org/embulk/executor/mapreduce/EmbulkRecordReader.java +63 -0
  79. data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java +391 -0
  80. data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutorTask.java +60 -0
  81. data/src/main/java/org/embulk/executor/mapreduce/PageWritable.java +66 -0
  82. data/src/main/java/org/embulk/executor/mapreduce/PartitionKey.java +11 -0
  83. data/src/main/java/org/embulk/executor/mapreduce/Partitioner.java +11 -0
  84. data/src/main/java/org/embulk/executor/mapreduce/Partitioning.java +12 -0
  85. data/src/main/java/org/embulk/executor/mapreduce/PluginArchive.java +189 -0
  86. data/src/main/java/org/embulk/executor/mapreduce/RemoteTaskFailedException.java +10 -0
  87. data/src/main/java/org/embulk/executor/mapreduce/SetContextClassLoader.java +19 -0
  88. data/src/main/java/org/embulk/executor/mapreduce/TimestampPartitioning.java +291 -0
  89. metadata +131 -0
@@ -0,0 +1,61 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.io.DataInput;
4
+ import java.io.DataOutput;
5
+ import java.io.IOException;
6
+ import org.apache.hadoop.io.Writable;
7
+ import org.apache.hadoop.mapreduce.InputSplit;
8
+
9
+ public class EmbulkInputSplit
10
+ extends InputSplit
11
+ implements Writable
12
+ {
13
+ private int[] taskIndexes;
14
+
15
+ public EmbulkInputSplit()
16
+ {
17
+ this(new int[0]);
18
+ }
19
+
20
+ public EmbulkInputSplit(int[] taskIndexes)
21
+ {
22
+ this.taskIndexes = taskIndexes;
23
+ }
24
+
25
+ public int[] getTaskIndexes()
26
+ {
27
+ return taskIndexes;
28
+ }
29
+
30
+ @Override
31
+ public long getLength()
32
+ {
33
+ return taskIndexes.length;
34
+ }
35
+
36
+ @Override
37
+ public String[] getLocations()
38
+ {
39
+ return new String[0];
40
+ }
41
+
42
+ @Override
43
+ public void write(DataOutput out) throws IOException
44
+ {
45
+ out.writeInt(taskIndexes.length);
46
+ for (int taskIndex : taskIndexes) {
47
+ out.writeInt(taskIndex);
48
+ }
49
+ }
50
+
51
+ @Override
52
+ public void readFields(DataInput in) throws IOException
53
+ {
54
+ int c = in.readInt();
55
+ int[] taskIndexes = new int[c];
56
+ for (int i=0; i < c; i++) {
57
+ taskIndexes[i] = in.readInt();
58
+ }
59
+ this.taskIndexes = taskIndexes;
60
+ }
61
+ }
@@ -0,0 +1,359 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.util.List;
4
+ import java.util.ArrayList;
5
+ import java.util.concurrent.ExecutionException;
6
+ import java.io.File;
7
+ import java.io.IOException;
8
+ import com.google.inject.Injector;
9
+ import com.google.common.base.Optional;
10
+ import com.google.common.base.Throwables;
11
+ import com.google.common.collect.ImmutableList;
12
+ import com.fasterxml.jackson.core.JsonFactory;
13
+ import com.fasterxml.jackson.databind.ObjectMapper;
14
+ import org.jruby.embed.ScriptingContainer;
15
+ import org.apache.hadoop.fs.Path;
16
+ import org.apache.hadoop.fs.FileStatus;
17
+ import org.apache.hadoop.fs.FSDataInputStream;
18
+ import org.apache.hadoop.fs.FSDataOutputStream;
19
+ import org.apache.hadoop.fs.LocalDirAllocator;
20
+ import org.apache.hadoop.io.IntWritable;
21
+ import org.apache.hadoop.io.NullWritable;
22
+ import org.apache.hadoop.conf.Configuration;
23
+ import org.apache.hadoop.mapreduce.Job;
24
+ import org.apache.hadoop.mapreduce.JobContext;
25
+ import org.apache.hadoop.mapreduce.TaskAttemptID;
26
+ import org.apache.hadoop.mapreduce.Mapper;
27
+ import org.apache.hadoop.mapreduce.Reducer;
28
+ import org.apache.hadoop.mapreduce.TaskAttemptContext;
29
+ import org.apache.hadoop.mapreduce.MRConfig;
30
+ import org.embulk.config.ModelManager;
31
+ import org.embulk.config.ConfigSource;
32
+ import org.embulk.config.ConfigLoader;
33
+ import org.embulk.config.CommitReport;
34
+ import org.embulk.spi.BufferAllocator;
35
+ import org.embulk.spi.Exec;
36
+ import org.embulk.spi.ExecAction;
37
+ import org.embulk.spi.ExecSession;
38
+ import org.embulk.spi.ProcessTask;
39
+ import org.embulk.spi.util.Executors;
40
+ import org.embulk.EmbulkService;
41
+
42
+ public class EmbulkMapReduce
43
+ {
44
+ private static final String CK_SYSTEM_CONFIG = "embulk.mapreduce.systemConfig";
45
+ private static final String CK_STATE_DIRECTORY_PATH = "embulk.mapreduce.stateDirectorypath";
46
+ private static final String CK_TASK_COUNT = "embulk.mapreduce.taskCount";
47
+ private static final String CK_TASK = "embulk.mapreduce.task";
48
+ private static final String CK_PLUGIN_ARCHIVE_SPECS = "embulk.mapreduce.pluginArchive.specs";
49
+ private static final String PLUGIN_ARCHIVE_FILE_NAME = "gems.zip";
50
+
51
+ public static void setSystemConfig(Configuration config, ModelManager modelManager, ConfigSource systemConfig)
52
+ {
53
+ config.set(CK_SYSTEM_CONFIG, modelManager.writeObject(systemConfig));
54
+ }
55
+
56
+ public static ConfigSource getSystemConfig(Configuration config)
57
+ {
58
+ try {
59
+ ModelManager bootstrapModelManager = new ModelManager(null, new ObjectMapper());
60
+ return new ConfigLoader(bootstrapModelManager).fromJson(
61
+ new JsonFactory().createParser(config.get(CK_SYSTEM_CONFIG))); // TODO add fromJson(String)
62
+ } catch (IOException e) {
63
+ throw Throwables.propagate(e);
64
+ }
65
+ }
66
+
67
+ public static void setMapTaskCount(Configuration config, int taskCount)
68
+ {
69
+ config.setInt(CK_TASK_COUNT, taskCount);
70
+ }
71
+
72
+ public static int getMapTaskCount(Configuration config)
73
+ {
74
+ return config.getInt(CK_TASK_COUNT, 0);
75
+ }
76
+
77
+ public static void setStateDirectoryPath(Configuration config, Path path)
78
+ {
79
+ config.set(CK_STATE_DIRECTORY_PATH, path.toString());
80
+ }
81
+
82
+ public static Path getStateDirectoryPath(Configuration config)
83
+ {
84
+ return new Path(config.get(CK_STATE_DIRECTORY_PATH));
85
+ }
86
+
87
+ public static void setExecutorTask(Configuration config, ModelManager modelManager, MapReduceExecutorTask task)
88
+ {
89
+ config.set(CK_TASK, modelManager.writeObject(task));
90
+ }
91
+
92
+ public static MapReduceExecutorTask getExecutorTask(Injector injector, Configuration config)
93
+ {
94
+ return injector.getInstance(ModelManager.class).readObject(MapReduceExecutorTask.class,
95
+ config.get(CK_TASK));
96
+ }
97
+
98
+ public static Injector newEmbulkInstance(Configuration config)
99
+ {
100
+ ConfigSource systemConfig = getSystemConfig(config);
101
+ return new EmbulkService(systemConfig).getInjector();
102
+ }
103
+
104
+ public static List<TaskAttemptID> listAttempts(Configuration config,
105
+ Path stateDir) throws IOException
106
+ {
107
+ FileStatus[] stats = stateDir.getFileSystem(config).listStatus(stateDir);
108
+ ImmutableList.Builder<TaskAttemptID> builder = ImmutableList.builder();
109
+ for (FileStatus stat : stats) {
110
+ if (stat.getPath().getName().startsWith("attempt_") && stat.isFile()) {
111
+ String name = stat.getPath().getName();
112
+ try {
113
+ builder.add(TaskAttemptID.forName(name));
114
+ } catch (IllegalArgumentException ex) {
115
+ // ignore
116
+ }
117
+ }
118
+ }
119
+ return builder.build();
120
+ }
121
+
122
+ public static PluginArchive readPluginArchive(File localDirectory, Configuration config,
123
+ Path stateDir, ModelManager modelManager) throws IOException
124
+ {
125
+ List<PluginArchive.GemSpec> specs = modelManager.readObject(
126
+ new ArrayList<PluginArchive.GemSpec>() {}.getClass(),
127
+ config.get(CK_PLUGIN_ARCHIVE_SPECS));
128
+ Path path = new Path(stateDir, PLUGIN_ARCHIVE_FILE_NAME);
129
+ try (FSDataInputStream in = path.getFileSystem(config).open(path)) {
130
+ return PluginArchive.load(localDirectory, specs, in);
131
+ }
132
+ }
133
+
134
+ public static void writePluginArchive(Configuration config, Path stateDir,
135
+ PluginArchive archive, ModelManager modelManager) throws IOException
136
+ {
137
+ Path path = new Path(stateDir, PLUGIN_ARCHIVE_FILE_NAME);
138
+ try (FSDataOutputStream out = path.getFileSystem(config).create(path, true)) {
139
+ List<PluginArchive.GemSpec> specs = archive.dump(out);
140
+ config.set(CK_PLUGIN_ARCHIVE_SPECS, modelManager.writeObject(specs));
141
+ }
142
+ }
143
+
144
+ public static AttemptState readAttemptStateFile(Configuration config,
145
+ Path stateDir, TaskAttemptID id, ModelManager modelManager) throws IOException
146
+ {
147
+ Path path = new Path(stateDir, id.toString());
148
+ try (FSDataInputStream in = path.getFileSystem(config).open(path)) {
149
+ return AttemptState.readFrom(in, modelManager);
150
+ }
151
+ }
152
+
153
+ public static void writeAttemptStateFile(Configuration config,
154
+ Path stateDir, AttemptState state, ModelManager modelManager) throws IOException
155
+ {
156
+ Path path = new Path(stateDir, state.getAttemptId().toString());
157
+ try (FSDataOutputStream out = path.getFileSystem(config).create(path, true)) {
158
+ state.writeTo(out, modelManager);
159
+ }
160
+ }
161
+
162
+ public static class SessionRunner
163
+ {
164
+ private final Configuration config;
165
+ private final Injector injector;
166
+ private final ModelManager modelManager;
167
+ private final MapReduceExecutorTask task;
168
+ private final ExecSession session;
169
+ private final File localGemPath;
170
+
171
+ public SessionRunner(TaskAttemptContext context)
172
+ {
173
+ this.config = context.getConfiguration();
174
+ this.injector = newEmbulkInstance(context.getConfiguration());
175
+ this.modelManager = injector.getInstance(ModelManager.class);
176
+ this.task = getExecutorTask(injector, context.getConfiguration());
177
+ this.session = new ExecSession(injector, task.getExecConfig());
178
+
179
+ try {
180
+ LocalDirAllocator localDirAllocator = new LocalDirAllocator(MRConfig.LOCAL_DIR);
181
+ Path destPath = localDirAllocator.getLocalPathForWrite("gems", config);
182
+ this.localGemPath = new File(destPath.toString());
183
+ } catch (IOException ex) {
184
+ throw new RuntimeException(ex);
185
+ }
186
+ }
187
+
188
+ public PluginArchive readPluginArchive() throws IOException
189
+ {
190
+ localGemPath.mkdirs();
191
+ return EmbulkMapReduce.readPluginArchive(localGemPath, config, getStateDirectoryPath(config), modelManager);
192
+ }
193
+
194
+ public Configuration getConfiguration()
195
+ {
196
+ return config;
197
+ }
198
+
199
+ public ModelManager getModelManager()
200
+ {
201
+ return modelManager;
202
+ }
203
+
204
+ public BufferAllocator getBufferAllocator()
205
+ {
206
+ return injector.getInstance(BufferAllocator.class);
207
+ }
208
+
209
+ public ScriptingContainer getScriptingContainer()
210
+ {
211
+ return injector.getInstance(ScriptingContainer.class);
212
+ }
213
+
214
+ public MapReduceExecutorTask getMapReduceExecutorTask()
215
+ {
216
+ return task;
217
+ }
218
+
219
+ public ExecSession getExecSession()
220
+ {
221
+ return session;
222
+ }
223
+
224
+ public <T> T execSession(ExecAction<T> action) throws IOException, InterruptedException
225
+ {
226
+ try {
227
+ return Exec.doWith(session, action);
228
+ } catch (ExecutionException e) {
229
+ Throwables.propagateIfInstanceOf(e.getCause(), IOException.class);
230
+ Throwables.propagateIfInstanceOf(e.getCause(), InterruptedException.class);
231
+ throw Throwables.propagate(e.getCause());
232
+ }
233
+ }
234
+
235
+ public void deleteTempFiles()
236
+ {
237
+ // TODO delete localGemPath
238
+ }
239
+ }
240
+
241
+ public static class AttemptStateUpdateHandler
242
+ implements Executors.ProcessStateCallback
243
+ {
244
+ private final Configuration config;
245
+ private final Path stateDir;
246
+ private final ModelManager modelManager;
247
+ private final AttemptState state;
248
+
249
+ public AttemptStateUpdateHandler(SessionRunner runner, AttemptState state)
250
+ {
251
+ this.config = runner.getConfiguration();
252
+ this.stateDir = getStateDirectoryPath(config);
253
+ this.state = state;
254
+ this.modelManager = runner.getModelManager();
255
+ }
256
+
257
+ @Override
258
+ public void started()
259
+ {
260
+ try {
261
+ writeAttemptStateFile(config, stateDir, state, modelManager);
262
+ } catch (IOException e) {
263
+ throw new RuntimeException(e);
264
+ }
265
+ }
266
+
267
+ @Override
268
+ public void inputCommitted(CommitReport report)
269
+ {
270
+ state.setInputCommitReport(report);
271
+ try {
272
+ writeAttemptStateFile(config, stateDir, state, modelManager);
273
+ } catch (IOException e) {
274
+ throw new RuntimeException(e);
275
+ }
276
+ }
277
+
278
+ @Override
279
+ public void outputCommitted(CommitReport report)
280
+ {
281
+ state.setOutputCommitReport(report);
282
+ try {
283
+ writeAttemptStateFile(config, stateDir, state, modelManager);
284
+ } catch (IOException e) {
285
+ throw new RuntimeException(e);
286
+ }
287
+ }
288
+
289
+ public void setException(Throwable ex) throws IOException
290
+ {
291
+ state.setException(ex);
292
+ writeAttemptStateFile(config, stateDir, state, modelManager);
293
+ }
294
+ }
295
+
296
+ public static class EmbulkMapper
297
+ extends Mapper<IntWritable, NullWritable, NullWritable, NullWritable>
298
+ {
299
+ private Context context;
300
+ private SessionRunner runner;
301
+
302
+ @Override
303
+ public void setup(Context context) throws IOException
304
+ {
305
+ this.context = context;
306
+ this.runner = new SessionRunner(context);
307
+ runner.readPluginArchive().restoreLoadPathsTo(runner.getScriptingContainer());
308
+ }
309
+
310
+ @Override
311
+ public void map(IntWritable key, NullWritable value, final Context context) throws IOException, InterruptedException
312
+ {
313
+ final int taskIndex = key.get();
314
+
315
+ runner.execSession(new ExecAction<Void>() {
316
+ public Void run() throws Exception
317
+ {
318
+ process(context, taskIndex);
319
+ return null;
320
+ }
321
+ });
322
+ }
323
+
324
+ private void process(final Context context, int taskIndex) throws IOException, InterruptedException
325
+ {
326
+ ProcessTask task = runner.getMapReduceExecutorTask().getProcessTask();
327
+
328
+ AttemptStateUpdateHandler handler = new AttemptStateUpdateHandler(runner,
329
+ new AttemptState(context.getTaskAttemptID(), Optional.of(taskIndex), Optional.of(taskIndex)));
330
+
331
+ try {
332
+ Executors.process(runner.getExecSession(), task, taskIndex, handler);
333
+ } catch (Throwable ex) {
334
+ try {
335
+ handler.setException(ex);
336
+ } catch (Throwable e) {
337
+ e.addSuppressed(ex);
338
+ throw e;
339
+ }
340
+ //if (task.getTaskRecovery()) {
341
+ // throw ex;
342
+ //}
343
+ }
344
+ }
345
+ }
346
+
347
+ public static class EmbulkReducer
348
+ extends Reducer<NullWritable, NullWritable, NullWritable, NullWritable>
349
+ {
350
+ private IntWritable result = new IntWritable();
351
+
352
+ @Override
353
+ public void reduce(NullWritable key, Iterable<NullWritable> values, Context context)
354
+ throws IOException, InterruptedException
355
+ {
356
+ // do nothing
357
+ }
358
+ }
359
+ }