embulk-executor-mapreduce 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/build.gradle +2 -0
  3. data/classpath/activation-1.1.jar +0 -0
  4. data/classpath/apacheds-i18n-2.0.0-M15.jar +0 -0
  5. data/classpath/apacheds-kerberos-codec-2.0.0-M15.jar +0 -0
  6. data/classpath/api-asn1-api-1.0.0-M20.jar +0 -0
  7. data/classpath/api-util-1.0.0-M20.jar +0 -0
  8. data/classpath/avro-1.7.4.jar +0 -0
  9. data/classpath/commons-beanutils-1.7.0.jar +0 -0
  10. data/classpath/commons-cli-1.2.jar +0 -0
  11. data/classpath/commons-codec-1.6.jar +0 -0
  12. data/classpath/commons-collections-3.2.1.jar +0 -0
  13. data/classpath/commons-compress-1.4.1.jar +0 -0
  14. data/classpath/commons-configuration-1.6.jar +0 -0
  15. data/classpath/commons-digester-1.8.jar +0 -0
  16. data/classpath/commons-httpclient-3.1.jar +0 -0
  17. data/classpath/commons-io-2.4.jar +0 -0
  18. data/classpath/commons-lang-2.6.jar +0 -0
  19. data/classpath/commons-logging-1.1.3.jar +0 -0
  20. data/classpath/commons-math3-3.1.1.jar +0 -0
  21. data/classpath/commons-net-3.1.jar +0 -0
  22. data/classpath/curator-client-2.6.0.jar +0 -0
  23. data/classpath/curator-framework-2.6.0.jar +0 -0
  24. data/classpath/curator-recipes-2.6.0.jar +0 -0
  25. data/classpath/embulk-executor-mapreduce-0.1.0.jar +0 -0
  26. data/classpath/gson-2.2.4.jar +0 -0
  27. data/classpath/hadoop-annotations-2.6.0.jar +0 -0
  28. data/classpath/hadoop-auth-2.6.0.jar +0 -0
  29. data/classpath/hadoop-client-2.6.0.jar +0 -0
  30. data/classpath/hadoop-common-2.6.0.jar +0 -0
  31. data/classpath/hadoop-hdfs-2.6.0.jar +0 -0
  32. data/classpath/hadoop-mapreduce-client-app-2.6.0.jar +0 -0
  33. data/classpath/hadoop-mapreduce-client-common-2.6.0.jar +0 -0
  34. data/classpath/hadoop-mapreduce-client-core-2.6.0.jar +0 -0
  35. data/classpath/hadoop-mapreduce-client-jobclient-2.6.0.jar +0 -0
  36. data/classpath/hadoop-mapreduce-client-shuffle-2.6.0.jar +0 -0
  37. data/classpath/hadoop-yarn-api-2.6.0.jar +0 -0
  38. data/classpath/hadoop-yarn-client-2.6.0.jar +0 -0
  39. data/classpath/hadoop-yarn-common-2.6.0.jar +0 -0
  40. data/classpath/hadoop-yarn-server-common-2.6.0.jar +0 -0
  41. data/classpath/hadoop-yarn-server-nodemanager-2.6.0.jar +0 -0
  42. data/classpath/htrace-core-3.0.4.jar +0 -0
  43. data/classpath/httpclient-4.2.5.jar +0 -0
  44. data/classpath/httpcore-4.2.4.jar +0 -0
  45. data/classpath/jackson-core-asl-1.9.13.jar +0 -0
  46. data/classpath/jackson-jaxrs-1.9.13.jar +0 -0
  47. data/classpath/jackson-mapper-asl-1.9.13.jar +0 -0
  48. data/classpath/jackson-xc-1.9.13.jar +0 -0
  49. data/classpath/jaxb-api-2.2.2.jar +0 -0
  50. data/classpath/jaxb-impl-2.2.3-1.jar +0 -0
  51. data/classpath/jersey-client-1.9.jar +0 -0
  52. data/classpath/jersey-core-1.9.jar +0 -0
  53. data/classpath/jersey-guice-1.9.jar +0 -0
  54. data/classpath/jersey-json-1.9.jar +0 -0
  55. data/classpath/jersey-server-1.9.jar +0 -0
  56. data/classpath/jettison-1.1.jar +0 -0
  57. data/classpath/jetty-util-6.1.26.jar +0 -0
  58. data/classpath/jline-0.9.94.jar +0 -0
  59. data/classpath/jsr305-1.3.9.jar +0 -0
  60. data/classpath/leveldbjni-all-1.8.jar +0 -0
  61. data/classpath/netty-3.7.0.Final.jar +0 -0
  62. data/classpath/paranamer-2.3.jar +0 -0
  63. data/classpath/protobuf-java-2.5.0.jar +0 -0
  64. data/classpath/servlet-api-2.5.jar +0 -0
  65. data/classpath/snappy-java-1.0.4.1.jar +0 -0
  66. data/classpath/stax-api-1.0-2.jar +0 -0
  67. data/classpath/xmlenc-0.52.jar +0 -0
  68. data/classpath/xz-1.0.jar +0 -0
  69. data/classpath/zookeeper-3.4.6.jar +0 -0
  70. data/lib/embulk/executor/mapreduce.rb +3 -0
  71. data/src/main/java/org/embulk/executor/mapreduce/AttemptState.java +154 -0
  72. data/src/main/java/org/embulk/executor/mapreduce/BufferWritable.java +74 -0
  73. data/src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java +158 -0
  74. data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputFormat.java +37 -0
  75. data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputSplit.java +61 -0
  76. data/src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java +359 -0
  77. data/src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java +303 -0
  78. data/src/main/java/org/embulk/executor/mapreduce/EmbulkRecordReader.java +63 -0
  79. data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java +391 -0
  80. data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutorTask.java +60 -0
  81. data/src/main/java/org/embulk/executor/mapreduce/PageWritable.java +66 -0
  82. data/src/main/java/org/embulk/executor/mapreduce/PartitionKey.java +11 -0
  83. data/src/main/java/org/embulk/executor/mapreduce/Partitioner.java +11 -0
  84. data/src/main/java/org/embulk/executor/mapreduce/Partitioning.java +12 -0
  85. data/src/main/java/org/embulk/executor/mapreduce/PluginArchive.java +189 -0
  86. data/src/main/java/org/embulk/executor/mapreduce/RemoteTaskFailedException.java +10 -0
  87. data/src/main/java/org/embulk/executor/mapreduce/SetContextClassLoader.java +19 -0
  88. data/src/main/java/org/embulk/executor/mapreduce/TimestampPartitioning.java +291 -0
  89. metadata +131 -0
@@ -0,0 +1,61 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.io.DataInput;
4
+ import java.io.DataOutput;
5
+ import java.io.IOException;
6
+ import org.apache.hadoop.io.Writable;
7
+ import org.apache.hadoop.mapreduce.InputSplit;
8
+
9
+ public class EmbulkInputSplit
10
+ extends InputSplit
11
+ implements Writable
12
+ {
13
+ private int[] taskIndexes;
14
+
15
+ public EmbulkInputSplit()
16
+ {
17
+ this(new int[0]);
18
+ }
19
+
20
+ public EmbulkInputSplit(int[] taskIndexes)
21
+ {
22
+ this.taskIndexes = taskIndexes;
23
+ }
24
+
25
+ public int[] getTaskIndexes()
26
+ {
27
+ return taskIndexes;
28
+ }
29
+
30
+ @Override
31
+ public long getLength()
32
+ {
33
+ return taskIndexes.length;
34
+ }
35
+
36
+ @Override
37
+ public String[] getLocations()
38
+ {
39
+ return new String[0];
40
+ }
41
+
42
+ @Override
43
+ public void write(DataOutput out) throws IOException
44
+ {
45
+ out.writeInt(taskIndexes.length);
46
+ for (int taskIndex : taskIndexes) {
47
+ out.writeInt(taskIndex);
48
+ }
49
+ }
50
+
51
+ @Override
52
+ public void readFields(DataInput in) throws IOException
53
+ {
54
+ int c = in.readInt();
55
+ int[] taskIndexes = new int[c];
56
+ for (int i=0; i < c; i++) {
57
+ taskIndexes[i] = in.readInt();
58
+ }
59
+ this.taskIndexes = taskIndexes;
60
+ }
61
+ }
@@ -0,0 +1,359 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.util.List;
4
+ import java.util.ArrayList;
5
+ import java.util.concurrent.ExecutionException;
6
+ import java.io.File;
7
+ import java.io.IOException;
8
+ import com.google.inject.Injector;
9
+ import com.google.common.base.Optional;
10
+ import com.google.common.base.Throwables;
11
+ import com.google.common.collect.ImmutableList;
12
+ import com.fasterxml.jackson.core.JsonFactory;
13
+ import com.fasterxml.jackson.databind.ObjectMapper;
14
+ import org.jruby.embed.ScriptingContainer;
15
+ import org.apache.hadoop.fs.Path;
16
+ import org.apache.hadoop.fs.FileStatus;
17
+ import org.apache.hadoop.fs.FSDataInputStream;
18
+ import org.apache.hadoop.fs.FSDataOutputStream;
19
+ import org.apache.hadoop.fs.LocalDirAllocator;
20
+ import org.apache.hadoop.io.IntWritable;
21
+ import org.apache.hadoop.io.NullWritable;
22
+ import org.apache.hadoop.conf.Configuration;
23
+ import org.apache.hadoop.mapreduce.Job;
24
+ import org.apache.hadoop.mapreduce.JobContext;
25
+ import org.apache.hadoop.mapreduce.TaskAttemptID;
26
+ import org.apache.hadoop.mapreduce.Mapper;
27
+ import org.apache.hadoop.mapreduce.Reducer;
28
+ import org.apache.hadoop.mapreduce.TaskAttemptContext;
29
+ import org.apache.hadoop.mapreduce.MRConfig;
30
+ import org.embulk.config.ModelManager;
31
+ import org.embulk.config.ConfigSource;
32
+ import org.embulk.config.ConfigLoader;
33
+ import org.embulk.config.CommitReport;
34
+ import org.embulk.spi.BufferAllocator;
35
+ import org.embulk.spi.Exec;
36
+ import org.embulk.spi.ExecAction;
37
+ import org.embulk.spi.ExecSession;
38
+ import org.embulk.spi.ProcessTask;
39
+ import org.embulk.spi.util.Executors;
40
+ import org.embulk.EmbulkService;
41
+
42
+ public class EmbulkMapReduce
43
+ {
44
+ private static final String CK_SYSTEM_CONFIG = "embulk.mapreduce.systemConfig";
45
+ private static final String CK_STATE_DIRECTORY_PATH = "embulk.mapreduce.stateDirectorypath";
46
+ private static final String CK_TASK_COUNT = "embulk.mapreduce.taskCount";
47
+ private static final String CK_TASK = "embulk.mapreduce.task";
48
+ private static final String CK_PLUGIN_ARCHIVE_SPECS = "embulk.mapreduce.pluginArchive.specs";
49
+ private static final String PLUGIN_ARCHIVE_FILE_NAME = "gems.zip";
50
+
51
+ public static void setSystemConfig(Configuration config, ModelManager modelManager, ConfigSource systemConfig)
52
+ {
53
+ config.set(CK_SYSTEM_CONFIG, modelManager.writeObject(systemConfig));
54
+ }
55
+
56
+ public static ConfigSource getSystemConfig(Configuration config)
57
+ {
58
+ try {
59
+ ModelManager bootstrapModelManager = new ModelManager(null, new ObjectMapper());
60
+ return new ConfigLoader(bootstrapModelManager).fromJson(
61
+ new JsonFactory().createParser(config.get(CK_SYSTEM_CONFIG))); // TODO add fromJson(String)
62
+ } catch (IOException e) {
63
+ throw Throwables.propagate(e);
64
+ }
65
+ }
66
+
67
+ public static void setMapTaskCount(Configuration config, int taskCount)
68
+ {
69
+ config.setInt(CK_TASK_COUNT, taskCount);
70
+ }
71
+
72
+ public static int getMapTaskCount(Configuration config)
73
+ {
74
+ return config.getInt(CK_TASK_COUNT, 0);
75
+ }
76
+
77
+ public static void setStateDirectoryPath(Configuration config, Path path)
78
+ {
79
+ config.set(CK_STATE_DIRECTORY_PATH, path.toString());
80
+ }
81
+
82
+ public static Path getStateDirectoryPath(Configuration config)
83
+ {
84
+ return new Path(config.get(CK_STATE_DIRECTORY_PATH));
85
+ }
86
+
87
+ public static void setExecutorTask(Configuration config, ModelManager modelManager, MapReduceExecutorTask task)
88
+ {
89
+ config.set(CK_TASK, modelManager.writeObject(task));
90
+ }
91
+
92
+ public static MapReduceExecutorTask getExecutorTask(Injector injector, Configuration config)
93
+ {
94
+ return injector.getInstance(ModelManager.class).readObject(MapReduceExecutorTask.class,
95
+ config.get(CK_TASK));
96
+ }
97
+
98
+ public static Injector newEmbulkInstance(Configuration config)
99
+ {
100
+ ConfigSource systemConfig = getSystemConfig(config);
101
+ return new EmbulkService(systemConfig).getInjector();
102
+ }
103
+
104
+ public static List<TaskAttemptID> listAttempts(Configuration config,
105
+ Path stateDir) throws IOException
106
+ {
107
+ FileStatus[] stats = stateDir.getFileSystem(config).listStatus(stateDir);
108
+ ImmutableList.Builder<TaskAttemptID> builder = ImmutableList.builder();
109
+ for (FileStatus stat : stats) {
110
+ if (stat.getPath().getName().startsWith("attempt_") && stat.isFile()) {
111
+ String name = stat.getPath().getName();
112
+ try {
113
+ builder.add(TaskAttemptID.forName(name));
114
+ } catch (IllegalArgumentException ex) {
115
+ // ignore
116
+ }
117
+ }
118
+ }
119
+ return builder.build();
120
+ }
121
+
122
+ public static PluginArchive readPluginArchive(File localDirectory, Configuration config,
123
+ Path stateDir, ModelManager modelManager) throws IOException
124
+ {
125
+ List<PluginArchive.GemSpec> specs = modelManager.readObject(
126
+ new ArrayList<PluginArchive.GemSpec>() {}.getClass(),
127
+ config.get(CK_PLUGIN_ARCHIVE_SPECS));
128
+ Path path = new Path(stateDir, PLUGIN_ARCHIVE_FILE_NAME);
129
+ try (FSDataInputStream in = path.getFileSystem(config).open(path)) {
130
+ return PluginArchive.load(localDirectory, specs, in);
131
+ }
132
+ }
133
+
134
+ public static void writePluginArchive(Configuration config, Path stateDir,
135
+ PluginArchive archive, ModelManager modelManager) throws IOException
136
+ {
137
+ Path path = new Path(stateDir, PLUGIN_ARCHIVE_FILE_NAME);
138
+ try (FSDataOutputStream out = path.getFileSystem(config).create(path, true)) {
139
+ List<PluginArchive.GemSpec> specs = archive.dump(out);
140
+ config.set(CK_PLUGIN_ARCHIVE_SPECS, modelManager.writeObject(specs));
141
+ }
142
+ }
143
+
144
+ public static AttemptState readAttemptStateFile(Configuration config,
145
+ Path stateDir, TaskAttemptID id, ModelManager modelManager) throws IOException
146
+ {
147
+ Path path = new Path(stateDir, id.toString());
148
+ try (FSDataInputStream in = path.getFileSystem(config).open(path)) {
149
+ return AttemptState.readFrom(in, modelManager);
150
+ }
151
+ }
152
+
153
+ public static void writeAttemptStateFile(Configuration config,
154
+ Path stateDir, AttemptState state, ModelManager modelManager) throws IOException
155
+ {
156
+ Path path = new Path(stateDir, state.getAttemptId().toString());
157
+ try (FSDataOutputStream out = path.getFileSystem(config).create(path, true)) {
158
+ state.writeTo(out, modelManager);
159
+ }
160
+ }
161
+
162
+ public static class SessionRunner
163
+ {
164
+ private final Configuration config;
165
+ private final Injector injector;
166
+ private final ModelManager modelManager;
167
+ private final MapReduceExecutorTask task;
168
+ private final ExecSession session;
169
+ private final File localGemPath;
170
+
171
+ public SessionRunner(TaskAttemptContext context)
172
+ {
173
+ this.config = context.getConfiguration();
174
+ this.injector = newEmbulkInstance(context.getConfiguration());
175
+ this.modelManager = injector.getInstance(ModelManager.class);
176
+ this.task = getExecutorTask(injector, context.getConfiguration());
177
+ this.session = new ExecSession(injector, task.getExecConfig());
178
+
179
+ try {
180
+ LocalDirAllocator localDirAllocator = new LocalDirAllocator(MRConfig.LOCAL_DIR);
181
+ Path destPath = localDirAllocator.getLocalPathForWrite("gems", config);
182
+ this.localGemPath = new File(destPath.toString());
183
+ } catch (IOException ex) {
184
+ throw new RuntimeException(ex);
185
+ }
186
+ }
187
+
188
+ public PluginArchive readPluginArchive() throws IOException
189
+ {
190
+ localGemPath.mkdirs();
191
+ return EmbulkMapReduce.readPluginArchive(localGemPath, config, getStateDirectoryPath(config), modelManager);
192
+ }
193
+
194
+ public Configuration getConfiguration()
195
+ {
196
+ return config;
197
+ }
198
+
199
+ public ModelManager getModelManager()
200
+ {
201
+ return modelManager;
202
+ }
203
+
204
+ public BufferAllocator getBufferAllocator()
205
+ {
206
+ return injector.getInstance(BufferAllocator.class);
207
+ }
208
+
209
+ public ScriptingContainer getScriptingContainer()
210
+ {
211
+ return injector.getInstance(ScriptingContainer.class);
212
+ }
213
+
214
+ public MapReduceExecutorTask getMapReduceExecutorTask()
215
+ {
216
+ return task;
217
+ }
218
+
219
+ public ExecSession getExecSession()
220
+ {
221
+ return session;
222
+ }
223
+
224
+ public <T> T execSession(ExecAction<T> action) throws IOException, InterruptedException
225
+ {
226
+ try {
227
+ return Exec.doWith(session, action);
228
+ } catch (ExecutionException e) {
229
+ Throwables.propagateIfInstanceOf(e.getCause(), IOException.class);
230
+ Throwables.propagateIfInstanceOf(e.getCause(), InterruptedException.class);
231
+ throw Throwables.propagate(e.getCause());
232
+ }
233
+ }
234
+
235
+ public void deleteTempFiles()
236
+ {
237
+ // TODO delete localGemPath
238
+ }
239
+ }
240
+
241
+ public static class AttemptStateUpdateHandler
242
+ implements Executors.ProcessStateCallback
243
+ {
244
+ private final Configuration config;
245
+ private final Path stateDir;
246
+ private final ModelManager modelManager;
247
+ private final AttemptState state;
248
+
249
+ public AttemptStateUpdateHandler(SessionRunner runner, AttemptState state)
250
+ {
251
+ this.config = runner.getConfiguration();
252
+ this.stateDir = getStateDirectoryPath(config);
253
+ this.state = state;
254
+ this.modelManager = runner.getModelManager();
255
+ }
256
+
257
+ @Override
258
+ public void started()
259
+ {
260
+ try {
261
+ writeAttemptStateFile(config, stateDir, state, modelManager);
262
+ } catch (IOException e) {
263
+ throw new RuntimeException(e);
264
+ }
265
+ }
266
+
267
+ @Override
268
+ public void inputCommitted(CommitReport report)
269
+ {
270
+ state.setInputCommitReport(report);
271
+ try {
272
+ writeAttemptStateFile(config, stateDir, state, modelManager);
273
+ } catch (IOException e) {
274
+ throw new RuntimeException(e);
275
+ }
276
+ }
277
+
278
+ @Override
279
+ public void outputCommitted(CommitReport report)
280
+ {
281
+ state.setOutputCommitReport(report);
282
+ try {
283
+ writeAttemptStateFile(config, stateDir, state, modelManager);
284
+ } catch (IOException e) {
285
+ throw new RuntimeException(e);
286
+ }
287
+ }
288
+
289
+ public void setException(Throwable ex) throws IOException
290
+ {
291
+ state.setException(ex);
292
+ writeAttemptStateFile(config, stateDir, state, modelManager);
293
+ }
294
+ }
295
+
296
+ public static class EmbulkMapper
297
+ extends Mapper<IntWritable, NullWritable, NullWritable, NullWritable>
298
+ {
299
+ private Context context;
300
+ private SessionRunner runner;
301
+
302
+ @Override
303
+ public void setup(Context context) throws IOException
304
+ {
305
+ this.context = context;
306
+ this.runner = new SessionRunner(context);
307
+ runner.readPluginArchive().restoreLoadPathsTo(runner.getScriptingContainer());
308
+ }
309
+
310
+ @Override
311
+ public void map(IntWritable key, NullWritable value, final Context context) throws IOException, InterruptedException
312
+ {
313
+ final int taskIndex = key.get();
314
+
315
+ runner.execSession(new ExecAction<Void>() {
316
+ public Void run() throws Exception
317
+ {
318
+ process(context, taskIndex);
319
+ return null;
320
+ }
321
+ });
322
+ }
323
+
324
+ private void process(final Context context, int taskIndex) throws IOException, InterruptedException
325
+ {
326
+ ProcessTask task = runner.getMapReduceExecutorTask().getProcessTask();
327
+
328
+ AttemptStateUpdateHandler handler = new AttemptStateUpdateHandler(runner,
329
+ new AttemptState(context.getTaskAttemptID(), Optional.of(taskIndex), Optional.of(taskIndex)));
330
+
331
+ try {
332
+ Executors.process(runner.getExecSession(), task, taskIndex, handler);
333
+ } catch (Throwable ex) {
334
+ try {
335
+ handler.setException(ex);
336
+ } catch (Throwable e) {
337
+ e.addSuppressed(ex);
338
+ throw e;
339
+ }
340
+ //if (task.getTaskRecovery()) {
341
+ // throw ex;
342
+ //}
343
+ }
344
+ }
345
+ }
346
+
347
+ public static class EmbulkReducer
348
+ extends Reducer<NullWritable, NullWritable, NullWritable, NullWritable>
349
+ {
350
+ private IntWritable result = new IntWritable();
351
+
352
+ @Override
353
+ public void reduce(NullWritable key, Iterable<NullWritable> values, Context context)
354
+ throws IOException, InterruptedException
355
+ {
356
+ // do nothing
357
+ }
358
+ }
359
+ }