embulk-executor-mapreduce 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (27) hide show
  1. checksums.yaml +4 -4
  2. data/classpath/{embulk-executor-mapreduce-0.2.2.jar → embulk-executor-mapreduce-0.2.3.jar} +0 -0
  3. data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java +48 -24
  4. data/src/main/java/org/embulk/executor/mapreduce/TimestampPartitioning.java +11 -6
  5. data/src/test/java/org/embulk/executor/mapreduce/MapReduceExecutorTestRuntime.java +130 -0
  6. data/src/test/java/org/embulk/executor/mapreduce/TestAttemptState.java +58 -0
  7. data/src/test/java/org/embulk/executor/mapreduce/TestEmbulkInputFormat.java +54 -0
  8. data/src/test/java/org/embulk/executor/mapreduce/TestEmbulkInputSplit.java +46 -0
  9. data/src/test/java/org/embulk/executor/mapreduce/TestEmbulkRecordReader.java +25 -0
  10. data/src/test/java/org/embulk/executor/mapreduce/TestMapReduceExecutor.java +251 -0
  11. data/src/test/java/org/embulk/executor/mapreduce/TestPageBufferWritable.java +84 -0
  12. data/src/test/java/org/embulk/executor/mapreduce/TestTimestampPartitioning.java +222 -0
  13. data/src/test/resources/config/core-site.xml +8 -0
  14. data/src/test/resources/config/embulk_mapred_config.yml +38 -0
  15. data/src/test/resources/config/embulk_mapred_invalid_config_files_config.yml +38 -0
  16. data/src/test/resources/config/embulk_mapred_invalid_libjars_config.yml +40 -0
  17. data/src/test/resources/config/embulk_mapred_invalid_partitioning_config.yml +40 -0
  18. data/src/test/resources/config/embulk_mapred_invalid_reducers_config.yml +44 -0
  19. data/src/test/resources/config/embulk_mapred_partitioning_config.yml +43 -0
  20. data/src/test/resources/config/embulk_mapred_stop_on_invalid_record_config.yml +39 -0
  21. data/src/test/resources/config/hdfs-site.xml +18 -0
  22. data/src/test/resources/config/mapred-site.xml +8 -0
  23. data/src/test/resources/fixtures/csv/sample1.csv +3 -0
  24. data/src/test/resources/fixtures/csv/sample2.csv +4 -0
  25. data/src/test/resources/fixtures/invalid_csv/sample1.csv +4 -0
  26. data/src/test/resources/fixtures/invalid_csv/sample2.csv +3 -0
  27. metadata +25 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d683908af5b1f5e06ea035f9ecd4e7a89dbd4eab
4
- data.tar.gz: f8a5a7db822f6241a40211dd1782e4fda83df073
3
+ metadata.gz: 3c12d59b65314dd94cf12e29b7caf8e83fd947bd
4
+ data.tar.gz: 478ea3a4e77c7a9e5395a4c4942ac9697772a10d
5
5
  SHA512:
6
- metadata.gz: 48b955f8b5a70864dd745347079cc83545fdb6953091812917beeb28e6aa9f66726c68e8b85b0aa3c8da40f419fc0b1efa78aab90bd899e42bb4c1a0c14edaae
7
- data.tar.gz: 99199b98f4f56e2d5d74c7f12be41b19bd31aa3f496144a940b16c6c53abf3980c181ae186f40ba3bf307106abb53dc00683c54157ea000652d24d66cff12cd5
6
+ metadata.gz: e8c955d1e7a4e0b318bc21a36c51410045b4b4c9e62b954e8848f66693d26fe580a79afb8c8e1c1910f9ecd1731f78772b6b9af7f64e817b9666776da26b126c
7
+ data.tar.gz: 14fb4a34dbaf1b59b8d37ceac1356d4ab66014a559943f16da087d9f17800e46c80e9b98ae1583be5568ee8909034ba708bfa4ca3dab27ddde90b5c8e2ab8f59
@@ -1,5 +1,6 @@
1
1
  package org.embulk.executor.mapreduce;
2
2
 
3
+ import java.security.PrivilegedAction;
3
4
  import java.util.List;
4
5
  import java.util.ArrayList;
5
6
  import java.util.Collection;
@@ -16,6 +17,8 @@ import java.net.URISyntaxException;
16
17
  import java.net.URL;
17
18
  import java.net.URLClassLoader;
18
19
  import java.net.MalformedURLException;
20
+
21
+ import org.apache.hadoop.security.UserGroupInformation;
19
22
  import org.slf4j.Logger;
20
23
  import org.joda.time.format.DateTimeFormat;
21
24
  import com.google.inject.Inject;
@@ -183,12 +186,12 @@ public class MapReduceExecutor
183
186
  }
184
187
  }
185
188
 
186
- void run(MapReduceExecutorTask task,
187
- int mapTaskCount, int reduceTaskCount, ProcessState state)
189
+ void run(final MapReduceExecutorTask task,
190
+ int mapTaskCount, final int reduceTaskCount, final ProcessState state)
188
191
  {
189
- ModelManager modelManager = task.getModelManager();
192
+ final ModelManager modelManager = task.getModelManager();
190
193
 
191
- Configuration conf = new Configuration();
194
+ final Configuration conf = new Configuration();
192
195
  // don't call conf.setQuietMode(false). Configuraiton has invalid resource names by default
193
196
  for (String path : task.getConfigFiles()) {
194
197
  File file = new File(path);
@@ -204,31 +207,56 @@ public class MapReduceExecutor
204
207
  }
205
208
 
206
209
  String uniqueTransactionName = getTransactionUniqueName(Exec.session());
207
- Path stateDir = new Path(new Path(task.getStatePath()), uniqueTransactionName);
208
-
209
- Job job;
210
- try {
211
- job = Job.getInstance(conf);
212
- } catch (IOException e) {
213
- throw Throwables.propagate(e);
214
- }
215
- job.setJobName(task.getJobName());
210
+ final Path stateDir = new Path(new Path(task.getStatePath()), uniqueTransactionName);
216
211
 
217
212
  // create a dedicated classloader for this yarn application.
218
213
  // allow task.getConfig to overwrite this parameter
219
- job.getConfiguration().set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER, "true"); // mapreduce.job.classloader
220
- job.getConfiguration().set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER_SYSTEM_CLASSES, "java.,org.apache.hadoop."); // mapreduce.job.classloader.system.classes
214
+ conf.set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER, "true"); // mapreduce.job.classloader
215
+ conf.set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER_SYSTEM_CLASSES, "java.,org.apache.hadoop."); // mapreduce.job.classloader.system.classes
221
216
 
222
217
  // extra config
223
218
  for (Map.Entry<String, String> pair : task.getConfig().entrySet()) {
224
- job.getConfiguration().set(pair.getKey(), pair.getValue());
219
+ conf.set(pair.getKey(), pair.getValue());
225
220
  }
226
221
 
227
222
  // framework config
228
- EmbulkMapReduce.setSystemConfig(job.getConfiguration(), modelManager, systemConfig);
229
- EmbulkMapReduce.setExecutorTask(job.getConfiguration(), modelManager, task);
230
- EmbulkMapReduce.setMapTaskCount(job.getConfiguration(), mapTaskCount); // used by EmbulkInputFormat
231
- EmbulkMapReduce.setStateDirectoryPath(job.getConfiguration(), stateDir);
223
+ EmbulkMapReduce.setSystemConfig(conf, modelManager, systemConfig);
224
+ EmbulkMapReduce.setExecutorTask(conf, modelManager, task);
225
+ EmbulkMapReduce.setMapTaskCount(conf, mapTaskCount); // used by EmbulkInputFormat
226
+ EmbulkMapReduce.setStateDirectoryPath(conf, stateDir);
227
+
228
+ // jar files
229
+ List<Path> jars = collectJars(task.getLibjars(), task.getExcludeJars());
230
+ conf.set("tmpjars", StringUtils.join(",", jars));
231
+
232
+ String remoteUser = conf.get(MRJobConfig.USER_NAME); // mapreduce.job.user.name
233
+ if (remoteUser != null) {
234
+ UserGroupInformation.createRemoteUser(remoteUser).doAs(
235
+ new PrivilegedAction<Void>()
236
+ {
237
+ @Override
238
+ public Void run()
239
+ {
240
+ runJob(task, modelManager, reduceTaskCount, state, stateDir, conf);
241
+ return null;
242
+ }
243
+ }
244
+ );
245
+ } else {
246
+ runJob(task, modelManager, reduceTaskCount, state, stateDir, conf);
247
+ }
248
+ }
249
+
250
+ void runJob(MapReduceExecutorTask task, ModelManager modelManager,
251
+ int reduceTaskCount, ProcessState state, Path stateDir, Configuration conf)
252
+ {
253
+ Job job;
254
+ try {
255
+ job = Job.getInstance(conf);
256
+ } catch (IOException e) {
257
+ throw Throwables.propagate(e);
258
+ }
259
+ job.setJobName(task.getJobName());
232
260
 
233
261
  // archive plugins (also create state dir)
234
262
  PluginArchive archive = new PluginArchive.Builder()
@@ -240,10 +268,6 @@ public class MapReduceExecutor
240
268
  throw new RuntimeException(ex);
241
269
  }
242
270
 
243
- // jar files
244
- List<Path> jars = collectJars(task.getLibjars(), task.getExcludeJars());
245
- job.getConfiguration().set("tmpjars", StringUtils.join(",", jars));
246
-
247
271
  job.setInputFormatClass(EmbulkInputFormat.class);
248
272
 
249
273
  if (reduceTaskCount > 0) {
@@ -1,5 +1,6 @@
1
1
  package org.embulk.executor.mapreduce;
2
2
 
3
+ import com.google.common.annotations.VisibleForTesting;
3
4
  import org.joda.time.DateTimeZone;
4
5
  import com.google.common.base.Optional;
5
6
  import org.embulk.config.Config;
@@ -40,7 +41,8 @@ public class TimestampPartitioning
40
41
  public void setTargetColumn(Column column);
41
42
  }
42
43
 
43
- private static enum Unit
44
+ @VisibleForTesting
45
+ static enum Unit
44
46
  {
45
47
  HOUR(60*60),
46
48
  DAY(24*60*60);
@@ -70,12 +72,13 @@ public class TimestampPartitioning
70
72
  //case "year": return YEAR;
71
73
  default:
72
74
  throw new ConfigException(
73
- String.format("Unknown unit '%s'. Supported units are hour and day"));
75
+ String.format("Unknown unit '%s'. Supported units are hour and day", s));
74
76
  }
75
77
  }
76
78
  }
77
79
 
78
- private static enum UnixTimestampUnit
80
+ @VisibleForTesting
81
+ static enum UnixTimestampUnit
79
82
  {
80
83
  SEC(1),
81
84
  MILLI(1000),
@@ -103,7 +106,7 @@ public class TimestampPartitioning
103
106
  case "nano": return NANO;
104
107
  default:
105
108
  throw new ConfigException(
106
- String.format("Unknown unix_timestamp_unit '%s'. Supported units are sec, milli, micro, and nano"));
109
+ String.format("Unknown unix_timestamp_unit '%s'. Supported units are sec, milli, micro, and nano", s));
107
110
  }
108
111
  }
109
112
  }
@@ -253,7 +256,8 @@ public class TimestampPartitioning
253
256
  }
254
257
  }
255
258
 
256
- private static class TimestampPartitioner
259
+ @VisibleForTesting
260
+ static class TimestampPartitioner
257
261
  extends AbstractTimestampPartitioner
258
262
  {
259
263
  public TimestampPartitioner(Column column, Unit unit)
@@ -269,7 +273,8 @@ public class TimestampPartitioning
269
273
  }
270
274
  }
271
275
 
272
- private static class LongUnixTimestampPartitioner
276
+ @VisibleForTesting
277
+ static class LongUnixTimestampPartitioner
273
278
  extends AbstractTimestampPartitioner
274
279
  {
275
280
  private final UnixTimestampUnit unixTimestampUnit;
@@ -0,0 +1,130 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.util.Random;
4
+
5
+ import com.google.inject.util.Modules;
6
+ import org.embulk.GuiceBinder;
7
+ import org.embulk.RandomManager;
8
+ import org.embulk.TestPluginSourceModule;
9
+ import org.embulk.TestUtilityModule;
10
+ import org.junit.runner.Description;
11
+ import org.junit.runners.model.Statement;
12
+ import com.google.inject.Injector;
13
+ import com.google.inject.Binder;
14
+ import com.google.inject.Module;
15
+ import org.embulk.config.ConfigSource;
16
+ import org.embulk.config.DataSourceImpl;
17
+ import org.embulk.config.ModelManager;
18
+ import org.embulk.exec.SystemConfigModule;
19
+ import org.embulk.exec.ExecModule;
20
+ import org.embulk.exec.ExtensionServiceLoaderModule;
21
+ import org.embulk.plugin.BuiltinPluginSourceModule;
22
+ import org.embulk.jruby.JRubyScriptingModule;
23
+ import org.embulk.spi.BufferAllocator;
24
+ import org.embulk.spi.Exec;
25
+ import org.embulk.spi.ExecAction;
26
+ import org.embulk.spi.ExecSession;
27
+
28
+ // TODO This class should be merged into EmbulkTestRuntime class. Because EmbulkTestRuntime doesn't have module overriding feature.
29
+ public class MapReduceExecutorTestRuntime
30
+ extends GuiceBinder
31
+ {
32
+ private static ConfigSource getSystemConfig()
33
+ {
34
+ // TODO set some default values
35
+ return new DataSourceImpl(null);
36
+ }
37
+
38
+ public static class TestRuntimeModule
39
+ implements Module
40
+ {
41
+ @Override
42
+ public void configure(Binder binder)
43
+ {
44
+ ConfigSource systemConfig = getSystemConfig();
45
+
46
+ new SystemConfigModule(systemConfig).configure(binder);
47
+ new ExecModule().configure(binder);
48
+ new ExtensionServiceLoaderModule(systemConfig).configure(binder);
49
+ new BuiltinPluginSourceModule().configure(binder);
50
+ new JRubyScriptingModule(systemConfig).configure(binder);
51
+ new TestUtilityModule().configure(binder);
52
+ new TestPluginSourceModule().configure(binder);
53
+
54
+ }
55
+ }
56
+
57
+ private ExecSession exec;
58
+
59
+ public MapReduceExecutorTestRuntime()
60
+ {
61
+ super(Modules.override(new TestRuntimeModule()).with(new Module() {
62
+ @Override
63
+ public void configure(Binder binder)
64
+ {
65
+ new TestMapReduceExecutor.ExecutorPluginApplyModule().configure(binder);
66
+ new TestMapReduceExecutor.LoggerOverrideModule().configure(binder);
67
+ }
68
+ }));
69
+ Injector injector = getInjector();
70
+ ConfigSource execConfig = new DataSourceImpl(injector.getInstance(ModelManager.class));
71
+ this.exec = ExecSession.builder(injector).fromExecConfig(execConfig).build();
72
+ }
73
+
74
+ public ExecSession getExec()
75
+ {
76
+ return exec;
77
+ }
78
+
79
+ public BufferAllocator getBufferAllocator()
80
+ {
81
+ return getInstance(BufferAllocator.class);
82
+ }
83
+
84
+ public ModelManager getModelManager()
85
+ {
86
+ return getInstance(ModelManager.class);
87
+ }
88
+
89
+ public Random getRandom()
90
+ {
91
+ return getInstance(RandomManager.class).getRandom();
92
+ }
93
+
94
+ @Override
95
+ public Statement apply(Statement base, Description description)
96
+ {
97
+ final Statement superStatement = MapReduceExecutorTestRuntime.super.apply(base, description);
98
+ return new Statement() {
99
+ public void evaluate() throws Throwable
100
+ {
101
+ try {
102
+ Exec.doWith(exec, new ExecAction<Void>() {
103
+ public Void run()
104
+ {
105
+ try {
106
+ superStatement.evaluate();
107
+ } catch (Throwable ex) {
108
+ throw new RuntimeExecutionException(ex);
109
+ }
110
+ return null;
111
+ }
112
+ });
113
+ } catch (RuntimeException ex) {
114
+ throw ex.getCause();
115
+ } finally {
116
+ exec.cleanup();
117
+ }
118
+ }
119
+ };
120
+ }
121
+
122
+ private static class RuntimeExecutionException
123
+ extends RuntimeException
124
+ {
125
+ public RuntimeExecutionException(Throwable cause)
126
+ {
127
+ super(cause);
128
+ }
129
+ }
130
+ }
@@ -0,0 +1,58 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import com.google.common.base.Optional;
4
+
5
+ import org.apache.hadoop.mapreduce.TaskAttemptID;
6
+ import org.junit.Rule;
7
+ import org.junit.Test;
8
+
9
+ import java.io.ByteArrayInputStream;
10
+ import java.io.ByteArrayOutputStream;
11
+ import java.io.IOException;
12
+
13
+ import static org.junit.Assert.assertEquals;
14
+
15
+ public class TestAttemptState
16
+ {
17
+ @Rule
18
+ public MapReduceExecutorTestRuntime runtime = new MapReduceExecutorTestRuntime();
19
+
20
+ @Test
21
+ public void readAndWrite()
22
+ throws IOException {
23
+ TaskAttemptID attemptId = TaskAttemptID.forName("attempt_200707121733_0003_m_000005_0");
24
+ int inputTaskIndex = 1;
25
+ int outputTaskIndex = 2;
26
+ Exception ex = new Exception();
27
+
28
+ AttemptState attemptState = new AttemptState(attemptId, Optional.of(inputTaskIndex), Optional.of(outputTaskIndex));
29
+ attemptState.setException(ex);
30
+ attemptState.setInputTaskReport(runtime.getExec().newTaskReport());
31
+ attemptState.setOutputTaskReport(runtime.getExec().newTaskReport());
32
+
33
+ try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
34
+ attemptState.writeTo(out, runtime.getModelManager());
35
+
36
+ try (ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray())) {
37
+ assertAttemptStateEquals(attemptState, AttemptState.readFrom(in, runtime.getModelManager()));
38
+ }
39
+ }
40
+ }
41
+
42
+ private static void assertAttemptStateEquals(AttemptState s1, AttemptState s2)
43
+ {
44
+ assertEquals(s1.getAttemptId(), s2.getAttemptId());
45
+ assertEquals(s1.getInputTaskIndex(), s2.getInputTaskIndex());
46
+ assertEquals(s1.getOutputTaskIndex(), s2.getOutputTaskIndex());
47
+ assertEquals(s1.getException(), s2.getException());
48
+ assertEquals(s1.getInputTaskReport(), s2.getInputTaskReport());
49
+ assertEquals(s1.getOutputTaskReport(), s2.getOutputTaskReport());
50
+ }
51
+
52
+ @Test
53
+ public void throwEOFIfInvalidJsonString()
54
+ throws IOException {
55
+ String json = "{\"key\":\"va";
56
+ // TODO
57
+ }
58
+ }
@@ -0,0 +1,54 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import org.apache.hadoop.conf.Configuration;
4
+ import org.apache.hadoop.mapreduce.JobContext;
5
+ import org.apache.hadoop.mapreduce.JobID;
6
+ import org.apache.hadoop.mapreduce.task.JobContextImpl;
7
+ import org.junit.Before;
8
+ import org.junit.Rule;
9
+ import org.junit.Test;
10
+
11
+ import static org.junit.Assert.assertEquals;
12
+
13
+ public class TestEmbulkInputFormat
14
+ {
15
+ @Rule
16
+ public MapReduceExecutorTestRuntime runtime = new MapReduceExecutorTestRuntime();
17
+
18
+ private Configuration conf;
19
+ private EmbulkInputFormat format;
20
+
21
+ @Before
22
+ public void createResources()
23
+ {
24
+ conf = new Configuration();
25
+ format = new EmbulkInputFormat();
26
+ }
27
+
28
+ @Test
29
+ public void getSplits()
30
+ throws Exception
31
+ {
32
+ checkNumOfSplits(0);
33
+
34
+ for (int i = 0; i < 10; i++) {
35
+
36
+ int split = runtime.getRandom().nextInt(10000);
37
+ checkNumOfSplits(split);
38
+ }
39
+ }
40
+
41
+ private void checkNumOfSplits(int split)
42
+ throws Exception
43
+ {
44
+ conf.set("embulk.mapreduce.taskCount", Integer.toString(split));
45
+ JobContext jobContext = newJobContext(conf);
46
+ assertEquals(split, format.getSplits(jobContext).size());
47
+ }
48
+
49
+ private JobContext newJobContext(Configuration conf)
50
+ {
51
+ JobID jobID = new JobID("test", runtime.getRandom().nextInt());
52
+ return new JobContextImpl(conf, jobID);
53
+ }
54
+ }
@@ -0,0 +1,46 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import org.junit.Test;
4
+
5
+ import java.io.ByteArrayInputStream;
6
+ import java.io.ByteArrayOutputStream;
7
+ import java.io.DataInputStream;
8
+ import java.io.DataOutputStream;
9
+ import java.io.IOException;
10
+
11
+ import static org.junit.Assert.assertArrayEquals;
12
+ import static org.junit.Assert.assertEquals;
13
+
14
+ public class TestEmbulkInputSplit
15
+ {
16
+ @Test
17
+ public void readAndWrite()
18
+ throws IOException
19
+ {
20
+ readAndWrite(new EmbulkInputSplit());
21
+ readAndWrite(new EmbulkInputSplit(new int[] {0, 1, 2, 3}));
22
+ }
23
+
24
+ private void readAndWrite(EmbulkInputSplit is) throws IOException
25
+ {
26
+ try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
27
+ try (DataOutputStream dout = new DataOutputStream(out)) {
28
+ is.write(dout);
29
+ dout.flush();
30
+
31
+ try (DataInputStream in = new DataInputStream(new ByteArrayInputStream(out.toByteArray()))) {
32
+ EmbulkInputSplit newIs = new EmbulkInputSplit();
33
+ newIs.readFields(in);
34
+ assertEmbulkInputSplitEquals(is, newIs);
35
+ }
36
+ }
37
+ }
38
+ }
39
+
40
+ private static void assertEmbulkInputSplitEquals(EmbulkInputSplit is1, EmbulkInputSplit is2)
41
+ {
42
+ assertArrayEquals(is1.getTaskIndexes(), is2.getTaskIndexes());
43
+ assertEquals(is1.getLength(), is2.getLength());
44
+ assertArrayEquals(is1.getLocations(), is2.getLocations());
45
+ }
46
+ }