embulk-executor-mapreduce 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. checksums.yaml +4 -4
  2. data/classpath/{embulk-executor-mapreduce-0.2.2.jar → embulk-executor-mapreduce-0.2.3.jar} +0 -0
  3. data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java +48 -24
  4. data/src/main/java/org/embulk/executor/mapreduce/TimestampPartitioning.java +11 -6
  5. data/src/test/java/org/embulk/executor/mapreduce/MapReduceExecutorTestRuntime.java +130 -0
  6. data/src/test/java/org/embulk/executor/mapreduce/TestAttemptState.java +58 -0
  7. data/src/test/java/org/embulk/executor/mapreduce/TestEmbulkInputFormat.java +54 -0
  8. data/src/test/java/org/embulk/executor/mapreduce/TestEmbulkInputSplit.java +46 -0
  9. data/src/test/java/org/embulk/executor/mapreduce/TestEmbulkRecordReader.java +25 -0
  10. data/src/test/java/org/embulk/executor/mapreduce/TestMapReduceExecutor.java +251 -0
  11. data/src/test/java/org/embulk/executor/mapreduce/TestPageBufferWritable.java +84 -0
  12. data/src/test/java/org/embulk/executor/mapreduce/TestTimestampPartitioning.java +222 -0
  13. data/src/test/resources/config/core-site.xml +8 -0
  14. data/src/test/resources/config/embulk_mapred_config.yml +38 -0
  15. data/src/test/resources/config/embulk_mapred_invalid_config_files_config.yml +38 -0
  16. data/src/test/resources/config/embulk_mapred_invalid_libjars_config.yml +40 -0
  17. data/src/test/resources/config/embulk_mapred_invalid_partitioning_config.yml +40 -0
  18. data/src/test/resources/config/embulk_mapred_invalid_reducers_config.yml +44 -0
  19. data/src/test/resources/config/embulk_mapred_partitioning_config.yml +43 -0
  20. data/src/test/resources/config/embulk_mapred_stop_on_invalid_record_config.yml +39 -0
  21. data/src/test/resources/config/hdfs-site.xml +18 -0
  22. data/src/test/resources/config/mapred-site.xml +8 -0
  23. data/src/test/resources/fixtures/csv/sample1.csv +3 -0
  24. data/src/test/resources/fixtures/csv/sample2.csv +4 -0
  25. data/src/test/resources/fixtures/invalid_csv/sample1.csv +4 -0
  26. data/src/test/resources/fixtures/invalid_csv/sample2.csv +3 -0
  27. metadata +25 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d683908af5b1f5e06ea035f9ecd4e7a89dbd4eab
4
- data.tar.gz: f8a5a7db822f6241a40211dd1782e4fda83df073
3
+ metadata.gz: 3c12d59b65314dd94cf12e29b7caf8e83fd947bd
4
+ data.tar.gz: 478ea3a4e77c7a9e5395a4c4942ac9697772a10d
5
5
  SHA512:
6
- metadata.gz: 48b955f8b5a70864dd745347079cc83545fdb6953091812917beeb28e6aa9f66726c68e8b85b0aa3c8da40f419fc0b1efa78aab90bd899e42bb4c1a0c14edaae
7
- data.tar.gz: 99199b98f4f56e2d5d74c7f12be41b19bd31aa3f496144a940b16c6c53abf3980c181ae186f40ba3bf307106abb53dc00683c54157ea000652d24d66cff12cd5
6
+ metadata.gz: e8c955d1e7a4e0b318bc21a36c51410045b4b4c9e62b954e8848f66693d26fe580a79afb8c8e1c1910f9ecd1731f78772b6b9af7f64e817b9666776da26b126c
7
+ data.tar.gz: 14fb4a34dbaf1b59b8d37ceac1356d4ab66014a559943f16da087d9f17800e46c80e9b98ae1583be5568ee8909034ba708bfa4ca3dab27ddde90b5c8e2ab8f59
@@ -1,5 +1,6 @@
1
1
  package org.embulk.executor.mapreduce;
2
2
 
3
+ import java.security.PrivilegedAction;
3
4
  import java.util.List;
4
5
  import java.util.ArrayList;
5
6
  import java.util.Collection;
@@ -16,6 +17,8 @@ import java.net.URISyntaxException;
16
17
  import java.net.URL;
17
18
  import java.net.URLClassLoader;
18
19
  import java.net.MalformedURLException;
20
+
21
+ import org.apache.hadoop.security.UserGroupInformation;
19
22
  import org.slf4j.Logger;
20
23
  import org.joda.time.format.DateTimeFormat;
21
24
  import com.google.inject.Inject;
@@ -183,12 +186,12 @@ public class MapReduceExecutor
183
186
  }
184
187
  }
185
188
 
186
- void run(MapReduceExecutorTask task,
187
- int mapTaskCount, int reduceTaskCount, ProcessState state)
189
+ void run(final MapReduceExecutorTask task,
190
+ int mapTaskCount, final int reduceTaskCount, final ProcessState state)
188
191
  {
189
- ModelManager modelManager = task.getModelManager();
192
+ final ModelManager modelManager = task.getModelManager();
190
193
 
191
- Configuration conf = new Configuration();
194
+ final Configuration conf = new Configuration();
192
195
  // don't call conf.setQuietMode(false). Configuraiton has invalid resource names by default
193
196
  for (String path : task.getConfigFiles()) {
194
197
  File file = new File(path);
@@ -204,31 +207,56 @@ public class MapReduceExecutor
204
207
  }
205
208
 
206
209
  String uniqueTransactionName = getTransactionUniqueName(Exec.session());
207
- Path stateDir = new Path(new Path(task.getStatePath()), uniqueTransactionName);
208
-
209
- Job job;
210
- try {
211
- job = Job.getInstance(conf);
212
- } catch (IOException e) {
213
- throw Throwables.propagate(e);
214
- }
215
- job.setJobName(task.getJobName());
210
+ final Path stateDir = new Path(new Path(task.getStatePath()), uniqueTransactionName);
216
211
 
217
212
  // create a dedicated classloader for this yarn application.
218
213
  // allow task.getConfig to overwrite this parameter
219
- job.getConfiguration().set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER, "true"); // mapreduce.job.classloader
220
- job.getConfiguration().set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER_SYSTEM_CLASSES, "java.,org.apache.hadoop."); // mapreduce.job.classloader.system.classes
214
+ conf.set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER, "true"); // mapreduce.job.classloader
215
+ conf.set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER_SYSTEM_CLASSES, "java.,org.apache.hadoop."); // mapreduce.job.classloader.system.classes
221
216
 
222
217
  // extra config
223
218
  for (Map.Entry<String, String> pair : task.getConfig().entrySet()) {
224
- job.getConfiguration().set(pair.getKey(), pair.getValue());
219
+ conf.set(pair.getKey(), pair.getValue());
225
220
  }
226
221
 
227
222
  // framework config
228
- EmbulkMapReduce.setSystemConfig(job.getConfiguration(), modelManager, systemConfig);
229
- EmbulkMapReduce.setExecutorTask(job.getConfiguration(), modelManager, task);
230
- EmbulkMapReduce.setMapTaskCount(job.getConfiguration(), mapTaskCount); // used by EmbulkInputFormat
231
- EmbulkMapReduce.setStateDirectoryPath(job.getConfiguration(), stateDir);
223
+ EmbulkMapReduce.setSystemConfig(conf, modelManager, systemConfig);
224
+ EmbulkMapReduce.setExecutorTask(conf, modelManager, task);
225
+ EmbulkMapReduce.setMapTaskCount(conf, mapTaskCount); // used by EmbulkInputFormat
226
+ EmbulkMapReduce.setStateDirectoryPath(conf, stateDir);
227
+
228
+ // jar files
229
+ List<Path> jars = collectJars(task.getLibjars(), task.getExcludeJars());
230
+ conf.set("tmpjars", StringUtils.join(",", jars));
231
+
232
+ String remoteUser = conf.get(MRJobConfig.USER_NAME); // mapreduce.job.user.name
233
+ if (remoteUser != null) {
234
+ UserGroupInformation.createRemoteUser(remoteUser).doAs(
235
+ new PrivilegedAction<Void>()
236
+ {
237
+ @Override
238
+ public Void run()
239
+ {
240
+ runJob(task, modelManager, reduceTaskCount, state, stateDir, conf);
241
+ return null;
242
+ }
243
+ }
244
+ );
245
+ } else {
246
+ runJob(task, modelManager, reduceTaskCount, state, stateDir, conf);
247
+ }
248
+ }
249
+
250
+ void runJob(MapReduceExecutorTask task, ModelManager modelManager,
251
+ int reduceTaskCount, ProcessState state, Path stateDir, Configuration conf)
252
+ {
253
+ Job job;
254
+ try {
255
+ job = Job.getInstance(conf);
256
+ } catch (IOException e) {
257
+ throw Throwables.propagate(e);
258
+ }
259
+ job.setJobName(task.getJobName());
232
260
 
233
261
  // archive plugins (also create state dir)
234
262
  PluginArchive archive = new PluginArchive.Builder()
@@ -240,10 +268,6 @@ public class MapReduceExecutor
240
268
  throw new RuntimeException(ex);
241
269
  }
242
270
 
243
- // jar files
244
- List<Path> jars = collectJars(task.getLibjars(), task.getExcludeJars());
245
- job.getConfiguration().set("tmpjars", StringUtils.join(",", jars));
246
-
247
271
  job.setInputFormatClass(EmbulkInputFormat.class);
248
272
 
249
273
  if (reduceTaskCount > 0) {
@@ -1,5 +1,6 @@
1
1
  package org.embulk.executor.mapreduce;
2
2
 
3
+ import com.google.common.annotations.VisibleForTesting;
3
4
  import org.joda.time.DateTimeZone;
4
5
  import com.google.common.base.Optional;
5
6
  import org.embulk.config.Config;
@@ -40,7 +41,8 @@ public class TimestampPartitioning
40
41
  public void setTargetColumn(Column column);
41
42
  }
42
43
 
43
- private static enum Unit
44
+ @VisibleForTesting
45
+ static enum Unit
44
46
  {
45
47
  HOUR(60*60),
46
48
  DAY(24*60*60);
@@ -70,12 +72,13 @@ public class TimestampPartitioning
70
72
  //case "year": return YEAR;
71
73
  default:
72
74
  throw new ConfigException(
73
- String.format("Unknown unit '%s'. Supported units are hour and day"));
75
+ String.format("Unknown unit '%s'. Supported units are hour and day", s));
74
76
  }
75
77
  }
76
78
  }
77
79
 
78
- private static enum UnixTimestampUnit
80
+ @VisibleForTesting
81
+ static enum UnixTimestampUnit
79
82
  {
80
83
  SEC(1),
81
84
  MILLI(1000),
@@ -103,7 +106,7 @@ public class TimestampPartitioning
103
106
  case "nano": return NANO;
104
107
  default:
105
108
  throw new ConfigException(
106
- String.format("Unknown unix_timestamp_unit '%s'. Supported units are sec, milli, micro, and nano"));
109
+ String.format("Unknown unix_timestamp_unit '%s'. Supported units are sec, milli, micro, and nano", s));
107
110
  }
108
111
  }
109
112
  }
@@ -253,7 +256,8 @@ public class TimestampPartitioning
253
256
  }
254
257
  }
255
258
 
256
- private static class TimestampPartitioner
259
+ @VisibleForTesting
260
+ static class TimestampPartitioner
257
261
  extends AbstractTimestampPartitioner
258
262
  {
259
263
  public TimestampPartitioner(Column column, Unit unit)
@@ -269,7 +273,8 @@ public class TimestampPartitioning
269
273
  }
270
274
  }
271
275
 
272
- private static class LongUnixTimestampPartitioner
276
+ @VisibleForTesting
277
+ static class LongUnixTimestampPartitioner
273
278
  extends AbstractTimestampPartitioner
274
279
  {
275
280
  private final UnixTimestampUnit unixTimestampUnit;
@@ -0,0 +1,130 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.util.Random;
4
+
5
+ import com.google.inject.util.Modules;
6
+ import org.embulk.GuiceBinder;
7
+ import org.embulk.RandomManager;
8
+ import org.embulk.TestPluginSourceModule;
9
+ import org.embulk.TestUtilityModule;
10
+ import org.junit.runner.Description;
11
+ import org.junit.runners.model.Statement;
12
+ import com.google.inject.Injector;
13
+ import com.google.inject.Binder;
14
+ import com.google.inject.Module;
15
+ import org.embulk.config.ConfigSource;
16
+ import org.embulk.config.DataSourceImpl;
17
+ import org.embulk.config.ModelManager;
18
+ import org.embulk.exec.SystemConfigModule;
19
+ import org.embulk.exec.ExecModule;
20
+ import org.embulk.exec.ExtensionServiceLoaderModule;
21
+ import org.embulk.plugin.BuiltinPluginSourceModule;
22
+ import org.embulk.jruby.JRubyScriptingModule;
23
+ import org.embulk.spi.BufferAllocator;
24
+ import org.embulk.spi.Exec;
25
+ import org.embulk.spi.ExecAction;
26
+ import org.embulk.spi.ExecSession;
27
+
28
+ // TODO This class should be merged into EmbulkTestRuntime class. Because EmbulkTestRuntime doesn't have module overriding feature.
29
+ public class MapReduceExecutorTestRuntime
30
+ extends GuiceBinder
31
+ {
32
+ private static ConfigSource getSystemConfig()
33
+ {
34
+ // TODO set some default values
35
+ return new DataSourceImpl(null);
36
+ }
37
+
38
+ public static class TestRuntimeModule
39
+ implements Module
40
+ {
41
+ @Override
42
+ public void configure(Binder binder)
43
+ {
44
+ ConfigSource systemConfig = getSystemConfig();
45
+
46
+ new SystemConfigModule(systemConfig).configure(binder);
47
+ new ExecModule().configure(binder);
48
+ new ExtensionServiceLoaderModule(systemConfig).configure(binder);
49
+ new BuiltinPluginSourceModule().configure(binder);
50
+ new JRubyScriptingModule(systemConfig).configure(binder);
51
+ new TestUtilityModule().configure(binder);
52
+ new TestPluginSourceModule().configure(binder);
53
+
54
+ }
55
+ }
56
+
57
+ private ExecSession exec;
58
+
59
+ public MapReduceExecutorTestRuntime()
60
+ {
61
+ super(Modules.override(new TestRuntimeModule()).with(new Module() {
62
+ @Override
63
+ public void configure(Binder binder)
64
+ {
65
+ new TestMapReduceExecutor.ExecutorPluginApplyModule().configure(binder);
66
+ new TestMapReduceExecutor.LoggerOverrideModule().configure(binder);
67
+ }
68
+ }));
69
+ Injector injector = getInjector();
70
+ ConfigSource execConfig = new DataSourceImpl(injector.getInstance(ModelManager.class));
71
+ this.exec = ExecSession.builder(injector).fromExecConfig(execConfig).build();
72
+ }
73
+
74
+ public ExecSession getExec()
75
+ {
76
+ return exec;
77
+ }
78
+
79
+ public BufferAllocator getBufferAllocator()
80
+ {
81
+ return getInstance(BufferAllocator.class);
82
+ }
83
+
84
+ public ModelManager getModelManager()
85
+ {
86
+ return getInstance(ModelManager.class);
87
+ }
88
+
89
+ public Random getRandom()
90
+ {
91
+ return getInstance(RandomManager.class).getRandom();
92
+ }
93
+
94
+ @Override
95
+ public Statement apply(Statement base, Description description)
96
+ {
97
+ final Statement superStatement = MapReduceExecutorTestRuntime.super.apply(base, description);
98
+ return new Statement() {
99
+ public void evaluate() throws Throwable
100
+ {
101
+ try {
102
+ Exec.doWith(exec, new ExecAction<Void>() {
103
+ public Void run()
104
+ {
105
+ try {
106
+ superStatement.evaluate();
107
+ } catch (Throwable ex) {
108
+ throw new RuntimeExecutionException(ex);
109
+ }
110
+ return null;
111
+ }
112
+ });
113
+ } catch (RuntimeException ex) {
114
+ throw ex.getCause();
115
+ } finally {
116
+ exec.cleanup();
117
+ }
118
+ }
119
+ };
120
+ }
121
+
122
+ private static class RuntimeExecutionException
123
+ extends RuntimeException
124
+ {
125
+ public RuntimeExecutionException(Throwable cause)
126
+ {
127
+ super(cause);
128
+ }
129
+ }
130
+ }
@@ -0,0 +1,58 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import com.google.common.base.Optional;
4
+
5
+ import org.apache.hadoop.mapreduce.TaskAttemptID;
6
+ import org.junit.Rule;
7
+ import org.junit.Test;
8
+
9
+ import java.io.ByteArrayInputStream;
10
+ import java.io.ByteArrayOutputStream;
11
+ import java.io.IOException;
12
+
13
+ import static org.junit.Assert.assertEquals;
14
+
15
+ public class TestAttemptState
16
+ {
17
+ @Rule
18
+ public MapReduceExecutorTestRuntime runtime = new MapReduceExecutorTestRuntime();
19
+
20
+ @Test
21
+ public void readAndWrite()
22
+ throws IOException {
23
+ TaskAttemptID attemptId = TaskAttemptID.forName("attempt_200707121733_0003_m_000005_0");
24
+ int inputTaskIndex = 1;
25
+ int outputTaskIndex = 2;
26
+ Exception ex = new Exception();
27
+
28
+ AttemptState attemptState = new AttemptState(attemptId, Optional.of(inputTaskIndex), Optional.of(outputTaskIndex));
29
+ attemptState.setException(ex);
30
+ attemptState.setInputTaskReport(runtime.getExec().newTaskReport());
31
+ attemptState.setOutputTaskReport(runtime.getExec().newTaskReport());
32
+
33
+ try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
34
+ attemptState.writeTo(out, runtime.getModelManager());
35
+
36
+ try (ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray())) {
37
+ assertAttemptStateEquals(attemptState, AttemptState.readFrom(in, runtime.getModelManager()));
38
+ }
39
+ }
40
+ }
41
+
42
+ private static void assertAttemptStateEquals(AttemptState s1, AttemptState s2)
43
+ {
44
+ assertEquals(s1.getAttemptId(), s2.getAttemptId());
45
+ assertEquals(s1.getInputTaskIndex(), s2.getInputTaskIndex());
46
+ assertEquals(s1.getOutputTaskIndex(), s2.getOutputTaskIndex());
47
+ assertEquals(s1.getException(), s2.getException());
48
+ assertEquals(s1.getInputTaskReport(), s2.getInputTaskReport());
49
+ assertEquals(s1.getOutputTaskReport(), s2.getOutputTaskReport());
50
+ }
51
+
52
+ @Test
53
+ public void throwEOFIfInvalidJsonString()
54
+ throws IOException {
55
+ String json = "{\"key\":\"va";
56
+ // TODO
57
+ }
58
+ }
@@ -0,0 +1,54 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import org.apache.hadoop.conf.Configuration;
4
+ import org.apache.hadoop.mapreduce.JobContext;
5
+ import org.apache.hadoop.mapreduce.JobID;
6
+ import org.apache.hadoop.mapreduce.task.JobContextImpl;
7
+ import org.junit.Before;
8
+ import org.junit.Rule;
9
+ import org.junit.Test;
10
+
11
+ import static org.junit.Assert.assertEquals;
12
+
13
+ public class TestEmbulkInputFormat
14
+ {
15
+ @Rule
16
+ public MapReduceExecutorTestRuntime runtime = new MapReduceExecutorTestRuntime();
17
+
18
+ private Configuration conf;
19
+ private EmbulkInputFormat format;
20
+
21
+ @Before
22
+ public void createResources()
23
+ {
24
+ conf = new Configuration();
25
+ format = new EmbulkInputFormat();
26
+ }
27
+
28
+ @Test
29
+ public void getSplits()
30
+ throws Exception
31
+ {
32
+ checkNumOfSplits(0);
33
+
34
+ for (int i = 0; i < 10; i++) {
35
+
36
+ int split = runtime.getRandom().nextInt(10000);
37
+ checkNumOfSplits(split);
38
+ }
39
+ }
40
+
41
+ private void checkNumOfSplits(int split)
42
+ throws Exception
43
+ {
44
+ conf.set("embulk.mapreduce.taskCount", Integer.toString(split));
45
+ JobContext jobContext = newJobContext(conf);
46
+ assertEquals(split, format.getSplits(jobContext).size());
47
+ }
48
+
49
+ private JobContext newJobContext(Configuration conf)
50
+ {
51
+ JobID jobID = new JobID("test", runtime.getRandom().nextInt());
52
+ return new JobContextImpl(conf, jobID);
53
+ }
54
+ }
@@ -0,0 +1,46 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import org.junit.Test;
4
+
5
+ import java.io.ByteArrayInputStream;
6
+ import java.io.ByteArrayOutputStream;
7
+ import java.io.DataInputStream;
8
+ import java.io.DataOutputStream;
9
+ import java.io.IOException;
10
+
11
+ import static org.junit.Assert.assertArrayEquals;
12
+ import static org.junit.Assert.assertEquals;
13
+
14
+ public class TestEmbulkInputSplit
15
+ {
16
+ @Test
17
+ public void readAndWrite()
18
+ throws IOException
19
+ {
20
+ readAndWrite(new EmbulkInputSplit());
21
+ readAndWrite(new EmbulkInputSplit(new int[] {0, 1, 2, 3}));
22
+ }
23
+
24
+ private void readAndWrite(EmbulkInputSplit is) throws IOException
25
+ {
26
+ try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
27
+ try (DataOutputStream dout = new DataOutputStream(out)) {
28
+ is.write(dout);
29
+ dout.flush();
30
+
31
+ try (DataInputStream in = new DataInputStream(new ByteArrayInputStream(out.toByteArray()))) {
32
+ EmbulkInputSplit newIs = new EmbulkInputSplit();
33
+ newIs.readFields(in);
34
+ assertEmbulkInputSplitEquals(is, newIs);
35
+ }
36
+ }
37
+ }
38
+ }
39
+
40
+ private static void assertEmbulkInputSplitEquals(EmbulkInputSplit is1, EmbulkInputSplit is2)
41
+ {
42
+ assertArrayEquals(is1.getTaskIndexes(), is2.getTaskIndexes());
43
+ assertEquals(is1.getLength(), is2.getLength());
44
+ assertArrayEquals(is1.getLocations(), is2.getLocations());
45
+ }
46
+ }