embulk-executor-mapreduce 0.1.5 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 995cd8754f48d4d4e0d222bbad7d39dfdad6648f
4
- data.tar.gz: 901b6a9298e76ed45ab7840ca4e4135bb68a9d8d
3
+ metadata.gz: 5c531e6955469a01f0e2ed716a65fdf228ae95ba
4
+ data.tar.gz: d8724a7abcaedd7549a397d2b14df6edb2832d52
5
5
  SHA512:
6
- metadata.gz: 02cbb013f7b83f51c787e649d1efd0e69b726af56636b2487b5d1b277bb285521db44a5b4b1ee45c59d7e5c0a71515bc9b590cdb5a3f90f5fd871581e4d23286
7
- data.tar.gz: f66984fd6801ee826b6c4a46db1c326aa699584497224daed575e11c6f83348f5cbf2ee469c26a856bf4d9479d57e609613abdc7fec69f3f2ccbde8ac612340c
6
+ metadata.gz: 9983809cd453596cf3fc683f75a22b43dbcca0abf202c7ad7bdc9cbb640c673e95be706f6d49ad04bb3881f8668bc9b8e3946115d54e2a14e90dfb163fbd9b0f
7
+ data.tar.gz: 7b272911c78f9bac87fa867f0e3978a96f31c8fb348b150410a6991e1f5f425782450c9422ca4022e2cb15abc61037caaf080fb196f48bd0a441e3e81cc31b15
@@ -16,7 +16,7 @@ import com.fasterxml.jackson.annotation.JsonIgnore;
16
16
  import com.fasterxml.jackson.annotation.JsonValue;
17
17
  import org.apache.hadoop.mapreduce.TaskAttemptID;
18
18
  import org.embulk.config.ModelManager;
19
- import org.embulk.config.CommitReport;
19
+ import org.embulk.config.TaskReport;
20
20
 
21
21
  public class AttemptState
22
22
  {
@@ -24,8 +24,8 @@ public class AttemptState
24
24
  private final Optional<Integer> inputTaskIndex;
25
25
  private final Optional<Integer> outputTaskIndex;
26
26
  private Optional<String> exception;
27
- private Optional<CommitReport> inputCommitReport;
28
- private Optional<CommitReport> outputCommitReport;
27
+ private Optional<TaskReport> inputTaskReport;
28
+ private Optional<TaskReport> outputTaskReport;
29
29
 
30
30
  public AttemptState(TaskAttemptID attemptId, Optional<Integer> inputTaskIndex, Optional<Integer> outputTaskIndex)
31
31
  {
@@ -40,12 +40,12 @@ public class AttemptState
40
40
  @JsonProperty("inputTaskIndex") Optional<Integer> inputTaskIndex,
41
41
  @JsonProperty("outputTaskIndex") Optional<Integer> outputTaskIndex,
42
42
  @JsonProperty("exception") Optional<String> exception,
43
- @JsonProperty("inputCommitReport") Optional<CommitReport> inputCommitReport,
44
- @JsonProperty("outputCommitReport") Optional<CommitReport> outputCommitReport)
43
+ @JsonProperty("inputTaskReport") Optional<TaskReport> inputTaskReport,
44
+ @JsonProperty("outputTaskReport") Optional<TaskReport> outputTaskReport)
45
45
  {
46
46
  this(TaskAttemptID.forName(attemptId),
47
47
  inputTaskIndex, outputTaskIndex, exception,
48
- inputCommitReport, outputCommitReport);
48
+ inputTaskReport, outputTaskReport);
49
49
  }
50
50
 
51
51
  public AttemptState(
@@ -53,15 +53,15 @@ public class AttemptState
53
53
  Optional<Integer> inputTaskIndex,
54
54
  Optional<Integer> outputTaskIndex,
55
55
  Optional<String> exception,
56
- Optional<CommitReport> inputCommitReport,
57
- Optional<CommitReport> outputCommitReport)
56
+ Optional<TaskReport> inputTaskReport,
57
+ Optional<TaskReport> outputTaskReport)
58
58
  {
59
59
  this.attemptId = attemptId;
60
60
  this.inputTaskIndex = inputTaskIndex;
61
61
  this.outputTaskIndex = outputTaskIndex;
62
62
  this.exception = exception;
63
- this.inputCommitReport = inputCommitReport;
64
- this.outputCommitReport = outputCommitReport;
63
+ this.inputTaskReport = inputTaskReport;
64
+ this.outputTaskReport = outputTaskReport;
65
65
  }
66
66
 
67
67
  @JsonIgnore
@@ -112,28 +112,28 @@ public class AttemptState
112
112
  return exception;
113
113
  }
114
114
 
115
- @JsonProperty("inputCommitReport")
116
- public Optional<CommitReport> getInputCommitReport()
115
+ @JsonProperty("inputTaskReport")
116
+ public Optional<TaskReport> getInputTaskReport()
117
117
  {
118
- return inputCommitReport;
118
+ return inputTaskReport;
119
119
  }
120
120
 
121
- @JsonProperty("outputCommitReport")
122
- public Optional<CommitReport> getOutputCommitReport()
121
+ @JsonProperty("outputTaskReport")
122
+ public Optional<TaskReport> getOutputTaskReport()
123
123
  {
124
- return outputCommitReport;
124
+ return outputTaskReport;
125
125
  }
126
126
 
127
127
  @JsonIgnore
128
- public void setInputCommitReport(CommitReport inputCommitReport)
128
+ public void setInputTaskReport(TaskReport inputTaskReport)
129
129
  {
130
- this.inputCommitReport = Optional.of(inputCommitReport);
130
+ this.inputTaskReport = Optional.of(inputTaskReport);
131
131
  }
132
132
 
133
133
  @JsonIgnore
134
- public void setOutputCommitReport(CommitReport outputCommitReport)
134
+ public void setOutputTaskReport(TaskReport outputTaskReport)
135
135
  {
136
- this.outputCommitReport = Optional.of(outputCommitReport);
136
+ this.outputTaskReport = Optional.of(outputTaskReport);
137
137
  }
138
138
 
139
139
  public void writeTo(OutputStream out, ModelManager modelManager) throws IOException
@@ -0,0 +1,13 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import org.embulk.config.ConfigSource;
4
+ import org.embulk.EmbulkEmbed;
5
+
6
+ public class DefaultEmbulkFactory
7
+ {
8
+ public EmbulkEmbed.Bootstrap bootstrap(ConfigSource systemConfig)
9
+ {
10
+ return new EmbulkEmbed.Bootstrap()
11
+ .setSystemConfig(systemConfig);
12
+ }
13
+ }
@@ -2,6 +2,7 @@ package org.embulk.executor.mapreduce;
2
2
 
3
3
  import java.util.List;
4
4
  import java.util.ArrayList;
5
+ import java.util.Map;
5
6
  import java.util.concurrent.Callable;
6
7
  import java.util.concurrent.ExecutionException;
7
8
  import java.io.File;
@@ -10,6 +11,7 @@ import java.io.ByteArrayInputStream;
10
11
  import java.io.IOException;
11
12
  import java.io.EOFException;
12
13
  import java.io.InterruptedIOException;
14
+ import java.lang.reflect.Method;
13
15
  import java.lang.reflect.InvocationTargetException;
14
16
  import com.google.inject.Injector;
15
17
  import com.google.common.base.Optional;
@@ -35,7 +37,8 @@ import org.apache.hadoop.mapreduce.MRConfig;
35
37
  import org.embulk.config.ModelManager;
36
38
  import org.embulk.config.ConfigSource;
37
39
  import org.embulk.config.ConfigLoader;
38
- import org.embulk.config.CommitReport;
40
+ import org.embulk.config.DataSourceImpl;
41
+ import org.embulk.config.TaskReport;
39
42
  import org.embulk.spi.BufferAllocator;
40
43
  import org.embulk.spi.Exec;
41
44
  import org.embulk.spi.ExecAction;
@@ -44,7 +47,7 @@ import org.embulk.spi.ProcessTask;
44
47
  import org.embulk.spi.util.Executors;
45
48
  import org.embulk.spi.util.RetryExecutor.Retryable;
46
49
  import org.embulk.spi.util.RetryExecutor.RetryGiveupException;
47
- import org.embulk.EmbulkService;
50
+ import org.embulk.EmbulkEmbed;
48
51
  import org.slf4j.Logger;
49
52
 
50
53
  import static java.nio.charset.StandardCharsets.UTF_8;
@@ -52,6 +55,7 @@ import static org.embulk.spi.util.RetryExecutor.retryExecutor;
52
55
 
53
56
  public class EmbulkMapReduce
54
57
  {
58
+ private static final String EMBULK_FACTORY_CLASS = "embulk_factory_class";
55
59
  private static final String SYSTEM_CONFIG_SERVICE_CLASS = "mapreduce_service_class";
56
60
 
57
61
  private static final String CK_SYSTEM_CONFIG = "embulk.mapreduce.systemConfig";
@@ -111,29 +115,41 @@ public class EmbulkMapReduce
111
115
  config.get(CK_TASK));
112
116
  }
113
117
 
114
- public static Injector newEmbulkInstance(Configuration config)
118
+ public static EmbulkEmbed.Bootstrap newEmbulkBootstrap(Configuration config)
115
119
  {
116
120
  ConfigSource systemConfig = getSystemConfig(config);
117
- String serviceClassName = systemConfig.get(String.class, SYSTEM_CONFIG_SERVICE_CLASS, "org.embulk.EmbulkService");
121
+
122
+ // for warnings of old versions
123
+ if (!systemConfig.get(String.class, SYSTEM_CONFIG_SERVICE_CLASS, "org.embulk.EmbulkService").equals("org.embulk.EmbulkService")) {
124
+ throw new RuntimeException("System config 'mapreduce_service_class' is not supported any more. Please use 'embulk_factory_class' instead");
125
+ }
126
+
127
+ String factoryClassName = systemConfig.get(String.class, EMBULK_FACTORY_CLASS, DefaultEmbulkFactory.class.getName());
118
128
 
119
129
  try {
120
- Object obj;
121
- if (serviceClassName.equals("org.embulk.EmbulkService")) {
122
- obj = new EmbulkService(systemConfig);
123
- } else {
124
- Class<?> serviceClass = Class.forName(serviceClassName);
125
- obj = serviceClass.getConstructor(ConfigSource.class).newInstance(systemConfig);
126
- }
130
+ Class<?> factoryClass = Class.forName(factoryClassName);
131
+ Object factory = factoryClass.newInstance();
127
132
 
128
- if (obj instanceof EmbulkService) {
129
- return ((EmbulkService) obj).getInjector();
130
- } else {
131
- return (Injector) obj.getClass().getMethod("getInjector").invoke(obj);
133
+ Object bootstrap;
134
+ try {
135
+ // factory.bootstrap(ConfigSource masterSystemConfig, ConfigSource executorParams)
136
+ Method method = factoryClass.getMethod("bootstrap", ConfigSource.class, ConfigSource.class);
137
+ Map<String, String> hadoopConfig = config.getValByRegex("");
138
+ ConfigSource executorParams = new DataSourceImpl(new ModelManager(null, new ObjectMapper())).set("hadoopConfig", hadoopConfig).getNested("hadoopConfig"); // TODO add a method to embulk that creates an empty DataSource instance
139
+ bootstrap = method.invoke(factory, systemConfig, executorParams);
132
140
  }
141
+ catch (NoSuchMethodException ex) {
142
+ // factory.bootstrap(ConfigSource masterSystemConfig)
143
+ bootstrap = factoryClass.getMethod("bootstrap", ConfigSource.class).invoke(factory, systemConfig);
144
+ }
145
+
146
+ return (EmbulkEmbed.Bootstrap) bootstrap;
133
147
 
134
- } catch (InvocationTargetException ex) {
148
+ }
149
+ catch (InvocationTargetException ex) {
135
150
  throw Throwables.propagate(ex.getCause());
136
- } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | IllegalAccessException | IllegalArgumentException ex) {
151
+ }
152
+ catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | IllegalAccessException | IllegalArgumentException ex) {
137
153
  throw Throwables.propagate(ex);
138
154
  }
139
155
  }
@@ -377,8 +393,7 @@ public class EmbulkMapReduce
377
393
  public static class SessionRunner
378
394
  {
379
395
  private final Configuration config;
380
- private final Injector injector;
381
- private final ModelManager modelManager;
396
+ private final EmbulkEmbed embed;
382
397
  private final MapReduceExecutorTask task;
383
398
  private final ExecSession session;
384
399
  private final File localGemPath;
@@ -386,10 +401,9 @@ public class EmbulkMapReduce
386
401
  public SessionRunner(TaskAttemptContext context)
387
402
  {
388
403
  this.config = context.getConfiguration();
389
- this.injector = newEmbulkInstance(context.getConfiguration());
390
- this.modelManager = injector.getInstance(ModelManager.class);
391
- this.task = getExecutorTask(injector, context.getConfiguration());
392
- this.session = ExecSession.builder(injector).fromExecConfig(task.getExecConfig()).build();
404
+ this.embed = newEmbulkBootstrap(context.getConfiguration()).initialize(); // TODO use initializeCloseable?
405
+ this.task = getExecutorTask(embed.getInjector(), context.getConfiguration());
406
+ this.session = ExecSession.builder(embed.getInjector()).fromExecConfig(task.getExecConfig()).build();
393
407
 
394
408
  try {
395
409
  LocalDirAllocator localDirAllocator = new LocalDirAllocator(MRConfig.LOCAL_DIR);
@@ -403,7 +417,7 @@ public class EmbulkMapReduce
403
417
  public PluginArchive readPluginArchive() throws IOException
404
418
  {
405
419
  localGemPath.mkdirs();
406
- return EmbulkMapReduce.readPluginArchive(localGemPath, config, getStateDirectoryPath(config), modelManager);
420
+ return EmbulkMapReduce.readPluginArchive(localGemPath, config, getStateDirectoryPath(config), embed.getModelManager());
407
421
  }
408
422
 
409
423
  public Configuration getConfiguration()
@@ -413,17 +427,17 @@ public class EmbulkMapReduce
413
427
 
414
428
  public ModelManager getModelManager()
415
429
  {
416
- return modelManager;
430
+ return embed.getModelManager();
417
431
  }
418
432
 
419
433
  public BufferAllocator getBufferAllocator()
420
434
  {
421
- return injector.getInstance(BufferAllocator.class);
435
+ return embed.getBufferAllocator();
422
436
  }
423
437
 
424
438
  public ScriptingContainer getScriptingContainer()
425
439
  {
426
- return injector.getInstance(ScriptingContainer.class);
440
+ return embed.getInjector().getInstance(ScriptingContainer.class);
427
441
  }
428
442
 
429
443
  public MapReduceExecutorTask getMapReduceExecutorTask()
@@ -480,9 +494,9 @@ public class EmbulkMapReduce
480
494
  }
481
495
 
482
496
  @Override
483
- public void inputCommitted(CommitReport report)
497
+ public void inputCommitted(TaskReport report)
484
498
  {
485
- state.setInputCommitReport(report);
499
+ state.setInputTaskReport(report);
486
500
  try {
487
501
  writeAttemptStateFile(config, stateDir, state, modelManager);
488
502
  } catch (IOException e) {
@@ -491,9 +505,9 @@ public class EmbulkMapReduce
491
505
  }
492
506
 
493
507
  @Override
494
- public void outputCommitted(CommitReport report)
508
+ public void outputCommitted(TaskReport report)
495
509
  {
496
- state.setOutputCommitReport(report);
510
+ state.setOutputTaskReport(report);
497
511
  try {
498
512
  writeAttemptStateFile(config, stateDir, state, modelManager);
499
513
  } catch (IOException e) {
@@ -12,7 +12,7 @@ import org.apache.hadoop.conf.Configuration;
12
12
  import org.apache.hadoop.mapreduce.Mapper;
13
13
  import org.apache.hadoop.mapreduce.Reducer;
14
14
  import org.embulk.config.ModelManager;
15
- import org.embulk.config.CommitReport;
15
+ import org.embulk.config.TaskReport;
16
16
  import org.embulk.config.ConfigDiff;
17
17
  import org.embulk.config.TaskSource;
18
18
  import org.embulk.config.ConfigSource;
@@ -218,7 +218,7 @@ public class EmbulkPartitioningMapReduce
218
218
  try {
219
219
  if (!failed) {
220
220
  output.finish();
221
- CommitReport report = output.commit();
221
+ TaskReport report = output.commit();
222
222
  handler.outputCommitted(report);
223
223
  }
224
224
  } finally {
@@ -266,7 +266,7 @@ public class EmbulkPartitioningMapReduce
266
266
 
267
267
  public void cleanup(TaskSource taskSource,
268
268
  Schema schema, int taskCount,
269
- List<CommitReport> successCommitReports)
269
+ List<TaskReport> successTaskReports)
270
270
  {
271
271
  // won't be called
272
272
  throw new RuntimeException("");
@@ -301,9 +301,9 @@ public class EmbulkPartitioningMapReduce
301
301
  public void abort()
302
302
  { }
303
303
 
304
- public CommitReport commit()
304
+ public TaskReport commit()
305
305
  {
306
- return Exec.newCommitReport();
306
+ return Exec.newTaskReport();
307
307
  }
308
308
  };
309
309
  }
@@ -40,7 +40,7 @@ import org.apache.hadoop.mapreduce.MRJobConfig;
40
40
  import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
41
41
  import org.embulk.exec.ForSystemConfig;
42
42
  import org.embulk.config.ConfigSource;
43
- import org.embulk.config.CommitReport;
43
+ import org.embulk.config.TaskReport;
44
44
  import org.embulk.config.ConfigException;
45
45
  import org.embulk.config.TaskSource;
46
46
  import org.embulk.config.ModelManager;
@@ -78,7 +78,7 @@ public class MapReduceExecutor
78
78
  final int outputTaskCount;
79
79
  final int reduceTaskCount;
80
80
 
81
- if (task.getPartitioning().isPresent()) {
81
+ if (task.getPartitioning().isPresent() && inputTaskCount > 0) { // here can disable partitioning and force set reduceTaskCount and outputTaskCount to 0 if inputTaskCount is 0
82
82
  reduceTaskCount = task.getReducers().or(inputTaskCount);
83
83
  if (reduceTaskCount <= 0) {
84
84
  throw new ConfigException("Reducers must be larger than 1 if partition: is set");
@@ -381,15 +381,15 @@ public class MapReduceExecutor
381
381
  private static void updateTaskState(TaskState state, AttemptState attempt, boolean isInput)
382
382
  {
383
383
  state.start();
384
- Optional<CommitReport> commitReport = isInput ? attempt.getInputCommitReport() : attempt.getOutputCommitReport();
385
- boolean committed = commitReport.isPresent();
384
+ Optional<TaskReport> taskReport = isInput ? attempt.getInputTaskReport() : attempt.getOutputTaskReport();
385
+ boolean committed = taskReport.isPresent();
386
386
  if (attempt.getException().isPresent()) {
387
387
  if (!state.isCommitted()) {
388
388
  state.setException(new RemoteTaskFailedException(attempt.getException().get()));
389
389
  }
390
390
  }
391
- if (commitReport.isPresent()) {
392
- state.setCommitReport(commitReport.get());
391
+ if (taskReport.isPresent()) {
392
+ state.setTaskReport(taskReport.get());
393
393
  state.finish();
394
394
  }
395
395
  }
@@ -436,12 +436,12 @@ public class MapReduceExecutor
436
436
 
437
437
  public boolean isInputCommitted()
438
438
  {
439
- return attemptState != null && attemptState.getInputCommitReport().isPresent();
439
+ return attemptState != null && attemptState.getInputTaskReport().isPresent();
440
440
  }
441
441
 
442
442
  public boolean isOutputCommitted()
443
443
  {
444
- return attemptState != null && attemptState.getOutputCommitReport().isPresent();
444
+ return attemptState != null && attemptState.getOutputTaskReport().isPresent();
445
445
  }
446
446
 
447
447
  public TaskAttemptID getTaskAttempId()
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-executor-mapreduce
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-07-28 00:00:00.000000000 Z
11
+ date: 2015-08-19 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Executes tasks on Hadoop.
14
14
  email:
@@ -22,6 +22,7 @@ files:
22
22
  - src/main/java/org/embulk/executor/mapreduce/AttemptState.java
23
23
  - src/main/java/org/embulk/executor/mapreduce/BufferWritable.java
24
24
  - src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java
25
+ - src/main/java/org/embulk/executor/mapreduce/DefaultEmbulkFactory.java
25
26
  - src/main/java/org/embulk/executor/mapreduce/EmbulkInputFormat.java
26
27
  - src/main/java/org/embulk/executor/mapreduce/EmbulkInputSplit.java
27
28
  - src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java
@@ -60,7 +61,7 @@ files:
60
61
  - classpath/curator-client-2.6.0.jar
61
62
  - classpath/curator-framework-2.6.0.jar
62
63
  - classpath/curator-recipes-2.6.0.jar
63
- - classpath/embulk-executor-mapreduce-0.1.5.jar
64
+ - classpath/embulk-executor-mapreduce-0.2.0.jar
64
65
  - classpath/gson-2.2.4.jar
65
66
  - classpath/hadoop-annotations-2.6.0.jar
66
67
  - classpath/hadoop-auth-2.6.0.jar