embulk-executor-mapreduce 0.1.5 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/classpath/embulk-executor-mapreduce-0.2.0.jar +0 -0
- data/src/main/java/org/embulk/executor/mapreduce/AttemptState.java +20 -20
- data/src/main/java/org/embulk/executor/mapreduce/DefaultEmbulkFactory.java +13 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java +45 -31
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java +5 -5
- data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java +8 -8
- metadata +4 -3
- data/classpath/embulk-executor-mapreduce-0.1.5.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5c531e6955469a01f0e2ed716a65fdf228ae95ba
|
4
|
+
data.tar.gz: d8724a7abcaedd7549a397d2b14df6edb2832d52
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9983809cd453596cf3fc683f75a22b43dbcca0abf202c7ad7bdc9cbb640c673e95be706f6d49ad04bb3881f8668bc9b8e3946115d54e2a14e90dfb163fbd9b0f
|
7
|
+
data.tar.gz: 7b272911c78f9bac87fa867f0e3978a96f31c8fb348b150410a6991e1f5f425782450c9422ca4022e2cb15abc61037caaf080fb196f48bd0a441e3e81cc31b15
|
Binary file
|
@@ -16,7 +16,7 @@ import com.fasterxml.jackson.annotation.JsonIgnore;
|
|
16
16
|
import com.fasterxml.jackson.annotation.JsonValue;
|
17
17
|
import org.apache.hadoop.mapreduce.TaskAttemptID;
|
18
18
|
import org.embulk.config.ModelManager;
|
19
|
-
import org.embulk.config.
|
19
|
+
import org.embulk.config.TaskReport;
|
20
20
|
|
21
21
|
public class AttemptState
|
22
22
|
{
|
@@ -24,8 +24,8 @@ public class AttemptState
|
|
24
24
|
private final Optional<Integer> inputTaskIndex;
|
25
25
|
private final Optional<Integer> outputTaskIndex;
|
26
26
|
private Optional<String> exception;
|
27
|
-
private Optional<
|
28
|
-
private Optional<
|
27
|
+
private Optional<TaskReport> inputTaskReport;
|
28
|
+
private Optional<TaskReport> outputTaskReport;
|
29
29
|
|
30
30
|
public AttemptState(TaskAttemptID attemptId, Optional<Integer> inputTaskIndex, Optional<Integer> outputTaskIndex)
|
31
31
|
{
|
@@ -40,12 +40,12 @@ public class AttemptState
|
|
40
40
|
@JsonProperty("inputTaskIndex") Optional<Integer> inputTaskIndex,
|
41
41
|
@JsonProperty("outputTaskIndex") Optional<Integer> outputTaskIndex,
|
42
42
|
@JsonProperty("exception") Optional<String> exception,
|
43
|
-
@JsonProperty("
|
44
|
-
@JsonProperty("
|
43
|
+
@JsonProperty("inputTaskReport") Optional<TaskReport> inputTaskReport,
|
44
|
+
@JsonProperty("outputTaskReport") Optional<TaskReport> outputTaskReport)
|
45
45
|
{
|
46
46
|
this(TaskAttemptID.forName(attemptId),
|
47
47
|
inputTaskIndex, outputTaskIndex, exception,
|
48
|
-
|
48
|
+
inputTaskReport, outputTaskReport);
|
49
49
|
}
|
50
50
|
|
51
51
|
public AttemptState(
|
@@ -53,15 +53,15 @@ public class AttemptState
|
|
53
53
|
Optional<Integer> inputTaskIndex,
|
54
54
|
Optional<Integer> outputTaskIndex,
|
55
55
|
Optional<String> exception,
|
56
|
-
Optional<
|
57
|
-
Optional<
|
56
|
+
Optional<TaskReport> inputTaskReport,
|
57
|
+
Optional<TaskReport> outputTaskReport)
|
58
58
|
{
|
59
59
|
this.attemptId = attemptId;
|
60
60
|
this.inputTaskIndex = inputTaskIndex;
|
61
61
|
this.outputTaskIndex = outputTaskIndex;
|
62
62
|
this.exception = exception;
|
63
|
-
this.
|
64
|
-
this.
|
63
|
+
this.inputTaskReport = inputTaskReport;
|
64
|
+
this.outputTaskReport = outputTaskReport;
|
65
65
|
}
|
66
66
|
|
67
67
|
@JsonIgnore
|
@@ -112,28 +112,28 @@ public class AttemptState
|
|
112
112
|
return exception;
|
113
113
|
}
|
114
114
|
|
115
|
-
@JsonProperty("
|
116
|
-
public Optional<
|
115
|
+
@JsonProperty("inputTaskReport")
|
116
|
+
public Optional<TaskReport> getInputTaskReport()
|
117
117
|
{
|
118
|
-
return
|
118
|
+
return inputTaskReport;
|
119
119
|
}
|
120
120
|
|
121
|
-
@JsonProperty("
|
122
|
-
public Optional<
|
121
|
+
@JsonProperty("outputTaskReport")
|
122
|
+
public Optional<TaskReport> getOutputTaskReport()
|
123
123
|
{
|
124
|
-
return
|
124
|
+
return outputTaskReport;
|
125
125
|
}
|
126
126
|
|
127
127
|
@JsonIgnore
|
128
|
-
public void
|
128
|
+
public void setInputTaskReport(TaskReport inputTaskReport)
|
129
129
|
{
|
130
|
-
this.
|
130
|
+
this.inputTaskReport = Optional.of(inputTaskReport);
|
131
131
|
}
|
132
132
|
|
133
133
|
@JsonIgnore
|
134
|
-
public void
|
134
|
+
public void setOutputTaskReport(TaskReport outputTaskReport)
|
135
135
|
{
|
136
|
-
this.
|
136
|
+
this.outputTaskReport = Optional.of(outputTaskReport);
|
137
137
|
}
|
138
138
|
|
139
139
|
public void writeTo(OutputStream out, ModelManager modelManager) throws IOException
|
@@ -0,0 +1,13 @@
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
2
|
+
|
3
|
+
import org.embulk.config.ConfigSource;
|
4
|
+
import org.embulk.EmbulkEmbed;
|
5
|
+
|
6
|
+
public class DefaultEmbulkFactory
|
7
|
+
{
|
8
|
+
public EmbulkEmbed.Bootstrap bootstrap(ConfigSource systemConfig)
|
9
|
+
{
|
10
|
+
return new EmbulkEmbed.Bootstrap()
|
11
|
+
.setSystemConfig(systemConfig);
|
12
|
+
}
|
13
|
+
}
|
@@ -2,6 +2,7 @@ package org.embulk.executor.mapreduce;
|
|
2
2
|
|
3
3
|
import java.util.List;
|
4
4
|
import java.util.ArrayList;
|
5
|
+
import java.util.Map;
|
5
6
|
import java.util.concurrent.Callable;
|
6
7
|
import java.util.concurrent.ExecutionException;
|
7
8
|
import java.io.File;
|
@@ -10,6 +11,7 @@ import java.io.ByteArrayInputStream;
|
|
10
11
|
import java.io.IOException;
|
11
12
|
import java.io.EOFException;
|
12
13
|
import java.io.InterruptedIOException;
|
14
|
+
import java.lang.reflect.Method;
|
13
15
|
import java.lang.reflect.InvocationTargetException;
|
14
16
|
import com.google.inject.Injector;
|
15
17
|
import com.google.common.base.Optional;
|
@@ -35,7 +37,8 @@ import org.apache.hadoop.mapreduce.MRConfig;
|
|
35
37
|
import org.embulk.config.ModelManager;
|
36
38
|
import org.embulk.config.ConfigSource;
|
37
39
|
import org.embulk.config.ConfigLoader;
|
38
|
-
import org.embulk.config.
|
40
|
+
import org.embulk.config.DataSourceImpl;
|
41
|
+
import org.embulk.config.TaskReport;
|
39
42
|
import org.embulk.spi.BufferAllocator;
|
40
43
|
import org.embulk.spi.Exec;
|
41
44
|
import org.embulk.spi.ExecAction;
|
@@ -44,7 +47,7 @@ import org.embulk.spi.ProcessTask;
|
|
44
47
|
import org.embulk.spi.util.Executors;
|
45
48
|
import org.embulk.spi.util.RetryExecutor.Retryable;
|
46
49
|
import org.embulk.spi.util.RetryExecutor.RetryGiveupException;
|
47
|
-
import org.embulk.
|
50
|
+
import org.embulk.EmbulkEmbed;
|
48
51
|
import org.slf4j.Logger;
|
49
52
|
|
50
53
|
import static java.nio.charset.StandardCharsets.UTF_8;
|
@@ -52,6 +55,7 @@ import static org.embulk.spi.util.RetryExecutor.retryExecutor;
|
|
52
55
|
|
53
56
|
public class EmbulkMapReduce
|
54
57
|
{
|
58
|
+
private static final String EMBULK_FACTORY_CLASS = "embulk_factory_class";
|
55
59
|
private static final String SYSTEM_CONFIG_SERVICE_CLASS = "mapreduce_service_class";
|
56
60
|
|
57
61
|
private static final String CK_SYSTEM_CONFIG = "embulk.mapreduce.systemConfig";
|
@@ -111,29 +115,41 @@ public class EmbulkMapReduce
|
|
111
115
|
config.get(CK_TASK));
|
112
116
|
}
|
113
117
|
|
114
|
-
public static
|
118
|
+
public static EmbulkEmbed.Bootstrap newEmbulkBootstrap(Configuration config)
|
115
119
|
{
|
116
120
|
ConfigSource systemConfig = getSystemConfig(config);
|
117
|
-
|
121
|
+
|
122
|
+
// for warnings of old versions
|
123
|
+
if (!systemConfig.get(String.class, SYSTEM_CONFIG_SERVICE_CLASS, "org.embulk.EmbulkService").equals("org.embulk.EmbulkService")) {
|
124
|
+
throw new RuntimeException("System config 'mapreduce_service_class' is not supported any more. Please use 'embulk_factory_class' instead");
|
125
|
+
}
|
126
|
+
|
127
|
+
String factoryClassName = systemConfig.get(String.class, EMBULK_FACTORY_CLASS, DefaultEmbulkFactory.class.getName());
|
118
128
|
|
119
129
|
try {
|
120
|
-
|
121
|
-
|
122
|
-
obj = new EmbulkService(systemConfig);
|
123
|
-
} else {
|
124
|
-
Class<?> serviceClass = Class.forName(serviceClassName);
|
125
|
-
obj = serviceClass.getConstructor(ConfigSource.class).newInstance(systemConfig);
|
126
|
-
}
|
130
|
+
Class<?> factoryClass = Class.forName(factoryClassName);
|
131
|
+
Object factory = factoryClass.newInstance();
|
127
132
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
133
|
+
Object bootstrap;
|
134
|
+
try {
|
135
|
+
// factory.bootstrap(ConfigSource masterSystemConfig, ConfigSource executorParams)
|
136
|
+
Method method = factoryClass.getMethod("bootstrap", ConfigSource.class, ConfigSource.class);
|
137
|
+
Map<String, String> hadoopConfig = config.getValByRegex("");
|
138
|
+
ConfigSource executorParams = new DataSourceImpl(new ModelManager(null, new ObjectMapper())).set("hadoopConfig", hadoopConfig).getNested("hadoopConfig"); // TODO add a method to embulk that creates an empty DataSource instance
|
139
|
+
bootstrap = method.invoke(factory, systemConfig, executorParams);
|
132
140
|
}
|
141
|
+
catch (NoSuchMethodException ex) {
|
142
|
+
// factory.bootstrap(ConfigSource masterSystemConfig)
|
143
|
+
bootstrap = factoryClass.getMethod("bootstrap", ConfigSource.class).invoke(factory, systemConfig);
|
144
|
+
}
|
145
|
+
|
146
|
+
return (EmbulkEmbed.Bootstrap) bootstrap;
|
133
147
|
|
134
|
-
}
|
148
|
+
}
|
149
|
+
catch (InvocationTargetException ex) {
|
135
150
|
throw Throwables.propagate(ex.getCause());
|
136
|
-
}
|
151
|
+
}
|
152
|
+
catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | IllegalAccessException | IllegalArgumentException ex) {
|
137
153
|
throw Throwables.propagate(ex);
|
138
154
|
}
|
139
155
|
}
|
@@ -377,8 +393,7 @@ public class EmbulkMapReduce
|
|
377
393
|
public static class SessionRunner
|
378
394
|
{
|
379
395
|
private final Configuration config;
|
380
|
-
private final
|
381
|
-
private final ModelManager modelManager;
|
396
|
+
private final EmbulkEmbed embed;
|
382
397
|
private final MapReduceExecutorTask task;
|
383
398
|
private final ExecSession session;
|
384
399
|
private final File localGemPath;
|
@@ -386,10 +401,9 @@ public class EmbulkMapReduce
|
|
386
401
|
public SessionRunner(TaskAttemptContext context)
|
387
402
|
{
|
388
403
|
this.config = context.getConfiguration();
|
389
|
-
this.
|
390
|
-
this.
|
391
|
-
this.
|
392
|
-
this.session = ExecSession.builder(injector).fromExecConfig(task.getExecConfig()).build();
|
404
|
+
this.embed = newEmbulkBootstrap(context.getConfiguration()).initialize(); // TODO use initializeCloseable?
|
405
|
+
this.task = getExecutorTask(embed.getInjector(), context.getConfiguration());
|
406
|
+
this.session = ExecSession.builder(embed.getInjector()).fromExecConfig(task.getExecConfig()).build();
|
393
407
|
|
394
408
|
try {
|
395
409
|
LocalDirAllocator localDirAllocator = new LocalDirAllocator(MRConfig.LOCAL_DIR);
|
@@ -403,7 +417,7 @@ public class EmbulkMapReduce
|
|
403
417
|
public PluginArchive readPluginArchive() throws IOException
|
404
418
|
{
|
405
419
|
localGemPath.mkdirs();
|
406
|
-
return EmbulkMapReduce.readPluginArchive(localGemPath, config, getStateDirectoryPath(config),
|
420
|
+
return EmbulkMapReduce.readPluginArchive(localGemPath, config, getStateDirectoryPath(config), embed.getModelManager());
|
407
421
|
}
|
408
422
|
|
409
423
|
public Configuration getConfiguration()
|
@@ -413,17 +427,17 @@ public class EmbulkMapReduce
|
|
413
427
|
|
414
428
|
public ModelManager getModelManager()
|
415
429
|
{
|
416
|
-
return
|
430
|
+
return embed.getModelManager();
|
417
431
|
}
|
418
432
|
|
419
433
|
public BufferAllocator getBufferAllocator()
|
420
434
|
{
|
421
|
-
return
|
435
|
+
return embed.getBufferAllocator();
|
422
436
|
}
|
423
437
|
|
424
438
|
public ScriptingContainer getScriptingContainer()
|
425
439
|
{
|
426
|
-
return
|
440
|
+
return embed.getInjector().getInstance(ScriptingContainer.class);
|
427
441
|
}
|
428
442
|
|
429
443
|
public MapReduceExecutorTask getMapReduceExecutorTask()
|
@@ -480,9 +494,9 @@ public class EmbulkMapReduce
|
|
480
494
|
}
|
481
495
|
|
482
496
|
@Override
|
483
|
-
public void inputCommitted(
|
497
|
+
public void inputCommitted(TaskReport report)
|
484
498
|
{
|
485
|
-
state.
|
499
|
+
state.setInputTaskReport(report);
|
486
500
|
try {
|
487
501
|
writeAttemptStateFile(config, stateDir, state, modelManager);
|
488
502
|
} catch (IOException e) {
|
@@ -491,9 +505,9 @@ public class EmbulkMapReduce
|
|
491
505
|
}
|
492
506
|
|
493
507
|
@Override
|
494
|
-
public void outputCommitted(
|
508
|
+
public void outputCommitted(TaskReport report)
|
495
509
|
{
|
496
|
-
state.
|
510
|
+
state.setOutputTaskReport(report);
|
497
511
|
try {
|
498
512
|
writeAttemptStateFile(config, stateDir, state, modelManager);
|
499
513
|
} catch (IOException e) {
|
@@ -12,7 +12,7 @@ import org.apache.hadoop.conf.Configuration;
|
|
12
12
|
import org.apache.hadoop.mapreduce.Mapper;
|
13
13
|
import org.apache.hadoop.mapreduce.Reducer;
|
14
14
|
import org.embulk.config.ModelManager;
|
15
|
-
import org.embulk.config.
|
15
|
+
import org.embulk.config.TaskReport;
|
16
16
|
import org.embulk.config.ConfigDiff;
|
17
17
|
import org.embulk.config.TaskSource;
|
18
18
|
import org.embulk.config.ConfigSource;
|
@@ -218,7 +218,7 @@ public class EmbulkPartitioningMapReduce
|
|
218
218
|
try {
|
219
219
|
if (!failed) {
|
220
220
|
output.finish();
|
221
|
-
|
221
|
+
TaskReport report = output.commit();
|
222
222
|
handler.outputCommitted(report);
|
223
223
|
}
|
224
224
|
} finally {
|
@@ -266,7 +266,7 @@ public class EmbulkPartitioningMapReduce
|
|
266
266
|
|
267
267
|
public void cleanup(TaskSource taskSource,
|
268
268
|
Schema schema, int taskCount,
|
269
|
-
List<
|
269
|
+
List<TaskReport> successTaskReports)
|
270
270
|
{
|
271
271
|
// won't be called
|
272
272
|
throw new RuntimeException("");
|
@@ -301,9 +301,9 @@ public class EmbulkPartitioningMapReduce
|
|
301
301
|
public void abort()
|
302
302
|
{ }
|
303
303
|
|
304
|
-
public
|
304
|
+
public TaskReport commit()
|
305
305
|
{
|
306
|
-
return Exec.
|
306
|
+
return Exec.newTaskReport();
|
307
307
|
}
|
308
308
|
};
|
309
309
|
}
|
@@ -40,7 +40,7 @@ import org.apache.hadoop.mapreduce.MRJobConfig;
|
|
40
40
|
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
|
41
41
|
import org.embulk.exec.ForSystemConfig;
|
42
42
|
import org.embulk.config.ConfigSource;
|
43
|
-
import org.embulk.config.
|
43
|
+
import org.embulk.config.TaskReport;
|
44
44
|
import org.embulk.config.ConfigException;
|
45
45
|
import org.embulk.config.TaskSource;
|
46
46
|
import org.embulk.config.ModelManager;
|
@@ -78,7 +78,7 @@ public class MapReduceExecutor
|
|
78
78
|
final int outputTaskCount;
|
79
79
|
final int reduceTaskCount;
|
80
80
|
|
81
|
-
if (task.getPartitioning().isPresent()) {
|
81
|
+
if (task.getPartitioning().isPresent() && inputTaskCount > 0) { // here can disable partitioning and force set reduceTaskCount and outputTaskCount to 0 if inputTaskCount is 0
|
82
82
|
reduceTaskCount = task.getReducers().or(inputTaskCount);
|
83
83
|
if (reduceTaskCount <= 0) {
|
84
84
|
throw new ConfigException("Reducers must be larger than 1 if partition: is set");
|
@@ -381,15 +381,15 @@ public class MapReduceExecutor
|
|
381
381
|
private static void updateTaskState(TaskState state, AttemptState attempt, boolean isInput)
|
382
382
|
{
|
383
383
|
state.start();
|
384
|
-
Optional<
|
385
|
-
boolean committed =
|
384
|
+
Optional<TaskReport> taskReport = isInput ? attempt.getInputTaskReport() : attempt.getOutputTaskReport();
|
385
|
+
boolean committed = taskReport.isPresent();
|
386
386
|
if (attempt.getException().isPresent()) {
|
387
387
|
if (!state.isCommitted()) {
|
388
388
|
state.setException(new RemoteTaskFailedException(attempt.getException().get()));
|
389
389
|
}
|
390
390
|
}
|
391
|
-
if (
|
392
|
-
state.
|
391
|
+
if (taskReport.isPresent()) {
|
392
|
+
state.setTaskReport(taskReport.get());
|
393
393
|
state.finish();
|
394
394
|
}
|
395
395
|
}
|
@@ -436,12 +436,12 @@ public class MapReduceExecutor
|
|
436
436
|
|
437
437
|
public boolean isInputCommitted()
|
438
438
|
{
|
439
|
-
return attemptState != null && attemptState.
|
439
|
+
return attemptState != null && attemptState.getInputTaskReport().isPresent();
|
440
440
|
}
|
441
441
|
|
442
442
|
public boolean isOutputCommitted()
|
443
443
|
{
|
444
|
-
return attemptState != null && attemptState.
|
444
|
+
return attemptState != null && attemptState.getOutputTaskReport().isPresent();
|
445
445
|
}
|
446
446
|
|
447
447
|
public TaskAttemptID getTaskAttempId()
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-executor-mapreduce
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-08-19 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Executes tasks on Hadoop.
|
14
14
|
email:
|
@@ -22,6 +22,7 @@ files:
|
|
22
22
|
- src/main/java/org/embulk/executor/mapreduce/AttemptState.java
|
23
23
|
- src/main/java/org/embulk/executor/mapreduce/BufferWritable.java
|
24
24
|
- src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java
|
25
|
+
- src/main/java/org/embulk/executor/mapreduce/DefaultEmbulkFactory.java
|
25
26
|
- src/main/java/org/embulk/executor/mapreduce/EmbulkInputFormat.java
|
26
27
|
- src/main/java/org/embulk/executor/mapreduce/EmbulkInputSplit.java
|
27
28
|
- src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java
|
@@ -60,7 +61,7 @@ files:
|
|
60
61
|
- classpath/curator-client-2.6.0.jar
|
61
62
|
- classpath/curator-framework-2.6.0.jar
|
62
63
|
- classpath/curator-recipes-2.6.0.jar
|
63
|
-
- classpath/embulk-executor-mapreduce-0.
|
64
|
+
- classpath/embulk-executor-mapreduce-0.2.0.jar
|
64
65
|
- classpath/gson-2.2.4.jar
|
65
66
|
- classpath/hadoop-annotations-2.6.0.jar
|
66
67
|
- classpath/hadoop-auth-2.6.0.jar
|
Binary file
|