embulk-executor-mapreduce 0.1.5 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/classpath/embulk-executor-mapreduce-0.2.0.jar +0 -0
- data/src/main/java/org/embulk/executor/mapreduce/AttemptState.java +20 -20
- data/src/main/java/org/embulk/executor/mapreduce/DefaultEmbulkFactory.java +13 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java +45 -31
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java +5 -5
- data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java +8 -8
- metadata +4 -3
- data/classpath/embulk-executor-mapreduce-0.1.5.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5c531e6955469a01f0e2ed716a65fdf228ae95ba
|
4
|
+
data.tar.gz: d8724a7abcaedd7549a397d2b14df6edb2832d52
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9983809cd453596cf3fc683f75a22b43dbcca0abf202c7ad7bdc9cbb640c673e95be706f6d49ad04bb3881f8668bc9b8e3946115d54e2a14e90dfb163fbd9b0f
|
7
|
+
data.tar.gz: 7b272911c78f9bac87fa867f0e3978a96f31c8fb348b150410a6991e1f5f425782450c9422ca4022e2cb15abc61037caaf080fb196f48bd0a441e3e81cc31b15
|
Binary file
|
@@ -16,7 +16,7 @@ import com.fasterxml.jackson.annotation.JsonIgnore;
|
|
16
16
|
import com.fasterxml.jackson.annotation.JsonValue;
|
17
17
|
import org.apache.hadoop.mapreduce.TaskAttemptID;
|
18
18
|
import org.embulk.config.ModelManager;
|
19
|
-
import org.embulk.config.
|
19
|
+
import org.embulk.config.TaskReport;
|
20
20
|
|
21
21
|
public class AttemptState
|
22
22
|
{
|
@@ -24,8 +24,8 @@ public class AttemptState
|
|
24
24
|
private final Optional<Integer> inputTaskIndex;
|
25
25
|
private final Optional<Integer> outputTaskIndex;
|
26
26
|
private Optional<String> exception;
|
27
|
-
private Optional<
|
28
|
-
private Optional<
|
27
|
+
private Optional<TaskReport> inputTaskReport;
|
28
|
+
private Optional<TaskReport> outputTaskReport;
|
29
29
|
|
30
30
|
public AttemptState(TaskAttemptID attemptId, Optional<Integer> inputTaskIndex, Optional<Integer> outputTaskIndex)
|
31
31
|
{
|
@@ -40,12 +40,12 @@ public class AttemptState
|
|
40
40
|
@JsonProperty("inputTaskIndex") Optional<Integer> inputTaskIndex,
|
41
41
|
@JsonProperty("outputTaskIndex") Optional<Integer> outputTaskIndex,
|
42
42
|
@JsonProperty("exception") Optional<String> exception,
|
43
|
-
@JsonProperty("
|
44
|
-
@JsonProperty("
|
43
|
+
@JsonProperty("inputTaskReport") Optional<TaskReport> inputTaskReport,
|
44
|
+
@JsonProperty("outputTaskReport") Optional<TaskReport> outputTaskReport)
|
45
45
|
{
|
46
46
|
this(TaskAttemptID.forName(attemptId),
|
47
47
|
inputTaskIndex, outputTaskIndex, exception,
|
48
|
-
|
48
|
+
inputTaskReport, outputTaskReport);
|
49
49
|
}
|
50
50
|
|
51
51
|
public AttemptState(
|
@@ -53,15 +53,15 @@ public class AttemptState
|
|
53
53
|
Optional<Integer> inputTaskIndex,
|
54
54
|
Optional<Integer> outputTaskIndex,
|
55
55
|
Optional<String> exception,
|
56
|
-
Optional<
|
57
|
-
Optional<
|
56
|
+
Optional<TaskReport> inputTaskReport,
|
57
|
+
Optional<TaskReport> outputTaskReport)
|
58
58
|
{
|
59
59
|
this.attemptId = attemptId;
|
60
60
|
this.inputTaskIndex = inputTaskIndex;
|
61
61
|
this.outputTaskIndex = outputTaskIndex;
|
62
62
|
this.exception = exception;
|
63
|
-
this.
|
64
|
-
this.
|
63
|
+
this.inputTaskReport = inputTaskReport;
|
64
|
+
this.outputTaskReport = outputTaskReport;
|
65
65
|
}
|
66
66
|
|
67
67
|
@JsonIgnore
|
@@ -112,28 +112,28 @@ public class AttemptState
|
|
112
112
|
return exception;
|
113
113
|
}
|
114
114
|
|
115
|
-
@JsonProperty("
|
116
|
-
public Optional<
|
115
|
+
@JsonProperty("inputTaskReport")
|
116
|
+
public Optional<TaskReport> getInputTaskReport()
|
117
117
|
{
|
118
|
-
return
|
118
|
+
return inputTaskReport;
|
119
119
|
}
|
120
120
|
|
121
|
-
@JsonProperty("
|
122
|
-
public Optional<
|
121
|
+
@JsonProperty("outputTaskReport")
|
122
|
+
public Optional<TaskReport> getOutputTaskReport()
|
123
123
|
{
|
124
|
-
return
|
124
|
+
return outputTaskReport;
|
125
125
|
}
|
126
126
|
|
127
127
|
@JsonIgnore
|
128
|
-
public void
|
128
|
+
public void setInputTaskReport(TaskReport inputTaskReport)
|
129
129
|
{
|
130
|
-
this.
|
130
|
+
this.inputTaskReport = Optional.of(inputTaskReport);
|
131
131
|
}
|
132
132
|
|
133
133
|
@JsonIgnore
|
134
|
-
public void
|
134
|
+
public void setOutputTaskReport(TaskReport outputTaskReport)
|
135
135
|
{
|
136
|
-
this.
|
136
|
+
this.outputTaskReport = Optional.of(outputTaskReport);
|
137
137
|
}
|
138
138
|
|
139
139
|
public void writeTo(OutputStream out, ModelManager modelManager) throws IOException
|
@@ -0,0 +1,13 @@
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
2
|
+
|
3
|
+
import org.embulk.config.ConfigSource;
|
4
|
+
import org.embulk.EmbulkEmbed;
|
5
|
+
|
6
|
+
public class DefaultEmbulkFactory
|
7
|
+
{
|
8
|
+
public EmbulkEmbed.Bootstrap bootstrap(ConfigSource systemConfig)
|
9
|
+
{
|
10
|
+
return new EmbulkEmbed.Bootstrap()
|
11
|
+
.setSystemConfig(systemConfig);
|
12
|
+
}
|
13
|
+
}
|
@@ -2,6 +2,7 @@ package org.embulk.executor.mapreduce;
|
|
2
2
|
|
3
3
|
import java.util.List;
|
4
4
|
import java.util.ArrayList;
|
5
|
+
import java.util.Map;
|
5
6
|
import java.util.concurrent.Callable;
|
6
7
|
import java.util.concurrent.ExecutionException;
|
7
8
|
import java.io.File;
|
@@ -10,6 +11,7 @@ import java.io.ByteArrayInputStream;
|
|
10
11
|
import java.io.IOException;
|
11
12
|
import java.io.EOFException;
|
12
13
|
import java.io.InterruptedIOException;
|
14
|
+
import java.lang.reflect.Method;
|
13
15
|
import java.lang.reflect.InvocationTargetException;
|
14
16
|
import com.google.inject.Injector;
|
15
17
|
import com.google.common.base.Optional;
|
@@ -35,7 +37,8 @@ import org.apache.hadoop.mapreduce.MRConfig;
|
|
35
37
|
import org.embulk.config.ModelManager;
|
36
38
|
import org.embulk.config.ConfigSource;
|
37
39
|
import org.embulk.config.ConfigLoader;
|
38
|
-
import org.embulk.config.
|
40
|
+
import org.embulk.config.DataSourceImpl;
|
41
|
+
import org.embulk.config.TaskReport;
|
39
42
|
import org.embulk.spi.BufferAllocator;
|
40
43
|
import org.embulk.spi.Exec;
|
41
44
|
import org.embulk.spi.ExecAction;
|
@@ -44,7 +47,7 @@ import org.embulk.spi.ProcessTask;
|
|
44
47
|
import org.embulk.spi.util.Executors;
|
45
48
|
import org.embulk.spi.util.RetryExecutor.Retryable;
|
46
49
|
import org.embulk.spi.util.RetryExecutor.RetryGiveupException;
|
47
|
-
import org.embulk.
|
50
|
+
import org.embulk.EmbulkEmbed;
|
48
51
|
import org.slf4j.Logger;
|
49
52
|
|
50
53
|
import static java.nio.charset.StandardCharsets.UTF_8;
|
@@ -52,6 +55,7 @@ import static org.embulk.spi.util.RetryExecutor.retryExecutor;
|
|
52
55
|
|
53
56
|
public class EmbulkMapReduce
|
54
57
|
{
|
58
|
+
private static final String EMBULK_FACTORY_CLASS = "embulk_factory_class";
|
55
59
|
private static final String SYSTEM_CONFIG_SERVICE_CLASS = "mapreduce_service_class";
|
56
60
|
|
57
61
|
private static final String CK_SYSTEM_CONFIG = "embulk.mapreduce.systemConfig";
|
@@ -111,29 +115,41 @@ public class EmbulkMapReduce
|
|
111
115
|
config.get(CK_TASK));
|
112
116
|
}
|
113
117
|
|
114
|
-
public static
|
118
|
+
public static EmbulkEmbed.Bootstrap newEmbulkBootstrap(Configuration config)
|
115
119
|
{
|
116
120
|
ConfigSource systemConfig = getSystemConfig(config);
|
117
|
-
|
121
|
+
|
122
|
+
// for warnings of old versions
|
123
|
+
if (!systemConfig.get(String.class, SYSTEM_CONFIG_SERVICE_CLASS, "org.embulk.EmbulkService").equals("org.embulk.EmbulkService")) {
|
124
|
+
throw new RuntimeException("System config 'mapreduce_service_class' is not supported any more. Please use 'embulk_factory_class' instead");
|
125
|
+
}
|
126
|
+
|
127
|
+
String factoryClassName = systemConfig.get(String.class, EMBULK_FACTORY_CLASS, DefaultEmbulkFactory.class.getName());
|
118
128
|
|
119
129
|
try {
|
120
|
-
|
121
|
-
|
122
|
-
obj = new EmbulkService(systemConfig);
|
123
|
-
} else {
|
124
|
-
Class<?> serviceClass = Class.forName(serviceClassName);
|
125
|
-
obj = serviceClass.getConstructor(ConfigSource.class).newInstance(systemConfig);
|
126
|
-
}
|
130
|
+
Class<?> factoryClass = Class.forName(factoryClassName);
|
131
|
+
Object factory = factoryClass.newInstance();
|
127
132
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
133
|
+
Object bootstrap;
|
134
|
+
try {
|
135
|
+
// factory.bootstrap(ConfigSource masterSystemConfig, ConfigSource executorParams)
|
136
|
+
Method method = factoryClass.getMethod("bootstrap", ConfigSource.class, ConfigSource.class);
|
137
|
+
Map<String, String> hadoopConfig = config.getValByRegex("");
|
138
|
+
ConfigSource executorParams = new DataSourceImpl(new ModelManager(null, new ObjectMapper())).set("hadoopConfig", hadoopConfig).getNested("hadoopConfig"); // TODO add a method to embulk that creates an empty DataSource instance
|
139
|
+
bootstrap = method.invoke(factory, systemConfig, executorParams);
|
132
140
|
}
|
141
|
+
catch (NoSuchMethodException ex) {
|
142
|
+
// factory.bootstrap(ConfigSource masterSystemConfig)
|
143
|
+
bootstrap = factoryClass.getMethod("bootstrap", ConfigSource.class).invoke(factory, systemConfig);
|
144
|
+
}
|
145
|
+
|
146
|
+
return (EmbulkEmbed.Bootstrap) bootstrap;
|
133
147
|
|
134
|
-
}
|
148
|
+
}
|
149
|
+
catch (InvocationTargetException ex) {
|
135
150
|
throw Throwables.propagate(ex.getCause());
|
136
|
-
}
|
151
|
+
}
|
152
|
+
catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | IllegalAccessException | IllegalArgumentException ex) {
|
137
153
|
throw Throwables.propagate(ex);
|
138
154
|
}
|
139
155
|
}
|
@@ -377,8 +393,7 @@ public class EmbulkMapReduce
|
|
377
393
|
public static class SessionRunner
|
378
394
|
{
|
379
395
|
private final Configuration config;
|
380
|
-
private final
|
381
|
-
private final ModelManager modelManager;
|
396
|
+
private final EmbulkEmbed embed;
|
382
397
|
private final MapReduceExecutorTask task;
|
383
398
|
private final ExecSession session;
|
384
399
|
private final File localGemPath;
|
@@ -386,10 +401,9 @@ public class EmbulkMapReduce
|
|
386
401
|
public SessionRunner(TaskAttemptContext context)
|
387
402
|
{
|
388
403
|
this.config = context.getConfiguration();
|
389
|
-
this.
|
390
|
-
this.
|
391
|
-
this.
|
392
|
-
this.session = ExecSession.builder(injector).fromExecConfig(task.getExecConfig()).build();
|
404
|
+
this.embed = newEmbulkBootstrap(context.getConfiguration()).initialize(); // TODO use initializeCloseable?
|
405
|
+
this.task = getExecutorTask(embed.getInjector(), context.getConfiguration());
|
406
|
+
this.session = ExecSession.builder(embed.getInjector()).fromExecConfig(task.getExecConfig()).build();
|
393
407
|
|
394
408
|
try {
|
395
409
|
LocalDirAllocator localDirAllocator = new LocalDirAllocator(MRConfig.LOCAL_DIR);
|
@@ -403,7 +417,7 @@ public class EmbulkMapReduce
|
|
403
417
|
public PluginArchive readPluginArchive() throws IOException
|
404
418
|
{
|
405
419
|
localGemPath.mkdirs();
|
406
|
-
return EmbulkMapReduce.readPluginArchive(localGemPath, config, getStateDirectoryPath(config),
|
420
|
+
return EmbulkMapReduce.readPluginArchive(localGemPath, config, getStateDirectoryPath(config), embed.getModelManager());
|
407
421
|
}
|
408
422
|
|
409
423
|
public Configuration getConfiguration()
|
@@ -413,17 +427,17 @@ public class EmbulkMapReduce
|
|
413
427
|
|
414
428
|
public ModelManager getModelManager()
|
415
429
|
{
|
416
|
-
return
|
430
|
+
return embed.getModelManager();
|
417
431
|
}
|
418
432
|
|
419
433
|
public BufferAllocator getBufferAllocator()
|
420
434
|
{
|
421
|
-
return
|
435
|
+
return embed.getBufferAllocator();
|
422
436
|
}
|
423
437
|
|
424
438
|
public ScriptingContainer getScriptingContainer()
|
425
439
|
{
|
426
|
-
return
|
440
|
+
return embed.getInjector().getInstance(ScriptingContainer.class);
|
427
441
|
}
|
428
442
|
|
429
443
|
public MapReduceExecutorTask getMapReduceExecutorTask()
|
@@ -480,9 +494,9 @@ public class EmbulkMapReduce
|
|
480
494
|
}
|
481
495
|
|
482
496
|
@Override
|
483
|
-
public void inputCommitted(
|
497
|
+
public void inputCommitted(TaskReport report)
|
484
498
|
{
|
485
|
-
state.
|
499
|
+
state.setInputTaskReport(report);
|
486
500
|
try {
|
487
501
|
writeAttemptStateFile(config, stateDir, state, modelManager);
|
488
502
|
} catch (IOException e) {
|
@@ -491,9 +505,9 @@ public class EmbulkMapReduce
|
|
491
505
|
}
|
492
506
|
|
493
507
|
@Override
|
494
|
-
public void outputCommitted(
|
508
|
+
public void outputCommitted(TaskReport report)
|
495
509
|
{
|
496
|
-
state.
|
510
|
+
state.setOutputTaskReport(report);
|
497
511
|
try {
|
498
512
|
writeAttemptStateFile(config, stateDir, state, modelManager);
|
499
513
|
} catch (IOException e) {
|
@@ -12,7 +12,7 @@ import org.apache.hadoop.conf.Configuration;
|
|
12
12
|
import org.apache.hadoop.mapreduce.Mapper;
|
13
13
|
import org.apache.hadoop.mapreduce.Reducer;
|
14
14
|
import org.embulk.config.ModelManager;
|
15
|
-
import org.embulk.config.
|
15
|
+
import org.embulk.config.TaskReport;
|
16
16
|
import org.embulk.config.ConfigDiff;
|
17
17
|
import org.embulk.config.TaskSource;
|
18
18
|
import org.embulk.config.ConfigSource;
|
@@ -218,7 +218,7 @@ public class EmbulkPartitioningMapReduce
|
|
218
218
|
try {
|
219
219
|
if (!failed) {
|
220
220
|
output.finish();
|
221
|
-
|
221
|
+
TaskReport report = output.commit();
|
222
222
|
handler.outputCommitted(report);
|
223
223
|
}
|
224
224
|
} finally {
|
@@ -266,7 +266,7 @@ public class EmbulkPartitioningMapReduce
|
|
266
266
|
|
267
267
|
public void cleanup(TaskSource taskSource,
|
268
268
|
Schema schema, int taskCount,
|
269
|
-
List<
|
269
|
+
List<TaskReport> successTaskReports)
|
270
270
|
{
|
271
271
|
// won't be called
|
272
272
|
throw new RuntimeException("");
|
@@ -301,9 +301,9 @@ public class EmbulkPartitioningMapReduce
|
|
301
301
|
public void abort()
|
302
302
|
{ }
|
303
303
|
|
304
|
-
public
|
304
|
+
public TaskReport commit()
|
305
305
|
{
|
306
|
-
return Exec.
|
306
|
+
return Exec.newTaskReport();
|
307
307
|
}
|
308
308
|
};
|
309
309
|
}
|
@@ -40,7 +40,7 @@ import org.apache.hadoop.mapreduce.MRJobConfig;
|
|
40
40
|
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
|
41
41
|
import org.embulk.exec.ForSystemConfig;
|
42
42
|
import org.embulk.config.ConfigSource;
|
43
|
-
import org.embulk.config.
|
43
|
+
import org.embulk.config.TaskReport;
|
44
44
|
import org.embulk.config.ConfigException;
|
45
45
|
import org.embulk.config.TaskSource;
|
46
46
|
import org.embulk.config.ModelManager;
|
@@ -78,7 +78,7 @@ public class MapReduceExecutor
|
|
78
78
|
final int outputTaskCount;
|
79
79
|
final int reduceTaskCount;
|
80
80
|
|
81
|
-
if (task.getPartitioning().isPresent()) {
|
81
|
+
if (task.getPartitioning().isPresent() && inputTaskCount > 0) { // here can disable partitioning and force set reduceTaskCount and outputTaskCount to 0 if inputTaskCount is 0
|
82
82
|
reduceTaskCount = task.getReducers().or(inputTaskCount);
|
83
83
|
if (reduceTaskCount <= 0) {
|
84
84
|
throw new ConfigException("Reducers must be larger than 1 if partition: is set");
|
@@ -381,15 +381,15 @@ public class MapReduceExecutor
|
|
381
381
|
private static void updateTaskState(TaskState state, AttemptState attempt, boolean isInput)
|
382
382
|
{
|
383
383
|
state.start();
|
384
|
-
Optional<
|
385
|
-
boolean committed =
|
384
|
+
Optional<TaskReport> taskReport = isInput ? attempt.getInputTaskReport() : attempt.getOutputTaskReport();
|
385
|
+
boolean committed = taskReport.isPresent();
|
386
386
|
if (attempt.getException().isPresent()) {
|
387
387
|
if (!state.isCommitted()) {
|
388
388
|
state.setException(new RemoteTaskFailedException(attempt.getException().get()));
|
389
389
|
}
|
390
390
|
}
|
391
|
-
if (
|
392
|
-
state.
|
391
|
+
if (taskReport.isPresent()) {
|
392
|
+
state.setTaskReport(taskReport.get());
|
393
393
|
state.finish();
|
394
394
|
}
|
395
395
|
}
|
@@ -436,12 +436,12 @@ public class MapReduceExecutor
|
|
436
436
|
|
437
437
|
public boolean isInputCommitted()
|
438
438
|
{
|
439
|
-
return attemptState != null && attemptState.
|
439
|
+
return attemptState != null && attemptState.getInputTaskReport().isPresent();
|
440
440
|
}
|
441
441
|
|
442
442
|
public boolean isOutputCommitted()
|
443
443
|
{
|
444
|
-
return attemptState != null && attemptState.
|
444
|
+
return attemptState != null && attemptState.getOutputTaskReport().isPresent();
|
445
445
|
}
|
446
446
|
|
447
447
|
public TaskAttemptID getTaskAttempId()
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-executor-mapreduce
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-08-19 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Executes tasks on Hadoop.
|
14
14
|
email:
|
@@ -22,6 +22,7 @@ files:
|
|
22
22
|
- src/main/java/org/embulk/executor/mapreduce/AttemptState.java
|
23
23
|
- src/main/java/org/embulk/executor/mapreduce/BufferWritable.java
|
24
24
|
- src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java
|
25
|
+
- src/main/java/org/embulk/executor/mapreduce/DefaultEmbulkFactory.java
|
25
26
|
- src/main/java/org/embulk/executor/mapreduce/EmbulkInputFormat.java
|
26
27
|
- src/main/java/org/embulk/executor/mapreduce/EmbulkInputSplit.java
|
27
28
|
- src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java
|
@@ -60,7 +61,7 @@ files:
|
|
60
61
|
- classpath/curator-client-2.6.0.jar
|
61
62
|
- classpath/curator-framework-2.6.0.jar
|
62
63
|
- classpath/curator-recipes-2.6.0.jar
|
63
|
-
- classpath/embulk-executor-mapreduce-0.
|
64
|
+
- classpath/embulk-executor-mapreduce-0.2.0.jar
|
64
65
|
- classpath/gson-2.2.4.jar
|
65
66
|
- classpath/hadoop-annotations-2.6.0.jar
|
66
67
|
- classpath/hadoop-auth-2.6.0.jar
|
Binary file
|