embulk-executor-mapreduce 0.2.3 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3c12d59b65314dd94cf12e29b7caf8e83fd947bd
4
- data.tar.gz: 478ea3a4e77c7a9e5395a4c4942ac9697772a10d
3
+ metadata.gz: b2abda7db750f6c161ab8867474fdccfa67eb265
4
+ data.tar.gz: 8cfc89242d0a57368b5803db9b55e6494b6916e6
5
5
  SHA512:
6
- metadata.gz: e8c955d1e7a4e0b318bc21a36c51410045b4b4c9e62b954e8848f66693d26fe580a79afb8c8e1c1910f9ecd1731f78772b6b9af7f64e817b9666776da26b126c
7
- data.tar.gz: 14fb4a34dbaf1b59b8d37ceac1356d4ab66014a559943f16da087d9f17800e46c80e9b98ae1583be5568ee8909034ba708bfa4ca3dab27ddde90b5c8e2ab8f59
6
+ metadata.gz: 0e8b2f14207ec85d1ba60b531cc5876c90689a2b259f76a5edb57d8b50d025bd5e7151307be0381d82e813b5f263d5326e39cbccaa56238d8aa3f1171e8a21fd
7
+ data.tar.gz: 645417db32cc29813fee20f2175a2858485596159fb80c31e77c1f45d1c715f40b239f7417ef0caae927f75428b8f93175eb477281d620fe38edae694e76f66a
@@ -61,6 +61,7 @@ public class EmbulkMapReduce
61
61
  private static final String CK_SYSTEM_CONFIG = "embulk.mapreduce.systemConfig";
62
62
  private static final String CK_STATE_DIRECTORY_PATH = "embulk.mapreduce.stateDirectorypath";
63
63
  private static final String CK_TASK_COUNT = "embulk.mapreduce.taskCount";
64
+ private static final String CK_RETRY_TASKS = "embulk.mapreduce.retryTasks";
64
65
  private static final String CK_TASK = "embulk.mapreduce.task";
65
66
  private static final String CK_PLUGIN_ARCHIVE_SPECS = "embulk.mapreduce.pluginArchive.specs";
66
67
 
@@ -94,6 +95,16 @@ public class EmbulkMapReduce
94
95
  return config.getInt(CK_TASK_COUNT, 0);
95
96
  }
96
97
 
98
+ public static void setRetryTasks(Configuration config, boolean enabled)
99
+ {
100
+ config.setBoolean(CK_RETRY_TASKS, enabled);
101
+ }
102
+
103
+ public static boolean getRetryTasks(Configuration config)
104
+ {
105
+ return config.getBoolean(CK_RETRY_TASKS, false);
106
+ }
107
+
97
108
  public static void setStateDirectoryPath(Configuration config, Path path)
98
109
  {
99
110
  config.set(CK_STATE_DIRECTORY_PATH, path.toString());
@@ -406,8 +417,11 @@ public class EmbulkMapReduce
406
417
  this.session = ExecSession.builder(embed.getInjector()).fromExecConfig(task.getExecConfig()).build();
407
418
 
408
419
  try {
420
+ // LocalDirAllocator allocates a directory for a job. Here adds attempt id to the path
421
+ // so that attempts running on the same machine don't conflict each other.
409
422
  LocalDirAllocator localDirAllocator = new LocalDirAllocator(MRConfig.LOCAL_DIR);
410
- Path destPath = localDirAllocator.getLocalPathForWrite("gems", config);
423
+ String dirName = context.getTaskAttemptID().toString() + "/embulk_gems";
424
+ Path destPath = localDirAllocator.getLocalPathForWrite(dirName, config);
411
425
  this.localGemPath = new File(destPath.toString());
412
426
  } catch (IOException ex) {
413
427
  throw new RuntimeException(ex);
@@ -527,12 +541,14 @@ public class EmbulkMapReduce
527
541
  {
528
542
  private Context context;
529
543
  private SessionRunner runner;
544
+ private boolean retryTasks;
530
545
 
531
546
  @Override
532
547
  public void setup(Context context) throws IOException, InterruptedException
533
548
  {
534
549
  this.context = context;
535
550
  this.runner = new SessionRunner(context);
551
+ this.retryTasks = EmbulkMapReduce.getRetryTasks(context.getConfiguration());
536
552
 
537
553
  runner.execSession(new ExecAction<Void>() { // for Exec.getLogger
538
554
  public Void run() throws IOException
@@ -566,16 +582,17 @@ public class EmbulkMapReduce
566
582
 
567
583
  try {
568
584
  Executors.process(runner.getExecSession(), task, taskIndex, handler);
569
- } catch (Throwable ex) {
585
+ }
586
+ catch (Exception ex) {
570
587
  try {
571
588
  handler.setException(ex);
572
589
  } catch (Throwable e) {
573
590
  e.addSuppressed(ex);
574
591
  throw e;
575
592
  }
576
- //if (task.getTaskRecovery()) {
577
- // throw ex;
578
- //}
593
+ if (retryTasks) {
594
+ throw ex;
595
+ }
579
596
  }
580
597
  }
581
598
  }
@@ -128,16 +128,16 @@ public class EmbulkPartitioningMapReduce
128
128
  filterPlugins, task.getFilterSchemas(), task.getFilterTaskSources(),
129
129
  outputPlugin, task.getOutputSchema(), task.getOutputTaskSource(),
130
130
  handler);
131
- } catch (Throwable ex) {
131
+ }
132
+ catch (Exception ex) {
132
133
  try {
133
134
  handler.setException(ex);
134
135
  } catch (Throwable e) {
135
136
  e.addSuppressed(ex);
136
137
  throw e;
137
138
  }
138
- //if (task.getTaskRecovery()) {
139
- // throw ex;
140
- //}
139
+ // always throw this exception to not start reducers when input fails
140
+ throw ex;
141
141
  }
142
142
  }
143
143
  }
@@ -147,6 +147,7 @@ public class EmbulkPartitioningMapReduce
147
147
  {
148
148
  private Context context;
149
149
  private SessionRunner runner;
150
+ private boolean retryTasks;
150
151
  private AttemptStateUpdateHandler handler;
151
152
  private TransactionalPageOutput output;
152
153
  private boolean failed = false;
@@ -156,6 +157,7 @@ public class EmbulkPartitioningMapReduce
156
157
  {
157
158
  this.context = context;
158
159
  this.runner = new SessionRunner(context);
160
+ this.retryTasks = EmbulkMapReduce.getRetryTasks(context.getConfiguration());
159
161
 
160
162
  runner.execSession(new ExecAction<Void>() {
161
163
  public Void run() throws Exception
@@ -199,7 +201,8 @@ public class EmbulkPartitioningMapReduce
199
201
  for (PageWritable value : values) {
200
202
  output.add(value.get());
201
203
  }
202
- } catch (Throwable ex) {
204
+ }
205
+ catch (Exception ex) {
203
206
  failed = true;
204
207
  try {
205
208
  handler.setException(ex);
@@ -207,6 +210,9 @@ public class EmbulkPartitioningMapReduce
207
210
  e.addSuppressed(ex);
208
211
  throw e;
209
212
  }
213
+ if (retryTasks) {
214
+ throw ex;
215
+ }
210
216
  }
211
217
  }
212
218
 
@@ -223,6 +223,7 @@ public class MapReduceExecutor
223
223
  EmbulkMapReduce.setSystemConfig(conf, modelManager, systemConfig);
224
224
  EmbulkMapReduce.setExecutorTask(conf, modelManager, task);
225
225
  EmbulkMapReduce.setMapTaskCount(conf, mapTaskCount); // used by EmbulkInputFormat
226
+ EmbulkMapReduce.setRetryTasks(conf, task.getRetryTasks());
226
227
  EmbulkMapReduce.setStateDirectoryPath(conf, stateDir);
227
228
 
228
229
  // jar files
@@ -43,6 +43,10 @@ public interface MapReduceExecutorTask
43
43
  @ConfigDefault("null")
44
44
  public Optional<Integer> getReducers();
45
45
 
46
+ @Config("retry_tasks")
47
+ @ConfigDefault("false")
48
+ public boolean getRetryTasks();
49
+
46
50
  @Config("partitioning")
47
51
  @ConfigDefault("null")
48
52
  public Optional<ConfigSource> getPartitioning();
@@ -1,5 +1,7 @@
1
1
  package org.embulk.executor.mapreduce;
2
2
 
3
+ import javax.validation.constraints.Min;
4
+ import javax.validation.constraints.Max;
3
5
  import com.google.common.annotations.VisibleForTesting;
4
6
  import org.joda.time.DateTimeZone;
5
7
  import com.google.common.base.Optional;
@@ -37,6 +39,12 @@ public class TimestampPartitioning
37
39
  @ConfigDefault("\"sec\"")
38
40
  public String getUnixTimestamp();
39
41
 
42
+ @Config("map_side_partition_split")
43
+ @ConfigDefault("1")
44
+ @Min(1)
45
+ @Max(65535) // TimestampPartitioning.LongPartitionKey encodes split number in 16-bit buffer
46
+ public int getMapSidePartitionSplit();
47
+
40
48
  public Column getTargetColumn();
41
49
  public void setTargetColumn(Column column);
42
50
  }
@@ -159,10 +167,19 @@ public class TimestampPartitioning
159
167
 
160
168
  Column column = task.getTargetColumn();
161
169
  if (column.getType() instanceof TimestampType) {
162
- return new TimestampPartitioner(column, Unit.of(task.getUnit()));
163
- } else if (column.getType() instanceof LongType) {
164
- return new LongUnixTimestampPartitioner(column, Unit.of(task.getUnit()), UnixTimestampUnit.of(task.getUnixTimestamp()));
165
- } else {
170
+ return new TimestampPartitioner(
171
+ column,
172
+ Unit.of(task.getUnit()),
173
+ task.getMapSidePartitionSplit());
174
+ }
175
+ else if (column.getType() instanceof LongType) {
176
+ return new LongUnixTimestampPartitioner(
177
+ column,
178
+ Unit.of(task.getUnit()),
179
+ task.getMapSidePartitionSplit(),
180
+ UnixTimestampUnit.of(task.getUnixTimestamp()));
181
+ }
182
+ else {
166
183
  throw new AssertionError();
167
184
  }
168
185
  }
@@ -234,13 +251,17 @@ public class TimestampPartitioning
234
251
  {
235
252
  protected final Column column;
236
253
  protected final Unit unit;
254
+ protected final int mapSidePartitionSplit;
237
255
  private final LongPartitionKey key;
256
+ private long roundRobin;
238
257
 
239
- public AbstractTimestampPartitioner(Column column, Unit unit)
258
+ public AbstractTimestampPartitioner(Column column, Unit unit, int mapSidePartitionSplit)
240
259
  {
241
260
  this.column = column;
242
261
  this.unit = unit;
262
+ this.mapSidePartitionSplit = mapSidePartitionSplit;
243
263
  this.key = new LongPartitionKey();
264
+ this.roundRobin = 0;
244
265
  }
245
266
 
246
267
  @Override
@@ -251,7 +272,19 @@ public class TimestampPartitioning
251
272
 
252
273
  protected LongPartitionKey updateKey(long v)
253
274
  {
254
- key.set(v);
275
+ // ((v << 16) | (roundRobin % mapSidePartitionSplit)) is used to distribute a large partition to
276
+ // multiple reducers. But this algorithm is not ideal under following scenario:
277
+ //
278
+ // * input data is in 2 hour (hour-0 and hour-1), and partitioning unit is hour.
279
+ // * there're 4 reducers.
280
+ // * with mapSidePartitionSplit = 1, hadoop uses 3 reducers because
281
+ // hour-0 is partitoned to reducer 0 (v + 0) and 1 (v + 1)
282
+ // hour-1 is partitoned to reducer 1 (v + 0) and 2 (v + 1)
283
+ //
284
+ // So, here needs further optimization to distribute load of the reducers.
285
+ //
286
+ key.set((v << 16) | (roundRobin % mapSidePartitionSplit));
287
+ roundRobin++;
255
288
  return key;
256
289
  }
257
290
  }
@@ -260,9 +293,9 @@ public class TimestampPartitioning
260
293
  static class TimestampPartitioner
261
294
  extends AbstractTimestampPartitioner
262
295
  {
263
- public TimestampPartitioner(Column column, Unit unit)
296
+ public TimestampPartitioner(Column column, Unit unit, int mapSidePartitionSplit)
264
297
  {
265
- super(column, unit);
298
+ super(column, unit, mapSidePartitionSplit);
266
299
  }
267
300
 
268
301
  @Override
@@ -280,9 +313,10 @@ public class TimestampPartitioning
280
313
  private final UnixTimestampUnit unixTimestampUnit;
281
314
 
282
315
  public LongUnixTimestampPartitioner(Column column, Unit unit,
316
+ int mapSidePartitionSplit,
283
317
  UnixTimestampUnit unixTimestampUnit)
284
318
  {
285
- super(column, unit);
319
+ super(column, unit, mapSidePartitionSplit);
286
320
  this.unixTimestampUnit = unixTimestampUnit;
287
321
  }
288
322
 
@@ -1,8 +1,11 @@
1
1
  package org.embulk.executor.mapreduce;
2
2
 
3
3
  import com.google.common.base.Function;
4
+ import com.google.common.base.Strings;
5
+ import com.google.common.base.Throwables;
4
6
  import com.google.common.collect.ImmutableList;
5
7
  import com.google.common.collect.Iterables;
8
+ import com.google.common.collect.Lists;
6
9
  import com.google.inject.Binder;
7
10
  import com.google.inject.Module;
8
11
  import com.google.inject.Provider;
@@ -20,12 +23,20 @@ import org.junit.Test;
20
23
  import org.slf4j.ILoggerFactory;
21
24
  import org.slf4j.impl.Log4jLoggerFactory;
22
25
 
26
+ import java.io.BufferedInputStream;
27
+ import java.io.BufferedReader;
23
28
  import java.io.FileNotFoundException;
24
29
  import java.io.IOException;
30
+ import java.io.InputStream;
31
+ import java.io.InputStreamReader;
32
+ import java.util.ArrayList;
33
+ import java.util.Collections;
34
+ import java.util.Comparator;
25
35
  import java.util.List;
26
36
  import java.util.Random;
27
37
 
28
38
  import static org.embulk.plugin.InjectedPluginSource.registerPluginTo;
39
+ import static org.junit.Assert.assertEquals;
29
40
  import static org.junit.Assert.assertTrue;
30
41
  import static org.junit.Assert.fail;
31
42
 
@@ -59,7 +70,13 @@ public class TestMapReduceExecutor
59
70
  {
60
71
  ConfigSource config = loadConfigSource(embulk.newConfigLoader(), "config/embulk_mapred_config.yml");
61
72
  embulk.run(config);
62
- // TODO compare input and output
73
+ assertFileContent(
74
+ Lists.newArrayList(
75
+ "fixtures/csv/sample1.csv",
76
+ "fixtures/csv/sample1.csv"),
77
+ Lists.newArrayList(
78
+ "fixtures/csv/embulk_mapred_output.000.00.csv",
79
+ "fixtures/csv/embulk_mapred_output.001.00.csv"));
63
80
  }
64
81
 
65
82
  @Test
@@ -68,7 +85,13 @@ public class TestMapReduceExecutor
68
85
  {
69
86
  ConfigSource config = loadConfigSource(embulk.newConfigLoader(), "config/embulk_mapred_partitioning_config.yml");
70
87
  embulk.run(config);
71
- // TODO compare input and output
88
+ assertFileContent(
89
+ Lists.newArrayList(
90
+ "fixtures/csv/sample1.csv",
91
+ "fixtures/csv/sample1.csv"),
92
+ Lists.newArrayList(
93
+ "fixtures/csv/embulk_mapred_partitioning_output.000.00.csv",
94
+ "fixtures/csv/embulk_mapred_partitioning_output.001.00.csv"));
72
95
  }
73
96
 
74
97
  @Test
@@ -248,4 +271,64 @@ public class TestMapReduceExecutor
248
271
  return bootstrap;
249
272
  }
250
273
  }
274
+
275
+ private static void assertFileContent(List<String> inputFiles, List<String> outputFiles)
276
+ {
277
+ List<List<String>> inputRecords = getRecords(inputFiles);
278
+ Collections.sort(inputRecords, new RecordComparator());
279
+
280
+ List<List<String>> outputRecords = getRecords(outputFiles);
281
+ Collections.sort(outputRecords, new RecordComparator());
282
+
283
+ assertEquals(inputRecords, outputRecords);
284
+ }
285
+
286
+ private static class RecordComparator
287
+ implements Comparator<List<String>>
288
+ {
289
+ @Override
290
+ public int compare(List<String> r1, List<String> r2)
291
+ {
292
+ return r1.get(0).compareTo(r2.get(0));
293
+ }
294
+ }
295
+
296
+ private static List<List<String>> getRecords(List<String> files)
297
+ {
298
+ List<List<String>> records = new ArrayList<>();
299
+
300
+ try {
301
+ for (String file : files) {
302
+ try (BufferedReader r = newReader(file)) {
303
+ r.readLine(); // header
304
+ records.addAll(getRecords(r)); // contents
305
+ }
306
+ }
307
+ }
308
+ catch (IOException e) {
309
+ throw Throwables.propagate(e);
310
+ }
311
+
312
+ return records;
313
+ }
314
+
315
+ private static List<List<String>> getRecords(BufferedReader reader)
316
+ throws IOException
317
+ {
318
+ List<List<String>> records = new ArrayList<>();
319
+
320
+ String line;
321
+ while (!Strings.isNullOrEmpty(line = reader.readLine())) {
322
+ String[] record = line.split(",");
323
+ records.add(Lists.newArrayList(record));
324
+ }
325
+
326
+ return records;
327
+ }
328
+
329
+ private static BufferedReader newReader(String filePath)
330
+ {
331
+ InputStream in = new BufferedInputStream(TestMapReduceExecutor.class.getClassLoader().getResourceAsStream(filePath));
332
+ return new BufferedReader(new InputStreamReader(in));
333
+ }
251
334
  }
@@ -117,8 +117,8 @@ public class TestTimestampPartitioning
117
117
  Column c1 = new Column(1, "c1", Types.TIMESTAMP);
118
118
  Schema schema = new Schema(Arrays.asList(c0, c1));
119
119
 
120
- LongUnixTimestampPartitioner lp = new LongUnixTimestampPartitioner(c0, Unit.HOUR, UnixTimestampUnit.SEC);
121
- TimestampPartitioner tp = new TimestampPartitioner(c1, Unit.HOUR);
120
+ LongUnixTimestampPartitioner lp = new LongUnixTimestampPartitioner(c0, Unit.HOUR, 1, UnixTimestampUnit.SEC);
121
+ TimestampPartitioner tp = new TimestampPartitioner(c1, Unit.HOUR, 1);
122
122
 
123
123
  long timeWindow = System.currentTimeMillis()/1000/3600*3600;
124
124
  PageReader r = new PageReader(schema);
@@ -35,4 +35,12 @@ in:
35
35
  - {name: d, type: double}
36
36
  - {name: flag, type: boolean}
37
37
  out:
38
- type: stdout
38
+ type: file
39
+ path_prefix: 'src/test/resources/fixtures/csv/embulk_mapred_output.'
40
+ file_ext: 'csv'
41
+ formatter:
42
+ charset: UTF-8
43
+ newline: CRLF
44
+ type: csv
45
+ column_options:
46
+ timestamp: {format: '%Y-%m-%d %H:%M:%S'}
@@ -16,6 +16,7 @@ exec:
16
16
  job_name: embulk_mapred_partitioning_0001
17
17
  exclude_jars:
18
18
  - '*log4j-over-slf4j*'
19
+ map_side_partition_split: 2
19
20
  in:
20
21
  type: file
21
22
  path_prefix: src/test/resources/fixtures/csv/sample
@@ -40,4 +41,12 @@ in:
40
41
  - {name: d, type: double}
41
42
  - {name: flag, type: boolean}
42
43
  out:
43
- type: stdout
44
+ type: file
45
+ path_prefix: 'src/test/resources/fixtures/csv/embulk_mapred_partitioning_output.'
46
+ file_ext: 'csv'
47
+ formatter:
48
+ charset: UTF-8
49
+ newline: CRLF
50
+ type: csv
51
+ column_options:
52
+ timestamp: {format: '%Y-%m-%d %H:%M:%S'}
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-executor-mapreduce
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-11 00:00:00.000000000 Z
11
+ date: 2015-12-21 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Executes tasks on Hadoop.
14
14
  email:
@@ -84,7 +84,7 @@ files:
84
84
  - classpath/curator-client-2.6.0.jar
85
85
  - classpath/curator-framework-2.6.0.jar
86
86
  - classpath/curator-recipes-2.6.0.jar
87
- - classpath/embulk-executor-mapreduce-0.2.3.jar
87
+ - classpath/embulk-executor-mapreduce-0.2.4.jar
88
88
  - classpath/gson-2.2.4.jar
89
89
  - classpath/hadoop-annotations-2.6.0.jar
90
90
  - classpath/hadoop-auth-2.6.0.jar