embulk-executor-mapreduce 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3c12d59b65314dd94cf12e29b7caf8e83fd947bd
4
- data.tar.gz: 478ea3a4e77c7a9e5395a4c4942ac9697772a10d
3
+ metadata.gz: b2abda7db750f6c161ab8867474fdccfa67eb265
4
+ data.tar.gz: 8cfc89242d0a57368b5803db9b55e6494b6916e6
5
5
  SHA512:
6
- metadata.gz: e8c955d1e7a4e0b318bc21a36c51410045b4b4c9e62b954e8848f66693d26fe580a79afb8c8e1c1910f9ecd1731f78772b6b9af7f64e817b9666776da26b126c
7
- data.tar.gz: 14fb4a34dbaf1b59b8d37ceac1356d4ab66014a559943f16da087d9f17800e46c80e9b98ae1583be5568ee8909034ba708bfa4ca3dab27ddde90b5c8e2ab8f59
6
+ metadata.gz: 0e8b2f14207ec85d1ba60b531cc5876c90689a2b259f76a5edb57d8b50d025bd5e7151307be0381d82e813b5f263d5326e39cbccaa56238d8aa3f1171e8a21fd
7
+ data.tar.gz: 645417db32cc29813fee20f2175a2858485596159fb80c31e77c1f45d1c715f40b239f7417ef0caae927f75428b8f93175eb477281d620fe38edae694e76f66a
@@ -61,6 +61,7 @@ public class EmbulkMapReduce
61
61
  private static final String CK_SYSTEM_CONFIG = "embulk.mapreduce.systemConfig";
62
62
  private static final String CK_STATE_DIRECTORY_PATH = "embulk.mapreduce.stateDirectorypath";
63
63
  private static final String CK_TASK_COUNT = "embulk.mapreduce.taskCount";
64
+ private static final String CK_RETRY_TASKS = "embulk.mapreduce.retryTasks";
64
65
  private static final String CK_TASK = "embulk.mapreduce.task";
65
66
  private static final String CK_PLUGIN_ARCHIVE_SPECS = "embulk.mapreduce.pluginArchive.specs";
66
67
 
@@ -94,6 +95,16 @@ public class EmbulkMapReduce
94
95
  return config.getInt(CK_TASK_COUNT, 0);
95
96
  }
96
97
 
98
+ public static void setRetryTasks(Configuration config, boolean enabled)
99
+ {
100
+ config.setBoolean(CK_RETRY_TASKS, enabled);
101
+ }
102
+
103
+ public static boolean getRetryTasks(Configuration config)
104
+ {
105
+ return config.getBoolean(CK_RETRY_TASKS, false);
106
+ }
107
+
97
108
  public static void setStateDirectoryPath(Configuration config, Path path)
98
109
  {
99
110
  config.set(CK_STATE_DIRECTORY_PATH, path.toString());
@@ -406,8 +417,11 @@ public class EmbulkMapReduce
406
417
  this.session = ExecSession.builder(embed.getInjector()).fromExecConfig(task.getExecConfig()).build();
407
418
 
408
419
  try {
420
+ // LocalDirAllocator allocates a directory for a job. Here adds attempt id to the path
421
+ // so that attempts running on the same machine don't conflict each other.
409
422
  LocalDirAllocator localDirAllocator = new LocalDirAllocator(MRConfig.LOCAL_DIR);
410
- Path destPath = localDirAllocator.getLocalPathForWrite("gems", config);
423
+ String dirName = context.getTaskAttemptID().toString() + "/embulk_gems";
424
+ Path destPath = localDirAllocator.getLocalPathForWrite(dirName, config);
411
425
  this.localGemPath = new File(destPath.toString());
412
426
  } catch (IOException ex) {
413
427
  throw new RuntimeException(ex);
@@ -527,12 +541,14 @@ public class EmbulkMapReduce
527
541
  {
528
542
  private Context context;
529
543
  private SessionRunner runner;
544
+ private boolean retryTasks;
530
545
 
531
546
  @Override
532
547
  public void setup(Context context) throws IOException, InterruptedException
533
548
  {
534
549
  this.context = context;
535
550
  this.runner = new SessionRunner(context);
551
+ this.retryTasks = EmbulkMapReduce.getRetryTasks(context.getConfiguration());
536
552
 
537
553
  runner.execSession(new ExecAction<Void>() { // for Exec.getLogger
538
554
  public Void run() throws IOException
@@ -566,16 +582,17 @@ public class EmbulkMapReduce
566
582
 
567
583
  try {
568
584
  Executors.process(runner.getExecSession(), task, taskIndex, handler);
569
- } catch (Throwable ex) {
585
+ }
586
+ catch (Exception ex) {
570
587
  try {
571
588
  handler.setException(ex);
572
589
  } catch (Throwable e) {
573
590
  e.addSuppressed(ex);
574
591
  throw e;
575
592
  }
576
- //if (task.getTaskRecovery()) {
577
- // throw ex;
578
- //}
593
+ if (retryTasks) {
594
+ throw ex;
595
+ }
579
596
  }
580
597
  }
581
598
  }
@@ -128,16 +128,16 @@ public class EmbulkPartitioningMapReduce
128
128
  filterPlugins, task.getFilterSchemas(), task.getFilterTaskSources(),
129
129
  outputPlugin, task.getOutputSchema(), task.getOutputTaskSource(),
130
130
  handler);
131
- } catch (Throwable ex) {
131
+ }
132
+ catch (Exception ex) {
132
133
  try {
133
134
  handler.setException(ex);
134
135
  } catch (Throwable e) {
135
136
  e.addSuppressed(ex);
136
137
  throw e;
137
138
  }
138
- //if (task.getTaskRecovery()) {
139
- // throw ex;
140
- //}
139
+ // always throw this exception to not start reducers when input fails
140
+ throw ex;
141
141
  }
142
142
  }
143
143
  }
@@ -147,6 +147,7 @@ public class EmbulkPartitioningMapReduce
147
147
  {
148
148
  private Context context;
149
149
  private SessionRunner runner;
150
+ private boolean retryTasks;
150
151
  private AttemptStateUpdateHandler handler;
151
152
  private TransactionalPageOutput output;
152
153
  private boolean failed = false;
@@ -156,6 +157,7 @@ public class EmbulkPartitioningMapReduce
156
157
  {
157
158
  this.context = context;
158
159
  this.runner = new SessionRunner(context);
160
+ this.retryTasks = EmbulkMapReduce.getRetryTasks(context.getConfiguration());
159
161
 
160
162
  runner.execSession(new ExecAction<Void>() {
161
163
  public Void run() throws Exception
@@ -199,7 +201,8 @@ public class EmbulkPartitioningMapReduce
199
201
  for (PageWritable value : values) {
200
202
  output.add(value.get());
201
203
  }
202
- } catch (Throwable ex) {
204
+ }
205
+ catch (Exception ex) {
203
206
  failed = true;
204
207
  try {
205
208
  handler.setException(ex);
@@ -207,6 +210,9 @@ public class EmbulkPartitioningMapReduce
207
210
  e.addSuppressed(ex);
208
211
  throw e;
209
212
  }
213
+ if (retryTasks) {
214
+ throw ex;
215
+ }
210
216
  }
211
217
  }
212
218
 
@@ -223,6 +223,7 @@ public class MapReduceExecutor
223
223
  EmbulkMapReduce.setSystemConfig(conf, modelManager, systemConfig);
224
224
  EmbulkMapReduce.setExecutorTask(conf, modelManager, task);
225
225
  EmbulkMapReduce.setMapTaskCount(conf, mapTaskCount); // used by EmbulkInputFormat
226
+ EmbulkMapReduce.setRetryTasks(conf, task.getRetryTasks());
226
227
  EmbulkMapReduce.setStateDirectoryPath(conf, stateDir);
227
228
 
228
229
  // jar files
@@ -43,6 +43,10 @@ public interface MapReduceExecutorTask
43
43
  @ConfigDefault("null")
44
44
  public Optional<Integer> getReducers();
45
45
 
46
+ @Config("retry_tasks")
47
+ @ConfigDefault("false")
48
+ public boolean getRetryTasks();
49
+
46
50
  @Config("partitioning")
47
51
  @ConfigDefault("null")
48
52
  public Optional<ConfigSource> getPartitioning();
@@ -1,5 +1,7 @@
1
1
  package org.embulk.executor.mapreduce;
2
2
 
3
+ import javax.validation.constraints.Min;
4
+ import javax.validation.constraints.Max;
3
5
  import com.google.common.annotations.VisibleForTesting;
4
6
  import org.joda.time.DateTimeZone;
5
7
  import com.google.common.base.Optional;
@@ -37,6 +39,12 @@ public class TimestampPartitioning
37
39
  @ConfigDefault("\"sec\"")
38
40
  public String getUnixTimestamp();
39
41
 
42
+ @Config("map_side_partition_split")
43
+ @ConfigDefault("1")
44
+ @Min(1)
45
+ @Max(65535) // TimestampPartitioning.LongPartitionKey encodes split number in 16-bit buffer
46
+ public int getMapSidePartitionSplit();
47
+
40
48
  public Column getTargetColumn();
41
49
  public void setTargetColumn(Column column);
42
50
  }
@@ -159,10 +167,19 @@ public class TimestampPartitioning
159
167
 
160
168
  Column column = task.getTargetColumn();
161
169
  if (column.getType() instanceof TimestampType) {
162
- return new TimestampPartitioner(column, Unit.of(task.getUnit()));
163
- } else if (column.getType() instanceof LongType) {
164
- return new LongUnixTimestampPartitioner(column, Unit.of(task.getUnit()), UnixTimestampUnit.of(task.getUnixTimestamp()));
165
- } else {
170
+ return new TimestampPartitioner(
171
+ column,
172
+ Unit.of(task.getUnit()),
173
+ task.getMapSidePartitionSplit());
174
+ }
175
+ else if (column.getType() instanceof LongType) {
176
+ return new LongUnixTimestampPartitioner(
177
+ column,
178
+ Unit.of(task.getUnit()),
179
+ task.getMapSidePartitionSplit(),
180
+ UnixTimestampUnit.of(task.getUnixTimestamp()));
181
+ }
182
+ else {
166
183
  throw new AssertionError();
167
184
  }
168
185
  }
@@ -234,13 +251,17 @@ public class TimestampPartitioning
234
251
  {
235
252
  protected final Column column;
236
253
  protected final Unit unit;
254
+ protected final int mapSidePartitionSplit;
237
255
  private final LongPartitionKey key;
256
+ private long roundRobin;
238
257
 
239
- public AbstractTimestampPartitioner(Column column, Unit unit)
258
+ public AbstractTimestampPartitioner(Column column, Unit unit, int mapSidePartitionSplit)
240
259
  {
241
260
  this.column = column;
242
261
  this.unit = unit;
262
+ this.mapSidePartitionSplit = mapSidePartitionSplit;
243
263
  this.key = new LongPartitionKey();
264
+ this.roundRobin = 0;
244
265
  }
245
266
 
246
267
  @Override
@@ -251,7 +272,19 @@ public class TimestampPartitioning
251
272
 
252
273
  protected LongPartitionKey updateKey(long v)
253
274
  {
254
- key.set(v);
275
+ // ((v << 16) | (roundRobin % mapSidePartitionSplit)) is used to distribute a large partition to
276
+ // multiple reducers. But this algorithm is not ideal under following scenario:
277
+ //
278
+ // * input data is in 2 hour (hour-0 and hour-1), and partitioning unit is hour.
279
+ // * there're 4 reducers.
280
+ // * with mapSidePartitionSplit = 1, hadoop uses 3 reducers because
281
+ // hour-0 is partitoned to reducer 0 (v + 0) and 1 (v + 1)
282
+ // hour-1 is partitoned to reducer 1 (v + 0) and 2 (v + 1)
283
+ //
284
+ // So, here needs further optimization to distribute load of the reducers.
285
+ //
286
+ key.set((v << 16) | (roundRobin % mapSidePartitionSplit));
287
+ roundRobin++;
255
288
  return key;
256
289
  }
257
290
  }
@@ -260,9 +293,9 @@ public class TimestampPartitioning
260
293
  static class TimestampPartitioner
261
294
  extends AbstractTimestampPartitioner
262
295
  {
263
- public TimestampPartitioner(Column column, Unit unit)
296
+ public TimestampPartitioner(Column column, Unit unit, int mapSidePartitionSplit)
264
297
  {
265
- super(column, unit);
298
+ super(column, unit, mapSidePartitionSplit);
266
299
  }
267
300
 
268
301
  @Override
@@ -280,9 +313,10 @@ public class TimestampPartitioning
280
313
  private final UnixTimestampUnit unixTimestampUnit;
281
314
 
282
315
  public LongUnixTimestampPartitioner(Column column, Unit unit,
316
+ int mapSidePartitionSplit,
283
317
  UnixTimestampUnit unixTimestampUnit)
284
318
  {
285
- super(column, unit);
319
+ super(column, unit, mapSidePartitionSplit);
286
320
  this.unixTimestampUnit = unixTimestampUnit;
287
321
  }
288
322
 
@@ -1,8 +1,11 @@
1
1
  package org.embulk.executor.mapreduce;
2
2
 
3
3
  import com.google.common.base.Function;
4
+ import com.google.common.base.Strings;
5
+ import com.google.common.base.Throwables;
4
6
  import com.google.common.collect.ImmutableList;
5
7
  import com.google.common.collect.Iterables;
8
+ import com.google.common.collect.Lists;
6
9
  import com.google.inject.Binder;
7
10
  import com.google.inject.Module;
8
11
  import com.google.inject.Provider;
@@ -20,12 +23,20 @@ import org.junit.Test;
20
23
  import org.slf4j.ILoggerFactory;
21
24
  import org.slf4j.impl.Log4jLoggerFactory;
22
25
 
26
+ import java.io.BufferedInputStream;
27
+ import java.io.BufferedReader;
23
28
  import java.io.FileNotFoundException;
24
29
  import java.io.IOException;
30
+ import java.io.InputStream;
31
+ import java.io.InputStreamReader;
32
+ import java.util.ArrayList;
33
+ import java.util.Collections;
34
+ import java.util.Comparator;
25
35
  import java.util.List;
26
36
  import java.util.Random;
27
37
 
28
38
  import static org.embulk.plugin.InjectedPluginSource.registerPluginTo;
39
+ import static org.junit.Assert.assertEquals;
29
40
  import static org.junit.Assert.assertTrue;
30
41
  import static org.junit.Assert.fail;
31
42
 
@@ -59,7 +70,13 @@ public class TestMapReduceExecutor
59
70
  {
60
71
  ConfigSource config = loadConfigSource(embulk.newConfigLoader(), "config/embulk_mapred_config.yml");
61
72
  embulk.run(config);
62
- // TODO compare input and output
73
+ assertFileContent(
74
+ Lists.newArrayList(
75
+ "fixtures/csv/sample1.csv",
76
+ "fixtures/csv/sample1.csv"),
77
+ Lists.newArrayList(
78
+ "fixtures/csv/embulk_mapred_output.000.00.csv",
79
+ "fixtures/csv/embulk_mapred_output.001.00.csv"));
63
80
  }
64
81
 
65
82
  @Test
@@ -68,7 +85,13 @@ public class TestMapReduceExecutor
68
85
  {
69
86
  ConfigSource config = loadConfigSource(embulk.newConfigLoader(), "config/embulk_mapred_partitioning_config.yml");
70
87
  embulk.run(config);
71
- // TODO compare input and output
88
+ assertFileContent(
89
+ Lists.newArrayList(
90
+ "fixtures/csv/sample1.csv",
91
+ "fixtures/csv/sample1.csv"),
92
+ Lists.newArrayList(
93
+ "fixtures/csv/embulk_mapred_partitioning_output.000.00.csv",
94
+ "fixtures/csv/embulk_mapred_partitioning_output.001.00.csv"));
72
95
  }
73
96
 
74
97
  @Test
@@ -248,4 +271,64 @@ public class TestMapReduceExecutor
248
271
  return bootstrap;
249
272
  }
250
273
  }
274
+
275
+ private static void assertFileContent(List<String> inputFiles, List<String> outputFiles)
276
+ {
277
+ List<List<String>> inputRecords = getRecords(inputFiles);
278
+ Collections.sort(inputRecords, new RecordComparator());
279
+
280
+ List<List<String>> outputRecords = getRecords(outputFiles);
281
+ Collections.sort(outputRecords, new RecordComparator());
282
+
283
+ assertEquals(inputRecords, outputRecords);
284
+ }
285
+
286
+ private static class RecordComparator
287
+ implements Comparator<List<String>>
288
+ {
289
+ @Override
290
+ public int compare(List<String> r1, List<String> r2)
291
+ {
292
+ return r1.get(0).compareTo(r2.get(0));
293
+ }
294
+ }
295
+
296
+ private static List<List<String>> getRecords(List<String> files)
297
+ {
298
+ List<List<String>> records = new ArrayList<>();
299
+
300
+ try {
301
+ for (String file : files) {
302
+ try (BufferedReader r = newReader(file)) {
303
+ r.readLine(); // header
304
+ records.addAll(getRecords(r)); // contents
305
+ }
306
+ }
307
+ }
308
+ catch (IOException e) {
309
+ throw Throwables.propagate(e);
310
+ }
311
+
312
+ return records;
313
+ }
314
+
315
+ private static List<List<String>> getRecords(BufferedReader reader)
316
+ throws IOException
317
+ {
318
+ List<List<String>> records = new ArrayList<>();
319
+
320
+ String line;
321
+ while (!Strings.isNullOrEmpty(line = reader.readLine())) {
322
+ String[] record = line.split(",");
323
+ records.add(Lists.newArrayList(record));
324
+ }
325
+
326
+ return records;
327
+ }
328
+
329
+ private static BufferedReader newReader(String filePath)
330
+ {
331
+ InputStream in = new BufferedInputStream(TestMapReduceExecutor.class.getClassLoader().getResourceAsStream(filePath));
332
+ return new BufferedReader(new InputStreamReader(in));
333
+ }
251
334
  }
@@ -117,8 +117,8 @@ public class TestTimestampPartitioning
117
117
  Column c1 = new Column(1, "c1", Types.TIMESTAMP);
118
118
  Schema schema = new Schema(Arrays.asList(c0, c1));
119
119
 
120
- LongUnixTimestampPartitioner lp = new LongUnixTimestampPartitioner(c0, Unit.HOUR, UnixTimestampUnit.SEC);
121
- TimestampPartitioner tp = new TimestampPartitioner(c1, Unit.HOUR);
120
+ LongUnixTimestampPartitioner lp = new LongUnixTimestampPartitioner(c0, Unit.HOUR, 1, UnixTimestampUnit.SEC);
121
+ TimestampPartitioner tp = new TimestampPartitioner(c1, Unit.HOUR, 1);
122
122
 
123
123
  long timeWindow = System.currentTimeMillis()/1000/3600*3600;
124
124
  PageReader r = new PageReader(schema);
@@ -35,4 +35,12 @@ in:
35
35
  - {name: d, type: double}
36
36
  - {name: flag, type: boolean}
37
37
  out:
38
- type: stdout
38
+ type: file
39
+ path_prefix: 'src/test/resources/fixtures/csv/embulk_mapred_output.'
40
+ file_ext: 'csv'
41
+ formatter:
42
+ charset: UTF-8
43
+ newline: CRLF
44
+ type: csv
45
+ column_options:
46
+ timestamp: {format: '%Y-%m-%d %H:%M:%S'}
@@ -16,6 +16,7 @@ exec:
16
16
  job_name: embulk_mapred_partitioning_0001
17
17
  exclude_jars:
18
18
  - '*log4j-over-slf4j*'
19
+ map_side_partition_split: 2
19
20
  in:
20
21
  type: file
21
22
  path_prefix: src/test/resources/fixtures/csv/sample
@@ -40,4 +41,12 @@ in:
40
41
  - {name: d, type: double}
41
42
  - {name: flag, type: boolean}
42
43
  out:
43
- type: stdout
44
+ type: file
45
+ path_prefix: 'src/test/resources/fixtures/csv/embulk_mapred_partitioning_output.'
46
+ file_ext: 'csv'
47
+ formatter:
48
+ charset: UTF-8
49
+ newline: CRLF
50
+ type: csv
51
+ column_options:
52
+ timestamp: {format: '%Y-%m-%d %H:%M:%S'}
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-executor-mapreduce
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-11 00:00:00.000000000 Z
11
+ date: 2015-12-21 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Executes tasks on Hadoop.
14
14
  email:
@@ -84,7 +84,7 @@ files:
84
84
  - classpath/curator-client-2.6.0.jar
85
85
  - classpath/curator-framework-2.6.0.jar
86
86
  - classpath/curator-recipes-2.6.0.jar
87
- - classpath/embulk-executor-mapreduce-0.2.3.jar
87
+ - classpath/embulk-executor-mapreduce-0.2.4.jar
88
88
  - classpath/gson-2.2.4.jar
89
89
  - classpath/hadoop-annotations-2.6.0.jar
90
90
  - classpath/hadoop-auth-2.6.0.jar