embulk-executor-mapreduce 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 44794b114e21e1c5ced89169aa74b7d86a882d4d
4
- data.tar.gz: 44ce570ea8b23c6d0c430598057022a1c36e73b2
3
+ metadata.gz: e8a1f6914bd6836006726de5d2f8badef02c614b
4
+ data.tar.gz: 97b14a2720664e78424dd8974865c97bbb4165de
5
5
  SHA512:
6
- metadata.gz: cc9a2d3da2e8b89c0fb630652209897efcbf1f4ef7aebc71f82c6f3913bb6cb812bd2b583a84d7cac98b6dfd50d8380c92beac0a8406b0d69497400c3b40ee6d
7
- data.tar.gz: 1f81b7fabaae57386f33c924bfd797e9fbe5f684e2a14cf87ccf68c2bc231a22b7aa5844f1ffeac9bee287fa082b34aaa45dbe6711445273ddf5635f0013a766
6
+ metadata.gz: 0b9c87ea48d10b8cab86e60aa251f0ce4d10720350cbe57c8ad229cb5a5e00d98ba05cb8e68e63218da9c619a34c100ef9da3e71bfd895a97b6d09e2ac816db4
7
+ data.tar.gz: 996dd45f438a2e420649627130021d20a295ce540611974bb50f79dc961b0f341adcce1ca944bd6facfd751bbfc430e944861e03f91fb57f7b785cf03c098735
Binary file
@@ -144,6 +144,7 @@ public class AttemptState
144
144
 
145
145
  public static AttemptState readFrom(InputStream in, ModelManager modelManager) throws IOException
146
146
  {
147
+ // If InputStream contains partial JSON (like '{"key":"va'), this method throws EOFException
147
148
  Scanner s = new Scanner(in, "UTF-8").useDelimiter("\\A"); // TODO
148
149
  if (s.hasNext()) {
149
150
  return modelManager.readObject(AttemptState.class, s.next());
@@ -1,5 +1,7 @@
1
1
  package org.embulk.executor.mapreduce;
2
2
 
3
+ import java.io.EOFException;
4
+ import java.io.InterruptedIOException;
3
5
  import java.util.List;
4
6
  import java.util.ArrayList;
5
7
  import java.util.concurrent.ExecutionException;
@@ -37,7 +39,12 @@ import org.embulk.spi.ExecAction;
37
39
  import org.embulk.spi.ExecSession;
38
40
  import org.embulk.spi.ProcessTask;
39
41
  import org.embulk.spi.util.Executors;
42
+ import org.embulk.spi.util.RetryExecutor.Retryable;
43
+ import org.embulk.spi.util.RetryExecutor.RetryGiveupException;
40
44
  import org.embulk.EmbulkService;
45
+ import org.slf4j.Logger;
46
+
47
+ import static org.embulk.spi.util.RetryExecutor.retryExecutor;
41
48
 
42
49
  public class EmbulkMapReduce
43
50
  {
@@ -141,12 +148,52 @@ public class EmbulkMapReduce
141
148
  }
142
149
  }
143
150
 
144
- public static AttemptState readAttemptStateFile(Configuration config,
145
- Path stateDir, TaskAttemptID id, ModelManager modelManager) throws IOException
151
+ public static AttemptState readAttemptStateFile(final Configuration config,
152
+ Path stateDir, TaskAttemptID id, final ModelManager modelManager) throws IOException
146
153
  {
147
- Path path = new Path(stateDir, id.toString());
148
- try (FSDataInputStream in = path.getFileSystem(config).open(path)) {
149
- return AttemptState.readFrom(in, modelManager);
154
+ final Logger log = Exec.getLogger(EmbulkMapReduce.class);
155
+ final Path path = new Path(stateDir, id.toString());
156
+ try {
157
+ return retryExecutor()
158
+ .withRetryLimit(5)
159
+ .withInitialRetryWait(2 * 1000)
160
+ .withMaxRetryWait(20 * 1000)
161
+ .runInterruptible(new Retryable<AttemptState>() {
162
+ @Override
163
+ public AttemptState call() throws IOException {
164
+ try (FSDataInputStream in = path.getFileSystem(config).open(path)) {
165
+ return AttemptState.readFrom(in, modelManager);
166
+ }
167
+ }
168
+
169
+ @Override
170
+ public boolean isRetryableException(Exception exception) {
171
+ // AttemptState.readFrom throws 2 types of exceptions:
172
+ // a) EOFException: race between readFrom and writeTo. See comments on AttemptState.readFrom.
173
+ // b) IOException "Cannot obtain block length for LocatedBlock": HDFS-1058. See https://github.com/embulk/embulk-executor-mapreduce/pull/3
174
+ // c) other IOException: FileSystem is not working
175
+ //
176
+ // a) and b) are temporary problem which is not critical. c) could be temporary problem and it is critical.
177
+ // Here retries regardless of the exception type because we can't distinguish b) from c).
178
+ return true;
179
+ }
180
+
181
+ @Override
182
+ public void onRetry(Exception exception, int retryCount, int retryLimit, int retryWait)
183
+ throws RetryGiveupException {
184
+ log.warn("Retrying opening state file " + path.getName() + " error: " + exception);
185
+ }
186
+
187
+ @Override
188
+ public void onGiveup(Exception firstException, Exception lastException)
189
+ throws RetryGiveupException {
190
+ }
191
+ });
192
+ } catch (RetryGiveupException e) {
193
+ Throwables.propagateIfInstanceOf(e.getCause(), IOException.class);
194
+ throw Throwables.propagate(e.getCause());
195
+ } catch (InterruptedException e) {
196
+ throw new InterruptedIOException();
150
197
  }
151
198
  }
152
199
 
@@ -154,6 +201,7 @@ public class EmbulkMapReduce
154
201
  Path stateDir, AttemptState state, ModelManager modelManager) throws IOException
155
202
  {
156
203
  Path path = new Path(stateDir, state.getAttemptId().toString());
204
+ // TODO retry file create and write
157
205
  try (FSDataOutputStream out = path.getFileSystem(config).create(path, true)) {
158
206
  state.writeTo(out, modelManager);
159
207
  }
@@ -221,12 +221,14 @@ public class MapReduceExecutor
221
221
  job.mapProgress() * 100, job.reduceProgress() * 100));
222
222
  Thread.sleep(interval);
223
223
 
224
- updateProcessState(job, mapTaskCount, stateDir, state, modelManager);
224
+ updateProcessState(job, mapTaskCount, stateDir, state, modelManager, true);
225
225
  }
226
226
 
227
+ // Here sets skipUnavailable=false to updateProcessState method because race
228
+ // condition of AttemptReport.readFrom and .writeTo does not happen here.
227
229
  log.info(String.format("map %.1f%% reduce %.1f%%",
228
230
  job.mapProgress() * 100, job.reduceProgress() * 100));
229
- updateProcessState(job, mapTaskCount, stateDir, state, modelManager);
231
+ updateProcessState(job, mapTaskCount, stateDir, state, modelManager, false);
230
232
 
231
233
  Counters counters = job.getCounters();
232
234
  if (counters != null) {
@@ -291,7 +293,7 @@ public class MapReduceExecutor
291
293
  }
292
294
 
293
295
  private void updateProcessState(Job job, int mapTaskCount, Path stateDir,
294
- ProcessState state, ModelManager modelManager) throws IOException
296
+ ProcessState state, ModelManager modelManager, boolean skipUnavailable) throws IOException
295
297
  {
296
298
  List<AttemptReport> reports = getAttemptReports(job.getConfiguration(), stateDir, modelManager);
297
299
 
@@ -299,8 +301,12 @@ public class MapReduceExecutor
299
301
  if (report == null) {
300
302
  continue;
301
303
  }
302
- if (!report.isStarted()) {
303
- continue;
304
+ if (!report.isAvailable()) {
305
+ if (skipUnavailable) {
306
+ continue;
307
+ } else {
308
+ throw report.getUnavailableException();
309
+ }
304
310
  }
305
311
  AttemptState attempt = report.getAttemptState();
306
312
  if (attempt.getInputTaskIndex().isPresent()) {
@@ -338,23 +344,32 @@ public class MapReduceExecutor
338
344
  {
339
345
  private final TaskAttemptID attemptId;
340
346
  private final AttemptState attemptState;
347
+ private final IOException unavailableException;
341
348
 
342
- public AttemptReport(TaskAttemptID attemptId)
349
+ public AttemptReport(TaskAttemptID attemptId, AttemptState attemptState)
343
350
  {
344
- this(attemptId, null);
351
+ this.attemptId = attemptId;
352
+ this.attemptState = attemptState;
353
+ this.unavailableException = null;
345
354
  }
346
355
 
347
- public AttemptReport(TaskAttemptID attemptId, AttemptState attemptState)
356
+ public AttemptReport(TaskAttemptID attemptId, IOException unavailableException)
348
357
  {
349
358
  this.attemptId = attemptId;
350
- this.attemptState = attemptState;
359
+ this.attemptState = null;
360
+ this.unavailableException = unavailableException;
351
361
  }
352
362
 
353
- public boolean isStarted()
363
+ public boolean isAvailable()
354
364
  {
355
365
  return attemptState != null;
356
366
  }
357
367
 
368
+ public IOException getUnavailableException()
369
+ {
370
+ return unavailableException;
371
+ }
372
+
358
373
  public boolean isInputCommitted()
359
374
  {
360
375
  return attemptState != null && attemptState.getInputCommitReport().isPresent();
@@ -382,8 +397,12 @@ public class MapReduceExecutor
382
397
  AttemptState state = EmbulkMapReduce.readAttemptStateFile(config,
383
398
  stateDir, aid, modelManager);
384
399
  builder.add(new AttemptReport(aid, state));
385
- } catch (EOFException ex) { // plus Not Found exception
386
- builder.add(new AttemptReport(aid, null));
400
+ } catch (IOException ex) {
401
+ // Either of:
402
+ // * race condition of AttemptReport.writeTo and .readFrom
403
+ // * FileSystem is not working
404
+ // See also comments on MapReduceExecutor.readAttemptStateFile.isRetryableException.
405
+ builder.add(new AttemptReport(aid, ex));
387
406
  }
388
407
  }
389
408
  return builder.build();
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-executor-mapreduce
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-08 00:00:00.000000000 Z
11
+ date: 2015-06-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Executes tasks on Hadoop.
14
14
  email:
@@ -42,6 +42,7 @@ files:
42
42
  - classpath/apacheds-kerberos-codec-2.0.0-M15.jar
43
43
  - classpath/api-asn1-api-1.0.0-M20.jar
44
44
  - classpath/api-util-1.0.0-M20.jar
45
+ - classpath/asm-3.1.jar
45
46
  - classpath/avro-1.7.4.jar
46
47
  - classpath/commons-beanutils-1.7.0.jar
47
48
  - classpath/commons-cli-1.2.jar
@@ -59,7 +60,7 @@ files:
59
60
  - classpath/curator-client-2.6.0.jar
60
61
  - classpath/curator-framework-2.6.0.jar
61
62
  - classpath/curator-recipes-2.6.0.jar
62
- - classpath/embulk-executor-mapreduce-0.1.0.jar
63
+ - classpath/embulk-executor-mapreduce-0.1.1.jar
63
64
  - classpath/gson-2.2.4.jar
64
65
  - classpath/hadoop-annotations-2.6.0.jar
65
66
  - classpath/hadoop-auth-2.6.0.jar