embulk 0.6.7 → 0.6.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c812083e751a2593484a749af8ac7f0519cbe5f7
4
- data.tar.gz: d7d74ec957f1cbb9951850199524a3e039be4640
3
+ metadata.gz: a9e8a1f4e6ada76c6388248ae5ac9a2cdc6876ea
4
+ data.tar.gz: b21c680b792319509aa443b4e01809af8d3dcb56
5
5
  SHA512:
6
- metadata.gz: 9f727dfbf18ab2dc4e768b7ea132413c30fc63330a6aa3c0d5ffc4285623c7f03eaa66c177ed9c4c57d1232f59e582583cb2255c73d8f0a4045831f126f040de
7
- data.tar.gz: a0218b10b993ad8cb09afc3fc7f94454da50410979e70a74375d2c3d739077e20ae16b27e0fb30d369137cc8c29b6bddaace9955746f66fcb3ae8c72c7a39e82
6
+ metadata.gz: 7990996b97d6ae238dd20c8f997af9cb68f2d557b94b3d27388b8ede4e675c1c0f044072310d76ce37c4e4625497510ddecc9dee594a783ac91875f398c0137b
7
+ data.tar.gz: c2421ec9edd3f3e404302699526b1e226af19755186962b3b27074e53aebbfdb45dabf9e832c0b5a285bb40aad3e5a7aac7000724d735f43cae009b53150e2c9
data/build.gradle CHANGED
@@ -11,7 +11,7 @@ def release_projects = [project(":embulk-core"), project(":embulk-standards")]
11
11
 
12
12
  allprojects {
13
13
  group = 'org.embulk'
14
- version = '0.6.7'
14
+ version = '0.6.8'
15
15
 
16
16
  ext {
17
17
  jrubyVersion = '1.7.19'
@@ -1,9 +1,6 @@
1
- // include ruby scripts
2
- sourceSets {
3
- main.resources {
4
- srcDirs "${rootProject.projectDir}/lib"
5
- }
6
- }
1
+ // include ruby scripts to jar. don't use sourceSets.main.resources.srcDirs
2
+ // because IntelliJ causes error if srcDirs includes files out of projectDir.
3
+ processResources.from "${rootProject.projectDir}/lib"
7
4
 
8
5
  configurations {
9
6
  // com.google.inject:guice depends on asm and cglib but version of the libraries conflict
@@ -164,7 +164,7 @@ public class BulkLoader
164
164
  return outputTaskStates.get(outputTaskIndex);
165
165
  }
166
166
 
167
- public boolean isAllCommitted()
167
+ public boolean isAllTasksCommitted()
168
168
  {
169
169
  if (outputTaskStates == null) {
170
170
  // not initialized
@@ -178,6 +178,11 @@ public class BulkLoader
178
178
  return true;
179
179
  }
180
180
 
181
+ public boolean isAllTransactionsCommitted()
182
+ {
183
+ return inputConfigDiff != null && outputConfigDiff != null;
184
+ }
185
+
181
186
  public boolean isAnyStarted()
182
187
  {
183
188
  if (inputTaskStates == null) {
@@ -489,7 +494,7 @@ public class BulkLoader
489
494
 
490
495
  state.initialize(inputTaskCount, outputTaskCount);
491
496
 
492
- if (!state.isAllCommitted()) { // inputTaskCount == 0
497
+ if (!state.isAllTasksCommitted()) { // inputTaskCount == 0
493
498
  execute(task, executor, state);
494
499
  }
495
500
 
@@ -511,7 +516,7 @@ public class BulkLoader
511
516
  return state.buildExecuteResult();
512
517
 
513
518
  } catch (Throwable ex) {
514
- if (state.isAllCommitted()) {
519
+ if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) {
515
520
  // ignore the exception
516
521
  return state.buildExecuteResultWithWarningException(ex);
517
522
  }
@@ -554,7 +559,7 @@ public class BulkLoader
554
559
  state.setOutputTaskSource(outputTask);
555
560
 
556
561
  restoreResumedCommitReports(resume, state);
557
- if (!state.isAllCommitted()) {
562
+ if (!state.isAllTasksCommitted()) {
558
563
  execute(task, executor, state);
559
564
  }
560
565
 
@@ -576,7 +581,7 @@ public class BulkLoader
576
581
  return state.buildExecuteResult();
577
582
 
578
583
  } catch (Throwable ex) {
579
- if (state.isAllCommitted()) {
584
+ if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) {
580
585
  // ignore the exception
581
586
  return state.buildExecuteResultWithWarningException(ex);
582
587
  }
@@ -621,7 +626,7 @@ public class BulkLoader
621
626
 
622
627
  executor.execute(procTask, state);
623
628
 
624
- if (!state.isAllCommitted()) {
629
+ if (!state.isAllTasksCommitted()) {
625
630
  throw state.getRepresentativeException();
626
631
  }
627
632
  }
@@ -39,6 +39,11 @@ public class TimestampFormatter
39
39
  this.dateFormat = new RubyDateFormat(format, Locale.ENGLISH, true);
40
40
  }
41
41
 
42
+ public DateTimeZone getTimeZone()
43
+ {
44
+ return timeZone;
45
+ }
46
+
42
47
  public void format(Timestamp value, LineEncoder encoder)
43
48
  {
44
49
  // TODO optimize by directly appending to internal buffer
@@ -30,6 +30,11 @@ public class TimestampParser
30
30
  this(task.getJRuby(), format, task.getDefaultTimeZone());
31
31
  }
32
32
 
33
+ public DateTimeZone getDefaultTimeZone()
34
+ {
35
+ return defaultTimeZone;
36
+ }
37
+
33
38
  // TODO this is still private because this might need current time
34
39
  private TimestampParser(ScriptingContainer jruby, String format, DateTimeZone defaultTimeZone)
35
40
  {
@@ -0,0 +1,128 @@
1
+ package org.embulk.spi.util;
2
+
3
+ import java.io.InputStream;
4
+ import java.io.IOException;
5
+
6
+ public class ResumableInputStream
7
+ extends InputStream
8
+ {
9
+ public interface Reopener
10
+ {
11
+ public InputStream reopen(long offset, Exception closedCause) throws IOException;
12
+ }
13
+
14
+ private final Reopener reopener;
15
+ protected InputStream in;
16
+ private long offset;
17
+ private long markedOffset;
18
+
19
+ public ResumableInputStream(InputStream initialInputStream, Reopener reopener)
20
+ {
21
+ this.reopener = reopener;
22
+ this.in = initialInputStream;
23
+ this.offset = 0L;
24
+ this.markedOffset = 0L;
25
+ }
26
+
27
+ public ResumableInputStream(Reopener reopener) throws IOException
28
+ {
29
+ this(reopener.reopen(0, null), reopener);
30
+ }
31
+
32
+ private void reopen(Exception closedCause) throws IOException
33
+ {
34
+ if (in != null) {
35
+ in.close();
36
+ in = null;
37
+ }
38
+ in = reopener.reopen(offset, closedCause);
39
+ }
40
+
41
+ @Override
42
+ public int read() throws IOException
43
+ {
44
+ while (true) {
45
+ try {
46
+ int v = in.read();
47
+ offset += 1;
48
+ return v;
49
+ } catch (IOException | RuntimeException ex) {
50
+ reopen(ex);
51
+ }
52
+ }
53
+ }
54
+
55
+ @Override
56
+ public int read(byte[] b) throws IOException
57
+ {
58
+ while (true) {
59
+ try {
60
+ int r = in.read(b);
61
+ offset += r;
62
+ return r;
63
+ } catch (IOException | RuntimeException ex) {
64
+ reopen(ex);
65
+ }
66
+ }
67
+ }
68
+
69
+ @Override
70
+ public int read(byte[] b, int off, int len) throws IOException
71
+ {
72
+ while (true) {
73
+ try {
74
+ int r = in.read(b, off, len);
75
+ offset += r;
76
+ return r;
77
+ } catch (IOException | RuntimeException ex) {
78
+ reopen(ex);
79
+ }
80
+ }
81
+ }
82
+
83
+ @Override
84
+ public long skip(long n) throws IOException
85
+ {
86
+ while (true) {
87
+ try {
88
+ long r = in.skip(n);
89
+ offset += r;
90
+ return r;
91
+ } catch (IOException | RuntimeException ex) {
92
+ reopen(ex);
93
+ }
94
+ }
95
+ }
96
+
97
+ @Override
98
+ public int available() throws IOException
99
+ {
100
+ return in.available();
101
+ }
102
+
103
+ @Override
104
+ public void close() throws IOException
105
+ {
106
+ in.close();
107
+ }
108
+
109
+ @Override
110
+ public void mark(int readlimit)
111
+ {
112
+ in.mark(readlimit);
113
+ markedOffset = offset;
114
+ }
115
+
116
+ @Override
117
+ public void reset() throws IOException
118
+ {
119
+ in.reset();
120
+ offset = markedOffset;
121
+ }
122
+
123
+ @Override
124
+ public boolean markSupported()
125
+ {
126
+ return in.markSupported();
127
+ }
128
+ }
@@ -0,0 +1,130 @@
1
+ package org.embulk.spi.util;
2
+
3
+ import java.util.concurrent.Callable;
4
+ import java.util.concurrent.ExecutionException;
5
+
6
+ public class RetryExecutor
7
+ {
8
+ public static RetryExecutor retryExecutor()
9
+ {
10
+ // TODO default configuration
11
+ return new RetryExecutor(3, 500, 30*60*1000);
12
+ }
13
+
14
+ public static class RetryGiveupException
15
+ extends ExecutionException
16
+ {
17
+ public RetryGiveupException(String message, Exception cause)
18
+ {
19
+ super(cause);
20
+ }
21
+
22
+ public RetryGiveupException(Exception cause)
23
+ {
24
+ super(cause);
25
+ }
26
+
27
+ public Exception getCause()
28
+ {
29
+ return (Exception) super.getCause();
30
+ }
31
+ }
32
+
33
+ public static interface Retryable<T>
34
+ extends Callable<T>
35
+ {
36
+ public T call()
37
+ throws Exception;
38
+
39
+ public boolean isRetryableException(Exception exception);
40
+
41
+ public void onRetry(Exception exception, int retryCount, int retryLimit, int retryWait)
42
+ throws RetryGiveupException;
43
+
44
+ public void onGiveup(Exception firstException, Exception lastException)
45
+ throws RetryGiveupException;
46
+ }
47
+
48
+ private final int retryLimit;
49
+ private final int initialRetryWait;
50
+ private final int maxRetryWait;
51
+
52
+ private RetryExecutor(int retryLimit, int initialRetryWait, int maxRetryWait)
53
+ {
54
+ this.retryLimit = retryLimit;
55
+ this.initialRetryWait = initialRetryWait;
56
+ this.maxRetryWait = maxRetryWait;
57
+ }
58
+
59
+ public RetryExecutor withRetryLimit(int count)
60
+ {
61
+ return new RetryExecutor(count, initialRetryWait, maxRetryWait);
62
+ }
63
+
64
+ public RetryExecutor withInitialRetryWait(int msec)
65
+ {
66
+ return new RetryExecutor(retryLimit, msec, maxRetryWait);
67
+ }
68
+
69
+ public RetryExecutor withMaxRetryWait(int msec)
70
+ {
71
+ return new RetryExecutor(retryLimit, initialRetryWait, msec);
72
+ }
73
+
74
+ public <T> T runInterruptible(Retryable<T> op)
75
+ throws InterruptedException, RetryGiveupException
76
+ {
77
+ return run(op, true);
78
+ }
79
+
80
+ public <T> T run(Retryable<T> op)
81
+ throws RetryGiveupException
82
+ {
83
+ try {
84
+ return run(op, false);
85
+ } catch (InterruptedException ex) {
86
+ throw new RetryGiveupException("Unexpected interruption", ex);
87
+ }
88
+ }
89
+
90
+ private <T> T run(Retryable<T> op, boolean interruptible)
91
+ throws InterruptedException, RetryGiveupException
92
+ {
93
+ int retryWait = initialRetryWait;
94
+ int retryCount = 0;
95
+
96
+ Exception firstException = null;
97
+
98
+ while(true) {
99
+ try {
100
+ return op.call();
101
+ } catch (Exception exception) {
102
+ if (firstException == null) {
103
+ firstException = exception;
104
+ }
105
+ if (!op.isRetryableException(exception) || retryCount >= retryLimit) {
106
+ op.onGiveup(firstException, exception);
107
+ throw new RetryGiveupException(firstException);
108
+ }
109
+
110
+ retryCount++;
111
+ op.onRetry(exception, retryCount, retryLimit, retryWait);
112
+
113
+ try {
114
+ Thread.sleep(retryWait);
115
+ } catch (InterruptedException ex) {
116
+ if (interruptible) {
117
+ throw ex;
118
+ }
119
+ }
120
+
121
+ // exponential back-off with hard limit
122
+ retryWait *= 2;
123
+ if (retryWait > maxRetryWait) {
124
+ retryWait = maxRetryWait;
125
+ }
126
+ }
127
+ }
128
+ }
129
+ }
130
+
@@ -145,6 +145,8 @@ Options
145
145
  +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
146
146
  | allow\_optional\_columns | boolean | If true, set null to insufficient columns. Otherwise, skip the row in case of insufficient number of columns | ``false`` by default |
147
147
  +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
148
+ | allow\_extra\_columns | boolean | If true, ignore too many columns. Otherwise, skip the row in case of too many columns | ``false`` by default |
149
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
148
150
  | max\_quoted\_size\_limit | integer | Maximum number of bytes of a quoted value. If a value exceeds the limit, the row will be skipped | ``131072`` by default |
149
151
  +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
150
152
  | default\_timezone | string | Time zone of timestamp columns if the value itself doesn't include time zone description (eg. Asia/Tokyo) | ``UTC`` by default |
@@ -4,6 +4,7 @@ Release Notes
4
4
  .. toctree::
5
5
  :maxdepth: 1
6
6
 
7
+ release/release-0.6.8
7
8
  release/release-0.6.7
8
9
  release/release-0.6.6
9
10
  release/release-0.6.5
@@ -0,0 +1,24 @@
1
+ Release 0.6.8
2
+ ==================================
3
+
4
+ Plugin API
5
+ ------------------
6
+
7
+ * Added utility class ``spi.util.ResumableInputStream``
8
+ * Added utility class ``spi.util.RetryExecutor``
9
+
10
+ Built-in plugins
11
+ ------------------
12
+
13
+ * ``parser-csv`` rejects rows if one includes too many columns by default. Setting ``allow extra_columns`` option to ``true`` will make the behavior same with before.
14
+ * ``guess-csv`` guesses ``columns`` option every time.
15
+
16
+ General Changes
17
+ ------------------
18
+
19
+ * Fixed a problem that IntelliJ IDEA causes problem when it imports embulk source code.
20
+ * Fixed a problem that transaction silently succeeds when an exception happens after all taks succeeded.
21
+
22
+ Release Date
23
+ ------------------
24
+ 2015-05-12
@@ -78,6 +78,10 @@ public class CsvParserPlugin
78
78
  @Config("allow_optional_columns")
79
79
  @ConfigDefault("false")
80
80
  public boolean getAllowOptionalColumns();
81
+
82
+ @Config("allow_extra_columns")
83
+ @ConfigDefault("false")
84
+ public boolean getAllowExtraColumns();
81
85
  }
82
86
 
83
87
  private final Logger log;
@@ -130,6 +134,7 @@ public class CsvParserPlugin
130
134
  final CsvTokenizer tokenizer = new CsvTokenizer(lineDecoder, task);
131
135
  final String nullStringOrNull = task.getNullString().orNull();
132
136
  final boolean allowOptionalColumns = task.getAllowOptionalColumns();
137
+ final boolean allowExtraColumns = task.getAllowExtraColumns();
133
138
  int skipHeaderLines = task.getSkipHeaderLines();
134
139
 
135
140
  try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
@@ -141,12 +146,15 @@ public class CsvParserPlugin
141
146
  }
142
147
  }
143
148
 
149
+ if (!tokenizer.nextRecord()) {
150
+ // empty file
151
+ continue;
152
+ }
153
+
144
154
  while (true) {
145
- try {
146
- if (!tokenizer.nextRecord()) {
147
- break;
148
- }
155
+ boolean hasNextRecord;
149
156
 
157
+ try {
150
158
  schema.visitColumns(new ColumnVisitor() {
151
159
  public void booleanColumn(Column column)
152
160
  {
@@ -216,6 +224,7 @@ public class CsvParserPlugin
216
224
  private String nextColumn()
217
225
  {
218
226
  if (allowOptionalColumns && !tokenizer.hasNextColumn()) {
227
+ //TODO warning
219
228
  return null;
220
229
  }
221
230
  String v = tokenizer.nextColumn();
@@ -231,13 +240,32 @@ public class CsvParserPlugin
231
240
  }
232
241
  }
233
242
  });
243
+
244
+ try {
245
+ hasNextRecord = tokenizer.nextRecord();
246
+ } catch (CsvTokenizer.TooManyColumnsException ex) {
247
+ if (allowExtraColumns) {
248
+ String tooManyColumnsLine = tokenizer.skipCurrentLine();
249
+ // TODO warning
250
+ hasNextRecord = tokenizer.nextRecord();
251
+ } else {
252
+ // this line will be skipped at the following catch section
253
+ throw ex;
254
+ }
255
+ }
234
256
  pageBuilder.addRecord();
235
257
 
236
258
  } catch (CsvTokenizer.InvalidFormatException | CsvRecordValidateException e) {
237
259
  long lineNumber = tokenizer.getCurrentLineNumber();
238
260
  String skippedLine = tokenizer.skipCurrentLine();
239
- log.warn(String.format("Skipped (line %d): %s", lineNumber, skippedLine), e);
261
+ log.warn(String.format("Skipped line %d (%s): %s", lineNumber, e.getMessage(), skippedLine));
240
262
  //exec.notice().skippedLine(skippedLine);
263
+
264
+ hasNextRecord = tokenizer.nextRecord();
265
+ }
266
+
267
+ if (!hasNextRecord) {
268
+ break;
241
269
  }
242
270
  }
243
271
  }
@@ -75,24 +75,25 @@ module Embulk
75
75
  parser_guessed["skip_header_lines"] = skip_header_lines
76
76
  end
77
77
 
78
- unless parser_config.has_key?("columns")
79
- if header_line
80
- column_names = sample_records.first
81
- else
82
- column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
83
- end
84
- schema = []
85
- column_names.zip(other_types).each do |name,type|
86
- if name && type
87
- if type.is_a?(SchemaGuess::TimestampTypeMatch)
88
- schema << {"name" => name, "type" => type, "format" => type.format}
89
- else
90
- schema << {"name" => name, "type" => type}
91
- end
78
+ parser_guessed["allow_extra_columns"] = false
79
+ parser_guessed["allow_optional_columns"] = false
80
+
81
+ if header_line
82
+ column_names = sample_records.first
83
+ else
84
+ column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
85
+ end
86
+ schema = []
87
+ column_names.zip(other_types).each do |name,type|
88
+ if name && type
89
+ if type.is_a?(SchemaGuess::TimestampTypeMatch)
90
+ schema << {"name" => name, "type" => type, "format" => type.format}
91
+ else
92
+ schema << {"name" => name, "type" => type}
92
93
  end
93
94
  end
94
- parser_guessed["columns"] = schema
95
95
  end
96
+ parser_guessed["columns"] = schema
96
97
 
97
98
  return {"parser" => parser_guessed}
98
99
  end
@@ -1,3 +1,3 @@
1
1
  module Embulk
2
- VERSION = '0.6.7'
2
+ VERSION = '0.6.8'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.7
4
+ version: 0.6.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-07 00:00:00.000000000 Z
11
+ date: 2015-05-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -228,6 +228,8 @@ files:
228
228
  - embulk-core/src/main/java/org/embulk/spi/util/OutputStreamFileOutput.java
229
229
  - embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java
230
230
  - embulk-core/src/main/java/org/embulk/spi/util/Pages.java
231
+ - embulk-core/src/main/java/org/embulk/spi/util/ResumableInputStream.java
232
+ - embulk-core/src/main/java/org/embulk/spi/util/RetryExecutor.java
231
233
  - embulk-core/src/test/java/org/embulk/EmbulkTestRuntime.java
232
234
  - embulk-core/src/test/java/org/embulk/GuiceBinder.java
233
235
  - embulk-core/src/test/java/org/embulk/RandomManager.java
@@ -297,6 +299,7 @@ files:
297
299
  - embulk-docs/src/release/release-0.6.5.rst
298
300
  - embulk-docs/src/release/release-0.6.6.rst
299
301
  - embulk-docs/src/release/release-0.6.7.rst
302
+ - embulk-docs/src/release/release-0.6.8.rst
300
303
  - embulk-standards/build.gradle
301
304
  - embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java
302
305
  - embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java
@@ -403,8 +406,8 @@ files:
403
406
  - classpath/bval-jsr303-0.5.jar
404
407
  - classpath/commons-beanutils-core-1.8.3.jar
405
408
  - classpath/commons-lang3-3.1.jar
406
- - classpath/embulk-core-0.6.7.jar
407
- - classpath/embulk-standards-0.6.7.jar
409
+ - classpath/embulk-core-0.6.8.jar
410
+ - classpath/embulk-standards-0.6.8.jar
408
411
  - classpath/guava-18.0.jar
409
412
  - classpath/guice-4.0.jar
410
413
  - classpath/guice-multibindings-4.0.jar