embulk 0.6.7 → 0.6.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c812083e751a2593484a749af8ac7f0519cbe5f7
4
- data.tar.gz: d7d74ec957f1cbb9951850199524a3e039be4640
3
+ metadata.gz: a9e8a1f4e6ada76c6388248ae5ac9a2cdc6876ea
4
+ data.tar.gz: b21c680b792319509aa443b4e01809af8d3dcb56
5
5
  SHA512:
6
- metadata.gz: 9f727dfbf18ab2dc4e768b7ea132413c30fc63330a6aa3c0d5ffc4285623c7f03eaa66c177ed9c4c57d1232f59e582583cb2255c73d8f0a4045831f126f040de
7
- data.tar.gz: a0218b10b993ad8cb09afc3fc7f94454da50410979e70a74375d2c3d739077e20ae16b27e0fb30d369137cc8c29b6bddaace9955746f66fcb3ae8c72c7a39e82
6
+ metadata.gz: 7990996b97d6ae238dd20c8f997af9cb68f2d557b94b3d27388b8ede4e675c1c0f044072310d76ce37c4e4625497510ddecc9dee594a783ac91875f398c0137b
7
+ data.tar.gz: c2421ec9edd3f3e404302699526b1e226af19755186962b3b27074e53aebbfdb45dabf9e832c0b5a285bb40aad3e5a7aac7000724d735f43cae009b53150e2c9
data/build.gradle CHANGED
@@ -11,7 +11,7 @@ def release_projects = [project(":embulk-core"), project(":embulk-standards")]
11
11
 
12
12
  allprojects {
13
13
  group = 'org.embulk'
14
- version = '0.6.7'
14
+ version = '0.6.8'
15
15
 
16
16
  ext {
17
17
  jrubyVersion = '1.7.19'
@@ -1,9 +1,6 @@
1
- // include ruby scripts
2
- sourceSets {
3
- main.resources {
4
- srcDirs "${rootProject.projectDir}/lib"
5
- }
6
- }
1
+ // include ruby scripts to jar. don't use sourceSets.main.resources.srcDirs
2
+ // because IntelliJ causes error if srcDirs includes files out of projectDir.
3
+ processResources.from "${rootProject.projectDir}/lib"
7
4
 
8
5
  configurations {
9
6
  // com.google.inject:guice depends on asm and cglib but version of the libraries conflict
@@ -164,7 +164,7 @@ public class BulkLoader
164
164
  return outputTaskStates.get(outputTaskIndex);
165
165
  }
166
166
 
167
- public boolean isAllCommitted()
167
+ public boolean isAllTasksCommitted()
168
168
  {
169
169
  if (outputTaskStates == null) {
170
170
  // not initialized
@@ -178,6 +178,11 @@ public class BulkLoader
178
178
  return true;
179
179
  }
180
180
 
181
+ public boolean isAllTransactionsCommitted()
182
+ {
183
+ return inputConfigDiff != null && outputConfigDiff != null;
184
+ }
185
+
181
186
  public boolean isAnyStarted()
182
187
  {
183
188
  if (inputTaskStates == null) {
@@ -489,7 +494,7 @@ public class BulkLoader
489
494
 
490
495
  state.initialize(inputTaskCount, outputTaskCount);
491
496
 
492
- if (!state.isAllCommitted()) { // inputTaskCount == 0
497
+ if (!state.isAllTasksCommitted()) { // inputTaskCount == 0
493
498
  execute(task, executor, state);
494
499
  }
495
500
 
@@ -511,7 +516,7 @@ public class BulkLoader
511
516
  return state.buildExecuteResult();
512
517
 
513
518
  } catch (Throwable ex) {
514
- if (state.isAllCommitted()) {
519
+ if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) {
515
520
  // ignore the exception
516
521
  return state.buildExecuteResultWithWarningException(ex);
517
522
  }
@@ -554,7 +559,7 @@ public class BulkLoader
554
559
  state.setOutputTaskSource(outputTask);
555
560
 
556
561
  restoreResumedCommitReports(resume, state);
557
- if (!state.isAllCommitted()) {
562
+ if (!state.isAllTasksCommitted()) {
558
563
  execute(task, executor, state);
559
564
  }
560
565
 
@@ -576,7 +581,7 @@ public class BulkLoader
576
581
  return state.buildExecuteResult();
577
582
 
578
583
  } catch (Throwable ex) {
579
- if (state.isAllCommitted()) {
584
+ if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) {
580
585
  // ignore the exception
581
586
  return state.buildExecuteResultWithWarningException(ex);
582
587
  }
@@ -621,7 +626,7 @@ public class BulkLoader
621
626
 
622
627
  executor.execute(procTask, state);
623
628
 
624
- if (!state.isAllCommitted()) {
629
+ if (!state.isAllTasksCommitted()) {
625
630
  throw state.getRepresentativeException();
626
631
  }
627
632
  }
@@ -39,6 +39,11 @@ public class TimestampFormatter
39
39
  this.dateFormat = new RubyDateFormat(format, Locale.ENGLISH, true);
40
40
  }
41
41
 
42
+ public DateTimeZone getTimeZone()
43
+ {
44
+ return timeZone;
45
+ }
46
+
42
47
  public void format(Timestamp value, LineEncoder encoder)
43
48
  {
44
49
  // TODO optimize by directly appending to internal buffer
@@ -30,6 +30,11 @@ public class TimestampParser
30
30
  this(task.getJRuby(), format, task.getDefaultTimeZone());
31
31
  }
32
32
 
33
+ public DateTimeZone getDefaultTimeZone()
34
+ {
35
+ return defaultTimeZone;
36
+ }
37
+
33
38
  // TODO this is still private because this might need current time
34
39
  private TimestampParser(ScriptingContainer jruby, String format, DateTimeZone defaultTimeZone)
35
40
  {
@@ -0,0 +1,128 @@
1
+ package org.embulk.spi.util;
2
+
3
+ import java.io.InputStream;
4
+ import java.io.IOException;
5
+
6
+ public class ResumableInputStream
7
+ extends InputStream
8
+ {
9
+ public interface Reopener
10
+ {
11
+ public InputStream reopen(long offset, Exception closedCause) throws IOException;
12
+ }
13
+
14
+ private final Reopener reopener;
15
+ protected InputStream in;
16
+ private long offset;
17
+ private long markedOffset;
18
+
19
+ public ResumableInputStream(InputStream initialInputStream, Reopener reopener)
20
+ {
21
+ this.reopener = reopener;
22
+ this.in = initialInputStream;
23
+ this.offset = 0L;
24
+ this.markedOffset = 0L;
25
+ }
26
+
27
+ public ResumableInputStream(Reopener reopener) throws IOException
28
+ {
29
+ this(reopener.reopen(0, null), reopener);
30
+ }
31
+
32
+ private void reopen(Exception closedCause) throws IOException
33
+ {
34
+ if (in != null) {
35
+ in.close();
36
+ in = null;
37
+ }
38
+ in = reopener.reopen(offset, closedCause);
39
+ }
40
+
41
+ @Override
42
+ public int read() throws IOException
43
+ {
44
+ while (true) {
45
+ try {
46
+ int v = in.read();
47
+ offset += 1;
48
+ return v;
49
+ } catch (IOException | RuntimeException ex) {
50
+ reopen(ex);
51
+ }
52
+ }
53
+ }
54
+
55
+ @Override
56
+ public int read(byte[] b) throws IOException
57
+ {
58
+ while (true) {
59
+ try {
60
+ int r = in.read(b);
61
+ offset += r;
62
+ return r;
63
+ } catch (IOException | RuntimeException ex) {
64
+ reopen(ex);
65
+ }
66
+ }
67
+ }
68
+
69
+ @Override
70
+ public int read(byte[] b, int off, int len) throws IOException
71
+ {
72
+ while (true) {
73
+ try {
74
+ int r = in.read(b, off, len);
75
+ offset += r;
76
+ return r;
77
+ } catch (IOException | RuntimeException ex) {
78
+ reopen(ex);
79
+ }
80
+ }
81
+ }
82
+
83
+ @Override
84
+ public long skip(long n) throws IOException
85
+ {
86
+ while (true) {
87
+ try {
88
+ long r = in.skip(n);
89
+ offset += r;
90
+ return r;
91
+ } catch (IOException | RuntimeException ex) {
92
+ reopen(ex);
93
+ }
94
+ }
95
+ }
96
+
97
+ @Override
98
+ public int available() throws IOException
99
+ {
100
+ return in.available();
101
+ }
102
+
103
+ @Override
104
+ public void close() throws IOException
105
+ {
106
+ in.close();
107
+ }
108
+
109
+ @Override
110
+ public void mark(int readlimit)
111
+ {
112
+ in.mark(readlimit);
113
+ markedOffset = offset;
114
+ }
115
+
116
+ @Override
117
+ public void reset() throws IOException
118
+ {
119
+ in.reset();
120
+ offset = markedOffset;
121
+ }
122
+
123
+ @Override
124
+ public boolean markSupported()
125
+ {
126
+ return in.markSupported();
127
+ }
128
+ }
@@ -0,0 +1,130 @@
1
+ package org.embulk.spi.util;
2
+
3
+ import java.util.concurrent.Callable;
4
+ import java.util.concurrent.ExecutionException;
5
+
6
+ public class RetryExecutor
7
+ {
8
+ public static RetryExecutor retryExecutor()
9
+ {
10
+ // TODO default configuration
11
+ return new RetryExecutor(3, 500, 30*60*1000);
12
+ }
13
+
14
+ public static class RetryGiveupException
15
+ extends ExecutionException
16
+ {
17
+ public RetryGiveupException(String message, Exception cause)
18
+ {
19
+ super(cause);
20
+ }
21
+
22
+ public RetryGiveupException(Exception cause)
23
+ {
24
+ super(cause);
25
+ }
26
+
27
+ public Exception getCause()
28
+ {
29
+ return (Exception) super.getCause();
30
+ }
31
+ }
32
+
33
+ public static interface Retryable<T>
34
+ extends Callable<T>
35
+ {
36
+ public T call()
37
+ throws Exception;
38
+
39
+ public boolean isRetryableException(Exception exception);
40
+
41
+ public void onRetry(Exception exception, int retryCount, int retryLimit, int retryWait)
42
+ throws RetryGiveupException;
43
+
44
+ public void onGiveup(Exception firstException, Exception lastException)
45
+ throws RetryGiveupException;
46
+ }
47
+
48
+ private final int retryLimit;
49
+ private final int initialRetryWait;
50
+ private final int maxRetryWait;
51
+
52
+ private RetryExecutor(int retryLimit, int initialRetryWait, int maxRetryWait)
53
+ {
54
+ this.retryLimit = retryLimit;
55
+ this.initialRetryWait = initialRetryWait;
56
+ this.maxRetryWait = maxRetryWait;
57
+ }
58
+
59
+ public RetryExecutor withRetryLimit(int count)
60
+ {
61
+ return new RetryExecutor(count, initialRetryWait, maxRetryWait);
62
+ }
63
+
64
+ public RetryExecutor withInitialRetryWait(int msec)
65
+ {
66
+ return new RetryExecutor(retryLimit, msec, maxRetryWait);
67
+ }
68
+
69
+ public RetryExecutor withMaxRetryWait(int msec)
70
+ {
71
+ return new RetryExecutor(retryLimit, initialRetryWait, msec);
72
+ }
73
+
74
+ public <T> T runInterruptible(Retryable<T> op)
75
+ throws InterruptedException, RetryGiveupException
76
+ {
77
+ return run(op, true);
78
+ }
79
+
80
+ public <T> T run(Retryable<T> op)
81
+ throws RetryGiveupException
82
+ {
83
+ try {
84
+ return run(op, false);
85
+ } catch (InterruptedException ex) {
86
+ throw new RetryGiveupException("Unexpected interruption", ex);
87
+ }
88
+ }
89
+
90
+ private <T> T run(Retryable<T> op, boolean interruptible)
91
+ throws InterruptedException, RetryGiveupException
92
+ {
93
+ int retryWait = initialRetryWait;
94
+ int retryCount = 0;
95
+
96
+ Exception firstException = null;
97
+
98
+ while(true) {
99
+ try {
100
+ return op.call();
101
+ } catch (Exception exception) {
102
+ if (firstException == null) {
103
+ firstException = exception;
104
+ }
105
+ if (!op.isRetryableException(exception) || retryCount >= retryLimit) {
106
+ op.onGiveup(firstException, exception);
107
+ throw new RetryGiveupException(firstException);
108
+ }
109
+
110
+ retryCount++;
111
+ op.onRetry(exception, retryCount, retryLimit, retryWait);
112
+
113
+ try {
114
+ Thread.sleep(retryWait);
115
+ } catch (InterruptedException ex) {
116
+ if (interruptible) {
117
+ throw ex;
118
+ }
119
+ }
120
+
121
+ // exponential back-off with hard limit
122
+ retryWait *= 2;
123
+ if (retryWait > maxRetryWait) {
124
+ retryWait = maxRetryWait;
125
+ }
126
+ }
127
+ }
128
+ }
129
+ }
130
+
@@ -145,6 +145,8 @@ Options
145
145
  +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
146
146
  | allow\_optional\_columns | boolean | If true, set null to insufficient columns. Otherwise, skip the row in case of insufficient number of columns | ``false`` by default |
147
147
  +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
148
+ | allow\_extra\_columns | boolean | If true, ignore too many columns. Otherwise, skip the row in case of too many columns | ``false`` by default |
149
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
148
150
  | max\_quoted\_size\_limit | integer | Maximum number of bytes of a quoted value. If a value exceeds the limit, the row will be skipped | ``131072`` by default |
149
151
  +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
150
152
  | default\_timezone | string | Time zone of timestamp columns if the value itself doesn't include time zone description (eg. Asia/Tokyo) | ``UTC`` by default |
@@ -4,6 +4,7 @@ Release Notes
4
4
  .. toctree::
5
5
  :maxdepth: 1
6
6
 
7
+ release/release-0.6.8
7
8
  release/release-0.6.7
8
9
  release/release-0.6.6
9
10
  release/release-0.6.5
@@ -0,0 +1,24 @@
1
+ Release 0.6.8
2
+ ==================================
3
+
4
+ Plugin API
5
+ ------------------
6
+
7
+ * Added utility class ``spi.util.ResumableInputStream``
8
+ * Added utility class ``spi.util.RetryExecutor``
9
+
10
+ Built-in plugins
11
+ ------------------
12
+
13
+ * ``parser-csv`` rejects rows if one includes too many columns by default. Setting ``allow extra_columns`` option to ``true`` will make the behavior same with before.
14
+ * ``guess-csv`` guesses ``columns`` option every time.
15
+
16
+ General Changes
17
+ ------------------
18
+
19
+ * Fixed a problem that IntelliJ IDEA causes problem when it imports embulk source code.
20
+ * Fixed a problem that transaction silently succeeds when an exception happens after all taks succeeded.
21
+
22
+ Release Date
23
+ ------------------
24
+ 2015-05-12
@@ -78,6 +78,10 @@ public class CsvParserPlugin
78
78
  @Config("allow_optional_columns")
79
79
  @ConfigDefault("false")
80
80
  public boolean getAllowOptionalColumns();
81
+
82
+ @Config("allow_extra_columns")
83
+ @ConfigDefault("false")
84
+ public boolean getAllowExtraColumns();
81
85
  }
82
86
 
83
87
  private final Logger log;
@@ -130,6 +134,7 @@ public class CsvParserPlugin
130
134
  final CsvTokenizer tokenizer = new CsvTokenizer(lineDecoder, task);
131
135
  final String nullStringOrNull = task.getNullString().orNull();
132
136
  final boolean allowOptionalColumns = task.getAllowOptionalColumns();
137
+ final boolean allowExtraColumns = task.getAllowExtraColumns();
133
138
  int skipHeaderLines = task.getSkipHeaderLines();
134
139
 
135
140
  try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
@@ -141,12 +146,15 @@ public class CsvParserPlugin
141
146
  }
142
147
  }
143
148
 
149
+ if (!tokenizer.nextRecord()) {
150
+ // empty file
151
+ continue;
152
+ }
153
+
144
154
  while (true) {
145
- try {
146
- if (!tokenizer.nextRecord()) {
147
- break;
148
- }
155
+ boolean hasNextRecord;
149
156
 
157
+ try {
150
158
  schema.visitColumns(new ColumnVisitor() {
151
159
  public void booleanColumn(Column column)
152
160
  {
@@ -216,6 +224,7 @@ public class CsvParserPlugin
216
224
  private String nextColumn()
217
225
  {
218
226
  if (allowOptionalColumns && !tokenizer.hasNextColumn()) {
227
+ //TODO warning
219
228
  return null;
220
229
  }
221
230
  String v = tokenizer.nextColumn();
@@ -231,13 +240,32 @@ public class CsvParserPlugin
231
240
  }
232
241
  }
233
242
  });
243
+
244
+ try {
245
+ hasNextRecord = tokenizer.nextRecord();
246
+ } catch (CsvTokenizer.TooManyColumnsException ex) {
247
+ if (allowExtraColumns) {
248
+ String tooManyColumnsLine = tokenizer.skipCurrentLine();
249
+ // TODO warning
250
+ hasNextRecord = tokenizer.nextRecord();
251
+ } else {
252
+ // this line will be skipped at the following catch section
253
+ throw ex;
254
+ }
255
+ }
234
256
  pageBuilder.addRecord();
235
257
 
236
258
  } catch (CsvTokenizer.InvalidFormatException | CsvRecordValidateException e) {
237
259
  long lineNumber = tokenizer.getCurrentLineNumber();
238
260
  String skippedLine = tokenizer.skipCurrentLine();
239
- log.warn(String.format("Skipped (line %d): %s", lineNumber, skippedLine), e);
261
+ log.warn(String.format("Skipped line %d (%s): %s", lineNumber, e.getMessage(), skippedLine));
240
262
  //exec.notice().skippedLine(skippedLine);
263
+
264
+ hasNextRecord = tokenizer.nextRecord();
265
+ }
266
+
267
+ if (!hasNextRecord) {
268
+ break;
241
269
  }
242
270
  }
243
271
  }
@@ -75,24 +75,25 @@ module Embulk
75
75
  parser_guessed["skip_header_lines"] = skip_header_lines
76
76
  end
77
77
 
78
- unless parser_config.has_key?("columns")
79
- if header_line
80
- column_names = sample_records.first
81
- else
82
- column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
83
- end
84
- schema = []
85
- column_names.zip(other_types).each do |name,type|
86
- if name && type
87
- if type.is_a?(SchemaGuess::TimestampTypeMatch)
88
- schema << {"name" => name, "type" => type, "format" => type.format}
89
- else
90
- schema << {"name" => name, "type" => type}
91
- end
78
+ parser_guessed["allow_extra_columns"] = false
79
+ parser_guessed["allow_optional_columns"] = false
80
+
81
+ if header_line
82
+ column_names = sample_records.first
83
+ else
84
+ column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
85
+ end
86
+ schema = []
87
+ column_names.zip(other_types).each do |name,type|
88
+ if name && type
89
+ if type.is_a?(SchemaGuess::TimestampTypeMatch)
90
+ schema << {"name" => name, "type" => type, "format" => type.format}
91
+ else
92
+ schema << {"name" => name, "type" => type}
92
93
  end
93
94
  end
94
- parser_guessed["columns"] = schema
95
95
  end
96
+ parser_guessed["columns"] = schema
96
97
 
97
98
  return {"parser" => parser_guessed}
98
99
  end
@@ -1,3 +1,3 @@
1
1
  module Embulk
2
- VERSION = '0.6.7'
2
+ VERSION = '0.6.8'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.7
4
+ version: 0.6.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-07 00:00:00.000000000 Z
11
+ date: 2015-05-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -228,6 +228,8 @@ files:
228
228
  - embulk-core/src/main/java/org/embulk/spi/util/OutputStreamFileOutput.java
229
229
  - embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java
230
230
  - embulk-core/src/main/java/org/embulk/spi/util/Pages.java
231
+ - embulk-core/src/main/java/org/embulk/spi/util/ResumableInputStream.java
232
+ - embulk-core/src/main/java/org/embulk/spi/util/RetryExecutor.java
231
233
  - embulk-core/src/test/java/org/embulk/EmbulkTestRuntime.java
232
234
  - embulk-core/src/test/java/org/embulk/GuiceBinder.java
233
235
  - embulk-core/src/test/java/org/embulk/RandomManager.java
@@ -297,6 +299,7 @@ files:
297
299
  - embulk-docs/src/release/release-0.6.5.rst
298
300
  - embulk-docs/src/release/release-0.6.6.rst
299
301
  - embulk-docs/src/release/release-0.6.7.rst
302
+ - embulk-docs/src/release/release-0.6.8.rst
300
303
  - embulk-standards/build.gradle
301
304
  - embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java
302
305
  - embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java
@@ -403,8 +406,8 @@ files:
403
406
  - classpath/bval-jsr303-0.5.jar
404
407
  - classpath/commons-beanutils-core-1.8.3.jar
405
408
  - classpath/commons-lang3-3.1.jar
406
- - classpath/embulk-core-0.6.7.jar
407
- - classpath/embulk-standards-0.6.7.jar
409
+ - classpath/embulk-core-0.6.8.jar
410
+ - classpath/embulk-standards-0.6.8.jar
408
411
  - classpath/guava-18.0.jar
409
412
  - classpath/guice-4.0.jar
410
413
  - classpath/guice-multibindings-4.0.jar