embulk 0.8.35-java → 0.8.36-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +4 -0
  3. data/build.gradle +1 -1
  4. data/embulk-cli/src/main/java/org/embulk/cli/EmbulkExample.java +5 -1
  5. data/embulk-cli/src/main/java/org/embulk/cli/EmbulkRun.java +12 -0
  6. data/embulk-core/src/main/java/org/embulk/EmbulkRunner.java +2 -2
  7. data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoader.java +802 -17
  8. data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoaderFactory.java +8 -1
  9. data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoaderModule.java +33 -2
  10. data/embulk-core/src/main/java/org/embulk/plugin/jar/JarPluginLoader.java +32 -5
  11. data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +1 -6
  12. data/embulk-core/src/main/java/org/embulk/spi/json/RubyValueApi.java +39 -1
  13. data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +21 -0
  14. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +21 -0
  15. data/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnSetterFactory.java +43 -9
  16. data/embulk-core/src/main/java/org/embulk/spi/util/DynamicPageBuilder.java +46 -8
  17. data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +19 -1
  18. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/AbstractDynamicColumnSetter.java +11 -0
  19. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/SkipColumnSetter.java +12 -1
  20. data/embulk-core/src/main/resources/embulk/parent_first_packages.properties +1 -0
  21. data/embulk-docs/build.gradle +8 -0
  22. data/embulk-docs/src/built-in.rst +47 -35
  23. data/embulk-docs/src/index.rst +9 -1
  24. data/embulk-docs/src/release.rst +1 -0
  25. data/embulk-docs/src/release/release-0.8.36.rst +32 -0
  26. data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +22 -0
  27. data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +34 -1
  28. data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +8 -2
  29. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +76 -0
  30. data/lib/embulk/guess/schema_guess.rb +1 -1
  31. data/lib/embulk/input_plugin.rb +8 -1
  32. data/lib/embulk/page_builder.rb +38 -5
  33. data/lib/embulk/schema.rb +5 -6
  34. data/lib/embulk/version.rb +1 -1
  35. data/test/guess/test_schema_guess.rb +18 -0
  36. metadata +7 -6
@@ -9,6 +9,7 @@ import org.embulk.spi.Column;
9
9
  import org.embulk.spi.PageReader;
10
10
  import org.embulk.spi.ColumnVisitor;
11
11
  import org.embulk.spi.type.TimestampType;
12
+ import org.joda.time.DateTimeZone;
12
13
 
13
14
  public class PagePrinter
14
15
  {
@@ -17,14 +18,29 @@ public class PagePrinter
17
18
  private final ArrayList<String> record;
18
19
 
19
20
  // TODO: Update this constructor because |TimestampFormater.FormatterTask| is deprecated since v0.6.14.
21
+ @Deprecated
20
22
  public PagePrinter(Schema schema, TimestampFormatter.FormatterTask task)
23
+ {
24
+ this(schema, task.getTimeZone());
25
+ // NOTE: Its deprecation is not actually from ScriptingContainer, though.
26
+ // TODO: Notify users about deprecated calls through the notification reporter.
27
+ if (!deprecationWarned) {
28
+ System.err.println("[WARN] Plugin uses deprecated constructor of org.embulk.spi.util.PagePrinter.");
29
+ System.err.println("[WARN] Report plugins in your config at: https://github.com/embulk/embulk/issues/827");
30
+ // The |deprecationWarned| flag is used only for warning messages.
31
+ // Even in case of race conditions, messages are just duplicated -- should be acceptable.
32
+ deprecationWarned = true;
33
+ }
34
+ }
35
+
36
+ public PagePrinter(final Schema schema, final DateTimeZone timezone)
21
37
  {
22
38
  this.schema = schema;
23
39
  this.timestampFormatters = new TimestampFormatter[schema.getColumnCount()];
24
40
  for (int i=0; i < timestampFormatters.length; i++) {
25
41
  if (schema.getColumnType(i) instanceof TimestampType) {
26
42
  TimestampType type = (TimestampType) schema.getColumnType(i);
27
- timestampFormatters[i] = new TimestampFormatter(type.getFormat(), task.getTimeZone());
43
+ timestampFormatters[i] = new TimestampFormatter(type.getFormat(), timezone);
28
44
  }
29
45
  }
30
46
 
@@ -105,4 +121,6 @@ public class PagePrinter
105
121
  string = reader.getJson(column).toString();
106
122
  }
107
123
  }
124
+
125
+ private static boolean deprecationWarned = false;
108
126
  }
@@ -44,8 +44,17 @@ public abstract class AbstractDynamicColumnSetter
44
44
 
45
45
  public abstract void set(Value value);
46
46
 
47
+ @Deprecated
47
48
  public void setRubyObject(IRubyObject rubyObject)
48
49
  {
50
+ if (!deprecationWarned) {
51
+ System.err.println("[WARN] Plugin uses deprecated org.embulk.spi.util.dynamic.AbstractDynamicColumnSetter#setRubyObject");
52
+ System.err.println("[WARN] Report plugins in your config at: https://github.com/embulk/embulk/issues/799");
53
+ // The |deprecationWarned| flag is used only for warning messages.
54
+ // Even in case of race conditions, messages are just duplicated -- should be acceptable.
55
+ deprecationWarned = true;
56
+ }
57
+
49
58
  if (rubyObject == null || rubyObject instanceof RubyNil) {
50
59
  setNull();
51
60
  } else if (rubyObject instanceof RubyBoolean) {
@@ -80,4 +89,6 @@ public abstract class AbstractDynamicColumnSetter
80
89
  set(RubyValueApi.toValue(rubyObject.getRuntime(), rubyObject));
81
90
  }
82
91
  }
92
+
93
+ private static boolean deprecationWarned = false;
83
94
  }
@@ -51,7 +51,18 @@ public class SkipColumnSetter
51
51
  public void set(Value v)
52
52
  { }
53
53
 
54
+ @Deprecated
54
55
  @Override
55
56
  public void setRubyObject(IRubyObject rubyObject)
56
- { }
57
+ {
58
+ if (!deprecationWarned) {
59
+ System.err.println("[WARN] Plugin uses deprecated org.embulk.spi.util.dynamic.SkipColumnSetter#setRubyObject");
60
+ System.err.println("[WARN] Report plugins in your config at: https://github.com/embulk/embulk/issues/799");
61
+ // The |deprecationWarned| flag is used only for warning messages.
62
+ // Even in case of race conditions, messages are just duplicated -- should be acceptable.
63
+ deprecationWarned = true;
64
+ }
65
+ }
66
+
67
+ private static boolean deprecationWarned = false;
57
68
  }
@@ -40,6 +40,7 @@ edu.umd.cs.findbugs.annotations
40
40
  io.airlift.slice
41
41
  io.netty.buffer
42
42
  io.netty.util
43
+ java
43
44
  javax.annotation
44
45
  javax.inject
45
46
  javax.validation
@@ -1,3 +1,11 @@
1
+ // TODO: Remove this block once rubygems.lasagna.io is back, or jruby-gradle-jar-plugin is upgraded to 1.5.0.
2
+ // See also: https://github.com/jruby-gradle/jruby-gradle-plugin/issues/297
3
+ // jruby-gradle-jar-plugin is not upgraded yet because its 1.5.0 depends on Java 8.
4
+ repositories {
5
+ mavenLocal()
6
+ maven { url 'http://rubygems-proxy.torquebox.org/releases' }
7
+ }
8
+
1
9
  apply plugin: 'com.github.jruby-gradle.base'
2
10
 
3
11
  import com.github.jrubygradle.JRubyExec
@@ -191,41 +191,53 @@ The ``csv`` parser plugin parses CSV and TSV files.
191
191
  Options
192
192
  ~~~~~~~~
193
193
 
194
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
195
- | name | type | description | required? |
196
- +============================+==========+================================================================================================================+===========================+
197
- | delimiter | string | Delimiter character such as ``,`` for CSV, ``"\t"`` for TSV, ``"|"`` | ``,`` by default |
198
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
199
- | quote | string | The character surrounding a quoted value. Setting ``null`` disables quoting. | ``"`` by default |
200
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
201
- | escape | string | Escape character to escape a special character. Setting ``null`` disables escaping. | ``\\`` by default |
202
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
203
- | skip\_header\_lines | integer | Skip this number of lines first. Set 1 if the file has header line. | ``0`` by default |
204
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
205
- | null\_string | string | If a value is this string, converts it to NULL. For example, set ``\N`` for CSV files created by mysqldump | |
206
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
207
- | trim\_if\_not\_quoted | boolean | If true, remove spaces of a value if the value is not surrounded by the quote character | ``false`` by default |
208
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
209
- | comment\_line\_marker | string | Skip a line if the line begins with this string | null by default |
210
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
211
- | allow\_optional\_columns | boolean | If true, set null to insufficient columns. Otherwise, skip the row in case of insufficient number of columns | ``false`` by default |
212
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
213
- | allow\_extra\_columns | boolean | If true, ignore too many columns. Otherwise, skip the row in case of too many columns | ``false`` by default |
214
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
215
- | max\_quoted\_size\_limit | integer | Maximum number of bytes of a quoted value. If a value exceeds the limit, the row will be skipped | ``131072`` by default |
216
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
217
- | stop\_on\_invalid\_record | boolean | Stop bulk load transaction if a file includes invalid record (such as invalid timestamp) | ``false`` by default |
218
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
219
- | default\_timezone | string | Time zone of timestamp columns if the value itself doesn't include time zone description (eg. Asia/Tokyo) | ``UTC`` by default |
220
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
221
- | default\_date | string | Set date part if the format doesnt include date part. | ``1970-01-01`` by default |
222
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
223
- | newline | enum | Newline character (CRLF, LF or CR) | ``CRLF`` by default |
224
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
225
- | charset | enum | Character encoding (eg. ISO-8859-1, UTF-8) | ``UTF-8`` by default |
226
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
227
- | columns | hash | Columns (see below) | required |
228
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
194
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
195
+ | name | type | description | required? |
196
+ +============================+==========+================================================================================================================+============================================+
197
+ | delimiter | string | Delimiter character such as ``,`` for CSV, ``"\t"`` for TSV, ``"|"`` | ``,`` by default |
198
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
199
+ | quote | string | The character surrounding a quoted value. Setting ``null`` disables quoting. | ``"`` by default |
200
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
201
+ | escape | string | Escape character to escape a special character. Setting ``null`` disables escaping. | ``\\`` by default |
202
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
203
+ | skip\_header\_lines | integer | Skip this number of lines first. Set 1 if the file has header line. | ``0`` by default |
204
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
205
+ | null\_string | string | If a value is this string, converts it to NULL. For example, set ``\N`` for CSV files created by mysqldump | |
206
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
207
+ | trim\_if\_not\_quoted | boolean | If true, remove spaces of a value if the value is not surrounded by the quote character | ``false`` by default |
208
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
209
+ | quotes\_in\_quoted\_fields | enum | Specify how to deal with irregular unescaped quote characters in quoted fields | ``ACCEPT_ONLY_RFC4180_ESCAPED`` by default |
210
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
211
+ | comment\_line\_marker | string | Skip a line if the line begins with this string | null by default |
212
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
213
+ | allow\_optional\_columns | boolean | If true, set null to insufficient columns. Otherwise, skip the row in case of insufficient number of columns | ``false`` by default |
214
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
215
+ | allow\_extra\_columns | boolean | If true, ignore too many columns. Otherwise, skip the row in case of too many columns | ``false`` by default |
216
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
217
+ | max\_quoted\_size\_limit | integer | Maximum number of bytes of a quoted value. If a value exceeds the limit, the row will be skipped | ``131072`` by default |
218
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
219
+ | stop\_on\_invalid\_record | boolean | Stop bulk load transaction if a file includes invalid record (such as invalid timestamp) | ``false`` by default |
220
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
221
+ | default\_timezone | string | Time zone of timestamp columns if the value itself doesn't include time zone description (eg. Asia/Tokyo) | ``UTC`` by default |
222
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
223
+ | default\_date | string | Set date part if the format doesn’t include date part. | ``1970-01-01`` by default |
224
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
225
+ | newline | enum | Newline character (CRLF, LF or CR) | ``CRLF`` by default |
226
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
227
+ | charset | enum | Character encoding (eg. ISO-8859-1, UTF-8) | ``UTF-8`` by default |
228
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
229
+ | columns | hash | Columns (see below) | required |
230
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
231
+
232
+ The ``quotes_in_quoted_fields`` option specifies how to deal with irregular non-escaped stray quote characters.
233
+
234
+ +------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
235
+ | name | description |
236
+ +======================================================+=====================================================================================================================================================+
237
+ | ACCEPT_ONLY_RFC4180_ESCAPED | Default. Accept only specified and RFC 4180-style escaped quote characters. |
238
+ +------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
239
+ | ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS | Accept stray quotes as-is in the field. Instead, it behaves undefined if delimiters are in fields. ``"a"b"`` goes ``a"b``. ``"a""b"`` goes ``a"b``. |
240
+ +------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
229
241
 
230
242
  The ``columns`` option declares the list of columns. This CSV parser plugin ignores the header line.
231
243
 
@@ -7,9 +7,17 @@ Embulk
7
7
  ==================================
8
8
 
9
9
  .. image:: _static/embulk-logo-v2/embulk-logo-v2-sq-tr-small.png
10
- :width: 512px
10
+ :width: 128px
11
11
  :target: https://github.com/embulk/embulk
12
12
 
13
+ Highlights
14
+ -----------
15
+
16
+ * Embulk's announcement mailing list (read-only) is ready. Please feel free to subscribe! Embulk core members post important updates such as **key releases**, **compatibility information**, and **feedback requests to users**.
17
+
18
+ * `Embulk-announce <https://groups.google.com/forum/#!forum/embulk-announce>`_
19
+
20
+
13
21
  What's Embulk?
14
22
  ------------------
15
23
 
@@ -4,6 +4,7 @@ Release Notes
4
4
  .. toctree::
5
5
  :maxdepth: 1
6
6
 
7
+ release/release-0.8.36
7
8
  release/release-0.8.35
8
9
  release/release-0.8.34
9
10
  release/release-0.8.33
@@ -0,0 +1,32 @@
1
+ Release 0.8.36
2
+ ==================================
3
+
4
+ General Changes
5
+ ----------------
6
+
7
+ * Load dependency JAR files embedded in plugin JAR [#792]
8
+ * Improve timestamp parsing in Ruby plugins [#812] [#814]
9
+ * Notify Embulk-announce mailing list in CLI [#816]
10
+
11
+ Bug Fixes
12
+ ----------
13
+
14
+ * Use single-quotes to quote path strings in YAML for Windows [#805]
15
+ * Truncate output file before overwriting it [#807]
16
+ * Fix typo in FALSE_STRINGS and add their test [#810]
17
+
18
+ Built-in plugins
19
+ -----------------
20
+
21
+ * Add new option ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS in CSV parser [#809]
22
+
23
+ Deprecations
24
+ -------------
25
+
26
+ * Deprecate JRuby-dependent classes and methods [#800] [#803] [#825]
27
+ * Warn explicitly for deprecated methods [#821] [#826]
28
+
29
+
30
+ Release Date
31
+ ------------------
32
+ 2017-10-24
@@ -77,6 +77,10 @@ public class CsvParserPlugin
77
77
  @ConfigDefault("false")
78
78
  boolean getTrimIfNotQuoted();
79
79
 
80
+ @Config("quotes_in_quoted_fields")
81
+ @ConfigDefault("\"ACCEPT_ONLY_RFC4180_ESCAPED\"")
82
+ QuotesInQuotedFields getQuotesInQuotedFields();
83
+
80
84
  @Config("max_quoted_size_limit")
81
85
  @ConfigDefault("131072") //128kB
82
86
  long getMaxQuotedSizeLimit();
@@ -98,6 +102,24 @@ public class CsvParserPlugin
98
102
  boolean getStopOnInvalidRecord();
99
103
  }
100
104
 
105
+ public enum QuotesInQuotedFields
106
+ {
107
+ ACCEPT_ONLY_RFC4180_ESCAPED,
108
+ ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS,
109
+ ;
110
+
111
+ @JsonCreator
112
+ public static QuotesInQuotedFields ofString(final String string)
113
+ {
114
+ for (final QuotesInQuotedFields value : values()) {
115
+ if (string.equals(value.toString())) {
116
+ return value;
117
+ }
118
+ }
119
+ throw new ConfigException("\"quotes_in_quoted_fields\" must be one of [ACCEPT_ONLY_RFC4180_ESCAPED, ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS].");
120
+ }
121
+ }
122
+
101
123
  public static class QuoteCharacter
102
124
  {
103
125
  private final char character;
@@ -8,6 +8,7 @@ import java.util.ArrayDeque;
8
8
  import org.embulk.spi.DataException;
9
9
  import org.embulk.spi.util.LineDecoder;
10
10
  import org.embulk.config.ConfigException;
11
+ import org.embulk.standards.CsvParserPlugin.QuotesInQuotedFields;
11
12
 
12
13
  public class CsvTokenizer
13
14
  {
@@ -31,6 +32,7 @@ public class CsvTokenizer
31
32
  private final char escape;
32
33
  private final String newline;
33
34
  private final boolean trimIfNotQuoted;
35
+ private final QuotesInQuotedFields quotesInQuotedFields;
34
36
  private final long maxQuotedSizeLimit;
35
37
  private final String commentLineMarker;
36
38
  private final LineDecoder input;
@@ -62,6 +64,12 @@ public class CsvTokenizer
62
64
  escape = task.getEscapeChar().or(CsvParserPlugin.EscapeCharacter.noEscape()).getCharacter();
63
65
  newline = task.getNewline().getString();
64
66
  trimIfNotQuoted = task.getTrimIfNotQuoted();
67
+ quotesInQuotedFields = task.getQuotesInQuotedFields();
68
+ if (trimIfNotQuoted && quotesInQuotedFields != QuotesInQuotedFields.ACCEPT_ONLY_RFC4180_ESCAPED) {
69
+ // The combination makes some syntax very ambiguous such as:
70
+ // val1, \"\"val2\"\" ,val3
71
+ throw new ConfigException("[quotes_in_quoted_fields != ACCEPT_ONLY_RFC4180_ESCAPED] is not allowed to specify with [trim_if_not_quoted = true]");
72
+ }
65
73
  maxQuotedSizeLimit = task.getMaxQuotedSizeLimit();
66
74
  commentLineMarker = task.getCommentLineMarker().orNull();
67
75
  nullStringOrNull = task.getNullString().orNull();
@@ -313,9 +321,23 @@ public class CsvTokenizer
313
321
 
314
322
  } else if (isQuote(c)) {
315
323
  char next = peekNextChar();
316
- if (isQuote(next)) { // escaped quote
324
+ final char nextNext = peekNextNextChar();
325
+ if (isQuote(next) &&
326
+ (quotesInQuotedFields != QuotesInQuotedFields.ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS ||
327
+ (!isDelimiter(nextNext) && !isEndOfLine(nextNext)))) {
328
+ // Escaped by preceding it with another quote.
329
+ // A quote just before a delimiter or an end of line is recognized as a functional quote,
330
+ // not just as a non-escaped stray "quote character" included the field, even if
331
+ // ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS is specified.
317
332
  quotedValue.append(line.substring(valueStartPos, linePos));
318
333
  valueStartPos = ++linePos;
334
+ } else if (quotesInQuotedFields == QuotesInQuotedFields.ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS &&
335
+ !(isDelimiter(next) || isEndOfLine(next))) {
336
+ // A non-escaped stray "quote character" in the field is processed as a regular character
337
+ // if ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS is specified,
338
+ if ((linePos - valueStartPos) + quotedValue.length() > maxQuotedSizeLimit) {
339
+ throw new QuotedSizeLimitExceededException("The size of the quoted value exceeds the limit size (" + maxQuotedSizeLimit + ")");
340
+ }
319
341
  } else {
320
342
  quotedValue.append(line.substring(valueStartPos, linePos - 1));
321
343
  columnState = ColumnState.AFTER_QUOTED_VALUE;
@@ -427,6 +449,17 @@ public class CsvTokenizer
427
449
  }
428
450
  }
429
451
 
452
+ private char peekNextNextChar()
453
+ {
454
+ Preconditions.checkState(line != null, "peekNextNextChar is called after end of file");
455
+
456
+ if (linePos + 1 >= line.length()) {
457
+ return END_OF_LINE;
458
+ } else {
459
+ return line.charAt(linePos + 1);
460
+ }
461
+ }
462
+
430
463
  private boolean isSpace(char c)
431
464
  {
432
465
  return c == ' ';
@@ -1,6 +1,8 @@
1
1
  package org.embulk.standards;
2
2
 
3
3
  import java.util.List;
4
+ import org.embulk.config.Config;
5
+ import org.embulk.config.ConfigDefault;
4
6
  import org.embulk.config.ConfigSource;
5
7
  import org.embulk.config.TaskSource;
6
8
  import org.embulk.config.ConfigDiff;
@@ -14,13 +16,17 @@ import org.embulk.spi.OutputPlugin;
14
16
  import org.embulk.spi.TransactionalPageOutput;
15
17
  import org.embulk.spi.PageReader;
16
18
  import org.embulk.spi.util.PagePrinter;
19
+ import org.joda.time.DateTimeZone;
17
20
 
18
21
  public class StdoutOutputPlugin
19
22
  implements OutputPlugin
20
23
  {
21
24
  public interface PluginTask
22
- extends Task, TimestampFormatter.FormatterTask
25
+ extends Task
23
26
  {
27
+ @Config("timezone")
28
+ @ConfigDefault("\"UTC\"")
29
+ public DateTimeZone getTimeZone();
24
30
  }
25
31
 
26
32
  @Override
@@ -54,7 +60,7 @@ public class StdoutOutputPlugin
54
60
 
55
61
  return new TransactionalPageOutput() {
56
62
  private final PageReader reader = new PageReader(schema);
57
- private final PagePrinter printer = new PagePrinter(schema, task);
63
+ private final PagePrinter printer = new PagePrinter(schema, task.getTimeZone());
58
64
 
59
65
  public void add(Page page)
60
66
  {
@@ -293,6 +293,82 @@ public class TestCsvTokenizer
293
293
  "\"trailing\n3\" ,\"trailing\n4\" "));
294
294
  }
295
295
 
296
+
297
+ @Test
298
+ public void parseWithDefaultQuotesInQuotedFields() throws Exception
299
+ {
300
+ reloadPluginTask();
301
+ assertEquals(expectedRecords(
302
+ 2,
303
+ "foo\"bar", "foofoo\"barbar",
304
+ "baz\"\"qux", "bazbaz\"\"quxqux"),
305
+ parse(
306
+ task,
307
+ "\"foo\"\"bar\",\"foofoo\"\"barbar\"",
308
+ "\"baz\"\"\"\"qux\",\"bazbaz\"\"\"\"quxqux\""));
309
+ }
310
+
311
+ @Test
312
+ public void parseWithQuotesInQuotedFields_ACCEPT_ONLY_RFC4180_ESCAPED() throws Exception
313
+ {
314
+ config.set("quotes_in_quoted_fields", "ACCEPT_ONLY_RFC4180_ESCAPED");
315
+ reloadPluginTask();
316
+ assertEquals(expectedRecords(
317
+ 2,
318
+ "foo\"bar", "foofoo\"barbar",
319
+ "baz\"\"qux", "bazbaz\"\"quxqux"),
320
+ parse(
321
+ task,
322
+ "\"foo\"\"bar\",\"foofoo\"\"barbar\"",
323
+ "\"baz\"\"\"\"qux\",\"bazbaz\"\"\"\"quxqux\""));
324
+ }
325
+
326
+ @Test
327
+ public void throwWithDefaultQuotesInQuotedFields() throws Exception
328
+ {
329
+ reloadPluginTask();
330
+ try {
331
+ parse(task, "\"foo\"bar\",\"hoge\"fuga\"");
332
+ fail();
333
+ } catch (Exception e) {
334
+ assertTrue(e instanceof CsvTokenizer.InvalidValueException);
335
+ assertEquals("Unexpected extra character 'b' after a value quoted by '\"'", e.getMessage());
336
+ return;
337
+ }
338
+ }
339
+
340
+ @Test
341
+ public void throwWithQuotesInQuotedFields_ACCEPT_ONLY_RFC4180_ESCAPED() throws Exception
342
+ {
343
+ config.set("quotes_in_quoted_fields", "ACCEPT_ONLY_RFC4180_ESCAPED");
344
+ reloadPluginTask();
345
+ try {
346
+ parse(task, "\"foo\"bar\",\"hoge\"fuga\"");
347
+ fail();
348
+ } catch (Exception e) {
349
+ assertTrue(e instanceof CsvTokenizer.InvalidValueException);
350
+ assertEquals("Unexpected extra character 'b' after a value quoted by '\"'", e.getMessage());
351
+ return;
352
+ }
353
+ }
354
+
355
+ @Test
356
+ public void parseWithQuotesInQuotedFields_ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS() throws Exception
357
+ {
358
+ config.set("quotes_in_quoted_fields", "ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS");
359
+ reloadPluginTask();
360
+ assertEquals(expectedRecords(
361
+ 2,
362
+ "foo\"bar", "foofoo\"barbar",
363
+ "baz\"\"qux", "bazbaz\"\"quxqux",
364
+ "\"embulk\"", "\"embul\"\"k\""),
365
+ parse(
366
+ task,
367
+ "\"foo\"bar\",\"foofoo\"\"barbar\"",
368
+ "\"baz\"\"\"qux\",\"bazbaz\"\"\"\"quxqux\"",
369
+ "\"\"\"embulk\"\",\"\"embul\"\"\"k\"\""));
370
+ }
371
+
296
372
  @Test
297
373
  public void throwQuotedSizeLimitExceededException() throws Exception
298
374
  {