embulk 0.8.35-java → 0.8.36-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +4 -0
  3. data/build.gradle +1 -1
  4. data/embulk-cli/src/main/java/org/embulk/cli/EmbulkExample.java +5 -1
  5. data/embulk-cli/src/main/java/org/embulk/cli/EmbulkRun.java +12 -0
  6. data/embulk-core/src/main/java/org/embulk/EmbulkRunner.java +2 -2
  7. data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoader.java +802 -17
  8. data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoaderFactory.java +8 -1
  9. data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoaderModule.java +33 -2
  10. data/embulk-core/src/main/java/org/embulk/plugin/jar/JarPluginLoader.java +32 -5
  11. data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +1 -6
  12. data/embulk-core/src/main/java/org/embulk/spi/json/RubyValueApi.java +39 -1
  13. data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +21 -0
  14. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +21 -0
  15. data/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnSetterFactory.java +43 -9
  16. data/embulk-core/src/main/java/org/embulk/spi/util/DynamicPageBuilder.java +46 -8
  17. data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +19 -1
  18. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/AbstractDynamicColumnSetter.java +11 -0
  19. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/SkipColumnSetter.java +12 -1
  20. data/embulk-core/src/main/resources/embulk/parent_first_packages.properties +1 -0
  21. data/embulk-docs/build.gradle +8 -0
  22. data/embulk-docs/src/built-in.rst +47 -35
  23. data/embulk-docs/src/index.rst +9 -1
  24. data/embulk-docs/src/release.rst +1 -0
  25. data/embulk-docs/src/release/release-0.8.36.rst +32 -0
  26. data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +22 -0
  27. data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +34 -1
  28. data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +8 -2
  29. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +76 -0
  30. data/lib/embulk/guess/schema_guess.rb +1 -1
  31. data/lib/embulk/input_plugin.rb +8 -1
  32. data/lib/embulk/page_builder.rb +38 -5
  33. data/lib/embulk/schema.rb +5 -6
  34. data/lib/embulk/version.rb +1 -1
  35. data/test/guess/test_schema_guess.rb +18 -0
  36. metadata +7 -6
@@ -9,6 +9,7 @@ import org.embulk.spi.Column;
9
9
  import org.embulk.spi.PageReader;
10
10
  import org.embulk.spi.ColumnVisitor;
11
11
  import org.embulk.spi.type.TimestampType;
12
+ import org.joda.time.DateTimeZone;
12
13
 
13
14
  public class PagePrinter
14
15
  {
@@ -17,14 +18,29 @@ public class PagePrinter
17
18
  private final ArrayList<String> record;
18
19
 
19
20
  // TODO: Update this constructor because |TimestampFormater.FormatterTask| is deprecated since v0.6.14.
21
+ @Deprecated
20
22
  public PagePrinter(Schema schema, TimestampFormatter.FormatterTask task)
23
+ {
24
+ this(schema, task.getTimeZone());
25
+ // NOTE: Its deprecation is not actually from ScriptingContainer, though.
26
+ // TODO: Notify users about deprecated calls through the notification reporter.
27
+ if (!deprecationWarned) {
28
+ System.err.println("[WARN] Plugin uses deprecated constructor of org.embulk.spi.util.PagePrinter.");
29
+ System.err.println("[WARN] Report plugins in your config at: https://github.com/embulk/embulk/issues/827");
30
+ // The |deprecationWarned| flag is used only for warning messages.
31
+ // Even in case of race conditions, messages are just duplicated -- should be acceptable.
32
+ deprecationWarned = true;
33
+ }
34
+ }
35
+
36
+ public PagePrinter(final Schema schema, final DateTimeZone timezone)
21
37
  {
22
38
  this.schema = schema;
23
39
  this.timestampFormatters = new TimestampFormatter[schema.getColumnCount()];
24
40
  for (int i=0; i < timestampFormatters.length; i++) {
25
41
  if (schema.getColumnType(i) instanceof TimestampType) {
26
42
  TimestampType type = (TimestampType) schema.getColumnType(i);
27
- timestampFormatters[i] = new TimestampFormatter(type.getFormat(), task.getTimeZone());
43
+ timestampFormatters[i] = new TimestampFormatter(type.getFormat(), timezone);
28
44
  }
29
45
  }
30
46
 
@@ -105,4 +121,6 @@ public class PagePrinter
105
121
  string = reader.getJson(column).toString();
106
122
  }
107
123
  }
124
+
125
+ private static boolean deprecationWarned = false;
108
126
  }
@@ -44,8 +44,17 @@ public abstract class AbstractDynamicColumnSetter
44
44
 
45
45
  public abstract void set(Value value);
46
46
 
47
+ @Deprecated
47
48
  public void setRubyObject(IRubyObject rubyObject)
48
49
  {
50
+ if (!deprecationWarned) {
51
+ System.err.println("[WARN] Plugin uses deprecated org.embulk.spi.util.dynamic.AbstractDynamicColumnSetter#setRubyObject");
52
+ System.err.println("[WARN] Report plugins in your config at: https://github.com/embulk/embulk/issues/799");
53
+ // The |deprecationWarned| flag is used only for warning messages.
54
+ // Even in case of race conditions, messages are just duplicated -- should be acceptable.
55
+ deprecationWarned = true;
56
+ }
57
+
49
58
  if (rubyObject == null || rubyObject instanceof RubyNil) {
50
59
  setNull();
51
60
  } else if (rubyObject instanceof RubyBoolean) {
@@ -80,4 +89,6 @@ public abstract class AbstractDynamicColumnSetter
80
89
  set(RubyValueApi.toValue(rubyObject.getRuntime(), rubyObject));
81
90
  }
82
91
  }
92
+
93
+ private static boolean deprecationWarned = false;
83
94
  }
@@ -51,7 +51,18 @@ public class SkipColumnSetter
51
51
  public void set(Value v)
52
52
  { }
53
53
 
54
+ @Deprecated
54
55
  @Override
55
56
  public void setRubyObject(IRubyObject rubyObject)
56
- { }
57
+ {
58
+ if (!deprecationWarned) {
59
+ System.err.println("[WARN] Plugin uses deprecated org.embulk.spi.util.dynamic.SkipColumnSetter#setRubyObject");
60
+ System.err.println("[WARN] Report plugins in your config at: https://github.com/embulk/embulk/issues/799");
61
+ // The |deprecationWarned| flag is used only for warning messages.
62
+ // Even in case of race conditions, messages are just duplicated -- should be acceptable.
63
+ deprecationWarned = true;
64
+ }
65
+ }
66
+
67
+ private static boolean deprecationWarned = false;
57
68
  }
@@ -40,6 +40,7 @@ edu.umd.cs.findbugs.annotations
40
40
  io.airlift.slice
41
41
  io.netty.buffer
42
42
  io.netty.util
43
+ java
43
44
  javax.annotation
44
45
  javax.inject
45
46
  javax.validation
@@ -1,3 +1,11 @@
1
+ // TODO: Remove this block once rubygems.lasagna.io is back, or jruby-gradle-jar-plugin is upgraded to 1.5.0.
2
+ // See also: https://github.com/jruby-gradle/jruby-gradle-plugin/issues/297
3
+ // jruby-gradle-jar-plugin is not upgraded yet because its 1.5.0 depends on Java 8.
4
+ repositories {
5
+ mavenLocal()
6
+ maven { url 'http://rubygems-proxy.torquebox.org/releases' }
7
+ }
8
+
1
9
  apply plugin: 'com.github.jruby-gradle.base'
2
10
 
3
11
  import com.github.jrubygradle.JRubyExec
@@ -191,41 +191,53 @@ The ``csv`` parser plugin parses CSV and TSV files.
191
191
  Options
192
192
  ~~~~~~~~
193
193
 
194
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
195
- | name | type | description | required? |
196
- +============================+==========+================================================================================================================+===========================+
197
- | delimiter | string | Delimiter character such as ``,`` for CSV, ``"\t"`` for TSV, ``"|"`` | ``,`` by default |
198
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
199
- | quote | string | The character surrounding a quoted value. Setting ``null`` disables quoting. | ``"`` by default |
200
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
201
- | escape | string | Escape character to escape a special character. Setting ``null`` disables escaping. | ``\\`` by default |
202
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
203
- | skip\_header\_lines | integer | Skip this number of lines first. Set 1 if the file has header line. | ``0`` by default |
204
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
205
- | null\_string | string | If a value is this string, converts it to NULL. For example, set ``\N`` for CSV files created by mysqldump | |
206
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
207
- | trim\_if\_not\_quoted | boolean | If true, remove spaces of a value if the value is not surrounded by the quote character | ``false`` by default |
208
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
209
- | comment\_line\_marker | string | Skip a line if the line begins with this string | null by default |
210
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
211
- | allow\_optional\_columns | boolean | If true, set null to insufficient columns. Otherwise, skip the row in case of insufficient number of columns | ``false`` by default |
212
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
213
- | allow\_extra\_columns | boolean | If true, ignore too many columns. Otherwise, skip the row in case of too many columns | ``false`` by default |
214
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
215
- | max\_quoted\_size\_limit | integer | Maximum number of bytes of a quoted value. If a value exceeds the limit, the row will be skipped | ``131072`` by default |
216
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
217
- | stop\_on\_invalid\_record | boolean | Stop bulk load transaction if a file includes invalid record (such as invalid timestamp) | ``false`` by default |
218
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
219
- | default\_timezone | string | Time zone of timestamp columns if the value itself doesn't include time zone description (eg. Asia/Tokyo) | ``UTC`` by default |
220
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
221
- | default\_date | string | Set date part if the format doesnt include date part. | ``1970-01-01`` by default |
222
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
223
- | newline | enum | Newline character (CRLF, LF or CR) | ``CRLF`` by default |
224
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
225
- | charset | enum | Character encoding (eg. ISO-8859-1, UTF-8) | ``UTF-8`` by default |
226
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
227
- | columns | hash | Columns (see below) | required |
228
- +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
194
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
195
+ | name | type | description | required? |
196
+ +============================+==========+================================================================================================================+============================================+
197
+ | delimiter | string | Delimiter character such as ``,`` for CSV, ``"\t"`` for TSV, ``"|"`` | ``,`` by default |
198
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
199
+ | quote | string | The character surrounding a quoted value. Setting ``null`` disables quoting. | ``"`` by default |
200
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
201
+ | escape | string | Escape character to escape a special character. Setting ``null`` disables escaping. | ``\\`` by default |
202
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
203
+ | skip\_header\_lines | integer | Skip this number of lines first. Set 1 if the file has header line. | ``0`` by default |
204
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
205
+ | null\_string | string | If a value is this string, converts it to NULL. For example, set ``\N`` for CSV files created by mysqldump | |
206
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
207
+ | trim\_if\_not\_quoted | boolean | If true, remove spaces of a value if the value is not surrounded by the quote character | ``false`` by default |
208
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
209
+ | quotes\_in\_quoted\_fields | enum | Specify how to deal with irregular unescaped quote characters in quoted fields | ``ACCEPT_ONLY_RFC4180_ESCAPED`` by default |
210
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
211
+ | comment\_line\_marker | string | Skip a line if the line begins with this string | null by default |
212
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
213
+ | allow\_optional\_columns | boolean | If true, set null to insufficient columns. Otherwise, skip the row in case of insufficient number of columns | ``false`` by default |
214
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
215
+ | allow\_extra\_columns | boolean | If true, ignore too many columns. Otherwise, skip the row in case of too many columns | ``false`` by default |
216
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
217
+ | max\_quoted\_size\_limit | integer | Maximum number of bytes of a quoted value. If a value exceeds the limit, the row will be skipped | ``131072`` by default |
218
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
219
+ | stop\_on\_invalid\_record | boolean | Stop bulk load transaction if a file includes invalid record (such as invalid timestamp) | ``false`` by default |
220
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
221
+ | default\_timezone | string | Time zone of timestamp columns if the value itself doesn't include time zone description (eg. Asia/Tokyo) | ``UTC`` by default |
222
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
223
+ | default\_date | string | Set date part if the format doesn’t include date part. | ``1970-01-01`` by default |
224
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
225
+ | newline | enum | Newline character (CRLF, LF or CR) | ``CRLF`` by default |
226
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
227
+ | charset | enum | Character encoding (eg. ISO-8859-1, UTF-8) | ``UTF-8`` by default |
228
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
229
+ | columns | hash | Columns (see below) | required |
230
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
231
+
232
+ The ``quotes_in_quoted_fields`` option specifies how to deal with irregular non-escaped stray quote characters.
233
+
234
+ +------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
235
+ | name | description |
236
+ +======================================================+=====================================================================================================================================================+
237
+ | ACCEPT_ONLY_RFC4180_ESCAPED | Default. Accept only specified and RFC 4180-style escaped quote characters. |
238
+ +------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
239
+ | ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS | Accept stray quotes as-is in the field. Instead, it behaves undefined if delimiters are in fields. ``"a"b"`` goes ``a"b``. ``"a""b"`` goes ``a"b``. |
240
+ +------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
229
241
 
230
242
  The ``columns`` option declares the list of columns. This CSV parser plugin ignores the header line.
231
243
 
@@ -7,9 +7,17 @@ Embulk
7
7
  ==================================
8
8
 
9
9
  .. image:: _static/embulk-logo-v2/embulk-logo-v2-sq-tr-small.png
10
- :width: 512px
10
+ :width: 128px
11
11
  :target: https://github.com/embulk/embulk
12
12
 
13
+ Highlights
14
+ -----------
15
+
16
+ * Embulk's announcement mailing list (read-only) is ready. Please feel free to subscribe! Embulk core members post important updates such as **key releases**, **compatibility information**, and **feedback requests to users**.
17
+
18
+ * `Embulk-announce <https://groups.google.com/forum/#!forum/embulk-announce>`_
19
+
20
+
13
21
  What's Embulk?
14
22
  ------------------
15
23
 
@@ -4,6 +4,7 @@ Release Notes
4
4
  .. toctree::
5
5
  :maxdepth: 1
6
6
 
7
+ release/release-0.8.36
7
8
  release/release-0.8.35
8
9
  release/release-0.8.34
9
10
  release/release-0.8.33
@@ -0,0 +1,32 @@
1
+ Release 0.8.36
2
+ ==================================
3
+
4
+ General Changes
5
+ ----------------
6
+
7
+ * Load dependency JAR files embedded in plugin JAR [#792]
8
+ * Improve timestamp parsing in Ruby plugins [#812] [#814]
9
+ * Notify Embulk-announce mailing list in CLI [#816]
10
+
11
+ Bug Fixes
12
+ ----------
13
+
14
+ * Use single-quotes to quote path strings in YAML for Windows [#805]
15
+ * Truncate output file before overwriting it [#807]
16
+ * Fix typo in FALSE_STRINGS and add their test [#810]
17
+
18
+ Built-in plugins
19
+ -----------------
20
+
21
+ * Add new option ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS in CSV parser [#809]
22
+
23
+ Deprecations
24
+ -------------
25
+
26
+ * Deprecate JRuby-dependent classes and methods [#800] [#803] [#825]
27
+ * Warn explicitly for deprecated methods [#821] [#826]
28
+
29
+
30
+ Release Date
31
+ ------------------
32
+ 2017-10-24
@@ -77,6 +77,10 @@ public class CsvParserPlugin
77
77
  @ConfigDefault("false")
78
78
  boolean getTrimIfNotQuoted();
79
79
 
80
+ @Config("quotes_in_quoted_fields")
81
+ @ConfigDefault("\"ACCEPT_ONLY_RFC4180_ESCAPED\"")
82
+ QuotesInQuotedFields getQuotesInQuotedFields();
83
+
80
84
  @Config("max_quoted_size_limit")
81
85
  @ConfigDefault("131072") //128kB
82
86
  long getMaxQuotedSizeLimit();
@@ -98,6 +102,24 @@ public class CsvParserPlugin
98
102
  boolean getStopOnInvalidRecord();
99
103
  }
100
104
 
105
+ public enum QuotesInQuotedFields
106
+ {
107
+ ACCEPT_ONLY_RFC4180_ESCAPED,
108
+ ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS,
109
+ ;
110
+
111
+ @JsonCreator
112
+ public static QuotesInQuotedFields ofString(final String string)
113
+ {
114
+ for (final QuotesInQuotedFields value : values()) {
115
+ if (string.equals(value.toString())) {
116
+ return value;
117
+ }
118
+ }
119
+ throw new ConfigException("\"quotes_in_quoted_fields\" must be one of [ACCEPT_ONLY_RFC4180_ESCAPED, ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS].");
120
+ }
121
+ }
122
+
101
123
  public static class QuoteCharacter
102
124
  {
103
125
  private final char character;
@@ -8,6 +8,7 @@ import java.util.ArrayDeque;
8
8
  import org.embulk.spi.DataException;
9
9
  import org.embulk.spi.util.LineDecoder;
10
10
  import org.embulk.config.ConfigException;
11
+ import org.embulk.standards.CsvParserPlugin.QuotesInQuotedFields;
11
12
 
12
13
  public class CsvTokenizer
13
14
  {
@@ -31,6 +32,7 @@ public class CsvTokenizer
31
32
  private final char escape;
32
33
  private final String newline;
33
34
  private final boolean trimIfNotQuoted;
35
+ private final QuotesInQuotedFields quotesInQuotedFields;
34
36
  private final long maxQuotedSizeLimit;
35
37
  private final String commentLineMarker;
36
38
  private final LineDecoder input;
@@ -62,6 +64,12 @@ public class CsvTokenizer
62
64
  escape = task.getEscapeChar().or(CsvParserPlugin.EscapeCharacter.noEscape()).getCharacter();
63
65
  newline = task.getNewline().getString();
64
66
  trimIfNotQuoted = task.getTrimIfNotQuoted();
67
+ quotesInQuotedFields = task.getQuotesInQuotedFields();
68
+ if (trimIfNotQuoted && quotesInQuotedFields != QuotesInQuotedFields.ACCEPT_ONLY_RFC4180_ESCAPED) {
69
+ // The combination makes some syntax very ambiguous such as:
70
+ // val1, \"\"val2\"\" ,val3
71
+ throw new ConfigException("[quotes_in_quoted_fields != ACCEPT_ONLY_RFC4180_ESCAPED] is not allowed to specify with [trim_if_not_quoted = true]");
72
+ }
65
73
  maxQuotedSizeLimit = task.getMaxQuotedSizeLimit();
66
74
  commentLineMarker = task.getCommentLineMarker().orNull();
67
75
  nullStringOrNull = task.getNullString().orNull();
@@ -313,9 +321,23 @@ public class CsvTokenizer
313
321
 
314
322
  } else if (isQuote(c)) {
315
323
  char next = peekNextChar();
316
- if (isQuote(next)) { // escaped quote
324
+ final char nextNext = peekNextNextChar();
325
+ if (isQuote(next) &&
326
+ (quotesInQuotedFields != QuotesInQuotedFields.ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS ||
327
+ (!isDelimiter(nextNext) && !isEndOfLine(nextNext)))) {
328
+ // Escaped by preceding it with another quote.
329
+ // A quote just before a delimiter or an end of line is recognized as a functional quote,
330
+ // not just as a non-escaped stray "quote character" included the field, even if
331
+ // ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS is specified.
317
332
  quotedValue.append(line.substring(valueStartPos, linePos));
318
333
  valueStartPos = ++linePos;
334
+ } else if (quotesInQuotedFields == QuotesInQuotedFields.ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS &&
335
+ !(isDelimiter(next) || isEndOfLine(next))) {
336
+ // A non-escaped stray "quote character" in the field is processed as a regular character
337
+ // if ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS is specified,
338
+ if ((linePos - valueStartPos) + quotedValue.length() > maxQuotedSizeLimit) {
339
+ throw new QuotedSizeLimitExceededException("The size of the quoted value exceeds the limit size (" + maxQuotedSizeLimit + ")");
340
+ }
319
341
  } else {
320
342
  quotedValue.append(line.substring(valueStartPos, linePos - 1));
321
343
  columnState = ColumnState.AFTER_QUOTED_VALUE;
@@ -427,6 +449,17 @@ public class CsvTokenizer
427
449
  }
428
450
  }
429
451
 
452
+ private char peekNextNextChar()
453
+ {
454
+ Preconditions.checkState(line != null, "peekNextNextChar is called after end of file");
455
+
456
+ if (linePos + 1 >= line.length()) {
457
+ return END_OF_LINE;
458
+ } else {
459
+ return line.charAt(linePos + 1);
460
+ }
461
+ }
462
+
430
463
  private boolean isSpace(char c)
431
464
  {
432
465
  return c == ' ';
@@ -1,6 +1,8 @@
1
1
  package org.embulk.standards;
2
2
 
3
3
  import java.util.List;
4
+ import org.embulk.config.Config;
5
+ import org.embulk.config.ConfigDefault;
4
6
  import org.embulk.config.ConfigSource;
5
7
  import org.embulk.config.TaskSource;
6
8
  import org.embulk.config.ConfigDiff;
@@ -14,13 +16,17 @@ import org.embulk.spi.OutputPlugin;
14
16
  import org.embulk.spi.TransactionalPageOutput;
15
17
  import org.embulk.spi.PageReader;
16
18
  import org.embulk.spi.util.PagePrinter;
19
+ import org.joda.time.DateTimeZone;
17
20
 
18
21
  public class StdoutOutputPlugin
19
22
  implements OutputPlugin
20
23
  {
21
24
  public interface PluginTask
22
- extends Task, TimestampFormatter.FormatterTask
25
+ extends Task
23
26
  {
27
+ @Config("timezone")
28
+ @ConfigDefault("\"UTC\"")
29
+ public DateTimeZone getTimeZone();
24
30
  }
25
31
 
26
32
  @Override
@@ -54,7 +60,7 @@ public class StdoutOutputPlugin
54
60
 
55
61
  return new TransactionalPageOutput() {
56
62
  private final PageReader reader = new PageReader(schema);
57
- private final PagePrinter printer = new PagePrinter(schema, task);
63
+ private final PagePrinter printer = new PagePrinter(schema, task.getTimeZone());
58
64
 
59
65
  public void add(Page page)
60
66
  {
@@ -293,6 +293,82 @@ public class TestCsvTokenizer
293
293
  "\"trailing\n3\" ,\"trailing\n4\" "));
294
294
  }
295
295
 
296
+
297
+ @Test
298
+ public void parseWithDefaultQuotesInQuotedFields() throws Exception
299
+ {
300
+ reloadPluginTask();
301
+ assertEquals(expectedRecords(
302
+ 2,
303
+ "foo\"bar", "foofoo\"barbar",
304
+ "baz\"\"qux", "bazbaz\"\"quxqux"),
305
+ parse(
306
+ task,
307
+ "\"foo\"\"bar\",\"foofoo\"\"barbar\"",
308
+ "\"baz\"\"\"\"qux\",\"bazbaz\"\"\"\"quxqux\""));
309
+ }
310
+
311
+ @Test
312
+ public void parseWithQuotesInQuotedFields_ACCEPT_ONLY_RFC4180_ESCAPED() throws Exception
313
+ {
314
+ config.set("quotes_in_quoted_fields", "ACCEPT_ONLY_RFC4180_ESCAPED");
315
+ reloadPluginTask();
316
+ assertEquals(expectedRecords(
317
+ 2,
318
+ "foo\"bar", "foofoo\"barbar",
319
+ "baz\"\"qux", "bazbaz\"\"quxqux"),
320
+ parse(
321
+ task,
322
+ "\"foo\"\"bar\",\"foofoo\"\"barbar\"",
323
+ "\"baz\"\"\"\"qux\",\"bazbaz\"\"\"\"quxqux\""));
324
+ }
325
+
326
+ @Test
327
+ public void throwWithDefaultQuotesInQuotedFields() throws Exception
328
+ {
329
+ reloadPluginTask();
330
+ try {
331
+ parse(task, "\"foo\"bar\",\"hoge\"fuga\"");
332
+ fail();
333
+ } catch (Exception e) {
334
+ assertTrue(e instanceof CsvTokenizer.InvalidValueException);
335
+ assertEquals("Unexpected extra character 'b' after a value quoted by '\"'", e.getMessage());
336
+ return;
337
+ }
338
+ }
339
+
340
+ @Test
341
+ public void throwWithQuotesInQuotedFields_ACCEPT_ONLY_RFC4180_ESCAPED() throws Exception
342
+ {
343
+ config.set("quotes_in_quoted_fields", "ACCEPT_ONLY_RFC4180_ESCAPED");
344
+ reloadPluginTask();
345
+ try {
346
+ parse(task, "\"foo\"bar\",\"hoge\"fuga\"");
347
+ fail();
348
+ } catch (Exception e) {
349
+ assertTrue(e instanceof CsvTokenizer.InvalidValueException);
350
+ assertEquals("Unexpected extra character 'b' after a value quoted by '\"'", e.getMessage());
351
+ return;
352
+ }
353
+ }
354
+
355
+ @Test
356
+ public void parseWithQuotesInQuotedFields_ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS() throws Exception
357
+ {
358
+ config.set("quotes_in_quoted_fields", "ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS");
359
+ reloadPluginTask();
360
+ assertEquals(expectedRecords(
361
+ 2,
362
+ "foo\"bar", "foofoo\"barbar",
363
+ "baz\"\"qux", "bazbaz\"\"quxqux",
364
+ "\"embulk\"", "\"embul\"\"k\""),
365
+ parse(
366
+ task,
367
+ "\"foo\"bar\",\"foofoo\"\"barbar\"",
368
+ "\"baz\"\"\"qux\",\"bazbaz\"\"\"\"quxqux\"",
369
+ "\"\"\"embulk\"\",\"\"embul\"\"\"k\"\""));
370
+ }
371
+
296
372
  @Test
297
373
  public void throwQuotedSizeLimitExceededException() throws Exception
298
374
  {