embulk 0.8.35-java → 0.8.36-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -0
- data/build.gradle +1 -1
- data/embulk-cli/src/main/java/org/embulk/cli/EmbulkExample.java +5 -1
- data/embulk-cli/src/main/java/org/embulk/cli/EmbulkRun.java +12 -0
- data/embulk-core/src/main/java/org/embulk/EmbulkRunner.java +2 -2
- data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoader.java +802 -17
- data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoaderFactory.java +8 -1
- data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoaderModule.java +33 -2
- data/embulk-core/src/main/java/org/embulk/plugin/jar/JarPluginLoader.java +32 -5
- data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +1 -6
- data/embulk-core/src/main/java/org/embulk/spi/json/RubyValueApi.java +39 -1
- data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +21 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +21 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnSetterFactory.java +43 -9
- data/embulk-core/src/main/java/org/embulk/spi/util/DynamicPageBuilder.java +46 -8
- data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +19 -1
- data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/AbstractDynamicColumnSetter.java +11 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/SkipColumnSetter.java +12 -1
- data/embulk-core/src/main/resources/embulk/parent_first_packages.properties +1 -0
- data/embulk-docs/build.gradle +8 -0
- data/embulk-docs/src/built-in.rst +47 -35
- data/embulk-docs/src/index.rst +9 -1
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.8.36.rst +32 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +22 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +34 -1
- data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +8 -2
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +76 -0
- data/lib/embulk/guess/schema_guess.rb +1 -1
- data/lib/embulk/input_plugin.rb +8 -1
- data/lib/embulk/page_builder.rb +38 -5
- data/lib/embulk/schema.rb +5 -6
- data/lib/embulk/version.rb +1 -1
- data/test/guess/test_schema_guess.rb +18 -0
- metadata +7 -6
@@ -9,6 +9,7 @@ import org.embulk.spi.Column;
|
|
9
9
|
import org.embulk.spi.PageReader;
|
10
10
|
import org.embulk.spi.ColumnVisitor;
|
11
11
|
import org.embulk.spi.type.TimestampType;
|
12
|
+
import org.joda.time.DateTimeZone;
|
12
13
|
|
13
14
|
public class PagePrinter
|
14
15
|
{
|
@@ -17,14 +18,29 @@ public class PagePrinter
|
|
17
18
|
private final ArrayList<String> record;
|
18
19
|
|
19
20
|
// TODO: Update this constructor because |TimestampFormater.FormatterTask| is deprecated since v0.6.14.
|
21
|
+
@Deprecated
|
20
22
|
public PagePrinter(Schema schema, TimestampFormatter.FormatterTask task)
|
23
|
+
{
|
24
|
+
this(schema, task.getTimeZone());
|
25
|
+
// NOTE: Its deprecation is not actually from ScriptingContainer, though.
|
26
|
+
// TODO: Notify users about deprecated calls through the notification reporter.
|
27
|
+
if (!deprecationWarned) {
|
28
|
+
System.err.println("[WARN] Plugin uses deprecated constructor of org.embulk.spi.util.PagePrinter.");
|
29
|
+
System.err.println("[WARN] Report plugins in your config at: https://github.com/embulk/embulk/issues/827");
|
30
|
+
// The |deprecationWarned| flag is used only for warning messages.
|
31
|
+
// Even in case of race conditions, messages are just duplicated -- should be acceptable.
|
32
|
+
deprecationWarned = true;
|
33
|
+
}
|
34
|
+
}
|
35
|
+
|
36
|
+
public PagePrinter(final Schema schema, final DateTimeZone timezone)
|
21
37
|
{
|
22
38
|
this.schema = schema;
|
23
39
|
this.timestampFormatters = new TimestampFormatter[schema.getColumnCount()];
|
24
40
|
for (int i=0; i < timestampFormatters.length; i++) {
|
25
41
|
if (schema.getColumnType(i) instanceof TimestampType) {
|
26
42
|
TimestampType type = (TimestampType) schema.getColumnType(i);
|
27
|
-
timestampFormatters[i] = new TimestampFormatter(type.getFormat(),
|
43
|
+
timestampFormatters[i] = new TimestampFormatter(type.getFormat(), timezone);
|
28
44
|
}
|
29
45
|
}
|
30
46
|
|
@@ -105,4 +121,6 @@ public class PagePrinter
|
|
105
121
|
string = reader.getJson(column).toString();
|
106
122
|
}
|
107
123
|
}
|
124
|
+
|
125
|
+
private static boolean deprecationWarned = false;
|
108
126
|
}
|
@@ -44,8 +44,17 @@ public abstract class AbstractDynamicColumnSetter
|
|
44
44
|
|
45
45
|
public abstract void set(Value value);
|
46
46
|
|
47
|
+
@Deprecated
|
47
48
|
public void setRubyObject(IRubyObject rubyObject)
|
48
49
|
{
|
50
|
+
if (!deprecationWarned) {
|
51
|
+
System.err.println("[WARN] Plugin uses deprecated org.embulk.spi.util.dynamic.AbstractDynamicColumnSetter#setRubyObject");
|
52
|
+
System.err.println("[WARN] Report plugins in your config at: https://github.com/embulk/embulk/issues/799");
|
53
|
+
// The |deprecationWarned| flag is used only for warning messages.
|
54
|
+
// Even in case of race conditions, messages are just duplicated -- should be acceptable.
|
55
|
+
deprecationWarned = true;
|
56
|
+
}
|
57
|
+
|
49
58
|
if (rubyObject == null || rubyObject instanceof RubyNil) {
|
50
59
|
setNull();
|
51
60
|
} else if (rubyObject instanceof RubyBoolean) {
|
@@ -80,4 +89,6 @@ public abstract class AbstractDynamicColumnSetter
|
|
80
89
|
set(RubyValueApi.toValue(rubyObject.getRuntime(), rubyObject));
|
81
90
|
}
|
82
91
|
}
|
92
|
+
|
93
|
+
private static boolean deprecationWarned = false;
|
83
94
|
}
|
@@ -51,7 +51,18 @@ public class SkipColumnSetter
|
|
51
51
|
public void set(Value v)
|
52
52
|
{ }
|
53
53
|
|
54
|
+
@Deprecated
|
54
55
|
@Override
|
55
56
|
public void setRubyObject(IRubyObject rubyObject)
|
56
|
-
{
|
57
|
+
{
|
58
|
+
if (!deprecationWarned) {
|
59
|
+
System.err.println("[WARN] Plugin uses deprecated org.embulk.spi.util.dynamic.SkipColumnSetter#setRubyObject");
|
60
|
+
System.err.println("[WARN] Report plugins in your config at: https://github.com/embulk/embulk/issues/799");
|
61
|
+
// The |deprecationWarned| flag is used only for warning messages.
|
62
|
+
// Even in case of race conditions, messages are just duplicated -- should be acceptable.
|
63
|
+
deprecationWarned = true;
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
private static boolean deprecationWarned = false;
|
57
68
|
}
|
data/embulk-docs/build.gradle
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
// TODO: Remove this block once rubygems.lasagna.io is back, or jruby-gradle-jar-plugin is upgraded to 1.5.0.
|
2
|
+
// See also: https://github.com/jruby-gradle/jruby-gradle-plugin/issues/297
|
3
|
+
// jruby-gradle-jar-plugin is not upgraded yet because its 1.5.0 depends on Java 8.
|
4
|
+
repositories {
|
5
|
+
mavenLocal()
|
6
|
+
maven { url 'http://rubygems-proxy.torquebox.org/releases' }
|
7
|
+
}
|
8
|
+
|
1
9
|
apply plugin: 'com.github.jruby-gradle.base'
|
2
10
|
|
3
11
|
import com.github.jrubygradle.JRubyExec
|
@@ -191,41 +191,53 @@ The ``csv`` parser plugin parses CSV and TSV files.
|
|
191
191
|
Options
|
192
192
|
~~~~~~~~
|
193
193
|
|
194
|
-
|
195
|
-
| name | type | description |
|
196
|
-
|
197
|
-
| delimiter | string | Delimiter character such as ``,`` for CSV, ``"\t"`` for TSV, ``"|"`` | ``,`` by default
|
198
|
-
|
199
|
-
| quote | string | The character surrounding a quoted value. Setting ``null`` disables quoting. | ``"`` by default
|
200
|
-
|
201
|
-
| escape | string | Escape character to escape a special character. Setting ``null`` disables escaping. | ``\\`` by default
|
202
|
-
|
203
|
-
| skip\_header\_lines | integer | Skip this number of lines first. Set 1 if the file has header line. | ``0`` by default
|
204
|
-
|
205
|
-
| null\_string | string | If a value is this string, converts it to NULL. For example, set ``\N`` for CSV files created by mysqldump |
|
206
|
-
|
207
|
-
| trim\_if\_not\_quoted | boolean | If true, remove spaces of a value if the value is not surrounded by the quote character | ``false`` by default
|
208
|
-
|
209
|
-
|
|
210
|
-
|
211
|
-
|
|
212
|
-
|
213
|
-
| allow\
|
214
|
-
|
215
|
-
|
|
216
|
-
|
217
|
-
|
|
218
|
-
|
219
|
-
|
|
220
|
-
|
221
|
-
| default\
|
222
|
-
|
223
|
-
|
|
224
|
-
|
225
|
-
|
|
226
|
-
|
227
|
-
|
|
228
|
-
|
194
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
195
|
+
| name | type | description | required? |
|
196
|
+
+============================+==========+================================================================================================================+============================================+
|
197
|
+
| delimiter | string | Delimiter character such as ``,`` for CSV, ``"\t"`` for TSV, ``"|"`` | ``,`` by default |
|
198
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
199
|
+
| quote | string | The character surrounding a quoted value. Setting ``null`` disables quoting. | ``"`` by default |
|
200
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
201
|
+
| escape | string | Escape character to escape a special character. Setting ``null`` disables escaping. | ``\\`` by default |
|
202
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
203
|
+
| skip\_header\_lines | integer | Skip this number of lines first. Set 1 if the file has header line. | ``0`` by default |
|
204
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
205
|
+
| null\_string | string | If a value is this string, converts it to NULL. For example, set ``\N`` for CSV files created by mysqldump | |
|
206
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
207
|
+
| trim\_if\_not\_quoted | boolean | If true, remove spaces of a value if the value is not surrounded by the quote character | ``false`` by default |
|
208
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
209
|
+
| quotes\_in\_quoted\_fields | enum | Specify how to deal with irregular unescaped quote characters in quoted fields | ``ACCEPT_ONLY_RFC4180_ESCAPED`` by default |
|
210
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
211
|
+
| comment\_line\_marker | string | Skip a line if the line begins with this string | null by default |
|
212
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
213
|
+
| allow\_optional\_columns | boolean | If true, set null to insufficient columns. Otherwise, skip the row in case of insufficient number of columns | ``false`` by default |
|
214
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
215
|
+
| allow\_extra\_columns | boolean | If true, ignore too many columns. Otherwise, skip the row in case of too many columns | ``false`` by default |
|
216
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
217
|
+
| max\_quoted\_size\_limit | integer | Maximum number of bytes of a quoted value. If a value exceeds the limit, the row will be skipped | ``131072`` by default |
|
218
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
219
|
+
| stop\_on\_invalid\_record | boolean | Stop bulk load transaction if a file includes invalid record (such as invalid timestamp) | ``false`` by default |
|
220
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
221
|
+
| default\_timezone | string | Time zone of timestamp columns if the value itself doesn't include time zone description (eg. Asia/Tokyo) | ``UTC`` by default |
|
222
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
223
|
+
| default\_date | string | Set date part if the format doesn’t include date part. | ``1970-01-01`` by default |
|
224
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
225
|
+
| newline | enum | Newline character (CRLF, LF or CR) | ``CRLF`` by default |
|
226
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
227
|
+
| charset | enum | Character encoding (eg. ISO-8859-1, UTF-8) | ``UTF-8`` by default |
|
228
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
229
|
+
| columns | hash | Columns (see below) | required |
|
230
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
231
|
+
|
232
|
+
The ``quotes_in_quoted_fields`` option specifies how to deal with irregular non-escaped stray quote characters.
|
233
|
+
|
234
|
+
+------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
|
235
|
+
| name | description |
|
236
|
+
+======================================================+=====================================================================================================================================================+
|
237
|
+
| ACCEPT_ONLY_RFC4180_ESCAPED | Default. Accept only specified and RFC 4180-style escaped quote characters. |
|
238
|
+
+------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
|
239
|
+
| ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS | Accept stray quotes as-is in the field. Instead, it behaves undefined if delimiters are in fields. ``"a"b"`` goes ``a"b``. ``"a""b"`` goes ``a"b``. |
|
240
|
+
+------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
|
229
241
|
|
230
242
|
The ``columns`` option declares the list of columns. This CSV parser plugin ignores the header line.
|
231
243
|
|
data/embulk-docs/src/index.rst
CHANGED
@@ -7,9 +7,17 @@ Embulk
|
|
7
7
|
==================================
|
8
8
|
|
9
9
|
.. image:: _static/embulk-logo-v2/embulk-logo-v2-sq-tr-small.png
|
10
|
-
:width:
|
10
|
+
:width: 128px
|
11
11
|
:target: https://github.com/embulk/embulk
|
12
12
|
|
13
|
+
Highlights
|
14
|
+
-----------
|
15
|
+
|
16
|
+
* Embulk's announcement mailing list (read-only) is ready. Please feel free to subscribe! Embulk core members post important updates such as **key releases**, **compatibility information**, and **feedback requests to users**.
|
17
|
+
|
18
|
+
* `Embulk-announce <https://groups.google.com/forum/#!forum/embulk-announce>`_
|
19
|
+
|
20
|
+
|
13
21
|
What's Embulk?
|
14
22
|
------------------
|
15
23
|
|
data/embulk-docs/src/release.rst
CHANGED
@@ -0,0 +1,32 @@
|
|
1
|
+
Release 0.8.36
|
2
|
+
==================================
|
3
|
+
|
4
|
+
General Changes
|
5
|
+
----------------
|
6
|
+
|
7
|
+
* Load dependency JAR files embedded in plugin JAR [#792]
|
8
|
+
* Improve timestamp parsing in Ruby plugins [#812] [#814]
|
9
|
+
* Notify Embulk-announce mailing list in CLI [#816]
|
10
|
+
|
11
|
+
Bug Fixes
|
12
|
+
----------
|
13
|
+
|
14
|
+
* Use single-quotes to quote path strings in YAML for Windows [#805]
|
15
|
+
* Truncate output file before overwriting it [#807]
|
16
|
+
* Fix typo in FALSE_STRINGS and add their test [#810]
|
17
|
+
|
18
|
+
Built-in plugins
|
19
|
+
-----------------
|
20
|
+
|
21
|
+
* Add new option ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS in CSV parser [#809]
|
22
|
+
|
23
|
+
Deprecations
|
24
|
+
-------------
|
25
|
+
|
26
|
+
* Deprecate JRuby-dependent classes and methods [#800] [#803] [#825]
|
27
|
+
* Warn explicitly for deprecated methods [#821] [#826]
|
28
|
+
|
29
|
+
|
30
|
+
Release Date
|
31
|
+
------------------
|
32
|
+
2017-10-24
|
@@ -77,6 +77,10 @@ public class CsvParserPlugin
|
|
77
77
|
@ConfigDefault("false")
|
78
78
|
boolean getTrimIfNotQuoted();
|
79
79
|
|
80
|
+
@Config("quotes_in_quoted_fields")
|
81
|
+
@ConfigDefault("\"ACCEPT_ONLY_RFC4180_ESCAPED\"")
|
82
|
+
QuotesInQuotedFields getQuotesInQuotedFields();
|
83
|
+
|
80
84
|
@Config("max_quoted_size_limit")
|
81
85
|
@ConfigDefault("131072") //128kB
|
82
86
|
long getMaxQuotedSizeLimit();
|
@@ -98,6 +102,24 @@ public class CsvParserPlugin
|
|
98
102
|
boolean getStopOnInvalidRecord();
|
99
103
|
}
|
100
104
|
|
105
|
+
public enum QuotesInQuotedFields
|
106
|
+
{
|
107
|
+
ACCEPT_ONLY_RFC4180_ESCAPED,
|
108
|
+
ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS,
|
109
|
+
;
|
110
|
+
|
111
|
+
@JsonCreator
|
112
|
+
public static QuotesInQuotedFields ofString(final String string)
|
113
|
+
{
|
114
|
+
for (final QuotesInQuotedFields value : values()) {
|
115
|
+
if (string.equals(value.toString())) {
|
116
|
+
return value;
|
117
|
+
}
|
118
|
+
}
|
119
|
+
throw new ConfigException("\"quotes_in_quoted_fields\" must be one of [ACCEPT_ONLY_RFC4180_ESCAPED, ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS].");
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
101
123
|
public static class QuoteCharacter
|
102
124
|
{
|
103
125
|
private final char character;
|
@@ -8,6 +8,7 @@ import java.util.ArrayDeque;
|
|
8
8
|
import org.embulk.spi.DataException;
|
9
9
|
import org.embulk.spi.util.LineDecoder;
|
10
10
|
import org.embulk.config.ConfigException;
|
11
|
+
import org.embulk.standards.CsvParserPlugin.QuotesInQuotedFields;
|
11
12
|
|
12
13
|
public class CsvTokenizer
|
13
14
|
{
|
@@ -31,6 +32,7 @@ public class CsvTokenizer
|
|
31
32
|
private final char escape;
|
32
33
|
private final String newline;
|
33
34
|
private final boolean trimIfNotQuoted;
|
35
|
+
private final QuotesInQuotedFields quotesInQuotedFields;
|
34
36
|
private final long maxQuotedSizeLimit;
|
35
37
|
private final String commentLineMarker;
|
36
38
|
private final LineDecoder input;
|
@@ -62,6 +64,12 @@ public class CsvTokenizer
|
|
62
64
|
escape = task.getEscapeChar().or(CsvParserPlugin.EscapeCharacter.noEscape()).getCharacter();
|
63
65
|
newline = task.getNewline().getString();
|
64
66
|
trimIfNotQuoted = task.getTrimIfNotQuoted();
|
67
|
+
quotesInQuotedFields = task.getQuotesInQuotedFields();
|
68
|
+
if (trimIfNotQuoted && quotesInQuotedFields != QuotesInQuotedFields.ACCEPT_ONLY_RFC4180_ESCAPED) {
|
69
|
+
// The combination makes some syntax very ambiguous such as:
|
70
|
+
// val1, \"\"val2\"\" ,val3
|
71
|
+
throw new ConfigException("[quotes_in_quoted_fields != ACCEPT_ONLY_RFC4180_ESCAPED] is not allowed to specify with [trim_if_not_quoted = true]");
|
72
|
+
}
|
65
73
|
maxQuotedSizeLimit = task.getMaxQuotedSizeLimit();
|
66
74
|
commentLineMarker = task.getCommentLineMarker().orNull();
|
67
75
|
nullStringOrNull = task.getNullString().orNull();
|
@@ -313,9 +321,23 @@ public class CsvTokenizer
|
|
313
321
|
|
314
322
|
} else if (isQuote(c)) {
|
315
323
|
char next = peekNextChar();
|
316
|
-
|
324
|
+
final char nextNext = peekNextNextChar();
|
325
|
+
if (isQuote(next) &&
|
326
|
+
(quotesInQuotedFields != QuotesInQuotedFields.ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS ||
|
327
|
+
(!isDelimiter(nextNext) && !isEndOfLine(nextNext)))) {
|
328
|
+
// Escaped by preceding it with another quote.
|
329
|
+
// A quote just before a delimiter or an end of line is recognized as a functional quote,
|
330
|
+
// not just as a non-escaped stray "quote character" included the field, even if
|
331
|
+
// ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS is specified.
|
317
332
|
quotedValue.append(line.substring(valueStartPos, linePos));
|
318
333
|
valueStartPos = ++linePos;
|
334
|
+
} else if (quotesInQuotedFields == QuotesInQuotedFields.ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS &&
|
335
|
+
!(isDelimiter(next) || isEndOfLine(next))) {
|
336
|
+
// A non-escaped stray "quote character" in the field is processed as a regular character
|
337
|
+
// if ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS is specified,
|
338
|
+
if ((linePos - valueStartPos) + quotedValue.length() > maxQuotedSizeLimit) {
|
339
|
+
throw new QuotedSizeLimitExceededException("The size of the quoted value exceeds the limit size (" + maxQuotedSizeLimit + ")");
|
340
|
+
}
|
319
341
|
} else {
|
320
342
|
quotedValue.append(line.substring(valueStartPos, linePos - 1));
|
321
343
|
columnState = ColumnState.AFTER_QUOTED_VALUE;
|
@@ -427,6 +449,17 @@ public class CsvTokenizer
|
|
427
449
|
}
|
428
450
|
}
|
429
451
|
|
452
|
+
private char peekNextNextChar()
|
453
|
+
{
|
454
|
+
Preconditions.checkState(line != null, "peekNextNextChar is called after end of file");
|
455
|
+
|
456
|
+
if (linePos + 1 >= line.length()) {
|
457
|
+
return END_OF_LINE;
|
458
|
+
} else {
|
459
|
+
return line.charAt(linePos + 1);
|
460
|
+
}
|
461
|
+
}
|
462
|
+
|
430
463
|
private boolean isSpace(char c)
|
431
464
|
{
|
432
465
|
return c == ' ';
|
@@ -1,6 +1,8 @@
|
|
1
1
|
package org.embulk.standards;
|
2
2
|
|
3
3
|
import java.util.List;
|
4
|
+
import org.embulk.config.Config;
|
5
|
+
import org.embulk.config.ConfigDefault;
|
4
6
|
import org.embulk.config.ConfigSource;
|
5
7
|
import org.embulk.config.TaskSource;
|
6
8
|
import org.embulk.config.ConfigDiff;
|
@@ -14,13 +16,17 @@ import org.embulk.spi.OutputPlugin;
|
|
14
16
|
import org.embulk.spi.TransactionalPageOutput;
|
15
17
|
import org.embulk.spi.PageReader;
|
16
18
|
import org.embulk.spi.util.PagePrinter;
|
19
|
+
import org.joda.time.DateTimeZone;
|
17
20
|
|
18
21
|
public class StdoutOutputPlugin
|
19
22
|
implements OutputPlugin
|
20
23
|
{
|
21
24
|
public interface PluginTask
|
22
|
-
extends Task
|
25
|
+
extends Task
|
23
26
|
{
|
27
|
+
@Config("timezone")
|
28
|
+
@ConfigDefault("\"UTC\"")
|
29
|
+
public DateTimeZone getTimeZone();
|
24
30
|
}
|
25
31
|
|
26
32
|
@Override
|
@@ -54,7 +60,7 @@ public class StdoutOutputPlugin
|
|
54
60
|
|
55
61
|
return new TransactionalPageOutput() {
|
56
62
|
private final PageReader reader = new PageReader(schema);
|
57
|
-
private final PagePrinter printer = new PagePrinter(schema, task);
|
63
|
+
private final PagePrinter printer = new PagePrinter(schema, task.getTimeZone());
|
58
64
|
|
59
65
|
public void add(Page page)
|
60
66
|
{
|
@@ -293,6 +293,82 @@ public class TestCsvTokenizer
|
|
293
293
|
"\"trailing\n3\" ,\"trailing\n4\" "));
|
294
294
|
}
|
295
295
|
|
296
|
+
|
297
|
+
@Test
|
298
|
+
public void parseWithDefaultQuotesInQuotedFields() throws Exception
|
299
|
+
{
|
300
|
+
reloadPluginTask();
|
301
|
+
assertEquals(expectedRecords(
|
302
|
+
2,
|
303
|
+
"foo\"bar", "foofoo\"barbar",
|
304
|
+
"baz\"\"qux", "bazbaz\"\"quxqux"),
|
305
|
+
parse(
|
306
|
+
task,
|
307
|
+
"\"foo\"\"bar\",\"foofoo\"\"barbar\"",
|
308
|
+
"\"baz\"\"\"\"qux\",\"bazbaz\"\"\"\"quxqux\""));
|
309
|
+
}
|
310
|
+
|
311
|
+
@Test
|
312
|
+
public void parseWithQuotesInQuotedFields_ACCEPT_ONLY_RFC4180_ESCAPED() throws Exception
|
313
|
+
{
|
314
|
+
config.set("quotes_in_quoted_fields", "ACCEPT_ONLY_RFC4180_ESCAPED");
|
315
|
+
reloadPluginTask();
|
316
|
+
assertEquals(expectedRecords(
|
317
|
+
2,
|
318
|
+
"foo\"bar", "foofoo\"barbar",
|
319
|
+
"baz\"\"qux", "bazbaz\"\"quxqux"),
|
320
|
+
parse(
|
321
|
+
task,
|
322
|
+
"\"foo\"\"bar\",\"foofoo\"\"barbar\"",
|
323
|
+
"\"baz\"\"\"\"qux\",\"bazbaz\"\"\"\"quxqux\""));
|
324
|
+
}
|
325
|
+
|
326
|
+
@Test
|
327
|
+
public void throwWithDefaultQuotesInQuotedFields() throws Exception
|
328
|
+
{
|
329
|
+
reloadPluginTask();
|
330
|
+
try {
|
331
|
+
parse(task, "\"foo\"bar\",\"hoge\"fuga\"");
|
332
|
+
fail();
|
333
|
+
} catch (Exception e) {
|
334
|
+
assertTrue(e instanceof CsvTokenizer.InvalidValueException);
|
335
|
+
assertEquals("Unexpected extra character 'b' after a value quoted by '\"'", e.getMessage());
|
336
|
+
return;
|
337
|
+
}
|
338
|
+
}
|
339
|
+
|
340
|
+
@Test
|
341
|
+
public void throwWithQuotesInQuotedFields_ACCEPT_ONLY_RFC4180_ESCAPED() throws Exception
|
342
|
+
{
|
343
|
+
config.set("quotes_in_quoted_fields", "ACCEPT_ONLY_RFC4180_ESCAPED");
|
344
|
+
reloadPluginTask();
|
345
|
+
try {
|
346
|
+
parse(task, "\"foo\"bar\",\"hoge\"fuga\"");
|
347
|
+
fail();
|
348
|
+
} catch (Exception e) {
|
349
|
+
assertTrue(e instanceof CsvTokenizer.InvalidValueException);
|
350
|
+
assertEquals("Unexpected extra character 'b' after a value quoted by '\"'", e.getMessage());
|
351
|
+
return;
|
352
|
+
}
|
353
|
+
}
|
354
|
+
|
355
|
+
@Test
|
356
|
+
public void parseWithQuotesInQuotedFields_ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS() throws Exception
|
357
|
+
{
|
358
|
+
config.set("quotes_in_quoted_fields", "ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS");
|
359
|
+
reloadPluginTask();
|
360
|
+
assertEquals(expectedRecords(
|
361
|
+
2,
|
362
|
+
"foo\"bar", "foofoo\"barbar",
|
363
|
+
"baz\"\"qux", "bazbaz\"\"quxqux",
|
364
|
+
"\"embulk\"", "\"embul\"\"k\""),
|
365
|
+
parse(
|
366
|
+
task,
|
367
|
+
"\"foo\"bar\",\"foofoo\"\"barbar\"",
|
368
|
+
"\"baz\"\"\"qux\",\"bazbaz\"\"\"\"quxqux\"",
|
369
|
+
"\"\"\"embulk\"\",\"\"embul\"\"\"k\"\""));
|
370
|
+
}
|
371
|
+
|
296
372
|
@Test
|
297
373
|
public void throwQuotedSizeLimitExceededException() throws Exception
|
298
374
|
{
|