embulk 0.8.35 → 0.8.36
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +4 -0
- data/build.gradle +1 -1
- data/embulk-cli/src/main/java/org/embulk/cli/EmbulkExample.java +5 -1
- data/embulk-cli/src/main/java/org/embulk/cli/EmbulkRun.java +12 -0
- data/embulk-core/src/main/java/org/embulk/EmbulkRunner.java +2 -2
- data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoader.java +802 -17
- data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoaderFactory.java +8 -1
- data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoaderModule.java +33 -2
- data/embulk-core/src/main/java/org/embulk/plugin/jar/JarPluginLoader.java +32 -5
- data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +1 -6
- data/embulk-core/src/main/java/org/embulk/spi/json/RubyValueApi.java +39 -1
- data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +21 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +21 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnSetterFactory.java +43 -9
- data/embulk-core/src/main/java/org/embulk/spi/util/DynamicPageBuilder.java +46 -8
- data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +19 -1
- data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/AbstractDynamicColumnSetter.java +11 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/SkipColumnSetter.java +12 -1
- data/embulk-core/src/main/resources/embulk/parent_first_packages.properties +1 -0
- data/embulk-docs/build.gradle +8 -0
- data/embulk-docs/src/built-in.rst +47 -35
- data/embulk-docs/src/index.rst +9 -1
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.8.36.rst +32 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +22 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +34 -1
- data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +8 -2
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +76 -0
- data/lib/embulk/guess/schema_guess.rb +1 -1
- data/lib/embulk/input_plugin.rb +8 -1
- data/lib/embulk/page_builder.rb +38 -5
- data/lib/embulk/schema.rb +5 -6
- data/lib/embulk/version.rb +1 -1
- data/test/guess/test_schema_guess.rb +18 -0
- metadata +7 -6
@@ -9,6 +9,7 @@ import org.embulk.spi.Column;
|
|
9
9
|
import org.embulk.spi.PageReader;
|
10
10
|
import org.embulk.spi.ColumnVisitor;
|
11
11
|
import org.embulk.spi.type.TimestampType;
|
12
|
+
import org.joda.time.DateTimeZone;
|
12
13
|
|
13
14
|
public class PagePrinter
|
14
15
|
{
|
@@ -17,14 +18,29 @@ public class PagePrinter
|
|
17
18
|
private final ArrayList<String> record;
|
18
19
|
|
19
20
|
// TODO: Update this constructor because |TimestampFormater.FormatterTask| is deprecated since v0.6.14.
|
21
|
+
@Deprecated
|
20
22
|
public PagePrinter(Schema schema, TimestampFormatter.FormatterTask task)
|
23
|
+
{
|
24
|
+
this(schema, task.getTimeZone());
|
25
|
+
// NOTE: Its deprecation is not actually from ScriptingContainer, though.
|
26
|
+
// TODO: Notify users about deprecated calls through the notification reporter.
|
27
|
+
if (!deprecationWarned) {
|
28
|
+
System.err.println("[WARN] Plugin uses deprecated constructor of org.embulk.spi.util.PagePrinter.");
|
29
|
+
System.err.println("[WARN] Report plugins in your config at: https://github.com/embulk/embulk/issues/827");
|
30
|
+
// The |deprecationWarned| flag is used only for warning messages.
|
31
|
+
// Even in case of race conditions, messages are just duplicated -- should be acceptable.
|
32
|
+
deprecationWarned = true;
|
33
|
+
}
|
34
|
+
}
|
35
|
+
|
36
|
+
public PagePrinter(final Schema schema, final DateTimeZone timezone)
|
21
37
|
{
|
22
38
|
this.schema = schema;
|
23
39
|
this.timestampFormatters = new TimestampFormatter[schema.getColumnCount()];
|
24
40
|
for (int i=0; i < timestampFormatters.length; i++) {
|
25
41
|
if (schema.getColumnType(i) instanceof TimestampType) {
|
26
42
|
TimestampType type = (TimestampType) schema.getColumnType(i);
|
27
|
-
timestampFormatters[i] = new TimestampFormatter(type.getFormat(),
|
43
|
+
timestampFormatters[i] = new TimestampFormatter(type.getFormat(), timezone);
|
28
44
|
}
|
29
45
|
}
|
30
46
|
|
@@ -105,4 +121,6 @@ public class PagePrinter
|
|
105
121
|
string = reader.getJson(column).toString();
|
106
122
|
}
|
107
123
|
}
|
124
|
+
|
125
|
+
private static boolean deprecationWarned = false;
|
108
126
|
}
|
@@ -44,8 +44,17 @@ public abstract class AbstractDynamicColumnSetter
|
|
44
44
|
|
45
45
|
public abstract void set(Value value);
|
46
46
|
|
47
|
+
@Deprecated
|
47
48
|
public void setRubyObject(IRubyObject rubyObject)
|
48
49
|
{
|
50
|
+
if (!deprecationWarned) {
|
51
|
+
System.err.println("[WARN] Plugin uses deprecated org.embulk.spi.util.dynamic.AbstractDynamicColumnSetter#setRubyObject");
|
52
|
+
System.err.println("[WARN] Report plugins in your config at: https://github.com/embulk/embulk/issues/799");
|
53
|
+
// The |deprecationWarned| flag is used only for warning messages.
|
54
|
+
// Even in case of race conditions, messages are just duplicated -- should be acceptable.
|
55
|
+
deprecationWarned = true;
|
56
|
+
}
|
57
|
+
|
49
58
|
if (rubyObject == null || rubyObject instanceof RubyNil) {
|
50
59
|
setNull();
|
51
60
|
} else if (rubyObject instanceof RubyBoolean) {
|
@@ -80,4 +89,6 @@ public abstract class AbstractDynamicColumnSetter
|
|
80
89
|
set(RubyValueApi.toValue(rubyObject.getRuntime(), rubyObject));
|
81
90
|
}
|
82
91
|
}
|
92
|
+
|
93
|
+
private static boolean deprecationWarned = false;
|
83
94
|
}
|
@@ -51,7 +51,18 @@ public class SkipColumnSetter
|
|
51
51
|
public void set(Value v)
|
52
52
|
{ }
|
53
53
|
|
54
|
+
@Deprecated
|
54
55
|
@Override
|
55
56
|
public void setRubyObject(IRubyObject rubyObject)
|
56
|
-
{
|
57
|
+
{
|
58
|
+
if (!deprecationWarned) {
|
59
|
+
System.err.println("[WARN] Plugin uses deprecated org.embulk.spi.util.dynamic.SkipColumnSetter#setRubyObject");
|
60
|
+
System.err.println("[WARN] Report plugins in your config at: https://github.com/embulk/embulk/issues/799");
|
61
|
+
// The |deprecationWarned| flag is used only for warning messages.
|
62
|
+
// Even in case of race conditions, messages are just duplicated -- should be acceptable.
|
63
|
+
deprecationWarned = true;
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
private static boolean deprecationWarned = false;
|
57
68
|
}
|
data/embulk-docs/build.gradle
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
// TODO: Remove this block once rubygems.lasagna.io is back, or jruby-gradle-jar-plugin is upgraded to 1.5.0.
|
2
|
+
// See also: https://github.com/jruby-gradle/jruby-gradle-plugin/issues/297
|
3
|
+
// jruby-gradle-jar-plugin is not upgraded yet because its 1.5.0 depends on Java 8.
|
4
|
+
repositories {
|
5
|
+
mavenLocal()
|
6
|
+
maven { url 'http://rubygems-proxy.torquebox.org/releases' }
|
7
|
+
}
|
8
|
+
|
1
9
|
apply plugin: 'com.github.jruby-gradle.base'
|
2
10
|
|
3
11
|
import com.github.jrubygradle.JRubyExec
|
@@ -191,41 +191,53 @@ The ``csv`` parser plugin parses CSV and TSV files.
|
|
191
191
|
Options
|
192
192
|
~~~~~~~~
|
193
193
|
|
194
|
-
|
195
|
-
| name | type | description |
|
196
|
-
|
197
|
-
| delimiter | string | Delimiter character such as ``,`` for CSV, ``"\t"`` for TSV, ``"|"`` | ``,`` by default
|
198
|
-
|
199
|
-
| quote | string | The character surrounding a quoted value. Setting ``null`` disables quoting. | ``"`` by default
|
200
|
-
|
201
|
-
| escape | string | Escape character to escape a special character. Setting ``null`` disables escaping. | ``\\`` by default
|
202
|
-
|
203
|
-
| skip\_header\_lines | integer | Skip this number of lines first. Set 1 if the file has header line. | ``0`` by default
|
204
|
-
|
205
|
-
| null\_string | string | If a value is this string, converts it to NULL. For example, set ``\N`` for CSV files created by mysqldump |
|
206
|
-
|
207
|
-
| trim\_if\_not\_quoted | boolean | If true, remove spaces of a value if the value is not surrounded by the quote character | ``false`` by default
|
208
|
-
|
209
|
-
|
|
210
|
-
|
211
|
-
|
|
212
|
-
|
213
|
-
| allow\
|
214
|
-
|
215
|
-
|
|
216
|
-
|
217
|
-
|
|
218
|
-
|
219
|
-
|
|
220
|
-
|
221
|
-
| default\
|
222
|
-
|
223
|
-
|
|
224
|
-
|
225
|
-
|
|
226
|
-
|
227
|
-
|
|
228
|
-
|
194
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
195
|
+
| name | type | description | required? |
|
196
|
+
+============================+==========+================================================================================================================+============================================+
|
197
|
+
| delimiter | string | Delimiter character such as ``,`` for CSV, ``"\t"`` for TSV, ``"|"`` | ``,`` by default |
|
198
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
199
|
+
| quote | string | The character surrounding a quoted value. Setting ``null`` disables quoting. | ``"`` by default |
|
200
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
201
|
+
| escape | string | Escape character to escape a special character. Setting ``null`` disables escaping. | ``\\`` by default |
|
202
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
203
|
+
| skip\_header\_lines | integer | Skip this number of lines first. Set 1 if the file has header line. | ``0`` by default |
|
204
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
205
|
+
| null\_string | string | If a value is this string, converts it to NULL. For example, set ``\N`` for CSV files created by mysqldump | |
|
206
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
207
|
+
| trim\_if\_not\_quoted | boolean | If true, remove spaces of a value if the value is not surrounded by the quote character | ``false`` by default |
|
208
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
209
|
+
| quotes\_in\_quoted\_fields | enum | Specify how to deal with irregular unescaped quote characters in quoted fields | ``ACCEPT_ONLY_RFC4180_ESCAPED`` by default |
|
210
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
211
|
+
| comment\_line\_marker | string | Skip a line if the line begins with this string | null by default |
|
212
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
213
|
+
| allow\_optional\_columns | boolean | If true, set null to insufficient columns. Otherwise, skip the row in case of insufficient number of columns | ``false`` by default |
|
214
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
215
|
+
| allow\_extra\_columns | boolean | If true, ignore too many columns. Otherwise, skip the row in case of too many columns | ``false`` by default |
|
216
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
217
|
+
| max\_quoted\_size\_limit | integer | Maximum number of bytes of a quoted value. If a value exceeds the limit, the row will be skipped | ``131072`` by default |
|
218
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
219
|
+
| stop\_on\_invalid\_record | boolean | Stop bulk load transaction if a file includes invalid record (such as invalid timestamp) | ``false`` by default |
|
220
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
221
|
+
| default\_timezone | string | Time zone of timestamp columns if the value itself doesn't include time zone description (eg. Asia/Tokyo) | ``UTC`` by default |
|
222
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
223
|
+
| default\_date | string | Set date part if the format doesn’t include date part. | ``1970-01-01`` by default |
|
224
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
225
|
+
| newline | enum | Newline character (CRLF, LF or CR) | ``CRLF`` by default |
|
226
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
227
|
+
| charset | enum | Character encoding (eg. ISO-8859-1, UTF-8) | ``UTF-8`` by default |
|
228
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
229
|
+
| columns | hash | Columns (see below) | required |
|
230
|
+
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+--------------------------------------------+
|
231
|
+
|
232
|
+
The ``quotes_in_quoted_fields`` option specifies how to deal with irregular non-escaped stray quote characters.
|
233
|
+
|
234
|
+
+------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
|
235
|
+
| name | description |
|
236
|
+
+======================================================+=====================================================================================================================================================+
|
237
|
+
| ACCEPT_ONLY_RFC4180_ESCAPED | Default. Accept only specified and RFC 4180-style escaped quote characters. |
|
238
|
+
+------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
|
239
|
+
| ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS | Accept stray quotes as-is in the field. Instead, it behaves undefined if delimiters are in fields. ``"a"b"`` goes ``a"b``. ``"a""b"`` goes ``a"b``. |
|
240
|
+
+------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
|
229
241
|
|
230
242
|
The ``columns`` option declares the list of columns. This CSV parser plugin ignores the header line.
|
231
243
|
|
data/embulk-docs/src/index.rst
CHANGED
@@ -7,9 +7,17 @@ Embulk
|
|
7
7
|
==================================
|
8
8
|
|
9
9
|
.. image:: _static/embulk-logo-v2/embulk-logo-v2-sq-tr-small.png
|
10
|
-
:width:
|
10
|
+
:width: 128px
|
11
11
|
:target: https://github.com/embulk/embulk
|
12
12
|
|
13
|
+
Highlights
|
14
|
+
-----------
|
15
|
+
|
16
|
+
* Embulk's announcement mailing list (read-only) is ready. Please feel free to subscribe! Embulk core members post important updates such as **key releases**, **compatibility information**, and **feedback requests to users**.
|
17
|
+
|
18
|
+
* `Embulk-announce <https://groups.google.com/forum/#!forum/embulk-announce>`_
|
19
|
+
|
20
|
+
|
13
21
|
What's Embulk?
|
14
22
|
------------------
|
15
23
|
|
data/embulk-docs/src/release.rst
CHANGED
@@ -0,0 +1,32 @@
|
|
1
|
+
Release 0.8.36
|
2
|
+
==================================
|
3
|
+
|
4
|
+
General Changes
|
5
|
+
----------------
|
6
|
+
|
7
|
+
* Load dependency JAR files embedded in plugin JAR [#792]
|
8
|
+
* Improve timestamp parsing in Ruby plugins [#812] [#814]
|
9
|
+
* Notify Embulk-announce mailing list in CLI [#816]
|
10
|
+
|
11
|
+
Bug Fixes
|
12
|
+
----------
|
13
|
+
|
14
|
+
* Use single-quotes to quote path strings in YAML for Windows [#805]
|
15
|
+
* Truncate output file before overwriting it [#807]
|
16
|
+
* Fix typo in FALSE_STRINGS and add their test [#810]
|
17
|
+
|
18
|
+
Built-in plugins
|
19
|
+
-----------------
|
20
|
+
|
21
|
+
* Add new option ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS in CSV parser [#809]
|
22
|
+
|
23
|
+
Deprecations
|
24
|
+
-------------
|
25
|
+
|
26
|
+
* Deprecate JRuby-dependent classes and methods [#800] [#803] [#825]
|
27
|
+
* Warn explicitly for deprecated methods [#821] [#826]
|
28
|
+
|
29
|
+
|
30
|
+
Release Date
|
31
|
+
------------------
|
32
|
+
2017-10-24
|
@@ -77,6 +77,10 @@ public class CsvParserPlugin
|
|
77
77
|
@ConfigDefault("false")
|
78
78
|
boolean getTrimIfNotQuoted();
|
79
79
|
|
80
|
+
@Config("quotes_in_quoted_fields")
|
81
|
+
@ConfigDefault("\"ACCEPT_ONLY_RFC4180_ESCAPED\"")
|
82
|
+
QuotesInQuotedFields getQuotesInQuotedFields();
|
83
|
+
|
80
84
|
@Config("max_quoted_size_limit")
|
81
85
|
@ConfigDefault("131072") //128kB
|
82
86
|
long getMaxQuotedSizeLimit();
|
@@ -98,6 +102,24 @@ public class CsvParserPlugin
|
|
98
102
|
boolean getStopOnInvalidRecord();
|
99
103
|
}
|
100
104
|
|
105
|
+
public enum QuotesInQuotedFields
|
106
|
+
{
|
107
|
+
ACCEPT_ONLY_RFC4180_ESCAPED,
|
108
|
+
ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS,
|
109
|
+
;
|
110
|
+
|
111
|
+
@JsonCreator
|
112
|
+
public static QuotesInQuotedFields ofString(final String string)
|
113
|
+
{
|
114
|
+
for (final QuotesInQuotedFields value : values()) {
|
115
|
+
if (string.equals(value.toString())) {
|
116
|
+
return value;
|
117
|
+
}
|
118
|
+
}
|
119
|
+
throw new ConfigException("\"quotes_in_quoted_fields\" must be one of [ACCEPT_ONLY_RFC4180_ESCAPED, ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS].");
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
101
123
|
public static class QuoteCharacter
|
102
124
|
{
|
103
125
|
private final char character;
|
@@ -8,6 +8,7 @@ import java.util.ArrayDeque;
|
|
8
8
|
import org.embulk.spi.DataException;
|
9
9
|
import org.embulk.spi.util.LineDecoder;
|
10
10
|
import org.embulk.config.ConfigException;
|
11
|
+
import org.embulk.standards.CsvParserPlugin.QuotesInQuotedFields;
|
11
12
|
|
12
13
|
public class CsvTokenizer
|
13
14
|
{
|
@@ -31,6 +32,7 @@ public class CsvTokenizer
|
|
31
32
|
private final char escape;
|
32
33
|
private final String newline;
|
33
34
|
private final boolean trimIfNotQuoted;
|
35
|
+
private final QuotesInQuotedFields quotesInQuotedFields;
|
34
36
|
private final long maxQuotedSizeLimit;
|
35
37
|
private final String commentLineMarker;
|
36
38
|
private final LineDecoder input;
|
@@ -62,6 +64,12 @@ public class CsvTokenizer
|
|
62
64
|
escape = task.getEscapeChar().or(CsvParserPlugin.EscapeCharacter.noEscape()).getCharacter();
|
63
65
|
newline = task.getNewline().getString();
|
64
66
|
trimIfNotQuoted = task.getTrimIfNotQuoted();
|
67
|
+
quotesInQuotedFields = task.getQuotesInQuotedFields();
|
68
|
+
if (trimIfNotQuoted && quotesInQuotedFields != QuotesInQuotedFields.ACCEPT_ONLY_RFC4180_ESCAPED) {
|
69
|
+
// The combination makes some syntax very ambiguous such as:
|
70
|
+
// val1, \"\"val2\"\" ,val3
|
71
|
+
throw new ConfigException("[quotes_in_quoted_fields != ACCEPT_ONLY_RFC4180_ESCAPED] is not allowed to specify with [trim_if_not_quoted = true]");
|
72
|
+
}
|
65
73
|
maxQuotedSizeLimit = task.getMaxQuotedSizeLimit();
|
66
74
|
commentLineMarker = task.getCommentLineMarker().orNull();
|
67
75
|
nullStringOrNull = task.getNullString().orNull();
|
@@ -313,9 +321,23 @@ public class CsvTokenizer
|
|
313
321
|
|
314
322
|
} else if (isQuote(c)) {
|
315
323
|
char next = peekNextChar();
|
316
|
-
|
324
|
+
final char nextNext = peekNextNextChar();
|
325
|
+
if (isQuote(next) &&
|
326
|
+
(quotesInQuotedFields != QuotesInQuotedFields.ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS ||
|
327
|
+
(!isDelimiter(nextNext) && !isEndOfLine(nextNext)))) {
|
328
|
+
// Escaped by preceding it with another quote.
|
329
|
+
// A quote just before a delimiter or an end of line is recognized as a functional quote,
|
330
|
+
// not just as a non-escaped stray "quote character" included the field, even if
|
331
|
+
// ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS is specified.
|
317
332
|
quotedValue.append(line.substring(valueStartPos, linePos));
|
318
333
|
valueStartPos = ++linePos;
|
334
|
+
} else if (quotesInQuotedFields == QuotesInQuotedFields.ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS &&
|
335
|
+
!(isDelimiter(next) || isEndOfLine(next))) {
|
336
|
+
// A non-escaped stray "quote character" in the field is processed as a regular character
|
337
|
+
// if ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS is specified,
|
338
|
+
if ((linePos - valueStartPos) + quotedValue.length() > maxQuotedSizeLimit) {
|
339
|
+
throw new QuotedSizeLimitExceededException("The size of the quoted value exceeds the limit size (" + maxQuotedSizeLimit + ")");
|
340
|
+
}
|
319
341
|
} else {
|
320
342
|
quotedValue.append(line.substring(valueStartPos, linePos - 1));
|
321
343
|
columnState = ColumnState.AFTER_QUOTED_VALUE;
|
@@ -427,6 +449,17 @@ public class CsvTokenizer
|
|
427
449
|
}
|
428
450
|
}
|
429
451
|
|
452
|
+
private char peekNextNextChar()
|
453
|
+
{
|
454
|
+
Preconditions.checkState(line != null, "peekNextNextChar is called after end of file");
|
455
|
+
|
456
|
+
if (linePos + 1 >= line.length()) {
|
457
|
+
return END_OF_LINE;
|
458
|
+
} else {
|
459
|
+
return line.charAt(linePos + 1);
|
460
|
+
}
|
461
|
+
}
|
462
|
+
|
430
463
|
private boolean isSpace(char c)
|
431
464
|
{
|
432
465
|
return c == ' ';
|
@@ -1,6 +1,8 @@
|
|
1
1
|
package org.embulk.standards;
|
2
2
|
|
3
3
|
import java.util.List;
|
4
|
+
import org.embulk.config.Config;
|
5
|
+
import org.embulk.config.ConfigDefault;
|
4
6
|
import org.embulk.config.ConfigSource;
|
5
7
|
import org.embulk.config.TaskSource;
|
6
8
|
import org.embulk.config.ConfigDiff;
|
@@ -14,13 +16,17 @@ import org.embulk.spi.OutputPlugin;
|
|
14
16
|
import org.embulk.spi.TransactionalPageOutput;
|
15
17
|
import org.embulk.spi.PageReader;
|
16
18
|
import org.embulk.spi.util.PagePrinter;
|
19
|
+
import org.joda.time.DateTimeZone;
|
17
20
|
|
18
21
|
public class StdoutOutputPlugin
|
19
22
|
implements OutputPlugin
|
20
23
|
{
|
21
24
|
public interface PluginTask
|
22
|
-
extends Task
|
25
|
+
extends Task
|
23
26
|
{
|
27
|
+
@Config("timezone")
|
28
|
+
@ConfigDefault("\"UTC\"")
|
29
|
+
public DateTimeZone getTimeZone();
|
24
30
|
}
|
25
31
|
|
26
32
|
@Override
|
@@ -54,7 +60,7 @@ public class StdoutOutputPlugin
|
|
54
60
|
|
55
61
|
return new TransactionalPageOutput() {
|
56
62
|
private final PageReader reader = new PageReader(schema);
|
57
|
-
private final PagePrinter printer = new PagePrinter(schema, task);
|
63
|
+
private final PagePrinter printer = new PagePrinter(schema, task.getTimeZone());
|
58
64
|
|
59
65
|
public void add(Page page)
|
60
66
|
{
|
@@ -293,6 +293,82 @@ public class TestCsvTokenizer
|
|
293
293
|
"\"trailing\n3\" ,\"trailing\n4\" "));
|
294
294
|
}
|
295
295
|
|
296
|
+
|
297
|
+
@Test
|
298
|
+
public void parseWithDefaultQuotesInQuotedFields() throws Exception
|
299
|
+
{
|
300
|
+
reloadPluginTask();
|
301
|
+
assertEquals(expectedRecords(
|
302
|
+
2,
|
303
|
+
"foo\"bar", "foofoo\"barbar",
|
304
|
+
"baz\"\"qux", "bazbaz\"\"quxqux"),
|
305
|
+
parse(
|
306
|
+
task,
|
307
|
+
"\"foo\"\"bar\",\"foofoo\"\"barbar\"",
|
308
|
+
"\"baz\"\"\"\"qux\",\"bazbaz\"\"\"\"quxqux\""));
|
309
|
+
}
|
310
|
+
|
311
|
+
@Test
|
312
|
+
public void parseWithQuotesInQuotedFields_ACCEPT_ONLY_RFC4180_ESCAPED() throws Exception
|
313
|
+
{
|
314
|
+
config.set("quotes_in_quoted_fields", "ACCEPT_ONLY_RFC4180_ESCAPED");
|
315
|
+
reloadPluginTask();
|
316
|
+
assertEquals(expectedRecords(
|
317
|
+
2,
|
318
|
+
"foo\"bar", "foofoo\"barbar",
|
319
|
+
"baz\"\"qux", "bazbaz\"\"quxqux"),
|
320
|
+
parse(
|
321
|
+
task,
|
322
|
+
"\"foo\"\"bar\",\"foofoo\"\"barbar\"",
|
323
|
+
"\"baz\"\"\"\"qux\",\"bazbaz\"\"\"\"quxqux\""));
|
324
|
+
}
|
325
|
+
|
326
|
+
@Test
|
327
|
+
public void throwWithDefaultQuotesInQuotedFields() throws Exception
|
328
|
+
{
|
329
|
+
reloadPluginTask();
|
330
|
+
try {
|
331
|
+
parse(task, "\"foo\"bar\",\"hoge\"fuga\"");
|
332
|
+
fail();
|
333
|
+
} catch (Exception e) {
|
334
|
+
assertTrue(e instanceof CsvTokenizer.InvalidValueException);
|
335
|
+
assertEquals("Unexpected extra character 'b' after a value quoted by '\"'", e.getMessage());
|
336
|
+
return;
|
337
|
+
}
|
338
|
+
}
|
339
|
+
|
340
|
+
@Test
|
341
|
+
public void throwWithQuotesInQuotedFields_ACCEPT_ONLY_RFC4180_ESCAPED() throws Exception
|
342
|
+
{
|
343
|
+
config.set("quotes_in_quoted_fields", "ACCEPT_ONLY_RFC4180_ESCAPED");
|
344
|
+
reloadPluginTask();
|
345
|
+
try {
|
346
|
+
parse(task, "\"foo\"bar\",\"hoge\"fuga\"");
|
347
|
+
fail();
|
348
|
+
} catch (Exception e) {
|
349
|
+
assertTrue(e instanceof CsvTokenizer.InvalidValueException);
|
350
|
+
assertEquals("Unexpected extra character 'b' after a value quoted by '\"'", e.getMessage());
|
351
|
+
return;
|
352
|
+
}
|
353
|
+
}
|
354
|
+
|
355
|
+
@Test
|
356
|
+
public void parseWithQuotesInQuotedFields_ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS() throws Exception
|
357
|
+
{
|
358
|
+
config.set("quotes_in_quoted_fields", "ACCEPT_STRAY_QUOTES_ASSUMING_NO_DELIMITERS_IN_FIELDS");
|
359
|
+
reloadPluginTask();
|
360
|
+
assertEquals(expectedRecords(
|
361
|
+
2,
|
362
|
+
"foo\"bar", "foofoo\"barbar",
|
363
|
+
"baz\"\"qux", "bazbaz\"\"quxqux",
|
364
|
+
"\"embulk\"", "\"embul\"\"k\""),
|
365
|
+
parse(
|
366
|
+
task,
|
367
|
+
"\"foo\"bar\",\"foofoo\"\"barbar\"",
|
368
|
+
"\"baz\"\"\"qux\",\"bazbaz\"\"\"\"quxqux\"",
|
369
|
+
"\"\"\"embulk\"\",\"\"embul\"\"\"k\"\""));
|
370
|
+
}
|
371
|
+
|
296
372
|
@Test
|
297
373
|
public void throwQuotedSizeLimitExceededException() throws Exception
|
298
374
|
{
|