embulk 0.5.3 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/build.gradle +1 -1
- data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +5 -2
- data/embulk-core/src/main/java/org/embulk/plugin/PluginManager.java +4 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/FileOutputOutputStream.java +1 -0
- data/embulk-docs/src/recipe/scheduled-csv-load-to-elasticsearch-kibana4.rst +1 -1
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.5.4.rst +24 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +15 -10
- data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +48 -22
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +3 -1
- data/lib/embulk/version.rb +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e0be98e5dbe81e40c6562142d2cdf44cc4f8cf34
|
4
|
+
data.tar.gz: f907d431af0add753761547f09dec47113b3b236
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a1690e94a7622db588cc6511f1dec583320192960ecdf9851c74ac1f5feaf7bda478b8e6dbf16a7b58f870e38dd464ee949cc20c6f5cadb3473b04bf3cf23db
|
7
|
+
data.tar.gz: 0eb2a31661f7772cadee71642781d2ef30cfc5015ce5bf549a3cc36310fbe29283f80d272b91ee788a4513c52ae8c09ecad8e4dce28e459ad11c5370f68a6e22
|
data/README.md
CHANGED
@@ -30,7 +30,7 @@ Embulk is a Java application. Please make sure that you installed [Java](http://
|
|
30
30
|
Following 4 commands install embulk to your home directory:
|
31
31
|
|
32
32
|
```
|
33
|
-
curl --create-dirs -o ~/.embulk/bin/embulk -L https://bintray.com/artifact/download/embulk/maven/embulk-0.5.
|
33
|
+
curl --create-dirs -o ~/.embulk/bin/embulk -L https://bintray.com/artifact/download/embulk/maven/embulk-0.5.4.jar
|
34
34
|
chmod +x ~/.embulk/bin/embulk
|
35
35
|
echo 'export PATH="$HOME/.embulk/bin:$PATH"' >> ~/.bashrc
|
36
36
|
source ~/.bashrc
|
@@ -45,7 +45,7 @@ Embulk is a Java application. Please make sure that you installed [Java](http://
|
|
45
45
|
You can assume the jar file is a .bat file.
|
46
46
|
|
47
47
|
```
|
48
|
-
PowerShell -Command "& {Invoke-WebRequest https://bintray.com/artifact/download/embulk/maven/embulk-0.5.
|
48
|
+
PowerShell -Command "& {Invoke-WebRequest https://bintray.com/artifact/download/embulk/maven/embulk-0.5.4.jar -OutFile embulk.bat}"
|
49
49
|
```
|
50
50
|
|
51
51
|
Next step: [Trying examples](#trying-examples)
|
data/build.gradle
CHANGED
@@ -11,6 +11,9 @@ public class PooledBufferAllocator
|
|
11
11
|
{
|
12
12
|
private PooledByteBufAllocator nettyBuffer;
|
13
13
|
|
14
|
+
private int DEFAULT_BUFFER_SIZE = 32*1024;
|
15
|
+
private int MINIMUM_BUFFER_SIZE = 8*1024;
|
16
|
+
|
14
17
|
public PooledBufferAllocator()
|
15
18
|
{
|
16
19
|
// TODO configure parameters
|
@@ -19,12 +22,12 @@ public class PooledBufferAllocator
|
|
19
22
|
|
20
23
|
public Buffer allocate()
|
21
24
|
{
|
22
|
-
return
|
25
|
+
return allocate(DEFAULT_BUFFER_SIZE);
|
23
26
|
}
|
24
27
|
|
25
28
|
public Buffer allocate(int minimumCapacity)
|
26
29
|
{
|
27
|
-
int size =
|
30
|
+
int size = MINIMUM_BUFFER_SIZE;
|
28
31
|
while (size < minimumCapacity) {
|
29
32
|
size *= 2;
|
30
33
|
}
|
@@ -28,6 +28,10 @@ public class PluginManager
|
|
28
28
|
throw new ConfigException("No PluginSource is installed");
|
29
29
|
}
|
30
30
|
|
31
|
+
if (type == null) {
|
32
|
+
throw new ConfigException(String.format("%s type is not set (if you intend to use NullOutputPlugin, you should enclose null in quotes such as {type: \"null\"}.", iface.getSimpleName()));
|
33
|
+
}
|
34
|
+
|
31
35
|
List<Throwable> causes = new ArrayList<Throwable>();
|
32
36
|
for (PluginSource source : sources) {
|
33
37
|
try {
|
@@ -56,7 +56,7 @@ You can find the latest embulk binary from the `releases <https://bintray.com/em
|
|
56
56
|
|
57
57
|
.. code-block:: console
|
58
58
|
|
59
|
-
$ sudo wget https://bintray.com/artifact/download/embulk/maven/embulk-0.5.
|
59
|
+
$ sudo wget https://bintray.com/artifact/download/embulk/maven/embulk-0.5.4.jar -O /usr/local/bin/embulk
|
60
60
|
$ sudo chmod +x /usr/local/bin/embulk
|
61
61
|
|
62
62
|
Step 2. Install Elasticsearch plugin
|
data/embulk-docs/src/release.rst
CHANGED
@@ -0,0 +1,24 @@
|
|
1
|
+
Release 0.5.4
|
2
|
+
==================================
|
3
|
+
|
4
|
+
Built-in plugins
|
5
|
+
------------------
|
6
|
+
|
7
|
+
* ``parser-csv`` supports ``allow_optional_columns`` option. With this option set to ``true``, the parser sets null to insufficient columns rather than skipping the entire row (@kamatama41++)
|
8
|
+
|
9
|
+
* Fixed exception handling of ``parser-csv`` so that the transaction properly fails with underlaying exceptions such as IOException
|
10
|
+
|
11
|
+
|
12
|
+
General Changes
|
13
|
+
------------------
|
14
|
+
|
15
|
+
* Increased buffer size from 256 bytes to 32 KB. This improves performance significantly. (@hito4t++)
|
16
|
+
|
17
|
+
* If plugin type is null, suggest to use ``{type: "null"}`` (@hito4t++)
|
18
|
+
|
19
|
+
* Embulk logo is available! See the orca: https://github.com/embulk/embulk/issues/12
|
20
|
+
|
21
|
+
|
22
|
+
Release Date
|
23
|
+
------------------
|
24
|
+
2015-03-23
|
@@ -1,6 +1,5 @@
|
|
1
1
|
package org.embulk.standards;
|
2
2
|
|
3
|
-
import com.google.common.base.Preconditions;
|
4
3
|
import com.google.common.base.Optional;
|
5
4
|
import com.google.common.collect.ImmutableSet;
|
6
5
|
import org.embulk.config.Task;
|
@@ -21,7 +20,6 @@ import org.embulk.spi.ParserPlugin;
|
|
21
20
|
import org.embulk.spi.Exec;
|
22
21
|
import org.embulk.spi.FileInput;
|
23
22
|
import org.embulk.spi.PageOutput;
|
24
|
-
import org.embulk.spi.BufferAllocator;
|
25
23
|
import org.embulk.spi.util.LineDecoder;
|
26
24
|
import org.slf4j.Logger;
|
27
25
|
|
@@ -76,6 +74,10 @@ public class CsvParserPlugin
|
|
76
74
|
@Config("max_quoted_size_limit")
|
77
75
|
@ConfigDefault("131072") //128kB
|
78
76
|
public long getMaxQuotedSizeLimit();
|
77
|
+
|
78
|
+
@Config("allow_optional_columns")
|
79
|
+
@ConfigDefault("false")
|
80
|
+
public boolean getAllowOptionalColumns();
|
79
81
|
}
|
80
82
|
|
81
83
|
private final Logger log;
|
@@ -127,6 +129,7 @@ public class CsvParserPlugin
|
|
127
129
|
LineDecoder lineDecoder = new LineDecoder(input, task);
|
128
130
|
final CsvTokenizer tokenizer = new CsvTokenizer(lineDecoder, task);
|
129
131
|
final String nullStringOrNull = task.getNullString().orNull();
|
132
|
+
final boolean allowOptionalColumns = task.getAllowOptionalColumns();
|
130
133
|
int skipHeaderLines = task.getSkipHeaderLines();
|
131
134
|
|
132
135
|
try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
|
@@ -147,7 +150,7 @@ public class CsvParserPlugin
|
|
147
150
|
schema.visitColumns(new ColumnVisitor() {
|
148
151
|
public void booleanColumn(Column column)
|
149
152
|
{
|
150
|
-
String v = nextColumn(schema, tokenizer, nullStringOrNull);
|
153
|
+
String v = nextColumn(schema, tokenizer, nullStringOrNull, allowOptionalColumns);
|
151
154
|
if (v == null) {
|
152
155
|
pageBuilder.setNull(column);
|
153
156
|
} else {
|
@@ -157,7 +160,7 @@ public class CsvParserPlugin
|
|
157
160
|
|
158
161
|
public void longColumn(Column column)
|
159
162
|
{
|
160
|
-
String v = nextColumn(schema, tokenizer, nullStringOrNull);
|
163
|
+
String v = nextColumn(schema, tokenizer, nullStringOrNull, allowOptionalColumns);
|
161
164
|
if (v == null) {
|
162
165
|
pageBuilder.setNull(column);
|
163
166
|
} else {
|
@@ -172,7 +175,7 @@ public class CsvParserPlugin
|
|
172
175
|
|
173
176
|
public void doubleColumn(Column column)
|
174
177
|
{
|
175
|
-
String v = nextColumn(schema, tokenizer, nullStringOrNull);
|
178
|
+
String v = nextColumn(schema, tokenizer, nullStringOrNull, allowOptionalColumns);
|
176
179
|
if (v == null) {
|
177
180
|
pageBuilder.setNull(column);
|
178
181
|
} else {
|
@@ -187,7 +190,7 @@ public class CsvParserPlugin
|
|
187
190
|
|
188
191
|
public void stringColumn(Column column)
|
189
192
|
{
|
190
|
-
String v = nextColumn(schema, tokenizer, nullStringOrNull);
|
193
|
+
String v = nextColumn(schema, tokenizer, nullStringOrNull, allowOptionalColumns);
|
191
194
|
if (v == null) {
|
192
195
|
pageBuilder.setNull(column);
|
193
196
|
} else {
|
@@ -197,7 +200,7 @@ public class CsvParserPlugin
|
|
197
200
|
|
198
201
|
public void timestampColumn(Column column)
|
199
202
|
{
|
200
|
-
String v = nextColumn(schema, tokenizer, nullStringOrNull);
|
203
|
+
String v = nextColumn(schema, tokenizer, nullStringOrNull, allowOptionalColumns);
|
201
204
|
if (v == null) {
|
202
205
|
pageBuilder.setNull(column);
|
203
206
|
} else {
|
@@ -212,8 +215,7 @@ public class CsvParserPlugin
|
|
212
215
|
});
|
213
216
|
pageBuilder.addRecord();
|
214
217
|
|
215
|
-
} catch (
|
216
|
-
// TODO logging
|
218
|
+
} catch (CsvTokenizer.InvalidFormatException e) {
|
217
219
|
long lineNumber = tokenizer.getCurrentLineNumber();
|
218
220
|
String skippedLine = tokenizer.skipCurrentLine();
|
219
221
|
log.warn(String.format("Skipped (line %d): %s", lineNumber, skippedLine), e);
|
@@ -226,8 +228,11 @@ public class CsvParserPlugin
|
|
226
228
|
}
|
227
229
|
}
|
228
230
|
|
229
|
-
private static String nextColumn(Schema schema, CsvTokenizer tokenizer, String nullStringOrNull)
|
231
|
+
private static String nextColumn(Schema schema, CsvTokenizer tokenizer, String nullStringOrNull, boolean allowOptionalColumns)
|
230
232
|
{
|
233
|
+
if(allowOptionalColumns && !tokenizer.hasNextColumn()) {
|
234
|
+
return null;
|
235
|
+
}
|
231
236
|
String v = tokenizer.nextColumn();
|
232
237
|
if (!v.isEmpty()) {
|
233
238
|
if (v.equals(nullStringOrNull)) {
|
@@ -20,7 +20,6 @@ public class CsvTokenizer
|
|
20
20
|
}
|
21
21
|
|
22
22
|
private static final char END_OF_LINE = '\0';
|
23
|
-
private static final boolean TRACE = false;
|
24
23
|
|
25
24
|
private final char delimiter;
|
26
25
|
private final char quote;
|
@@ -81,7 +80,10 @@ public class CsvTokenizer
|
|
81
80
|
public boolean nextRecord()
|
82
81
|
{
|
83
82
|
// If at the end of record, read the next line and initialize the state
|
84
|
-
|
83
|
+
if (recordState != RecordState.END) {
|
84
|
+
throw new TooManyColumnsException("Too many columns");
|
85
|
+
}
|
86
|
+
|
85
87
|
boolean hasNext = nextLine(true);
|
86
88
|
if (hasNext) {
|
87
89
|
recordState = RecordState.NOT_END;
|
@@ -105,10 +107,6 @@ public class CsvTokenizer
|
|
105
107
|
linePos = 0;
|
106
108
|
lineNumber++;
|
107
109
|
|
108
|
-
if (TRACE) {
|
109
|
-
System.out.println("#MN line: " + line + " (" + lineNumber + ")");
|
110
|
-
}
|
111
|
-
|
112
110
|
if (!line.isEmpty() || !ignoreEmptyLine) {
|
113
111
|
return true;
|
114
112
|
}
|
@@ -122,7 +120,9 @@ public class CsvTokenizer
|
|
122
120
|
|
123
121
|
public String nextColumn()
|
124
122
|
{
|
125
|
-
|
123
|
+
if (!hasNextColumn()) {
|
124
|
+
throw new TooFewColumnsException("Too few columns");
|
125
|
+
}
|
126
126
|
|
127
127
|
// reset last state
|
128
128
|
wasQuotedColumn = false;
|
@@ -136,10 +136,6 @@ public class CsvTokenizer
|
|
136
136
|
|
137
137
|
while (true) {
|
138
138
|
final char c = nextChar();
|
139
|
-
if (TRACE) {
|
140
|
-
System.out.println("#MN c: " + c + " (" + columnState + "," + recordState + ")");
|
141
|
-
try { Thread.sleep(100); } catch (Exception e) {}
|
142
|
-
}
|
143
139
|
|
144
140
|
switch (columnState) {
|
145
141
|
case BEGIN:
|
@@ -241,15 +237,12 @@ public class CsvTokenizer
|
|
241
237
|
quotedValue.append(newline);
|
242
238
|
quotedValueLines.add(line);
|
243
239
|
if (!nextLine(false)) {
|
244
|
-
throw new
|
240
|
+
throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
|
245
241
|
}
|
246
242
|
valueStartPos = 0;
|
247
243
|
|
248
244
|
} else if (isQuote(c)) {
|
249
245
|
char next = peekNextChar();
|
250
|
-
if (TRACE) {
|
251
|
-
System.out.println("#MN peeked c: " + next + " (" + columnState + "," + recordState + ")");
|
252
|
-
}
|
253
246
|
if (isQuote(next)) { // escaped quote
|
254
247
|
quotedValue.append(line.substring(valueStartPos, linePos));
|
255
248
|
valueStartPos = ++linePos;
|
@@ -261,15 +254,12 @@ public class CsvTokenizer
|
|
261
254
|
} else if (isEscape(c)) { // isQuote must be checked first in case of quote == escape
|
262
255
|
// In RFC 4180, CSV's escape char is '\"'. But '\\' is often used.
|
263
256
|
char next = peekNextChar();
|
264
|
-
if (TRACE) {
|
265
|
-
System.out.println("#MN peeked c: " + next + " (" + columnState + "," + recordState + ")");
|
266
|
-
}
|
267
257
|
if (isEndOfLine(c)) {
|
268
258
|
// escape end of line. TODO assuming multi-line quoted value without newline?
|
269
259
|
quotedValue.append(line.substring(valueStartPos, linePos));
|
270
260
|
quotedValueLines.add(line);
|
271
261
|
if (!nextLine(false)) {
|
272
|
-
throw new
|
262
|
+
throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
|
273
263
|
}
|
274
264
|
valueStartPos = 0;
|
275
265
|
} else if (isQuote(next) || isEscape(next)) { // escaped quote
|
@@ -298,7 +288,7 @@ public class CsvTokenizer
|
|
298
288
|
// column has trailing spaces and quoted. TODO should this be rejected?
|
299
289
|
|
300
290
|
} else {
|
301
|
-
throw new
|
291
|
+
throw new InvalidValueException("Unexpected extra character after quoted value");
|
302
292
|
}
|
303
293
|
break;
|
304
294
|
|
@@ -360,10 +350,46 @@ public class CsvTokenizer
|
|
360
350
|
return c == escape;
|
361
351
|
}
|
362
352
|
|
363
|
-
static class
|
353
|
+
public static class InvalidFormatException
|
364
354
|
extends RuntimeException
|
365
355
|
{
|
366
|
-
|
356
|
+
public InvalidFormatException(String message)
|
357
|
+
{
|
358
|
+
super(message);
|
359
|
+
}
|
360
|
+
}
|
361
|
+
|
362
|
+
public static class InvalidValueException
|
363
|
+
extends RuntimeException
|
364
|
+
{
|
365
|
+
public InvalidValueException(String message)
|
366
|
+
{
|
367
|
+
super(message);
|
368
|
+
}
|
369
|
+
}
|
370
|
+
|
371
|
+
public static class QuotedSizeLimitExceededException
|
372
|
+
extends InvalidValueException
|
373
|
+
{
|
374
|
+
public QuotedSizeLimitExceededException(String message)
|
375
|
+
{
|
376
|
+
super(message);
|
377
|
+
}
|
378
|
+
}
|
379
|
+
|
380
|
+
public class TooManyColumnsException
|
381
|
+
extends InvalidFormatException
|
382
|
+
{
|
383
|
+
public TooManyColumnsException(String message)
|
384
|
+
{
|
385
|
+
super(message);
|
386
|
+
}
|
387
|
+
}
|
388
|
+
|
389
|
+
public class TooFewColumnsException
|
390
|
+
extends InvalidFormatException
|
391
|
+
{
|
392
|
+
public TooFewColumnsException(String message)
|
367
393
|
{
|
368
394
|
super(message);
|
369
395
|
}
|
@@ -1,7 +1,6 @@
|
|
1
1
|
package org.embulk.standards;
|
2
2
|
|
3
3
|
import org.junit.Rule;
|
4
|
-
import org.junit.Before;
|
5
4
|
import org.junit.Test;
|
6
5
|
import static org.junit.Assert.assertEquals;
|
7
6
|
import java.nio.charset.Charset;
|
@@ -34,6 +33,7 @@ public class TestCsvParserPlugin
|
|
34
33
|
assertEquals(false, task.getHeaderLine().or(false));
|
35
34
|
assertEquals(',', task.getDelimiterChar());
|
36
35
|
assertEquals('\"', task.getQuoteChar());
|
36
|
+
assertEquals(false, task.getAllowOptionalColumns());
|
37
37
|
}
|
38
38
|
|
39
39
|
@Test(expected = ConfigException.class)
|
@@ -53,6 +53,7 @@ public class TestCsvParserPlugin
|
|
53
53
|
.set("header_line", true)
|
54
54
|
.set("delimiter", "\t")
|
55
55
|
.set("quote", "\\")
|
56
|
+
.set("allow_optional_columns", true)
|
56
57
|
.set("columns", ImmutableList.of(
|
57
58
|
ImmutableMap.of(
|
58
59
|
"name", "date_code",
|
@@ -65,5 +66,6 @@ public class TestCsvParserPlugin
|
|
65
66
|
assertEquals(true, task.getHeaderLine().or(false));
|
66
67
|
assertEquals('\t', task.getDelimiterChar());
|
67
68
|
assertEquals('\\', task.getQuoteChar());
|
69
|
+
assertEquals(true, task.getAllowOptionalColumns());
|
68
70
|
}
|
69
71
|
}
|
data/lib/embulk/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -275,6 +275,7 @@ files:
|
|
275
275
|
- embulk-docs/src/release/release-0.5.1.rst
|
276
276
|
- embulk-docs/src/release/release-0.5.2.rst
|
277
277
|
- embulk-docs/src/release/release-0.5.3.rst
|
278
|
+
- embulk-docs/src/release/release-0.5.4.rst
|
278
279
|
- embulk-standards/build.gradle
|
279
280
|
- embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java
|
280
281
|
- embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java
|
@@ -379,8 +380,8 @@ files:
|
|
379
380
|
- classpath/bval-jsr303-0.5.jar
|
380
381
|
- classpath/commons-beanutils-core-1.8.3.jar
|
381
382
|
- classpath/commons-lang3-3.1.jar
|
382
|
-
- classpath/embulk-core-0.5.
|
383
|
-
- classpath/embulk-standards-0.5.
|
383
|
+
- classpath/embulk-core-0.5.4.jar
|
384
|
+
- classpath/embulk-standards-0.5.4.jar
|
384
385
|
- classpath/guava-18.0.jar
|
385
386
|
- classpath/guice-3.0.jar
|
386
387
|
- classpath/guice-multibindings-3.0.jar
|