embulk 0.5.3 → 0.5.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/build.gradle +1 -1
- data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +5 -2
- data/embulk-core/src/main/java/org/embulk/plugin/PluginManager.java +4 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/FileOutputOutputStream.java +1 -0
- data/embulk-docs/src/recipe/scheduled-csv-load-to-elasticsearch-kibana4.rst +1 -1
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.5.4.rst +24 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +15 -10
- data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +48 -22
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +3 -1
- data/lib/embulk/version.rb +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e0be98e5dbe81e40c6562142d2cdf44cc4f8cf34
|
4
|
+
data.tar.gz: f907d431af0add753761547f09dec47113b3b236
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a1690e94a7622db588cc6511f1dec583320192960ecdf9851c74ac1f5feaf7bda478b8e6dbf16a7b58f870e38dd464ee949cc20c6f5cadb3473b04bf3cf23db
|
7
|
+
data.tar.gz: 0eb2a31661f7772cadee71642781d2ef30cfc5015ce5bf549a3cc36310fbe29283f80d272b91ee788a4513c52ae8c09ecad8e4dce28e459ad11c5370f68a6e22
|
data/README.md
CHANGED
@@ -30,7 +30,7 @@ Embulk is a Java application. Please make sure that you installed [Java](http://
|
|
30
30
|
Following 4 commands install embulk to your home directory:
|
31
31
|
|
32
32
|
```
|
33
|
-
curl --create-dirs -o ~/.embulk/bin/embulk -L https://bintray.com/artifact/download/embulk/maven/embulk-0.5.
|
33
|
+
curl --create-dirs -o ~/.embulk/bin/embulk -L https://bintray.com/artifact/download/embulk/maven/embulk-0.5.4.jar
|
34
34
|
chmod +x ~/.embulk/bin/embulk
|
35
35
|
echo 'export PATH="$HOME/.embulk/bin:$PATH"' >> ~/.bashrc
|
36
36
|
source ~/.bashrc
|
@@ -45,7 +45,7 @@ Embulk is a Java application. Please make sure that you installed [Java](http://
|
|
45
45
|
You can assume the jar file is a .bat file.
|
46
46
|
|
47
47
|
```
|
48
|
-
PowerShell -Command "& {Invoke-WebRequest https://bintray.com/artifact/download/embulk/maven/embulk-0.5.
|
48
|
+
PowerShell -Command "& {Invoke-WebRequest https://bintray.com/artifact/download/embulk/maven/embulk-0.5.4.jar -OutFile embulk.bat}"
|
49
49
|
```
|
50
50
|
|
51
51
|
Next step: [Trying examples](#trying-examples)
|
data/build.gradle
CHANGED
@@ -11,6 +11,9 @@ public class PooledBufferAllocator
|
|
11
11
|
{
|
12
12
|
private PooledByteBufAllocator nettyBuffer;
|
13
13
|
|
14
|
+
private int DEFAULT_BUFFER_SIZE = 32*1024;
|
15
|
+
private int MINIMUM_BUFFER_SIZE = 8*1024;
|
16
|
+
|
14
17
|
public PooledBufferAllocator()
|
15
18
|
{
|
16
19
|
// TODO configure parameters
|
@@ -19,12 +22,12 @@ public class PooledBufferAllocator
|
|
19
22
|
|
20
23
|
public Buffer allocate()
|
21
24
|
{
|
22
|
-
return
|
25
|
+
return allocate(DEFAULT_BUFFER_SIZE);
|
23
26
|
}
|
24
27
|
|
25
28
|
public Buffer allocate(int minimumCapacity)
|
26
29
|
{
|
27
|
-
int size =
|
30
|
+
int size = MINIMUM_BUFFER_SIZE;
|
28
31
|
while (size < minimumCapacity) {
|
29
32
|
size *= 2;
|
30
33
|
}
|
@@ -28,6 +28,10 @@ public class PluginManager
|
|
28
28
|
throw new ConfigException("No PluginSource is installed");
|
29
29
|
}
|
30
30
|
|
31
|
+
if (type == null) {
|
32
|
+
throw new ConfigException(String.format("%s type is not set (if you intend to use NullOutputPlugin, you should enclose null in quotes such as {type: \"null\"}.", iface.getSimpleName()));
|
33
|
+
}
|
34
|
+
|
31
35
|
List<Throwable> causes = new ArrayList<Throwable>();
|
32
36
|
for (PluginSource source : sources) {
|
33
37
|
try {
|
@@ -56,7 +56,7 @@ You can find the latest embulk binary from the `releases <https://bintray.com/em
|
|
56
56
|
|
57
57
|
.. code-block:: console
|
58
58
|
|
59
|
-
$ sudo wget https://bintray.com/artifact/download/embulk/maven/embulk-0.5.
|
59
|
+
$ sudo wget https://bintray.com/artifact/download/embulk/maven/embulk-0.5.4.jar -O /usr/local/bin/embulk
|
60
60
|
$ sudo chmod +x /usr/local/bin/embulk
|
61
61
|
|
62
62
|
Step 2. Install Elasticsearch plugin
|
data/embulk-docs/src/release.rst
CHANGED
@@ -0,0 +1,24 @@
|
|
1
|
+
Release 0.5.4
|
2
|
+
==================================
|
3
|
+
|
4
|
+
Built-in plugins
|
5
|
+
------------------
|
6
|
+
|
7
|
+
* ``parser-csv`` supports ``allow_optional_columns`` option. With this option set to ``true``, the parser sets null to insufficient columns rather than skipping the entire row (@kamatama41++)
|
8
|
+
|
9
|
+
* Fixed exception handling of ``parser-csv`` so that the transaction properly fails with underlaying exceptions such as IOException
|
10
|
+
|
11
|
+
|
12
|
+
General Changes
|
13
|
+
------------------
|
14
|
+
|
15
|
+
* Increased buffer size from 256 bytes to 32 KB. This improves performance significantly. (@hito4t++)
|
16
|
+
|
17
|
+
* If plugin type is null, suggest to use ``{type: "null"}`` (@hito4t++)
|
18
|
+
|
19
|
+
* Embulk logo is available! See the orca: https://github.com/embulk/embulk/issues/12
|
20
|
+
|
21
|
+
|
22
|
+
Release Date
|
23
|
+
------------------
|
24
|
+
2015-03-23
|
@@ -1,6 +1,5 @@
|
|
1
1
|
package org.embulk.standards;
|
2
2
|
|
3
|
-
import com.google.common.base.Preconditions;
|
4
3
|
import com.google.common.base.Optional;
|
5
4
|
import com.google.common.collect.ImmutableSet;
|
6
5
|
import org.embulk.config.Task;
|
@@ -21,7 +20,6 @@ import org.embulk.spi.ParserPlugin;
|
|
21
20
|
import org.embulk.spi.Exec;
|
22
21
|
import org.embulk.spi.FileInput;
|
23
22
|
import org.embulk.spi.PageOutput;
|
24
|
-
import org.embulk.spi.BufferAllocator;
|
25
23
|
import org.embulk.spi.util.LineDecoder;
|
26
24
|
import org.slf4j.Logger;
|
27
25
|
|
@@ -76,6 +74,10 @@ public class CsvParserPlugin
|
|
76
74
|
@Config("max_quoted_size_limit")
|
77
75
|
@ConfigDefault("131072") //128kB
|
78
76
|
public long getMaxQuotedSizeLimit();
|
77
|
+
|
78
|
+
@Config("allow_optional_columns")
|
79
|
+
@ConfigDefault("false")
|
80
|
+
public boolean getAllowOptionalColumns();
|
79
81
|
}
|
80
82
|
|
81
83
|
private final Logger log;
|
@@ -127,6 +129,7 @@ public class CsvParserPlugin
|
|
127
129
|
LineDecoder lineDecoder = new LineDecoder(input, task);
|
128
130
|
final CsvTokenizer tokenizer = new CsvTokenizer(lineDecoder, task);
|
129
131
|
final String nullStringOrNull = task.getNullString().orNull();
|
132
|
+
final boolean allowOptionalColumns = task.getAllowOptionalColumns();
|
130
133
|
int skipHeaderLines = task.getSkipHeaderLines();
|
131
134
|
|
132
135
|
try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
|
@@ -147,7 +150,7 @@ public class CsvParserPlugin
|
|
147
150
|
schema.visitColumns(new ColumnVisitor() {
|
148
151
|
public void booleanColumn(Column column)
|
149
152
|
{
|
150
|
-
String v = nextColumn(schema, tokenizer, nullStringOrNull);
|
153
|
+
String v = nextColumn(schema, tokenizer, nullStringOrNull, allowOptionalColumns);
|
151
154
|
if (v == null) {
|
152
155
|
pageBuilder.setNull(column);
|
153
156
|
} else {
|
@@ -157,7 +160,7 @@ public class CsvParserPlugin
|
|
157
160
|
|
158
161
|
public void longColumn(Column column)
|
159
162
|
{
|
160
|
-
String v = nextColumn(schema, tokenizer, nullStringOrNull);
|
163
|
+
String v = nextColumn(schema, tokenizer, nullStringOrNull, allowOptionalColumns);
|
161
164
|
if (v == null) {
|
162
165
|
pageBuilder.setNull(column);
|
163
166
|
} else {
|
@@ -172,7 +175,7 @@ public class CsvParserPlugin
|
|
172
175
|
|
173
176
|
public void doubleColumn(Column column)
|
174
177
|
{
|
175
|
-
String v = nextColumn(schema, tokenizer, nullStringOrNull);
|
178
|
+
String v = nextColumn(schema, tokenizer, nullStringOrNull, allowOptionalColumns);
|
176
179
|
if (v == null) {
|
177
180
|
pageBuilder.setNull(column);
|
178
181
|
} else {
|
@@ -187,7 +190,7 @@ public class CsvParserPlugin
|
|
187
190
|
|
188
191
|
public void stringColumn(Column column)
|
189
192
|
{
|
190
|
-
String v = nextColumn(schema, tokenizer, nullStringOrNull);
|
193
|
+
String v = nextColumn(schema, tokenizer, nullStringOrNull, allowOptionalColumns);
|
191
194
|
if (v == null) {
|
192
195
|
pageBuilder.setNull(column);
|
193
196
|
} else {
|
@@ -197,7 +200,7 @@ public class CsvParserPlugin
|
|
197
200
|
|
198
201
|
public void timestampColumn(Column column)
|
199
202
|
{
|
200
|
-
String v = nextColumn(schema, tokenizer, nullStringOrNull);
|
203
|
+
String v = nextColumn(schema, tokenizer, nullStringOrNull, allowOptionalColumns);
|
201
204
|
if (v == null) {
|
202
205
|
pageBuilder.setNull(column);
|
203
206
|
} else {
|
@@ -212,8 +215,7 @@ public class CsvParserPlugin
|
|
212
215
|
});
|
213
216
|
pageBuilder.addRecord();
|
214
217
|
|
215
|
-
} catch (
|
216
|
-
// TODO logging
|
218
|
+
} catch (CsvTokenizer.InvalidFormatException e) {
|
217
219
|
long lineNumber = tokenizer.getCurrentLineNumber();
|
218
220
|
String skippedLine = tokenizer.skipCurrentLine();
|
219
221
|
log.warn(String.format("Skipped (line %d): %s", lineNumber, skippedLine), e);
|
@@ -226,8 +228,11 @@ public class CsvParserPlugin
|
|
226
228
|
}
|
227
229
|
}
|
228
230
|
|
229
|
-
private static String nextColumn(Schema schema, CsvTokenizer tokenizer, String nullStringOrNull)
|
231
|
+
private static String nextColumn(Schema schema, CsvTokenizer tokenizer, String nullStringOrNull, boolean allowOptionalColumns)
|
230
232
|
{
|
233
|
+
if(allowOptionalColumns && !tokenizer.hasNextColumn()) {
|
234
|
+
return null;
|
235
|
+
}
|
231
236
|
String v = tokenizer.nextColumn();
|
232
237
|
if (!v.isEmpty()) {
|
233
238
|
if (v.equals(nullStringOrNull)) {
|
@@ -20,7 +20,6 @@ public class CsvTokenizer
|
|
20
20
|
}
|
21
21
|
|
22
22
|
private static final char END_OF_LINE = '\0';
|
23
|
-
private static final boolean TRACE = false;
|
24
23
|
|
25
24
|
private final char delimiter;
|
26
25
|
private final char quote;
|
@@ -81,7 +80,10 @@ public class CsvTokenizer
|
|
81
80
|
public boolean nextRecord()
|
82
81
|
{
|
83
82
|
// If at the end of record, read the next line and initialize the state
|
84
|
-
|
83
|
+
if (recordState != RecordState.END) {
|
84
|
+
throw new TooManyColumnsException("Too many columns");
|
85
|
+
}
|
86
|
+
|
85
87
|
boolean hasNext = nextLine(true);
|
86
88
|
if (hasNext) {
|
87
89
|
recordState = RecordState.NOT_END;
|
@@ -105,10 +107,6 @@ public class CsvTokenizer
|
|
105
107
|
linePos = 0;
|
106
108
|
lineNumber++;
|
107
109
|
|
108
|
-
if (TRACE) {
|
109
|
-
System.out.println("#MN line: " + line + " (" + lineNumber + ")");
|
110
|
-
}
|
111
|
-
|
112
110
|
if (!line.isEmpty() || !ignoreEmptyLine) {
|
113
111
|
return true;
|
114
112
|
}
|
@@ -122,7 +120,9 @@ public class CsvTokenizer
|
|
122
120
|
|
123
121
|
public String nextColumn()
|
124
122
|
{
|
125
|
-
|
123
|
+
if (!hasNextColumn()) {
|
124
|
+
throw new TooFewColumnsException("Too few columns");
|
125
|
+
}
|
126
126
|
|
127
127
|
// reset last state
|
128
128
|
wasQuotedColumn = false;
|
@@ -136,10 +136,6 @@ public class CsvTokenizer
|
|
136
136
|
|
137
137
|
while (true) {
|
138
138
|
final char c = nextChar();
|
139
|
-
if (TRACE) {
|
140
|
-
System.out.println("#MN c: " + c + " (" + columnState + "," + recordState + ")");
|
141
|
-
try { Thread.sleep(100); } catch (Exception e) {}
|
142
|
-
}
|
143
139
|
|
144
140
|
switch (columnState) {
|
145
141
|
case BEGIN:
|
@@ -241,15 +237,12 @@ public class CsvTokenizer
|
|
241
237
|
quotedValue.append(newline);
|
242
238
|
quotedValueLines.add(line);
|
243
239
|
if (!nextLine(false)) {
|
244
|
-
throw new
|
240
|
+
throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
|
245
241
|
}
|
246
242
|
valueStartPos = 0;
|
247
243
|
|
248
244
|
} else if (isQuote(c)) {
|
249
245
|
char next = peekNextChar();
|
250
|
-
if (TRACE) {
|
251
|
-
System.out.println("#MN peeked c: " + next + " (" + columnState + "," + recordState + ")");
|
252
|
-
}
|
253
246
|
if (isQuote(next)) { // escaped quote
|
254
247
|
quotedValue.append(line.substring(valueStartPos, linePos));
|
255
248
|
valueStartPos = ++linePos;
|
@@ -261,15 +254,12 @@ public class CsvTokenizer
|
|
261
254
|
} else if (isEscape(c)) { // isQuote must be checked first in case of quote == escape
|
262
255
|
// In RFC 4180, CSV's escape char is '\"'. But '\\' is often used.
|
263
256
|
char next = peekNextChar();
|
264
|
-
if (TRACE) {
|
265
|
-
System.out.println("#MN peeked c: " + next + " (" + columnState + "," + recordState + ")");
|
266
|
-
}
|
267
257
|
if (isEndOfLine(c)) {
|
268
258
|
// escape end of line. TODO assuming multi-line quoted value without newline?
|
269
259
|
quotedValue.append(line.substring(valueStartPos, linePos));
|
270
260
|
quotedValueLines.add(line);
|
271
261
|
if (!nextLine(false)) {
|
272
|
-
throw new
|
262
|
+
throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
|
273
263
|
}
|
274
264
|
valueStartPos = 0;
|
275
265
|
} else if (isQuote(next) || isEscape(next)) { // escaped quote
|
@@ -298,7 +288,7 @@ public class CsvTokenizer
|
|
298
288
|
// column has trailing spaces and quoted. TODO should this be rejected?
|
299
289
|
|
300
290
|
} else {
|
301
|
-
throw new
|
291
|
+
throw new InvalidValueException("Unexpected extra character after quoted value");
|
302
292
|
}
|
303
293
|
break;
|
304
294
|
|
@@ -360,10 +350,46 @@ public class CsvTokenizer
|
|
360
350
|
return c == escape;
|
361
351
|
}
|
362
352
|
|
363
|
-
static class
|
353
|
+
public static class InvalidFormatException
|
364
354
|
extends RuntimeException
|
365
355
|
{
|
366
|
-
|
356
|
+
public InvalidFormatException(String message)
|
357
|
+
{
|
358
|
+
super(message);
|
359
|
+
}
|
360
|
+
}
|
361
|
+
|
362
|
+
public static class InvalidValueException
|
363
|
+
extends RuntimeException
|
364
|
+
{
|
365
|
+
public InvalidValueException(String message)
|
366
|
+
{
|
367
|
+
super(message);
|
368
|
+
}
|
369
|
+
}
|
370
|
+
|
371
|
+
public static class QuotedSizeLimitExceededException
|
372
|
+
extends InvalidValueException
|
373
|
+
{
|
374
|
+
public QuotedSizeLimitExceededException(String message)
|
375
|
+
{
|
376
|
+
super(message);
|
377
|
+
}
|
378
|
+
}
|
379
|
+
|
380
|
+
public class TooManyColumnsException
|
381
|
+
extends InvalidFormatException
|
382
|
+
{
|
383
|
+
public TooManyColumnsException(String message)
|
384
|
+
{
|
385
|
+
super(message);
|
386
|
+
}
|
387
|
+
}
|
388
|
+
|
389
|
+
public class TooFewColumnsException
|
390
|
+
extends InvalidFormatException
|
391
|
+
{
|
392
|
+
public TooFewColumnsException(String message)
|
367
393
|
{
|
368
394
|
super(message);
|
369
395
|
}
|
@@ -1,7 +1,6 @@
|
|
1
1
|
package org.embulk.standards;
|
2
2
|
|
3
3
|
import org.junit.Rule;
|
4
|
-
import org.junit.Before;
|
5
4
|
import org.junit.Test;
|
6
5
|
import static org.junit.Assert.assertEquals;
|
7
6
|
import java.nio.charset.Charset;
|
@@ -34,6 +33,7 @@ public class TestCsvParserPlugin
|
|
34
33
|
assertEquals(false, task.getHeaderLine().or(false));
|
35
34
|
assertEquals(',', task.getDelimiterChar());
|
36
35
|
assertEquals('\"', task.getQuoteChar());
|
36
|
+
assertEquals(false, task.getAllowOptionalColumns());
|
37
37
|
}
|
38
38
|
|
39
39
|
@Test(expected = ConfigException.class)
|
@@ -53,6 +53,7 @@ public class TestCsvParserPlugin
|
|
53
53
|
.set("header_line", true)
|
54
54
|
.set("delimiter", "\t")
|
55
55
|
.set("quote", "\\")
|
56
|
+
.set("allow_optional_columns", true)
|
56
57
|
.set("columns", ImmutableList.of(
|
57
58
|
ImmutableMap.of(
|
58
59
|
"name", "date_code",
|
@@ -65,5 +66,6 @@ public class TestCsvParserPlugin
|
|
65
66
|
assertEquals(true, task.getHeaderLine().or(false));
|
66
67
|
assertEquals('\t', task.getDelimiterChar());
|
67
68
|
assertEquals('\\', task.getQuoteChar());
|
69
|
+
assertEquals(true, task.getAllowOptionalColumns());
|
68
70
|
}
|
69
71
|
}
|
data/lib/embulk/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -275,6 +275,7 @@ files:
|
|
275
275
|
- embulk-docs/src/release/release-0.5.1.rst
|
276
276
|
- embulk-docs/src/release/release-0.5.2.rst
|
277
277
|
- embulk-docs/src/release/release-0.5.3.rst
|
278
|
+
- embulk-docs/src/release/release-0.5.4.rst
|
278
279
|
- embulk-standards/build.gradle
|
279
280
|
- embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java
|
280
281
|
- embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java
|
@@ -379,8 +380,8 @@ files:
|
|
379
380
|
- classpath/bval-jsr303-0.5.jar
|
380
381
|
- classpath/commons-beanutils-core-1.8.3.jar
|
381
382
|
- classpath/commons-lang3-3.1.jar
|
382
|
-
- classpath/embulk-core-0.5.
|
383
|
-
- classpath/embulk-standards-0.5.
|
383
|
+
- classpath/embulk-core-0.5.4.jar
|
384
|
+
- classpath/embulk-standards-0.5.4.jar
|
384
385
|
- classpath/guava-18.0.jar
|
385
386
|
- classpath/guice-3.0.jar
|
386
387
|
- classpath/guice-multibindings-3.0.jar
|