embulk 0.8.3 → 0.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/build.gradle +1 -1
- data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +1 -0
- data/embulk-core/src/main/java/org/embulk/spi/json/JsonParser.java +128 -21
- data/embulk-docs/src/built-in.rst +75 -2
- data/embulk-docs/src/recipe/scheduled-csv-load-to-elasticsearch-kibana4.rst +16 -15
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.8.4.rst +18 -0
- data/embulk-standards/build.gradle +1 -0
- data/embulk-standards/src/main/java/org/embulk/standards/Bzip2FileDecoderPlugin.java +55 -0
- data/embulk-standards/src/main/java/org/embulk/standards/Bzip2FileEncoderPlugin.java +67 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +3 -0
- data/lib/embulk/guess/bzip2.rb +23 -0
- data/lib/embulk/runner.rb +2 -2
- data/lib/embulk/version.rb +1 -1
- metadata +9 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 671d7e093dbc15c2d87cf48cc69d6c46db01906f
|
4
|
+
data.tar.gz: 6fea3fc181559ca1821ca48f462cd2944994b741
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b4c6d18798c4cdc272348c5febdcc875478a2d21165fe37fd9b5c1f55fdd3c97b57aa8260f840cd7e213c11cb65ec1768c6df9c1413f16ea25435b2a50b2bce6
|
7
|
+
data.tar.gz: cee70f19578b92763838f3f404af6cd562aa566ab31548c0d9197c54e27f74bd11fd7ae49c2ed0ed94dd76e25ae20bb7add76e595f20cea0a76995e77ffe895b
|
data/build.gradle
CHANGED
@@ -4,6 +4,8 @@ import java.util.List;
|
|
4
4
|
import java.util.ArrayList;
|
5
5
|
import java.util.Map;
|
6
6
|
import java.util.HashMap;
|
7
|
+
import java.io.InputStream;
|
8
|
+
import java.io.Closeable;
|
7
9
|
import java.io.IOException;
|
8
10
|
import org.msgpack.value.Value;
|
9
11
|
import org.msgpack.value.ValueFactory;
|
@@ -13,6 +15,14 @@ import com.fasterxml.jackson.core.JsonToken;
|
|
13
15
|
|
14
16
|
public class JsonParser
|
15
17
|
{
|
18
|
+
public interface Stream
|
19
|
+
extends Closeable
|
20
|
+
{
|
21
|
+
Value next() throws IOException;
|
22
|
+
|
23
|
+
void close() throws IOException;
|
24
|
+
}
|
25
|
+
|
16
26
|
private final JsonFactory factory;
|
17
27
|
|
18
28
|
public JsonParser()
|
@@ -21,38 +31,136 @@ public class JsonParser
|
|
21
31
|
factory.enable(Feature.ALLOW_UNQUOTED_CONTROL_CHARS);
|
22
32
|
}
|
23
33
|
|
34
|
+
public Stream open(InputStream in) throws IOException
|
35
|
+
{
|
36
|
+
return new StreamParseContext(factory, in);
|
37
|
+
}
|
38
|
+
|
24
39
|
public Value parse(String json)
|
25
40
|
{
|
26
|
-
return new
|
41
|
+
return new SingleParseContext(factory, json).parse();
|
42
|
+
}
|
43
|
+
|
44
|
+
private static String sampleJsonString(String json)
|
45
|
+
{
|
46
|
+
if (json.length() < 100) {
|
47
|
+
return json;
|
48
|
+
}
|
49
|
+
else {
|
50
|
+
return json.substring(0, 97) + "...";
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
private static class StreamParseContext
|
55
|
+
extends AbstractParseContext
|
56
|
+
implements Stream
|
57
|
+
{
|
58
|
+
public StreamParseContext(JsonFactory factory, InputStream in)
|
59
|
+
throws IOException, JsonParseException
|
60
|
+
{
|
61
|
+
super(createParser(factory, in));
|
62
|
+
}
|
63
|
+
|
64
|
+
private static com.fasterxml.jackson.core.JsonParser createParser(JsonFactory factory, InputStream in)
|
65
|
+
throws IOException
|
66
|
+
{
|
67
|
+
try {
|
68
|
+
return factory.createParser(in);
|
69
|
+
}
|
70
|
+
catch (IOException ex) {
|
71
|
+
throw ex;
|
72
|
+
}
|
73
|
+
catch (Exception ex) {
|
74
|
+
throw new JsonParseException("Failed to parse JSON", ex);
|
75
|
+
}
|
76
|
+
}
|
77
|
+
|
78
|
+
@Override
|
79
|
+
public void close() throws IOException
|
80
|
+
{
|
81
|
+
parser.close();
|
82
|
+
}
|
83
|
+
|
84
|
+
@Override
|
85
|
+
protected String sampleJsonString()
|
86
|
+
{
|
87
|
+
return "in";
|
88
|
+
}
|
27
89
|
}
|
28
90
|
|
29
|
-
private class
|
91
|
+
private static class SingleParseContext
|
92
|
+
extends AbstractParseContext
|
30
93
|
{
|
31
94
|
private final String json;
|
32
|
-
private final com.fasterxml.jackson.core.JsonParser parser;
|
33
95
|
|
34
|
-
public
|
96
|
+
public SingleParseContext(JsonFactory factory, String json)
|
35
97
|
{
|
98
|
+
super(createParser(factory, json));
|
36
99
|
this.json = json;
|
100
|
+
}
|
101
|
+
|
102
|
+
private static com.fasterxml.jackson.core.JsonParser createParser(JsonFactory factory, String json)
|
103
|
+
{
|
37
104
|
try {
|
38
|
-
|
105
|
+
return factory.createParser(json);
|
39
106
|
}
|
40
107
|
catch (Exception ex) {
|
41
|
-
throw new JsonParseException("Failed to parse
|
108
|
+
throw new JsonParseException("Failed to parse JSON: "+JsonParser.sampleJsonString(json), ex);
|
42
109
|
}
|
43
110
|
}
|
44
111
|
|
45
112
|
public Value parse()
|
113
|
+
{
|
114
|
+
try {
|
115
|
+
Value v = next();
|
116
|
+
if (v == null) {
|
117
|
+
throw new JsonParseException("Unable to parse empty string");
|
118
|
+
}
|
119
|
+
return v;
|
120
|
+
}
|
121
|
+
catch (IOException ex) {
|
122
|
+
throw new JsonParseException("Failed to parse JSON: "+sampleJsonString(), ex);
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
@Override
|
127
|
+
protected String sampleJsonString()
|
128
|
+
{
|
129
|
+
return JsonParser.sampleJsonString(json);
|
130
|
+
}
|
131
|
+
}
|
132
|
+
|
133
|
+
private static abstract class AbstractParseContext
|
134
|
+
{
|
135
|
+
protected final com.fasterxml.jackson.core.JsonParser parser;
|
136
|
+
|
137
|
+
public AbstractParseContext(com.fasterxml.jackson.core.JsonParser parser)
|
138
|
+
{
|
139
|
+
this.parser = parser;
|
140
|
+
}
|
141
|
+
|
142
|
+
protected abstract String sampleJsonString();
|
143
|
+
|
144
|
+
public Value next() throws IOException
|
46
145
|
{
|
47
146
|
try {
|
48
147
|
JsonToken token = parser.nextToken();
|
148
|
+
if (token == null) {
|
149
|
+
return null;
|
150
|
+
}
|
49
151
|
return jsonTokenToValue(token);
|
50
152
|
}
|
153
|
+
catch (com.fasterxml.jackson.core.JsonParseException ex) {
|
154
|
+
throw new JsonParseException("Failed to parse JSON: "+sampleJsonString(), ex);
|
155
|
+
}
|
156
|
+
catch (IOException ex) {
|
157
|
+
throw ex;
|
158
|
+
}
|
51
159
|
catch (JsonParseException ex) {
|
52
160
|
throw ex;
|
53
161
|
}
|
54
|
-
catch (
|
55
|
-
throw new JsonParseException("Failed to parse
|
162
|
+
catch (RuntimeException ex) {
|
163
|
+
throw new JsonParseException("Failed to parse JSON: "+sampleJsonString(), ex);
|
56
164
|
}
|
57
165
|
}
|
58
166
|
|
@@ -81,9 +189,12 @@ public class JsonParser
|
|
81
189
|
List<Value> list = new ArrayList<>();
|
82
190
|
while (true) {
|
83
191
|
token = parser.nextToken();
|
84
|
-
if(token == JsonToken.END_ARRAY) {
|
192
|
+
if (token == JsonToken.END_ARRAY) {
|
85
193
|
return ValueFactory.newArray(list);
|
86
194
|
}
|
195
|
+
else if (token == null) {
|
196
|
+
throw new JsonParseException("Unexpected end of JSON at "+parser.getTokenLocation() + " while expecting an element of an array: " + sampleJsonString());
|
197
|
+
}
|
87
198
|
list.add(jsonTokenToValue(token));
|
88
199
|
}
|
89
200
|
}
|
@@ -94,11 +205,17 @@ public class JsonParser
|
|
94
205
|
if (token == JsonToken.END_OBJECT) {
|
95
206
|
return ValueFactory.newMap(map);
|
96
207
|
}
|
208
|
+
else if (token == null) {
|
209
|
+
throw new JsonParseException("Unexpected end of JSON at "+parser.getTokenLocation() + " while expecting a key of object: " + sampleJsonString());
|
210
|
+
}
|
97
211
|
String key = parser.getCurrentName();
|
98
212
|
if (key == null) {
|
99
|
-
throw new JsonParseException("Unexpected token "+token+" at "+parser.getTokenLocation());
|
213
|
+
throw new JsonParseException("Unexpected token "+token+" at "+parser.getTokenLocation() + ": " + sampleJsonString());
|
100
214
|
}
|
101
215
|
token = parser.nextToken();
|
216
|
+
if (token == null) {
|
217
|
+
throw new JsonParseException("Unexpected end of JSON at "+parser.getTokenLocation() + " while expecting a value of object: " + sampleJsonString());
|
218
|
+
}
|
102
219
|
Value value = jsonTokenToValue(token);
|
103
220
|
map.put(ValueFactory.newString(key), value);
|
104
221
|
}
|
@@ -108,18 +225,8 @@ public class JsonParser
|
|
108
225
|
case END_OBJECT:
|
109
226
|
case NOT_AVAILABLE:
|
110
227
|
default:
|
111
|
-
throw new JsonParseException("Unexpected token "+token+" at "+parser.getTokenLocation());
|
228
|
+
throw new JsonParseException("Unexpected token "+token+" at "+parser.getTokenLocation() + ": " + sampleJsonString());
|
112
229
|
}
|
113
230
|
}
|
114
231
|
}
|
115
|
-
|
116
|
-
private static String sampleJsonString(String json)
|
117
|
-
{
|
118
|
-
if (json.length() < 100) {
|
119
|
-
return json;
|
120
|
-
}
|
121
|
-
else {
|
122
|
-
return json.substring(0, 97) + "...";
|
123
|
-
}
|
124
|
-
}
|
125
232
|
}
|
@@ -44,13 +44,13 @@ A configuration file consists of following sections:
|
|
44
44
|
|
45
45
|
* **parser:** If the input is file-based, parser plugin parses a file format (built-in csv, `json <https://github.com/takumakanari/embulk-parser-json>`_, etc).
|
46
46
|
|
47
|
-
* **decoder:** If the input is file-based, decoder plugin decodes compression or encryption (built-in gzip, `zip <https://github.com/hata/embulk-decoder-commons-compress>`_, `tar.gz <https://github.com/hata/embulk-decoder-commons-compress>`_, etc).
|
47
|
+
* **decoder:** If the input is file-based, decoder plugin decodes compression or encryption (built-in gzip, bzip2, `zip <https://github.com/hata/embulk-decoder-commons-compress>`_, `tar.gz <https://github.com/hata/embulk-decoder-commons-compress>`_, etc).
|
48
48
|
|
49
49
|
* **out:** Output plugin options. An output plugin is either record-based (`Oracle <https://github.com/embulk/embulk-output-jdbc>`_, `Elasticsearch <https://github.com/muga/embulk-output-elasticsearch>`_, etc) or file-based (`Google Cloud Storage <https://github.com/hakobera/embulk-output-gcs>`_, `Command <https://github.com/embulk/embulk-output-command>`_, etc)
|
50
50
|
|
51
51
|
* **formatter:** If the output is file-based, formatter plugin formats a file format (such as built-in csv, `JSON <https://github.com/takei-yuya/embulk-formatter-jsonl>`_)
|
52
52
|
|
53
|
-
* **encoder:** If the output is file-based, encoder plugin encodes compression or encryption (such as built-in gzip)
|
53
|
+
* **encoder:** If the output is file-based, encoder plugin encodes compression or encryption (such as built-in gzip or bzip2)
|
54
54
|
|
55
55
|
* **filters:** Filter plugins options (optional).
|
56
56
|
|
@@ -298,6 +298,27 @@ Example
|
|
298
298
|
- {type: gzip}
|
299
299
|
|
300
300
|
|
301
|
+
BZip2 decoder plugin
|
302
|
+
------------------
|
303
|
+
|
304
|
+
The ``bzip2`` decoder plugin decompresses bzip2 files before input plugins read them.
|
305
|
+
|
306
|
+
Options
|
307
|
+
~~~~~~~~~~~~~~~~~~
|
308
|
+
|
309
|
+
This plugin doesn't have any options.
|
310
|
+
|
311
|
+
Example
|
312
|
+
~~~~~~~~~~~~~~~~~~
|
313
|
+
|
314
|
+
.. code-block:: yaml
|
315
|
+
|
316
|
+
in:
|
317
|
+
...
|
318
|
+
decoders:
|
319
|
+
- {type: bzip2}
|
320
|
+
|
321
|
+
|
301
322
|
File output plugin
|
302
323
|
------------------
|
303
324
|
|
@@ -448,6 +469,58 @@ Example
|
|
448
469
|
- type: gzip
|
449
470
|
level: 1
|
450
471
|
|
472
|
+
|
473
|
+
Gzip encoder plugin
|
474
|
+
------------------
|
475
|
+
|
476
|
+
The ``gzip`` encoder plugin compresses output files using gzip.
|
477
|
+
|
478
|
+
Options
|
479
|
+
~~~~~~~~~~~~~~~~~~
|
480
|
+
|
481
|
+
+---------+----------+----------------------------------------------------------------------+--------------------+
|
482
|
+
| name | type | description | required? |
|
483
|
+
+=========+==========+======================================================================+====================+
|
484
|
+
| level | integer | Compression level. From 0 (no compression) to 9 (best compression). | ``6`` by default |
|
485
|
+
+---------+----------+----------------------------------------------------------------------+--------------------+
|
486
|
+
|
487
|
+
Example
|
488
|
+
~~~~~~~~~~~~~~~~~~
|
489
|
+
|
490
|
+
.. code-block:: yaml
|
491
|
+
|
492
|
+
out:
|
493
|
+
...
|
494
|
+
encoders:
|
495
|
+
- type: gzip
|
496
|
+
level: 1
|
497
|
+
|
498
|
+
BZip2 encoder plugin
|
499
|
+
------------------
|
500
|
+
|
501
|
+
The ``bzip2`` encoder plugin compresses output files using bzip2.
|
502
|
+
|
503
|
+
Options
|
504
|
+
~~~~~~~~~~~~~~~~~~
|
505
|
+
|
506
|
+
+---------+----------+----------------------------------------------------------------------+--------------------+
|
507
|
+
| name | type | description | required? |
|
508
|
+
+=========+==========+======================================================================+====================+
|
509
|
+
| level | integer | Compression level. From 1 to 9 (best compression). | ``9`` by default |
|
510
|
+
+---------+----------+----------------------------------------------------------------------+--------------------+
|
511
|
+
|
512
|
+
Example
|
513
|
+
~~~~~~~~~~~~~~~~~~
|
514
|
+
|
515
|
+
.. code-block:: yaml
|
516
|
+
|
517
|
+
out:
|
518
|
+
...
|
519
|
+
encoders:
|
520
|
+
- type: bzip2
|
521
|
+
level: 6
|
522
|
+
|
523
|
+
|
451
524
|
Rename filter plugin
|
452
525
|
------------------
|
453
526
|
|
@@ -24,9 +24,9 @@ For the smallest setup, you can unzip the package and run `./bin/elasticsearch`
|
|
24
24
|
|
25
25
|
.. code-block:: console
|
26
26
|
|
27
|
-
$ wget https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-
|
28
|
-
$ unzip elasticsearch-
|
29
|
-
$ cd elasticsearch-
|
27
|
+
$ wget https://download.elasticsearch.org/elasticsearch/release/org/elasticsearch/distribution/zip/elasticsearch/2.2.0/elasticsearch-2.2.0.zip
|
28
|
+
$ unzip elasticsearch-2.2.0.zip
|
29
|
+
$ cd elasticsearch-2.2.0
|
30
30
|
$ ./bin/elasticsearch
|
31
31
|
|
32
32
|
Step 2. Download and unzip Kibana:
|
@@ -36,12 +36,12 @@ You can find releases from the `Kibana website <http://www.elasticsearch.org/ove
|
|
36
36
|
|
37
37
|
.. code-block:: console
|
38
38
|
|
39
|
-
$ wget https://download.
|
40
|
-
$ tar zxvf kibana-4.
|
41
|
-
$ cd kibana-4.
|
39
|
+
$ wget https://download.elastic.co/kibana/kibana/kibana-4.4.0-linux-x64.tar.gz
|
40
|
+
$ tar zxvf kibana-4.4.0-linux-x64.tar.gz
|
41
|
+
$ cd kibana-4.4.0-linux-x64
|
42
42
|
$ ./bin/kibana
|
43
43
|
|
44
|
-
Note: If you're using Mac OS X, https://download.
|
44
|
+
Note: If you're using Mac OS X, https://download.elastic.co/kibana/kibana/kibana-4.4.0-darwin-x64.tar.gz is the URL to download.
|
45
45
|
|
46
46
|
Now Elasticsearch and Kibana started. Open http://localhost:5601/ using your browser to see the Kibana's graphical interface.
|
47
47
|
|
@@ -75,7 +75,7 @@ Loading a CSV file
|
|
75
75
|
|
76
76
|
Assuming you have a CSV files at ``./mydata/csv/`` directory. If you don't have CSV files, you can create ones using ``embulk example ./mydata`` command.
|
77
77
|
|
78
|
-
Create this configuration file and save as ``
|
78
|
+
Create this configuration file and save as ``seed.yml``:
|
79
79
|
|
80
80
|
.. code-block:: yaml
|
81
81
|
|
@@ -93,9 +93,9 @@ In fact, this configuration lacks some important information. However, embulk gu
|
|
93
93
|
|
94
94
|
.. code-block:: console
|
95
95
|
|
96
|
-
$ embulk guess
|
96
|
+
$ embulk guess ./mydata/seed.yml -o config.yml
|
97
97
|
|
98
|
-
The generated config
|
98
|
+
The generated config.yml file should include complete information as following:
|
99
99
|
|
100
100
|
.. code-block:: yaml
|
101
101
|
|
@@ -137,24 +137,25 @@ Now, you can run the bulk loading:
|
|
137
137
|
|
138
138
|
.. code-block:: console
|
139
139
|
|
140
|
-
$ embulk run config
|
140
|
+
$ embulk run config.yml -c diff.yml
|
141
141
|
|
142
142
|
Scheduling loading by cron
|
143
143
|
------------------
|
144
144
|
|
145
|
-
At the last step, you ran embulk command with ``-
|
145
|
+
At the last step, you ran embulk command with ``-c diff.yml`` file. The ``diff.yml`` file should include a parameter named ``last_path``:
|
146
146
|
|
147
147
|
.. code-block:: yaml
|
148
148
|
|
149
|
-
last_path: mydata/csv/sample_01.csv.gz
|
149
|
+
in: {last_path: mydata/csv/sample_01.csv.gz}
|
150
|
+
out: {}
|
150
151
|
|
151
152
|
With this configuration, embulk loads the files newer than this file in alphabetical order.
|
152
153
|
|
153
|
-
For example, if you create ``./mydata/csv/sample_02.csv.gz`` file, embulk skips ``sample_01.csv.gz`` file and loads ``sample_02.csv.gz`` only next time. And the next
|
154
|
+
For example, if you create ``./mydata/csv/sample_02.csv.gz`` file, embulk skips ``sample_01.csv.gz`` file and loads ``sample_02.csv.gz`` only next time. And the next ``diff.yml`` file has ``last_path: mydata/csv/sample_02.csv.gz`` for the next next execution.
|
154
155
|
|
155
156
|
So, if you want to loads newly created files every day, you can setup this cron schedule:
|
156
157
|
|
157
158
|
.. code-block:: cron
|
158
159
|
|
159
|
-
0 * * * * embulk run /path/to/
|
160
|
+
0 * * * * embulk run /path/to/config.yml -c /path/to/diff.yml
|
160
161
|
|
data/embulk-docs/src/release.rst
CHANGED
@@ -0,0 +1,18 @@
|
|
1
|
+
Release 0.8.4
|
2
|
+
==================================
|
3
|
+
|
4
|
+
General Changes
|
5
|
+
------------------
|
6
|
+
|
7
|
+
* Added ``bzip2`` encoder plugin, decoder plugin, and guess plugin.
|
8
|
+
* Fixed PageBuilder to close underlaying output plugin.
|
9
|
+
* Embulk::Runner accepts .yaml as well as .yml
|
10
|
+
|
11
|
+
Java API
|
12
|
+
------------------
|
13
|
+
|
14
|
+
* Added JsonParser.Stream API for parser plugins to parse a stream of json objects.
|
15
|
+
|
16
|
+
Release Date
|
17
|
+
------------------
|
18
|
+
2016-02-16
|
@@ -0,0 +1,55 @@
|
|
1
|
+
package org.embulk.standards;
|
2
|
+
|
3
|
+
import java.io.InputStream;
|
4
|
+
import java.io.IOException;
|
5
|
+
import org.embulk.config.Task;
|
6
|
+
import org.embulk.config.TaskSource;
|
7
|
+
import org.embulk.config.ConfigSource;
|
8
|
+
import org.embulk.config.ConfigInject;
|
9
|
+
import org.embulk.spi.DecoderPlugin;
|
10
|
+
import org.embulk.spi.BufferAllocator;
|
11
|
+
import org.embulk.spi.FileInput;
|
12
|
+
import org.embulk.spi.util.FileInputInputStream;
|
13
|
+
import org.embulk.spi.util.InputStreamFileInput;
|
14
|
+
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
|
15
|
+
|
16
|
+
public class Bzip2FileDecoderPlugin
|
17
|
+
implements DecoderPlugin
|
18
|
+
{
|
19
|
+
public interface PluginTask
|
20
|
+
extends Task
|
21
|
+
{
|
22
|
+
@ConfigInject
|
23
|
+
BufferAllocator getBufferAllocator();
|
24
|
+
}
|
25
|
+
|
26
|
+
@Override
|
27
|
+
public void transaction(ConfigSource config, DecoderPlugin.Control control)
|
28
|
+
{
|
29
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
30
|
+
control.run(task.dump());
|
31
|
+
}
|
32
|
+
|
33
|
+
@Override
|
34
|
+
public FileInput open(TaskSource taskSource, FileInput fileInput)
|
35
|
+
{
|
36
|
+
PluginTask task = taskSource.loadTask(PluginTask.class);
|
37
|
+
final FileInputInputStream files = new FileInputInputStream(fileInput);
|
38
|
+
return new InputStreamFileInput(
|
39
|
+
task.getBufferAllocator(),
|
40
|
+
new InputStreamFileInput.Provider() {
|
41
|
+
public InputStream openNext() throws IOException
|
42
|
+
{
|
43
|
+
if (!files.nextFile()) {
|
44
|
+
return null;
|
45
|
+
}
|
46
|
+
return new BZip2CompressorInputStream(files, true);
|
47
|
+
}
|
48
|
+
|
49
|
+
public void close() throws IOException
|
50
|
+
{
|
51
|
+
files.close();
|
52
|
+
}
|
53
|
+
});
|
54
|
+
}
|
55
|
+
}
|
@@ -0,0 +1,67 @@
|
|
1
|
+
package org.embulk.standards;
|
2
|
+
|
3
|
+
import java.io.OutputStream;
|
4
|
+
import java.io.IOException;
|
5
|
+
import javax.validation.constraints.Min;
|
6
|
+
import javax.validation.constraints.Max;
|
7
|
+
import org.embulk.config.Task;
|
8
|
+
import org.embulk.config.Config;
|
9
|
+
import org.embulk.config.ConfigInject;
|
10
|
+
import org.embulk.config.ConfigDefault;
|
11
|
+
import org.embulk.config.TaskSource;
|
12
|
+
import org.embulk.config.ConfigSource;
|
13
|
+
import org.embulk.spi.EncoderPlugin;
|
14
|
+
import org.embulk.spi.FileOutput;
|
15
|
+
import org.embulk.spi.BufferAllocator;
|
16
|
+
import org.embulk.spi.util.FileOutputOutputStream;
|
17
|
+
import org.embulk.spi.util.OutputStreamFileOutput;
|
18
|
+
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
|
19
|
+
|
20
|
+
public class Bzip2FileEncoderPlugin
|
21
|
+
implements EncoderPlugin
|
22
|
+
{
|
23
|
+
public interface PluginTask
|
24
|
+
extends Task
|
25
|
+
{
|
26
|
+
@Config("level")
|
27
|
+
@ConfigDefault("9")
|
28
|
+
@Min(1)
|
29
|
+
@Max(9)
|
30
|
+
int getLevel();
|
31
|
+
|
32
|
+
@ConfigInject
|
33
|
+
BufferAllocator getBufferAllocator();
|
34
|
+
}
|
35
|
+
|
36
|
+
public void transaction(ConfigSource config, EncoderPlugin.Control control)
|
37
|
+
{
|
38
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
39
|
+
control.run(task.dump());
|
40
|
+
}
|
41
|
+
|
42
|
+
@Override
|
43
|
+
public FileOutput open(TaskSource taskSource, final FileOutput fileOutput)
|
44
|
+
{
|
45
|
+
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
46
|
+
|
47
|
+
final FileOutputOutputStream output = new FileOutputOutputStream(fileOutput, task.getBufferAllocator(), FileOutputOutputStream.CloseMode.FLUSH);
|
48
|
+
|
49
|
+
return new OutputStreamFileOutput(new OutputStreamFileOutput.Provider() {
|
50
|
+
public OutputStream openNext() throws IOException
|
51
|
+
{
|
52
|
+
output.nextFile();
|
53
|
+
return new BZip2CompressorOutputStream(output, task.getLevel());
|
54
|
+
}
|
55
|
+
|
56
|
+
public void finish() throws IOException
|
57
|
+
{
|
58
|
+
fileOutput.finish();
|
59
|
+
}
|
60
|
+
|
61
|
+
public void close() throws IOException
|
62
|
+
{
|
63
|
+
fileOutput.close();
|
64
|
+
}
|
65
|
+
});
|
66
|
+
}
|
67
|
+
}
|
@@ -30,6 +30,7 @@ public class StandardPluginModule
|
|
30
30
|
|
31
31
|
// file decoder plugins
|
32
32
|
registerPluginTo(binder, DecoderPlugin.class, "gzip", GzipFileDecoderPlugin.class);
|
33
|
+
registerPluginTo(binder, DecoderPlugin.class, "bzip2", Bzip2FileDecoderPlugin.class);
|
33
34
|
|
34
35
|
// output plugins
|
35
36
|
registerPluginTo(binder, OutputPlugin.class, "file", LocalFileOutputPlugin.class);
|
@@ -41,12 +42,14 @@ public class StandardPluginModule
|
|
41
42
|
|
42
43
|
// file encoder plugins
|
43
44
|
registerPluginTo(binder, EncoderPlugin.class, "gzip", GzipFileEncoderPlugin.class);
|
45
|
+
registerPluginTo(binder, EncoderPlugin.class, "bzip2", Bzip2FileEncoderPlugin.class);
|
44
46
|
|
45
47
|
// filter plugins
|
46
48
|
registerPluginTo(binder, FilterPlugin.class, "rename", RenameFilterPlugin.class);
|
47
49
|
|
48
50
|
// default guess plugins
|
49
51
|
registerDefaultGuessPluginTo(binder, new PluginType("gzip"));
|
52
|
+
registerDefaultGuessPluginTo(binder, new PluginType("bzip2"));
|
50
53
|
registerDefaultGuessPluginTo(binder, new PluginType("csv"));
|
51
54
|
// charset and newline guess plugins are loaded and invoked by CsvGuessPlugin
|
52
55
|
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Guess
|
3
|
+
|
4
|
+
class Bzip2GuessPlugin < GuessPlugin
|
5
|
+
Plugin.register_guess('bzip2', self)
|
6
|
+
|
7
|
+
# magic: BZ
|
8
|
+
# version: 'h' = bzip2
|
9
|
+
# blocksize: 1 .. 9
|
10
|
+
# block magic: 0x314159265359 (6 bytes)
|
11
|
+
block_magic = [0x31, 0x41, 0x59, 0x26, 0x53, 0x59].pack('C*')
|
12
|
+
BZIP2_HEADER_PATTERN = /BZh[1-9]#{Regexp.quote(block_magic)}/n
|
13
|
+
|
14
|
+
def guess(config, sample_buffer)
|
15
|
+
if sample_buffer[0,10] =~ BZIP2_HEADER_PATTERN
|
16
|
+
return {"decoders" => [{"type" => "bzip2"}]}
|
17
|
+
end
|
18
|
+
return {}
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
data/lib/embulk/runner.rb
CHANGED
@@ -122,12 +122,12 @@ module Embulk
|
|
122
122
|
case config
|
123
123
|
when String
|
124
124
|
case config
|
125
|
-
when /\.
|
125
|
+
when /\.ya?ml\.liquid$/
|
126
126
|
require 'liquid'
|
127
127
|
template_params = options[:template_params] || {}
|
128
128
|
template_include_path = File.expand_path(options[:template_include_path] || File.dirname(config)) unless options[:template_include_path] == false
|
129
129
|
@embed.newConfigLoader.fromYamlString run_liquid(File.read(config), template_params, template_include_path)
|
130
|
-
when /\.
|
130
|
+
when /\.ya?ml$/
|
131
131
|
@embed.newConfigLoader.fromYamlString File.read(config)
|
132
132
|
else
|
133
133
|
raise ConfigError.new("Unsupported file extension. Supported file extensions are .yml and .yml.liquid: #{config}")
|
data/lib/embulk/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: jruby-jars
|
@@ -106,9 +106,10 @@ files:
|
|
106
106
|
- classpath/bval-core-0.5.jar
|
107
107
|
- classpath/bval-jsr303-0.5.jar
|
108
108
|
- classpath/commons-beanutils-core-1.8.3.jar
|
109
|
+
- classpath/commons-compress-1.10.jar
|
109
110
|
- classpath/commons-lang3-3.1.jar
|
110
|
-
- classpath/embulk-core-0.8.
|
111
|
-
- classpath/embulk-standards-0.8.
|
111
|
+
- classpath/embulk-core-0.8.4.jar
|
112
|
+
- classpath/embulk-standards-0.8.4.jar
|
112
113
|
- classpath/guava-18.0.jar
|
113
114
|
- classpath/guice-4.0.jar
|
114
115
|
- classpath/guice-bootstrap-0.1.1.jar
|
@@ -422,7 +423,10 @@ files:
|
|
422
423
|
- embulk-docs/src/release/release-0.8.1.rst
|
423
424
|
- embulk-docs/src/release/release-0.8.2.rst
|
424
425
|
- embulk-docs/src/release/release-0.8.3.rst
|
426
|
+
- embulk-docs/src/release/release-0.8.4.rst
|
425
427
|
- embulk-standards/build.gradle
|
428
|
+
- embulk-standards/src/main/java/org/embulk/standards/Bzip2FileDecoderPlugin.java
|
429
|
+
- embulk-standards/src/main/java/org/embulk/standards/Bzip2FileEncoderPlugin.java
|
426
430
|
- embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java
|
427
431
|
- embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java
|
428
432
|
- embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java
|
@@ -507,6 +511,7 @@ files:
|
|
507
511
|
- lib/embulk/file_output_plugin.rb
|
508
512
|
- lib/embulk/filter_plugin.rb
|
509
513
|
- lib/embulk/formatter_plugin.rb
|
514
|
+
- lib/embulk/guess/bzip2.rb
|
510
515
|
- lib/embulk/guess/charset.rb
|
511
516
|
- lib/embulk/guess/csv.rb
|
512
517
|
- lib/embulk/guess/gzip.rb
|