embulk 0.8.4 → 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 671d7e093dbc15c2d87cf48cc69d6c46db01906f
4
- data.tar.gz: 6fea3fc181559ca1821ca48f462cd2944994b741
3
+ metadata.gz: 99cb7ac6a07d7e8db18b43a95012b56e20ae35cc
4
+ data.tar.gz: 664f254e002c1d086e73412e67a9efd32c4b4787
5
5
  SHA512:
6
- metadata.gz: b4c6d18798c4cdc272348c5febdcc875478a2d21165fe37fd9b5c1f55fdd3c97b57aa8260f840cd7e213c11cb65ec1768c6df9c1413f16ea25435b2a50b2bce6
7
- data.tar.gz: cee70f19578b92763838f3f404af6cd562aa566ab31548c0d9197c54e27f74bd11fd7ae49c2ed0ed94dd76e25ae20bb7add76e595f20cea0a76995e77ffe895b
6
+ metadata.gz: 0076dcca57267d34e1474373a56addccbaec0a43b30dc8a23bd504ccca3725802f0e41d90caba94387e0db560090e076829f48bb34789cf8cebaeb4fe7bd93f4
7
+ data.tar.gz: c0889fa0e4a95b9adb83e4460f4e4821c2c6948ac6583649286e67af305459bd5855df03fa3784da01a048d9390c7218e739e1247ff5d1b84f353047baddd611
data/build.gradle CHANGED
@@ -16,7 +16,7 @@ def release_projects = [project(":embulk-core"), project(":embulk-standards")]
16
16
 
17
17
  allprojects {
18
18
  group = 'org.embulk'
19
- version = '0.8.4'
19
+ version = '0.8.5'
20
20
 
21
21
  ext {
22
22
  jrubyVersion = '9.0.4.0'
@@ -277,6 +277,40 @@ Example
277
277
  - {name: purchase, type: timestamp, format: '%Y%m%d'}
278
278
  - {name: comment, type: string}
279
279
 
280
+
281
+ JSON parser plugin
282
+ ------------------
283
+
284
+ The ``json`` parser plugin parses a JSON file that contains a sequence of JSON objects. Example:
285
+
286
+ .. code-block:: json
287
+
288
+ {"time":1455829282,"ip":"93.184.216.34","name":frsyuki}
289
+ {"time":1455829282,"ip":"172.36.8.109":sadayuki}
290
+ {"time":1455829284,"ip":"example.com","name":Treasure Data}
291
+ {"time":1455829282,"ip":"10.98.43.1","name":MessagePack}
292
+
293
+ ``json`` parser plugin outputs a single record named "record" (type is json).
294
+
295
+ Options
296
+ ~~~~~~~~~~~~~~~~~~
297
+
298
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
299
+ | name | type | description | required? |
300
+ +============================+==========+================================================================================================================+========================+
301
+ | stop\_on\_invalid\_record | boolean | Stop bulk load transaction if a file includes invalid record (such as invalid json) | ``false`` by default |
302
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
303
+
304
+
305
+ Example
306
+ ~~~~~~~~~~~~~~~~~~
307
+
308
+ .. code-block:: yaml
309
+
310
+ in:
311
+ parser:
312
+ type: json
313
+
280
314
  Gzip decoder plugin
281
315
  ------------------
282
316
 
@@ -4,6 +4,7 @@ Release Notes
4
4
  .. toctree::
5
5
  :maxdepth: 1
6
6
 
7
+ release/release-0.8.5
7
8
  release/release-0.8.4
8
9
  release/release-0.8.3
9
10
  release/release-0.8.2
@@ -0,0 +1,11 @@
1
+ Release 0.8.5
2
+ ==================================
3
+
4
+ General Changes
5
+ ------------------
6
+
7
+ * Added ``json`` parser plugin. It doesn't have options to extract values into columns (as like embulk-parser-jsonl plugin supports). It's intended to be supported by a filter plugin.
8
+
9
+ Release Date
10
+ ------------------
11
+ 2016-02-18
@@ -0,0 +1,116 @@
1
+ package org.embulk.standards;
2
+
3
+ import com.google.common.annotations.VisibleForTesting;
4
+ import org.embulk.config.Config;
5
+ import org.embulk.config.ConfigDefault;
6
+ import org.embulk.config.ConfigSource;
7
+ import org.embulk.config.Task;
8
+ import org.embulk.config.TaskSource;
9
+ import org.embulk.spi.Column;
10
+ import org.embulk.spi.DataException;
11
+ import org.embulk.spi.Exec;
12
+ import org.embulk.spi.FileInput;
13
+ import org.embulk.spi.PageBuilder;
14
+ import org.embulk.spi.PageOutput;
15
+ import org.embulk.spi.ParserPlugin;
16
+ import org.embulk.spi.Schema;
17
+ import org.embulk.spi.json.JsonParseException;
18
+ import org.embulk.spi.json.JsonParser;
19
+ import org.embulk.spi.type.Types;
20
+ import org.embulk.spi.util.FileInputInputStream;
21
+ import org.msgpack.value.Value;
22
+ import org.slf4j.Logger;
23
+
24
+ import java.io.IOException;
25
+
26
+ public class JsonParserPlugin
27
+ implements ParserPlugin
28
+ {
29
+ public interface PluginTask
30
+ extends Task
31
+ {
32
+ @Config("stop_on_invalid_record")
33
+ @ConfigDefault("false")
34
+ boolean getStopOnInvalidRecord();
35
+ }
36
+
37
+ private final Logger log;
38
+
39
+ public JsonParserPlugin()
40
+ {
41
+ this.log = Exec.getLogger(JsonParserPlugin.class);
42
+ }
43
+
44
+ @Override
45
+ public void transaction(ConfigSource configSource, Control control)
46
+ {
47
+ PluginTask task = configSource.loadConfig(PluginTask.class);
48
+ control.run(task.dump(), newSchema());
49
+ }
50
+
51
+ @VisibleForTesting
52
+ Schema newSchema()
53
+ {
54
+ return Schema.builder().add("record", Types.JSON).build(); // generate a schema
55
+ }
56
+
57
+ @Override
58
+ public void run(TaskSource taskSource, Schema schema, FileInput input, PageOutput output)
59
+ {
60
+ PluginTask task = taskSource.loadTask(PluginTask.class);
61
+
62
+ final boolean stopOnInvalidRecord = task.getStopOnInvalidRecord();
63
+ final Column column = schema.getColumn(0); // record column
64
+
65
+ try (PageBuilder pageBuilder = newPageBuilder(schema, output);
66
+ FileInputInputStream in = new FileInputInputStream(input)) {
67
+ while (in.nextFile()) {
68
+ try (JsonParser.Stream stream = newJsonStream(in)) {
69
+ Value value;
70
+ while ((value = stream.next()) != null) {
71
+ try {
72
+ if (!value.isMapValue()) {
73
+ throw new JsonRecordValidateException(
74
+ String.format("A Json record must not represent map value but it's %s", value.getValueType().name()));
75
+ }
76
+
77
+ pageBuilder.setJson(column, value);
78
+ pageBuilder.addRecord();
79
+ }
80
+ catch (JsonRecordValidateException e) {
81
+ if (stopOnInvalidRecord) {
82
+ throw new DataException(String.format("Invalid record: %s", value.toJson()), e);
83
+ }
84
+ log.warn(String.format("Skipped record (%s): %s", e.getMessage(), value.toJson()));
85
+ }
86
+ }
87
+ }
88
+ catch (IOException | JsonParseException e) {
89
+ throw new DataException(e);
90
+ }
91
+ }
92
+
93
+ pageBuilder.finish();
94
+ }
95
+ }
96
+
97
+ private PageBuilder newPageBuilder(Schema schema, PageOutput output)
98
+ {
99
+ return new PageBuilder(Exec.getBufferAllocator(), schema, output);
100
+ }
101
+
102
+ private JsonParser.Stream newJsonStream(FileInputInputStream in)
103
+ throws IOException
104
+ {
105
+ return new JsonParser().open(in);
106
+ }
107
+
108
+ static class JsonRecordValidateException
109
+ extends DataException
110
+ {
111
+ JsonRecordValidateException(String message)
112
+ {
113
+ super(message);
114
+ }
115
+ }
116
+ }
@@ -27,6 +27,7 @@ public class StandardPluginModule
27
27
 
28
28
  // parser plugins
29
29
  registerPluginTo(binder, ParserPlugin.class, "csv", CsvParserPlugin.class);
30
+ registerPluginTo(binder, ParserPlugin.class, "json", JsonParserPlugin.class);
30
31
 
31
32
  // file decoder plugins
32
33
  registerPluginTo(binder, DecoderPlugin.class, "gzip", GzipFileDecoderPlugin.class);
@@ -50,6 +51,7 @@ public class StandardPluginModule
50
51
  // default guess plugins
51
52
  registerDefaultGuessPluginTo(binder, new PluginType("gzip"));
52
53
  registerDefaultGuessPluginTo(binder, new PluginType("bzip2"));
54
+ registerDefaultGuessPluginTo(binder, new PluginType("json")); // should be registered before CsvGuessPlugin
53
55
  registerDefaultGuessPluginTo(binder, new PluginType("csv"));
54
56
  // charset and newline guess plugins are loaded and invoked by CsvGuessPlugin
55
57
  }
@@ -0,0 +1,169 @@
1
+ package org.embulk.standards;
2
+
3
+ import com.google.common.collect.ImmutableList;
4
+ import org.embulk.EmbulkTestRuntime;
5
+ import org.embulk.config.ConfigSource;
6
+ import org.embulk.config.TaskSource;
7
+ import org.embulk.spi.DataException;
8
+ import org.embulk.spi.FileInput;
9
+ import org.embulk.spi.ParserPlugin;
10
+ import org.embulk.spi.Schema;
11
+ import org.embulk.spi.TestPageBuilderReader.MockPageOutput;
12
+ import org.embulk.spi.util.InputStreamFileInput;
13
+ import org.embulk.spi.util.Pages;
14
+ import org.junit.Before;
15
+ import org.junit.Rule;
16
+ import org.junit.Test;
17
+ import org.msgpack.value.Value;
18
+
19
+ import java.io.ByteArrayInputStream;
20
+ import java.io.IOException;
21
+ import java.io.InputStream;
22
+ import java.util.List;
23
+ import java.util.Map;
24
+
25
+ import static org.junit.Assert.assertEquals;
26
+ import static org.junit.Assert.assertTrue;
27
+ import static org.junit.Assert.fail;
28
+ import static org.msgpack.value.ValueFactory.newArray;
29
+ import static org.msgpack.value.ValueFactory.newBoolean;
30
+ import static org.msgpack.value.ValueFactory.newInteger;
31
+ import static org.msgpack.value.ValueFactory.newMap;
32
+ import static org.msgpack.value.ValueFactory.newString;
33
+
34
+ public class TestJsonParserPlugin
35
+ {
36
+ @Rule
37
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
38
+
39
+ private ConfigSource config;
40
+ private JsonParserPlugin plugin;
41
+ private MockPageOutput output;
42
+
43
+ @Before
44
+ public void createResource()
45
+ {
46
+ config = config();
47
+ plugin = new JsonParserPlugin();
48
+ output = new MockPageOutput();
49
+ }
50
+
51
+ @Test
52
+ public void readNormalJson()
53
+ throws Exception
54
+ {
55
+ transaction(config, fileInput(
56
+ "{\"_c0\":true,\"_c1\":10,\"_c2\":\"embulk\",\"_c3\":{\"k\":\"v\"}}",
57
+ "{}",
58
+ "{\n" +
59
+ "\"_c0\":false,\n" +
60
+ "\"_c1\":-10,\n" +
61
+ "\"_c2\":\"エンバルク\",\n" +
62
+ "\"_c3\":[\"e0\",\"e1\"]\n" +
63
+ "}",
64
+ "[1, 2, 3]", // this line should be skipped.
65
+ "\"embulk\"", // this line should be skipped.
66
+ "10", // this line should be skipped.
67
+ "true", // this line should be skipped.
68
+ "false", // this line should be skipped.
69
+ "null" // this line should be skipped.
70
+ ));
71
+
72
+ List<Object[]> records = Pages.toObjects(plugin.newSchema(), output.pages);
73
+ assertEquals(3, records.size());
74
+
75
+ Object[] record;
76
+ Map<Value, Value> map;
77
+ { // "{\"_c0\":true,\"_c1\":10,\"_c2\":\"embulk\",\"_c3\":{\"k\":\"v\"}}"
78
+ record = records.get(0);
79
+ assertEquals(1, record.length);
80
+ map = ((Value)record[0]).asMapValue().map();
81
+
82
+ assertEquals(newBoolean(true), map.get(newString("_c0")));
83
+ assertEquals(newInteger(10L), map.get(newString("_c1")));
84
+ assertEquals(newString("embulk"), map.get(newString("_c2")));
85
+ assertEquals(newMap(newString("k"), newString("v")), map.get(newString("_c3")));
86
+ }
87
+ { // "{}"
88
+ record = records.get(1);
89
+ assertEquals(1, record.length);
90
+ assertTrue(((Value)record[0]).asMapValue().map().isEmpty());
91
+ }
92
+ {
93
+ record = records.get(2);
94
+ assertEquals(1, record.length);
95
+ map = ((Value)record[0]).asMapValue().map();
96
+
97
+ assertEquals(newBoolean(false), map.get(newString("_c0")));
98
+ assertEquals(newInteger(-10L), map.get(newString("_c1")));
99
+ assertEquals(newString("エンバルク"), map.get(newString("_c2")));
100
+ assertEquals(newArray(newString("e0"), newString("e1")), map.get(newString("_c3")));
101
+ }
102
+ }
103
+
104
+ @Test
105
+ public void useStopOnInvalidRecord()
106
+ throws Exception
107
+ {
108
+ ConfigSource config = this.config.deepCopy().set("stop_on_invalid_record", true);
109
+
110
+ try {
111
+ transaction(config, fileInput(
112
+ "[1, 2, 3]" // throw DataException
113
+ ));
114
+ fail();
115
+ }
116
+ catch (Throwable t) {
117
+ assertTrue(t instanceof DataException);
118
+ }
119
+ }
120
+
121
+ @Test
122
+ public void readBrokenJson()
123
+ {
124
+ try {
125
+ transaction(config, fileInput(
126
+ "{\"_c0\":true,\"_c1\":10," // throw DataException
127
+ ));
128
+ fail();
129
+ }
130
+ catch (Throwable t) {
131
+ assertTrue(t instanceof DataException);
132
+ }
133
+ }
134
+
135
+ private ConfigSource config()
136
+ {
137
+ return runtime.getExec().newConfigSource();
138
+ }
139
+
140
+ private void transaction(ConfigSource config, final FileInput input)
141
+ {
142
+ plugin.transaction(config, new ParserPlugin.Control() {
143
+ @Override
144
+ public void run(TaskSource taskSource, Schema schema)
145
+ {
146
+ plugin.run(taskSource, schema, input, output);
147
+ }
148
+ });
149
+ }
150
+
151
+ private FileInput fileInput(String... lines)
152
+ throws Exception
153
+ {
154
+ StringBuilder sb = new StringBuilder();
155
+ for (String line : lines) {
156
+ sb.append(line).append("\n");
157
+ }
158
+
159
+ ByteArrayInputStream in = new ByteArrayInputStream(sb.toString().getBytes());
160
+ return new InputStreamFileInput(runtime.getBufferAllocator(), provider(in));
161
+ }
162
+
163
+ private InputStreamFileInput.IteratorProvider provider(InputStream... inputStreams)
164
+ throws IOException
165
+ {
166
+ return new InputStreamFileInput.IteratorProvider(
167
+ ImmutableList.copyOf(inputStreams));
168
+ }
169
+ }
@@ -0,0 +1,40 @@
1
+ module Embulk
2
+ module Guess
3
+ class JsonGuessPlugin < GuessPlugin
4
+ Plugin.register_guess('json', self)
5
+
6
+ java_import 'com.google.common.collect.Lists'
7
+ java_import 'java.io.ByteArrayInputStream'
8
+ java_import 'org.embulk.spi.Exec'
9
+ java_import 'org.embulk.spi.json.JsonParser'
10
+ java_import 'org.embulk.spi.json.JsonParseException'
11
+ java_import 'org.embulk.spi.util.FileInputInputStream'
12
+ java_import 'org.embulk.spi.util.InputStreamFileInput'
13
+
14
+ def guess(config, sample_buffer)
15
+ return {} unless config.fetch("parser", {}).fetch("type", "json") == "json"
16
+
17
+ # Use org.embulk.spi.json.JsonParser to respond to multi-line Json
18
+ json_parser = new_json_parser(sample_buffer)
19
+ begin
20
+ while json_parser.next
21
+ end
22
+ rescue JsonParseException
23
+ return {}
24
+ end
25
+
26
+ return {"parser" => {"type" => "json"}}
27
+ end
28
+
29
+ private
30
+
31
+ def new_json_parser(buffer)
32
+ input_streams = Lists::newArrayList(ByteArrayInputStream.new(buffer.to_java_bytes))
33
+ iterator_provider = InputStreamFileInput::IteratorProvider.new(input_streams)
34
+ input = FileInputInputStream.new(InputStreamFileInput.new(Java::SPI::Exec.getBufferAllocator(), iterator_provider))
35
+ input.nextFile
36
+ JsonParser.new.open(input)
37
+ end
38
+ end
39
+ end
40
+ end
@@ -1,3 +1,3 @@
1
1
  module Embulk
2
- VERSION = '0.8.4'
2
+ VERSION = '0.8.5'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.4
4
+ version: 0.8.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-16 00:00:00.000000000 Z
11
+ date: 2016-02-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: jruby-jars
@@ -108,8 +108,8 @@ files:
108
108
  - classpath/commons-beanutils-core-1.8.3.jar
109
109
  - classpath/commons-compress-1.10.jar
110
110
  - classpath/commons-lang3-3.1.jar
111
- - classpath/embulk-core-0.8.4.jar
112
- - classpath/embulk-standards-0.8.4.jar
111
+ - classpath/embulk-core-0.8.5.jar
112
+ - classpath/embulk-standards-0.8.5.jar
113
113
  - classpath/guava-18.0.jar
114
114
  - classpath/guice-4.0.jar
115
115
  - classpath/guice-bootstrap-0.1.1.jar
@@ -424,6 +424,7 @@ files:
424
424
  - embulk-docs/src/release/release-0.8.2.rst
425
425
  - embulk-docs/src/release/release-0.8.3.rst
426
426
  - embulk-docs/src/release/release-0.8.4.rst
427
+ - embulk-docs/src/release/release-0.8.5.rst
427
428
  - embulk-standards/build.gradle
428
429
  - embulk-standards/src/main/java/org/embulk/standards/Bzip2FileDecoderPlugin.java
429
430
  - embulk-standards/src/main/java/org/embulk/standards/Bzip2FileEncoderPlugin.java
@@ -432,6 +433,7 @@ files:
432
433
  - embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java
433
434
  - embulk-standards/src/main/java/org/embulk/standards/GzipFileDecoderPlugin.java
434
435
  - embulk-standards/src/main/java/org/embulk/standards/GzipFileEncoderPlugin.java
436
+ - embulk-standards/src/main/java/org/embulk/standards/JsonParserPlugin.java
435
437
  - embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java
436
438
  - embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java
437
439
  - embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java
@@ -443,6 +445,7 @@ files:
443
445
  - embulk-standards/src/test/java/org/embulk/standards/TestCsvFormatterPlugin.java
444
446
  - embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java
445
447
  - embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java
448
+ - embulk-standards/src/test/java/org/embulk/standards/TestJsonParserPlugin.java
446
449
  - embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java
447
450
  - embulk.gemspec
448
451
  - gradle/wrapper/gradle-wrapper.jar
@@ -515,6 +518,7 @@ files:
515
518
  - lib/embulk/guess/charset.rb
516
519
  - lib/embulk/guess/csv.rb
517
520
  - lib/embulk/guess/gzip.rb
521
+ - lib/embulk/guess/json.rb
518
522
  - lib/embulk/guess/newline.rb
519
523
  - lib/embulk/guess/schema_guess.rb
520
524
  - lib/embulk/guess/time_format_guess.rb