embulk 0.8.4-java → 0.8.5-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e3dabb8856cd4e9ad6d545a11567e80f4d9554f0
4
- data.tar.gz: 73793784b13f37a9f1f7dd26050e60c7cd1798fe
3
+ metadata.gz: 3c3d7aa4421322670e6cc3add4163fc0a17c676a
4
+ data.tar.gz: acfdf53810139a9883b86b1baab3f278bbf4b295
5
5
  SHA512:
6
- metadata.gz: 8eaa75bfa389c681008b811b0705e78dfe0d25825e09e88c2dd97f8c4924f5ee894f2c3461dff704ec25aa8e93596015edac642554b2b185c1562a43e5606f84
7
- data.tar.gz: 9caa1141f13553a073ac3841a0cd0d463ae4ffc92810c691e5a5990a44d4b0d17b491afe11057369150ba031d8d0d6e4c176e49c2cd4a004b30da284bb38d0e6
6
+ metadata.gz: ebcdf5814a2c29fc29e3816f733d762428a4f229f2064ea750ada8eed8a0ad9d5184de54847dccbb13e5de8d29a6926991445206f9a00b3011f961205210dab1
7
+ data.tar.gz: 8e354cd04b40444ccc79bd75a7804277756ed2458c25872db91970d94436282576fba2847723c6b66894984c7cd012500552fb9f19220a2b9bf5b776b355590c
data/build.gradle CHANGED
@@ -16,7 +16,7 @@ def release_projects = [project(":embulk-core"), project(":embulk-standards")]
16
16
 
17
17
  allprojects {
18
18
  group = 'org.embulk'
19
- version = '0.8.4'
19
+ version = '0.8.5'
20
20
 
21
21
  ext {
22
22
  jrubyVersion = '9.0.4.0'
@@ -277,6 +277,40 @@ Example
277
277
  - {name: purchase, type: timestamp, format: '%Y%m%d'}
278
278
  - {name: comment, type: string}
279
279
 
280
+
281
+ JSON parser plugin
282
+ ------------------
283
+
284
+ The ``json`` parser plugin parses a JSON file that contains a sequence of JSON objects. Example:
285
+
286
+ .. code-block:: json
287
+
288
+ {"time":1455829282,"ip":"93.184.216.34","name":frsyuki}
289
+ {"time":1455829282,"ip":"172.36.8.109":sadayuki}
290
+ {"time":1455829284,"ip":"example.com","name":Treasure Data}
291
+ {"time":1455829282,"ip":"10.98.43.1","name":MessagePack}
292
+
293
+ ``json`` parser plugin outputs a single record named "record" (type is json).
294
+
295
+ Options
296
+ ~~~~~~~~~~~~~~~~~~
297
+
298
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
299
+ | name | type | description | required? |
300
+ +============================+==========+================================================================================================================+========================+
301
+ | stop\_on\_invalid\_record | boolean | Stop bulk load transaction if a file includes invalid record (such as invalid json) | ``false`` by default |
302
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
303
+
304
+
305
+ Example
306
+ ~~~~~~~~~~~~~~~~~~
307
+
308
+ .. code-block:: yaml
309
+
310
+ in:
311
+ parser:
312
+ type: json
313
+
280
314
  Gzip decoder plugin
281
315
  ------------------
282
316
 
@@ -4,6 +4,7 @@ Release Notes
4
4
  .. toctree::
5
5
  :maxdepth: 1
6
6
 
7
+ release/release-0.8.5
7
8
  release/release-0.8.4
8
9
  release/release-0.8.3
9
10
  release/release-0.8.2
@@ -0,0 +1,11 @@
1
+ Release 0.8.5
2
+ ==================================
3
+
4
+ General Changes
5
+ ------------------
6
+
7
+ * Added ``json`` parser plugin. It doesn't have options to extract values into columns (as like embulk-parser-jsonl plugin supports). It's intended to be supported by a filter plugin.
8
+
9
+ Release Date
10
+ ------------------
11
+ 2016-02-18
@@ -0,0 +1,116 @@
1
+ package org.embulk.standards;
2
+
3
+ import com.google.common.annotations.VisibleForTesting;
4
+ import org.embulk.config.Config;
5
+ import org.embulk.config.ConfigDefault;
6
+ import org.embulk.config.ConfigSource;
7
+ import org.embulk.config.Task;
8
+ import org.embulk.config.TaskSource;
9
+ import org.embulk.spi.Column;
10
+ import org.embulk.spi.DataException;
11
+ import org.embulk.spi.Exec;
12
+ import org.embulk.spi.FileInput;
13
+ import org.embulk.spi.PageBuilder;
14
+ import org.embulk.spi.PageOutput;
15
+ import org.embulk.spi.ParserPlugin;
16
+ import org.embulk.spi.Schema;
17
+ import org.embulk.spi.json.JsonParseException;
18
+ import org.embulk.spi.json.JsonParser;
19
+ import org.embulk.spi.type.Types;
20
+ import org.embulk.spi.util.FileInputInputStream;
21
+ import org.msgpack.value.Value;
22
+ import org.slf4j.Logger;
23
+
24
+ import java.io.IOException;
25
+
26
+ public class JsonParserPlugin
27
+ implements ParserPlugin
28
+ {
29
+ public interface PluginTask
30
+ extends Task
31
+ {
32
+ @Config("stop_on_invalid_record")
33
+ @ConfigDefault("false")
34
+ boolean getStopOnInvalidRecord();
35
+ }
36
+
37
+ private final Logger log;
38
+
39
+ public JsonParserPlugin()
40
+ {
41
+ this.log = Exec.getLogger(JsonParserPlugin.class);
42
+ }
43
+
44
+ @Override
45
+ public void transaction(ConfigSource configSource, Control control)
46
+ {
47
+ PluginTask task = configSource.loadConfig(PluginTask.class);
48
+ control.run(task.dump(), newSchema());
49
+ }
50
+
51
+ @VisibleForTesting
52
+ Schema newSchema()
53
+ {
54
+ return Schema.builder().add("record", Types.JSON).build(); // generate a schema
55
+ }
56
+
57
+ @Override
58
+ public void run(TaskSource taskSource, Schema schema, FileInput input, PageOutput output)
59
+ {
60
+ PluginTask task = taskSource.loadTask(PluginTask.class);
61
+
62
+ final boolean stopOnInvalidRecord = task.getStopOnInvalidRecord();
63
+ final Column column = schema.getColumn(0); // record column
64
+
65
+ try (PageBuilder pageBuilder = newPageBuilder(schema, output);
66
+ FileInputInputStream in = new FileInputInputStream(input)) {
67
+ while (in.nextFile()) {
68
+ try (JsonParser.Stream stream = newJsonStream(in)) {
69
+ Value value;
70
+ while ((value = stream.next()) != null) {
71
+ try {
72
+ if (!value.isMapValue()) {
73
+ throw new JsonRecordValidateException(
74
+ String.format("A Json record must not represent map value but it's %s", value.getValueType().name()));
75
+ }
76
+
77
+ pageBuilder.setJson(column, value);
78
+ pageBuilder.addRecord();
79
+ }
80
+ catch (JsonRecordValidateException e) {
81
+ if (stopOnInvalidRecord) {
82
+ throw new DataException(String.format("Invalid record: %s", value.toJson()), e);
83
+ }
84
+ log.warn(String.format("Skipped record (%s): %s", e.getMessage(), value.toJson()));
85
+ }
86
+ }
87
+ }
88
+ catch (IOException | JsonParseException e) {
89
+ throw new DataException(e);
90
+ }
91
+ }
92
+
93
+ pageBuilder.finish();
94
+ }
95
+ }
96
+
97
+ private PageBuilder newPageBuilder(Schema schema, PageOutput output)
98
+ {
99
+ return new PageBuilder(Exec.getBufferAllocator(), schema, output);
100
+ }
101
+
102
+ private JsonParser.Stream newJsonStream(FileInputInputStream in)
103
+ throws IOException
104
+ {
105
+ return new JsonParser().open(in);
106
+ }
107
+
108
+ static class JsonRecordValidateException
109
+ extends DataException
110
+ {
111
+ JsonRecordValidateException(String message)
112
+ {
113
+ super(message);
114
+ }
115
+ }
116
+ }
@@ -27,6 +27,7 @@ public class StandardPluginModule
27
27
 
28
28
  // parser plugins
29
29
  registerPluginTo(binder, ParserPlugin.class, "csv", CsvParserPlugin.class);
30
+ registerPluginTo(binder, ParserPlugin.class, "json", JsonParserPlugin.class);
30
31
 
31
32
  // file decoder plugins
32
33
  registerPluginTo(binder, DecoderPlugin.class, "gzip", GzipFileDecoderPlugin.class);
@@ -50,6 +51,7 @@ public class StandardPluginModule
50
51
  // default guess plugins
51
52
  registerDefaultGuessPluginTo(binder, new PluginType("gzip"));
52
53
  registerDefaultGuessPluginTo(binder, new PluginType("bzip2"));
54
+ registerDefaultGuessPluginTo(binder, new PluginType("json")); // should be registered before CsvGuessPlugin
53
55
  registerDefaultGuessPluginTo(binder, new PluginType("csv"));
54
56
  // charset and newline guess plugins are loaded and invoked by CsvGuessPlugin
55
57
  }
@@ -0,0 +1,169 @@
1
+ package org.embulk.standards;
2
+
3
+ import com.google.common.collect.ImmutableList;
4
+ import org.embulk.EmbulkTestRuntime;
5
+ import org.embulk.config.ConfigSource;
6
+ import org.embulk.config.TaskSource;
7
+ import org.embulk.spi.DataException;
8
+ import org.embulk.spi.FileInput;
9
+ import org.embulk.spi.ParserPlugin;
10
+ import org.embulk.spi.Schema;
11
+ import org.embulk.spi.TestPageBuilderReader.MockPageOutput;
12
+ import org.embulk.spi.util.InputStreamFileInput;
13
+ import org.embulk.spi.util.Pages;
14
+ import org.junit.Before;
15
+ import org.junit.Rule;
16
+ import org.junit.Test;
17
+ import org.msgpack.value.Value;
18
+
19
+ import java.io.ByteArrayInputStream;
20
+ import java.io.IOException;
21
+ import java.io.InputStream;
22
+ import java.util.List;
23
+ import java.util.Map;
24
+
25
+ import static org.junit.Assert.assertEquals;
26
+ import static org.junit.Assert.assertTrue;
27
+ import static org.junit.Assert.fail;
28
+ import static org.msgpack.value.ValueFactory.newArray;
29
+ import static org.msgpack.value.ValueFactory.newBoolean;
30
+ import static org.msgpack.value.ValueFactory.newInteger;
31
+ import static org.msgpack.value.ValueFactory.newMap;
32
+ import static org.msgpack.value.ValueFactory.newString;
33
+
34
+ public class TestJsonParserPlugin
35
+ {
36
+ @Rule
37
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
38
+
39
+ private ConfigSource config;
40
+ private JsonParserPlugin plugin;
41
+ private MockPageOutput output;
42
+
43
+ @Before
44
+ public void createResource()
45
+ {
46
+ config = config();
47
+ plugin = new JsonParserPlugin();
48
+ output = new MockPageOutput();
49
+ }
50
+
51
+ @Test
52
+ public void readNormalJson()
53
+ throws Exception
54
+ {
55
+ transaction(config, fileInput(
56
+ "{\"_c0\":true,\"_c1\":10,\"_c2\":\"embulk\",\"_c3\":{\"k\":\"v\"}}",
57
+ "{}",
58
+ "{\n" +
59
+ "\"_c0\":false,\n" +
60
+ "\"_c1\":-10,\n" +
61
+ "\"_c2\":\"エンバルク\",\n" +
62
+ "\"_c3\":[\"e0\",\"e1\"]\n" +
63
+ "}",
64
+ "[1, 2, 3]", // this line should be skipped.
65
+ "\"embulk\"", // this line should be skipped.
66
+ "10", // this line should be skipped.
67
+ "true", // this line should be skipped.
68
+ "false", // this line should be skipped.
69
+ "null" // this line should be skipped.
70
+ ));
71
+
72
+ List<Object[]> records = Pages.toObjects(plugin.newSchema(), output.pages);
73
+ assertEquals(3, records.size());
74
+
75
+ Object[] record;
76
+ Map<Value, Value> map;
77
+ { // "{\"_c0\":true,\"_c1\":10,\"_c2\":\"embulk\",\"_c3\":{\"k\":\"v\"}}"
78
+ record = records.get(0);
79
+ assertEquals(1, record.length);
80
+ map = ((Value)record[0]).asMapValue().map();
81
+
82
+ assertEquals(newBoolean(true), map.get(newString("_c0")));
83
+ assertEquals(newInteger(10L), map.get(newString("_c1")));
84
+ assertEquals(newString("embulk"), map.get(newString("_c2")));
85
+ assertEquals(newMap(newString("k"), newString("v")), map.get(newString("_c3")));
86
+ }
87
+ { // "{}"
88
+ record = records.get(1);
89
+ assertEquals(1, record.length);
90
+ assertTrue(((Value)record[0]).asMapValue().map().isEmpty());
91
+ }
92
+ {
93
+ record = records.get(2);
94
+ assertEquals(1, record.length);
95
+ map = ((Value)record[0]).asMapValue().map();
96
+
97
+ assertEquals(newBoolean(false), map.get(newString("_c0")));
98
+ assertEquals(newInteger(-10L), map.get(newString("_c1")));
99
+ assertEquals(newString("エンバルク"), map.get(newString("_c2")));
100
+ assertEquals(newArray(newString("e0"), newString("e1")), map.get(newString("_c3")));
101
+ }
102
+ }
103
+
104
+ @Test
105
+ public void useStopOnInvalidRecord()
106
+ throws Exception
107
+ {
108
+ ConfigSource config = this.config.deepCopy().set("stop_on_invalid_record", true);
109
+
110
+ try {
111
+ transaction(config, fileInput(
112
+ "[1, 2, 3]" // throw DataException
113
+ ));
114
+ fail();
115
+ }
116
+ catch (Throwable t) {
117
+ assertTrue(t instanceof DataException);
118
+ }
119
+ }
120
+
121
+ @Test
122
+ public void readBrokenJson()
123
+ {
124
+ try {
125
+ transaction(config, fileInput(
126
+ "{\"_c0\":true,\"_c1\":10," // throw DataException
127
+ ));
128
+ fail();
129
+ }
130
+ catch (Throwable t) {
131
+ assertTrue(t instanceof DataException);
132
+ }
133
+ }
134
+
135
+ private ConfigSource config()
136
+ {
137
+ return runtime.getExec().newConfigSource();
138
+ }
139
+
140
+ private void transaction(ConfigSource config, final FileInput input)
141
+ {
142
+ plugin.transaction(config, new ParserPlugin.Control() {
143
+ @Override
144
+ public void run(TaskSource taskSource, Schema schema)
145
+ {
146
+ plugin.run(taskSource, schema, input, output);
147
+ }
148
+ });
149
+ }
150
+
151
+ private FileInput fileInput(String... lines)
152
+ throws Exception
153
+ {
154
+ StringBuilder sb = new StringBuilder();
155
+ for (String line : lines) {
156
+ sb.append(line).append("\n");
157
+ }
158
+
159
+ ByteArrayInputStream in = new ByteArrayInputStream(sb.toString().getBytes());
160
+ return new InputStreamFileInput(runtime.getBufferAllocator(), provider(in));
161
+ }
162
+
163
+ private InputStreamFileInput.IteratorProvider provider(InputStream... inputStreams)
164
+ throws IOException
165
+ {
166
+ return new InputStreamFileInput.IteratorProvider(
167
+ ImmutableList.copyOf(inputStreams));
168
+ }
169
+ }
@@ -0,0 +1,40 @@
1
+ module Embulk
2
+ module Guess
3
+ class JsonGuessPlugin < GuessPlugin
4
+ Plugin.register_guess('json', self)
5
+
6
+ java_import 'com.google.common.collect.Lists'
7
+ java_import 'java.io.ByteArrayInputStream'
8
+ java_import 'org.embulk.spi.Exec'
9
+ java_import 'org.embulk.spi.json.JsonParser'
10
+ java_import 'org.embulk.spi.json.JsonParseException'
11
+ java_import 'org.embulk.spi.util.FileInputInputStream'
12
+ java_import 'org.embulk.spi.util.InputStreamFileInput'
13
+
14
+ def guess(config, sample_buffer)
15
+ return {} unless config.fetch("parser", {}).fetch("type", "json") == "json"
16
+
17
+ # Use org.embulk.spi.json.JsonParser to respond to multi-line Json
18
+ json_parser = new_json_parser(sample_buffer)
19
+ begin
20
+ while json_parser.next
21
+ end
22
+ rescue JsonParseException
23
+ return {}
24
+ end
25
+
26
+ return {"parser" => {"type" => "json"}}
27
+ end
28
+
29
+ private
30
+
31
+ def new_json_parser(buffer)
32
+ input_streams = Lists::newArrayList(ByteArrayInputStream.new(buffer.to_java_bytes))
33
+ iterator_provider = InputStreamFileInput::IteratorProvider.new(input_streams)
34
+ input = FileInputInputStream.new(InputStreamFileInput.new(Java::SPI::Exec.getBufferAllocator(), iterator_provider))
35
+ input.nextFile
36
+ JsonParser.new.open(input)
37
+ end
38
+ end
39
+ end
40
+ end
@@ -1,3 +1,3 @@
1
1
  module Embulk
2
- VERSION = '0.8.4'
2
+ VERSION = '0.8.5'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.4
4
+ version: 0.8.5
5
5
  platform: java
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-16 00:00:00.000000000 Z
11
+ date: 2016-02-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -148,8 +148,8 @@ files:
148
148
  - classpath/commons-beanutils-core-1.8.3.jar
149
149
  - classpath/commons-compress-1.10.jar
150
150
  - classpath/commons-lang3-3.1.jar
151
- - classpath/embulk-core-0.8.4.jar
152
- - classpath/embulk-standards-0.8.4.jar
151
+ - classpath/embulk-core-0.8.5.jar
152
+ - classpath/embulk-standards-0.8.5.jar
153
153
  - classpath/guava-18.0.jar
154
154
  - classpath/guice-4.0.jar
155
155
  - classpath/guice-bootstrap-0.1.1.jar
@@ -464,6 +464,7 @@ files:
464
464
  - embulk-docs/src/release/release-0.8.2.rst
465
465
  - embulk-docs/src/release/release-0.8.3.rst
466
466
  - embulk-docs/src/release/release-0.8.4.rst
467
+ - embulk-docs/src/release/release-0.8.5.rst
467
468
  - embulk-standards/build.gradle
468
469
  - embulk-standards/src/main/java/org/embulk/standards/Bzip2FileDecoderPlugin.java
469
470
  - embulk-standards/src/main/java/org/embulk/standards/Bzip2FileEncoderPlugin.java
@@ -472,6 +473,7 @@ files:
472
473
  - embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java
473
474
  - embulk-standards/src/main/java/org/embulk/standards/GzipFileDecoderPlugin.java
474
475
  - embulk-standards/src/main/java/org/embulk/standards/GzipFileEncoderPlugin.java
476
+ - embulk-standards/src/main/java/org/embulk/standards/JsonParserPlugin.java
475
477
  - embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java
476
478
  - embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java
477
479
  - embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java
@@ -483,6 +485,7 @@ files:
483
485
  - embulk-standards/src/test/java/org/embulk/standards/TestCsvFormatterPlugin.java
484
486
  - embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java
485
487
  - embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java
488
+ - embulk-standards/src/test/java/org/embulk/standards/TestJsonParserPlugin.java
486
489
  - embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java
487
490
  - embulk.gemspec
488
491
  - gradle/wrapper/gradle-wrapper.jar
@@ -555,6 +558,7 @@ files:
555
558
  - lib/embulk/guess/charset.rb
556
559
  - lib/embulk/guess/csv.rb
557
560
  - lib/embulk/guess/gzip.rb
561
+ - lib/embulk/guess/json.rb
558
562
  - lib/embulk/guess/newline.rb
559
563
  - lib/embulk/guess/schema_guess.rb
560
564
  - lib/embulk/guess/time_format_guess.rb