embulk 0.8.4-java → 0.8.5-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e3dabb8856cd4e9ad6d545a11567e80f4d9554f0
4
- data.tar.gz: 73793784b13f37a9f1f7dd26050e60c7cd1798fe
3
+ metadata.gz: 3c3d7aa4421322670e6cc3add4163fc0a17c676a
4
+ data.tar.gz: acfdf53810139a9883b86b1baab3f278bbf4b295
5
5
  SHA512:
6
- metadata.gz: 8eaa75bfa389c681008b811b0705e78dfe0d25825e09e88c2dd97f8c4924f5ee894f2c3461dff704ec25aa8e93596015edac642554b2b185c1562a43e5606f84
7
- data.tar.gz: 9caa1141f13553a073ac3841a0cd0d463ae4ffc92810c691e5a5990a44d4b0d17b491afe11057369150ba031d8d0d6e4c176e49c2cd4a004b30da284bb38d0e6
6
+ metadata.gz: ebcdf5814a2c29fc29e3816f733d762428a4f229f2064ea750ada8eed8a0ad9d5184de54847dccbb13e5de8d29a6926991445206f9a00b3011f961205210dab1
7
+ data.tar.gz: 8e354cd04b40444ccc79bd75a7804277756ed2458c25872db91970d94436282576fba2847723c6b66894984c7cd012500552fb9f19220a2b9bf5b776b355590c
data/build.gradle CHANGED
@@ -16,7 +16,7 @@ def release_projects = [project(":embulk-core"), project(":embulk-standards")]
16
16
 
17
17
  allprojects {
18
18
  group = 'org.embulk'
19
- version = '0.8.4'
19
+ version = '0.8.5'
20
20
 
21
21
  ext {
22
22
  jrubyVersion = '9.0.4.0'
@@ -277,6 +277,40 @@ Example
277
277
  - {name: purchase, type: timestamp, format: '%Y%m%d'}
278
278
  - {name: comment, type: string}
279
279
 
280
+
281
+ JSON parser plugin
282
+ ------------------
283
+
284
+ The ``json`` parser plugin parses a JSON file that contains a sequence of JSON objects. Example:
285
+
286
+ .. code-block:: json
287
+
288
+ {"time":1455829282,"ip":"93.184.216.34","name":frsyuki}
289
+ {"time":1455829282,"ip":"172.36.8.109":sadayuki}
290
+ {"time":1455829284,"ip":"example.com","name":Treasure Data}
291
+ {"time":1455829282,"ip":"10.98.43.1","name":MessagePack}
292
+
293
+ ``json`` parser plugin outputs a single record named "record" (type is json).
294
+
295
+ Options
296
+ ~~~~~~~~~~~~~~~~~~
297
+
298
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
299
+ | name | type | description | required? |
300
+ +============================+==========+================================================================================================================+========================+
301
+ | stop\_on\_invalid\_record | boolean | Stop bulk load transaction if a file includes invalid record (such as invalid json) | ``false`` by default |
302
+ +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
303
+
304
+
305
+ Example
306
+ ~~~~~~~~~~~~~~~~~~
307
+
308
+ .. code-block:: yaml
309
+
310
+ in:
311
+ parser:
312
+ type: json
313
+
280
314
  Gzip decoder plugin
281
315
  ------------------
282
316
 
@@ -4,6 +4,7 @@ Release Notes
4
4
  .. toctree::
5
5
  :maxdepth: 1
6
6
 
7
+ release/release-0.8.5
7
8
  release/release-0.8.4
8
9
  release/release-0.8.3
9
10
  release/release-0.8.2
@@ -0,0 +1,11 @@
1
+ Release 0.8.5
2
+ ==================================
3
+
4
+ General Changes
5
+ ------------------
6
+
7
+ * Added ``json`` parser plugin. It doesn't have options to extract values into columns (as like embulk-parser-jsonl plugin supports). It's intended to be supported by a filter plugin.
8
+
9
+ Release Date
10
+ ------------------
11
+ 2016-02-18
@@ -0,0 +1,116 @@
1
+ package org.embulk.standards;
2
+
3
+ import com.google.common.annotations.VisibleForTesting;
4
+ import org.embulk.config.Config;
5
+ import org.embulk.config.ConfigDefault;
6
+ import org.embulk.config.ConfigSource;
7
+ import org.embulk.config.Task;
8
+ import org.embulk.config.TaskSource;
9
+ import org.embulk.spi.Column;
10
+ import org.embulk.spi.DataException;
11
+ import org.embulk.spi.Exec;
12
+ import org.embulk.spi.FileInput;
13
+ import org.embulk.spi.PageBuilder;
14
+ import org.embulk.spi.PageOutput;
15
+ import org.embulk.spi.ParserPlugin;
16
+ import org.embulk.spi.Schema;
17
+ import org.embulk.spi.json.JsonParseException;
18
+ import org.embulk.spi.json.JsonParser;
19
+ import org.embulk.spi.type.Types;
20
+ import org.embulk.spi.util.FileInputInputStream;
21
+ import org.msgpack.value.Value;
22
+ import org.slf4j.Logger;
23
+
24
+ import java.io.IOException;
25
+
26
+ public class JsonParserPlugin
27
+ implements ParserPlugin
28
+ {
29
+ public interface PluginTask
30
+ extends Task
31
+ {
32
+ @Config("stop_on_invalid_record")
33
+ @ConfigDefault("false")
34
+ boolean getStopOnInvalidRecord();
35
+ }
36
+
37
+ private final Logger log;
38
+
39
+ public JsonParserPlugin()
40
+ {
41
+ this.log = Exec.getLogger(JsonParserPlugin.class);
42
+ }
43
+
44
+ @Override
45
+ public void transaction(ConfigSource configSource, Control control)
46
+ {
47
+ PluginTask task = configSource.loadConfig(PluginTask.class);
48
+ control.run(task.dump(), newSchema());
49
+ }
50
+
51
+ @VisibleForTesting
52
+ Schema newSchema()
53
+ {
54
+ return Schema.builder().add("record", Types.JSON).build(); // generate a schema
55
+ }
56
+
57
+ @Override
58
+ public void run(TaskSource taskSource, Schema schema, FileInput input, PageOutput output)
59
+ {
60
+ PluginTask task = taskSource.loadTask(PluginTask.class);
61
+
62
+ final boolean stopOnInvalidRecord = task.getStopOnInvalidRecord();
63
+ final Column column = schema.getColumn(0); // record column
64
+
65
+ try (PageBuilder pageBuilder = newPageBuilder(schema, output);
66
+ FileInputInputStream in = new FileInputInputStream(input)) {
67
+ while (in.nextFile()) {
68
+ try (JsonParser.Stream stream = newJsonStream(in)) {
69
+ Value value;
70
+ while ((value = stream.next()) != null) {
71
+ try {
72
+ if (!value.isMapValue()) {
73
+ throw new JsonRecordValidateException(
74
+ String.format("A Json record must not represent map value but it's %s", value.getValueType().name()));
75
+ }
76
+
77
+ pageBuilder.setJson(column, value);
78
+ pageBuilder.addRecord();
79
+ }
80
+ catch (JsonRecordValidateException e) {
81
+ if (stopOnInvalidRecord) {
82
+ throw new DataException(String.format("Invalid record: %s", value.toJson()), e);
83
+ }
84
+ log.warn(String.format("Skipped record (%s): %s", e.getMessage(), value.toJson()));
85
+ }
86
+ }
87
+ }
88
+ catch (IOException | JsonParseException e) {
89
+ throw new DataException(e);
90
+ }
91
+ }
92
+
93
+ pageBuilder.finish();
94
+ }
95
+ }
96
+
97
+ private PageBuilder newPageBuilder(Schema schema, PageOutput output)
98
+ {
99
+ return new PageBuilder(Exec.getBufferAllocator(), schema, output);
100
+ }
101
+
102
+ private JsonParser.Stream newJsonStream(FileInputInputStream in)
103
+ throws IOException
104
+ {
105
+ return new JsonParser().open(in);
106
+ }
107
+
108
+ static class JsonRecordValidateException
109
+ extends DataException
110
+ {
111
+ JsonRecordValidateException(String message)
112
+ {
113
+ super(message);
114
+ }
115
+ }
116
+ }
@@ -27,6 +27,7 @@ public class StandardPluginModule
27
27
 
28
28
  // parser plugins
29
29
  registerPluginTo(binder, ParserPlugin.class, "csv", CsvParserPlugin.class);
30
+ registerPluginTo(binder, ParserPlugin.class, "json", JsonParserPlugin.class);
30
31
 
31
32
  // file decoder plugins
32
33
  registerPluginTo(binder, DecoderPlugin.class, "gzip", GzipFileDecoderPlugin.class);
@@ -50,6 +51,7 @@ public class StandardPluginModule
50
51
  // default guess plugins
51
52
  registerDefaultGuessPluginTo(binder, new PluginType("gzip"));
52
53
  registerDefaultGuessPluginTo(binder, new PluginType("bzip2"));
54
+ registerDefaultGuessPluginTo(binder, new PluginType("json")); // should be registered before CsvGuessPlugin
53
55
  registerDefaultGuessPluginTo(binder, new PluginType("csv"));
54
56
  // charset and newline guess plugins are loaded and invoked by CsvGuessPlugin
55
57
  }
@@ -0,0 +1,169 @@
1
+ package org.embulk.standards;
2
+
3
+ import com.google.common.collect.ImmutableList;
4
+ import org.embulk.EmbulkTestRuntime;
5
+ import org.embulk.config.ConfigSource;
6
+ import org.embulk.config.TaskSource;
7
+ import org.embulk.spi.DataException;
8
+ import org.embulk.spi.FileInput;
9
+ import org.embulk.spi.ParserPlugin;
10
+ import org.embulk.spi.Schema;
11
+ import org.embulk.spi.TestPageBuilderReader.MockPageOutput;
12
+ import org.embulk.spi.util.InputStreamFileInput;
13
+ import org.embulk.spi.util.Pages;
14
+ import org.junit.Before;
15
+ import org.junit.Rule;
16
+ import org.junit.Test;
17
+ import org.msgpack.value.Value;
18
+
19
+ import java.io.ByteArrayInputStream;
20
+ import java.io.IOException;
21
+ import java.io.InputStream;
22
+ import java.util.List;
23
+ import java.util.Map;
24
+
25
+ import static org.junit.Assert.assertEquals;
26
+ import static org.junit.Assert.assertTrue;
27
+ import static org.junit.Assert.fail;
28
+ import static org.msgpack.value.ValueFactory.newArray;
29
+ import static org.msgpack.value.ValueFactory.newBoolean;
30
+ import static org.msgpack.value.ValueFactory.newInteger;
31
+ import static org.msgpack.value.ValueFactory.newMap;
32
+ import static org.msgpack.value.ValueFactory.newString;
33
+
34
+ public class TestJsonParserPlugin
35
+ {
36
+ @Rule
37
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
38
+
39
+ private ConfigSource config;
40
+ private JsonParserPlugin plugin;
41
+ private MockPageOutput output;
42
+
43
+ @Before
44
+ public void createResource()
45
+ {
46
+ config = config();
47
+ plugin = new JsonParserPlugin();
48
+ output = new MockPageOutput();
49
+ }
50
+
51
+ @Test
52
+ public void readNormalJson()
53
+ throws Exception
54
+ {
55
+ transaction(config, fileInput(
56
+ "{\"_c0\":true,\"_c1\":10,\"_c2\":\"embulk\",\"_c3\":{\"k\":\"v\"}}",
57
+ "{}",
58
+ "{\n" +
59
+ "\"_c0\":false,\n" +
60
+ "\"_c1\":-10,\n" +
61
+ "\"_c2\":\"エンバルク\",\n" +
62
+ "\"_c3\":[\"e0\",\"e1\"]\n" +
63
+ "}",
64
+ "[1, 2, 3]", // this line should be skipped.
65
+ "\"embulk\"", // this line should be skipped.
66
+ "10", // this line should be skipped.
67
+ "true", // this line should be skipped.
68
+ "false", // this line should be skipped.
69
+ "null" // this line should be skipped.
70
+ ));
71
+
72
+ List<Object[]> records = Pages.toObjects(plugin.newSchema(), output.pages);
73
+ assertEquals(3, records.size());
74
+
75
+ Object[] record;
76
+ Map<Value, Value> map;
77
+ { // "{\"_c0\":true,\"_c1\":10,\"_c2\":\"embulk\",\"_c3\":{\"k\":\"v\"}}"
78
+ record = records.get(0);
79
+ assertEquals(1, record.length);
80
+ map = ((Value)record[0]).asMapValue().map();
81
+
82
+ assertEquals(newBoolean(true), map.get(newString("_c0")));
83
+ assertEquals(newInteger(10L), map.get(newString("_c1")));
84
+ assertEquals(newString("embulk"), map.get(newString("_c2")));
85
+ assertEquals(newMap(newString("k"), newString("v")), map.get(newString("_c3")));
86
+ }
87
+ { // "{}"
88
+ record = records.get(1);
89
+ assertEquals(1, record.length);
90
+ assertTrue(((Value)record[0]).asMapValue().map().isEmpty());
91
+ }
92
+ {
93
+ record = records.get(2);
94
+ assertEquals(1, record.length);
95
+ map = ((Value)record[0]).asMapValue().map();
96
+
97
+ assertEquals(newBoolean(false), map.get(newString("_c0")));
98
+ assertEquals(newInteger(-10L), map.get(newString("_c1")));
99
+ assertEquals(newString("エンバルク"), map.get(newString("_c2")));
100
+ assertEquals(newArray(newString("e0"), newString("e1")), map.get(newString("_c3")));
101
+ }
102
+ }
103
+
104
+ @Test
105
+ public void useStopOnInvalidRecord()
106
+ throws Exception
107
+ {
108
+ ConfigSource config = this.config.deepCopy().set("stop_on_invalid_record", true);
109
+
110
+ try {
111
+ transaction(config, fileInput(
112
+ "[1, 2, 3]" // throw DataException
113
+ ));
114
+ fail();
115
+ }
116
+ catch (Throwable t) {
117
+ assertTrue(t instanceof DataException);
118
+ }
119
+ }
120
+
121
+ @Test
122
+ public void readBrokenJson()
123
+ {
124
+ try {
125
+ transaction(config, fileInput(
126
+ "{\"_c0\":true,\"_c1\":10," // throw DataException
127
+ ));
128
+ fail();
129
+ }
130
+ catch (Throwable t) {
131
+ assertTrue(t instanceof DataException);
132
+ }
133
+ }
134
+
135
+ private ConfigSource config()
136
+ {
137
+ return runtime.getExec().newConfigSource();
138
+ }
139
+
140
+ private void transaction(ConfigSource config, final FileInput input)
141
+ {
142
+ plugin.transaction(config, new ParserPlugin.Control() {
143
+ @Override
144
+ public void run(TaskSource taskSource, Schema schema)
145
+ {
146
+ plugin.run(taskSource, schema, input, output);
147
+ }
148
+ });
149
+ }
150
+
151
+ private FileInput fileInput(String... lines)
152
+ throws Exception
153
+ {
154
+ StringBuilder sb = new StringBuilder();
155
+ for (String line : lines) {
156
+ sb.append(line).append("\n");
157
+ }
158
+
159
+ ByteArrayInputStream in = new ByteArrayInputStream(sb.toString().getBytes());
160
+ return new InputStreamFileInput(runtime.getBufferAllocator(), provider(in));
161
+ }
162
+
163
+ private InputStreamFileInput.IteratorProvider provider(InputStream... inputStreams)
164
+ throws IOException
165
+ {
166
+ return new InputStreamFileInput.IteratorProvider(
167
+ ImmutableList.copyOf(inputStreams));
168
+ }
169
+ }
@@ -0,0 +1,40 @@
1
+ module Embulk
2
+ module Guess
3
+ class JsonGuessPlugin < GuessPlugin
4
+ Plugin.register_guess('json', self)
5
+
6
+ java_import 'com.google.common.collect.Lists'
7
+ java_import 'java.io.ByteArrayInputStream'
8
+ java_import 'org.embulk.spi.Exec'
9
+ java_import 'org.embulk.spi.json.JsonParser'
10
+ java_import 'org.embulk.spi.json.JsonParseException'
11
+ java_import 'org.embulk.spi.util.FileInputInputStream'
12
+ java_import 'org.embulk.spi.util.InputStreamFileInput'
13
+
14
+ def guess(config, sample_buffer)
15
+ return {} unless config.fetch("parser", {}).fetch("type", "json") == "json"
16
+
17
+ # Use org.embulk.spi.json.JsonParser to respond to multi-line Json
18
+ json_parser = new_json_parser(sample_buffer)
19
+ begin
20
+ while json_parser.next
21
+ end
22
+ rescue JsonParseException
23
+ return {}
24
+ end
25
+
26
+ return {"parser" => {"type" => "json"}}
27
+ end
28
+
29
+ private
30
+
31
+ def new_json_parser(buffer)
32
+ input_streams = Lists::newArrayList(ByteArrayInputStream.new(buffer.to_java_bytes))
33
+ iterator_provider = InputStreamFileInput::IteratorProvider.new(input_streams)
34
+ input = FileInputInputStream.new(InputStreamFileInput.new(Java::SPI::Exec.getBufferAllocator(), iterator_provider))
35
+ input.nextFile
36
+ JsonParser.new.open(input)
37
+ end
38
+ end
39
+ end
40
+ end
@@ -1,3 +1,3 @@
1
1
  module Embulk
2
- VERSION = '0.8.4'
2
+ VERSION = '0.8.5'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.4
4
+ version: 0.8.5
5
5
  platform: java
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-16 00:00:00.000000000 Z
11
+ date: 2016-02-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -148,8 +148,8 @@ files:
148
148
  - classpath/commons-beanutils-core-1.8.3.jar
149
149
  - classpath/commons-compress-1.10.jar
150
150
  - classpath/commons-lang3-3.1.jar
151
- - classpath/embulk-core-0.8.4.jar
152
- - classpath/embulk-standards-0.8.4.jar
151
+ - classpath/embulk-core-0.8.5.jar
152
+ - classpath/embulk-standards-0.8.5.jar
153
153
  - classpath/guava-18.0.jar
154
154
  - classpath/guice-4.0.jar
155
155
  - classpath/guice-bootstrap-0.1.1.jar
@@ -464,6 +464,7 @@ files:
464
464
  - embulk-docs/src/release/release-0.8.2.rst
465
465
  - embulk-docs/src/release/release-0.8.3.rst
466
466
  - embulk-docs/src/release/release-0.8.4.rst
467
+ - embulk-docs/src/release/release-0.8.5.rst
467
468
  - embulk-standards/build.gradle
468
469
  - embulk-standards/src/main/java/org/embulk/standards/Bzip2FileDecoderPlugin.java
469
470
  - embulk-standards/src/main/java/org/embulk/standards/Bzip2FileEncoderPlugin.java
@@ -472,6 +473,7 @@ files:
472
473
  - embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java
473
474
  - embulk-standards/src/main/java/org/embulk/standards/GzipFileDecoderPlugin.java
474
475
  - embulk-standards/src/main/java/org/embulk/standards/GzipFileEncoderPlugin.java
476
+ - embulk-standards/src/main/java/org/embulk/standards/JsonParserPlugin.java
475
477
  - embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java
476
478
  - embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java
477
479
  - embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java
@@ -483,6 +485,7 @@ files:
483
485
  - embulk-standards/src/test/java/org/embulk/standards/TestCsvFormatterPlugin.java
484
486
  - embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java
485
487
  - embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java
488
+ - embulk-standards/src/test/java/org/embulk/standards/TestJsonParserPlugin.java
486
489
  - embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java
487
490
  - embulk.gemspec
488
491
  - gradle/wrapper/gradle-wrapper.jar
@@ -555,6 +558,7 @@ files:
555
558
  - lib/embulk/guess/charset.rb
556
559
  - lib/embulk/guess/csv.rb
557
560
  - lib/embulk/guess/gzip.rb
561
+ - lib/embulk/guess/json.rb
558
562
  - lib/embulk/guess/newline.rb
559
563
  - lib/embulk/guess/schema_guess.rb
560
564
  - lib/embulk/guess/time_format_guess.rb