embulk-parser-jsonl 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -1
  3. data/README.md +17 -14
  4. data/bench/gen_dummy.rb +5 -0
  5. data/bench/typecast.yml +17 -0
  6. data/bench/without_typecast.yml +17 -0
  7. data/build.gradle +1 -1
  8. data/embulk-parser-jsonl.gemspec +1 -1
  9. data/example/compat.yml +21 -0
  10. data/example/example.yml +5 -10
  11. data/example/example_without_typecast.yml +18 -0
  12. data/example/sample.json +2 -2
  13. data/src/main/java/org/embulk/parser/jsonl/ColumnCaster.java +97 -0
  14. data/src/main/java/org/embulk/parser/jsonl/ColumnVisitorImpl.java +164 -0
  15. data/src/main/java/org/embulk/parser/jsonl/JsonRecordValidateException.java +11 -3
  16. data/src/main/java/org/embulk/parser/jsonl/JsonlParserPlugin.java +39 -14
  17. data/src/main/java/org/embulk/parser/jsonl/cast/BooleanCast.java +39 -0
  18. data/src/main/java/org/embulk/parser/jsonl/cast/DoubleCast.java +41 -0
  19. data/src/main/java/org/embulk/parser/jsonl/cast/JsonCast.java +40 -0
  20. data/src/main/java/org/embulk/parser/jsonl/cast/LongCast.java +47 -0
  21. data/src/main/java/org/embulk/parser/jsonl/cast/StringCast.java +82 -0
  22. data/src/test/java/org/embulk/parser/jsonl/TestColumnCaster.java +256 -0
  23. data/src/test/java/org/embulk/parser/jsonl/cast/TestBooleanCast.java +56 -0
  24. data/src/test/java/org/embulk/parser/jsonl/cast/TestDoubleCast.java +50 -0
  25. data/src/test/java/org/embulk/parser/jsonl/cast/TestJsonCast.java +80 -0
  26. data/src/test/java/org/embulk/parser/jsonl/cast/TestLongCast.java +42 -0
  27. data/src/test/java/org/embulk/parser/jsonl/cast/TestStringCast.java +103 -0
  28. metadata +21 -7
  29. data/src/main/java/org/embulk/parser/jsonl/JsonlColumnOption.java +0 -16
  30. data/src/main/java/org/embulk/parser/jsonl/getter/ColumnGetterFactory.java +0 -24
  31. data/src/main/java/org/embulk/parser/jsonl/getter/CommonColumnGetter.java +0 -131
  32. data/src/main/java/org/embulk/parser/jsonl/getter/StringColumnGetter.java +0 -68
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7f4944036ca4cacbe9f2b8a0943503f77c92aca1
4
- data.tar.gz: a11616b53812123915ccdd8e88938471f81b2aea
3
+ metadata.gz: e169a403e6e9d48e00b0a5d4b5aed92774e556a1
4
+ data.tar.gz: 0b762ec444862c9fc3f575d1b7ddd87207632d08
5
5
  SHA512:
6
- metadata.gz: bf717db902192c17124a393b063972067e2e9d46b66dfc7247cf427f03bcec551d10c799c2ec6923ca486ba44ec0017d72598280106d356eb3f9464fdbf94d0f
7
- data.tar.gz: be23fbef9cba986f15fecebd91ba15059f1fea6fe13aaebd62373488aa092e6f42b7fb13ab471bf125c38f8ab587a7a0bfe62dc6b465dece554a80e6e0792e25
6
+ metadata.gz: 6a16d1e617e0bcf7ccd1231704374f3ba30c49012c21b7b9b03744b245120e1fe174a2c42d9cc8b481836485c0c9cafa54a8fecfab9417c063fac161b6f650da
7
+ data.tar.gz: 44cdc4bb41d368162b6b58a537c3c1eecf2f0b3685dd87643ecd394e2666cbd58aa55c0db09f82be9c61449b17b47ea5d164284f92e4cc2b3890e1d7b7f852bb
@@ -1,7 +1,11 @@
1
+ ## 0.2.0 - 2016-05-28
2
+
3
+ [new feature] Support typecast option [#9](https://github.com/shun0102/embulk-parser-jsonl/pull/9)
4
+
1
5
  ## 0.1.2 - 2016-03-27
2
6
 
3
7
  [new feature] Support column_options option [#4](https://github.com/shun0102/embulk-parser-jsonl/pull/4)
4
- [maintenance] Upgrade Embulk v0.8.8 [#6](https://github.com/shun0102/embulk-parser-jsonl/pull/6)
8
+ [maintenance] Upgrade Embulk v0.8.8 [#6](https://github.com/shun0102/embulk-parser-jsonl/pull/6)## 0.1.2 - 2016-03-27
5
9
 
6
10
  ## 0.1.1 - 2016-03-17
7
11
 
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Jsonl parser plugin for Embulk
2
2
 
3
- TODO: Write short description here and embulk-parser-jsonl.gemspec file.
3
+ [JSONL (JSON Lines)](http://jsonlines.org/) parser plugin for Embulk
4
4
 
5
5
  ## Overview
6
6
 
@@ -9,21 +9,24 @@ TODO: Write short description here and embulk-parser-jsonl.gemspec file.
9
9
 
10
10
  ## Configuration
11
11
 
12
- - **type**: specify this parser as jsonl
13
- - **columns**: specify column name and type (array, required)
12
+ - **type**: Specify this parser as jsonl
13
+ - **columns**: Specify column name and type. See below (array, required)
14
+ * **stop_on_invalid_record**: Stop bulk load transaction if a file includes invalid record (such as invalid timestamp) (boolean, default: false)
15
+ * **default_timezone**: Default timezone of the timestamp (string, default: UTC)
16
+ * **default_timestamp_format**: Default timestamp format of the timestamp (string, default: `%Y-%m-%d %H:%M:%S.%N %z`)
17
+ * **newline**: Newline character (CRLF, LF or CR) (string, default: CRLF)
18
+ * **charset**: Character encoding (eg. ISO-8859-1, UTF-8) (string, default: UTF-8)
19
+ * **default_typecast**: Specify whether to cast values automatically to the specified types or not (boolean, default: true)
14
20
 
15
- ## Example
21
+ ### columns
16
22
 
17
- ```yaml
18
- in:
19
- type: any file input plugin type
20
- parser:
21
- type: jsonl
22
- columns:
23
- - {name: first_name, type: string}
24
- - {name: last_name, type: string}
25
- - {name: age, type: long}
26
- ```
23
+ * **name**: Name of the column (string, required)
24
+ * **type**: Type of the column (string, required)
25
+ * **timezone**: Timezone of the timestamp if type is timestamp (string, default: default_timestamp)
26
+ * **format**: Format of the timestamp if type is timestamp (string, default: default_format)
27
+ * **typecast**: Whether cast values or not (boolean, default: default_typecast)
28
+
29
+ ## Guess
27
30
 
28
31
  (If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
29
32
 
@@ -0,0 +1,5 @@
1
+ File::open('bench/sample.jsonl', 'w') { |f|
2
+ (1..1000000).each {
3
+ f.puts(%Q[{"foo":"foo","bool":true,"bool_str":"true","int":10,"int_str":"20","double":1.5,"double_str":"2.5","array":[1,2,3]}])
4
+ }
5
+ }
@@ -0,0 +1,17 @@
1
+ in:
2
+ type: file
3
+ path_prefix: "bench/sample"
4
+ parser:
5
+ type: jsonl
6
+ default_typecast: true
7
+ columns:
8
+ - {name: "foo", type: "string"}
9
+ - {name: "bool", type: "boolean"}
10
+ - {name: "bool_str", type: "boolean"}
11
+ - {name: "int", type: "long"}
12
+ - {name: "int_str", type: "long"}
13
+ - {name: "double", type: "double"}
14
+ - {name: "double_str", type: "double"}
15
+ - {name: "array", type: "json"}
16
+ out:
17
+ type: "null"
@@ -0,0 +1,17 @@
1
+ in:
2
+ type: file
3
+ path_prefix: "bench/sample"
4
+ parser:
5
+ type: jsonl
6
+ default_typecast: false
7
+ columns:
8
+ - {name: "foo", type: "string"}
9
+ - {name: "bool", type: "boolean"}
10
+ - {name: "bool_str", type: "string"}
11
+ - {name: "int", type: "long"}
12
+ - {name: "int_str", type: "string"}
13
+ - {name: "double", type: "double"}
14
+ - {name: "double_str", type: "string"}
15
+ - {name: "array", type: "json"}
16
+ out:
17
+ type: "null"
@@ -13,7 +13,7 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.1.2"
16
+ version = "0.2.0"
17
17
 
18
18
  compileJava.options.encoding = 'UTF-8' // source encoding
19
19
  sourceCompatibility = 1.7
@@ -1,7 +1,7 @@
1
1
 
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "embulk-parser-jsonl"
4
- spec.version = "0.1.2"
4
+ spec.version = "0.2.0"
5
5
  spec.authors = ["Shunsuke Mikami"]
6
6
  spec.summary = "Jsonl parser plugin for Embulk"
7
7
  spec.description = "Parses Jsonl files read by other file input plugins."
@@ -0,0 +1,21 @@
1
+ in:
2
+ type: file
3
+ path_prefix: "example/sample"
4
+ parser:
5
+ type: jsonl
6
+ columns:
7
+ - {name: "foo", type: "string"}
8
+ - {name: "bool", type: "boolean"}
9
+ - {name: "bool_str", type: "boolean"}
10
+ - {name: "int", type: "string"}
11
+ - {name: "int_str", type: "string"}
12
+ - {name: "time", type: "timestamp", format: '%Y-%m-%d %H:%M:%S'}
13
+ - {name: "double", type: "double"}
14
+ - {name: "double_str", type: "string"}
15
+ - {name: "array", type: "json"}
16
+ column_options:
17
+ bool_str: {type: "boolean"}
18
+ int_str: {type: "long"}
19
+ double_str: {type: "double"}
20
+ out:
21
+ type: stdout
@@ -1,23 +1,18 @@
1
1
  in:
2
2
  type: file
3
- path_prefix: "sample"
3
+ path_prefix: "example/sample"
4
4
  parser:
5
5
  type: jsonl
6
+ # default_typecast: true # default: true
6
7
  columns:
7
8
  - {name: "foo", type: "string"}
8
9
  - {name: "bool", type: "boolean"}
9
- - {name: "bool_str", type: "string"}
10
+ - {name: "bool_str", type: "boolean"}
10
11
  - {name: "int", type: "long"}
11
- - {name: "int_str", type: "string"}
12
+ - {name: "int_str", type: "long"}
12
13
  - {name: "time", type: "timestamp", format: '%Y-%m-%d %H:%M:%S'}
13
14
  - {name: "double", type: "double"}
14
- - {name: "double_str", type: "string"}
15
+ - {name: "double_str", type: "double"}
15
16
  - {name: "array", type: "json"}
16
- column_options:
17
- bool_str: {type: "boolean"}
18
- int_str: {type: "long"}
19
- double_str: {type: "double"}
20
-
21
-
22
17
  out:
23
18
  type: stdout
@@ -0,0 +1,18 @@
1
+ in:
2
+ type: file
3
+ path_prefix: "example/sample"
4
+ parser:
5
+ type: jsonl
6
+ default_typecast: false
7
+ columns:
8
+ - {name: "foo", type: "string"}
9
+ - {name: "bool", type: "boolean"}
10
+ - {name: "bool_str", type: "string"}
11
+ - {name: "int", type: "long"}
12
+ - {name: "int_str", type: "string"}
13
+ - {name: "time", type: "timestamp", format: '%Y-%m-%d %H:%M:%S'}
14
+ - {name: "double", type: "double"}
15
+ - {name: "double_str", type: "string"}
16
+ - {name: "array", type: "json"}
17
+ out:
18
+ type: stdout
@@ -1,2 +1,2 @@
1
- {"foo": "bar", "bool": true, "bool_str": "true", "int": 1, "int_str": "42", "time": "2016-3-2 00:39:18", "double": 1.2, "double_str": "2.4", "array": [1, 2, 3]}
2
- {"foo": null, "bool": false, "bool_str": "false", "int": 1, "int_str": "42", "time": "2016-3-2 00:39:18", "double": 1.2, "double_str": "2.4", "array": [{"inner": "hoge"}, {"inner": 1.5}]}
1
+ {"foo": "bar", "bool": true, "bool_str": "true", "int": 42, "int_str": "42", "time": "2016-3-2 00:39:18", "double": 2.4, "double_str": "2.4", "array": [1, 2, 3]}
2
+ {"foo": null, "bool": false, "bool_str": "false", "int": 42, "int_str": "42", "time": "2016-3-2 00:39:18", "double": 2.4, "double_str": "2.4", "array": [{"inner": "hoge"}, {"inner": 1.5}]}
@@ -0,0 +1,97 @@
1
+ package org.embulk.parser.jsonl;
2
+
3
+ import org.embulk.parser.jsonl.cast.BooleanCast;
4
+ import org.embulk.parser.jsonl.cast.DoubleCast;
5
+ import org.embulk.parser.jsonl.cast.JsonCast;
6
+ import org.embulk.parser.jsonl.cast.LongCast;
7
+ import org.embulk.parser.jsonl.cast.StringCast;
8
+ import org.embulk.spi.DataException;
9
+ import org.embulk.spi.time.Timestamp;
10
+ import org.embulk.spi.time.TimestampParser;
11
+ import org.msgpack.value.Value;
12
+
13
+ class ColumnCaster
14
+ {
15
+ ColumnCaster() {}
16
+
17
+ public static boolean asBoolean(Value value) throws DataException
18
+ {
19
+ if (value.isBooleanValue()) {
20
+ return value.asBooleanValue().getBoolean();
21
+ }
22
+ else if (value.isIntegerValue()) {
23
+ return LongCast.asBoolean(value.asIntegerValue().asLong());
24
+ }
25
+ else if (value.isFloatValue()) {
26
+ return DoubleCast.asBoolean(value.asFloatValue().toDouble());
27
+ }
28
+ else if (value.isStringValue()) {
29
+ return StringCast.asBoolean(value.asStringValue().asString());
30
+ }
31
+ else {
32
+ return JsonCast.asBoolean(value);
33
+ }
34
+ }
35
+
36
+ public static long asLong(Value value) throws DataException
37
+ {
38
+ if (value.isBooleanValue()) {
39
+ return BooleanCast.asLong(value.asBooleanValue().getBoolean());
40
+ }
41
+ else if (value.isIntegerValue()) {
42
+ return value.asIntegerValue().asLong();
43
+ }
44
+ else if (value.isFloatValue()) {
45
+ return DoubleCast.asLong(value.asFloatValue().toDouble());
46
+ }
47
+ else if (value.isStringValue()) {
48
+ return StringCast.asLong(value.asStringValue().asString());
49
+ }
50
+ else {
51
+ return JsonCast.asLong(value);
52
+ }
53
+ }
54
+
55
+ public static double asDouble(Value value) throws DataException
56
+ {
57
+ if (value.isBooleanValue()) {
58
+ return BooleanCast.asDouble(value.asBooleanValue().getBoolean());
59
+ }
60
+ else if (value.isIntegerValue()) {
61
+ return LongCast.asDouble(value.asIntegerValue().asLong());
62
+ }
63
+ else if (value.isFloatValue()) {
64
+ return value.asFloatValue().toDouble();
65
+ }
66
+ else if (value.isStringValue()) {
67
+ return StringCast.asDouble(value.asStringValue().asString());
68
+ }
69
+ else {
70
+ return JsonCast.asDouble(value);
71
+ }
72
+ }
73
+
74
+ public static String asString(Value value) throws DataException
75
+ {
76
+ return value.toString();
77
+ }
78
+
79
+ public static Timestamp asTimestamp(Value value, TimestampParser parser) throws DataException
80
+ {
81
+ if (value.isBooleanValue()) {
82
+ return BooleanCast.asTimestamp(value.asBooleanValue().getBoolean());
83
+ }
84
+ else if (value.isIntegerValue()) {
85
+ return LongCast.asTimestamp(value.asIntegerValue().asLong());
86
+ }
87
+ else if (value.isFloatValue()) {
88
+ return DoubleCast.asTimestamp(value.asFloatValue().toDouble());
89
+ }
90
+ else if (value.isStringValue()) {
91
+ return StringCast.asTimestamp(value.asStringValue().asString(), parser);
92
+ }
93
+ else {
94
+ return JsonCast.asTimestamp(value);
95
+ }
96
+ }
97
+ }
@@ -0,0 +1,164 @@
1
+ package org.embulk.parser.jsonl;
2
+
3
+ import com.google.common.base.Optional;
4
+ import org.embulk.parser.jsonl.JsonlParserPlugin.PluginTask;
5
+ import org.embulk.parser.jsonl.JsonlParserPlugin.TypecastColumnOption;
6
+
7
+ import org.embulk.spi.Column;
8
+ import org.embulk.spi.ColumnConfig;
9
+ import org.embulk.spi.ColumnVisitor;
10
+ import org.embulk.spi.PageBuilder;
11
+ import org.embulk.spi.Schema;
12
+ import org.embulk.spi.SchemaConfig;
13
+ import org.embulk.spi.time.Timestamp;
14
+ import org.embulk.spi.time.TimestampParser;
15
+ import org.msgpack.core.MessageTypeException;
16
+ import org.msgpack.value.Value;
17
+
18
+ public class ColumnVisitorImpl implements ColumnVisitor {
19
+ protected final PluginTask task;
20
+ protected final Schema schema;
21
+ protected final PageBuilder pageBuilder;
22
+ protected final TimestampParser[] timestampParsers;
23
+ protected final Boolean autoTypecasts[];
24
+
25
+ protected Value value;
26
+
27
+ public ColumnVisitorImpl(PluginTask task, Schema schema, PageBuilder pageBuilder, TimestampParser[] timestampParsers)
28
+ {
29
+ this.task = task;
30
+ this.schema = schema;
31
+ this.pageBuilder = pageBuilder;
32
+ this.timestampParsers = timestampParsers;
33
+ this.autoTypecasts = new Boolean[schema.size()];
34
+ buildAutoTypecasts();
35
+ }
36
+
37
+ private void buildAutoTypecasts()
38
+ {
39
+ for (Column column : schema.getColumns()) {
40
+ this.autoTypecasts[column.getIndex()] = task.getDefaultTypecast();
41
+ }
42
+
43
+ Optional<SchemaConfig> schemaConfig = task.getSchemaConfig();
44
+ if (schemaConfig.isPresent()) {
45
+ for (ColumnConfig columnConfig : schemaConfig.get().getColumns()) {
46
+ TypecastColumnOption columnOption = columnConfig.getOption().loadConfig(TypecastColumnOption.class);
47
+ Boolean autoTypecast = columnOption.getTypecast().or(task.getDefaultTypecast());
48
+ Column column = schema.lookupColumn(columnConfig.getName());
49
+ this.autoTypecasts[column.getIndex()] = autoTypecast;
50
+ }
51
+ }
52
+ }
53
+
54
+ public void setValue(Value value)
55
+ {
56
+ this.value = value;
57
+ }
58
+
59
+ @Override
60
+ public void booleanColumn(Column column)
61
+ {
62
+ if (isNil(value)) {
63
+ pageBuilder.setNull(column);
64
+ }
65
+ else {
66
+ try {
67
+ boolean booleanValue = autoTypecasts[column.getIndex()] ? ColumnCaster.asBoolean(value) : value.asBooleanValue().getBoolean();
68
+ pageBuilder.setBoolean(column, booleanValue);
69
+ }
70
+ catch (MessageTypeException e) {
71
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as Boolean", value), e);
72
+ }
73
+ }
74
+ }
75
+
76
+ @Override
77
+ public void longColumn(Column column)
78
+ {
79
+ if (isNil(value)) {
80
+ pageBuilder.setNull(column);
81
+ }
82
+ else {
83
+ try {
84
+ long longValue = autoTypecasts[column.getIndex()] ? ColumnCaster.asLong(value) : value.asIntegerValue().toLong();
85
+ pageBuilder.setLong(column, longValue);
86
+ }
87
+ catch (MessageTypeException e) {
88
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as Long", value), e);
89
+ }
90
+ }
91
+ }
92
+
93
+ @Override
94
+ public void doubleColumn(Column column)
95
+ {
96
+ if (isNil(value)) {
97
+ pageBuilder.setNull(column);
98
+ }
99
+ else {
100
+ try {
101
+ double doubleValue = autoTypecasts[column.getIndex()] ? ColumnCaster.asDouble(value) : value.asFloatValue().toDouble();
102
+ pageBuilder.setDouble(column, doubleValue);
103
+ }
104
+ catch (MessageTypeException e) {
105
+ throw new JsonRecordValidateException(String.format("failed get \"%s\" as Double", value), e);
106
+ }
107
+ }
108
+ }
109
+
110
+ @Override
111
+ public void stringColumn(Column column)
112
+ {
113
+ if (isNil(value)) {
114
+ pageBuilder.setNull(column);
115
+ }
116
+ else {
117
+ try {
118
+ String string = autoTypecasts[column.getIndex()] ? ColumnCaster.asString(value) : value.asStringValue().toString();
119
+ pageBuilder.setString(column, string);
120
+ }
121
+ catch (MessageTypeException e) {
122
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as String", value), e);
123
+ }
124
+ }
125
+ }
126
+
127
+ @Override
128
+ public void timestampColumn(Column column)
129
+ {
130
+ if (isNil(value)) {
131
+ pageBuilder.setNull(column);
132
+ }
133
+ else {
134
+ try {
135
+ Timestamp timestamp = ColumnCaster.asTimestamp(value, timestampParsers[column.getIndex()]);
136
+ pageBuilder.setTimestamp(column, timestamp);
137
+ }
138
+ catch (MessageTypeException e) {
139
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as Timestamp", value), e);
140
+ }
141
+ }
142
+ }
143
+
144
+ @Override
145
+ public void jsonColumn(Column column)
146
+ {
147
+ if (isNil(value)) {
148
+ pageBuilder.setNull(column);
149
+ }
150
+ else {
151
+ try {
152
+ pageBuilder.setJson(column, value);
153
+ }
154
+ catch (MessageTypeException e) {
155
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as Json", value), e);
156
+ }
157
+ }
158
+ }
159
+
160
+ protected boolean isNil(Value v)
161
+ {
162
+ return v == null || v.isNilValue();
163
+ }
164
+ }