embulk-parser-jsonl 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (32) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -1
  3. data/README.md +17 -14
  4. data/bench/gen_dummy.rb +5 -0
  5. data/bench/typecast.yml +17 -0
  6. data/bench/without_typecast.yml +17 -0
  7. data/build.gradle +1 -1
  8. data/embulk-parser-jsonl.gemspec +1 -1
  9. data/example/compat.yml +21 -0
  10. data/example/example.yml +5 -10
  11. data/example/example_without_typecast.yml +18 -0
  12. data/example/sample.json +2 -2
  13. data/src/main/java/org/embulk/parser/jsonl/ColumnCaster.java +97 -0
  14. data/src/main/java/org/embulk/parser/jsonl/ColumnVisitorImpl.java +164 -0
  15. data/src/main/java/org/embulk/parser/jsonl/JsonRecordValidateException.java +11 -3
  16. data/src/main/java/org/embulk/parser/jsonl/JsonlParserPlugin.java +39 -14
  17. data/src/main/java/org/embulk/parser/jsonl/cast/BooleanCast.java +39 -0
  18. data/src/main/java/org/embulk/parser/jsonl/cast/DoubleCast.java +41 -0
  19. data/src/main/java/org/embulk/parser/jsonl/cast/JsonCast.java +40 -0
  20. data/src/main/java/org/embulk/parser/jsonl/cast/LongCast.java +47 -0
  21. data/src/main/java/org/embulk/parser/jsonl/cast/StringCast.java +82 -0
  22. data/src/test/java/org/embulk/parser/jsonl/TestColumnCaster.java +256 -0
  23. data/src/test/java/org/embulk/parser/jsonl/cast/TestBooleanCast.java +56 -0
  24. data/src/test/java/org/embulk/parser/jsonl/cast/TestDoubleCast.java +50 -0
  25. data/src/test/java/org/embulk/parser/jsonl/cast/TestJsonCast.java +80 -0
  26. data/src/test/java/org/embulk/parser/jsonl/cast/TestLongCast.java +42 -0
  27. data/src/test/java/org/embulk/parser/jsonl/cast/TestStringCast.java +103 -0
  28. metadata +21 -7
  29. data/src/main/java/org/embulk/parser/jsonl/JsonlColumnOption.java +0 -16
  30. data/src/main/java/org/embulk/parser/jsonl/getter/ColumnGetterFactory.java +0 -24
  31. data/src/main/java/org/embulk/parser/jsonl/getter/CommonColumnGetter.java +0 -131
  32. data/src/main/java/org/embulk/parser/jsonl/getter/StringColumnGetter.java +0 -68
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7f4944036ca4cacbe9f2b8a0943503f77c92aca1
4
- data.tar.gz: a11616b53812123915ccdd8e88938471f81b2aea
3
+ metadata.gz: e169a403e6e9d48e00b0a5d4b5aed92774e556a1
4
+ data.tar.gz: 0b762ec444862c9fc3f575d1b7ddd87207632d08
5
5
  SHA512:
6
- metadata.gz: bf717db902192c17124a393b063972067e2e9d46b66dfc7247cf427f03bcec551d10c799c2ec6923ca486ba44ec0017d72598280106d356eb3f9464fdbf94d0f
7
- data.tar.gz: be23fbef9cba986f15fecebd91ba15059f1fea6fe13aaebd62373488aa092e6f42b7fb13ab471bf125c38f8ab587a7a0bfe62dc6b465dece554a80e6e0792e25
6
+ metadata.gz: 6a16d1e617e0bcf7ccd1231704374f3ba30c49012c21b7b9b03744b245120e1fe174a2c42d9cc8b481836485c0c9cafa54a8fecfab9417c063fac161b6f650da
7
+ data.tar.gz: 44cdc4bb41d368162b6b58a537c3c1eecf2f0b3685dd87643ecd394e2666cbd58aa55c0db09f82be9c61449b17b47ea5d164284f92e4cc2b3890e1d7b7f852bb
@@ -1,7 +1,11 @@
1
+ ## 0.2.0 - 2016-05-28
2
+
3
+ [new feature] Support typecast option [#9](https://github.com/shun0102/embulk-parser-jsonl/pull/9)
4
+
1
5
  ## 0.1.2 - 2016-03-27
2
6
 
3
7
  [new feature] Support column_options option [#4](https://github.com/shun0102/embulk-parser-jsonl/pull/4)
4
- [maintenance] Upgrade Embulk v0.8.8 [#6](https://github.com/shun0102/embulk-parser-jsonl/pull/6)
8
+ [maintenance] Upgrade Embulk v0.8.8 [#6](https://github.com/shun0102/embulk-parser-jsonl/pull/6)## 0.1.2 - 2016-03-27
5
9
 
6
10
  ## 0.1.1 - 2016-03-17
7
11
 
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Jsonl parser plugin for Embulk
2
2
 
3
- TODO: Write short description here and embulk-parser-jsonl.gemspec file.
3
+ [JSONL (JSON Lines)](http://jsonlines.org/) parser plugin for Embulk
4
4
 
5
5
  ## Overview
6
6
 
@@ -9,21 +9,24 @@ TODO: Write short description here and embulk-parser-jsonl.gemspec file.
9
9
 
10
10
  ## Configuration
11
11
 
12
- - **type**: specify this parser as jsonl
13
- - **columns**: specify column name and type (array, required)
12
+ - **type**: Specify this parser as jsonl
13
+ - **columns**: Specify column name and type. See below (array, required)
14
+ * **stop_on_invalid_record**: Stop bulk load transaction if a file includes invalid record (such as invalid timestamp) (boolean, default: false)
15
+ * **default_timezone**: Default timezone of the timestamp (string, default: UTC)
16
+ * **default_timestamp_format**: Default timestamp format of the timestamp (string, default: `%Y-%m-%d %H:%M:%S.%N %z`)
17
+ * **newline**: Newline character (CRLF, LF or CR) (string, default: CRLF)
18
+ * **charset**: Character encoding (eg. ISO-8859-1, UTF-8) (string, default: UTF-8)
19
+ * **default_typecast**: Specify whether to cast values automatically to the specified types or not (boolean, default: true)
14
20
 
15
- ## Example
21
+ ### columns
16
22
 
17
- ```yaml
18
- in:
19
- type: any file input plugin type
20
- parser:
21
- type: jsonl
22
- columns:
23
- - {name: first_name, type: string}
24
- - {name: last_name, type: string}
25
- - {name: age, type: long}
26
- ```
23
+ * **name**: Name of the column (string, required)
24
+ * **type**: Type of the column (string, required)
25
+ * **timezone**: Timezone of the timestamp if type is timestamp (string, default: default_timestamp)
26
+ * **format**: Format of the timestamp if type is timestamp (string, default: default_format)
27
+ * **typecast**: Whether cast values or not (boolean, default: default_typecast)
28
+
29
+ ## Guess
27
30
 
28
31
  (If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
29
32
 
@@ -0,0 +1,5 @@
1
+ File::open('bench/sample.jsonl', 'w') { |f|
2
+ (1..1000000).each {
3
+ f.puts(%Q[{"foo":"foo","bool":true,"bool_str":"true","int":10,"int_str":"20","double":1.5,"double_str":"2.5","array":[1,2,3]}])
4
+ }
5
+ }
@@ -0,0 +1,17 @@
1
+ in:
2
+ type: file
3
+ path_prefix: "bench/sample"
4
+ parser:
5
+ type: jsonl
6
+ default_typecast: true
7
+ columns:
8
+ - {name: "foo", type: "string"}
9
+ - {name: "bool", type: "boolean"}
10
+ - {name: "bool_str", type: "boolean"}
11
+ - {name: "int", type: "long"}
12
+ - {name: "int_str", type: "long"}
13
+ - {name: "double", type: "double"}
14
+ - {name: "double_str", type: "double"}
15
+ - {name: "array", type: "json"}
16
+ out:
17
+ type: "null"
@@ -0,0 +1,17 @@
1
+ in:
2
+ type: file
3
+ path_prefix: "bench/sample"
4
+ parser:
5
+ type: jsonl
6
+ default_typecast: false
7
+ columns:
8
+ - {name: "foo", type: "string"}
9
+ - {name: "bool", type: "boolean"}
10
+ - {name: "bool_str", type: "string"}
11
+ - {name: "int", type: "long"}
12
+ - {name: "int_str", type: "string"}
13
+ - {name: "double", type: "double"}
14
+ - {name: "double_str", type: "string"}
15
+ - {name: "array", type: "json"}
16
+ out:
17
+ type: "null"
@@ -13,7 +13,7 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.1.2"
16
+ version = "0.2.0"
17
17
 
18
18
  compileJava.options.encoding = 'UTF-8' // source encoding
19
19
  sourceCompatibility = 1.7
@@ -1,7 +1,7 @@
1
1
 
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "embulk-parser-jsonl"
4
- spec.version = "0.1.2"
4
+ spec.version = "0.2.0"
5
5
  spec.authors = ["Shunsuke Mikami"]
6
6
  spec.summary = "Jsonl parser plugin for Embulk"
7
7
  spec.description = "Parses Jsonl files read by other file input plugins."
@@ -0,0 +1,21 @@
1
+ in:
2
+ type: file
3
+ path_prefix: "example/sample"
4
+ parser:
5
+ type: jsonl
6
+ columns:
7
+ - {name: "foo", type: "string"}
8
+ - {name: "bool", type: "boolean"}
9
+ - {name: "bool_str", type: "boolean"}
10
+ - {name: "int", type: "string"}
11
+ - {name: "int_str", type: "string"}
12
+ - {name: "time", type: "timestamp", format: '%Y-%m-%d %H:%M:%S'}
13
+ - {name: "double", type: "double"}
14
+ - {name: "double_str", type: "string"}
15
+ - {name: "array", type: "json"}
16
+ column_options:
17
+ bool_str: {type: "boolean"}
18
+ int_str: {type: "long"}
19
+ double_str: {type: "double"}
20
+ out:
21
+ type: stdout
@@ -1,23 +1,18 @@
1
1
  in:
2
2
  type: file
3
- path_prefix: "sample"
3
+ path_prefix: "example/sample"
4
4
  parser:
5
5
  type: jsonl
6
+ # default_typecast: true # default: true
6
7
  columns:
7
8
  - {name: "foo", type: "string"}
8
9
  - {name: "bool", type: "boolean"}
9
- - {name: "bool_str", type: "string"}
10
+ - {name: "bool_str", type: "boolean"}
10
11
  - {name: "int", type: "long"}
11
- - {name: "int_str", type: "string"}
12
+ - {name: "int_str", type: "long"}
12
13
  - {name: "time", type: "timestamp", format: '%Y-%m-%d %H:%M:%S'}
13
14
  - {name: "double", type: "double"}
14
- - {name: "double_str", type: "string"}
15
+ - {name: "double_str", type: "double"}
15
16
  - {name: "array", type: "json"}
16
- column_options:
17
- bool_str: {type: "boolean"}
18
- int_str: {type: "long"}
19
- double_str: {type: "double"}
20
-
21
-
22
17
  out:
23
18
  type: stdout
@@ -0,0 +1,18 @@
1
+ in:
2
+ type: file
3
+ path_prefix: "example/sample"
4
+ parser:
5
+ type: jsonl
6
+ default_typecast: false
7
+ columns:
8
+ - {name: "foo", type: "string"}
9
+ - {name: "bool", type: "boolean"}
10
+ - {name: "bool_str", type: "string"}
11
+ - {name: "int", type: "long"}
12
+ - {name: "int_str", type: "string"}
13
+ - {name: "time", type: "timestamp", format: '%Y-%m-%d %H:%M:%S'}
14
+ - {name: "double", type: "double"}
15
+ - {name: "double_str", type: "string"}
16
+ - {name: "array", type: "json"}
17
+ out:
18
+ type: stdout
@@ -1,2 +1,2 @@
1
- {"foo": "bar", "bool": true, "bool_str": "true", "int": 1, "int_str": "42", "time": "2016-3-2 00:39:18", "double": 1.2, "double_str": "2.4", "array": [1, 2, 3]}
2
- {"foo": null, "bool": false, "bool_str": "false", "int": 1, "int_str": "42", "time": "2016-3-2 00:39:18", "double": 1.2, "double_str": "2.4", "array": [{"inner": "hoge"}, {"inner": 1.5}]}
1
+ {"foo": "bar", "bool": true, "bool_str": "true", "int": 42, "int_str": "42", "time": "2016-3-2 00:39:18", "double": 2.4, "double_str": "2.4", "array": [1, 2, 3]}
2
+ {"foo": null, "bool": false, "bool_str": "false", "int": 42, "int_str": "42", "time": "2016-3-2 00:39:18", "double": 2.4, "double_str": "2.4", "array": [{"inner": "hoge"}, {"inner": 1.5}]}
@@ -0,0 +1,97 @@
1
+ package org.embulk.parser.jsonl;
2
+
3
+ import org.embulk.parser.jsonl.cast.BooleanCast;
4
+ import org.embulk.parser.jsonl.cast.DoubleCast;
5
+ import org.embulk.parser.jsonl.cast.JsonCast;
6
+ import org.embulk.parser.jsonl.cast.LongCast;
7
+ import org.embulk.parser.jsonl.cast.StringCast;
8
+ import org.embulk.spi.DataException;
9
+ import org.embulk.spi.time.Timestamp;
10
+ import org.embulk.spi.time.TimestampParser;
11
+ import org.msgpack.value.Value;
12
+
13
+ class ColumnCaster
14
+ {
15
+ ColumnCaster() {}
16
+
17
+ public static boolean asBoolean(Value value) throws DataException
18
+ {
19
+ if (value.isBooleanValue()) {
20
+ return value.asBooleanValue().getBoolean();
21
+ }
22
+ else if (value.isIntegerValue()) {
23
+ return LongCast.asBoolean(value.asIntegerValue().asLong());
24
+ }
25
+ else if (value.isFloatValue()) {
26
+ return DoubleCast.asBoolean(value.asFloatValue().toDouble());
27
+ }
28
+ else if (value.isStringValue()) {
29
+ return StringCast.asBoolean(value.asStringValue().asString());
30
+ }
31
+ else {
32
+ return JsonCast.asBoolean(value);
33
+ }
34
+ }
35
+
36
+ public static long asLong(Value value) throws DataException
37
+ {
38
+ if (value.isBooleanValue()) {
39
+ return BooleanCast.asLong(value.asBooleanValue().getBoolean());
40
+ }
41
+ else if (value.isIntegerValue()) {
42
+ return value.asIntegerValue().asLong();
43
+ }
44
+ else if (value.isFloatValue()) {
45
+ return DoubleCast.asLong(value.asFloatValue().toDouble());
46
+ }
47
+ else if (value.isStringValue()) {
48
+ return StringCast.asLong(value.asStringValue().asString());
49
+ }
50
+ else {
51
+ return JsonCast.asLong(value);
52
+ }
53
+ }
54
+
55
+ public static double asDouble(Value value) throws DataException
56
+ {
57
+ if (value.isBooleanValue()) {
58
+ return BooleanCast.asDouble(value.asBooleanValue().getBoolean());
59
+ }
60
+ else if (value.isIntegerValue()) {
61
+ return LongCast.asDouble(value.asIntegerValue().asLong());
62
+ }
63
+ else if (value.isFloatValue()) {
64
+ return value.asFloatValue().toDouble();
65
+ }
66
+ else if (value.isStringValue()) {
67
+ return StringCast.asDouble(value.asStringValue().asString());
68
+ }
69
+ else {
70
+ return JsonCast.asDouble(value);
71
+ }
72
+ }
73
+
74
+ public static String asString(Value value) throws DataException
75
+ {
76
+ return value.toString();
77
+ }
78
+
79
+ public static Timestamp asTimestamp(Value value, TimestampParser parser) throws DataException
80
+ {
81
+ if (value.isBooleanValue()) {
82
+ return BooleanCast.asTimestamp(value.asBooleanValue().getBoolean());
83
+ }
84
+ else if (value.isIntegerValue()) {
85
+ return LongCast.asTimestamp(value.asIntegerValue().asLong());
86
+ }
87
+ else if (value.isFloatValue()) {
88
+ return DoubleCast.asTimestamp(value.asFloatValue().toDouble());
89
+ }
90
+ else if (value.isStringValue()) {
91
+ return StringCast.asTimestamp(value.asStringValue().asString(), parser);
92
+ }
93
+ else {
94
+ return JsonCast.asTimestamp(value);
95
+ }
96
+ }
97
+ }
@@ -0,0 +1,164 @@
1
+ package org.embulk.parser.jsonl;
2
+
3
+ import com.google.common.base.Optional;
4
+ import org.embulk.parser.jsonl.JsonlParserPlugin.PluginTask;
5
+ import org.embulk.parser.jsonl.JsonlParserPlugin.TypecastColumnOption;
6
+
7
+ import org.embulk.spi.Column;
8
+ import org.embulk.spi.ColumnConfig;
9
+ import org.embulk.spi.ColumnVisitor;
10
+ import org.embulk.spi.PageBuilder;
11
+ import org.embulk.spi.Schema;
12
+ import org.embulk.spi.SchemaConfig;
13
+ import org.embulk.spi.time.Timestamp;
14
+ import org.embulk.spi.time.TimestampParser;
15
+ import org.msgpack.core.MessageTypeException;
16
+ import org.msgpack.value.Value;
17
+
18
+ public class ColumnVisitorImpl implements ColumnVisitor {
19
+ protected final PluginTask task;
20
+ protected final Schema schema;
21
+ protected final PageBuilder pageBuilder;
22
+ protected final TimestampParser[] timestampParsers;
23
+ protected final Boolean autoTypecasts[];
24
+
25
+ protected Value value;
26
+
27
+ public ColumnVisitorImpl(PluginTask task, Schema schema, PageBuilder pageBuilder, TimestampParser[] timestampParsers)
28
+ {
29
+ this.task = task;
30
+ this.schema = schema;
31
+ this.pageBuilder = pageBuilder;
32
+ this.timestampParsers = timestampParsers;
33
+ this.autoTypecasts = new Boolean[schema.size()];
34
+ buildAutoTypecasts();
35
+ }
36
+
37
+ private void buildAutoTypecasts()
38
+ {
39
+ for (Column column : schema.getColumns()) {
40
+ this.autoTypecasts[column.getIndex()] = task.getDefaultTypecast();
41
+ }
42
+
43
+ Optional<SchemaConfig> schemaConfig = task.getSchemaConfig();
44
+ if (schemaConfig.isPresent()) {
45
+ for (ColumnConfig columnConfig : schemaConfig.get().getColumns()) {
46
+ TypecastColumnOption columnOption = columnConfig.getOption().loadConfig(TypecastColumnOption.class);
47
+ Boolean autoTypecast = columnOption.getTypecast().or(task.getDefaultTypecast());
48
+ Column column = schema.lookupColumn(columnConfig.getName());
49
+ this.autoTypecasts[column.getIndex()] = autoTypecast;
50
+ }
51
+ }
52
+ }
53
+
54
+ public void setValue(Value value)
55
+ {
56
+ this.value = value;
57
+ }
58
+
59
+ @Override
60
+ public void booleanColumn(Column column)
61
+ {
62
+ if (isNil(value)) {
63
+ pageBuilder.setNull(column);
64
+ }
65
+ else {
66
+ try {
67
+ boolean booleanValue = autoTypecasts[column.getIndex()] ? ColumnCaster.asBoolean(value) : value.asBooleanValue().getBoolean();
68
+ pageBuilder.setBoolean(column, booleanValue);
69
+ }
70
+ catch (MessageTypeException e) {
71
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as Boolean", value), e);
72
+ }
73
+ }
74
+ }
75
+
76
+ @Override
77
+ public void longColumn(Column column)
78
+ {
79
+ if (isNil(value)) {
80
+ pageBuilder.setNull(column);
81
+ }
82
+ else {
83
+ try {
84
+ long longValue = autoTypecasts[column.getIndex()] ? ColumnCaster.asLong(value) : value.asIntegerValue().toLong();
85
+ pageBuilder.setLong(column, longValue);
86
+ }
87
+ catch (MessageTypeException e) {
88
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as Long", value), e);
89
+ }
90
+ }
91
+ }
92
+
93
+ @Override
94
+ public void doubleColumn(Column column)
95
+ {
96
+ if (isNil(value)) {
97
+ pageBuilder.setNull(column);
98
+ }
99
+ else {
100
+ try {
101
+ double doubleValue = autoTypecasts[column.getIndex()] ? ColumnCaster.asDouble(value) : value.asFloatValue().toDouble();
102
+ pageBuilder.setDouble(column, doubleValue);
103
+ }
104
+ catch (MessageTypeException e) {
105
+ throw new JsonRecordValidateException(String.format("failed get \"%s\" as Double", value), e);
106
+ }
107
+ }
108
+ }
109
+
110
+ @Override
111
+ public void stringColumn(Column column)
112
+ {
113
+ if (isNil(value)) {
114
+ pageBuilder.setNull(column);
115
+ }
116
+ else {
117
+ try {
118
+ String string = autoTypecasts[column.getIndex()] ? ColumnCaster.asString(value) : value.asStringValue().toString();
119
+ pageBuilder.setString(column, string);
120
+ }
121
+ catch (MessageTypeException e) {
122
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as String", value), e);
123
+ }
124
+ }
125
+ }
126
+
127
+ @Override
128
+ public void timestampColumn(Column column)
129
+ {
130
+ if (isNil(value)) {
131
+ pageBuilder.setNull(column);
132
+ }
133
+ else {
134
+ try {
135
+ Timestamp timestamp = ColumnCaster.asTimestamp(value, timestampParsers[column.getIndex()]);
136
+ pageBuilder.setTimestamp(column, timestamp);
137
+ }
138
+ catch (MessageTypeException e) {
139
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as Timestamp", value), e);
140
+ }
141
+ }
142
+ }
143
+
144
+ @Override
145
+ public void jsonColumn(Column column)
146
+ {
147
+ if (isNil(value)) {
148
+ pageBuilder.setNull(column);
149
+ }
150
+ else {
151
+ try {
152
+ pageBuilder.setJson(column, value);
153
+ }
154
+ catch (MessageTypeException e) {
155
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as Json", value), e);
156
+ }
157
+ }
158
+ }
159
+
160
+ protected boolean isNil(Value v)
161
+ {
162
+ return v == null || v.isNilValue();
163
+ }
164
+ }