embulk-filter-typecast 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +11 -5
- data/build.gradle +2 -1
- data/example/example.yml +41 -0
- data/src/main/java/org/embulk/filter/typecast/ColumnCaster.java +10 -4
- data/src/main/java/org/embulk/filter/typecast/JsonPathUtil.java +78 -0
- data/src/main/java/org/embulk/filter/typecast/JsonVisitor.java +30 -21
- data/src/main/java/org/embulk/filter/typecast/TypecastFilterPlugin.java +6 -5
- metadata +11 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 58d54323014090807f457eec3b84aef29060ddd2
|
4
|
+
data.tar.gz: eb7be6477a3a993ce83da5a3e404fa0e7fd3a50d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f19f899b820a7faccca9d261b8cc807ac230321b26b4ccde552e7308c6853bc01a24b7cefde15eef9061f1b06e55f0b1b31e8ff67f29ab83d4f07ac3feb488db
|
7
|
+
data.tar.gz: 479bce7de22d8ecefdd6ddde53cd602f44fefdab0394cb0b83df9eb1f67cf2aeb980aa87b24da62f72a3ce47192f13af5ba6b9c1006116fb1da0f64156313ad2
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -19,17 +19,23 @@ A filter plugin for Embulk to cast column type.
|
|
19
19
|
|
20
20
|
See [example.csv](./example/example.csv) and [example.yml](./example/example.yml).
|
21
21
|
|
22
|
-
## JSONPath
|
22
|
+
## JSONPath
|
23
23
|
|
24
24
|
For `type: json` column, you can specify [JSONPath](http://goessner.net/articles/JsonPath/) for column's name as:
|
25
25
|
|
26
26
|
```
|
27
|
-
$.payload.key1
|
28
|
-
$.payload.array[0]
|
29
|
-
$.payload.array[*]
|
27
|
+
name: $.payload.key1
|
28
|
+
name: "$.payload.array[0]"
|
29
|
+
name: "$.payload.array[*]"
|
30
|
+
name: $['payload']['key1.key2']
|
30
31
|
```
|
31
32
|
|
32
|
-
|
33
|
+
Following operators of JSONPath are not supported:
|
34
|
+
|
35
|
+
* Multiple properties such as `['name','name']`
|
36
|
+
* Multiple array indexes such as `[1,2]`
|
37
|
+
* Array slice such as `[1:2]`
|
38
|
+
* Filter expression such as `[?(<expression>)]`
|
33
39
|
|
34
40
|
## ToDo
|
35
41
|
|
data/build.gradle
CHANGED
@@ -13,13 +13,14 @@ configurations {
|
|
13
13
|
provided
|
14
14
|
}
|
15
15
|
|
16
|
-
version = "0.1.
|
16
|
+
version = "0.1.5"
|
17
17
|
sourceCompatibility = 1.7
|
18
18
|
targetCompatibility = 1.7
|
19
19
|
|
20
20
|
dependencies {
|
21
21
|
compile "org.embulk:embulk-core:0.8.+"
|
22
22
|
provided "org.embulk:embulk-core:0.8.+"
|
23
|
+
compile "io.github.medjed:JsonPathCompiler:0.1.1"
|
23
24
|
|
24
25
|
testCompile "junit:junit:4.+"
|
25
26
|
testCompile "org.embulk:embulk-core:0.8.+:tests"
|
data/example/example.yml
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: example/example.csv
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
charset: UTF-8
|
7
|
+
newline: CRLF
|
8
|
+
null_string: ''
|
9
|
+
skip_header_lines: 1
|
10
|
+
comment_line_marker: '#'
|
11
|
+
columns:
|
12
|
+
- {name: timestamp, type: string}
|
13
|
+
- {name: "null", type: string}
|
14
|
+
- {name: long, type: string}
|
15
|
+
- {name: string, type: string}
|
16
|
+
- {name: double, type: string}
|
17
|
+
- {name: json1, type: string}
|
18
|
+
- {name: json2, type: string}
|
19
|
+
- {name: array_str, type: string}
|
20
|
+
- {name: array_int, type: string}
|
21
|
+
- {name: ignore, type: string}
|
22
|
+
- {name: boolean, type: string}
|
23
|
+
filters:
|
24
|
+
- type: typecast
|
25
|
+
columns:
|
26
|
+
- {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
|
27
|
+
- {name: "null", type: long}
|
28
|
+
- {name: long, type: long}
|
29
|
+
- {name: string, type: string}
|
30
|
+
- {name: double, type: double}
|
31
|
+
- {name: json1, type: json}
|
32
|
+
- {name: json2, type: json}
|
33
|
+
- {name: array_str, type: json}
|
34
|
+
- {name: array_int, type: json}
|
35
|
+
- {name: boolean, type: boolean}
|
36
|
+
- {name: "$.json1.string", type: long}
|
37
|
+
- {name: "$.json2.long", type: long}
|
38
|
+
- {name: "$.array_str[0]", type: long}
|
39
|
+
- {name: "$.array_int[*]", type: long}
|
40
|
+
out:
|
41
|
+
type: "null"
|
@@ -1,5 +1,9 @@
|
|
1
1
|
package org.embulk.filter.typecast;
|
2
2
|
|
3
|
+
import io.github.medjed.jsonpathcompiler.expressions.Path;
|
4
|
+
import io.github.medjed.jsonpathcompiler.expressions.Utils;
|
5
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PathCompiler;
|
6
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PropertyPathToken;
|
3
7
|
import org.embulk.filter.typecast.TypecastFilterPlugin.ColumnConfig;
|
4
8
|
import org.embulk.filter.typecast.TypecastFilterPlugin.PluginTask;
|
5
9
|
|
@@ -61,7 +65,7 @@ class ColumnCaster
|
|
61
65
|
{
|
62
66
|
// columnName => TimestampParser
|
63
67
|
for (ColumnConfig columnConfig : task.getColumns()) {
|
64
|
-
if (columnConfig.getName()
|
68
|
+
if (PathCompiler.isProbablyJsonPath(columnConfig.getName())) {
|
65
69
|
continue; // type: json columns do not support type: timestamp
|
66
70
|
}
|
67
71
|
Column inputColumn = inputSchema.lookupColumn(columnConfig.getName());
|
@@ -76,7 +80,7 @@ class ColumnCaster
|
|
76
80
|
{
|
77
81
|
// columnName => TimestampFormatter
|
78
82
|
for (ColumnConfig columnConfig : task.getColumns()) {
|
79
|
-
if (columnConfig.getName()
|
83
|
+
if (PathCompiler.isProbablyJsonPath(columnConfig.getName())) {
|
80
84
|
continue; // type: json columns do not have type: timestamp
|
81
85
|
}
|
82
86
|
Column inputColumn = inputSchema.lookupColumn(columnConfig.getName());
|
@@ -200,7 +204,8 @@ class ColumnCaster
|
|
200
204
|
}
|
201
205
|
else if (outputType instanceof JsonType) {
|
202
206
|
Value jsonValue = StringCast.asJson(value);
|
203
|
-
String
|
207
|
+
String name = outputColumn.getName();
|
208
|
+
String jsonPath = new StringBuilder("$").append(PropertyPathToken.getPathFragment(name)).toString();
|
204
209
|
Value castedValue = jsonVisitor.visit(jsonPath, jsonValue);
|
205
210
|
pageBuilder.setJson(outputColumn, castedValue);
|
206
211
|
}
|
@@ -238,7 +243,8 @@ class ColumnCaster
|
|
238
243
|
|
239
244
|
public void setFromJson(Column outputColumn, Value value)
|
240
245
|
{
|
241
|
-
String
|
246
|
+
String name = outputColumn.getName();
|
247
|
+
String jsonPath = new StringBuilder("$").append(PropertyPathToken.getPathFragment(name)).toString();
|
242
248
|
Value castedValue = jsonVisitor.visit(jsonPath, value);
|
243
249
|
Type outputType = outputColumn.getType();
|
244
250
|
if (outputType instanceof BooleanType) {
|
@@ -0,0 +1,78 @@
|
|
1
|
+
package org.embulk.filter.typecast;
|
2
|
+
|
3
|
+
import io.github.medjed.jsonpathcompiler.InvalidPathException;
|
4
|
+
import io.github.medjed.jsonpathcompiler.expressions.Path;
|
5
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.ArrayIndexOperation;
|
6
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.ArrayPathToken;
|
7
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.FunctionPathToken;
|
8
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PathCompiler;
|
9
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PathToken;
|
10
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PredicatePathToken;
|
11
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PropertyPathToken;
|
12
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.ScanPathToken;
|
13
|
+
import org.embulk.config.ConfigException;
|
14
|
+
|
15
|
+
public class JsonPathUtil
|
16
|
+
{
|
17
|
+
private JsonPathUtil() {}
|
18
|
+
|
19
|
+
public static String getColumnName(String jsonPath)
|
20
|
+
{
|
21
|
+
Path compiledPath;
|
22
|
+
try {
|
23
|
+
compiledPath = PathCompiler.compile(jsonPath);
|
24
|
+
}
|
25
|
+
catch (InvalidPathException e) {
|
26
|
+
throw new ConfigException(String.format("jsonpath %s, %s", jsonPath, e.getMessage()));
|
27
|
+
}
|
28
|
+
PathToken pathToken = compiledPath.getRoot();
|
29
|
+
pathToken = pathToken.next(); // skip $
|
30
|
+
return ((PropertyPathToken) pathToken).getProperties().get(0);
|
31
|
+
}
|
32
|
+
|
33
|
+
public static void assertJsonPathFormat(String path)
|
34
|
+
{
|
35
|
+
Path compiledPath;
|
36
|
+
try {
|
37
|
+
compiledPath = PathCompiler.compile(path);
|
38
|
+
}
|
39
|
+
catch (InvalidPathException e) {
|
40
|
+
throw new ConfigException(String.format("jsonpath %s, %s", path, e.getMessage()));
|
41
|
+
}
|
42
|
+
PathToken pathToken = compiledPath.getRoot();
|
43
|
+
while (true) {
|
44
|
+
assertSupportedPathToken(pathToken, path);
|
45
|
+
if (pathToken.isLeaf()) {
|
46
|
+
break;
|
47
|
+
}
|
48
|
+
pathToken = pathToken.next();
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
protected static void assertSupportedPathToken(PathToken pathToken, String path)
|
53
|
+
{
|
54
|
+
if (pathToken instanceof ArrayPathToken) {
|
55
|
+
ArrayIndexOperation arrayIndexOperation = ((ArrayPathToken) pathToken).getArrayIndexOperation();
|
56
|
+
assertSupportedArrayPathToken(arrayIndexOperation, path);
|
57
|
+
}
|
58
|
+
else if (pathToken instanceof ScanPathToken) {
|
59
|
+
throw new ConfigException(String.format("scan path token is not supported \"%s\"", path));
|
60
|
+
}
|
61
|
+
else if (pathToken instanceof FunctionPathToken) {
|
62
|
+
throw new ConfigException(String.format("function path token is not supported \"%s\"", path));
|
63
|
+
}
|
64
|
+
else if (pathToken instanceof PredicatePathToken) {
|
65
|
+
throw new ConfigException(String.format("predicate path token is not supported \"%s\"", path));
|
66
|
+
}
|
67
|
+
}
|
68
|
+
|
69
|
+
protected static void assertSupportedArrayPathToken(ArrayIndexOperation arrayIndexOperation, String path)
|
70
|
+
{
|
71
|
+
if (arrayIndexOperation == null) {
|
72
|
+
throw new ConfigException(String.format("Array Slice Operation is not supported \"%s\"", path));
|
73
|
+
}
|
74
|
+
else if (!arrayIndexOperation.isSingleIndexOperation()) {
|
75
|
+
throw new ConfigException(String.format("Multi Array Indexes is not supported \"%s\"", path));
|
76
|
+
}
|
77
|
+
}
|
78
|
+
}
|
@@ -1,5 +1,10 @@
|
|
1
1
|
package org.embulk.filter.typecast;
|
2
2
|
|
3
|
+
import io.github.medjed.jsonpathcompiler.expressions.Path;
|
4
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.ArrayPathToken;
|
5
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PathCompiler;
|
6
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PathToken;
|
7
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PropertyPathToken;
|
3
8
|
import org.embulk.filter.typecast.TypecastFilterPlugin.ColumnConfig;
|
4
9
|
import org.embulk.filter.typecast.TypecastFilterPlugin.PluginTask;
|
5
10
|
|
@@ -33,20 +38,33 @@ public class JsonVisitor
|
|
33
38
|
this.inputSchema = inputSchema;
|
34
39
|
this.outputSchema = outputSchema;
|
35
40
|
|
41
|
+
assertJsonPathFromat();
|
36
42
|
buildShouldVisitSet();
|
37
43
|
buildJsonPathTypeMap();
|
38
44
|
}
|
39
45
|
|
46
|
+
private void assertJsonPathFromat()
|
47
|
+
{
|
48
|
+
for (ColumnConfig columnConfig : task.getColumns()) {
|
49
|
+
String name = columnConfig.getName();
|
50
|
+
if (! PathCompiler.isProbablyJsonPath(name)) {
|
51
|
+
continue;
|
52
|
+
}
|
53
|
+
JsonPathUtil.assertJsonPathFormat(name);
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
40
57
|
private void buildJsonPathTypeMap()
|
41
58
|
{
|
42
59
|
// json path => Type
|
43
60
|
for (ColumnConfig columnConfig : task.getColumns()) {
|
44
61
|
String name = columnConfig.getName();
|
45
|
-
if (!
|
62
|
+
if (! PathCompiler.isProbablyJsonPath(name)) {
|
46
63
|
continue;
|
47
64
|
}
|
65
|
+
Path compiledPath = PathCompiler.compile(name);
|
48
66
|
Type type = columnConfig.getType();
|
49
|
-
this.jsonPathTypeMap.put(
|
67
|
+
this.jsonPathTypeMap.put(compiledPath.toString(), type);
|
50
68
|
}
|
51
69
|
}
|
52
70
|
|
@@ -55,26 +73,15 @@ public class JsonVisitor
|
|
55
73
|
// json partial path => Boolean to avoid unnecessary type: json visit
|
56
74
|
for (ColumnConfig columnConfig : task.getColumns()) {
|
57
75
|
String name = columnConfig.getName();
|
58
|
-
if (!
|
76
|
+
if (! PathCompiler.isProbablyJsonPath(name)) {
|
59
77
|
continue;
|
60
78
|
}
|
61
|
-
|
79
|
+
PathToken parts = PathCompiler.compile(name).getRoot();
|
62
80
|
StringBuilder partialPath = new StringBuilder("$");
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
this.shouldVisitSet.add(partialPath.toString());
|
68
|
-
for (int j = 1; j < arrayParts.length; j++) {
|
69
|
-
// Supports both [0] and [*]
|
70
|
-
partialPath.append("[").append(arrayParts[j]);
|
71
|
-
this.shouldVisitSet.add(partialPath.toString());
|
72
|
-
}
|
73
|
-
}
|
74
|
-
else {
|
75
|
-
partialPath.append(".").append(parts[i]);
|
76
|
-
this.shouldVisitSet.add(partialPath.toString());
|
77
|
-
}
|
81
|
+
while (! parts.isLeaf()) {
|
82
|
+
parts = parts.next(); // first next() skips "$"
|
83
|
+
partialPath.append(parts.getPathFragment());
|
84
|
+
this.shouldVisitSet.add(partialPath.toString());
|
78
85
|
}
|
79
86
|
}
|
80
87
|
}
|
@@ -118,7 +125,8 @@ public class JsonVisitor
|
|
118
125
|
int size = arrayValue.size();
|
119
126
|
Value[] newValue = new Value[size];
|
120
127
|
for (int i = 0; i < size; i++) {
|
121
|
-
String
|
128
|
+
String pathFragment = ArrayPathToken.getPathFragment(i);
|
129
|
+
String k = new StringBuilder(rootPath).append(pathFragment).toString();
|
122
130
|
if (!shouldVisit(k)) {
|
123
131
|
k = new StringBuilder(rootPath).append("[*]").toString(); // try [*] too
|
124
132
|
}
|
@@ -135,7 +143,8 @@ public class JsonVisitor
|
|
135
143
|
for (Map.Entry<Value, Value> entry : mapValue.entrySet()) {
|
136
144
|
Value k = entry.getKey();
|
137
145
|
Value v = entry.getValue();
|
138
|
-
String
|
146
|
+
String pathFragment = PropertyPathToken.getPathFragment(k.asStringValue().asString());
|
147
|
+
String newPath = new StringBuilder(rootPath).append(pathFragment).toString();
|
139
148
|
Value r = visit(newPath, v);
|
140
149
|
newValue[i++] = k;
|
141
150
|
newValue[i++] = r;
|
@@ -2,6 +2,7 @@ package org.embulk.filter.typecast;
|
|
2
2
|
|
3
3
|
import com.google.common.base.Optional;
|
4
4
|
import com.google.common.collect.ImmutableList;
|
5
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PathCompiler;
|
5
6
|
import org.embulk.config.Config;
|
6
7
|
import org.embulk.config.ConfigDefault;
|
7
8
|
import org.embulk.config.ConfigException;
|
@@ -91,10 +92,10 @@ public class TypecastFilterPlugin implements FilterPlugin
|
|
91
92
|
// throw if column does not exist
|
92
93
|
for (ColumnConfig columnConfig : columnConfigs) {
|
93
94
|
String name = columnConfig.getName();
|
94
|
-
if (
|
95
|
-
|
96
|
-
String
|
97
|
-
inputSchema.lookupColumn(
|
95
|
+
if (PathCompiler.isProbablyJsonPath(name)) {
|
96
|
+
// check only top level column name
|
97
|
+
String columnName = JsonPathUtil.getColumnName(name);
|
98
|
+
inputSchema.lookupColumn(columnName);
|
98
99
|
}
|
99
100
|
else {
|
100
101
|
inputSchema.lookupColumn(name);
|
@@ -103,7 +104,7 @@ public class TypecastFilterPlugin implements FilterPlugin
|
|
103
104
|
// throw if timestamp is specified in json path
|
104
105
|
for (ColumnConfig columnConfig : columnConfigs) {
|
105
106
|
String name = columnConfig.getName();
|
106
|
-
if (
|
107
|
+
if (PathCompiler.isProbablyJsonPath(name) && columnConfig.getType() instanceof TimestampType) {
|
107
108
|
throw new ConfigException(String.format("embulk-filter-typecast: timestamp type is not supported in json column: \"%s\"", name));
|
108
109
|
}
|
109
110
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-typecast
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naotoshi Seo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-11-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -54,6 +54,7 @@ files:
|
|
54
54
|
- config/checkstyle/checkstyle.xml
|
55
55
|
- example/empty.yml
|
56
56
|
- example/example.csv
|
57
|
+
- example/example.yml
|
57
58
|
- example/from_string.txt
|
58
59
|
- example/from_string.yml
|
59
60
|
- example/jsoncast.json
|
@@ -70,6 +71,7 @@ files:
|
|
70
71
|
- src/main/java/org/embulk/filter/typecast/ColumnCaster.java
|
71
72
|
- src/main/java/org/embulk/filter/typecast/ColumnVisitorImpl.java
|
72
73
|
- src/main/java/org/embulk/filter/typecast/JsonCaster.java
|
74
|
+
- src/main/java/org/embulk/filter/typecast/JsonPathUtil.java
|
73
75
|
- src/main/java/org/embulk/filter/typecast/JsonVisitor.java
|
74
76
|
- src/main/java/org/embulk/filter/typecast/TypecastFilterPlugin.java
|
75
77
|
- src/main/java/org/embulk/filter/typecast/cast/BooleanCast.java
|
@@ -84,7 +86,13 @@ files:
|
|
84
86
|
- src/test/java/org/embulk/filter/typecast/cast/TestLongCast.java
|
85
87
|
- src/test/java/org/embulk/filter/typecast/cast/TestStringCast.java
|
86
88
|
- src/test/java/org/embulk/filter/typecast/cast/TestTimestampCast.java
|
87
|
-
- classpath/
|
89
|
+
- classpath/accessors-smart-1.1.jar
|
90
|
+
- classpath/asm-5.0.3.jar
|
91
|
+
- classpath/commons-lang3-3.4.jar
|
92
|
+
- classpath/embulk-filter-typecast-0.1.5.jar
|
93
|
+
- classpath/json-smart-2.2.1.jar
|
94
|
+
- classpath/JsonPathCompiler-0.1.1.jar
|
95
|
+
- classpath/slf4j-api-1.7.21.jar
|
88
96
|
homepage: https://github.com/sonots/embulk-filter-typecast
|
89
97
|
licenses:
|
90
98
|
- MIT
|