embulk-filter-typecast 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +11 -5
- data/build.gradle +2 -1
- data/example/example.yml +41 -0
- data/src/main/java/org/embulk/filter/typecast/ColumnCaster.java +10 -4
- data/src/main/java/org/embulk/filter/typecast/JsonPathUtil.java +78 -0
- data/src/main/java/org/embulk/filter/typecast/JsonVisitor.java +30 -21
- data/src/main/java/org/embulk/filter/typecast/TypecastFilterPlugin.java +6 -5
- metadata +11 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 58d54323014090807f457eec3b84aef29060ddd2
|
4
|
+
data.tar.gz: eb7be6477a3a993ce83da5a3e404fa0e7fd3a50d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f19f899b820a7faccca9d261b8cc807ac230321b26b4ccde552e7308c6853bc01a24b7cefde15eef9061f1b06e55f0b1b31e8ff67f29ab83d4f07ac3feb488db
|
7
|
+
data.tar.gz: 479bce7de22d8ecefdd6ddde53cd602f44fefdab0394cb0b83df9eb1f67cf2aeb980aa87b24da62f72a3ce47192f13af5ba6b9c1006116fb1da0f64156313ad2
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -19,17 +19,23 @@ A filter plugin for Embulk to cast column type.
|
|
19
19
|
|
20
20
|
See [example.csv](./example/example.csv) and [example.yml](./example/example.yml).
|
21
21
|
|
22
|
-
## JSONPath
|
22
|
+
## JSONPath
|
23
23
|
|
24
24
|
For `type: json` column, you can specify [JSONPath](http://goessner.net/articles/JsonPath/) for column's name as:
|
25
25
|
|
26
26
|
```
|
27
|
-
$.payload.key1
|
28
|
-
$.payload.array[0]
|
29
|
-
$.payload.array[*]
|
27
|
+
name: $.payload.key1
|
28
|
+
name: "$.payload.array[0]"
|
29
|
+
name: "$.payload.array[*]"
|
30
|
+
name: $['payload']['key1.key2']
|
30
31
|
```
|
31
32
|
|
32
|
-
|
33
|
+
Following operators of JSONPath are not supported:
|
34
|
+
|
35
|
+
* Multiple properties such as `['name','name']`
|
36
|
+
* Multiple array indexes such as `[1,2]`
|
37
|
+
* Array slice such as `[1:2]`
|
38
|
+
* Filter expression such as `[?(<expression>)]`
|
33
39
|
|
34
40
|
## ToDo
|
35
41
|
|
data/build.gradle
CHANGED
@@ -13,13 +13,14 @@ configurations {
|
|
13
13
|
provided
|
14
14
|
}
|
15
15
|
|
16
|
-
version = "0.1.
|
16
|
+
version = "0.1.5"
|
17
17
|
sourceCompatibility = 1.7
|
18
18
|
targetCompatibility = 1.7
|
19
19
|
|
20
20
|
dependencies {
|
21
21
|
compile "org.embulk:embulk-core:0.8.+"
|
22
22
|
provided "org.embulk:embulk-core:0.8.+"
|
23
|
+
compile "io.github.medjed:JsonPathCompiler:0.1.1"
|
23
24
|
|
24
25
|
testCompile "junit:junit:4.+"
|
25
26
|
testCompile "org.embulk:embulk-core:0.8.+:tests"
|
data/example/example.yml
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: example/example.csv
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
charset: UTF-8
|
7
|
+
newline: CRLF
|
8
|
+
null_string: ''
|
9
|
+
skip_header_lines: 1
|
10
|
+
comment_line_marker: '#'
|
11
|
+
columns:
|
12
|
+
- {name: timestamp, type: string}
|
13
|
+
- {name: "null", type: string}
|
14
|
+
- {name: long, type: string}
|
15
|
+
- {name: string, type: string}
|
16
|
+
- {name: double, type: string}
|
17
|
+
- {name: json1, type: string}
|
18
|
+
- {name: json2, type: string}
|
19
|
+
- {name: array_str, type: string}
|
20
|
+
- {name: array_int, type: string}
|
21
|
+
- {name: ignore, type: string}
|
22
|
+
- {name: boolean, type: string}
|
23
|
+
filters:
|
24
|
+
- type: typecast
|
25
|
+
columns:
|
26
|
+
- {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
|
27
|
+
- {name: "null", type: long}
|
28
|
+
- {name: long, type: long}
|
29
|
+
- {name: string, type: string}
|
30
|
+
- {name: double, type: double}
|
31
|
+
- {name: json1, type: json}
|
32
|
+
- {name: json2, type: json}
|
33
|
+
- {name: array_str, type: json}
|
34
|
+
- {name: array_int, type: json}
|
35
|
+
- {name: boolean, type: boolean}
|
36
|
+
- {name: "$.json1.string", type: long}
|
37
|
+
- {name: "$.json2.long", type: long}
|
38
|
+
- {name: "$.array_str[0]", type: long}
|
39
|
+
- {name: "$.array_int[*]", type: long}
|
40
|
+
out:
|
41
|
+
type: "null"
|
@@ -1,5 +1,9 @@
|
|
1
1
|
package org.embulk.filter.typecast;
|
2
2
|
|
3
|
+
import io.github.medjed.jsonpathcompiler.expressions.Path;
|
4
|
+
import io.github.medjed.jsonpathcompiler.expressions.Utils;
|
5
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PathCompiler;
|
6
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PropertyPathToken;
|
3
7
|
import org.embulk.filter.typecast.TypecastFilterPlugin.ColumnConfig;
|
4
8
|
import org.embulk.filter.typecast.TypecastFilterPlugin.PluginTask;
|
5
9
|
|
@@ -61,7 +65,7 @@ class ColumnCaster
|
|
61
65
|
{
|
62
66
|
// columnName => TimestampParser
|
63
67
|
for (ColumnConfig columnConfig : task.getColumns()) {
|
64
|
-
if (columnConfig.getName()
|
68
|
+
if (PathCompiler.isProbablyJsonPath(columnConfig.getName())) {
|
65
69
|
continue; // type: json columns do not support type: timestamp
|
66
70
|
}
|
67
71
|
Column inputColumn = inputSchema.lookupColumn(columnConfig.getName());
|
@@ -76,7 +80,7 @@ class ColumnCaster
|
|
76
80
|
{
|
77
81
|
// columnName => TimestampFormatter
|
78
82
|
for (ColumnConfig columnConfig : task.getColumns()) {
|
79
|
-
if (columnConfig.getName()
|
83
|
+
if (PathCompiler.isProbablyJsonPath(columnConfig.getName())) {
|
80
84
|
continue; // type: json columns do not have type: timestamp
|
81
85
|
}
|
82
86
|
Column inputColumn = inputSchema.lookupColumn(columnConfig.getName());
|
@@ -200,7 +204,8 @@ class ColumnCaster
|
|
200
204
|
}
|
201
205
|
else if (outputType instanceof JsonType) {
|
202
206
|
Value jsonValue = StringCast.asJson(value);
|
203
|
-
String
|
207
|
+
String name = outputColumn.getName();
|
208
|
+
String jsonPath = new StringBuilder("$").append(PropertyPathToken.getPathFragment(name)).toString();
|
204
209
|
Value castedValue = jsonVisitor.visit(jsonPath, jsonValue);
|
205
210
|
pageBuilder.setJson(outputColumn, castedValue);
|
206
211
|
}
|
@@ -238,7 +243,8 @@ class ColumnCaster
|
|
238
243
|
|
239
244
|
public void setFromJson(Column outputColumn, Value value)
|
240
245
|
{
|
241
|
-
String
|
246
|
+
String name = outputColumn.getName();
|
247
|
+
String jsonPath = new StringBuilder("$").append(PropertyPathToken.getPathFragment(name)).toString();
|
242
248
|
Value castedValue = jsonVisitor.visit(jsonPath, value);
|
243
249
|
Type outputType = outputColumn.getType();
|
244
250
|
if (outputType instanceof BooleanType) {
|
@@ -0,0 +1,78 @@
|
|
1
|
+
package org.embulk.filter.typecast;
|
2
|
+
|
3
|
+
import io.github.medjed.jsonpathcompiler.InvalidPathException;
|
4
|
+
import io.github.medjed.jsonpathcompiler.expressions.Path;
|
5
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.ArrayIndexOperation;
|
6
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.ArrayPathToken;
|
7
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.FunctionPathToken;
|
8
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PathCompiler;
|
9
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PathToken;
|
10
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PredicatePathToken;
|
11
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PropertyPathToken;
|
12
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.ScanPathToken;
|
13
|
+
import org.embulk.config.ConfigException;
|
14
|
+
|
15
|
+
public class JsonPathUtil
|
16
|
+
{
|
17
|
+
private JsonPathUtil() {}
|
18
|
+
|
19
|
+
public static String getColumnName(String jsonPath)
|
20
|
+
{
|
21
|
+
Path compiledPath;
|
22
|
+
try {
|
23
|
+
compiledPath = PathCompiler.compile(jsonPath);
|
24
|
+
}
|
25
|
+
catch (InvalidPathException e) {
|
26
|
+
throw new ConfigException(String.format("jsonpath %s, %s", jsonPath, e.getMessage()));
|
27
|
+
}
|
28
|
+
PathToken pathToken = compiledPath.getRoot();
|
29
|
+
pathToken = pathToken.next(); // skip $
|
30
|
+
return ((PropertyPathToken) pathToken).getProperties().get(0);
|
31
|
+
}
|
32
|
+
|
33
|
+
public static void assertJsonPathFormat(String path)
|
34
|
+
{
|
35
|
+
Path compiledPath;
|
36
|
+
try {
|
37
|
+
compiledPath = PathCompiler.compile(path);
|
38
|
+
}
|
39
|
+
catch (InvalidPathException e) {
|
40
|
+
throw new ConfigException(String.format("jsonpath %s, %s", path, e.getMessage()));
|
41
|
+
}
|
42
|
+
PathToken pathToken = compiledPath.getRoot();
|
43
|
+
while (true) {
|
44
|
+
assertSupportedPathToken(pathToken, path);
|
45
|
+
if (pathToken.isLeaf()) {
|
46
|
+
break;
|
47
|
+
}
|
48
|
+
pathToken = pathToken.next();
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
protected static void assertSupportedPathToken(PathToken pathToken, String path)
|
53
|
+
{
|
54
|
+
if (pathToken instanceof ArrayPathToken) {
|
55
|
+
ArrayIndexOperation arrayIndexOperation = ((ArrayPathToken) pathToken).getArrayIndexOperation();
|
56
|
+
assertSupportedArrayPathToken(arrayIndexOperation, path);
|
57
|
+
}
|
58
|
+
else if (pathToken instanceof ScanPathToken) {
|
59
|
+
throw new ConfigException(String.format("scan path token is not supported \"%s\"", path));
|
60
|
+
}
|
61
|
+
else if (pathToken instanceof FunctionPathToken) {
|
62
|
+
throw new ConfigException(String.format("function path token is not supported \"%s\"", path));
|
63
|
+
}
|
64
|
+
else if (pathToken instanceof PredicatePathToken) {
|
65
|
+
throw new ConfigException(String.format("predicate path token is not supported \"%s\"", path));
|
66
|
+
}
|
67
|
+
}
|
68
|
+
|
69
|
+
protected static void assertSupportedArrayPathToken(ArrayIndexOperation arrayIndexOperation, String path)
|
70
|
+
{
|
71
|
+
if (arrayIndexOperation == null) {
|
72
|
+
throw new ConfigException(String.format("Array Slice Operation is not supported \"%s\"", path));
|
73
|
+
}
|
74
|
+
else if (!arrayIndexOperation.isSingleIndexOperation()) {
|
75
|
+
throw new ConfigException(String.format("Multi Array Indexes is not supported \"%s\"", path));
|
76
|
+
}
|
77
|
+
}
|
78
|
+
}
|
@@ -1,5 +1,10 @@
|
|
1
1
|
package org.embulk.filter.typecast;
|
2
2
|
|
3
|
+
import io.github.medjed.jsonpathcompiler.expressions.Path;
|
4
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.ArrayPathToken;
|
5
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PathCompiler;
|
6
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PathToken;
|
7
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PropertyPathToken;
|
3
8
|
import org.embulk.filter.typecast.TypecastFilterPlugin.ColumnConfig;
|
4
9
|
import org.embulk.filter.typecast.TypecastFilterPlugin.PluginTask;
|
5
10
|
|
@@ -33,20 +38,33 @@ public class JsonVisitor
|
|
33
38
|
this.inputSchema = inputSchema;
|
34
39
|
this.outputSchema = outputSchema;
|
35
40
|
|
41
|
+
assertJsonPathFromat();
|
36
42
|
buildShouldVisitSet();
|
37
43
|
buildJsonPathTypeMap();
|
38
44
|
}
|
39
45
|
|
46
|
+
private void assertJsonPathFromat()
|
47
|
+
{
|
48
|
+
for (ColumnConfig columnConfig : task.getColumns()) {
|
49
|
+
String name = columnConfig.getName();
|
50
|
+
if (! PathCompiler.isProbablyJsonPath(name)) {
|
51
|
+
continue;
|
52
|
+
}
|
53
|
+
JsonPathUtil.assertJsonPathFormat(name);
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
40
57
|
private void buildJsonPathTypeMap()
|
41
58
|
{
|
42
59
|
// json path => Type
|
43
60
|
for (ColumnConfig columnConfig : task.getColumns()) {
|
44
61
|
String name = columnConfig.getName();
|
45
|
-
if (!
|
62
|
+
if (! PathCompiler.isProbablyJsonPath(name)) {
|
46
63
|
continue;
|
47
64
|
}
|
65
|
+
Path compiledPath = PathCompiler.compile(name);
|
48
66
|
Type type = columnConfig.getType();
|
49
|
-
this.jsonPathTypeMap.put(
|
67
|
+
this.jsonPathTypeMap.put(compiledPath.toString(), type);
|
50
68
|
}
|
51
69
|
}
|
52
70
|
|
@@ -55,26 +73,15 @@ public class JsonVisitor
|
|
55
73
|
// json partial path => Boolean to avoid unnecessary type: json visit
|
56
74
|
for (ColumnConfig columnConfig : task.getColumns()) {
|
57
75
|
String name = columnConfig.getName();
|
58
|
-
if (!
|
76
|
+
if (! PathCompiler.isProbablyJsonPath(name)) {
|
59
77
|
continue;
|
60
78
|
}
|
61
|
-
|
79
|
+
PathToken parts = PathCompiler.compile(name).getRoot();
|
62
80
|
StringBuilder partialPath = new StringBuilder("$");
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
this.shouldVisitSet.add(partialPath.toString());
|
68
|
-
for (int j = 1; j < arrayParts.length; j++) {
|
69
|
-
// Supports both [0] and [*]
|
70
|
-
partialPath.append("[").append(arrayParts[j]);
|
71
|
-
this.shouldVisitSet.add(partialPath.toString());
|
72
|
-
}
|
73
|
-
}
|
74
|
-
else {
|
75
|
-
partialPath.append(".").append(parts[i]);
|
76
|
-
this.shouldVisitSet.add(partialPath.toString());
|
77
|
-
}
|
81
|
+
while (! parts.isLeaf()) {
|
82
|
+
parts = parts.next(); // first next() skips "$"
|
83
|
+
partialPath.append(parts.getPathFragment());
|
84
|
+
this.shouldVisitSet.add(partialPath.toString());
|
78
85
|
}
|
79
86
|
}
|
80
87
|
}
|
@@ -118,7 +125,8 @@ public class JsonVisitor
|
|
118
125
|
int size = arrayValue.size();
|
119
126
|
Value[] newValue = new Value[size];
|
120
127
|
for (int i = 0; i < size; i++) {
|
121
|
-
String
|
128
|
+
String pathFragment = ArrayPathToken.getPathFragment(i);
|
129
|
+
String k = new StringBuilder(rootPath).append(pathFragment).toString();
|
122
130
|
if (!shouldVisit(k)) {
|
123
131
|
k = new StringBuilder(rootPath).append("[*]").toString(); // try [*] too
|
124
132
|
}
|
@@ -135,7 +143,8 @@ public class JsonVisitor
|
|
135
143
|
for (Map.Entry<Value, Value> entry : mapValue.entrySet()) {
|
136
144
|
Value k = entry.getKey();
|
137
145
|
Value v = entry.getValue();
|
138
|
-
String
|
146
|
+
String pathFragment = PropertyPathToken.getPathFragment(k.asStringValue().asString());
|
147
|
+
String newPath = new StringBuilder(rootPath).append(pathFragment).toString();
|
139
148
|
Value r = visit(newPath, v);
|
140
149
|
newValue[i++] = k;
|
141
150
|
newValue[i++] = r;
|
@@ -2,6 +2,7 @@ package org.embulk.filter.typecast;
|
|
2
2
|
|
3
3
|
import com.google.common.base.Optional;
|
4
4
|
import com.google.common.collect.ImmutableList;
|
5
|
+
import io.github.medjed.jsonpathcompiler.expressions.path.PathCompiler;
|
5
6
|
import org.embulk.config.Config;
|
6
7
|
import org.embulk.config.ConfigDefault;
|
7
8
|
import org.embulk.config.ConfigException;
|
@@ -91,10 +92,10 @@ public class TypecastFilterPlugin implements FilterPlugin
|
|
91
92
|
// throw if column does not exist
|
92
93
|
for (ColumnConfig columnConfig : columnConfigs) {
|
93
94
|
String name = columnConfig.getName();
|
94
|
-
if (
|
95
|
-
|
96
|
-
String
|
97
|
-
inputSchema.lookupColumn(
|
95
|
+
if (PathCompiler.isProbablyJsonPath(name)) {
|
96
|
+
// check only top level column name
|
97
|
+
String columnName = JsonPathUtil.getColumnName(name);
|
98
|
+
inputSchema.lookupColumn(columnName);
|
98
99
|
}
|
99
100
|
else {
|
100
101
|
inputSchema.lookupColumn(name);
|
@@ -103,7 +104,7 @@ public class TypecastFilterPlugin implements FilterPlugin
|
|
103
104
|
// throw if timestamp is specified in json path
|
104
105
|
for (ColumnConfig columnConfig : columnConfigs) {
|
105
106
|
String name = columnConfig.getName();
|
106
|
-
if (
|
107
|
+
if (PathCompiler.isProbablyJsonPath(name) && columnConfig.getType() instanceof TimestampType) {
|
107
108
|
throw new ConfigException(String.format("embulk-filter-typecast: timestamp type is not supported in json column: \"%s\"", name));
|
108
109
|
}
|
109
110
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-typecast
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naotoshi Seo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-11-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -54,6 +54,7 @@ files:
|
|
54
54
|
- config/checkstyle/checkstyle.xml
|
55
55
|
- example/empty.yml
|
56
56
|
- example/example.csv
|
57
|
+
- example/example.yml
|
57
58
|
- example/from_string.txt
|
58
59
|
- example/from_string.yml
|
59
60
|
- example/jsoncast.json
|
@@ -70,6 +71,7 @@ files:
|
|
70
71
|
- src/main/java/org/embulk/filter/typecast/ColumnCaster.java
|
71
72
|
- src/main/java/org/embulk/filter/typecast/ColumnVisitorImpl.java
|
72
73
|
- src/main/java/org/embulk/filter/typecast/JsonCaster.java
|
74
|
+
- src/main/java/org/embulk/filter/typecast/JsonPathUtil.java
|
73
75
|
- src/main/java/org/embulk/filter/typecast/JsonVisitor.java
|
74
76
|
- src/main/java/org/embulk/filter/typecast/TypecastFilterPlugin.java
|
75
77
|
- src/main/java/org/embulk/filter/typecast/cast/BooleanCast.java
|
@@ -84,7 +86,13 @@ files:
|
|
84
86
|
- src/test/java/org/embulk/filter/typecast/cast/TestLongCast.java
|
85
87
|
- src/test/java/org/embulk/filter/typecast/cast/TestStringCast.java
|
86
88
|
- src/test/java/org/embulk/filter/typecast/cast/TestTimestampCast.java
|
87
|
-
- classpath/
|
89
|
+
- classpath/accessors-smart-1.1.jar
|
90
|
+
- classpath/asm-5.0.3.jar
|
91
|
+
- classpath/commons-lang3-3.4.jar
|
92
|
+
- classpath/embulk-filter-typecast-0.1.5.jar
|
93
|
+
- classpath/json-smart-2.2.1.jar
|
94
|
+
- classpath/JsonPathCompiler-0.1.1.jar
|
95
|
+
- classpath/slf4j-api-1.7.21.jar
|
88
96
|
homepage: https://github.com/sonots/embulk-filter-typecast
|
89
97
|
licenses:
|
90
98
|
- MIT
|