embulk-filter-typecast 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -1
- data/CHANGELOG.md +6 -0
- data/build.gradle +1 -1
- data/example/example.csv +11 -0
- data/example/example2.yml +6 -2
- data/src/main/java/org/embulk/filter/typecast/{TypecastPageBuilder.java → ColumnCaster.java} +47 -14
- data/src/main/java/org/embulk/filter/typecast/ColumnVisitorImpl.java +36 -133
- data/src/main/java/org/embulk/filter/typecast/{TypecastJsonBuilder.java → JsonCaster.java} +31 -50
- data/src/main/java/org/embulk/filter/typecast/JsonVisitor.java +136 -0
- data/src/main/java/org/embulk/filter/typecast/TypecastFilterPlugin.java +11 -1
- metadata +7 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e8a2482d0bd6fc6109bb8763f9a1f9db6b0733db
|
4
|
+
data.tar.gz: b80c4d01e823e3a4b59594002277fc78e1ee0ef9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7473b57b158d8936e14015358dcd21d3bfe852f0b7c33de10da4ad38aa4553b3ddf39e5b41c9ed11df32713c42a33506aba22820b2236590ab124035a1056783
|
7
|
+
data.tar.gz: 354bf8ab52c2ee5ba124af046abea9e311213c4ebaa03991f432504cc87094a0e1d4815c3c94434ee843de836f6e2146331540fd23b286071c000011bb64c55a
|
data/.gitignore
CHANGED
data/CHANGELOG.md
CHANGED
data/build.gradle
CHANGED
data/example/example.csv
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
timestamp,null,long,string,double,json1,json2,boolean
|
2
|
+
2015-07-13 00:00:00.100000,,90,l6lTsvxd,903.4,{"string":"0"},{"long":0},true
|
3
|
+
2015-07-13 00:00:00.100000,,91,XoALSEQg,394.5,{"string":"1"},{"long":1},true
|
4
|
+
2015-07-13 00:00:00.100000,,92,0hgDRI_m,810.9,{"string":"2"},{"long":2},true
|
5
|
+
2015-07-13 00:00:00.100000,,93,KjCRAc-A,477.4,{"string":"3"},{"long":3},true
|
6
|
+
2015-07-13 00:00:00.100000,,94,fyQVGlT8,725.3,{"string":"4"},{"long":4},true
|
7
|
+
2015-07-13 00:00:00.100000,,95,FpBYRPWK,316.6,{"string":"5"},{"long":5},false
|
8
|
+
2015-07-13 00:00:00.100000,,96,9ikvnUqp,369.5,{"string":"6"},{"long":6},false
|
9
|
+
2015-07-13 00:00:00.100000,,97,RRNYDAzK,506.5,{"string":"7"},{"long":7},false
|
10
|
+
2015-07-13 00:00:00.100000,,90,l6lTsvxd,903.4,{"string":"8"},{"long":8},false
|
11
|
+
2015-07-13 00:00:00.100000,,91,XoALSEQg,394.5,{"string":"9"},{"long":9},false
|
data/example/example2.yml
CHANGED
@@ -14,7 +14,8 @@ in:
|
|
14
14
|
- {name: long, type: string}
|
15
15
|
- {name: string, type: string}
|
16
16
|
- {name: double, type: string}
|
17
|
-
- {name:
|
17
|
+
- {name: json1, type: string}
|
18
|
+
- {name: json2, type: string}
|
18
19
|
- {name: boolean, type: boolean}
|
19
20
|
filters:
|
20
21
|
- type: typecast
|
@@ -24,7 +25,10 @@ filters:
|
|
24
25
|
- {name: long, type: long}
|
25
26
|
- {name: string, type: string}
|
26
27
|
- {name: double, type: double}
|
27
|
-
- {name:
|
28
|
+
- {name: json1, type: json}
|
29
|
+
- {name: json2, type: json}
|
28
30
|
- {name: boolean, type: boolean}
|
31
|
+
- {name: "$.json1.string", type: long}
|
32
|
+
- {name: "$.json2.long", type: long}
|
29
33
|
out:
|
30
34
|
type: "null"
|
data/src/main/java/org/embulk/filter/typecast/{TypecastPageBuilder.java → ColumnCaster.java}
RENAMED
@@ -2,17 +2,45 @@ package org.embulk.filter.typecast;
|
|
2
2
|
|
3
3
|
import org.embulk.filter.typecast.cast.*;
|
4
4
|
|
5
|
+
import org.embulk.filter.typecast.TypecastFilterPlugin.PluginTask;
|
6
|
+
|
5
7
|
import org.embulk.spi.Column;
|
6
8
|
import org.embulk.spi.DataException;
|
9
|
+
import org.embulk.spi.Exec;
|
7
10
|
import org.embulk.spi.PageBuilder;
|
11
|
+
import org.embulk.spi.PageReader;
|
12
|
+
import org.embulk.spi.Schema;
|
8
13
|
import org.embulk.spi.time.Timestamp;
|
9
14
|
import org.embulk.spi.time.TimestampFormatter;
|
10
15
|
import org.embulk.spi.time.TimestampParser;
|
11
16
|
import org.embulk.spi.type.*;
|
12
17
|
import org.msgpack.value.Value;
|
13
18
|
|
14
|
-
|
15
|
-
|
19
|
+
import org.slf4j.Logger;
|
20
|
+
|
21
|
+
|
22
|
+
class ColumnCaster
|
23
|
+
{
|
24
|
+
private static final Logger logger = Exec.getLogger(TypecastFilterPlugin.class);
|
25
|
+
private final PluginTask task;
|
26
|
+
private final Schema inputSchema;
|
27
|
+
private final Schema outputSchema;
|
28
|
+
private final PageReader pageReader;
|
29
|
+
private final PageBuilder pageBuilder;
|
30
|
+
private final JsonVisitor jsonVisitor;
|
31
|
+
|
32
|
+
ColumnCaster(TypecastFilterPlugin.PluginTask task, Schema inputSchema, Schema outputSchema,
|
33
|
+
PageReader pageReader, PageBuilder pageBuilder)
|
34
|
+
{
|
35
|
+
this.task = task;
|
36
|
+
this.inputSchema = inputSchema;
|
37
|
+
this.outputSchema = outputSchema;
|
38
|
+
this.pageReader = pageReader;
|
39
|
+
this.pageBuilder = pageBuilder;
|
40
|
+
this.jsonVisitor = new JsonVisitor(task, inputSchema, outputSchema);
|
41
|
+
}
|
42
|
+
|
43
|
+
public void setFromBoolean(Column outputColumn, boolean value) {
|
16
44
|
Type outputType = outputColumn.getType();
|
17
45
|
if (outputType instanceof BooleanType) {
|
18
46
|
pageBuilder.setBoolean(outputColumn, BooleanCast.asBoolean(value));
|
@@ -31,7 +59,7 @@ class TypecastPageBuilder {
|
|
31
59
|
}
|
32
60
|
}
|
33
61
|
|
34
|
-
|
62
|
+
public void setFromLong(Column outputColumn, long value)
|
35
63
|
{
|
36
64
|
Type outputType = outputColumn.getType();
|
37
65
|
if (outputType instanceof BooleanType) {
|
@@ -51,7 +79,7 @@ class TypecastPageBuilder {
|
|
51
79
|
}
|
52
80
|
}
|
53
81
|
|
54
|
-
|
82
|
+
public void setFromDouble(Column outputColumn, double value)
|
55
83
|
{
|
56
84
|
try {
|
57
85
|
Type outputType = outputColumn.getType();
|
@@ -76,7 +104,7 @@ class TypecastPageBuilder {
|
|
76
104
|
}
|
77
105
|
}
|
78
106
|
|
79
|
-
|
107
|
+
public void setFromString(Column outputColumn, String value, TimestampParser timestampParser)
|
80
108
|
{
|
81
109
|
Type outputType = outputColumn.getType();
|
82
110
|
if (outputType instanceof BooleanType) {
|
@@ -90,13 +118,16 @@ class TypecastPageBuilder {
|
|
90
118
|
} else if (outputType instanceof TimestampType) {
|
91
119
|
pageBuilder.setTimestamp(outputColumn, StringCast.asTimestamp(value, timestampParser));
|
92
120
|
} else if (outputType instanceof JsonType) {
|
93
|
-
|
121
|
+
Value jsonValue = StringCast.asJson(value);
|
122
|
+
String jsonPath = new StringBuilder("$.").append(outputColumn.getName()).toString();
|
123
|
+
Value castedValue = jsonVisitor.visit(jsonPath, jsonValue);
|
124
|
+
pageBuilder.setJson(outputColumn, castedValue);
|
94
125
|
} else {
|
95
126
|
assert(false);
|
96
127
|
}
|
97
128
|
}
|
98
129
|
|
99
|
-
|
130
|
+
public void setFromTimestamp(Column outputColumn, Timestamp value, TimestampFormatter timestampFormatter)
|
100
131
|
{
|
101
132
|
Type outputType = outputColumn.getType();
|
102
133
|
if (outputType instanceof BooleanType) {
|
@@ -116,21 +147,23 @@ class TypecastPageBuilder {
|
|
116
147
|
}
|
117
148
|
}
|
118
149
|
|
119
|
-
|
150
|
+
public void setFromJson(Column outputColumn, Value value)
|
120
151
|
{
|
152
|
+
String jsonPath = new StringBuilder("$.").append(outputColumn.getName()).toString();
|
153
|
+
Value castedValue = jsonVisitor.visit(jsonPath, value);
|
121
154
|
Type outputType = outputColumn.getType();
|
122
155
|
if (outputType instanceof BooleanType) {
|
123
|
-
pageBuilder.setBoolean(outputColumn, JsonCast.asBoolean(
|
156
|
+
pageBuilder.setBoolean(outputColumn, JsonCast.asBoolean(castedValue));
|
124
157
|
} else if (outputType instanceof LongType) {
|
125
|
-
pageBuilder.setLong(outputColumn, JsonCast.asLong(
|
158
|
+
pageBuilder.setLong(outputColumn, JsonCast.asLong(castedValue));
|
126
159
|
} else if (outputType instanceof DoubleType) {
|
127
|
-
pageBuilder.setDouble(outputColumn, JsonCast.asDouble(
|
160
|
+
pageBuilder.setDouble(outputColumn, JsonCast.asDouble(castedValue));
|
128
161
|
} else if (outputType instanceof StringType) {
|
129
|
-
pageBuilder.setString(outputColumn, JsonCast.asString(
|
162
|
+
pageBuilder.setString(outputColumn, JsonCast.asString(castedValue));
|
130
163
|
} else if (outputType instanceof TimestampType) {
|
131
|
-
pageBuilder.setTimestamp(outputColumn, JsonCast.asTimestamp(
|
164
|
+
pageBuilder.setTimestamp(outputColumn, JsonCast.asTimestamp(castedValue));
|
132
165
|
} else if (outputType instanceof JsonType) {
|
133
|
-
pageBuilder.setJson(outputColumn, JsonCast.asJson(
|
166
|
+
pageBuilder.setJson(outputColumn, JsonCast.asJson(castedValue));
|
134
167
|
} else {
|
135
168
|
assert(false);
|
136
169
|
}
|
@@ -1,11 +1,8 @@
|
|
1
1
|
package org.embulk.filter.typecast;
|
2
2
|
|
3
3
|
import org.embulk.spi.*;
|
4
|
-
import org.embulk.spi.type.
|
5
|
-
import org.
|
6
|
-
import org.msgpack.value.MapValue;
|
7
|
-
import org.msgpack.value.Value;
|
8
|
-
import org.msgpack.value.ValueFactory;
|
4
|
+
import org.embulk.spi.type.StringType;
|
5
|
+
import org.embulk.spi.type.TimestampType;
|
9
6
|
|
10
7
|
import org.embulk.filter.typecast.TypecastFilterPlugin.ColumnConfig;
|
11
8
|
import org.embulk.filter.typecast.TypecastFilterPlugin.PluginTask;
|
@@ -16,8 +13,6 @@ import org.joda.time.DateTimeZone;
|
|
16
13
|
import org.slf4j.Logger;
|
17
14
|
|
18
15
|
import java.util.HashMap;
|
19
|
-
import java.util.HashSet;
|
20
|
-
import java.util.Map;
|
21
16
|
|
22
17
|
public class ColumnVisitorImpl
|
23
18
|
implements ColumnVisitor
|
@@ -31,11 +26,10 @@ public class ColumnVisitorImpl
|
|
31
26
|
private final HashMap<String, Column> outputColumnMap = new HashMap<>();
|
32
27
|
private final HashMap<String, TimestampParser> timestampParserMap = new HashMap<>();
|
33
28
|
private final HashMap<String, TimestampFormatter> timestampFormatterMap = new HashMap<>();
|
34
|
-
private final
|
35
|
-
private final HashMap<String, Type> jsonPathTypeMap = new HashMap<>();
|
29
|
+
private final ColumnCaster columnCaster;
|
36
30
|
|
37
31
|
ColumnVisitorImpl(PluginTask task, Schema inputSchema, Schema outputSchema,
|
38
|
-
|
32
|
+
PageReader pageReader, PageBuilder pageBuilder)
|
39
33
|
{
|
40
34
|
this.task = task;
|
41
35
|
this.inputSchema = inputSchema;
|
@@ -43,11 +37,11 @@ public class ColumnVisitorImpl
|
|
43
37
|
this.pageReader = pageReader;
|
44
38
|
this.pageBuilder = pageBuilder;
|
45
39
|
|
40
|
+
this.columnCaster = new ColumnCaster(task, inputSchema, outputSchema, pageReader, pageBuilder);
|
41
|
+
|
46
42
|
buildOutputColumnMap();
|
47
43
|
buildTimestampParserMap();
|
48
44
|
buildTimestampFormatterMap();
|
49
|
-
buildShouldVisitJsonPathSet();;
|
50
|
-
buildJsonPathTypeMap();
|
51
45
|
}
|
52
46
|
|
53
47
|
private void buildOutputColumnMap()
|
@@ -60,132 +54,46 @@ public class ColumnVisitorImpl
|
|
60
54
|
|
61
55
|
private void buildTimestampParserMap()
|
62
56
|
{
|
63
|
-
// columnName
|
57
|
+
// columnName => TimestampParser
|
64
58
|
for (ColumnConfig columnConfig : task.getColumns()) {
|
65
|
-
|
66
|
-
|
67
|
-
}
|
68
|
-
}
|
69
|
-
|
70
|
-
private TimestampParser getTimestampParser(ColumnConfig columnConfig, PluginTask task)
|
71
|
-
{
|
72
|
-
DateTimeZone timezone = columnConfig.getTimeZone().or(task.getDefaultTimeZone());
|
73
|
-
String format = columnConfig.getFormat().or(task.getDefaultTimestampFormat());
|
74
|
-
return new TimestampParser(task.getJRuby(), format, timezone);
|
75
|
-
}
|
76
|
-
|
77
|
-
private void buildTimestampFormatterMap()
|
78
|
-
{
|
79
|
-
// columnName or jsonPath => TimestampFormatter
|
80
|
-
for (ColumnConfig columnConfig : task.getColumns()) {
|
81
|
-
TimestampFormatter parser = getTimestampFormatter(columnConfig, task);
|
82
|
-
this.timestampFormatterMap.put(columnConfig.getName(), parser);
|
83
|
-
}
|
84
|
-
}
|
85
|
-
|
86
|
-
private TimestampFormatter getTimestampFormatter(ColumnConfig columnConfig, PluginTask task)
|
87
|
-
{
|
88
|
-
String format = columnConfig.getFormat().or(task.getDefaultTimestampFormat());
|
89
|
-
DateTimeZone timezone = columnConfig.getTimeZone().or(task.getDefaultTimeZone());
|
90
|
-
return new TimestampFormatter(task.getJRuby(), format, timezone);
|
91
|
-
}
|
92
|
-
|
93
|
-
private void buildShouldVisitJsonPathSet()
|
94
|
-
{
|
95
|
-
// json partial path => Boolean to avoid unnecessary type: json visit
|
96
|
-
for (ColumnConfig columnConfig : task.getColumns()) {
|
97
|
-
String name = columnConfig.getName();
|
98
|
-
if (!name.startsWith("$.")) {
|
99
|
-
continue;
|
59
|
+
if (columnConfig.getName().startsWith("$.")) {
|
60
|
+
continue; // type: json columns do not support type: timestamp
|
100
61
|
}
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
String[] arrayParts = parts[i].split("\\[");
|
106
|
-
partialPath.append(".").append(arrayParts[0]);
|
107
|
-
this.shouldVisitJsonPathSet.add(partialPath.toString());
|
108
|
-
for (int j = 1; j < arrayParts.length; j++) {
|
109
|
-
partialPath.append("[").append(arrayParts[j]);
|
110
|
-
this.shouldVisitJsonPathSet.add(partialPath.toString());
|
111
|
-
}
|
112
|
-
}
|
113
|
-
else {
|
114
|
-
partialPath.append(".").append(parts[i]);
|
115
|
-
this.shouldVisitJsonPathSet.add(partialPath.toString());
|
116
|
-
}
|
62
|
+
Column inputColumn = inputSchema.lookupColumn(columnConfig.getName());
|
63
|
+
if (inputColumn.getType() instanceof StringType && columnConfig.getType() instanceof TimestampType) {
|
64
|
+
TimestampParser parser = getTimestampParser(columnConfig, task);
|
65
|
+
this.timestampParserMap.put(columnConfig.getName(), parser);
|
117
66
|
}
|
118
67
|
}
|
119
68
|
}
|
120
69
|
|
121
|
-
private void
|
70
|
+
private void buildTimestampFormatterMap()
|
122
71
|
{
|
123
|
-
//
|
72
|
+
// columnName => TimestampFormatter
|
124
73
|
for (ColumnConfig columnConfig : task.getColumns()) {
|
125
|
-
|
126
|
-
|
127
|
-
|
74
|
+
if (columnConfig.getName().startsWith("$.")) {
|
75
|
+
continue; // type: json columns do not have type: timestamp
|
76
|
+
}
|
77
|
+
Column inputColumn = inputSchema.lookupColumn(columnConfig.getName());
|
78
|
+
if (inputColumn.getType() instanceof TimestampType && columnConfig.getType() instanceof StringType) {
|
79
|
+
TimestampFormatter parser = getTimestampFormatter(columnConfig, task);
|
80
|
+
this.timestampFormatterMap.put(columnConfig.getName(), parser);
|
128
81
|
}
|
129
|
-
Type type = columnConfig.getType();
|
130
|
-
this.jsonPathTypeMap.put(name, type);
|
131
82
|
}
|
132
83
|
}
|
133
84
|
|
134
|
-
private
|
85
|
+
private TimestampParser getTimestampParser(ColumnConfig columnConfig, PluginTask task)
|
135
86
|
{
|
136
|
-
|
87
|
+
DateTimeZone timezone = columnConfig.getTimeZone().or(task.getDefaultTimeZone());
|
88
|
+
String format = columnConfig.getFormat().or(task.getDefaultTimestampFormat());
|
89
|
+
return new TimestampParser(task.getJRuby(), format, timezone);
|
137
90
|
}
|
138
91
|
|
139
|
-
private
|
92
|
+
private TimestampFormatter getTimestampFormatter(ColumnConfig columnConfig, PluginTask task)
|
140
93
|
{
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
if (value.isArrayValue()) {
|
145
|
-
ArrayValue arrayValue = value.asArrayValue();
|
146
|
-
int size = arrayValue.size();
|
147
|
-
Value[] newValue = new Value[size];
|
148
|
-
for (int i = 0; i < size; i++) {
|
149
|
-
String k = new StringBuilder(jsonPath).append("[").append(Integer.toString(i)).append("]").toString();
|
150
|
-
Value v = arrayValue.get(i);
|
151
|
-
newValue[i] = castJsonRecursively(task, k, v);
|
152
|
-
}
|
153
|
-
return ValueFactory.newArray(newValue, true);
|
154
|
-
}
|
155
|
-
else if (value.isMapValue()) {
|
156
|
-
MapValue mapValue = value.asMapValue();
|
157
|
-
int size = mapValue.size() * 2;
|
158
|
-
Value[] newValue = new Value[size];
|
159
|
-
int i = 0;
|
160
|
-
for (Map.Entry<Value, Value> entry : mapValue.entrySet()) {
|
161
|
-
Value k = entry.getKey();
|
162
|
-
Value v = entry.getValue();
|
163
|
-
String newPath = new StringBuilder(jsonPath).append(".").append(k.asStringValue().asString()).toString();
|
164
|
-
Value r = castJsonRecursively(task, newPath, v);
|
165
|
-
newValue[i++] = k;
|
166
|
-
newValue[i++] = r;
|
167
|
-
}
|
168
|
-
return ValueFactory.newMap(newValue, true);
|
169
|
-
}
|
170
|
-
else if (value.isBooleanValue()) {
|
171
|
-
Type outputType = jsonPathTypeMap.get(jsonPath);
|
172
|
-
return TypecastJsonBuilder.getFromBoolean(outputType, value.asBooleanValue().getBoolean());
|
173
|
-
}
|
174
|
-
else if (value.isIntegerValue()) {
|
175
|
-
Type outputType = jsonPathTypeMap.get(jsonPath);
|
176
|
-
return TypecastJsonBuilder.getFromLong(outputType, value.asIntegerValue().asLong());
|
177
|
-
}
|
178
|
-
else if (value.isFloatValue()) {
|
179
|
-
Type outputType = jsonPathTypeMap.get(jsonPath);
|
180
|
-
return TypecastJsonBuilder.getFromDouble(outputType, value.asFloatValue().toDouble());
|
181
|
-
}
|
182
|
-
else if (value.isStringValue()) {
|
183
|
-
Type outputType = jsonPathTypeMap.get(jsonPath);
|
184
|
-
return TypecastJsonBuilder.getFromString(outputType, value.asStringValue().asString());
|
185
|
-
}
|
186
|
-
else {
|
187
|
-
return value;
|
188
|
-
}
|
94
|
+
String format = columnConfig.getFormat().or(task.getDefaultTimestampFormat());
|
95
|
+
DateTimeZone timezone = columnConfig.getTimeZone().or(task.getDefaultTimeZone());
|
96
|
+
return new TimestampFormatter(task.getJRuby(), format, timezone);
|
189
97
|
}
|
190
98
|
|
191
99
|
private interface PageBuildable
|
@@ -217,7 +125,7 @@ public class ColumnVisitorImpl
|
|
217
125
|
final Column outputColumn = outputColumnMap.get(inputColumn.getName());
|
218
126
|
PageBuildable op = new PageBuildable() {
|
219
127
|
public void run() throws DataException {
|
220
|
-
|
128
|
+
columnCaster.setFromBoolean(outputColumn, pageReader.getBoolean(inputColumn));
|
221
129
|
}
|
222
130
|
};
|
223
131
|
withStopOnInvalidRecord(op, inputColumn, outputColumn);
|
@@ -229,7 +137,7 @@ public class ColumnVisitorImpl
|
|
229
137
|
final Column outputColumn = outputColumnMap.get(inputColumn.getName());
|
230
138
|
PageBuildable op = new PageBuildable() {
|
231
139
|
public void run() throws DataException {
|
232
|
-
|
140
|
+
columnCaster.setFromLong(outputColumn, pageReader.getLong(inputColumn));
|
233
141
|
}
|
234
142
|
};
|
235
143
|
withStopOnInvalidRecord(op, inputColumn, outputColumn);
|
@@ -241,7 +149,7 @@ public class ColumnVisitorImpl
|
|
241
149
|
final Column outputColumn = outputColumnMap.get(inputColumn.getName());
|
242
150
|
PageBuildable op = new PageBuildable() {
|
243
151
|
public void run() throws DataException {
|
244
|
-
|
152
|
+
columnCaster.setFromDouble(outputColumn, pageReader.getDouble(inputColumn));
|
245
153
|
}
|
246
154
|
};
|
247
155
|
withStopOnInvalidRecord(op, inputColumn, outputColumn);
|
@@ -254,8 +162,7 @@ public class ColumnVisitorImpl
|
|
254
162
|
final TimestampParser timestampParser = timestampParserMap.get(inputColumn.getName());
|
255
163
|
PageBuildable op = new PageBuildable() {
|
256
164
|
public void run() throws DataException {
|
257
|
-
|
258
|
-
pageBuilder, outputColumn, pageReader.getString(inputColumn), timestampParser);
|
165
|
+
columnCaster.setFromString(outputColumn, pageReader.getString(inputColumn), timestampParser);
|
259
166
|
}
|
260
167
|
};
|
261
168
|
withStopOnInvalidRecord(op, inputColumn, outputColumn);
|
@@ -268,8 +175,7 @@ public class ColumnVisitorImpl
|
|
268
175
|
final TimestampFormatter timestampFormatter = timestampFormatterMap.get(inputColumn.getName());
|
269
176
|
PageBuildable op = new PageBuildable() {
|
270
177
|
public void run() throws DataException {
|
271
|
-
|
272
|
-
pageBuilder, outputColumn, pageReader.getTimestamp(inputColumn), timestampFormatter);
|
178
|
+
columnCaster.setFromTimestamp(outputColumn, pageReader.getTimestamp(inputColumn), timestampFormatter);
|
273
179
|
}
|
274
180
|
};
|
275
181
|
withStopOnInvalidRecord(op, inputColumn, outputColumn);
|
@@ -278,13 +184,10 @@ public class ColumnVisitorImpl
|
|
278
184
|
@Override
|
279
185
|
public void jsonColumn(final Column inputColumn)
|
280
186
|
{
|
281
|
-
String jsonPath = new StringBuilder("$.").append(inputColumn.getName()).toString();
|
282
|
-
Value value = pageReader.getJson(inputColumn);
|
283
|
-
final Value castedValue = castJsonRecursively(task, jsonPath, value);
|
284
187
|
final Column outputColumn = outputColumnMap.get(inputColumn.getName());
|
285
188
|
PageBuildable op = new PageBuildable() {
|
286
189
|
public void run() throws DataException {
|
287
|
-
|
190
|
+
columnCaster.setFromJson(outputColumn, pageReader.getJson(inputColumn));
|
288
191
|
}
|
289
192
|
};
|
290
193
|
withStopOnInvalidRecord(op, inputColumn, outputColumn);
|
@@ -4,21 +4,28 @@ import org.embulk.filter.typecast.cast.*;
|
|
4
4
|
|
5
5
|
import org.embulk.spi.DataException;
|
6
6
|
import org.embulk.spi.type.*;
|
7
|
+
import org.msgpack.value.BooleanValue;
|
8
|
+
import org.msgpack.value.IntegerValue;
|
9
|
+
import org.msgpack.value.FloatValue;
|
10
|
+
import org.msgpack.value.StringValue;
|
7
11
|
import org.msgpack.value.Value;
|
8
12
|
import org.msgpack.value.ValueFactory;
|
9
13
|
|
10
|
-
class
|
11
|
-
|
14
|
+
class JsonCaster
|
15
|
+
{
|
16
|
+
public JsonCaster()
|
17
|
+
{
|
18
|
+
}
|
19
|
+
|
20
|
+
public Value fromBoolean(Type outputType, BooleanValue value) {
|
12
21
|
if (outputType instanceof BooleanType) {
|
13
|
-
return
|
22
|
+
return value;
|
14
23
|
} else if (outputType instanceof LongType) {
|
15
|
-
return ValueFactory.newInteger(BooleanCast.asLong(value));
|
24
|
+
return ValueFactory.newInteger(BooleanCast.asLong(value.getBoolean()));
|
16
25
|
} else if (outputType instanceof DoubleType) {
|
17
|
-
return ValueFactory.newFloat(BooleanCast.asDouble(value));
|
26
|
+
return ValueFactory.newFloat(BooleanCast.asDouble(value.getBoolean()));
|
18
27
|
} else if (outputType instanceof StringType) {
|
19
|
-
return ValueFactory.newString(BooleanCast.asString(value));
|
20
|
-
} else if (outputType instanceof TimestampType) {
|
21
|
-
throw new DataException(String.format("no timestamp type in json: \"%s\"", value));
|
28
|
+
return ValueFactory.newString(BooleanCast.asString(value.getBoolean()));
|
22
29
|
} else if (outputType instanceof JsonType) {
|
23
30
|
throw new DataException(String.format("cannot cast boolean to json: \"%s\"", value));
|
24
31
|
} else {
|
@@ -27,18 +34,16 @@ class TypecastJsonBuilder {
|
|
27
34
|
}
|
28
35
|
}
|
29
36
|
|
30
|
-
|
37
|
+
public Value fromLong(Type outputType, IntegerValue value)
|
31
38
|
{
|
32
39
|
if (outputType instanceof BooleanType) {
|
33
|
-
return ValueFactory.newBoolean(LongCast.asBoolean(value));
|
40
|
+
return ValueFactory.newBoolean(LongCast.asBoolean(value.asLong()));
|
34
41
|
} else if (outputType instanceof LongType) {
|
35
|
-
return
|
42
|
+
return value;
|
36
43
|
} else if (outputType instanceof DoubleType) {
|
37
|
-
return ValueFactory.newFloat(LongCast.asDouble(value));
|
44
|
+
return ValueFactory.newFloat(LongCast.asDouble(value.asLong()));
|
38
45
|
} else if (outputType instanceof StringType) {
|
39
|
-
return ValueFactory.newString(LongCast.asString(value));
|
40
|
-
} else if (outputType instanceof TimestampType) {
|
41
|
-
throw new DataException(String.format("no timestamp type in json: \"%s\"", value));
|
46
|
+
return ValueFactory.newString(LongCast.asString(value.asLong()));
|
42
47
|
} else if (outputType instanceof JsonType) {
|
43
48
|
throw new DataException(String.format("cannot cast long to json:: \"%s\"", value));
|
44
49
|
} else {
|
@@ -47,18 +52,16 @@ class TypecastJsonBuilder {
|
|
47
52
|
}
|
48
53
|
}
|
49
54
|
|
50
|
-
|
55
|
+
public Value fromDouble(Type outputType, FloatValue value)
|
51
56
|
{
|
52
57
|
if (outputType instanceof BooleanType) {
|
53
|
-
return ValueFactory.newBoolean(DoubleCast.asBoolean(value));
|
58
|
+
return ValueFactory.newBoolean(DoubleCast.asBoolean(value.toDouble()));
|
54
59
|
} else if (outputType instanceof LongType) {
|
55
|
-
return ValueFactory.newInteger(DoubleCast.asLong(value));
|
60
|
+
return ValueFactory.newInteger(DoubleCast.asLong(value.toDouble()));
|
56
61
|
} else if (outputType instanceof DoubleType) {
|
57
|
-
return
|
62
|
+
return value;
|
58
63
|
} else if (outputType instanceof StringType) {
|
59
|
-
return ValueFactory.newString(DoubleCast.asString(value));
|
60
|
-
} else if (outputType instanceof TimestampType) {
|
61
|
-
throw new DataException(String.format("no timestamp type in json: \"%s\"", value));
|
64
|
+
return ValueFactory.newString(DoubleCast.asString(value.toDouble()));
|
62
65
|
} else if (outputType instanceof JsonType) {
|
63
66
|
throw new DataException(String.format("cannot cast double to json:: \"%s\"", value));
|
64
67
|
} else {
|
@@ -67,40 +70,18 @@ class TypecastJsonBuilder {
|
|
67
70
|
}
|
68
71
|
}
|
69
72
|
|
70
|
-
|
71
|
-
{
|
72
|
-
if (outputType instanceof BooleanType) {
|
73
|
-
return ValueFactory.newBoolean(StringCast.asBoolean(value));
|
74
|
-
} else if (outputType instanceof LongType) {
|
75
|
-
return ValueFactory.newInteger(StringCast.asLong(value));
|
76
|
-
} else if (outputType instanceof DoubleType) {
|
77
|
-
return ValueFactory.newFloat(StringCast.asDouble(value));
|
78
|
-
} else if (outputType instanceof StringType) {
|
79
|
-
return ValueFactory.newString(StringCast.asString(value));
|
80
|
-
} else if (outputType instanceof TimestampType) {
|
81
|
-
throw new DataException(String.format("no timestamp type in json: \"%s\"", value));
|
82
|
-
} else if (outputType instanceof JsonType) {
|
83
|
-
return StringCast.asJson(value);
|
84
|
-
} else {
|
85
|
-
assert(false);
|
86
|
-
return null;
|
87
|
-
}
|
88
|
-
}
|
89
|
-
|
90
|
-
static Value getFromJson(Type outputType, Value value)
|
73
|
+
public Value fromString(Type outputType, StringValue value)
|
91
74
|
{
|
92
75
|
if (outputType instanceof BooleanType) {
|
93
|
-
return ValueFactory.newBoolean(
|
76
|
+
return ValueFactory.newBoolean(StringCast.asBoolean(value.asString()));
|
94
77
|
} else if (outputType instanceof LongType) {
|
95
|
-
return ValueFactory.newInteger(
|
78
|
+
return ValueFactory.newInteger(StringCast.asLong(value.asString()));
|
96
79
|
} else if (outputType instanceof DoubleType) {
|
97
|
-
return ValueFactory.newFloat(
|
80
|
+
return ValueFactory.newFloat(StringCast.asDouble(value.asString()));
|
98
81
|
} else if (outputType instanceof StringType) {
|
99
|
-
return ValueFactory.newString(JsonCast.asString(value));
|
100
|
-
} else if (outputType instanceof TimestampType) {
|
101
|
-
throw new DataException(String.format("no timestamp type in json: \"%s\"", value));
|
102
|
-
} else if (outputType instanceof JsonType) {
|
103
82
|
return value;
|
83
|
+
} else if (outputType instanceof JsonType) {
|
84
|
+
return StringCast.asJson(value.asString());
|
104
85
|
} else {
|
105
86
|
assert(false);
|
106
87
|
return null;
|
@@ -0,0 +1,136 @@
|
|
1
|
+
package org.embulk.filter.typecast;
|
2
|
+
|
3
|
+
import org.embulk.spi.*;
|
4
|
+
import org.embulk.spi.type.Type;
|
5
|
+
import org.msgpack.value.ArrayValue;
|
6
|
+
import org.msgpack.value.MapValue;
|
7
|
+
import org.msgpack.value.Value;
|
8
|
+
import org.msgpack.value.ValueFactory;
|
9
|
+
|
10
|
+
import org.embulk.filter.typecast.TypecastFilterPlugin.ColumnConfig;
|
11
|
+
import org.embulk.filter.typecast.TypecastFilterPlugin.PluginTask;
|
12
|
+
|
13
|
+
import org.slf4j.Logger;
|
14
|
+
|
15
|
+
import java.util.HashMap;
|
16
|
+
import java.util.HashSet;
|
17
|
+
import java.util.Map;
|
18
|
+
|
19
|
+
public class JsonVisitor
|
20
|
+
{
|
21
|
+
private static final Logger logger = Exec.getLogger(TypecastFilterPlugin.class);
|
22
|
+
private final PluginTask task;
|
23
|
+
private final Schema inputSchema;
|
24
|
+
private final Schema outputSchema;
|
25
|
+
private final HashSet<String> shouldVisitSet = new HashSet<>();
|
26
|
+
private final HashMap<String, Type> jsonPathTypeMap = new HashMap<>();
|
27
|
+
private final JsonCaster jsonCaster = new JsonCaster();
|
28
|
+
|
29
|
+
JsonVisitor(PluginTask task, Schema inputSchema, Schema outputSchema)
|
30
|
+
{
|
31
|
+
this.task = task;
|
32
|
+
this.inputSchema = inputSchema;
|
33
|
+
this.outputSchema = outputSchema;
|
34
|
+
|
35
|
+
buildShouldVisitSet();
|
36
|
+
buildJsonPathTypeMap();
|
37
|
+
}
|
38
|
+
|
39
|
+
private void buildJsonPathTypeMap()
|
40
|
+
{
|
41
|
+
// json path => Type
|
42
|
+
for (ColumnConfig columnConfig : task.getColumns()) {
|
43
|
+
String name = columnConfig.getName();
|
44
|
+
if (!name.startsWith("$.")) {
|
45
|
+
continue;
|
46
|
+
}
|
47
|
+
Type type = columnConfig.getType();
|
48
|
+
this.jsonPathTypeMap.put(name, type);
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
private void buildShouldVisitSet()
|
53
|
+
{
|
54
|
+
// json partial path => Boolean to avoid unnecessary type: json visit
|
55
|
+
for (ColumnConfig columnConfig : task.getColumns()) {
|
56
|
+
String name = columnConfig.getName();
|
57
|
+
if (!name.startsWith("$.")) {
|
58
|
+
continue;
|
59
|
+
}
|
60
|
+
String[] parts = name.split("\\.");
|
61
|
+
StringBuilder partialPath = new StringBuilder("$");
|
62
|
+
for (int i = 1; i < parts.length; i++) {
|
63
|
+
if (parts[i].contains("[")) {
|
64
|
+
String[] arrayParts = parts[i].split("\\[");
|
65
|
+
partialPath.append(".").append(arrayParts[0]);
|
66
|
+
this.shouldVisitSet.add(partialPath.toString());
|
67
|
+
for (int j = 1; j < arrayParts.length; j++) {
|
68
|
+
partialPath.append("[").append(arrayParts[j]);
|
69
|
+
this.shouldVisitSet.add(partialPath.toString());
|
70
|
+
}
|
71
|
+
}
|
72
|
+
else {
|
73
|
+
partialPath.append(".").append(parts[i]);
|
74
|
+
this.shouldVisitSet.add(partialPath.toString());
|
75
|
+
}
|
76
|
+
}
|
77
|
+
}
|
78
|
+
}
|
79
|
+
|
80
|
+
private boolean shouldVisit(String jsonPath)
|
81
|
+
{
|
82
|
+
return shouldVisitSet.contains(jsonPath);
|
83
|
+
}
|
84
|
+
|
85
|
+
public Value visit(String jsonPath, Value value)
|
86
|
+
{
|
87
|
+
if (!shouldVisit(jsonPath)) {
|
88
|
+
return value;
|
89
|
+
}
|
90
|
+
if (value.isArrayValue()) {
|
91
|
+
ArrayValue arrayValue = value.asArrayValue();
|
92
|
+
int size = arrayValue.size();
|
93
|
+
Value[] newValue = new Value[size];
|
94
|
+
for (int i = 0; i < size; i++) {
|
95
|
+
String k = new StringBuilder(jsonPath).append("[").append(Integer.toString(i)).append("]").toString();
|
96
|
+
Value v = arrayValue.get(i);
|
97
|
+
newValue[i] = visit(k, v);
|
98
|
+
}
|
99
|
+
return ValueFactory.newArray(newValue, true);
|
100
|
+
}
|
101
|
+
else if (value.isMapValue()) {
|
102
|
+
MapValue mapValue = value.asMapValue();
|
103
|
+
int size = mapValue.size() * 2;
|
104
|
+
Value[] newValue = new Value[size];
|
105
|
+
int i = 0;
|
106
|
+
for (Map.Entry<Value, Value> entry : mapValue.entrySet()) {
|
107
|
+
Value k = entry.getKey();
|
108
|
+
Value v = entry.getValue();
|
109
|
+
String newPath = new StringBuilder(jsonPath).append(".").append(k.asStringValue().asString()).toString();
|
110
|
+
Value r = visit(newPath, v);
|
111
|
+
newValue[i++] = k;
|
112
|
+
newValue[i++] = r;
|
113
|
+
}
|
114
|
+
return ValueFactory.newMap(newValue, true);
|
115
|
+
}
|
116
|
+
else if (value.isBooleanValue()) {
|
117
|
+
Type outputType = jsonPathTypeMap.get(jsonPath);
|
118
|
+
return jsonCaster.fromBoolean(outputType, value.asBooleanValue());
|
119
|
+
}
|
120
|
+
else if (value.isIntegerValue()) {
|
121
|
+
Type outputType = jsonPathTypeMap.get(jsonPath);
|
122
|
+
return jsonCaster.fromLong(outputType, value.asIntegerValue());
|
123
|
+
}
|
124
|
+
else if (value.isFloatValue()) {
|
125
|
+
Type outputType = jsonPathTypeMap.get(jsonPath);
|
126
|
+
return jsonCaster.fromDouble(outputType, value.asFloatValue());
|
127
|
+
}
|
128
|
+
else if (value.isStringValue()) {
|
129
|
+
Type outputType = jsonPathTypeMap.get(jsonPath);
|
130
|
+
return jsonCaster.fromString(outputType, value.asStringValue());
|
131
|
+
}
|
132
|
+
else {
|
133
|
+
return value;
|
134
|
+
}
|
135
|
+
}
|
136
|
+
}
|
@@ -4,15 +4,18 @@ import com.google.common.base.Optional;
|
|
4
4
|
import com.google.common.collect.ImmutableList;
|
5
5
|
import org.embulk.config.Config;
|
6
6
|
import org.embulk.config.ConfigDefault;
|
7
|
+
import org.embulk.config.ConfigException;
|
7
8
|
import org.embulk.config.ConfigInject;
|
8
9
|
import org.embulk.config.ConfigSource;
|
9
10
|
import org.embulk.config.Task;
|
10
11
|
import org.embulk.config.TaskSource;
|
11
12
|
|
12
13
|
import org.embulk.spi.*;
|
14
|
+
import org.embulk.spi.time.Timestamp;
|
13
15
|
import org.embulk.spi.time.TimestampFormatter;
|
14
16
|
import org.embulk.spi.time.TimestampParser;
|
15
17
|
|
18
|
+
import org.embulk.spi.type.TimestampType;
|
16
19
|
import org.embulk.spi.type.Type;
|
17
20
|
import org.joda.time.DateTimeZone;
|
18
21
|
import org.jruby.embed.ScriptingContainer;
|
@@ -86,7 +89,7 @@ public class TypecastFilterPlugin implements FilterPlugin
|
|
86
89
|
// throw if column does not exist
|
87
90
|
for (ColumnConfig columnConfig : columnConfigs) {
|
88
91
|
String name = columnConfig.getName();
|
89
|
-
if (name.startsWith("$.")) {
|
92
|
+
if (name.startsWith("$.")) { // check only top level column name
|
90
93
|
String firstName = name.split("\\.", 3)[1];
|
91
94
|
inputSchema.lookupColumn(firstName);
|
92
95
|
}
|
@@ -94,6 +97,13 @@ public class TypecastFilterPlugin implements FilterPlugin
|
|
94
97
|
inputSchema.lookupColumn(name);
|
95
98
|
}
|
96
99
|
}
|
100
|
+
// throw if timestamp is specified in json path
|
101
|
+
for (ColumnConfig columnConfig : columnConfigs) {
|
102
|
+
String name = columnConfig.getName();
|
103
|
+
if (name.startsWith("$.") && columnConfig.getType() instanceof TimestampType) {
|
104
|
+
throw new ConfigException(String.format("embulk-filter-typecast: timestamp type is not supported in json column: \"%s\"", name));
|
105
|
+
}
|
106
|
+
}
|
97
107
|
}
|
98
108
|
|
99
109
|
private Schema buildOuputSchema(final PluginTask task, final Schema inputSchema)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-typecast
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naotoshi Seo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-04-
|
11
|
+
date: 2016-04-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,7 @@ files:
|
|
52
52
|
- README.md
|
53
53
|
- build.gradle
|
54
54
|
- config/checkstyle/checkstyle.xml
|
55
|
+
- example/example.csv
|
55
56
|
- example/example.yml
|
56
57
|
- example/example2.yml
|
57
58
|
- gradle/wrapper/gradle-wrapper.jar
|
@@ -59,10 +60,11 @@ files:
|
|
59
60
|
- gradlew
|
60
61
|
- gradlew.bat
|
61
62
|
- lib/embulk/filter/typecast.rb
|
63
|
+
- src/main/java/org/embulk/filter/typecast/ColumnCaster.java
|
62
64
|
- src/main/java/org/embulk/filter/typecast/ColumnVisitorImpl.java
|
65
|
+
- src/main/java/org/embulk/filter/typecast/JsonCaster.java
|
66
|
+
- src/main/java/org/embulk/filter/typecast/JsonVisitor.java
|
63
67
|
- src/main/java/org/embulk/filter/typecast/TypecastFilterPlugin.java
|
64
|
-
- src/main/java/org/embulk/filter/typecast/TypecastJsonBuilder.java
|
65
|
-
- src/main/java/org/embulk/filter/typecast/TypecastPageBuilder.java
|
66
68
|
- src/main/java/org/embulk/filter/typecast/cast/BooleanCast.java
|
67
69
|
- src/main/java/org/embulk/filter/typecast/cast/DoubleCast.java
|
68
70
|
- src/main/java/org/embulk/filter/typecast/cast/JsonCast.java
|
@@ -70,7 +72,7 @@ files:
|
|
70
72
|
- src/main/java/org/embulk/filter/typecast/cast/StringCast.java
|
71
73
|
- src/main/java/org/embulk/filter/typecast/cast/TimestampCast.java
|
72
74
|
- src/test/java/org/embulk/filter/TestTypecastFilterPlugin.java
|
73
|
-
- classpath/embulk-filter-typecast-0.1.
|
75
|
+
- classpath/embulk-filter-typecast-0.1.2.jar
|
74
76
|
homepage: https://github.com/sonots/embulk-filter-typecast
|
75
77
|
licenses:
|
76
78
|
- MIT
|