embulk-filter-column 0.4.0 → 0.5.0.pre1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +6 -0
- data/README.md +23 -1
- data/build.gradle +1 -1
- data/example/columns.yml +2 -1
- data/example/example.yml +35 -0
- data/example/json_add_columns.yml +31 -0
- data/example/json_columns.yml +23 -0
- data/example/json_drop_columns.yml +22 -0
- data/gradlew +3 -3
- data/lib/embulk/filter/column.rb +1 -1
- data/settings.gradle +1 -0
- data/src/main/java/org/embulk/filter/column/ColumnFilterPlugin.java +260 -0
- data/src/main/java/org/embulk/filter/column/ColumnVisitorImpl.java +275 -0
- data/src/main/java/org/embulk/filter/column/JsonColumn.java +104 -0
- data/src/main/java/org/embulk/filter/column/JsonVisitor.java +328 -0
- metadata +14 -6
- data/src/main/java/org/embulk/filter/ColumnFilterPlugin.java +0 -462
@@ -0,0 +1,275 @@
|
|
1
|
+
package org.embulk.filter.column;
|
2
|
+
|
3
|
+
import com.google.common.base.Throwables;
|
4
|
+
|
5
|
+
import org.embulk.filter.column.ColumnFilterPlugin.ColumnConfig;
|
6
|
+
import org.embulk.filter.column.ColumnFilterPlugin.PluginTask;
|
7
|
+
|
8
|
+
import org.embulk.spi.Column;
|
9
|
+
import org.embulk.spi.ColumnVisitor;
|
10
|
+
import org.embulk.spi.Exec;
|
11
|
+
import org.embulk.spi.PageBuilder;
|
12
|
+
import org.embulk.spi.PageReader;
|
13
|
+
import org.embulk.spi.Schema;
|
14
|
+
import org.embulk.spi.SchemaConfigException;
|
15
|
+
import org.embulk.spi.json.JsonParser;
|
16
|
+
import org.embulk.spi.time.Timestamp;
|
17
|
+
import org.embulk.spi.time.TimestampParseException;
|
18
|
+
import org.embulk.spi.time.TimestampParser;
|
19
|
+
import org.embulk.spi.type.BooleanType;
|
20
|
+
import org.embulk.spi.type.DoubleType;
|
21
|
+
import org.embulk.spi.type.JsonType;
|
22
|
+
import org.embulk.spi.type.LongType;
|
23
|
+
import org.embulk.spi.type.StringType;
|
24
|
+
import org.embulk.spi.type.TimestampType;
|
25
|
+
import org.embulk.spi.type.Type;
|
26
|
+
|
27
|
+
import org.joda.time.DateTimeZone;
|
28
|
+
import org.msgpack.value.Value;
|
29
|
+
import org.slf4j.Logger;
|
30
|
+
|
31
|
+
import java.util.HashMap;
|
32
|
+
import java.util.List;
|
33
|
+
|
34
|
+
public class ColumnVisitorImpl implements ColumnVisitor
|
35
|
+
{
|
36
|
+
private static final Logger logger = Exec.getLogger(ColumnFilterPlugin.class);
|
37
|
+
private final PluginTask task;
|
38
|
+
private final Schema inputSchema;
|
39
|
+
private final Schema outputSchema;
|
40
|
+
private final PageReader pageReader;
|
41
|
+
private final PageBuilder pageBuilder;
|
42
|
+
private final HashMap<Column, Column> outputInputColumnMap = new HashMap<>();
|
43
|
+
private final HashMap<Column, Object> outputDefaultMap = new HashMap<>();
|
44
|
+
private final JsonVisitor jsonVisitor;
|
45
|
+
|
46
|
+
ColumnVisitorImpl(PluginTask task, Schema inputSchema, Schema outputSchema, PageReader pageReader, PageBuilder pageBuilder)
|
47
|
+
{
|
48
|
+
this.task = task;
|
49
|
+
this.inputSchema = inputSchema;
|
50
|
+
this.outputSchema = outputSchema;
|
51
|
+
this.pageReader = pageReader;
|
52
|
+
this.pageBuilder = pageBuilder;
|
53
|
+
buildOutputInputColumnMap();
|
54
|
+
buildOutputDefaultMap();
|
55
|
+
this.jsonVisitor = new JsonVisitor(task, inputSchema, outputSchema);
|
56
|
+
}
|
57
|
+
|
58
|
+
// Map outputColumn => inputColumn
|
59
|
+
private void buildOutputInputColumnMap()
|
60
|
+
{
|
61
|
+
for (Column outputColumn : outputSchema.getColumns()) {
|
62
|
+
String name = outputColumn.getName();
|
63
|
+
String srcName = getSrc(name, task.getColumns());
|
64
|
+
if (srcName == null) {
|
65
|
+
srcName = getSrc(name, task.getAddColumns());
|
66
|
+
}
|
67
|
+
if (srcName == null) {
|
68
|
+
srcName = name;
|
69
|
+
}
|
70
|
+
Column inputColumn;
|
71
|
+
try {
|
72
|
+
inputColumn = inputSchema.lookupColumn(srcName);
|
73
|
+
}
|
74
|
+
catch (SchemaConfigException ex) {
|
75
|
+
inputColumn = null;
|
76
|
+
}
|
77
|
+
outputInputColumnMap.put(outputColumn, inputColumn); // NOTE: inputColumn would be null
|
78
|
+
}
|
79
|
+
}
|
80
|
+
|
81
|
+
// Map outputColumn => default value if present
|
82
|
+
private void buildOutputDefaultMap()
|
83
|
+
{
|
84
|
+
for (Column outputColumn : outputSchema.getColumns()) {
|
85
|
+
String name = outputColumn.getName();
|
86
|
+
Type type = outputColumn.getType();
|
87
|
+
|
88
|
+
Object defaultValue = getDefault(task, name, type, task.getColumns());
|
89
|
+
if (defaultValue == null) {
|
90
|
+
defaultValue = getDefault(task, name, type, task.getAddColumns());
|
91
|
+
}
|
92
|
+
if (defaultValue != null) {
|
93
|
+
outputDefaultMap.put(outputColumn, defaultValue);
|
94
|
+
}
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
static String getSrc(String name, List<ColumnConfig> columnConfigs)
|
99
|
+
{
|
100
|
+
for (ColumnConfig columnConfig : columnConfigs) {
|
101
|
+
if (columnConfig.getName().equals(name) &&
|
102
|
+
columnConfig.getSrc().isPresent()) {
|
103
|
+
return columnConfig.getSrc().get();
|
104
|
+
}
|
105
|
+
}
|
106
|
+
return null;
|
107
|
+
}
|
108
|
+
|
109
|
+
static Object getDefault(PluginTask task, String name, Type type, List<ColumnConfig> columnConfigs)
|
110
|
+
{
|
111
|
+
for (ColumnConfig columnConfig : columnConfigs) {
|
112
|
+
if (columnConfig.getName().equals(name)) {
|
113
|
+
return getDefault(task, name, type, columnConfig);
|
114
|
+
}
|
115
|
+
}
|
116
|
+
return null;
|
117
|
+
}
|
118
|
+
|
119
|
+
static Object getDefault(PluginTask task, String name, Type type, ColumnConfig columnConfig)
|
120
|
+
{
|
121
|
+
if (type instanceof BooleanType) {
|
122
|
+
if (columnConfig.getDefault().isPresent()) {
|
123
|
+
return (Boolean) columnConfig.getDefault().get();
|
124
|
+
}
|
125
|
+
}
|
126
|
+
else if (type instanceof LongType) {
|
127
|
+
if (columnConfig.getDefault().isPresent()) {
|
128
|
+
return new Long(columnConfig.getDefault().get().toString());
|
129
|
+
}
|
130
|
+
}
|
131
|
+
else if (type instanceof DoubleType) {
|
132
|
+
if (columnConfig.getDefault().isPresent()) {
|
133
|
+
return new Double(columnConfig.getDefault().get().toString());
|
134
|
+
}
|
135
|
+
}
|
136
|
+
else if (type instanceof StringType) {
|
137
|
+
if (columnConfig.getDefault().isPresent()) {
|
138
|
+
return columnConfig.getDefault().get();
|
139
|
+
}
|
140
|
+
}
|
141
|
+
else if (type instanceof JsonType) {
|
142
|
+
if (columnConfig.getDefault().isPresent()) {
|
143
|
+
JsonParser parser = new JsonParser();
|
144
|
+
return parser.parse((String) columnConfig.getDefault().get());
|
145
|
+
}
|
146
|
+
}
|
147
|
+
else if (type instanceof TimestampType) {
|
148
|
+
if (columnConfig.getDefault().isPresent()) {
|
149
|
+
String time = (String) columnConfig.getDefault().get();
|
150
|
+
String format = columnConfig.getFormat().or(task.getDefaultTimestampFormat());
|
151
|
+
DateTimeZone timezone = columnConfig.getTimeZone().or(task.getDefaultTimeZone());
|
152
|
+
TimestampParser parser = new TimestampParser(task.getJRuby(), format, timezone);
|
153
|
+
try {
|
154
|
+
Timestamp defaultValue = parser.parse(time);
|
155
|
+
return defaultValue;
|
156
|
+
}
|
157
|
+
catch (TimestampParseException ex) {
|
158
|
+
throw Throwables.propagate(ex);
|
159
|
+
}
|
160
|
+
}
|
161
|
+
}
|
162
|
+
return null;
|
163
|
+
}
|
164
|
+
|
165
|
+
@Override
|
166
|
+
public void booleanColumn(Column outputColumn)
|
167
|
+
{
|
168
|
+
Column inputColumn = outputInputColumnMap.get(outputColumn);
|
169
|
+
if (inputColumn == null || pageReader.isNull(inputColumn)) {
|
170
|
+
Boolean defaultValue = (Boolean) outputDefaultMap.get(outputColumn);
|
171
|
+
if (defaultValue == null) {
|
172
|
+
pageBuilder.setNull(outputColumn);
|
173
|
+
}
|
174
|
+
else {
|
175
|
+
pageBuilder.setBoolean(outputColumn, defaultValue.booleanValue());
|
176
|
+
}
|
177
|
+
}
|
178
|
+
else {
|
179
|
+
pageBuilder.setBoolean(outputColumn, pageReader.getBoolean(inputColumn));
|
180
|
+
}
|
181
|
+
}
|
182
|
+
|
183
|
+
@Override
|
184
|
+
public void longColumn(Column outputColumn)
|
185
|
+
{
|
186
|
+
Column inputColumn = outputInputColumnMap.get(outputColumn);
|
187
|
+
if (inputColumn == null || pageReader.isNull(inputColumn)) {
|
188
|
+
Long defaultValue = (Long) outputDefaultMap.get(outputColumn);
|
189
|
+
if (defaultValue == null) {
|
190
|
+
pageBuilder.setNull(outputColumn);
|
191
|
+
}
|
192
|
+
else {
|
193
|
+
pageBuilder.setLong(outputColumn, defaultValue.longValue());
|
194
|
+
}
|
195
|
+
}
|
196
|
+
else {
|
197
|
+
pageBuilder.setLong(outputColumn, pageReader.getLong(inputColumn));
|
198
|
+
}
|
199
|
+
}
|
200
|
+
|
201
|
+
@Override
|
202
|
+
public void doubleColumn(Column outputColumn)
|
203
|
+
{
|
204
|
+
Column inputColumn = outputInputColumnMap.get(outputColumn);
|
205
|
+
if (inputColumn == null || pageReader.isNull(inputColumn)) {
|
206
|
+
Double defaultValue = (Double) outputDefaultMap.get(outputColumn);
|
207
|
+
if (defaultValue == null) {
|
208
|
+
pageBuilder.setNull(outputColumn);
|
209
|
+
}
|
210
|
+
else {
|
211
|
+
pageBuilder.setDouble(outputColumn, defaultValue.doubleValue());
|
212
|
+
}
|
213
|
+
}
|
214
|
+
else {
|
215
|
+
pageBuilder.setDouble(outputColumn, pageReader.getDouble(inputColumn));
|
216
|
+
}
|
217
|
+
}
|
218
|
+
|
219
|
+
@Override
|
220
|
+
public void stringColumn(Column outputColumn)
|
221
|
+
{
|
222
|
+
Column inputColumn = outputInputColumnMap.get(outputColumn);
|
223
|
+
if (inputColumn == null || pageReader.isNull(inputColumn)) {
|
224
|
+
String defaultValue = (String) outputDefaultMap.get(outputColumn);
|
225
|
+
if (defaultValue == null) {
|
226
|
+
pageBuilder.setNull(outputColumn);
|
227
|
+
}
|
228
|
+
else {
|
229
|
+
pageBuilder.setString(outputColumn, defaultValue);
|
230
|
+
}
|
231
|
+
}
|
232
|
+
else {
|
233
|
+
pageBuilder.setString(outputColumn, pageReader.getString(inputColumn));
|
234
|
+
}
|
235
|
+
}
|
236
|
+
|
237
|
+
@Override
|
238
|
+
public void jsonColumn(Column outputColumn)
|
239
|
+
{
|
240
|
+
Column inputColumn = outputInputColumnMap.get(outputColumn);
|
241
|
+
if (inputColumn == null || pageReader.isNull(inputColumn)) {
|
242
|
+
Value defaultValue = (Value) outputDefaultMap.get(outputColumn);
|
243
|
+
if (defaultValue == null) {
|
244
|
+
pageBuilder.setNull(outputColumn);
|
245
|
+
}
|
246
|
+
else {
|
247
|
+
String jsonPath = new StringBuilder("$.").append(inputColumn.getName()).toString();
|
248
|
+
pageBuilder.setJson(outputColumn, jsonVisitor.visit(jsonPath, defaultValue));
|
249
|
+
}
|
250
|
+
}
|
251
|
+
else {
|
252
|
+
Value value = pageReader.getJson(inputColumn);
|
253
|
+
String jsonPath = new StringBuilder("$.").append(inputColumn.getName()).toString();
|
254
|
+
pageBuilder.setJson(outputColumn, jsonVisitor.visit(jsonPath, value));
|
255
|
+
}
|
256
|
+
}
|
257
|
+
|
258
|
+
@Override
|
259
|
+
public void timestampColumn(Column outputColumn)
|
260
|
+
{
|
261
|
+
Column inputColumn = outputInputColumnMap.get(outputColumn);
|
262
|
+
if (inputColumn == null || pageReader.isNull(inputColumn)) {
|
263
|
+
Timestamp defaultValue = (Timestamp) outputDefaultMap.get(outputColumn);
|
264
|
+
if (defaultValue == null) {
|
265
|
+
pageBuilder.setNull(outputColumn);
|
266
|
+
}
|
267
|
+
else {
|
268
|
+
pageBuilder.setTimestamp(outputColumn, defaultValue);
|
269
|
+
}
|
270
|
+
}
|
271
|
+
else {
|
272
|
+
pageBuilder.setTimestamp(outputColumn, pageReader.getTimestamp(inputColumn));
|
273
|
+
}
|
274
|
+
}
|
275
|
+
}
|
@@ -0,0 +1,104 @@
|
|
1
|
+
package org.embulk.filter.column;
|
2
|
+
|
3
|
+
import org.embulk.spi.type.Type;
|
4
|
+
import org.msgpack.value.StringValue;
|
5
|
+
import org.msgpack.value.Value;
|
6
|
+
import org.msgpack.value.ValueFactory;
|
7
|
+
|
8
|
+
public class JsonColumn
|
9
|
+
{
|
10
|
+
private final String name;
|
11
|
+
private final Type type;
|
12
|
+
private final Value defaultValue;
|
13
|
+
private String objectPath = null; // object path (like directory) of json path
|
14
|
+
private String elementPath = null; // element path (like leaf) of json path
|
15
|
+
private StringValue nameValue = null;
|
16
|
+
private StringValue objectPathValue = null;
|
17
|
+
private StringValue elementPathValue = null;
|
18
|
+
|
19
|
+
public JsonColumn(
|
20
|
+
String name,
|
21
|
+
Type type)
|
22
|
+
{
|
23
|
+
this(name, type, null);
|
24
|
+
}
|
25
|
+
|
26
|
+
public JsonColumn(
|
27
|
+
String name,
|
28
|
+
Type type,
|
29
|
+
Value defaultValue)
|
30
|
+
{
|
31
|
+
this.name = name;
|
32
|
+
this.type = type;
|
33
|
+
this.defaultValue = (defaultValue == null ? ValueFactory.newNil() : defaultValue);
|
34
|
+
this.objectPath = objectPath(name);
|
35
|
+
this.elementPath = elementPath(name);
|
36
|
+
this.nameValue = ValueFactory.newString(name);
|
37
|
+
this.objectPathValue = ValueFactory.newString(objectPath);
|
38
|
+
this.elementPathValue = ValueFactory.newString(elementPath);
|
39
|
+
}
|
40
|
+
|
41
|
+
public String getName()
|
42
|
+
{
|
43
|
+
return name;
|
44
|
+
}
|
45
|
+
|
46
|
+
public Type getType()
|
47
|
+
{
|
48
|
+
return type;
|
49
|
+
}
|
50
|
+
|
51
|
+
public Value getDefaultValue()
|
52
|
+
{
|
53
|
+
return defaultValue;
|
54
|
+
}
|
55
|
+
|
56
|
+
public String getObjectPath()
|
57
|
+
{
|
58
|
+
return objectPath;
|
59
|
+
}
|
60
|
+
|
61
|
+
public String getElementPath()
|
62
|
+
{
|
63
|
+
return elementPath;
|
64
|
+
}
|
65
|
+
|
66
|
+
public StringValue getNameValue()
|
67
|
+
{
|
68
|
+
return nameValue;
|
69
|
+
}
|
70
|
+
|
71
|
+
public StringValue getObjectPathValue()
|
72
|
+
{
|
73
|
+
return objectPathValue;
|
74
|
+
}
|
75
|
+
|
76
|
+
public StringValue getElementPathValue()
|
77
|
+
{
|
78
|
+
return elementPathValue;
|
79
|
+
}
|
80
|
+
|
81
|
+
public static String objectPath(String path)
|
82
|
+
{
|
83
|
+
String[] parts = path.split("\\.");
|
84
|
+
StringBuilder builder = new StringBuilder();
|
85
|
+
builder.append(parts[0]);
|
86
|
+
for (int i = 1; i < parts.length - 1; i++) {
|
87
|
+
builder.append(".").append(parts[i]);
|
88
|
+
}
|
89
|
+
if (parts[parts.length - 1].contains("[")) {
|
90
|
+
String[] arrayParts = parts[parts.length - 1].split("\\[");
|
91
|
+
builder.append(".").append(arrayParts[0]);
|
92
|
+
for (int j = 1; j < arrayParts.length - 1; j++) {
|
93
|
+
builder.append("[").append(arrayParts[j]);
|
94
|
+
}
|
95
|
+
}
|
96
|
+
return builder.toString();
|
97
|
+
}
|
98
|
+
|
99
|
+
public static String elementPath(String path)
|
100
|
+
{
|
101
|
+
String[] parts = path.split("\\.");
|
102
|
+
return parts[parts.length - 1];
|
103
|
+
}
|
104
|
+
}
|
@@ -0,0 +1,328 @@
|
|
1
|
+
package org.embulk.filter.column;
|
2
|
+
|
3
|
+
import org.embulk.config.ConfigException;
|
4
|
+
import org.embulk.filter.column.ColumnFilterPlugin.ColumnConfig;
|
5
|
+
import org.embulk.filter.column.ColumnFilterPlugin.PluginTask;
|
6
|
+
|
7
|
+
import org.embulk.spi.Exec;
|
8
|
+
import org.embulk.spi.Schema;
|
9
|
+
import org.embulk.spi.SchemaConfigException;
|
10
|
+
import org.embulk.spi.type.BooleanType;
|
11
|
+
import org.embulk.spi.type.DoubleType;
|
12
|
+
import org.embulk.spi.type.JsonType;
|
13
|
+
import org.embulk.spi.type.LongType;
|
14
|
+
import org.embulk.spi.type.StringType;
|
15
|
+
import org.embulk.spi.type.TimestampType;
|
16
|
+
import org.embulk.spi.type.Type;
|
17
|
+
import org.msgpack.value.ArrayValue;
|
18
|
+
import org.msgpack.value.MapValue;
|
19
|
+
import org.msgpack.value.Value;
|
20
|
+
import org.msgpack.value.ValueFactory;
|
21
|
+
|
22
|
+
import org.slf4j.Logger;
|
23
|
+
|
24
|
+
import java.util.ArrayList;
|
25
|
+
import java.util.HashMap;
|
26
|
+
import java.util.HashSet;
|
27
|
+
import java.util.LinkedHashMap;
|
28
|
+
import java.util.List;
|
29
|
+
import java.util.Map;
|
30
|
+
|
31
|
+
public class JsonVisitor
|
32
|
+
{
|
33
|
+
private static final Logger logger = Exec.getLogger(ColumnFilterPlugin.class);
|
34
|
+
private final PluginTask task;
|
35
|
+
private final Schema inputSchema;
|
36
|
+
private final Schema outputSchema;
|
37
|
+
private final HashSet<String> shouldVisitSet = new HashSet<>();
|
38
|
+
private final HashMap<String, LinkedHashMap<String, JsonColumn>> jsonColumns = new HashMap<>();
|
39
|
+
private final HashMap<String, LinkedHashMap<String, JsonColumn>> jsonAddColumns = new HashMap<>();
|
40
|
+
private final HashMap<String, HashSet<String>> jsonDropColumns = new HashMap<>();
|
41
|
+
|
42
|
+
JsonVisitor(PluginTask task, Schema inputSchema, Schema outputSchema)
|
43
|
+
{
|
44
|
+
this.task = task;
|
45
|
+
this.inputSchema = inputSchema;
|
46
|
+
this.outputSchema = outputSchema;
|
47
|
+
|
48
|
+
buildShouldVisitSet();
|
49
|
+
buildJsonSchema();
|
50
|
+
}
|
51
|
+
|
52
|
+
static Value getDefault(PluginTask task, String name, Type type, ColumnConfig columnConfig)
|
53
|
+
{
|
54
|
+
Object defaultValue = ColumnVisitorImpl.getDefault(task, name, type, columnConfig);
|
55
|
+
if (defaultValue == null) {
|
56
|
+
return ValueFactory.newNil();
|
57
|
+
}
|
58
|
+
if (type instanceof BooleanType) {
|
59
|
+
return ValueFactory.newBoolean((Boolean) defaultValue);
|
60
|
+
}
|
61
|
+
else if (type instanceof LongType) {
|
62
|
+
return ValueFactory.newInteger((Long) defaultValue);
|
63
|
+
}
|
64
|
+
else if (type instanceof DoubleType) {
|
65
|
+
return ValueFactory.newFloat((Double) defaultValue);
|
66
|
+
}
|
67
|
+
else if (type instanceof StringType) {
|
68
|
+
return ValueFactory.newString((String) defaultValue.toString());
|
69
|
+
}
|
70
|
+
else if (type instanceof JsonType) {
|
71
|
+
return (Value) defaultValue;
|
72
|
+
}
|
73
|
+
else if (type instanceof TimestampType) {
|
74
|
+
throw new ConfigException("type: timestamp is not available in json path");
|
75
|
+
}
|
76
|
+
else {
|
77
|
+
throw new ConfigException(String.format("type: '%s' is not supported", type));
|
78
|
+
}
|
79
|
+
}
|
80
|
+
|
81
|
+
private void jsonColumnsPut(String path, JsonColumn value)
|
82
|
+
{
|
83
|
+
String objectPath = JsonColumn.objectPath(path);
|
84
|
+
if (! jsonColumns.containsKey(objectPath)) {
|
85
|
+
jsonColumns.put(objectPath, new LinkedHashMap<String, JsonColumn>());
|
86
|
+
}
|
87
|
+
jsonColumns.get(objectPath).put(path, value);
|
88
|
+
}
|
89
|
+
|
90
|
+
private void jsonAddColumnsPut(String path, JsonColumn value)
|
91
|
+
{
|
92
|
+
String objectPath = JsonColumn.objectPath(path);
|
93
|
+
if (! jsonAddColumns.containsKey(objectPath)) {
|
94
|
+
jsonAddColumns.put(objectPath, new LinkedHashMap<String, JsonColumn>());
|
95
|
+
}
|
96
|
+
jsonAddColumns.get(objectPath).put(path, value);
|
97
|
+
}
|
98
|
+
|
99
|
+
private void jsonDropColumnsPut(String path)
|
100
|
+
{
|
101
|
+
String objectPath = JsonColumn.objectPath(path);
|
102
|
+
if (! jsonDropColumns.containsKey(objectPath)) {
|
103
|
+
jsonDropColumns.put(objectPath, new HashSet<String>());
|
104
|
+
}
|
105
|
+
jsonDropColumns.get(objectPath).add(path);
|
106
|
+
}
|
107
|
+
|
108
|
+
// build jsonColumns, jsonAddColumns, and jsonDropColumns
|
109
|
+
private void buildJsonSchema()
|
110
|
+
{
|
111
|
+
List<ColumnConfig> columns = task.getColumns();
|
112
|
+
List<ColumnConfig> addColumns = task.getAddColumns();
|
113
|
+
List<ColumnConfig> dropColumns = task.getDropColumns();
|
114
|
+
|
115
|
+
int i = 0;
|
116
|
+
if (dropColumns.size() > 0) {
|
117
|
+
for (ColumnConfig dropColumn : dropColumns) {
|
118
|
+
String name = dropColumn.getName();
|
119
|
+
// skip NON json path notation to build output schema
|
120
|
+
if (! name.startsWith("$.")) {
|
121
|
+
continue;
|
122
|
+
}
|
123
|
+
jsonDropColumnsPut(name);
|
124
|
+
}
|
125
|
+
}
|
126
|
+
else if (columns.size() > 0) {
|
127
|
+
for (ColumnConfig column : columns) {
|
128
|
+
String name = column.getName();
|
129
|
+
// skip NON json path notation to build output schema
|
130
|
+
if (! name.startsWith("$.")) {
|
131
|
+
continue;
|
132
|
+
}
|
133
|
+
if (column.getSrc().isPresent()) {
|
134
|
+
throw new ConfigException(String.format("columns: src is not supported for json path yet: '%s'", name));
|
135
|
+
}
|
136
|
+
else if (column.getType().isPresent() && column.getDefault().isPresent()) { // add column
|
137
|
+
Type type = column.getType().get();
|
138
|
+
Value defaultValue = getDefault(task, name, type, column);
|
139
|
+
jsonColumnsPut(name, new JsonColumn(name, type, defaultValue));
|
140
|
+
}
|
141
|
+
else {
|
142
|
+
Type type = column.getType().isPresent() ? column.getType().get() : null;
|
143
|
+
jsonColumnsPut(name, new JsonColumn(name, type));
|
144
|
+
}
|
145
|
+
}
|
146
|
+
}
|
147
|
+
|
148
|
+
// Add columns to last. If you want to add to head or middle, you can use `columns` option
|
149
|
+
if (addColumns.size() > 0) {
|
150
|
+
for (ColumnConfig column : addColumns) {
|
151
|
+
String name = column.getName();
|
152
|
+
// skip NON json path notation to build output schema
|
153
|
+
if (! name.startsWith("$.")) {
|
154
|
+
continue;
|
155
|
+
}
|
156
|
+
if (column.getSrc().isPresent()) {
|
157
|
+
throw new ConfigException(String.format("add_columns: src is not supported for json path yet: '%s'", name));
|
158
|
+
}
|
159
|
+
else if (column.getType().isPresent() && column.getDefault().isPresent()) { // add column
|
160
|
+
Type type = column.getType().get();
|
161
|
+
Value defaultValue = getDefault(task, name, type, column);
|
162
|
+
jsonAddColumnsPut(name, new JsonColumn(name, type, defaultValue));
|
163
|
+
}
|
164
|
+
else {
|
165
|
+
throw new SchemaConfigException(String.format("add_columns: Column '%s' does not have \"type\" and \"default\"", name));
|
166
|
+
}
|
167
|
+
}
|
168
|
+
}
|
169
|
+
}
|
170
|
+
|
171
|
+
// json partial path => Boolean to avoid unnecessary type: json visit
|
172
|
+
private void buildShouldVisitSet()
|
173
|
+
{
|
174
|
+
ArrayList<ColumnConfig> columnConfigs = new ArrayList<>(task.getColumns());
|
175
|
+
columnConfigs.addAll(task.getAddColumns());
|
176
|
+
columnConfigs.addAll(task.getDropColumns());
|
177
|
+
|
178
|
+
for (ColumnConfig columnConfig : columnConfigs) {
|
179
|
+
String name = columnConfig.getName();
|
180
|
+
if (!name.startsWith("$.")) {
|
181
|
+
continue;
|
182
|
+
}
|
183
|
+
String[] parts = name.split("\\.");
|
184
|
+
StringBuilder partialPath = new StringBuilder("$");
|
185
|
+
for (int i = 1; i < parts.length; i++) {
|
186
|
+
if (parts[i].contains("[")) {
|
187
|
+
String[] arrayParts = parts[i].split("\\[");
|
188
|
+
partialPath.append(".").append(arrayParts[0]);
|
189
|
+
this.shouldVisitSet.add(partialPath.toString());
|
190
|
+
for (int j = 1; j < arrayParts.length; j++) {
|
191
|
+
// Supports both [0] and [*]
|
192
|
+
partialPath.append("[").append(arrayParts[j]);
|
193
|
+
this.shouldVisitSet.add(partialPath.toString());
|
194
|
+
}
|
195
|
+
}
|
196
|
+
else {
|
197
|
+
partialPath.append(".").append(parts[i]);
|
198
|
+
this.shouldVisitSet.add(partialPath.toString());
|
199
|
+
}
|
200
|
+
}
|
201
|
+
}
|
202
|
+
}
|
203
|
+
|
204
|
+
boolean shouldVisit(String jsonPath)
|
205
|
+
{
|
206
|
+
return shouldVisitSet.contains(jsonPath);
|
207
|
+
}
|
208
|
+
|
209
|
+
String newArrayJsonPath(String rootPath, int i)
|
210
|
+
{
|
211
|
+
String newPath = new StringBuilder(rootPath).append("[").append(Integer.toString(i)).append("]").toString();
|
212
|
+
if (! shouldVisit(newPath)) {
|
213
|
+
newPath = new StringBuilder(rootPath).append("[*]").toString(); // try [*] too
|
214
|
+
}
|
215
|
+
return newPath;
|
216
|
+
}
|
217
|
+
|
218
|
+
String newMapJsonPath(String rootPath, Value elementPathValue)
|
219
|
+
{
|
220
|
+
String elementPath = elementPathValue.asStringValue().asString();
|
221
|
+
String newPath = new StringBuilder(rootPath).append(".").append(elementPath).toString();
|
222
|
+
return newPath;
|
223
|
+
}
|
224
|
+
|
225
|
+
Value visitArray(String rootPath, ArrayValue arrayValue)
|
226
|
+
{
|
227
|
+
int size = arrayValue.size();
|
228
|
+
ArrayList<Value> newValue = new ArrayList<>(size);
|
229
|
+
int j = 0;
|
230
|
+
if (this.jsonDropColumns.containsKey(rootPath)) {
|
231
|
+
HashSet<String> jsonDropColumns = this.jsonDropColumns.get(rootPath);
|
232
|
+
for (int i = 0; i < size; i++) {
|
233
|
+
String newPath = newArrayJsonPath(rootPath, i);
|
234
|
+
if (! jsonDropColumns.contains(newPath)) {
|
235
|
+
newValue.add(j++, visit(newPath, arrayValue.get(i)));
|
236
|
+
}
|
237
|
+
}
|
238
|
+
}
|
239
|
+
else if (this.jsonColumns.containsKey(rootPath)) {
|
240
|
+
LinkedHashMap<String, JsonColumn> jsonColumns = this.jsonColumns.get(rootPath);
|
241
|
+
for (int i = 0; i < size; i++) {
|
242
|
+
String newPath = newArrayJsonPath(rootPath, i);
|
243
|
+
if (jsonColumns.containsKey(newPath)) {
|
244
|
+
newValue.add(j++, visit(newPath, arrayValue.get(i)));
|
245
|
+
}
|
246
|
+
}
|
247
|
+
}
|
248
|
+
else {
|
249
|
+
for (int i = 0; i < size; i++) {
|
250
|
+
String newPath = newArrayJsonPath(rootPath, i);
|
251
|
+
newValue.add(j++, visit(newPath, arrayValue.get(i)));
|
252
|
+
}
|
253
|
+
}
|
254
|
+
if (this.jsonAddColumns.containsKey(rootPath)) {
|
255
|
+
for (JsonColumn jsonColumn : this.jsonAddColumns.get(rootPath).values()) {
|
256
|
+
newValue.add(j++, jsonColumn.getDefaultValue());
|
257
|
+
}
|
258
|
+
}
|
259
|
+
return ValueFactory.newArray(newValue.toArray(new Value[0]), true);
|
260
|
+
}
|
261
|
+
|
262
|
+
Value visitMap(String rootPath, MapValue mapValue)
|
263
|
+
{
|
264
|
+
int size = mapValue.size();
|
265
|
+
int i = 0;
|
266
|
+
ArrayList<Value> newValue = new ArrayList<>(size * 2);
|
267
|
+
if (this.jsonDropColumns.containsKey(rootPath)) {
|
268
|
+
HashSet<String> jsonDropColumns = this.jsonDropColumns.get(rootPath);
|
269
|
+
for (Map.Entry<Value, Value> entry : mapValue.entrySet()) {
|
270
|
+
Value k = entry.getKey();
|
271
|
+
Value v = entry.getValue();
|
272
|
+
String newPath = newMapJsonPath(rootPath, k);
|
273
|
+
if (! jsonDropColumns.contains(newPath)) {
|
274
|
+
Value visited = visit(newPath, v);
|
275
|
+
newValue.add(i++, k);
|
276
|
+
newValue.add(i++, visited);
|
277
|
+
}
|
278
|
+
}
|
279
|
+
}
|
280
|
+
else if (this.jsonColumns.containsKey(rootPath)) {
|
281
|
+
Map<Value, Value> map = mapValue.map();
|
282
|
+
for (JsonColumn jsonColumn : jsonColumns.get(rootPath).values()) {
|
283
|
+
Value k = jsonColumn.getElementPathValue();
|
284
|
+
Value v = map.get(k);
|
285
|
+
String newPath = jsonColumn.getName();
|
286
|
+
Value visited = visit(newPath, v);
|
287
|
+
if (visited == null) {
|
288
|
+
visited = jsonColumn.getDefaultValue();
|
289
|
+
}
|
290
|
+
newValue.add(i++, k);
|
291
|
+
newValue.add(i++, visited);
|
292
|
+
}
|
293
|
+
}
|
294
|
+
else {
|
295
|
+
for (Map.Entry<Value, Value> entry : mapValue.entrySet()) {
|
296
|
+
Value k = entry.getKey();
|
297
|
+
Value v = entry.getValue();
|
298
|
+
String newPath = newMapJsonPath(rootPath, k);
|
299
|
+
Value visited = visit(newPath, v);
|
300
|
+
newValue.add(i++, k);
|
301
|
+
newValue.add(i++, visited);
|
302
|
+
}
|
303
|
+
}
|
304
|
+
if (this.jsonAddColumns.containsKey(rootPath)) {
|
305
|
+
for (JsonColumn jsonColumn : this.jsonAddColumns.get(rootPath).values()) {
|
306
|
+
newValue.add(i++, jsonColumn.getElementPathValue());
|
307
|
+
newValue.add(i++, jsonColumn.getDefaultValue());
|
308
|
+
}
|
309
|
+
}
|
310
|
+
return ValueFactory.newMap(newValue.toArray(new Value[0]), true);
|
311
|
+
}
|
312
|
+
|
313
|
+
public Value visit(String rootPath, Value value)
|
314
|
+
{
|
315
|
+
if (! shouldVisit(rootPath)) {
|
316
|
+
return value;
|
317
|
+
}
|
318
|
+
if (value.isArrayValue()) {
|
319
|
+
return visitArray(rootPath, value.asArrayValue());
|
320
|
+
}
|
321
|
+
else if (value.isMapValue()) {
|
322
|
+
return visitMap(rootPath, value.asMapValue());
|
323
|
+
}
|
324
|
+
else {
|
325
|
+
return value;
|
326
|
+
}
|
327
|
+
}
|
328
|
+
}
|