embulk-filter-column 0.4.0 → 0.5.0.pre1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +6 -0
- data/README.md +23 -1
- data/build.gradle +1 -1
- data/example/columns.yml +2 -1
- data/example/example.yml +35 -0
- data/example/json_add_columns.yml +31 -0
- data/example/json_columns.yml +23 -0
- data/example/json_drop_columns.yml +22 -0
- data/gradlew +3 -3
- data/lib/embulk/filter/column.rb +1 -1
- data/settings.gradle +1 -0
- data/src/main/java/org/embulk/filter/column/ColumnFilterPlugin.java +260 -0
- data/src/main/java/org/embulk/filter/column/ColumnVisitorImpl.java +275 -0
- data/src/main/java/org/embulk/filter/column/JsonColumn.java +104 -0
- data/src/main/java/org/embulk/filter/column/JsonVisitor.java +328 -0
- metadata +14 -6
- data/src/main/java/org/embulk/filter/ColumnFilterPlugin.java +0 -462
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1e88bb8b22f8f2030982764303175dd77b97a42a
|
4
|
+
data.tar.gz: e7b65b4cc79b92b3aa89596fc3e1253fa7897ace
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7cc74b699dc85ec17ff45ab2d991f77219b7fa93ef03d2d444b21d36b83c47689b4d08ebe81dfc0e428134f9a380262367fd8d3cd33987b831d8628e64a79713
|
7
|
+
data.tar.gz: e2cb3d98ec5b678f67d94c8b70854f2b46c5db7544cfbe1f2713653229195c1c57ee2b28c8fe423ab345a01f16b4b7d68337ebfd4c93719512e68c7c35e7a1e2
|
data/.gitignore
CHANGED
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -109,6 +109,28 @@ VmjbjAA0tOoSEPv_vKAGMtD_0aXZji0abGe7_VXHmUQ,3962
|
|
109
109
|
C40P5H1WcBx-aWFDJCI8th6QPEI2DOUgupt_gB8UutE,7323
|
110
110
|
```
|
111
111
|
|
112
|
+
## JSONPath (like) name
|
113
|
+
|
114
|
+
For type: json column, you can specify [JSONPath](http://goessner.net/articles/JsonPath/) for column's name as:
|
115
|
+
|
116
|
+
```
|
117
|
+
$.payload.key1
|
118
|
+
$.payload.array[0]
|
119
|
+
$.payload.array[*]
|
120
|
+
```
|
121
|
+
|
122
|
+
EXAMPLE:
|
123
|
+
|
124
|
+
* [example/json_columns.yml](example/json_columns.yml)
|
125
|
+
* [example/json_add_columns.yml](example/json_add_columns.yml)
|
126
|
+
* [example/json_drop_columns.yml](example/json_drop_columns.yml)
|
127
|
+
|
128
|
+
NOTE:
|
129
|
+
|
130
|
+
* JSONPath syntax is not fully supported
|
131
|
+
* Embulk's type: json cannot have timestamp column, so `type: timesatmp` for `add_columns` or `columns` with default is not available
|
132
|
+
* `src` for `add_columns` or `columns` is not supported yet
|
133
|
+
|
112
134
|
## ToDo
|
113
135
|
|
114
136
|
* Write test
|
@@ -119,7 +141,7 @@ Run example:
|
|
119
141
|
|
120
142
|
```
|
121
143
|
$ ./gradlew classpath
|
122
|
-
$ embulk
|
144
|
+
$ embulk preview -I lib example/example.yml
|
123
145
|
```
|
124
146
|
|
125
147
|
Run test:
|
data/build.gradle
CHANGED
data/example/columns.yml
CHANGED
data/example/example.yml
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# in:
|
2
|
+
# type: random
|
3
|
+
# rows: 100
|
4
|
+
# schema:
|
5
|
+
# id: primary_key
|
6
|
+
# name: string
|
7
|
+
# score: integer
|
8
|
+
in:
|
9
|
+
type: file
|
10
|
+
path_prefix: example/example.csv
|
11
|
+
parser:
|
12
|
+
type: csv
|
13
|
+
charset: UTF-8
|
14
|
+
newline: CRLF
|
15
|
+
null_string: 'NULL'
|
16
|
+
skip_header_lines: 1
|
17
|
+
comment_line_marker: '#'
|
18
|
+
columns:
|
19
|
+
- {name: time, type: timestamp, format: "%Y-%m-%d"}
|
20
|
+
- {name: id, type: long}
|
21
|
+
- {name: name, type: string}
|
22
|
+
- {name: score, type: double}
|
23
|
+
- {name: json, type: json}
|
24
|
+
filters:
|
25
|
+
- type: column
|
26
|
+
columns:
|
27
|
+
- {name: time, default: "2015-07-13", format: "%Y-%m-%d"}
|
28
|
+
- {name: name, default: "foo"}
|
29
|
+
- {name: foo, default: 1, type: long}
|
30
|
+
- {name: id}
|
31
|
+
- {name: copy_score, src: score}
|
32
|
+
- {name: json, default: "{\"foo\":\"FOO\"}"}
|
33
|
+
- {name: $.json.foo}
|
34
|
+
out:
|
35
|
+
type: stdout
|
@@ -0,0 +1,31 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: example/example.csv
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
charset: UTF-8
|
7
|
+
newline: CRLF
|
8
|
+
null_string: 'NULL'
|
9
|
+
skip_header_lines: 1
|
10
|
+
comment_line_marker: '#'
|
11
|
+
columns:
|
12
|
+
- {name: time, type: timestamp, format: "%Y-%m-%d"}
|
13
|
+
- {name: id, type: long}
|
14
|
+
- {name: name, type: string}
|
15
|
+
- {name: score, type: double}
|
16
|
+
- {name: json, type: json}
|
17
|
+
filters:
|
18
|
+
- type: column
|
19
|
+
default_timezone: "Asia/Tokyo"
|
20
|
+
default_timestamp_format: "%Y-%m-%d"
|
21
|
+
columns:
|
22
|
+
- {name: time}
|
23
|
+
- {name: id}
|
24
|
+
- {name: name}
|
25
|
+
- {name: score}
|
26
|
+
- {name: json, default: "{}"}
|
27
|
+
add_columns:
|
28
|
+
- {name: $.json.foo, type: long, default: 1}
|
29
|
+
- {name: $.json.d, type: string, default: "2015-07-13"}
|
30
|
+
out:
|
31
|
+
type: stdout
|
@@ -0,0 +1,23 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: example/example.csv
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
charset: UTF-8
|
7
|
+
newline: CRLF
|
8
|
+
null_string: 'NULL'
|
9
|
+
skip_header_lines: 1
|
10
|
+
comment_line_marker: '#'
|
11
|
+
columns:
|
12
|
+
- {name: time, type: timestamp, format: "%Y-%m-%d"}
|
13
|
+
- {name: id, type: long}
|
14
|
+
- {name: name, type: string}
|
15
|
+
- {name: score, type: double}
|
16
|
+
- {name: json, type: json}
|
17
|
+
filters:
|
18
|
+
- type: column
|
19
|
+
columns:
|
20
|
+
- {name: json, default: "{\"foo\":\"FOO\"}"}
|
21
|
+
- {name: $.json.foo}
|
22
|
+
out:
|
23
|
+
type: stdout
|
@@ -0,0 +1,22 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: example/example.csv
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
charset: UTF-8
|
7
|
+
newline: CRLF
|
8
|
+
null_string: 'NULL'
|
9
|
+
skip_header_lines: 1
|
10
|
+
comment_line_marker: '#'
|
11
|
+
columns:
|
12
|
+
- {name: time, type: timestamp, format: "%Y-%m-%d"}
|
13
|
+
- {name: id, type: long}
|
14
|
+
- {name: name, type: string}
|
15
|
+
- {name: score, type: double}
|
16
|
+
- {name: json, type: json}
|
17
|
+
filters:
|
18
|
+
- type: column
|
19
|
+
drop_columns:
|
20
|
+
- {name: $.json.foo }
|
21
|
+
out:
|
22
|
+
type: stdout
|
data/gradlew
CHANGED
@@ -112,8 +112,8 @@ fi
|
|
112
112
|
|
113
113
|
# For Cygwin, switch paths to Windows format before running java
|
114
114
|
if $cygwin ; then
|
115
|
-
APP_HOME=`cygpath --
|
116
|
-
CLASSPATH=`cygpath --
|
115
|
+
APP_HOME=`cygpath --name --mixed "$APP_HOME"`
|
116
|
+
CLASSPATH=`cygpath --name --mixed "$CLASSPATH"`
|
117
117
|
|
118
118
|
# We build the pattern for arguments to be converted via cygpath
|
119
119
|
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
|
@@ -134,7 +134,7 @@ if $cygwin ; then
|
|
134
134
|
CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
|
135
135
|
|
136
136
|
if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
|
137
|
-
eval `echo args$i`=`cygpath --
|
137
|
+
eval `echo args$i`=`cygpath --name --ignore --mixed "$arg"`
|
138
138
|
else
|
139
139
|
eval `echo args$i`="\"$arg\""
|
140
140
|
fi
|
data/lib/embulk/filter/column.rb
CHANGED
data/settings.gradle
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rootProject.name = 'embulk-filter-column'
|
@@ -0,0 +1,260 @@
|
|
1
|
+
package org.embulk.filter.column;
|
2
|
+
|
3
|
+
import com.google.common.base.Optional;
|
4
|
+
import com.google.common.collect.ImmutableList;
|
5
|
+
|
6
|
+
import org.embulk.config.Config;
|
7
|
+
import org.embulk.config.ConfigDefault;
|
8
|
+
import org.embulk.config.ConfigException;
|
9
|
+
import org.embulk.config.ConfigSource;
|
10
|
+
import org.embulk.config.Task;
|
11
|
+
import org.embulk.config.TaskSource;
|
12
|
+
|
13
|
+
import org.embulk.spi.Column;
|
14
|
+
import org.embulk.spi.Exec;
|
15
|
+
import org.embulk.spi.FilterPlugin;
|
16
|
+
import org.embulk.spi.Page;
|
17
|
+
import org.embulk.spi.PageBuilder;
|
18
|
+
import org.embulk.spi.PageOutput;
|
19
|
+
import org.embulk.spi.PageReader;
|
20
|
+
import org.embulk.spi.Schema;
|
21
|
+
import org.embulk.spi.SchemaConfigException;
|
22
|
+
import org.embulk.spi.time.TimestampParser;
|
23
|
+
import org.embulk.spi.type.Type;
|
24
|
+
|
25
|
+
import org.joda.time.DateTimeZone;
|
26
|
+
import org.slf4j.Logger;
|
27
|
+
|
28
|
+
import java.util.List;
|
29
|
+
|
30
|
+
public class ColumnFilterPlugin implements FilterPlugin
|
31
|
+
{
|
32
|
+
private static final Logger logger = Exec.getLogger(ColumnFilterPlugin.class);
|
33
|
+
|
34
|
+
public ColumnFilterPlugin()
|
35
|
+
{
|
36
|
+
}
|
37
|
+
|
38
|
+
// NOTE: This is not spi.ColumnConfig
|
39
|
+
interface ColumnConfig extends Task
|
40
|
+
{
|
41
|
+
@Config("name")
|
42
|
+
public String getName();
|
43
|
+
|
44
|
+
@Config("type")
|
45
|
+
@ConfigDefault("null")
|
46
|
+
public Optional<Type> getType(); // required only for addColumns
|
47
|
+
|
48
|
+
@Config("default")
|
49
|
+
@ConfigDefault("null")
|
50
|
+
public Optional<Object> getDefault();
|
51
|
+
|
52
|
+
@Config("format")
|
53
|
+
@ConfigDefault("null")
|
54
|
+
public Optional<String> getFormat();
|
55
|
+
|
56
|
+
@Config("timezone")
|
57
|
+
@ConfigDefault("null")
|
58
|
+
public Optional<DateTimeZone> getTimeZone();
|
59
|
+
|
60
|
+
@Config("src")
|
61
|
+
@ConfigDefault("null")
|
62
|
+
public Optional<String> getSrc();
|
63
|
+
}
|
64
|
+
|
65
|
+
interface PluginTask extends Task, TimestampParser.Task
|
66
|
+
{
|
67
|
+
@Config("columns")
|
68
|
+
@ConfigDefault("[]")
|
69
|
+
public List<ColumnConfig> getColumns();
|
70
|
+
|
71
|
+
@Config("add_columns")
|
72
|
+
@ConfigDefault("[]")
|
73
|
+
public List<ColumnConfig> getAddColumns();
|
74
|
+
|
75
|
+
@Config("drop_columns")
|
76
|
+
@ConfigDefault("[]")
|
77
|
+
public List<ColumnConfig> getDropColumns();
|
78
|
+
|
79
|
+
// See TimestampParser for default_timestamp_format, and default_timezone
|
80
|
+
}
|
81
|
+
|
82
|
+
@Override
|
83
|
+
public void transaction(final ConfigSource config, final Schema inputSchema,
|
84
|
+
final FilterPlugin.Control control)
|
85
|
+
{
|
86
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
87
|
+
|
88
|
+
configure(task);
|
89
|
+
Schema outputSchema = buildOutputSchema(task, inputSchema);
|
90
|
+
|
91
|
+
control.run(task.dump(), outputSchema);
|
92
|
+
}
|
93
|
+
|
94
|
+
private void configure(PluginTask task)
|
95
|
+
{
|
96
|
+
List<ColumnConfig> columns = task.getColumns();
|
97
|
+
List<ColumnConfig> addColumns = task.getAddColumns();
|
98
|
+
List<ColumnConfig> dropColumns = task.getDropColumns();
|
99
|
+
|
100
|
+
if (columns.size() == 0 && addColumns.size() == 0 && dropColumns.size() == 0) {
|
101
|
+
throw new ConfigException("One of \"columns\", \"add_columns\", \"drop_columns\" must be specified.");
|
102
|
+
}
|
103
|
+
|
104
|
+
if (columns.size() > 0 && dropColumns.size() > 0) {
|
105
|
+
throw new ConfigException("Either of \"columns\", \"drop_columns\" can be specified.");
|
106
|
+
}
|
107
|
+
}
|
108
|
+
|
109
|
+
private Schema buildOutputSchema(PluginTask task, Schema inputSchema)
|
110
|
+
{
|
111
|
+
List<ColumnConfig> columns = task.getColumns();
|
112
|
+
List<ColumnConfig> addColumns = task.getAddColumns();
|
113
|
+
List<ColumnConfig> dropColumns = task.getDropColumns();
|
114
|
+
|
115
|
+
// Automatically get column type from inputSchema for columns and dropColumns
|
116
|
+
ImmutableList.Builder<Column> builder = ImmutableList.builder();
|
117
|
+
int i = 0;
|
118
|
+
if (dropColumns.size() > 0) {
|
119
|
+
for (Column inputColumn : inputSchema.getColumns()) {
|
120
|
+
String name = inputColumn.getName();
|
121
|
+
boolean matched = false;
|
122
|
+
for (ColumnConfig dropColumn : dropColumns) {
|
123
|
+
// skip json path notation to build outputSchema
|
124
|
+
if (dropColumn.getName().startsWith("$.")) {
|
125
|
+
continue;
|
126
|
+
}
|
127
|
+
if (dropColumn.getName().equals(name)) {
|
128
|
+
matched = true;
|
129
|
+
break;
|
130
|
+
}
|
131
|
+
}
|
132
|
+
if (! matched) {
|
133
|
+
Column outputColumn = new Column(i++, name, inputColumn.getType());
|
134
|
+
builder.add(outputColumn);
|
135
|
+
}
|
136
|
+
}
|
137
|
+
}
|
138
|
+
else if (columns.size() > 0) {
|
139
|
+
for (ColumnConfig column : columns) {
|
140
|
+
// skip json path notation to build output schema
|
141
|
+
if (column.getName().startsWith("$.")) {
|
142
|
+
continue;
|
143
|
+
}
|
144
|
+
if (column.getSrc().isPresent() && column.getSrc().get().startsWith("$.")) {
|
145
|
+
continue;
|
146
|
+
}
|
147
|
+
|
148
|
+
String name = column.getName();
|
149
|
+
Optional<Type> type = column.getType();
|
150
|
+
Optional<Object> defaultValue = column.getDefault();
|
151
|
+
Optional<String> src = column.getSrc();
|
152
|
+
|
153
|
+
String srcName = src.isPresent() ? src.get() : name;
|
154
|
+
Column inputColumn;
|
155
|
+
try {
|
156
|
+
inputColumn = inputSchema.lookupColumn(srcName);
|
157
|
+
}
|
158
|
+
catch (SchemaConfigException ex) {
|
159
|
+
inputColumn = null;
|
160
|
+
}
|
161
|
+
if (inputColumn != null) { // filter or copy column
|
162
|
+
Column outputColumn = new Column(i++, name, inputColumn.getType());
|
163
|
+
builder.add(outputColumn);
|
164
|
+
}
|
165
|
+
else if (type.isPresent() && defaultValue.isPresent()) { // add column
|
166
|
+
Column outputColumn = new Column(i++, name, type.get());
|
167
|
+
builder.add(outputColumn);
|
168
|
+
}
|
169
|
+
else {
|
170
|
+
throw new SchemaConfigException(String.format("columns: Column src '%s' is not found in inputSchema. Column '%s' does not have \"type\" and \"default\"", srcName, name));
|
171
|
+
}
|
172
|
+
}
|
173
|
+
}
|
174
|
+
else {
|
175
|
+
for (Column column : inputSchema.getColumns()) {
|
176
|
+
Column outputColumn = new Column(i++, column.getName(), column.getType());
|
177
|
+
builder.add(outputColumn);
|
178
|
+
}
|
179
|
+
}
|
180
|
+
|
181
|
+
// Add columns to last. If you want to add to head or middle, you can use `columns` option
|
182
|
+
if (addColumns.size() > 0) {
|
183
|
+
for (ColumnConfig column : addColumns) {
|
184
|
+
// skip json path notation to build output schema
|
185
|
+
if (column.getName().startsWith("$.")) {
|
186
|
+
continue;
|
187
|
+
}
|
188
|
+
if (column.getSrc().isPresent() && column.getSrc().get().startsWith("$.")) {
|
189
|
+
continue;
|
190
|
+
}
|
191
|
+
|
192
|
+
String name = column.getName();
|
193
|
+
Optional<Type> type = column.getType();
|
194
|
+
Optional<Object> defaultValue = column.getDefault();
|
195
|
+
Optional<String> src = column.getSrc();
|
196
|
+
|
197
|
+
String srcName = null;
|
198
|
+
Column inputColumn = null;
|
199
|
+
if (src.isPresent()) {
|
200
|
+
srcName = src.get();
|
201
|
+
try {
|
202
|
+
inputColumn = inputSchema.lookupColumn(srcName);
|
203
|
+
}
|
204
|
+
catch (SchemaConfigException ex) {
|
205
|
+
inputColumn = null;
|
206
|
+
}
|
207
|
+
}
|
208
|
+
if (inputColumn != null) { // copy column
|
209
|
+
Column outputColumn = new Column(i++, name, inputColumn.getType());
|
210
|
+
builder.add(outputColumn);
|
211
|
+
}
|
212
|
+
else if (type.isPresent() && defaultValue.isPresent()) { // add column
|
213
|
+
Column outputColumn = new Column(i++, name, type.get());
|
214
|
+
builder.add(outputColumn);
|
215
|
+
}
|
216
|
+
else {
|
217
|
+
throw new SchemaConfigException(String.format("add_columns: Column src '%s' is not found in inputSchema, Column '%s' does not have \"type\" and \"default\"", srcName, name));
|
218
|
+
}
|
219
|
+
}
|
220
|
+
}
|
221
|
+
|
222
|
+
return new Schema(builder.build());
|
223
|
+
}
|
224
|
+
|
225
|
+
@Override
|
226
|
+
public PageOutput open(final TaskSource taskSource, final Schema inputSchema,
|
227
|
+
final Schema outputSchema, final PageOutput output)
|
228
|
+
{
|
229
|
+
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
230
|
+
|
231
|
+
return new PageOutput() {
|
232
|
+
private PageReader pageReader = new PageReader(inputSchema);
|
233
|
+
private PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
|
234
|
+
private ColumnVisitorImpl visitor = new ColumnVisitorImpl(task, inputSchema, outputSchema, pageReader, pageBuilder);
|
235
|
+
|
236
|
+
@Override
|
237
|
+
public void finish()
|
238
|
+
{
|
239
|
+
pageBuilder.finish();
|
240
|
+
}
|
241
|
+
|
242
|
+
@Override
|
243
|
+
public void close()
|
244
|
+
{
|
245
|
+
pageBuilder.close();
|
246
|
+
}
|
247
|
+
|
248
|
+
@Override
|
249
|
+
public void add(Page page)
|
250
|
+
{
|
251
|
+
pageReader.setPage(page);
|
252
|
+
|
253
|
+
while (pageReader.nextRecord()) {
|
254
|
+
outputSchema.visitColumns(visitor);
|
255
|
+
pageBuilder.addRecord();
|
256
|
+
}
|
257
|
+
}
|
258
|
+
};
|
259
|
+
}
|
260
|
+
}
|