embulk-filter-column 0.4.0 → 0.5.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 572e638f806833f390196e60aa5791c0f23aff76
4
- data.tar.gz: db719c3a30d31e86bea2f75afaa09849c92ec631
3
+ metadata.gz: 1e88bb8b22f8f2030982764303175dd77b97a42a
4
+ data.tar.gz: e7b65b4cc79b92b3aa89596fc3e1253fa7897ace
5
5
  SHA512:
6
- metadata.gz: 6a4c06e967fc14763e386326166ae4b0ae9247fca8a158dc41e25a9bb430e26430d70fb2603abd29fedfb54f822a2fac152078628f62eb3fe816bb31a7882d45
7
- data.tar.gz: edb08b9d1b7fc6e53a41d86ab82a61a7f848737bf37f6baa0ec6048c5475c5040db4f165fb5306a21f4fef6e69b7afdce4f78855381c1c5b48e5aaec4fa3b303
6
+ metadata.gz: 7cc74b699dc85ec17ff45ab2d991f77219b7fa93ef03d2d444b21d36b83c47689b4d08ebe81dfc0e428134f9a380262367fd8d3cd33987b831d8628e64a79713
7
+ data.tar.gz: e2cb3d98ec5b678f67d94c8b70854f2b46c5db7544cfbe1f2713653229195c1c57ee2b28c8fe423ab345a01f16b4b7d68337ebfd4c93719512e68c7c35e7a1e2
data/.gitignore CHANGED
@@ -10,3 +10,4 @@ build/
10
10
  .tags
11
11
  .ruby-version
12
12
  *.iml
13
+ .DS_Store
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ # 0.5.0.pre1 (2016-05-24)
2
+
3
+ Enhancements:
4
+
5
+ * Support JSONPath (like) name
6
+
1
7
  # 0.4.0 (2016-02-01)
2
8
 
3
9
  Enhancements:
data/README.md CHANGED
@@ -109,6 +109,28 @@ VmjbjAA0tOoSEPv_vKAGMtD_0aXZji0abGe7_VXHmUQ,3962
109
109
  C40P5H1WcBx-aWFDJCI8th6QPEI2DOUgupt_gB8UutE,7323
110
110
  ```
111
111
 
112
+ ## JSONPath (like) name
113
+
114
+ For type: json column, you can specify [JSONPath](http://goessner.net/articles/JsonPath/) for column's name as:
115
+
116
+ ```
117
+ $.payload.key1
118
+ $.payload.array[0]
119
+ $.payload.array[*]
120
+ ```
121
+
122
+ EXAMPLE:
123
+
124
+ * [example/json_columns.yml](example/json_columns.yml)
125
+ * [example/json_add_columns.yml](example/json_add_columns.yml)
126
+ * [example/json_drop_columns.yml](example/json_drop_columns.yml)
127
+
128
+ NOTE:
129
+
130
+ * JSONPath syntax is not fully supported
131
+ * Embulk's type: json cannot have timestamp column, so `type: timesatmp` for `add_columns` or `columns` with default is not available
132
+ * `src` for `add_columns` or `columns` is not supported yet
133
+
112
134
  ## ToDo
113
135
 
114
136
  * Write test
@@ -119,7 +141,7 @@ Run example:
119
141
 
120
142
  ```
121
143
  $ ./gradlew classpath
122
- $ embulk run -I lib example.yml
144
+ $ embulk preview -I lib example/example.yml
123
145
  ```
124
146
 
125
147
  Run test:
data/build.gradle CHANGED
@@ -13,7 +13,7 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.4.0"
16
+ version = "0.5.0.pre1"
17
17
  sourceCompatibility = 1.7
18
18
  targetCompatibility = 1.7
19
19
 
data/example/columns.yml CHANGED
@@ -29,6 +29,7 @@ filters:
29
29
  - {name: foo, default: 1, type: long}
30
30
  - {name: id}
31
31
  - {name: copy_score, src: score}
32
- - {name: json, default: "{\"foo\": \"FOO\"}"}
32
+ - {name: json, default: "{\"foo\":\"FOO\"}"}
33
+ - {name: $.json.foo}
33
34
  out:
34
35
  type: stdout
@@ -0,0 +1,35 @@
1
+ # in:
2
+ # type: random
3
+ # rows: 100
4
+ # schema:
5
+ # id: primary_key
6
+ # name: string
7
+ # score: integer
8
+ in:
9
+ type: file
10
+ path_prefix: example/example.csv
11
+ parser:
12
+ type: csv
13
+ charset: UTF-8
14
+ newline: CRLF
15
+ null_string: 'NULL'
16
+ skip_header_lines: 1
17
+ comment_line_marker: '#'
18
+ columns:
19
+ - {name: time, type: timestamp, format: "%Y-%m-%d"}
20
+ - {name: id, type: long}
21
+ - {name: name, type: string}
22
+ - {name: score, type: double}
23
+ - {name: json, type: json}
24
+ filters:
25
+ - type: column
26
+ columns:
27
+ - {name: time, default: "2015-07-13", format: "%Y-%m-%d"}
28
+ - {name: name, default: "foo"}
29
+ - {name: foo, default: 1, type: long}
30
+ - {name: id}
31
+ - {name: copy_score, src: score}
32
+ - {name: json, default: "{\"foo\":\"FOO\"}"}
33
+ - {name: $.json.foo}
34
+ out:
35
+ type: stdout
@@ -0,0 +1,31 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/example.csv
4
+ parser:
5
+ type: csv
6
+ charset: UTF-8
7
+ newline: CRLF
8
+ null_string: 'NULL'
9
+ skip_header_lines: 1
10
+ comment_line_marker: '#'
11
+ columns:
12
+ - {name: time, type: timestamp, format: "%Y-%m-%d"}
13
+ - {name: id, type: long}
14
+ - {name: name, type: string}
15
+ - {name: score, type: double}
16
+ - {name: json, type: json}
17
+ filters:
18
+ - type: column
19
+ default_timezone: "Asia/Tokyo"
20
+ default_timestamp_format: "%Y-%m-%d"
21
+ columns:
22
+ - {name: time}
23
+ - {name: id}
24
+ - {name: name}
25
+ - {name: score}
26
+ - {name: json, default: "{}"}
27
+ add_columns:
28
+ - {name: $.json.foo, type: long, default: 1}
29
+ - {name: $.json.d, type: string, default: "2015-07-13"}
30
+ out:
31
+ type: stdout
@@ -0,0 +1,23 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/example.csv
4
+ parser:
5
+ type: csv
6
+ charset: UTF-8
7
+ newline: CRLF
8
+ null_string: 'NULL'
9
+ skip_header_lines: 1
10
+ comment_line_marker: '#'
11
+ columns:
12
+ - {name: time, type: timestamp, format: "%Y-%m-%d"}
13
+ - {name: id, type: long}
14
+ - {name: name, type: string}
15
+ - {name: score, type: double}
16
+ - {name: json, type: json}
17
+ filters:
18
+ - type: column
19
+ columns:
20
+ - {name: json, default: "{\"foo\":\"FOO\"}"}
21
+ - {name: $.json.foo}
22
+ out:
23
+ type: stdout
@@ -0,0 +1,22 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/example.csv
4
+ parser:
5
+ type: csv
6
+ charset: UTF-8
7
+ newline: CRLF
8
+ null_string: 'NULL'
9
+ skip_header_lines: 1
10
+ comment_line_marker: '#'
11
+ columns:
12
+ - {name: time, type: timestamp, format: "%Y-%m-%d"}
13
+ - {name: id, type: long}
14
+ - {name: name, type: string}
15
+ - {name: score, type: double}
16
+ - {name: json, type: json}
17
+ filters:
18
+ - type: column
19
+ drop_columns:
20
+ - {name: $.json.foo }
21
+ out:
22
+ type: stdout
data/gradlew CHANGED
@@ -112,8 +112,8 @@ fi
112
112
 
113
113
  # For Cygwin, switch paths to Windows format before running java
114
114
  if $cygwin ; then
115
- APP_HOME=`cygpath --path --mixed "$APP_HOME"`
116
- CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
115
+ APP_HOME=`cygpath --name --mixed "$APP_HOME"`
116
+ CLASSPATH=`cygpath --name --mixed "$CLASSPATH"`
117
117
 
118
118
  # We build the pattern for arguments to be converted via cygpath
119
119
  ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
@@ -134,7 +134,7 @@ if $cygwin ; then
134
134
  CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135
135
 
136
136
  if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137
- eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
137
+ eval `echo args$i`=`cygpath --name --ignore --mixed "$arg"`
138
138
  else
139
139
  eval `echo args$i`="\"$arg\""
140
140
  fi
@@ -1,3 +1,3 @@
1
1
  Embulk::JavaPlugin.register_filter(
2
- "column", "org.embulk.filter.ColumnFilterPlugin",
2
+ "column", "org.embulk.filter.column.ColumnFilterPlugin",
3
3
  File.expand_path('../../../../classpath', __FILE__))
data/settings.gradle ADDED
@@ -0,0 +1 @@
1
+ rootProject.name = 'embulk-filter-column'
@@ -0,0 +1,260 @@
1
+ package org.embulk.filter.column;
2
+
3
+ import com.google.common.base.Optional;
4
+ import com.google.common.collect.ImmutableList;
5
+
6
+ import org.embulk.config.Config;
7
+ import org.embulk.config.ConfigDefault;
8
+ import org.embulk.config.ConfigException;
9
+ import org.embulk.config.ConfigSource;
10
+ import org.embulk.config.Task;
11
+ import org.embulk.config.TaskSource;
12
+
13
+ import org.embulk.spi.Column;
14
+ import org.embulk.spi.Exec;
15
+ import org.embulk.spi.FilterPlugin;
16
+ import org.embulk.spi.Page;
17
+ import org.embulk.spi.PageBuilder;
18
+ import org.embulk.spi.PageOutput;
19
+ import org.embulk.spi.PageReader;
20
+ import org.embulk.spi.Schema;
21
+ import org.embulk.spi.SchemaConfigException;
22
+ import org.embulk.spi.time.TimestampParser;
23
+ import org.embulk.spi.type.Type;
24
+
25
+ import org.joda.time.DateTimeZone;
26
+ import org.slf4j.Logger;
27
+
28
+ import java.util.List;
29
+
30
+ public class ColumnFilterPlugin implements FilterPlugin
31
+ {
32
+ private static final Logger logger = Exec.getLogger(ColumnFilterPlugin.class);
33
+
34
+ public ColumnFilterPlugin()
35
+ {
36
+ }
37
+
38
+ // NOTE: This is not spi.ColumnConfig
39
+ interface ColumnConfig extends Task
40
+ {
41
+ @Config("name")
42
+ public String getName();
43
+
44
+ @Config("type")
45
+ @ConfigDefault("null")
46
+ public Optional<Type> getType(); // required only for addColumns
47
+
48
+ @Config("default")
49
+ @ConfigDefault("null")
50
+ public Optional<Object> getDefault();
51
+
52
+ @Config("format")
53
+ @ConfigDefault("null")
54
+ public Optional<String> getFormat();
55
+
56
+ @Config("timezone")
57
+ @ConfigDefault("null")
58
+ public Optional<DateTimeZone> getTimeZone();
59
+
60
+ @Config("src")
61
+ @ConfigDefault("null")
62
+ public Optional<String> getSrc();
63
+ }
64
+
65
+ interface PluginTask extends Task, TimestampParser.Task
66
+ {
67
+ @Config("columns")
68
+ @ConfigDefault("[]")
69
+ public List<ColumnConfig> getColumns();
70
+
71
+ @Config("add_columns")
72
+ @ConfigDefault("[]")
73
+ public List<ColumnConfig> getAddColumns();
74
+
75
+ @Config("drop_columns")
76
+ @ConfigDefault("[]")
77
+ public List<ColumnConfig> getDropColumns();
78
+
79
+ // See TimestampParser for default_timestamp_format, and default_timezone
80
+ }
81
+
82
+ @Override
83
+ public void transaction(final ConfigSource config, final Schema inputSchema,
84
+ final FilterPlugin.Control control)
85
+ {
86
+ PluginTask task = config.loadConfig(PluginTask.class);
87
+
88
+ configure(task);
89
+ Schema outputSchema = buildOutputSchema(task, inputSchema);
90
+
91
+ control.run(task.dump(), outputSchema);
92
+ }
93
+
94
+ private void configure(PluginTask task)
95
+ {
96
+ List<ColumnConfig> columns = task.getColumns();
97
+ List<ColumnConfig> addColumns = task.getAddColumns();
98
+ List<ColumnConfig> dropColumns = task.getDropColumns();
99
+
100
+ if (columns.size() == 0 && addColumns.size() == 0 && dropColumns.size() == 0) {
101
+ throw new ConfigException("One of \"columns\", \"add_columns\", \"drop_columns\" must be specified.");
102
+ }
103
+
104
+ if (columns.size() > 0 && dropColumns.size() > 0) {
105
+ throw new ConfigException("Either of \"columns\", \"drop_columns\" can be specified.");
106
+ }
107
+ }
108
+
109
+ private Schema buildOutputSchema(PluginTask task, Schema inputSchema)
110
+ {
111
+ List<ColumnConfig> columns = task.getColumns();
112
+ List<ColumnConfig> addColumns = task.getAddColumns();
113
+ List<ColumnConfig> dropColumns = task.getDropColumns();
114
+
115
+ // Automatically get column type from inputSchema for columns and dropColumns
116
+ ImmutableList.Builder<Column> builder = ImmutableList.builder();
117
+ int i = 0;
118
+ if (dropColumns.size() > 0) {
119
+ for (Column inputColumn : inputSchema.getColumns()) {
120
+ String name = inputColumn.getName();
121
+ boolean matched = false;
122
+ for (ColumnConfig dropColumn : dropColumns) {
123
+ // skip json path notation to build outputSchema
124
+ if (dropColumn.getName().startsWith("$.")) {
125
+ continue;
126
+ }
127
+ if (dropColumn.getName().equals(name)) {
128
+ matched = true;
129
+ break;
130
+ }
131
+ }
132
+ if (! matched) {
133
+ Column outputColumn = new Column(i++, name, inputColumn.getType());
134
+ builder.add(outputColumn);
135
+ }
136
+ }
137
+ }
138
+ else if (columns.size() > 0) {
139
+ for (ColumnConfig column : columns) {
140
+ // skip json path notation to build output schema
141
+ if (column.getName().startsWith("$.")) {
142
+ continue;
143
+ }
144
+ if (column.getSrc().isPresent() && column.getSrc().get().startsWith("$.")) {
145
+ continue;
146
+ }
147
+
148
+ String name = column.getName();
149
+ Optional<Type> type = column.getType();
150
+ Optional<Object> defaultValue = column.getDefault();
151
+ Optional<String> src = column.getSrc();
152
+
153
+ String srcName = src.isPresent() ? src.get() : name;
154
+ Column inputColumn;
155
+ try {
156
+ inputColumn = inputSchema.lookupColumn(srcName);
157
+ }
158
+ catch (SchemaConfigException ex) {
159
+ inputColumn = null;
160
+ }
161
+ if (inputColumn != null) { // filter or copy column
162
+ Column outputColumn = new Column(i++, name, inputColumn.getType());
163
+ builder.add(outputColumn);
164
+ }
165
+ else if (type.isPresent() && defaultValue.isPresent()) { // add column
166
+ Column outputColumn = new Column(i++, name, type.get());
167
+ builder.add(outputColumn);
168
+ }
169
+ else {
170
+ throw new SchemaConfigException(String.format("columns: Column src '%s' is not found in inputSchema. Column '%s' does not have \"type\" and \"default\"", srcName, name));
171
+ }
172
+ }
173
+ }
174
+ else {
175
+ for (Column column : inputSchema.getColumns()) {
176
+ Column outputColumn = new Column(i++, column.getName(), column.getType());
177
+ builder.add(outputColumn);
178
+ }
179
+ }
180
+
181
+ // Add columns to last. If you want to add to head or middle, you can use `columns` option
182
+ if (addColumns.size() > 0) {
183
+ for (ColumnConfig column : addColumns) {
184
+ // skip json path notation to build output schema
185
+ if (column.getName().startsWith("$.")) {
186
+ continue;
187
+ }
188
+ if (column.getSrc().isPresent() && column.getSrc().get().startsWith("$.")) {
189
+ continue;
190
+ }
191
+
192
+ String name = column.getName();
193
+ Optional<Type> type = column.getType();
194
+ Optional<Object> defaultValue = column.getDefault();
195
+ Optional<String> src = column.getSrc();
196
+
197
+ String srcName = null;
198
+ Column inputColumn = null;
199
+ if (src.isPresent()) {
200
+ srcName = src.get();
201
+ try {
202
+ inputColumn = inputSchema.lookupColumn(srcName);
203
+ }
204
+ catch (SchemaConfigException ex) {
205
+ inputColumn = null;
206
+ }
207
+ }
208
+ if (inputColumn != null) { // copy column
209
+ Column outputColumn = new Column(i++, name, inputColumn.getType());
210
+ builder.add(outputColumn);
211
+ }
212
+ else if (type.isPresent() && defaultValue.isPresent()) { // add column
213
+ Column outputColumn = new Column(i++, name, type.get());
214
+ builder.add(outputColumn);
215
+ }
216
+ else {
217
+ throw new SchemaConfigException(String.format("add_columns: Column src '%s' is not found in inputSchema, Column '%s' does not have \"type\" and \"default\"", srcName, name));
218
+ }
219
+ }
220
+ }
221
+
222
+ return new Schema(builder.build());
223
+ }
224
+
225
+ @Override
226
+ public PageOutput open(final TaskSource taskSource, final Schema inputSchema,
227
+ final Schema outputSchema, final PageOutput output)
228
+ {
229
+ final PluginTask task = taskSource.loadTask(PluginTask.class);
230
+
231
+ return new PageOutput() {
232
+ private PageReader pageReader = new PageReader(inputSchema);
233
+ private PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
234
+ private ColumnVisitorImpl visitor = new ColumnVisitorImpl(task, inputSchema, outputSchema, pageReader, pageBuilder);
235
+
236
+ @Override
237
+ public void finish()
238
+ {
239
+ pageBuilder.finish();
240
+ }
241
+
242
+ @Override
243
+ public void close()
244
+ {
245
+ pageBuilder.close();
246
+ }
247
+
248
+ @Override
249
+ public void add(Page page)
250
+ {
251
+ pageReader.setPage(page);
252
+
253
+ while (pageReader.nextRecord()) {
254
+ outputSchema.visitColumns(visitor);
255
+ pageBuilder.addRecord();
256
+ }
257
+ }
258
+ };
259
+ }
260
+ }