embulk-filter-column 0.4.0 → 0.5.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 572e638f806833f390196e60aa5791c0f23aff76
4
- data.tar.gz: db719c3a30d31e86bea2f75afaa09849c92ec631
3
+ metadata.gz: 1e88bb8b22f8f2030982764303175dd77b97a42a
4
+ data.tar.gz: e7b65b4cc79b92b3aa89596fc3e1253fa7897ace
5
5
  SHA512:
6
- metadata.gz: 6a4c06e967fc14763e386326166ae4b0ae9247fca8a158dc41e25a9bb430e26430d70fb2603abd29fedfb54f822a2fac152078628f62eb3fe816bb31a7882d45
7
- data.tar.gz: edb08b9d1b7fc6e53a41d86ab82a61a7f848737bf37f6baa0ec6048c5475c5040db4f165fb5306a21f4fef6e69b7afdce4f78855381c1c5b48e5aaec4fa3b303
6
+ metadata.gz: 7cc74b699dc85ec17ff45ab2d991f77219b7fa93ef03d2d444b21d36b83c47689b4d08ebe81dfc0e428134f9a380262367fd8d3cd33987b831d8628e64a79713
7
+ data.tar.gz: e2cb3d98ec5b678f67d94c8b70854f2b46c5db7544cfbe1f2713653229195c1c57ee2b28c8fe423ab345a01f16b4b7d68337ebfd4c93719512e68c7c35e7a1e2
data/.gitignore CHANGED
@@ -10,3 +10,4 @@ build/
10
10
  .tags
11
11
  .ruby-version
12
12
  *.iml
13
+ .DS_Store
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ # 0.5.0.pre1 (2016-05-24)
2
+
3
+ Enhancements:
4
+
5
+ * Support JSONPath (like) name
6
+
1
7
  # 0.4.0 (2016-02-01)
2
8
 
3
9
  Enhancements:
data/README.md CHANGED
@@ -109,6 +109,28 @@ VmjbjAA0tOoSEPv_vKAGMtD_0aXZji0abGe7_VXHmUQ,3962
109
109
  C40P5H1WcBx-aWFDJCI8th6QPEI2DOUgupt_gB8UutE,7323
110
110
  ```
111
111
 
112
+ ## JSONPath (like) name
113
+
114
+ For type: json column, you can specify [JSONPath](http://goessner.net/articles/JsonPath/) for column's name as:
115
+
116
+ ```
117
+ $.payload.key1
118
+ $.payload.array[0]
119
+ $.payload.array[*]
120
+ ```
121
+
122
+ EXAMPLE:
123
+
124
+ * [example/json_columns.yml](example/json_columns.yml)
125
+ * [example/json_add_columns.yml](example/json_add_columns.yml)
126
+ * [example/json_drop_columns.yml](example/json_drop_columns.yml)
127
+
128
+ NOTE:
129
+
130
+ * JSONPath syntax is not fully supported
131
+ * Embulk's type: json cannot have timestamp column, so `type: timesatmp` for `add_columns` or `columns` with default is not available
132
+ * `src` for `add_columns` or `columns` is not supported yet
133
+
112
134
  ## ToDo
113
135
 
114
136
  * Write test
@@ -119,7 +141,7 @@ Run example:
119
141
 
120
142
  ```
121
143
  $ ./gradlew classpath
122
- $ embulk run -I lib example.yml
144
+ $ embulk preview -I lib example/example.yml
123
145
  ```
124
146
 
125
147
  Run test:
data/build.gradle CHANGED
@@ -13,7 +13,7 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.4.0"
16
+ version = "0.5.0.pre1"
17
17
  sourceCompatibility = 1.7
18
18
  targetCompatibility = 1.7
19
19
 
data/example/columns.yml CHANGED
@@ -29,6 +29,7 @@ filters:
29
29
  - {name: foo, default: 1, type: long}
30
30
  - {name: id}
31
31
  - {name: copy_score, src: score}
32
- - {name: json, default: "{\"foo\": \"FOO\"}"}
32
+ - {name: json, default: "{\"foo\":\"FOO\"}"}
33
+ - {name: $.json.foo}
33
34
  out:
34
35
  type: stdout
@@ -0,0 +1,35 @@
1
+ # in:
2
+ # type: random
3
+ # rows: 100
4
+ # schema:
5
+ # id: primary_key
6
+ # name: string
7
+ # score: integer
8
+ in:
9
+ type: file
10
+ path_prefix: example/example.csv
11
+ parser:
12
+ type: csv
13
+ charset: UTF-8
14
+ newline: CRLF
15
+ null_string: 'NULL'
16
+ skip_header_lines: 1
17
+ comment_line_marker: '#'
18
+ columns:
19
+ - {name: time, type: timestamp, format: "%Y-%m-%d"}
20
+ - {name: id, type: long}
21
+ - {name: name, type: string}
22
+ - {name: score, type: double}
23
+ - {name: json, type: json}
24
+ filters:
25
+ - type: column
26
+ columns:
27
+ - {name: time, default: "2015-07-13", format: "%Y-%m-%d"}
28
+ - {name: name, default: "foo"}
29
+ - {name: foo, default: 1, type: long}
30
+ - {name: id}
31
+ - {name: copy_score, src: score}
32
+ - {name: json, default: "{\"foo\":\"FOO\"}"}
33
+ - {name: $.json.foo}
34
+ out:
35
+ type: stdout
@@ -0,0 +1,31 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/example.csv
4
+ parser:
5
+ type: csv
6
+ charset: UTF-8
7
+ newline: CRLF
8
+ null_string: 'NULL'
9
+ skip_header_lines: 1
10
+ comment_line_marker: '#'
11
+ columns:
12
+ - {name: time, type: timestamp, format: "%Y-%m-%d"}
13
+ - {name: id, type: long}
14
+ - {name: name, type: string}
15
+ - {name: score, type: double}
16
+ - {name: json, type: json}
17
+ filters:
18
+ - type: column
19
+ default_timezone: "Asia/Tokyo"
20
+ default_timestamp_format: "%Y-%m-%d"
21
+ columns:
22
+ - {name: time}
23
+ - {name: id}
24
+ - {name: name}
25
+ - {name: score}
26
+ - {name: json, default: "{}"}
27
+ add_columns:
28
+ - {name: $.json.foo, type: long, default: 1}
29
+ - {name: $.json.d, type: string, default: "2015-07-13"}
30
+ out:
31
+ type: stdout
@@ -0,0 +1,23 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/example.csv
4
+ parser:
5
+ type: csv
6
+ charset: UTF-8
7
+ newline: CRLF
8
+ null_string: 'NULL'
9
+ skip_header_lines: 1
10
+ comment_line_marker: '#'
11
+ columns:
12
+ - {name: time, type: timestamp, format: "%Y-%m-%d"}
13
+ - {name: id, type: long}
14
+ - {name: name, type: string}
15
+ - {name: score, type: double}
16
+ - {name: json, type: json}
17
+ filters:
18
+ - type: column
19
+ columns:
20
+ - {name: json, default: "{\"foo\":\"FOO\"}"}
21
+ - {name: $.json.foo}
22
+ out:
23
+ type: stdout
@@ -0,0 +1,22 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/example.csv
4
+ parser:
5
+ type: csv
6
+ charset: UTF-8
7
+ newline: CRLF
8
+ null_string: 'NULL'
9
+ skip_header_lines: 1
10
+ comment_line_marker: '#'
11
+ columns:
12
+ - {name: time, type: timestamp, format: "%Y-%m-%d"}
13
+ - {name: id, type: long}
14
+ - {name: name, type: string}
15
+ - {name: score, type: double}
16
+ - {name: json, type: json}
17
+ filters:
18
+ - type: column
19
+ drop_columns:
20
+ - {name: $.json.foo }
21
+ out:
22
+ type: stdout
data/gradlew CHANGED
@@ -112,8 +112,8 @@ fi
112
112
 
113
113
  # For Cygwin, switch paths to Windows format before running java
114
114
  if $cygwin ; then
115
- APP_HOME=`cygpath --path --mixed "$APP_HOME"`
116
- CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
115
+ APP_HOME=`cygpath --name --mixed "$APP_HOME"`
116
+ CLASSPATH=`cygpath --name --mixed "$CLASSPATH"`
117
117
 
118
118
  # We build the pattern for arguments to be converted via cygpath
119
119
  ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
@@ -134,7 +134,7 @@ if $cygwin ; then
134
134
  CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135
135
 
136
136
  if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137
- eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
137
+ eval `echo args$i`=`cygpath --name --ignore --mixed "$arg"`
138
138
  else
139
139
  eval `echo args$i`="\"$arg\""
140
140
  fi
@@ -1,3 +1,3 @@
1
1
  Embulk::JavaPlugin.register_filter(
2
- "column", "org.embulk.filter.ColumnFilterPlugin",
2
+ "column", "org.embulk.filter.column.ColumnFilterPlugin",
3
3
  File.expand_path('../../../../classpath', __FILE__))
data/settings.gradle ADDED
@@ -0,0 +1 @@
1
+ rootProject.name = 'embulk-filter-column'
@@ -0,0 +1,260 @@
1
+ package org.embulk.filter.column;
2
+
3
+ import com.google.common.base.Optional;
4
+ import com.google.common.collect.ImmutableList;
5
+
6
+ import org.embulk.config.Config;
7
+ import org.embulk.config.ConfigDefault;
8
+ import org.embulk.config.ConfigException;
9
+ import org.embulk.config.ConfigSource;
10
+ import org.embulk.config.Task;
11
+ import org.embulk.config.TaskSource;
12
+
13
+ import org.embulk.spi.Column;
14
+ import org.embulk.spi.Exec;
15
+ import org.embulk.spi.FilterPlugin;
16
+ import org.embulk.spi.Page;
17
+ import org.embulk.spi.PageBuilder;
18
+ import org.embulk.spi.PageOutput;
19
+ import org.embulk.spi.PageReader;
20
+ import org.embulk.spi.Schema;
21
+ import org.embulk.spi.SchemaConfigException;
22
+ import org.embulk.spi.time.TimestampParser;
23
+ import org.embulk.spi.type.Type;
24
+
25
+ import org.joda.time.DateTimeZone;
26
+ import org.slf4j.Logger;
27
+
28
+ import java.util.List;
29
+
30
+ public class ColumnFilterPlugin implements FilterPlugin
31
+ {
32
+ private static final Logger logger = Exec.getLogger(ColumnFilterPlugin.class);
33
+
34
+ public ColumnFilterPlugin()
35
+ {
36
+ }
37
+
38
+ // NOTE: This is not spi.ColumnConfig
39
+ interface ColumnConfig extends Task
40
+ {
41
+ @Config("name")
42
+ public String getName();
43
+
44
+ @Config("type")
45
+ @ConfigDefault("null")
46
+ public Optional<Type> getType(); // required only for addColumns
47
+
48
+ @Config("default")
49
+ @ConfigDefault("null")
50
+ public Optional<Object> getDefault();
51
+
52
+ @Config("format")
53
+ @ConfigDefault("null")
54
+ public Optional<String> getFormat();
55
+
56
+ @Config("timezone")
57
+ @ConfigDefault("null")
58
+ public Optional<DateTimeZone> getTimeZone();
59
+
60
+ @Config("src")
61
+ @ConfigDefault("null")
62
+ public Optional<String> getSrc();
63
+ }
64
+
65
+ interface PluginTask extends Task, TimestampParser.Task
66
+ {
67
+ @Config("columns")
68
+ @ConfigDefault("[]")
69
+ public List<ColumnConfig> getColumns();
70
+
71
+ @Config("add_columns")
72
+ @ConfigDefault("[]")
73
+ public List<ColumnConfig> getAddColumns();
74
+
75
+ @Config("drop_columns")
76
+ @ConfigDefault("[]")
77
+ public List<ColumnConfig> getDropColumns();
78
+
79
+ // See TimestampParser for default_timestamp_format, and default_timezone
80
+ }
81
+
82
+ @Override
83
+ public void transaction(final ConfigSource config, final Schema inputSchema,
84
+ final FilterPlugin.Control control)
85
+ {
86
+ PluginTask task = config.loadConfig(PluginTask.class);
87
+
88
+ configure(task);
89
+ Schema outputSchema = buildOutputSchema(task, inputSchema);
90
+
91
+ control.run(task.dump(), outputSchema);
92
+ }
93
+
94
+ private void configure(PluginTask task)
95
+ {
96
+ List<ColumnConfig> columns = task.getColumns();
97
+ List<ColumnConfig> addColumns = task.getAddColumns();
98
+ List<ColumnConfig> dropColumns = task.getDropColumns();
99
+
100
+ if (columns.size() == 0 && addColumns.size() == 0 && dropColumns.size() == 0) {
101
+ throw new ConfigException("One of \"columns\", \"add_columns\", \"drop_columns\" must be specified.");
102
+ }
103
+
104
+ if (columns.size() > 0 && dropColumns.size() > 0) {
105
+ throw new ConfigException("Either of \"columns\", \"drop_columns\" can be specified.");
106
+ }
107
+ }
108
+
109
+ private Schema buildOutputSchema(PluginTask task, Schema inputSchema)
110
+ {
111
+ List<ColumnConfig> columns = task.getColumns();
112
+ List<ColumnConfig> addColumns = task.getAddColumns();
113
+ List<ColumnConfig> dropColumns = task.getDropColumns();
114
+
115
+ // Automatically get column type from inputSchema for columns and dropColumns
116
+ ImmutableList.Builder<Column> builder = ImmutableList.builder();
117
+ int i = 0;
118
+ if (dropColumns.size() > 0) {
119
+ for (Column inputColumn : inputSchema.getColumns()) {
120
+ String name = inputColumn.getName();
121
+ boolean matched = false;
122
+ for (ColumnConfig dropColumn : dropColumns) {
123
+ // skip json path notation to build outputSchema
124
+ if (dropColumn.getName().startsWith("$.")) {
125
+ continue;
126
+ }
127
+ if (dropColumn.getName().equals(name)) {
128
+ matched = true;
129
+ break;
130
+ }
131
+ }
132
+ if (! matched) {
133
+ Column outputColumn = new Column(i++, name, inputColumn.getType());
134
+ builder.add(outputColumn);
135
+ }
136
+ }
137
+ }
138
+ else if (columns.size() > 0) {
139
+ for (ColumnConfig column : columns) {
140
+ // skip json path notation to build output schema
141
+ if (column.getName().startsWith("$.")) {
142
+ continue;
143
+ }
144
+ if (column.getSrc().isPresent() && column.getSrc().get().startsWith("$.")) {
145
+ continue;
146
+ }
147
+
148
+ String name = column.getName();
149
+ Optional<Type> type = column.getType();
150
+ Optional<Object> defaultValue = column.getDefault();
151
+ Optional<String> src = column.getSrc();
152
+
153
+ String srcName = src.isPresent() ? src.get() : name;
154
+ Column inputColumn;
155
+ try {
156
+ inputColumn = inputSchema.lookupColumn(srcName);
157
+ }
158
+ catch (SchemaConfigException ex) {
159
+ inputColumn = null;
160
+ }
161
+ if (inputColumn != null) { // filter or copy column
162
+ Column outputColumn = new Column(i++, name, inputColumn.getType());
163
+ builder.add(outputColumn);
164
+ }
165
+ else if (type.isPresent() && defaultValue.isPresent()) { // add column
166
+ Column outputColumn = new Column(i++, name, type.get());
167
+ builder.add(outputColumn);
168
+ }
169
+ else {
170
+ throw new SchemaConfigException(String.format("columns: Column src '%s' is not found in inputSchema. Column '%s' does not have \"type\" and \"default\"", srcName, name));
171
+ }
172
+ }
173
+ }
174
+ else {
175
+ for (Column column : inputSchema.getColumns()) {
176
+ Column outputColumn = new Column(i++, column.getName(), column.getType());
177
+ builder.add(outputColumn);
178
+ }
179
+ }
180
+
181
+ // Add columns to last. If you want to add to head or middle, you can use `columns` option
182
+ if (addColumns.size() > 0) {
183
+ for (ColumnConfig column : addColumns) {
184
+ // skip json path notation to build output schema
185
+ if (column.getName().startsWith("$.")) {
186
+ continue;
187
+ }
188
+ if (column.getSrc().isPresent() && column.getSrc().get().startsWith("$.")) {
189
+ continue;
190
+ }
191
+
192
+ String name = column.getName();
193
+ Optional<Type> type = column.getType();
194
+ Optional<Object> defaultValue = column.getDefault();
195
+ Optional<String> src = column.getSrc();
196
+
197
+ String srcName = null;
198
+ Column inputColumn = null;
199
+ if (src.isPresent()) {
200
+ srcName = src.get();
201
+ try {
202
+ inputColumn = inputSchema.lookupColumn(srcName);
203
+ }
204
+ catch (SchemaConfigException ex) {
205
+ inputColumn = null;
206
+ }
207
+ }
208
+ if (inputColumn != null) { // copy column
209
+ Column outputColumn = new Column(i++, name, inputColumn.getType());
210
+ builder.add(outputColumn);
211
+ }
212
+ else if (type.isPresent() && defaultValue.isPresent()) { // add column
213
+ Column outputColumn = new Column(i++, name, type.get());
214
+ builder.add(outputColumn);
215
+ }
216
+ else {
217
+ throw new SchemaConfigException(String.format("add_columns: Column src '%s' is not found in inputSchema, Column '%s' does not have \"type\" and \"default\"", srcName, name));
218
+ }
219
+ }
220
+ }
221
+
222
+ return new Schema(builder.build());
223
+ }
224
+
225
+ @Override
226
+ public PageOutput open(final TaskSource taskSource, final Schema inputSchema,
227
+ final Schema outputSchema, final PageOutput output)
228
+ {
229
+ final PluginTask task = taskSource.loadTask(PluginTask.class);
230
+
231
+ return new PageOutput() {
232
+ private PageReader pageReader = new PageReader(inputSchema);
233
+ private PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
234
+ private ColumnVisitorImpl visitor = new ColumnVisitorImpl(task, inputSchema, outputSchema, pageReader, pageBuilder);
235
+
236
+ @Override
237
+ public void finish()
238
+ {
239
+ pageBuilder.finish();
240
+ }
241
+
242
+ @Override
243
+ public void close()
244
+ {
245
+ pageBuilder.close();
246
+ }
247
+
248
+ @Override
249
+ public void add(Page page)
250
+ {
251
+ pageReader.setPage(page);
252
+
253
+ while (pageReader.nextRecord()) {
254
+ outputSchema.visitColumns(visitor);
255
+ pageBuilder.addRecord();
256
+ }
257
+ }
258
+ };
259
+ }
260
+ }