embulk-filter-row 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 66c2553beb93fa0ec8460ac8645e62a9e54da8c9
4
- data.tar.gz: 59bd429b89b936989ff80488b4c9d9caabfb3cf7
3
+ metadata.gz: 87ce06dbda2e17a52f4825208c79414b8938fb58
4
+ data.tar.gz: eb2bccca46662779c8e43041520b2a6fd2f4075e
5
5
  SHA512:
6
- metadata.gz: cba7c85deadaefe03697785643f9612cb9e67de63895b773a05cff5e059c002731ef0abf096d1d71fbdc3d7bea5372cc56c71db7d572486c302f6e941a02bff1
7
- data.tar.gz: 6a8d533156ea62bb4273ce9260cfcea67d16f210780b82c53248a82d982b02fbd0b2fca4d562a56d322b43bab2f4f9d196ff8e6a7ce894947eba24cf11c0182d
6
+ metadata.gz: 70708739971679cbed722a954e37e47390d9640dfdda35f291ce18ce44cd755fb125adc7a1b04b578ba0eb8a32be4fc41b701364fe3f421c47d9a17c69ea5b60
7
+ data.tar.gz: 25265ac26fab5c139930be8e7976a984466ee9e0bc8cae3343b1bbff94bac63d0a9fab74683132e8d6d8a68a2fc93831e9abbfae7cbcf5f207202b7ffefed979
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ # 0.2.0 (2015-12-05)
2
+
3
+ Enhancements:
4
+
5
+ * Support OR condition
6
+
1
7
  # 0.1.4
2
8
 
3
9
  Fixes:
data/README.md CHANGED
@@ -6,7 +6,8 @@ A filter plugin for Embulk to filter out rows
6
6
 
7
7
  ## Configuration
8
8
 
9
- * **conditions**: select only rows which matches with conditions. (support only **AND** conditions)
9
+ * **condition**: AND or OR (string, default: AND).
10
+ * **conditions**: select only rows which matches with conditions.
10
11
  * **column**: column name (string, required)
11
12
  * **operator** operator (string, optional, default: ==)
12
13
  * boolean operator
@@ -35,11 +36,12 @@ A filter plugin for Embulk to filter out rows
35
36
 
36
37
  NOTE: column type is automatically retrieved from input data (inputSchema)
37
38
 
38
- ## Example
39
+ ## Example (AND)
39
40
 
40
41
  ```yaml
41
42
  filters:
42
43
  - type: row
44
+ condition: AND
43
45
  conditions:
44
46
  - {column: foo, operator: "IS NOT NULL"}
45
47
  - {column: id, operator: ">=", argument: 10}
@@ -48,13 +50,41 @@ filters:
48
50
  - {column: time, operator: "==", argument: "2015-07-13", format: "%Y-%m-%d"}
49
51
  ```
50
52
 
51
- NOTE: column type is automatically retrieved from input data (inputSchema)
53
+ ## Example (OR)
54
+
55
+ ```yaml
56
+ filters:
57
+ - type: row
58
+ condition: OR
59
+ conditions:
60
+ - {column: a, operator: "IS NOT NULL"}
61
+ - {column: b, operator: "IS NOT NULL"}
62
+ ```
63
+
64
+ ## Example (AND of OR)
65
+
66
+ embulk-output-row does not directly supports complex conditions such as `((A OR B) AND (C OR D))`, but you should be able to express most of complex conditions by combining multiple filters like
67
+
68
+ ```yaml
69
+ filters:
70
+ - type: row
71
+ condition: OR
72
+ conditions:
73
+ - {column: a, operator: "IS NOT NULL"}
74
+ - {column: b, operator: "IS NOT NULL"}
75
+ - type: row
76
+ condition: OR
77
+ conditions:
78
+ - {column: c, operator: "IS NOT NULL"}
79
+ - {column: d, operator: "IS NOT NULL"}
80
+ ```
81
+
82
+ This is equivalent with `((A OR B) AND (C OR D))`.
52
83
 
53
- ## ToDo
84
+ ## Not Supported: More Complex Conditions
54
85
 
55
- * Support OR condition
56
- * It should be better to think using Query engine like [Apache Drill](https://drill.apache.org/) or [Presto](https://prestodb.io/)
57
- * With them, it is possible to send a query to local files, even to S3 files.
86
+ * It should be better to think using Query engine like [Apache Drill](https://drill.apache.org/) or [Presto](https://prestodb.io/)
87
+ * With them, it is possible to send a query to local files, even to S3 files.
58
88
 
59
89
  ## ChangeLog
60
90
 
@@ -66,7 +96,7 @@ Run example:
66
96
 
67
97
  ```
68
98
  $ ./gradlew classpath
69
- $ embulk run -I lib example.yml
99
+ $ embulk run -I lib example/and.yml
70
100
  ```
71
101
 
72
102
  Run test:
data/build.gradle CHANGED
@@ -12,7 +12,7 @@ configurations {
12
12
  provided
13
13
  }
14
14
 
15
- version = "0.1.4"
15
+ version = "0.2.0"
16
16
  sourceCompatibility = 1.7
17
17
  targetCompatibility = 1.7
18
18
 
@@ -7,7 +7,7 @@
7
7
  # score: integer
8
8
  in:
9
9
  type: file
10
- path_prefix: example.csv
10
+ path_prefix: example/example.csv
11
11
  parser:
12
12
  type: csv
13
13
  charset: UTF-8
data/example/or.yml ADDED
@@ -0,0 +1,28 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/example.csv
4
+ parser:
5
+ type: csv
6
+ charset: UTF-8
7
+ newline: CRLF
8
+ null_string: 'NULL'
9
+ skip_header_lines: 1
10
+ comment_line_marker: '#'
11
+ columns:
12
+ - {name: time, type: timestamp, format: "%Y-%m-%d"}
13
+ - {name: foo, type: string}
14
+ - {name: bar, type: string}
15
+ - {name: flag, type: boolean}
16
+ - {name: id, type: long}
17
+ - {name: name, type: string}
18
+ - {name: score, type: double}
19
+ filters:
20
+ - type: row
21
+ condition: OR
22
+ conditions:
23
+ - {column: name, operator: ==, argument: "Vqjht6YEUBsMPXmoW1iOGFROZF27pBzz0TUkOKeDXEY"}
24
+ - {column: score, operator: ==, argument: 43}
25
+ - {column: id, operator: ==, argument: 97}
26
+ - {column: flag, operator: ==, argument: false}
27
+ out:
28
+ type: stdout
@@ -4,6 +4,7 @@ import org.embulk.config.Config;
4
4
  import org.embulk.config.ConfigDefault;
5
5
  import org.embulk.config.ConfigDiff;
6
6
  import org.embulk.config.ConfigSource;
7
+ import org.embulk.config.ConfigException;
7
8
  import org.embulk.config.Task;
8
9
  import org.embulk.config.TaskSource;
9
10
 
@@ -51,6 +52,10 @@ public class RowFilterPlugin implements FilterPlugin
51
52
 
52
53
  public interface PluginTask extends Task, TimestampParser.Task
53
54
  {
55
+ @Config("condition")
56
+ @ConfigDefault("\"AND\"")
57
+ public String getCondition();
58
+
54
59
  @Config("conditions")
55
60
  public List<ConditionConfig> getConditions();
56
61
  }
@@ -66,6 +71,11 @@ public class RowFilterPlugin implements FilterPlugin
66
71
  inputSchema.lookupColumn(columnName); // throw SchemaConfigException if not found
67
72
  }
68
73
 
74
+ String condition = task.getCondition().toLowerCase();
75
+ if (!condition.equals("or") && !condition.equals("and")) {
76
+ throw new ConfigException("condition must be either of \"or\" or \"and\".");
77
+ }
78
+
69
79
  Schema outputSchema = inputSchema;
70
80
  control.run(task.dump(), outputSchema);
71
81
  }
@@ -76,6 +86,8 @@ public class RowFilterPlugin implements FilterPlugin
76
86
  {
77
87
  PluginTask task = taskSource.loadTask(PluginTask.class);
78
88
 
89
+ final boolean orCondition = task.getCondition().toLowerCase().equals("or");
90
+
79
91
  final HashMap<String, List<Condition>> conditionMap = new HashMap<String, List<Condition>>();
80
92
  for (Column column : outputSchema.getColumns()) {
81
93
  String columnName = column.getName();
@@ -97,7 +109,8 @@ public class RowFilterPlugin implements FilterPlugin
97
109
  return new PageOutput() {
98
110
  private PageReader pageReader = new PageReader(inputSchema);
99
111
  private PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
100
- private boolean shouldAddRecord = true;
112
+ private boolean shouldAddRecord;
113
+ private ColumnVisitor visitor = orCondition ? new ColumnVisitorOrImpl(pageBuilder) : new ColumnVisitorAndImpl(pageBuilder);
101
114
 
102
115
  @Override
103
116
  public void finish() {
@@ -113,18 +126,125 @@ public class RowFilterPlugin implements FilterPlugin
113
126
  public void add(Page page) {
114
127
  pageReader.setPage(page);
115
128
 
116
- ColumnVisitorImpl visitor = new ColumnVisitorImpl(pageBuilder);
117
129
  while (pageReader.nextRecord()) {
118
- shouldAddRecord = true;
130
+ shouldAddRecord = orCondition ? false : true;
119
131
  inputSchema.visitColumns(visitor);
120
132
  if (shouldAddRecord) pageBuilder.addRecord();
121
133
  }
122
134
  }
123
135
 
124
- class ColumnVisitorImpl implements ColumnVisitor {
136
+ class ColumnVisitorOrImpl implements ColumnVisitor {
137
+ private final PageBuilder pageBuilder;
138
+
139
+ ColumnVisitorOrImpl(PageBuilder pageBuilder) {
140
+ this.pageBuilder = pageBuilder;
141
+ }
142
+
143
+ @Override
144
+ public void booleanColumn(Column column) {
145
+ if (pageReader.isNull(column)) {
146
+ pageBuilder.setNull(column);
147
+ } else {
148
+ pageBuilder.setBoolean(column, pageReader.getBoolean(column));
149
+ }
150
+ if (shouldAddRecord) return;
151
+ List<Condition> conditionList = conditionMap.get(column.getName());
152
+ for (Condition _condition : conditionList) {
153
+ BooleanCondition condition = (BooleanCondition)_condition;
154
+ if (pageReader.isNull(column)) {
155
+ if (condition.compare(null)) { shouldAddRecord = true; break; }
156
+ } else {
157
+ boolean subject = pageReader.getBoolean(column);
158
+ if (condition.compare(subject)) { shouldAddRecord = true; break; }
159
+ }
160
+ }
161
+ }
162
+
163
+ @Override
164
+ public void longColumn(Column column) {
165
+ if (pageReader.isNull(column)) {
166
+ pageBuilder.setNull(column);
167
+ } else {
168
+ pageBuilder.setLong(column, pageReader.getLong(column));
169
+ }
170
+ if (shouldAddRecord) return;
171
+ List<Condition> conditionList = conditionMap.get(column.getName());
172
+ for (Condition _condition : conditionList) {
173
+ LongCondition condition = (LongCondition)_condition;
174
+ if (pageReader.isNull(column)) {
175
+ if (condition.compare(null)) { shouldAddRecord = true; break; }
176
+ } else {
177
+ long subject = pageReader.getLong(column);
178
+ if (condition.compare(subject)) { shouldAddRecord = true; break; }
179
+ }
180
+ }
181
+ }
182
+
183
+ @Override
184
+ public void doubleColumn(Column column) {
185
+ if (pageReader.isNull(column)) {
186
+ pageBuilder.setNull(column);
187
+ } else {
188
+ pageBuilder.setDouble(column, pageReader.getDouble(column));
189
+ }
190
+ if (shouldAddRecord) return;
191
+ List<Condition> conditionList = conditionMap.get(column.getName());
192
+ for (Condition _condition : conditionList) {
193
+ DoubleCondition condition = (DoubleCondition)_condition;
194
+ if (pageReader.isNull(column)) {
195
+ if (condition.compare(null)) { shouldAddRecord = true; break; }
196
+ } else {
197
+ double subject = pageReader.getDouble(column);
198
+ if (condition.compare(subject)) { shouldAddRecord = true; break; }
199
+ }
200
+ }
201
+ }
202
+
203
+ @Override
204
+ public void stringColumn(Column column) {
205
+ if (pageReader.isNull(column)) {
206
+ pageBuilder.setNull(column);
207
+ } else {
208
+ pageBuilder.setString(column, pageReader.getString(column));
209
+ }
210
+ if (shouldAddRecord) return;
211
+ List<Condition> conditionList = conditionMap.get(column.getName());
212
+ for (Condition _condition : conditionList) {
213
+ StringCondition condition = (StringCondition)_condition;
214
+ if (pageReader.isNull(column)) {
215
+ if (condition.compare(null)) { shouldAddRecord = true; break; }
216
+ } else {
217
+ String subject = pageReader.getString(column);
218
+ if (condition.compare(subject)) { shouldAddRecord = true; break; }
219
+ }
220
+ }
221
+ }
222
+
223
+ @Override
224
+ public void timestampColumn(Column column) {
225
+ if (pageReader.isNull(column)) {
226
+ pageBuilder.setNull(column);
227
+ } else {
228
+ pageBuilder.setTimestamp(column, pageReader.getTimestamp(column));
229
+ }
230
+ if (shouldAddRecord) return;
231
+ List<Condition> conditionList = conditionMap.get(column.getName());
232
+ for (Condition _condition : conditionList) {
233
+ TimestampCondition condition = (TimestampCondition)_condition;
234
+ if (pageReader.isNull(column)) {
235
+ if (condition.compare(null)) { shouldAddRecord = true; break; }
236
+ } else {
237
+ Timestamp subject = pageReader.getTimestamp(column);
238
+ if (condition.compare(subject)) { shouldAddRecord = true; break; }
239
+ }
240
+ }
241
+ }
242
+ }
243
+
244
+ class ColumnVisitorAndImpl implements ColumnVisitor {
125
245
  private final PageBuilder pageBuilder;
126
246
 
127
- ColumnVisitorImpl(PageBuilder pageBuilder) {
247
+ ColumnVisitorAndImpl(PageBuilder pageBuilder) {
128
248
  this.pageBuilder = pageBuilder;
129
249
  }
130
250
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-row
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Naotoshi Seo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-20 00:00:00.000000000 Z
11
+ date: 2015-12-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -51,7 +51,8 @@ files:
51
51
  - LICENSE.txt
52
52
  - README.md
53
53
  - build.gradle
54
- - example.yml
54
+ - example/and.yml
55
+ - example/or.yml
55
56
  - gradle/wrapper/gradle-wrapper.jar
56
57
  - gradle/wrapper/gradle-wrapper.properties
57
58
  - gradlew
@@ -73,7 +74,7 @@ files:
73
74
  - src/test/java/org/embulk/filter/row/TestLongCondition.java
74
75
  - src/test/java/org/embulk/filter/row/TestStringCondition.java
75
76
  - src/test/java/org/embulk/filter/row/TestTimestampCondition.java
76
- - classpath/embulk-filter-row-0.1.4.jar
77
+ - classpath/embulk-filter-row-0.2.0.jar
77
78
  homepage: https://github.com/sonots/embulk-filter-row
78
79
  licenses:
79
80
  - MIT