embulk-filter-row 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 66c2553beb93fa0ec8460ac8645e62a9e54da8c9
4
- data.tar.gz: 59bd429b89b936989ff80488b4c9d9caabfb3cf7
3
+ metadata.gz: 87ce06dbda2e17a52f4825208c79414b8938fb58
4
+ data.tar.gz: eb2bccca46662779c8e43041520b2a6fd2f4075e
5
5
  SHA512:
6
- metadata.gz: cba7c85deadaefe03697785643f9612cb9e67de63895b773a05cff5e059c002731ef0abf096d1d71fbdc3d7bea5372cc56c71db7d572486c302f6e941a02bff1
7
- data.tar.gz: 6a8d533156ea62bb4273ce9260cfcea67d16f210780b82c53248a82d982b02fbd0b2fca4d562a56d322b43bab2f4f9d196ff8e6a7ce894947eba24cf11c0182d
6
+ metadata.gz: 70708739971679cbed722a954e37e47390d9640dfdda35f291ce18ce44cd755fb125adc7a1b04b578ba0eb8a32be4fc41b701364fe3f421c47d9a17c69ea5b60
7
+ data.tar.gz: 25265ac26fab5c139930be8e7976a984466ee9e0bc8cae3343b1bbff94bac63d0a9fab74683132e8d6d8a68a2fc93831e9abbfae7cbcf5f207202b7ffefed979
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ # 0.2.0 (2015-12-05)
2
+
3
+ Enhancements:
4
+
5
+ * Support OR condition
6
+
1
7
  # 0.1.4
2
8
 
3
9
  Fixes:
data/README.md CHANGED
@@ -6,7 +6,8 @@ A filter plugin for Embulk to filter out rows
6
6
 
7
7
  ## Configuration
8
8
 
9
- * **conditions**: select only rows which matches with conditions. (support only **AND** conditions)
9
+ * **condition**: AND or OR (string, default: AND).
10
+ * **conditions**: select only rows which matches with conditions.
10
11
  * **column**: column name (string, required)
11
12
  * **operator** operator (string, optional, default: ==)
12
13
  * boolean operator
@@ -35,11 +36,12 @@ A filter plugin for Embulk to filter out rows
35
36
 
36
37
  NOTE: column type is automatically retrieved from input data (inputSchema)
37
38
 
38
- ## Example
39
+ ## Example (AND)
39
40
 
40
41
  ```yaml
41
42
  filters:
42
43
  - type: row
44
+ condition: AND
43
45
  conditions:
44
46
  - {column: foo, operator: "IS NOT NULL"}
45
47
  - {column: id, operator: ">=", argument: 10}
@@ -48,13 +50,41 @@ filters:
48
50
  - {column: time, operator: "==", argument: "2015-07-13", format: "%Y-%m-%d"}
49
51
  ```
50
52
 
51
- NOTE: column type is automatically retrieved from input data (inputSchema)
53
+ ## Example (OR)
54
+
55
+ ```yaml
56
+ filters:
57
+ - type: row
58
+ condition: OR
59
+ conditions:
60
+ - {column: a, operator: "IS NOT NULL"}
61
+ - {column: b, operator: "IS NOT NULL"}
62
+ ```
63
+
64
+ ## Example (AND of OR)
65
+
66
+ embulk-output-row does not directly supports complex conditions such as `((A OR B) AND (C OR D))`, but you should be able to express most of complex conditions by combining multiple filters like
67
+
68
+ ```yaml
69
+ filters:
70
+ - type: row
71
+ condition: OR
72
+ conditions:
73
+ - {column: a, operator: "IS NOT NULL"}
74
+ - {column: b, operator: "IS NOT NULL"}
75
+ - type: row
76
+ condition: OR
77
+ conditions:
78
+ - {column: c, operator: "IS NOT NULL"}
79
+ - {column: d, operator: "IS NOT NULL"}
80
+ ```
81
+
82
+ This is equivalent with `((A OR B) AND (C OR D))`.
52
83
 
53
- ## ToDo
84
+ ## Not Supported: More Complex Conditions
54
85
 
55
- * Support OR condition
56
- * It should be better to think using Query engine like [Apache Drill](https://drill.apache.org/) or [Presto](https://prestodb.io/)
57
- * With them, it is possible to send a query to local files, even to S3 files.
86
+ * It should be better to think using Query engine like [Apache Drill](https://drill.apache.org/) or [Presto](https://prestodb.io/)
87
+ * With them, it is possible to send a query to local files, even to S3 files.
58
88
 
59
89
  ## ChangeLog
60
90
 
@@ -66,7 +96,7 @@ Run example:
66
96
 
67
97
  ```
68
98
  $ ./gradlew classpath
69
- $ embulk run -I lib example.yml
99
+ $ embulk run -I lib example/and.yml
70
100
  ```
71
101
 
72
102
  Run test:
data/build.gradle CHANGED
@@ -12,7 +12,7 @@ configurations {
12
12
  provided
13
13
  }
14
14
 
15
- version = "0.1.4"
15
+ version = "0.2.0"
16
16
  sourceCompatibility = 1.7
17
17
  targetCompatibility = 1.7
18
18
 
@@ -7,7 +7,7 @@
7
7
  # score: integer
8
8
  in:
9
9
  type: file
10
- path_prefix: example.csv
10
+ path_prefix: example/example.csv
11
11
  parser:
12
12
  type: csv
13
13
  charset: UTF-8
data/example/or.yml ADDED
@@ -0,0 +1,28 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/example.csv
4
+ parser:
5
+ type: csv
6
+ charset: UTF-8
7
+ newline: CRLF
8
+ null_string: 'NULL'
9
+ skip_header_lines: 1
10
+ comment_line_marker: '#'
11
+ columns:
12
+ - {name: time, type: timestamp, format: "%Y-%m-%d"}
13
+ - {name: foo, type: string}
14
+ - {name: bar, type: string}
15
+ - {name: flag, type: boolean}
16
+ - {name: id, type: long}
17
+ - {name: name, type: string}
18
+ - {name: score, type: double}
19
+ filters:
20
+ - type: row
21
+ condition: OR
22
+ conditions:
23
+ - {column: name, operator: ==, argument: "Vqjht6YEUBsMPXmoW1iOGFROZF27pBzz0TUkOKeDXEY"}
24
+ - {column: score, operator: ==, argument: 43}
25
+ - {column: id, operator: ==, argument: 97}
26
+ - {column: flag, operator: ==, argument: false}
27
+ out:
28
+ type: stdout
@@ -4,6 +4,7 @@ import org.embulk.config.Config;
4
4
  import org.embulk.config.ConfigDefault;
5
5
  import org.embulk.config.ConfigDiff;
6
6
  import org.embulk.config.ConfigSource;
7
+ import org.embulk.config.ConfigException;
7
8
  import org.embulk.config.Task;
8
9
  import org.embulk.config.TaskSource;
9
10
 
@@ -51,6 +52,10 @@ public class RowFilterPlugin implements FilterPlugin
51
52
 
52
53
  public interface PluginTask extends Task, TimestampParser.Task
53
54
  {
55
+ @Config("condition")
56
+ @ConfigDefault("\"AND\"")
57
+ public String getCondition();
58
+
54
59
  @Config("conditions")
55
60
  public List<ConditionConfig> getConditions();
56
61
  }
@@ -66,6 +71,11 @@ public class RowFilterPlugin implements FilterPlugin
66
71
  inputSchema.lookupColumn(columnName); // throw SchemaConfigException if not found
67
72
  }
68
73
 
74
+ String condition = task.getCondition().toLowerCase();
75
+ if (!condition.equals("or") && !condition.equals("and")) {
76
+ throw new ConfigException("condition must be either of \"or\" or \"and\".");
77
+ }
78
+
69
79
  Schema outputSchema = inputSchema;
70
80
  control.run(task.dump(), outputSchema);
71
81
  }
@@ -76,6 +86,8 @@ public class RowFilterPlugin implements FilterPlugin
76
86
  {
77
87
  PluginTask task = taskSource.loadTask(PluginTask.class);
78
88
 
89
+ final boolean orCondition = task.getCondition().toLowerCase().equals("or");
90
+
79
91
  final HashMap<String, List<Condition>> conditionMap = new HashMap<String, List<Condition>>();
80
92
  for (Column column : outputSchema.getColumns()) {
81
93
  String columnName = column.getName();
@@ -97,7 +109,8 @@ public class RowFilterPlugin implements FilterPlugin
97
109
  return new PageOutput() {
98
110
  private PageReader pageReader = new PageReader(inputSchema);
99
111
  private PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
100
- private boolean shouldAddRecord = true;
112
+ private boolean shouldAddRecord;
113
+ private ColumnVisitor visitor = orCondition ? new ColumnVisitorOrImpl(pageBuilder) : new ColumnVisitorAndImpl(pageBuilder);
101
114
 
102
115
  @Override
103
116
  public void finish() {
@@ -113,18 +126,125 @@ public class RowFilterPlugin implements FilterPlugin
113
126
  public void add(Page page) {
114
127
  pageReader.setPage(page);
115
128
 
116
- ColumnVisitorImpl visitor = new ColumnVisitorImpl(pageBuilder);
117
129
  while (pageReader.nextRecord()) {
118
- shouldAddRecord = true;
130
+ shouldAddRecord = orCondition ? false : true;
119
131
  inputSchema.visitColumns(visitor);
120
132
  if (shouldAddRecord) pageBuilder.addRecord();
121
133
  }
122
134
  }
123
135
 
124
- class ColumnVisitorImpl implements ColumnVisitor {
136
+ class ColumnVisitorOrImpl implements ColumnVisitor {
137
+ private final PageBuilder pageBuilder;
138
+
139
+ ColumnVisitorOrImpl(PageBuilder pageBuilder) {
140
+ this.pageBuilder = pageBuilder;
141
+ }
142
+
143
+ @Override
144
+ public void booleanColumn(Column column) {
145
+ if (pageReader.isNull(column)) {
146
+ pageBuilder.setNull(column);
147
+ } else {
148
+ pageBuilder.setBoolean(column, pageReader.getBoolean(column));
149
+ }
150
+ if (shouldAddRecord) return;
151
+ List<Condition> conditionList = conditionMap.get(column.getName());
152
+ for (Condition _condition : conditionList) {
153
+ BooleanCondition condition = (BooleanCondition)_condition;
154
+ if (pageReader.isNull(column)) {
155
+ if (condition.compare(null)) { shouldAddRecord = true; break; }
156
+ } else {
157
+ boolean subject = pageReader.getBoolean(column);
158
+ if (condition.compare(subject)) { shouldAddRecord = true; break; }
159
+ }
160
+ }
161
+ }
162
+
163
+ @Override
164
+ public void longColumn(Column column) {
165
+ if (pageReader.isNull(column)) {
166
+ pageBuilder.setNull(column);
167
+ } else {
168
+ pageBuilder.setLong(column, pageReader.getLong(column));
169
+ }
170
+ if (shouldAddRecord) return;
171
+ List<Condition> conditionList = conditionMap.get(column.getName());
172
+ for (Condition _condition : conditionList) {
173
+ LongCondition condition = (LongCondition)_condition;
174
+ if (pageReader.isNull(column)) {
175
+ if (condition.compare(null)) { shouldAddRecord = true; break; }
176
+ } else {
177
+ long subject = pageReader.getLong(column);
178
+ if (condition.compare(subject)) { shouldAddRecord = true; break; }
179
+ }
180
+ }
181
+ }
182
+
183
+ @Override
184
+ public void doubleColumn(Column column) {
185
+ if (pageReader.isNull(column)) {
186
+ pageBuilder.setNull(column);
187
+ } else {
188
+ pageBuilder.setDouble(column, pageReader.getDouble(column));
189
+ }
190
+ if (shouldAddRecord) return;
191
+ List<Condition> conditionList = conditionMap.get(column.getName());
192
+ for (Condition _condition : conditionList) {
193
+ DoubleCondition condition = (DoubleCondition)_condition;
194
+ if (pageReader.isNull(column)) {
195
+ if (condition.compare(null)) { shouldAddRecord = true; break; }
196
+ } else {
197
+ double subject = pageReader.getDouble(column);
198
+ if (condition.compare(subject)) { shouldAddRecord = true; break; }
199
+ }
200
+ }
201
+ }
202
+
203
+ @Override
204
+ public void stringColumn(Column column) {
205
+ if (pageReader.isNull(column)) {
206
+ pageBuilder.setNull(column);
207
+ } else {
208
+ pageBuilder.setString(column, pageReader.getString(column));
209
+ }
210
+ if (shouldAddRecord) return;
211
+ List<Condition> conditionList = conditionMap.get(column.getName());
212
+ for (Condition _condition : conditionList) {
213
+ StringCondition condition = (StringCondition)_condition;
214
+ if (pageReader.isNull(column)) {
215
+ if (condition.compare(null)) { shouldAddRecord = true; break; }
216
+ } else {
217
+ String subject = pageReader.getString(column);
218
+ if (condition.compare(subject)) { shouldAddRecord = true; break; }
219
+ }
220
+ }
221
+ }
222
+
223
+ @Override
224
+ public void timestampColumn(Column column) {
225
+ if (pageReader.isNull(column)) {
226
+ pageBuilder.setNull(column);
227
+ } else {
228
+ pageBuilder.setTimestamp(column, pageReader.getTimestamp(column));
229
+ }
230
+ if (shouldAddRecord) return;
231
+ List<Condition> conditionList = conditionMap.get(column.getName());
232
+ for (Condition _condition : conditionList) {
233
+ TimestampCondition condition = (TimestampCondition)_condition;
234
+ if (pageReader.isNull(column)) {
235
+ if (condition.compare(null)) { shouldAddRecord = true; break; }
236
+ } else {
237
+ Timestamp subject = pageReader.getTimestamp(column);
238
+ if (condition.compare(subject)) { shouldAddRecord = true; break; }
239
+ }
240
+ }
241
+ }
242
+ }
243
+
244
+ class ColumnVisitorAndImpl implements ColumnVisitor {
125
245
  private final PageBuilder pageBuilder;
126
246
 
127
- ColumnVisitorImpl(PageBuilder pageBuilder) {
247
+ ColumnVisitorAndImpl(PageBuilder pageBuilder) {
128
248
  this.pageBuilder = pageBuilder;
129
249
  }
130
250
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-row
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Naotoshi Seo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-20 00:00:00.000000000 Z
11
+ date: 2015-12-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -51,7 +51,8 @@ files:
51
51
  - LICENSE.txt
52
52
  - README.md
53
53
  - build.gradle
54
- - example.yml
54
+ - example/and.yml
55
+ - example/or.yml
55
56
  - gradle/wrapper/gradle-wrapper.jar
56
57
  - gradle/wrapper/gradle-wrapper.properties
57
58
  - gradlew
@@ -73,7 +74,7 @@ files:
73
74
  - src/test/java/org/embulk/filter/row/TestLongCondition.java
74
75
  - src/test/java/org/embulk/filter/row/TestStringCondition.java
75
76
  - src/test/java/org/embulk/filter/row/TestTimestampCondition.java
76
- - classpath/embulk-filter-row-0.1.4.jar
77
+ - classpath/embulk-filter-row-0.2.0.jar
77
78
  homepage: https://github.com/sonots/embulk-filter-row
78
79
  licenses:
79
80
  - MIT