embulk-filter-row 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +38 -8
- data/build.gradle +1 -1
- data/classpath/{embulk-filter-row-0.1.4.jar → embulk-filter-row-0.2.0.jar} +0 -0
- data/{example.yml → example/and.yml} +1 -1
- data/example/or.yml +28 -0
- data/src/main/java/org/embulk/filter/RowFilterPlugin.java +125 -5
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 87ce06dbda2e17a52f4825208c79414b8938fb58
|
4
|
+
data.tar.gz: eb2bccca46662779c8e43041520b2a6fd2f4075e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 70708739971679cbed722a954e37e47390d9640dfdda35f291ce18ce44cd755fb125adc7a1b04b578ba0eb8a32be4fc41b701364fe3f421c47d9a17c69ea5b60
|
7
|
+
data.tar.gz: 25265ac26fab5c139930be8e7976a984466ee9e0bc8cae3343b1bbff94bac63d0a9fab74683132e8d6d8a68a2fc93831e9abbfae7cbcf5f207202b7ffefed979
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -6,7 +6,8 @@ A filter plugin for Embulk to filter out rows
|
|
6
6
|
|
7
7
|
## Configuration
|
8
8
|
|
9
|
-
* **
|
9
|
+
* **condition**: AND or OR (string, default: AND).
|
10
|
+
* **conditions**: select only rows which matches with conditions.
|
10
11
|
* **column**: column name (string, required)
|
11
12
|
* **operator** operator (string, optional, default: ==)
|
12
13
|
* boolean operator
|
@@ -35,11 +36,12 @@ A filter plugin for Embulk to filter out rows
|
|
35
36
|
|
36
37
|
NOTE: column type is automatically retrieved from input data (inputSchema)
|
37
38
|
|
38
|
-
## Example
|
39
|
+
## Example (AND)
|
39
40
|
|
40
41
|
```yaml
|
41
42
|
filters:
|
42
43
|
- type: row
|
44
|
+
condition: AND
|
43
45
|
conditions:
|
44
46
|
- {column: foo, operator: "IS NOT NULL"}
|
45
47
|
- {column: id, operator: ">=", argument: 10}
|
@@ -48,13 +50,41 @@ filters:
|
|
48
50
|
- {column: time, operator: "==", argument: "2015-07-13", format: "%Y-%m-%d"}
|
49
51
|
```
|
50
52
|
|
51
|
-
|
53
|
+
## Example (OR)
|
54
|
+
|
55
|
+
```yaml
|
56
|
+
filters:
|
57
|
+
- type: row
|
58
|
+
condition: OR
|
59
|
+
conditions:
|
60
|
+
- {column: a, operator: "IS NOT NULL"}
|
61
|
+
- {column: b, operator: "IS NOT NULL"}
|
62
|
+
```
|
63
|
+
|
64
|
+
## Example (AND of OR)
|
65
|
+
|
66
|
+
embulk-output-row does not directly supports complex conditions such as `((A OR B) AND (C OR D))`, but you should be able to express most of complex conditions by combining multiple filters like
|
67
|
+
|
68
|
+
```yaml
|
69
|
+
filters:
|
70
|
+
- type: row
|
71
|
+
condition: OR
|
72
|
+
conditions:
|
73
|
+
- {column: a, operator: "IS NOT NULL"}
|
74
|
+
- {column: b, operator: "IS NOT NULL"}
|
75
|
+
- type: row
|
76
|
+
condition: OR
|
77
|
+
conditions:
|
78
|
+
- {column: c, operator: "IS NOT NULL"}
|
79
|
+
- {column: d, operator: "IS NOT NULL"}
|
80
|
+
```
|
81
|
+
|
82
|
+
This is equivalent with `((A OR B) AND (C OR D))`.
|
52
83
|
|
53
|
-
##
|
84
|
+
## Not Supported: More Complex Conditions
|
54
85
|
|
55
|
-
*
|
56
|
-
|
57
|
-
* With them, it is possible to send a query to local files, even to S3 files.
|
86
|
+
* It should be better to think using Query engine like [Apache Drill](https://drill.apache.org/) or [Presto](https://prestodb.io/)
|
87
|
+
* With them, it is possible to send a query to local files, even to S3 files.
|
58
88
|
|
59
89
|
## ChangeLog
|
60
90
|
|
@@ -66,7 +96,7 @@ Run example:
|
|
66
96
|
|
67
97
|
```
|
68
98
|
$ ./gradlew classpath
|
69
|
-
$ embulk run -I lib example.yml
|
99
|
+
$ embulk run -I lib example/and.yml
|
70
100
|
```
|
71
101
|
|
72
102
|
Run test:
|
data/build.gradle
CHANGED
Binary file
|
data/example/or.yml
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: example/example.csv
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
charset: UTF-8
|
7
|
+
newline: CRLF
|
8
|
+
null_string: 'NULL'
|
9
|
+
skip_header_lines: 1
|
10
|
+
comment_line_marker: '#'
|
11
|
+
columns:
|
12
|
+
- {name: time, type: timestamp, format: "%Y-%m-%d"}
|
13
|
+
- {name: foo, type: string}
|
14
|
+
- {name: bar, type: string}
|
15
|
+
- {name: flag, type: boolean}
|
16
|
+
- {name: id, type: long}
|
17
|
+
- {name: name, type: string}
|
18
|
+
- {name: score, type: double}
|
19
|
+
filters:
|
20
|
+
- type: row
|
21
|
+
condition: OR
|
22
|
+
conditions:
|
23
|
+
- {column: name, operator: ==, argument: "Vqjht6YEUBsMPXmoW1iOGFROZF27pBzz0TUkOKeDXEY"}
|
24
|
+
- {column: score, operator: ==, argument: 43}
|
25
|
+
- {column: id, operator: ==, argument: 97}
|
26
|
+
- {column: flag, operator: ==, argument: false}
|
27
|
+
out:
|
28
|
+
type: stdout
|
@@ -4,6 +4,7 @@ import org.embulk.config.Config;
|
|
4
4
|
import org.embulk.config.ConfigDefault;
|
5
5
|
import org.embulk.config.ConfigDiff;
|
6
6
|
import org.embulk.config.ConfigSource;
|
7
|
+
import org.embulk.config.ConfigException;
|
7
8
|
import org.embulk.config.Task;
|
8
9
|
import org.embulk.config.TaskSource;
|
9
10
|
|
@@ -51,6 +52,10 @@ public class RowFilterPlugin implements FilterPlugin
|
|
51
52
|
|
52
53
|
public interface PluginTask extends Task, TimestampParser.Task
|
53
54
|
{
|
55
|
+
@Config("condition")
|
56
|
+
@ConfigDefault("\"AND\"")
|
57
|
+
public String getCondition();
|
58
|
+
|
54
59
|
@Config("conditions")
|
55
60
|
public List<ConditionConfig> getConditions();
|
56
61
|
}
|
@@ -66,6 +71,11 @@ public class RowFilterPlugin implements FilterPlugin
|
|
66
71
|
inputSchema.lookupColumn(columnName); // throw SchemaConfigException if not found
|
67
72
|
}
|
68
73
|
|
74
|
+
String condition = task.getCondition().toLowerCase();
|
75
|
+
if (!condition.equals("or") && !condition.equals("and")) {
|
76
|
+
throw new ConfigException("condition must be either of \"or\" or \"and\".");
|
77
|
+
}
|
78
|
+
|
69
79
|
Schema outputSchema = inputSchema;
|
70
80
|
control.run(task.dump(), outputSchema);
|
71
81
|
}
|
@@ -76,6 +86,8 @@ public class RowFilterPlugin implements FilterPlugin
|
|
76
86
|
{
|
77
87
|
PluginTask task = taskSource.loadTask(PluginTask.class);
|
78
88
|
|
89
|
+
final boolean orCondition = task.getCondition().toLowerCase().equals("or");
|
90
|
+
|
79
91
|
final HashMap<String, List<Condition>> conditionMap = new HashMap<String, List<Condition>>();
|
80
92
|
for (Column column : outputSchema.getColumns()) {
|
81
93
|
String columnName = column.getName();
|
@@ -97,7 +109,8 @@ public class RowFilterPlugin implements FilterPlugin
|
|
97
109
|
return new PageOutput() {
|
98
110
|
private PageReader pageReader = new PageReader(inputSchema);
|
99
111
|
private PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
|
100
|
-
private boolean shouldAddRecord
|
112
|
+
private boolean shouldAddRecord;
|
113
|
+
private ColumnVisitor visitor = orCondition ? new ColumnVisitorOrImpl(pageBuilder) : new ColumnVisitorAndImpl(pageBuilder);
|
101
114
|
|
102
115
|
@Override
|
103
116
|
public void finish() {
|
@@ -113,18 +126,125 @@ public class RowFilterPlugin implements FilterPlugin
|
|
113
126
|
public void add(Page page) {
|
114
127
|
pageReader.setPage(page);
|
115
128
|
|
116
|
-
ColumnVisitorImpl visitor = new ColumnVisitorImpl(pageBuilder);
|
117
129
|
while (pageReader.nextRecord()) {
|
118
|
-
shouldAddRecord = true;
|
130
|
+
shouldAddRecord = orCondition ? false : true;
|
119
131
|
inputSchema.visitColumns(visitor);
|
120
132
|
if (shouldAddRecord) pageBuilder.addRecord();
|
121
133
|
}
|
122
134
|
}
|
123
135
|
|
124
|
-
class
|
136
|
+
class ColumnVisitorOrImpl implements ColumnVisitor {
|
137
|
+
private final PageBuilder pageBuilder;
|
138
|
+
|
139
|
+
ColumnVisitorOrImpl(PageBuilder pageBuilder) {
|
140
|
+
this.pageBuilder = pageBuilder;
|
141
|
+
}
|
142
|
+
|
143
|
+
@Override
|
144
|
+
public void booleanColumn(Column column) {
|
145
|
+
if (pageReader.isNull(column)) {
|
146
|
+
pageBuilder.setNull(column);
|
147
|
+
} else {
|
148
|
+
pageBuilder.setBoolean(column, pageReader.getBoolean(column));
|
149
|
+
}
|
150
|
+
if (shouldAddRecord) return;
|
151
|
+
List<Condition> conditionList = conditionMap.get(column.getName());
|
152
|
+
for (Condition _condition : conditionList) {
|
153
|
+
BooleanCondition condition = (BooleanCondition)_condition;
|
154
|
+
if (pageReader.isNull(column)) {
|
155
|
+
if (condition.compare(null)) { shouldAddRecord = true; break; }
|
156
|
+
} else {
|
157
|
+
boolean subject = pageReader.getBoolean(column);
|
158
|
+
if (condition.compare(subject)) { shouldAddRecord = true; break; }
|
159
|
+
}
|
160
|
+
}
|
161
|
+
}
|
162
|
+
|
163
|
+
@Override
|
164
|
+
public void longColumn(Column column) {
|
165
|
+
if (pageReader.isNull(column)) {
|
166
|
+
pageBuilder.setNull(column);
|
167
|
+
} else {
|
168
|
+
pageBuilder.setLong(column, pageReader.getLong(column));
|
169
|
+
}
|
170
|
+
if (shouldAddRecord) return;
|
171
|
+
List<Condition> conditionList = conditionMap.get(column.getName());
|
172
|
+
for (Condition _condition : conditionList) {
|
173
|
+
LongCondition condition = (LongCondition)_condition;
|
174
|
+
if (pageReader.isNull(column)) {
|
175
|
+
if (condition.compare(null)) { shouldAddRecord = true; break; }
|
176
|
+
} else {
|
177
|
+
long subject = pageReader.getLong(column);
|
178
|
+
if (condition.compare(subject)) { shouldAddRecord = true; break; }
|
179
|
+
}
|
180
|
+
}
|
181
|
+
}
|
182
|
+
|
183
|
+
@Override
|
184
|
+
public void doubleColumn(Column column) {
|
185
|
+
if (pageReader.isNull(column)) {
|
186
|
+
pageBuilder.setNull(column);
|
187
|
+
} else {
|
188
|
+
pageBuilder.setDouble(column, pageReader.getDouble(column));
|
189
|
+
}
|
190
|
+
if (shouldAddRecord) return;
|
191
|
+
List<Condition> conditionList = conditionMap.get(column.getName());
|
192
|
+
for (Condition _condition : conditionList) {
|
193
|
+
DoubleCondition condition = (DoubleCondition)_condition;
|
194
|
+
if (pageReader.isNull(column)) {
|
195
|
+
if (condition.compare(null)) { shouldAddRecord = true; break; }
|
196
|
+
} else {
|
197
|
+
double subject = pageReader.getDouble(column);
|
198
|
+
if (condition.compare(subject)) { shouldAddRecord = true; break; }
|
199
|
+
}
|
200
|
+
}
|
201
|
+
}
|
202
|
+
|
203
|
+
@Override
|
204
|
+
public void stringColumn(Column column) {
|
205
|
+
if (pageReader.isNull(column)) {
|
206
|
+
pageBuilder.setNull(column);
|
207
|
+
} else {
|
208
|
+
pageBuilder.setString(column, pageReader.getString(column));
|
209
|
+
}
|
210
|
+
if (shouldAddRecord) return;
|
211
|
+
List<Condition> conditionList = conditionMap.get(column.getName());
|
212
|
+
for (Condition _condition : conditionList) {
|
213
|
+
StringCondition condition = (StringCondition)_condition;
|
214
|
+
if (pageReader.isNull(column)) {
|
215
|
+
if (condition.compare(null)) { shouldAddRecord = true; break; }
|
216
|
+
} else {
|
217
|
+
String subject = pageReader.getString(column);
|
218
|
+
if (condition.compare(subject)) { shouldAddRecord = true; break; }
|
219
|
+
}
|
220
|
+
}
|
221
|
+
}
|
222
|
+
|
223
|
+
@Override
|
224
|
+
public void timestampColumn(Column column) {
|
225
|
+
if (pageReader.isNull(column)) {
|
226
|
+
pageBuilder.setNull(column);
|
227
|
+
} else {
|
228
|
+
pageBuilder.setTimestamp(column, pageReader.getTimestamp(column));
|
229
|
+
}
|
230
|
+
if (shouldAddRecord) return;
|
231
|
+
List<Condition> conditionList = conditionMap.get(column.getName());
|
232
|
+
for (Condition _condition : conditionList) {
|
233
|
+
TimestampCondition condition = (TimestampCondition)_condition;
|
234
|
+
if (pageReader.isNull(column)) {
|
235
|
+
if (condition.compare(null)) { shouldAddRecord = true; break; }
|
236
|
+
} else {
|
237
|
+
Timestamp subject = pageReader.getTimestamp(column);
|
238
|
+
if (condition.compare(subject)) { shouldAddRecord = true; break; }
|
239
|
+
}
|
240
|
+
}
|
241
|
+
}
|
242
|
+
}
|
243
|
+
|
244
|
+
class ColumnVisitorAndImpl implements ColumnVisitor {
|
125
245
|
private final PageBuilder pageBuilder;
|
126
246
|
|
127
|
-
|
247
|
+
ColumnVisitorAndImpl(PageBuilder pageBuilder) {
|
128
248
|
this.pageBuilder = pageBuilder;
|
129
249
|
}
|
130
250
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-row
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naotoshi Seo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-12-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -51,7 +51,8 @@ files:
|
|
51
51
|
- LICENSE.txt
|
52
52
|
- README.md
|
53
53
|
- build.gradle
|
54
|
-
- example.yml
|
54
|
+
- example/and.yml
|
55
|
+
- example/or.yml
|
55
56
|
- gradle/wrapper/gradle-wrapper.jar
|
56
57
|
- gradle/wrapper/gradle-wrapper.properties
|
57
58
|
- gradlew
|
@@ -73,7 +74,7 @@ files:
|
|
73
74
|
- src/test/java/org/embulk/filter/row/TestLongCondition.java
|
74
75
|
- src/test/java/org/embulk/filter/row/TestStringCondition.java
|
75
76
|
- src/test/java/org/embulk/filter/row/TestTimestampCondition.java
|
76
|
-
- classpath/embulk-filter-row-0.
|
77
|
+
- classpath/embulk-filter-row-0.2.0.jar
|
77
78
|
homepage: https://github.com/sonots/embulk-filter-row
|
78
79
|
licenses:
|
79
80
|
- MIT
|