embulk-filter-row 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +38 -8
- data/build.gradle +1 -1
- data/classpath/{embulk-filter-row-0.1.4.jar → embulk-filter-row-0.2.0.jar} +0 -0
- data/{example.yml → example/and.yml} +1 -1
- data/example/or.yml +28 -0
- data/src/main/java/org/embulk/filter/RowFilterPlugin.java +125 -5
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 87ce06dbda2e17a52f4825208c79414b8938fb58
|
4
|
+
data.tar.gz: eb2bccca46662779c8e43041520b2a6fd2f4075e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 70708739971679cbed722a954e37e47390d9640dfdda35f291ce18ce44cd755fb125adc7a1b04b578ba0eb8a32be4fc41b701364fe3f421c47d9a17c69ea5b60
|
7
|
+
data.tar.gz: 25265ac26fab5c139930be8e7976a984466ee9e0bc8cae3343b1bbff94bac63d0a9fab74683132e8d6d8a68a2fc93831e9abbfae7cbcf5f207202b7ffefed979
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -6,7 +6,8 @@ A filter plugin for Embulk to filter out rows
|
|
6
6
|
|
7
7
|
## Configuration
|
8
8
|
|
9
|
-
* **
|
9
|
+
* **condition**: AND or OR (string, default: AND).
|
10
|
+
* **conditions**: select only rows which matches with conditions.
|
10
11
|
* **column**: column name (string, required)
|
11
12
|
* **operator** operator (string, optional, default: ==)
|
12
13
|
* boolean operator
|
@@ -35,11 +36,12 @@ A filter plugin for Embulk to filter out rows
|
|
35
36
|
|
36
37
|
NOTE: column type is automatically retrieved from input data (inputSchema)
|
37
38
|
|
38
|
-
## Example
|
39
|
+
## Example (AND)
|
39
40
|
|
40
41
|
```yaml
|
41
42
|
filters:
|
42
43
|
- type: row
|
44
|
+
condition: AND
|
43
45
|
conditions:
|
44
46
|
- {column: foo, operator: "IS NOT NULL"}
|
45
47
|
- {column: id, operator: ">=", argument: 10}
|
@@ -48,13 +50,41 @@ filters:
|
|
48
50
|
- {column: time, operator: "==", argument: "2015-07-13", format: "%Y-%m-%d"}
|
49
51
|
```
|
50
52
|
|
51
|
-
|
53
|
+
## Example (OR)
|
54
|
+
|
55
|
+
```yaml
|
56
|
+
filters:
|
57
|
+
- type: row
|
58
|
+
condition: OR
|
59
|
+
conditions:
|
60
|
+
- {column: a, operator: "IS NOT NULL"}
|
61
|
+
- {column: b, operator: "IS NOT NULL"}
|
62
|
+
```
|
63
|
+
|
64
|
+
## Example (AND of OR)
|
65
|
+
|
66
|
+
embulk-output-row does not directly supports complex conditions such as `((A OR B) AND (C OR D))`, but you should be able to express most of complex conditions by combining multiple filters like
|
67
|
+
|
68
|
+
```yaml
|
69
|
+
filters:
|
70
|
+
- type: row
|
71
|
+
condition: OR
|
72
|
+
conditions:
|
73
|
+
- {column: a, operator: "IS NOT NULL"}
|
74
|
+
- {column: b, operator: "IS NOT NULL"}
|
75
|
+
- type: row
|
76
|
+
condition: OR
|
77
|
+
conditions:
|
78
|
+
- {column: c, operator: "IS NOT NULL"}
|
79
|
+
- {column: d, operator: "IS NOT NULL"}
|
80
|
+
```
|
81
|
+
|
82
|
+
This is equivalent with `((A OR B) AND (C OR D))`.
|
52
83
|
|
53
|
-
##
|
84
|
+
## Not Supported: More Complex Conditions
|
54
85
|
|
55
|
-
*
|
56
|
-
|
57
|
-
* With them, it is possible to send a query to local files, even to S3 files.
|
86
|
+
* It should be better to think using Query engine like [Apache Drill](https://drill.apache.org/) or [Presto](https://prestodb.io/)
|
87
|
+
* With them, it is possible to send a query to local files, even to S3 files.
|
58
88
|
|
59
89
|
## ChangeLog
|
60
90
|
|
@@ -66,7 +96,7 @@ Run example:
|
|
66
96
|
|
67
97
|
```
|
68
98
|
$ ./gradlew classpath
|
69
|
-
$ embulk run -I lib example.yml
|
99
|
+
$ embulk run -I lib example/and.yml
|
70
100
|
```
|
71
101
|
|
72
102
|
Run test:
|
data/build.gradle
CHANGED
Binary file
|
data/example/or.yml
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: example/example.csv
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
charset: UTF-8
|
7
|
+
newline: CRLF
|
8
|
+
null_string: 'NULL'
|
9
|
+
skip_header_lines: 1
|
10
|
+
comment_line_marker: '#'
|
11
|
+
columns:
|
12
|
+
- {name: time, type: timestamp, format: "%Y-%m-%d"}
|
13
|
+
- {name: foo, type: string}
|
14
|
+
- {name: bar, type: string}
|
15
|
+
- {name: flag, type: boolean}
|
16
|
+
- {name: id, type: long}
|
17
|
+
- {name: name, type: string}
|
18
|
+
- {name: score, type: double}
|
19
|
+
filters:
|
20
|
+
- type: row
|
21
|
+
condition: OR
|
22
|
+
conditions:
|
23
|
+
- {column: name, operator: ==, argument: "Vqjht6YEUBsMPXmoW1iOGFROZF27pBzz0TUkOKeDXEY"}
|
24
|
+
- {column: score, operator: ==, argument: 43}
|
25
|
+
- {column: id, operator: ==, argument: 97}
|
26
|
+
- {column: flag, operator: ==, argument: false}
|
27
|
+
out:
|
28
|
+
type: stdout
|
@@ -4,6 +4,7 @@ import org.embulk.config.Config;
|
|
4
4
|
import org.embulk.config.ConfigDefault;
|
5
5
|
import org.embulk.config.ConfigDiff;
|
6
6
|
import org.embulk.config.ConfigSource;
|
7
|
+
import org.embulk.config.ConfigException;
|
7
8
|
import org.embulk.config.Task;
|
8
9
|
import org.embulk.config.TaskSource;
|
9
10
|
|
@@ -51,6 +52,10 @@ public class RowFilterPlugin implements FilterPlugin
|
|
51
52
|
|
52
53
|
public interface PluginTask extends Task, TimestampParser.Task
|
53
54
|
{
|
55
|
+
@Config("condition")
|
56
|
+
@ConfigDefault("\"AND\"")
|
57
|
+
public String getCondition();
|
58
|
+
|
54
59
|
@Config("conditions")
|
55
60
|
public List<ConditionConfig> getConditions();
|
56
61
|
}
|
@@ -66,6 +71,11 @@ public class RowFilterPlugin implements FilterPlugin
|
|
66
71
|
inputSchema.lookupColumn(columnName); // throw SchemaConfigException if not found
|
67
72
|
}
|
68
73
|
|
74
|
+
String condition = task.getCondition().toLowerCase();
|
75
|
+
if (!condition.equals("or") && !condition.equals("and")) {
|
76
|
+
throw new ConfigException("condition must be either of \"or\" or \"and\".");
|
77
|
+
}
|
78
|
+
|
69
79
|
Schema outputSchema = inputSchema;
|
70
80
|
control.run(task.dump(), outputSchema);
|
71
81
|
}
|
@@ -76,6 +86,8 @@ public class RowFilterPlugin implements FilterPlugin
|
|
76
86
|
{
|
77
87
|
PluginTask task = taskSource.loadTask(PluginTask.class);
|
78
88
|
|
89
|
+
final boolean orCondition = task.getCondition().toLowerCase().equals("or");
|
90
|
+
|
79
91
|
final HashMap<String, List<Condition>> conditionMap = new HashMap<String, List<Condition>>();
|
80
92
|
for (Column column : outputSchema.getColumns()) {
|
81
93
|
String columnName = column.getName();
|
@@ -97,7 +109,8 @@ public class RowFilterPlugin implements FilterPlugin
|
|
97
109
|
return new PageOutput() {
|
98
110
|
private PageReader pageReader = new PageReader(inputSchema);
|
99
111
|
private PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
|
100
|
-
private boolean shouldAddRecord
|
112
|
+
private boolean shouldAddRecord;
|
113
|
+
private ColumnVisitor visitor = orCondition ? new ColumnVisitorOrImpl(pageBuilder) : new ColumnVisitorAndImpl(pageBuilder);
|
101
114
|
|
102
115
|
@Override
|
103
116
|
public void finish() {
|
@@ -113,18 +126,125 @@ public class RowFilterPlugin implements FilterPlugin
|
|
113
126
|
public void add(Page page) {
|
114
127
|
pageReader.setPage(page);
|
115
128
|
|
116
|
-
ColumnVisitorImpl visitor = new ColumnVisitorImpl(pageBuilder);
|
117
129
|
while (pageReader.nextRecord()) {
|
118
|
-
shouldAddRecord = true;
|
130
|
+
shouldAddRecord = orCondition ? false : true;
|
119
131
|
inputSchema.visitColumns(visitor);
|
120
132
|
if (shouldAddRecord) pageBuilder.addRecord();
|
121
133
|
}
|
122
134
|
}
|
123
135
|
|
124
|
-
class
|
136
|
+
class ColumnVisitorOrImpl implements ColumnVisitor {
|
137
|
+
private final PageBuilder pageBuilder;
|
138
|
+
|
139
|
+
ColumnVisitorOrImpl(PageBuilder pageBuilder) {
|
140
|
+
this.pageBuilder = pageBuilder;
|
141
|
+
}
|
142
|
+
|
143
|
+
@Override
|
144
|
+
public void booleanColumn(Column column) {
|
145
|
+
if (pageReader.isNull(column)) {
|
146
|
+
pageBuilder.setNull(column);
|
147
|
+
} else {
|
148
|
+
pageBuilder.setBoolean(column, pageReader.getBoolean(column));
|
149
|
+
}
|
150
|
+
if (shouldAddRecord) return;
|
151
|
+
List<Condition> conditionList = conditionMap.get(column.getName());
|
152
|
+
for (Condition _condition : conditionList) {
|
153
|
+
BooleanCondition condition = (BooleanCondition)_condition;
|
154
|
+
if (pageReader.isNull(column)) {
|
155
|
+
if (condition.compare(null)) { shouldAddRecord = true; break; }
|
156
|
+
} else {
|
157
|
+
boolean subject = pageReader.getBoolean(column);
|
158
|
+
if (condition.compare(subject)) { shouldAddRecord = true; break; }
|
159
|
+
}
|
160
|
+
}
|
161
|
+
}
|
162
|
+
|
163
|
+
@Override
|
164
|
+
public void longColumn(Column column) {
|
165
|
+
if (pageReader.isNull(column)) {
|
166
|
+
pageBuilder.setNull(column);
|
167
|
+
} else {
|
168
|
+
pageBuilder.setLong(column, pageReader.getLong(column));
|
169
|
+
}
|
170
|
+
if (shouldAddRecord) return;
|
171
|
+
List<Condition> conditionList = conditionMap.get(column.getName());
|
172
|
+
for (Condition _condition : conditionList) {
|
173
|
+
LongCondition condition = (LongCondition)_condition;
|
174
|
+
if (pageReader.isNull(column)) {
|
175
|
+
if (condition.compare(null)) { shouldAddRecord = true; break; }
|
176
|
+
} else {
|
177
|
+
long subject = pageReader.getLong(column);
|
178
|
+
if (condition.compare(subject)) { shouldAddRecord = true; break; }
|
179
|
+
}
|
180
|
+
}
|
181
|
+
}
|
182
|
+
|
183
|
+
@Override
|
184
|
+
public void doubleColumn(Column column) {
|
185
|
+
if (pageReader.isNull(column)) {
|
186
|
+
pageBuilder.setNull(column);
|
187
|
+
} else {
|
188
|
+
pageBuilder.setDouble(column, pageReader.getDouble(column));
|
189
|
+
}
|
190
|
+
if (shouldAddRecord) return;
|
191
|
+
List<Condition> conditionList = conditionMap.get(column.getName());
|
192
|
+
for (Condition _condition : conditionList) {
|
193
|
+
DoubleCondition condition = (DoubleCondition)_condition;
|
194
|
+
if (pageReader.isNull(column)) {
|
195
|
+
if (condition.compare(null)) { shouldAddRecord = true; break; }
|
196
|
+
} else {
|
197
|
+
double subject = pageReader.getDouble(column);
|
198
|
+
if (condition.compare(subject)) { shouldAddRecord = true; break; }
|
199
|
+
}
|
200
|
+
}
|
201
|
+
}
|
202
|
+
|
203
|
+
@Override
|
204
|
+
public void stringColumn(Column column) {
|
205
|
+
if (pageReader.isNull(column)) {
|
206
|
+
pageBuilder.setNull(column);
|
207
|
+
} else {
|
208
|
+
pageBuilder.setString(column, pageReader.getString(column));
|
209
|
+
}
|
210
|
+
if (shouldAddRecord) return;
|
211
|
+
List<Condition> conditionList = conditionMap.get(column.getName());
|
212
|
+
for (Condition _condition : conditionList) {
|
213
|
+
StringCondition condition = (StringCondition)_condition;
|
214
|
+
if (pageReader.isNull(column)) {
|
215
|
+
if (condition.compare(null)) { shouldAddRecord = true; break; }
|
216
|
+
} else {
|
217
|
+
String subject = pageReader.getString(column);
|
218
|
+
if (condition.compare(subject)) { shouldAddRecord = true; break; }
|
219
|
+
}
|
220
|
+
}
|
221
|
+
}
|
222
|
+
|
223
|
+
@Override
|
224
|
+
public void timestampColumn(Column column) {
|
225
|
+
if (pageReader.isNull(column)) {
|
226
|
+
pageBuilder.setNull(column);
|
227
|
+
} else {
|
228
|
+
pageBuilder.setTimestamp(column, pageReader.getTimestamp(column));
|
229
|
+
}
|
230
|
+
if (shouldAddRecord) return;
|
231
|
+
List<Condition> conditionList = conditionMap.get(column.getName());
|
232
|
+
for (Condition _condition : conditionList) {
|
233
|
+
TimestampCondition condition = (TimestampCondition)_condition;
|
234
|
+
if (pageReader.isNull(column)) {
|
235
|
+
if (condition.compare(null)) { shouldAddRecord = true; break; }
|
236
|
+
} else {
|
237
|
+
Timestamp subject = pageReader.getTimestamp(column);
|
238
|
+
if (condition.compare(subject)) { shouldAddRecord = true; break; }
|
239
|
+
}
|
240
|
+
}
|
241
|
+
}
|
242
|
+
}
|
243
|
+
|
244
|
+
class ColumnVisitorAndImpl implements ColumnVisitor {
|
125
245
|
private final PageBuilder pageBuilder;
|
126
246
|
|
127
|
-
|
247
|
+
ColumnVisitorAndImpl(PageBuilder pageBuilder) {
|
128
248
|
this.pageBuilder = pageBuilder;
|
129
249
|
}
|
130
250
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-row
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naotoshi Seo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-12-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -51,7 +51,8 @@ files:
|
|
51
51
|
- LICENSE.txt
|
52
52
|
- README.md
|
53
53
|
- build.gradle
|
54
|
-
- example.yml
|
54
|
+
- example/and.yml
|
55
|
+
- example/or.yml
|
55
56
|
- gradle/wrapper/gradle-wrapper.jar
|
56
57
|
- gradle/wrapper/gradle-wrapper.properties
|
57
58
|
- gradlew
|
@@ -73,7 +74,7 @@ files:
|
|
73
74
|
- src/test/java/org/embulk/filter/row/TestLongCondition.java
|
74
75
|
- src/test/java/org/embulk/filter/row/TestStringCondition.java
|
75
76
|
- src/test/java/org/embulk/filter/row/TestTimestampCondition.java
|
76
|
-
- classpath/embulk-filter-row-0.
|
77
|
+
- classpath/embulk-filter-row-0.2.0.jar
|
77
78
|
homepage: https://github.com/sonots/embulk-filter-row
|
78
79
|
licenses:
|
79
80
|
- MIT
|