embulk-filter-row 0.3.3 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -2
- data/README.md +119 -102
- data/build.gradle +4 -1
- data/example/regexp_multibyte.yml +24 -0
- data/src/main/java/org/embulk/filter/row/where/ParserExp.java +15 -7
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 04f141506053778ed4ff092894f87c9f22acca3d
|
4
|
+
data.tar.gz: 8f7435bc29f4c0e97e44141771a4ce608edb429a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f27041d19e78633e4eedac7426d8d96e2180e047e0793f7c37b55929ba1f8da95f10966ac4643cd0dd690fe6e63ca8c82b95b0afaac64d2d96a89440f97bcf8
|
7
|
+
data.tar.gz: c12e1cca2eed39fa039180237808bc898e415af8a03b889c04ebe010eadcbbfd84cd82d4aaa62d18f2d17f9d71cb2382055e056c2ef30feebeceaf4ed53cc181
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
# 0.4.0 (2017-05-18)
|
2
|
+
|
3
|
+
Enhancements
|
4
|
+
|
5
|
+
* Use joni library for REGEXP operator to improve performance
|
6
|
+
* Note that This may introduce trival incompatibility changes
|
7
|
+
|
1
8
|
# 0.3.3 (2016-08-09)
|
2
9
|
|
3
10
|
Enhancements
|
@@ -19,8 +26,7 @@ Fxies:
|
|
19
26
|
|
20
27
|
Changes:
|
21
28
|
|
22
|
-
* Change identifier syntax from `[a-zA-Z$][a-zA-z0-9\.\-_]*` to `[a-zA-Z_][a-zA-z0-9_]*`
|
23
|
-
* Allow starting _. Disallow staring $. Diallow -. Disallow \.
|
29
|
+
* Change identifier syntax from `[a-zA-Z$][a-zA-z0-9\.\-_]*` to `[a-zA-Z_][a-zA-z0-9_]*` to allow starting _, disallow staring $, disallow -, disallow . (dot).
|
24
30
|
|
25
31
|
# 0.3.0 (2016-08-06)
|
26
32
|
|
data/README.md
CHANGED
@@ -6,93 +6,18 @@ A filter plugin for Embulk to filter out rows
|
|
6
6
|
|
7
7
|
## Configuration
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
Following options are **deprecated**, and will be removed someday.
|
12
|
-
|
13
|
-
* **condition**: AND or OR (string, default: AND).
|
14
|
-
* **conditions**: select only rows which matches with conditions.
|
15
|
-
* **column**: column name (string, required)
|
16
|
-
* **operator** operator (string, optional, default: ==)
|
17
|
-
* boolean operator
|
18
|
-
* ==
|
19
|
-
* !=
|
20
|
-
* numeric operator (long, double, Timestamp)
|
21
|
-
* ==
|
22
|
-
* !=
|
23
|
-
* >
|
24
|
-
* >=
|
25
|
-
* <=
|
26
|
-
* <
|
27
|
-
* string operator
|
28
|
-
* ==
|
29
|
-
* !=
|
30
|
-
* start_with (or startsWith)
|
31
|
-
* end_with (or endsWith)
|
32
|
-
* include (or contains)
|
33
|
-
* unary operator
|
34
|
-
* "IS NULL"
|
35
|
-
* "IS NOT NULL"
|
36
|
-
* **argument**: argument for the operation (string, required for non-unary operators)
|
37
|
-
* **not**: not (boolean, optional, default: false)
|
38
|
-
* **format**: special option for timestamp column, specify the format of timestamp argument, parsed argument is compared with the column value as Timestamp object (string, default is `%Y-%m-%d %H:%M:%S.%N %z`)
|
39
|
-
* **timezone**: special option for timestamp column, specify the timezone of timestamp argument (string, default is `UTC`)
|
40
|
-
|
41
|
-
NOTE: column type is automatically retrieved from input data (inputSchema)
|
9
|
+
Versions >= 0.3.0 has `where` option to support SQL-like syntax.
|
42
10
|
|
43
|
-
|
44
|
-
|
45
|
-
**Deprecated**
|
46
|
-
|
47
|
-
```yaml
|
48
|
-
filters:
|
49
|
-
- type: row
|
50
|
-
condition: AND
|
51
|
-
conditions:
|
52
|
-
- {column: foo, operator: "IS NOT NULL"}
|
53
|
-
- {column: id, operator: ">=", argument: 10}
|
54
|
-
- {column: id, operator: "<", argument: 20}
|
55
|
-
- {column: name, opeartor: "include", argument: foo, not: true}
|
56
|
-
- {column: time, operator: "==", argument: "2015-07-13", format: "%Y-%m-%d"}
|
57
|
-
```
|
58
|
-
|
59
|
-
## Example (OR)
|
11
|
+
* **where**: Select only rows which matches with conditions written in SQL-like syntax. See [SQL-like Syntax](#sql-like-syntax)
|
60
12
|
|
61
|
-
|
13
|
+
## Example
|
62
14
|
|
63
15
|
```yaml
|
64
16
|
filters:
|
65
17
|
- type: row
|
66
|
-
|
67
|
-
conditions:
|
68
|
-
- {column: a, operator: "IS NOT NULL"}
|
69
|
-
- {column: b, operator: "IS NOT NULL"}
|
18
|
+
where: column1 = 'str'
|
70
19
|
```
|
71
20
|
|
72
|
-
## Example (AND of OR)
|
73
|
-
|
74
|
-
**Deprecated**
|
75
|
-
|
76
|
-
You can express a condition such as `(A OR B) AND (C OR D)` by combining multiple filters like
|
77
|
-
|
78
|
-
```yaml
|
79
|
-
filters:
|
80
|
-
- type: row
|
81
|
-
condition: OR
|
82
|
-
conditions:
|
83
|
-
- {column: a, operator: "IS NOT NULL"}
|
84
|
-
- {column: b, operator: "IS NOT NULL"}
|
85
|
-
- type: row
|
86
|
-
condition: OR
|
87
|
-
conditions:
|
88
|
-
- {column: c, operator: "IS NOT NULL"}
|
89
|
-
- {column: d, operator: "IS NOT NULL"}
|
90
|
-
```
|
91
|
-
|
92
|
-
## Example (WHERE)
|
93
|
-
|
94
|
-
Versions >= 0.3.0 suppors SQL-like syntax like
|
95
|
-
|
96
21
|
```yaml
|
97
22
|
filters:
|
98
23
|
- type: row
|
@@ -145,6 +70,8 @@ Characters surrounded by `'` such as `'foo'` is considered as a string literal
|
|
145
70
|
|
146
71
|
### Timestamp Literal
|
147
72
|
|
73
|
+
NOTE: It became possible to omit `TIMESTAMP` keyword on comparing with `timestamp` identifier (column) from version >= 0.3.3.
|
74
|
+
|
148
75
|
`TIMESTAMP ( NumberLiteral | StringLiteral )` such as `TIMESTAMP 1470433087.747123` or `TIMESTAMP '2016-08-06 06:38:07.747123 +0900'` is considered as a timestamp literal
|
149
76
|
|
150
77
|
Number is a epoch time since 1970-01-01 UTC with nano time resolution.
|
@@ -160,49 +87,47 @@ String is a timestamp string which matches with one of following format:
|
|
160
87
|
|
161
88
|
The time zone for formats without `%z` is UTC, and the time resolution is micro second (caused by limitation of Embulk TimestampParser).
|
162
89
|
|
163
|
-
It is possible to omit `TIMESTAMP` keyword on comparing with `timestamp` identifier from version >= 0.3.3.
|
164
|
-
|
165
90
|
### Json Literal
|
166
91
|
|
167
92
|
Not supported yet
|
168
93
|
|
169
94
|
### Identifier Literal
|
170
95
|
|
171
|
-
Characters matching with a regular expression `[a-zA-Z_][a-zA-z0-9_]*` such as `foobar`, and characters surrounded by `"` such as `"foo\"bar"` are considred as an identifier literal, that is, embulk's column name.
|
96
|
+
Characters matching with a regular expression `[a-zA-Z_][a-zA-z0-9_]*` such as `foobar`, and characters surrounded by `"` such as `"foo-bar"`, `"foo.bar"`, and `"foo\"bar"` are considred as an identifier literal, that is, embulk's column name.
|
172
97
|
|
173
98
|
## Operators
|
174
99
|
|
175
100
|
### Boolean Operator
|
176
101
|
|
177
|
-
*
|
178
|
-
*
|
102
|
+
* `=`
|
103
|
+
* `!=`
|
179
104
|
|
180
105
|
### Number Operator (Long and Double)
|
181
106
|
|
182
|
-
*
|
183
|
-
*
|
184
|
-
*
|
185
|
-
*
|
186
|
-
*
|
187
|
-
*
|
107
|
+
* `=`
|
108
|
+
* `!=`
|
109
|
+
* `>`
|
110
|
+
* `>=`
|
111
|
+
* `<=`
|
112
|
+
* `<`
|
188
113
|
|
189
114
|
### String Operator
|
190
115
|
|
191
|
-
*
|
192
|
-
*
|
193
|
-
* START_WITH
|
194
|
-
* END_WITH
|
195
|
-
* INCLUDE
|
196
|
-
* REGEXP
|
116
|
+
* `=`
|
117
|
+
* `!=`
|
118
|
+
* `START_WITH`
|
119
|
+
* `END_WITH`
|
120
|
+
* `INCLUDE`
|
121
|
+
* `REGEXP`
|
197
122
|
|
198
123
|
### Timestamp Operator
|
199
124
|
|
200
|
-
*
|
201
|
-
*
|
202
|
-
*
|
203
|
-
*
|
204
|
-
*
|
205
|
-
*
|
125
|
+
* `=`
|
126
|
+
* `!=`
|
127
|
+
* `>`
|
128
|
+
* `>=`
|
129
|
+
* `<=`
|
130
|
+
* `<`
|
206
131
|
|
207
132
|
### Json Operator
|
208
133
|
|
@@ -214,9 +139,101 @@ Not supported yet
|
|
214
139
|
* "xxx IS NOT NULL"
|
215
140
|
* "NOT xxx"
|
216
141
|
|
142
|
+
## Old Configuration
|
143
|
+
|
144
|
+
Versions >= 0.3.0 has `where` option to supports SQL-like syntax. I recommend to use it.
|
145
|
+
|
146
|
+
Following options are **deprecated**, and **will be removed someday**.
|
147
|
+
|
148
|
+
* **condition**: AND or OR (string, default: AND).
|
149
|
+
* **conditions**: select only rows which matches with conditions.
|
150
|
+
* **column**: column name (string, required)
|
151
|
+
* **operator** operator (string, optional, default: ==)
|
152
|
+
* boolean operator
|
153
|
+
* `==`
|
154
|
+
* `!=`
|
155
|
+
* numeric operator (long, double, Timestamp)
|
156
|
+
* `==`
|
157
|
+
* `!=`
|
158
|
+
* `>`
|
159
|
+
* `>=`
|
160
|
+
* `<=`
|
161
|
+
* `<`
|
162
|
+
* string operator
|
163
|
+
* `==`
|
164
|
+
* `!=`
|
165
|
+
* `start_with` (or `startsWith`)
|
166
|
+
* `end_with` (or `endsWith`)
|
167
|
+
* `include` (or `contains`)
|
168
|
+
* unary operator
|
169
|
+
* `IS NULL`
|
170
|
+
* `IS NOT NULL`
|
171
|
+
* **argument**: argument for the operation (string, required for non-unary operators)
|
172
|
+
* **not**: not (boolean, optional, default: false)
|
173
|
+
* **format**: special option for timestamp column, specify the format of timestamp argument, parsed argument is compared with the column value as Timestamp object (string, default is `%Y-%m-%d %H:%M:%S.%N %z`)
|
174
|
+
* **timezone**: special option for timestamp column, specify the timezone of timestamp argument (string, default is `UTC`)
|
175
|
+
|
176
|
+
NOTE: column type is automatically retrieved from input data (inputSchema)
|
177
|
+
|
178
|
+
## Example (AND)
|
179
|
+
|
180
|
+
**Deprecated**
|
181
|
+
|
182
|
+
```yaml
|
183
|
+
filters:
|
184
|
+
- type: row
|
185
|
+
condition: AND
|
186
|
+
conditions:
|
187
|
+
- {column: foo, operator: "IS NOT NULL"}
|
188
|
+
- {column: id, operator: ">=", argument: 10}
|
189
|
+
- {column: id, operator: "<", argument: 20}
|
190
|
+
- {column: name, opeartor: "include", argument: foo, not: true}
|
191
|
+
- {column: time, operator: "==", argument: "2015-07-13", format: "%Y-%m-%d"}
|
192
|
+
```
|
193
|
+
|
194
|
+
## Example (OR)
|
195
|
+
|
196
|
+
**Deprecated**
|
197
|
+
|
198
|
+
```yaml
|
199
|
+
filters:
|
200
|
+
- type: row
|
201
|
+
condition: OR
|
202
|
+
conditions:
|
203
|
+
- {column: a, operator: "IS NOT NULL"}
|
204
|
+
- {column: b, operator: "IS NOT NULL"}
|
205
|
+
```
|
206
|
+
|
207
|
+
## Example (AND of OR)
|
208
|
+
|
209
|
+
**Deprecated**
|
210
|
+
|
211
|
+
You can express a condition such as `(A OR B) AND (C OR D)` by combining multiple filters like
|
212
|
+
|
213
|
+
```yaml
|
214
|
+
filters:
|
215
|
+
- type: row
|
216
|
+
condition: OR
|
217
|
+
conditions:
|
218
|
+
- {column: a, operator: "IS NOT NULL"}
|
219
|
+
- {column: b, operator: "IS NOT NULL"}
|
220
|
+
- type: row
|
221
|
+
condition: OR
|
222
|
+
conditions:
|
223
|
+
- {column: c, operator: "IS NOT NULL"}
|
224
|
+
- {column: d, operator: "IS NOT NULL"}
|
225
|
+
```
|
226
|
+
|
227
|
+
## Comparisions
|
228
|
+
|
229
|
+
* [embulk-filter-calcite](https://github.com/muga/embulk-filter-calcite)
|
230
|
+
* embulk-filter-calcite is a pretty nice plugin which enables us to write SQL query to filter embulk records.
|
231
|
+
* However, based on [my benchmark (Japanese)](http://qiita.com/sonots/items/a70482d29862de87624d), embulk-filter-calcite was 1471 times slower than embulk-filter-row in iterms of string `=` operator. Use `embulk-filter-row` (this plugin) when you need performance.
|
232
|
+
|
217
233
|
## ToDo
|
218
234
|
|
219
235
|
* Support filtering by values of `type: json` with JSONPath
|
236
|
+
* Support IN operator
|
220
237
|
|
221
238
|
## ChangeLog
|
222
239
|
|
data/build.gradle
CHANGED
@@ -15,7 +15,7 @@ configurations {
|
|
15
15
|
provided
|
16
16
|
}
|
17
17
|
|
18
|
-
version = "0.
|
18
|
+
version = "0.4.0"
|
19
19
|
sourceCompatibility = 1.7
|
20
20
|
targetCompatibility = 1.7
|
21
21
|
|
@@ -23,6 +23,9 @@ dependencies {
|
|
23
23
|
compile "org.embulk:embulk-core:0.8.+"
|
24
24
|
provided "org.embulk:embulk-core:0.8.+"
|
25
25
|
|
26
|
+
compile "org.jruby.joni:joni:2.1.11"
|
27
|
+
compile "org.jruby.jcodings:jcodings:1.0.18"
|
28
|
+
|
26
29
|
testCompile "junit:junit:4.+"
|
27
30
|
testCompile "org.embulk:embulk-core:0.8.+:tests"
|
28
31
|
testCompile "org.embulk:embulk-standards:0.8.+"
|
@@ -0,0 +1,24 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: example/regexp_multibyte.csv
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
charset: UTF-8
|
7
|
+
newline: CRLF
|
8
|
+
null_string: "NULL"
|
9
|
+
skip_header_lines: 1
|
10
|
+
comment_line_marker: '#'
|
11
|
+
columns:
|
12
|
+
- {name: time, type: timestamp, format: "%Y-%m-%d"}
|
13
|
+
- {name: foo, type: string}
|
14
|
+
- {name: bar, type: string}
|
15
|
+
- {name: flag, type: boolean}
|
16
|
+
- {name: id, type: long}
|
17
|
+
- {name: name, type: string}
|
18
|
+
- {name: json, type: json}
|
19
|
+
- {name: score, type: double}
|
20
|
+
filters:
|
21
|
+
- type: row
|
22
|
+
where: name REGEXP '.*チ'
|
23
|
+
out:
|
24
|
+
type: stdout
|
@@ -4,8 +4,12 @@ import org.embulk.config.ConfigException;
|
|
4
4
|
import org.embulk.spi.PageReader;
|
5
5
|
import org.embulk.spi.time.Timestamp;
|
6
6
|
|
7
|
-
import
|
8
|
-
import
|
7
|
+
import org.jcodings.specific.UTF8Encoding;
|
8
|
+
import org.joni.Matcher;
|
9
|
+
import org.joni.Option;
|
10
|
+
import org.joni.Regex;
|
11
|
+
|
12
|
+
import java.nio.charset.StandardCharsets;
|
9
13
|
|
10
14
|
// Operation Node of AST (Abstract Syntax Tree)
|
11
15
|
public abstract class ParserExp extends ParserNode
|
@@ -282,12 +286,15 @@ class StringOpExp extends BinaryOpExp
|
|
282
286
|
|
283
287
|
class RegexpOpExp extends BinaryOpExp
|
284
288
|
{
|
285
|
-
|
289
|
+
Regex regex;
|
286
290
|
|
287
291
|
public RegexpOpExp(ParserLiteral left, ParserLiteral right, int operator)
|
288
292
|
{
|
289
293
|
super(left, right, operator);
|
290
|
-
|
294
|
+
|
295
|
+
byte[] pattern = (((StringLiteral)right).val).getBytes(StandardCharsets.UTF_8);
|
296
|
+
this.regex = new Regex(pattern, 0, pattern.length, Option.NONE, UTF8Encoding.INSTANCE);
|
297
|
+
|
291
298
|
if (! left.isString()) {
|
292
299
|
throw new ConfigException(String.format("\"%s\" is not a String column", ((IdentifierLiteral)left).name));
|
293
300
|
}
|
@@ -300,9 +307,10 @@ class RegexpOpExp extends BinaryOpExp
|
|
300
307
|
|
301
308
|
public boolean eval(PageReader pageReader)
|
302
309
|
{
|
303
|
-
|
304
|
-
Matcher
|
305
|
-
|
310
|
+
byte[] l = left.getString(pageReader).getBytes(StandardCharsets.UTF_8);
|
311
|
+
Matcher matcher = regex.matcher(l);
|
312
|
+
int result = matcher.search(0, l.length, Option.DEFAULT);
|
313
|
+
return result != -1;
|
306
314
|
}
|
307
315
|
}
|
308
316
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-row
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naotoshi Seo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-05-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -56,6 +56,8 @@ files:
|
|
56
56
|
- example/example.csv
|
57
57
|
- example/example.yml
|
58
58
|
- example/or.yml
|
59
|
+
- example/regexp_multibyte.csv
|
60
|
+
- example/regexp_multibyte.yml
|
59
61
|
- example/where.yml
|
60
62
|
- gradle/wrapper/gradle-wrapper.jar
|
61
63
|
- gradle/wrapper/gradle-wrapper.properties
|
@@ -94,7 +96,9 @@ files:
|
|
94
96
|
- src/test/java/org/embulk/filter/row/condition/TestTimestampCondition.java
|
95
97
|
- src/test/java/org/embulk/filter/row/where/TestParser.java
|
96
98
|
- src/test/java/org/embulk/filter/row/where/TestYylex.java
|
97
|
-
- classpath/embulk-filter-row-0.
|
99
|
+
- classpath/embulk-filter-row-0.4.0.jar
|
100
|
+
- classpath/jcodings-1.0.18.jar
|
101
|
+
- classpath/joni-2.1.11.jar
|
98
102
|
homepage: https://github.com/sonots/embulk-filter-row
|
99
103
|
licenses:
|
100
104
|
- MIT
|