embulk-filter-row 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -2
- data/README.md +119 -102
- data/build.gradle +4 -1
- data/example/regexp_multibyte.yml +24 -0
- data/src/main/java/org/embulk/filter/row/where/ParserExp.java +15 -7
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 04f141506053778ed4ff092894f87c9f22acca3d
|
4
|
+
data.tar.gz: 8f7435bc29f4c0e97e44141771a4ce608edb429a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f27041d19e78633e4eedac7426d8d96e2180e047e0793f7c37b55929ba1f8da95f10966ac4643cd0dd690fe6e63ca8c82b95b0afaac64d2d96a89440f97bcf8
|
7
|
+
data.tar.gz: c12e1cca2eed39fa039180237808bc898e415af8a03b889c04ebe010eadcbbfd84cd82d4aaa62d18f2d17f9d71cb2382055e056c2ef30feebeceaf4ed53cc181
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
# 0.4.0 (2017-05-18)
|
2
|
+
|
3
|
+
Enhancements
|
4
|
+
|
5
|
+
* Use joni library for REGEXP operator to improve performance
|
6
|
+
* Note that This may introduce trival incompatibility changes
|
7
|
+
|
1
8
|
# 0.3.3 (2016-08-09)
|
2
9
|
|
3
10
|
Enhancements
|
@@ -19,8 +26,7 @@ Fxies:
|
|
19
26
|
|
20
27
|
Changes:
|
21
28
|
|
22
|
-
* Change identifier syntax from `[a-zA-Z$][a-zA-z0-9\.\-_]*` to `[a-zA-Z_][a-zA-z0-9_]*`
|
23
|
-
* Allow starting _. Disallow staring $. Diallow -. Disallow \.
|
29
|
+
* Change identifier syntax from `[a-zA-Z$][a-zA-z0-9\.\-_]*` to `[a-zA-Z_][a-zA-z0-9_]*` to allow starting _, disallow staring $, disallow -, disallow . (dot).
|
24
30
|
|
25
31
|
# 0.3.0 (2016-08-06)
|
26
32
|
|
data/README.md
CHANGED
@@ -6,93 +6,18 @@ A filter plugin for Embulk to filter out rows
|
|
6
6
|
|
7
7
|
## Configuration
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
Following options are **deprecated**, and will be removed someday.
|
12
|
-
|
13
|
-
* **condition**: AND or OR (string, default: AND).
|
14
|
-
* **conditions**: select only rows which matches with conditions.
|
15
|
-
* **column**: column name (string, required)
|
16
|
-
* **operator** operator (string, optional, default: ==)
|
17
|
-
* boolean operator
|
18
|
-
* ==
|
19
|
-
* !=
|
20
|
-
* numeric operator (long, double, Timestamp)
|
21
|
-
* ==
|
22
|
-
* !=
|
23
|
-
* >
|
24
|
-
* >=
|
25
|
-
* <=
|
26
|
-
* <
|
27
|
-
* string operator
|
28
|
-
* ==
|
29
|
-
* !=
|
30
|
-
* start_with (or startsWith)
|
31
|
-
* end_with (or endsWith)
|
32
|
-
* include (or contains)
|
33
|
-
* unary operator
|
34
|
-
* "IS NULL"
|
35
|
-
* "IS NOT NULL"
|
36
|
-
* **argument**: argument for the operation (string, required for non-unary operators)
|
37
|
-
* **not**: not (boolean, optional, default: false)
|
38
|
-
* **format**: special option for timestamp column, specify the format of timestamp argument, parsed argument is compared with the column value as Timestamp object (string, default is `%Y-%m-%d %H:%M:%S.%N %z`)
|
39
|
-
* **timezone**: special option for timestamp column, specify the timezone of timestamp argument (string, default is `UTC`)
|
40
|
-
|
41
|
-
NOTE: column type is automatically retrieved from input data (inputSchema)
|
9
|
+
Versions >= 0.3.0 has `where` option to support SQL-like syntax.
|
42
10
|
|
43
|
-
|
44
|
-
|
45
|
-
**Deprecated**
|
46
|
-
|
47
|
-
```yaml
|
48
|
-
filters:
|
49
|
-
- type: row
|
50
|
-
condition: AND
|
51
|
-
conditions:
|
52
|
-
- {column: foo, operator: "IS NOT NULL"}
|
53
|
-
- {column: id, operator: ">=", argument: 10}
|
54
|
-
- {column: id, operator: "<", argument: 20}
|
55
|
-
- {column: name, opeartor: "include", argument: foo, not: true}
|
56
|
-
- {column: time, operator: "==", argument: "2015-07-13", format: "%Y-%m-%d"}
|
57
|
-
```
|
58
|
-
|
59
|
-
## Example (OR)
|
11
|
+
* **where**: Select only rows which matches with conditions written in SQL-like syntax. See [SQL-like Syntax](#sql-like-syntax)
|
60
12
|
|
61
|
-
|
13
|
+
## Example
|
62
14
|
|
63
15
|
```yaml
|
64
16
|
filters:
|
65
17
|
- type: row
|
66
|
-
|
67
|
-
conditions:
|
68
|
-
- {column: a, operator: "IS NOT NULL"}
|
69
|
-
- {column: b, operator: "IS NOT NULL"}
|
18
|
+
where: column1 = 'str'
|
70
19
|
```
|
71
20
|
|
72
|
-
## Example (AND of OR)
|
73
|
-
|
74
|
-
**Deprecated**
|
75
|
-
|
76
|
-
You can express a condition such as `(A OR B) AND (C OR D)` by combining multiple filters like
|
77
|
-
|
78
|
-
```yaml
|
79
|
-
filters:
|
80
|
-
- type: row
|
81
|
-
condition: OR
|
82
|
-
conditions:
|
83
|
-
- {column: a, operator: "IS NOT NULL"}
|
84
|
-
- {column: b, operator: "IS NOT NULL"}
|
85
|
-
- type: row
|
86
|
-
condition: OR
|
87
|
-
conditions:
|
88
|
-
- {column: c, operator: "IS NOT NULL"}
|
89
|
-
- {column: d, operator: "IS NOT NULL"}
|
90
|
-
```
|
91
|
-
|
92
|
-
## Example (WHERE)
|
93
|
-
|
94
|
-
Versions >= 0.3.0 suppors SQL-like syntax like
|
95
|
-
|
96
21
|
```yaml
|
97
22
|
filters:
|
98
23
|
- type: row
|
@@ -145,6 +70,8 @@ Characters surrounded by `'` such as `'foo'` is considered as a string literal
|
|
145
70
|
|
146
71
|
### Timestamp Literal
|
147
72
|
|
73
|
+
NOTE: It became possible to omit `TIMESTAMP` keyword on comparing with `timestamp` identifier (column) from version >= 0.3.3.
|
74
|
+
|
148
75
|
`TIMESTAMP ( NumberLiteral | StringLiteral )` such as `TIMESTAMP 1470433087.747123` or `TIMESTAMP '2016-08-06 06:38:07.747123 +0900'` is considered as a timestamp literal
|
149
76
|
|
150
77
|
Number is a epoch time since 1970-01-01 UTC with nano time resolution.
|
@@ -160,49 +87,47 @@ String is a timestamp string which matches with one of following format:
|
|
160
87
|
|
161
88
|
The time zone for formats without `%z` is UTC, and the time resolution is micro second (caused by limitation of Embulk TimestampParser).
|
162
89
|
|
163
|
-
It is possible to omit `TIMESTAMP` keyword on comparing with `timestamp` identifier from version >= 0.3.3.
|
164
|
-
|
165
90
|
### Json Literal
|
166
91
|
|
167
92
|
Not supported yet
|
168
93
|
|
169
94
|
### Identifier Literal
|
170
95
|
|
171
|
-
Characters matching with a regular expression `[a-zA-Z_][a-zA-z0-9_]*` such as `foobar`, and characters surrounded by `"` such as `"foo\"bar"` are considred as an identifier literal, that is, embulk's column name.
|
96
|
+
Characters matching with a regular expression `[a-zA-Z_][a-zA-z0-9_]*` such as `foobar`, and characters surrounded by `"` such as `"foo-bar"`, `"foo.bar"`, and `"foo\"bar"` are considred as an identifier literal, that is, embulk's column name.
|
172
97
|
|
173
98
|
## Operators
|
174
99
|
|
175
100
|
### Boolean Operator
|
176
101
|
|
177
|
-
*
|
178
|
-
*
|
102
|
+
* `=`
|
103
|
+
* `!=`
|
179
104
|
|
180
105
|
### Number Operator (Long and Double)
|
181
106
|
|
182
|
-
*
|
183
|
-
*
|
184
|
-
*
|
185
|
-
*
|
186
|
-
*
|
187
|
-
*
|
107
|
+
* `=`
|
108
|
+
* `!=`
|
109
|
+
* `>`
|
110
|
+
* `>=`
|
111
|
+
* `<=`
|
112
|
+
* `<`
|
188
113
|
|
189
114
|
### String Operator
|
190
115
|
|
191
|
-
*
|
192
|
-
*
|
193
|
-
* START_WITH
|
194
|
-
* END_WITH
|
195
|
-
* INCLUDE
|
196
|
-
* REGEXP
|
116
|
+
* `=`
|
117
|
+
* `!=`
|
118
|
+
* `START_WITH`
|
119
|
+
* `END_WITH`
|
120
|
+
* `INCLUDE`
|
121
|
+
* `REGEXP`
|
197
122
|
|
198
123
|
### Timestamp Operator
|
199
124
|
|
200
|
-
*
|
201
|
-
*
|
202
|
-
*
|
203
|
-
*
|
204
|
-
*
|
205
|
-
*
|
125
|
+
* `=`
|
126
|
+
* `!=`
|
127
|
+
* `>`
|
128
|
+
* `>=`
|
129
|
+
* `<=`
|
130
|
+
* `<`
|
206
131
|
|
207
132
|
### Json Operator
|
208
133
|
|
@@ -214,9 +139,101 @@ Not supported yet
|
|
214
139
|
* "xxx IS NOT NULL"
|
215
140
|
* "NOT xxx"
|
216
141
|
|
142
|
+
## Old Configuration
|
143
|
+
|
144
|
+
Versions >= 0.3.0 has `where` option to supports SQL-like syntax. I recommend to use it.
|
145
|
+
|
146
|
+
Following options are **deprecated**, and **will be removed someday**.
|
147
|
+
|
148
|
+
* **condition**: AND or OR (string, default: AND).
|
149
|
+
* **conditions**: select only rows which matches with conditions.
|
150
|
+
* **column**: column name (string, required)
|
151
|
+
* **operator** operator (string, optional, default: ==)
|
152
|
+
* boolean operator
|
153
|
+
* `==`
|
154
|
+
* `!=`
|
155
|
+
* numeric operator (long, double, Timestamp)
|
156
|
+
* `==`
|
157
|
+
* `!=`
|
158
|
+
* `>`
|
159
|
+
* `>=`
|
160
|
+
* `<=`
|
161
|
+
* `<`
|
162
|
+
* string operator
|
163
|
+
* `==`
|
164
|
+
* `!=`
|
165
|
+
* `start_with` (or `startsWith`)
|
166
|
+
* `end_with` (or `endsWith`)
|
167
|
+
* `include` (or `contains`)
|
168
|
+
* unary operator
|
169
|
+
* `IS NULL`
|
170
|
+
* `IS NOT NULL`
|
171
|
+
* **argument**: argument for the operation (string, required for non-unary operators)
|
172
|
+
* **not**: not (boolean, optional, default: false)
|
173
|
+
* **format**: special option for timestamp column, specify the format of timestamp argument, parsed argument is compared with the column value as Timestamp object (string, default is `%Y-%m-%d %H:%M:%S.%N %z`)
|
174
|
+
* **timezone**: special option for timestamp column, specify the timezone of timestamp argument (string, default is `UTC`)
|
175
|
+
|
176
|
+
NOTE: column type is automatically retrieved from input data (inputSchema)
|
177
|
+
|
178
|
+
## Example (AND)
|
179
|
+
|
180
|
+
**Deprecated**
|
181
|
+
|
182
|
+
```yaml
|
183
|
+
filters:
|
184
|
+
- type: row
|
185
|
+
condition: AND
|
186
|
+
conditions:
|
187
|
+
- {column: foo, operator: "IS NOT NULL"}
|
188
|
+
- {column: id, operator: ">=", argument: 10}
|
189
|
+
- {column: id, operator: "<", argument: 20}
|
190
|
+
- {column: name, opeartor: "include", argument: foo, not: true}
|
191
|
+
- {column: time, operator: "==", argument: "2015-07-13", format: "%Y-%m-%d"}
|
192
|
+
```
|
193
|
+
|
194
|
+
## Example (OR)
|
195
|
+
|
196
|
+
**Deprecated**
|
197
|
+
|
198
|
+
```yaml
|
199
|
+
filters:
|
200
|
+
- type: row
|
201
|
+
condition: OR
|
202
|
+
conditions:
|
203
|
+
- {column: a, operator: "IS NOT NULL"}
|
204
|
+
- {column: b, operator: "IS NOT NULL"}
|
205
|
+
```
|
206
|
+
|
207
|
+
## Example (AND of OR)
|
208
|
+
|
209
|
+
**Deprecated**
|
210
|
+
|
211
|
+
You can express a condition such as `(A OR B) AND (C OR D)` by combining multiple filters like
|
212
|
+
|
213
|
+
```yaml
|
214
|
+
filters:
|
215
|
+
- type: row
|
216
|
+
condition: OR
|
217
|
+
conditions:
|
218
|
+
- {column: a, operator: "IS NOT NULL"}
|
219
|
+
- {column: b, operator: "IS NOT NULL"}
|
220
|
+
- type: row
|
221
|
+
condition: OR
|
222
|
+
conditions:
|
223
|
+
- {column: c, operator: "IS NOT NULL"}
|
224
|
+
- {column: d, operator: "IS NOT NULL"}
|
225
|
+
```
|
226
|
+
|
227
|
+
## Comparisions
|
228
|
+
|
229
|
+
* [embulk-filter-calcite](https://github.com/muga/embulk-filter-calcite)
|
230
|
+
* embulk-filter-calcite is a pretty nice plugin which enables us to write SQL query to filter embulk records.
|
231
|
+
* However, based on [my benchmark (Japanese)](http://qiita.com/sonots/items/a70482d29862de87624d), embulk-filter-calcite was 1471 times slower than embulk-filter-row in iterms of string `=` operator. Use `embulk-filter-row` (this plugin) when you need performance.
|
232
|
+
|
217
233
|
## ToDo
|
218
234
|
|
219
235
|
* Support filtering by values of `type: json` with JSONPath
|
236
|
+
* Support IN operator
|
220
237
|
|
221
238
|
## ChangeLog
|
222
239
|
|
data/build.gradle
CHANGED
@@ -15,7 +15,7 @@ configurations {
|
|
15
15
|
provided
|
16
16
|
}
|
17
17
|
|
18
|
-
version = "0.
|
18
|
+
version = "0.4.0"
|
19
19
|
sourceCompatibility = 1.7
|
20
20
|
targetCompatibility = 1.7
|
21
21
|
|
@@ -23,6 +23,9 @@ dependencies {
|
|
23
23
|
compile "org.embulk:embulk-core:0.8.+"
|
24
24
|
provided "org.embulk:embulk-core:0.8.+"
|
25
25
|
|
26
|
+
compile "org.jruby.joni:joni:2.1.11"
|
27
|
+
compile "org.jruby.jcodings:jcodings:1.0.18"
|
28
|
+
|
26
29
|
testCompile "junit:junit:4.+"
|
27
30
|
testCompile "org.embulk:embulk-core:0.8.+:tests"
|
28
31
|
testCompile "org.embulk:embulk-standards:0.8.+"
|
@@ -0,0 +1,24 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: example/regexp_multibyte.csv
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
charset: UTF-8
|
7
|
+
newline: CRLF
|
8
|
+
null_string: "NULL"
|
9
|
+
skip_header_lines: 1
|
10
|
+
comment_line_marker: '#'
|
11
|
+
columns:
|
12
|
+
- {name: time, type: timestamp, format: "%Y-%m-%d"}
|
13
|
+
- {name: foo, type: string}
|
14
|
+
- {name: bar, type: string}
|
15
|
+
- {name: flag, type: boolean}
|
16
|
+
- {name: id, type: long}
|
17
|
+
- {name: name, type: string}
|
18
|
+
- {name: json, type: json}
|
19
|
+
- {name: score, type: double}
|
20
|
+
filters:
|
21
|
+
- type: row
|
22
|
+
where: name REGEXP '.*チ'
|
23
|
+
out:
|
24
|
+
type: stdout
|
@@ -4,8 +4,12 @@ import org.embulk.config.ConfigException;
|
|
4
4
|
import org.embulk.spi.PageReader;
|
5
5
|
import org.embulk.spi.time.Timestamp;
|
6
6
|
|
7
|
-
import
|
8
|
-
import
|
7
|
+
import org.jcodings.specific.UTF8Encoding;
|
8
|
+
import org.joni.Matcher;
|
9
|
+
import org.joni.Option;
|
10
|
+
import org.joni.Regex;
|
11
|
+
|
12
|
+
import java.nio.charset.StandardCharsets;
|
9
13
|
|
10
14
|
// Operation Node of AST (Abstract Syntax Tree)
|
11
15
|
public abstract class ParserExp extends ParserNode
|
@@ -282,12 +286,15 @@ class StringOpExp extends BinaryOpExp
|
|
282
286
|
|
283
287
|
class RegexpOpExp extends BinaryOpExp
|
284
288
|
{
|
285
|
-
|
289
|
+
Regex regex;
|
286
290
|
|
287
291
|
public RegexpOpExp(ParserLiteral left, ParserLiteral right, int operator)
|
288
292
|
{
|
289
293
|
super(left, right, operator);
|
290
|
-
|
294
|
+
|
295
|
+
byte[] pattern = (((StringLiteral)right).val).getBytes(StandardCharsets.UTF_8);
|
296
|
+
this.regex = new Regex(pattern, 0, pattern.length, Option.NONE, UTF8Encoding.INSTANCE);
|
297
|
+
|
291
298
|
if (! left.isString()) {
|
292
299
|
throw new ConfigException(String.format("\"%s\" is not a String column", ((IdentifierLiteral)left).name));
|
293
300
|
}
|
@@ -300,9 +307,10 @@ class RegexpOpExp extends BinaryOpExp
|
|
300
307
|
|
301
308
|
public boolean eval(PageReader pageReader)
|
302
309
|
{
|
303
|
-
|
304
|
-
Matcher
|
305
|
-
|
310
|
+
byte[] l = left.getString(pageReader).getBytes(StandardCharsets.UTF_8);
|
311
|
+
Matcher matcher = regex.matcher(l);
|
312
|
+
int result = matcher.search(0, l.length, Option.DEFAULT);
|
313
|
+
return result != -1;
|
306
314
|
}
|
307
315
|
}
|
308
316
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-row
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naotoshi Seo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-05-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -56,6 +56,8 @@ files:
|
|
56
56
|
- example/example.csv
|
57
57
|
- example/example.yml
|
58
58
|
- example/or.yml
|
59
|
+
- example/regexp_multibyte.csv
|
60
|
+
- example/regexp_multibyte.yml
|
59
61
|
- example/where.yml
|
60
62
|
- gradle/wrapper/gradle-wrapper.jar
|
61
63
|
- gradle/wrapper/gradle-wrapper.properties
|
@@ -94,7 +96,9 @@ files:
|
|
94
96
|
- src/test/java/org/embulk/filter/row/condition/TestTimestampCondition.java
|
95
97
|
- src/test/java/org/embulk/filter/row/where/TestParser.java
|
96
98
|
- src/test/java/org/embulk/filter/row/where/TestYylex.java
|
97
|
-
- classpath/embulk-filter-row-0.
|
99
|
+
- classpath/embulk-filter-row-0.4.0.jar
|
100
|
+
- classpath/jcodings-1.0.18.jar
|
101
|
+
- classpath/joni-2.1.11.jar
|
98
102
|
homepage: https://github.com/sonots/embulk-filter-row
|
99
103
|
licenses:
|
100
104
|
- MIT
|