embulk-filter-expand_json 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +0 -2
- data/CHANGELOG.md +5 -0
- data/README.md +32 -1
- data/bench/.bundle/config +2 -0
- data/bench/.gitignore +2 -0
- data/bench/Gemfile +5 -0
- data/bench/Gemfile.lock +22 -0
- data/bench/config_raw.yml +14 -0
- data/bench/config_with_lru_cache.yml +27 -0
- data/bench/config_with_noop_cache.yml +28 -0
- data/bench/run.sh +57 -0
- data/build.gradle +18 -15
- data/settings.gradle +1 -0
- data/src/main/java/org/embulk/filter/expand_json/ExpandJsonFilterPlugin.java +55 -4
- data/src/main/java/org/embulk/filter/expand_json/FilteredPageOutput.java +0 -3
- data/src/test/java/org/embulk/filter/expand_json/MyNOOPCache.java +9 -0
- data/src/test/java/org/embulk/filter/expand_json/TestExpandJsonFilterPlugin.java +840 -2
- metadata +27 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9a6ff1c6a5264a85b4aa62b7a7719c20f5a987c1
|
4
|
+
data.tar.gz: f00862e0f575be8a4f7b1ed938f56370a32d9b0b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 787eb4e46c979414d5ee986ec7e1deb2cef85477ac48d0b85aa55762de4dd6ac133d1fa877490e432c393b86cf61d40dac4fd021613bf012ced550200c26d3dc
|
7
|
+
data.tar.gz: 3f3352f3eb7dfd14b506bda85e0c15ac038d5b1495d618eb623ca6f686fc75ec823a28e1d2d1dde6e5e5c8f4f5b0f86169df1cb006a8b796adf8bb90ada6a9b5
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
0.3.0 (2019-05-02)
|
2
|
+
==================
|
3
|
+
- [Enhancement] Introduce `cache_provider` option.
|
4
|
+
- https://github.com/civitaspo/embulk-filter-expand_json/pull/41
|
5
|
+
|
1
6
|
0.2.2 (2017-09-14)
|
2
7
|
==================
|
3
8
|
- [Enhancement] Use TimestampParser's constructor without JRuby ScriptingContainer
|
data/README.md
CHANGED
@@ -21,6 +21,8 @@ expand columns having json into multiple columns
|
|
21
21
|
- **keep_expanding_json_column**: Not remove the expanding json column from input schema if it's true (false by default)
|
22
22
|
- **default_timezone**: Time zone of timestamp columns if values don’t include time zone description (`UTC` by default)
|
23
23
|
- **stop_on_invalid_record**: Stop bulk load transaction if an invalid record is included (false by default)
|
24
|
+
- **cache_provider**: Cache provider name for JsonPath. `"LRU"` and `"NOOP"` are built-in. You can specify user defined class. (string, default: `"LRU"`)
|
25
|
+
- `"NOOP"` becomes default in the future.
|
24
26
|
|
25
27
|
---
|
26
28
|
**type of the column**
|
@@ -53,6 +55,7 @@ filters:
|
|
53
55
|
- {name: "profile.like_words[0]", type: string}
|
54
56
|
```
|
55
57
|
|
58
|
+
|
56
59
|
## Note
|
57
60
|
- If the value evaluated by JsonPath is Array or Hash, the value is written as JSON.
|
58
61
|
|
@@ -61,13 +64,41 @@ filters:
|
|
61
64
|
- use to evaluate [JsonPath](http://goessner.net/articles/JsonPath/)
|
62
65
|
- [Apache License Version 2.0](https://github.com/jayway/JsonPath/blob/master/LICENSE)
|
63
66
|
|
67
|
+
## Development
|
64
68
|
|
65
|
-
|
69
|
+
### Run Example
|
70
|
+
|
71
|
+
```
|
72
|
+
./gradlew classpath
|
73
|
+
embulk run -Ilib ./example/config.yml
|
74
|
+
```
|
75
|
+
|
76
|
+
|
77
|
+
### Build
|
66
78
|
|
67
79
|
```
|
68
80
|
$ ./gradlew gem # -t to watch change of files and rebuild continuously
|
69
81
|
```
|
70
82
|
|
83
|
+
## Benchmark for `cache_provider` option
|
84
|
+
|
85
|
+
In some cases, `cache_provider: NOOP` improves the performance of this plugin by 3 times (https://github.com/civitaspo/embulk-filter-expand_json/pull/41/).
|
86
|
+
So we do a benchmark about `cache_provider`. In our case, `cache_provider: noop` improves the performance by 1.5 times.
|
87
|
+
|
88
|
+
|use `expand_json` filter|cache_provider|Time took|records/s|
|
89
|
+
|:---|:---|:---|:---|
|
90
|
+
|`false`|none|7.62s|1,325,459/s|
|
91
|
+
|`true`|`"LRU"`|2m9s|78,025/s|
|
92
|
+
|`true`|`"NOOP"`|1m25s|118,476/s|
|
93
|
+
|
94
|
+
|
95
|
+
You can reproduce the bench by the below way.
|
96
|
+
|
97
|
+
```
|
98
|
+
./gradlew classpath
|
99
|
+
./bench/run.sh
|
100
|
+
```
|
101
|
+
|
71
102
|
## Contributor
|
72
103
|
- @Civitaspo
|
73
104
|
- @muga
|
data/bench/.gitignore
ADDED
data/bench/Gemfile
ADDED
data/bench/Gemfile.lock
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
GEM
|
2
|
+
remote: https://rubygems.org/
|
3
|
+
specs:
|
4
|
+
embulk (0.9.16-java)
|
5
|
+
bundler (>= 1.10.6)
|
6
|
+
liquid (~> 4.0.0)
|
7
|
+
msgpack (~> 1.1.0)
|
8
|
+
embulk-filter-speedometer (0.3.4)
|
9
|
+
embulk-parser-none (0.2.0)
|
10
|
+
liquid (4.0.3)
|
11
|
+
msgpack (1.1.0-java)
|
12
|
+
|
13
|
+
PLATFORMS
|
14
|
+
java
|
15
|
+
|
16
|
+
DEPENDENCIES
|
17
|
+
embulk
|
18
|
+
embulk-filter-speedometer
|
19
|
+
embulk-parser-none
|
20
|
+
|
21
|
+
BUNDLED WITH
|
22
|
+
1.16.0
|
@@ -0,0 +1,27 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: data.jsonl
|
4
|
+
parser:
|
5
|
+
type: none
|
6
|
+
column_name: payload
|
7
|
+
|
8
|
+
filters:
|
9
|
+
- type: speedometer
|
10
|
+
log_interval_seconds: 1
|
11
|
+
- type: expand_json
|
12
|
+
json_column_name: payload
|
13
|
+
root: "$."
|
14
|
+
expanded_columns:
|
15
|
+
- {name: "phone_numbers", type: string}
|
16
|
+
- {name: "app_id", type: long}
|
17
|
+
- {name: "point", type: double}
|
18
|
+
- {name: "created_at", type: timestamp, format: "%Y-%m-%d"}
|
19
|
+
- {name: "profile.anniversary.et", type: string}
|
20
|
+
- {name: "profile.anniversary", type: string}
|
21
|
+
- {name: "profile.like_words[1]", type: string}
|
22
|
+
- {name: "profile.like_words[2]", type: string}
|
23
|
+
- {name: "profile.like_words", type: string}
|
24
|
+
|
25
|
+
out:
|
26
|
+
type: "null"
|
27
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: data.jsonl
|
4
|
+
parser:
|
5
|
+
type: none
|
6
|
+
column_name: payload
|
7
|
+
|
8
|
+
filters:
|
9
|
+
- type: speedometer
|
10
|
+
log_interval_seconds: 1
|
11
|
+
- type: expand_json
|
12
|
+
json_column_name: payload
|
13
|
+
cache_provider: noop
|
14
|
+
root: "$."
|
15
|
+
expanded_columns:
|
16
|
+
- {name: "phone_numbers", type: string}
|
17
|
+
- {name: "app_id", type: long}
|
18
|
+
- {name: "point", type: double}
|
19
|
+
- {name: "created_at", type: timestamp, format: "%Y-%m-%d"}
|
20
|
+
- {name: "profile.anniversary.et", type: string}
|
21
|
+
- {name: "profile.anniversary", type: string}
|
22
|
+
- {name: "profile.like_words[1]", type: string}
|
23
|
+
- {name: "profile.like_words[2]", type: string}
|
24
|
+
- {name: "profile.like_words", type: string}
|
25
|
+
|
26
|
+
out:
|
27
|
+
type: "null"
|
28
|
+
|
data/bench/run.sh
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
BENCH_ROOT=$(cd $(dirname $0); pwd)
|
4
|
+
DATA_FILE=data.jsonl
|
5
|
+
TMP_DATA_FILE=tmp.jsonl
|
6
|
+
|
7
|
+
function now() {
|
8
|
+
date +"%FT%T%:z"
|
9
|
+
}
|
10
|
+
|
11
|
+
echo "[$(now)] Preparing ..."
|
12
|
+
(
|
13
|
+
cd $BENCH_ROOT
|
14
|
+
embulk bundle
|
15
|
+
|
16
|
+
if [ -f $DATA_FILE ]; then
|
17
|
+
rm -f $DATA_FILE
|
18
|
+
fi
|
19
|
+
if [ -f $TMP_DATA_FILE ]; then
|
20
|
+
rm -f $TMP_DATA_FILE
|
21
|
+
fi
|
22
|
+
for n in {1..100}; do
|
23
|
+
cat ../example/data.tsv | cut -f5 >> $TMP_DATA_FILE
|
24
|
+
done
|
25
|
+
for n in {1..1000}; do
|
26
|
+
cat $TMP_DATA_FILE >> $DATA_FILE
|
27
|
+
done
|
28
|
+
)
|
29
|
+
|
30
|
+
echo "[$(now)] Run No expand_json"
|
31
|
+
(
|
32
|
+
cd $BENCH_ROOT
|
33
|
+
time embulk run -I ../lib -b . config_raw.yml
|
34
|
+
)
|
35
|
+
|
36
|
+
echo "[$(now)] Run Default (LRUCache)"
|
37
|
+
(
|
38
|
+
cd $BENCH_ROOT
|
39
|
+
time embulk run -I ../lib -b . config_with_lru_cache.yml
|
40
|
+
)
|
41
|
+
|
42
|
+
echo "[$(now)] Run with NOOPCache"
|
43
|
+
(
|
44
|
+
cd $BENCH_ROOT
|
45
|
+
time embulk run -I ../lib -b . config_with_noop_cache.yml
|
46
|
+
)
|
47
|
+
|
48
|
+
echo "[$(now)] Teardown..."
|
49
|
+
(
|
50
|
+
cd $BENCH_ROOT
|
51
|
+
if [ -f $DATA_FILE ]; then
|
52
|
+
rm -f $DATA_FILE
|
53
|
+
fi
|
54
|
+
if [ -f $TMP_DATA_FILE ]; then
|
55
|
+
rm -f $TMP_DATA_FILE
|
56
|
+
fi
|
57
|
+
)
|
data/build.gradle
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
plugins {
|
2
2
|
id "com.jfrog.bintray" version "1.1"
|
3
|
-
id "com.github.jruby-gradle.base" version "
|
3
|
+
id "com.github.jruby-gradle.base" version "1.5.0"
|
4
4
|
id "com.github.kt3k.coveralls" version "2.4.0"
|
5
5
|
id "jacoco"
|
6
6
|
id "java"
|
@@ -15,16 +15,16 @@ configurations {
|
|
15
15
|
provided
|
16
16
|
}
|
17
17
|
|
18
|
-
version = "0.
|
19
|
-
sourceCompatibility = 1.
|
20
|
-
targetCompatibility = 1.
|
18
|
+
version = "0.3.0"
|
19
|
+
sourceCompatibility = 1.8
|
20
|
+
targetCompatibility = 1.8
|
21
21
|
|
22
22
|
dependencies {
|
23
|
-
compile "org.embulk:embulk-core:0.
|
24
|
-
provided "org.embulk:embulk-core:0.
|
23
|
+
compile "org.embulk:embulk-core:0.9.+"
|
24
|
+
provided "org.embulk:embulk-core:0.9.+"
|
25
25
|
compile "com.jayway.jsonpath:json-path:2.+"
|
26
26
|
testCompile "junit:junit:4.+"
|
27
|
-
testCompile "org.embulk:embulk-core:0.
|
27
|
+
testCompile "org.embulk:embulk-core:0.9.+:tests"
|
28
28
|
}
|
29
29
|
|
30
30
|
jacocoTestReport {
|
@@ -59,19 +59,23 @@ task checkstyle(type: Checkstyle) {
|
|
59
59
|
}
|
60
60
|
|
61
61
|
task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
|
62
|
-
jrubyArgs "-
|
63
|
-
script "
|
62
|
+
jrubyArgs "-S"
|
63
|
+
script "gem"
|
64
|
+
scriptArgs "build", "${project.name}.gemspec"
|
64
65
|
doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
|
65
66
|
}
|
66
67
|
|
67
68
|
task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
|
68
|
-
jrubyArgs "-
|
69
|
-
script "
|
69
|
+
jrubyArgs "-S"
|
70
|
+
script "gem"
|
71
|
+
scriptArgs "push", "pkg/${project.name}-${project.version}.gem"
|
70
72
|
}
|
71
73
|
|
72
|
-
task "package"(dependsOn: ["gemspec", "classpath"])
|
73
|
-
|
74
|
-
|
74
|
+
task "package"(dependsOn: ["gemspec", "classpath"]) {
|
75
|
+
doLast {
|
76
|
+
println "> Build succeeded."
|
77
|
+
println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
|
78
|
+
}
|
75
79
|
}
|
76
80
|
|
77
81
|
task gemspec {
|
@@ -93,7 +97,6 @@ Gem::Specification.new do |spec|
|
|
93
97
|
spec.test_files = spec.files.grep(%r"^(test|spec)/")
|
94
98
|
spec.require_paths = ["lib"]
|
95
99
|
|
96
|
-
#spec.add_dependency 'YOUR_GEM_DEPENDENCY', ['~> YOUR_GEM_DEPENDENCY_VERSION']
|
97
100
|
spec.add_development_dependency 'bundler', ['~> 1.0']
|
98
101
|
spec.add_development_dependency 'rake', ['>= 10.0']
|
99
102
|
end
|
data/settings.gradle
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rootProject.name = "embulk-filter-expand_json"
|
@@ -1,6 +1,11 @@
|
|
1
1
|
package org.embulk.filter.expand_json;
|
2
2
|
|
3
3
|
import com.google.common.collect.ImmutableList;
|
4
|
+
import com.jayway.jsonpath.JsonPathException;
|
5
|
+
import com.jayway.jsonpath.spi.cache.Cache;
|
6
|
+
import com.jayway.jsonpath.spi.cache.CacheProvider;
|
7
|
+
import com.jayway.jsonpath.spi.cache.LRUCache;
|
8
|
+
import com.jayway.jsonpath.spi.cache.NOOPCache;
|
4
9
|
import org.embulk.config.Config;
|
5
10
|
import org.embulk.config.ConfigDefault;
|
6
11
|
import org.embulk.config.ConfigException;
|
@@ -19,6 +24,8 @@ import org.slf4j.Logger;
|
|
19
24
|
|
20
25
|
import java.util.ArrayList;
|
21
26
|
import java.util.List;
|
27
|
+
import java.util.Locale;
|
28
|
+
import java.util.Optional;
|
22
29
|
|
23
30
|
public class ExpandJsonFilterPlugin
|
24
31
|
implements FilterPlugin
|
@@ -29,14 +36,14 @@ public class ExpandJsonFilterPlugin
|
|
29
36
|
extends Task, TimestampParser.Task
|
30
37
|
{
|
31
38
|
@Config("json_column_name")
|
32
|
-
|
39
|
+
String getJsonColumnName();
|
33
40
|
|
34
41
|
@Config("root")
|
35
42
|
@ConfigDefault("\"$.\"")
|
36
|
-
|
43
|
+
String getRoot();
|
37
44
|
|
38
45
|
@Config("expanded_columns")
|
39
|
-
|
46
|
+
List<ColumnConfig> getExpandedColumns();
|
40
47
|
|
41
48
|
// default_timezone option from TimestampParser.Task
|
42
49
|
|
@@ -46,7 +53,11 @@ public class ExpandJsonFilterPlugin
|
|
46
53
|
|
47
54
|
@Config("keep_expanding_json_column")
|
48
55
|
@ConfigDefault("false")
|
49
|
-
|
56
|
+
boolean getKeepExpandingJsonColumn();
|
57
|
+
|
58
|
+
@Config("cache_provider")
|
59
|
+
@ConfigDefault("null")
|
60
|
+
Optional<String> getCacheProviderName();
|
50
61
|
}
|
51
62
|
|
52
63
|
@Override
|
@@ -60,6 +71,9 @@ public class ExpandJsonFilterPlugin
|
|
60
71
|
|
61
72
|
PluginTask task = config.loadConfig(PluginTask.class);
|
62
73
|
|
74
|
+
// set cache provider
|
75
|
+
task.getCacheProviderName().ifPresent(this::setCacheProvider);
|
76
|
+
|
63
77
|
// check if a column specified as json_column_name option exists or not
|
64
78
|
Column jsonColumn = inputSchema.lookupColumn(task.getJsonColumnName());
|
65
79
|
if (jsonColumn.getType() != Types.STRING && jsonColumn.getType() != Types.JSON) {
|
@@ -79,6 +93,8 @@ public class ExpandJsonFilterPlugin
|
|
79
93
|
final Schema outputSchema, final PageOutput output)
|
80
94
|
{
|
81
95
|
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
96
|
+
// set cache provider for mapreduce executor.
|
97
|
+
task.getCacheProviderName().ifPresent(this::setCacheProviderOrIgnore);
|
82
98
|
return new FilteredPageOutput(task, inputSchema, outputSchema, output);
|
83
99
|
}
|
84
100
|
|
@@ -152,4 +168,39 @@ public class ExpandJsonFilterPlugin
|
|
152
168
|
columnList.add(columnName);
|
153
169
|
}
|
154
170
|
}
|
171
|
+
|
172
|
+
private void setCacheProvider(String cacheProviderName)
|
173
|
+
{
|
174
|
+
String upperCacheProviderName = cacheProviderName.toUpperCase(Locale.ENGLISH);
|
175
|
+
switch (upperCacheProviderName)
|
176
|
+
{
|
177
|
+
case "LRU":
|
178
|
+
CacheProvider.setCache(new LRUCache(400));
|
179
|
+
break;
|
180
|
+
|
181
|
+
case "NOOP":
|
182
|
+
CacheProvider.setCache(new NOOPCache());
|
183
|
+
break;
|
184
|
+
|
185
|
+
default:
|
186
|
+
try {
|
187
|
+
Class<?> klass = Class.forName(cacheProviderName);
|
188
|
+
Cache cache = (Cache) klass.newInstance();
|
189
|
+
CacheProvider.setCache(cache);
|
190
|
+
}
|
191
|
+
catch (ClassNotFoundException | IllegalAccessException | InstantiationException | ClassCastException e) {
|
192
|
+
throw new ConfigException(String.format("Cache Provider '%s' is not supported: %s.", cacheProviderName, e.getMessage()), e);
|
193
|
+
}
|
194
|
+
}
|
195
|
+
}
|
196
|
+
|
197
|
+
private void setCacheProviderOrIgnore(String cacheProviderName)
|
198
|
+
{
|
199
|
+
try {
|
200
|
+
setCacheProvider(cacheProviderName);
|
201
|
+
}
|
202
|
+
catch (JsonPathException e) {
|
203
|
+
logger.debug("Cache:{} is already set.", CacheProvider.getCache().getClass());
|
204
|
+
}
|
205
|
+
}
|
155
206
|
}
|