embulk-filter-expand_json 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +0 -2
- data/CHANGELOG.md +5 -0
- data/README.md +32 -1
- data/bench/.bundle/config +2 -0
- data/bench/.gitignore +2 -0
- data/bench/Gemfile +5 -0
- data/bench/Gemfile.lock +22 -0
- data/bench/config_raw.yml +14 -0
- data/bench/config_with_lru_cache.yml +27 -0
- data/bench/config_with_noop_cache.yml +28 -0
- data/bench/run.sh +57 -0
- data/build.gradle +18 -15
- data/settings.gradle +1 -0
- data/src/main/java/org/embulk/filter/expand_json/ExpandJsonFilterPlugin.java +55 -4
- data/src/main/java/org/embulk/filter/expand_json/FilteredPageOutput.java +0 -3
- data/src/test/java/org/embulk/filter/expand_json/MyNOOPCache.java +9 -0
- data/src/test/java/org/embulk/filter/expand_json/TestExpandJsonFilterPlugin.java +840 -2
- metadata +27 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9a6ff1c6a5264a85b4aa62b7a7719c20f5a987c1
|
4
|
+
data.tar.gz: f00862e0f575be8a4f7b1ed938f56370a32d9b0b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 787eb4e46c979414d5ee986ec7e1deb2cef85477ac48d0b85aa55762de4dd6ac133d1fa877490e432c393b86cf61d40dac4fd021613bf012ced550200c26d3dc
|
7
|
+
data.tar.gz: 3f3352f3eb7dfd14b506bda85e0c15ac038d5b1495d618eb623ca6f686fc75ec823a28e1d2d1dde6e5e5c8f4f5b0f86169df1cb006a8b796adf8bb90ada6a9b5
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
0.3.0 (2019-05-02)
|
2
|
+
==================
|
3
|
+
- [Enhancement] Introduce `cache_provider` option.
|
4
|
+
- https://github.com/civitaspo/embulk-filter-expand_json/pull/41
|
5
|
+
|
1
6
|
0.2.2 (2017-09-14)
|
2
7
|
==================
|
3
8
|
- [Enhancement] Use TimestampParser's constructor without JRuby ScriptingContainer
|
data/README.md
CHANGED
@@ -21,6 +21,8 @@ expand columns having json into multiple columns
|
|
21
21
|
- **keep_expanding_json_column**: Not remove the expanding json column from input schema if it's true (false by default)
|
22
22
|
- **default_timezone**: Time zone of timestamp columns if values don’t include time zone description (`UTC` by default)
|
23
23
|
- **stop_on_invalid_record**: Stop bulk load transaction if an invalid record is included (false by default)
|
24
|
+
- **cache_provider**: Cache provider name for JsonPath. `"LRU"` and `"NOOP"` are built-in. You can specify user defined class. (string, default: `"LRU"`)
|
25
|
+
- `"NOOP"` becomes default in the future.
|
24
26
|
|
25
27
|
---
|
26
28
|
**type of the column**
|
@@ -53,6 +55,7 @@ filters:
|
|
53
55
|
- {name: "profile.like_words[0]", type: string}
|
54
56
|
```
|
55
57
|
|
58
|
+
|
56
59
|
## Note
|
57
60
|
- If the value evaluated by JsonPath is Array or Hash, the value is written as JSON.
|
58
61
|
|
@@ -61,13 +64,41 @@ filters:
|
|
61
64
|
- use to evaluate [JsonPath](http://goessner.net/articles/JsonPath/)
|
62
65
|
- [Apache License Version 2.0](https://github.com/jayway/JsonPath/blob/master/LICENSE)
|
63
66
|
|
67
|
+
## Development
|
64
68
|
|
65
|
-
|
69
|
+
### Run Example
|
70
|
+
|
71
|
+
```
|
72
|
+
./gradlew classpath
|
73
|
+
embulk run -Ilib ./example/config.yml
|
74
|
+
```
|
75
|
+
|
76
|
+
|
77
|
+
### Build
|
66
78
|
|
67
79
|
```
|
68
80
|
$ ./gradlew gem # -t to watch change of files and rebuild continuously
|
69
81
|
```
|
70
82
|
|
83
|
+
## Benchmark for `cache_provider` option
|
84
|
+
|
85
|
+
In some cases, `cache_provider: NOOP` improves the performance of this plugin by 3 times (https://github.com/civitaspo/embulk-filter-expand_json/pull/41/).
|
86
|
+
So we do a benchmark about `cache_provider`. In our case, `cache_provider: noop` improves the performance by 1.5 times.
|
87
|
+
|
88
|
+
|use `expand_json` filter|cache_provider|Time took|records/s|
|
89
|
+
|:---|:---|:---|:---|
|
90
|
+
|`false`|none|7.62s|1,325,459/s|
|
91
|
+
|`true`|`"LRU"`|2m9s|78,025/s|
|
92
|
+
|`true`|`"NOOP"`|1m25s|118,476/s|
|
93
|
+
|
94
|
+
|
95
|
+
You can reproduce the bench by the below way.
|
96
|
+
|
97
|
+
```
|
98
|
+
./gradlew classpath
|
99
|
+
./bench/run.sh
|
100
|
+
```
|
101
|
+
|
71
102
|
## Contributor
|
72
103
|
- @Civitaspo
|
73
104
|
- @muga
|
data/bench/.gitignore
ADDED
data/bench/Gemfile
ADDED
data/bench/Gemfile.lock
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
GEM
|
2
|
+
remote: https://rubygems.org/
|
3
|
+
specs:
|
4
|
+
embulk (0.9.16-java)
|
5
|
+
bundler (>= 1.10.6)
|
6
|
+
liquid (~> 4.0.0)
|
7
|
+
msgpack (~> 1.1.0)
|
8
|
+
embulk-filter-speedometer (0.3.4)
|
9
|
+
embulk-parser-none (0.2.0)
|
10
|
+
liquid (4.0.3)
|
11
|
+
msgpack (1.1.0-java)
|
12
|
+
|
13
|
+
PLATFORMS
|
14
|
+
java
|
15
|
+
|
16
|
+
DEPENDENCIES
|
17
|
+
embulk
|
18
|
+
embulk-filter-speedometer
|
19
|
+
embulk-parser-none
|
20
|
+
|
21
|
+
BUNDLED WITH
|
22
|
+
1.16.0
|
@@ -0,0 +1,27 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: data.jsonl
|
4
|
+
parser:
|
5
|
+
type: none
|
6
|
+
column_name: payload
|
7
|
+
|
8
|
+
filters:
|
9
|
+
- type: speedometer
|
10
|
+
log_interval_seconds: 1
|
11
|
+
- type: expand_json
|
12
|
+
json_column_name: payload
|
13
|
+
root: "$."
|
14
|
+
expanded_columns:
|
15
|
+
- {name: "phone_numbers", type: string}
|
16
|
+
- {name: "app_id", type: long}
|
17
|
+
- {name: "point", type: double}
|
18
|
+
- {name: "created_at", type: timestamp, format: "%Y-%m-%d"}
|
19
|
+
- {name: "profile.anniversary.et", type: string}
|
20
|
+
- {name: "profile.anniversary", type: string}
|
21
|
+
- {name: "profile.like_words[1]", type: string}
|
22
|
+
- {name: "profile.like_words[2]", type: string}
|
23
|
+
- {name: "profile.like_words", type: string}
|
24
|
+
|
25
|
+
out:
|
26
|
+
type: "null"
|
27
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: data.jsonl
|
4
|
+
parser:
|
5
|
+
type: none
|
6
|
+
column_name: payload
|
7
|
+
|
8
|
+
filters:
|
9
|
+
- type: speedometer
|
10
|
+
log_interval_seconds: 1
|
11
|
+
- type: expand_json
|
12
|
+
json_column_name: payload
|
13
|
+
cache_provider: noop
|
14
|
+
root: "$."
|
15
|
+
expanded_columns:
|
16
|
+
- {name: "phone_numbers", type: string}
|
17
|
+
- {name: "app_id", type: long}
|
18
|
+
- {name: "point", type: double}
|
19
|
+
- {name: "created_at", type: timestamp, format: "%Y-%m-%d"}
|
20
|
+
- {name: "profile.anniversary.et", type: string}
|
21
|
+
- {name: "profile.anniversary", type: string}
|
22
|
+
- {name: "profile.like_words[1]", type: string}
|
23
|
+
- {name: "profile.like_words[2]", type: string}
|
24
|
+
- {name: "profile.like_words", type: string}
|
25
|
+
|
26
|
+
out:
|
27
|
+
type: "null"
|
28
|
+
|
data/bench/run.sh
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
BENCH_ROOT=$(cd $(dirname $0); pwd)
|
4
|
+
DATA_FILE=data.jsonl
|
5
|
+
TMP_DATA_FILE=tmp.jsonl
|
6
|
+
|
7
|
+
function now() {
|
8
|
+
date +"%FT%T%:z"
|
9
|
+
}
|
10
|
+
|
11
|
+
echo "[$(now)] Preparing ..."
|
12
|
+
(
|
13
|
+
cd $BENCH_ROOT
|
14
|
+
embulk bundle
|
15
|
+
|
16
|
+
if [ -f $DATA_FILE ]; then
|
17
|
+
rm -f $DATA_FILE
|
18
|
+
fi
|
19
|
+
if [ -f $TMP_DATA_FILE ]; then
|
20
|
+
rm -f $TMP_DATA_FILE
|
21
|
+
fi
|
22
|
+
for n in {1..100}; do
|
23
|
+
cat ../example/data.tsv | cut -f5 >> $TMP_DATA_FILE
|
24
|
+
done
|
25
|
+
for n in {1..1000}; do
|
26
|
+
cat $TMP_DATA_FILE >> $DATA_FILE
|
27
|
+
done
|
28
|
+
)
|
29
|
+
|
30
|
+
echo "[$(now)] Run No expand_json"
|
31
|
+
(
|
32
|
+
cd $BENCH_ROOT
|
33
|
+
time embulk run -I ../lib -b . config_raw.yml
|
34
|
+
)
|
35
|
+
|
36
|
+
echo "[$(now)] Run Default (LRUCache)"
|
37
|
+
(
|
38
|
+
cd $BENCH_ROOT
|
39
|
+
time embulk run -I ../lib -b . config_with_lru_cache.yml
|
40
|
+
)
|
41
|
+
|
42
|
+
echo "[$(now)] Run with NOOPCache"
|
43
|
+
(
|
44
|
+
cd $BENCH_ROOT
|
45
|
+
time embulk run -I ../lib -b . config_with_noop_cache.yml
|
46
|
+
)
|
47
|
+
|
48
|
+
echo "[$(now)] Teardown..."
|
49
|
+
(
|
50
|
+
cd $BENCH_ROOT
|
51
|
+
if [ -f $DATA_FILE ]; then
|
52
|
+
rm -f $DATA_FILE
|
53
|
+
fi
|
54
|
+
if [ -f $TMP_DATA_FILE ]; then
|
55
|
+
rm -f $TMP_DATA_FILE
|
56
|
+
fi
|
57
|
+
)
|
data/build.gradle
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
plugins {
|
2
2
|
id "com.jfrog.bintray" version "1.1"
|
3
|
-
id "com.github.jruby-gradle.base" version "
|
3
|
+
id "com.github.jruby-gradle.base" version "1.5.0"
|
4
4
|
id "com.github.kt3k.coveralls" version "2.4.0"
|
5
5
|
id "jacoco"
|
6
6
|
id "java"
|
@@ -15,16 +15,16 @@ configurations {
|
|
15
15
|
provided
|
16
16
|
}
|
17
17
|
|
18
|
-
version = "0.
|
19
|
-
sourceCompatibility = 1.
|
20
|
-
targetCompatibility = 1.
|
18
|
+
version = "0.3.0"
|
19
|
+
sourceCompatibility = 1.8
|
20
|
+
targetCompatibility = 1.8
|
21
21
|
|
22
22
|
dependencies {
|
23
|
-
compile "org.embulk:embulk-core:0.
|
24
|
-
provided "org.embulk:embulk-core:0.
|
23
|
+
compile "org.embulk:embulk-core:0.9.+"
|
24
|
+
provided "org.embulk:embulk-core:0.9.+"
|
25
25
|
compile "com.jayway.jsonpath:json-path:2.+"
|
26
26
|
testCompile "junit:junit:4.+"
|
27
|
-
testCompile "org.embulk:embulk-core:0.
|
27
|
+
testCompile "org.embulk:embulk-core:0.9.+:tests"
|
28
28
|
}
|
29
29
|
|
30
30
|
jacocoTestReport {
|
@@ -59,19 +59,23 @@ task checkstyle(type: Checkstyle) {
|
|
59
59
|
}
|
60
60
|
|
61
61
|
task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
|
62
|
-
jrubyArgs "-
|
63
|
-
script "
|
62
|
+
jrubyArgs "-S"
|
63
|
+
script "gem"
|
64
|
+
scriptArgs "build", "${project.name}.gemspec"
|
64
65
|
doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
|
65
66
|
}
|
66
67
|
|
67
68
|
task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
|
68
|
-
jrubyArgs "-
|
69
|
-
script "
|
69
|
+
jrubyArgs "-S"
|
70
|
+
script "gem"
|
71
|
+
scriptArgs "push", "pkg/${project.name}-${project.version}.gem"
|
70
72
|
}
|
71
73
|
|
72
|
-
task "package"(dependsOn: ["gemspec", "classpath"])
|
73
|
-
|
74
|
-
|
74
|
+
task "package"(dependsOn: ["gemspec", "classpath"]) {
|
75
|
+
doLast {
|
76
|
+
println "> Build succeeded."
|
77
|
+
println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
|
78
|
+
}
|
75
79
|
}
|
76
80
|
|
77
81
|
task gemspec {
|
@@ -93,7 +97,6 @@ Gem::Specification.new do |spec|
|
|
93
97
|
spec.test_files = spec.files.grep(%r"^(test|spec)/")
|
94
98
|
spec.require_paths = ["lib"]
|
95
99
|
|
96
|
-
#spec.add_dependency 'YOUR_GEM_DEPENDENCY', ['~> YOUR_GEM_DEPENDENCY_VERSION']
|
97
100
|
spec.add_development_dependency 'bundler', ['~> 1.0']
|
98
101
|
spec.add_development_dependency 'rake', ['>= 10.0']
|
99
102
|
end
|
data/settings.gradle
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rootProject.name = "embulk-filter-expand_json"
|
@@ -1,6 +1,11 @@
|
|
1
1
|
package org.embulk.filter.expand_json;
|
2
2
|
|
3
3
|
import com.google.common.collect.ImmutableList;
|
4
|
+
import com.jayway.jsonpath.JsonPathException;
|
5
|
+
import com.jayway.jsonpath.spi.cache.Cache;
|
6
|
+
import com.jayway.jsonpath.spi.cache.CacheProvider;
|
7
|
+
import com.jayway.jsonpath.spi.cache.LRUCache;
|
8
|
+
import com.jayway.jsonpath.spi.cache.NOOPCache;
|
4
9
|
import org.embulk.config.Config;
|
5
10
|
import org.embulk.config.ConfigDefault;
|
6
11
|
import org.embulk.config.ConfigException;
|
@@ -19,6 +24,8 @@ import org.slf4j.Logger;
|
|
19
24
|
|
20
25
|
import java.util.ArrayList;
|
21
26
|
import java.util.List;
|
27
|
+
import java.util.Locale;
|
28
|
+
import java.util.Optional;
|
22
29
|
|
23
30
|
public class ExpandJsonFilterPlugin
|
24
31
|
implements FilterPlugin
|
@@ -29,14 +36,14 @@ public class ExpandJsonFilterPlugin
|
|
29
36
|
extends Task, TimestampParser.Task
|
30
37
|
{
|
31
38
|
@Config("json_column_name")
|
32
|
-
|
39
|
+
String getJsonColumnName();
|
33
40
|
|
34
41
|
@Config("root")
|
35
42
|
@ConfigDefault("\"$.\"")
|
36
|
-
|
43
|
+
String getRoot();
|
37
44
|
|
38
45
|
@Config("expanded_columns")
|
39
|
-
|
46
|
+
List<ColumnConfig> getExpandedColumns();
|
40
47
|
|
41
48
|
// default_timezone option from TimestampParser.Task
|
42
49
|
|
@@ -46,7 +53,11 @@ public class ExpandJsonFilterPlugin
|
|
46
53
|
|
47
54
|
@Config("keep_expanding_json_column")
|
48
55
|
@ConfigDefault("false")
|
49
|
-
|
56
|
+
boolean getKeepExpandingJsonColumn();
|
57
|
+
|
58
|
+
@Config("cache_provider")
|
59
|
+
@ConfigDefault("null")
|
60
|
+
Optional<String> getCacheProviderName();
|
50
61
|
}
|
51
62
|
|
52
63
|
@Override
|
@@ -60,6 +71,9 @@ public class ExpandJsonFilterPlugin
|
|
60
71
|
|
61
72
|
PluginTask task = config.loadConfig(PluginTask.class);
|
62
73
|
|
74
|
+
// set cache provider
|
75
|
+
task.getCacheProviderName().ifPresent(this::setCacheProvider);
|
76
|
+
|
63
77
|
// check if a column specified as json_column_name option exists or not
|
64
78
|
Column jsonColumn = inputSchema.lookupColumn(task.getJsonColumnName());
|
65
79
|
if (jsonColumn.getType() != Types.STRING && jsonColumn.getType() != Types.JSON) {
|
@@ -79,6 +93,8 @@ public class ExpandJsonFilterPlugin
|
|
79
93
|
final Schema outputSchema, final PageOutput output)
|
80
94
|
{
|
81
95
|
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
96
|
+
// set cache provider for mapreduce executor.
|
97
|
+
task.getCacheProviderName().ifPresent(this::setCacheProviderOrIgnore);
|
82
98
|
return new FilteredPageOutput(task, inputSchema, outputSchema, output);
|
83
99
|
}
|
84
100
|
|
@@ -152,4 +168,39 @@ public class ExpandJsonFilterPlugin
|
|
152
168
|
columnList.add(columnName);
|
153
169
|
}
|
154
170
|
}
|
171
|
+
|
172
|
+
private void setCacheProvider(String cacheProviderName)
|
173
|
+
{
|
174
|
+
String upperCacheProviderName = cacheProviderName.toUpperCase(Locale.ENGLISH);
|
175
|
+
switch (upperCacheProviderName)
|
176
|
+
{
|
177
|
+
case "LRU":
|
178
|
+
CacheProvider.setCache(new LRUCache(400));
|
179
|
+
break;
|
180
|
+
|
181
|
+
case "NOOP":
|
182
|
+
CacheProvider.setCache(new NOOPCache());
|
183
|
+
break;
|
184
|
+
|
185
|
+
default:
|
186
|
+
try {
|
187
|
+
Class<?> klass = Class.forName(cacheProviderName);
|
188
|
+
Cache cache = (Cache) klass.newInstance();
|
189
|
+
CacheProvider.setCache(cache);
|
190
|
+
}
|
191
|
+
catch (ClassNotFoundException | IllegalAccessException | InstantiationException | ClassCastException e) {
|
192
|
+
throw new ConfigException(String.format("Cache Provider '%s' is not supported: %s.", cacheProviderName, e.getMessage()), e);
|
193
|
+
}
|
194
|
+
}
|
195
|
+
}
|
196
|
+
|
197
|
+
private void setCacheProviderOrIgnore(String cacheProviderName)
|
198
|
+
{
|
199
|
+
try {
|
200
|
+
setCacheProvider(cacheProviderName);
|
201
|
+
}
|
202
|
+
catch (JsonPathException e) {
|
203
|
+
logger.debug("Cache:{} is already set.", CacheProvider.getCache().getClass());
|
204
|
+
}
|
205
|
+
}
|
155
206
|
}
|