embulk-filter-expand_json 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c82efb599e6e9e1cd6f02219354383e4d7d1b966
4
- data.tar.gz: 1111d92e7fcf2e9df01ac7b0e7e2a27439d71421
3
+ metadata.gz: 9a6ff1c6a5264a85b4aa62b7a7719c20f5a987c1
4
+ data.tar.gz: f00862e0f575be8a4f7b1ed938f56370a32d9b0b
5
5
  SHA512:
6
- metadata.gz: 0e274e928beefad9aacd00b8ef11cbfb5b3772018841d4f7775afd0f03fcb70124ebbea0e5317b401d7dc77b493ab1d56dc445430ebb68a12d7c7798a8fdb2b2
7
- data.tar.gz: afebf67519d74a5a86c0e9ad217b2afe66bf7011c6faca6c1f6dc2291ee07afd5f5acb089b168632e2a14b58fc399ceb9ad0dd555923b50e79f9e45acc8e49c3
6
+ metadata.gz: 787eb4e46c979414d5ee986ec7e1deb2cef85477ac48d0b85aa55762de4dd6ac133d1fa877490e432c393b86cf61d40dac4fd021613bf012ced550200c26d3dc
7
+ data.tar.gz: 3f3352f3eb7dfd14b506bda85e0c15ac038d5b1495d618eb623ca6f686fc75ec823a28e1d2d1dde6e5e5c8f4f5b0f86169df1cb006a8b796adf8bb90ada6a9b5
@@ -1,8 +1,6 @@
1
1
  dist: precise
2
2
  language: java
3
3
  jdk:
4
- - openjdk7
5
- - oraclejdk7
6
4
  - oraclejdk8
7
5
  script:
8
6
  - ./gradlew test
@@ -1,3 +1,8 @@
1
+ 0.3.0 (2019-05-02)
2
+ ==================
3
+ - [Enhancement] Introduce `cache_provider` option.
4
+ - https://github.com/civitaspo/embulk-filter-expand_json/pull/41
5
+
1
6
  0.2.2 (2017-09-14)
2
7
  ==================
3
8
  - [Enhancement] Use TimestampParser's constructor without JRuby ScriptingContainer
data/README.md CHANGED
@@ -21,6 +21,8 @@ expand columns having json into multiple columns
21
21
  - **keep_expanding_json_column**: Not remove the expanding json column from input schema if it's true (false by default)
22
22
  - **default_timezone**: Time zone of timestamp columns if values don’t include time zone description (`UTC` by default)
23
23
  - **stop_on_invalid_record**: Stop bulk load transaction if an invalid record is included (false by default)
24
+ - **cache_provider**: Cache provider name for JsonPath. `"LRU"` and `"NOOP"` are built-in. You can specify user defined class. (string, default: `"LRU"`)
25
+ - `"NOOP"` becomes default in the future.
24
26
 
25
27
  ---
26
28
  **type of the column**
@@ -53,6 +55,7 @@ filters:
53
55
  - {name: "profile.like_words[0]", type: string}
54
56
  ```
55
57
 
58
+
56
59
  ## Note
57
60
  - If the value evaluated by JsonPath is Array or Hash, the value is written as JSON.
58
61
 
@@ -61,13 +64,41 @@ filters:
61
64
  - use to evaluate [JsonPath](http://goessner.net/articles/JsonPath/)
62
65
  - [Apache License Version 2.0](https://github.com/jayway/JsonPath/blob/master/LICENSE)
63
66
 
67
+ ## Development
64
68
 
65
- ## Build
69
+ ### Run Example
70
+
71
+ ```
72
+ ./gradlew classpath
73
+ embulk run -Ilib ./example/config.yml
74
+ ```
75
+
76
+
77
+ ### Build
66
78
 
67
79
  ```
68
80
  $ ./gradlew gem # -t to watch change of files and rebuild continuously
69
81
  ```
70
82
 
83
+ ## Benchmark for `cache_provider` option
84
+
85
+ In some cases, `cache_provider: NOOP` improves the performance of this plugin by 3 times (https://github.com/civitaspo/embulk-filter-expand_json/pull/41/).
86
+ So we do a benchmark about `cache_provider`. In our case, `cache_provider: noop` improves the performance by 1.5 times.
87
+
88
+ |use `expand_json` filter|cache_provider|Time took|records/s|
89
+ |:---|:---|:---|:---|
90
+ |`false`|none|7.62s|1,325,459/s|
91
+ |`true`|`"LRU"`|2m9s|78,025/s|
92
+ |`true`|`"NOOP"`|1m25s|118,476/s|
93
+
94
+
95
+ You can reproduce the bench by the below way.
96
+
97
+ ```
98
+ ./gradlew classpath
99
+ ./bench/run.sh
100
+ ```
101
+
71
102
  ## Contributor
72
103
  - @Civitaspo
73
104
  - @muga
@@ -0,0 +1,2 @@
1
+ ---
2
+ BUNDLE_PATH: "vendor/bundle"
@@ -0,0 +1,2 @@
1
+ /vendor/
2
+ *.jsonl
@@ -0,0 +1,5 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'embulk'
4
+ gem 'embulk-parser-none'
5
+ gem 'embulk-filter-speedometer'
@@ -0,0 +1,22 @@
1
+ GEM
2
+ remote: https://rubygems.org/
3
+ specs:
4
+ embulk (0.9.16-java)
5
+ bundler (>= 1.10.6)
6
+ liquid (~> 4.0.0)
7
+ msgpack (~> 1.1.0)
8
+ embulk-filter-speedometer (0.3.4)
9
+ embulk-parser-none (0.2.0)
10
+ liquid (4.0.3)
11
+ msgpack (1.1.0-java)
12
+
13
+ PLATFORMS
14
+ java
15
+
16
+ DEPENDENCIES
17
+ embulk
18
+ embulk-filter-speedometer
19
+ embulk-parser-none
20
+
21
+ BUNDLED WITH
22
+ 1.16.0
@@ -0,0 +1,14 @@
1
+ in:
2
+ type: file
3
+ path_prefix: data.jsonl
4
+ parser:
5
+ type: none
6
+ column_name: payload
7
+
8
+ filters:
9
+ - type: speedometer
10
+ log_interval_seconds: 1
11
+
12
+ out:
13
+ type: "null"
14
+
@@ -0,0 +1,27 @@
1
+ in:
2
+ type: file
3
+ path_prefix: data.jsonl
4
+ parser:
5
+ type: none
6
+ column_name: payload
7
+
8
+ filters:
9
+ - type: speedometer
10
+ log_interval_seconds: 1
11
+ - type: expand_json
12
+ json_column_name: payload
13
+ root: "$."
14
+ expanded_columns:
15
+ - {name: "phone_numbers", type: string}
16
+ - {name: "app_id", type: long}
17
+ - {name: "point", type: double}
18
+ - {name: "created_at", type: timestamp, format: "%Y-%m-%d"}
19
+ - {name: "profile.anniversary.et", type: string}
20
+ - {name: "profile.anniversary", type: string}
21
+ - {name: "profile.like_words[1]", type: string}
22
+ - {name: "profile.like_words[2]", type: string}
23
+ - {name: "profile.like_words", type: string}
24
+
25
+ out:
26
+ type: "null"
27
+
@@ -0,0 +1,28 @@
1
+ in:
2
+ type: file
3
+ path_prefix: data.jsonl
4
+ parser:
5
+ type: none
6
+ column_name: payload
7
+
8
+ filters:
9
+ - type: speedometer
10
+ log_interval_seconds: 1
11
+ - type: expand_json
12
+ json_column_name: payload
13
+ cache_provider: noop
14
+ root: "$."
15
+ expanded_columns:
16
+ - {name: "phone_numbers", type: string}
17
+ - {name: "app_id", type: long}
18
+ - {name: "point", type: double}
19
+ - {name: "created_at", type: timestamp, format: "%Y-%m-%d"}
20
+ - {name: "profile.anniversary.et", type: string}
21
+ - {name: "profile.anniversary", type: string}
22
+ - {name: "profile.like_words[1]", type: string}
23
+ - {name: "profile.like_words[2]", type: string}
24
+ - {name: "profile.like_words", type: string}
25
+
26
+ out:
27
+ type: "null"
28
+
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env bash
2
+
3
+ BENCH_ROOT=$(cd $(dirname $0); pwd)
4
+ DATA_FILE=data.jsonl
5
+ TMP_DATA_FILE=tmp.jsonl
6
+
7
+ function now() {
8
+ date +"%FT%T%:z"
9
+ }
10
+
11
+ echo "[$(now)] Preparing ..."
12
+ (
13
+ cd $BENCH_ROOT
14
+ embulk bundle
15
+
16
+ if [ -f $DATA_FILE ]; then
17
+ rm -f $DATA_FILE
18
+ fi
19
+ if [ -f $TMP_DATA_FILE ]; then
20
+ rm -f $TMP_DATA_FILE
21
+ fi
22
+ for n in {1..100}; do
23
+ cat ../example/data.tsv | cut -f5 >> $TMP_DATA_FILE
24
+ done
25
+ for n in {1..1000}; do
26
+ cat $TMP_DATA_FILE >> $DATA_FILE
27
+ done
28
+ )
29
+
30
+ echo "[$(now)] Run No expand_json"
31
+ (
32
+ cd $BENCH_ROOT
33
+ time embulk run -I ../lib -b . config_raw.yml
34
+ )
35
+
36
+ echo "[$(now)] Run Default (LRUCache)"
37
+ (
38
+ cd $BENCH_ROOT
39
+ time embulk run -I ../lib -b . config_with_lru_cache.yml
40
+ )
41
+
42
+ echo "[$(now)] Run with NOOPCache"
43
+ (
44
+ cd $BENCH_ROOT
45
+ time embulk run -I ../lib -b . config_with_noop_cache.yml
46
+ )
47
+
48
+ echo "[$(now)] Teardown..."
49
+ (
50
+ cd $BENCH_ROOT
51
+ if [ -f $DATA_FILE ]; then
52
+ rm -f $DATA_FILE
53
+ fi
54
+ if [ -f $TMP_DATA_FILE ]; then
55
+ rm -f $TMP_DATA_FILE
56
+ fi
57
+ )
@@ -1,6 +1,6 @@
1
1
  plugins {
2
2
  id "com.jfrog.bintray" version "1.1"
3
- id "com.github.jruby-gradle.base" version "0.1.5"
3
+ id "com.github.jruby-gradle.base" version "1.5.0"
4
4
  id "com.github.kt3k.coveralls" version "2.4.0"
5
5
  id "jacoco"
6
6
  id "java"
@@ -15,16 +15,16 @@ configurations {
15
15
  provided
16
16
  }
17
17
 
18
- version = "0.2.2"
19
- sourceCompatibility = 1.7
20
- targetCompatibility = 1.7
18
+ version = "0.3.0"
19
+ sourceCompatibility = 1.8
20
+ targetCompatibility = 1.8
21
21
 
22
22
  dependencies {
23
- compile "org.embulk:embulk-core:0.8.32"
24
- provided "org.embulk:embulk-core:0.8.32"
23
+ compile "org.embulk:embulk-core:0.9.+"
24
+ provided "org.embulk:embulk-core:0.9.+"
25
25
  compile "com.jayway.jsonpath:json-path:2.+"
26
26
  testCompile "junit:junit:4.+"
27
- testCompile "org.embulk:embulk-core:0.8.32:tests"
27
+ testCompile "org.embulk:embulk-core:0.9.+:tests"
28
28
  }
29
29
 
30
30
  jacocoTestReport {
@@ -59,19 +59,23 @@ task checkstyle(type: Checkstyle) {
59
59
  }
60
60
 
61
61
  task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
62
- jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
63
- script "${project.name}.gemspec"
62
+ jrubyArgs "-S"
63
+ script "gem"
64
+ scriptArgs "build", "${project.name}.gemspec"
64
65
  doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
65
66
  }
66
67
 
67
68
  task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
68
- jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "push"
69
- script "pkg/${project.name}-${project.version}.gem"
69
+ jrubyArgs "-S"
70
+ script "gem"
71
+ scriptArgs "push", "pkg/${project.name}-${project.version}.gem"
70
72
  }
71
73
 
72
- task "package"(dependsOn: ["gemspec", "classpath"]) << {
73
- println "> Build succeeded."
74
- println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
74
+ task "package"(dependsOn: ["gemspec", "classpath"]) {
75
+ doLast {
76
+ println "> Build succeeded."
77
+ println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
78
+ }
75
79
  }
76
80
 
77
81
  task gemspec {
@@ -93,7 +97,6 @@ Gem::Specification.new do |spec|
93
97
  spec.test_files = spec.files.grep(%r"^(test|spec)/")
94
98
  spec.require_paths = ["lib"]
95
99
 
96
- #spec.add_dependency 'YOUR_GEM_DEPENDENCY', ['~> YOUR_GEM_DEPENDENCY_VERSION']
97
100
  spec.add_development_dependency 'bundler', ['~> 1.0']
98
101
  spec.add_development_dependency 'rake', ['>= 10.0']
99
102
  end
@@ -0,0 +1 @@
1
+ rootProject.name = "embulk-filter-expand_json"
@@ -1,6 +1,11 @@
1
1
  package org.embulk.filter.expand_json;
2
2
 
3
3
  import com.google.common.collect.ImmutableList;
4
+ import com.jayway.jsonpath.JsonPathException;
5
+ import com.jayway.jsonpath.spi.cache.Cache;
6
+ import com.jayway.jsonpath.spi.cache.CacheProvider;
7
+ import com.jayway.jsonpath.spi.cache.LRUCache;
8
+ import com.jayway.jsonpath.spi.cache.NOOPCache;
4
9
  import org.embulk.config.Config;
5
10
  import org.embulk.config.ConfigDefault;
6
11
  import org.embulk.config.ConfigException;
@@ -19,6 +24,8 @@ import org.slf4j.Logger;
19
24
 
20
25
  import java.util.ArrayList;
21
26
  import java.util.List;
27
+ import java.util.Locale;
28
+ import java.util.Optional;
22
29
 
23
30
  public class ExpandJsonFilterPlugin
24
31
  implements FilterPlugin
@@ -29,14 +36,14 @@ public class ExpandJsonFilterPlugin
29
36
  extends Task, TimestampParser.Task
30
37
  {
31
38
  @Config("json_column_name")
32
- public String getJsonColumnName();
39
+ String getJsonColumnName();
33
40
 
34
41
  @Config("root")
35
42
  @ConfigDefault("\"$.\"")
36
- public String getRoot();
43
+ String getRoot();
37
44
 
38
45
  @Config("expanded_columns")
39
- public List<ColumnConfig> getExpandedColumns();
46
+ List<ColumnConfig> getExpandedColumns();
40
47
 
41
48
  // default_timezone option from TimestampParser.Task
42
49
 
@@ -46,7 +53,11 @@ public class ExpandJsonFilterPlugin
46
53
 
47
54
  @Config("keep_expanding_json_column")
48
55
  @ConfigDefault("false")
49
- public boolean getKeepExpandingJsonColumn();
56
+ boolean getKeepExpandingJsonColumn();
57
+
58
+ @Config("cache_provider")
59
+ @ConfigDefault("null")
60
+ Optional<String> getCacheProviderName();
50
61
  }
51
62
 
52
63
  @Override
@@ -60,6 +71,9 @@ public class ExpandJsonFilterPlugin
60
71
 
61
72
  PluginTask task = config.loadConfig(PluginTask.class);
62
73
 
74
+ // set cache provider
75
+ task.getCacheProviderName().ifPresent(this::setCacheProvider);
76
+
63
77
  // check if a column specified as json_column_name option exists or not
64
78
  Column jsonColumn = inputSchema.lookupColumn(task.getJsonColumnName());
65
79
  if (jsonColumn.getType() != Types.STRING && jsonColumn.getType() != Types.JSON) {
@@ -79,6 +93,8 @@ public class ExpandJsonFilterPlugin
79
93
  final Schema outputSchema, final PageOutput output)
80
94
  {
81
95
  final PluginTask task = taskSource.loadTask(PluginTask.class);
96
+ // set cache provider for mapreduce executor.
97
+ task.getCacheProviderName().ifPresent(this::setCacheProviderOrIgnore);
82
98
  return new FilteredPageOutput(task, inputSchema, outputSchema, output);
83
99
  }
84
100
 
@@ -152,4 +168,39 @@ public class ExpandJsonFilterPlugin
152
168
  columnList.add(columnName);
153
169
  }
154
170
  }
171
+
172
+ private void setCacheProvider(String cacheProviderName)
173
+ {
174
+ String upperCacheProviderName = cacheProviderName.toUpperCase(Locale.ENGLISH);
175
+ switch (upperCacheProviderName)
176
+ {
177
+ case "LRU":
178
+ CacheProvider.setCache(new LRUCache(400));
179
+ break;
180
+
181
+ case "NOOP":
182
+ CacheProvider.setCache(new NOOPCache());
183
+ break;
184
+
185
+ default:
186
+ try {
187
+ Class<?> klass = Class.forName(cacheProviderName);
188
+ Cache cache = (Cache) klass.newInstance();
189
+ CacheProvider.setCache(cache);
190
+ }
191
+ catch (ClassNotFoundException | IllegalAccessException | InstantiationException | ClassCastException e) {
192
+ throw new ConfigException(String.format("Cache Provider '%s' is not supported: %s.", cacheProviderName, e.getMessage()), e);
193
+ }
194
+ }
195
+ }
196
+
197
+ private void setCacheProviderOrIgnore(String cacheProviderName)
198
+ {
199
+ try {
200
+ setCacheProvider(cacheProviderName);
201
+ }
202
+ catch (JsonPathException e) {
203
+ logger.debug("Cache:{} is already set.", CacheProvider.getCache().getClass());
204
+ }
205
+ }
155
206
  }
@@ -32,9 +32,6 @@ import java.util.Map;
32
32
 
33
33
  import static org.embulk.filter.expand_json.ExpandJsonFilterPlugin.PluginTask;
34
34
 
35
- /**
36
- * Created by takahiro.nakayama on 10/19/15.
37
- */
38
35
  public class FilteredPageOutput
39
36
  implements PageOutput
40
37
  {