embulk-filter-expand_json 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c82efb599e6e9e1cd6f02219354383e4d7d1b966
4
- data.tar.gz: 1111d92e7fcf2e9df01ac7b0e7e2a27439d71421
3
+ metadata.gz: 9a6ff1c6a5264a85b4aa62b7a7719c20f5a987c1
4
+ data.tar.gz: f00862e0f575be8a4f7b1ed938f56370a32d9b0b
5
5
  SHA512:
6
- metadata.gz: 0e274e928beefad9aacd00b8ef11cbfb5b3772018841d4f7775afd0f03fcb70124ebbea0e5317b401d7dc77b493ab1d56dc445430ebb68a12d7c7798a8fdb2b2
7
- data.tar.gz: afebf67519d74a5a86c0e9ad217b2afe66bf7011c6faca6c1f6dc2291ee07afd5f5acb089b168632e2a14b58fc399ceb9ad0dd555923b50e79f9e45acc8e49c3
6
+ metadata.gz: 787eb4e46c979414d5ee986ec7e1deb2cef85477ac48d0b85aa55762de4dd6ac133d1fa877490e432c393b86cf61d40dac4fd021613bf012ced550200c26d3dc
7
+ data.tar.gz: 3f3352f3eb7dfd14b506bda85e0c15ac038d5b1495d618eb623ca6f686fc75ec823a28e1d2d1dde6e5e5c8f4f5b0f86169df1cb006a8b796adf8bb90ada6a9b5
@@ -1,8 +1,6 @@
1
1
  dist: precise
2
2
  language: java
3
3
  jdk:
4
- - openjdk7
5
- - oraclejdk7
6
4
  - oraclejdk8
7
5
  script:
8
6
  - ./gradlew test
@@ -1,3 +1,8 @@
1
+ 0.3.0 (2019-05-02)
2
+ ==================
3
+ - [Enhancement] Introduce `cache_provider` option.
4
+ - https://github.com/civitaspo/embulk-filter-expand_json/pull/41
5
+
1
6
  0.2.2 (2017-09-14)
2
7
  ==================
3
8
  - [Enhancement] Use TimestampParser's constructor without JRuby ScriptingContainer
data/README.md CHANGED
@@ -21,6 +21,8 @@ expand columns having json into multiple columns
21
21
  - **keep_expanding_json_column**: Not remove the expanding json column from input schema if it's true (false by default)
22
22
  - **default_timezone**: Time zone of timestamp columns if values don’t include time zone description (`UTC` by default)
23
23
  - **stop_on_invalid_record**: Stop bulk load transaction if an invalid record is included (false by default)
24
+ - **cache_provider**: Cache provider name for JsonPath. `"LRU"` and `"NOOP"` are built-in. You can specify user defined class. (string, default: `"LRU"`)
25
+ - `"NOOP"` becomes default in the future.
24
26
 
25
27
  ---
26
28
  **type of the column**
@@ -53,6 +55,7 @@ filters:
53
55
  - {name: "profile.like_words[0]", type: string}
54
56
  ```
55
57
 
58
+
56
59
  ## Note
57
60
  - If the value evaluated by JsonPath is Array or Hash, the value is written as JSON.
58
61
 
@@ -61,13 +64,41 @@ filters:
61
64
  - use to evaluate [JsonPath](http://goessner.net/articles/JsonPath/)
62
65
  - [Apache License Version 2.0](https://github.com/jayway/JsonPath/blob/master/LICENSE)
63
66
 
67
+ ## Development
64
68
 
65
- ## Build
69
+ ### Run Example
70
+
71
+ ```
72
+ ./gradlew classpath
73
+ embulk run -Ilib ./example/config.yml
74
+ ```
75
+
76
+
77
+ ### Build
66
78
 
67
79
  ```
68
80
  $ ./gradlew gem # -t to watch change of files and rebuild continuously
69
81
  ```
70
82
 
83
+ ## Benchmark for `cache_provider` option
84
+
85
+ In some cases, `cache_provider: NOOP` improves the performance of this plugin by 3 times (https://github.com/civitaspo/embulk-filter-expand_json/pull/41/).
86
+ So we do a benchmark about `cache_provider`. In our case, `cache_provider: noop` improves the performance by 1.5 times.
87
+
88
+ |use `expand_json` filter|cache_provider|Time took|records/s|
89
+ |:---|:---|:---|:---|
90
+ |`false`|none|7.62s|1,325,459/s|
91
+ |`true`|`"LRU"`|2m9s|78,025/s|
92
+ |`true`|`"NOOP"`|1m25s|118,476/s|
93
+
94
+
95
+ You can reproduce the bench by the below way.
96
+
97
+ ```
98
+ ./gradlew classpath
99
+ ./bench/run.sh
100
+ ```
101
+
71
102
  ## Contributor
72
103
  - @Civitaspo
73
104
  - @muga
@@ -0,0 +1,2 @@
1
+ ---
2
+ BUNDLE_PATH: "vendor/bundle"
@@ -0,0 +1,2 @@
1
+ /vendor/
2
+ *.jsonl
@@ -0,0 +1,5 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'embulk'
4
+ gem 'embulk-parser-none'
5
+ gem 'embulk-filter-speedometer'
@@ -0,0 +1,22 @@
1
+ GEM
2
+ remote: https://rubygems.org/
3
+ specs:
4
+ embulk (0.9.16-java)
5
+ bundler (>= 1.10.6)
6
+ liquid (~> 4.0.0)
7
+ msgpack (~> 1.1.0)
8
+ embulk-filter-speedometer (0.3.4)
9
+ embulk-parser-none (0.2.0)
10
+ liquid (4.0.3)
11
+ msgpack (1.1.0-java)
12
+
13
+ PLATFORMS
14
+ java
15
+
16
+ DEPENDENCIES
17
+ embulk
18
+ embulk-filter-speedometer
19
+ embulk-parser-none
20
+
21
+ BUNDLED WITH
22
+ 1.16.0
@@ -0,0 +1,14 @@
1
+ in:
2
+ type: file
3
+ path_prefix: data.jsonl
4
+ parser:
5
+ type: none
6
+ column_name: payload
7
+
8
+ filters:
9
+ - type: speedometer
10
+ log_interval_seconds: 1
11
+
12
+ out:
13
+ type: "null"
14
+
@@ -0,0 +1,27 @@
1
+ in:
2
+ type: file
3
+ path_prefix: data.jsonl
4
+ parser:
5
+ type: none
6
+ column_name: payload
7
+
8
+ filters:
9
+ - type: speedometer
10
+ log_interval_seconds: 1
11
+ - type: expand_json
12
+ json_column_name: payload
13
+ root: "$."
14
+ expanded_columns:
15
+ - {name: "phone_numbers", type: string}
16
+ - {name: "app_id", type: long}
17
+ - {name: "point", type: double}
18
+ - {name: "created_at", type: timestamp, format: "%Y-%m-%d"}
19
+ - {name: "profile.anniversary.et", type: string}
20
+ - {name: "profile.anniversary", type: string}
21
+ - {name: "profile.like_words[1]", type: string}
22
+ - {name: "profile.like_words[2]", type: string}
23
+ - {name: "profile.like_words", type: string}
24
+
25
+ out:
26
+ type: "null"
27
+
@@ -0,0 +1,28 @@
1
+ in:
2
+ type: file
3
+ path_prefix: data.jsonl
4
+ parser:
5
+ type: none
6
+ column_name: payload
7
+
8
+ filters:
9
+ - type: speedometer
10
+ log_interval_seconds: 1
11
+ - type: expand_json
12
+ json_column_name: payload
13
+ cache_provider: noop
14
+ root: "$."
15
+ expanded_columns:
16
+ - {name: "phone_numbers", type: string}
17
+ - {name: "app_id", type: long}
18
+ - {name: "point", type: double}
19
+ - {name: "created_at", type: timestamp, format: "%Y-%m-%d"}
20
+ - {name: "profile.anniversary.et", type: string}
21
+ - {name: "profile.anniversary", type: string}
22
+ - {name: "profile.like_words[1]", type: string}
23
+ - {name: "profile.like_words[2]", type: string}
24
+ - {name: "profile.like_words", type: string}
25
+
26
+ out:
27
+ type: "null"
28
+
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env bash
2
+
3
+ BENCH_ROOT=$(cd $(dirname $0); pwd)
4
+ DATA_FILE=data.jsonl
5
+ TMP_DATA_FILE=tmp.jsonl
6
+
7
+ function now() {
8
+ date +"%FT%T%:z"
9
+ }
10
+
11
+ echo "[$(now)] Preparing ..."
12
+ (
13
+ cd $BENCH_ROOT
14
+ embulk bundle
15
+
16
+ if [ -f $DATA_FILE ]; then
17
+ rm -f $DATA_FILE
18
+ fi
19
+ if [ -f $TMP_DATA_FILE ]; then
20
+ rm -f $TMP_DATA_FILE
21
+ fi
22
+ for n in {1..100}; do
23
+ cat ../example/data.tsv | cut -f5 >> $TMP_DATA_FILE
24
+ done
25
+ for n in {1..1000}; do
26
+ cat $TMP_DATA_FILE >> $DATA_FILE
27
+ done
28
+ )
29
+
30
+ echo "[$(now)] Run No expand_json"
31
+ (
32
+ cd $BENCH_ROOT
33
+ time embulk run -I ../lib -b . config_raw.yml
34
+ )
35
+
36
+ echo "[$(now)] Run Default (LRUCache)"
37
+ (
38
+ cd $BENCH_ROOT
39
+ time embulk run -I ../lib -b . config_with_lru_cache.yml
40
+ )
41
+
42
+ echo "[$(now)] Run with NOOPCache"
43
+ (
44
+ cd $BENCH_ROOT
45
+ time embulk run -I ../lib -b . config_with_noop_cache.yml
46
+ )
47
+
48
+ echo "[$(now)] Teardown..."
49
+ (
50
+ cd $BENCH_ROOT
51
+ if [ -f $DATA_FILE ]; then
52
+ rm -f $DATA_FILE
53
+ fi
54
+ if [ -f $TMP_DATA_FILE ]; then
55
+ rm -f $TMP_DATA_FILE
56
+ fi
57
+ )
@@ -1,6 +1,6 @@
1
1
  plugins {
2
2
  id "com.jfrog.bintray" version "1.1"
3
- id "com.github.jruby-gradle.base" version "0.1.5"
3
+ id "com.github.jruby-gradle.base" version "1.5.0"
4
4
  id "com.github.kt3k.coveralls" version "2.4.0"
5
5
  id "jacoco"
6
6
  id "java"
@@ -15,16 +15,16 @@ configurations {
15
15
  provided
16
16
  }
17
17
 
18
- version = "0.2.2"
19
- sourceCompatibility = 1.7
20
- targetCompatibility = 1.7
18
+ version = "0.3.0"
19
+ sourceCompatibility = 1.8
20
+ targetCompatibility = 1.8
21
21
 
22
22
  dependencies {
23
- compile "org.embulk:embulk-core:0.8.32"
24
- provided "org.embulk:embulk-core:0.8.32"
23
+ compile "org.embulk:embulk-core:0.9.+"
24
+ provided "org.embulk:embulk-core:0.9.+"
25
25
  compile "com.jayway.jsonpath:json-path:2.+"
26
26
  testCompile "junit:junit:4.+"
27
- testCompile "org.embulk:embulk-core:0.8.32:tests"
27
+ testCompile "org.embulk:embulk-core:0.9.+:tests"
28
28
  }
29
29
 
30
30
  jacocoTestReport {
@@ -59,19 +59,23 @@ task checkstyle(type: Checkstyle) {
59
59
  }
60
60
 
61
61
  task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
62
- jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
63
- script "${project.name}.gemspec"
62
+ jrubyArgs "-S"
63
+ script "gem"
64
+ scriptArgs "build", "${project.name}.gemspec"
64
65
  doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
65
66
  }
66
67
 
67
68
  task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
68
- jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "push"
69
- script "pkg/${project.name}-${project.version}.gem"
69
+ jrubyArgs "-S"
70
+ script "gem"
71
+ scriptArgs "push", "pkg/${project.name}-${project.version}.gem"
70
72
  }
71
73
 
72
- task "package"(dependsOn: ["gemspec", "classpath"]) << {
73
- println "> Build succeeded."
74
- println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
74
+ task "package"(dependsOn: ["gemspec", "classpath"]) {
75
+ doLast {
76
+ println "> Build succeeded."
77
+ println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
78
+ }
75
79
  }
76
80
 
77
81
  task gemspec {
@@ -93,7 +97,6 @@ Gem::Specification.new do |spec|
93
97
  spec.test_files = spec.files.grep(%r"^(test|spec)/")
94
98
  spec.require_paths = ["lib"]
95
99
 
96
- #spec.add_dependency 'YOUR_GEM_DEPENDENCY', ['~> YOUR_GEM_DEPENDENCY_VERSION']
97
100
  spec.add_development_dependency 'bundler', ['~> 1.0']
98
101
  spec.add_development_dependency 'rake', ['>= 10.0']
99
102
  end
@@ -0,0 +1 @@
1
+ rootProject.name = "embulk-filter-expand_json"
@@ -1,6 +1,11 @@
1
1
  package org.embulk.filter.expand_json;
2
2
 
3
3
  import com.google.common.collect.ImmutableList;
4
+ import com.jayway.jsonpath.JsonPathException;
5
+ import com.jayway.jsonpath.spi.cache.Cache;
6
+ import com.jayway.jsonpath.spi.cache.CacheProvider;
7
+ import com.jayway.jsonpath.spi.cache.LRUCache;
8
+ import com.jayway.jsonpath.spi.cache.NOOPCache;
4
9
  import org.embulk.config.Config;
5
10
  import org.embulk.config.ConfigDefault;
6
11
  import org.embulk.config.ConfigException;
@@ -19,6 +24,8 @@ import org.slf4j.Logger;
19
24
 
20
25
  import java.util.ArrayList;
21
26
  import java.util.List;
27
+ import java.util.Locale;
28
+ import java.util.Optional;
22
29
 
23
30
  public class ExpandJsonFilterPlugin
24
31
  implements FilterPlugin
@@ -29,14 +36,14 @@ public class ExpandJsonFilterPlugin
29
36
  extends Task, TimestampParser.Task
30
37
  {
31
38
  @Config("json_column_name")
32
- public String getJsonColumnName();
39
+ String getJsonColumnName();
33
40
 
34
41
  @Config("root")
35
42
  @ConfigDefault("\"$.\"")
36
- public String getRoot();
43
+ String getRoot();
37
44
 
38
45
  @Config("expanded_columns")
39
- public List<ColumnConfig> getExpandedColumns();
46
+ List<ColumnConfig> getExpandedColumns();
40
47
 
41
48
  // default_timezone option from TimestampParser.Task
42
49
 
@@ -46,7 +53,11 @@ public class ExpandJsonFilterPlugin
46
53
 
47
54
  @Config("keep_expanding_json_column")
48
55
  @ConfigDefault("false")
49
- public boolean getKeepExpandingJsonColumn();
56
+ boolean getKeepExpandingJsonColumn();
57
+
58
+ @Config("cache_provider")
59
+ @ConfigDefault("null")
60
+ Optional<String> getCacheProviderName();
50
61
  }
51
62
 
52
63
  @Override
@@ -60,6 +71,9 @@ public class ExpandJsonFilterPlugin
60
71
 
61
72
  PluginTask task = config.loadConfig(PluginTask.class);
62
73
 
74
+ // set cache provider
75
+ task.getCacheProviderName().ifPresent(this::setCacheProvider);
76
+
63
77
  // check if a column specified as json_column_name option exists or not
64
78
  Column jsonColumn = inputSchema.lookupColumn(task.getJsonColumnName());
65
79
  if (jsonColumn.getType() != Types.STRING && jsonColumn.getType() != Types.JSON) {
@@ -79,6 +93,8 @@ public class ExpandJsonFilterPlugin
79
93
  final Schema outputSchema, final PageOutput output)
80
94
  {
81
95
  final PluginTask task = taskSource.loadTask(PluginTask.class);
96
+ // set cache provider for mapreduce executor.
97
+ task.getCacheProviderName().ifPresent(this::setCacheProviderOrIgnore);
82
98
  return new FilteredPageOutput(task, inputSchema, outputSchema, output);
83
99
  }
84
100
 
@@ -152,4 +168,39 @@ public class ExpandJsonFilterPlugin
152
168
  columnList.add(columnName);
153
169
  }
154
170
  }
171
+
172
+ private void setCacheProvider(String cacheProviderName)
173
+ {
174
+ String upperCacheProviderName = cacheProviderName.toUpperCase(Locale.ENGLISH);
175
+ switch (upperCacheProviderName)
176
+ {
177
+ case "LRU":
178
+ CacheProvider.setCache(new LRUCache(400));
179
+ break;
180
+
181
+ case "NOOP":
182
+ CacheProvider.setCache(new NOOPCache());
183
+ break;
184
+
185
+ default:
186
+ try {
187
+ Class<?> klass = Class.forName(cacheProviderName);
188
+ Cache cache = (Cache) klass.newInstance();
189
+ CacheProvider.setCache(cache);
190
+ }
191
+ catch (ClassNotFoundException | IllegalAccessException | InstantiationException | ClassCastException e) {
192
+ throw new ConfigException(String.format("Cache Provider '%s' is not supported: %s.", cacheProviderName, e.getMessage()), e);
193
+ }
194
+ }
195
+ }
196
+
197
+ private void setCacheProviderOrIgnore(String cacheProviderName)
198
+ {
199
+ try {
200
+ setCacheProvider(cacheProviderName);
201
+ }
202
+ catch (JsonPathException e) {
203
+ logger.debug("Cache:{} is already set.", CacheProvider.getCache().getClass());
204
+ }
205
+ }
155
206
  }
@@ -32,9 +32,6 @@ import java.util.Map;
32
32
 
33
33
  import static org.embulk.filter.expand_json.ExpandJsonFilterPlugin.PluginTask;
34
34
 
35
- /**
36
- * Created by takahiro.nakayama on 10/19/15.
37
- */
38
35
  public class FilteredPageOutput
39
36
  implements PageOutput
40
37
  {