embulk-filter-ruby_proc 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 911204f43abd2acd2d9ac90317e474994fbdf221
4
+ data.tar.gz: 4521e06844679d721b5e9a21e7d01a368b600278
5
+ SHA512:
6
+ metadata.gz: 8da351fbf8fe0f39f9e29289bd0365bce6ecc03af0f833c2088fda6fdc9a7b0789781f99a70b44e452adda9824804e192e8256e1290e5f61154c5d497084f3d9
7
+ data.tar.gz: 2f1ad9b592fb2210ada815239e5bed14f1f4265c2fc43ee715102542e87f97dacef2998031ad4b44b693ecf0fc3aef1b6afdd0b3ea2ce4e3e9c6da32d2fbd600
@@ -0,0 +1,6 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ /.bundle/
5
+ /Gemfile.lock
6
+ /example/out*
@@ -0,0 +1 @@
1
+ jruby-9.0.4.0
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org/'
2
+ gemspec
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,80 @@
1
+ # Ruby Proc filter plugin for Embulk
2
+
3
+ This plugin is inspired by [mgi166/embulk-filter-eval: Eval ruby code on filtering](https://github.com/mgi166/embulk-filter-eval "mgi166/embulk-filter-eval: Eval ruby code on filtering")
4
+
5
+ This plugin apply ruby proc to each record.
6
+
7
+ ## Overview
8
+
9
+ * **Plugin type**: filter
10
+
11
+ ## Configuration
12
+
13
+ - **columns**: filter definition (hash, required)
14
+ - **requires**: pre required libraries (array, default: `[]`)
15
+
16
+ ## Example
17
+
18
+ ### input
19
+ ```csv
20
+ id,account,time,purchase,comment,data
21
+ 1,32864,2015-01-27 19:23:49,20150127,embulk,"{\"foo\": \"bar\", \"events\": [{\"id\": 1, \"name\": \"Name1\"}, {\"id\": 2, \"name\": \"Name2\"}]}"
22
+ 2,14824,2015-01-27 19:01:23,20150127,embulk jruby,NULL
23
+ 3,27559,2015-01-28 02:20:02,20150128,"Embulk ""csv"" parser plugin",NULL
24
+ 4,11270,2015-01-29 11:54:36,20150129,NULL,NULL
25
+ ```
26
+
27
+ ### config
28
+ ```yaml
29
+ # ...
30
+
31
+ filters:
32
+ - type: ruby_proc
33
+ requires:
34
+ - cgi
35
+ columns:
36
+ - name: data
37
+ proc: |
38
+ ->(data) do
39
+ data["events"] = data["events"].map.with_index do |e, idx|
40
+ e.tap { |e_| e_["idx"] = idx }
41
+ end
42
+ data.to_json
43
+ end
44
+ - name: id
45
+ proc: |
46
+ ->(id) do
47
+ id * 2
48
+ end
49
+ type: string
50
+ - name: comment
51
+ proc: |
52
+ ->(comment, record) do
53
+ return [record["account"].to_s].to_json unless comment
54
+ comment.upcase.split(" ").map { |s| CGI.escape(s) }.to_json
55
+ end
56
+ skip_nil: false
57
+ type: json
58
+ target: events
59
+
60
+ # ...
61
+
62
+ ```
63
+
64
+ ### preview
65
+ ```
66
+ +-----------+--------------+-------------------------+-------------------------+------------------------------------------+------------------------------------------------------------------------------------------+
67
+ | id:string | account:long | time:timestamp | purchase:timestamp | comment:json | data:json |
68
+ +-----------+--------------+-------------------------+-------------------------+------------------------------------------+------------------------------------------------------------------------------------------+
69
+ | 2 | 32,864 | 2015-01-27 19:23:49 UTC | 2015-01-27 00:00:00 UTC | ["EMBULK"] | {"events":[{"id":1,"name":"Name1","idx":0},{"id":2,"name":"Name2","idx":1}],"foo":"bar"} |
70
+ | 4 | 14,824 | 2015-01-27 19:01:23 UTC | 2015-01-27 00:00:00 UTC | ["EMBULK","JRUBY"] | |
71
+ | 6 | 27,559 | 2015-01-28 02:20:02 UTC | 2015-01-28 00:00:00 UTC | ["EMBULK","%22CSV%22","PARSER","PLUGIN"] | |
72
+ | 8 | 11,270 | 2015-01-29 11:54:36 UTC | 2015-01-29 00:00:00 UTC | ["11270"] | |
73
+ +-----------+--------------+-------------------------+-------------------------+------------------------------------------+------------------------------------------------------------------------------------------+
74
+ ```
75
+
76
+ ## Build
77
+
78
+ ```
79
+ $ rake
80
+ ```
@@ -0,0 +1,3 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ task default: :build
@@ -0,0 +1,19 @@
1
+
2
+ Gem::Specification.new do |spec|
3
+ spec.name = "embulk-filter-ruby_proc"
4
+ spec.version = "0.1.0"
5
+ spec.authors = ["joker1007"]
6
+ spec.summary = "Ruby Proc filter plugin for Embulk"
7
+ spec.description = "Filter each record by ruby proc"
8
+ spec.email = ["kakyoin.hierophant@gmail.com"]
9
+ spec.licenses = ["MIT"]
10
+ spec.homepage = "https://github.com/kakyoin.hierophant/embulk-filter-ruby_proc"
11
+
12
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
13
+ spec.test_files = spec.files.grep(%r{^(test|spec)/})
14
+ spec.require_paths = ["lib"]
15
+
16
+ spec.add_development_dependency 'embulk', ['>= 0.8.1']
17
+ spec.add_development_dependency 'bundler', ['>= 1.10.6']
18
+ spec.add_development_dependency 'rake', ['>= 10.0']
19
+ end
@@ -0,0 +1,67 @@
1
+ in:
2
+ type: file
3
+ path_prefix: ./sample_
4
+ parser:
5
+ charset: UTF-8
6
+ newline: CRLF
7
+ type: csv
8
+ delimiter: ','
9
+ quote: '"'
10
+ escape: '\'
11
+ null_string: 'NULL'
12
+ trim_if_not_quoted: false
13
+ skip_header_lines: 1
14
+ allow_extra_columns: false
15
+ allow_optional_columns: false
16
+ columns:
17
+ - {name: id, type: long}
18
+ - {name: account, type: long}
19
+ - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
20
+ - {name: purchase, type: timestamp, format: '%Y%m%d'}
21
+ - {name: comment, type: string}
22
+ - {name: data, type: json}
23
+
24
+ filters:
25
+ - type: ruby_proc
26
+ requires:
27
+ - cgi
28
+ columns:
29
+ - name: data
30
+ proc: |
31
+ ->(data) do
32
+ data["events"] = data["events"].map.with_index do |e, idx|
33
+ e.tap { |e_| e_["idx"] = idx }
34
+ end
35
+ data.to_json
36
+ end
37
+ - name: id
38
+ proc: |
39
+ ->(id) do
40
+ id * 2
41
+ end
42
+ type: string
43
+ - name: comment
44
+ proc: |
45
+ ->(comment, record) do
46
+ return [record["account"].to_s].to_json unless comment
47
+ comment.upcase.split(" ").map { |s| CGI.escape(s) }.to_json
48
+ end
49
+ skip_nil: false
50
+ type: json
51
+ target: events
52
+
53
+ out:
54
+ type: file
55
+ path_prefix: ./out_
56
+ file_ext: tsv
57
+ formatter:
58
+ type: csv
59
+ delimiter: "\t"
60
+ newline: CRLF
61
+ newline_in_field: LF
62
+ charset: UTF-8
63
+ quote_policy: MINIMAL
64
+ quote: '"'
65
+ escape: "\\"
66
+ null_string: 'NULL'
67
+ default_timezone: 'UTC'
@@ -0,0 +1,5 @@
1
+ id,account,time,purchase,comment,data
2
+ 1,32864,2015-01-27 19:23:49,20150127,embulk,"{\"foo\": \"bar\", \"events\": [{\"id\": 1, \"name\": \"Name1\"}, {\"id\": 2, \"name\": \"Name2\"}]}"
3
+ 2,14824,2015-01-27 19:01:23,20150127,embulk jruby,NULL
4
+ 3,27559,2015-01-28 02:20:02,20150128,"Embulk ""csv"" parser plugin",NULL
5
+ 4,11270,2015-01-29 11:54:36,20150129,NULL,NULL
@@ -0,0 +1,70 @@
1
+ module Embulk
2
+ module Filter
3
+
4
+ class RubyProc < FilterPlugin
5
+ Plugin.register_filter("ruby_proc", self)
6
+
7
+ def self.transaction(config, in_schema, &control)
8
+ task = {
9
+ "columns" => config.param("columns", :array),
10
+ "requires" => config.param("requires", :array, default: []),
11
+ }
12
+
13
+ out_columns = in_schema.map do |col|
14
+ target = task["columns"].find { |filter_col| filter_col["name"] == col.name }
15
+ if target
16
+ type = target["type"] ? target["type"].to_sym : col.type
17
+ Embulk::Column.new(index: col.index, name: col.name, type: type || col.type, format: target["format"] || col.format)
18
+ else
19
+ col
20
+ end
21
+ end
22
+
23
+ yield(task, out_columns)
24
+ end
25
+
26
+ def init
27
+ task["requires"].each do |lib|
28
+ require lib
29
+ end
30
+ @procs = Hash[task["columns"].map {|col|
31
+ [col["name"], eval(col["proc"])]
32
+ }]
33
+ @skip_nils = Hash[task["columns"].map {|col|
34
+ [col["name"], col["skip_nil"].nil? ? true : !!col["skip_nil"]]
35
+ }]
36
+ end
37
+
38
+ def close
39
+ end
40
+
41
+ def add(page)
42
+ page.each do |record|
43
+ record_hash = hashrize(record)
44
+ @procs.each do |col, pr|
45
+ next unless record_hash.has_key?(col)
46
+ next if record_hash[col].nil? && @skip_nils[col]
47
+
48
+ if pr.arity == 1
49
+ record_hash[col] = pr.call(record_hash[col])
50
+ else
51
+ record_hash[col] = pr.call(record_hash[col], record_hash)
52
+ end
53
+ end
54
+ page_builder.add(record_hash.values)
55
+ end
56
+ end
57
+
58
+ def finish
59
+ page_builder.finish
60
+ end
61
+
62
+ private
63
+
64
+ def hashrize(record)
65
+ Hash[in_schema.names.zip(record)]
66
+ end
67
+ end
68
+
69
+ end
70
+ end
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-filter-ruby_proc
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - joker1007
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-02-01 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: embulk
15
+ version_requirements: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.8.1
20
+ requirement: !ruby/object:Gem::Requirement
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: 0.8.1
25
+ prerelease: false
26
+ type: :development
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 1.10.6
34
+ requirement: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: 1.10.6
39
+ prerelease: false
40
+ type: :development
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ version_requirements: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ requirement: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '10.0'
53
+ prerelease: false
54
+ type: :development
55
+ description: Filter each record by ruby proc
56
+ email:
57
+ - kakyoin.hierophant@gmail.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".gitignore"
63
+ - ".ruby-version"
64
+ - Gemfile
65
+ - LICENSE.txt
66
+ - README.md
67
+ - Rakefile
68
+ - embulk-filter-ruby_proc.gemspec
69
+ - example/config.yml
70
+ - example/sample_01.csv
71
+ - lib/embulk/filter/ruby_proc.rb
72
+ homepage: https://github.com/kakyoin.hierophant/embulk-filter-ruby_proc
73
+ licenses:
74
+ - MIT
75
+ metadata: {}
76
+ post_install_message:
77
+ rdoc_options: []
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ requirements: []
91
+ rubyforge_project:
92
+ rubygems_version: 2.4.8
93
+ signing_key:
94
+ specification_version: 4
95
+ summary: Ruby Proc filter plugin for Embulk
96
+ test_files: []