embulk-parser-jsonl 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: f98fb168c55e6a4c80f5d098bc72945091fdcf8d
4
+ data.tar.gz: 5e85f4e3d5aece158e1b55e381d40ee6eda1d69d
5
+ SHA512:
6
+ metadata.gz: 6452226f7de14018279af312af6f4105dfd98668506168f3acb4e3139519b6a93353e1bac454c0bc272bca96375c4a6fedb201b59f9b639ca0ec7b7774bd69bb
7
+ data.tar.gz: 1fa70b407a8ad5d8fa37a6467e621aa7e8fd937cedb908d85f3189bcb2afd6506b476c62433bc4ef47ad241b31fc47b46dfc50c77388d504294348540014c09d
@@ -0,0 +1,5 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ /.bundle/
5
+ /Gemfile.lock
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org/'
2
+ gemspec
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,39 @@
1
+ # Jsonl parser plugin for Embulk
2
+
3
+ TODO: Write short description here and embulk-parser-jsonl.gemspec file.
4
+
5
+ ## Overview
6
+
7
+ * **Plugin type**: parser
8
+ * **Guess supported**: yes
9
+
10
+ ## Configuration
11
+
12
+ - **type**: specify this parser as jsonl
13
+ - **schema**: specify column name and type (array, required)
14
+
15
+ ## Example
16
+
17
+ ```yaml
18
+ in:
19
+ type: any file input plugin type
20
+ parser:
21
+ type: jsonl
22
+ schema:
23
+ - {name: first_name, type: string}
24
+ - {name: last_name, type: string}
25
+ - {name: age, type: long}
26
+ ```
27
+
28
+ (If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
29
+
30
+ ```
31
+ $ embulk install embulk-parser-jsonl
32
+ $ embulk guess -g jsonl config.yml -o guessed.yml
33
+ ```
34
+
35
+ ## Build
36
+
37
+ ```
38
+ $ rake
39
+ ```
@@ -0,0 +1,3 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ task default: :build
@@ -0,0 +1,18 @@
1
+
2
+ Gem::Specification.new do |spec|
3
+ spec.name = "embulk-parser-jsonl"
4
+ spec.version = "0.0.1"
5
+ spec.authors = ["Shunsuke Mikami"]
6
+ spec.summary = "Jsonl parser plugin for Embulk"
7
+ spec.description = "Parses Jsonl files read by other file input plugins."
8
+ spec.email = ["shun0102@gmail.com"]
9
+ spec.licenses = ["MIT"]
10
+ spec.homepage = "https://github.com/shun0102/embulk-parser-jsonl"
11
+
12
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
13
+ spec.test_files = spec.files.grep(%r{^(test|spec)/})
14
+ spec.require_paths = ["lib"]
15
+
16
+ spec.add_development_dependency 'bundler', ['~> 1.0']
17
+ spec.add_development_dependency 'rake', ['~> 10.0']
18
+ end
@@ -0,0 +1,85 @@
1
+ require 'json'
2
+
3
+ module Embulk
4
+ module Guess
5
+
6
+ # TODO implement guess plugin to make this command work:
7
+ # $ embulk guess -g "jsonl" partial-config.yml
8
+ #
9
+ # Depending on the file format the plugin uses, you can use choose
10
+ # one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
11
+ # or line guess (LineGuessPlugin).
12
+
13
+ require "embulk/parser/jsonl.rb"
14
+
15
+ #class JsonlParserGuessPlugin < GuessPlugin
16
+ # Plugin.register_guess("jsonl", self)
17
+ #
18
+ # def guess(config, sample_buffer)
19
+ # if sample_buffer[0,2] == GZIP_HEADER
20
+ # guessed = {}
21
+ # guessed["type"] = "jsonl"
22
+ # guessed["property1"] = "guessed-value"
23
+ # return {"parser" => guessed}
24
+ # else
25
+ # return {}
26
+ # end
27
+ # end
28
+ #end
29
+
30
+ #class JsonlParserGuessPlugin < TextGuessPlugin
31
+ # Plugin.register_guess("jsonl", self)
32
+ #
33
+ # def guess_text(config, sample_text)
34
+ # js = JSON.parse(sample_text) rescue nil
35
+ # if js && js["mykeyword"] == "keyword"
36
+ # guessed = {}
37
+ # guessed["type"] = "jsonl"
38
+ # guessed["property1"] = "guessed-value"
39
+ # return {"parser" => guessed}
40
+ # else
41
+ # return {}
42
+ # end
43
+ # end
44
+ #end
45
+
46
+ class JsonlParserGuessPlugin < LineGuessPlugin
47
+ Plugin.register_guess("jsonl", self)
48
+
49
+ def guess_lines(config, sample_lines)
50
+ columns = {}
51
+ sample_lines.each do |line|
52
+ hash = JSON.parse(line)
53
+ hash.each do |k, v|
54
+ columns[k] = get_embulk_type(v)
55
+ end
56
+ end
57
+ schema = []
58
+ columns.each do |k,v|
59
+ schema << {'name' => k, 'type' => v}
60
+ end
61
+ guessed = {}
62
+ guessed["type"] = "jsonl"
63
+ guessed["schema"] = schema
64
+ return {"parser" => guessed}
65
+ end
66
+
67
+ private
68
+
69
+ def get_embulk_type(val)
70
+ case val
71
+ when TrueClass
72
+ return "boolean"
73
+ when FalseClass
74
+ return "boolean"
75
+ when Integer
76
+ return "long"
77
+ when Float
78
+ return "double"
79
+ else
80
+ return "string"
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,67 @@
1
+ require 'json'
2
+
3
+ module Embulk
4
+ module Parser
5
+
6
+ class JsonlParserPlugin < ParserPlugin
7
+ Plugin.register_parser("jsonl", self)
8
+
9
+ def self.transaction(config, &control)
10
+ parser_task = config.load_config(Java::LineDecoder::DecoderTask)
11
+ task = {
12
+ "decoder_task" => DataSource.from_java(parser_task.dump),
13
+ "schema" => config.param("schema", :array)
14
+ }
15
+ columns = task["schema"].each_with_index.map do |c, i|
16
+ Column.new(i, c["name"], c["type"].to_sym)
17
+ end
18
+ yield(task, columns)
19
+ end
20
+
21
+ def init
22
+ @decoder_task = task.param("decoder_task", :hash).load_task(Java::LineDecoder::DecoderTask)
23
+ end
24
+
25
+ def run(file_input)
26
+ decoder = Java::LineDecoder.new(file_input.instance_eval { @java_file_input }, @decoder_task)
27
+ schema = @task["schema"]
28
+
29
+ while decoder.nextFile
30
+ while line = decoder.poll
31
+ begin
32
+ hash = JSON.parse(line)
33
+ @page_builder.add(make_record(schema, hash))
34
+ rescue
35
+ # TODO: logging
36
+ end
37
+ end
38
+ end
39
+ page_builder.finish
40
+ end
41
+
42
+ private
43
+
44
+ def make_record(schema, e)
45
+ schema.map do |c|
46
+ val = e[c["name"]]
47
+ v = val.nil? ? "" : val
48
+ case c["type"]
49
+ when "string"
50
+ v
51
+ when "long"
52
+ v.to_i
53
+ when "double"
54
+ v.to_f
55
+ when "boolean"
56
+ ["yes", "true", "1"].include?(v.downcase)
57
+ when "timestamp"
58
+ v.empty? ? nil : Time.strptime(v, c["time_format"])
59
+ else
60
+ raise "Unsupported type #{c['type']}"
61
+ end
62
+ end
63
+ end
64
+ end
65
+
66
+ end
67
+ end
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-parser-jsonl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Shunsuke Mikami
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-04-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description: Parses Jsonl files read by other file input plugins.
42
+ email:
43
+ - shun0102@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".gitignore"
49
+ - Gemfile
50
+ - LICENSE.txt
51
+ - README.md
52
+ - Rakefile
53
+ - embulk-parser-jsonl.gemspec
54
+ - lib/embulk/guess/jsonl.rb
55
+ - lib/embulk/parser/jsonl.rb
56
+ homepage: https://github.com/shun0102/embulk-parser-jsonl
57
+ licenses:
58
+ - MIT
59
+ metadata: {}
60
+ post_install_message:
61
+ rdoc_options: []
62
+ require_paths:
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: '0'
74
+ requirements: []
75
+ rubyforge_project:
76
+ rubygems_version: 2.2.2
77
+ signing_key:
78
+ specification_version: 4
79
+ summary: Jsonl parser plugin for Embulk
80
+ test_files: []