embulk-parser-ltsv 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 33a6060131190c76380a815b6b1a89406158eed4
4
+ data.tar.gz: 85f2463f7fb53ba2e395393083ddcf52f198f935
5
+ SHA512:
6
+ metadata.gz: cf27bffe35212b122a46e4d93526fbe8a1ed359c53fc9c8aac6ecfb53d53c7f7458e2512dbfd1d692515a8ed61864e98a2f90c0c48d8ad9c749211828cd1b423
7
+ data.tar.gz: f2a8d8e9c77aa26093f0b631f145b51b28c911dc7d627b3aeea13f082883b12fd3ad2063342fb734f0a538e456e3e1d06b7efcc474610a41b506c47423221d83
data/.gitignore ADDED
@@ -0,0 +1,6 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ /.bundle/
5
+ /vendor/
6
+ /Gemfile.lock
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org/'
2
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,60 @@
1
+ # Ltsv parser plugin for Embulk
2
+
3
+
4
+ ## Overview
5
+
6
+ * **Plugin type**: parser
7
+ * **Guess supported**: yes
8
+
9
+ ## Configuration
10
+
11
+ - **option1**: description (integer, required)
12
+ - **option2**: description (string, default: `"myvalue"`)
13
+ - **null_value_pattern**: null value pattern. (string, default: `null`)
14
+
15
+ ## Example
16
+
17
+ ```yaml
18
+
19
+ in:
20
+ type: file
21
+ path_prefix: /Users/toyama-h/access_log-20150616.ltsv.gz
22
+ decoders:
23
+ - {type: gzip}
24
+ parser:
25
+ type: ltsv
26
+ charset: UTF-8
27
+ newline: CRLF
28
+ null_value_pattern: ^(-|null|NULL)$
29
+ schema:
30
+ - {name: host, type: string}
31
+ - {name: ip_address, type: string}
32
+ - {name: server, type: string}
33
+ - {name: remote_user, type: string}
34
+ - {name: log_time, type: timestamp, time_format: '%d/%b/%Y:%H:%M:%S %z'}
35
+ - {name: method, type: string}
36
+ - {name: path, type: string}
37
+ - {name: protocol, type: string}
38
+ - {name: status, type: long}
39
+ - {name: size, type: string}
40
+ - {name: referer, type: string}
41
+ - {name: user_agent, type: string}
42
+ - {name: response_time, type: long}
43
+ exec: {}
44
+ out: {type: stdout}
45
+
46
+
47
+ ```
48
+
49
+ (If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
50
+
51
+ ```
52
+ $ embulk gem install embulk-parser-ltsv
53
+ $ embulk guess -g ltsv config.yml -o guessed.yml
54
+ ```
55
+
56
+ ## Build
57
+
58
+ ```
59
+ $ rake
60
+ ```
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ task default: :build
@@ -0,0 +1,19 @@
1
+
2
+ Gem::Specification.new do |spec|
3
+ spec.name = "embulk-parser-ltsv"
4
+ spec.version = "0.1.0"
5
+ spec.authors = ["toyama0919"]
6
+ spec.summary = "Ltsv parser plugin for Embulk"
7
+ spec.description = "Parses Ltsv files read by other file input plugins."
8
+ spec.email = ["toyama0919@gmail.com"]
9
+ spec.licenses = ["MIT"]
10
+ spec.homepage = "https://github.com/toyama0919/embulk-parser-ltsv"
11
+
12
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
13
+ spec.test_files = spec.files.grep(%r{^(test|spec)/})
14
+ spec.require_paths = ["lib"]
15
+
16
+ #spec.add_dependency 'YOUR_GEM_DEPENDENCY', ['~> YOUR_GEM_DEPENDENCY_VERSION']
17
+ spec.add_development_dependency 'bundler', ['~> 1.0']
18
+ spec.add_development_dependency 'rake', ['>= 10.0']
19
+ end
@@ -0,0 +1,53 @@
1
+ module Embulk
2
+ module Guess
3
+ class LtsvParserGuessPlugin < LineGuessPlugin
4
+ Plugin.register_guess("ltsv", self)
5
+
6
+ def guess_lines(config, sample_lines)
7
+ columns = {}
8
+ sample_lines.each do |line|
9
+ hash = {}
10
+ array = line.split("\t").each { |pair|
11
+ key, value = pair.split(":", 2)
12
+ hash[key] = value
13
+ }
14
+
15
+ hash.each do |k, v|
16
+ columns[k] = get_embulk_type(v)
17
+ end
18
+ end
19
+ schema = []
20
+ columns.each do |k,v|
21
+ schema << {'name' => k, 'type' => v}
22
+ end
23
+ guessed = {}
24
+ guessed["type"] = "ltsv"
25
+ guessed["schema"] = schema
26
+ return {"parser" => guessed}
27
+ end
28
+
29
+ private
30
+
31
+ def get_embulk_type(val)
32
+ if val =~ /^\d+\.\d+$/
33
+ return "double"
34
+ end
35
+
36
+ if val =~ /^\d+$/
37
+ return "long"
38
+ end
39
+
40
+ begin
41
+ Time.parse(val)
42
+ return "timestamp"
43
+ rescue => e
44
+ end
45
+
46
+ if val =~ /^(true|false)$/i
47
+ return "boolean"
48
+ end
49
+ return "string"
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,103 @@
1
+ module Embulk
2
+ module Parser
3
+
4
+ class Ltsv < ParserPlugin
5
+ Plugin.register_parser("ltsv", self)
6
+
7
+ def self.transaction(config, &control)
8
+ parser_task = config.load_config(Java::LineDecoder::DecoderTask)
9
+ task = {
10
+ "decoder_task" => DataSource.from_java(parser_task.dump),
11
+ "schema" => config.param("schema", :array),
12
+ "null_value_pattern" => config.param("null_value_pattern", :string, default: nil),
13
+ "null_empty_string" => config.param("null_empty_string", :bool, default: false),
14
+ "delimiter" => config.param("delimiter", :string, default: "\t"),
15
+ "label_delimiter" => config.param("label_delimiter", :string, default: ":")
16
+ }
17
+ columns = task["schema"].each_with_index.map do |c, i|
18
+ Column.new(i, c["name"], c["type"].to_sym)
19
+ end
20
+ yield(task, columns)
21
+ end
22
+
23
+ def init
24
+ @delimiter = task["delimiter"]
25
+ @label_delimiter = task["label_delimiter"]
26
+ @null_value_pattern = task["null_value_pattern"] ? Regexp.new(task["null_value_pattern"]) : nil
27
+ @null_empty_string = task["null_empty_string"]
28
+ @decoder_task = task.param("decoder_task", :hash).load_task(Java::LineDecoder::DecoderTask)
29
+ end
30
+
31
+ def run(file_input)
32
+ decoder = Java::LineDecoder.new(file_input.instance_eval { @java_file_input }, @decoder_task)
33
+
34
+ while decoder.nextFile
35
+ while line = decoder.poll
36
+ begin
37
+ array = line.split(@delimiter).map { |pair|
38
+ pair.split(@label_delimiter, 2)
39
+ }
40
+ @page_builder.add(make_record(Hash[*array.flatten]))
41
+ rescue => e
42
+ puts "\n#{e.message}\n#{e.backtrace.join("\n")}"
43
+ end
44
+ end
45
+ end
46
+ page_builder.finish
47
+ end
48
+
49
+ private
50
+
51
+ def make_record(e)
52
+ @task["schema"].map do |c|
53
+ convert_value(e, c)
54
+ end
55
+ end
56
+
57
+ def convert_value(e, c)
58
+ v = convert_value_to_nil(e[c["name"]])
59
+ return nil if v.nil?
60
+ case c["type"]
61
+ when "string"
62
+ v
63
+ when "long"
64
+ v.to_i
65
+ when "double"
66
+ v.to_f
67
+ when "boolean"
68
+ ["yes", "true", "1"].include?(v.downcase)
69
+ when "timestamp"
70
+ if v.empty?
71
+ nil
72
+ else
73
+ c["time_format"] ? Time.strptime(v, c["time_format"]) : Time.parse(v)
74
+ end
75
+ else
76
+ raise "Unsupported type #{c['type']}"
77
+ end
78
+ end
79
+
80
+ def convert_value_to_nil(value)
81
+ if value and @null_empty_string
82
+ value = (value == '') ? nil : value
83
+ end
84
+ if value and @null_value_pattern
85
+ value = match_regexp(@null_value_pattern, value) ? nil : value
86
+ end
87
+ value
88
+ end
89
+
90
+ def match_regexp(regexp, string)
91
+ begin
92
+ return regexp.match(string)
93
+ rescue ArgumentError => e
94
+ raise e unless e.message.index("invalid byte sequence in".freeze).zero?
95
+ log.info "invalid byte sequence is replaced in `#{string}`"
96
+ string = string.scrub('?')
97
+ retry
98
+ end
99
+ return true
100
+ end
101
+ end
102
+ end
103
+ end
metadata ADDED
@@ -0,0 +1,81 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-parser-ltsv
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - toyama0919
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-08-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description: Parses Ltsv files read by other file input plugins.
42
+ email:
43
+ - toyama0919@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".gitignore"
49
+ - Gemfile
50
+ - LICENSE.txt
51
+ - README.md
52
+ - Rakefile
53
+ - embulk-parser-ltsv.gemspec
54
+ - lib/embulk/guess/ltsv.rb
55
+ - lib/embulk/parser/ltsv.rb
56
+ homepage: https://github.com/toyama0919/embulk-parser-ltsv
57
+ licenses:
58
+ - MIT
59
+ metadata: {}
60
+ post_install_message:
61
+ rdoc_options: []
62
+ require_paths:
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: '0'
74
+ requirements: []
75
+ rubyforge_project:
76
+ rubygems_version: 2.4.2
77
+ signing_key:
78
+ specification_version: 4
79
+ summary: Ltsv parser plugin for Embulk
80
+ test_files: []
81
+ has_rdoc: