embulk-parser-ltsv 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 33a6060131190c76380a815b6b1a89406158eed4
4
+ data.tar.gz: 85f2463f7fb53ba2e395393083ddcf52f198f935
5
+ SHA512:
6
+ metadata.gz: cf27bffe35212b122a46e4d93526fbe8a1ed359c53fc9c8aac6ecfb53d53c7f7458e2512dbfd1d692515a8ed61864e98a2f90c0c48d8ad9c749211828cd1b423
7
+ data.tar.gz: f2a8d8e9c77aa26093f0b631f145b51b28c911dc7d627b3aeea13f082883b12fd3ad2063342fb734f0a538e456e3e1d06b7efcc474610a41b506c47423221d83
data/.gitignore ADDED
@@ -0,0 +1,6 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ /.bundle/
5
+ /vendor/
6
+ /Gemfile.lock
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org/'
2
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,60 @@
1
+ # Ltsv parser plugin for Embulk
2
+
3
+
4
+ ## Overview
5
+
6
+ * **Plugin type**: parser
7
+ * **Guess supported**: yes
8
+
9
+ ## Configuration
10
+
11
+ - **option1**: description (integer, required)
12
+ - **option2**: description (string, default: `"myvalue"`)
13
+ - **null_value_pattern**: null value pattern. (string, default: `null`)
14
+
15
+ ## Example
16
+
17
+ ```yaml
18
+
19
+ in:
20
+ type: file
21
+ path_prefix: /Users/toyama-h/access_log-20150616.ltsv.gz
22
+ decoders:
23
+ - {type: gzip}
24
+ parser:
25
+ type: ltsv
26
+ charset: UTF-8
27
+ newline: CRLF
28
+ null_value_pattern: ^(-|null|NULL)$
29
+ schema:
30
+ - {name: host, type: string}
31
+ - {name: ip_address, type: string}
32
+ - {name: server, type: string}
33
+ - {name: remote_user, type: string}
34
+ - {name: log_time, type: timestamp, time_format: '%d/%b/%Y:%H:%M:%S %z'}
35
+ - {name: method, type: string}
36
+ - {name: path, type: string}
37
+ - {name: protocol, type: string}
38
+ - {name: status, type: long}
39
+ - {name: size, type: string}
40
+ - {name: referer, type: string}
41
+ - {name: user_agent, type: string}
42
+ - {name: response_time, type: long}
43
+ exec: {}
44
+ out: {type: stdout}
45
+
46
+
47
+ ```
48
+
49
+ (If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
50
+
51
+ ```
52
+ $ embulk gem install embulk-parser-ltsv
53
+ $ embulk guess -g ltsv config.yml -o guessed.yml
54
+ ```
55
+
56
+ ## Build
57
+
58
+ ```
59
+ $ rake
60
+ ```
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ task default: :build
@@ -0,0 +1,19 @@
1
+
2
+ Gem::Specification.new do |spec|
3
+ spec.name = "embulk-parser-ltsv"
4
+ spec.version = "0.1.0"
5
+ spec.authors = ["toyama0919"]
6
+ spec.summary = "Ltsv parser plugin for Embulk"
7
+ spec.description = "Parses Ltsv files read by other file input plugins."
8
+ spec.email = ["toyama0919@gmail.com"]
9
+ spec.licenses = ["MIT"]
10
+ spec.homepage = "https://github.com/toyama0919/embulk-parser-ltsv"
11
+
12
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
13
+ spec.test_files = spec.files.grep(%r{^(test|spec)/})
14
+ spec.require_paths = ["lib"]
15
+
16
+ #spec.add_dependency 'YOUR_GEM_DEPENDENCY', ['~> YOUR_GEM_DEPENDENCY_VERSION']
17
+ spec.add_development_dependency 'bundler', ['~> 1.0']
18
+ spec.add_development_dependency 'rake', ['>= 10.0']
19
+ end
@@ -0,0 +1,53 @@
1
+ module Embulk
2
+ module Guess
3
+ class LtsvParserGuessPlugin < LineGuessPlugin
4
+ Plugin.register_guess("ltsv", self)
5
+
6
+ def guess_lines(config, sample_lines)
7
+ columns = {}
8
+ sample_lines.each do |line|
9
+ hash = {}
10
+ array = line.split("\t").each { |pair|
11
+ key, value = pair.split(":", 2)
12
+ hash[key] = value
13
+ }
14
+
15
+ hash.each do |k, v|
16
+ columns[k] = get_embulk_type(v)
17
+ end
18
+ end
19
+ schema = []
20
+ columns.each do |k,v|
21
+ schema << {'name' => k, 'type' => v}
22
+ end
23
+ guessed = {}
24
+ guessed["type"] = "ltsv"
25
+ guessed["schema"] = schema
26
+ return {"parser" => guessed}
27
+ end
28
+
29
+ private
30
+
31
+ def get_embulk_type(val)
32
+ if val =~ /^\d+\.\d+$/
33
+ return "double"
34
+ end
35
+
36
+ if val =~ /^\d+$/
37
+ return "long"
38
+ end
39
+
40
+ begin
41
+ Time.parse(val)
42
+ return "timestamp"
43
+ rescue => e
44
+ end
45
+
46
+ if val =~ /^(true|false)$/i
47
+ return "boolean"
48
+ end
49
+ return "string"
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,103 @@
1
+ module Embulk
2
+ module Parser
3
+
4
+ class Ltsv < ParserPlugin
5
+ Plugin.register_parser("ltsv", self)
6
+
7
+ def self.transaction(config, &control)
8
+ parser_task = config.load_config(Java::LineDecoder::DecoderTask)
9
+ task = {
10
+ "decoder_task" => DataSource.from_java(parser_task.dump),
11
+ "schema" => config.param("schema", :array),
12
+ "null_value_pattern" => config.param("null_value_pattern", :string, default: nil),
13
+ "null_empty_string" => config.param("null_empty_string", :bool, default: false),
14
+ "delimiter" => config.param("delimiter", :string, default: "\t"),
15
+ "label_delimiter" => config.param("label_delimiter", :string, default: ":")
16
+ }
17
+ columns = task["schema"].each_with_index.map do |c, i|
18
+ Column.new(i, c["name"], c["type"].to_sym)
19
+ end
20
+ yield(task, columns)
21
+ end
22
+
23
+ def init
24
+ @delimiter = task["delimiter"]
25
+ @label_delimiter = task["label_delimiter"]
26
+ @null_value_pattern = task["null_value_pattern"] ? Regexp.new(task["null_value_pattern"]) : nil
27
+ @null_empty_string = task["null_empty_string"]
28
+ @decoder_task = task.param("decoder_task", :hash).load_task(Java::LineDecoder::DecoderTask)
29
+ end
30
+
31
+ def run(file_input)
32
+ decoder = Java::LineDecoder.new(file_input.instance_eval { @java_file_input }, @decoder_task)
33
+
34
+ while decoder.nextFile
35
+ while line = decoder.poll
36
+ begin
37
+ array = line.split(@delimiter).map { |pair|
38
+ pair.split(@label_delimiter, 2)
39
+ }
40
+ @page_builder.add(make_record(Hash[*array.flatten]))
41
+ rescue => e
42
+ puts "\n#{e.message}\n#{e.backtrace.join("\n")}"
43
+ end
44
+ end
45
+ end
46
+ page_builder.finish
47
+ end
48
+
49
+ private
50
+
51
+ def make_record(e)
52
+ @task["schema"].map do |c|
53
+ convert_value(e, c)
54
+ end
55
+ end
56
+
57
+ def convert_value(e, c)
58
+ v = convert_value_to_nil(e[c["name"]])
59
+ return nil if v.nil?
60
+ case c["type"]
61
+ when "string"
62
+ v
63
+ when "long"
64
+ v.to_i
65
+ when "double"
66
+ v.to_f
67
+ when "boolean"
68
+ ["yes", "true", "1"].include?(v.downcase)
69
+ when "timestamp"
70
+ if v.empty?
71
+ nil
72
+ else
73
+ c["time_format"] ? Time.strptime(v, c["time_format"]) : Time.parse(v)
74
+ end
75
+ else
76
+ raise "Unsupported type #{c['type']}"
77
+ end
78
+ end
79
+
80
+ def convert_value_to_nil(value)
81
+ if value and @null_empty_string
82
+ value = (value == '') ? nil : value
83
+ end
84
+ if value and @null_value_pattern
85
+ value = match_regexp(@null_value_pattern, value) ? nil : value
86
+ end
87
+ value
88
+ end
89
+
90
+ def match_regexp(regexp, string)
91
+ begin
92
+ return regexp.match(string)
93
+ rescue ArgumentError => e
94
+ raise e unless e.message.index("invalid byte sequence in".freeze).zero?
95
+ log.info "invalid byte sequence is replaced in `#{string}`"
96
+ string = string.scrub('?')
97
+ retry
98
+ end
99
+ return true
100
+ end
101
+ end
102
+ end
103
+ end
metadata ADDED
@@ -0,0 +1,81 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-parser-ltsv
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - toyama0919
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-08-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description: Parses Ltsv files read by other file input plugins.
42
+ email:
43
+ - toyama0919@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".gitignore"
49
+ - Gemfile
50
+ - LICENSE.txt
51
+ - README.md
52
+ - Rakefile
53
+ - embulk-parser-ltsv.gemspec
54
+ - lib/embulk/guess/ltsv.rb
55
+ - lib/embulk/parser/ltsv.rb
56
+ homepage: https://github.com/toyama0919/embulk-parser-ltsv
57
+ licenses:
58
+ - MIT
59
+ metadata: {}
60
+ post_install_message:
61
+ rdoc_options: []
62
+ require_paths:
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: '0'
74
+ requirements: []
75
+ rubyforge_project:
76
+ rubygems_version: 2.4.2
77
+ signing_key:
78
+ specification_version: 4
79
+ summary: Ltsv parser plugin for Embulk
80
+ test_files: []
81
+ has_rdoc: