embulk-parser-fixed 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: de264839a5caf9e90ab09ac915ac9554d2407418
4
+ data.tar.gz: bf15e9204a7c44c0d97ab63613ddb6984670d751
5
+ SHA512:
6
+ metadata.gz: e612d874078ca2409fa39503e45cf74d41f2dd183e9f8d69ca6b249de10a767c9f981afd18be651b9c3a2c4b58c064aa7ab0477ed678370cd9a55254d21f6322
7
+ data.tar.gz: aba8fd7dd76f21e303aabce2edc95b8b5b0ce2da256d7c366136ddca7e838c11b805ed5ae5a259f52119766bab73b54618652d60c06cada2b7b0b2e1f36d6e8d
@@ -0,0 +1,5 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ /.bundle/
5
+ /Gemfile.lock
@@ -0,0 +1 @@
1
+ jruby-9.1.5.0
@@ -0,0 +1,12 @@
1
+ language: ruby
2
+ rvm:
3
+ - jruby-9.1.5.0
4
+ jdk:
5
+ - oraclejdk8
6
+ env:
7
+ global:
8
+ - JRUBY_OPTS="-Xcli.debug=true --debug"
9
+
10
+ gemfile:
11
+ - gemfiles/embulk-latest
12
+
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org/'
2
+ gemspec
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,36 @@
1
+ [![Build Status](https://travis-ci.org/kakoni/embulk-parser-fixed.svg?branch=master)](https://travis-ci.org/kakoni/embulk-parser-fixed)
2
+
3
+ # Fixed width parser plugin for Embulk
4
+
5
+ Fixed width parser. Useful for parsing fixed width format files.
6
+ Can be used to transform `FirstSecond Third` line to `{key: "First", key2: "Second", key3: "Third"}`
7
+
8
+ ## Overview
9
+
10
+ * **Plugin type**: parser
11
+ * **Guess supported**: no
12
+
13
+ ## Configuration
14
+
15
+ - **columns**: declares the list of columns, their types and positions as range in input. Values will be assigned to these in order.
16
+ - **strip_whitespace**: Strip whitespace from parsed values. (bool, default: true)
17
+
18
+ ## Example
19
+
20
+ ```yaml
21
+ in:
22
+ type: any file input plugin type
23
+ parser:
24
+ type: unpack
25
+ columns:
26
+ - {name: first, type: string, pos: 0..1}
27
+ - {name: second, type: string, pos: 3..7}
28
+ - {name: third, type: string, pos: 10..12}
29
+
30
+ ```
31
+
32
+ ## Install plugin
33
+
34
+ ```
35
+ $ embulk gem install embulk-parser-fixed
36
+ ```
@@ -0,0 +1,8 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ task default: :test
4
+
5
+ desc "Run tests"
6
+ task :test do
7
+ ruby("test/run-test.rb", "--use-color=yes")
8
+ end
@@ -0,0 +1,21 @@
1
+
2
+ Gem::Specification.new do |spec|
3
+ spec.name = "embulk-parser-fixed"
4
+ spec.version = "0.1.0"
5
+ spec.authors = ["Karri Niemel\u{e4}"]
6
+ spec.summary = "Fixed width parser plugin for Embulk"
7
+ spec.description = "Parses fixed width files read by other file input plugins."
8
+ spec.email = ["kakoni@gmail.com"]
9
+ spec.licenses = ["MIT"]
10
+ spec.homepage = "https://github.com/kakoni/embulk-parser-fixed"
11
+
12
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
13
+ spec.test_files = spec.files.grep(%r{^(test|spec)/})
14
+ spec.require_paths = ["lib"]
15
+
16
+ spec.add_development_dependency 'embulk', ['>= 0.8.9']
17
+ spec.add_development_dependency 'bundler', ['>= 1.10.6']
18
+ spec.add_development_dependency 'rake', ['>= 10.0']
19
+ spec.add_development_dependency 'test-unit'
20
+ spec.add_development_dependency 'test-unit-rr'
21
+ end
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org/'
2
+ gemspec :path => '../'
3
+
4
+ gem "embulk", "~> 0.8.15"
@@ -0,0 +1,63 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ # TODO implement guess plugin to make this command work:
5
+ # $ embulk guess -g "unpack" partial-config.yml
6
+ #
7
+ # Depending on the file format the plugin uses, you can use choose
8
+ # one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
9
+ # or line guess (LineGuessPlugin).
10
+
11
+ #require "embulk/parser/unpack.rb"
12
+
13
+ #class Unpack < GuessPlugin
14
+ # Plugin.register_guess("unpack", self)
15
+ #
16
+ # def guess(config, sample_buffer)
17
+ # if sample_buffer[0,2] == GZIP_HEADER
18
+ # guessed = {}
19
+ # guessed["type"] = "unpack"
20
+ # guessed["property1"] = "guessed-value"
21
+ # return {"parser" => guessed}
22
+ # else
23
+ # return {}
24
+ # end
25
+ # end
26
+ #end
27
+
28
+ #class Unpack < TextGuessPlugin
29
+ # Plugin.register_guess("unpack", self)
30
+ #
31
+ # def guess_text(config, sample_text)
32
+ # js = JSON.parse(sample_text) rescue nil
33
+ # if js && js["mykeyword"] == "keyword"
34
+ # guessed = {}
35
+ # guessed["type"] = "unpack"
36
+ # guessed["property1"] = "guessed-value"
37
+ # return {"parser" => guessed}
38
+ # else
39
+ # return {}
40
+ # end
41
+ # end
42
+ #end
43
+
44
+ #class Unpack < LineGuessPlugin
45
+ # Plugin.register_guess("unpack", self)
46
+ #
47
+ # def guess_lines(config, sample_lines)
48
+ # all_line_matched = sample_lines.all? do |line|
49
+ # line =~ /mypattern/
50
+ # end
51
+ # if all_line_matched
52
+ # guessed = {}
53
+ # guessed["type"] = "unpack"
54
+ # guessed["property1"] = "guessed-value"
55
+ # return {"parser" => guessed}
56
+ # else
57
+ # return {}
58
+ # end
59
+ # end
60
+ #end
61
+
62
+ end
63
+ end
@@ -0,0 +1,58 @@
1
+ module Embulk
2
+ module Parser
3
+
4
+ class Fixed < ParserPlugin
5
+ Plugin.register_parser("fixed", self)
6
+
7
+ def self.transaction(config, &control)
8
+ decoder_task = config.load_config(Java::LineDecoder::DecoderTask)
9
+
10
+ # configuration code:
11
+ task = {
12
+ "decoder" => DataSource.from_java(decoder_task.dump),
13
+ "schema" => config.param("columns", :array, default: []),
14
+ "strip_whitespace" => config.param("strip_whitespace", :bool, default: true)
15
+ }
16
+
17
+ columns = []
18
+ task["schema"].each do |column|
19
+ name = column["name"]
20
+ type = column["type"].to_sym
21
+ columns << Column.new(nil, name, type)
22
+ end
23
+
24
+ yield(task, columns)
25
+ end
26
+
27
+ def init
28
+ @decoder = task.param("decoder", :hash).load_task(Java::LineDecoder::DecoderTask)
29
+ @schema = @task["schema"]
30
+ @strip_whitespace = task["strip_whitespace"]
31
+ end
32
+
33
+
34
+ def run(file_input)
35
+ decoder = Java::LineDecoder.new(file_input.to_java, @decoder)
36
+ while decoder.nextFile
37
+ while line = decoder.poll
38
+ process_line(line)
39
+ end
40
+ end
41
+
42
+ page_builder.finish
43
+ end
44
+
45
+ private
46
+
47
+ def process_line(line)
48
+ values = @schema.map do |column|
49
+ range = Range.new(*column["pos"].split("..").map(&:to_i)) if column["pos"]
50
+ val = line.slice(range)
51
+ val.strip if @strip_whitespace
52
+ end
53
+ page_builder.add(values)
54
+ end
55
+
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,45 @@
1
+ module CaptureIo
2
+ def capture(output = :out, &block)
3
+ _, out = swap_io(output, &block)
4
+ out
5
+ end
6
+
7
+ def silence(&block)
8
+ block_result = nil
9
+ swap_io(:out) do
10
+ block_result,_ = swap_io(:err, &block)
11
+ end
12
+ block_result
13
+ end
14
+
15
+ def swap_io(output = :out, &block)
16
+ java_import 'java.io.PrintStream'
17
+ java_import 'java.io.ByteArrayOutputStream'
18
+ java_import 'java.lang.System'
19
+
20
+ ruby_original_stream = output == :out ? $stdout.dup : $stderr.dup
21
+ java_original_stream = System.send(output) # :out or :err
22
+ ruby_buf = StringIO.new
23
+ java_buf = ByteArrayOutputStream.new
24
+
25
+ case output
26
+ when :out
27
+ $stdout = ruby_buf
28
+ System.setOut(PrintStream.new(java_buf))
29
+ when :err
30
+ $stderr = ruby_buf
31
+ System.setErr(PrintStream.new(java_buf))
32
+ end
33
+
34
+ [block.call, ruby_buf.string + java_buf.toString]
35
+ ensure
36
+ case output
37
+ when :out
38
+ $stdout = ruby_original_stream
39
+ System.setOut(java_original_stream)
40
+ when :err
41
+ $stderr = ruby_original_stream
42
+ System.setErr(java_original_stream)
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,52 @@
1
+ require "prepare_embulk"
2
+ require "embulk/parser/fixed"
3
+ require "embulk/data_source"
4
+
5
+
6
+ module Embulk
7
+ module Parser
8
+ class FixedTest < Test::Unit::TestCase
9
+
10
+ class TestProcessLine < self
11
+
12
+ def test_foo
13
+ mock(page_builder).add(["eka", "Toka", "Kolmas", "Neljas"])
14
+ line = "ekaTokaKolmas Neljas"
15
+ plugin.send(:process_line, line)
16
+ end
17
+
18
+ def plugin
19
+ @plugin ||= Fixed.new(DataSource[task], schema, page_builder)
20
+ end
21
+
22
+ def page_builder
23
+ @page_builder ||= Object.new
24
+ end
25
+
26
+ def task
27
+ {
28
+ "decoder" => {"Charset" => "UTF-8", "Newline" => "CRLF"},
29
+ "schema" => columns,
30
+ "strip_whitespace" => true,
31
+ }
32
+ end
33
+
34
+ def columns
35
+ [
36
+ {"name" => "foo", "type" => :string, "pos" => "0..2"},
37
+ {"name" => "bar", "type" => :string, "pos" => "3..6"},
38
+ {"name" => "baz", "type" => :string, "pos" => "7..12"},
39
+ {"name" => "qux", "type" => :string, "pos" => "14..19"},
40
+ ]
41
+ end
42
+
43
+ def schema
44
+ columns.map do |column|
45
+ Column.new(nil, column["name"], column["type"].to_sym)
46
+ end
47
+ end
48
+
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,19 @@
1
+ require "capture_io"
2
+
3
+ module EmbulkRunHelper
4
+ include CaptureIo
5
+
6
+ def embulk_guess(seed_path, dest_path)
7
+ silence do
8
+ embulk_exec(%W(guess -g query_string #{seed_path} -o #{dest_path}))
9
+ end
10
+ end
11
+
12
+ def embulk_run(yaml_path)
13
+ embulk_exec(%W(run #{yaml_path}))
14
+ end
15
+
16
+ def embulk_exec(cli_options = [])
17
+ Embulk.run(cli_options)
18
+ end
19
+ end
@@ -0,0 +1,5 @@
1
+ require "embulk"
2
+
3
+ Embulk.setup
4
+
5
+ require "embulk/command/embulk_run"
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ base_dir = File.expand_path(File.join(File.dirname(__FILE__), ".."))
4
+ lib_dir = File.join(base_dir, "lib")
5
+ test_dir = File.join(base_dir, "test")
6
+
7
+ require "test-unit"
8
+ require "test/unit/rr"
9
+
10
+ $LOAD_PATH.unshift(lib_dir)
11
+ $LOAD_PATH.unshift(test_dir)
12
+
13
+ ENV["TEST_UNIT_MAX_DIFF_TARGET_STRING_SIZE"] ||= "5000"
14
+
15
+ exit Test::Unit::AutoRunner.run(true, test_dir, ARGV + %w(--collector=dir))
metadata ADDED
@@ -0,0 +1,135 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-parser-fixed
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Karri Niemelä
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-12-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: 0.8.9
19
+ name: embulk
20
+ prerelease: false
21
+ type: :development
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.8.9
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.10.6
33
+ name: bundler
34
+ prerelease: false
35
+ type: :development
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 1.10.6
41
+ - !ruby/object:Gem::Dependency
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: '10.0'
47
+ name: rake
48
+ prerelease: false
49
+ type: :development
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ name: test-unit
62
+ prerelease: false
63
+ type: :development
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ name: test-unit-rr
76
+ prerelease: false
77
+ type: :development
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Parses fixed width files read by other file input plugins.
84
+ email:
85
+ - kakoni@gmail.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - ".gitignore"
91
+ - ".ruby-version"
92
+ - ".travis.yml"
93
+ - Gemfile
94
+ - LICENSE.txt
95
+ - README.md
96
+ - Rakefile
97
+ - embulk-parser-fixed.gemspec
98
+ - gemfiles/embulk-latest
99
+ - lib/embulk/guess/unpack.rb
100
+ - lib/embulk/parser/fixed.rb
101
+ - test/capture_io.rb
102
+ - test/embulk/parser/test_fixed.rb
103
+ - test/embulk_run_helper.rb
104
+ - test/prepare_embulk.rb
105
+ - test/run-test.rb
106
+ homepage: https://github.com/kakoni/embulk-parser-fixed
107
+ licenses:
108
+ - MIT
109
+ metadata: {}
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ required_ruby_version: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ version: '0'
119
+ required_rubygems_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ requirements: []
125
+ rubyforge_project:
126
+ rubygems_version: 2.6.6
127
+ signing_key:
128
+ specification_version: 4
129
+ summary: Fixed width parser plugin for Embulk
130
+ test_files:
131
+ - test/capture_io.rb
132
+ - test/embulk/parser/test_fixed.rb
133
+ - test/embulk_run_helper.rb
134
+ - test/prepare_embulk.rb
135
+ - test/run-test.rb