embulk-parser-xml 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5a19b485eaa8f3a9143e2d1b2eb8a07a6b7fe616
4
+ data.tar.gz: 859ef953a4d17ca9b24c8060c6950231887949db
5
+ SHA512:
6
+ metadata.gz: 5c3ada6ad8d47dc867bf62c4bb3f60c4a690a84b69e9e41a048f87d0ba5dc0bd19f0ca92f90fdcc515c4d648821b983a01bc1e22c9aa7d0fde02017fccb556ed
7
+ data.tar.gz: 621bde05d19a7f3f58199ae681a891a39ec552d9b0a3a5b751f97983a711879e40312128d47b66ad49c0f5f4147722b157f759945534917186212ece394fddab
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ jruby-9.0.4.0
data/README.md CHANGED
@@ -10,9 +10,15 @@ Read data from input as xml and fetch each entries to output.
10
10
  * **Load all or nothing**: yes
11
11
  * **Resume supported**: no
12
12
 
13
+ ## Types
14
+
15
+ - **xml**: Find rows by SAX.
16
+ - **xpath**: Find finds rows by Xpath, so you can process XML by more complex condition than *xml* type.
13
17
 
14
18
  ## Configuration
15
19
 
20
+ ### XML
21
+
16
22
  ```yaml
17
23
  parser:
18
24
  type: xml
@@ -32,10 +38,41 @@ If you need to parse column as timestamp type, *schema* supports 2 optional para
32
38
  schema:
33
39
  - {name: timestamp_column, type: timestamp, format: "%Y-%m-%d", timezone: "+0000"}
34
40
  ```
41
+
42
+ - **format**: timestamp format to parse, required.
43
+ - **timezone**: timestamp will be parsing in this timezone, `"+0900"` is used by default.
44
+
45
+
46
+ ### Xpath
47
+
48
+ ```yaml
49
+ parser:
50
+ type: xpath
51
+ root: //data/students/student
52
+ schema:
53
+ - {path: name, type: string, name: name}
54
+ - {path: age, type: long, name: age}
55
+ ```
56
+
57
+ - **type**: specify this plugin as `xpath` .
58
+ - **root**: root property to start fetching each entries, specify in Xpath, *'/''* is used by default.
59
+ - **schema**: specify the attribute of table and data type, required.
60
+ - **namespaces**: xml namespaces
61
+
62
+
63
+ If you need to parse column as timestamp type, *schema* supports 2 optional parameters:
64
+
65
+ ```yaml
66
+ schema:
67
+ - {name: timestamp_column, type: timestamp, format: "%Y-%m-%d", timezone: "+0000"}
68
+ ```
69
+
35
70
  - **format**: timestamp format to parse, required.
36
71
  - **timezone**: timestamp will be parsing in this timezone, `"+0900"` is used by default.
37
72
 
38
- Then you can fetch entries from the following xml:
73
+
74
+
75
+ Here is XML for xample:
39
76
 
40
77
  ```xml
41
78
  <data>
@@ -59,4 +96,4 @@ Then you can fetch entries from the following xml:
59
96
  </student>
60
97
  </students>
61
98
  </data>
62
- ```
99
+ ```
@@ -4,7 +4,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
 
5
5
  Gem::Specification.new do |spec|
6
6
  spec.name = "embulk-parser-xml"
7
- spec.version = "0.0.5"
7
+ spec.version = "0.0.7"
8
8
  spec.authors = ["Takuma kanari"]
9
9
  spec.email = ["chemtrails.t@gmail.com"]
10
10
  spec.summary = %q{Embulk parser plugin for XML}
@@ -16,7 +16,8 @@ Gem::Specification.new do |spec|
16
16
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
17
  spec.require_paths = ["lib"]
18
18
 
19
- spec.add_dependency "nokogiri", "~> 1.6"
19
+ spec.add_dependency "nokogiri", "~> 1.4.0"
20
20
  spec.add_development_dependency "bundler", "~> 1.0"
21
+ spec.add_development_dependency 'embulk', ['>= 0.8.8']
21
22
  spec.add_development_dependency "rake", "~> 10.0"
22
23
  end
@@ -20,7 +20,7 @@ module Embulk
20
20
  end
21
21
  task = {
22
22
  :schema => schema_serialized,
23
- :root_to_route => config.param("root", :string).split("/")
23
+ :root => config.param("root", :string).split("/")
24
24
  }
25
25
  columns = schema.each_with_index.map do |c, i|
26
26
  Column.new(i, c["name"], c["type"].to_sym)
@@ -29,15 +29,11 @@ module Embulk
29
29
  end
30
30
 
31
31
  def run(file_input)
32
- on_new_record = lambda {|record|
33
- @page_builder.add(record)
34
- }
35
- doc = RecordBinder.new(@task["root_to_route"],
36
- @task["schema"], on_new_record)
32
+ doc = RecordBinder.new(@task["root"], @task["schema"], @page_builder)
37
33
  parser = Nokogiri::XML::SAX::Parser.new(doc)
38
34
  while file = file_input.next_file
39
35
  data = file.read
40
- if !data.nil? && !data.empty?
36
+ unless data.nil?
41
37
  doc.clear
42
38
  parser.parse(data)
43
39
  end
@@ -48,10 +44,10 @@ module Embulk
48
44
 
49
45
  class RecordBinder < Nokogiri::XML::SAX::Document
50
46
 
51
- def initialize(route, schema, on_new_record)
47
+ def initialize(route, schema, page_builder)
52
48
  @route = route
53
49
  @schema = schema
54
- @on_new_record = on_new_record
50
+ @page_builder = page_builder
55
51
  clear
56
52
  super()
57
53
  end
@@ -89,7 +85,7 @@ module Embulk
89
85
  if @enter
90
86
  if name == @route.last
91
87
  @enter = false
92
- @on_new_record.call(@current_data.map{|k, v| v})
88
+ @page_builder.add(@current_data.map{|k, v| v})
93
89
  @current_data = new_map_by_schema
94
90
  elsif !@current_element_name.nil? && @schema.key?(name)
95
91
  @current_data[name] = convert(@current_data[name], @schema[name])
@@ -131,6 +127,5 @@ module Embulk
131
127
  end
132
128
  end
133
129
  end
134
-
135
130
  end
136
131
  end
@@ -0,0 +1,60 @@
1
+ require "nokogiri"
2
+
3
+ module Embulk
4
+ module Parser
5
+
6
+ class XPathParserPlugin < ParserPlugin
7
+ Plugin.register_parser("xpath", self)
8
+
9
+ def self.transaction(config, &control)
10
+ schema = config.param("schema", :array)
11
+ task = {
12
+ :schema => schema,
13
+ :root => config.param("root", :string, default: "/"),
14
+ :namespaces => config.param("namespaces", :hash, default: {})
15
+ }
16
+ columns = schema.each_with_index.map do |c, i|
17
+ path, name = c["path"], c["name"]
18
+ Column.new(i, name.nil? ? path : name, c["type"].to_sym)
19
+ end
20
+ yield(task, columns)
21
+ end
22
+
23
+ def run(file_input)
24
+ while file = file_input.next_file
25
+ data = file.read
26
+ if !data.nil? && !data.empty?
27
+ Nokogiri::XML(data).xpath(@task["root"], @task["namespaces"]).each do |item|
28
+ dest = @task["schema"].inject([]) do |memo, s|
29
+ es = item.xpath(s["path"], @namespaces)
30
+ memo << convert(es.empty? ? nil : es.text, s["type"])
31
+ memo
32
+ end
33
+ @page_builder.add(dest)
34
+ end
35
+ end
36
+ end
37
+ @page_builder.finish
38
+ end
39
+
40
+ private
41
+ def convert(val, type)
42
+ v = val.nil? ? "" : val
43
+ case type
44
+ when "string"
45
+ v
46
+ when "long"
47
+ v.to_i
48
+ when "double"
49
+ v.to_f
50
+ when "boolean"
51
+ ["yes", "true", "1"].include?(v.downcase)
52
+ when "timestamp"
53
+ v.empty? ? nil : Time.strptime(v, c["format"])
54
+ else
55
+ raise "Unsupported type '#{type}'"
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
metadata CHANGED
@@ -1,64 +1,71 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-xml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
5
- prerelease:
4
+ version: 0.0.7
6
5
  platform: ruby
7
6
  authors:
8
7
  - Takuma kanari
9
- autorequire:
8
+ autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2016-02-27 00:00:00.000000000 Z
11
+ date: 2016-12-15 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: nokogiri
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ~>
17
+ - - "~>"
20
18
  - !ruby/object:Gem::Version
21
- version: '1.6'
22
- type: :runtime
23
- prerelease: false
19
+ version: 1.4.0
24
20
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
21
  requirements:
27
- - - ~>
22
+ - - "~>"
28
23
  - !ruby/object:Gem::Version
29
- version: '1.6'
24
+ version: 1.4.0
25
+ prerelease: false
26
+ type: :runtime
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: bundler
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ~>
31
+ - - "~>"
36
32
  - !ruby/object:Gem::Version
37
33
  version: '1.0'
38
- type: :development
39
- prerelease: false
40
34
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
35
  requirements:
43
- - - ~>
36
+ - - "~>"
44
37
  - !ruby/object:Gem::Version
45
38
  version: '1.0'
39
+ prerelease: false
40
+ type: :development
41
+ - !ruby/object:Gem::Dependency
42
+ name: embulk
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: 0.8.8
48
+ version_requirements: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 0.8.8
53
+ prerelease: false
54
+ type: :development
46
55
  - !ruby/object:Gem::Dependency
47
56
  name: rake
48
57
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
58
  requirements:
51
- - - ~>
59
+ - - "~>"
52
60
  - !ruby/object:Gem::Version
53
61
  version: '10.0'
54
- type: :development
55
- prerelease: false
56
62
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
63
  requirements:
59
- - - ~>
64
+ - - "~>"
60
65
  - !ruby/object:Gem::Version
61
66
  version: '10.0'
67
+ prerelease: false
68
+ type: :development
62
69
  description: XML parser plugin is Embulk plugin to fetch entries in xml format.
63
70
  email:
64
71
  - chemtrails.t@gmail.com
@@ -66,36 +73,37 @@ executables: []
66
73
  extensions: []
67
74
  extra_rdoc_files: []
68
75
  files:
69
- - .gitignore
76
+ - ".gitignore"
77
+ - ".ruby-version"
70
78
  - Gemfile
71
79
  - LICENSE.txt
72
80
  - README.md
73
81
  - Rakefile
74
82
  - embulk-parser-xml.gemspec
75
83
  - lib/embulk/parser/xml.rb
84
+ - lib/embulk/parser/xpath.rb
76
85
  homepage: https://github.com/takumakanari/embulk-parser-xml
77
86
  licenses:
78
87
  - MIT
79
- post_install_message:
88
+ metadata: {}
89
+ post_install_message:
80
90
  rdoc_options: []
81
91
  require_paths:
82
92
  - lib
83
93
  required_ruby_version: !ruby/object:Gem::Requirement
84
- none: false
85
94
  requirements:
86
- - - ! '>='
95
+ - - ">="
87
96
  - !ruby/object:Gem::Version
88
97
  version: '0'
89
98
  required_rubygems_version: !ruby/object:Gem::Requirement
90
- none: false
91
99
  requirements:
92
- - - ! '>='
100
+ - - ">="
93
101
  - !ruby/object:Gem::Version
94
102
  version: '0'
95
103
  requirements: []
96
- rubyforge_project:
97
- rubygems_version: 1.8.23
98
- signing_key:
99
- specification_version: 3
104
+ rubyforge_project:
105
+ rubygems_version: 2.6.6
106
+ signing_key:
107
+ specification_version: 4
100
108
  summary: Embulk parser plugin for XML
101
109
  test_files: []