embulk-parser-xml 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: da39918e4e87ef06f5a8a3f1c321062ef00f10b4
4
- data.tar.gz: e440ca6333be465a791abeb0b0998e02f7251440
3
+ metadata.gz: 31b1d69fab1c3bfcf1e676508a8f7e8c644db85e
4
+ data.tar.gz: 86d1162369def6860f2471c008b0842469db303a
5
5
  SHA512:
6
- metadata.gz: 010d16da3d32201b038fe0f219eae33ae19115bf2d465e904c30035b5d1b093197b8067b9dc11593c4e8ab6bfed8360ff640342e7626c508c74a9d14ce746be8
7
- data.tar.gz: 89467d469d2d7547b03c04102a8c4a1216318daad0a463d03b7ccb3749e013acfcfb632d89e42ec7654ddd1825d089ac28f48c0158d5a325a513942cd963f50e
6
+ metadata.gz: 2f81442fd1695ccb607a1f056a190432a3ed94b146a4650cbcf2d9d6ebe10e716a2068e41860ee18a275c36223b1bec51ce3bca4718700229079d452d069ae64
7
+ data.tar.gz: 6e8c4e50183802026bddab9eea6b298ff2f06af6ac8a773cbf0ae3afda50feaa095127163052c5aac648cd1364ef6d899f96208ef2b10f83674feba04c6c5422
@@ -4,7 +4,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
 
5
5
  Gem::Specification.new do |spec|
6
6
  spec.name = "embulk-parser-xml"
7
- spec.version = "0.0.1"
7
+ spec.version = "0.0.2"
8
8
  spec.authors = ["Takuma kanari"]
9
9
  spec.email = ["chemtrails.t@gmail.com"]
10
10
  spec.summary = %q{Embulk parser plugin for XML}
@@ -16,6 +16,7 @@ Gem::Specification.new do |spec|
16
16
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
17
  spec.require_paths = ["lib"]
18
18
 
19
+ spec.add_dependency "nokogiri", "~> 1.6"
19
20
  spec.add_development_dependency "bundler", "~> 1.0"
20
21
  spec.add_development_dependency "rake", "~> 10.0"
21
22
  end
@@ -1,4 +1,4 @@
1
- require "rexml/document"
1
+ require "nokogiri"
2
2
 
3
3
  module Embulk
4
4
  module Parser
@@ -7,54 +7,111 @@ module Embulk
7
7
  Plugin.register_parser("xml", self)
8
8
 
9
9
  def self.transaction(config, &control)
10
+ schema = config.param("schema", :array)
11
+ schema_serialized = schema.inject({}) do |memo, s|
12
+ memo[s["name"]] = s["type"]
13
+ memo
14
+ end
10
15
  task = {
11
- :schema => config.param("schema", :array),
12
- :root => config.param("root", :string)
16
+ :schema => schema_serialized,
17
+ :root_to_route => config.param("root", :string).split("/")
13
18
  }
14
- columns = task[:schema].each_with_index.map do |c, i|
19
+ columns = schema.each_with_index.map do |c, i|
15
20
  Column.new(i, c["name"], c["type"].to_sym)
16
21
  end
17
22
  yield(task, columns)
18
23
  end
19
24
 
20
25
  def run(file_input)
21
- schema = @task["schema"]
22
- root = @task["root"]
26
+ on_new_record = lambda {|record|
27
+ @page_builder.add(record)
28
+ }
29
+ doc = RecordBinder.new(@task["root_to_route"],
30
+ @task["schema"], on_new_record)
31
+ parser = Nokogiri::XML::SAX::Parser.new(doc)
23
32
  while file = file_input.next_file
24
- REXML::Document.new(file.read).elements.each(root) do |e|
25
- dest = {}
26
- e.elements.each do |d|
27
- dest[d.name] = d.text
28
- end
29
- @page_builder.add(make_record(schema, dest))
30
- end
33
+ parser.parse(file.read)
34
+ doc.clear
31
35
  end
32
36
  @page_builder.finish
33
37
  end
38
+ end
34
39
 
35
- private
40
+ class RecordBinder < Nokogiri::XML::SAX::Document
36
41
 
37
- def make_record(schema, e)
38
- schema.map do |c|
39
- name = c["name"]
40
- val = e[name]
41
-
42
- v = val.nil? ? "" : val
43
- type = c["type"]
44
- case type
45
- when "string"
46
- v
47
- when "long"
48
- v.to_i
49
- when "double"
50
- v.to_f
51
- when "boolean"
52
- ["yes", "true", "1"].include?(v.downcase)
53
- when "timestamp"
54
- v.empty? ? nil : Time.strptime(v, c["format"])
42
+ def initialize(route, schema, on_new_record)
43
+ @route = route
44
+ @schema = schema
45
+ @on_new_record = on_new_record
46
+ clear
47
+ super()
48
+ end
49
+
50
+ def clear
51
+ @find_route_idx = 0
52
+ @enter = false
53
+ @current_element_name = nil
54
+ @current_data = new_map_by_schema
55
+ end
56
+
57
+ def start_element(name, attributes = [])
58
+ if !@enter
59
+ if name == @route[@find_route_idx]
60
+ if @find_route_idx == @route.size - 1
61
+ @enter = true
55
62
  else
56
- raise "Unsupported type #{type}"
63
+ @find_route_idx += 1
64
+ end
57
65
  end
66
+ else
67
+ @current_element_name = (@schema[name].nil?) ? nil : name
68
+ end
69
+ end
70
+
71
+ def characters(string)
72
+ return if !@enter || string.strip.size == 0 || @current_element_name.nil?
73
+ val = @current_data[@current_element_name]
74
+ val = "" if val.nil?
75
+ val += string
76
+ @current_data[@current_element_name] = val
77
+ end
78
+
79
+ def end_element(name, attributes = [])
80
+ if @enter
81
+ if name == @route.last
82
+ @enter = false
83
+ @on_new_record.call(@current_data.map{|k, v| v})
84
+ @current_data = new_map_by_schema
85
+ elsif !@current_element_name.nil?
86
+ @current_data[name] = convert(@current_data[name], @schema[name])
87
+ end
88
+ end
89
+ end
90
+
91
+ private
92
+
93
+ def new_map_by_schema
94
+ @schema.keys.inject({}) do |memo, k|
95
+ memo[k] = nil
96
+ memo
97
+ end
98
+ end
99
+
100
+ def convert(val, type)
101
+ v = val.nil? ? "" : val
102
+ case type
103
+ when "string"
104
+ v
105
+ when "long"
106
+ v.to_i
107
+ when "double"
108
+ v.to_f
109
+ when "boolean"
110
+ ["yes", "true", "1"].include?(v.downcase)
111
+ when "timestamp"
112
+ v.empty? ? nil : Time.strptime(v, c["format"])
113
+ else
114
+ raise "Unsupported type #{type}"
58
115
  end
59
116
  end
60
117
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-xml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Takuma kanari
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-14 00:00:00.000000000 Z
11
+ date: 2015-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: bundler
15
29
  requirement: !ruby/object:Gem::Requirement