embulk-parser-xml 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: da39918e4e87ef06f5a8a3f1c321062ef00f10b4
4
- data.tar.gz: e440ca6333be465a791abeb0b0998e02f7251440
3
+ metadata.gz: 31b1d69fab1c3bfcf1e676508a8f7e8c644db85e
4
+ data.tar.gz: 86d1162369def6860f2471c008b0842469db303a
5
5
  SHA512:
6
- metadata.gz: 010d16da3d32201b038fe0f219eae33ae19115bf2d465e904c30035b5d1b093197b8067b9dc11593c4e8ab6bfed8360ff640342e7626c508c74a9d14ce746be8
7
- data.tar.gz: 89467d469d2d7547b03c04102a8c4a1216318daad0a463d03b7ccb3749e013acfcfb632d89e42ec7654ddd1825d089ac28f48c0158d5a325a513942cd963f50e
6
+ metadata.gz: 2f81442fd1695ccb607a1f056a190432a3ed94b146a4650cbcf2d9d6ebe10e716a2068e41860ee18a275c36223b1bec51ce3bca4718700229079d452d069ae64
7
+ data.tar.gz: 6e8c4e50183802026bddab9eea6b298ff2f06af6ac8a773cbf0ae3afda50feaa095127163052c5aac648cd1364ef6d899f96208ef2b10f83674feba04c6c5422
@@ -4,7 +4,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
 
5
5
  Gem::Specification.new do |spec|
6
6
  spec.name = "embulk-parser-xml"
7
- spec.version = "0.0.1"
7
+ spec.version = "0.0.2"
8
8
  spec.authors = ["Takuma kanari"]
9
9
  spec.email = ["chemtrails.t@gmail.com"]
10
10
  spec.summary = %q{Embulk parser plugin for XML}
@@ -16,6 +16,7 @@ Gem::Specification.new do |spec|
16
16
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
17
  spec.require_paths = ["lib"]
18
18
 
19
+ spec.add_dependency "nokogiri", "~> 1.6"
19
20
  spec.add_development_dependency "bundler", "~> 1.0"
20
21
  spec.add_development_dependency "rake", "~> 10.0"
21
22
  end
@@ -1,4 +1,4 @@
1
- require "rexml/document"
1
+ require "nokogiri"
2
2
 
3
3
  module Embulk
4
4
  module Parser
@@ -7,54 +7,111 @@ module Embulk
7
7
  Plugin.register_parser("xml", self)
8
8
 
9
9
  def self.transaction(config, &control)
10
+ schema = config.param("schema", :array)
11
+ schema_serialized = schema.inject({}) do |memo, s|
12
+ memo[s["name"]] = s["type"]
13
+ memo
14
+ end
10
15
  task = {
11
- :schema => config.param("schema", :array),
12
- :root => config.param("root", :string)
16
+ :schema => schema_serialized,
17
+ :root_to_route => config.param("root", :string).split("/")
13
18
  }
14
- columns = task[:schema].each_with_index.map do |c, i|
19
+ columns = schema.each_with_index.map do |c, i|
15
20
  Column.new(i, c["name"], c["type"].to_sym)
16
21
  end
17
22
  yield(task, columns)
18
23
  end
19
24
 
20
25
  def run(file_input)
21
- schema = @task["schema"]
22
- root = @task["root"]
26
+ on_new_record = lambda {|record|
27
+ @page_builder.add(record)
28
+ }
29
+ doc = RecordBinder.new(@task["root_to_route"],
30
+ @task["schema"], on_new_record)
31
+ parser = Nokogiri::XML::SAX::Parser.new(doc)
23
32
  while file = file_input.next_file
24
- REXML::Document.new(file.read).elements.each(root) do |e|
25
- dest = {}
26
- e.elements.each do |d|
27
- dest[d.name] = d.text
28
- end
29
- @page_builder.add(make_record(schema, dest))
30
- end
33
+ parser.parse(file.read)
34
+ doc.clear
31
35
  end
32
36
  @page_builder.finish
33
37
  end
38
+ end
34
39
 
35
- private
40
+ class RecordBinder < Nokogiri::XML::SAX::Document
36
41
 
37
- def make_record(schema, e)
38
- schema.map do |c|
39
- name = c["name"]
40
- val = e[name]
41
-
42
- v = val.nil? ? "" : val
43
- type = c["type"]
44
- case type
45
- when "string"
46
- v
47
- when "long"
48
- v.to_i
49
- when "double"
50
- v.to_f
51
- when "boolean"
52
- ["yes", "true", "1"].include?(v.downcase)
53
- when "timestamp"
54
- v.empty? ? nil : Time.strptime(v, c["format"])
42
+ def initialize(route, schema, on_new_record)
43
+ @route = route
44
+ @schema = schema
45
+ @on_new_record = on_new_record
46
+ clear
47
+ super()
48
+ end
49
+
50
+ def clear
51
+ @find_route_idx = 0
52
+ @enter = false
53
+ @current_element_name = nil
54
+ @current_data = new_map_by_schema
55
+ end
56
+
57
+ def start_element(name, attributes = [])
58
+ if !@enter
59
+ if name == @route[@find_route_idx]
60
+ if @find_route_idx == @route.size - 1
61
+ @enter = true
55
62
  else
56
- raise "Unsupported type #{type}"
63
+ @find_route_idx += 1
64
+ end
57
65
  end
66
+ else
67
+ @current_element_name = (@schema[name].nil?) ? nil : name
68
+ end
69
+ end
70
+
71
+ def characters(string)
72
+ return if !@enter || string.strip.size == 0 || @current_element_name.nil?
73
+ val = @current_data[@current_element_name]
74
+ val = "" if val.nil?
75
+ val += string
76
+ @current_data[@current_element_name] = val
77
+ end
78
+
79
+ def end_element(name, attributes = [])
80
+ if @enter
81
+ if name == @route.last
82
+ @enter = false
83
+ @on_new_record.call(@current_data.map{|k, v| v})
84
+ @current_data = new_map_by_schema
85
+ elsif !@current_element_name.nil?
86
+ @current_data[name] = convert(@current_data[name], @schema[name])
87
+ end
88
+ end
89
+ end
90
+
91
+ private
92
+
93
+ def new_map_by_schema
94
+ @schema.keys.inject({}) do |memo, k|
95
+ memo[k] = nil
96
+ memo
97
+ end
98
+ end
99
+
100
+ def convert(val, type)
101
+ v = val.nil? ? "" : val
102
+ case type
103
+ when "string"
104
+ v
105
+ when "long"
106
+ v.to_i
107
+ when "double"
108
+ v.to_f
109
+ when "boolean"
110
+ ["yes", "true", "1"].include?(v.downcase)
111
+ when "timestamp"
112
+ v.empty? ? nil : Time.strptime(v, c["format"])
113
+ else
114
+ raise "Unsupported type #{type}"
58
115
  end
59
116
  end
60
117
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-xml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Takuma kanari
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-14 00:00:00.000000000 Z
11
+ date: 2015-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: bundler
15
29
  requirement: !ruby/object:Gem::Requirement