embulk-parser-xml 0.0.5 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5a19b485eaa8f3a9143e2d1b2eb8a07a6b7fe616
4
+ data.tar.gz: 859ef953a4d17ca9b24c8060c6950231887949db
5
+ SHA512:
6
+ metadata.gz: 5c3ada6ad8d47dc867bf62c4bb3f60c4a690a84b69e9e41a048f87d0ba5dc0bd19f0ca92f90fdcc515c4d648821b983a01bc1e22c9aa7d0fde02017fccb556ed
7
+ data.tar.gz: 621bde05d19a7f3f58199ae681a891a39ec552d9b0a3a5b751f97983a711879e40312128d47b66ad49c0f5f4147722b157f759945534917186212ece394fddab
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ jruby-9.0.4.0
data/README.md CHANGED
@@ -10,9 +10,15 @@ Read data from input as xml and fetch each entries to output.
10
10
  * **Load all or nothing**: yes
11
11
  * **Resume supported**: no
12
12
 
13
+ ## Types
14
+
15
+ - **xml**: Find rows by SAX.
16
+ - **xpath**: Find finds rows by Xpath, so you can process XML by more complex condition than *xml* type.
13
17
 
14
18
  ## Configuration
15
19
 
20
+ ### XML
21
+
16
22
  ```yaml
17
23
  parser:
18
24
  type: xml
@@ -32,10 +38,41 @@ If you need to parse column as timestamp type, *schema* supports 2 optional para
32
38
  schema:
33
39
  - {name: timestamp_column, type: timestamp, format: "%Y-%m-%d", timezone: "+0000"}
34
40
  ```
41
+
42
+ - **format**: timestamp format to parse, required.
43
+ - **timezone**: timestamp will be parsing in this timezone, `"+0900"` is used by default.
44
+
45
+
46
+ ### Xpath
47
+
48
+ ```yaml
49
+ parser:
50
+ type: xpath
51
+ root: //data/students/student
52
+ schema:
53
+ - {path: name, type: string, name: name}
54
+ - {path: age, type: long, name: age}
55
+ ```
56
+
57
+ - **type**: specify this plugin as `xpath` .
58
+ - **root**: root property to start fetching each entries, specify in Xpath, *'/''* is used by default.
59
+ - **schema**: specify the attribute of table and data type, required.
60
+ - **namespaces**: xml namespaces
61
+
62
+
63
+ If you need to parse column as timestamp type, *schema* supports 2 optional parameters:
64
+
65
+ ```yaml
66
+ schema:
67
+ - {name: timestamp_column, type: timestamp, format: "%Y-%m-%d", timezone: "+0000"}
68
+ ```
69
+
35
70
  - **format**: timestamp format to parse, required.
36
71
  - **timezone**: timestamp will be parsing in this timezone, `"+0900"` is used by default.
37
72
 
38
- Then you can fetch entries from the following xml:
73
+
74
+
75
+ Here is XML for xample:
39
76
 
40
77
  ```xml
41
78
  <data>
@@ -59,4 +96,4 @@ Then you can fetch entries from the following xml:
59
96
  </student>
60
97
  </students>
61
98
  </data>
62
- ```
99
+ ```
@@ -4,7 +4,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
 
5
5
  Gem::Specification.new do |spec|
6
6
  spec.name = "embulk-parser-xml"
7
- spec.version = "0.0.5"
7
+ spec.version = "0.0.7"
8
8
  spec.authors = ["Takuma kanari"]
9
9
  spec.email = ["chemtrails.t@gmail.com"]
10
10
  spec.summary = %q{Embulk parser plugin for XML}
@@ -16,7 +16,8 @@ Gem::Specification.new do |spec|
16
16
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
17
  spec.require_paths = ["lib"]
18
18
 
19
- spec.add_dependency "nokogiri", "~> 1.6"
19
+ spec.add_dependency "nokogiri", "~> 1.4.0"
20
20
  spec.add_development_dependency "bundler", "~> 1.0"
21
+ spec.add_development_dependency 'embulk', ['>= 0.8.8']
21
22
  spec.add_development_dependency "rake", "~> 10.0"
22
23
  end
@@ -20,7 +20,7 @@ module Embulk
20
20
  end
21
21
  task = {
22
22
  :schema => schema_serialized,
23
- :root_to_route => config.param("root", :string).split("/")
23
+ :root => config.param("root", :string).split("/")
24
24
  }
25
25
  columns = schema.each_with_index.map do |c, i|
26
26
  Column.new(i, c["name"], c["type"].to_sym)
@@ -29,15 +29,11 @@ module Embulk
29
29
  end
30
30
 
31
31
  def run(file_input)
32
- on_new_record = lambda {|record|
33
- @page_builder.add(record)
34
- }
35
- doc = RecordBinder.new(@task["root_to_route"],
36
- @task["schema"], on_new_record)
32
+ doc = RecordBinder.new(@task["root"], @task["schema"], @page_builder)
37
33
  parser = Nokogiri::XML::SAX::Parser.new(doc)
38
34
  while file = file_input.next_file
39
35
  data = file.read
40
- if !data.nil? && !data.empty?
36
+ unless data.nil?
41
37
  doc.clear
42
38
  parser.parse(data)
43
39
  end
@@ -48,10 +44,10 @@ module Embulk
48
44
 
49
45
  class RecordBinder < Nokogiri::XML::SAX::Document
50
46
 
51
- def initialize(route, schema, on_new_record)
47
+ def initialize(route, schema, page_builder)
52
48
  @route = route
53
49
  @schema = schema
54
- @on_new_record = on_new_record
50
+ @page_builder = page_builder
55
51
  clear
56
52
  super()
57
53
  end
@@ -89,7 +85,7 @@ module Embulk
89
85
  if @enter
90
86
  if name == @route.last
91
87
  @enter = false
92
- @on_new_record.call(@current_data.map{|k, v| v})
88
+ @page_builder.add(@current_data.map{|k, v| v})
93
89
  @current_data = new_map_by_schema
94
90
  elsif !@current_element_name.nil? && @schema.key?(name)
95
91
  @current_data[name] = convert(@current_data[name], @schema[name])
@@ -131,6 +127,5 @@ module Embulk
131
127
  end
132
128
  end
133
129
  end
134
-
135
130
  end
136
131
  end
@@ -0,0 +1,60 @@
1
+ require "nokogiri"
2
+
3
+ module Embulk
4
+ module Parser
5
+
6
+ class XPathParserPlugin < ParserPlugin
7
+ Plugin.register_parser("xpath", self)
8
+
9
+ def self.transaction(config, &control)
10
+ schema = config.param("schema", :array)
11
+ task = {
12
+ :schema => schema,
13
+ :root => config.param("root", :string, default: "/"),
14
+ :namespaces => config.param("namespaces", :hash, default: {})
15
+ }
16
+ columns = schema.each_with_index.map do |c, i|
17
+ path, name = c["path"], c["name"]
18
+ Column.new(i, name.nil? ? path : name, c["type"].to_sym)
19
+ end
20
+ yield(task, columns)
21
+ end
22
+
23
+ def run(file_input)
24
+ while file = file_input.next_file
25
+ data = file.read
26
+ if !data.nil? && !data.empty?
27
+ Nokogiri::XML(data).xpath(@task["root"], @task["namespaces"]).each do |item|
28
+ dest = @task["schema"].inject([]) do |memo, s|
29
+ es = item.xpath(s["path"], @namespaces)
30
+ memo << convert(es.empty? ? nil : es.text, s["type"])
31
+ memo
32
+ end
33
+ @page_builder.add(dest)
34
+ end
35
+ end
36
+ end
37
+ @page_builder.finish
38
+ end
39
+
40
+ private
41
+ def convert(val, type)
42
+ v = val.nil? ? "" : val
43
+ case type
44
+ when "string"
45
+ v
46
+ when "long"
47
+ v.to_i
48
+ when "double"
49
+ v.to_f
50
+ when "boolean"
51
+ ["yes", "true", "1"].include?(v.downcase)
52
+ when "timestamp"
53
+ v.empty? ? nil : Time.strptime(v, c["format"])
54
+ else
55
+ raise "Unsupported type '#{type}'"
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
metadata CHANGED
@@ -1,64 +1,71 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-parser-xml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
5
- prerelease:
4
+ version: 0.0.7
6
5
  platform: ruby
7
6
  authors:
8
7
  - Takuma kanari
9
- autorequire:
8
+ autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2016-02-27 00:00:00.000000000 Z
11
+ date: 2016-12-15 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: nokogiri
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ~>
17
+ - - "~>"
20
18
  - !ruby/object:Gem::Version
21
- version: '1.6'
22
- type: :runtime
23
- prerelease: false
19
+ version: 1.4.0
24
20
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
21
  requirements:
27
- - - ~>
22
+ - - "~>"
28
23
  - !ruby/object:Gem::Version
29
- version: '1.6'
24
+ version: 1.4.0
25
+ prerelease: false
26
+ type: :runtime
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: bundler
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ~>
31
+ - - "~>"
36
32
  - !ruby/object:Gem::Version
37
33
  version: '1.0'
38
- type: :development
39
- prerelease: false
40
34
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
35
  requirements:
43
- - - ~>
36
+ - - "~>"
44
37
  - !ruby/object:Gem::Version
45
38
  version: '1.0'
39
+ prerelease: false
40
+ type: :development
41
+ - !ruby/object:Gem::Dependency
42
+ name: embulk
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: 0.8.8
48
+ version_requirements: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 0.8.8
53
+ prerelease: false
54
+ type: :development
46
55
  - !ruby/object:Gem::Dependency
47
56
  name: rake
48
57
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
58
  requirements:
51
- - - ~>
59
+ - - "~>"
52
60
  - !ruby/object:Gem::Version
53
61
  version: '10.0'
54
- type: :development
55
- prerelease: false
56
62
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
63
  requirements:
59
- - - ~>
64
+ - - "~>"
60
65
  - !ruby/object:Gem::Version
61
66
  version: '10.0'
67
+ prerelease: false
68
+ type: :development
62
69
  description: XML parser plugin is Embulk plugin to fetch entries in xml format.
63
70
  email:
64
71
  - chemtrails.t@gmail.com
@@ -66,36 +73,37 @@ executables: []
66
73
  extensions: []
67
74
  extra_rdoc_files: []
68
75
  files:
69
- - .gitignore
76
+ - ".gitignore"
77
+ - ".ruby-version"
70
78
  - Gemfile
71
79
  - LICENSE.txt
72
80
  - README.md
73
81
  - Rakefile
74
82
  - embulk-parser-xml.gemspec
75
83
  - lib/embulk/parser/xml.rb
84
+ - lib/embulk/parser/xpath.rb
76
85
  homepage: https://github.com/takumakanari/embulk-parser-xml
77
86
  licenses:
78
87
  - MIT
79
- post_install_message:
88
+ metadata: {}
89
+ post_install_message:
80
90
  rdoc_options: []
81
91
  require_paths:
82
92
  - lib
83
93
  required_ruby_version: !ruby/object:Gem::Requirement
84
- none: false
85
94
  requirements:
86
- - - ! '>='
95
+ - - ">="
87
96
  - !ruby/object:Gem::Version
88
97
  version: '0'
89
98
  required_rubygems_version: !ruby/object:Gem::Requirement
90
- none: false
91
99
  requirements:
92
- - - ! '>='
100
+ - - ">="
93
101
  - !ruby/object:Gem::Version
94
102
  version: '0'
95
103
  requirements: []
96
- rubyforge_project:
97
- rubygems_version: 1.8.23
98
- signing_key:
99
- specification_version: 3
104
+ rubyforge_project:
105
+ rubygems_version: 2.6.6
106
+ signing_key:
107
+ specification_version: 4
100
108
  summary: Embulk parser plugin for XML
101
109
  test_files: []