embulk-parser-xml 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.ruby-version +1 -0
- data/README.md +39 -2
- data/embulk-parser-xml.gemspec +3 -2
- data/lib/embulk/parser/xml.rb +6 -11
- data/lib/embulk/parser/xpath.rb +60 -0
- metadata +42 -34
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 5a19b485eaa8f3a9143e2d1b2eb8a07a6b7fe616
|
4
|
+
data.tar.gz: 859ef953a4d17ca9b24c8060c6950231887949db
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 5c3ada6ad8d47dc867bf62c4bb3f60c4a690a84b69e9e41a048f87d0ba5dc0bd19f0ca92f90fdcc515c4d648821b983a01bc1e22c9aa7d0fde02017fccb556ed
|
7
|
+
data.tar.gz: 621bde05d19a7f3f58199ae681a891a39ec552d9b0a3a5b751f97983a711879e40312128d47b66ad49c0f5f4147722b157f759945534917186212ece394fddab
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
jruby-9.0.4.0
|
data/README.md
CHANGED
@@ -10,9 +10,15 @@ Read data from input as xml and fetch each entries to output.
|
|
10
10
|
* **Load all or nothing**: yes
|
11
11
|
* **Resume supported**: no
|
12
12
|
|
13
|
+
## Types
|
14
|
+
|
15
|
+
- **xml**: Find rows by SAX.
|
16
|
+
- **xpath**: Find finds rows by Xpath, so you can process XML by more complex condition than *xml* type.
|
13
17
|
|
14
18
|
## Configuration
|
15
19
|
|
20
|
+
### XML
|
21
|
+
|
16
22
|
```yaml
|
17
23
|
parser:
|
18
24
|
type: xml
|
@@ -32,10 +38,41 @@ If you need to parse column as timestamp type, *schema* supports 2 optional para
|
|
32
38
|
schema:
|
33
39
|
- {name: timestamp_column, type: timestamp, format: "%Y-%m-%d", timezone: "+0000"}
|
34
40
|
```
|
41
|
+
|
42
|
+
- **format**: timestamp format to parse, required.
|
43
|
+
- **timezone**: timestamp will be parsing in this timezone, `"+0900"` is used by default.
|
44
|
+
|
45
|
+
|
46
|
+
### Xpath
|
47
|
+
|
48
|
+
```yaml
|
49
|
+
parser:
|
50
|
+
type: xpath
|
51
|
+
root: //data/students/student
|
52
|
+
schema:
|
53
|
+
- {path: name, type: string, name: name}
|
54
|
+
- {path: age, type: long, name: age}
|
55
|
+
```
|
56
|
+
|
57
|
+
- **type**: specify this plugin as `xpath` .
|
58
|
+
- **root**: root property to start fetching each entries, specify in Xpath, *'/''* is used by default.
|
59
|
+
- **schema**: specify the attribute of table and data type, required.
|
60
|
+
- **namespaces**: xml namespaces
|
61
|
+
|
62
|
+
|
63
|
+
If you need to parse column as timestamp type, *schema* supports 2 optional parameters:
|
64
|
+
|
65
|
+
```yaml
|
66
|
+
schema:
|
67
|
+
- {name: timestamp_column, type: timestamp, format: "%Y-%m-%d", timezone: "+0000"}
|
68
|
+
```
|
69
|
+
|
35
70
|
- **format**: timestamp format to parse, required.
|
36
71
|
- **timezone**: timestamp will be parsing in this timezone, `"+0900"` is used by default.
|
37
72
|
|
38
|
-
|
73
|
+
|
74
|
+
|
75
|
+
Here is XML for xample:
|
39
76
|
|
40
77
|
```xml
|
41
78
|
<data>
|
@@ -59,4 +96,4 @@ Then you can fetch entries from the following xml:
|
|
59
96
|
</student>
|
60
97
|
</students>
|
61
98
|
</data>
|
62
|
-
```
|
99
|
+
```
|
data/embulk-parser-xml.gemspec
CHANGED
@@ -4,7 +4,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
|
5
5
|
Gem::Specification.new do |spec|
|
6
6
|
spec.name = "embulk-parser-xml"
|
7
|
-
spec.version = "0.0.
|
7
|
+
spec.version = "0.0.7"
|
8
8
|
spec.authors = ["Takuma kanari"]
|
9
9
|
spec.email = ["chemtrails.t@gmail.com"]
|
10
10
|
spec.summary = %q{Embulk parser plugin for XML}
|
@@ -16,7 +16,8 @@ Gem::Specification.new do |spec|
|
|
16
16
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
17
|
spec.require_paths = ["lib"]
|
18
18
|
|
19
|
-
spec.add_dependency "nokogiri", "~> 1.
|
19
|
+
spec.add_dependency "nokogiri", "~> 1.4.0"
|
20
20
|
spec.add_development_dependency "bundler", "~> 1.0"
|
21
|
+
spec.add_development_dependency 'embulk', ['>= 0.8.8']
|
21
22
|
spec.add_development_dependency "rake", "~> 10.0"
|
22
23
|
end
|
data/lib/embulk/parser/xml.rb
CHANGED
@@ -20,7 +20,7 @@ module Embulk
|
|
20
20
|
end
|
21
21
|
task = {
|
22
22
|
:schema => schema_serialized,
|
23
|
-
:
|
23
|
+
:root => config.param("root", :string).split("/")
|
24
24
|
}
|
25
25
|
columns = schema.each_with_index.map do |c, i|
|
26
26
|
Column.new(i, c["name"], c["type"].to_sym)
|
@@ -29,15 +29,11 @@ module Embulk
|
|
29
29
|
end
|
30
30
|
|
31
31
|
def run(file_input)
|
32
|
-
|
33
|
-
@page_builder.add(record)
|
34
|
-
}
|
35
|
-
doc = RecordBinder.new(@task["root_to_route"],
|
36
|
-
@task["schema"], on_new_record)
|
32
|
+
doc = RecordBinder.new(@task["root"], @task["schema"], @page_builder)
|
37
33
|
parser = Nokogiri::XML::SAX::Parser.new(doc)
|
38
34
|
while file = file_input.next_file
|
39
35
|
data = file.read
|
40
|
-
|
36
|
+
unless data.nil?
|
41
37
|
doc.clear
|
42
38
|
parser.parse(data)
|
43
39
|
end
|
@@ -48,10 +44,10 @@ module Embulk
|
|
48
44
|
|
49
45
|
class RecordBinder < Nokogiri::XML::SAX::Document
|
50
46
|
|
51
|
-
def initialize(route, schema,
|
47
|
+
def initialize(route, schema, page_builder)
|
52
48
|
@route = route
|
53
49
|
@schema = schema
|
54
|
-
@
|
50
|
+
@page_builder = page_builder
|
55
51
|
clear
|
56
52
|
super()
|
57
53
|
end
|
@@ -89,7 +85,7 @@ module Embulk
|
|
89
85
|
if @enter
|
90
86
|
if name == @route.last
|
91
87
|
@enter = false
|
92
|
-
@
|
88
|
+
@page_builder.add(@current_data.map{|k, v| v})
|
93
89
|
@current_data = new_map_by_schema
|
94
90
|
elsif !@current_element_name.nil? && @schema.key?(name)
|
95
91
|
@current_data[name] = convert(@current_data[name], @schema[name])
|
@@ -131,6 +127,5 @@ module Embulk
|
|
131
127
|
end
|
132
128
|
end
|
133
129
|
end
|
134
|
-
|
135
130
|
end
|
136
131
|
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
|
3
|
+
module Embulk
|
4
|
+
module Parser
|
5
|
+
|
6
|
+
class XPathParserPlugin < ParserPlugin
|
7
|
+
Plugin.register_parser("xpath", self)
|
8
|
+
|
9
|
+
def self.transaction(config, &control)
|
10
|
+
schema = config.param("schema", :array)
|
11
|
+
task = {
|
12
|
+
:schema => schema,
|
13
|
+
:root => config.param("root", :string, default: "/"),
|
14
|
+
:namespaces => config.param("namespaces", :hash, default: {})
|
15
|
+
}
|
16
|
+
columns = schema.each_with_index.map do |c, i|
|
17
|
+
path, name = c["path"], c["name"]
|
18
|
+
Column.new(i, name.nil? ? path : name, c["type"].to_sym)
|
19
|
+
end
|
20
|
+
yield(task, columns)
|
21
|
+
end
|
22
|
+
|
23
|
+
def run(file_input)
|
24
|
+
while file = file_input.next_file
|
25
|
+
data = file.read
|
26
|
+
if !data.nil? && !data.empty?
|
27
|
+
Nokogiri::XML(data).xpath(@task["root"], @task["namespaces"]).each do |item|
|
28
|
+
dest = @task["schema"].inject([]) do |memo, s|
|
29
|
+
es = item.xpath(s["path"], @namespaces)
|
30
|
+
memo << convert(es.empty? ? nil : es.text, s["type"])
|
31
|
+
memo
|
32
|
+
end
|
33
|
+
@page_builder.add(dest)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
@page_builder.finish
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
def convert(val, type)
|
42
|
+
v = val.nil? ? "" : val
|
43
|
+
case type
|
44
|
+
when "string"
|
45
|
+
v
|
46
|
+
when "long"
|
47
|
+
v.to_i
|
48
|
+
when "double"
|
49
|
+
v.to_f
|
50
|
+
when "boolean"
|
51
|
+
["yes", "true", "1"].include?(v.downcase)
|
52
|
+
when "timestamp"
|
53
|
+
v.empty? ? nil : Time.strptime(v, c["format"])
|
54
|
+
else
|
55
|
+
raise "Unsupported type '#{type}'"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
metadata
CHANGED
@@ -1,64 +1,71 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-parser-xml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.7
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Takuma kanari
|
9
|
-
autorequire:
|
8
|
+
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2016-
|
11
|
+
date: 2016-12-15 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: nokogiri
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- - ~>
|
17
|
+
- - "~>"
|
20
18
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
22
|
-
type: :runtime
|
23
|
-
prerelease: false
|
19
|
+
version: 1.4.0
|
24
20
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
21
|
requirements:
|
27
|
-
- - ~>
|
22
|
+
- - "~>"
|
28
23
|
- !ruby/object:Gem::Version
|
29
|
-
version:
|
24
|
+
version: 1.4.0
|
25
|
+
prerelease: false
|
26
|
+
type: :runtime
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: bundler
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- - ~>
|
31
|
+
- - "~>"
|
36
32
|
- !ruby/object:Gem::Version
|
37
33
|
version: '1.0'
|
38
|
-
type: :development
|
39
|
-
prerelease: false
|
40
34
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
35
|
requirements:
|
43
|
-
- - ~>
|
36
|
+
- - "~>"
|
44
37
|
- !ruby/object:Gem::Version
|
45
38
|
version: '1.0'
|
39
|
+
prerelease: false
|
40
|
+
type: :development
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: embulk
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.8.8
|
48
|
+
version_requirements: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 0.8.8
|
53
|
+
prerelease: false
|
54
|
+
type: :development
|
46
55
|
- !ruby/object:Gem::Dependency
|
47
56
|
name: rake
|
48
57
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
58
|
requirements:
|
51
|
-
- - ~>
|
59
|
+
- - "~>"
|
52
60
|
- !ruby/object:Gem::Version
|
53
61
|
version: '10.0'
|
54
|
-
type: :development
|
55
|
-
prerelease: false
|
56
62
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
63
|
requirements:
|
59
|
-
- - ~>
|
64
|
+
- - "~>"
|
60
65
|
- !ruby/object:Gem::Version
|
61
66
|
version: '10.0'
|
67
|
+
prerelease: false
|
68
|
+
type: :development
|
62
69
|
description: XML parser plugin is Embulk plugin to fetch entries in xml format.
|
63
70
|
email:
|
64
71
|
- chemtrails.t@gmail.com
|
@@ -66,36 +73,37 @@ executables: []
|
|
66
73
|
extensions: []
|
67
74
|
extra_rdoc_files: []
|
68
75
|
files:
|
69
|
-
- .gitignore
|
76
|
+
- ".gitignore"
|
77
|
+
- ".ruby-version"
|
70
78
|
- Gemfile
|
71
79
|
- LICENSE.txt
|
72
80
|
- README.md
|
73
81
|
- Rakefile
|
74
82
|
- embulk-parser-xml.gemspec
|
75
83
|
- lib/embulk/parser/xml.rb
|
84
|
+
- lib/embulk/parser/xpath.rb
|
76
85
|
homepage: https://github.com/takumakanari/embulk-parser-xml
|
77
86
|
licenses:
|
78
87
|
- MIT
|
79
|
-
|
88
|
+
metadata: {}
|
89
|
+
post_install_message:
|
80
90
|
rdoc_options: []
|
81
91
|
require_paths:
|
82
92
|
- lib
|
83
93
|
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
-
none: false
|
85
94
|
requirements:
|
86
|
-
- -
|
95
|
+
- - ">="
|
87
96
|
- !ruby/object:Gem::Version
|
88
97
|
version: '0'
|
89
98
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
90
|
-
none: false
|
91
99
|
requirements:
|
92
|
-
- -
|
100
|
+
- - ">="
|
93
101
|
- !ruby/object:Gem::Version
|
94
102
|
version: '0'
|
95
103
|
requirements: []
|
96
|
-
rubyforge_project:
|
97
|
-
rubygems_version:
|
98
|
-
signing_key:
|
99
|
-
specification_version:
|
104
|
+
rubyforge_project:
|
105
|
+
rubygems_version: 2.6.6
|
106
|
+
signing_key:
|
107
|
+
specification_version: 4
|
100
108
|
summary: Embulk parser plugin for XML
|
101
109
|
test_files: []
|