eatr 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +426 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/rspec +17 -0
- data/bin/setup +8 -0
- data/eatr.gemspec +30 -0
- data/lib/eatr.rb +11 -0
- data/lib/eatr/csv/document.rb +49 -0
- data/lib/eatr/dot_generator.rb +28 -0
- data/lib/eatr/dot_template.dot +35 -0
- data/lib/eatr/parse_value.rb +25 -0
- data/lib/eatr/pipeline.rb +11 -0
- data/lib/eatr/pipeline_spec.rb +13 -0
- data/lib/eatr/schema.rb +94 -0
- data/lib/eatr/sql/table_generator.rb +52 -0
- data/lib/eatr/transformation/add_date_id.rb +20 -0
- data/lib/eatr/transformation_set.rb +27 -0
- data/lib/eatr/version.rb +3 -0
- data/lib/eatr/xml/document.rb +87 -0
- data/lib/eatr/xml/schema_generator.rb +69 -0
- data/sample.dot +42 -0
- metadata +142 -0
data/lib/eatr/version.rb
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
require "yaml"
|
2
|
+
require "nokogiri"
|
3
|
+
|
4
|
+
module Eatr
|
5
|
+
module Xml
|
6
|
+
NodeNotFound = Class.new(StandardError)
|
7
|
+
|
8
|
+
class Document
|
9
|
+
include ParseValue
|
10
|
+
extend Forwardable
|
11
|
+
|
12
|
+
attr_reader :schema
|
13
|
+
|
14
|
+
def_delegator :schema,
|
15
|
+
:transformation_pipeline
|
16
|
+
|
17
|
+
def initialize(schema_path)
|
18
|
+
@schema = Schema.new(YAML.load(File.read(schema_path)))
|
19
|
+
end
|
20
|
+
|
21
|
+
def parse(xml_document_path)
|
22
|
+
@namespaces = {}
|
23
|
+
|
24
|
+
doc = Nokogiri::XML(File.open(xml_document_path)) do |config|
|
25
|
+
config.strict.nonet
|
26
|
+
end
|
27
|
+
|
28
|
+
if @schema.remove_namespaces?
|
29
|
+
doc.remove_namespaces!
|
30
|
+
@namespaces = {}
|
31
|
+
else
|
32
|
+
@namespaces = doc.collect_namespaces
|
33
|
+
end
|
34
|
+
|
35
|
+
cardinality = @schema.fields.inject(1) do |memo, field|
|
36
|
+
if field.node?
|
37
|
+
memo * [doc.xpath(field.xpath, @namespaces).count, 1].max
|
38
|
+
else
|
39
|
+
memo
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
objects = []
|
44
|
+
|
45
|
+
cardinality.times do |n|
|
46
|
+
objects << @schema.to_struct.new
|
47
|
+
end
|
48
|
+
|
49
|
+
@schema.fields.each do |field|
|
50
|
+
objects = set_field(objects, doc, field)
|
51
|
+
end
|
52
|
+
|
53
|
+
objects
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def set_field(objects, doc, field)
|
59
|
+
if field.node?
|
60
|
+
doc.xpath(field.xpath, @namespaces).each_with_index do |child_xml, idx|
|
61
|
+
field.children.flat_map do |child|
|
62
|
+
set_field([objects[idx]], child_xml, child)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
elsif field.name
|
66
|
+
objects.each do |o|
|
67
|
+
o.public_send("#{field.name}=", value_at(doc, field))
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
objects
|
72
|
+
end
|
73
|
+
|
74
|
+
def value_at(doc, field)
|
75
|
+
if field.value
|
76
|
+
field.value
|
77
|
+
elsif field.xpath
|
78
|
+
if node = doc.at_xpath(field.xpath, @namespaces)
|
79
|
+
parse_value(field, node.content)
|
80
|
+
elsif field.required?
|
81
|
+
raise NodeNotFound, "Unable to find '#{field.name}' using xpath '#{field.xpath}'"
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module Eatr
|
2
|
+
module Xml
|
3
|
+
class SchemaGenerator
|
4
|
+
def initialize(xml_path)
|
5
|
+
@xml_path = xml_path
|
6
|
+
end
|
7
|
+
|
8
|
+
def schema(starting_point)
|
9
|
+
doc = Nokogiri::XML(File.open(@xml_path)) do |config|
|
10
|
+
config.strict.nonet
|
11
|
+
end
|
12
|
+
|
13
|
+
doc.remove_namespaces!
|
14
|
+
|
15
|
+
fields = doc.at_xpath(starting_point).element_children.flat_map do |child|
|
16
|
+
field_def(child)
|
17
|
+
end
|
18
|
+
|
19
|
+
schema = {
|
20
|
+
'name' => '',
|
21
|
+
'remove_namespaces' => true,
|
22
|
+
'fields' => fields
|
23
|
+
}
|
24
|
+
|
25
|
+
YAML.dump(schema)
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def field_def(child, name_prefix: '', xpath_relative_to: nil)
|
31
|
+
if unique_children_count(child) == 1 && child.element_children.map(&:name).count > 1
|
32
|
+
relative_path = Regexp.new(child.element_children.first.path.gsub(/\[\d+\]/, "\\[\\d+\\]"))
|
33
|
+
node_path = child.element_children.first.path.gsub(/\[\d+\]/, "")
|
34
|
+
|
35
|
+
{
|
36
|
+
'node' => name_prefix + underscore(child.name),
|
37
|
+
'xpath' => xpath_relative_to ? child.path.gsub(xpath_relative_to, ".") : node_path,
|
38
|
+
'children' => child.element_children.first.element_children.flat_map do |c|
|
39
|
+
field_def(c, name_prefix: "#{underscore(child.name)}_", xpath_relative_to: relative_path)
|
40
|
+
end
|
41
|
+
}
|
42
|
+
elsif unique_children_count(child) >= 1
|
43
|
+
child.element_children.flat_map do |c|
|
44
|
+
field_def(c, name_prefix: "#{underscore(child.name)}_", xpath_relative_to: xpath_relative_to)
|
45
|
+
end
|
46
|
+
else
|
47
|
+
{
|
48
|
+
'name' => name_prefix + underscore(child.name),
|
49
|
+
'xpath' => xpath_relative_to ? child.path.gsub(xpath_relative_to, ".") : child.path,
|
50
|
+
'type' => 'string',
|
51
|
+
'required' => false
|
52
|
+
}
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def unique_children_count(element)
|
57
|
+
element.element_children.map(&:name).uniq.count
|
58
|
+
end
|
59
|
+
|
60
|
+
def underscore(str)
|
61
|
+
str.gsub(/::/, '/').
|
62
|
+
gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
|
63
|
+
gsub(/([a-z\d])([A-Z])/,'\1_\2').
|
64
|
+
tr("-", "_").
|
65
|
+
downcase
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/sample.dot
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
strict digraph g {
|
2
|
+
ranksep="1.6"
|
3
|
+
graph [
|
4
|
+
rankdir = "LR"
|
5
|
+
];
|
6
|
+
node [
|
7
|
+
fontsize = "16"
|
8
|
+
];
|
9
|
+
edge [
|
10
|
+
arrowhead = "none"
|
11
|
+
];
|
12
|
+
"books" [shape=none, margin=0, label=<
|
13
|
+
<table border="0" cellborder="1" cellspacing="0" cellpadding="4">
|
14
|
+
<tr><td bgcolor="lightblue">books</td></tr>
|
15
|
+
<tr><td port="id" align="left">id</td></tr>
|
16
|
+
<tr><td port="author" align="left">author</td></tr>
|
17
|
+
<tr><td port="library_id" align="left">library_id</td></tr>
|
18
|
+
<tr><td port="pages" align="left">pages</td></tr>
|
19
|
+
<tr><td port="for_sale" align="left">for_sale</td></tr>
|
20
|
+
<tr><td port="published_at" align="left">published_at</td></tr>
|
21
|
+
<tr><td port="rating" align="left">rating</td></tr>
|
22
|
+
<tr><td port="icbn" align="left">icbn</td></tr>
|
23
|
+
<tr><td port="summary" align="left">summary</td></tr>
|
24
|
+
<tr><td port="age" align="left">age</td></tr>
|
25
|
+
</table>>];
|
26
|
+
"chapters" [shape=none, margin=0, label=<
|
27
|
+
<table border="0" cellborder="1" cellspacing="0" cellpadding="4">
|
28
|
+
<tr><td bgcolor="lightblue">chapters</td></tr>
|
29
|
+
<tr><td port="book_id" align="left">book_id</td></tr>
|
30
|
+
<tr><td port="title" align="left">title</td></tr>
|
31
|
+
</table>>];
|
32
|
+
"libraries" [shape=none, margin=0, label=<
|
33
|
+
<table border="0" cellborder="1" cellspacing="0" cellpadding="4">
|
34
|
+
<tr><td bgcolor="lightblue">libraries</td></tr>
|
35
|
+
<tr><td port="id" align="left">id</td></tr>
|
36
|
+
<tr><td port="book_title" align="left">book_title</td></tr>
|
37
|
+
<tr><td port="desk_number" align="left">desk_number</td></tr>
|
38
|
+
</table>>];
|
39
|
+
"books":"id" -> "chapters":"book_id" [arrowhead="crow"];
|
40
|
+
"books":"library_id" -> "libraries":"id" [arrowhead="teeodot"];
|
41
|
+
"chapters":"book_id" -> "books":"id" [arrowhead="tee"];
|
42
|
+
}
|
metadata
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: eatr
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Greggory Rothmeier
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-01-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.13'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.13'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '10.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '10.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '3.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '3.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: pry
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.10'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.10'
|
83
|
+
description: Configuration-based document parsing and transformation framework.
|
84
|
+
email:
|
85
|
+
- greggroth@gmail.com
|
86
|
+
executables: []
|
87
|
+
extensions: []
|
88
|
+
extra_rdoc_files: []
|
89
|
+
files:
|
90
|
+
- ".gitignore"
|
91
|
+
- ".rspec"
|
92
|
+
- ".travis.yml"
|
93
|
+
- CODE_OF_CONDUCT.md
|
94
|
+
- Gemfile
|
95
|
+
- LICENSE.txt
|
96
|
+
- README.md
|
97
|
+
- Rakefile
|
98
|
+
- bin/console
|
99
|
+
- bin/rspec
|
100
|
+
- bin/setup
|
101
|
+
- eatr.gemspec
|
102
|
+
- lib/eatr.rb
|
103
|
+
- lib/eatr/csv/document.rb
|
104
|
+
- lib/eatr/dot_generator.rb
|
105
|
+
- lib/eatr/dot_template.dot
|
106
|
+
- lib/eatr/parse_value.rb
|
107
|
+
- lib/eatr/pipeline.rb
|
108
|
+
- lib/eatr/pipeline_spec.rb
|
109
|
+
- lib/eatr/schema.rb
|
110
|
+
- lib/eatr/sql/table_generator.rb
|
111
|
+
- lib/eatr/transformation/add_date_id.rb
|
112
|
+
- lib/eatr/transformation_set.rb
|
113
|
+
- lib/eatr/version.rb
|
114
|
+
- lib/eatr/xml/document.rb
|
115
|
+
- lib/eatr/xml/schema_generator.rb
|
116
|
+
- sample.dot
|
117
|
+
homepage: ''
|
118
|
+
licenses:
|
119
|
+
- MIT
|
120
|
+
metadata: {}
|
121
|
+
post_install_message:
|
122
|
+
rdoc_options: []
|
123
|
+
require_paths:
|
124
|
+
- lib
|
125
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
126
|
+
requirements:
|
127
|
+
- - ">="
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
version: '0'
|
130
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
131
|
+
requirements:
|
132
|
+
- - ">="
|
133
|
+
- !ruby/object:Gem::Version
|
134
|
+
version: '0'
|
135
|
+
requirements: []
|
136
|
+
rubyforge_project:
|
137
|
+
rubygems_version: 2.5.1
|
138
|
+
signing_key:
|
139
|
+
specification_version: 4
|
140
|
+
summary: Configuration-based document parsing and transformation framework.
|
141
|
+
test_files: []
|
142
|
+
has_rdoc:
|