eatr 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +426 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/rspec +17 -0
- data/bin/setup +8 -0
- data/eatr.gemspec +30 -0
- data/lib/eatr.rb +11 -0
- data/lib/eatr/csv/document.rb +49 -0
- data/lib/eatr/dot_generator.rb +28 -0
- data/lib/eatr/dot_template.dot +35 -0
- data/lib/eatr/parse_value.rb +25 -0
- data/lib/eatr/pipeline.rb +11 -0
- data/lib/eatr/pipeline_spec.rb +13 -0
- data/lib/eatr/schema.rb +94 -0
- data/lib/eatr/sql/table_generator.rb +52 -0
- data/lib/eatr/transformation/add_date_id.rb +20 -0
- data/lib/eatr/transformation_set.rb +27 -0
- data/lib/eatr/version.rb +3 -0
- data/lib/eatr/xml/document.rb +87 -0
- data/lib/eatr/xml/schema_generator.rb +69 -0
- data/sample.dot +42 -0
- metadata +142 -0
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "eatl"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/rspec
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
#
|
4
|
+
# This file was generated by Bundler.
|
5
|
+
#
|
6
|
+
# The application 'rspec' is installed as part of a gem, and
|
7
|
+
# this file is here to facilitate running it.
|
8
|
+
#
|
9
|
+
|
10
|
+
require "pathname"
|
11
|
+
ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../../Gemfile",
|
12
|
+
Pathname.new(__FILE__).realpath)
|
13
|
+
|
14
|
+
require "rubygems"
|
15
|
+
require "bundler/setup"
|
16
|
+
|
17
|
+
load Gem.bin_path("rspec-core", "rspec")
|
data/bin/setup
ADDED
data/eatr.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'eatr/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "eatr"
|
8
|
+
spec.version = Eatr::VERSION
|
9
|
+
spec.authors = ["Greggory Rothmeier"]
|
10
|
+
spec.email = ["greggroth@gmail.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{Configuration-based document parsing and transformation framework.}
|
13
|
+
spec.description = %q{Configuration-based document parsing and transformation framework.}
|
14
|
+
spec.homepage = ""
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
18
|
+
f.match(%r{^(test|spec|features)/})
|
19
|
+
end
|
20
|
+
spec.bindir = "exe"
|
21
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
22
|
+
spec.require_paths = ["lib"]
|
23
|
+
|
24
|
+
spec.add_runtime_dependency "nokogiri", "~> 1.6"
|
25
|
+
|
26
|
+
spec.add_development_dependency "bundler", "~> 1.13"
|
27
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
28
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
29
|
+
spec.add_development_dependency "pry", "~>0.10"
|
30
|
+
end
|
data/lib/eatr.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require "eatr/version"
|
2
|
+
require "eatr/parse_value"
|
3
|
+
require "eatr/schema"
|
4
|
+
require "eatr/xml/document"
|
5
|
+
require "eatr/csv/document"
|
6
|
+
require "eatr/pipeline"
|
7
|
+
require "eatr/transformation_set"
|
8
|
+
require "eatr/transformation/add_date_id"
|
9
|
+
|
10
|
+
module Eatr
|
11
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module Eatr
|
4
|
+
module Csv
|
5
|
+
ValueNotFound = Class.new(StandardError)
|
6
|
+
|
7
|
+
class Document
|
8
|
+
include ParseValue
|
9
|
+
extend Forwardable
|
10
|
+
|
11
|
+
attr_reader :schema
|
12
|
+
|
13
|
+
def_delegator :schema,
|
14
|
+
:transformation_pipeline
|
15
|
+
|
16
|
+
def initialize(schema_path)
|
17
|
+
@schema = Schema.new(YAML.load(File.read(schema_path)))
|
18
|
+
end
|
19
|
+
|
20
|
+
def parse(csv_document_path)
|
21
|
+
objects = []
|
22
|
+
|
23
|
+
CSV.foreach(csv_document_path, headers: true) do |row|
|
24
|
+
obj = @schema.to_struct.new
|
25
|
+
|
26
|
+
@schema.fields.each do |field|
|
27
|
+
obj.public_send("#{field.name}=", value_at(row, field))
|
28
|
+
end
|
29
|
+
|
30
|
+
objects << obj
|
31
|
+
end
|
32
|
+
|
33
|
+
objects
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def value_at(row, field)
|
39
|
+
if field.value
|
40
|
+
field.value
|
41
|
+
elsif text = row[field.csv_header]
|
42
|
+
parse_value(field, text)
|
43
|
+
elsif field.required?
|
44
|
+
raise ValueNotFound, "Unable to find '#{field.name}' with header '#{field.csv_header}'"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Eatr
|
2
|
+
class DotGenerator
|
3
|
+
attr_reader :tables
|
4
|
+
|
5
|
+
DEFAULT_TEMPLATE_PATH = "#{File.dirname(__FILE__)}/dot_template.dot"
|
6
|
+
|
7
|
+
def initialize(schema_paths, template_path: DEFAULT_TEMPLATE_PATH)
|
8
|
+
@tables = Array[schema_paths].flatten.map { |s| Schema.new(YAML.load(File.read(s))) }
|
9
|
+
@template_path = template_path
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_dot
|
13
|
+
ERB.new(File.read(@template_path), nil, '-').result(binding)
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def table_included?(belongs_to_str)
|
19
|
+
table_name, _ = belongs_to_str.split('.')
|
20
|
+
@tables.any? { |t| t.table_name == table_name }
|
21
|
+
end
|
22
|
+
|
23
|
+
def arrow_target(belongs_to_str)
|
24
|
+
table_name, column = belongs_to_str.split('.')
|
25
|
+
"\"#{table_name}\":\"#{column}\""
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
strict digraph g {
|
2
|
+
ranksep="1.6"
|
3
|
+
graph [
|
4
|
+
rankdir = "LR"
|
5
|
+
];
|
6
|
+
node [
|
7
|
+
fontsize = "16"
|
8
|
+
];
|
9
|
+
edge [
|
10
|
+
arrowhead = "none"
|
11
|
+
];
|
12
|
+
<%- tables.each do |table| -%>
|
13
|
+
"<%= table.table_name %>" [shape=none, margin=0, label=<
|
14
|
+
<table border="0" cellborder="1" cellspacing="0" cellpadding="4">
|
15
|
+
<tr><td bgcolor="lightblue"><%= table.table_name %></td></tr>
|
16
|
+
<%- table.flat_fields.each do |field| -%>
|
17
|
+
<tr><td port="<%= field.name %>" align="left"><%= field.name %></td></tr>
|
18
|
+
<%- end -%>
|
19
|
+
</table>>];
|
20
|
+
<%- end -%>
|
21
|
+
<%- tables.each do |table| -%>
|
22
|
+
<%- table.flat_fields.each do |field| -%>
|
23
|
+
<%- if field.belongs_to_one && table_included?(field.belongs_to_one)-%>
|
24
|
+
"<%= table.table_name %>":"<%= field.name %>" -> <%= arrow_target(field.belongs_to_one) %> [arrowhead="tee<%= 'odot' if !field.required? %>"];
|
25
|
+
<%- end -%>
|
26
|
+
<%- if field.has_many -%>
|
27
|
+
<%- field.has_many.each do |r| -%>
|
28
|
+
<%- if table_included?(r)-%>
|
29
|
+
"<%= table.table_name %>":"<%= field.name %>" -> <%= arrow_target(r) %> [arrowhead="crow<%= 'odot' if !field.required? %>"];
|
30
|
+
<%- end -%>
|
31
|
+
<%- end -%>
|
32
|
+
<%- end -%>
|
33
|
+
<%- end -%>
|
34
|
+
<%- end -%>
|
35
|
+
}
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Eatr
|
2
|
+
module ParseValue
|
3
|
+
def parse_value(field, text)
|
4
|
+
case field.type
|
5
|
+
when 'integer' then text.to_i
|
6
|
+
when 'float' then text.to_f
|
7
|
+
when 'timestamp'
|
8
|
+
if field.strptime
|
9
|
+
DateTime.strptime(text, field.strptime)
|
10
|
+
else
|
11
|
+
DateTime.parse(text)
|
12
|
+
end
|
13
|
+
when 'boolean' then YAML.load(text)
|
14
|
+
else
|
15
|
+
if field.max_length
|
16
|
+
text[0...field.max_length]
|
17
|
+
elsif field.length
|
18
|
+
text[0...field.length]
|
19
|
+
else
|
20
|
+
text
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require './lib/eatr/pipeline'
|
3
|
+
|
4
|
+
describe Eatr::Pipeline do
|
5
|
+
it 'performs each action on an object' do
|
6
|
+
pipeline = described_class.new([
|
7
|
+
->(str) { str.upcase },
|
8
|
+
->(str) { str[1..3] },
|
9
|
+
])
|
10
|
+
|
11
|
+
expect(pipeline.call("hello")).to eq("ELL")
|
12
|
+
end
|
13
|
+
end
|
data/lib/eatr/schema.rb
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
module Eatr
|
2
|
+
class Schema
|
3
|
+
class Field
|
4
|
+
def initialize(field_attributes)
|
5
|
+
@field_attributes = field_attributes
|
6
|
+
end
|
7
|
+
|
8
|
+
%w[
|
9
|
+
name
|
10
|
+
type
|
11
|
+
xpath
|
12
|
+
csv_header
|
13
|
+
strptime
|
14
|
+
value
|
15
|
+
max_length
|
16
|
+
length
|
17
|
+
belongs_to_one
|
18
|
+
has_many
|
19
|
+
].each do |f|
|
20
|
+
define_method(f) do
|
21
|
+
@field_attributes[f]
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def type
|
26
|
+
@field_attributes['type'].to_s.downcase
|
27
|
+
end
|
28
|
+
|
29
|
+
def node?
|
30
|
+
@field_attributes.has_key?('node')
|
31
|
+
end
|
32
|
+
|
33
|
+
def required?
|
34
|
+
@field_attributes.fetch('required', true)
|
35
|
+
end
|
36
|
+
|
37
|
+
def children
|
38
|
+
Array[*@field_attributes['children']].map { |f| Field.new(f) }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def initialize(schema_hash)
|
43
|
+
@schema = schema_hash
|
44
|
+
end
|
45
|
+
|
46
|
+
def fields
|
47
|
+
@fields ||= @schema.fetch('fields').map { |f| Field.new(f) }
|
48
|
+
end
|
49
|
+
|
50
|
+
def flat_fields
|
51
|
+
@flat_fields ||= fields.select(&:name).
|
52
|
+
concat(fields.flat_map(&:children))
|
53
|
+
end
|
54
|
+
|
55
|
+
def name
|
56
|
+
@schema.fetch('name', 'schema')
|
57
|
+
end
|
58
|
+
|
59
|
+
def table_name
|
60
|
+
@schema.fetch('table_name', name)
|
61
|
+
end
|
62
|
+
|
63
|
+
def remove_namespaces?
|
64
|
+
@schema.fetch('remove_namespaces', false)
|
65
|
+
end
|
66
|
+
|
67
|
+
def to_struct
|
68
|
+
@struct_klass ||= begin
|
69
|
+
Object.const_get("Struct::#{constant_name}")
|
70
|
+
rescue NameError
|
71
|
+
Struct.new(constant_name, *field_names)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def transformation_pipeline
|
76
|
+
Pipeline.new(TransformationSet.new(@schema.fetch('transformations', [])))
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
|
81
|
+
def constant_name
|
82
|
+
constantize(name)
|
83
|
+
end
|
84
|
+
|
85
|
+
def field_names
|
86
|
+
flat_fields.map { |f| f.name.to_sym }
|
87
|
+
end
|
88
|
+
|
89
|
+
def constantize(underscore_name)
|
90
|
+
underscore_name.split('_').map(&:capitalize).join
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module Eatr
|
2
|
+
module Sql
|
3
|
+
class TableGenerator
|
4
|
+
def initialize(schema_path)
|
5
|
+
@schema = Schema.new(YAML.load(File.read(schema_path)))
|
6
|
+
end
|
7
|
+
|
8
|
+
def statement
|
9
|
+
<<-STATEMENT
|
10
|
+
CREATE TABLE #{@schema.table_name} (
|
11
|
+
#{column_defs.join(",\n ")}
|
12
|
+
);
|
13
|
+
STATEMENT
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def column_defs
|
19
|
+
@schema.flat_fields.map do |f|
|
20
|
+
"#{f.name} #{type(f)}#{nullness(f)}"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def type(f)
|
25
|
+
case f.type
|
26
|
+
when nil,'string',''
|
27
|
+
if f.length
|
28
|
+
"CHAR(#{f.length})"
|
29
|
+
elsif f.max_length
|
30
|
+
"VARCHAR(#{f.max_length})"
|
31
|
+
else
|
32
|
+
'TEXT'
|
33
|
+
end
|
34
|
+
when 'integer'
|
35
|
+
'INT'
|
36
|
+
when 'float'
|
37
|
+
'REAL'
|
38
|
+
when 'timestamp'
|
39
|
+
'TIMESTAMP'
|
40
|
+
when 'boolean'
|
41
|
+
'BOOLEAN'
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def nullness(f)
|
46
|
+
if f.required?
|
47
|
+
" NOT NULL"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Eatr
|
2
|
+
module Transformation
|
3
|
+
class AddDateId
|
4
|
+
def initialize(args)
|
5
|
+
@source = args.fetch('source')
|
6
|
+
@destination = args.fetch('destination')
|
7
|
+
end
|
8
|
+
|
9
|
+
def call(*objs)
|
10
|
+
Array(objs.flatten).map do |o|
|
11
|
+
if !o[@source].nil?
|
12
|
+
o[@destination] = o[@source].strftime('%Y%m%d').to_i
|
13
|
+
end
|
14
|
+
|
15
|
+
o
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Eatr
|
2
|
+
class TransformationSet
|
3
|
+
include Enumerable
|
4
|
+
|
5
|
+
def initialize(transformations)
|
6
|
+
@transformations = transformations
|
7
|
+
end
|
8
|
+
|
9
|
+
def each
|
10
|
+
to_a.each do |t|
|
11
|
+
yield t
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_a
|
16
|
+
@transformations.map do |t|
|
17
|
+
const = Object.const_get(t.fetch('class'))
|
18
|
+
|
19
|
+
if t['args']
|
20
|
+
const.new(t['args'])
|
21
|
+
else
|
22
|
+
const.new
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|