reindeer-etl 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/CODE_OF_CONDUCT.md +13 -0
- data/Gemfile +4 -0
- data/License.md +596 -0
- data/README.md +112 -0
- data/Rakefile +13 -0
- data/bin/console +15 -0
- data/bin/setup +7 -0
- data/etl/.gitignore +2 -0
- data/lib/reindeer-etl.rb +31 -0
- data/lib/reindeer-etl/destinations/csv_dest.rb +21 -0
- data/lib/reindeer-etl/errors.rb +3 -0
- data/lib/reindeer-etl/sources/base_source.rb +13 -0
- data/lib/reindeer-etl/sources/csv_source.rb +25 -0
- data/lib/reindeer-etl/sources/multi_source.rb +51 -0
- data/lib/reindeer-etl/transforms/recode.rb +61 -0
- data/lib/reindeer-etl/transforms/rename_fields.rb +26 -0
- data/lib/reindeer-etl/transforms/response_status.rb +46 -0
- data/lib/reindeer-etl/transforms/simple_transforms.rb +33 -0
- data/lib/reindeer-etl/version.rb +3 -0
- data/reindeer-etl.gemspec +37 -0
- data/test/lib/destinations_csv_dest_test.rb +0 -0
- data/test/lib/sources_base_source_test.rb +17 -0
- data/test/lib/sources_csv_source_test.rb +24 -0
- data/test/lib/sources_multi_source_test.rb +20 -0
- data/test/lib/transforms_recode_test.rb +34 -0
- data/test/lib/transforms_rename_fields_test.rb +0 -0
- data/test/lib/transforms_response_status_test.rb +22 -0
- data/test/minitest_helper.rb +7 -0
- data/test/reindeer_waterworks_test.rb +8 -0
- metadata +156 -0
@@ -0,0 +1,33 @@
|
|
1
|
+
module ReindeerETL::Transforms
|
2
|
+
module SimpleTransforms
|
3
|
+
def st_initialize opts={}
|
4
|
+
@only_cols = (opts.delete(:only) || []).to_set
|
5
|
+
@require_cols = (opts.delete(:require) || []).to_set
|
6
|
+
end
|
7
|
+
|
8
|
+
##
|
9
|
+
# Configurable transforms in source
|
10
|
+
def simple_transforms row
|
11
|
+
st_only_cols(row) unless @only_cols.empty?
|
12
|
+
st_require_cols(row) unless @require_cols.empty?
|
13
|
+
row
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# Filter out everything except these columns
|
18
|
+
def st_only_cols dict
|
19
|
+
(dict.keys.to_set - @only_cols).each{|col|dict.delete(col)}
|
20
|
+
dict
|
21
|
+
end
|
22
|
+
|
23
|
+
##
|
24
|
+
# require these columns
|
25
|
+
def st_require_cols dict
|
26
|
+
dcols = dict.keys.to_set
|
27
|
+
unless @require_cols.subset? dict.keys.to_set
|
28
|
+
missing_cols = (@require_cols - dcols).to_a
|
29
|
+
raise ReindeerETL::Errors::RecordInvalid.new("Missing required columns: #{missing_cols}")
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'reindeer-etl/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "reindeer-etl"
|
8
|
+
spec.version = ReindeerETL::VERSION
|
9
|
+
spec.authors = ["William Hatt", "Patrick Chung"]
|
10
|
+
spec.email = ["hattb@ohsu.edu", "chungp@ohsu.edu"]
|
11
|
+
|
12
|
+
spec.summary = %q{A simple ETL pipeline for use with project reindeer and LimeSurvey}
|
13
|
+
spec.description = %q{An ETL pipeline tool for automatic data modifications to LimeSurvey}
|
14
|
+
spec.homepage = "https://github.com/OHSU-FM/reindeer-etl"
|
15
|
+
spec.license = "GPL"
|
16
|
+
|
17
|
+
# Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
|
18
|
+
# delete this section to allow pushing this gem to any host.
|
19
|
+
#if spec.respond_to?(:metadata)
|
20
|
+
# spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
|
21
|
+
#else
|
22
|
+
# raise "RubyGems 2.0 or newer is required to protect against public gem pushes."
|
23
|
+
#end
|
24
|
+
|
25
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
26
|
+
spec.test_files = Dir.glob("test/**/*.rb")
|
27
|
+
spec.bindir = "exe"
|
28
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
29
|
+
spec.require_paths = ["lib"]
|
30
|
+
|
31
|
+
spec.add_dependency "rest-client", "~> 1.8"
|
32
|
+
spec.add_development_dependency "minitest", "~> 5.7"
|
33
|
+
spec.add_development_dependency "bundler", "~> 1.9"
|
34
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
35
|
+
spec.add_development_dependency "pry", "~> 0.10.1"
|
36
|
+
|
37
|
+
end
|
File without changes
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
describe ReindeerETL::Sources::BaseSource do
|
4
|
+
it 'must remove columns when asked' do
|
5
|
+
transform = ReindeerETL::Sources::BaseSource.new '', :only=>[:aa, :bb]
|
6
|
+
row = {aa: 1, bb: 2, cc: 3}
|
7
|
+
transform.simple_transforms row
|
8
|
+
row.keys.must_equal [:aa, :bb]
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'must raise error if required fields missing' do
|
12
|
+
transform = ReindeerETL::Sources::BaseSource.new '', :require=>[:aa]
|
13
|
+
row = {cc: 3}
|
14
|
+
assert_raises(ReindeerETL::Errors::RecordInvalid){transform.simple_transforms row}
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
def loads_as_hash path, opts={}
|
4
|
+
source = ReindeerETL::Sources::CSVSource.new(path, opts)
|
5
|
+
counter = 0
|
6
|
+
source.each do |row|
|
7
|
+
row.keys.must_equal %w{a b c d e f g h i}
|
8
|
+
counter += 1
|
9
|
+
counter.must_equal row['a'].to_i
|
10
|
+
end
|
11
|
+
counter.must_equal 13
|
12
|
+
end
|
13
|
+
|
14
|
+
describe ReindeerETL::Sources::CSVSource do
|
15
|
+
it 'must yield each line of a csv file as a hash' do
|
16
|
+
path = "#{$dir}/fixtures/comma_delimited.csv"
|
17
|
+
loads_as_hash(path, :col_sep=>',')
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'must yield each line of a tabbed csv file as a hash' do
|
21
|
+
path = "#{$dir}/fixtures/tab_delimited.csv"
|
22
|
+
loads_as_hash(path, :col_sep=>"\t")
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
describe ReindeerETL::Sources::MultiSource do
|
4
|
+
it 'must merge data from multiple sources' do
|
5
|
+
klass = ReindeerETL::Sources::CSVSource
|
6
|
+
path1 = "#{$dir}/fixtures/comma_delimited.csv"
|
7
|
+
path2 = "#{$dir}/fixtures/comma_delimited_join_on_a.csv"
|
8
|
+
source = ReindeerETL::Sources::MultiSource.new('a', [path1, path2], :klass=>klass)
|
9
|
+
rows = []
|
10
|
+
keys = %w[a b c d e f g h i j k l]
|
11
|
+
source.each do |row|
|
12
|
+
rows.push row
|
13
|
+
row.keys.must_equal keys
|
14
|
+
end
|
15
|
+
rows.count.must_equal 13
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
describe ReindeerETL::Transforms::Recode do
|
4
|
+
|
5
|
+
it 'must recode values' do
|
6
|
+
row = {g: :a}
|
7
|
+
transform = ReindeerETL::Transforms::Recode.new :codes=>{:a=>:b}, :cols=>[:g]
|
8
|
+
transform.process row
|
9
|
+
row[:g].must_equal :b
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'must raise an error if the column does not exist' do
|
13
|
+
row = {hhh: :a}
|
14
|
+
transform = ReindeerETL::Transforms::Recode.new codes: {a: :b}, cols: [:g]
|
15
|
+
|
16
|
+
assert_raises(ReindeerETL::Errors::RecordInvalid){
|
17
|
+
transform.process row
|
18
|
+
}
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'must raise an error if parameters are missing' do
|
22
|
+
assert_raises(ArgumentError){
|
23
|
+
ReindeerETL::Transforms::Recode.new codes: {}
|
24
|
+
}
|
25
|
+
assert_raises(ArgumentError){
|
26
|
+
ReindeerETL::Transforms::Recode.new cols: {}
|
27
|
+
}
|
28
|
+
ReindeerETL::Transforms::Recode.new codes: {a: :b}, cols: [:hhh]
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
|
File without changes
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
describe ReindeerETL::Transforms::RenameFields do
|
4
|
+
before do
|
5
|
+
cols = {:aaa=>:bbb, :ccc=>:ddd}
|
6
|
+
@transform = ReindeerETL::Transforms::RenameFields.new cols
|
7
|
+
end
|
8
|
+
|
9
|
+
it 'must raise an error when the column is not in the row' do
|
10
|
+
row = {}
|
11
|
+
assert_raises(ReindeerETL::Errors::RecordInvalid){ @transform.process row }
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'must rename fields' do
|
15
|
+
row = {:aaa=>555, :ccc=>888}
|
16
|
+
row = @transform.process row
|
17
|
+
row.keys.include?(:aaa).must_equal false
|
18
|
+
row.keys.include?(:bbb).must_equal true
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
metadata
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: reindeer-etl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- William Hatt
|
8
|
+
- Patrick Chung
|
9
|
+
autorequire:
|
10
|
+
bindir: exe
|
11
|
+
cert_chain: []
|
12
|
+
date: 2015-07-07 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rest-client
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '1.8'
|
21
|
+
type: :runtime
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - "~>"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: '1.8'
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: minitest
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - "~>"
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '5.7'
|
35
|
+
type: :development
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - "~>"
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '5.7'
|
42
|
+
- !ruby/object:Gem::Dependency
|
43
|
+
name: bundler
|
44
|
+
requirement: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - "~>"
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '1.9'
|
49
|
+
type: :development
|
50
|
+
prerelease: false
|
51
|
+
version_requirements: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - "~>"
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '1.9'
|
56
|
+
- !ruby/object:Gem::Dependency
|
57
|
+
name: rake
|
58
|
+
requirement: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - "~>"
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '10.0'
|
63
|
+
type: :development
|
64
|
+
prerelease: false
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - "~>"
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '10.0'
|
70
|
+
- !ruby/object:Gem::Dependency
|
71
|
+
name: pry
|
72
|
+
requirement: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - "~>"
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: 0.10.1
|
77
|
+
type: :development
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - "~>"
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: 0.10.1
|
84
|
+
description: An ETL pipeline tool for automatic data modifications to LimeSurvey
|
85
|
+
email:
|
86
|
+
- hattb@ohsu.edu
|
87
|
+
- chungp@ohsu.edu
|
88
|
+
executables: []
|
89
|
+
extensions: []
|
90
|
+
extra_rdoc_files: []
|
91
|
+
files:
|
92
|
+
- ".gitignore"
|
93
|
+
- ".travis.yml"
|
94
|
+
- CODE_OF_CONDUCT.md
|
95
|
+
- Gemfile
|
96
|
+
- License.md
|
97
|
+
- README.md
|
98
|
+
- Rakefile
|
99
|
+
- bin/console
|
100
|
+
- bin/setup
|
101
|
+
- etl/.gitignore
|
102
|
+
- lib/reindeer-etl.rb
|
103
|
+
- lib/reindeer-etl/destinations/csv_dest.rb
|
104
|
+
- lib/reindeer-etl/errors.rb
|
105
|
+
- lib/reindeer-etl/sources/base_source.rb
|
106
|
+
- lib/reindeer-etl/sources/csv_source.rb
|
107
|
+
- lib/reindeer-etl/sources/multi_source.rb
|
108
|
+
- lib/reindeer-etl/transforms/recode.rb
|
109
|
+
- lib/reindeer-etl/transforms/rename_fields.rb
|
110
|
+
- lib/reindeer-etl/transforms/response_status.rb
|
111
|
+
- lib/reindeer-etl/transforms/simple_transforms.rb
|
112
|
+
- lib/reindeer-etl/version.rb
|
113
|
+
- reindeer-etl.gemspec
|
114
|
+
- test/lib/destinations_csv_dest_test.rb
|
115
|
+
- test/lib/sources_base_source_test.rb
|
116
|
+
- test/lib/sources_csv_source_test.rb
|
117
|
+
- test/lib/sources_multi_source_test.rb
|
118
|
+
- test/lib/transforms_recode_test.rb
|
119
|
+
- test/lib/transforms_rename_fields_test.rb
|
120
|
+
- test/lib/transforms_response_status_test.rb
|
121
|
+
- test/minitest_helper.rb
|
122
|
+
- test/reindeer_waterworks_test.rb
|
123
|
+
homepage: https://github.com/OHSU-FM/reindeer-etl
|
124
|
+
licenses:
|
125
|
+
- GPL
|
126
|
+
metadata: {}
|
127
|
+
post_install_message:
|
128
|
+
rdoc_options: []
|
129
|
+
require_paths:
|
130
|
+
- lib
|
131
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: '0'
|
136
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
137
|
+
requirements:
|
138
|
+
- - ">="
|
139
|
+
- !ruby/object:Gem::Version
|
140
|
+
version: '0'
|
141
|
+
requirements: []
|
142
|
+
rubyforge_project:
|
143
|
+
rubygems_version: 2.4.6
|
144
|
+
signing_key:
|
145
|
+
specification_version: 4
|
146
|
+
summary: A simple ETL pipeline for use with project reindeer and LimeSurvey
|
147
|
+
test_files:
|
148
|
+
- test/lib/transforms_recode_test.rb
|
149
|
+
- test/lib/transforms_response_status_test.rb
|
150
|
+
- test/lib/sources_csv_source_test.rb
|
151
|
+
- test/lib/transforms_rename_fields_test.rb
|
152
|
+
- test/lib/sources_multi_source_test.rb
|
153
|
+
- test/lib/sources_base_source_test.rb
|
154
|
+
- test/lib/destinations_csv_dest_test.rb
|
155
|
+
- test/reindeer_waterworks_test.rb
|
156
|
+
- test/minitest_helper.rb
|