reindeer-etl 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,33 @@
1
+ module ReindeerETL::Transforms
2
+ module SimpleTransforms
3
+ def st_initialize opts={}
4
+ @only_cols = (opts.delete(:only) || []).to_set
5
+ @require_cols = (opts.delete(:require) || []).to_set
6
+ end
7
+
8
+ ##
9
+ # Configurable transforms in source
10
+ def simple_transforms row
11
+ st_only_cols(row) unless @only_cols.empty?
12
+ st_require_cols(row) unless @require_cols.empty?
13
+ row
14
+ end
15
+
16
+ ##
17
+ # Filter out everything except these columns
18
+ def st_only_cols dict
19
+ (dict.keys.to_set - @only_cols).each{|col|dict.delete(col)}
20
+ dict
21
+ end
22
+
23
+ ##
24
+ # require these columns
25
+ def st_require_cols dict
26
+ dcols = dict.keys.to_set
27
+ unless @require_cols.subset? dict.keys.to_set
28
+ missing_cols = (@require_cols - dcols).to_a
29
+ raise ReindeerETL::Errors::RecordInvalid.new("Missing required columns: #{missing_cols}")
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,3 @@
1
+ module ReindeerETL
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,37 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'reindeer-etl/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "reindeer-etl"
8
+ spec.version = ReindeerETL::VERSION
9
+ spec.authors = ["William Hatt", "Patrick Chung"]
10
+ spec.email = ["hattb@ohsu.edu", "chungp@ohsu.edu"]
11
+
12
+ spec.summary = %q{A simple ETL pipeline for use with project reindeer and LimeSurvey}
13
+ spec.description = %q{An ETL pipeline tool for automatic data modifications to LimeSurvey}
14
+ spec.homepage = "https://github.com/OHSU-FM/reindeer-etl"
15
+ spec.license = "GPL"
16
+
17
+ # Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
18
+ # delete this section to allow pushing this gem to any host.
19
+ #if spec.respond_to?(:metadata)
20
+ # spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
21
+ #else
22
+ # raise "RubyGems 2.0 or newer is required to protect against public gem pushes."
23
+ #end
24
+
25
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
26
+ spec.test_files = Dir.glob("test/**/*.rb")
27
+ spec.bindir = "exe"
28
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ["lib"]
30
+
31
+ spec.add_dependency "rest-client", "~> 1.8"
32
+ spec.add_development_dependency "minitest", "~> 5.7"
33
+ spec.add_development_dependency "bundler", "~> 1.9"
34
+ spec.add_development_dependency "rake", "~> 10.0"
35
+ spec.add_development_dependency "pry", "~> 0.10.1"
36
+
37
+ end
File without changes
@@ -0,0 +1,17 @@
1
+ require 'minitest_helper'
2
+
3
+ describe ReindeerETL::Sources::BaseSource do
4
+ it 'must remove columns when asked' do
5
+ transform = ReindeerETL::Sources::BaseSource.new '', :only=>[:aa, :bb]
6
+ row = {aa: 1, bb: 2, cc: 3}
7
+ transform.simple_transforms row
8
+ row.keys.must_equal [:aa, :bb]
9
+ end
10
+
11
+ it 'must raise error if required fields missing' do
12
+ transform = ReindeerETL::Sources::BaseSource.new '', :require=>[:aa]
13
+ row = {cc: 3}
14
+ assert_raises(ReindeerETL::Errors::RecordInvalid){transform.simple_transforms row}
15
+ end
16
+
17
+ end
@@ -0,0 +1,24 @@
1
+ require 'minitest_helper'
2
+
3
+ def loads_as_hash path, opts={}
4
+ source = ReindeerETL::Sources::CSVSource.new(path, opts)
5
+ counter = 0
6
+ source.each do |row|
7
+ row.keys.must_equal %w{a b c d e f g h i}
8
+ counter += 1
9
+ counter.must_equal row['a'].to_i
10
+ end
11
+ counter.must_equal 13
12
+ end
13
+
14
+ describe ReindeerETL::Sources::CSVSource do
15
+ it 'must yield each line of a csv file as a hash' do
16
+ path = "#{$dir}/fixtures/comma_delimited.csv"
17
+ loads_as_hash(path, :col_sep=>',')
18
+ end
19
+
20
+ it 'must yield each line of a tabbed csv file as a hash' do
21
+ path = "#{$dir}/fixtures/tab_delimited.csv"
22
+ loads_as_hash(path, :col_sep=>"\t")
23
+ end
24
+ end
@@ -0,0 +1,20 @@
1
+ require 'minitest_helper'
2
+
3
+ describe ReindeerETL::Sources::MultiSource do
4
+ it 'must merge data from multiple sources' do
5
+ klass = ReindeerETL::Sources::CSVSource
6
+ path1 = "#{$dir}/fixtures/comma_delimited.csv"
7
+ path2 = "#{$dir}/fixtures/comma_delimited_join_on_a.csv"
8
+ source = ReindeerETL::Sources::MultiSource.new('a', [path1, path2], :klass=>klass)
9
+ rows = []
10
+ keys = %w[a b c d e f g h i j k l]
11
+ source.each do |row|
12
+ rows.push row
13
+ row.keys.must_equal keys
14
+ end
15
+ rows.count.must_equal 13
16
+ end
17
+
18
+ end
19
+
20
+
@@ -0,0 +1,34 @@
1
+ require 'minitest_helper'
2
+
3
+ describe ReindeerETL::Transforms::Recode do
4
+
5
+ it 'must recode values' do
6
+ row = {g: :a}
7
+ transform = ReindeerETL::Transforms::Recode.new :codes=>{:a=>:b}, :cols=>[:g]
8
+ transform.process row
9
+ row[:g].must_equal :b
10
+ end
11
+
12
+ it 'must raise an error if the column does not exist' do
13
+ row = {hhh: :a}
14
+ transform = ReindeerETL::Transforms::Recode.new codes: {a: :b}, cols: [:g]
15
+
16
+ assert_raises(ReindeerETL::Errors::RecordInvalid){
17
+ transform.process row
18
+ }
19
+ end
20
+
21
+ it 'must raise an error if parameters are missing' do
22
+ assert_raises(ArgumentError){
23
+ ReindeerETL::Transforms::Recode.new codes: {}
24
+ }
25
+ assert_raises(ArgumentError){
26
+ ReindeerETL::Transforms::Recode.new cols: {}
27
+ }
28
+ ReindeerETL::Transforms::Recode.new codes: {a: :b}, cols: [:hhh]
29
+ end
30
+
31
+
32
+ end
33
+
34
+
File without changes
@@ -0,0 +1,22 @@
1
+ require 'minitest_helper'
2
+
3
+ describe ReindeerETL::Transforms::RenameFields do
4
+ before do
5
+ cols = {:aaa=>:bbb, :ccc=>:ddd}
6
+ @transform = ReindeerETL::Transforms::RenameFields.new cols
7
+ end
8
+
9
+ it 'must raise an error when the column is not in the row' do
10
+ row = {}
11
+ assert_raises(ReindeerETL::Errors::RecordInvalid){ @transform.process row }
12
+ end
13
+
14
+ it 'must rename fields' do
15
+ row = {:aaa=>555, :ccc=>888}
16
+ row = @transform.process row
17
+ row.keys.include?(:aaa).must_equal false
18
+ row.keys.include?(:bbb).must_equal true
19
+ end
20
+
21
+ end
22
+
@@ -0,0 +1,7 @@
1
+ require 'pry'
2
+ require 'minitest/autorun'
3
+ require 'minitest/unit'
4
+ require 'minitest/pride'
5
+
6
+ require File.expand_path('../../lib/reindeer-etl.rb', __FILE__)
7
+ $dir = File.dirname(File.expand_path(__FILE__))
@@ -0,0 +1,8 @@
1
+ require 'minitest_helper'
2
+
3
+ class TestReindeerETL < Minitest::Test
4
+ def test_that_it_has_a_version_number
5
+ refute_nil ::ReindeerETL::VERSION
6
+ end
7
+
8
+ end
metadata ADDED
@@ -0,0 +1,156 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: reindeer-etl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - William Hatt
8
+ - Patrick Chung
9
+ autorequire:
10
+ bindir: exe
11
+ cert_chain: []
12
+ date: 2015-07-07 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rest-client
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '1.8'
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: '1.8'
28
+ - !ruby/object:Gem::Dependency
29
+ name: minitest
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - "~>"
33
+ - !ruby/object:Gem::Version
34
+ version: '5.7'
35
+ type: :development
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - "~>"
40
+ - !ruby/object:Gem::Version
41
+ version: '5.7'
42
+ - !ruby/object:Gem::Dependency
43
+ name: bundler
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '1.9'
49
+ type: :development
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '1.9'
56
+ - !ruby/object:Gem::Dependency
57
+ name: rake
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - "~>"
61
+ - !ruby/object:Gem::Version
62
+ version: '10.0'
63
+ type: :development
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: '10.0'
70
+ - !ruby/object:Gem::Dependency
71
+ name: pry
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: 0.10.1
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: 0.10.1
84
+ description: An ETL pipeline tool for automatic data modifications to LimeSurvey
85
+ email:
86
+ - hattb@ohsu.edu
87
+ - chungp@ohsu.edu
88
+ executables: []
89
+ extensions: []
90
+ extra_rdoc_files: []
91
+ files:
92
+ - ".gitignore"
93
+ - ".travis.yml"
94
+ - CODE_OF_CONDUCT.md
95
+ - Gemfile
96
+ - License.md
97
+ - README.md
98
+ - Rakefile
99
+ - bin/console
100
+ - bin/setup
101
+ - etl/.gitignore
102
+ - lib/reindeer-etl.rb
103
+ - lib/reindeer-etl/destinations/csv_dest.rb
104
+ - lib/reindeer-etl/errors.rb
105
+ - lib/reindeer-etl/sources/base_source.rb
106
+ - lib/reindeer-etl/sources/csv_source.rb
107
+ - lib/reindeer-etl/sources/multi_source.rb
108
+ - lib/reindeer-etl/transforms/recode.rb
109
+ - lib/reindeer-etl/transforms/rename_fields.rb
110
+ - lib/reindeer-etl/transforms/response_status.rb
111
+ - lib/reindeer-etl/transforms/simple_transforms.rb
112
+ - lib/reindeer-etl/version.rb
113
+ - reindeer-etl.gemspec
114
+ - test/lib/destinations_csv_dest_test.rb
115
+ - test/lib/sources_base_source_test.rb
116
+ - test/lib/sources_csv_source_test.rb
117
+ - test/lib/sources_multi_source_test.rb
118
+ - test/lib/transforms_recode_test.rb
119
+ - test/lib/transforms_rename_fields_test.rb
120
+ - test/lib/transforms_response_status_test.rb
121
+ - test/minitest_helper.rb
122
+ - test/reindeer_waterworks_test.rb
123
+ homepage: https://github.com/OHSU-FM/reindeer-etl
124
+ licenses:
125
+ - GPL
126
+ metadata: {}
127
+ post_install_message:
128
+ rdoc_options: []
129
+ require_paths:
130
+ - lib
131
+ required_ruby_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ required_rubygems_version: !ruby/object:Gem::Requirement
137
+ requirements:
138
+ - - ">="
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ requirements: []
142
+ rubyforge_project:
143
+ rubygems_version: 2.4.6
144
+ signing_key:
145
+ specification_version: 4
146
+ summary: A simple ETL pipeline for use with project reindeer and LimeSurvey
147
+ test_files:
148
+ - test/lib/transforms_recode_test.rb
149
+ - test/lib/transforms_response_status_test.rb
150
+ - test/lib/sources_csv_source_test.rb
151
+ - test/lib/transforms_rename_fields_test.rb
152
+ - test/lib/sources_multi_source_test.rb
153
+ - test/lib/sources_base_source_test.rb
154
+ - test/lib/destinations_csv_dest_test.rb
155
+ - test/reindeer_waterworks_test.rb
156
+ - test/minitest_helper.rb