reindeer-etl 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/CODE_OF_CONDUCT.md +13 -0
- data/Gemfile +4 -0
- data/License.md +596 -0
- data/README.md +112 -0
- data/Rakefile +13 -0
- data/bin/console +15 -0
- data/bin/setup +7 -0
- data/etl/.gitignore +2 -0
- data/lib/reindeer-etl.rb +31 -0
- data/lib/reindeer-etl/destinations/csv_dest.rb +21 -0
- data/lib/reindeer-etl/errors.rb +3 -0
- data/lib/reindeer-etl/sources/base_source.rb +13 -0
- data/lib/reindeer-etl/sources/csv_source.rb +25 -0
- data/lib/reindeer-etl/sources/multi_source.rb +51 -0
- data/lib/reindeer-etl/transforms/recode.rb +61 -0
- data/lib/reindeer-etl/transforms/rename_fields.rb +26 -0
- data/lib/reindeer-etl/transforms/response_status.rb +46 -0
- data/lib/reindeer-etl/transforms/simple_transforms.rb +33 -0
- data/lib/reindeer-etl/version.rb +3 -0
- data/reindeer-etl.gemspec +37 -0
- data/test/lib/destinations_csv_dest_test.rb +0 -0
- data/test/lib/sources_base_source_test.rb +17 -0
- data/test/lib/sources_csv_source_test.rb +24 -0
- data/test/lib/sources_multi_source_test.rb +20 -0
- data/test/lib/transforms_recode_test.rb +34 -0
- data/test/lib/transforms_rename_fields_test.rb +0 -0
- data/test/lib/transforms_response_status_test.rb +22 -0
- data/test/minitest_helper.rb +7 -0
- data/test/reindeer_waterworks_test.rb +8 -0
- metadata +156 -0
data/README.md
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
[](http://badge.fury.io/rb/reindeer-etl)
|
2
|
+
[](License.md)
|
3
|
+

|
4
|
+
|
5
|
+
# ReindeerETL
|
6
|
+
|
7
|
+
Sources, Transforms and Destinations for the [Kiba](https://github.com/thbar/kiba) ETL gem
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
Add this line to your application's Gemfile:
|
12
|
+
|
13
|
+
```ruby
|
14
|
+
gem 'reindeer-etl'
|
15
|
+
```
|
16
|
+
|
17
|
+
And then execute:
|
18
|
+
|
19
|
+
$ bundle
|
20
|
+
|
21
|
+
Or install it yourself as:
|
22
|
+
|
23
|
+
$ gem install reindeer-etl
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
### Simple Example
|
28
|
+
|
29
|
+
If you have a csv file that looks like this:
|
30
|
+
|
31
|
+
input.csv
|
32
|
+
|
33
|
+
```
|
34
|
+
a,b,c
|
35
|
+
1,2,3
|
36
|
+
4,5,6
|
37
|
+
```
|
38
|
+
|
39
|
+
In your kiba ETL script you can now do this:
|
40
|
+
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
require 'reindeer-etl'
|
44
|
+
|
45
|
+
only_fields = ['a', 'b']
|
46
|
+
|
47
|
+
# Open a csv file
|
48
|
+
source(ReindeerETL::Sources::CSVSource, './input.csv',
|
49
|
+
require: only_fields, only: only_fields)
|
50
|
+
|
51
|
+
# rename a column
|
52
|
+
transform(ReindeerETL::Transforms::RenameFields, {'b'=>'c'}
|
53
|
+
|
54
|
+
# Recode all 777 values as 888
|
55
|
+
transform ReindeerETL::Transforms::Recode, cols: ['a'],
|
56
|
+
codes: {'777'=>'888'}, ignore_all: true
|
57
|
+
|
58
|
+
# Write the file to disk
|
59
|
+
destination ReindeerETL::Destinations::CSVDest, './output.csv'
|
60
|
+
```
|
61
|
+
|
62
|
+
### Joining data from multiple sources
|
63
|
+
|
64
|
+
A slightly more complex example is where you have data in multiple CSV files and
|
65
|
+
you would like to join that information into a single ETL job.
|
66
|
+
|
67
|
+
input1.csv
|
68
|
+
```
|
69
|
+
a,b,c
|
70
|
+
1,2,3
|
71
|
+
4,5,6
|
72
|
+
```
|
73
|
+
|
74
|
+
input2.csv
|
75
|
+
```
|
76
|
+
a,e,f
|
77
|
+
1,7,8
|
78
|
+
4,10,11
|
79
|
+
```
|
80
|
+
|
81
|
+
reindeer.etl
|
82
|
+
```ruby
|
83
|
+
# Open a csv file
|
84
|
+
source(ReindeerETL::Sources::MultiSource, ['./input1.csv', './input2.csv'], key: 'a')
|
85
|
+
|
86
|
+
# Write the file to disk
|
87
|
+
destination ReindeerETL::Destinations::CSVDest, './output.csv'
|
88
|
+
|
89
|
+
```
|
90
|
+
|
91
|
+
output.csv
|
92
|
+
```
|
93
|
+
a,b,c,e,f
|
94
|
+
1,2,3,7,8
|
95
|
+
4,5,6,10,11
|
96
|
+
```
|
97
|
+
|
98
|
+
### More examples coming soon
|
99
|
+
|
100
|
+
## Development
|
101
|
+
|
102
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
|
103
|
+
|
104
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
105
|
+
|
106
|
+
## Contributing
|
107
|
+
|
108
|
+
1. Fork it ( https://github.com/[my-github-username]/reindeer-etl/fork )
|
109
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
110
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
111
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
112
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require 'rake/testtask'
|
3
|
+
Rake::TestTask.new do |t|
|
4
|
+
t.libs << 'lib/reindeer-etl'
|
5
|
+
t.libs << 'test'
|
6
|
+
t.test_files = FileList[
|
7
|
+
"test/*_test.rb",
|
8
|
+
"test/lib/*_test.rb"
|
9
|
+
]
|
10
|
+
t.verbose = true
|
11
|
+
end
|
12
|
+
|
13
|
+
task default: :test
|
data/bin/console
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "reindeer-etl"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
require "pry"
|
11
|
+
Pry.config.color = true
|
12
|
+
Pry.start
|
13
|
+
|
14
|
+
#require "irb"
|
15
|
+
#IRB.start
|
data/bin/setup
ADDED
data/etl/.gitignore
ADDED
data/lib/reindeer-etl.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
module ReindeerETL
|
2
|
+
# Your code goes here...
|
3
|
+
|
4
|
+
module Sources
|
5
|
+
end
|
6
|
+
|
7
|
+
module Transforms
|
8
|
+
end
|
9
|
+
|
10
|
+
module Destinations
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
require 'rest-client'
|
16
|
+
require 'pp'
|
17
|
+
|
18
|
+
require 'reindeer-etl/version'
|
19
|
+
require 'reindeer-etl/errors'
|
20
|
+
|
21
|
+
require 'reindeer-etl/transforms/simple_transforms'
|
22
|
+
require 'reindeer-etl/transforms/recode'
|
23
|
+
require 'reindeer-etl/transforms/rename_fields'
|
24
|
+
require 'reindeer-etl/transforms/response_status'
|
25
|
+
|
26
|
+
require 'reindeer-etl/sources/base_source'
|
27
|
+
require 'reindeer-etl/sources/csv_source'
|
28
|
+
require 'reindeer-etl/sources/multi_source'
|
29
|
+
|
30
|
+
require 'reindeer-etl/destinations/csv_dest'
|
31
|
+
require 'reindeer-etl/destinations/lime_survey_curl'
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module ReindeerETL::Destinations
|
4
|
+
class CSVDest
|
5
|
+
def initialize(output_file, delimiter=',')
|
6
|
+
@csv = CSV.open(output_file, 'w', {col_sep: delimiter})
|
7
|
+
end
|
8
|
+
|
9
|
+
def write(row)
|
10
|
+
unless @headers_written
|
11
|
+
@headers_written = true
|
12
|
+
@csv << row.keys
|
13
|
+
end
|
14
|
+
@csv << row.values
|
15
|
+
end
|
16
|
+
|
17
|
+
def close
|
18
|
+
@csv.close
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module ReindeerETL::Sources
|
4
|
+
class CSVSource < BaseSource
|
5
|
+
def initialize path, opts = {}
|
6
|
+
super
|
7
|
+
@csv_opts = {headers: true, col_sep: ','}.merge(opts)
|
8
|
+
end
|
9
|
+
|
10
|
+
def each
|
11
|
+
first_run = true
|
12
|
+
CSV.foreach(@path, @csv_opts) do |row|
|
13
|
+
if first_run
|
14
|
+
first_run = false
|
15
|
+
if row.headers.count != row.headers.uniq.count
|
16
|
+
raise ReindeerETL::Errors::RecordInvalid.new('Duplicate columns')
|
17
|
+
end
|
18
|
+
end
|
19
|
+
row = row.to_hash
|
20
|
+
simple_transforms(row)
|
21
|
+
yield(row)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module ReindeerETL::Sources
|
4
|
+
|
5
|
+
class MultiSource
|
6
|
+
def initialize key, paths, opts={}
|
7
|
+
@klass = opts[:klass] || ReindeerETL::Sources::CSVSource
|
8
|
+
@key = key
|
9
|
+
@sources = paths.map{|path|
|
10
|
+
@klass.new path
|
11
|
+
}
|
12
|
+
end
|
13
|
+
|
14
|
+
def each
|
15
|
+
rows = []
|
16
|
+
all_keys = Set.new
|
17
|
+
@sources.each_with_index do |source, source_idx|
|
18
|
+
first_row = false
|
19
|
+
idx = 0
|
20
|
+
source.each do |row|
|
21
|
+
unless first_row
|
22
|
+
first_row = true
|
23
|
+
all_keys += row.keys
|
24
|
+
unless row.keys.include? @key
|
25
|
+
raise ReindeerETL::Errors::RecordInvalid.new("Path#1 missing key: #{@key}")
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
if source_idx == 0
|
30
|
+
# first source?
|
31
|
+
rows.push row
|
32
|
+
else
|
33
|
+
rindex = rows.index{|arow|arow[@key] == row[@key]}
|
34
|
+
begin
|
35
|
+
rows[rindex] = rows[rindex].merge(row)
|
36
|
+
rescue TypeError
|
37
|
+
raise ReindeerETL::Errors::RecordInvalid.new("Unable to Join source##{source_idx} - row##{idx}")
|
38
|
+
end
|
39
|
+
end
|
40
|
+
idx += 1
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
rows.each do |row|
|
45
|
+
(all_keys - row.keys).each{|k|row[k] = nil}
|
46
|
+
yield row
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module ReindeerETL::Transforms
|
2
|
+
class Recode
|
3
|
+
attr_accessor :cols
|
4
|
+
|
5
|
+
def initialize opts={}
|
6
|
+
@cols = opts[:cols]
|
7
|
+
@except = (opts[:except] || []).to_set
|
8
|
+
@codes = opts[:codes] || {}
|
9
|
+
@ignore_vals = (opts[:ignore] || [])
|
10
|
+
@ignore_all = (opts[:ignore_all] || false)
|
11
|
+
@error_on_unknown = !@ignore_all
|
12
|
+
|
13
|
+
if @cols.nil? && opts.keys.include?(:cols)
|
14
|
+
raise ArgumentError.new(':cols array is empty')
|
15
|
+
end
|
16
|
+
@cols = @cols.to_set unless @cols.nil?
|
17
|
+
raise ArgumentError.new(':codes hash is empty') if @codes.empty?
|
18
|
+
@acceptable_keys = (@codes.keys + @ignore_vals).to_set
|
19
|
+
@counter = 0
|
20
|
+
end
|
21
|
+
|
22
|
+
def process row
|
23
|
+
@cols ||= row.keys.to_set - @except
|
24
|
+
|
25
|
+
# Raise error unless all columns are present
|
26
|
+
rset = row.keys.to_set
|
27
|
+
unless @cols.subset?(rset)
|
28
|
+
m_cols = @cols - rset
|
29
|
+
raise ReindeerETL::Errors::RecordInvalid.new("Missing columns: #{m_cols.to_a}")
|
30
|
+
end
|
31
|
+
|
32
|
+
# Run recode
|
33
|
+
@cols.each do |col|
|
34
|
+
val = row[col]
|
35
|
+
_validate_val(val)
|
36
|
+
_update_row(row, col, val)
|
37
|
+
end
|
38
|
+
|
39
|
+
@counter += 1
|
40
|
+
row
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def _validate_val(val)
|
46
|
+
if @error_on_unkown && !@any_val.include?(val)
|
47
|
+
# Raise error if we don't recognize this value
|
48
|
+
raise ReindeerETL::Errors::RecordInvalid.new("Bad value: #{val}")
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def _update_row(row, col, val)
|
53
|
+
if @acceptable_keys.include? val
|
54
|
+
row[col] = @codes[val] if @codes.has_key?(val)
|
55
|
+
elsif @error_on_unknown
|
56
|
+
raise ReindeerETL::Errors::RecordInvalid.new("Invalid value in recode: row# #{@counter} {#{col}:#{val}}")
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ReindeerETL::Transforms
|
2
|
+
##
|
3
|
+
# A simple transform that renames columns
|
4
|
+
class RenameFields
|
5
|
+
def initialize cols
|
6
|
+
@cols = cols
|
7
|
+
end
|
8
|
+
|
9
|
+
def process(row)
|
10
|
+
counter=0
|
11
|
+
row_keys = row.keys.to_set
|
12
|
+
req_keys = @cols.keys.to_set
|
13
|
+
|
14
|
+
# raise an error unless all of req is in row
|
15
|
+
unless req_keys.subset?(row_keys)
|
16
|
+
raise ReindeerETL::Errors::RecordInvalid.new('Missing columns in rename')
|
17
|
+
end
|
18
|
+
@cols.each do |k, v|
|
19
|
+
next if k == v
|
20
|
+
row[v] = row.delete(k)
|
21
|
+
end
|
22
|
+
row
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module ReindeerETL::Transforms
|
2
|
+
##
|
3
|
+
# Swap out old error codes with REP_CODE, add new columns with error codes
|
4
|
+
class ResponseStatus
|
5
|
+
ERROR_CODES = %w{222 444 555 777 888 998 999}
|
6
|
+
NO_CODE = '111'
|
7
|
+
|
8
|
+
# What to replace a code with if one is found
|
9
|
+
REP_CODE = '{question_not_shown}'
|
10
|
+
REP_COL_PREFIX = 'responseStatus_'
|
11
|
+
|
12
|
+
def initialize opts={}
|
13
|
+
@except_cols = (opts[:except] || []).to_set
|
14
|
+
end
|
15
|
+
|
16
|
+
def process(row)
|
17
|
+
row_keys = row.keys.to_set
|
18
|
+
unless @except_cols.subset? row_keys
|
19
|
+
x_cols = (@except_cols - row_keys).to_a
|
20
|
+
raise ReindeerETL::Errors::RecordInvalid.new("Missing except keys: #{x_cols}")
|
21
|
+
end
|
22
|
+
oldrow = row.dup
|
23
|
+
(row_keys - @except_cols).each do |k|
|
24
|
+
new_col = "#{REP_COL_PREFIX}#{k.gsub('_','')}"
|
25
|
+
if row_keys.include? new_col
|
26
|
+
raise ReindeerETL::Errors::RecordInvalid.new("Column #{new_col} already exists")
|
27
|
+
end
|
28
|
+
val = row[k]
|
29
|
+
if _has_code?(val)
|
30
|
+
row[k] = REP_CODE
|
31
|
+
ecode = val.to_s
|
32
|
+
else
|
33
|
+
ecode = NO_CODE
|
34
|
+
end
|
35
|
+
row[new_col] = "E#{ecode}E"
|
36
|
+
end
|
37
|
+
|
38
|
+
row
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
def _has_code? val
|
43
|
+
ERROR_CODES.include?(val.to_s)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|