reindeer-etl 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/CODE_OF_CONDUCT.md +13 -0
- data/Gemfile +4 -0
- data/License.md +596 -0
- data/README.md +112 -0
- data/Rakefile +13 -0
- data/bin/console +15 -0
- data/bin/setup +7 -0
- data/etl/.gitignore +2 -0
- data/lib/reindeer-etl.rb +31 -0
- data/lib/reindeer-etl/destinations/csv_dest.rb +21 -0
- data/lib/reindeer-etl/errors.rb +3 -0
- data/lib/reindeer-etl/sources/base_source.rb +13 -0
- data/lib/reindeer-etl/sources/csv_source.rb +25 -0
- data/lib/reindeer-etl/sources/multi_source.rb +51 -0
- data/lib/reindeer-etl/transforms/recode.rb +61 -0
- data/lib/reindeer-etl/transforms/rename_fields.rb +26 -0
- data/lib/reindeer-etl/transforms/response_status.rb +46 -0
- data/lib/reindeer-etl/transforms/simple_transforms.rb +33 -0
- data/lib/reindeer-etl/version.rb +3 -0
- data/reindeer-etl.gemspec +37 -0
- data/test/lib/destinations_csv_dest_test.rb +0 -0
- data/test/lib/sources_base_source_test.rb +17 -0
- data/test/lib/sources_csv_source_test.rb +24 -0
- data/test/lib/sources_multi_source_test.rb +20 -0
- data/test/lib/transforms_recode_test.rb +34 -0
- data/test/lib/transforms_rename_fields_test.rb +0 -0
- data/test/lib/transforms_response_status_test.rb +22 -0
- data/test/minitest_helper.rb +7 -0
- data/test/reindeer_waterworks_test.rb +8 -0
- metadata +156 -0
data/README.md
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
[![Gem Version](https://badge.fury.io/rb/reindeer-etl.svg)](http://badge.fury.io/rb/reindeer-etl)
|
2
|
+
[![License](https://img.shields.io/badge/license-GPL-blue.svg)](License.md)
|
3
|
+
![Downloads](https://img.shields.io/gem/dt/reindeer-etl.svg)
|
4
|
+
|
5
|
+
# ReindeerETL
|
6
|
+
|
7
|
+
Sources, Transforms and Destinations for the [Kiba](https://github.com/thbar/kiba) ETL gem
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
Add this line to your application's Gemfile:
|
12
|
+
|
13
|
+
```ruby
|
14
|
+
gem 'reindeer-etl'
|
15
|
+
```
|
16
|
+
|
17
|
+
And then execute:
|
18
|
+
|
19
|
+
$ bundle
|
20
|
+
|
21
|
+
Or install it yourself as:
|
22
|
+
|
23
|
+
$ gem install reindeer-etl
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
### Simple Example
|
28
|
+
|
29
|
+
If you have a csv file that looks like this:
|
30
|
+
|
31
|
+
input.csv
|
32
|
+
|
33
|
+
```
|
34
|
+
a,b,c
|
35
|
+
1,2,3
|
36
|
+
4,5,6
|
37
|
+
```
|
38
|
+
|
39
|
+
In your kiba ETL script you can now do this:
|
40
|
+
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
require 'reindeer-etl'
|
44
|
+
|
45
|
+
only_fields = ['a', 'b']
|
46
|
+
|
47
|
+
# Open a csv file
|
48
|
+
source(ReindeerETL::Sources::CSVSource, './input.csv',
|
49
|
+
require: only_fields, only: only_fields)
|
50
|
+
|
51
|
+
# rename a column
|
52
|
+
transform(ReindeerETL::Transforms::RenameFields, {'b'=>'c'}
|
53
|
+
|
54
|
+
# Recode all 777 values as 888
|
55
|
+
transform ReindeerETL::Transforms::Recode, cols: ['a'],
|
56
|
+
codes: {'777'=>'888'}, ignore_all: true
|
57
|
+
|
58
|
+
# Write the file to disk
|
59
|
+
destination ReindeerETL::Destinations::CSVDest, './output.csv'
|
60
|
+
```
|
61
|
+
|
62
|
+
### Joining data from multiple sources
|
63
|
+
|
64
|
+
A slightly more complex example is where you have data in multiple CSV files and
|
65
|
+
you would like to join that information into a single ETL job.
|
66
|
+
|
67
|
+
input1.csv
|
68
|
+
```
|
69
|
+
a,b,c
|
70
|
+
1,2,3
|
71
|
+
4,5,6
|
72
|
+
```
|
73
|
+
|
74
|
+
input2.csv
|
75
|
+
```
|
76
|
+
a,e,f
|
77
|
+
1,7,8
|
78
|
+
4,10,11
|
79
|
+
```
|
80
|
+
|
81
|
+
reindeer.etl
|
82
|
+
```ruby
|
83
|
+
# Open a csv file
|
84
|
+
source(ReindeerETL::Sources::MultiSource, ['./input1.csv', './input2.csv'], key: 'a')
|
85
|
+
|
86
|
+
# Write the file to disk
|
87
|
+
destination ReindeerETL::Destinations::CSVDest, './output.csv'
|
88
|
+
|
89
|
+
```
|
90
|
+
|
91
|
+
output.csv
|
92
|
+
```
|
93
|
+
a,b,c,e,f
|
94
|
+
1,2,3,7,8
|
95
|
+
4,5,6,10,11
|
96
|
+
```
|
97
|
+
|
98
|
+
### More examples coming soon
|
99
|
+
|
100
|
+
## Development
|
101
|
+
|
102
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
|
103
|
+
|
104
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
105
|
+
|
106
|
+
## Contributing
|
107
|
+
|
108
|
+
1. Fork it ( https://github.com/[my-github-username]/reindeer-etl/fork )
|
109
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
110
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
111
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
112
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require 'rake/testtask'
|
3
|
+
Rake::TestTask.new do |t|
|
4
|
+
t.libs << 'lib/reindeer-etl'
|
5
|
+
t.libs << 'test'
|
6
|
+
t.test_files = FileList[
|
7
|
+
"test/*_test.rb",
|
8
|
+
"test/lib/*_test.rb"
|
9
|
+
]
|
10
|
+
t.verbose = true
|
11
|
+
end
|
12
|
+
|
13
|
+
task default: :test
|
data/bin/console
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "reindeer-etl"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
require "pry"
|
11
|
+
Pry.config.color = true
|
12
|
+
Pry.start
|
13
|
+
|
14
|
+
#require "irb"
|
15
|
+
#IRB.start
|
data/bin/setup
ADDED
data/etl/.gitignore
ADDED
data/lib/reindeer-etl.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
module ReindeerETL
|
2
|
+
# Your code goes here...
|
3
|
+
|
4
|
+
module Sources
|
5
|
+
end
|
6
|
+
|
7
|
+
module Transforms
|
8
|
+
end
|
9
|
+
|
10
|
+
module Destinations
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
require 'rest-client'
|
16
|
+
require 'pp'
|
17
|
+
|
18
|
+
require 'reindeer-etl/version'
|
19
|
+
require 'reindeer-etl/errors'
|
20
|
+
|
21
|
+
require 'reindeer-etl/transforms/simple_transforms'
|
22
|
+
require 'reindeer-etl/transforms/recode'
|
23
|
+
require 'reindeer-etl/transforms/rename_fields'
|
24
|
+
require 'reindeer-etl/transforms/response_status'
|
25
|
+
|
26
|
+
require 'reindeer-etl/sources/base_source'
|
27
|
+
require 'reindeer-etl/sources/csv_source'
|
28
|
+
require 'reindeer-etl/sources/multi_source'
|
29
|
+
|
30
|
+
require 'reindeer-etl/destinations/csv_dest'
|
31
|
+
require 'reindeer-etl/destinations/lime_survey_curl'
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module ReindeerETL::Destinations
|
4
|
+
class CSVDest
|
5
|
+
def initialize(output_file, delimiter=',')
|
6
|
+
@csv = CSV.open(output_file, 'w', {col_sep: delimiter})
|
7
|
+
end
|
8
|
+
|
9
|
+
def write(row)
|
10
|
+
unless @headers_written
|
11
|
+
@headers_written = true
|
12
|
+
@csv << row.keys
|
13
|
+
end
|
14
|
+
@csv << row.values
|
15
|
+
end
|
16
|
+
|
17
|
+
def close
|
18
|
+
@csv.close
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module ReindeerETL::Sources
|
4
|
+
class CSVSource < BaseSource
|
5
|
+
def initialize path, opts = {}
|
6
|
+
super
|
7
|
+
@csv_opts = {headers: true, col_sep: ','}.merge(opts)
|
8
|
+
end
|
9
|
+
|
10
|
+
def each
|
11
|
+
first_run = true
|
12
|
+
CSV.foreach(@path, @csv_opts) do |row|
|
13
|
+
if first_run
|
14
|
+
first_run = false
|
15
|
+
if row.headers.count != row.headers.uniq.count
|
16
|
+
raise ReindeerETL::Errors::RecordInvalid.new('Duplicate columns')
|
17
|
+
end
|
18
|
+
end
|
19
|
+
row = row.to_hash
|
20
|
+
simple_transforms(row)
|
21
|
+
yield(row)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module ReindeerETL::Sources
|
4
|
+
|
5
|
+
class MultiSource
|
6
|
+
def initialize key, paths, opts={}
|
7
|
+
@klass = opts[:klass] || ReindeerETL::Sources::CSVSource
|
8
|
+
@key = key
|
9
|
+
@sources = paths.map{|path|
|
10
|
+
@klass.new path
|
11
|
+
}
|
12
|
+
end
|
13
|
+
|
14
|
+
def each
|
15
|
+
rows = []
|
16
|
+
all_keys = Set.new
|
17
|
+
@sources.each_with_index do |source, source_idx|
|
18
|
+
first_row = false
|
19
|
+
idx = 0
|
20
|
+
source.each do |row|
|
21
|
+
unless first_row
|
22
|
+
first_row = true
|
23
|
+
all_keys += row.keys
|
24
|
+
unless row.keys.include? @key
|
25
|
+
raise ReindeerETL::Errors::RecordInvalid.new("Path#1 missing key: #{@key}")
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
if source_idx == 0
|
30
|
+
# first source?
|
31
|
+
rows.push row
|
32
|
+
else
|
33
|
+
rindex = rows.index{|arow|arow[@key] == row[@key]}
|
34
|
+
begin
|
35
|
+
rows[rindex] = rows[rindex].merge(row)
|
36
|
+
rescue TypeError
|
37
|
+
raise ReindeerETL::Errors::RecordInvalid.new("Unable to Join source##{source_idx} - row##{idx}")
|
38
|
+
end
|
39
|
+
end
|
40
|
+
idx += 1
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
rows.each do |row|
|
45
|
+
(all_keys - row.keys).each{|k|row[k] = nil}
|
46
|
+
yield row
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module ReindeerETL::Transforms
|
2
|
+
class Recode
|
3
|
+
attr_accessor :cols
|
4
|
+
|
5
|
+
def initialize opts={}
|
6
|
+
@cols = opts[:cols]
|
7
|
+
@except = (opts[:except] || []).to_set
|
8
|
+
@codes = opts[:codes] || {}
|
9
|
+
@ignore_vals = (opts[:ignore] || [])
|
10
|
+
@ignore_all = (opts[:ignore_all] || false)
|
11
|
+
@error_on_unknown = !@ignore_all
|
12
|
+
|
13
|
+
if @cols.nil? && opts.keys.include?(:cols)
|
14
|
+
raise ArgumentError.new(':cols array is empty')
|
15
|
+
end
|
16
|
+
@cols = @cols.to_set unless @cols.nil?
|
17
|
+
raise ArgumentError.new(':codes hash is empty') if @codes.empty?
|
18
|
+
@acceptable_keys = (@codes.keys + @ignore_vals).to_set
|
19
|
+
@counter = 0
|
20
|
+
end
|
21
|
+
|
22
|
+
def process row
|
23
|
+
@cols ||= row.keys.to_set - @except
|
24
|
+
|
25
|
+
# Raise error unless all columns are present
|
26
|
+
rset = row.keys.to_set
|
27
|
+
unless @cols.subset?(rset)
|
28
|
+
m_cols = @cols - rset
|
29
|
+
raise ReindeerETL::Errors::RecordInvalid.new("Missing columns: #{m_cols.to_a}")
|
30
|
+
end
|
31
|
+
|
32
|
+
# Run recode
|
33
|
+
@cols.each do |col|
|
34
|
+
val = row[col]
|
35
|
+
_validate_val(val)
|
36
|
+
_update_row(row, col, val)
|
37
|
+
end
|
38
|
+
|
39
|
+
@counter += 1
|
40
|
+
row
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def _validate_val(val)
|
46
|
+
if @error_on_unkown && !@any_val.include?(val)
|
47
|
+
# Raise error if we don't recognize this value
|
48
|
+
raise ReindeerETL::Errors::RecordInvalid.new("Bad value: #{val}")
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def _update_row(row, col, val)
|
53
|
+
if @acceptable_keys.include? val
|
54
|
+
row[col] = @codes[val] if @codes.has_key?(val)
|
55
|
+
elsif @error_on_unknown
|
56
|
+
raise ReindeerETL::Errors::RecordInvalid.new("Invalid value in recode: row# #{@counter} {#{col}:#{val}}")
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ReindeerETL::Transforms
|
2
|
+
##
|
3
|
+
# A simple transform that renames columns
|
4
|
+
class RenameFields
|
5
|
+
def initialize cols
|
6
|
+
@cols = cols
|
7
|
+
end
|
8
|
+
|
9
|
+
def process(row)
|
10
|
+
counter=0
|
11
|
+
row_keys = row.keys.to_set
|
12
|
+
req_keys = @cols.keys.to_set
|
13
|
+
|
14
|
+
# raise an error unless all of req is in row
|
15
|
+
unless req_keys.subset?(row_keys)
|
16
|
+
raise ReindeerETL::Errors::RecordInvalid.new('Missing columns in rename')
|
17
|
+
end
|
18
|
+
@cols.each do |k, v|
|
19
|
+
next if k == v
|
20
|
+
row[v] = row.delete(k)
|
21
|
+
end
|
22
|
+
row
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module ReindeerETL::Transforms
|
2
|
+
##
|
3
|
+
# Swap out old error codes with REP_CODE, add new columns with error codes
|
4
|
+
class ResponseStatus
|
5
|
+
ERROR_CODES = %w{222 444 555 777 888 998 999}
|
6
|
+
NO_CODE = '111'
|
7
|
+
|
8
|
+
# What to replace a code with if one is found
|
9
|
+
REP_CODE = '{question_not_shown}'
|
10
|
+
REP_COL_PREFIX = 'responseStatus_'
|
11
|
+
|
12
|
+
def initialize opts={}
|
13
|
+
@except_cols = (opts[:except] || []).to_set
|
14
|
+
end
|
15
|
+
|
16
|
+
def process(row)
|
17
|
+
row_keys = row.keys.to_set
|
18
|
+
unless @except_cols.subset? row_keys
|
19
|
+
x_cols = (@except_cols - row_keys).to_a
|
20
|
+
raise ReindeerETL::Errors::RecordInvalid.new("Missing except keys: #{x_cols}")
|
21
|
+
end
|
22
|
+
oldrow = row.dup
|
23
|
+
(row_keys - @except_cols).each do |k|
|
24
|
+
new_col = "#{REP_COL_PREFIX}#{k.gsub('_','')}"
|
25
|
+
if row_keys.include? new_col
|
26
|
+
raise ReindeerETL::Errors::RecordInvalid.new("Column #{new_col} already exists")
|
27
|
+
end
|
28
|
+
val = row[k]
|
29
|
+
if _has_code?(val)
|
30
|
+
row[k] = REP_CODE
|
31
|
+
ecode = val.to_s
|
32
|
+
else
|
33
|
+
ecode = NO_CODE
|
34
|
+
end
|
35
|
+
row[new_col] = "E#{ecode}E"
|
36
|
+
end
|
37
|
+
|
38
|
+
row
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
def _has_code? val
|
43
|
+
ERROR_CODES.include?(val.to_s)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|