reindeer-etl 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md ADDED
@@ -0,0 +1,112 @@
1
+ [![Gem Version](https://badge.fury.io/rb/reindeer-etl.svg)](http://badge.fury.io/rb/reindeer-etl)
2
+ [![License](https://img.shields.io/badge/license-GPL-blue.svg)](License.md)
3
+ ![Downloads](https://img.shields.io/gem/dt/reindeer-etl.svg)
4
+
5
+ # ReindeerETL
6
+
7
+ Sources, Transforms and Destinations for the [Kiba](https://github.com/thbar/kiba) ETL gem
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ ```ruby
14
+ gem 'reindeer-etl'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install reindeer-etl
24
+
25
+ ## Usage
26
+
27
+ ### Simple Example
28
+
29
+ If you have a csv file that looks like this:
30
+
31
+ input.csv
32
+
33
+ ```
34
+ a,b,c
35
+ 1,2,3
36
+ 4,5,6
37
+ ```
38
+
39
+ In your kiba ETL script you can now do this:
40
+
41
+
42
+ ```ruby
43
+ require 'reindeer-etl'
44
+
45
+ only_fields = ['a', 'b']
46
+
47
+ # Open a csv file
48
+ source(ReindeerETL::Sources::CSVSource, './input.csv',
49
+ require: only_fields, only: only_fields)
50
+
51
+ # rename a column
52
+ transform(ReindeerETL::Transforms::RenameFields, {'b'=>'c'}
53
+
54
+ # Recode all 777 values as 888
55
+ transform ReindeerETL::Transforms::Recode, cols: ['a'],
56
+ codes: {'777'=>'888'}, ignore_all: true
57
+
58
+ # Write the file to disk
59
+ destination ReindeerETL::Destinations::CSVDest, './output.csv'
60
+ ```
61
+
62
+ ### Joining data from multiple sources
63
+
64
+ A slightly more complex example is where you have data in multiple CSV files and
65
+ you would like to join that information into a single ETL job.
66
+
67
+ input1.csv
68
+ ```
69
+ a,b,c
70
+ 1,2,3
71
+ 4,5,6
72
+ ```
73
+
74
+ input2.csv
75
+ ```
76
+ a,e,f
77
+ 1,7,8
78
+ 4,10,11
79
+ ```
80
+
81
+ reindeer.etl
82
+ ```ruby
83
+ # Open a csv file
84
+ source(ReindeerETL::Sources::MultiSource, ['./input1.csv', './input2.csv'], key: 'a')
85
+
86
+ # Write the file to disk
87
+ destination ReindeerETL::Destinations::CSVDest, './output.csv'
88
+
89
+ ```
90
+
91
+ output.csv
92
+ ```
93
+ a,b,c,e,f
94
+ 1,2,3,7,8
95
+ 4,5,6,10,11
96
+ ```
97
+
98
+ ### More examples coming soon
99
+
100
+ ## Development
101
+
102
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
103
+
104
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
105
+
106
+ ## Contributing
107
+
108
+ 1. Fork it ( https://github.com/[my-github-username]/reindeer-etl/fork )
109
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
110
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
111
+ 4. Push to the branch (`git push origin my-new-feature`)
112
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'lib/reindeer-etl'
5
+ t.libs << 'test'
6
+ t.test_files = FileList[
7
+ "test/*_test.rb",
8
+ "test/lib/*_test.rb"
9
+ ]
10
+ t.verbose = true
11
+ end
12
+
13
+ task default: :test
data/bin/console ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "reindeer-etl"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ require "pry"
11
+ Pry.config.color = true
12
+ Pry.start
13
+
14
+ #require "irb"
15
+ #IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
data/etl/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ *
2
+ !.gitignore
@@ -0,0 +1,31 @@
1
+ module ReindeerETL
2
+ # Your code goes here...
3
+
4
+ module Sources
5
+ end
6
+
7
+ module Transforms
8
+ end
9
+
10
+ module Destinations
11
+ end
12
+
13
+ end
14
+
15
+ require 'rest-client'
16
+ require 'pp'
17
+
18
+ require 'reindeer-etl/version'
19
+ require 'reindeer-etl/errors'
20
+
21
+ require 'reindeer-etl/transforms/simple_transforms'
22
+ require 'reindeer-etl/transforms/recode'
23
+ require 'reindeer-etl/transforms/rename_fields'
24
+ require 'reindeer-etl/transforms/response_status'
25
+
26
+ require 'reindeer-etl/sources/base_source'
27
+ require 'reindeer-etl/sources/csv_source'
28
+ require 'reindeer-etl/sources/multi_source'
29
+
30
+ require 'reindeer-etl/destinations/csv_dest'
31
+ require 'reindeer-etl/destinations/lime_survey_curl'
@@ -0,0 +1,21 @@
1
+ require 'csv'
2
+
3
+ module ReindeerETL::Destinations
4
+ class CSVDest
5
+ def initialize(output_file, delimiter=',')
6
+ @csv = CSV.open(output_file, 'w', {col_sep: delimiter})
7
+ end
8
+
9
+ def write(row)
10
+ unless @headers_written
11
+ @headers_written = true
12
+ @csv << row.keys
13
+ end
14
+ @csv << row.values
15
+ end
16
+
17
+ def close
18
+ @csv.close
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,3 @@
1
+ module ReindeerETL::Errors
2
+ class RecordInvalid < StandardError; end
3
+ end
@@ -0,0 +1,13 @@
1
+ require 'set'
2
+
3
+ module ReindeerETL::Sources
4
+ class BaseSource
5
+ include ReindeerETL::Transforms::SimpleTransforms
6
+
7
+ def initialize path, opts={}
8
+ @path = path
9
+ st_initialize(opts)
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,25 @@
1
+ require 'csv'
2
+
3
+ module ReindeerETL::Sources
4
+ class CSVSource < BaseSource
5
+ def initialize path, opts = {}
6
+ super
7
+ @csv_opts = {headers: true, col_sep: ','}.merge(opts)
8
+ end
9
+
10
+ def each
11
+ first_run = true
12
+ CSV.foreach(@path, @csv_opts) do |row|
13
+ if first_run
14
+ first_run = false
15
+ if row.headers.count != row.headers.uniq.count
16
+ raise ReindeerETL::Errors::RecordInvalid.new('Duplicate columns')
17
+ end
18
+ end
19
+ row = row.to_hash
20
+ simple_transforms(row)
21
+ yield(row)
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,51 @@
1
+ require 'set'
2
+
3
+ module ReindeerETL::Sources
4
+
5
+ class MultiSource
6
+ def initialize key, paths, opts={}
7
+ @klass = opts[:klass] || ReindeerETL::Sources::CSVSource
8
+ @key = key
9
+ @sources = paths.map{|path|
10
+ @klass.new path
11
+ }
12
+ end
13
+
14
+ def each
15
+ rows = []
16
+ all_keys = Set.new
17
+ @sources.each_with_index do |source, source_idx|
18
+ first_row = false
19
+ idx = 0
20
+ source.each do |row|
21
+ unless first_row
22
+ first_row = true
23
+ all_keys += row.keys
24
+ unless row.keys.include? @key
25
+ raise ReindeerETL::Errors::RecordInvalid.new("Path#1 missing key: #{@key}")
26
+ end
27
+ end
28
+
29
+ if source_idx == 0
30
+ # first source?
31
+ rows.push row
32
+ else
33
+ rindex = rows.index{|arow|arow[@key] == row[@key]}
34
+ begin
35
+ rows[rindex] = rows[rindex].merge(row)
36
+ rescue TypeError
37
+ raise ReindeerETL::Errors::RecordInvalid.new("Unable to Join source##{source_idx} - row##{idx}")
38
+ end
39
+ end
40
+ idx += 1
41
+ end
42
+ end
43
+
44
+ rows.each do |row|
45
+ (all_keys - row.keys).each{|k|row[k] = nil}
46
+ yield row
47
+ end
48
+ end
49
+ end
50
+
51
+ end
@@ -0,0 +1,61 @@
1
+ module ReindeerETL::Transforms
2
+ class Recode
3
+ attr_accessor :cols
4
+
5
+ def initialize opts={}
6
+ @cols = opts[:cols]
7
+ @except = (opts[:except] || []).to_set
8
+ @codes = opts[:codes] || {}
9
+ @ignore_vals = (opts[:ignore] || [])
10
+ @ignore_all = (opts[:ignore_all] || false)
11
+ @error_on_unknown = !@ignore_all
12
+
13
+ if @cols.nil? && opts.keys.include?(:cols)
14
+ raise ArgumentError.new(':cols array is empty')
15
+ end
16
+ @cols = @cols.to_set unless @cols.nil?
17
+ raise ArgumentError.new(':codes hash is empty') if @codes.empty?
18
+ @acceptable_keys = (@codes.keys + @ignore_vals).to_set
19
+ @counter = 0
20
+ end
21
+
22
+ def process row
23
+ @cols ||= row.keys.to_set - @except
24
+
25
+ # Raise error unless all columns are present
26
+ rset = row.keys.to_set
27
+ unless @cols.subset?(rset)
28
+ m_cols = @cols - rset
29
+ raise ReindeerETL::Errors::RecordInvalid.new("Missing columns: #{m_cols.to_a}")
30
+ end
31
+
32
+ # Run recode
33
+ @cols.each do |col|
34
+ val = row[col]
35
+ _validate_val(val)
36
+ _update_row(row, col, val)
37
+ end
38
+
39
+ @counter += 1
40
+ row
41
+ end
42
+
43
+ private
44
+
45
+ def _validate_val(val)
46
+ if @error_on_unkown && !@any_val.include?(val)
47
+ # Raise error if we don't recognize this value
48
+ raise ReindeerETL::Errors::RecordInvalid.new("Bad value: #{val}")
49
+ end
50
+ end
51
+
52
+ def _update_row(row, col, val)
53
+ if @acceptable_keys.include? val
54
+ row[col] = @codes[val] if @codes.has_key?(val)
55
+ elsif @error_on_unknown
56
+ raise ReindeerETL::Errors::RecordInvalid.new("Invalid value in recode: row# #{@counter} {#{col}:#{val}}")
57
+ end
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,26 @@
1
+ module ReindeerETL::Transforms
2
+ ##
3
+ # A simple transform that renames columns
4
+ class RenameFields
5
+ def initialize cols
6
+ @cols = cols
7
+ end
8
+
9
+ def process(row)
10
+ counter=0
11
+ row_keys = row.keys.to_set
12
+ req_keys = @cols.keys.to_set
13
+
14
+ # raise an error unless all of req is in row
15
+ unless req_keys.subset?(row_keys)
16
+ raise ReindeerETL::Errors::RecordInvalid.new('Missing columns in rename')
17
+ end
18
+ @cols.each do |k, v|
19
+ next if k == v
20
+ row[v] = row.delete(k)
21
+ end
22
+ row
23
+ end
24
+ end
25
+
26
+ end
@@ -0,0 +1,46 @@
1
+ module ReindeerETL::Transforms
2
+ ##
3
+ # Swap out old error codes with REP_CODE, add new columns with error codes
4
+ class ResponseStatus
5
+ ERROR_CODES = %w{222 444 555 777 888 998 999}
6
+ NO_CODE = '111'
7
+
8
+ # What to replace a code with if one is found
9
+ REP_CODE = '{question_not_shown}'
10
+ REP_COL_PREFIX = 'responseStatus_'
11
+
12
+ def initialize opts={}
13
+ @except_cols = (opts[:except] || []).to_set
14
+ end
15
+
16
+ def process(row)
17
+ row_keys = row.keys.to_set
18
+ unless @except_cols.subset? row_keys
19
+ x_cols = (@except_cols - row_keys).to_a
20
+ raise ReindeerETL::Errors::RecordInvalid.new("Missing except keys: #{x_cols}")
21
+ end
22
+ oldrow = row.dup
23
+ (row_keys - @except_cols).each do |k|
24
+ new_col = "#{REP_COL_PREFIX}#{k.gsub('_','')}"
25
+ if row_keys.include? new_col
26
+ raise ReindeerETL::Errors::RecordInvalid.new("Column #{new_col} already exists")
27
+ end
28
+ val = row[k]
29
+ if _has_code?(val)
30
+ row[k] = REP_CODE
31
+ ecode = val.to_s
32
+ else
33
+ ecode = NO_CODE
34
+ end
35
+ row[new_col] = "E#{ecode}E"
36
+ end
37
+
38
+ row
39
+ end
40
+
41
+ private
42
+ def _has_code? val
43
+ ERROR_CODES.include?(val.to_s)
44
+ end
45
+ end
46
+ end