reindeer-etl 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md ADDED
@@ -0,0 +1,112 @@
1
+ [![Gem Version](https://badge.fury.io/rb/reindeer-etl.svg)](http://badge.fury.io/rb/reindeer-etl)
2
+ [![License](https://img.shields.io/badge/license-GPL-blue.svg)](License.md)
3
+ ![Downloads](https://img.shields.io/gem/dt/reindeer-etl.svg)
4
+
5
+ # ReindeerETL
6
+
7
+ Sources, Transforms and Destinations for the [Kiba](https://github.com/thbar/kiba) ETL gem
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ ```ruby
14
+ gem 'reindeer-etl'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install reindeer-etl
24
+
25
+ ## Usage
26
+
27
+ ### Simple Example
28
+
29
+ If you have a csv file that looks like this:
30
+
31
+ input.csv
32
+
33
+ ```
34
+ a,b,c
35
+ 1,2,3
36
+ 4,5,6
37
+ ```
38
+
39
+ In your kiba ETL script you can now do this:
40
+
41
+
42
+ ```ruby
43
+ require 'reindeer-etl'
44
+
45
+ only_fields = ['a', 'b']
46
+
47
+ # Open a csv file
48
+ source(ReindeerETL::Sources::CSVSource, './input.csv',
49
+ require: only_fields, only: only_fields)
50
+
51
+ # rename a column
52
+ transform(ReindeerETL::Transforms::RenameFields, {'b'=>'c'}
53
+
54
+ # Recode all 777 values as 888
55
+ transform ReindeerETL::Transforms::Recode, cols: ['a'],
56
+ codes: {'777'=>'888'}, ignore_all: true
57
+
58
+ # Write the file to disk
59
+ destination ReindeerETL::Destinations::CSVDest, './output.csv'
60
+ ```
61
+
62
+ ### Joining data from multiple sources
63
+
64
+ A slightly more complex example is where you have data in multiple CSV files and
65
+ you would like to join that information into a single ETL job.
66
+
67
+ input1.csv
68
+ ```
69
+ a,b,c
70
+ 1,2,3
71
+ 4,5,6
72
+ ```
73
+
74
+ input2.csv
75
+ ```
76
+ a,e,f
77
+ 1,7,8
78
+ 4,10,11
79
+ ```
80
+
81
+ reindeer.etl
82
+ ```ruby
83
+ # Open a csv file
84
+ source(ReindeerETL::Sources::MultiSource, ['./input1.csv', './input2.csv'], key: 'a')
85
+
86
+ # Write the file to disk
87
+ destination ReindeerETL::Destinations::CSVDest, './output.csv'
88
+
89
+ ```
90
+
91
+ output.csv
92
+ ```
93
+ a,b,c,e,f
94
+ 1,2,3,7,8
95
+ 4,5,6,10,11
96
+ ```
97
+
98
+ ### More examples coming soon
99
+
100
+ ## Development
101
+
102
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
103
+
104
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
105
+
106
+ ## Contributing
107
+
108
+ 1. Fork it ( https://github.com/[my-github-username]/reindeer-etl/fork )
109
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
110
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
111
+ 4. Push to the branch (`git push origin my-new-feature`)
112
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'lib/reindeer-etl'
5
+ t.libs << 'test'
6
+ t.test_files = FileList[
7
+ "test/*_test.rb",
8
+ "test/lib/*_test.rb"
9
+ ]
10
+ t.verbose = true
11
+ end
12
+
13
+ task default: :test
data/bin/console ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "reindeer-etl"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ require "pry"
11
+ Pry.config.color = true
12
+ Pry.start
13
+
14
+ #require "irb"
15
+ #IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
data/etl/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ *
2
+ !.gitignore
@@ -0,0 +1,31 @@
1
+ module ReindeerETL
2
+ # Your code goes here...
3
+
4
+ module Sources
5
+ end
6
+
7
+ module Transforms
8
+ end
9
+
10
+ module Destinations
11
+ end
12
+
13
+ end
14
+
15
+ require 'rest-client'
16
+ require 'pp'
17
+
18
+ require 'reindeer-etl/version'
19
+ require 'reindeer-etl/errors'
20
+
21
+ require 'reindeer-etl/transforms/simple_transforms'
22
+ require 'reindeer-etl/transforms/recode'
23
+ require 'reindeer-etl/transforms/rename_fields'
24
+ require 'reindeer-etl/transforms/response_status'
25
+
26
+ require 'reindeer-etl/sources/base_source'
27
+ require 'reindeer-etl/sources/csv_source'
28
+ require 'reindeer-etl/sources/multi_source'
29
+
30
+ require 'reindeer-etl/destinations/csv_dest'
31
+ require 'reindeer-etl/destinations/lime_survey_curl'
@@ -0,0 +1,21 @@
1
+ require 'csv'
2
+
3
+ module ReindeerETL::Destinations
4
+ class CSVDest
5
+ def initialize(output_file, delimiter=',')
6
+ @csv = CSV.open(output_file, 'w', {col_sep: delimiter})
7
+ end
8
+
9
+ def write(row)
10
+ unless @headers_written
11
+ @headers_written = true
12
+ @csv << row.keys
13
+ end
14
+ @csv << row.values
15
+ end
16
+
17
+ def close
18
+ @csv.close
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,3 @@
1
+ module ReindeerETL::Errors
2
+ class RecordInvalid < StandardError; end
3
+ end
@@ -0,0 +1,13 @@
1
+ require 'set'
2
+
3
+ module ReindeerETL::Sources
4
+ class BaseSource
5
+ include ReindeerETL::Transforms::SimpleTransforms
6
+
7
+ def initialize path, opts={}
8
+ @path = path
9
+ st_initialize(opts)
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,25 @@
1
+ require 'csv'
2
+
3
+ module ReindeerETL::Sources
4
+ class CSVSource < BaseSource
5
+ def initialize path, opts = {}
6
+ super
7
+ @csv_opts = {headers: true, col_sep: ','}.merge(opts)
8
+ end
9
+
10
+ def each
11
+ first_run = true
12
+ CSV.foreach(@path, @csv_opts) do |row|
13
+ if first_run
14
+ first_run = false
15
+ if row.headers.count != row.headers.uniq.count
16
+ raise ReindeerETL::Errors::RecordInvalid.new('Duplicate columns')
17
+ end
18
+ end
19
+ row = row.to_hash
20
+ simple_transforms(row)
21
+ yield(row)
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,51 @@
1
+ require 'set'
2
+
3
+ module ReindeerETL::Sources
4
+
5
+ class MultiSource
6
+ def initialize key, paths, opts={}
7
+ @klass = opts[:klass] || ReindeerETL::Sources::CSVSource
8
+ @key = key
9
+ @sources = paths.map{|path|
10
+ @klass.new path
11
+ }
12
+ end
13
+
14
+ def each
15
+ rows = []
16
+ all_keys = Set.new
17
+ @sources.each_with_index do |source, source_idx|
18
+ first_row = false
19
+ idx = 0
20
+ source.each do |row|
21
+ unless first_row
22
+ first_row = true
23
+ all_keys += row.keys
24
+ unless row.keys.include? @key
25
+ raise ReindeerETL::Errors::RecordInvalid.new("Path#1 missing key: #{@key}")
26
+ end
27
+ end
28
+
29
+ if source_idx == 0
30
+ # first source?
31
+ rows.push row
32
+ else
33
+ rindex = rows.index{|arow|arow[@key] == row[@key]}
34
+ begin
35
+ rows[rindex] = rows[rindex].merge(row)
36
+ rescue TypeError
37
+ raise ReindeerETL::Errors::RecordInvalid.new("Unable to Join source##{source_idx} - row##{idx}")
38
+ end
39
+ end
40
+ idx += 1
41
+ end
42
+ end
43
+
44
+ rows.each do |row|
45
+ (all_keys - row.keys).each{|k|row[k] = nil}
46
+ yield row
47
+ end
48
+ end
49
+ end
50
+
51
+ end
@@ -0,0 +1,61 @@
1
+ module ReindeerETL::Transforms
2
+ class Recode
3
+ attr_accessor :cols
4
+
5
+ def initialize opts={}
6
+ @cols = opts[:cols]
7
+ @except = (opts[:except] || []).to_set
8
+ @codes = opts[:codes] || {}
9
+ @ignore_vals = (opts[:ignore] || [])
10
+ @ignore_all = (opts[:ignore_all] || false)
11
+ @error_on_unknown = !@ignore_all
12
+
13
+ if @cols.nil? && opts.keys.include?(:cols)
14
+ raise ArgumentError.new(':cols array is empty')
15
+ end
16
+ @cols = @cols.to_set unless @cols.nil?
17
+ raise ArgumentError.new(':codes hash is empty') if @codes.empty?
18
+ @acceptable_keys = (@codes.keys + @ignore_vals).to_set
19
+ @counter = 0
20
+ end
21
+
22
+ def process row
23
+ @cols ||= row.keys.to_set - @except
24
+
25
+ # Raise error unless all columns are present
26
+ rset = row.keys.to_set
27
+ unless @cols.subset?(rset)
28
+ m_cols = @cols - rset
29
+ raise ReindeerETL::Errors::RecordInvalid.new("Missing columns: #{m_cols.to_a}")
30
+ end
31
+
32
+ # Run recode
33
+ @cols.each do |col|
34
+ val = row[col]
35
+ _validate_val(val)
36
+ _update_row(row, col, val)
37
+ end
38
+
39
+ @counter += 1
40
+ row
41
+ end
42
+
43
+ private
44
+
45
+ def _validate_val(val)
46
+ if @error_on_unkown && !@any_val.include?(val)
47
+ # Raise error if we don't recognize this value
48
+ raise ReindeerETL::Errors::RecordInvalid.new("Bad value: #{val}")
49
+ end
50
+ end
51
+
52
+ def _update_row(row, col, val)
53
+ if @acceptable_keys.include? val
54
+ row[col] = @codes[val] if @codes.has_key?(val)
55
+ elsif @error_on_unknown
56
+ raise ReindeerETL::Errors::RecordInvalid.new("Invalid value in recode: row# #{@counter} {#{col}:#{val}}")
57
+ end
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,26 @@
1
+ module ReindeerETL::Transforms
2
+ ##
3
+ # A simple transform that renames columns
4
+ class RenameFields
5
+ def initialize cols
6
+ @cols = cols
7
+ end
8
+
9
+ def process(row)
10
+ counter=0
11
+ row_keys = row.keys.to_set
12
+ req_keys = @cols.keys.to_set
13
+
14
+ # raise an error unless all of req is in row
15
+ unless req_keys.subset?(row_keys)
16
+ raise ReindeerETL::Errors::RecordInvalid.new('Missing columns in rename')
17
+ end
18
+ @cols.each do |k, v|
19
+ next if k == v
20
+ row[v] = row.delete(k)
21
+ end
22
+ row
23
+ end
24
+ end
25
+
26
+ end
@@ -0,0 +1,46 @@
1
+ module ReindeerETL::Transforms
2
+ ##
3
+ # Swap out old error codes with REP_CODE, add new columns with error codes
4
+ class ResponseStatus
5
+ ERROR_CODES = %w{222 444 555 777 888 998 999}
6
+ NO_CODE = '111'
7
+
8
+ # What to replace a code with if one is found
9
+ REP_CODE = '{question_not_shown}'
10
+ REP_COL_PREFIX = 'responseStatus_'
11
+
12
+ def initialize opts={}
13
+ @except_cols = (opts[:except] || []).to_set
14
+ end
15
+
16
+ def process(row)
17
+ row_keys = row.keys.to_set
18
+ unless @except_cols.subset? row_keys
19
+ x_cols = (@except_cols - row_keys).to_a
20
+ raise ReindeerETL::Errors::RecordInvalid.new("Missing except keys: #{x_cols}")
21
+ end
22
+ oldrow = row.dup
23
+ (row_keys - @except_cols).each do |k|
24
+ new_col = "#{REP_COL_PREFIX}#{k.gsub('_','')}"
25
+ if row_keys.include? new_col
26
+ raise ReindeerETL::Errors::RecordInvalid.new("Column #{new_col} already exists")
27
+ end
28
+ val = row[k]
29
+ if _has_code?(val)
30
+ row[k] = REP_CODE
31
+ ecode = val.to_s
32
+ else
33
+ ecode = NO_CODE
34
+ end
35
+ row[new_col] = "E#{ecode}E"
36
+ end
37
+
38
+ row
39
+ end
40
+
41
+ private
42
+ def _has_code? val
43
+ ERROR_CODES.include?(val.to_s)
44
+ end
45
+ end
46
+ end