chicago-etl 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/.document +5 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +16 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +21 -0
  6. data/Rakefile +42 -0
  7. data/VERSION +1 -0
  8. data/chicago-etl.gemspec +117 -0
  9. data/lib/chicago/etl/batch.rb +110 -0
  10. data/lib/chicago/etl/buffering_insert_writer.rb +36 -0
  11. data/lib/chicago/etl/counter.rb +36 -0
  12. data/lib/chicago/etl/key_builder.rb +198 -0
  13. data/lib/chicago/etl/load_dataset_builder.rb +75 -0
  14. data/lib/chicago/etl/mysql_dumpfile.rb +32 -0
  15. data/lib/chicago/etl/mysql_load_file_value_transformer.rb +24 -0
  16. data/lib/chicago/etl/screens/column_screen.rb +59 -0
  17. data/lib/chicago/etl/screens/composite_screen.rb +17 -0
  18. data/lib/chicago/etl/screens/invalid_element.rb +27 -0
  19. data/lib/chicago/etl/screens/missing_value.rb +22 -0
  20. data/lib/chicago/etl/screens/out_of_bounds.rb +33 -0
  21. data/lib/chicago/etl/sequel/dependant_tables.rb +48 -0
  22. data/lib/chicago/etl/sequel/filter_to_etl_batch.rb +53 -0
  23. data/lib/chicago/etl/sequel/load_data_infile.rb +19 -0
  24. data/lib/chicago/etl/sink.rb +61 -0
  25. data/lib/chicago/etl/table_builder.rb +45 -0
  26. data/lib/chicago/etl/task_invocation.rb +32 -0
  27. data/lib/chicago/etl/tasks.rb +34 -0
  28. data/lib/chicago/etl/transformations/add_insert_timestamp.rb +16 -0
  29. data/lib/chicago/etl/transformations/uk_post_code.rb +40 -0
  30. data/lib/chicago/etl/transformations/uk_post_code_field.rb +59 -0
  31. data/lib/chicago/etl.rb +35 -0
  32. data/lib/chicago-etl.rb +0 -0
  33. data/spec/db_connections.yml.dist +4 -0
  34. data/spec/etl/batch_spec.rb +86 -0
  35. data/spec/etl/counter_spec.rb +44 -0
  36. data/spec/etl/etl_batch_id_dataset_filter.rb +29 -0
  37. data/spec/etl/key_builder_spec.rb +190 -0
  38. data/spec/etl/load_dataset_builder_spec.rb +86 -0
  39. data/spec/etl/mysql_dumpfile_spec.rb +42 -0
  40. data/spec/etl/mysql_load_file_value_transformer_spec.rb +27 -0
  41. data/spec/etl/screens/composite_screen_spec.rb +25 -0
  42. data/spec/etl/screens/invalid_element_spec.rb +27 -0
  43. data/spec/etl/screens/missing_value_spec.rb +58 -0
  44. data/spec/etl/screens/out_of_bounds_spec.rb +64 -0
  45. data/spec/etl/sequel/dependant_tables_spec.rb +41 -0
  46. data/spec/etl/sequel/filter_to_etl_batch_spec.rb +54 -0
  47. data/spec/etl/sequel/load_data_infile_spec.rb +37 -0
  48. data/spec/etl/sink_spec.rb +7 -0
  49. data/spec/etl/table_builder_spec.rb +22 -0
  50. data/spec/etl/task_spec.rb +87 -0
  51. data/spec/etl/transformations/add_insert_timestamp_spec.rb +9 -0
  52. data/spec/etl/transformations/uk_post_code_field_spec.rb +95 -0
  53. data/spec/etl/transformations/uk_post_code_spec.rb +102 -0
  54. data/spec/spec_helper.rb +20 -0
  55. metadata +245 -0
@@ -0,0 +1,75 @@
1
+ require 'set'
2
+
3
+ module Chicago
4
+ module ETL
5
+ class LoadDatasetBuilder
6
+ def initialize(&block)
7
+ @constructed_columns = {}
8
+ @joins = []
9
+ configure(&block) if block_given?
10
+ end
11
+
12
+ def configure(&block)
13
+ instance_eval(&block)
14
+ self
15
+ end
16
+
17
+ def table(table_name)
18
+ @table_name = table_name
19
+ self
20
+ end
21
+
22
+ def denormalize(table, keys)
23
+ @joins << [:left_outer, table, keys]
24
+ self
25
+ end
26
+
27
+ def provide(target_name, source_column)
28
+ @constructed_columns[target_name] = source_column
29
+ self
30
+ end
31
+
32
+ def build(db, columns)
33
+ dataset = @joins.inject(db[@table_name]) {|ds, join|
34
+ ds.join_table(*join)
35
+ }
36
+
37
+ available_columns = available_columns_index(db, dataset)
38
+
39
+ select_columns = columns.map {|name|
40
+ if @constructed_columns[name].kind_of?(Symbol)
41
+ qualify_column(available_columns, @constructed_columns[name]).as(name)
42
+ elsif @constructed_columns[name]
43
+ @constructed_columns[name].as(name)
44
+ else
45
+ qualify_column(available_columns, name)
46
+ end
47
+ }
48
+
49
+ dataset.select(*select_columns)
50
+ end
51
+
52
+ private
53
+
54
+ def available_columns_index(db, dataset)
55
+ dataset.dependant_tables.inject({}) do |hsh, table|
56
+ db[table].columns.each do |column|
57
+ (hsh[column] ||= Set.new) << table
58
+ end
59
+ hsh
60
+ end
61
+ end
62
+
63
+ def qualify_column(available_columns, name)
64
+ if available_columns[name] && available_columns[name].size == 1
65
+ name.qualify(available_columns[name].first)
66
+ elsif name == :id
67
+ name.qualify(@table_name)
68
+ else
69
+ raise "Column #{name} was either ambiguous or non-existant"
70
+ end
71
+ end
72
+
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,32 @@
1
+ require 'chicago/etl/sink'
2
+
3
+ module Chicago
4
+ module ETL
5
+ # Wrapper around FasterCSV's output object, to convert values to a
6
+ # format required by MySQL's LOAD DATA INFILE command.
7
+ #
8
+ # @api public
9
+ class MysqlDumpfile < Sink
10
+ # Creates a new writer.
11
+ #
12
+ # @param csv a FasterCSV output object
13
+ # @param [Symbol] column_names columns to be output
14
+ # @param key an optional key to ensure rows are written only once.
15
+ def initialize(csv, column_names, key=nil)
16
+ super(csv, column_names, key)
17
+ @transformer = MysqlLoadFileValueTransformer.new
18
+ end
19
+
20
+ protected
21
+
22
+ # Writes a row to the output.
23
+ #
24
+ # @param Hash row Only keys in column_names will be output.
25
+ def write(row)
26
+ output << @column_names.map {|name|
27
+ @transformer.transform(row[name])
28
+ }
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,24 @@
1
+ module Chicago
2
+ module ETL
3
+ class MysqlLoadFileValueTransformer
4
+ # Transforms a value to be suitable for use in file in a LOAD
5
+ # DATA INFILE mysql statement.
6
+ def transform(value)
7
+ case value
8
+ when nil
9
+ "\\N"
10
+ when true
11
+ "1"
12
+ when false
13
+ "0"
14
+ when Time, DateTime
15
+ value.strftime("%Y-%m-%d %H:%M:%S")
16
+ when Date
17
+ value.strftime("%Y-%m-%d")
18
+ else
19
+ value
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,59 @@
1
+ module Chicago
2
+ module ETL
3
+ module Screens
4
+ class ColumnScreen
5
+ attr_reader :column, :table_name
6
+
7
+ def initialize(table_name, column)
8
+ @table_name = table_name
9
+ @column = column
10
+ @error_name = self.class.name.split('::').last.sub(/Screen$/,'').titlecase
11
+ end
12
+
13
+ def self.for_columns(table_name, columns)
14
+ screens = columns.map {|column| new(table_name, column) }
15
+ CompositeScreen.new(screens)
16
+ end
17
+
18
+ def call(row, errors=[])
19
+ value = row[column.database_name]
20
+
21
+ if applies?(value)
22
+ overwrite_value(row)
23
+ log_error(value, errors)
24
+ end
25
+
26
+ [row, errors]
27
+ end
28
+
29
+ def severity
30
+ 1
31
+ end
32
+
33
+ private
34
+
35
+ def overwrite_value(row)
36
+ row[column.database_name] = column.default_value
37
+ end
38
+
39
+ def log_error(value, errors)
40
+ errors << error_hash(value)
41
+ end
42
+
43
+ def error_hash(value)
44
+ {
45
+ :process_name => "StandardTransformations",
46
+ :process_version => 2,
47
+ :table => table_name.to_s,
48
+ :column => column.database_name.to_s,
49
+ :severity => severity,
50
+ :error => @error_name
51
+ }
52
+ end
53
+
54
+ def applies?(value)
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,17 @@
1
+ module Chicago
2
+ module ETL
3
+ module Screens
4
+ class CompositeScreen
5
+ def initialize(*screens)
6
+ @screens = screens.flatten
7
+ end
8
+
9
+ def call(row, errors=[])
10
+ @screens.inject([row,errors]) do |(row, errors), screen|
11
+ screen.call(row, errors)
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,27 @@
1
+ module Chicago
2
+ module ETL
3
+ module Screens
4
+ class InvalidElement < ColumnScreen
5
+ def self.for_columns(table_name, columns)
6
+ screens = columns.select(&:elements).
7
+ map {|column| new(table_name, column) }
8
+ CompositeScreen.new(screens)
9
+ end
10
+
11
+ def severity
12
+ 3
13
+ end
14
+
15
+ def applies?(value)
16
+ column.elements &&
17
+ !column.elements.map(&:downcase).include?(value.to_s.downcase)
18
+ end
19
+
20
+ def error_hash(value)
21
+ super(value).
22
+ merge(:error_detail => "'#{value}' is not a valid value.")
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,22 @@
1
+ module Chicago
2
+ module ETL
3
+ module Screens
4
+ class MissingValue < ColumnScreen
5
+ def severity
6
+ column.descriptive? ? 1 : 2
7
+ end
8
+
9
+ def log_error(value, errors)
10
+ if ! (column.column_type == :boolean || column.optional?)
11
+ errors << error_hash(value)
12
+ end
13
+ end
14
+
15
+ def applies?(value)
16
+ value.nil? ||
17
+ (column.column_type == :string && value.blank?)
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,33 @@
1
+ module Chicago
2
+ module ETL
3
+ module Screens
4
+ class OutOfBounds < ColumnScreen
5
+ def severity
6
+ 2
7
+ end
8
+
9
+ def applies?(value)
10
+ return false unless value
11
+
12
+ (column.numeric? && applies_to_numeric?(value)) ||
13
+ (column.column_type == :string && applies_to_string?(value))
14
+ end
15
+
16
+ def overwrite_value(row)
17
+ end
18
+
19
+ private
20
+
21
+ def applies_to_numeric?(value)
22
+ (column.min && value < column.min) ||
23
+ (column.max && value > column.max)
24
+ end
25
+
26
+ def applies_to_string?(value)
27
+ (column.min && value.length < column.min) ||
28
+ (column.max && value.length > column.max)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,48 @@
1
+ require 'sequel'
2
+
3
+ module Chicago
4
+ module ETL
5
+ module SequelExtensions
6
+ module DependantTables
7
+ # Returns an Array of table names used in this dataset.
8
+ #
9
+ # Handles joins and if the recurse flag is true, unions and
10
+ # nested datasets.
11
+ def dependant_tables(recurse=true)
12
+ tables = extract_dependant_tables_in_clause(opts[:from].first, recurse)
13
+
14
+ if opts[:compounds]
15
+ tables += opts[:compounds].map {|(_, dataset, _)|
16
+ dataset.dependant_tables
17
+ }
18
+ end
19
+
20
+ if opts[:join]
21
+ tables += opts[:join].map {|join|
22
+ extract_dependant_tables_in_clause(join.table, recurse)
23
+ }
24
+ end
25
+
26
+ tables.flatten.uniq
27
+ end
28
+
29
+ private
30
+
31
+ def extract_dependant_tables_in_clause(clause, recurse)
32
+ case clause
33
+ when Symbol
34
+ [clause]
35
+ when Sequel::SQL::AliasedExpression
36
+ extract_dependant_tables_in_clause(clause.expression, recurse)
37
+ when Sequel::Dataset
38
+ recurse ? clause.dependant_tables : []
39
+ else
40
+ []
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
47
+
48
+ Sequel::Dataset.send :include, Chicago::ETL::SequelExtensions::DependantTables
@@ -0,0 +1,53 @@
1
+ module Chicago
2
+ module ETL
3
+ module SequelExtensions
4
+ module FilterToEtlBatch
5
+ def filter_to_etl_batch(etl_batch)
6
+ conditions = (opts[:from] + (opts[:join] || [])).
7
+ select {|e| has_etl_batch_column?(e) }.
8
+ map {|e| make_etl_batch_filter(e, etl_batch) }
9
+
10
+ ds = conditions.any? ? filter(conditions.inject {|a,b| a | b}) : dup
11
+
12
+ if ds.opts[:compounds]
13
+ ds.opts[:compounds].each do |compound|
14
+ compound[1] = compound[1].filter_to_etl_batch(etl_batch)
15
+ end
16
+ end
17
+
18
+ ds
19
+ end
20
+
21
+ private
22
+
23
+ def make_etl_batch_filter(expression, etl_batch)
24
+ table = case expression
25
+ when Sequel::SQL::AliasedExpression
26
+ expression.aliaz
27
+ when Sequel::SQL::JoinClause
28
+ expression.table_alias || expression.table
29
+ else
30
+ expression
31
+ end
32
+
33
+ {:etl_batch_id.qualify(table) => etl_batch.id}
34
+ end
35
+
36
+ def has_etl_batch_column?(expression)
37
+ case expression
38
+ when Sequel::SQL::AliasedExpression
39
+ has_etl_batch_column?(expression.expression)
40
+ when Symbol
41
+ db.schema(expression).map(&:first).include?(:etl_batch_id)
42
+ when Sequel::SQL::JoinClause
43
+ has_etl_batch_column?(expression.table)
44
+ else
45
+ false
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
52
+
53
+ Sequel::Dataset.send :include, Chicago::ETL::SequelExtensions::FilterToEtlBatch
@@ -0,0 +1,19 @@
1
+ module Chicago
2
+ module ETL
3
+ module SequelExtensions
4
+ module LoadDataInfile
5
+ # Loads the CSV data columns in filepath into this dataset's table.
6
+ def load_csv_infile(filepath, columns)
7
+ execute_dui(load_csv_infile_sql(filepath, columns))
8
+ end
9
+
10
+ def load_csv_infile_sql(filepath, columns)
11
+ replacement = opts[:insert_ignore] ? "IGNORE" : "REPLACE"
12
+ "LOAD DATA INFILE '#{filepath}' #{replacement} INTO TABLE `#{opts[:from].first}` CHARACTER SET 'utf8' FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"' ESCAPED BY '\"' (`#{columns.join('`,`')}`);"
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
18
+
19
+ Sequel::Dataset.send :include, Chicago::ETL::SequelExtensions::LoadDataInfile
@@ -0,0 +1,61 @@
1
+ require 'set'
2
+
3
+ module Chicago
4
+ module ETL
5
+ # An end point to write rows.
6
+ #
7
+ # @abstract
8
+ # @api public
9
+ class Sink
10
+ # Returns the column names expected to be written to this sink.
11
+ # @api public
12
+ attr_reader :column_names
13
+
14
+ # @abstract
15
+ def initialize(output, column_names, unique_row_key=nil)
16
+ @output = output
17
+ @column_names = column_names
18
+ @written_rows = Set.new
19
+ @unique_row_key = unique_row_key
20
+ end
21
+
22
+ # Writes a row to the output.
23
+ #
24
+ # Row will not be written to the output if it has already been
25
+ # written, as identified by the unique row key.
26
+ #
27
+ # Should not be overridden by subclasses - overwrite write instead.
28
+ def <<(row)
29
+ unless written?(row)
30
+ write row
31
+ @written_rows << row[@unique_row_key]
32
+ end
33
+ end
34
+
35
+ # Flushes any remaining writes to the output.
36
+ #
37
+ # By default does nothing, subclasses should override where
38
+ # necessary.
39
+ def flush
40
+ end
41
+
42
+ # Returns true if this row has previously been written to the
43
+ # output.
44
+ #
45
+ # Always returns false if no key to determine row uniqueness has
46
+ # been provided.
47
+ def written?(row)
48
+ return false if @unique_row_key.nil?
49
+ @written_rows.include?(row[@unique_row_key])
50
+ end
51
+
52
+ protected
53
+
54
+ attr_reader :output
55
+
56
+ # @abstract
57
+ def write(row)
58
+ end
59
+ end
60
+ end
61
+ end