chicago-etl 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +16 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +21 -0
- data/Rakefile +42 -0
- data/VERSION +1 -0
- data/chicago-etl.gemspec +117 -0
- data/lib/chicago/etl/batch.rb +110 -0
- data/lib/chicago/etl/buffering_insert_writer.rb +36 -0
- data/lib/chicago/etl/counter.rb +36 -0
- data/lib/chicago/etl/key_builder.rb +198 -0
- data/lib/chicago/etl/load_dataset_builder.rb +75 -0
- data/lib/chicago/etl/mysql_dumpfile.rb +32 -0
- data/lib/chicago/etl/mysql_load_file_value_transformer.rb +24 -0
- data/lib/chicago/etl/screens/column_screen.rb +59 -0
- data/lib/chicago/etl/screens/composite_screen.rb +17 -0
- data/lib/chicago/etl/screens/invalid_element.rb +27 -0
- data/lib/chicago/etl/screens/missing_value.rb +22 -0
- data/lib/chicago/etl/screens/out_of_bounds.rb +33 -0
- data/lib/chicago/etl/sequel/dependant_tables.rb +48 -0
- data/lib/chicago/etl/sequel/filter_to_etl_batch.rb +53 -0
- data/lib/chicago/etl/sequel/load_data_infile.rb +19 -0
- data/lib/chicago/etl/sink.rb +61 -0
- data/lib/chicago/etl/table_builder.rb +45 -0
- data/lib/chicago/etl/task_invocation.rb +32 -0
- data/lib/chicago/etl/tasks.rb +34 -0
- data/lib/chicago/etl/transformations/add_insert_timestamp.rb +16 -0
- data/lib/chicago/etl/transformations/uk_post_code.rb +40 -0
- data/lib/chicago/etl/transformations/uk_post_code_field.rb +59 -0
- data/lib/chicago/etl.rb +35 -0
- data/lib/chicago-etl.rb +0 -0
- data/spec/db_connections.yml.dist +4 -0
- data/spec/etl/batch_spec.rb +86 -0
- data/spec/etl/counter_spec.rb +44 -0
- data/spec/etl/etl_batch_id_dataset_filter.rb +29 -0
- data/spec/etl/key_builder_spec.rb +190 -0
- data/spec/etl/load_dataset_builder_spec.rb +86 -0
- data/spec/etl/mysql_dumpfile_spec.rb +42 -0
- data/spec/etl/mysql_load_file_value_transformer_spec.rb +27 -0
- data/spec/etl/screens/composite_screen_spec.rb +25 -0
- data/spec/etl/screens/invalid_element_spec.rb +27 -0
- data/spec/etl/screens/missing_value_spec.rb +58 -0
- data/spec/etl/screens/out_of_bounds_spec.rb +64 -0
- data/spec/etl/sequel/dependant_tables_spec.rb +41 -0
- data/spec/etl/sequel/filter_to_etl_batch_spec.rb +54 -0
- data/spec/etl/sequel/load_data_infile_spec.rb +37 -0
- data/spec/etl/sink_spec.rb +7 -0
- data/spec/etl/table_builder_spec.rb +22 -0
- data/spec/etl/task_spec.rb +87 -0
- data/spec/etl/transformations/add_insert_timestamp_spec.rb +9 -0
- data/spec/etl/transformations/uk_post_code_field_spec.rb +95 -0
- data/spec/etl/transformations/uk_post_code_spec.rb +102 -0
- data/spec/spec_helper.rb +20 -0
- metadata +245 -0
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Chicago
|
4
|
+
module ETL
|
5
|
+
class LoadDatasetBuilder
|
6
|
+
def initialize(&block)
|
7
|
+
@constructed_columns = {}
|
8
|
+
@joins = []
|
9
|
+
configure(&block) if block_given?
|
10
|
+
end
|
11
|
+
|
12
|
+
def configure(&block)
|
13
|
+
instance_eval(&block)
|
14
|
+
self
|
15
|
+
end
|
16
|
+
|
17
|
+
def table(table_name)
|
18
|
+
@table_name = table_name
|
19
|
+
self
|
20
|
+
end
|
21
|
+
|
22
|
+
def denormalize(table, keys)
|
23
|
+
@joins << [:left_outer, table, keys]
|
24
|
+
self
|
25
|
+
end
|
26
|
+
|
27
|
+
def provide(target_name, source_column)
|
28
|
+
@constructed_columns[target_name] = source_column
|
29
|
+
self
|
30
|
+
end
|
31
|
+
|
32
|
+
def build(db, columns)
|
33
|
+
dataset = @joins.inject(db[@table_name]) {|ds, join|
|
34
|
+
ds.join_table(*join)
|
35
|
+
}
|
36
|
+
|
37
|
+
available_columns = available_columns_index(db, dataset)
|
38
|
+
|
39
|
+
select_columns = columns.map {|name|
|
40
|
+
if @constructed_columns[name].kind_of?(Symbol)
|
41
|
+
qualify_column(available_columns, @constructed_columns[name]).as(name)
|
42
|
+
elsif @constructed_columns[name]
|
43
|
+
@constructed_columns[name].as(name)
|
44
|
+
else
|
45
|
+
qualify_column(available_columns, name)
|
46
|
+
end
|
47
|
+
}
|
48
|
+
|
49
|
+
dataset.select(*select_columns)
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def available_columns_index(db, dataset)
|
55
|
+
dataset.dependant_tables.inject({}) do |hsh, table|
|
56
|
+
db[table].columns.each do |column|
|
57
|
+
(hsh[column] ||= Set.new) << table
|
58
|
+
end
|
59
|
+
hsh
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def qualify_column(available_columns, name)
|
64
|
+
if available_columns[name] && available_columns[name].size == 1
|
65
|
+
name.qualify(available_columns[name].first)
|
66
|
+
elsif name == :id
|
67
|
+
name.qualify(@table_name)
|
68
|
+
else
|
69
|
+
raise "Column #{name} was either ambiguous or non-existant"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'chicago/etl/sink'
|
2
|
+
|
3
|
+
module Chicago
|
4
|
+
module ETL
|
5
|
+
# Wrapper around FasterCSV's output object, to convert values to a
|
6
|
+
# format required by MySQL's LOAD DATA INFILE command.
|
7
|
+
#
|
8
|
+
# @api public
|
9
|
+
class MysqlDumpfile < Sink
|
10
|
+
# Creates a new writer.
|
11
|
+
#
|
12
|
+
# @param csv a FasterCSV output object
|
13
|
+
# @param [Symbol] column_names columns to be output
|
14
|
+
# @param key an optional key to ensure rows are written only once.
|
15
|
+
def initialize(csv, column_names, key=nil)
|
16
|
+
super(csv, column_names, key)
|
17
|
+
@transformer = MysqlLoadFileValueTransformer.new
|
18
|
+
end
|
19
|
+
|
20
|
+
protected
|
21
|
+
|
22
|
+
# Writes a row to the output.
|
23
|
+
#
|
24
|
+
# @param Hash row Only keys in column_names will be output.
|
25
|
+
def write(row)
|
26
|
+
output << @column_names.map {|name|
|
27
|
+
@transformer.transform(row[name])
|
28
|
+
}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
class MysqlLoadFileValueTransformer
|
4
|
+
# Transforms a value to be suitable for use in file in a LOAD
|
5
|
+
# DATA INFILE mysql statement.
|
6
|
+
def transform(value)
|
7
|
+
case value
|
8
|
+
when nil
|
9
|
+
"\\N"
|
10
|
+
when true
|
11
|
+
"1"
|
12
|
+
when false
|
13
|
+
"0"
|
14
|
+
when Time, DateTime
|
15
|
+
value.strftime("%Y-%m-%d %H:%M:%S")
|
16
|
+
when Date
|
17
|
+
value.strftime("%Y-%m-%d")
|
18
|
+
else
|
19
|
+
value
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
module Screens
|
4
|
+
class ColumnScreen
|
5
|
+
attr_reader :column, :table_name
|
6
|
+
|
7
|
+
def initialize(table_name, column)
|
8
|
+
@table_name = table_name
|
9
|
+
@column = column
|
10
|
+
@error_name = self.class.name.split('::').last.sub(/Screen$/,'').titlecase
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.for_columns(table_name, columns)
|
14
|
+
screens = columns.map {|column| new(table_name, column) }
|
15
|
+
CompositeScreen.new(screens)
|
16
|
+
end
|
17
|
+
|
18
|
+
def call(row, errors=[])
|
19
|
+
value = row[column.database_name]
|
20
|
+
|
21
|
+
if applies?(value)
|
22
|
+
overwrite_value(row)
|
23
|
+
log_error(value, errors)
|
24
|
+
end
|
25
|
+
|
26
|
+
[row, errors]
|
27
|
+
end
|
28
|
+
|
29
|
+
def severity
|
30
|
+
1
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def overwrite_value(row)
|
36
|
+
row[column.database_name] = column.default_value
|
37
|
+
end
|
38
|
+
|
39
|
+
def log_error(value, errors)
|
40
|
+
errors << error_hash(value)
|
41
|
+
end
|
42
|
+
|
43
|
+
def error_hash(value)
|
44
|
+
{
|
45
|
+
:process_name => "StandardTransformations",
|
46
|
+
:process_version => 2,
|
47
|
+
:table => table_name.to_s,
|
48
|
+
:column => column.database_name.to_s,
|
49
|
+
:severity => severity,
|
50
|
+
:error => @error_name
|
51
|
+
}
|
52
|
+
end
|
53
|
+
|
54
|
+
def applies?(value)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
module Screens
|
4
|
+
class CompositeScreen
|
5
|
+
def initialize(*screens)
|
6
|
+
@screens = screens.flatten
|
7
|
+
end
|
8
|
+
|
9
|
+
def call(row, errors=[])
|
10
|
+
@screens.inject([row,errors]) do |(row, errors), screen|
|
11
|
+
screen.call(row, errors)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
module Screens
|
4
|
+
class InvalidElement < ColumnScreen
|
5
|
+
def self.for_columns(table_name, columns)
|
6
|
+
screens = columns.select(&:elements).
|
7
|
+
map {|column| new(table_name, column) }
|
8
|
+
CompositeScreen.new(screens)
|
9
|
+
end
|
10
|
+
|
11
|
+
def severity
|
12
|
+
3
|
13
|
+
end
|
14
|
+
|
15
|
+
def applies?(value)
|
16
|
+
column.elements &&
|
17
|
+
!column.elements.map(&:downcase).include?(value.to_s.downcase)
|
18
|
+
end
|
19
|
+
|
20
|
+
def error_hash(value)
|
21
|
+
super(value).
|
22
|
+
merge(:error_detail => "'#{value}' is not a valid value.")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
module Screens
|
4
|
+
class MissingValue < ColumnScreen
|
5
|
+
def severity
|
6
|
+
column.descriptive? ? 1 : 2
|
7
|
+
end
|
8
|
+
|
9
|
+
def log_error(value, errors)
|
10
|
+
if ! (column.column_type == :boolean || column.optional?)
|
11
|
+
errors << error_hash(value)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def applies?(value)
|
16
|
+
value.nil? ||
|
17
|
+
(column.column_type == :string && value.blank?)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
module Screens
|
4
|
+
class OutOfBounds < ColumnScreen
|
5
|
+
def severity
|
6
|
+
2
|
7
|
+
end
|
8
|
+
|
9
|
+
def applies?(value)
|
10
|
+
return false unless value
|
11
|
+
|
12
|
+
(column.numeric? && applies_to_numeric?(value)) ||
|
13
|
+
(column.column_type == :string && applies_to_string?(value))
|
14
|
+
end
|
15
|
+
|
16
|
+
def overwrite_value(row)
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def applies_to_numeric?(value)
|
22
|
+
(column.min && value < column.min) ||
|
23
|
+
(column.max && value > column.max)
|
24
|
+
end
|
25
|
+
|
26
|
+
def applies_to_string?(value)
|
27
|
+
(column.min && value.length < column.min) ||
|
28
|
+
(column.max && value.length > column.max)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
|
3
|
+
module Chicago
|
4
|
+
module ETL
|
5
|
+
module SequelExtensions
|
6
|
+
module DependantTables
|
7
|
+
# Returns an Array of table names used in this dataset.
|
8
|
+
#
|
9
|
+
# Handles joins and if the recurse flag is true, unions and
|
10
|
+
# nested datasets.
|
11
|
+
def dependant_tables(recurse=true)
|
12
|
+
tables = extract_dependant_tables_in_clause(opts[:from].first, recurse)
|
13
|
+
|
14
|
+
if opts[:compounds]
|
15
|
+
tables += opts[:compounds].map {|(_, dataset, _)|
|
16
|
+
dataset.dependant_tables
|
17
|
+
}
|
18
|
+
end
|
19
|
+
|
20
|
+
if opts[:join]
|
21
|
+
tables += opts[:join].map {|join|
|
22
|
+
extract_dependant_tables_in_clause(join.table, recurse)
|
23
|
+
}
|
24
|
+
end
|
25
|
+
|
26
|
+
tables.flatten.uniq
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def extract_dependant_tables_in_clause(clause, recurse)
|
32
|
+
case clause
|
33
|
+
when Symbol
|
34
|
+
[clause]
|
35
|
+
when Sequel::SQL::AliasedExpression
|
36
|
+
extract_dependant_tables_in_clause(clause.expression, recurse)
|
37
|
+
when Sequel::Dataset
|
38
|
+
recurse ? clause.dependant_tables : []
|
39
|
+
else
|
40
|
+
[]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
Sequel::Dataset.send :include, Chicago::ETL::SequelExtensions::DependantTables
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
module SequelExtensions
|
4
|
+
module FilterToEtlBatch
|
5
|
+
def filter_to_etl_batch(etl_batch)
|
6
|
+
conditions = (opts[:from] + (opts[:join] || [])).
|
7
|
+
select {|e| has_etl_batch_column?(e) }.
|
8
|
+
map {|e| make_etl_batch_filter(e, etl_batch) }
|
9
|
+
|
10
|
+
ds = conditions.any? ? filter(conditions.inject {|a,b| a | b}) : dup
|
11
|
+
|
12
|
+
if ds.opts[:compounds]
|
13
|
+
ds.opts[:compounds].each do |compound|
|
14
|
+
compound[1] = compound[1].filter_to_etl_batch(etl_batch)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
ds
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def make_etl_batch_filter(expression, etl_batch)
|
24
|
+
table = case expression
|
25
|
+
when Sequel::SQL::AliasedExpression
|
26
|
+
expression.aliaz
|
27
|
+
when Sequel::SQL::JoinClause
|
28
|
+
expression.table_alias || expression.table
|
29
|
+
else
|
30
|
+
expression
|
31
|
+
end
|
32
|
+
|
33
|
+
{:etl_batch_id.qualify(table) => etl_batch.id}
|
34
|
+
end
|
35
|
+
|
36
|
+
def has_etl_batch_column?(expression)
|
37
|
+
case expression
|
38
|
+
when Sequel::SQL::AliasedExpression
|
39
|
+
has_etl_batch_column?(expression.expression)
|
40
|
+
when Symbol
|
41
|
+
db.schema(expression).map(&:first).include?(:etl_batch_id)
|
42
|
+
when Sequel::SQL::JoinClause
|
43
|
+
has_etl_batch_column?(expression.table)
|
44
|
+
else
|
45
|
+
false
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
Sequel::Dataset.send :include, Chicago::ETL::SequelExtensions::FilterToEtlBatch
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
module SequelExtensions
|
4
|
+
module LoadDataInfile
|
5
|
+
# Loads the CSV data columns in filepath into this dataset's table.
|
6
|
+
def load_csv_infile(filepath, columns)
|
7
|
+
execute_dui(load_csv_infile_sql(filepath, columns))
|
8
|
+
end
|
9
|
+
|
10
|
+
def load_csv_infile_sql(filepath, columns)
|
11
|
+
replacement = opts[:insert_ignore] ? "IGNORE" : "REPLACE"
|
12
|
+
"LOAD DATA INFILE '#{filepath}' #{replacement} INTO TABLE `#{opts[:from].first}` CHARACTER SET 'utf8' FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"' ESCAPED BY '\"' (`#{columns.join('`,`')}`);"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
Sequel::Dataset.send :include, Chicago::ETL::SequelExtensions::LoadDataInfile
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Chicago
|
4
|
+
module ETL
|
5
|
+
# An end point to write rows.
|
6
|
+
#
|
7
|
+
# @abstract
|
8
|
+
# @api public
|
9
|
+
class Sink
|
10
|
+
# Returns the column names expected to be written to this sink.
|
11
|
+
# @api public
|
12
|
+
attr_reader :column_names
|
13
|
+
|
14
|
+
# @abstract
|
15
|
+
def initialize(output, column_names, unique_row_key=nil)
|
16
|
+
@output = output
|
17
|
+
@column_names = column_names
|
18
|
+
@written_rows = Set.new
|
19
|
+
@unique_row_key = unique_row_key
|
20
|
+
end
|
21
|
+
|
22
|
+
# Writes a row to the output.
|
23
|
+
#
|
24
|
+
# Row will not be written to the output if it has already been
|
25
|
+
# written, as identified by the unique row key.
|
26
|
+
#
|
27
|
+
# Should not be overridden by subclasses - overwrite write instead.
|
28
|
+
def <<(row)
|
29
|
+
unless written?(row)
|
30
|
+
write row
|
31
|
+
@written_rows << row[@unique_row_key]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Flushes any remaining writes to the output.
|
36
|
+
#
|
37
|
+
# By default does nothing, subclasses should override where
|
38
|
+
# necessary.
|
39
|
+
def flush
|
40
|
+
end
|
41
|
+
|
42
|
+
# Returns true if this row has previously been written to the
|
43
|
+
# output.
|
44
|
+
#
|
45
|
+
# Always returns false if no key to determine row uniqueness has
|
46
|
+
# been provided.
|
47
|
+
def written?(row)
|
48
|
+
return false if @unique_row_key.nil?
|
49
|
+
@written_rows.include?(row[@unique_row_key])
|
50
|
+
end
|
51
|
+
|
52
|
+
protected
|
53
|
+
|
54
|
+
attr_reader :output
|
55
|
+
|
56
|
+
# @abstract
|
57
|
+
def write(row)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|