chicago-etl 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +16 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +21 -0
- data/Rakefile +42 -0
- data/VERSION +1 -0
- data/chicago-etl.gemspec +117 -0
- data/lib/chicago/etl/batch.rb +110 -0
- data/lib/chicago/etl/buffering_insert_writer.rb +36 -0
- data/lib/chicago/etl/counter.rb +36 -0
- data/lib/chicago/etl/key_builder.rb +198 -0
- data/lib/chicago/etl/load_dataset_builder.rb +75 -0
- data/lib/chicago/etl/mysql_dumpfile.rb +32 -0
- data/lib/chicago/etl/mysql_load_file_value_transformer.rb +24 -0
- data/lib/chicago/etl/screens/column_screen.rb +59 -0
- data/lib/chicago/etl/screens/composite_screen.rb +17 -0
- data/lib/chicago/etl/screens/invalid_element.rb +27 -0
- data/lib/chicago/etl/screens/missing_value.rb +22 -0
- data/lib/chicago/etl/screens/out_of_bounds.rb +33 -0
- data/lib/chicago/etl/sequel/dependant_tables.rb +48 -0
- data/lib/chicago/etl/sequel/filter_to_etl_batch.rb +53 -0
- data/lib/chicago/etl/sequel/load_data_infile.rb +19 -0
- data/lib/chicago/etl/sink.rb +61 -0
- data/lib/chicago/etl/table_builder.rb +45 -0
- data/lib/chicago/etl/task_invocation.rb +32 -0
- data/lib/chicago/etl/tasks.rb +34 -0
- data/lib/chicago/etl/transformations/add_insert_timestamp.rb +16 -0
- data/lib/chicago/etl/transformations/uk_post_code.rb +40 -0
- data/lib/chicago/etl/transformations/uk_post_code_field.rb +59 -0
- data/lib/chicago/etl.rb +35 -0
- data/lib/chicago-etl.rb +0 -0
- data/spec/db_connections.yml.dist +4 -0
- data/spec/etl/batch_spec.rb +86 -0
- data/spec/etl/counter_spec.rb +44 -0
- data/spec/etl/etl_batch_id_dataset_filter.rb +29 -0
- data/spec/etl/key_builder_spec.rb +190 -0
- data/spec/etl/load_dataset_builder_spec.rb +86 -0
- data/spec/etl/mysql_dumpfile_spec.rb +42 -0
- data/spec/etl/mysql_load_file_value_transformer_spec.rb +27 -0
- data/spec/etl/screens/composite_screen_spec.rb +25 -0
- data/spec/etl/screens/invalid_element_spec.rb +27 -0
- data/spec/etl/screens/missing_value_spec.rb +58 -0
- data/spec/etl/screens/out_of_bounds_spec.rb +64 -0
- data/spec/etl/sequel/dependant_tables_spec.rb +41 -0
- data/spec/etl/sequel/filter_to_etl_batch_spec.rb +54 -0
- data/spec/etl/sequel/load_data_infile_spec.rb +37 -0
- data/spec/etl/sink_spec.rb +7 -0
- data/spec/etl/table_builder_spec.rb +22 -0
- data/spec/etl/task_spec.rb +87 -0
- data/spec/etl/transformations/add_insert_timestamp_spec.rb +9 -0
- data/spec/etl/transformations/uk_post_code_field_spec.rb +95 -0
- data/spec/etl/transformations/uk_post_code_spec.rb +102 -0
- data/spec/spec_helper.rb +20 -0
- metadata +245 -0
data/.document
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/Gemfile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
|
3
|
+
gem "chicagowarehouse", "~> 0.4"
|
4
|
+
|
5
|
+
# Add dependencies to develop your gem here.
|
6
|
+
# Include everything needed to run rake, tests, features, etc.
|
7
|
+
group :development do
|
8
|
+
gem "rspec", "~> 2"
|
9
|
+
gem "timecop"
|
10
|
+
gem "yard"
|
11
|
+
gem "flog"
|
12
|
+
gem "jeweler"
|
13
|
+
gem "rcov", :platforms => :mri_18
|
14
|
+
gem "simplecov", :platforms => :mri_19
|
15
|
+
gem "ZenTest"
|
16
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 notonthehighstreet.com
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
= chicago-etl
|
2
|
+
|
3
|
+
HIGHLY EXPERIMENTAL. If you use this, you'll find that things will vanish without warning and you'll be terrified.
|
4
|
+
|
5
|
+
An ETL pipeline for use with Chicago Warehouse.
|
6
|
+
|
7
|
+
== Contributing to chicago-etl
|
8
|
+
|
9
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
|
10
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
|
11
|
+
* Fork the project.
|
12
|
+
* Start a feature/bugfix branch.
|
13
|
+
* Commit and push until you are happy with your contribution.
|
14
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
15
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
16
|
+
|
17
|
+
== Copyright
|
18
|
+
|
19
|
+
Copyright (c) 2012 notonthehighstreet.com. See LICENSE.txt for
|
20
|
+
further details.
|
21
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "chicago-etl"
|
18
|
+
gem.homepage = "http://github.com/notonthehighstreet/chicago-etl"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = "Chicago ETL"
|
21
|
+
gem.description = "ETL tools for Chicago"
|
22
|
+
gem.email = "roland.swingler@gmail.com"
|
23
|
+
gem.authors = ["Roland Swingler"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rspec/core'
|
29
|
+
require 'rspec/core/rake_task'
|
30
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
31
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
32
|
+
end
|
33
|
+
|
34
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
35
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
36
|
+
spec.rcov = true
|
37
|
+
end
|
38
|
+
|
39
|
+
task :default => :spec
|
40
|
+
|
41
|
+
require 'yard'
|
42
|
+
YARD::Rake::YardocTask.new
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.9
|
data/chicago-etl.gemspec
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "chicago-etl"
|
8
|
+
s.version = "0.0.9"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Roland Swingler"]
|
12
|
+
s.date = "2013-02-19"
|
13
|
+
s.description = "ETL tools for Chicago"
|
14
|
+
s.email = "roland.swingler@gmail.com"
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE.txt",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".rspec",
|
22
|
+
"Gemfile",
|
23
|
+
"LICENSE.txt",
|
24
|
+
"README.rdoc",
|
25
|
+
"Rakefile",
|
26
|
+
"VERSION",
|
27
|
+
"chicago-etl.gemspec",
|
28
|
+
"lib/chicago-etl.rb",
|
29
|
+
"lib/chicago/etl.rb",
|
30
|
+
"lib/chicago/etl/batch.rb",
|
31
|
+
"lib/chicago/etl/buffering_insert_writer.rb",
|
32
|
+
"lib/chicago/etl/counter.rb",
|
33
|
+
"lib/chicago/etl/key_builder.rb",
|
34
|
+
"lib/chicago/etl/load_dataset_builder.rb",
|
35
|
+
"lib/chicago/etl/mysql_dumpfile.rb",
|
36
|
+
"lib/chicago/etl/mysql_load_file_value_transformer.rb",
|
37
|
+
"lib/chicago/etl/screens/column_screen.rb",
|
38
|
+
"lib/chicago/etl/screens/composite_screen.rb",
|
39
|
+
"lib/chicago/etl/screens/invalid_element.rb",
|
40
|
+
"lib/chicago/etl/screens/missing_value.rb",
|
41
|
+
"lib/chicago/etl/screens/out_of_bounds.rb",
|
42
|
+
"lib/chicago/etl/sequel/dependant_tables.rb",
|
43
|
+
"lib/chicago/etl/sequel/filter_to_etl_batch.rb",
|
44
|
+
"lib/chicago/etl/sequel/load_data_infile.rb",
|
45
|
+
"lib/chicago/etl/sink.rb",
|
46
|
+
"lib/chicago/etl/table_builder.rb",
|
47
|
+
"lib/chicago/etl/task_invocation.rb",
|
48
|
+
"lib/chicago/etl/tasks.rb",
|
49
|
+
"lib/chicago/etl/transformations/add_insert_timestamp.rb",
|
50
|
+
"lib/chicago/etl/transformations/uk_post_code.rb",
|
51
|
+
"lib/chicago/etl/transformations/uk_post_code_field.rb",
|
52
|
+
"spec/db_connections.yml.dist",
|
53
|
+
"spec/etl/batch_spec.rb",
|
54
|
+
"spec/etl/counter_spec.rb",
|
55
|
+
"spec/etl/etl_batch_id_dataset_filter.rb",
|
56
|
+
"spec/etl/key_builder_spec.rb",
|
57
|
+
"spec/etl/load_dataset_builder_spec.rb",
|
58
|
+
"spec/etl/mysql_dumpfile_spec.rb",
|
59
|
+
"spec/etl/mysql_load_file_value_transformer_spec.rb",
|
60
|
+
"spec/etl/screens/composite_screen_spec.rb",
|
61
|
+
"spec/etl/screens/invalid_element_spec.rb",
|
62
|
+
"spec/etl/screens/missing_value_spec.rb",
|
63
|
+
"spec/etl/screens/out_of_bounds_spec.rb",
|
64
|
+
"spec/etl/sequel/dependant_tables_spec.rb",
|
65
|
+
"spec/etl/sequel/filter_to_etl_batch_spec.rb",
|
66
|
+
"spec/etl/sequel/load_data_infile_spec.rb",
|
67
|
+
"spec/etl/sink_spec.rb",
|
68
|
+
"spec/etl/table_builder_spec.rb",
|
69
|
+
"spec/etl/task_spec.rb",
|
70
|
+
"spec/etl/transformations/add_insert_timestamp_spec.rb",
|
71
|
+
"spec/etl/transformations/uk_post_code_field_spec.rb",
|
72
|
+
"spec/etl/transformations/uk_post_code_spec.rb",
|
73
|
+
"spec/spec_helper.rb"
|
74
|
+
]
|
75
|
+
s.homepage = "http://github.com/notonthehighstreet/chicago-etl"
|
76
|
+
s.licenses = ["MIT"]
|
77
|
+
s.require_paths = ["lib"]
|
78
|
+
s.rubygems_version = "1.8.25"
|
79
|
+
s.summary = "Chicago ETL"
|
80
|
+
|
81
|
+
if s.respond_to? :specification_version then
|
82
|
+
s.specification_version = 3
|
83
|
+
|
84
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
85
|
+
s.add_runtime_dependency(%q<chicagowarehouse>, ["~> 0.4"])
|
86
|
+
s.add_development_dependency(%q<rspec>, ["~> 2"])
|
87
|
+
s.add_development_dependency(%q<timecop>, [">= 0"])
|
88
|
+
s.add_development_dependency(%q<yard>, [">= 0"])
|
89
|
+
s.add_development_dependency(%q<flog>, [">= 0"])
|
90
|
+
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
91
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
92
|
+
s.add_development_dependency(%q<simplecov>, [">= 0"])
|
93
|
+
s.add_development_dependency(%q<ZenTest>, [">= 0"])
|
94
|
+
else
|
95
|
+
s.add_dependency(%q<chicagowarehouse>, ["~> 0.4"])
|
96
|
+
s.add_dependency(%q<rspec>, ["~> 2"])
|
97
|
+
s.add_dependency(%q<timecop>, [">= 0"])
|
98
|
+
s.add_dependency(%q<yard>, [">= 0"])
|
99
|
+
s.add_dependency(%q<flog>, [">= 0"])
|
100
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
101
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
102
|
+
s.add_dependency(%q<simplecov>, [">= 0"])
|
103
|
+
s.add_dependency(%q<ZenTest>, [">= 0"])
|
104
|
+
end
|
105
|
+
else
|
106
|
+
s.add_dependency(%q<chicagowarehouse>, ["~> 0.4"])
|
107
|
+
s.add_dependency(%q<rspec>, ["~> 2"])
|
108
|
+
s.add_dependency(%q<timecop>, [">= 0"])
|
109
|
+
s.add_dependency(%q<yard>, [">= 0"])
|
110
|
+
s.add_dependency(%q<flog>, [">= 0"])
|
111
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
112
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
113
|
+
s.add_dependency(%q<simplecov>, [">= 0"])
|
114
|
+
s.add_dependency(%q<ZenTest>, [">= 0"])
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
@@ -0,0 +1,110 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'logger'
|
3
|
+
|
4
|
+
module Chicago
|
5
|
+
module ETL
|
6
|
+
# A particular "run" of the ETL process.
|
7
|
+
#
|
8
|
+
# All ETL tasks should be executed in the context of a Batch.
|
9
|
+
#
|
10
|
+
# A batch creates a temporary directory under tmp/batches/:id
|
11
|
+
# where it stores various logs and extract files.
|
12
|
+
class Batch < Sequel::Model
|
13
|
+
set_dataset :etl_batches
|
14
|
+
|
15
|
+
one_to_many :task_invocations
|
16
|
+
|
17
|
+
class << self
|
18
|
+
# Returns the Batch that should be used for the ETL process.
|
19
|
+
#
|
20
|
+
# A new batch is returned, unless the previous batch did not
|
21
|
+
# finish successfully.
|
22
|
+
#
|
23
|
+
# This should be used in preference to new or create.
|
24
|
+
def instance
|
25
|
+
(last_batch.nil? || last_batch.finished?) ? new : last_batch
|
26
|
+
end
|
27
|
+
|
28
|
+
# Returns the last batch run, or nil if this is the first batch.
|
29
|
+
def last_batch
|
30
|
+
order(:started_at).last
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Deprecated.
|
35
|
+
def load(task_name, &block)
|
36
|
+
perform_task(:load, task_name, &block)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Deprecated.
|
40
|
+
def transform(task_name, &block)
|
41
|
+
perform_task(:extract, task_name, &block)
|
42
|
+
end
|
43
|
+
|
44
|
+
# Deprecated.
|
45
|
+
def extract(task_name, &block)
|
46
|
+
perform_task(:extract, task_name, &block)
|
47
|
+
end
|
48
|
+
|
49
|
+
# Perform a named task if it hasn't already run successfully in
|
50
|
+
# this batch.
|
51
|
+
def perform_task(stage, task_name, &block)
|
52
|
+
task = find_or_create_task_invocation(stage, task_name)
|
53
|
+
task.perform(&block) unless task.finished?
|
54
|
+
end
|
55
|
+
|
56
|
+
# Returns the directory files & batch logs will be written to.
|
57
|
+
def dir
|
58
|
+
@dir ||= File.join(Chicago.project_root, "tmp", "batches", id.to_s)
|
59
|
+
end
|
60
|
+
|
61
|
+
# Starts this batch.
|
62
|
+
def start(extract_to=nil)
|
63
|
+
self.extracted_to = extract_to || Date.today
|
64
|
+
save
|
65
|
+
if state == "Started"
|
66
|
+
log.info "Started ETL batch #{id}."
|
67
|
+
else
|
68
|
+
log.info "Resumed ETL batch #{id}."
|
69
|
+
end
|
70
|
+
self
|
71
|
+
end
|
72
|
+
|
73
|
+
# Finishes this batch, and sets the finished_at timestamp.
|
74
|
+
def finish
|
75
|
+
update(:state => "Finished", :finished_at => Time.now)
|
76
|
+
end
|
77
|
+
|
78
|
+
# Sets this batch to the Error state.
|
79
|
+
def error
|
80
|
+
update(:state => "Error")
|
81
|
+
end
|
82
|
+
|
83
|
+
# Returns true if this batch is finished.
|
84
|
+
def finished?
|
85
|
+
state == "Finished"
|
86
|
+
end
|
87
|
+
|
88
|
+
# Returns true if in the error state
|
89
|
+
def in_error?
|
90
|
+
state == "Error"
|
91
|
+
end
|
92
|
+
|
93
|
+
# Returns the logger for this batch
|
94
|
+
def log
|
95
|
+
@log ||= Logger.new(File.join(dir, "log"))
|
96
|
+
end
|
97
|
+
|
98
|
+
def after_create # :nodoc:
|
99
|
+
FileUtils.mkdir_p(dir, :mode => 0777)
|
100
|
+
end
|
101
|
+
|
102
|
+
private
|
103
|
+
|
104
|
+
def find_or_create_task_invocation(stage, name)
|
105
|
+
attrs = {:stage => stage.to_s.downcase, :name => name.to_s}
|
106
|
+
task_invocations_dataset.filter(attrs).first || add_task_invocation(attrs)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'chicago/etl/sink'
|
2
|
+
|
3
|
+
module Chicago
|
4
|
+
module ETL
|
5
|
+
# Wrapper around a dataset to allowed buffered inserts.
|
6
|
+
#
|
7
|
+
# @api public
|
8
|
+
class BufferingInsertWriter < Sink
|
9
|
+
# The number of rows written before inserting to the DB.
|
10
|
+
BUFFER_SIZE = 10_000
|
11
|
+
|
12
|
+
def initialize(dataset, column_names, key=nil)
|
13
|
+
super([], column_names, key)
|
14
|
+
@dataset = dataset
|
15
|
+
end
|
16
|
+
|
17
|
+
def flush
|
18
|
+
@dataset.insert_replace.import(column_names, output)
|
19
|
+
output.clear
|
20
|
+
end
|
21
|
+
|
22
|
+
protected
|
23
|
+
|
24
|
+
def write(row)
|
25
|
+
output << @column_names.map {|name| row[name] }
|
26
|
+
flush if reached_buffer_limit?
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def reached_buffer_limit?
|
32
|
+
output.size >= BUFFER_SIZE
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'thread'
|
2
|
+
|
3
|
+
module Chicago
|
4
|
+
module ETL
|
5
|
+
# Provides a thread-safe wrapper around an incrementing number.
|
6
|
+
#
|
7
|
+
# Intended to be used for key builders, rather than using the
|
8
|
+
# database's AUTO INCREMENT functionality.
|
9
|
+
#
|
10
|
+
# @api private
|
11
|
+
class Counter
|
12
|
+
# Returns the current number this counter is on.
|
13
|
+
attr_reader :current
|
14
|
+
|
15
|
+
# Creates a new counter, optionally with a starting count.
|
16
|
+
def initialize(current_number=0, &block)
|
17
|
+
@mutex = Mutex.new
|
18
|
+
if block
|
19
|
+
@block = block
|
20
|
+
else
|
21
|
+
@current = current_number || 0
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns the next number.
|
26
|
+
#
|
27
|
+
# Modifies the current state of the counter.
|
28
|
+
def next
|
29
|
+
@current = (@block.call || 0) if @current.nil?
|
30
|
+
@mutex.synchronize do
|
31
|
+
@current += 1
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,198 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
require 'chicago/etl/buffering_insert_writer'
|
3
|
+
|
4
|
+
module Chicago
|
5
|
+
module ETL
|
6
|
+
# Builds a surrogate key for a dimension record, without relying
|
7
|
+
# on the database's AUTO_INCREMENT functionality.
|
8
|
+
#
|
9
|
+
# We avoid AUTO_INCREMENT because we need to be able to get the
|
10
|
+
# key mappings without having anything to do with the database -
|
11
|
+
# this allows us to use bulk load.
|
12
|
+
#
|
13
|
+
# @api public
|
14
|
+
class KeyBuilder
|
15
|
+
# @api private
|
16
|
+
class Factory
|
17
|
+
attr_reader :table, :staging_db
|
18
|
+
|
19
|
+
def initialize(table, staging_db)
|
20
|
+
@table = table
|
21
|
+
@staging_db = staging_db
|
22
|
+
end
|
23
|
+
|
24
|
+
def make
|
25
|
+
if dimension?
|
26
|
+
key_table = staging_db[table.key_table_name]
|
27
|
+
key_sink = BufferingInsertWriter.new(key_table,
|
28
|
+
[:original_id, :dimension_id])
|
29
|
+
|
30
|
+
if table.identifiable?
|
31
|
+
IdentifiableDimensionKeyBuilder.new(key_table, key_sink)
|
32
|
+
else
|
33
|
+
HashingKeyBuilder.new(key_table, key_sink, columns_to_hash)
|
34
|
+
end
|
35
|
+
elsif fact?
|
36
|
+
FactKeyBuilder.new(staging_db[table.table_name])
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def dimension?
|
43
|
+
table.kind_of?(Chicago::Schema::Dimension)
|
44
|
+
end
|
45
|
+
|
46
|
+
def fact?
|
47
|
+
table.kind_of?(Chicago::Schema::Fact)
|
48
|
+
end
|
49
|
+
|
50
|
+
def columns_to_hash
|
51
|
+
if table.natural_key.nil?
|
52
|
+
table.columns.map(&:name)
|
53
|
+
else
|
54
|
+
table.natural_key
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns an appropriate key builder for a schema table, using
|
60
|
+
# the staging database for key management where necessary.
|
61
|
+
def self.for_table(table, staging_db)
|
62
|
+
Factory.new(table, staging_db).make
|
63
|
+
end
|
64
|
+
|
65
|
+
def initialize(key_table, key_sink)
|
66
|
+
@key_table = key_table
|
67
|
+
@new_keys = key_sink
|
68
|
+
@counter = Counter.new { key_table.max(:dimension_id) }
|
69
|
+
end
|
70
|
+
|
71
|
+
# Returns a surrogate key, given a record row.
|
72
|
+
#
|
73
|
+
# @raises Chicago::ETL::KeyError if the surrogate key cannot be
|
74
|
+
# determined from the row data.
|
75
|
+
def key(row)
|
76
|
+
fetch_cache unless @key_mapping
|
77
|
+
row_id = original_key(row)
|
78
|
+
new_key = @key_mapping[row_id]
|
79
|
+
|
80
|
+
if new_key
|
81
|
+
new_key
|
82
|
+
else
|
83
|
+
new_key = @counter.next
|
84
|
+
@new_keys << {
|
85
|
+
:original_id => key_for_insert(row_id),
|
86
|
+
:dimension_id => new_key
|
87
|
+
}
|
88
|
+
@key_mapping[row_id] = new_key
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Returns the original key for the row.
|
93
|
+
#
|
94
|
+
# Overridden by subclasses.
|
95
|
+
def original_key(row)
|
96
|
+
end
|
97
|
+
|
98
|
+
# Flushes any newly created keys to the key table.
|
99
|
+
def flush
|
100
|
+
@new_keys.flush
|
101
|
+
end
|
102
|
+
|
103
|
+
protected
|
104
|
+
|
105
|
+
attr_reader :key_table
|
106
|
+
|
107
|
+
def fetch_cache
|
108
|
+
@key_mapping = key_table.
|
109
|
+
select_hash(original_key_select_fragment, :dimension_id)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# Key builder for identifiable dimensions.
|
114
|
+
#
|
115
|
+
# This should not be instantiated directly, use
|
116
|
+
# KeyBuilder.for_dimension.
|
117
|
+
#
|
118
|
+
# @api private
|
119
|
+
class IdentifiableDimensionKeyBuilder < KeyBuilder
|
120
|
+
def key(row)
|
121
|
+
raise KeyError.new("Row does not have an original_id field") unless row.has_key?(:original_id)
|
122
|
+
super
|
123
|
+
end
|
124
|
+
|
125
|
+
def original_key(row)
|
126
|
+
row[:original_id]
|
127
|
+
end
|
128
|
+
|
129
|
+
def key_for_insert(original_id)
|
130
|
+
original_id
|
131
|
+
end
|
132
|
+
|
133
|
+
def original_key_select_fragment
|
134
|
+
:original_id
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# Key builder for dimensions with natuaral keys, but no simple
|
139
|
+
# key.
|
140
|
+
#
|
141
|
+
# This should not be instantiated directly, use
|
142
|
+
# KeyBuilder.for_dimension.
|
143
|
+
#
|
144
|
+
# @api private
|
145
|
+
class HashingKeyBuilder < KeyBuilder
|
146
|
+
attr_reader :columns
|
147
|
+
attr_accessor :hash_preparation
|
148
|
+
|
149
|
+
def initialize(key_table, key_sink, columns)
|
150
|
+
super(key_table, key_sink)
|
151
|
+
@columns = columns
|
152
|
+
@hash_preparation = lambda {|column| column.to_s.upcase }
|
153
|
+
end
|
154
|
+
|
155
|
+
def original_key(row)
|
156
|
+
str = columns.map {|column| hash_preparation.call(row[column]) }.join
|
157
|
+
Digest::MD5.hexdigest(str).upcase
|
158
|
+
end
|
159
|
+
|
160
|
+
def key_for_insert(original_id)
|
161
|
+
("0x" + original_id).lit
|
162
|
+
end
|
163
|
+
|
164
|
+
def original_key_select_fragment
|
165
|
+
:hex.sql_function(:original_id).as(:original_id)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
# Returns ids for Fact tables.
|
170
|
+
#
|
171
|
+
# Fact table surrogate ids are transient - there is no expectation
|
172
|
+
# that the same fact row will have the same id between
|
173
|
+
# invocations. This is ok, because all facts should have a natural
|
174
|
+
# key defined - the id generated by this is purely for convenience
|
175
|
+
# and linking to error rows.
|
176
|
+
#
|
177
|
+
# As a result fact keys aren't stored in a key table - they are
|
178
|
+
# never referenced by any other tables in the system.
|
179
|
+
#
|
180
|
+
# In addition, the same row passed twice will get a different id.
|
181
|
+
class FactKeyBuilder
|
182
|
+
def initialize(db_table, key_sink=nil)
|
183
|
+
@db_table = db_table
|
184
|
+
@counter = Counter.new { @db_table.max(:id) }
|
185
|
+
end
|
186
|
+
|
187
|
+
# Returns an id given a row - the row actually has no bearing on
|
188
|
+
# the id returned.
|
189
|
+
def key(row)
|
190
|
+
@counter.next
|
191
|
+
end
|
192
|
+
|
193
|
+
# No-op, provided for interface compatability.
|
194
|
+
def flush
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|