chicago-etl 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +16 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +21 -0
- data/Rakefile +42 -0
- data/VERSION +1 -0
- data/chicago-etl.gemspec +117 -0
- data/lib/chicago/etl/batch.rb +110 -0
- data/lib/chicago/etl/buffering_insert_writer.rb +36 -0
- data/lib/chicago/etl/counter.rb +36 -0
- data/lib/chicago/etl/key_builder.rb +198 -0
- data/lib/chicago/etl/load_dataset_builder.rb +75 -0
- data/lib/chicago/etl/mysql_dumpfile.rb +32 -0
- data/lib/chicago/etl/mysql_load_file_value_transformer.rb +24 -0
- data/lib/chicago/etl/screens/column_screen.rb +59 -0
- data/lib/chicago/etl/screens/composite_screen.rb +17 -0
- data/lib/chicago/etl/screens/invalid_element.rb +27 -0
- data/lib/chicago/etl/screens/missing_value.rb +22 -0
- data/lib/chicago/etl/screens/out_of_bounds.rb +33 -0
- data/lib/chicago/etl/sequel/dependant_tables.rb +48 -0
- data/lib/chicago/etl/sequel/filter_to_etl_batch.rb +53 -0
- data/lib/chicago/etl/sequel/load_data_infile.rb +19 -0
- data/lib/chicago/etl/sink.rb +61 -0
- data/lib/chicago/etl/table_builder.rb +45 -0
- data/lib/chicago/etl/task_invocation.rb +32 -0
- data/lib/chicago/etl/tasks.rb +34 -0
- data/lib/chicago/etl/transformations/add_insert_timestamp.rb +16 -0
- data/lib/chicago/etl/transformations/uk_post_code.rb +40 -0
- data/lib/chicago/etl/transformations/uk_post_code_field.rb +59 -0
- data/lib/chicago/etl.rb +35 -0
- data/lib/chicago-etl.rb +0 -0
- data/spec/db_connections.yml.dist +4 -0
- data/spec/etl/batch_spec.rb +86 -0
- data/spec/etl/counter_spec.rb +44 -0
- data/spec/etl/etl_batch_id_dataset_filter.rb +29 -0
- data/spec/etl/key_builder_spec.rb +190 -0
- data/spec/etl/load_dataset_builder_spec.rb +86 -0
- data/spec/etl/mysql_dumpfile_spec.rb +42 -0
- data/spec/etl/mysql_load_file_value_transformer_spec.rb +27 -0
- data/spec/etl/screens/composite_screen_spec.rb +25 -0
- data/spec/etl/screens/invalid_element_spec.rb +27 -0
- data/spec/etl/screens/missing_value_spec.rb +58 -0
- data/spec/etl/screens/out_of_bounds_spec.rb +64 -0
- data/spec/etl/sequel/dependant_tables_spec.rb +41 -0
- data/spec/etl/sequel/filter_to_etl_batch_spec.rb +54 -0
- data/spec/etl/sequel/load_data_infile_spec.rb +37 -0
- data/spec/etl/sink_spec.rb +7 -0
- data/spec/etl/table_builder_spec.rb +22 -0
- data/spec/etl/task_spec.rb +87 -0
- data/spec/etl/transformations/add_insert_timestamp_spec.rb +9 -0
- data/spec/etl/transformations/uk_post_code_field_spec.rb +95 -0
- data/spec/etl/transformations/uk_post_code_spec.rb +102 -0
- data/spec/spec_helper.rb +20 -0
- metadata +245 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
# Builds ETL tables.
|
4
|
+
class TableBuilder
|
5
|
+
# Creates the necessary tables for the ETL process in the given
|
6
|
+
# database.
|
7
|
+
def self.build(db)
|
8
|
+
new(db).build
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(db) # :nodoc:
|
12
|
+
@db = db
|
13
|
+
end
|
14
|
+
|
15
|
+
def build # :nodoc:
|
16
|
+
create_table :etl_batches do
|
17
|
+
primary_key :id, :type => :integer, :unsigned => true
|
18
|
+
timestamp :started_at, :null => false, :default => :current_timestamp.sql_function
|
19
|
+
timestamp :finished_at, :null => true, :default => nil
|
20
|
+
timestamp :extracted_to, :null => true, :default => nil
|
21
|
+
enum :state, :null => false, :elements => %w{Started Finished Error}, :default => "Started"
|
22
|
+
end
|
23
|
+
|
24
|
+
create_table :etl_task_invocations do
|
25
|
+
primary_key :id, :type => :integer, :unsigned => true
|
26
|
+
integer :batch_id, :unsigned => true, :null => false
|
27
|
+
enum :stage, :null => false, :elements => %w{Extract Transform Load}
|
28
|
+
varchar :name, :null => false
|
29
|
+
timestamp :started_at, :null => false, :default => :current_timestamp.sql_function
|
30
|
+
timestamp :finished_at, :null => true, :default => nil
|
31
|
+
enum :state, :null => false, :elements => %w{Created Started Finished Error}, :default => "Created"
|
32
|
+
smallint :attempts, :null => false, :unsigned => true
|
33
|
+
|
34
|
+
index [:batch_id, :stage, :name], :unique => true
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def create_table(table, &block)
|
41
|
+
@db.create_table(table, :engine => "innodb", &block) unless @db.tables.include?(table)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
|
4
|
+
class TaskInvocation < Sequel::Model
|
5
|
+
set_dataset :etl_task_invocations
|
6
|
+
many_to_one :batch
|
7
|
+
|
8
|
+
# Executes a block of code.
|
9
|
+
#
|
10
|
+
# Sets the state to "Error" and re-raises any exception that the
|
11
|
+
# block of code raises.
|
12
|
+
def perform
|
13
|
+
raise RuntimeError.new("The task #{name} in batch #{batch_id} has already run") if finished?
|
14
|
+
update(:state => "Started", :attempts => attempts + 1)
|
15
|
+
begin
|
16
|
+
yield
|
17
|
+
rescue Exception => e
|
18
|
+
update(:state => "Error")
|
19
|
+
batch.error if batch
|
20
|
+
raise e
|
21
|
+
end
|
22
|
+
update(:state => "Finished", :finished_at => Time.now)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns true if this task has finished running successfully.
|
26
|
+
def finished?
|
27
|
+
state == "Finished"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'rake/tasklib'
|
2
|
+
|
3
|
+
module Chicago
|
4
|
+
module ETL
|
5
|
+
# ETL Rake tasks for a Chicago project.
|
6
|
+
#
|
7
|
+
# To use, simply include:
|
8
|
+
#
|
9
|
+
# Chicago::ETL::RakeTasks.new(db, schema)
|
10
|
+
#
|
11
|
+
# in your project's Rakefile.
|
12
|
+
#
|
13
|
+
# Provides the following tasks:
|
14
|
+
#
|
15
|
+
# +db:create_etl_tables+:: defines the tables used for ETL batches
|
16
|
+
# and the like
|
17
|
+
class RakeTasks < Rake::TaskLib
|
18
|
+
def initialize(db, schema)
|
19
|
+
@db = db
|
20
|
+
@schema = schema
|
21
|
+
define
|
22
|
+
end
|
23
|
+
|
24
|
+
def define
|
25
|
+
namespace :db do
|
26
|
+
desc "Creates the etl tables"
|
27
|
+
task :create_etl_tables do
|
28
|
+
Chicago::ETL::TableBuilder.build(@db)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
module Transformations
|
4
|
+
class AddInsertTimestamp
|
5
|
+
def initialize(timestamp=Time.now)
|
6
|
+
@insert_timestamp = timestamp.utc
|
7
|
+
end
|
8
|
+
|
9
|
+
def call(row, errors=[])
|
10
|
+
row[:_inserted_at] = @insert_timestamp
|
11
|
+
[row, errors]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module Chicago
|
3
|
+
module ETL
|
4
|
+
module Transformations
|
5
|
+
# Cleans and reformats UK-based postcodes.
|
6
|
+
#
|
7
|
+
# Transformations are based on observed errors in data entry, so
|
8
|
+
# shift key slips (i.e. typing '!' where '1' was meant) are
|
9
|
+
# corrected, as are use of numbers where letters were intended
|
10
|
+
# i.e. (0X -> OX for Oxfordshire postcodes).
|
11
|
+
#
|
12
|
+
# Leaves BFPO postcodes alone.
|
13
|
+
class UkPostCode
|
14
|
+
# Creates a new post code transformation.
|
15
|
+
#
|
16
|
+
# @param Symbol column_name - the name of the column
|
17
|
+
# containing the post code.
|
18
|
+
#
|
19
|
+
# @param Proc filter_block - an optional block, which takes a
|
20
|
+
# row. If the block returns false, the transformation will
|
21
|
+
# not be run. This can be useful to only run the
|
22
|
+
# transformation on UK addresses, based a country field in
|
23
|
+
# the row for example.
|
24
|
+
def initialize(column_name, &filter_block)
|
25
|
+
@column_name = column_name
|
26
|
+
@filter_block = filter_block
|
27
|
+
end
|
28
|
+
|
29
|
+
def call(row, errors=[])
|
30
|
+
return [row, errors] if @filter_block && !@filter_block.call(row)
|
31
|
+
|
32
|
+
row[@column_name] = UkPostCodeField.new.
|
33
|
+
normalize(row[@column_name])[:post_code]
|
34
|
+
|
35
|
+
[row, errors]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
module Chicago
|
4
|
+
module ETL
|
5
|
+
module Transformations
|
6
|
+
class UkPostCodeField
|
7
|
+
MATCH = /\A([A-Z][A-Z]?[0-9][0-9A-Z]?)(?:([0-9][A-Z]{2}))?\Z/
|
8
|
+
|
9
|
+
# Returns cleaned, formatted data about a UK Post Code.
|
10
|
+
#
|
11
|
+
# Example:
|
12
|
+
#
|
13
|
+
# UkPostCodeField.new.normalize(" SW !2 4 GH")
|
14
|
+
# # => { :post_code => "SW12 4GH",
|
15
|
+
# :outcode => "SW12",
|
16
|
+
# :incode => "4GH" }
|
17
|
+
#
|
18
|
+
# Partial postcodes will be returned without the incode. BFPO
|
19
|
+
# postcodes are supported, but have no incode or
|
20
|
+
# outcode. Postcodes that do not follow the format will be
|
21
|
+
# returned as is, with an invalid key set.
|
22
|
+
def normalize(raw_post_code)
|
23
|
+
reformat(clean(raw_post_code)) ||
|
24
|
+
{:post_code => raw_post_code, :invalid => true}
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def clean(raw_post_code)
|
30
|
+
raw_post_code.
|
31
|
+
strip.
|
32
|
+
upcase.
|
33
|
+
tr('!"$%^&*()', '124567890').
|
34
|
+
gsub("£", "3").
|
35
|
+
sub(/^0([XL])/, 'O\1').
|
36
|
+
sub(/^([PSCY])0/, '\1O')
|
37
|
+
end
|
38
|
+
|
39
|
+
def reformat(post_code)
|
40
|
+
if post_code[0..3] == "BFPO"
|
41
|
+
{ :post_code => post_code.sub(/BFPO\s*/, "BFPO ") }
|
42
|
+
else
|
43
|
+
reformat_standard_post_code(post_code)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def reformat_standard_post_code(post_code)
|
48
|
+
match = post_code.gsub(/\s+/,'').match(MATCH)
|
49
|
+
|
50
|
+
unless match.nil?
|
51
|
+
{ :outcode => match[1],
|
52
|
+
:incode => match[2],
|
53
|
+
:post_code => [match[1], match[2]].join(' ').strip }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
data/lib/chicago/etl.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
|
3
|
+
require 'chicago/etl/counter'
|
4
|
+
require 'chicago/etl/key_builder'
|
5
|
+
require 'chicago/etl/sink'
|
6
|
+
require 'chicago/etl/mysql_load_file_value_transformer'
|
7
|
+
require 'chicago/etl/buffering_insert_writer'
|
8
|
+
require 'chicago/etl/mysql_dumpfile'
|
9
|
+
|
10
|
+
require 'chicago/etl/load_dataset_builder'
|
11
|
+
|
12
|
+
# Sequel Extensions
|
13
|
+
require 'chicago/etl/sequel/filter_to_etl_batch'
|
14
|
+
require 'chicago/etl/sequel/load_data_infile'
|
15
|
+
require 'chicago/etl/sequel/dependant_tables'
|
16
|
+
|
17
|
+
# Screens
|
18
|
+
require 'chicago/etl/screens/column_screen'
|
19
|
+
require 'chicago/etl/screens/composite_screen'
|
20
|
+
require 'chicago/etl/screens/missing_value'
|
21
|
+
require 'chicago/etl/screens/invalid_element'
|
22
|
+
require 'chicago/etl/screens/out_of_bounds'
|
23
|
+
|
24
|
+
# Transformations
|
25
|
+
require 'chicago/etl/transformations/add_insert_timestamp'
|
26
|
+
require 'chicago/etl/transformations/uk_post_code'
|
27
|
+
require 'chicago/etl/transformations/uk_post_code_field'
|
28
|
+
|
29
|
+
module Chicago
|
30
|
+
module ETL
|
31
|
+
autoload :TableBuilder, 'chicago/etl/table_builder.rb'
|
32
|
+
autoload :Batch, 'chicago/etl/batch.rb'
|
33
|
+
autoload :TaskInvocation, 'chicago/etl/task_invocation.rb'
|
34
|
+
end
|
35
|
+
end
|
data/lib/chicago-etl.rb
ADDED
File without changes
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Chicago::ETL::Batch do
|
4
|
+
before :each do
|
5
|
+
TEST_DB.drop_table(*(TEST_DB.tables))
|
6
|
+
ETL::TableBuilder.build(TEST_DB)
|
7
|
+
ETL::Batch.db = TEST_DB
|
8
|
+
Chicago.project_root = File.expand_path(File.join(File.dirname(__FILE__), ".."))
|
9
|
+
tmpdir = File.expand_path(File.join(File.dirname(__FILE__), "..", "tmp"))
|
10
|
+
FileUtils.rm_r(tmpdir) if File.exists?(tmpdir)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should return a new batch when instance is called and there are no outstanding batches in error" do
|
14
|
+
ETL::Batch.instance.should be_new
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should set the start timestamp of the batch to now when created" do
|
18
|
+
ETL::Batch.instance.start.started_at.to_i.should == Time.now.to_i
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should have a state of 'Started' when started" do
|
22
|
+
ETL::Batch.instance.start.state.should == "Started"
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should have a default extracted_to datetime of midnight (this morning)" do
|
26
|
+
now = Time.now
|
27
|
+
ETL::Batch.instance.start.extracted_to.should == Time.local(now.year, now.month, now.day, 0,0,0)
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should be able to specify an extract to date" do
|
31
|
+
now = Date.today - 1
|
32
|
+
ETL::Batch.instance.start(now).extracted_to.should == Time.local(now.year, now.month, now.day, 0,0,0)
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should create a directory tmp/batches/1 under the project root when created" do
|
36
|
+
ETL::Batch.instance.start
|
37
|
+
File.should be_directory(Chicago.project_root + "/tmp/batches/1")
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should return the batch directory path from #dir" do
|
41
|
+
ETL::Batch.instance.start.dir.should == Chicago.project_root + "/tmp/batches/1"
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should set the finished_at timestamp when #finish is called" do
|
45
|
+
batch = ETL::Batch.instance.start
|
46
|
+
batch.finish
|
47
|
+
batch.finished_at.should_not be_nil
|
48
|
+
batch.state.should == "Finished"
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should return true from #error? if in the error state" do
|
52
|
+
batch = ETL::Batch.instance.start
|
53
|
+
batch.error
|
54
|
+
batch.should be_in_error
|
55
|
+
end
|
56
|
+
|
57
|
+
it "should not return a new batch if the last batch was not finished" do
|
58
|
+
batch = ETL::Batch.instance.start
|
59
|
+
ETL::Batch.instance == batch
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should not return a new batch if the last batch ended in error" do
|
63
|
+
batch = ETL::Batch.instance.start
|
64
|
+
batch.error
|
65
|
+
ETL::Batch.instance.should == batch
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should create a log in tmp/batches/1/log" do
|
69
|
+
ETL::Batch.instance.start
|
70
|
+
File.read(Chicago.project_root + "/tmp/batches/1/log").
|
71
|
+
should include("Started ETL batch 1.")
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should perform a task only once" do
|
75
|
+
batch = ETL::Batch.instance.start
|
76
|
+
i = 0
|
77
|
+
2.times { batch.perform_task("Transform", "Test") { i += 1} }
|
78
|
+
i.should == 1
|
79
|
+
batch.task_invocations_dataset.filter(:stage => "Transform", :name => "Test").count.should == 1
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should not complain when given a symbol as the stage name" do
|
83
|
+
batch = ETL::Batch.instance.start
|
84
|
+
lambda { batch.perform_task(:transform, "Test") {} }.should_not raise_error(Sequel::DatabaseError)
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Chicago::ETL::Counter do
|
4
|
+
it "returns the next available key" do
|
5
|
+
key = described_class.new(3)
|
6
|
+
key.next.should == 4
|
7
|
+
key.next.should == 5
|
8
|
+
end
|
9
|
+
|
10
|
+
it "can have the initial key set via a block" do
|
11
|
+
counter = described_class.new { 1 + 1 }
|
12
|
+
counter.next.should == 3
|
13
|
+
end
|
14
|
+
|
15
|
+
it "defaults the counter to 0 if the block returns nil" do
|
16
|
+
counter = described_class.new { nil }
|
17
|
+
counter.next.should == 1
|
18
|
+
end
|
19
|
+
|
20
|
+
it "prefers the block to the argument for setting initial state" do
|
21
|
+
counter = described_class.new(5) { 2 }
|
22
|
+
counter.next.should == 3
|
23
|
+
end
|
24
|
+
|
25
|
+
it "can be constructed with no argument, implying 0" do
|
26
|
+
described_class.new.next.should == 1
|
27
|
+
end
|
28
|
+
|
29
|
+
it "updates keys in a thread-safe fashion" do
|
30
|
+
key = described_class.new
|
31
|
+
|
32
|
+
# These seem to need to be a fairly large number of times to see
|
33
|
+
# errors
|
34
|
+
[Thread.new { 100_000.times {|i| key.next } },
|
35
|
+
Thread.new { 100_000.times {|i| key.next } },
|
36
|
+
Thread.new { 100_000.times {|i| key.next } }].each(&:join)
|
37
|
+
|
38
|
+
key.next.should == 300_001
|
39
|
+
end
|
40
|
+
|
41
|
+
it "has a current value" do
|
42
|
+
described_class.new.current.should == 0
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
class EtlBatchIdDatasetFilter
|
4
|
+
def initialize(etl_batch_id)
|
5
|
+
@etl_batch_id = etl_batch_id
|
6
|
+
end
|
7
|
+
|
8
|
+
# Returns a new dataset, filtered by all tables where the etl
|
9
|
+
# batch id matches.
|
10
|
+
def filter(dataset)
|
11
|
+
dataset.filter(conditions(filterable_tables(dataset)))
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def filterable_tables(dataset)
|
17
|
+
dataset.dependant_tables.select {|t|
|
18
|
+
dataset.db.schema(t).map(&:first).include?(:etl_batch_id)
|
19
|
+
}
|
20
|
+
end
|
21
|
+
|
22
|
+
def conditions(tables)
|
23
|
+
tables.
|
24
|
+
map {|t| {:etl_batch_id.qualify(t) => @etl_batch_id} }.
|
25
|
+
inject {|a,b| a | b}
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,190 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Chicago::ETL::KeyBuilder do
|
4
|
+
before :all do
|
5
|
+
@schema = Chicago::StarSchema.new
|
6
|
+
@schema.define_dimension(:user) do
|
7
|
+
columns { integer :original_id }
|
8
|
+
end
|
9
|
+
|
10
|
+
@schema.define_dimension(:address) do
|
11
|
+
columns do
|
12
|
+
string :line1
|
13
|
+
string :post_code
|
14
|
+
end
|
15
|
+
|
16
|
+
natural_key :line1, :post_code
|
17
|
+
end
|
18
|
+
|
19
|
+
@schema.define_dimension(:random) do
|
20
|
+
columns do
|
21
|
+
string :foo
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
@schema.define_fact(:addresses) do
|
26
|
+
dimensions :user, :address
|
27
|
+
natural_key :user, :address
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
before :each do
|
32
|
+
@db = stub(:staging_database).as_null_object
|
33
|
+
@db.stub(:[]).and_return(stub(:max => nil, :select_hash => {}))
|
34
|
+
@writer = stub(:writer).as_null_object
|
35
|
+
Chicago::ETL::BufferingInsertWriter.stub(:new).and_return(@writer)
|
36
|
+
end
|
37
|
+
|
38
|
+
describe "for identifiable dimensions" do
|
39
|
+
before :each do
|
40
|
+
@dimension = @schema.dimension(:user)
|
41
|
+
end
|
42
|
+
|
43
|
+
it "returns an incrementing key, given a row" do
|
44
|
+
builder = described_class.for_table(@dimension, @db)
|
45
|
+
builder.key(:original_id => 2).should == 1
|
46
|
+
builder.key(:original_id => 3).should == 2
|
47
|
+
end
|
48
|
+
|
49
|
+
it "returns the same key for the same record" do
|
50
|
+
builder = described_class.for_table(@dimension, @db)
|
51
|
+
builder.key(:original_id => 2).should == 1
|
52
|
+
builder.key(:original_id => 2).should == 1
|
53
|
+
end
|
54
|
+
|
55
|
+
it "takes into account the current maximum key in the database" do
|
56
|
+
@db.stub(:[]).with(:keys_dimension_user).and_return(stub(:max => 2, :select_hash => {}))
|
57
|
+
builder = described_class.for_table(@dimension, @db)
|
58
|
+
builder.key(:original_id => 1).should == 3
|
59
|
+
end
|
60
|
+
|
61
|
+
it "returns previously created keys" do
|
62
|
+
dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1})
|
63
|
+
@db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
|
64
|
+
|
65
|
+
builder = described_class.for_table(@dimension, @db)
|
66
|
+
builder.key(:original_id => 30).should == 2
|
67
|
+
builder.key(:original_id => 40).should == 1
|
68
|
+
end
|
69
|
+
|
70
|
+
it "raises an error when original_id isn't present in the row" do
|
71
|
+
builder = described_class.for_table(@dimension, @db)
|
72
|
+
expect { builder.key(:foo => :bar) }.to raise_error(Chicago::ETL::KeyError)
|
73
|
+
end
|
74
|
+
|
75
|
+
it "flushes new keys to a key table" do
|
76
|
+
pending
|
77
|
+
dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1})
|
78
|
+
dataset.stub(:insert_replace => dataset)
|
79
|
+
@db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
|
80
|
+
|
81
|
+
dataset.should_receive(:multi_insert).
|
82
|
+
with([{:original_id => 30, :dimension_id => 2}])
|
83
|
+
|
84
|
+
builder = described_class.for_table(@dimension, @db)
|
85
|
+
builder.key(:original_id => 30)
|
86
|
+
builder.key(:original_id => 40)
|
87
|
+
builder.flush
|
88
|
+
end
|
89
|
+
|
90
|
+
it "flushes new keys only once" do
|
91
|
+
pending
|
92
|
+
dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1})
|
93
|
+
dataset.stub(:insert_replace => dataset)
|
94
|
+
@db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
|
95
|
+
|
96
|
+
dataset.should_receive(:multi_insert).
|
97
|
+
with([{:original_id => 30, :dimension_id => 2}])
|
98
|
+
dataset.should_receive(:multi_insert).with([])
|
99
|
+
|
100
|
+
builder = described_class.for_table(@dimension, @db)
|
101
|
+
builder.key(:original_id => 30)
|
102
|
+
builder.key(:original_id => 40)
|
103
|
+
builder.flush
|
104
|
+
builder.flush
|
105
|
+
end
|
106
|
+
|
107
|
+
it "replaces old mappings with new values" do
|
108
|
+
pending
|
109
|
+
dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1}, :multi_insert => nil)
|
110
|
+
@db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
|
111
|
+
|
112
|
+
dataset.should_receive(:insert_replace).and_return(dataset)
|
113
|
+
described_class.for_table(@dimension, @db).flush
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
describe "for non-identifiable dimensions with natural keys" do
|
118
|
+
before :each do
|
119
|
+
@builder = described_class.for_table(@schema.dimension(:address), @db)
|
120
|
+
end
|
121
|
+
|
122
|
+
it "returns an incrementing key, given a row" do
|
123
|
+
@builder.key(:line1 => "some street", :post_code => "TW3 X45").
|
124
|
+
should == 1
|
125
|
+
@builder.key(:line1 => "some road", :post_code => "TW3 X45").
|
126
|
+
should == 2
|
127
|
+
end
|
128
|
+
|
129
|
+
it "returns the same incrementing key, ignoring case" do
|
130
|
+
@builder.key(:line1 => "some street", :post_code => "TW3 X45").
|
131
|
+
should == 1
|
132
|
+
@builder.key(:line1 => "some STREET", :post_code => "TW3 X45").
|
133
|
+
should == 1
|
134
|
+
end
|
135
|
+
|
136
|
+
it "can override default hash preparation" do
|
137
|
+
@builder.hash_preparation = lambda {|c| c }
|
138
|
+
|
139
|
+
@builder.key(:line1 => "some street", :post_code => "TW3 X45").
|
140
|
+
should == 1
|
141
|
+
@builder.key(:line1 => "some STREET", :post_code => "TW3 X45").
|
142
|
+
should == 2
|
143
|
+
end
|
144
|
+
|
145
|
+
it "inserts the hash as a binary literal" do
|
146
|
+
# Yuck. Don't like the implementation test, but mock
|
147
|
+
# expectations fail here for some reason, maybe because of the
|
148
|
+
# Sequel::LiteralString?
|
149
|
+
@builder.key_for_insert(@builder.original_key(:line1 => "some street", :post_code => "TW3 X45")).should == "0x817860F2417EB83D81FEA9D82E6B213A".lit
|
150
|
+
end
|
151
|
+
|
152
|
+
it "selects the Hex version of the binary column for the cache" do
|
153
|
+
dataset = stub(:dataset, :max => 1).as_null_object
|
154
|
+
@db.stub(:[]).with(:keys_dimension_address).and_return(dataset)
|
155
|
+
@builder = described_class.for_table(@schema.dimension(:address), @db)
|
156
|
+
|
157
|
+
dataset.should_receive(:select_hash).with(:hex.sql_function(:original_id).as(:original_id), :dimension_id).and_return({})
|
158
|
+
|
159
|
+
@builder.key(:line1 => "foo")
|
160
|
+
end
|
161
|
+
|
162
|
+
it "uses all columns as the natural key if one isn't defined" do
|
163
|
+
described_class.
|
164
|
+
for_table(@schema.dimension(:random), @db).
|
165
|
+
original_key(:foo => "bar").
|
166
|
+
should == "3D75EEC709B70A350E143492192A1736"
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
describe "for facts" do
|
171
|
+
before :each do
|
172
|
+
@builder = described_class.for_table(@schema.fact(:addresses), @db)
|
173
|
+
end
|
174
|
+
|
175
|
+
it "increments the id, regardless of row equality" do
|
176
|
+
@builder.key({}).should == 1
|
177
|
+
@builder.key({}).should == 2
|
178
|
+
end
|
179
|
+
|
180
|
+
it "increments from the last id stored id in the fact table" do
|
181
|
+
@db.stub(:[]).with(:facts_addresses).and_return(stub(:max => 100, :select_hash => {}))
|
182
|
+
@builder = described_class.for_table(@schema.fact(:addresses), @db)
|
183
|
+
@builder.key({}).should == 101
|
184
|
+
end
|
185
|
+
|
186
|
+
it "supports the flush interface as a no-op" do
|
187
|
+
lambda { @builder.flush }.should_not raise_error
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|