chicago-etl 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/.document +5 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +16 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +21 -0
  6. data/Rakefile +42 -0
  7. data/VERSION +1 -0
  8. data/chicago-etl.gemspec +117 -0
  9. data/lib/chicago/etl/batch.rb +110 -0
  10. data/lib/chicago/etl/buffering_insert_writer.rb +36 -0
  11. data/lib/chicago/etl/counter.rb +36 -0
  12. data/lib/chicago/etl/key_builder.rb +198 -0
  13. data/lib/chicago/etl/load_dataset_builder.rb +75 -0
  14. data/lib/chicago/etl/mysql_dumpfile.rb +32 -0
  15. data/lib/chicago/etl/mysql_load_file_value_transformer.rb +24 -0
  16. data/lib/chicago/etl/screens/column_screen.rb +59 -0
  17. data/lib/chicago/etl/screens/composite_screen.rb +17 -0
  18. data/lib/chicago/etl/screens/invalid_element.rb +27 -0
  19. data/lib/chicago/etl/screens/missing_value.rb +22 -0
  20. data/lib/chicago/etl/screens/out_of_bounds.rb +33 -0
  21. data/lib/chicago/etl/sequel/dependant_tables.rb +48 -0
  22. data/lib/chicago/etl/sequel/filter_to_etl_batch.rb +53 -0
  23. data/lib/chicago/etl/sequel/load_data_infile.rb +19 -0
  24. data/lib/chicago/etl/sink.rb +61 -0
  25. data/lib/chicago/etl/table_builder.rb +45 -0
  26. data/lib/chicago/etl/task_invocation.rb +32 -0
  27. data/lib/chicago/etl/tasks.rb +34 -0
  28. data/lib/chicago/etl/transformations/add_insert_timestamp.rb +16 -0
  29. data/lib/chicago/etl/transformations/uk_post_code.rb +40 -0
  30. data/lib/chicago/etl/transformations/uk_post_code_field.rb +59 -0
  31. data/lib/chicago/etl.rb +35 -0
  32. data/lib/chicago-etl.rb +0 -0
  33. data/spec/db_connections.yml.dist +4 -0
  34. data/spec/etl/batch_spec.rb +86 -0
  35. data/spec/etl/counter_spec.rb +44 -0
  36. data/spec/etl/etl_batch_id_dataset_filter.rb +29 -0
  37. data/spec/etl/key_builder_spec.rb +190 -0
  38. data/spec/etl/load_dataset_builder_spec.rb +86 -0
  39. data/spec/etl/mysql_dumpfile_spec.rb +42 -0
  40. data/spec/etl/mysql_load_file_value_transformer_spec.rb +27 -0
  41. data/spec/etl/screens/composite_screen_spec.rb +25 -0
  42. data/spec/etl/screens/invalid_element_spec.rb +27 -0
  43. data/spec/etl/screens/missing_value_spec.rb +58 -0
  44. data/spec/etl/screens/out_of_bounds_spec.rb +64 -0
  45. data/spec/etl/sequel/dependant_tables_spec.rb +41 -0
  46. data/spec/etl/sequel/filter_to_etl_batch_spec.rb +54 -0
  47. data/spec/etl/sequel/load_data_infile_spec.rb +37 -0
  48. data/spec/etl/sink_spec.rb +7 -0
  49. data/spec/etl/table_builder_spec.rb +22 -0
  50. data/spec/etl/task_spec.rb +87 -0
  51. data/spec/etl/transformations/add_insert_timestamp_spec.rb +9 -0
  52. data/spec/etl/transformations/uk_post_code_field_spec.rb +95 -0
  53. data/spec/etl/transformations/uk_post_code_spec.rb +102 -0
  54. data/spec/spec_helper.rb +20 -0
  55. metadata +245 -0
@@ -0,0 +1,45 @@
1
+ module Chicago
2
+ module ETL
3
+ # Builds ETL tables.
4
+ class TableBuilder
5
+ # Creates the necessary tables for the ETL process in the given
6
+ # database.
7
+ def self.build(db)
8
+ new(db).build
9
+ end
10
+
11
+ def initialize(db) # :nodoc:
12
+ @db = db
13
+ end
14
+
15
+ def build # :nodoc:
16
+ create_table :etl_batches do
17
+ primary_key :id, :type => :integer, :unsigned => true
18
+ timestamp :started_at, :null => false, :default => :current_timestamp.sql_function
19
+ timestamp :finished_at, :null => true, :default => nil
20
+ timestamp :extracted_to, :null => true, :default => nil
21
+ enum :state, :null => false, :elements => %w{Started Finished Error}, :default => "Started"
22
+ end
23
+
24
+ create_table :etl_task_invocations do
25
+ primary_key :id, :type => :integer, :unsigned => true
26
+ integer :batch_id, :unsigned => true, :null => false
27
+ enum :stage, :null => false, :elements => %w{Extract Transform Load}
28
+ varchar :name, :null => false
29
+ timestamp :started_at, :null => false, :default => :current_timestamp.sql_function
30
+ timestamp :finished_at, :null => true, :default => nil
31
+ enum :state, :null => false, :elements => %w{Created Started Finished Error}, :default => "Created"
32
+ smallint :attempts, :null => false, :unsigned => true
33
+
34
+ index [:batch_id, :stage, :name], :unique => true
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def create_table(table, &block)
41
+ @db.create_table(table, :engine => "innodb", &block) unless @db.tables.include?(table)
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,32 @@
1
+ module Chicago
2
+ module ETL
3
+
4
+ class TaskInvocation < Sequel::Model
5
+ set_dataset :etl_task_invocations
6
+ many_to_one :batch
7
+
8
+ # Executes a block of code.
9
+ #
10
+ # Sets the state to "Error" and re-raises any exception that the
11
+ # block of code raises.
12
+ def perform
13
+ raise RuntimeError.new("The task #{name} in batch #{batch_id} has already run") if finished?
14
+ update(:state => "Started", :attempts => attempts + 1)
15
+ begin
16
+ yield
17
+ rescue Exception => e
18
+ update(:state => "Error")
19
+ batch.error if batch
20
+ raise e
21
+ end
22
+ update(:state => "Finished", :finished_at => Time.now)
23
+ end
24
+
25
+ # Returns true if this task has finished running successfully.
26
+ def finished?
27
+ state == "Finished"
28
+ end
29
+ end
30
+
31
+ end
32
+ end
@@ -0,0 +1,34 @@
1
+ require 'rake/tasklib'
2
+
3
+ module Chicago
4
+ module ETL
5
+ # ETL Rake tasks for a Chicago project.
6
+ #
7
+ # To use, simply include:
8
+ #
9
+ # Chicago::ETL::RakeTasks.new(db, schema)
10
+ #
11
+ # in your project's Rakefile.
12
+ #
13
+ # Provides the following tasks:
14
+ #
15
+ # +db:create_etl_tables+:: defines the tables used for ETL batches
16
+ # and the like
17
+ class RakeTasks < Rake::TaskLib
18
+ def initialize(db, schema)
19
+ @db = db
20
+ @schema = schema
21
+ define
22
+ end
23
+
24
+ def define
25
+ namespace :db do
26
+ desc "Creates the etl tables"
27
+ task :create_etl_tables do
28
+ Chicago::ETL::TableBuilder.build(@db)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,16 @@
1
+ module Chicago
2
+ module ETL
3
+ module Transformations
4
+ class AddInsertTimestamp
5
+ def initialize(timestamp=Time.now)
6
+ @insert_timestamp = timestamp.utc
7
+ end
8
+
9
+ def call(row, errors=[])
10
+ row[:_inserted_at] = @insert_timestamp
11
+ [row, errors]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,40 @@
1
+ # -*- coding: utf-8 -*-
2
+ module Chicago
3
+ module ETL
4
+ module Transformations
5
+ # Cleans and reformats UK-based postcodes.
6
+ #
7
+ # Transformations are based on observed errors in data entry, so
8
+ # shift key slips (i.e. typing '!' where '1' was meant) are
9
+ # corrected, as are use of numbers where letters were intended
10
+ # i.e. (0X -> OX for Oxfordshire postcodes).
11
+ #
12
+ # Leaves BFPO postcodes alone.
13
+ class UkPostCode
14
+ # Creates a new post code transformation.
15
+ #
16
+ # @param Symbol column_name - the name of the column
17
+ # containing the post code.
18
+ #
19
+ # @param Proc filter_block - an optional block, which takes a
20
+ # row. If the block returns false, the transformation will
21
+ # not be run. This can be useful to only run the
22
+ # transformation on UK addresses, based a country field in
23
+ # the row for example.
24
+ def initialize(column_name, &filter_block)
25
+ @column_name = column_name
26
+ @filter_block = filter_block
27
+ end
28
+
29
+ def call(row, errors=[])
30
+ return [row, errors] if @filter_block && !@filter_block.call(row)
31
+
32
+ row[@column_name] = UkPostCodeField.new.
33
+ normalize(row[@column_name])[:post_code]
34
+
35
+ [row, errors]
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,59 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ module Chicago
4
+ module ETL
5
+ module Transformations
6
+ class UkPostCodeField
7
+ MATCH = /\A([A-Z][A-Z]?[0-9][0-9A-Z]?)(?:([0-9][A-Z]{2}))?\Z/
8
+
9
+ # Returns cleaned, formatted data about a UK Post Code.
10
+ #
11
+ # Example:
12
+ #
13
+ # UkPostCodeField.new.normalize(" SW !2 4 GH")
14
+ # # => { :post_code => "SW12 4GH",
15
+ # :outcode => "SW12",
16
+ # :incode => "4GH" }
17
+ #
18
+ # Partial postcodes will be returned without the incode. BFPO
19
+ # postcodes are supported, but have no incode or
20
+ # outcode. Postcodes that do not follow the format will be
21
+ # returned as is, with an invalid key set.
22
+ def normalize(raw_post_code)
23
+ reformat(clean(raw_post_code)) ||
24
+ {:post_code => raw_post_code, :invalid => true}
25
+ end
26
+
27
+ private
28
+
29
+ def clean(raw_post_code)
30
+ raw_post_code.
31
+ strip.
32
+ upcase.
33
+ tr('!"$%^&*()', '124567890').
34
+ gsub("£", "3").
35
+ sub(/^0([XL])/, 'O\1').
36
+ sub(/^([PSCY])0/, '\1O')
37
+ end
38
+
39
+ def reformat(post_code)
40
+ if post_code[0..3] == "BFPO"
41
+ { :post_code => post_code.sub(/BFPO\s*/, "BFPO ") }
42
+ else
43
+ reformat_standard_post_code(post_code)
44
+ end
45
+ end
46
+
47
+ def reformat_standard_post_code(post_code)
48
+ match = post_code.gsub(/\s+/,'').match(MATCH)
49
+
50
+ unless match.nil?
51
+ { :outcode => match[1],
52
+ :incode => match[2],
53
+ :post_code => [match[1], match[2]].join(' ').strip }
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,35 @@
1
+ require 'sequel'
2
+
3
+ require 'chicago/etl/counter'
4
+ require 'chicago/etl/key_builder'
5
+ require 'chicago/etl/sink'
6
+ require 'chicago/etl/mysql_load_file_value_transformer'
7
+ require 'chicago/etl/buffering_insert_writer'
8
+ require 'chicago/etl/mysql_dumpfile'
9
+
10
+ require 'chicago/etl/load_dataset_builder'
11
+
12
+ # Sequel Extensions
13
+ require 'chicago/etl/sequel/filter_to_etl_batch'
14
+ require 'chicago/etl/sequel/load_data_infile'
15
+ require 'chicago/etl/sequel/dependant_tables'
16
+
17
+ # Screens
18
+ require 'chicago/etl/screens/column_screen'
19
+ require 'chicago/etl/screens/composite_screen'
20
+ require 'chicago/etl/screens/missing_value'
21
+ require 'chicago/etl/screens/invalid_element'
22
+ require 'chicago/etl/screens/out_of_bounds'
23
+
24
+ # Transformations
25
+ require 'chicago/etl/transformations/add_insert_timestamp'
26
+ require 'chicago/etl/transformations/uk_post_code'
27
+ require 'chicago/etl/transformations/uk_post_code_field'
28
+
29
+ module Chicago
30
+ module ETL
31
+ autoload :TableBuilder, 'chicago/etl/table_builder.rb'
32
+ autoload :Batch, 'chicago/etl/batch.rb'
33
+ autoload :TaskInvocation, 'chicago/etl/task_invocation.rb'
34
+ end
35
+ end
File without changes
@@ -0,0 +1,4 @@
1
+ adapter: mysql
2
+ username: root
3
+ socket: /tmp/mysql.sock
4
+ database: chicago_test_db
@@ -0,0 +1,86 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::Batch do
4
+ before :each do
5
+ TEST_DB.drop_table(*(TEST_DB.tables))
6
+ ETL::TableBuilder.build(TEST_DB)
7
+ ETL::Batch.db = TEST_DB
8
+ Chicago.project_root = File.expand_path(File.join(File.dirname(__FILE__), ".."))
9
+ tmpdir = File.expand_path(File.join(File.dirname(__FILE__), "..", "tmp"))
10
+ FileUtils.rm_r(tmpdir) if File.exists?(tmpdir)
11
+ end
12
+
13
+ it "should return a new batch when instance is called and there are no outstanding batches in error" do
14
+ ETL::Batch.instance.should be_new
15
+ end
16
+
17
+ it "should set the start timestamp of the batch to now when created" do
18
+ ETL::Batch.instance.start.started_at.to_i.should == Time.now.to_i
19
+ end
20
+
21
+ it "should have a state of 'Started' when started" do
22
+ ETL::Batch.instance.start.state.should == "Started"
23
+ end
24
+
25
+ it "should have a default extracted_to datetime of midnight (this morning)" do
26
+ now = Time.now
27
+ ETL::Batch.instance.start.extracted_to.should == Time.local(now.year, now.month, now.day, 0,0,0)
28
+ end
29
+
30
+ it "should be able to specify an extract to date" do
31
+ now = Date.today - 1
32
+ ETL::Batch.instance.start(now).extracted_to.should == Time.local(now.year, now.month, now.day, 0,0,0)
33
+ end
34
+
35
+ it "should create a directory tmp/batches/1 under the project root when created" do
36
+ ETL::Batch.instance.start
37
+ File.should be_directory(Chicago.project_root + "/tmp/batches/1")
38
+ end
39
+
40
+ it "should return the batch directory path from #dir" do
41
+ ETL::Batch.instance.start.dir.should == Chicago.project_root + "/tmp/batches/1"
42
+ end
43
+
44
+ it "should set the finished_at timestamp when #finish is called" do
45
+ batch = ETL::Batch.instance.start
46
+ batch.finish
47
+ batch.finished_at.should_not be_nil
48
+ batch.state.should == "Finished"
49
+ end
50
+
51
+ it "should return true from #error? if in the error state" do
52
+ batch = ETL::Batch.instance.start
53
+ batch.error
54
+ batch.should be_in_error
55
+ end
56
+
57
+ it "should not return a new batch if the last batch was not finished" do
58
+ batch = ETL::Batch.instance.start
59
+ ETL::Batch.instance == batch
60
+ end
61
+
62
+ it "should not return a new batch if the last batch ended in error" do
63
+ batch = ETL::Batch.instance.start
64
+ batch.error
65
+ ETL::Batch.instance.should == batch
66
+ end
67
+
68
+ it "should create a log in tmp/batches/1/log" do
69
+ ETL::Batch.instance.start
70
+ File.read(Chicago.project_root + "/tmp/batches/1/log").
71
+ should include("Started ETL batch 1.")
72
+ end
73
+
74
+ it "should perform a task only once" do
75
+ batch = ETL::Batch.instance.start
76
+ i = 0
77
+ 2.times { batch.perform_task("Transform", "Test") { i += 1} }
78
+ i.should == 1
79
+ batch.task_invocations_dataset.filter(:stage => "Transform", :name => "Test").count.should == 1
80
+ end
81
+
82
+ it "should not complain when given a symbol as the stage name" do
83
+ batch = ETL::Batch.instance.start
84
+ lambda { batch.perform_task(:transform, "Test") {} }.should_not raise_error(Sequel::DatabaseError)
85
+ end
86
+ end
@@ -0,0 +1,44 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::Counter do
4
+ it "returns the next available key" do
5
+ key = described_class.new(3)
6
+ key.next.should == 4
7
+ key.next.should == 5
8
+ end
9
+
10
+ it "can have the initial key set via a block" do
11
+ counter = described_class.new { 1 + 1 }
12
+ counter.next.should == 3
13
+ end
14
+
15
+ it "defaults the counter to 0 if the block returns nil" do
16
+ counter = described_class.new { nil }
17
+ counter.next.should == 1
18
+ end
19
+
20
+ it "prefers the block to the argument for setting initial state" do
21
+ counter = described_class.new(5) { 2 }
22
+ counter.next.should == 3
23
+ end
24
+
25
+ it "can be constructed with no argument, implying 0" do
26
+ described_class.new.next.should == 1
27
+ end
28
+
29
+ it "updates keys in a thread-safe fashion" do
30
+ key = described_class.new
31
+
32
+ # These seem to need to be a fairly large number of times to see
33
+ # errors
34
+ [Thread.new { 100_000.times {|i| key.next } },
35
+ Thread.new { 100_000.times {|i| key.next } },
36
+ Thread.new { 100_000.times {|i| key.next } }].each(&:join)
37
+
38
+ key.next.should == 300_001
39
+ end
40
+
41
+ it "has a current value" do
42
+ described_class.new.current.should == 0
43
+ end
44
+ end
@@ -0,0 +1,29 @@
1
+ module Chicago
2
+ module ETL
3
+ class EtlBatchIdDatasetFilter
4
+ def initialize(etl_batch_id)
5
+ @etl_batch_id = etl_batch_id
6
+ end
7
+
8
+ # Returns a new dataset, filtered by all tables where the etl
9
+ # batch id matches.
10
+ def filter(dataset)
11
+ dataset.filter(conditions(filterable_tables(dataset)))
12
+ end
13
+
14
+ private
15
+
16
+ def filterable_tables(dataset)
17
+ dataset.dependant_tables.select {|t|
18
+ dataset.db.schema(t).map(&:first).include?(:etl_batch_id)
19
+ }
20
+ end
21
+
22
+ def conditions(tables)
23
+ tables.
24
+ map {|t| {:etl_batch_id.qualify(t) => @etl_batch_id} }.
25
+ inject {|a,b| a | b}
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,190 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::KeyBuilder do
4
+ before :all do
5
+ @schema = Chicago::StarSchema.new
6
+ @schema.define_dimension(:user) do
7
+ columns { integer :original_id }
8
+ end
9
+
10
+ @schema.define_dimension(:address) do
11
+ columns do
12
+ string :line1
13
+ string :post_code
14
+ end
15
+
16
+ natural_key :line1, :post_code
17
+ end
18
+
19
+ @schema.define_dimension(:random) do
20
+ columns do
21
+ string :foo
22
+ end
23
+ end
24
+
25
+ @schema.define_fact(:addresses) do
26
+ dimensions :user, :address
27
+ natural_key :user, :address
28
+ end
29
+ end
30
+
31
+ before :each do
32
+ @db = stub(:staging_database).as_null_object
33
+ @db.stub(:[]).and_return(stub(:max => nil, :select_hash => {}))
34
+ @writer = stub(:writer).as_null_object
35
+ Chicago::ETL::BufferingInsertWriter.stub(:new).and_return(@writer)
36
+ end
37
+
38
+ describe "for identifiable dimensions" do
39
+ before :each do
40
+ @dimension = @schema.dimension(:user)
41
+ end
42
+
43
+ it "returns an incrementing key, given a row" do
44
+ builder = described_class.for_table(@dimension, @db)
45
+ builder.key(:original_id => 2).should == 1
46
+ builder.key(:original_id => 3).should == 2
47
+ end
48
+
49
+ it "returns the same key for the same record" do
50
+ builder = described_class.for_table(@dimension, @db)
51
+ builder.key(:original_id => 2).should == 1
52
+ builder.key(:original_id => 2).should == 1
53
+ end
54
+
55
+ it "takes into account the current maximum key in the database" do
56
+ @db.stub(:[]).with(:keys_dimension_user).and_return(stub(:max => 2, :select_hash => {}))
57
+ builder = described_class.for_table(@dimension, @db)
58
+ builder.key(:original_id => 1).should == 3
59
+ end
60
+
61
+ it "returns previously created keys" do
62
+ dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1})
63
+ @db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
64
+
65
+ builder = described_class.for_table(@dimension, @db)
66
+ builder.key(:original_id => 30).should == 2
67
+ builder.key(:original_id => 40).should == 1
68
+ end
69
+
70
+ it "raises an error when original_id isn't present in the row" do
71
+ builder = described_class.for_table(@dimension, @db)
72
+ expect { builder.key(:foo => :bar) }.to raise_error(Chicago::ETL::KeyError)
73
+ end
74
+
75
+ it "flushes new keys to a key table" do
76
+ pending
77
+ dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1})
78
+ dataset.stub(:insert_replace => dataset)
79
+ @db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
80
+
81
+ dataset.should_receive(:multi_insert).
82
+ with([{:original_id => 30, :dimension_id => 2}])
83
+
84
+ builder = described_class.for_table(@dimension, @db)
85
+ builder.key(:original_id => 30)
86
+ builder.key(:original_id => 40)
87
+ builder.flush
88
+ end
89
+
90
+ it "flushes new keys only once" do
91
+ pending
92
+ dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1})
93
+ dataset.stub(:insert_replace => dataset)
94
+ @db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
95
+
96
+ dataset.should_receive(:multi_insert).
97
+ with([{:original_id => 30, :dimension_id => 2}])
98
+ dataset.should_receive(:multi_insert).with([])
99
+
100
+ builder = described_class.for_table(@dimension, @db)
101
+ builder.key(:original_id => 30)
102
+ builder.key(:original_id => 40)
103
+ builder.flush
104
+ builder.flush
105
+ end
106
+
107
+ it "replaces old mappings with new values" do
108
+ pending
109
+ dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1}, :multi_insert => nil)
110
+ @db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
111
+
112
+ dataset.should_receive(:insert_replace).and_return(dataset)
113
+ described_class.for_table(@dimension, @db).flush
114
+ end
115
+ end
116
+
117
+ describe "for non-identifiable dimensions with natural keys" do
118
+ before :each do
119
+ @builder = described_class.for_table(@schema.dimension(:address), @db)
120
+ end
121
+
122
+ it "returns an incrementing key, given a row" do
123
+ @builder.key(:line1 => "some street", :post_code => "TW3 X45").
124
+ should == 1
125
+ @builder.key(:line1 => "some road", :post_code => "TW3 X45").
126
+ should == 2
127
+ end
128
+
129
+ it "returns the same incrementing key, ignoring case" do
130
+ @builder.key(:line1 => "some street", :post_code => "TW3 X45").
131
+ should == 1
132
+ @builder.key(:line1 => "some STREET", :post_code => "TW3 X45").
133
+ should == 1
134
+ end
135
+
136
+ it "can override default hash preparation" do
137
+ @builder.hash_preparation = lambda {|c| c }
138
+
139
+ @builder.key(:line1 => "some street", :post_code => "TW3 X45").
140
+ should == 1
141
+ @builder.key(:line1 => "some STREET", :post_code => "TW3 X45").
142
+ should == 2
143
+ end
144
+
145
+ it "inserts the hash as a binary literal" do
146
+ # Yuck. Don't like the implementation test, but mock
147
+ # expectations fail here for some reason, maybe because of the
148
+ # Sequel::LiteralString?
149
+ @builder.key_for_insert(@builder.original_key(:line1 => "some street", :post_code => "TW3 X45")).should == "0x817860F2417EB83D81FEA9D82E6B213A".lit
150
+ end
151
+
152
+ it "selects the Hex version of the binary column for the cache" do
153
+ dataset = stub(:dataset, :max => 1).as_null_object
154
+ @db.stub(:[]).with(:keys_dimension_address).and_return(dataset)
155
+ @builder = described_class.for_table(@schema.dimension(:address), @db)
156
+
157
+ dataset.should_receive(:select_hash).with(:hex.sql_function(:original_id).as(:original_id), :dimension_id).and_return({})
158
+
159
+ @builder.key(:line1 => "foo")
160
+ end
161
+
162
+ it "uses all columns as the natural key if one isn't defined" do
163
+ described_class.
164
+ for_table(@schema.dimension(:random), @db).
165
+ original_key(:foo => "bar").
166
+ should == "3D75EEC709B70A350E143492192A1736"
167
+ end
168
+ end
169
+
170
+ describe "for facts" do
171
+ before :each do
172
+ @builder = described_class.for_table(@schema.fact(:addresses), @db)
173
+ end
174
+
175
+ it "increments the id, regardless of row equality" do
176
+ @builder.key({}).should == 1
177
+ @builder.key({}).should == 2
178
+ end
179
+
180
+ it "increments from the last id stored id in the fact table" do
181
+ @db.stub(:[]).with(:facts_addresses).and_return(stub(:max => 100, :select_hash => {}))
182
+ @builder = described_class.for_table(@schema.fact(:addresses), @db)
183
+ @builder.key({}).should == 101
184
+ end
185
+
186
+ it "supports the flush interface as a no-op" do
187
+ lambda { @builder.flush }.should_not raise_error
188
+ end
189
+ end
190
+ end