chicago-etl 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. data/.document +5 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +16 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +21 -0
  6. data/Rakefile +42 -0
  7. data/VERSION +1 -0
  8. data/chicago-etl.gemspec +117 -0
  9. data/lib/chicago/etl/batch.rb +110 -0
  10. data/lib/chicago/etl/buffering_insert_writer.rb +36 -0
  11. data/lib/chicago/etl/counter.rb +36 -0
  12. data/lib/chicago/etl/key_builder.rb +198 -0
  13. data/lib/chicago/etl/load_dataset_builder.rb +75 -0
  14. data/lib/chicago/etl/mysql_dumpfile.rb +32 -0
  15. data/lib/chicago/etl/mysql_load_file_value_transformer.rb +24 -0
  16. data/lib/chicago/etl/screens/column_screen.rb +59 -0
  17. data/lib/chicago/etl/screens/composite_screen.rb +17 -0
  18. data/lib/chicago/etl/screens/invalid_element.rb +27 -0
  19. data/lib/chicago/etl/screens/missing_value.rb +22 -0
  20. data/lib/chicago/etl/screens/out_of_bounds.rb +33 -0
  21. data/lib/chicago/etl/sequel/dependant_tables.rb +48 -0
  22. data/lib/chicago/etl/sequel/filter_to_etl_batch.rb +53 -0
  23. data/lib/chicago/etl/sequel/load_data_infile.rb +19 -0
  24. data/lib/chicago/etl/sink.rb +61 -0
  25. data/lib/chicago/etl/table_builder.rb +45 -0
  26. data/lib/chicago/etl/task_invocation.rb +32 -0
  27. data/lib/chicago/etl/tasks.rb +34 -0
  28. data/lib/chicago/etl/transformations/add_insert_timestamp.rb +16 -0
  29. data/lib/chicago/etl/transformations/uk_post_code.rb +40 -0
  30. data/lib/chicago/etl/transformations/uk_post_code_field.rb +59 -0
  31. data/lib/chicago/etl.rb +35 -0
  32. data/lib/chicago-etl.rb +0 -0
  33. data/spec/db_connections.yml.dist +4 -0
  34. data/spec/etl/batch_spec.rb +86 -0
  35. data/spec/etl/counter_spec.rb +44 -0
  36. data/spec/etl/etl_batch_id_dataset_filter.rb +29 -0
  37. data/spec/etl/key_builder_spec.rb +190 -0
  38. data/spec/etl/load_dataset_builder_spec.rb +86 -0
  39. data/spec/etl/mysql_dumpfile_spec.rb +42 -0
  40. data/spec/etl/mysql_load_file_value_transformer_spec.rb +27 -0
  41. data/spec/etl/screens/composite_screen_spec.rb +25 -0
  42. data/spec/etl/screens/invalid_element_spec.rb +27 -0
  43. data/spec/etl/screens/missing_value_spec.rb +58 -0
  44. data/spec/etl/screens/out_of_bounds_spec.rb +64 -0
  45. data/spec/etl/sequel/dependant_tables_spec.rb +41 -0
  46. data/spec/etl/sequel/filter_to_etl_batch_spec.rb +54 -0
  47. data/spec/etl/sequel/load_data_infile_spec.rb +37 -0
  48. data/spec/etl/sink_spec.rb +7 -0
  49. data/spec/etl/table_builder_spec.rb +22 -0
  50. data/spec/etl/task_spec.rb +87 -0
  51. data/spec/etl/transformations/add_insert_timestamp_spec.rb +9 -0
  52. data/spec/etl/transformations/uk_post_code_field_spec.rb +95 -0
  53. data/spec/etl/transformations/uk_post_code_spec.rb +102 -0
  54. data/spec/spec_helper.rb +20 -0
  55. metadata +245 -0
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/Gemfile ADDED
@@ -0,0 +1,16 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "chicagowarehouse", "~> 0.4"
4
+
5
+ # Add dependencies to develop your gem here.
6
+ # Include everything needed to run rake, tests, features, etc.
7
+ group :development do
8
+ gem "rspec", "~> 2"
9
+ gem "timecop"
10
+ gem "yard"
11
+ gem "flog"
12
+ gem "jeweler"
13
+ gem "rcov", :platforms => :mri_18
14
+ gem "simplecov", :platforms => :mri_19
15
+ gem "ZenTest"
16
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 notonthehighstreet.com
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,21 @@
1
+ = chicago-etl
2
+
3
+ HIGHLY EXPERIMENTAL. If you use this, you'll find that things will vanish without warning and you'll be terrified.
4
+
5
+ An ETL pipeline for use with Chicago Warehouse.
6
+
7
+ == Contributing to chicago-etl
8
+
9
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
10
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
11
+ * Fork the project.
12
+ * Start a feature/bugfix branch.
13
+ * Commit and push until you are happy with your contribution.
14
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
15
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
16
+
17
+ == Copyright
18
+
19
+ Copyright (c) 2012 notonthehighstreet.com. See LICENSE.txt for
20
+ further details.
21
+
data/Rakefile ADDED
@@ -0,0 +1,42 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "chicago-etl"
18
+ gem.homepage = "http://github.com/notonthehighstreet/chicago-etl"
19
+ gem.license = "MIT"
20
+ gem.summary = "Chicago ETL"
21
+ gem.description = "ETL tools for Chicago"
22
+ gem.email = "roland.swingler@gmail.com"
23
+ gem.authors = ["Roland Swingler"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rspec/core'
29
+ require 'rspec/core/rake_task'
30
+ RSpec::Core::RakeTask.new(:spec) do |spec|
31
+ spec.pattern = FileList['spec/**/*_spec.rb']
32
+ end
33
+
34
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
35
+ spec.pattern = 'spec/**/*_spec.rb'
36
+ spec.rcov = true
37
+ end
38
+
39
+ task :default => :spec
40
+
41
+ require 'yard'
42
+ YARD::Rake::YardocTask.new
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.9
@@ -0,0 +1,117 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "chicago-etl"
8
+ s.version = "0.0.9"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Roland Swingler"]
12
+ s.date = "2013-02-19"
13
+ s.description = "ETL tools for Chicago"
14
+ s.email = "roland.swingler@gmail.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".rspec",
22
+ "Gemfile",
23
+ "LICENSE.txt",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "chicago-etl.gemspec",
28
+ "lib/chicago-etl.rb",
29
+ "lib/chicago/etl.rb",
30
+ "lib/chicago/etl/batch.rb",
31
+ "lib/chicago/etl/buffering_insert_writer.rb",
32
+ "lib/chicago/etl/counter.rb",
33
+ "lib/chicago/etl/key_builder.rb",
34
+ "lib/chicago/etl/load_dataset_builder.rb",
35
+ "lib/chicago/etl/mysql_dumpfile.rb",
36
+ "lib/chicago/etl/mysql_load_file_value_transformer.rb",
37
+ "lib/chicago/etl/screens/column_screen.rb",
38
+ "lib/chicago/etl/screens/composite_screen.rb",
39
+ "lib/chicago/etl/screens/invalid_element.rb",
40
+ "lib/chicago/etl/screens/missing_value.rb",
41
+ "lib/chicago/etl/screens/out_of_bounds.rb",
42
+ "lib/chicago/etl/sequel/dependant_tables.rb",
43
+ "lib/chicago/etl/sequel/filter_to_etl_batch.rb",
44
+ "lib/chicago/etl/sequel/load_data_infile.rb",
45
+ "lib/chicago/etl/sink.rb",
46
+ "lib/chicago/etl/table_builder.rb",
47
+ "lib/chicago/etl/task_invocation.rb",
48
+ "lib/chicago/etl/tasks.rb",
49
+ "lib/chicago/etl/transformations/add_insert_timestamp.rb",
50
+ "lib/chicago/etl/transformations/uk_post_code.rb",
51
+ "lib/chicago/etl/transformations/uk_post_code_field.rb",
52
+ "spec/db_connections.yml.dist",
53
+ "spec/etl/batch_spec.rb",
54
+ "spec/etl/counter_spec.rb",
55
+ "spec/etl/etl_batch_id_dataset_filter.rb",
56
+ "spec/etl/key_builder_spec.rb",
57
+ "spec/etl/load_dataset_builder_spec.rb",
58
+ "spec/etl/mysql_dumpfile_spec.rb",
59
+ "spec/etl/mysql_load_file_value_transformer_spec.rb",
60
+ "spec/etl/screens/composite_screen_spec.rb",
61
+ "spec/etl/screens/invalid_element_spec.rb",
62
+ "spec/etl/screens/missing_value_spec.rb",
63
+ "spec/etl/screens/out_of_bounds_spec.rb",
64
+ "spec/etl/sequel/dependant_tables_spec.rb",
65
+ "spec/etl/sequel/filter_to_etl_batch_spec.rb",
66
+ "spec/etl/sequel/load_data_infile_spec.rb",
67
+ "spec/etl/sink_spec.rb",
68
+ "spec/etl/table_builder_spec.rb",
69
+ "spec/etl/task_spec.rb",
70
+ "spec/etl/transformations/add_insert_timestamp_spec.rb",
71
+ "spec/etl/transformations/uk_post_code_field_spec.rb",
72
+ "spec/etl/transformations/uk_post_code_spec.rb",
73
+ "spec/spec_helper.rb"
74
+ ]
75
+ s.homepage = "http://github.com/notonthehighstreet/chicago-etl"
76
+ s.licenses = ["MIT"]
77
+ s.require_paths = ["lib"]
78
+ s.rubygems_version = "1.8.25"
79
+ s.summary = "Chicago ETL"
80
+
81
+ if s.respond_to? :specification_version then
82
+ s.specification_version = 3
83
+
84
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
85
+ s.add_runtime_dependency(%q<chicagowarehouse>, ["~> 0.4"])
86
+ s.add_development_dependency(%q<rspec>, ["~> 2"])
87
+ s.add_development_dependency(%q<timecop>, [">= 0"])
88
+ s.add_development_dependency(%q<yard>, [">= 0"])
89
+ s.add_development_dependency(%q<flog>, [">= 0"])
90
+ s.add_development_dependency(%q<jeweler>, [">= 0"])
91
+ s.add_development_dependency(%q<rcov>, [">= 0"])
92
+ s.add_development_dependency(%q<simplecov>, [">= 0"])
93
+ s.add_development_dependency(%q<ZenTest>, [">= 0"])
94
+ else
95
+ s.add_dependency(%q<chicagowarehouse>, ["~> 0.4"])
96
+ s.add_dependency(%q<rspec>, ["~> 2"])
97
+ s.add_dependency(%q<timecop>, [">= 0"])
98
+ s.add_dependency(%q<yard>, [">= 0"])
99
+ s.add_dependency(%q<flog>, [">= 0"])
100
+ s.add_dependency(%q<jeweler>, [">= 0"])
101
+ s.add_dependency(%q<rcov>, [">= 0"])
102
+ s.add_dependency(%q<simplecov>, [">= 0"])
103
+ s.add_dependency(%q<ZenTest>, [">= 0"])
104
+ end
105
+ else
106
+ s.add_dependency(%q<chicagowarehouse>, ["~> 0.4"])
107
+ s.add_dependency(%q<rspec>, ["~> 2"])
108
+ s.add_dependency(%q<timecop>, [">= 0"])
109
+ s.add_dependency(%q<yard>, [">= 0"])
110
+ s.add_dependency(%q<flog>, [">= 0"])
111
+ s.add_dependency(%q<jeweler>, [">= 0"])
112
+ s.add_dependency(%q<rcov>, [">= 0"])
113
+ s.add_dependency(%q<simplecov>, [">= 0"])
114
+ s.add_dependency(%q<ZenTest>, [">= 0"])
115
+ end
116
+ end
117
+
@@ -0,0 +1,110 @@
1
+ require 'fileutils'
2
+ require 'logger'
3
+
4
+ module Chicago
5
+ module ETL
6
+ # A particular "run" of the ETL process.
7
+ #
8
+ # All ETL tasks should be executed in the context of a Batch.
9
+ #
10
+ # A batch creates a temporary directory under tmp/batches/:id
11
+ # where it stores various logs and extract files.
12
+ class Batch < Sequel::Model
13
+ set_dataset :etl_batches
14
+
15
+ one_to_many :task_invocations
16
+
17
+ class << self
18
+ # Returns the Batch that should be used for the ETL process.
19
+ #
20
+ # A new batch is returned, unless the previous batch did not
21
+ # finish successfully.
22
+ #
23
+ # This should be used in preference to new or create.
24
+ def instance
25
+ (last_batch.nil? || last_batch.finished?) ? new : last_batch
26
+ end
27
+
28
+ # Returns the last batch run, or nil if this is the first batch.
29
+ def last_batch
30
+ order(:started_at).last
31
+ end
32
+ end
33
+
34
+ # Deprecated.
35
+ def load(task_name, &block)
36
+ perform_task(:load, task_name, &block)
37
+ end
38
+
39
+ # Deprecated.
40
+ def transform(task_name, &block)
41
+ perform_task(:extract, task_name, &block)
42
+ end
43
+
44
+ # Deprecated.
45
+ def extract(task_name, &block)
46
+ perform_task(:extract, task_name, &block)
47
+ end
48
+
49
+ # Perform a named task if it hasn't already run successfully in
50
+ # this batch.
51
+ def perform_task(stage, task_name, &block)
52
+ task = find_or_create_task_invocation(stage, task_name)
53
+ task.perform(&block) unless task.finished?
54
+ end
55
+
56
+ # Returns the directory files & batch logs will be written to.
57
+ def dir
58
+ @dir ||= File.join(Chicago.project_root, "tmp", "batches", id.to_s)
59
+ end
60
+
61
+ # Starts this batch.
62
+ def start(extract_to=nil)
63
+ self.extracted_to = extract_to || Date.today
64
+ save
65
+ if state == "Started"
66
+ log.info "Started ETL batch #{id}."
67
+ else
68
+ log.info "Resumed ETL batch #{id}."
69
+ end
70
+ self
71
+ end
72
+
73
+ # Finishes this batch, and sets the finished_at timestamp.
74
+ def finish
75
+ update(:state => "Finished", :finished_at => Time.now)
76
+ end
77
+
78
+ # Sets this batch to the Error state.
79
+ def error
80
+ update(:state => "Error")
81
+ end
82
+
83
+ # Returns true if this batch is finished.
84
+ def finished?
85
+ state == "Finished"
86
+ end
87
+
88
+ # Returns true if in the error state
89
+ def in_error?
90
+ state == "Error"
91
+ end
92
+
93
+ # Returns the logger for this batch
94
+ def log
95
+ @log ||= Logger.new(File.join(dir, "log"))
96
+ end
97
+
98
+ def after_create # :nodoc:
99
+ FileUtils.mkdir_p(dir, :mode => 0777)
100
+ end
101
+
102
+ private
103
+
104
+ def find_or_create_task_invocation(stage, name)
105
+ attrs = {:stage => stage.to_s.downcase, :name => name.to_s}
106
+ task_invocations_dataset.filter(attrs).first || add_task_invocation(attrs)
107
+ end
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,36 @@
1
+ require 'chicago/etl/sink'
2
+
3
+ module Chicago
4
+ module ETL
5
+ # Wrapper around a dataset to allowed buffered inserts.
6
+ #
7
+ # @api public
8
+ class BufferingInsertWriter < Sink
9
+ # The number of rows written before inserting to the DB.
10
+ BUFFER_SIZE = 10_000
11
+
12
+ def initialize(dataset, column_names, key=nil)
13
+ super([], column_names, key)
14
+ @dataset = dataset
15
+ end
16
+
17
+ def flush
18
+ @dataset.insert_replace.import(column_names, output)
19
+ output.clear
20
+ end
21
+
22
+ protected
23
+
24
+ def write(row)
25
+ output << @column_names.map {|name| row[name] }
26
+ flush if reached_buffer_limit?
27
+ end
28
+
29
+ private
30
+
31
+ def reached_buffer_limit?
32
+ output.size >= BUFFER_SIZE
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,36 @@
1
+ require 'thread'
2
+
3
+ module Chicago
4
+ module ETL
5
+ # Provides a thread-safe wrapper around an incrementing number.
6
+ #
7
+ # Intended to be used for key builders, rather than using the
8
+ # database's AUTO INCREMENT functionality.
9
+ #
10
+ # @api private
11
+ class Counter
12
+ # Returns the current number this counter is on.
13
+ attr_reader :current
14
+
15
+ # Creates a new counter, optionally with a starting count.
16
+ def initialize(current_number=0, &block)
17
+ @mutex = Mutex.new
18
+ if block
19
+ @block = block
20
+ else
21
+ @current = current_number || 0
22
+ end
23
+ end
24
+
25
+ # Returns the next number.
26
+ #
27
+ # Modifies the current state of the counter.
28
+ def next
29
+ @current = (@block.call || 0) if @current.nil?
30
+ @mutex.synchronize do
31
+ @current += 1
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,198 @@
1
+ require 'digest/md5'
2
+ require 'chicago/etl/buffering_insert_writer'
3
+
4
+ module Chicago
5
+ module ETL
6
+ # Builds a surrogate key for a dimension record, without relying
7
+ # on the database's AUTO_INCREMENT functionality.
8
+ #
9
+ # We avoid AUTO_INCREMENT because we need to be able to get the
10
+ # key mappings without having anything to do with the database -
11
+ # this allows us to use bulk load.
12
+ #
13
+ # @api public
14
+ class KeyBuilder
15
+ # @api private
16
+ class Factory
17
+ attr_reader :table, :staging_db
18
+
19
+ def initialize(table, staging_db)
20
+ @table = table
21
+ @staging_db = staging_db
22
+ end
23
+
24
+ def make
25
+ if dimension?
26
+ key_table = staging_db[table.key_table_name]
27
+ key_sink = BufferingInsertWriter.new(key_table,
28
+ [:original_id, :dimension_id])
29
+
30
+ if table.identifiable?
31
+ IdentifiableDimensionKeyBuilder.new(key_table, key_sink)
32
+ else
33
+ HashingKeyBuilder.new(key_table, key_sink, columns_to_hash)
34
+ end
35
+ elsif fact?
36
+ FactKeyBuilder.new(staging_db[table.table_name])
37
+ end
38
+ end
39
+
40
+ private
41
+
42
+ def dimension?
43
+ table.kind_of?(Chicago::Schema::Dimension)
44
+ end
45
+
46
+ def fact?
47
+ table.kind_of?(Chicago::Schema::Fact)
48
+ end
49
+
50
+ def columns_to_hash
51
+ if table.natural_key.nil?
52
+ table.columns.map(&:name)
53
+ else
54
+ table.natural_key
55
+ end
56
+ end
57
+ end
58
+
59
+ # Returns an appropriate key builder for a schema table, using
60
+ # the staging database for key management where necessary.
61
+ def self.for_table(table, staging_db)
62
+ Factory.new(table, staging_db).make
63
+ end
64
+
65
+ def initialize(key_table, key_sink)
66
+ @key_table = key_table
67
+ @new_keys = key_sink
68
+ @counter = Counter.new { key_table.max(:dimension_id) }
69
+ end
70
+
71
+ # Returns a surrogate key, given a record row.
72
+ #
73
+ # @raises Chicago::ETL::KeyError if the surrogate key cannot be
74
+ # determined from the row data.
75
+ def key(row)
76
+ fetch_cache unless @key_mapping
77
+ row_id = original_key(row)
78
+ new_key = @key_mapping[row_id]
79
+
80
+ if new_key
81
+ new_key
82
+ else
83
+ new_key = @counter.next
84
+ @new_keys << {
85
+ :original_id => key_for_insert(row_id),
86
+ :dimension_id => new_key
87
+ }
88
+ @key_mapping[row_id] = new_key
89
+ end
90
+ end
91
+
92
+ # Returns the original key for the row.
93
+ #
94
+ # Overridden by subclasses.
95
+ def original_key(row)
96
+ end
97
+
98
+ # Flushes any newly created keys to the key table.
99
+ def flush
100
+ @new_keys.flush
101
+ end
102
+
103
+ protected
104
+
105
+ attr_reader :key_table
106
+
107
+ def fetch_cache
108
+ @key_mapping = key_table.
109
+ select_hash(original_key_select_fragment, :dimension_id)
110
+ end
111
+ end
112
+
113
+ # Key builder for identifiable dimensions.
114
+ #
115
+ # This should not be instantiated directly, use
116
+ # KeyBuilder.for_dimension.
117
+ #
118
+ # @api private
119
+ class IdentifiableDimensionKeyBuilder < KeyBuilder
120
+ def key(row)
121
+ raise KeyError.new("Row does not have an original_id field") unless row.has_key?(:original_id)
122
+ super
123
+ end
124
+
125
+ def original_key(row)
126
+ row[:original_id]
127
+ end
128
+
129
+ def key_for_insert(original_id)
130
+ original_id
131
+ end
132
+
133
+ def original_key_select_fragment
134
+ :original_id
135
+ end
136
+ end
137
+
138
+ # Key builder for dimensions with natuaral keys, but no simple
139
+ # key.
140
+ #
141
+ # This should not be instantiated directly, use
142
+ # KeyBuilder.for_dimension.
143
+ #
144
+ # @api private
145
+ class HashingKeyBuilder < KeyBuilder
146
+ attr_reader :columns
147
+ attr_accessor :hash_preparation
148
+
149
+ def initialize(key_table, key_sink, columns)
150
+ super(key_table, key_sink)
151
+ @columns = columns
152
+ @hash_preparation = lambda {|column| column.to_s.upcase }
153
+ end
154
+
155
+ def original_key(row)
156
+ str = columns.map {|column| hash_preparation.call(row[column]) }.join
157
+ Digest::MD5.hexdigest(str).upcase
158
+ end
159
+
160
+ def key_for_insert(original_id)
161
+ ("0x" + original_id).lit
162
+ end
163
+
164
+ def original_key_select_fragment
165
+ :hex.sql_function(:original_id).as(:original_id)
166
+ end
167
+ end
168
+
169
+ # Returns ids for Fact tables.
170
+ #
171
+ # Fact table surrogate ids are transient - there is no expectation
172
+ # that the same fact row will have the same id between
173
+ # invocations. This is ok, because all facts should have a natural
174
+ # key defined - the id generated by this is purely for convenience
175
+ # and linking to error rows.
176
+ #
177
+ # As a result fact keys aren't stored in a key table - they are
178
+ # never referenced by any other tables in the system.
179
+ #
180
+ # In addition, the same row passed twice will get a different id.
181
+ class FactKeyBuilder
182
+ def initialize(db_table, key_sink=nil)
183
+ @db_table = db_table
184
+ @counter = Counter.new { @db_table.max(:id) }
185
+ end
186
+
187
+ # Returns an id given a row - the row actually has no bearing on
188
+ # the id returned.
189
+ def key(row)
190
+ @counter.next
191
+ end
192
+
193
+ # No-op, provided for interface compatability.
194
+ def flush
195
+ end
196
+ end
197
+ end
198
+ end