activewarehouse-etl 0.9.1 → 0.9.5.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (135) hide show
  1. data/.gitignore +7 -0
  2. data/0.9-UPGRADE +6 -0
  3. data/CHANGELOG +182 -150
  4. data/Gemfile +4 -0
  5. data/HOW_TO_RELEASE +9 -0
  6. data/README +18 -2
  7. data/Rakefile +35 -91
  8. data/active_support_logger.patch +78 -0
  9. data/activewarehouse-etl.gemspec +30 -0
  10. data/lib/etl.rb +10 -2
  11. data/lib/etl/batch/directives.rb +11 -1
  12. data/lib/etl/control/control.rb +2 -2
  13. data/lib/etl/control/destination.rb +27 -7
  14. data/lib/etl/control/destination/database_destination.rb +8 -6
  15. data/lib/etl/control/destination/excel_destination.rb +91 -0
  16. data/lib/etl/control/destination/file_destination.rb +6 -4
  17. data/lib/etl/control/destination/insert_update_database_destination.rb +133 -0
  18. data/lib/etl/control/destination/update_database_destination.rb +109 -0
  19. data/lib/etl/control/source.rb +3 -2
  20. data/lib/etl/control/source/database_source.rb +14 -10
  21. data/lib/etl/control/source/file_source.rb +2 -2
  22. data/lib/etl/engine.rb +17 -15
  23. data/lib/etl/execution.rb +0 -1
  24. data/lib/etl/execution/batch.rb +3 -1
  25. data/lib/etl/execution/migration.rb +5 -0
  26. data/lib/etl/parser/delimited_parser.rb +20 -1
  27. data/lib/etl/parser/excel_parser.rb +112 -0
  28. data/lib/etl/processor/bulk_import_processor.rb +4 -2
  29. data/lib/etl/processor/database_join_processor.rb +68 -0
  30. data/lib/etl/processor/escape_csv_processor.rb +77 -0
  31. data/lib/etl/processor/filter_row_processor.rb +51 -0
  32. data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
  33. data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
  34. data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
  35. data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
  36. data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
  37. data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
  38. data/lib/etl/processor/zip_file_processor.rb +27 -0
  39. data/lib/etl/transform/calculation_transform.rb +71 -0
  40. data/lib/etl/transform/foreign_key_lookup_transform.rb +25 -7
  41. data/lib/etl/transform/ordinalize_transform.rb +3 -1
  42. data/lib/etl/transform/split_fields_transform.rb +27 -0
  43. data/lib/etl/version.rb +1 -7
  44. data/test-matrix.yml +10 -0
  45. data/test/.gitignore +1 -0
  46. data/test/.ignore +2 -0
  47. data/test/all.ebf +6 -0
  48. data/test/apache_combined_log.ctl +11 -0
  49. data/test/batch_test.rb +41 -0
  50. data/test/batch_with_error.ebf +6 -0
  51. data/test/batched1.ctl +0 -0
  52. data/test/batched2.ctl +0 -0
  53. data/test/block_processor.ctl +6 -0
  54. data/test/block_processor_error.ctl +1 -0
  55. data/test/block_processor_pre_post_process.ctl +4 -0
  56. data/test/block_processor_remove_rows.ctl +5 -0
  57. data/test/block_processor_test.rb +38 -0
  58. data/test/config/Gemfile.rails-2.3.x +3 -0
  59. data/test/config/Gemfile.rails-2.3.x.lock +38 -0
  60. data/test/config/Gemfile.rails-3.0.x +3 -0
  61. data/test/config/Gemfile.rails-3.0.x.lock +49 -0
  62. data/test/config/common.rb +21 -0
  63. data/test/connection/mysql/connection.rb +9 -0
  64. data/test/connection/mysql/schema.sql +36 -0
  65. data/test/connection/postgresql/connection.rb +13 -0
  66. data/test/connection/postgresql/schema.sql +39 -0
  67. data/test/control_test.rb +43 -0
  68. data/test/data/apache_combined_log.txt +3 -0
  69. data/test/data/bulk_import.txt +3 -0
  70. data/test/data/bulk_import_with_empties.txt +3 -0
  71. data/test/data/decode.txt +3 -0
  72. data/test/data/delimited.txt +3 -0
  73. data/test/data/encode_source_latin1.txt +2 -0
  74. data/test/data/excel.xls +0 -0
  75. data/test/data/excel2.xls +0 -0
  76. data/test/data/fixed_width.txt +3 -0
  77. data/test/data/multiple_delimited_1.txt +3 -0
  78. data/test/data/multiple_delimited_2.txt +3 -0
  79. data/test/data/people.txt +3 -0
  80. data/test/data/sax.xml +14 -0
  81. data/test/data/xml.xml +16 -0
  82. data/test/date_dimension_builder_test.rb +96 -0
  83. data/test/delimited.ctl +30 -0
  84. data/test/delimited_absolute.ctl +33 -0
  85. data/test/delimited_destination_db.ctl +25 -0
  86. data/test/delimited_excel.ctl +31 -0
  87. data/test/delimited_insert_update.ctl +34 -0
  88. data/test/delimited_update.ctl +34 -0
  89. data/test/delimited_with_bulk_load.ctl +34 -0
  90. data/test/destination_test.rb +275 -0
  91. data/test/directive_test.rb +23 -0
  92. data/test/encode_processor_test.rb +32 -0
  93. data/test/engine_test.rb +32 -0
  94. data/test/errors.ctl +24 -0
  95. data/test/etl_test.rb +42 -0
  96. data/test/excel.ctl +24 -0
  97. data/test/excel2.ctl +25 -0
  98. data/test/fixed_width.ctl +35 -0
  99. data/test/generator_test.rb +14 -0
  100. data/test/inline_parser.ctl +17 -0
  101. data/test/mocks/mock_destination.rb +26 -0
  102. data/test/mocks/mock_source.rb +25 -0
  103. data/test/model_source.ctl +14 -0
  104. data/test/multiple_delimited.ctl +22 -0
  105. data/test/multiple_source_delimited.ctl +39 -0
  106. data/test/parser_test.rb +224 -0
  107. data/test/performance/delimited.ctl +30 -0
  108. data/test/processor_test.rb +44 -0
  109. data/test/row_processor_test.rb +17 -0
  110. data/test/sax.ctl +26 -0
  111. data/test/scd/1.txt +1 -0
  112. data/test/scd/2.txt +1 -0
  113. data/test/scd/3.txt +1 -0
  114. data/test/scd_test.rb +257 -0
  115. data/test/scd_test_type_1.ctl +43 -0
  116. data/test/scd_test_type_2.ctl +34 -0
  117. data/test/screen_test.rb +9 -0
  118. data/test/screen_test_error.ctl +3 -0
  119. data/test/screen_test_fatal.ctl +3 -0
  120. data/test/source_test.rb +139 -0
  121. data/test/test_helper.rb +34 -0
  122. data/test/transform_test.rb +101 -0
  123. data/test/vendor/adapter_extensions-0.5.0/CHANGELOG +26 -0
  124. data/test/vendor/adapter_extensions-0.5.0/LICENSE +16 -0
  125. data/test/vendor/adapter_extensions-0.5.0/README +7 -0
  126. data/test/vendor/adapter_extensions-0.5.0/Rakefile +158 -0
  127. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions.rb +12 -0
  128. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/abstract_adapter.rb +44 -0
  129. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/mysql_adapter.rb +63 -0
  130. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/postgresql_adapter.rb +52 -0
  131. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/sqlserver_adapter.rb +44 -0
  132. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/version.rb +10 -0
  133. data/test/xml.ctl +31 -0
  134. metadata +229 -70
  135. data/lib/etl/execution/record.rb +0 -18
@@ -0,0 +1,78 @@
1
+ Index: lib/active_support/clean_logger.rb
2
+ ===================================================================
3
+ --- lib/active_support/clean_logger.rb (revision 5963)
4
+ +++ lib/active_support/clean_logger.rb (working copy)
5
+ @@ -1,10 +1,21 @@
6
+ require 'logger'
7
+ require File.dirname(__FILE__) + '/core_ext/class/attribute_accessors'
8
+
9
+ -class Logger #:nodoc:
10
+ +# Extensions to the built in Ruby logger.
11
+ +#
12
+ +# If you want to use the default log formatter as defined in the Ruby core, then you
13
+ +# will need to set the formatter for the logger as in:
14
+ +#
15
+ +# logger.formatter = Formatter.new
16
+ +#
17
+ +# You can then specify the datetime format, for example:
18
+ +#
19
+ +# logger.datetime_format = "%Y-%m-%d"
20
+ +class Logger
21
+ + # Set to false to disable the silencer
22
+ cattr_accessor :silencer
23
+ self.silencer = true
24
+ -
25
+ +
26
+ # Silences the logger for the duration of the block.
27
+ def silence(temporary_level = Logger::ERROR)
28
+ if silencer
29
+ @@ -18,6 +29,35 @@
30
+ yield self
31
+ end
32
+ end
33
+ +
34
+ + alias :old_datetime_format= :datetime_format=
35
+ + # Logging date-time format (string passed to +strftime+). Ignored if the formatter
36
+ + # does not respond to datetime_format=.
37
+ + def datetime_format=(datetime_format)
38
+ + formatter.datetime_format = datetime_format if formatter.respond_to?(:datetime_format=)
39
+ + end
40
+ +
41
+ + alias :old_datetime_format :datetime_format
42
+ + # Get the logging datetime format. Returns nil if the formatter does not support
43
+ + # datetime formatting.
44
+ + def datetime_format
45
+ + formatter.datetime_format if formatter.respond_to?(:datetime_format)
46
+ + end
47
+ +
48
+ + alias :old_formatter :formatter
49
+ + # Get the current formatter. The default formatter is a SimpleFormatter which only
50
+ + # displays the log message
51
+ + def formatter
52
+ + @formatter ||= SimpleFormatter.new
53
+ + end
54
+ +
55
+ + # Simple formatter which only displays the message.
56
+ + class SimpleFormatter < Logger::Formatter
57
+ + # This method is invoked when a log event occurs
58
+ + def call(severity, timestamp, progname, msg)
59
+ + "#{msg}\n"
60
+ + end
61
+ + end
62
+
63
+ private
64
+ alias old_format_message format_message
65
+ @@ -28,11 +68,11 @@
66
+ # with Logger from 1.8.3 and vice versa.
67
+ if method_defined?(:formatter=)
68
+ def format_message(severity, timestamp, progname, msg)
69
+ - "#{msg}\n"
70
+ + formatter.call(severity, timestamp, progname, msg)
71
+ end
72
+ else
73
+ def format_message(severity, timestamp, msg, progname)
74
+ - "#{msg}\n"
75
+ + formatter.call(severity, timestamp, progname, msg)
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib/', __FILE__)
3
+ $:.unshift lib unless $:.include?(lib)
4
+
5
+ require 'etl/version'
6
+
7
+ Gem::Specification.new do |s|
8
+ s.name = %q{activewarehouse-etl}
9
+ s.version = ETL::VERSION
10
+ s.platform = Gem::Platform::RUBY
11
+ s.authors = ["Anthony Eden", "Thibaut Barrère"]
12
+ s.email = ["thibaut.barrere@gmail.com"]
13
+ s.homepage = "https://github.com/activewarehouse/activewarehouse-etl"
14
+ s.summary = %q{Pure Ruby ETL package.}
15
+ s.description = %q{ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.}
16
+
17
+ s.required_rubygems_version = ">= 1.3.6"
18
+
19
+ s.add_runtime_dependency('rake', '>= 0.8.3')
20
+ s.add_runtime_dependency('activesupport', '>= 2.1.0')
21
+ s.add_runtime_dependency('activerecord', '>= 2.1.0')
22
+ s.add_runtime_dependency('fastercsv', '>= 1.2.0')
23
+ s.add_runtime_dependency('adapter_extensions', '>= 0.5.0')
24
+ s.add_runtime_dependency('spreadsheet')
25
+
26
+ s.files = `git ls-files`.split("\n")
27
+ s.test_files = `git ls-files -- {test}/*`.split("\n")
28
+ s.executables = %w(etl)
29
+ s.require_path = "lib"
30
+ end
data/lib/etl.rb CHANGED
@@ -33,13 +33,21 @@ require 'rubygems'
33
33
 
34
34
  unless defined?(REXML::VERSION)
35
35
  require 'rexml/rexml'
36
- REXML::VERSION = REXML::Version
36
+ unless defined?(REXML::VERSION)
37
+ REXML::VERSION = REXML::Version
38
+ end
37
39
  end
38
40
 
39
41
  require 'active_support'
40
42
  require 'active_record'
41
43
  require 'adapter_extensions'
42
- require 'faster_csv'
44
+
45
+ if RUBY_VERSION < '1.9'
46
+ require 'faster_csv'
47
+ CSV = FasterCSV unless defined?(CSV)
48
+ else
49
+ require 'csv'
50
+ end
43
51
 
44
52
  $:.unshift(File.dirname(__FILE__))
45
53
 
@@ -37,7 +37,17 @@ module ETL #:nodoc:
37
37
  protected
38
38
  # Execute the process
39
39
  def do_execute
40
+ current_batch = ETL::Engine.batch
40
41
  batch.engine.process(file)
42
+
43
+ job = ETL::Engine.batch
44
+ if (job.kind_of? ETL::Execution::Batch and
45
+ current_batch[:id] != job[:id])
46
+ job[:batch_id] = current_batch[:id]
47
+ job.save!
48
+ end
49
+
50
+ ETL::Engine.batch = current_batch
41
51
  end
42
52
  end
43
53
 
@@ -52,4 +62,4 @@ module ETL #:nodoc:
52
62
  end
53
63
  end
54
64
  end
55
- end
65
+ end
@@ -66,8 +66,8 @@ module ETL #:nodoc:
66
66
  sources << source_class.new(self, configuration, definition)
67
67
  break
68
68
  end
69
- raise ControlError, "A source was specified but no matching type was found"
70
69
  end
70
+ raise ControlError, "A source was specified but no matching type was found" if sources.empty?
71
71
  end
72
72
  end
73
73
 
@@ -100,8 +100,8 @@ module ETL #:nodoc:
100
100
  destinations << dest_class.new(self, configuration, mapping)
101
101
  break
102
102
  end
103
- raise ControlError, "A destination was specified but no matching destination type was found"
104
103
  end
104
+ raise ControlError, "A destination was specified but no matching destination type was found" if destinations.empty?
105
105
  end
106
106
  end
107
107
 
@@ -129,11 +129,23 @@ module ETL #:nodoc:
129
129
  # missing, uses all of the row's fields.
130
130
  def scd_fields(row)
131
131
  @scd_fields ||= configuration[:scd_fields] || row.keys
132
+ ETL::Engine.logger.debug "@scd_fields is: #{@scd_fields.inspect}"
133
+ @scd_fields
134
+ end
135
+
136
+ # returns the fields that are required to identify an SCD
137
+ def scd_required_fields
138
+ if scd?
139
+ [scd_effective_date_field, scd_end_date_field, scd_latest_version_field]
140
+ else
141
+ []
142
+ end
132
143
  end
133
144
 
134
145
  def non_scd_fields(row)
135
- @non_csd_fields ||= row.keys - natural_key - scd_fields(row) -
136
- [primary_key, scd_effective_date_field, scd_end_date_field, scd_latest_version_field]
146
+ @non_scd_fields ||= row.keys - natural_key - scd_fields(row) - [primary_key] - scd_required_fields
147
+ ETL::Engine.logger.debug "@non_scd_fields is: #{@non_scd_fields.inspect}"
148
+ @non_scd_fields
137
149
  end
138
150
 
139
151
  def non_evolving_fields
@@ -280,7 +292,8 @@ module ETL #:nodoc:
280
292
  values << row[nk]
281
293
  end
282
294
  statement = statement.join(" AND ")
283
- ActiveRecord::Base.send(:sanitize_sql, [statement, *values])
295
+ x=ActiveRecord::Base.send(:sanitize_sql_array, [statement, *values])
296
+ return x
284
297
  end
285
298
 
286
299
  # Do all the steps required when a SCD *has* changed. Exact steps
@@ -353,10 +366,10 @@ module ETL #:nodoc:
353
366
  q = "SELECT * FROM #{dimension_table} WHERE #{natural_key_equality_for_row(row)}"
354
367
  q << " AND #{scd_latest_version_field}" if scd_type == 2
355
368
 
356
- #puts "looking for original record"
369
+ ETL::Engine.logger.debug "looking for original record"
357
370
  result = connection.select_one(q)
358
371
 
359
- #puts "Result: #{result.inspect}"
372
+ ETL::Engine.logger.debug "Result: #{result.inspect}"
360
373
 
361
374
  result ? ETL::Row[result.symbolize_keys!] : nil
362
375
  end
@@ -364,7 +377,14 @@ module ETL #:nodoc:
364
377
  # Check whether non-scd fields have changed since the last
365
378
  # load of this record.
366
379
  def has_scd_field_changes?(row)
367
- scd_fields(row).any? { |csd_field| row[csd_field].to_s != @existing_row[csd_field].to_s }
380
+ scd_fields(row).any? { |csd_field|
381
+ ETL::Engine.logger.debug "Row: #{row.inspect}"
382
+ ETL::Engine.logger.debug "Existing Row: #{@existing_row.inspect}"
383
+ ETL::Engine.logger.debug "comparing: #{row[csd_field].to_s} != #{@existing_row[csd_field].to_s}"
384
+ x=row[csd_field].to_s != @existing_row[csd_field].to_s
385
+ ETL::Engine.logger.debug x
386
+ x
387
+ }
368
388
  end
369
389
 
370
390
  # Check whether non-scd fields have changed since the last
@@ -417,4 +437,4 @@ module ETL #:nodoc:
417
437
  end
418
438
  end
419
439
 
420
- Dir[File.dirname(__FILE__) + "/destination/*.rb"].each { |file| require(file) }
440
+ Dir[File.dirname(__FILE__) + "/destination/*.rb"].each { |file| require(file) }
@@ -38,8 +38,10 @@ module ETL #:nodoc:
38
38
  @target = configuration[:target]
39
39
  @table = configuration[:table]
40
40
  @truncate = configuration[:truncate] ||= false
41
- @unique = configuration[:unique]
42
- @order = mapping[:order] || order_from_source
41
+ @unique = configuration[:unique] ? configuration[:unique] + [scd_effective_date_field] : configuration[:unique]
42
+ @unique.uniq! unless @unique.nil?
43
+ @order = mapping[:order] ? mapping[:order] + scd_required_fields : order_from_source
44
+ @order.uniq! unless @order.nil?
43
45
  raise ControlError, "Order required in mapping" unless @order
44
46
  raise ControlError, "Table required" unless @table
45
47
  raise ControlError, "Target required" unless @target
@@ -59,10 +61,10 @@ module ETL #:nodoc:
59
61
  names = []
60
62
  values = []
61
63
  order.each do |name|
62
- names << "`#{name}`"
63
- values << conn.quote(row[name]) # TODO: this is probably not database agnostic
64
+ names << conn.quote_column_name(name)
65
+ values << conn.quote(row[name])
64
66
  end
65
- q = "INSERT INTO `#{table_name}` (#{names.join(',')}) VALUES (#{values.join(',')})"
67
+ q = "INSERT INTO #{conn.quote_table_name(table_name)} (#{names.join(',')}) VALUES (#{values.join(',')})"
66
68
  ETL::Engine.logger.debug("Executing insert: #{q}")
67
69
  conn.insert(q, "Insert row #{current_row}")
68
70
  @current_row += 1
@@ -92,4 +94,4 @@ module ETL #:nodoc:
92
94
 
93
95
  end
94
96
  end
95
- end
97
+ end
@@ -0,0 +1,91 @@
1
+ require 'spreadsheet'
2
+
3
+ module ETL
4
+ module Control
5
+ # Excel as the final destination.
6
+ class ExcelDestination < Destination
7
+ # The File to write to
8
+ attr_reader :file
9
+
10
+ # The output order
11
+ attr_reader :order
12
+
13
+ # Flag which indicates to append (default is to overwrite)
14
+ attr_accessor :append
15
+
16
+ # Initialize the object.
17
+ # * <tt>control</tt>: The Control object
18
+ # * <tt>configuration</tt>: The configuration map
19
+ # * <tt>mapping</tt>: The output mapping
20
+ #
21
+ # Configuration options:
22
+ # * <tt>:file<tt>: The file to write to (REQUIRED)
23
+ # * <tt>:append</tt>: Set to true to append to the file (default is to overwrite)
24
+ # * <tt>:unique</tt>: Set to true to only write unique records
25
+ # * <tt>:append_rows</tt>: Array of rows to append
26
+ #
27
+ # Mapping options:
28
+ # * <tt>:order</tt>: The order array
29
+ def initialize(control, configuration, mapping={})
30
+ super
31
+ path = Pathname.new(configuration[:file])
32
+ @file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(control.file))) + path
33
+ @append = configuration[:append] ||= false
34
+ @unique = configuration[:unique] ? configuration[:unique] + scd_required_fields : configuration[:unique]
35
+ @unique.uniq! unless @unique.nil?
36
+ @order = mapping[:order] ? mapping[:order] + scd_required_fields : order_from_source
37
+ @order.uniq! unless @order.nil?
38
+ raise ControlError, "Order required in mapping" unless @order
39
+ end
40
+
41
+ # Close the destination. This will flush the buffer and close the underlying stream or connection.
42
+ def close
43
+ buffer << append_rows if append_rows
44
+ flush
45
+ book.write(file)
46
+ end
47
+
48
+ # Flush the destination buffer
49
+ def flush
50
+ #puts "Flushing buffer (#{file}) with #{buffer.length} rows"
51
+ buffer.flatten.each_with_index do |row, index|
52
+ #puts "row change type: #{row.change_type}"
53
+ # check to see if this row's compound key constraint already exists
54
+ # note that the compound key constraint may not utilize virtual fields
55
+ next unless row_allowed?(row)
56
+
57
+ # add any virtual fields
58
+ add_virtuals!(row)
59
+
60
+ # collect all of the values using the order designated in the configuration
61
+ values = order.collect do |name|
62
+ value = row[name]
63
+ case value
64
+ when Date, Time, DateTime
65
+ value.to_s(:db)
66
+ else
67
+ value.to_s
68
+ end
69
+ end
70
+
71
+ # write the values
72
+ sheet.insert_row(index, values)
73
+ end
74
+ buffer.clear
75
+ #puts "After flush there are #{buffer.length} rows"
76
+ end
77
+
78
+ private
79
+ # Get the open file excel
80
+ def book
81
+ @book ||= ( append ? Spreadsheet.open(file) : Spreadsheet::Workbook.new(file) )
82
+ end
83
+
84
+ private
85
+ # Get the open sheet
86
+ def sheet
87
+ @sheet ||= ( append ? book.worksheet(0) : book.create_worksheet() )
88
+ end
89
+ end
90
+ end
91
+ end
@@ -40,14 +40,16 @@ module ETL #:nodoc:
40
40
  # * <tt>:order</tt>: The order array
41
41
  def initialize(control, configuration, mapping={})
42
42
  super
43
- @file = File.join(File.dirname(control.file), configuration[:file])
43
+ path = Pathname.new(configuration[:file])
44
+ @file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(control.file))) + path
44
45
  @append = configuration[:append] ||= false
45
46
  @separator = configuration[:separator] ||= ','
46
47
  @eol = configuration[:eol] ||= "\n"
47
48
  @enclose = configuration[:enclose]
48
- @unique = configuration[:unique]
49
-
50
- @order = mapping[:order] || order_from_source
49
+ @unique = configuration[:unique] ? configuration[:unique] + scd_required_fields : configuration[:unique]
50
+ @unique.uniq! unless @unique.nil?
51
+ @order = mapping[:order] ? mapping[:order] + scd_required_fields : order_from_source
52
+ @order.uniq! unless @order.nil?
51
53
  raise ControlError, "Order required in mapping" unless @order
52
54
  end
53
55
 
@@ -0,0 +1,133 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Destination which writes directly to a database. This is useful when you are dealing with
4
+ # a small amount of data. For larger amounts of data you should probably use the bulk
5
+ # loader if it is supported with your target database as it will use a much faster load
6
+ # method.
7
+ class InsertUpdateDatabaseDestination < Destination
8
+ # The target connection
9
+ attr_reader :target
10
+
11
+ # The table
12
+ attr_reader :table
13
+
14
+ # Specify the order from the source
15
+ attr_reader :order
16
+
17
+ # Specify the primarykey from the source
18
+ attr_reader :primarykey
19
+
20
+ # Set to true to truncate the destination table first
21
+ attr_reader :truncate
22
+
23
+ # Initialize the database destination
24
+ #
25
+ # * <tt>control</tt>: The ETL::Control::Control instance
26
+ # * <tt>configuration</tt>: The configuration Hash
27
+ # * <tt>mapping</tt>: The mapping
28
+ #
29
+ # Configuration options:
30
+ # * <tt>:database</tt>: The database name (REQUIRED)
31
+ # * <tt>:target</tt>: The target connection (REQUIRED)
32
+ # * <tt>:table</tt>: The table to write to (REQUIRED)
33
+ # * <tt>:truncate</tt>: Set to true to truncate before writing (defaults to false)
34
+ # * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
35
+ # * <tt>:append_rows</tt>: Array of rows to append
36
+ #
37
+ # Mapping options:
38
+ # * <tt>:order</tt>: The order of fields to write (REQUIRED)
39
+ # * <tt>:primarykey</tt>: The primary key of fields to select insert or update (REQUIRED)
40
+ def initialize(control, configuration, mapping={})
41
+ super
42
+ @target = configuration[:target]
43
+ @table = configuration[:table]
44
+ @truncate = configuration[:truncate] ||= false
45
+ @unique = configuration[:unique] ? configuration[:unique] + [scd_effective_date_field] : configuration[:unique]
46
+ @unique.uniq! unless @unique.nil?
47
+ @order = mapping[:order] ? mapping[:order] + scd_required_fields : order_from_source
48
+ @order.uniq! unless @order.nil?
49
+ @primarykey = mapping[:primarykey] ? mapping[:primarykey] + scd_required_fields : nil
50
+ @primarykey.uniq! unless @primarykey.nil?
51
+ raise ControlError, "Primarykey required in mapping" unless @primarykey
52
+ raise ControlError, "Order required in mapping" unless @order
53
+ raise ControlError, "Table required" unless @table
54
+ raise ControlError, "Target required" unless @target
55
+ end
56
+
57
+ # Flush the currently buffered data
58
+ def flush
59
+ conn.transaction do
60
+ buffer.flatten.each do |row|
61
+ # check to see if this row's compound key constraint already exists
62
+ # note that the compound key constraint may not utilize virtual fields
63
+ next unless row_allowed?(row)
64
+
65
+ # add any virtual fields
66
+ add_virtuals!(row)
67
+
68
+ primarykeyfilter = []
69
+ primarykey.each do |name|
70
+ primarykeyfilter << "#{conn.quote_column_name(name)} = #{conn.quote(row[name])}"
71
+ end
72
+ q = "SELECT * FROM #{conn.quote_table_name(table_name)} WHERE #{primarykeyfilter.join(' AND ')}"
73
+ ETL::Engine.logger.debug("Executing select: #{q}")
74
+ res = conn.execute(q, "Select row #{current_row}")
75
+ none = true
76
+
77
+ case conn
78
+ when ActiveRecord::ConnectionAdapters::PostgreSQLAdapter;
79
+ res.each { none = false }
80
+ when ActiveRecord::ConnectionAdapters::MysqlAdapter;
81
+ res.each_hash { none = false }
82
+ else raise "Unsupported adapter #{conn.class} for this destination"
83
+ end
84
+
85
+ if none
86
+ names = []
87
+ values = []
88
+ order.each do |name|
89
+ names << conn.quote_column_name(name)
90
+ values << conn.quote(row[name])
91
+ end
92
+ q = "INSERT INTO #{conn.quote_table_name(table_name)} (#{names.join(',')}) VALUES (#{values.join(',')})"
93
+ ETL::Engine.logger.debug("Executing insert: #{q}")
94
+ conn.insert(q, "Insert row #{current_row}")
95
+ else
96
+ updatevalues = []
97
+ order.each do |name|
98
+ updatevalues << "#{conn.quote_column_name(name)} = #{conn.quote(row[name])}"
99
+ end
100
+ q = "UPDATE #{conn.quote_table_name(table_name)} SET #{updatevalues.join(',')} WHERE #{primarykeyfilter.join(' AND ')}"
101
+ ETL::Engine.logger.debug("Executing update: #{q}")
102
+ conn.update(q, "Update row #{current_row}")
103
+ end
104
+ @current_row += 1
105
+ end
106
+ buffer.clear
107
+ end
108
+ end
109
+
110
+ # Close the connection
111
+ def close
112
+ buffer << append_rows if append_rows
113
+ flush
114
+ end
115
+
116
+ private
117
+ def conn
118
+ @conn ||= begin
119
+ conn = ETL::Engine.connection(target)
120
+ conn.truncate(table_name) if truncate
121
+ conn
122
+ rescue
123
+ raise RuntimeError, "Problem to connect to db"
124
+ end
125
+ end
126
+
127
+ def table_name
128
+ ETL::Engine.table(table, ETL::Engine.connection(target))
129
+ end
130
+
131
+ end
132
+ end
133
+ end