activewarehouse-etl-sgonyea 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. data/.gitignore +9 -0
  2. data/0.9-UPGRADE +6 -0
  3. data/CHANGELOG +236 -0
  4. data/Gemfile +4 -0
  5. data/HOW_TO_RELEASE +13 -0
  6. data/LICENSE +7 -0
  7. data/README.textile +111 -0
  8. data/Rakefile +103 -0
  9. data/TODO +28 -0
  10. data/active_support_logger.patch +78 -0
  11. data/activewarehouse-etl.gemspec +36 -0
  12. data/bin/etl +28 -0
  13. data/bin/etl.cmd +8 -0
  14. data/examples/database.example.yml +16 -0
  15. data/lib/etl.rb +97 -0
  16. data/lib/etl/batch.rb +2 -0
  17. data/lib/etl/batch/batch.rb +111 -0
  18. data/lib/etl/batch/directives.rb +65 -0
  19. data/lib/etl/builder.rb +2 -0
  20. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  21. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  22. data/lib/etl/commands/etl.rb +89 -0
  23. data/lib/etl/control.rb +3 -0
  24. data/lib/etl/control/control.rb +405 -0
  25. data/lib/etl/control/destination.rb +438 -0
  26. data/lib/etl/control/destination/csv_destination.rb +113 -0
  27. data/lib/etl/control/destination/database_destination.rb +97 -0
  28. data/lib/etl/control/destination/excel_destination.rb +91 -0
  29. data/lib/etl/control/destination/file_destination.rb +126 -0
  30. data/lib/etl/control/destination/insert_update_database_destination.rb +136 -0
  31. data/lib/etl/control/destination/update_database_destination.rb +109 -0
  32. data/lib/etl/control/destination/yaml_destination.rb +74 -0
  33. data/lib/etl/control/source.rb +132 -0
  34. data/lib/etl/control/source/database_source.rb +224 -0
  35. data/lib/etl/control/source/enumerable_source.rb +11 -0
  36. data/lib/etl/control/source/file_source.rb +90 -0
  37. data/lib/etl/control/source/model_source.rb +39 -0
  38. data/lib/etl/core_ext.rb +1 -0
  39. data/lib/etl/core_ext/time.rb +5 -0
  40. data/lib/etl/core_ext/time/calculations.rb +42 -0
  41. data/lib/etl/engine.rb +582 -0
  42. data/lib/etl/execution.rb +19 -0
  43. data/lib/etl/execution/base.rb +8 -0
  44. data/lib/etl/execution/batch.rb +10 -0
  45. data/lib/etl/execution/job.rb +8 -0
  46. data/lib/etl/execution/migration.rb +90 -0
  47. data/lib/etl/generator.rb +2 -0
  48. data/lib/etl/generator/generator.rb +20 -0
  49. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  50. data/lib/etl/http_tools.rb +139 -0
  51. data/lib/etl/parser.rb +11 -0
  52. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  53. data/lib/etl/parser/csv_parser.rb +93 -0
  54. data/lib/etl/parser/excel_parser.rb +112 -0
  55. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  56. data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
  57. data/lib/etl/parser/parser.rb +41 -0
  58. data/lib/etl/parser/sax_parser.rb +218 -0
  59. data/lib/etl/parser/xml_parser.rb +65 -0
  60. data/lib/etl/processor.rb +11 -0
  61. data/lib/etl/processor/block_processor.rb +14 -0
  62. data/lib/etl/processor/bulk_import_processor.rb +94 -0
  63. data/lib/etl/processor/check_exist_processor.rb +80 -0
  64. data/lib/etl/processor/check_unique_processor.rb +39 -0
  65. data/lib/etl/processor/copy_field_processor.rb +26 -0
  66. data/lib/etl/processor/database_join_processor.rb +82 -0
  67. data/lib/etl/processor/encode_processor.rb +55 -0
  68. data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
  69. data/lib/etl/processor/escape_csv_processor.rb +77 -0
  70. data/lib/etl/processor/filter_row_processor.rb +51 -0
  71. data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
  72. data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
  73. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  74. data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
  75. data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
  76. data/lib/etl/processor/print_row_processor.rb +12 -0
  77. data/lib/etl/processor/processor.rb +25 -0
  78. data/lib/etl/processor/rename_processor.rb +24 -0
  79. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  80. data/lib/etl/processor/row_processor.rb +27 -0
  81. data/lib/etl/processor/sequence_processor.rb +23 -0
  82. data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
  83. data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
  84. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  85. data/lib/etl/processor/truncate_processor.rb +40 -0
  86. data/lib/etl/processor/zip_file_processor.rb +27 -0
  87. data/lib/etl/row.rb +20 -0
  88. data/lib/etl/screen.rb +14 -0
  89. data/lib/etl/screen/row_count_screen.rb +20 -0
  90. data/lib/etl/transform.rb +2 -0
  91. data/lib/etl/transform/block_transform.rb +13 -0
  92. data/lib/etl/transform/calculation_transform.rb +71 -0
  93. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  94. data/lib/etl/transform/decode_transform.rb +51 -0
  95. data/lib/etl/transform/default_transform.rb +20 -0
  96. data/lib/etl/transform/foreign_key_lookup_transform.rb +211 -0
  97. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  98. data/lib/etl/transform/md5_transform.rb +13 -0
  99. data/lib/etl/transform/ordinalize_transform.rb +14 -0
  100. data/lib/etl/transform/sha1_transform.rb +13 -0
  101. data/lib/etl/transform/split_fields_transform.rb +27 -0
  102. data/lib/etl/transform/string_to_date_time_transform.rb +14 -0
  103. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  104. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  105. data/lib/etl/transform/transform.rb +61 -0
  106. data/lib/etl/transform/trim_transform.rb +26 -0
  107. data/lib/etl/transform/type_transform.rb +35 -0
  108. data/lib/etl/util.rb +59 -0
  109. data/lib/etl/version.rb +3 -0
  110. data/test-matrix.yml +10 -0
  111. data/test/.gitignore +1 -0
  112. data/test/.ignore +2 -0
  113. data/test/all.ebf +6 -0
  114. data/test/apache_combined_log.ctl +11 -0
  115. data/test/batch_test.rb +41 -0
  116. data/test/batch_with_error.ebf +6 -0
  117. data/test/batched1.ctl +0 -0
  118. data/test/batched2.ctl +0 -0
  119. data/test/block_processor.ctl +6 -0
  120. data/test/block_processor_error.ctl +1 -0
  121. data/test/block_processor_pre_post_process.ctl +4 -0
  122. data/test/block_processor_remove_rows.ctl +5 -0
  123. data/test/block_processor_test.rb +38 -0
  124. data/test/check_exist_processor_test.rb +92 -0
  125. data/test/check_unique_processor_test.rb +40 -0
  126. data/test/config/Gemfile.rails-2.3.x +3 -0
  127. data/test/config/Gemfile.rails-2.3.x.lock +53 -0
  128. data/test/config/Gemfile.rails-3.0.x +3 -0
  129. data/test/config/Gemfile.rails-3.0.x.lock +61 -0
  130. data/test/config/common.rb +29 -0
  131. data/test/connection/mysql/connection.rb +9 -0
  132. data/test/connection/mysql/schema.sql +37 -0
  133. data/test/connection/postgresql/connection.rb +13 -0
  134. data/test/connection/postgresql/schema.sql +40 -0
  135. data/test/control_test.rb +43 -0
  136. data/test/data/apache_combined_log.txt +3 -0
  137. data/test/data/bulk_import.txt +3 -0
  138. data/test/data/bulk_import_with_empties.txt +3 -0
  139. data/test/data/decode.txt +3 -0
  140. data/test/data/delimited.txt +3 -0
  141. data/test/data/encode_source_latin1.txt +2 -0
  142. data/test/data/excel.xls +0 -0
  143. data/test/data/excel2.xls +0 -0
  144. data/test/data/fixed_width.txt +3 -0
  145. data/test/data/multiple_delimited_1.txt +3 -0
  146. data/test/data/multiple_delimited_2.txt +3 -0
  147. data/test/data/nokogiri.xml +38 -0
  148. data/test/data/people.txt +3 -0
  149. data/test/data/sax.xml +14 -0
  150. data/test/data/xml.xml +16 -0
  151. data/test/database_join_processor_test.rb +43 -0
  152. data/test/date_dimension_builder_test.rb +96 -0
  153. data/test/delimited.ctl +30 -0
  154. data/test/delimited_absolute.ctl +31 -0
  155. data/test/delimited_destination_db.ctl +23 -0
  156. data/test/delimited_excel.ctl +31 -0
  157. data/test/delimited_insert_update.ctl +34 -0
  158. data/test/delimited_update.ctl +34 -0
  159. data/test/delimited_with_bulk_load.ctl +34 -0
  160. data/test/destination_test.rb +275 -0
  161. data/test/directive_test.rb +23 -0
  162. data/test/encode_processor_test.rb +32 -0
  163. data/test/engine_test.rb +78 -0
  164. data/test/ensure_fields_presence_processor_test.rb +28 -0
  165. data/test/errors.ctl +24 -0
  166. data/test/etl_test.rb +42 -0
  167. data/test/excel.ctl +24 -0
  168. data/test/excel2.ctl +25 -0
  169. data/test/fixed_width.ctl +35 -0
  170. data/test/foreign_key_lookup_transform_test.rb +50 -0
  171. data/test/generator_test.rb +14 -0
  172. data/test/inline_parser.ctl +17 -0
  173. data/test/mocks/mock_destination.rb +26 -0
  174. data/test/mocks/mock_source.rb +25 -0
  175. data/test/model_source.ctl +14 -0
  176. data/test/multiple_delimited.ctl +22 -0
  177. data/test/multiple_source_delimited.ctl +39 -0
  178. data/test/nokogiri_all.ctl +35 -0
  179. data/test/nokogiri_select.ctl +35 -0
  180. data/test/nokogiri_test.rb +35 -0
  181. data/test/parser_test.rb +224 -0
  182. data/test/performance/delimited.ctl +30 -0
  183. data/test/processor_test.rb +44 -0
  184. data/test/row_processor_test.rb +17 -0
  185. data/test/sax.ctl +26 -0
  186. data/test/scd/1.txt +1 -0
  187. data/test/scd/2.txt +1 -0
  188. data/test/scd/3.txt +1 -0
  189. data/test/scd_test.rb +257 -0
  190. data/test/scd_test_type_1.ctl +43 -0
  191. data/test/scd_test_type_2.ctl +34 -0
  192. data/test/screen_test.rb +9 -0
  193. data/test/screen_test_error.ctl +3 -0
  194. data/test/screen_test_fatal.ctl +3 -0
  195. data/test/source_test.rb +154 -0
  196. data/test/test_helper.rb +37 -0
  197. data/test/transform_test.rb +101 -0
  198. data/test/truncate_processor_test.rb +37 -0
  199. data/test/xml.ctl +31 -0
  200. metadata +370 -0
@@ -0,0 +1,438 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Base class for destinations.
4
+ class Destination
5
+ # Read-only accessor for the ETL::Control::Control instance
6
+ attr_reader :control
7
+
8
+ # Read-only accessor for the configuration Hash
9
+ attr_reader :configuration
10
+
11
+ # Read-only accessor for the destination mapping Hash
12
+ attr_reader :mapping
13
+
14
+ # Accessor to the buffer size
15
+ attr_accessor :buffer_size
16
+
17
+ # Unique flag.
18
+ attr_accessor :unique
19
+
20
+ # A condition for writing
21
+ attr_accessor :condition
22
+
23
+ # An array of rows to append to the destination
24
+ attr_accessor :append_rows
25
+
26
+ class << self
27
+ # Get the destination class for the specified name.
28
+ #
29
+ # For example if name is :database or 'database' then the
30
+ # DatabaseDestination class is returned
31
+ def class_for_name(name)
32
+ ETL::Control.const_get("#{name.to_s.camelize}Destination")
33
+ end
34
+ end
35
+
36
+ # Initialize the destination
37
+ #
38
+ # Arguments:
39
+ # * <tt>control</tt>: The ETL::Control::Control instance
40
+ # * <tt>configuration</tt>: The configuration Hash
41
+ # * <tt>mapping</tt>: The mapping Hash
42
+ #
43
+ # Options:
44
+ # * <tt>:buffer_size</tt>: The output buffer size (default 1000 records)
45
+ # * <tt>:condition</tt>: A conditional proc that must return true for the
46
+ # row to be written
47
+ # * <tt>:append_rows</tt>: An array of rows to append
48
+ def initialize(control, configuration, mapping)
49
+ @control = control
50
+ @configuration = configuration
51
+ @mapping = mapping
52
+ @buffer_size = configuration[:buffer_size] ||= 100
53
+ @condition = configuration[:condition]
54
+ @append_rows = configuration[:append_rows]
55
+ end
56
+
57
+ # Get the current row number
58
+ def current_row
59
+ @current_row ||= 1
60
+ end
61
+
62
+ # Write the given row
63
+ def write(row)
64
+ if @condition.nil? || @condition.call(row)
65
+ process_change(row)
66
+ end
67
+ flush if buffer.length >= buffer_size
68
+ end
69
+
70
+ # Abstract method
71
+ def flush
72
+ raise NotImplementedError, "flush method must be implemented by subclasses"
73
+ end
74
+
75
+ # Abstract method
76
+ def close
77
+ raise NotImplementedError, "close method must be implemented by subclasses"
78
+ end
79
+
80
+ def errors
81
+ @errors ||= []
82
+ end
83
+
84
+ protected
85
+ # Access the buffer
86
+ def buffer
87
+ @buffer ||= []
88
+ end
89
+
90
+ # Access the generators map
91
+ def generators
92
+ @generators ||= {}
93
+ end
94
+
95
+ # Get the order of elements from the source order
96
+ def order_from_source
97
+ order = []
98
+ control.sources.first.definition.each do |item|
99
+ case item
100
+ when Hash
101
+ order << item[:name]
102
+ else
103
+ order << item
104
+ end
105
+ end
106
+ order
107
+ end
108
+
109
+ # Return true if the row is allowed. The row will not be allowed if the
110
+ # :unique option is specified in the configuration and the compound key
111
+ # already exists
112
+ def row_allowed?(row)
113
+ if unique
114
+ key = (unique.collect { |k| row[k] }).join('|')
115
+ return false if compound_key_constraints[key]
116
+ compound_key_constraints[key] = 1
117
+ end
118
+ return true
119
+ end
120
+
121
+ # Get a hash of compound key contraints. This is used to determine if a
122
+ # row can be written when the unique option is specified
123
+ def compound_key_constraints
124
+ @compound_key_constraints ||= {}
125
+ end
126
+
127
+ # Return fields which are Slowly Changing Dimension fields.
128
+ # Uses the scd_fields specified in the configuration. If that's
129
+ # missing, uses all of the row's fields.
130
+ def scd_fields(row)
131
+ @scd_fields ||= configuration[:scd_fields] || row.keys
132
+ ETL::Engine.logger.debug "@scd_fields is: #{@scd_fields.inspect}"
133
+ @scd_fields
134
+ end
135
+
136
+ # returns the fields that are required to identify an SCD
137
+ def scd_required_fields
138
+ if scd? and scd_type == 2
139
+ [scd_effective_date_field, scd_end_date_field, scd_latest_version_field]
140
+ else
141
+ []
142
+ end
143
+ end
144
+
145
+ def non_scd_fields(row)
146
+ @non_scd_fields ||= row.keys - natural_key - scd_fields(row) - [primary_key] - scd_required_fields
147
+ ETL::Engine.logger.debug "@non_scd_fields is: #{@non_scd_fields.inspect}"
148
+ @non_scd_fields
149
+ end
150
+
151
+ def non_evolving_fields
152
+ (Array(configuration[:scd][:non_evolving_fields]) << primary_key).uniq
153
+ end
154
+
155
+ def scd?
156
+ !configuration[:scd].nil?
157
+ end
158
+
159
+ def scd_type
160
+ scd? ? configuration[:scd][:type] : nil
161
+ end
162
+
163
+ # Get the Slowly Changing Dimension effective date field. Defaults to
164
+ # 'effective_date'.
165
+ def scd_effective_date_field
166
+ configuration[:scd][:effective_date_field] || :effective_date if scd?
167
+ end
168
+
169
+ # Get the Slowly Changing Dimension end date field. Defaults to
170
+ # 'end_date'.
171
+ def scd_end_date_field
172
+ configuration[:scd][:end_date_field] || :end_date if scd?
173
+ end
174
+
175
+ # Get the Slowly Changing Dimension latest version field. Defaults to
176
+ # 'latest_version'.
177
+ def scd_latest_version_field
178
+ configuration[:scd][:latest_version_field] || :latest_version if scd?
179
+ end
180
+
181
+ # Return the natural key field names, defaults to []
182
+ def natural_key
183
+ @natural_key ||= determine_natural_key
184
+ end
185
+
186
+ # Get the dimension table if specified
187
+ def dimension_table
188
+ @dimension_table ||= if scd?
189
+ ETL::Engine.table(configuration[:scd][:dimension_table], dimension_target) or raise ConfigurationError, "dimension_table setting required"
190
+ end
191
+ end
192
+
193
+ # Get the dimension target if specified
194
+ def dimension_target
195
+ @dimension_target ||= if scd?
196
+ configuration[:scd][:dimension_target] or raise ConfigurationError, "dimension_target setting required"
197
+ end
198
+ end
199
+
200
+ # Process a row to determine the change type
201
+ def process_change(row)
202
+ ETL::Engine.logger.debug "Processing row: #{row.inspect}"
203
+ return unless row
204
+
205
+ # Change processing can only occur if the natural key exists in the row
206
+ ETL::Engine.logger.debug "Checking for natural key existence"
207
+ unless has_natural_key?(row)
208
+ buffer << row
209
+ return
210
+ end
211
+
212
+ @timestamp = Time.now
213
+
214
+ # See if the scd_fields of the current record have changed
215
+ # from the last time this record was loaded into the data
216
+ # warehouse. If they match then throw away this row (no need
217
+ # to process). If they do not match then the record is an
218
+ # 'update'. If the record doesn't exist then it is an 'insert'
219
+ ETL::Engine.logger.debug "Checking record for SCD change"
220
+ if @existing_row = preexisting_row(row)
221
+ if has_scd_field_changes?(row)
222
+ process_scd_change(row)
223
+ else
224
+ process_scd_match(row)
225
+ end
226
+ else
227
+ schedule_new_record(row)
228
+ end
229
+ end
230
+
231
+ # Add any virtual fields to the row. Virtual rows will get their value
232
+ # from one of the following:
233
+ # * If the mapping is a Class, then an object which implements the next
234
+ # method
235
+ # * If the mapping is a Symbol, then the XGenerator where X is the
236
+ # classified symbol
237
+ # * If the mapping is a Proc, then it will be called with the row
238
+ # * Otherwise the value itself will be assigned to the field
239
+ def add_virtuals!(row)
240
+ if mapping[:virtual]
241
+ mapping[:virtual].each do |key,value|
242
+ # If the row already has the virtual set, assume that's correct
243
+ next if row[key]
244
+ # Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
245
+ case value
246
+ when Class
247
+ generator = generators[key] ||= value.new
248
+ row[key] = generator.next
249
+ when Symbol
250
+ generator = generators[key] ||= ETL::Generator::Generator.class_for_name(value).new(options)
251
+ row[key] = generator.next
252
+ when Proc, Method
253
+ row[key] = value.call(row)
254
+ else
255
+ if value.is_a?(ETL::Generator::Generator)
256
+ row[key] = value.next
257
+ else
258
+ row[key] = value
259
+ end
260
+ end
261
+ end
262
+ end
263
+ end
264
+
265
+ private
266
+
267
+ # Determine the natural key. This method will always return an array
268
+ # of symbols. The default value is [].
269
+ def determine_natural_key
270
+ Array(configuration[:natural_key]).collect(&:to_sym)
271
+ end
272
+
273
+ # Check whether a natural key has been defined, and if so, whether
274
+ # this row has enough information to do searches based on that natural
275
+ # key.
276
+ #
277
+ # TODO: This should be factored out into
278
+ # ETL::Row#has_all_fields?(field_array) But that's not possible
279
+ # until *all* sources cast to ETL::Row, instead of sometimes
280
+ # using Hash
281
+ def has_natural_key?(row)
282
+ natural_key.any? && natural_key.all? { |key| row.has_key?(key) }
283
+ end
284
+
285
+ # Helper for generating the SQL where clause that allows searching
286
+ # by a natural key
287
+ def natural_key_equality_for_row(row)
288
+ statement = []
289
+ values = []
290
+ natural_key.each do |nk|
291
+ statement << "#{nk} = #{ActiveRecord::Base.send(:quote_bound_value, row[nk], connection)}"
292
+ end
293
+ statement = statement.join(" AND ")
294
+ return statement
295
+ end
296
+
297
+ # Do all the steps required when a SCD *has* changed. Exact steps
298
+ # depend on what type of SCD we're handling.
299
+ def process_scd_change(row)
300
+ ETL::Engine.logger.debug "SCD fields do not match"
301
+
302
+ if scd_type == 2
303
+ # SCD Type 2: new row should be added and old row should be updated
304
+ ETL::Engine.logger.debug "type 2 SCD"
305
+
306
+ # To update the old row, we delete the version in the database
307
+ # and insert a new expired version
308
+
309
+ # If there is no truncate then the row will exist twice in the database
310
+ delete_outdated_record
311
+
312
+ ETL::Engine.logger.debug "expiring original record"
313
+ @existing_row[scd_end_date_field] = @timestamp
314
+ @existing_row[scd_latest_version_field] = false
315
+
316
+ buffer << @existing_row
317
+
318
+ elsif scd_type == 1
319
+ # SCD Type 1: only the new row should be added
320
+ ETL::Engine.logger.debug "type 1 SCD"
321
+
322
+ # Copy primary key, and other non-evolving fields over from
323
+ # original version of record
324
+ non_evolving_fields.each do |non_evolving_field|
325
+ row[non_evolving_field] = @existing_row[non_evolving_field]
326
+ end
327
+
328
+ # If there is no truncate then the row will exist twice in the database
329
+ delete_outdated_record
330
+ else
331
+ # SCD Type 3: not supported
332
+ ETL::Engine.logger.debug "SCD type #{scd_type} not supported"
333
+ end
334
+
335
+ # In all cases, the latest, greatest version of the record
336
+ # should go into the load
337
+ schedule_new_record(row)
338
+ end
339
+
340
+ # Do all the steps required when a SCD has *not* changed. Exact
341
+ # steps depend on what type of SCD we're handling.
342
+ def process_scd_match(row)
343
+ ETL::Engine.logger.debug "SCD fields match"
344
+
345
+ if scd_type == 2 && has_non_scd_field_changes?(row)
346
+ ETL::Engine.logger.debug "Non-SCD field changes"
347
+ # Copy important data over from original version of record
348
+ row[primary_key] = @existing_row[primary_key]
349
+ row[scd_end_date_field] = @existing_row[scd_end_date_field]
350
+ row[scd_effective_date_field] = @existing_row[scd_effective_date_field]
351
+ row[scd_latest_version_field] = @existing_row[scd_latest_version_field]
352
+
353
+ # If there is no truncate then the row will exist twice in the database
354
+ delete_outdated_record
355
+
356
+ buffer << row
357
+ else
358
+ # The record is totally the same, so skip it
359
+ end
360
+ end
361
+
362
+ # Find the version of this row that already exists in the datawarehouse.
363
+ def preexisting_row(row)
364
+ q = "SELECT * FROM #{dimension_table} WHERE #{natural_key_equality_for_row(row)}"
365
+ q << " AND #{scd_latest_version_field}" if scd_type == 2
366
+
367
+ ETL::Engine.logger.debug "looking for original record"
368
+ result = connection.select_one(q)
369
+
370
+ ETL::Engine.logger.debug "Result: #{result.inspect}"
371
+
372
+ result ? ETL::Row[result.symbolize_keys!] : nil
373
+ end
374
+
375
+ # Check whether non-scd fields have changed since the last
376
+ # load of this record.
377
+ def has_scd_field_changes?(row)
378
+ scd_fields(row).any? { |csd_field|
379
+ ETL::Engine.logger.debug "Row: #{row.inspect}"
380
+ ETL::Engine.logger.debug "Existing Row: #{@existing_row.inspect}"
381
+ ETL::Engine.logger.debug "comparing: #{row[csd_field].to_s} != #{@existing_row[csd_field].to_s}"
382
+ x=row[csd_field].to_s != @existing_row[csd_field].to_s
383
+ ETL::Engine.logger.debug x
384
+ x
385
+ }
386
+ end
387
+
388
+ # Check whether non-scd fields have changed since the last
389
+ # load of this record.
390
+ def has_non_scd_field_changes?(row)
391
+ non_scd_fields(row).any? { |non_csd_field| row[non_csd_field].to_s != @existing_row[non_csd_field].to_s }
392
+ end
393
+
394
+ # Grab, or re-use, a database connection for running queries directly
395
+ # during the destination processing.
396
+ def connection
397
+ @conn ||= ETL::Engine.connection(dimension_target)
398
+ end
399
+
400
+ # Utility for removing a row that has outdated information. Note
401
+ # that this deletes directly from the database, even if this is a file
402
+ # destination. It needs to do this because you can't do deletes in a
403
+ # bulk load.
404
+ def delete_outdated_record
405
+ ETL::Engine.logger.debug "deleting old row"
406
+
407
+ q = "DELETE FROM #{dimension_table} WHERE #{primary_key} = #{@existing_row[primary_key]}"
408
+ connection.delete(q)
409
+ end
410
+
411
+ # Schedule the latest, greatest version of the row for insertion
412
+ # into the database
413
+ def schedule_new_record(row)
414
+ ETL::Engine.logger.debug "writing new record"
415
+ if scd_type == 2
416
+ row[scd_effective_date_field] = @timestamp
417
+ row[scd_end_date_field] = '9999-12-31 00:00:00'
418
+ row[scd_latest_version_field] = true
419
+ end
420
+ buffer << row
421
+ end
422
+
423
+ # Get the name of the primary key for this table. Asks the dimension
424
+ # model class for this information, but if that class hasn't been
425
+ # defined, just defaults to :id.
426
+ def primary_key
427
+ return @primary_key if @primary_key
428
+ @primary_key = dimension_table.to_s.camelize.constantize.primary_key.to_sym
429
+ rescue NameError => e
430
+ ETL::Engine.logger.debug "couldn't get primary_key from dimension model class, using default :id"
431
+ @primary_key = :id
432
+ end
433
+
434
+ end
435
+ end
436
+ end
437
+
438
+ Dir[File.dirname(__FILE__) + "/destination/*.rb"].each { |file| require(file) }
@@ -0,0 +1,113 @@
1
+ # This source file contains the ETL::Control::CsvDestination
2
+
3
+ module ETL #:nodoc:
4
+ module Control #:nodoc:
5
+ # CSV File as the final destination.
6
+ class CsvDestination < Destination
7
+ # The File to write to
8
+ attr_reader :file
9
+
10
+ # The output order
11
+ attr_reader :order
12
+
13
+ # Flag which indicates to append (default is to overwrite)
14
+ attr_accessor :append
15
+
16
+ # The separator
17
+ attr_accessor :separator
18
+
19
+ # The end of line marker
20
+ attr_accessor :eol
21
+
22
+ # The enclosure character
23
+ attr_accessor :enclose
24
+
25
+ # Initialize the object.
26
+ # * <tt>control</tt>: The Control object
27
+ # * <tt>configuration</tt>: The configuration map
28
+ # * <tt>mapping</tt>: The output mapping
29
+ #
30
+ # Configuration options:
31
+ # * <tt>:file<tt>: The file to write to (REQUIRED)
32
+ # * <tt>:append</tt>: Set to true to append to the file (default is to overwrite)
33
+ # * <tt>:separator</tt>: Record separator (default is a comma)
34
+ # * <tt>:eol</tt>: End of line marker (default is \n)
35
+ # * <tt>:enclose</tt>: Set to true of false
36
+ # * <tt>:unique</tt>: Set to true to only write unique records
37
+ # * <tt>:append_rows</tt>: Array of rows to append
38
+ #
39
+ # Mapping options:
40
+ # * <tt>:order</tt>: The order array
41
+ def initialize(control, configuration, mapping={})
42
+ super
43
+ path = Pathname.new(configuration[:file])
44
+ @file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(control.file))) + path
45
+ @append = configuration[:append] ||= false
46
+ @separator = configuration[:separator] ||= ','
47
+ @eol = configuration[:eol] ||= "\n"
48
+ @enclose = true & configuration[:enclose]
49
+ @unique = configuration[:unique] ? configuration[:unique] + scd_required_fields : configuration[:unique]
50
+ @unique.uniq! unless @unique.nil?
51
+ @order = mapping[:order] ? mapping[:order] + scd_required_fields : order_from_source
52
+ @order.uniq! unless @order.nil?
53
+ raise ControlError, "Order required in mapping" unless @order
54
+ end
55
+
56
+ # Close the destination. This will flush the buffer and close the underlying stream or connection.
57
+ def close
58
+ buffer << append_rows if append_rows
59
+ flush
60
+ f.close
61
+ end
62
+
63
+ # Flush the destination buffer
64
+ def flush
65
+ #puts "Flushing buffer (#{file}) with #{buffer.length} rows"
66
+ buffer.flatten.each do |row|
67
+ #puts "row change type: #{row.change_type}"
68
+ # check to see if this row's compound key constraint already exists
69
+ # note that the compound key constraint may not utilize virtual fields
70
+ next unless row_allowed?(row)
71
+
72
+ # add any virtual fields
73
+ add_virtuals!(row)
74
+
75
+ # collect all of the values using the order designated in the configuration
76
+ values = order.collect do |name|
77
+ value = row[name]
78
+ case value
79
+ when Date, Time, DateTime
80
+ value.to_s(:db)
81
+ else
82
+ value.to_s
83
+ end
84
+ end
85
+
86
+ f << values
87
+ end
88
+ f.flush
89
+ buffer.clear
90
+ #puts "After flush there are #{buffer.length} rows"
91
+ end
92
+
93
+ private
94
+ # Get the open file stream
95
+ def f
96
+ @f ||= FasterCSV.open(file, mode, options)
97
+ end
98
+
99
+ def options
100
+ @options ||= {
101
+ :col_sep => separator,
102
+ :row_sep => eol,
103
+ :force_quotes => enclose
104
+ }
105
+ end
106
+
107
+ # Get the appropriate mode to open the file stream
108
+ def mode
109
+ append ? 'a' : 'w'
110
+ end
111
+ end
112
+ end
113
+ end