activewarehouse-etl-sgonyea 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. data/.gitignore +9 -0
  2. data/0.9-UPGRADE +6 -0
  3. data/CHANGELOG +236 -0
  4. data/Gemfile +4 -0
  5. data/HOW_TO_RELEASE +13 -0
  6. data/LICENSE +7 -0
  7. data/README.textile +111 -0
  8. data/Rakefile +103 -0
  9. data/TODO +28 -0
  10. data/active_support_logger.patch +78 -0
  11. data/activewarehouse-etl.gemspec +36 -0
  12. data/bin/etl +28 -0
  13. data/bin/etl.cmd +8 -0
  14. data/examples/database.example.yml +16 -0
  15. data/lib/etl.rb +97 -0
  16. data/lib/etl/batch.rb +2 -0
  17. data/lib/etl/batch/batch.rb +111 -0
  18. data/lib/etl/batch/directives.rb +65 -0
  19. data/lib/etl/builder.rb +2 -0
  20. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  21. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  22. data/lib/etl/commands/etl.rb +89 -0
  23. data/lib/etl/control.rb +3 -0
  24. data/lib/etl/control/control.rb +405 -0
  25. data/lib/etl/control/destination.rb +438 -0
  26. data/lib/etl/control/destination/csv_destination.rb +113 -0
  27. data/lib/etl/control/destination/database_destination.rb +97 -0
  28. data/lib/etl/control/destination/excel_destination.rb +91 -0
  29. data/lib/etl/control/destination/file_destination.rb +126 -0
  30. data/lib/etl/control/destination/insert_update_database_destination.rb +136 -0
  31. data/lib/etl/control/destination/update_database_destination.rb +109 -0
  32. data/lib/etl/control/destination/yaml_destination.rb +74 -0
  33. data/lib/etl/control/source.rb +132 -0
  34. data/lib/etl/control/source/database_source.rb +224 -0
  35. data/lib/etl/control/source/enumerable_source.rb +11 -0
  36. data/lib/etl/control/source/file_source.rb +90 -0
  37. data/lib/etl/control/source/model_source.rb +39 -0
  38. data/lib/etl/core_ext.rb +1 -0
  39. data/lib/etl/core_ext/time.rb +5 -0
  40. data/lib/etl/core_ext/time/calculations.rb +42 -0
  41. data/lib/etl/engine.rb +582 -0
  42. data/lib/etl/execution.rb +19 -0
  43. data/lib/etl/execution/base.rb +8 -0
  44. data/lib/etl/execution/batch.rb +10 -0
  45. data/lib/etl/execution/job.rb +8 -0
  46. data/lib/etl/execution/migration.rb +90 -0
  47. data/lib/etl/generator.rb +2 -0
  48. data/lib/etl/generator/generator.rb +20 -0
  49. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  50. data/lib/etl/http_tools.rb +139 -0
  51. data/lib/etl/parser.rb +11 -0
  52. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  53. data/lib/etl/parser/csv_parser.rb +93 -0
  54. data/lib/etl/parser/excel_parser.rb +112 -0
  55. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  56. data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
  57. data/lib/etl/parser/parser.rb +41 -0
  58. data/lib/etl/parser/sax_parser.rb +218 -0
  59. data/lib/etl/parser/xml_parser.rb +65 -0
  60. data/lib/etl/processor.rb +11 -0
  61. data/lib/etl/processor/block_processor.rb +14 -0
  62. data/lib/etl/processor/bulk_import_processor.rb +94 -0
  63. data/lib/etl/processor/check_exist_processor.rb +80 -0
  64. data/lib/etl/processor/check_unique_processor.rb +39 -0
  65. data/lib/etl/processor/copy_field_processor.rb +26 -0
  66. data/lib/etl/processor/database_join_processor.rb +82 -0
  67. data/lib/etl/processor/encode_processor.rb +55 -0
  68. data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
  69. data/lib/etl/processor/escape_csv_processor.rb +77 -0
  70. data/lib/etl/processor/filter_row_processor.rb +51 -0
  71. data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
  72. data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
  73. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  74. data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
  75. data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
  76. data/lib/etl/processor/print_row_processor.rb +12 -0
  77. data/lib/etl/processor/processor.rb +25 -0
  78. data/lib/etl/processor/rename_processor.rb +24 -0
  79. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  80. data/lib/etl/processor/row_processor.rb +27 -0
  81. data/lib/etl/processor/sequence_processor.rb +23 -0
  82. data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
  83. data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
  84. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  85. data/lib/etl/processor/truncate_processor.rb +40 -0
  86. data/lib/etl/processor/zip_file_processor.rb +27 -0
  87. data/lib/etl/row.rb +20 -0
  88. data/lib/etl/screen.rb +14 -0
  89. data/lib/etl/screen/row_count_screen.rb +20 -0
  90. data/lib/etl/transform.rb +2 -0
  91. data/lib/etl/transform/block_transform.rb +13 -0
  92. data/lib/etl/transform/calculation_transform.rb +71 -0
  93. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  94. data/lib/etl/transform/decode_transform.rb +51 -0
  95. data/lib/etl/transform/default_transform.rb +20 -0
  96. data/lib/etl/transform/foreign_key_lookup_transform.rb +211 -0
  97. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  98. data/lib/etl/transform/md5_transform.rb +13 -0
  99. data/lib/etl/transform/ordinalize_transform.rb +14 -0
  100. data/lib/etl/transform/sha1_transform.rb +13 -0
  101. data/lib/etl/transform/split_fields_transform.rb +27 -0
  102. data/lib/etl/transform/string_to_date_time_transform.rb +14 -0
  103. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  104. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  105. data/lib/etl/transform/transform.rb +61 -0
  106. data/lib/etl/transform/trim_transform.rb +26 -0
  107. data/lib/etl/transform/type_transform.rb +35 -0
  108. data/lib/etl/util.rb +59 -0
  109. data/lib/etl/version.rb +3 -0
  110. data/test-matrix.yml +10 -0
  111. data/test/.gitignore +1 -0
  112. data/test/.ignore +2 -0
  113. data/test/all.ebf +6 -0
  114. data/test/apache_combined_log.ctl +11 -0
  115. data/test/batch_test.rb +41 -0
  116. data/test/batch_with_error.ebf +6 -0
  117. data/test/batched1.ctl +0 -0
  118. data/test/batched2.ctl +0 -0
  119. data/test/block_processor.ctl +6 -0
  120. data/test/block_processor_error.ctl +1 -0
  121. data/test/block_processor_pre_post_process.ctl +4 -0
  122. data/test/block_processor_remove_rows.ctl +5 -0
  123. data/test/block_processor_test.rb +38 -0
  124. data/test/check_exist_processor_test.rb +92 -0
  125. data/test/check_unique_processor_test.rb +40 -0
  126. data/test/config/Gemfile.rails-2.3.x +3 -0
  127. data/test/config/Gemfile.rails-2.3.x.lock +53 -0
  128. data/test/config/Gemfile.rails-3.0.x +3 -0
  129. data/test/config/Gemfile.rails-3.0.x.lock +61 -0
  130. data/test/config/common.rb +29 -0
  131. data/test/connection/mysql/connection.rb +9 -0
  132. data/test/connection/mysql/schema.sql +37 -0
  133. data/test/connection/postgresql/connection.rb +13 -0
  134. data/test/connection/postgresql/schema.sql +40 -0
  135. data/test/control_test.rb +43 -0
  136. data/test/data/apache_combined_log.txt +3 -0
  137. data/test/data/bulk_import.txt +3 -0
  138. data/test/data/bulk_import_with_empties.txt +3 -0
  139. data/test/data/decode.txt +3 -0
  140. data/test/data/delimited.txt +3 -0
  141. data/test/data/encode_source_latin1.txt +2 -0
  142. data/test/data/excel.xls +0 -0
  143. data/test/data/excel2.xls +0 -0
  144. data/test/data/fixed_width.txt +3 -0
  145. data/test/data/multiple_delimited_1.txt +3 -0
  146. data/test/data/multiple_delimited_2.txt +3 -0
  147. data/test/data/nokogiri.xml +38 -0
  148. data/test/data/people.txt +3 -0
  149. data/test/data/sax.xml +14 -0
  150. data/test/data/xml.xml +16 -0
  151. data/test/database_join_processor_test.rb +43 -0
  152. data/test/date_dimension_builder_test.rb +96 -0
  153. data/test/delimited.ctl +30 -0
  154. data/test/delimited_absolute.ctl +31 -0
  155. data/test/delimited_destination_db.ctl +23 -0
  156. data/test/delimited_excel.ctl +31 -0
  157. data/test/delimited_insert_update.ctl +34 -0
  158. data/test/delimited_update.ctl +34 -0
  159. data/test/delimited_with_bulk_load.ctl +34 -0
  160. data/test/destination_test.rb +275 -0
  161. data/test/directive_test.rb +23 -0
  162. data/test/encode_processor_test.rb +32 -0
  163. data/test/engine_test.rb +78 -0
  164. data/test/ensure_fields_presence_processor_test.rb +28 -0
  165. data/test/errors.ctl +24 -0
  166. data/test/etl_test.rb +42 -0
  167. data/test/excel.ctl +24 -0
  168. data/test/excel2.ctl +25 -0
  169. data/test/fixed_width.ctl +35 -0
  170. data/test/foreign_key_lookup_transform_test.rb +50 -0
  171. data/test/generator_test.rb +14 -0
  172. data/test/inline_parser.ctl +17 -0
  173. data/test/mocks/mock_destination.rb +26 -0
  174. data/test/mocks/mock_source.rb +25 -0
  175. data/test/model_source.ctl +14 -0
  176. data/test/multiple_delimited.ctl +22 -0
  177. data/test/multiple_source_delimited.ctl +39 -0
  178. data/test/nokogiri_all.ctl +35 -0
  179. data/test/nokogiri_select.ctl +35 -0
  180. data/test/nokogiri_test.rb +35 -0
  181. data/test/parser_test.rb +224 -0
  182. data/test/performance/delimited.ctl +30 -0
  183. data/test/processor_test.rb +44 -0
  184. data/test/row_processor_test.rb +17 -0
  185. data/test/sax.ctl +26 -0
  186. data/test/scd/1.txt +1 -0
  187. data/test/scd/2.txt +1 -0
  188. data/test/scd/3.txt +1 -0
  189. data/test/scd_test.rb +257 -0
  190. data/test/scd_test_type_1.ctl +43 -0
  191. data/test/scd_test_type_2.ctl +34 -0
  192. data/test/screen_test.rb +9 -0
  193. data/test/screen_test_error.ctl +3 -0
  194. data/test/screen_test_fatal.ctl +3 -0
  195. data/test/source_test.rb +154 -0
  196. data/test/test_helper.rb +37 -0
  197. data/test/transform_test.rb +101 -0
  198. data/test/truncate_processor_test.rb +37 -0
  199. data/test/xml.ctl +31 -0
  200. metadata +370 -0
@@ -0,0 +1,109 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Destination which writes directly to a database. This is useful when you are dealing with
4
+ # a small amount of data. For larger amounts of data you should probably use the bulk
5
+ # loader if it is supported with your target database as it will use a much faster load
6
+ # method.
7
+ class UpdateDatabaseDestination < Destination
8
+ # The target connection
9
+ attr_reader :target
10
+
11
+ # The table
12
+ attr_reader :table
13
+
14
+ # Specify the order from the source
15
+ attr_reader :order
16
+
17
+ # Specify the conditions from the source
18
+ attr_reader :conditions
19
+
20
+ # Initialize the database destination
21
+ #
22
+ # * <tt>control</tt>: The ETL::Control::Control instance
23
+ # * <tt>configuration</tt>: The configuration Hash
24
+ # * <tt>mapping</tt>: The mapping
25
+ #
26
+ # Configuration options:
27
+ # * <tt>:database</tt>: The database name (REQUIRED)
28
+ # * <tt>:target</tt>: The target connection (REQUIRED)
29
+ # * <tt>:table</tt>: The table to write to (REQUIRED)
30
+ # * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
31
+ # * <tt>:append_rows</tt>: Array of rows to append
32
+ #
33
+ # Mapping options:
34
+ # * <tt>:order</tt>: The order of fields to write (REQUIRED)
35
+ # * <tt>:conditions</tt>: The conditions on the fields to update (REQUIRED)
36
+ def initialize(control, configuration, mapping={})
37
+ super
38
+ @target = configuration[:target]
39
+ @table = configuration[:table]
40
+ @unique = configuration[:unique] ? configuration[:unique] + [scd_effective_date_field] : configuration[:unique]
41
+ @unique.uniq! unless @unique.nil?
42
+ @order = mapping[:order] ? mapping[:order] + scd_required_fields : order_from_source
43
+ @order.uniq! unless @order.nil?
44
+ @conditions = mapping[:conditions] ? mapping[:conditions] + scd_required_fields : nil
45
+ @conditions.uniq! unless @conditions.nil?
46
+ raise ControlError, "Conditions required in mapping" unless @conditions
47
+ raise ControlError, "Order required in mapping" unless @order
48
+ raise ControlError, "Table required" unless @table
49
+ raise ControlError, "Target required" unless @target
50
+ end
51
+
52
+ # Flush the currently buffered data
53
+ def flush
54
+ conn.transaction do
55
+ buffer.flatten.each do |row|
56
+ # check to see if this row's compound key constraint already exists
57
+ # note that the compound key constraint may not utilize virtual fields
58
+ next unless row_allowed?(row)
59
+
60
+ # add any virtual fields
61
+ add_virtuals!(row)
62
+
63
+ conditionsfilter = []
64
+ conditions.each do |cond|
65
+ c = " #{cond[:field]} #{cond[:comp]} #{cond[:value]} "
66
+ condition = c
67
+ begin
68
+ condition = eval('"' + c + '"')
69
+ rescue
70
+ end
71
+ conditionsfilter << condition
72
+ end
73
+
74
+ updatevalues = []
75
+ order.each do |name|
76
+ updatevalues << "#{conn.quote_column_name(name)} = #{conn.quote(row[name])}"
77
+ end
78
+ q = "UPDATE #{conn.quote_table_name(table_name)} SET #{updatevalues.join(',')} WHERE #{conditionsfilter.join(' AND ')}"
79
+ ETL::Engine.logger.debug("Executing update: #{q}")
80
+ conn.update(q, "Update row #{current_row}")
81
+ @current_row += 1
82
+ end
83
+ buffer.clear
84
+ end
85
+ end
86
+
87
+ # Close the connection
88
+ def close
89
+ buffer << append_rows if append_rows
90
+ flush
91
+ end
92
+
93
+ private
94
+ def conn
95
+ @conn ||= begin
96
+ conn = ETL::Engine.connection(target)
97
+ conn
98
+ rescue
99
+ raise RuntimeError, "Problem to connect to db"
100
+ end
101
+ end
102
+
103
+ def table_name
104
+ ETL::Engine.table(table, ETL::Engine.connection(target))
105
+ end
106
+
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,74 @@
1
+ require 'yaml'
2
+
3
+ module ETL #:nodoc:
4
+ module Control #:nodoc:
5
+ class YamlDestination < Destination
6
+ attr_reader :file, :append, :only, :except
7
+ # Initialize the object.
8
+ # * <tt>control</tt>: The Control object
9
+ # * <tt>configuration</tt>: The configuration map
10
+ # * <tt>mapping</tt>: The output mapping
11
+ #
12
+ # Configuration options:
13
+ # * <tt>:file<tt>: The file to write to (REQUIRED)
14
+ # * <tt>:append</tt>: Set to true to append to the file (default is to overwrite)
15
+ # * <tt>:only</tt>
16
+ # * <tt>:except</tt>
17
+ def initialize(control, configuration, mapping={})
18
+ super
19
+ @file = File.join(File.dirname(control.file), configuration[:file])
20
+ @append = configuration[:append] ||= false
21
+ @only = configuration[:only]
22
+ @except = configuration[:except]
23
+ raise ControlError, "the :only and :except options must be used seperately, do not specify both" if @only && @except
24
+ end
25
+
26
+ # Close the destination. This will flush the buffer and close the underlying stream or connection.
27
+ def close
28
+ flush
29
+ f.close
30
+ end
31
+
32
+ # Flush the destination buffer
33
+ def flush
34
+ #puts "Flushing buffer (#{file}) with #{buffer.length} rows"
35
+ buffer.flatten.each do |row|
36
+ # check to see if this row's compound key constraint already exists
37
+ # note that the compound key constraint may not utilize virtual fields
38
+ next unless row_allowed?(row)
39
+ # add any virtual fields
40
+ add_virtuals!(row)
41
+
42
+ yaml = {}
43
+ row.each do |key, value|
44
+ next if only && !only.include?(key)
45
+ next if except && except.include?(key)
46
+
47
+ case value
48
+ when Date, Time, DateTime
49
+ value = value.to_s(:db)
50
+ end
51
+
52
+ yaml[key] = value
53
+ end
54
+
55
+ # write the values
56
+ YAML.dump(yaml, f)
57
+ end
58
+ f.flush
59
+ buffer.clear
60
+ end
61
+
62
+ private
63
+ # Get the open file stream
64
+ def f
65
+ @f ||= File.open(file, mode)
66
+ end
67
+
68
+ # Get the appropriate mode to open the file stream
69
+ def mode
70
+ append ? 'a' : 'w'
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,132 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # ETL source. Subclasses must implement the <tt>each</tt> method.
4
+ class Source
5
+ include Enumerable
6
+
7
+ # The control object
8
+ attr_accessor :control
9
+
10
+ # The configuration Hash
11
+ attr_accessor :configuration
12
+
13
+ # The definition Hash
14
+ attr_accessor :definition
15
+
16
+ # Returns true if the source data should be stored locally for archival
17
+ # Default behavior will return true.
18
+ attr_accessor :store_locally
19
+
20
+ class << self
21
+ # Convert the name to a Source class.
22
+ #
23
+ # For example if name is :database then this will return a
24
+ # DatabaseSource class
25
+ def class_for_name(name)
26
+ ETL::Control.const_get("#{name.to_s.camelize}Source")
27
+ end
28
+ end
29
+
30
+ # Initialize the Source instance
31
+ # * <tt>control</tt>: The control object
32
+ # * <tt>configuration</tt>: The configuration hash
33
+ # * <tt>definition</tt>: The source layout definition
34
+ #
35
+ # Configuration options:
36
+ # * <tt>:store_locally</tt>: Set to false to not store source data
37
+ # locally (defaults to true)
38
+ def initialize(control, configuration, definition)
39
+ @control = control
40
+ @configuration = configuration
41
+ @definition = definition
42
+
43
+ @store_locally = configuration[:store_locally].nil? ? true : configuration[:store_locally]
44
+ end
45
+
46
+ # Get an array of errors that occur during reading from the source
47
+ def errors
48
+ @errors ||= []
49
+ end
50
+
51
+ # Get a timestamp value as a string
52
+ def timestamp
53
+ Engine.timestamp
54
+ end
55
+
56
+ # The base directory where local files are stored.
57
+ attr_accessor :local_base
58
+
59
+ # Get the local base, defaults to 'source_data'
60
+ def local_base
61
+ @local_base ||= 'source_data'
62
+ end
63
+
64
+ # The local directory for storing. This method must be overriden by
65
+ # subclasses
66
+ def local_directory
67
+ raise "local_directory method is abstract"
68
+ end
69
+
70
+ # Return the local file for storing the raw source data. Each call to
71
+ # this method will result in a timestamped file, so you cannot expect
72
+ # to call it multiple times and reference the same file
73
+ #
74
+ # Optional sequence can be specified if there are multiple source files
75
+ def local_file(sequence=nil)
76
+ filename = timestamp.to_s
77
+ filename += sequence.to_s if sequence
78
+
79
+ local_dir = local_directory
80
+ FileUtils.mkdir_p(local_dir)
81
+ File.join(local_dir, "#{filename}.csv")
82
+ end
83
+
84
+ # Get the last fully written local file
85
+ def last_local_file
86
+ File.join(local_directory, File.basename(last_local_file_trigger, '.trig'))
87
+ end
88
+
89
+ # Get the last local file trigger filename using timestamp in filenames.
90
+ # Filename is in the format YYYYMMDDHHMMSS.csv.trig, but in the case of a
91
+ # file source there is an unpadded sequence number before the file
92
+ # extension. This code may not return the correct "last" file in that
93
+ # case (in particular when there are 10 or more source files). However,
94
+ # at this point only the database source calls the method, and it wouldn't
95
+ # make sense for a file source to use it if multiple files are expected
96
+ def last_local_file_trigger
97
+ trig_files = []
98
+ trig_ext = '.csv.trig'
99
+
100
+ # Store the basename (without extension) of all files that end in the
101
+ # desired extension
102
+ Dir.glob(File.join(local_directory, "*" + trig_ext)) do |f|
103
+ # Extract the basename of each file with the extension snipped off
104
+ trig_files << File.basename(f, trig_ext) if File.file?(f)
105
+ end
106
+
107
+ # Throw an exception if no trigger files are available
108
+ raise "Local cache trigger file not found" if trig_files.empty?
109
+
110
+ # Sort trigger file strings and get the last one
111
+ last_trig = trig_files.sort {|a,b| a <=> b}.last
112
+
113
+ # Return the file path including extension
114
+ File.join(local_directory, last_trig + trig_ext)
115
+ end
116
+
117
+ # Get the local trigger file that is used to indicate that the file has
118
+ # been completely written
119
+ def local_file_trigger(file)
120
+ Pathname.new(file.to_s + '.trig')
121
+ end
122
+
123
+ # Return true if the source should read locally.
124
+ def read_locally
125
+ Engine.read_locally
126
+ end
127
+
128
+ end
129
+ end
130
+ end
131
+
132
+ Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
@@ -0,0 +1,224 @@
1
+ require 'fileutils'
2
+
3
+ module ETL #:nodoc:
4
+ class Source < ::ActiveRecord::Base #:nodoc:
5
+ # Connection for database sources
6
+ end
7
+
8
+ module Control #:nodoc:
9
+ # Source object which extracts data from a database using ActiveRecord.
10
+ class DatabaseSource < Source
11
+ attr_accessor :target
12
+ attr_accessor :table
13
+
14
+ # Initialize the source.
15
+ #
16
+ # Arguments:
17
+ # * <tt>control</tt>: The ETL::Control::Control instance
18
+ # * <tt>configuration</tt>: The configuration Hash
19
+ # * <tt>definition</tt>: The source definition
20
+ #
21
+ # Required configuration options:
22
+ # * <tt>:target</tt>: The target connection
23
+ # * <tt>:table</tt>: The source table name
24
+ # * <tt>:database</tt>: The database name
25
+ #
26
+ # Other options:
27
+ # * <tt>:join</tt>: Optional join part for the query (ignored unless
28
+ # specified)
29
+ # * <tt>:select</tt>: Optional select part for the query (defaults to
30
+ # '*')
31
+ # * <tt>:group</tt>: Optional group by part for the query (ignored
32
+ # unless specified)
33
+ # * <tt>:order</tt>: Optional order part for the query (ignored unless
34
+ # specified)
35
+ # * <tt>:new_records_only</tt>: Specify the column to use when comparing
36
+ # timestamps against the last successful ETL job execution for the
37
+ # current control file.
38
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
39
+ # source data locally in a flat file (defaults to true)
40
+ def initialize(control, configuration, definition)
41
+ super
42
+ @target = configuration[:target]
43
+ @table = configuration[:table]
44
+ @query = configuration[:query]
45
+ end
46
+
47
+ # Get a String identifier for the source
48
+ def to_s
49
+ "#{host}/#{database}/#{@table}"
50
+ end
51
+
52
+ # Get the local directory to use, which is a combination of the
53
+ # local_base, the db hostname the db database name and the db table.
54
+ def local_directory
55
+ File.join(local_base, to_s)
56
+ end
57
+
58
+ # Get the join part of the query, defaults to nil
59
+ def join
60
+ configuration[:join]
61
+ end
62
+
63
+ # Get the select part of the query, defaults to '*'
64
+ def select
65
+ configuration[:select] || '*'
66
+ end
67
+
68
+ # Get the group by part of the query, defaults to nil
69
+ def group
70
+ configuration[:group]
71
+ end
72
+
73
+ # Get the order for the query, defaults to nil
74
+ def order
75
+ configuration[:order]
76
+ end
77
+
78
+ # Return the column which is used for in the where clause to identify
79
+ # new rows
80
+ def new_records_only
81
+ configuration[:new_records_only]
82
+ end
83
+
84
+ # Get the number of rows in the source
85
+ def count(use_cache=true)
86
+ return @count if @count && use_cache
87
+ if @store_locally || read_locally
88
+ @count = count_locally
89
+ else
90
+ @count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
91
+ end
92
+ end
93
+
94
+ # Get the list of columns to read. This is defined in the source
95
+ # definition as either an Array or Hash
96
+ def columns
97
+ # weird default is required for writing to cache correctly
98
+ @columns ||= query_rows.any? ? query_rows.first.keys : ['']
99
+ end
100
+
101
+ # Returns each row from the source. If read_locally is specified then
102
+ # this method will attempt to read from the last stored local file.
103
+ # If no locally stored file exists or if the trigger file for the last
104
+ # locally stored file does not exist then this method will raise an
105
+ # error.
106
+ def each(&block)
107
+ if read_locally # Read from the last stored source
108
+ ETL::Engine.logger.debug "Reading from local cache"
109
+ read_rows(last_local_file, &block)
110
+ else # Read from the original source
111
+ if @store_locally
112
+ file = local_file
113
+ write_local(file)
114
+ read_rows(file, &block)
115
+ else
116
+ query_rows.each do |r|
117
+ row = ETL::Row.new()
118
+ r.symbolize_keys.each_pair { |key, value|
119
+ row[key] = value
120
+ }
121
+ row.source = self
122
+ yield row
123
+ end
124
+ end
125
+ end
126
+ end
127
+
128
+ private
129
+ # Read rows from the local cache
130
+ def read_rows(file)
131
+ raise "Local cache file not found" unless File.exists?(file)
132
+ raise "Local cache trigger file not found" unless File.exists?(local_file_trigger(file))
133
+
134
+ t = Benchmark.realtime do
135
+ CSV.open(file, :headers => true).each do |row|
136
+ result_row = ETL::Row.new
137
+ result_row.source = self
138
+ row.each do |header, field|
139
+ result_row[header.to_sym] = field
140
+ end
141
+ yield result_row
142
+ end
143
+ end
144
+ ETL::Engine.average_rows_per_second = ETL::Engine.rows_read / t
145
+ end
146
+
147
+ def count_locally
148
+ counter = 0
149
+ File.open(last_local_file, 'r').each { |line| counter += 1 }
150
+ counter
151
+ end
152
+
153
+ # Write rows to the local cache
154
+ def write_local(file)
155
+ lines = 0
156
+ t = Benchmark.realtime do
157
+ CSV.open(file, 'w') do |f|
158
+ f << columns
159
+ query_rows.each do |row|
160
+ f << columns.collect { |column| row[column.to_s] }
161
+ lines += 1
162
+ end
163
+ end
164
+ File.open(local_file_trigger(file), 'w') {|f| }
165
+ end
166
+ ETL::Engine.logger.info "Stored locally in #{t}s (avg: #{lines/t} lines/sec)"
167
+ end
168
+
169
+ # Get the query to use
170
+ def query
171
+ return @query if @query
172
+ q = "SELECT #{select} FROM #{@table}"
173
+ q << " #{join}" if join
174
+
175
+ conditions = []
176
+ if new_records_only
177
+ last_completed = ETL::Execution::Job.maximum('created_at',
178
+ :conditions => ['control_file = ? and completed_at is not null', control.file]
179
+ )
180
+ if last_completed
181
+ conditions << "#{new_records_only} > #{connection.quote(last_completed.to_s(:db))}"
182
+ end
183
+ end
184
+
185
+ conditions << configuration[:conditions] if configuration[:conditions]
186
+ if conditions.length > 0
187
+ q << " WHERE #{conditions.join(' AND ')}"
188
+ end
189
+
190
+ q << " GROUP BY #{group}" if group
191
+ q << " ORDER BY #{order}" if order
192
+
193
+ if ETL::Engine.limit || ETL::Engine.offset
194
+ options = {}
195
+ options[:limit] = ETL::Engine.limit if ETL::Engine.limit
196
+ options[:offset] = ETL::Engine.offset if ETL::Engine.offset
197
+ connection.add_limit_offset!(q, options)
198
+ end
199
+
200
+ q = q.gsub(/\n/,' ')
201
+ ETL::Engine.logger.info "Query: #{q}"
202
+ @query = q
203
+ end
204
+
205
+ def query_rows
206
+ @query_rows ||= connection.select_all(query)
207
+ end
208
+
209
+ # Get the database connection to use
210
+ def connection
211
+ ETL::Engine.connection(target)
212
+ end
213
+
214
+ # Get the host, defaults to 'localhost'
215
+ def host
216
+ ETL::Base.configurations[target.to_s]['host'] || 'localhost'
217
+ end
218
+
219
+ def database
220
+ ETL::Base.configurations[target.to_s]['database']
221
+ end
222
+ end
223
+ end
224
+ end