balinterdi-activewarehouse-etl 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (159) hide show
  1. data/.gitignore +4 -0
  2. data/0.9-UPGRADE +6 -0
  3. data/CHANGELOG +198 -0
  4. data/HOW_TO_RELEASE +4 -0
  5. data/LICENSE +7 -0
  6. data/README +85 -0
  7. data/Rakefile +169 -0
  8. data/TODO +28 -0
  9. data/VERSION +1 -0
  10. data/active_support_logger.patch +78 -0
  11. data/balinterdi-activewarehouse-etl.gemspec +221 -0
  12. data/bin/etl +28 -0
  13. data/bin/etl.cmd +8 -0
  14. data/examples/database.example.yml +16 -0
  15. data/lib/etl.rb +78 -0
  16. data/lib/etl/batch.rb +2 -0
  17. data/lib/etl/batch/batch.rb +111 -0
  18. data/lib/etl/batch/directives.rb +55 -0
  19. data/lib/etl/builder.rb +2 -0
  20. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  21. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  22. data/lib/etl/commands/etl.rb +89 -0
  23. data/lib/etl/control.rb +3 -0
  24. data/lib/etl/control/control.rb +405 -0
  25. data/lib/etl/control/destination.rb +420 -0
  26. data/lib/etl/control/destination/database_destination.rb +95 -0
  27. data/lib/etl/control/destination/file_destination.rb +124 -0
  28. data/lib/etl/control/source.rb +109 -0
  29. data/lib/etl/control/source/database_source.rb +220 -0
  30. data/lib/etl/control/source/enumerable_source.rb +11 -0
  31. data/lib/etl/control/source/file_source.rb +90 -0
  32. data/lib/etl/control/source/model_source.rb +39 -0
  33. data/lib/etl/core_ext.rb +1 -0
  34. data/lib/etl/core_ext/time.rb +5 -0
  35. data/lib/etl/core_ext/time/calculations.rb +42 -0
  36. data/lib/etl/engine.rb +556 -0
  37. data/lib/etl/execution.rb +20 -0
  38. data/lib/etl/execution/base.rb +9 -0
  39. data/lib/etl/execution/batch.rb +8 -0
  40. data/lib/etl/execution/job.rb +8 -0
  41. data/lib/etl/execution/migration.rb +85 -0
  42. data/lib/etl/generator.rb +2 -0
  43. data/lib/etl/generator/generator.rb +20 -0
  44. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  45. data/lib/etl/http_tools.rb +139 -0
  46. data/lib/etl/parser.rb +11 -0
  47. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  48. data/lib/etl/parser/delimited_parser.rb +75 -0
  49. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  50. data/lib/etl/parser/parser.rb +41 -0
  51. data/lib/etl/parser/sax_parser.rb +218 -0
  52. data/lib/etl/parser/xml_parser.rb +65 -0
  53. data/lib/etl/processor.rb +11 -0
  54. data/lib/etl/processor/block_processor.rb +14 -0
  55. data/lib/etl/processor/bulk_import_processor.rb +81 -0
  56. data/lib/etl/processor/check_exist_processor.rb +80 -0
  57. data/lib/etl/processor/check_unique_processor.rb +35 -0
  58. data/lib/etl/processor/copy_field_processor.rb +26 -0
  59. data/lib/etl/processor/encode_processor.rb +55 -0
  60. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  61. data/lib/etl/processor/print_row_processor.rb +12 -0
  62. data/lib/etl/processor/processor.rb +25 -0
  63. data/lib/etl/processor/rename_processor.rb +24 -0
  64. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  65. data/lib/etl/processor/row_processor.rb +17 -0
  66. data/lib/etl/processor/sequence_processor.rb +23 -0
  67. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  68. data/lib/etl/processor/truncate_processor.rb +35 -0
  69. data/lib/etl/row.rb +20 -0
  70. data/lib/etl/screen.rb +14 -0
  71. data/lib/etl/screen/row_count_screen.rb +20 -0
  72. data/lib/etl/transform.rb +2 -0
  73. data/lib/etl/transform/block_transform.rb +13 -0
  74. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  75. data/lib/etl/transform/decode_transform.rb +51 -0
  76. data/lib/etl/transform/default_transform.rb +20 -0
  77. data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
  78. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  79. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  80. data/lib/etl/transform/sha1_transform.rb +13 -0
  81. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  82. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  83. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  84. data/lib/etl/transform/transform.rb +61 -0
  85. data/lib/etl/transform/trim_transform.rb +26 -0
  86. data/lib/etl/transform/type_transform.rb +35 -0
  87. data/lib/etl/util.rb +59 -0
  88. data/lib/etl/version.rb +9 -0
  89. data/test/.ignore +2 -0
  90. data/test/all.ebf +6 -0
  91. data/test/apache_combined_log.ctl +11 -0
  92. data/test/batch_test.rb +41 -0
  93. data/test/batch_with_error.ebf +6 -0
  94. data/test/batched1.ctl +0 -0
  95. data/test/batched2.ctl +0 -0
  96. data/test/block_processor.ctl +6 -0
  97. data/test/block_processor_error.ctl +1 -0
  98. data/test/block_processor_pre_post_process.ctl +4 -0
  99. data/test/block_processor_remove_rows.ctl +5 -0
  100. data/test/block_processor_test.rb +38 -0
  101. data/test/connection/native_mysql/connection.rb +9 -0
  102. data/test/connection/native_mysql/schema.sql +36 -0
  103. data/test/connection/postgresql/connection.rb +13 -0
  104. data/test/connection/postgresql/schema.sql +43 -0
  105. data/test/control_test.rb +43 -0
  106. data/test/data/apache_combined_log.txt +3 -0
  107. data/test/data/bulk_import.txt +3 -0
  108. data/test/data/bulk_import_with_empties.txt +3 -0
  109. data/test/data/decode.txt +3 -0
  110. data/test/data/delimited.txt +3 -0
  111. data/test/data/encode_source_latin1.txt +2 -0
  112. data/test/data/fixed_width.txt +3 -0
  113. data/test/data/multiple_delimited_1.txt +3 -0
  114. data/test/data/multiple_delimited_2.txt +3 -0
  115. data/test/data/people.txt +3 -0
  116. data/test/data/sax.xml +14 -0
  117. data/test/data/xml.xml +16 -0
  118. data/test/database.example.yml +18 -0
  119. data/test/database.mysql.yml +18 -0
  120. data/test/database.postgres.yml +18 -0
  121. data/test/database.yml +18 -0
  122. data/test/date_dimension_builder_test.rb +96 -0
  123. data/test/delimited.ctl +30 -0
  124. data/test/delimited_absolute.ctl +33 -0
  125. data/test/delimited_destination_db.ctl +25 -0
  126. data/test/delimited_with_bulk_load.ctl +34 -0
  127. data/test/destination_test.rb +171 -0
  128. data/test/directive_test.rb +23 -0
  129. data/test/encode_processor_test.rb +31 -0
  130. data/test/engine_test.rb +32 -0
  131. data/test/errors.ctl +24 -0
  132. data/test/etl_test.rb +42 -0
  133. data/test/fixed_width.ctl +35 -0
  134. data/test/generator_test.rb +14 -0
  135. data/test/inline_parser.ctl +17 -0
  136. data/test/mocks/mock_destination.rb +26 -0
  137. data/test/mocks/mock_source.rb +25 -0
  138. data/test/model_source.ctl +14 -0
  139. data/test/multiple_delimited.ctl +22 -0
  140. data/test/multiple_source_delimited.ctl +39 -0
  141. data/test/parser_test.rb +200 -0
  142. data/test/performance/delimited.ctl +30 -0
  143. data/test/processor_test.rb +38 -0
  144. data/test/row_processor_test.rb +17 -0
  145. data/test/sax.ctl +26 -0
  146. data/test/scd/1.txt +1 -0
  147. data/test/scd/2.txt +1 -0
  148. data/test/scd/3.txt +1 -0
  149. data/test/scd_test.rb +263 -0
  150. data/test/scd_test_type_1.ctl +43 -0
  151. data/test/scd_test_type_2.ctl +42 -0
  152. data/test/screen_test.rb +9 -0
  153. data/test/screen_test_error.ctl +3 -0
  154. data/test/screen_test_fatal.ctl +3 -0
  155. data/test/source_test.rb +139 -0
  156. data/test/test_helper.rb +33 -0
  157. data/test/transform_test.rb +101 -0
  158. data/test/xml.ctl +31 -0
  159. metadata +237 -0
@@ -0,0 +1,124 @@
1
+ # This source file contains the ETL::Control::FileDestination
2
+
3
+ module ETL #:nodoc:
4
+ module Control #:nodoc:
5
+ # File as the final destination.
6
+ class FileDestination < Destination
7
+ # The File to write to
8
+ attr_reader :file
9
+
10
+ # The output order
11
+ attr_reader :order
12
+
13
+ # Flag which indicates to append (default is to overwrite)
14
+ attr_accessor :append
15
+
16
+ # The separator
17
+ attr_accessor :separator
18
+
19
+ # The end of line marker
20
+ attr_accessor :eol
21
+
22
+ # The enclosure character
23
+ attr_accessor :enclose
24
+
25
+ # Initialize the object.
26
+ # * <tt>control</tt>: The Control object
27
+ # * <tt>configuration</tt>: The configuration map
28
+ # * <tt>mapping</tt>: The output mapping
29
+ #
30
+ # Configuration options:
31
+ # * <tt>:file<tt>: The file to write to (REQUIRED)
32
+ # * <tt>:append</tt>: Set to true to append to the file (default is to overwrite)
33
+ # * <tt>:separator</tt>: Record separator (default is a comma)
34
+ # * <tt>:eol</tt>: End of line marker (default is \n)
35
+ # * <tt>:enclose</tt>: Enclosure character (default is none)
36
+ # * <tt>:unique</tt>: Set to true to only write unique records
37
+ # * <tt>:append_rows</tt>: Array of rows to append
38
+ #
39
+ # Mapping options:
40
+ # * <tt>:order</tt>: The order array
41
+ def initialize(control, configuration, mapping={})
42
+ super
43
+ @file = configuration[:file]
44
+ @append = configuration[:append] ||= false
45
+ @separator = configuration[:separator] ||= ','
46
+ @eol = configuration[:eol] ||= "\n"
47
+ @enclose = configuration[:enclose]
48
+ @unique = configuration[:unique]
49
+
50
+ @order = mapping[:order] || order_from_source
51
+ raise ControlError, "Order required in mapping" unless @order
52
+ end
53
+
54
+ # Close the destination. This will flush the buffer and close the underlying stream or connection.
55
+ def close
56
+ buffer << append_rows if append_rows
57
+ flush
58
+ f.close
59
+ end
60
+
61
+ # Flush the destination buffer
62
+ def flush
63
+ #puts "Flushing buffer (#{file}) with #{buffer.length} rows"
64
+ buffer.flatten.each do |row|
65
+ #puts "row change type: #{row.change_type}"
66
+ # check to see if this row's compound key constraint already exists
67
+ # note that the compound key constraint may not utilize virtual fields
68
+ next unless row_allowed?(row)
69
+
70
+ # add any virtual fields
71
+ add_virtuals!(row)
72
+
73
+ # collect all of the values using the order designated in the configuration
74
+ values = order.collect do |name|
75
+ value = row[name]
76
+ case value
77
+ when Date, Time, DateTime
78
+ value.to_s(:db)
79
+ else
80
+ value.to_s
81
+ end
82
+ end
83
+
84
+ values.collect! { |v| v.gsub(/\\/, '\\\\\\\\')}
85
+ values.collect! { |v| v.gsub(separator, "\\#{separator}")}
86
+ values.collect! { |v| v.gsub(/\n|\r/, '')}
87
+
88
+ # enclose the value if required
89
+ if !enclose.nil?
90
+ values.collect! { |v| enclose + v.gsub(/(#{enclose})/, '\\\\\1') + enclose }
91
+ end
92
+
93
+ # write the values joined by the separator defined in the configuration
94
+ f.write(values.join(separator))
95
+
96
+ # write the end-of-line
97
+ f.write(eol)
98
+ end
99
+ f.flush
100
+ buffer.clear
101
+ #puts "After flush there are #{buffer.length} rows"
102
+ end
103
+
104
+ private
105
+ # Get the open file stream
106
+ def f
107
+ @f ||= open(file, mode)
108
+ end
109
+
110
+ def options
111
+ @options ||= {
112
+ :col_sep => separator,
113
+ :row_sep => eol,
114
+ :force_quotes => !enclose.nil?
115
+ }
116
+ end
117
+
118
+ # Get the appropriate mode to open the file stream
119
+ def mode
120
+ append ? 'a' : 'w'
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,109 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # ETL source. Subclasses must implement the <tt>each</tt> method.
4
+ class Source
5
+ include Enumerable
6
+
7
+ # The control object
8
+ attr_accessor :control
9
+
10
+ # The configuration Hash
11
+ attr_accessor :configuration
12
+
13
+ # The definition Hash
14
+ attr_accessor :definition
15
+
16
+ # Returns true if the source data should be stored locally for archival
17
+ # Default behavior will return true.
18
+ attr_accessor :store_locally
19
+
20
+ class << self
21
+ # Convert the name to a Source class.
22
+ #
23
+ # For example if name is :database then this will return a
24
+ # DatabaseSource class
25
+ def class_for_name(name)
26
+ ETL::Control.const_get("#{name.to_s.camelize}Source")
27
+ end
28
+ end
29
+
30
+ # Initialize the Source instance
31
+ # * <tt>control</tt>: The control object
32
+ # * <tt>configuration</tt>: The configuration hash
33
+ # * <tt>definition</tt>: The source layout definition
34
+ #
35
+ # Configuration options:
36
+ # * <tt>:store_locally</tt>: Set to false to not store source data
37
+ # locally (defaults to true)
38
+ def initialize(control, configuration, definition)
39
+ @control = control
40
+ @configuration = configuration
41
+ @definition = definition
42
+
43
+ @store_locally = configuration[:store_locally] || true
44
+ end
45
+
46
+ # Get an array of errors that occur during reading from the source
47
+ def errors
48
+ @errors ||= []
49
+ end
50
+
51
+ # Get a timestamp value as a string
52
+ def timestamp
53
+ Engine.timestamp
54
+ end
55
+
56
+ # The base directory where local files are stored.
57
+ attr_accessor :local_base
58
+
59
+ # Get the local base, defaults to 'source_data'
60
+ def local_base
61
+ @local_base ||= 'source_data'
62
+ end
63
+
64
+ # The local directory for storing. This method must be overriden by
65
+ # subclasses
66
+ def local_directory
67
+ raise "local_directory method is abstract"
68
+ end
69
+
70
+ # Return the local file for storing the raw source data. Each call to
71
+ # this method will result in a timestamped file, so you cannot expect
72
+ # to call it multiple times and reference the same file
73
+ #
74
+ # Optional sequence can be specified if there are multiple source files
75
+ def local_file(sequence=nil)
76
+ filename = timestamp.to_s
77
+ filename += sequence.to_s if sequence
78
+
79
+ local_dir = local_directory
80
+ FileUtils.mkdir_p(local_dir)
81
+ File.join(local_dir, "#{filename}.csv")
82
+ end
83
+
84
+ # Get the last fully written local file
85
+ def last_local_file
86
+ File.join(local_directory, File.basename(last_local_file_trigger, '.trig'))
87
+ end
88
+
89
+ # Get the last local file trigger
90
+ def last_local_file_trigger
91
+ Dir.glob(File.join(local_directory, '*.trig')).last
92
+ end
93
+
94
+ # Get the local trigger file that is used to indicate that the file has
95
+ # been completely written
96
+ def local_file_trigger(file)
97
+ Pathname.new(file.to_s + '.trig')
98
+ end
99
+
100
+ # Return true if the source should read locally.
101
+ def read_locally
102
+ Engine.read_locally
103
+ end
104
+
105
+ end
106
+ end
107
+ end
108
+
109
+ Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
@@ -0,0 +1,220 @@
1
+ require 'fileutils'
2
+
3
+ module ETL #:nodoc:
4
+ class Source < ::ActiveRecord::Base #:nodoc:
5
+ # Connection for database sources
6
+ end
7
+
8
+ module Control #:nodoc:
9
+ # Source object which extracts data from a database using ActiveRecord.
10
+ class DatabaseSource < Source
11
+ attr_accessor :target
12
+ attr_accessor :table
13
+
14
+ # Initialize the source.
15
+ #
16
+ # Arguments:
17
+ # * <tt>control</tt>: The ETL::Control::Control instance
18
+ # * <tt>configuration</tt>: The configuration Hash
19
+ # * <tt>definition</tt>: The source definition
20
+ #
21
+ # Required configuration options:
22
+ # * <tt>:target</tt>: The target connection
23
+ # * <tt>:table</tt>: The source table name
24
+ # * <tt>:database</tt>: The database name
25
+ #
26
+ # Other options:
27
+ # * <tt>:join</tt>: Optional join part for the query (ignored unless
28
+ # specified)
29
+ # * <tt>:select</tt>: Optional select part for the query (defaults to
30
+ # '*')
31
+ # * <tt>:group</tt>: Optional group by part for the query (ignored
32
+ # unless specified)
33
+ # * <tt>:order</tt>: Optional order part for the query (ignored unless
34
+ # specified)
35
+ # * <tt>:new_records_only</tt>: Specify the column to use when comparing
36
+ # timestamps against the last successful ETL job execution for the
37
+ # current control file.
38
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
39
+ # source data locally in a flat file (defaults to true)
40
+ def initialize(control, configuration, definition)
41
+ super
42
+ @target = configuration[:target]
43
+ @table = configuration[:table]
44
+ end
45
+
46
+ # Get a String identifier for the source
47
+ def to_s
48
+ "#{host}/#{database}/#{table}"
49
+ end
50
+
51
+ # Get the local directory to use, which is a combination of the
52
+ # local_base, the db hostname the db database name and the db table.
53
+ def local_directory
54
+ File.join(local_base, host, database, configuration[:table])
55
+ end
56
+
57
+ # Get the join part of the query, defaults to nil
58
+ def join
59
+ configuration[:join]
60
+ end
61
+
62
+ # Get the select part of the query, defaults to '*'
63
+ def select
64
+ configuration[:select] || '*'
65
+ end
66
+
67
+ # Get the group by part of the query, defaults to nil
68
+ def group
69
+ configuration[:group]
70
+ end
71
+
72
+ # Get the order for the query, defaults to nil
73
+ def order
74
+ configuration[:order]
75
+ end
76
+
77
+ # Return the column which is used for in the where clause to identify
78
+ # new rows
79
+ def new_records_only
80
+ configuration[:new_records_only]
81
+ end
82
+
83
+ # Get the number of rows in the source
84
+ def count(use_cache=true)
85
+ return @count if @count && use_cache
86
+ if store_locally || read_locally
87
+ @count = count_locally
88
+ else
89
+ @count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
90
+ end
91
+ end
92
+
93
+ # Get the list of columns to read. This is defined in the source
94
+ # definition as either an Array or Hash
95
+ def columns
96
+ # weird default is required for writing to cache correctly
97
+ @columns ||= query_rows.any? ? query_rows.first.keys : ['']
98
+ end
99
+
100
+ # Returns each row from the source. If read_locally is specified then
101
+ # this method will attempt to read from the last stored local file.
102
+ # If no locally stored file exists or if the trigger file for the last
103
+ # locally stored file does not exist then this method will raise an
104
+ # error.
105
+ def each(&block)
106
+ if read_locally # Read from the last stored source
107
+ ETL::Engine.logger.debug "Reading from local cache"
108
+ read_rows(last_local_file, &block)
109
+ else # Read from the original source
110
+ if store_locally
111
+ file = local_file
112
+ write_local(file)
113
+ read_rows(file, &block)
114
+ else
115
+ query_rows.each do |row|
116
+ row = ETL::Row.new(row.symbolize_keys)
117
+ row.source = self
118
+ yield row
119
+ end
120
+ end
121
+ end
122
+ end
123
+
124
+ private
125
+ # Read rows from the local cache
126
+ def read_rows(file)
127
+ raise "Local cache file not found" unless File.exists?(file)
128
+ raise "Local cache trigger file not found" unless File.exists?(local_file_trigger(file))
129
+
130
+ t = Benchmark.realtime do
131
+ FasterCSV.open(file, :headers => true).each do |row|
132
+ result_row = ETL::Row.new
133
+ result_row.source = self
134
+ row.each do |header, field|
135
+ result_row[header.to_sym] = field
136
+ end
137
+ yield result_row
138
+ end
139
+ end
140
+ ETL::Engine.average_rows_per_second = ETL::Engine.rows_read / t
141
+ end
142
+
143
+ def count_locally
144
+ counter = 0
145
+ File.open(last_local_file, 'r').each { |line| counter += 1 }
146
+ counter
147
+ end
148
+
149
+ # Write rows to the local cache
150
+ def write_local(file)
151
+ lines = 0
152
+ t = Benchmark.realtime do
153
+ FasterCSV.open(file, 'w') do |f|
154
+ f << columns
155
+ query_rows.each do |row|
156
+ f << columns.collect { |column| row[column.to_s] }
157
+ lines += 1
158
+ end
159
+ end
160
+ File.open(local_file_trigger(file), 'w') {|f| }
161
+ end
162
+ ETL::Engine.logger.info "Stored locally in #{t}s (avg: #{lines/t} lines/sec)"
163
+ end
164
+
165
+ # Get the query to use
166
+ def query
167
+ return @query if @query
168
+ q = "SELECT #{select} FROM #{configuration[:table]}"
169
+ q << " #{join}" if join
170
+
171
+ conditions = []
172
+ if new_records_only
173
+ last_completed = ETL::Execution::Job.maximum('created_at',
174
+ :conditions => ['control_file = ? and completed_at is not null', control.file]
175
+ )
176
+ if last_completed
177
+ conditions << "#{new_records_only} > #{connection.quote(last_completed.to_s(:db))}"
178
+ end
179
+ end
180
+
181
+ conditions << configuration[:conditions] if configuration[:conditions]
182
+ if conditions.length > 0
183
+ q << " WHERE #{conditions.join(' AND ')}"
184
+ end
185
+
186
+ q << " GROUP BY #{group}" if group
187
+ q << " ORDER BY #{order}" if order
188
+
189
+ if ETL::Engine.limit || ETL::Engine.offset
190
+ options = {}
191
+ options[:limit] = ETL::Engine.limit if ETL::Engine.limit
192
+ options[:offset] = ETL::Engine.offset if ETL::Engine.offset
193
+ connection.add_limit_offset!(q, options)
194
+ end
195
+
196
+ q = q.gsub(/\n/,' ')
197
+ ETL::Engine.logger.info "Query: #{q}"
198
+ @query = q
199
+ end
200
+
201
+ def query_rows
202
+ @query_rows ||= connection.select_all(query)
203
+ end
204
+
205
+ # Get the database connection to use
206
+ def connection
207
+ ETL::Engine.connection(target)
208
+ end
209
+
210
+ # Get the host, defaults to 'localhost'
211
+ def host
212
+ ETL::Base.configurations[target.to_s]['host'] || 'localhost'
213
+ end
214
+
215
+ def database
216
+ ETL::Base.configurations[target.to_s]['database']
217
+ end
218
+ end
219
+ end
220
+ end