factorylabs-activewarehouse-etl 0.9.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +85 -0
  4. data/Rakefile +153 -0
  5. data/TODO +28 -0
  6. data/bin/etl +28 -0
  7. data/bin/etl.cmd +8 -0
  8. data/examples/database.example.yml +16 -0
  9. data/lib/etl.rb +78 -0
  10. data/lib/etl/batch.rb +2 -0
  11. data/lib/etl/batch/batch.rb +111 -0
  12. data/lib/etl/batch/directives.rb +55 -0
  13. data/lib/etl/builder.rb +2 -0
  14. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  15. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  16. data/lib/etl/commands/etl.rb +89 -0
  17. data/lib/etl/control.rb +3 -0
  18. data/lib/etl/control/control.rb +405 -0
  19. data/lib/etl/control/destination.rb +420 -0
  20. data/lib/etl/control/destination/database_destination.rb +95 -0
  21. data/lib/etl/control/destination/file_destination.rb +124 -0
  22. data/lib/etl/control/source.rb +109 -0
  23. data/lib/etl/control/source/database_source.rb +220 -0
  24. data/lib/etl/control/source/enumerable_source.rb +11 -0
  25. data/lib/etl/control/source/file_source.rb +90 -0
  26. data/lib/etl/control/source/model_source.rb +39 -0
  27. data/lib/etl/core_ext.rb +1 -0
  28. data/lib/etl/core_ext/time.rb +5 -0
  29. data/lib/etl/core_ext/time/calculations.rb +42 -0
  30. data/lib/etl/engine.rb +556 -0
  31. data/lib/etl/execution.rb +20 -0
  32. data/lib/etl/execution/base.rb +9 -0
  33. data/lib/etl/execution/batch.rb +8 -0
  34. data/lib/etl/execution/job.rb +8 -0
  35. data/lib/etl/execution/migration.rb +85 -0
  36. data/lib/etl/execution/record.rb +18 -0
  37. data/lib/etl/generator.rb +2 -0
  38. data/lib/etl/generator/generator.rb +20 -0
  39. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  40. data/lib/etl/http_tools.rb +139 -0
  41. data/lib/etl/parser.rb +11 -0
  42. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  43. data/lib/etl/parser/delimited_parser.rb +74 -0
  44. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  45. data/lib/etl/parser/parser.rb +41 -0
  46. data/lib/etl/parser/sax_parser.rb +218 -0
  47. data/lib/etl/parser/xml_parser.rb +65 -0
  48. data/lib/etl/processor.rb +11 -0
  49. data/lib/etl/processor/block_processor.rb +14 -0
  50. data/lib/etl/processor/bulk_import_processor.rb +81 -0
  51. data/lib/etl/processor/check_exist_processor.rb +80 -0
  52. data/lib/etl/processor/check_unique_processor.rb +35 -0
  53. data/lib/etl/processor/copy_field_processor.rb +26 -0
  54. data/lib/etl/processor/encode_processor.rb +55 -0
  55. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  56. data/lib/etl/processor/print_row_processor.rb +12 -0
  57. data/lib/etl/processor/processor.rb +25 -0
  58. data/lib/etl/processor/rename_processor.rb +24 -0
  59. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  60. data/lib/etl/processor/row_processor.rb +17 -0
  61. data/lib/etl/processor/sequence_processor.rb +23 -0
  62. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  63. data/lib/etl/processor/truncate_processor.rb +35 -0
  64. data/lib/etl/row.rb +20 -0
  65. data/lib/etl/screen.rb +14 -0
  66. data/lib/etl/screen/row_count_screen.rb +20 -0
  67. data/lib/etl/transform.rb +2 -0
  68. data/lib/etl/transform/block_transform.rb +13 -0
  69. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  70. data/lib/etl/transform/decode_transform.rb +51 -0
  71. data/lib/etl/transform/default_transform.rb +20 -0
  72. data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
  73. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  74. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  75. data/lib/etl/transform/sha1_transform.rb +13 -0
  76. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  77. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  78. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  79. data/lib/etl/transform/transform.rb +61 -0
  80. data/lib/etl/transform/trim_transform.rb +26 -0
  81. data/lib/etl/transform/type_transform.rb +35 -0
  82. data/lib/etl/util.rb +59 -0
  83. data/lib/etl/version.rb +9 -0
  84. metadata +195 -0
@@ -0,0 +1,95 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Destination which writes directly to a database. This is useful when you are dealing with
4
+ # a small amount of data. For larger amounts of data you should probably use the bulk
5
+ # loader if it is supported with your target database as it will use a much faster load
6
+ # method.
7
+ class DatabaseDestination < Destination
8
+ # The target connection
9
+ attr_reader :target
10
+
11
+ # The table
12
+ attr_reader :table
13
+
14
+ # Specify the order from the source
15
+ attr_reader :order
16
+
17
+ # Set to true to truncate the destination table first
18
+ attr_reader :truncate
19
+
20
+ # Initialize the database destination
21
+ #
22
+ # * <tt>control</tt>: The ETL::Control::Control instance
23
+ # * <tt>configuration</tt>: The configuration Hash
24
+ # * <tt>mapping</tt>: The mapping
25
+ #
26
+ # Configuration options:
27
+ # * <tt>:database</tt>: The database name (REQUIRED)
28
+ # * <tt>:target</tt>: The target connection (REQUIRED)
29
+ # * <tt>:table</tt>: The table to write to (REQUIRED)
30
+ # * <tt>:truncate</tt>: Set to true to truncate before writing (defaults to false)
31
+ # * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
32
+ # * <tt>:append_rows</tt>: Array of rows to append
33
+ #
34
+ # Mapping options:
35
+ # * <tt>:order</tt>: The order of fields to write (REQUIRED)
36
+ def initialize(control, configuration, mapping={})
37
+ super
38
+ @target = configuration[:target]
39
+ @table = configuration[:table]
40
+ @truncate = configuration[:truncate] ||= false
41
+ @unique = configuration[:unique]
42
+ @order = mapping[:order] || order_from_source
43
+ raise ControlError, "Order required in mapping" unless @order
44
+ raise ControlError, "Table required" unless @table
45
+ raise ControlError, "Target required" unless @target
46
+ end
47
+
48
+ # Flush the currently buffered data
49
+ def flush
50
+ conn.transaction do
51
+ buffer.flatten.each do |row|
52
+ # check to see if this row's compound key constraint already exists
53
+ # note that the compound key constraint may not utilize virtual fields
54
+ next unless row_allowed?(row)
55
+
56
+ # add any virtual fields
57
+ add_virtuals!(row)
58
+
59
+ names = []
60
+ values = []
61
+ order.each do |name|
62
+ names << "`#{name}`"
63
+ values << conn.quote(row[name]) # TODO: this is probably not database agnostic
64
+ end
65
+ q = "INSERT INTO `#{table_name}` (#{names.join(',')}) VALUES (#{values.join(',')})"
66
+ ETL::Engine.logger.debug("Executing insert: #{q}")
67
+ conn.insert(q, "Insert row #{current_row}")
68
+ @current_row += 1
69
+ end
70
+ buffer.clear
71
+ end
72
+ end
73
+
74
+ # Close the connection
75
+ def close
76
+ buffer << append_rows if append_rows
77
+ flush
78
+ end
79
+
80
+ private
81
+ def conn
82
+ @conn ||= begin
83
+ conn = ETL::Engine.connection(target)
84
+ conn.truncate(table_name) if truncate
85
+ conn
86
+ end
87
+ end
88
+
89
+ def table_name
90
+ ETL::Engine.table(table, ETL::Engine.connection(target))
91
+ end
92
+
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,124 @@
1
+ # This source file contains the ETL::Control::FileDestination
2
+
3
+ module ETL #:nodoc:
4
+ module Control #:nodoc:
5
+ # File as the final destination.
6
+ class FileDestination < Destination
7
+ # The File to write to
8
+ attr_reader :file
9
+
10
+ # The output order
11
+ attr_reader :order
12
+
13
+ # Flag which indicates to append (default is to overwrite)
14
+ attr_accessor :append
15
+
16
+ # The separator
17
+ attr_accessor :separator
18
+
19
+ # The end of line marker
20
+ attr_accessor :eol
21
+
22
+ # The enclosure character
23
+ attr_accessor :enclose
24
+
25
+ # Initialize the object.
26
+ # * <tt>control</tt>: The Control object
27
+ # * <tt>configuration</tt>: The configuration map
28
+ # * <tt>mapping</tt>: The output mapping
29
+ #
30
+ # Configuration options:
31
+ # * <tt>:file<tt>: The file to write to (REQUIRED)
32
+ # * <tt>:append</tt>: Set to true to append to the file (default is to overwrite)
33
+ # * <tt>:separator</tt>: Record separator (default is a comma)
34
+ # * <tt>:eol</tt>: End of line marker (default is \n)
35
+ # * <tt>:enclose</tt>: Enclosure character (default is none)
36
+ # * <tt>:unique</tt>: Set to true to only write unique records
37
+ # * <tt>:append_rows</tt>: Array of rows to append
38
+ #
39
+ # Mapping options:
40
+ # * <tt>:order</tt>: The order array
41
+ def initialize(control, configuration, mapping={})
42
+ super
43
+ @file = File.join(File.dirname(control.file), configuration[:file])
44
+ @append = configuration[:append] ||= false
45
+ @separator = configuration[:separator] ||= ','
46
+ @eol = configuration[:eol] ||= "\n"
47
+ @enclose = configuration[:enclose]
48
+ @unique = configuration[:unique]
49
+
50
+ @order = mapping[:order] || order_from_source
51
+ raise ControlError, "Order required in mapping" unless @order
52
+ end
53
+
54
+ # Close the destination. This will flush the buffer and close the underlying stream or connection.
55
+ def close
56
+ buffer << append_rows if append_rows
57
+ flush
58
+ f.close
59
+ end
60
+
61
+ # Flush the destination buffer
62
+ def flush
63
+ #puts "Flushing buffer (#{file}) with #{buffer.length} rows"
64
+ buffer.flatten.each do |row|
65
+ #puts "row change type: #{row.change_type}"
66
+ # check to see if this row's compound key constraint already exists
67
+ # note that the compound key constraint may not utilize virtual fields
68
+ next unless row_allowed?(row)
69
+
70
+ # add any virtual fields
71
+ add_virtuals!(row)
72
+
73
+ # collect all of the values using the order designated in the configuration
74
+ values = order.collect do |name|
75
+ value = row[name]
76
+ case value
77
+ when Date, Time, DateTime
78
+ value.to_s(:db)
79
+ else
80
+ value.to_s
81
+ end
82
+ end
83
+
84
+ values.collect! { |v| v.gsub(/\\/, '\\\\\\\\')}
85
+ values.collect! { |v| v.gsub(separator, "\\#{separator}")}
86
+ values.collect! { |v| v.gsub(/\n|\r/, '')}
87
+
88
+ # enclose the value if required
89
+ if !enclose.nil?
90
+ values.collect! { |v| enclose + v.gsub(/(#{enclose})/, '\\\\\1') + enclose }
91
+ end
92
+
93
+ # write the values joined by the separator defined in the configuration
94
+ f.write(values.join(separator))
95
+
96
+ # write the end-of-line
97
+ f.write(eol)
98
+ end
99
+ f.flush
100
+ buffer.clear
101
+ #puts "After flush there are #{buffer.length} rows"
102
+ end
103
+
104
+ private
105
+ # Get the open file stream
106
+ def f
107
+ @f ||= open(file, mode)
108
+ end
109
+
110
+ def options
111
+ @options ||= {
112
+ :col_sep => separator,
113
+ :row_sep => eol,
114
+ :force_quotes => !enclose.nil?
115
+ }
116
+ end
117
+
118
+ # Get the appropriate mode to open the file stream
119
+ def mode
120
+ append ? 'a' : 'w'
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,109 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # ETL source. Subclasses must implement the <tt>each</tt> method.
4
+ class Source
5
+ include Enumerable
6
+
7
+ # The control object
8
+ attr_accessor :control
9
+
10
+ # The configuration Hash
11
+ attr_accessor :configuration
12
+
13
+ # The definition Hash
14
+ attr_accessor :definition
15
+
16
+ # Returns true if the source data should be stored locally for archival
17
+ # Default behavior will return true.
18
+ attr_accessor :store_locally
19
+
20
+ class << self
21
+ # Convert the name to a Source class.
22
+ #
23
+ # For example if name is :database then this will return a
24
+ # DatabaseSource class
25
+ def class_for_name(name)
26
+ ETL::Control.const_get("#{name.to_s.camelize}Source")
27
+ end
28
+ end
29
+
30
+ # Initialize the Source instance
31
+ # * <tt>control</tt>: The control object
32
+ # * <tt>configuration</tt>: The configuration hash
33
+ # * <tt>definition</tt>: The source layout definition
34
+ #
35
+ # Configuration options:
36
+ # * <tt>:store_locally</tt>: Set to false to not store source data
37
+ # locally (defaults to true)
38
+ def initialize(control, configuration, definition)
39
+ @control = control
40
+ @configuration = configuration
41
+ @definition = definition
42
+
43
+ @store_locally = configuration[:store_locally] || true
44
+ end
45
+
46
+ # Get an array of errors that occur during reading from the source
47
+ def errors
48
+ @errors ||= []
49
+ end
50
+
51
+ # Get a timestamp value as a string
52
+ def timestamp
53
+ Engine.timestamp
54
+ end
55
+
56
+ # The base directory where local files are stored.
57
+ attr_accessor :local_base
58
+
59
+ # Get the local base, defaults to 'source_data'
60
+ def local_base
61
+ @local_base ||= 'source_data'
62
+ end
63
+
64
+ # The local directory for storing. This method must be overriden by
65
+ # subclasses
66
+ def local_directory
67
+ raise "local_directory method is abstract"
68
+ end
69
+
70
+ # Return the local file for storing the raw source data. Each call to
71
+ # this method will result in a timestamped file, so you cannot expect
72
+ # to call it multiple times and reference the same file
73
+ #
74
+ # Optional sequence can be specified if there are multiple source files
75
+ def local_file(sequence=nil)
76
+ filename = timestamp.to_s
77
+ filename += sequence.to_s if sequence
78
+
79
+ local_dir = local_directory
80
+ FileUtils.mkdir_p(local_dir)
81
+ File.join(local_dir, "#{filename}.csv")
82
+ end
83
+
84
+ # Get the last fully written local file
85
+ def last_local_file
86
+ File.join(local_directory, File.basename(last_local_file_trigger, '.trig'))
87
+ end
88
+
89
+ # Get the last local file trigger
90
+ def last_local_file_trigger
91
+ Dir.glob(File.join(local_directory, '*.trig')).last
92
+ end
93
+
94
+ # Get the local trigger file that is used to indicate that the file has
95
+ # been completely written
96
+ def local_file_trigger(file)
97
+ Pathname.new(file.to_s + '.trig')
98
+ end
99
+
100
+ # Return true if the source should read locally.
101
+ def read_locally
102
+ Engine.read_locally
103
+ end
104
+
105
+ end
106
+ end
107
+ end
108
+
109
+ Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
@@ -0,0 +1,220 @@
1
+ require 'fileutils'
2
+
3
+ module ETL #:nodoc:
4
+ class Source < ::ActiveRecord::Base #:nodoc:
5
+ # Connection for database sources
6
+ end
7
+
8
+ module Control #:nodoc:
9
+ # Source object which extracts data from a database using ActiveRecord.
10
+ class DatabaseSource < Source
11
+ attr_accessor :target
12
+ attr_accessor :table
13
+
14
+ # Initialize the source.
15
+ #
16
+ # Arguments:
17
+ # * <tt>control</tt>: The ETL::Control::Control instance
18
+ # * <tt>configuration</tt>: The configuration Hash
19
+ # * <tt>definition</tt>: The source definition
20
+ #
21
+ # Required configuration options:
22
+ # * <tt>:target</tt>: The target connection
23
+ # * <tt>:table</tt>: The source table name
24
+ # * <tt>:database</tt>: The database name
25
+ #
26
+ # Other options:
27
+ # * <tt>:join</tt>: Optional join part for the query (ignored unless
28
+ # specified)
29
+ # * <tt>:select</tt>: Optional select part for the query (defaults to
30
+ # '*')
31
+ # * <tt>:group</tt>: Optional group by part for the query (ignored
32
+ # unless specified)
33
+ # * <tt>:order</tt>: Optional order part for the query (ignored unless
34
+ # specified)
35
+ # * <tt>:new_records_only</tt>: Specify the column to use when comparing
36
+ # timestamps against the last successful ETL job execution for the
37
+ # current control file.
38
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
39
+ # source data locally in a flat file (defaults to true)
40
+ def initialize(control, configuration, definition)
41
+ super
42
+ @target = configuration[:target]
43
+ @table = configuration[:table]
44
+ end
45
+
46
+ # Get a String identifier for the source
47
+ def to_s
48
+ "#{host}/#{database}/#{table}"
49
+ end
50
+
51
+ # Get the local directory to use, which is a combination of the
52
+ # local_base, the db hostname the db database name and the db table.
53
+ def local_directory
54
+ File.join(local_base, host, database, configuration[:table])
55
+ end
56
+
57
+ # Get the join part of the query, defaults to nil
58
+ def join
59
+ configuration[:join]
60
+ end
61
+
62
+ # Get the select part of the query, defaults to '*'
63
+ def select
64
+ configuration[:select] || '*'
65
+ end
66
+
67
+ # Get the group by part of the query, defaults to nil
68
+ def group
69
+ configuration[:group]
70
+ end
71
+
72
+ # Get the order for the query, defaults to nil
73
+ def order
74
+ configuration[:order]
75
+ end
76
+
77
+ # Return the column which is used for in the where clause to identify
78
+ # new rows
79
+ def new_records_only
80
+ configuration[:new_records_only]
81
+ end
82
+
83
+ # Get the number of rows in the source
84
+ def count(use_cache=true)
85
+ return @count if @count && use_cache
86
+ if store_locally || read_locally
87
+ @count = count_locally
88
+ else
89
+ @count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
90
+ end
91
+ end
92
+
93
+ # Get the list of columns to read. This is defined in the source
94
+ # definition as either an Array or Hash
95
+ def columns
96
+ # weird default is required for writing to cache correctly
97
+ @columns ||= query_rows.any? ? query_rows.first.keys : ['']
98
+ end
99
+
100
+ # Returns each row from the source. If read_locally is specified then
101
+ # this method will attempt to read from the last stored local file.
102
+ # If no locally stored file exists or if the trigger file for the last
103
+ # locally stored file does not exist then this method will raise an
104
+ # error.
105
+ def each(&block)
106
+ if read_locally # Read from the last stored source
107
+ ETL::Engine.logger.debug "Reading from local cache"
108
+ read_rows(last_local_file, &block)
109
+ else # Read from the original source
110
+ if store_locally
111
+ file = local_file
112
+ write_local(file)
113
+ read_rows(file, &block)
114
+ else
115
+ query_rows.each do |row|
116
+ row = ETL::Row.new(row.symbolize_keys)
117
+ row.source = self
118
+ yield row
119
+ end
120
+ end
121
+ end
122
+ end
123
+
124
+ private
125
+ # Read rows from the local cache
126
+ def read_rows(file)
127
+ raise "Local cache file not found" unless File.exists?(file)
128
+ raise "Local cache trigger file not found" unless File.exists?(local_file_trigger(file))
129
+
130
+ t = Benchmark.realtime do
131
+ FasterCSV.open(file, :headers => true).each do |row|
132
+ result_row = ETL::Row.new
133
+ result_row.source = self
134
+ row.each do |header, field|
135
+ result_row[header.to_sym] = field
136
+ end
137
+ yield result_row
138
+ end
139
+ end
140
+ ETL::Engine.average_rows_per_second = ETL::Engine.rows_read / t
141
+ end
142
+
143
+ def count_locally
144
+ counter = 0
145
+ File.open(last_local_file, 'r').each { |line| counter += 1 }
146
+ counter
147
+ end
148
+
149
+ # Write rows to the local cache
150
+ def write_local(file)
151
+ lines = 0
152
+ t = Benchmark.realtime do
153
+ FasterCSV.open(file, 'w') do |f|
154
+ f << columns
155
+ query_rows.each do |row|
156
+ f << columns.collect { |column| row[column.to_s] }
157
+ lines += 1
158
+ end
159
+ end
160
+ File.open(local_file_trigger(file), 'w') {|f| }
161
+ end
162
+ ETL::Engine.logger.info "Stored locally in #{t}s (avg: #{lines/t} lines/sec)"
163
+ end
164
+
165
+ # Get the query to use
166
+ def query
167
+ return @query if @query
168
+ q = "SELECT #{select} FROM #{configuration[:table]}"
169
+ q << " #{join}" if join
170
+
171
+ conditions = []
172
+ if new_records_only
173
+ last_completed = ETL::Execution::Job.maximum('created_at',
174
+ :conditions => ['control_file = ? and completed_at is not null', control.file]
175
+ )
176
+ if last_completed
177
+ conditions << "#{new_records_only} > #{connection.quote(last_completed.to_s(:db))}"
178
+ end
179
+ end
180
+
181
+ conditions << configuration[:conditions] if configuration[:conditions]
182
+ if conditions.length > 0
183
+ q << " WHERE #{conditions.join(' AND ')}"
184
+ end
185
+
186
+ q << " GROUP BY #{group}" if group
187
+ q << " ORDER BY #{order}" if order
188
+
189
+ if ETL::Engine.limit || ETL::Engine.offset
190
+ options = {}
191
+ options[:limit] = ETL::Engine.limit if ETL::Engine.limit
192
+ options[:offset] = ETL::Engine.offset if ETL::Engine.offset
193
+ connection.add_limit_offset!(q, options)
194
+ end
195
+
196
+ q = q.gsub(/\n/,' ')
197
+ ETL::Engine.logger.info "Query: #{q}"
198
+ @query = q
199
+ end
200
+
201
+ def query_rows
202
+ @query_rows ||= connection.select_all(query)
203
+ end
204
+
205
+ # Get the database connection to use
206
+ def connection
207
+ ETL::Engine.connection(target)
208
+ end
209
+
210
+ # Get the host, defaults to 'localhost'
211
+ def host
212
+ ETL::Base.configurations[target.to_s]['host'] || 'localhost'
213
+ end
214
+
215
+ def database
216
+ ETL::Base.configurations[target.to_s]['database']
217
+ end
218
+ end
219
+ end
220
+ end