darrell-activewarehouse-etl 0.9.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +99 -0
  4. data/Rakefile +175 -0
  5. data/TODO +28 -0
  6. data/bin/etl +28 -0
  7. data/bin/etl.cmd +8 -0
  8. data/examples/database.example.yml +16 -0
  9. data/lib/etl/batch/batch.rb +111 -0
  10. data/lib/etl/batch/directives.rb +55 -0
  11. data/lib/etl/batch.rb +2 -0
  12. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  13. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  14. data/lib/etl/builder.rb +2 -0
  15. data/lib/etl/commands/etl.rb +89 -0
  16. data/lib/etl/control/control.rb +405 -0
  17. data/lib/etl/control/destination/database_destination.rb +97 -0
  18. data/lib/etl/control/destination/file_destination.rb +126 -0
  19. data/lib/etl/control/destination.rb +448 -0
  20. data/lib/etl/control/source/database_source.rb +220 -0
  21. data/lib/etl/control/source/enumerable_source.rb +11 -0
  22. data/lib/etl/control/source/file_source.rb +90 -0
  23. data/lib/etl/control/source/model_source.rb +39 -0
  24. data/lib/etl/control/source.rb +109 -0
  25. data/lib/etl/control.rb +3 -0
  26. data/lib/etl/core_ext/time/calculations.rb +42 -0
  27. data/lib/etl/core_ext/time.rb +5 -0
  28. data/lib/etl/core_ext.rb +1 -0
  29. data/lib/etl/engine.rb +556 -0
  30. data/lib/etl/execution/base.rb +9 -0
  31. data/lib/etl/execution/batch.rb +8 -0
  32. data/lib/etl/execution/job.rb +8 -0
  33. data/lib/etl/execution/migration.rb +85 -0
  34. data/lib/etl/execution.rb +19 -0
  35. data/lib/etl/generator/generator.rb +20 -0
  36. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  37. data/lib/etl/generator.rb +2 -0
  38. data/lib/etl/http_tools.rb +139 -0
  39. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  40. data/lib/etl/parser/delimited_parser.rb +74 -0
  41. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  42. data/lib/etl/parser/parser.rb +41 -0
  43. data/lib/etl/parser/sax_parser.rb +218 -0
  44. data/lib/etl/parser/xml_parser.rb +65 -0
  45. data/lib/etl/parser.rb +11 -0
  46. data/lib/etl/processor/block_processor.rb +14 -0
  47. data/lib/etl/processor/bulk_import_processor.rb +83 -0
  48. data/lib/etl/processor/check_exist_processor.rb +80 -0
  49. data/lib/etl/processor/check_unique_processor.rb +35 -0
  50. data/lib/etl/processor/copy_field_processor.rb +26 -0
  51. data/lib/etl/processor/encode_processor.rb +55 -0
  52. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  53. data/lib/etl/processor/print_row_processor.rb +12 -0
  54. data/lib/etl/processor/processor.rb +25 -0
  55. data/lib/etl/processor/rename_processor.rb +24 -0
  56. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  57. data/lib/etl/processor/row_processor.rb +17 -0
  58. data/lib/etl/processor/sequence_processor.rb +23 -0
  59. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  60. data/lib/etl/processor/truncate_processor.rb +35 -0
  61. data/lib/etl/processor.rb +11 -0
  62. data/lib/etl/row.rb +20 -0
  63. data/lib/etl/screen/row_count_screen.rb +20 -0
  64. data/lib/etl/screen.rb +14 -0
  65. data/lib/etl/transform/block_transform.rb +13 -0
  66. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  67. data/lib/etl/transform/decode_transform.rb +51 -0
  68. data/lib/etl/transform/default_transform.rb +20 -0
  69. data/lib/etl/transform/foreign_key_lookup_transform.rb +169 -0
  70. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  71. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  72. data/lib/etl/transform/sha1_transform.rb +13 -0
  73. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  74. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  75. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  76. data/lib/etl/transform/transform.rb +61 -0
  77. data/lib/etl/transform/trim_transform.rb +26 -0
  78. data/lib/etl/transform/type_transform.rb +35 -0
  79. data/lib/etl/transform.rb +2 -0
  80. data/lib/etl/util.rb +59 -0
  81. data/lib/etl/version.rb +9 -0
  82. data/lib/etl.rb +83 -0
  83. metadata +245 -0
@@ -0,0 +1,220 @@
1
+ require 'fileutils'
2
+
3
+ module ETL #:nodoc:
4
+ class Source < ::ActiveRecord::Base #:nodoc:
5
+ # Connection for database sources
6
+ end
7
+
8
+ module Control #:nodoc:
9
+ # Source object which extracts data from a database using ActiveRecord.
10
+ class DatabaseSource < Source
11
+ attr_accessor :target
12
+ attr_accessor :table
13
+
14
+ # Initialize the source.
15
+ #
16
+ # Arguments:
17
+ # * <tt>control</tt>: The ETL::Control::Control instance
18
+ # * <tt>configuration</tt>: The configuration Hash
19
+ # * <tt>definition</tt>: The source definition
20
+ #
21
+ # Required configuration options:
22
+ # * <tt>:target</tt>: The target connection
23
+ # * <tt>:table</tt>: The source table name
24
+ # * <tt>:database</tt>: The database name
25
+ #
26
+ # Other options:
27
+ # * <tt>:join</tt>: Optional join part for the query (ignored unless
28
+ # specified)
29
+ # * <tt>:select</tt>: Optional select part for the query (defaults to
30
+ # '*')
31
+ # * <tt>:group</tt>: Optional group by part for the query (ignored
32
+ # unless specified)
33
+ # * <tt>:order</tt>: Optional order part for the query (ignored unless
34
+ # specified)
35
+ # * <tt>:new_records_only</tt>: Specify the column to use when comparing
36
+ # timestamps against the last successful ETL job execution for the
37
+ # current control file.
38
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
39
+ # source data locally in a flat file (defaults to true)
40
+ def initialize(control, configuration, definition)
41
+ super
42
+ @target = configuration[:target]
43
+ @table = configuration[:table]
44
+ end
45
+
46
+ # Get a String identifier for the source
47
+ def to_s
48
+ "#{host}/#{database}/#{table}"
49
+ end
50
+
51
+ # Get the local directory to use, which is a combination of the
52
+ # local_base, the db hostname the db database name and the db table.
53
+ def local_directory
54
+ File.join(local_base, host, database, configuration[:table])
55
+ end
56
+
57
+ # Get the join part of the query, defaults to nil
58
+ def join
59
+ configuration[:join]
60
+ end
61
+
62
+ # Get the select part of the query, defaults to '*'
63
+ def select
64
+ configuration[:select] || '*'
65
+ end
66
+
67
+ # Get the group by part of the query, defaults to nil
68
+ def group
69
+ configuration[:group]
70
+ end
71
+
72
+ # Get the order for the query, defaults to nil
73
+ def order
74
+ configuration[:order]
75
+ end
76
+
77
+ # Return the column which is used for in the where clause to identify
78
+ # new rows
79
+ def new_records_only
80
+ configuration[:new_records_only]
81
+ end
82
+
83
+ # Get the number of rows in the source
84
+ def count(use_cache=true)
85
+ return @count if @count && use_cache
86
+ if store_locally || read_locally
87
+ @count = count_locally
88
+ else
89
+ @count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
90
+ end
91
+ end
92
+
93
+ # Get the list of columns to read. This is defined in the source
94
+ # definition as either an Array or Hash
95
+ def columns
96
+ # weird default is required for writing to cache correctly
97
+ @columns ||= query_rows.any? ? query_rows.first.keys : ['']
98
+ end
99
+
100
+ # Returns each row from the source. If read_locally is specified then
101
+ # this method will attempt to read from the last stored local file.
102
+ # If no locally stored file exists or if the trigger file for the last
103
+ # locally stored file does not exist then this method will raise an
104
+ # error.
105
+ def each(&block)
106
+ if read_locally # Read from the last stored source
107
+ ETL::Engine.logger.debug "Reading from local cache"
108
+ read_rows(last_local_file, &block)
109
+ else # Read from the original source
110
+ if store_locally
111
+ file = local_file
112
+ write_local(file)
113
+ read_rows(file, &block)
114
+ else
115
+ query_rows.each do |row|
116
+ row = ETL::Row.new(row.symbolize_keys)
117
+ row.source = self
118
+ yield row
119
+ end
120
+ end
121
+ end
122
+ end
123
+
124
+ private
125
+ # Read rows from the local cache
126
+ def read_rows(file)
127
+ raise "Local cache file not found" unless File.exists?(file)
128
+ raise "Local cache trigger file not found" unless File.exists?(local_file_trigger(file))
129
+
130
+ t = Benchmark.realtime do
131
+ FasterCSV.open(file, :headers => true).each do |row|
132
+ result_row = ETL::Row.new
133
+ result_row.source = self
134
+ row.each do |header, field|
135
+ result_row[header.to_sym] = field
136
+ end
137
+ yield result_row
138
+ end
139
+ end
140
+ ETL::Engine.average_rows_per_second = ETL::Engine.rows_read / t
141
+ end
142
+
143
+ def count_locally
144
+ counter = 0
145
+ File.open(last_local_file, 'r').each { |line| counter += 1 }
146
+ counter
147
+ end
148
+
149
+ # Write rows to the local cache
150
+ def write_local(file)
151
+ lines = 0
152
+ t = Benchmark.realtime do
153
+ FasterCSV.open(file, 'w') do |f|
154
+ f << columns
155
+ query_rows.each do |row|
156
+ f << columns.collect { |column| row[column.to_s] }
157
+ lines += 1
158
+ end
159
+ end
160
+ File.open(local_file_trigger(file), 'w') {|f| }
161
+ end
162
+ ETL::Engine.logger.info "Stored locally in #{t}s (avg: #{lines/t} lines/sec)"
163
+ end
164
+
165
+ # Get the query to use
166
+ def query
167
+ return @query if @query
168
+ q = "SELECT #{select} FROM #{configuration[:table]}"
169
+ q << " #{join}" if join
170
+
171
+ conditions = []
172
+ if new_records_only
173
+ last_completed = ETL::Execution::Job.maximum('created_at',
174
+ :conditions => ['control_file = ? and completed_at is not null', control.file]
175
+ )
176
+ if last_completed
177
+ conditions << "#{new_records_only} > #{connection.quote(last_completed.to_s(:db))}"
178
+ end
179
+ end
180
+
181
+ conditions << configuration[:conditions] if configuration[:conditions]
182
+ if conditions.length > 0
183
+ q << " WHERE #{conditions.join(' AND ')}"
184
+ end
185
+
186
+ q << " GROUP BY #{group}" if group
187
+ q << " ORDER BY #{order}" if order
188
+
189
+ if ETL::Engine.limit || ETL::Engine.offset
190
+ options = {}
191
+ options[:limit] = ETL::Engine.limit if ETL::Engine.limit
192
+ options[:offset] = ETL::Engine.offset if ETL::Engine.offset
193
+ connection.add_limit_offset!(q, options)
194
+ end
195
+
196
+ q = q.gsub(/\n/,' ')
197
+ ETL::Engine.logger.info "Query: #{q}"
198
+ @query = q
199
+ end
200
+
201
+ def query_rows
202
+ @query_rows ||= connection.select_all(query)
203
+ end
204
+
205
+ # Get the database connection to use
206
+ def connection
207
+ ETL::Engine.connection(target)
208
+ end
209
+
210
+ # Get the host, defaults to 'localhost'
211
+ def host
212
+ ETL::Base.configurations[target.to_s]['host'] || 'localhost'
213
+ end
214
+
215
+ def database
216
+ ETL::Base.configurations[target.to_s]['database']
217
+ end
218
+ end
219
+ end
220
+ end
@@ -0,0 +1,11 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Use an Enumerable as a source
4
+ class EnumerableSource < ETL::Control::Source
5
+ # Iterate through the enumerable
6
+ def each(&block)
7
+ configuration[:enumerable].each(&block)
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,90 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # A File source.
4
+ class FileSource < Source
5
+ # The number of lines to skip, default is 0
6
+ attr_accessor :skip_lines
7
+
8
+ # Accessor for the underlying parser
9
+ attr_accessor :parser
10
+
11
+ # The source file
12
+ attr_accessor :file
13
+
14
+ # Initialize the source
15
+ #
16
+ # Configuration options:
17
+ # * <tt>:file</tt>: The source file
18
+ # * <tt>:parser</tt>: One of the following: a parser name as a String or
19
+ # symbol, a class which extends from Parser, a Hash with :name and
20
+ # optionally an :options key. Whether or not the parser uses the
21
+ # options is dependent on which parser is used. See the documentation
22
+ # for each parser for information on what options it accepts.
23
+ # * <tt>:skip_lines</tt>: The number of lines to skip (defaults to 0)
24
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
25
+ # source data locally for archival
26
+ def initialize(control, configuration, definition)
27
+ super
28
+ configure
29
+ end
30
+
31
+ # Get a String identifier for the source
32
+ def to_s
33
+ file
34
+ end
35
+
36
+ # Get the local storage directory
37
+ def local_directory
38
+ File.join(local_base, File.basename(file, File.extname(file)))
39
+ end
40
+
41
+ # Returns each row from the source
42
+ def each
43
+ count = 0
44
+ copy_sources if store_locally
45
+ @parser.each do |row|
46
+ if ETL::Engine.offset && count < ETL::Engine.offset
47
+ count += 1
48
+ else
49
+ row = ETL::Row[row]
50
+ row.source = self
51
+ yield row
52
+ end
53
+ end
54
+ end
55
+
56
+ private
57
+ # Copy source data to a local directory structure
58
+ def copy_sources
59
+ sequence = 0
60
+ path = Pathname.new(file)
61
+ path = path.absolute? ? path : Pathname.new(File.dirname(control.file)) + path
62
+ Pathname.glob(path).each do |f|
63
+ next if f.directory?
64
+ lf = local_file(sequence)
65
+ FileUtils.cp(f, lf)
66
+ File.open(local_file_trigger(lf), 'w') {|f| }
67
+ sequence += 1
68
+ end
69
+ end
70
+
71
+ # Configure the source
72
+ def configure
73
+ @file = configuration[:file]
74
+ case configuration[:parser]
75
+ when Class
76
+ @parser = configuration[:parser].new(self)
77
+ when String, Symbol
78
+ @parser = ETL::Parser::Parser.class_for_name(configuration[:parser]).new(self)
79
+ when Hash
80
+ name = configuration[:parser][:name]
81
+ options = configuration[:parser][:options]
82
+ @parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
83
+ else
84
+ raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
85
+ end
86
+ @skip_lines = configuration[:skip_lines] ||= 0
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,39 @@
1
+ #RAILS_ENV = 'development'
2
+ #require '../config/environment'
3
+
4
+ module ETL #:nodoc:
5
+ module Control #:nodoc:
6
+ class ModelSource < Source
7
+
8
+ def columns
9
+ case definition
10
+ when Array
11
+ definition.collect(&:to_sym)
12
+ when Hash
13
+ definition.keys.collect(&:to_sym)
14
+ else
15
+ raise "Definition must be either an Array or a Hash"
16
+ end
17
+ end
18
+
19
+ def railsmodel
20
+ configuration[:model]
21
+ end
22
+
23
+ def order
24
+ configuration[:order] || "id"
25
+ end
26
+
27
+ def each(&block)
28
+ railsmodel.to_s.camelize.constantize.find(:all,:order=>order).each do |row|
29
+ result_row = ETL::Row.new
30
+ result_row.source = self
31
+ columns.each do |column|
32
+ result_row[column.to_sym] = row.send(column)
33
+ end
34
+ yield result_row
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,109 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # ETL source. Subclasses must implement the <tt>each</tt> method.
4
+ class Source
5
+ include Enumerable
6
+
7
+ # The control object
8
+ attr_accessor :control
9
+
10
+ # The configuration Hash
11
+ attr_accessor :configuration
12
+
13
+ # The definition Hash
14
+ attr_accessor :definition
15
+
16
+ # Returns true if the source data should be stored locally for archival
17
+ # Default behavior will return true.
18
+ attr_accessor :store_locally
19
+
20
+ class << self
21
+ # Convert the name to a Source class.
22
+ #
23
+ # For example if name is :database then this will return a
24
+ # DatabaseSource class
25
+ def class_for_name(name)
26
+ ETL::Control.const_get("#{name.to_s.camelize}Source")
27
+ end
28
+ end
29
+
30
+ # Initialize the Source instance
31
+ # * <tt>control</tt>: The control object
32
+ # * <tt>configuration</tt>: The configuration hash
33
+ # * <tt>definition</tt>: The source layout definition
34
+ #
35
+ # Configuration options:
36
+ # * <tt>:store_locally</tt>: Set to false to not store source data
37
+ # locally (defaults to true)
38
+ def initialize(control, configuration, definition)
39
+ @control = control
40
+ @configuration = configuration
41
+ @definition = definition
42
+
43
+ @store_locally = configuration[:store_locally] || true
44
+ end
45
+
46
+ # Get an array of errors that occur during reading from the source
47
+ def errors
48
+ @errors ||= []
49
+ end
50
+
51
+ # Get a timestamp value as a string
52
+ def timestamp
53
+ Engine.timestamp
54
+ end
55
+
56
+ # The base directory where local files are stored.
57
+ attr_accessor :local_base
58
+
59
+ # Get the local base, defaults to 'source_data'
60
+ def local_base
61
+ @local_base ||= 'source_data'
62
+ end
63
+
64
+ # The local directory for storing. This method must be overriden by
65
+ # subclasses
66
+ def local_directory
67
+ raise "local_directory method is abstract"
68
+ end
69
+
70
+ # Return the local file for storing the raw source data. Each call to
71
+ # this method will result in a timestamped file, so you cannot expect
72
+ # to call it multiple times and reference the same file
73
+ #
74
+ # Optional sequence can be specified if there are multiple source files
75
+ def local_file(sequence=nil)
76
+ filename = timestamp.to_s
77
+ filename += sequence.to_s if sequence
78
+
79
+ local_dir = local_directory
80
+ FileUtils.mkdir_p(local_dir)
81
+ File.join(local_dir, "#{filename}.csv")
82
+ end
83
+
84
+ # Get the last fully written local file
85
+ def last_local_file
86
+ File.join(local_directory, File.basename(last_local_file_trigger, '.trig'))
87
+ end
88
+
89
+ # Get the last local file trigger
90
+ def last_local_file_trigger
91
+ Dir.glob(File.join(local_directory, '*.trig')).last
92
+ end
93
+
94
+ # Get the local trigger file that is used to indicate that the file has
95
+ # been completely written
96
+ def local_file_trigger(file)
97
+ Pathname.new(file.to_s + '.trig')
98
+ end
99
+
100
+ # Return true if the source should read locally.
101
+ def read_locally
102
+ Engine.read_locally
103
+ end
104
+
105
+ end
106
+ end
107
+ end
108
+
109
+ Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
@@ -0,0 +1,3 @@
1
+ require 'etl/control/control'
2
+ require 'etl/control/source'
3
+ require 'etl/control/destination'
@@ -0,0 +1,42 @@
1
+ #Updated by Jack Hong on 04/05/08
2
+
3
+ module ETL #:nodoc:
4
+ module CoreExtensions #:nodoc:
5
+ module Time #:nodoc:
6
+ # Enables the use of time calculations within Time itself
7
+ module Calculations
8
+ def week
9
+ cyw = ((yday - 1) / 7) + 1
10
+ cyw = 52 if cyw == 53
11
+ cyw
12
+ end
13
+ def quarter
14
+ ((month - 1) / 3) + 1
15
+ end
16
+ def fiscal_year_week(offset_month=10)
17
+ fyw = ((fiscal_year_yday(offset_month) - 1) / 7) + 1
18
+ fyw = 52 if fyw == 53
19
+ fyw
20
+ end
21
+ def fiscal_year_month(offset_month=10)
22
+ shifted_month = month - (offset_month - 1)
23
+ shifted_month += 12 if shifted_month <= 0
24
+ shifted_month
25
+ end
26
+ def fiscal_year_quarter(offset_month=10)
27
+ ((fiscal_year_month(offset_month) - 1) / 3) + 1
28
+ end
29
+ def fiscal_year(offset_month=10)
30
+ month >= offset_month ? year + 1 : year
31
+ end
32
+ def fiscal_year_yday(offset_month=10)
33
+ offset_days = 0
34
+ 1.upto(offset_month - 1) { |m| offset_days += ::Time.days_in_month(m, year) }
35
+ shifted_year_day = yday - offset_days
36
+ shifted_year_day += 365 if shifted_year_day <= 0
37
+ shifted_year_day
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,5 @@
1
+ require File.dirname(__FILE__) + '/time/calculations'
2
+
3
+ class Time#:nodoc:
4
+ include ETL::CoreExtensions::Time::Calculations
5
+ end
@@ -0,0 +1 @@
1
+ require 'etl/core_ext/time'