darrell-activewarehouse-etl 0.9.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (83) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +99 -0
  4. data/Rakefile +175 -0
  5. data/TODO +28 -0
  6. data/bin/etl +28 -0
  7. data/bin/etl.cmd +8 -0
  8. data/examples/database.example.yml +16 -0
  9. data/lib/etl/batch/batch.rb +111 -0
  10. data/lib/etl/batch/directives.rb +55 -0
  11. data/lib/etl/batch.rb +2 -0
  12. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  13. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  14. data/lib/etl/builder.rb +2 -0
  15. data/lib/etl/commands/etl.rb +89 -0
  16. data/lib/etl/control/control.rb +405 -0
  17. data/lib/etl/control/destination/database_destination.rb +97 -0
  18. data/lib/etl/control/destination/file_destination.rb +126 -0
  19. data/lib/etl/control/destination.rb +448 -0
  20. data/lib/etl/control/source/database_source.rb +220 -0
  21. data/lib/etl/control/source/enumerable_source.rb +11 -0
  22. data/lib/etl/control/source/file_source.rb +90 -0
  23. data/lib/etl/control/source/model_source.rb +39 -0
  24. data/lib/etl/control/source.rb +109 -0
  25. data/lib/etl/control.rb +3 -0
  26. data/lib/etl/core_ext/time/calculations.rb +42 -0
  27. data/lib/etl/core_ext/time.rb +5 -0
  28. data/lib/etl/core_ext.rb +1 -0
  29. data/lib/etl/engine.rb +556 -0
  30. data/lib/etl/execution/base.rb +9 -0
  31. data/lib/etl/execution/batch.rb +8 -0
  32. data/lib/etl/execution/job.rb +8 -0
  33. data/lib/etl/execution/migration.rb +85 -0
  34. data/lib/etl/execution.rb +19 -0
  35. data/lib/etl/generator/generator.rb +20 -0
  36. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  37. data/lib/etl/generator.rb +2 -0
  38. data/lib/etl/http_tools.rb +139 -0
  39. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  40. data/lib/etl/parser/delimited_parser.rb +74 -0
  41. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  42. data/lib/etl/parser/parser.rb +41 -0
  43. data/lib/etl/parser/sax_parser.rb +218 -0
  44. data/lib/etl/parser/xml_parser.rb +65 -0
  45. data/lib/etl/parser.rb +11 -0
  46. data/lib/etl/processor/block_processor.rb +14 -0
  47. data/lib/etl/processor/bulk_import_processor.rb +83 -0
  48. data/lib/etl/processor/check_exist_processor.rb +80 -0
  49. data/lib/etl/processor/check_unique_processor.rb +35 -0
  50. data/lib/etl/processor/copy_field_processor.rb +26 -0
  51. data/lib/etl/processor/encode_processor.rb +55 -0
  52. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  53. data/lib/etl/processor/print_row_processor.rb +12 -0
  54. data/lib/etl/processor/processor.rb +25 -0
  55. data/lib/etl/processor/rename_processor.rb +24 -0
  56. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  57. data/lib/etl/processor/row_processor.rb +17 -0
  58. data/lib/etl/processor/sequence_processor.rb +23 -0
  59. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  60. data/lib/etl/processor/truncate_processor.rb +35 -0
  61. data/lib/etl/processor.rb +11 -0
  62. data/lib/etl/row.rb +20 -0
  63. data/lib/etl/screen/row_count_screen.rb +20 -0
  64. data/lib/etl/screen.rb +14 -0
  65. data/lib/etl/transform/block_transform.rb +13 -0
  66. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  67. data/lib/etl/transform/decode_transform.rb +51 -0
  68. data/lib/etl/transform/default_transform.rb +20 -0
  69. data/lib/etl/transform/foreign_key_lookup_transform.rb +169 -0
  70. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  71. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  72. data/lib/etl/transform/sha1_transform.rb +13 -0
  73. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  74. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  75. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  76. data/lib/etl/transform/transform.rb +61 -0
  77. data/lib/etl/transform/trim_transform.rb +26 -0
  78. data/lib/etl/transform/type_transform.rb +35 -0
  79. data/lib/etl/transform.rb +2 -0
  80. data/lib/etl/util.rb +59 -0
  81. data/lib/etl/version.rb +9 -0
  82. data/lib/etl.rb +83 -0
  83. metadata +245 -0
@@ -0,0 +1,220 @@
1
+ require 'fileutils'
2
+
3
+ module ETL #:nodoc:
4
+ class Source < ::ActiveRecord::Base #:nodoc:
5
+ # Connection for database sources
6
+ end
7
+
8
+ module Control #:nodoc:
9
+ # Source object which extracts data from a database using ActiveRecord.
10
+ class DatabaseSource < Source
11
+ attr_accessor :target
12
+ attr_accessor :table
13
+
14
+ # Initialize the source.
15
+ #
16
+ # Arguments:
17
+ # * <tt>control</tt>: The ETL::Control::Control instance
18
+ # * <tt>configuration</tt>: The configuration Hash
19
+ # * <tt>definition</tt>: The source definition
20
+ #
21
+ # Required configuration options:
22
+ # * <tt>:target</tt>: The target connection
23
+ # * <tt>:table</tt>: The source table name
24
+ # * <tt>:database</tt>: The database name
25
+ #
26
+ # Other options:
27
+ # * <tt>:join</tt>: Optional join part for the query (ignored unless
28
+ # specified)
29
+ # * <tt>:select</tt>: Optional select part for the query (defaults to
30
+ # '*')
31
+ # * <tt>:group</tt>: Optional group by part for the query (ignored
32
+ # unless specified)
33
+ # * <tt>:order</tt>: Optional order part for the query (ignored unless
34
+ # specified)
35
+ # * <tt>:new_records_only</tt>: Specify the column to use when comparing
36
+ # timestamps against the last successful ETL job execution for the
37
+ # current control file.
38
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
39
+ # source data locally in a flat file (defaults to true)
40
+ def initialize(control, configuration, definition)
41
+ super
42
+ @target = configuration[:target]
43
+ @table = configuration[:table]
44
+ end
45
+
46
+ # Get a String identifier for the source
47
+ def to_s
48
+ "#{host}/#{database}/#{table}"
49
+ end
50
+
51
+ # Get the local directory to use, which is a combination of the
52
+ # local_base, the db hostname the db database name and the db table.
53
+ def local_directory
54
+ File.join(local_base, host, database, configuration[:table])
55
+ end
56
+
57
+ # Get the join part of the query, defaults to nil
58
+ def join
59
+ configuration[:join]
60
+ end
61
+
62
+ # Get the select part of the query, defaults to '*'
63
+ def select
64
+ configuration[:select] || '*'
65
+ end
66
+
67
+ # Get the group by part of the query, defaults to nil
68
+ def group
69
+ configuration[:group]
70
+ end
71
+
72
+ # Get the order for the query, defaults to nil
73
+ def order
74
+ configuration[:order]
75
+ end
76
+
77
+ # Return the column which is used for in the where clause to identify
78
+ # new rows
79
+ def new_records_only
80
+ configuration[:new_records_only]
81
+ end
82
+
83
+ # Get the number of rows in the source
84
+ def count(use_cache=true)
85
+ return @count if @count && use_cache
86
+ if store_locally || read_locally
87
+ @count = count_locally
88
+ else
89
+ @count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
90
+ end
91
+ end
92
+
93
+ # Get the list of columns to read. This is defined in the source
94
+ # definition as either an Array or Hash
95
+ def columns
96
+ # weird default is required for writing to cache correctly
97
+ @columns ||= query_rows.any? ? query_rows.first.keys : ['']
98
+ end
99
+
100
+ # Returns each row from the source. If read_locally is specified then
101
+ # this method will attempt to read from the last stored local file.
102
+ # If no locally stored file exists or if the trigger file for the last
103
+ # locally stored file does not exist then this method will raise an
104
+ # error.
105
+ def each(&block)
106
+ if read_locally # Read from the last stored source
107
+ ETL::Engine.logger.debug "Reading from local cache"
108
+ read_rows(last_local_file, &block)
109
+ else # Read from the original source
110
+ if store_locally
111
+ file = local_file
112
+ write_local(file)
113
+ read_rows(file, &block)
114
+ else
115
+ query_rows.each do |row|
116
+ row = ETL::Row.new(row.symbolize_keys)
117
+ row.source = self
118
+ yield row
119
+ end
120
+ end
121
+ end
122
+ end
123
+
124
+ private
125
+ # Read rows from the local cache
126
+ def read_rows(file)
127
+ raise "Local cache file not found" unless File.exists?(file)
128
+ raise "Local cache trigger file not found" unless File.exists?(local_file_trigger(file))
129
+
130
+ t = Benchmark.realtime do
131
+ FasterCSV.open(file, :headers => true).each do |row|
132
+ result_row = ETL::Row.new
133
+ result_row.source = self
134
+ row.each do |header, field|
135
+ result_row[header.to_sym] = field
136
+ end
137
+ yield result_row
138
+ end
139
+ end
140
+ ETL::Engine.average_rows_per_second = ETL::Engine.rows_read / t
141
+ end
142
+
143
+ def count_locally
144
+ counter = 0
145
+ File.open(last_local_file, 'r').each { |line| counter += 1 }
146
+ counter
147
+ end
148
+
149
+ # Write rows to the local cache
150
+ def write_local(file)
151
+ lines = 0
152
+ t = Benchmark.realtime do
153
+ FasterCSV.open(file, 'w') do |f|
154
+ f << columns
155
+ query_rows.each do |row|
156
+ f << columns.collect { |column| row[column.to_s] }
157
+ lines += 1
158
+ end
159
+ end
160
+ File.open(local_file_trigger(file), 'w') {|f| }
161
+ end
162
+ ETL::Engine.logger.info "Stored locally in #{t}s (avg: #{lines/t} lines/sec)"
163
+ end
164
+
165
+ # Get the query to use
166
+ def query
167
+ return @query if @query
168
+ q = "SELECT #{select} FROM #{configuration[:table]}"
169
+ q << " #{join}" if join
170
+
171
+ conditions = []
172
+ if new_records_only
173
+ last_completed = ETL::Execution::Job.maximum('created_at',
174
+ :conditions => ['control_file = ? and completed_at is not null', control.file]
175
+ )
176
+ if last_completed
177
+ conditions << "#{new_records_only} > #{connection.quote(last_completed.to_s(:db))}"
178
+ end
179
+ end
180
+
181
+ conditions << configuration[:conditions] if configuration[:conditions]
182
+ if conditions.length > 0
183
+ q << " WHERE #{conditions.join(' AND ')}"
184
+ end
185
+
186
+ q << " GROUP BY #{group}" if group
187
+ q << " ORDER BY #{order}" if order
188
+
189
+ if ETL::Engine.limit || ETL::Engine.offset
190
+ options = {}
191
+ options[:limit] = ETL::Engine.limit if ETL::Engine.limit
192
+ options[:offset] = ETL::Engine.offset if ETL::Engine.offset
193
+ connection.add_limit_offset!(q, options)
194
+ end
195
+
196
+ q = q.gsub(/\n/,' ')
197
+ ETL::Engine.logger.info "Query: #{q}"
198
+ @query = q
199
+ end
200
+
201
+ def query_rows
202
+ @query_rows ||= connection.select_all(query)
203
+ end
204
+
205
+ # Get the database connection to use
206
+ def connection
207
+ ETL::Engine.connection(target)
208
+ end
209
+
210
+ # Get the host, defaults to 'localhost'
211
+ def host
212
+ ETL::Base.configurations[target.to_s]['host'] || 'localhost'
213
+ end
214
+
215
+ def database
216
+ ETL::Base.configurations[target.to_s]['database']
217
+ end
218
+ end
219
+ end
220
+ end
@@ -0,0 +1,11 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # Use an Enumerable as a source
4
+ class EnumerableSource < ETL::Control::Source
5
+ # Iterate through the enumerable
6
+ def each(&block)
7
+ configuration[:enumerable].each(&block)
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,90 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # A File source.
4
+ class FileSource < Source
5
+ # The number of lines to skip, default is 0
6
+ attr_accessor :skip_lines
7
+
8
+ # Accessor for the underlying parser
9
+ attr_accessor :parser
10
+
11
+ # The source file
12
+ attr_accessor :file
13
+
14
+ # Initialize the source
15
+ #
16
+ # Configuration options:
17
+ # * <tt>:file</tt>: The source file
18
+ # * <tt>:parser</tt>: One of the following: a parser name as a String or
19
+ # symbol, a class which extends from Parser, a Hash with :name and
20
+ # optionally an :options key. Whether or not the parser uses the
21
+ # options is dependent on which parser is used. See the documentation
22
+ # for each parser for information on what options it accepts.
23
+ # * <tt>:skip_lines</tt>: The number of lines to skip (defaults to 0)
24
+ # * <tt>:store_locally</tt>: Set to false to not store a copy of the
25
+ # source data locally for archival
26
+ def initialize(control, configuration, definition)
27
+ super
28
+ configure
29
+ end
30
+
31
+ # Get a String identifier for the source
32
+ def to_s
33
+ file
34
+ end
35
+
36
+ # Get the local storage directory
37
+ def local_directory
38
+ File.join(local_base, File.basename(file, File.extname(file)))
39
+ end
40
+
41
+ # Returns each row from the source
42
+ def each
43
+ count = 0
44
+ copy_sources if store_locally
45
+ @parser.each do |row|
46
+ if ETL::Engine.offset && count < ETL::Engine.offset
47
+ count += 1
48
+ else
49
+ row = ETL::Row[row]
50
+ row.source = self
51
+ yield row
52
+ end
53
+ end
54
+ end
55
+
56
+ private
57
+ # Copy source data to a local directory structure
58
+ def copy_sources
59
+ sequence = 0
60
+ path = Pathname.new(file)
61
+ path = path.absolute? ? path : Pathname.new(File.dirname(control.file)) + path
62
+ Pathname.glob(path).each do |f|
63
+ next if f.directory?
64
+ lf = local_file(sequence)
65
+ FileUtils.cp(f, lf)
66
+ File.open(local_file_trigger(lf), 'w') {|f| }
67
+ sequence += 1
68
+ end
69
+ end
70
+
71
+ # Configure the source
72
+ def configure
73
+ @file = configuration[:file]
74
+ case configuration[:parser]
75
+ when Class
76
+ @parser = configuration[:parser].new(self)
77
+ when String, Symbol
78
+ @parser = ETL::Parser::Parser.class_for_name(configuration[:parser]).new(self)
79
+ when Hash
80
+ name = configuration[:parser][:name]
81
+ options = configuration[:parser][:options]
82
+ @parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
83
+ else
84
+ raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
85
+ end
86
+ @skip_lines = configuration[:skip_lines] ||= 0
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,39 @@
1
+ #RAILS_ENV = 'development'
2
+ #require '../config/environment'
3
+
4
+ module ETL #:nodoc:
5
+ module Control #:nodoc:
6
+ class ModelSource < Source
7
+
8
+ def columns
9
+ case definition
10
+ when Array
11
+ definition.collect(&:to_sym)
12
+ when Hash
13
+ definition.keys.collect(&:to_sym)
14
+ else
15
+ raise "Definition must be either an Array or a Hash"
16
+ end
17
+ end
18
+
19
+ def railsmodel
20
+ configuration[:model]
21
+ end
22
+
23
+ def order
24
+ configuration[:order] || "id"
25
+ end
26
+
27
+ def each(&block)
28
+ railsmodel.to_s.camelize.constantize.find(:all,:order=>order).each do |row|
29
+ result_row = ETL::Row.new
30
+ result_row.source = self
31
+ columns.each do |column|
32
+ result_row[column.to_sym] = row.send(column)
33
+ end
34
+ yield result_row
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,109 @@
1
+ module ETL #:nodoc:
2
+ module Control #:nodoc:
3
+ # ETL source. Subclasses must implement the <tt>each</tt> method.
4
+ class Source
5
+ include Enumerable
6
+
7
+ # The control object
8
+ attr_accessor :control
9
+
10
+ # The configuration Hash
11
+ attr_accessor :configuration
12
+
13
+ # The definition Hash
14
+ attr_accessor :definition
15
+
16
+ # Returns true if the source data should be stored locally for archival
17
+ # Default behavior will return true.
18
+ attr_accessor :store_locally
19
+
20
+ class << self
21
+ # Convert the name to a Source class.
22
+ #
23
+ # For example if name is :database then this will return a
24
+ # DatabaseSource class
25
+ def class_for_name(name)
26
+ ETL::Control.const_get("#{name.to_s.camelize}Source")
27
+ end
28
+ end
29
+
30
+ # Initialize the Source instance
31
+ # * <tt>control</tt>: The control object
32
+ # * <tt>configuration</tt>: The configuration hash
33
+ # * <tt>definition</tt>: The source layout definition
34
+ #
35
+ # Configuration options:
36
+ # * <tt>:store_locally</tt>: Set to false to not store source data
37
+ # locally (defaults to true)
38
+ def initialize(control, configuration, definition)
39
+ @control = control
40
+ @configuration = configuration
41
+ @definition = definition
42
+
43
+ @store_locally = configuration[:store_locally] || true
44
+ end
45
+
46
+ # Get an array of errors that occur during reading from the source
47
+ def errors
48
+ @errors ||= []
49
+ end
50
+
51
+ # Get a timestamp value as a string
52
+ def timestamp
53
+ Engine.timestamp
54
+ end
55
+
56
+ # The base directory where local files are stored.
57
+ attr_accessor :local_base
58
+
59
+ # Get the local base, defaults to 'source_data'
60
+ def local_base
61
+ @local_base ||= 'source_data'
62
+ end
63
+
64
+ # The local directory for storing. This method must be overriden by
65
+ # subclasses
66
+ def local_directory
67
+ raise "local_directory method is abstract"
68
+ end
69
+
70
+ # Return the local file for storing the raw source data. Each call to
71
+ # this method will result in a timestamped file, so you cannot expect
72
+ # to call it multiple times and reference the same file
73
+ #
74
+ # Optional sequence can be specified if there are multiple source files
75
+ def local_file(sequence=nil)
76
+ filename = timestamp.to_s
77
+ filename += sequence.to_s if sequence
78
+
79
+ local_dir = local_directory
80
+ FileUtils.mkdir_p(local_dir)
81
+ File.join(local_dir, "#{filename}.csv")
82
+ end
83
+
84
+ # Get the last fully written local file
85
+ def last_local_file
86
+ File.join(local_directory, File.basename(last_local_file_trigger, '.trig'))
87
+ end
88
+
89
+ # Get the last local file trigger
90
+ def last_local_file_trigger
91
+ Dir.glob(File.join(local_directory, '*.trig')).last
92
+ end
93
+
94
+ # Get the local trigger file that is used to indicate that the file has
95
+ # been completely written
96
+ def local_file_trigger(file)
97
+ Pathname.new(file.to_s + '.trig')
98
+ end
99
+
100
+ # Return true if the source should read locally.
101
+ def read_locally
102
+ Engine.read_locally
103
+ end
104
+
105
+ end
106
+ end
107
+ end
108
+
109
+ Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
@@ -0,0 +1,3 @@
1
+ require 'etl/control/control'
2
+ require 'etl/control/source'
3
+ require 'etl/control/destination'
@@ -0,0 +1,42 @@
1
+ #Updated by Jack Hong on 04/05/08
2
+
3
+ module ETL #:nodoc:
4
+ module CoreExtensions #:nodoc:
5
+ module Time #:nodoc:
6
+ # Enables the use of time calculations within Time itself
7
+ module Calculations
8
+ def week
9
+ cyw = ((yday - 1) / 7) + 1
10
+ cyw = 52 if cyw == 53
11
+ cyw
12
+ end
13
+ def quarter
14
+ ((month - 1) / 3) + 1
15
+ end
16
+ def fiscal_year_week(offset_month=10)
17
+ fyw = ((fiscal_year_yday(offset_month) - 1) / 7) + 1
18
+ fyw = 52 if fyw == 53
19
+ fyw
20
+ end
21
+ def fiscal_year_month(offset_month=10)
22
+ shifted_month = month - (offset_month - 1)
23
+ shifted_month += 12 if shifted_month <= 0
24
+ shifted_month
25
+ end
26
+ def fiscal_year_quarter(offset_month=10)
27
+ ((fiscal_year_month(offset_month) - 1) / 3) + 1
28
+ end
29
+ def fiscal_year(offset_month=10)
30
+ month >= offset_month ? year + 1 : year
31
+ end
32
+ def fiscal_year_yday(offset_month=10)
33
+ offset_days = 0
34
+ 1.upto(offset_month - 1) { |m| offset_days += ::Time.days_in_month(m, year) }
35
+ shifted_year_day = yday - offset_days
36
+ shifted_year_day += 365 if shifted_year_day <= 0
37
+ shifted_year_day
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,5 @@
1
+ require File.dirname(__FILE__) + '/time/calculations'
2
+
3
+ class Time#:nodoc:
4
+ include ETL::CoreExtensions::Time::Calculations
5
+ end
@@ -0,0 +1 @@
1
+ require 'etl/core_ext/time'