activewarehouse-etl 0.9.1 → 0.9.5.rc1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +7 -0
- data/0.9-UPGRADE +6 -0
- data/CHANGELOG +182 -150
- data/Gemfile +4 -0
- data/HOW_TO_RELEASE +9 -0
- data/README +18 -2
- data/Rakefile +35 -91
- data/active_support_logger.patch +78 -0
- data/activewarehouse-etl.gemspec +30 -0
- data/lib/etl.rb +10 -2
- data/lib/etl/batch/directives.rb +11 -1
- data/lib/etl/control/control.rb +2 -2
- data/lib/etl/control/destination.rb +27 -7
- data/lib/etl/control/destination/database_destination.rb +8 -6
- data/lib/etl/control/destination/excel_destination.rb +91 -0
- data/lib/etl/control/destination/file_destination.rb +6 -4
- data/lib/etl/control/destination/insert_update_database_destination.rb +133 -0
- data/lib/etl/control/destination/update_database_destination.rb +109 -0
- data/lib/etl/control/source.rb +3 -2
- data/lib/etl/control/source/database_source.rb +14 -10
- data/lib/etl/control/source/file_source.rb +2 -2
- data/lib/etl/engine.rb +17 -15
- data/lib/etl/execution.rb +0 -1
- data/lib/etl/execution/batch.rb +3 -1
- data/lib/etl/execution/migration.rb +5 -0
- data/lib/etl/parser/delimited_parser.rb +20 -1
- data/lib/etl/parser/excel_parser.rb +112 -0
- data/lib/etl/processor/bulk_import_processor.rb +4 -2
- data/lib/etl/processor/database_join_processor.rb +68 -0
- data/lib/etl/processor/escape_csv_processor.rb +77 -0
- data/lib/etl/processor/filter_row_processor.rb +51 -0
- data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
- data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
- data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
- data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
- data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
- data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
- data/lib/etl/processor/zip_file_processor.rb +27 -0
- data/lib/etl/transform/calculation_transform.rb +71 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +25 -7
- data/lib/etl/transform/ordinalize_transform.rb +3 -1
- data/lib/etl/transform/split_fields_transform.rb +27 -0
- data/lib/etl/version.rb +1 -7
- data/test-matrix.yml +10 -0
- data/test/.gitignore +1 -0
- data/test/.ignore +2 -0
- data/test/all.ebf +6 -0
- data/test/apache_combined_log.ctl +11 -0
- data/test/batch_test.rb +41 -0
- data/test/batch_with_error.ebf +6 -0
- data/test/batched1.ctl +0 -0
- data/test/batched2.ctl +0 -0
- data/test/block_processor.ctl +6 -0
- data/test/block_processor_error.ctl +1 -0
- data/test/block_processor_pre_post_process.ctl +4 -0
- data/test/block_processor_remove_rows.ctl +5 -0
- data/test/block_processor_test.rb +38 -0
- data/test/config/Gemfile.rails-2.3.x +3 -0
- data/test/config/Gemfile.rails-2.3.x.lock +38 -0
- data/test/config/Gemfile.rails-3.0.x +3 -0
- data/test/config/Gemfile.rails-3.0.x.lock +49 -0
- data/test/config/common.rb +21 -0
- data/test/connection/mysql/connection.rb +9 -0
- data/test/connection/mysql/schema.sql +36 -0
- data/test/connection/postgresql/connection.rb +13 -0
- data/test/connection/postgresql/schema.sql +39 -0
- data/test/control_test.rb +43 -0
- data/test/data/apache_combined_log.txt +3 -0
- data/test/data/bulk_import.txt +3 -0
- data/test/data/bulk_import_with_empties.txt +3 -0
- data/test/data/decode.txt +3 -0
- data/test/data/delimited.txt +3 -0
- data/test/data/encode_source_latin1.txt +2 -0
- data/test/data/excel.xls +0 -0
- data/test/data/excel2.xls +0 -0
- data/test/data/fixed_width.txt +3 -0
- data/test/data/multiple_delimited_1.txt +3 -0
- data/test/data/multiple_delimited_2.txt +3 -0
- data/test/data/people.txt +3 -0
- data/test/data/sax.xml +14 -0
- data/test/data/xml.xml +16 -0
- data/test/date_dimension_builder_test.rb +96 -0
- data/test/delimited.ctl +30 -0
- data/test/delimited_absolute.ctl +33 -0
- data/test/delimited_destination_db.ctl +25 -0
- data/test/delimited_excel.ctl +31 -0
- data/test/delimited_insert_update.ctl +34 -0
- data/test/delimited_update.ctl +34 -0
- data/test/delimited_with_bulk_load.ctl +34 -0
- data/test/destination_test.rb +275 -0
- data/test/directive_test.rb +23 -0
- data/test/encode_processor_test.rb +32 -0
- data/test/engine_test.rb +32 -0
- data/test/errors.ctl +24 -0
- data/test/etl_test.rb +42 -0
- data/test/excel.ctl +24 -0
- data/test/excel2.ctl +25 -0
- data/test/fixed_width.ctl +35 -0
- data/test/generator_test.rb +14 -0
- data/test/inline_parser.ctl +17 -0
- data/test/mocks/mock_destination.rb +26 -0
- data/test/mocks/mock_source.rb +25 -0
- data/test/model_source.ctl +14 -0
- data/test/multiple_delimited.ctl +22 -0
- data/test/multiple_source_delimited.ctl +39 -0
- data/test/parser_test.rb +224 -0
- data/test/performance/delimited.ctl +30 -0
- data/test/processor_test.rb +44 -0
- data/test/row_processor_test.rb +17 -0
- data/test/sax.ctl +26 -0
- data/test/scd/1.txt +1 -0
- data/test/scd/2.txt +1 -0
- data/test/scd/3.txt +1 -0
- data/test/scd_test.rb +257 -0
- data/test/scd_test_type_1.ctl +43 -0
- data/test/scd_test_type_2.ctl +34 -0
- data/test/screen_test.rb +9 -0
- data/test/screen_test_error.ctl +3 -0
- data/test/screen_test_fatal.ctl +3 -0
- data/test/source_test.rb +139 -0
- data/test/test_helper.rb +34 -0
- data/test/transform_test.rb +101 -0
- data/test/vendor/adapter_extensions-0.5.0/CHANGELOG +26 -0
- data/test/vendor/adapter_extensions-0.5.0/LICENSE +16 -0
- data/test/vendor/adapter_extensions-0.5.0/README +7 -0
- data/test/vendor/adapter_extensions-0.5.0/Rakefile +158 -0
- data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions.rb +12 -0
- data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/abstract_adapter.rb +44 -0
- data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/mysql_adapter.rb +63 -0
- data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/postgresql_adapter.rb +52 -0
- data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/sqlserver_adapter.rb +44 -0
- data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/version.rb +10 -0
- data/test/xml.ctl +31 -0
- metadata +229 -70
- data/lib/etl/execution/record.rb +0 -18
@@ -0,0 +1,109 @@
|
|
1
|
+
module ETL #:nodoc:
|
2
|
+
module Control #:nodoc:
|
3
|
+
# Destination which writes directly to a database. This is useful when you are dealing with
|
4
|
+
# a small amount of data. For larger amounts of data you should probably use the bulk
|
5
|
+
# loader if it is supported with your target database as it will use a much faster load
|
6
|
+
# method.
|
7
|
+
class UpdateDatabaseDestination < Destination
|
8
|
+
# The target connection
|
9
|
+
attr_reader :target
|
10
|
+
|
11
|
+
# The table
|
12
|
+
attr_reader :table
|
13
|
+
|
14
|
+
# Specify the order from the source
|
15
|
+
attr_reader :order
|
16
|
+
|
17
|
+
# Specify the conditions from the source
|
18
|
+
attr_reader :conditions
|
19
|
+
|
20
|
+
# Initialize the database destination
|
21
|
+
#
|
22
|
+
# * <tt>control</tt>: The ETL::Control::Control instance
|
23
|
+
# * <tt>configuration</tt>: The configuration Hash
|
24
|
+
# * <tt>mapping</tt>: The mapping
|
25
|
+
#
|
26
|
+
# Configuration options:
|
27
|
+
# * <tt>:database</tt>: The database name (REQUIRED)
|
28
|
+
# * <tt>:target</tt>: The target connection (REQUIRED)
|
29
|
+
# * <tt>:table</tt>: The table to write to (REQUIRED)
|
30
|
+
# * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
|
31
|
+
# * <tt>:append_rows</tt>: Array of rows to append
|
32
|
+
#
|
33
|
+
# Mapping options:
|
34
|
+
# * <tt>:order</tt>: The order of fields to write (REQUIRED)
|
35
|
+
# * <tt>:conditions</tt>: The conditions on the fields to update (REQUIRED)
|
36
|
+
def initialize(control, configuration, mapping={})
|
37
|
+
super
|
38
|
+
@target = configuration[:target]
|
39
|
+
@table = configuration[:table]
|
40
|
+
@unique = configuration[:unique] ? configuration[:unique] + [scd_effective_date_field] : configuration[:unique]
|
41
|
+
@unique.uniq! unless @unique.nil?
|
42
|
+
@order = mapping[:order] ? mapping[:order] + scd_required_fields : order_from_source
|
43
|
+
@order.uniq! unless @order.nil?
|
44
|
+
@conditions = mapping[:conditions] ? mapping[:conditions] + scd_required_fields : nil
|
45
|
+
@conditions.uniq! unless @conditions.nil?
|
46
|
+
raise ControlError, "Conditions required in mapping" unless @conditions
|
47
|
+
raise ControlError, "Order required in mapping" unless @order
|
48
|
+
raise ControlError, "Table required" unless @table
|
49
|
+
raise ControlError, "Target required" unless @target
|
50
|
+
end
|
51
|
+
|
52
|
+
# Flush the currently buffered data
|
53
|
+
def flush
|
54
|
+
conn.transaction do
|
55
|
+
buffer.flatten.each do |row|
|
56
|
+
# check to see if this row's compound key constraint already exists
|
57
|
+
# note that the compound key constraint may not utilize virtual fields
|
58
|
+
next unless row_allowed?(row)
|
59
|
+
|
60
|
+
# add any virtual fields
|
61
|
+
add_virtuals!(row)
|
62
|
+
|
63
|
+
conditionsfilter = []
|
64
|
+
conditions.each do |cond|
|
65
|
+
c = " #{cond[:field]} #{cond[:comp]} #{cond[:value]} "
|
66
|
+
condition = c
|
67
|
+
begin
|
68
|
+
condition = eval('"' + c + '"')
|
69
|
+
rescue
|
70
|
+
end
|
71
|
+
conditionsfilter << condition
|
72
|
+
end
|
73
|
+
|
74
|
+
updatevalues = []
|
75
|
+
order.each do |name|
|
76
|
+
updatevalues << "#{conn.quote_column_name(name)} = #{conn.quote(row[name])}"
|
77
|
+
end
|
78
|
+
q = "UPDATE #{conn.quote_table_name(table_name)} SET #{updatevalues.join(',')} WHERE #{conditionsfilter.join(' AND ')}"
|
79
|
+
ETL::Engine.logger.debug("Executing update: #{q}")
|
80
|
+
conn.update(q, "Update row #{current_row}")
|
81
|
+
@current_row += 1
|
82
|
+
end
|
83
|
+
buffer.clear
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Close the connection
|
88
|
+
def close
|
89
|
+
buffer << append_rows if append_rows
|
90
|
+
flush
|
91
|
+
end
|
92
|
+
|
93
|
+
private
|
94
|
+
def conn
|
95
|
+
@conn ||= begin
|
96
|
+
conn = ETL::Engine.connection(target)
|
97
|
+
conn
|
98
|
+
rescue
|
99
|
+
raise RuntimeError, "Problem to connect to db"
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def table_name
|
104
|
+
ETL::Engine.table(table, ETL::Engine.connection(target))
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
data/lib/etl/control/source.rb
CHANGED
@@ -40,7 +40,8 @@ module ETL #:nodoc:
|
|
40
40
|
@configuration = configuration
|
41
41
|
@definition = definition
|
42
42
|
|
43
|
-
@store_locally =
|
43
|
+
@store_locally = true
|
44
|
+
@store_locally = configuration[:store_locally] unless configuration[:store_locally].nil?
|
44
45
|
end
|
45
46
|
|
46
47
|
# Get an array of errors that occur during reading from the source
|
@@ -106,4 +107,4 @@ module ETL #:nodoc:
|
|
106
107
|
end
|
107
108
|
end
|
108
109
|
|
109
|
-
Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
|
110
|
+
Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
|
@@ -41,17 +41,18 @@ module ETL #:nodoc:
|
|
41
41
|
super
|
42
42
|
@target = configuration[:target]
|
43
43
|
@table = configuration[:table]
|
44
|
+
@query = configuration[:query]
|
44
45
|
end
|
45
46
|
|
46
47
|
# Get a String identifier for the source
|
47
48
|
def to_s
|
48
|
-
"#{host}/#{database}/#{table}"
|
49
|
+
"#{host}/#{database}/#{@table}"
|
49
50
|
end
|
50
51
|
|
51
52
|
# Get the local directory to use, which is a combination of the
|
52
53
|
# local_base, the db hostname the db database name and the db table.
|
53
54
|
def local_directory
|
54
|
-
File.join(local_base,
|
55
|
+
File.join(local_base, to_s)
|
55
56
|
end
|
56
57
|
|
57
58
|
# Get the join part of the query, defaults to nil
|
@@ -83,7 +84,7 @@ module ETL #:nodoc:
|
|
83
84
|
# Get the number of rows in the source
|
84
85
|
def count(use_cache=true)
|
85
86
|
return @count if @count && use_cache
|
86
|
-
if store_locally || read_locally
|
87
|
+
if @store_locally || read_locally
|
87
88
|
@count = count_locally
|
88
89
|
else
|
89
90
|
@count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
|
@@ -107,13 +108,16 @@ module ETL #:nodoc:
|
|
107
108
|
ETL::Engine.logger.debug "Reading from local cache"
|
108
109
|
read_rows(last_local_file, &block)
|
109
110
|
else # Read from the original source
|
110
|
-
if store_locally
|
111
|
+
if @store_locally
|
111
112
|
file = local_file
|
112
113
|
write_local(file)
|
113
114
|
read_rows(file, &block)
|
114
115
|
else
|
115
|
-
query_rows.each do |
|
116
|
-
row = ETL::Row.new(
|
116
|
+
query_rows.each do |r|
|
117
|
+
row = ETL::Row.new()
|
118
|
+
r.symbolize_keys.each_pair { |key, value|
|
119
|
+
row[key] = value
|
120
|
+
}
|
117
121
|
row.source = self
|
118
122
|
yield row
|
119
123
|
end
|
@@ -128,7 +132,7 @@ module ETL #:nodoc:
|
|
128
132
|
raise "Local cache trigger file not found" unless File.exists?(local_file_trigger(file))
|
129
133
|
|
130
134
|
t = Benchmark.realtime do
|
131
|
-
|
135
|
+
CSV.open(file, :headers => true).each do |row|
|
132
136
|
result_row = ETL::Row.new
|
133
137
|
result_row.source = self
|
134
138
|
row.each do |header, field|
|
@@ -150,7 +154,7 @@ module ETL #:nodoc:
|
|
150
154
|
def write_local(file)
|
151
155
|
lines = 0
|
152
156
|
t = Benchmark.realtime do
|
153
|
-
|
157
|
+
CSV.open(file, 'w') do |f|
|
154
158
|
f << columns
|
155
159
|
query_rows.each do |row|
|
156
160
|
f << columns.collect { |column| row[column.to_s] }
|
@@ -165,7 +169,7 @@ module ETL #:nodoc:
|
|
165
169
|
# Get the query to use
|
166
170
|
def query
|
167
171
|
return @query if @query
|
168
|
-
q = "SELECT #{select} FROM #{
|
172
|
+
q = "SELECT #{select} FROM #{@table}"
|
169
173
|
q << " #{join}" if join
|
170
174
|
|
171
175
|
conditions = []
|
@@ -217,4 +221,4 @@ module ETL #:nodoc:
|
|
217
221
|
end
|
218
222
|
end
|
219
223
|
end
|
220
|
-
end
|
224
|
+
end
|
@@ -41,7 +41,7 @@ module ETL #:nodoc:
|
|
41
41
|
# Returns each row from the source
|
42
42
|
def each
|
43
43
|
count = 0
|
44
|
-
copy_sources if store_locally
|
44
|
+
copy_sources if @store_locally
|
45
45
|
@parser.each do |row|
|
46
46
|
if ETL::Engine.offset && count < ETL::Engine.offset
|
47
47
|
count += 1
|
@@ -87,4 +87,4 @@ module ETL #:nodoc:
|
|
87
87
|
end
|
88
88
|
end
|
89
89
|
end
|
90
|
-
end
|
90
|
+
end
|
data/lib/etl/engine.rb
CHANGED
@@ -32,7 +32,7 @@ module ETL #:nodoc:
|
|
32
32
|
options[:config] = 'config/database.yml' unless File.exist?(options[:config])
|
33
33
|
database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
|
34
34
|
ActiveRecord::Base.configurations.merge!(database_configuration)
|
35
|
-
ETL::Base.configurations = database_configuration
|
35
|
+
ETL::Base.configurations = HashWithIndifferentAccess.new(database_configuration)
|
36
36
|
#puts "configurations in init: #{ActiveRecord::Base.configurations.inspect}"
|
37
37
|
|
38
38
|
require 'etl/execution'
|
@@ -173,17 +173,19 @@ module ETL #:nodoc:
|
|
173
173
|
# Modify the table name if necessary
|
174
174
|
def table(table_name, connection)
|
175
175
|
if use_temp_tables?
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
176
|
+
temp_table_name = "tmp_#{table_name}"
|
177
|
+
|
178
|
+
if temp_tables[temp_table_name].nil?
|
179
|
+
# Create the temp table and add it to the mapping
|
180
|
+
begin connection.drop_table(temp_table_name); rescue; end
|
181
|
+
connection.copy_table(table_name, temp_table_name)
|
182
|
+
temp_tables[temp_table_name] = {
|
183
|
+
:table => table_name,
|
184
|
+
:connection => connection
|
185
|
+
}
|
186
186
|
end
|
187
|
+
|
188
|
+
temp_table_name
|
187
189
|
else
|
188
190
|
table_name
|
189
191
|
end
|
@@ -308,7 +310,7 @@ module ETL #:nodoc:
|
|
308
310
|
|
309
311
|
sources.each do |source|
|
310
312
|
Engine.current_source = source
|
311
|
-
Engine.logger.debug "Processing source #{source}"
|
313
|
+
Engine.logger.debug "Processing source #{source.inspect}"
|
312
314
|
say "Source: #{source}"
|
313
315
|
say "Limiting enabled: #{Engine.limit}" if Engine.limit != nil
|
314
316
|
say "Offset enabled: #{Engine.offset}" if Engine.offset != nil
|
@@ -470,8 +472,8 @@ module ETL #:nodoc:
|
|
470
472
|
say "Avg transforms: #{Engine.rows_read/benchmarks[:transforms]} rows/sec" if benchmarks[:transforms] > 0
|
471
473
|
say "Avg writes: #{Engine.rows_read/benchmarks[:writes]} rows/sec" if benchmarks[:writes] > 0
|
472
474
|
|
473
|
-
say "Avg time writing execution records: #{ETL::Execution::Record.average_time_spent}"
|
474
|
-
|
475
|
+
# say "Avg time writing execution records: #{ETL::Execution::Record.average_time_spent}"
|
476
|
+
#
|
475
477
|
# ETL::Transform::Transform.benchmarks.each do |klass, t|
|
476
478
|
# say "Avg #{klass}: #{Engine.rows_read/t} rows/sec"
|
477
479
|
# end
|
@@ -553,4 +555,4 @@ module ETL #:nodoc:
|
|
553
555
|
end
|
554
556
|
end
|
555
557
|
end
|
556
|
-
end
|
558
|
+
end
|
data/lib/etl/execution.rb
CHANGED
data/lib/etl/execution/batch.rb
CHANGED
@@ -74,6 +74,11 @@ module ETL #:nodoc:
|
|
74
74
|
def migration_4
|
75
75
|
connection.drop_table :records
|
76
76
|
end
|
77
|
+
|
78
|
+
def migration_5
|
79
|
+
connection.add_column :batches, :batch_id, :integer
|
80
|
+
connection.add_index :batches, :batch_id
|
81
|
+
end
|
77
82
|
|
78
83
|
# Update the schema info table, setting the version value
|
79
84
|
def update_schema_info(version)
|
@@ -10,13 +10,32 @@ module ETL #:nodoc:
|
|
10
10
|
configure
|
11
11
|
end
|
12
12
|
|
13
|
+
def get_fields_names(file)
|
14
|
+
File.open(file) do |input|
|
15
|
+
fields = CSV.parse(input.readline).first
|
16
|
+
new_fields = []
|
17
|
+
fields.each_with_index do |field,index|
|
18
|
+
# compute the index of occurrence of this specific occurrence of the field (usually, will be 1)
|
19
|
+
occurrence_index = fields[0..index].find_all { |e| e == field }.size
|
20
|
+
number_of_occurrences = fields.find_all { |e| e == field }.size
|
21
|
+
new_field = field + (number_of_occurrences > 1 ? "_#{occurrence_index}" : "")
|
22
|
+
new_fields << Field.new(new_field.to_sym)
|
23
|
+
end
|
24
|
+
return new_fields
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
13
28
|
# Returns each row.
|
14
29
|
def each
|
15
30
|
Dir.glob(file).each do |file|
|
16
31
|
ETL::Engine.logger.debug "parsing #{file}"
|
32
|
+
if fields.length == 0
|
33
|
+
ETL::Engine.logger.debug "no columns specified so reading names from first line of #{file}"
|
34
|
+
@fields = get_fields_names(file)
|
35
|
+
end
|
17
36
|
line = 0
|
18
37
|
lines_skipped = 0
|
19
|
-
|
38
|
+
CSV.foreach(file, options) do |raw_row|
|
20
39
|
if lines_skipped < source.skip_lines
|
21
40
|
ETL::Engine.logger.debug "skipping line"
|
22
41
|
lines_skipped += 1
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'spreadsheet'
|
2
|
+
|
3
|
+
module ETL
|
4
|
+
module Parser
|
5
|
+
class ExcelParser < ETL::Parser::Parser
|
6
|
+
|
7
|
+
attr_accessor :ignore_blank_line
|
8
|
+
|
9
|
+
# Initialize the parser
|
10
|
+
# * <tt>source</tt>: The Source object
|
11
|
+
# * <tt>options</tt>: Parser options Hash
|
12
|
+
def initialize(source, options={})
|
13
|
+
super
|
14
|
+
configure
|
15
|
+
end
|
16
|
+
|
17
|
+
# Returns each row
|
18
|
+
def each
|
19
|
+
Dir.glob(file).each do |file|
|
20
|
+
ETL::Engine.logger.debug "parsing #{file}"
|
21
|
+
line = 0
|
22
|
+
lines_skipped = 0
|
23
|
+
book = Spreadsheet.open file
|
24
|
+
loopworksheets = []
|
25
|
+
|
26
|
+
if worksheets.empty?
|
27
|
+
loopworksheets = book.worksheets
|
28
|
+
else
|
29
|
+
worksheets.each do |index|
|
30
|
+
loopworksheets << book.worksheet(index)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
loopworksheets.each do |sheet|
|
35
|
+
sheet.each do |raw_row|
|
36
|
+
if lines_skipped < source.skip_lines
|
37
|
+
ETL::Engine.logger.debug "skipping line"
|
38
|
+
lines_skipped += 1
|
39
|
+
next
|
40
|
+
end
|
41
|
+
line += 1
|
42
|
+
row = {}
|
43
|
+
if self.ignore_blank_line and raw_row.empty?
|
44
|
+
lines_skipped += 1
|
45
|
+
next
|
46
|
+
end
|
47
|
+
validate_row(raw_row, line, file)
|
48
|
+
raw_row.each_with_index do |value, index|
|
49
|
+
f = fields[index]
|
50
|
+
row[f.name] = value
|
51
|
+
end
|
52
|
+
yield row
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Get an array of defined worksheets
|
59
|
+
def worksheets
|
60
|
+
@worksheets ||= []
|
61
|
+
end
|
62
|
+
|
63
|
+
# Get an array of defined fields
|
64
|
+
def fields
|
65
|
+
@fields ||= []
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
def validate_row(row, line, file)
|
70
|
+
ETL::Engine.logger.debug "validating line #{line} in file #{file}"
|
71
|
+
if row.length != fields.length
|
72
|
+
raise_with_info( MismatchError,
|
73
|
+
"The number of columns from the source (#{row.length}) does not match the number of columns in the definition (#{fields.length})",
|
74
|
+
line, file
|
75
|
+
)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
def configure
|
81
|
+
source.definition[:worksheets].each do |worksheet|
|
82
|
+
if Integer(worksheet)
|
83
|
+
worksheets << worksheet.to_i
|
84
|
+
else
|
85
|
+
raise DefinitionError, "Each worksheet definition must be an integer"
|
86
|
+
end
|
87
|
+
end unless source.definition[:worksheets].nil?
|
88
|
+
|
89
|
+
self.ignore_blank_line = source.definition[:ignore_blank_line]
|
90
|
+
|
91
|
+
source.definition[:fields].each do |options|
|
92
|
+
case options
|
93
|
+
when Symbol
|
94
|
+
fields << Field.new(options)
|
95
|
+
when Hash
|
96
|
+
fields << Field.new(options[:name])
|
97
|
+
else
|
98
|
+
raise DefinitionError, "Each field definition must either be a symbol or a hash"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
class Field #:nodoc:
|
104
|
+
attr_reader :name
|
105
|
+
def initialize(name)
|
106
|
+
@name = name
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|