darrell-activewarehouse-etl 0.9.1.4 → 0.9.1.6
Sign up to get free protection for your applications and to get access to all the features.
- data/README +2 -1
- data/Rakefile +3 -3
- data/lib/etl/control/destination.rb +22 -12
- data/lib/etl/control/source.rb +3 -2
- data/lib/etl/control/source/database_source.rb +12 -8
- data/lib/etl/control/source/file_source.rb +2 -2
- data/lib/etl/engine.rb +2 -1
- data/lib/etl/parser/delimited_parser.rb +19 -0
- data/lib/etl/parser/excel_parser.rb +112 -0
- metadata +4 -3
data/README
CHANGED
@@ -86,6 +86,7 @@ The tests require:
|
|
86
86
|
- gem install shoulda
|
87
87
|
- gem install flexmock
|
88
88
|
- gem install pg (if you want to run the tests on pg)
|
89
|
+
- gem install spreadsheet
|
89
90
|
|
90
91
|
The tests subfolder contains examples database.yml for mysql and postgres.
|
91
92
|
|
@@ -96,4 +97,4 @@ To run the tests:
|
|
96
97
|
== Feedback
|
97
98
|
This is a work in progress. Comments should be made on the
|
98
99
|
activewarehouse-discuss mailing list at the moment. Contributions are always
|
99
|
-
welcome.
|
100
|
+
welcome.
|
data/Rakefile
CHANGED
@@ -9,7 +9,7 @@ require File.join(File.dirname(__FILE__), 'lib/etl', 'version')
|
|
9
9
|
|
10
10
|
module AWETL
|
11
11
|
PKG_BUILD = ENV['PKG_BUILD'] ? '.' + ENV['PKG_BUILD'] : ''
|
12
|
-
PKG_NAME = 'activewarehouse-etl'
|
12
|
+
PKG_NAME = 'darrell-activewarehouse-etl'
|
13
13
|
PKG_VERSION = ETL::VERSION::STRING + PKG_BUILD
|
14
14
|
PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
|
15
15
|
PKG_DESTINATION = ENV["PKG_DESTINATION"] || "../#{PKG_NAME}"
|
@@ -87,7 +87,7 @@ module AWETL
|
|
87
87
|
|
88
88
|
def self.spec(package_prefix = '')
|
89
89
|
Gem::Specification.new do |s|
|
90
|
-
s.name = 'activewarehouse-etl'
|
90
|
+
s.name = 'darrell-activewarehouse-etl'
|
91
91
|
s.version = AWETL::PKG_VERSION
|
92
92
|
s.summary = "Pure Ruby ETL package."
|
93
93
|
s.description = <<-EOF
|
@@ -172,4 +172,4 @@ task :reinstall => [:package] do
|
|
172
172
|
gem = windows ? 'gem.bat' : 'gem'
|
173
173
|
`#{sudo} #{gem} uninstall #{AWETL::PKG_NAME} -x`
|
174
174
|
`#{sudo} #{gem} install pkg/#{AWETL::PKG_NAME}-#{AWETL::PKG_VERSION}`
|
175
|
-
end
|
175
|
+
end
|
@@ -209,7 +209,12 @@ module ETL #:nodoc:
|
|
209
209
|
return
|
210
210
|
end
|
211
211
|
|
212
|
-
@timestamp =
|
212
|
+
@timestamp = case configuration[:scd][:timestamp]
|
213
|
+
when Time, Date then configuration[:scd][:timestamp]
|
214
|
+
when Symbol then row[configuration[:scd][:timestamp]]
|
215
|
+
when nil then Time.now
|
216
|
+
else raise "Unknown timestamp: #{configuration[:scd][:timestamp].inspect}. Use Time or Date for a specific time, a symbol for a value from each row, or nil for the current time"
|
217
|
+
end
|
213
218
|
|
214
219
|
# See if the scd_fields of the current record have changed
|
215
220
|
# from the last time this record was loaded into the data
|
@@ -318,6 +323,12 @@ module ETL #:nodoc:
|
|
318
323
|
ETL::Engine.logger.debug "expiring original record"
|
319
324
|
@existing_row[scd_end_date_field] = @timestamp
|
320
325
|
@existing_row[scd_latest_version_field] = false
|
326
|
+
|
327
|
+
if configuration[:scd][:merge_nils]
|
328
|
+
scd_fields(row).each do |f|
|
329
|
+
row[f] ||= @existing_row[f]
|
330
|
+
end
|
331
|
+
end
|
321
332
|
|
322
333
|
buffer << @existing_row
|
323
334
|
|
@@ -381,17 +392,16 @@ module ETL #:nodoc:
|
|
381
392
|
# Check whether non-scd fields have changed since the last
|
382
393
|
# load of this record.
|
383
394
|
def has_scd_field_changes?(row)
|
384
|
-
scd_fields(row)
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
x
|
395
|
+
fields = scd_fields(row)
|
396
|
+
ETL::Engine.logger.debug " Row: %s" % row.slice(*fields).inspect
|
397
|
+
ETL::Engine.logger.debug "Existing Row: %s" % @existing_row.slice(*fields).inspect
|
398
|
+
|
399
|
+
fields.any? { |csd_field|
|
400
|
+
mismatch = configuration[:scd][:merge_nils] ? !row[csd_field].nil? : true
|
401
|
+
mismatch = mismatch && (row[csd_field].to_s != @existing_row[csd_field].to_s)
|
402
|
+
|
403
|
+
ETL::Engine.logger.debug "#{csd_field}: " + (mismatch ? row[csd_field].to_s + " != " + @existing_row[csd_field].to_s : @existing_row[csd_field].to_s)
|
404
|
+
mismatch
|
395
405
|
}
|
396
406
|
end
|
397
407
|
|
data/lib/etl/control/source.rb
CHANGED
@@ -40,7 +40,8 @@ module ETL #:nodoc:
|
|
40
40
|
@configuration = configuration
|
41
41
|
@definition = definition
|
42
42
|
|
43
|
-
@store_locally =
|
43
|
+
@store_locally = true
|
44
|
+
@store_locally = configuration[:store_locally] unless configuration[:store_locally].nil?
|
44
45
|
end
|
45
46
|
|
46
47
|
# Get an array of errors that occur during reading from the source
|
@@ -106,4 +107,4 @@ module ETL #:nodoc:
|
|
106
107
|
end
|
107
108
|
end
|
108
109
|
|
109
|
-
Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
|
110
|
+
Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
|
@@ -41,17 +41,18 @@ module ETL #:nodoc:
|
|
41
41
|
super
|
42
42
|
@target = configuration[:target]
|
43
43
|
@table = configuration[:table]
|
44
|
+
@query = configuration[:query]
|
44
45
|
end
|
45
46
|
|
46
47
|
# Get a String identifier for the source
|
47
48
|
def to_s
|
48
|
-
"#{host}/#{database}/#{table}"
|
49
|
+
"#{host}/#{database}/#{@table}"
|
49
50
|
end
|
50
51
|
|
51
52
|
# Get the local directory to use, which is a combination of the
|
52
53
|
# local_base, the db hostname the db database name and the db table.
|
53
54
|
def local_directory
|
54
|
-
File.join(local_base,
|
55
|
+
File.join(local_base, to_s)
|
55
56
|
end
|
56
57
|
|
57
58
|
# Get the join part of the query, defaults to nil
|
@@ -83,7 +84,7 @@ module ETL #:nodoc:
|
|
83
84
|
# Get the number of rows in the source
|
84
85
|
def count(use_cache=true)
|
85
86
|
return @count if @count && use_cache
|
86
|
-
if store_locally || read_locally
|
87
|
+
if @store_locally || read_locally
|
87
88
|
@count = count_locally
|
88
89
|
else
|
89
90
|
@count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
|
@@ -107,13 +108,16 @@ module ETL #:nodoc:
|
|
107
108
|
ETL::Engine.logger.debug "Reading from local cache"
|
108
109
|
read_rows(last_local_file, &block)
|
109
110
|
else # Read from the original source
|
110
|
-
if store_locally
|
111
|
+
if @store_locally
|
111
112
|
file = local_file
|
112
113
|
write_local(file)
|
113
114
|
read_rows(file, &block)
|
114
115
|
else
|
115
|
-
query_rows.each do |
|
116
|
-
row = ETL::Row.new(
|
116
|
+
query_rows.each do |r|
|
117
|
+
row = ETL::Row.new()
|
118
|
+
r.symbolize_keys.each_pair { |key, value|
|
119
|
+
row[key] = value
|
120
|
+
}
|
117
121
|
row.source = self
|
118
122
|
yield row
|
119
123
|
end
|
@@ -165,7 +169,7 @@ module ETL #:nodoc:
|
|
165
169
|
# Get the query to use
|
166
170
|
def query
|
167
171
|
return @query if @query
|
168
|
-
q = "SELECT #{select} FROM #{
|
172
|
+
q = "SELECT #{select} FROM #{@table}"
|
169
173
|
q << " #{join}" if join
|
170
174
|
|
171
175
|
conditions = []
|
@@ -217,4 +221,4 @@ module ETL #:nodoc:
|
|
217
221
|
end
|
218
222
|
end
|
219
223
|
end
|
220
|
-
end
|
224
|
+
end
|
@@ -41,7 +41,7 @@ module ETL #:nodoc:
|
|
41
41
|
# Returns each row from the source
|
42
42
|
def each
|
43
43
|
count = 0
|
44
|
-
copy_sources if store_locally
|
44
|
+
copy_sources if @store_locally
|
45
45
|
@parser.each do |row|
|
46
46
|
if ETL::Engine.offset && count < ETL::Engine.offset
|
47
47
|
count += 1
|
@@ -87,4 +87,4 @@ module ETL #:nodoc:
|
|
87
87
|
end
|
88
88
|
end
|
89
89
|
end
|
90
|
-
end
|
90
|
+
end
|
data/lib/etl/engine.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'active_support/all'
|
1
2
|
module ETL #:nodoc:
|
2
3
|
|
3
4
|
class Base < ActiveRecord::Base
|
@@ -32,7 +33,7 @@ module ETL #:nodoc:
|
|
32
33
|
options[:config] = 'config/database.yml' unless File.exist?(options[:config])
|
33
34
|
database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
|
34
35
|
ActiveRecord::Base.configurations.merge!(database_configuration)
|
35
|
-
ETL::Base.configurations = database_configuration
|
36
|
+
ETL::Base.configurations = HashWithIndifferentAccess.new(database_configuration)
|
36
37
|
#puts "configurations in init: #{ActiveRecord::Base.configurations.inspect}"
|
37
38
|
|
38
39
|
require 'etl/execution'
|
@@ -10,10 +10,29 @@ module ETL #:nodoc:
|
|
10
10
|
configure
|
11
11
|
end
|
12
12
|
|
13
|
+
def get_fields_names(file)
|
14
|
+
File.open(file) do |input|
|
15
|
+
fields = FasterCSV.parse(input.readline).first
|
16
|
+
new_fields = []
|
17
|
+
fields.each_with_index do |field,index|
|
18
|
+
# compute the index of occurrence of this specific occurrence of the field (usually, will be 1)
|
19
|
+
occurrence_index = fields[0..index].find_all { |e| e == field }.size
|
20
|
+
number_of_occurrences = fields.find_all { |e| e == field }.size
|
21
|
+
new_field = field + (number_of_occurrences > 1 ? "_#{occurrence_index}" : "")
|
22
|
+
new_fields << Field.new(new_field.to_sym)
|
23
|
+
end
|
24
|
+
return new_fields
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
13
28
|
# Returns each row.
|
14
29
|
def each
|
15
30
|
Dir.glob(file).each do |file|
|
16
31
|
ETL::Engine.logger.debug "parsing #{file}"
|
32
|
+
if fields.length == 0
|
33
|
+
ETL::Engine.logger.debug "no columns specified so reading names from first line of #{file}"
|
34
|
+
@fields = get_fields_names(file)
|
35
|
+
end
|
17
36
|
line = 0
|
18
37
|
lines_skipped = 0
|
19
38
|
FasterCSV.foreach(file, options) do |raw_row|
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'spreadsheet'
|
2
|
+
|
3
|
+
module ETL
|
4
|
+
module Parser
|
5
|
+
class ExcelParser < ETL::Parser::Parser
|
6
|
+
|
7
|
+
attr_accessor :ignore_blank_line
|
8
|
+
|
9
|
+
# Initialize the parser
|
10
|
+
# * <tt>source</tt>: The Source object
|
11
|
+
# * <tt>options</tt>: Parser options Hash
|
12
|
+
def initialize(source, options={})
|
13
|
+
super
|
14
|
+
configure
|
15
|
+
end
|
16
|
+
|
17
|
+
# Returns each row
|
18
|
+
def each
|
19
|
+
Dir.glob(file).each do |file|
|
20
|
+
ETL::Engine.logger.debug "parsing #{file}"
|
21
|
+
line = 0
|
22
|
+
lines_skipped = 0
|
23
|
+
book = Spreadsheet.open file
|
24
|
+
loopworksheets = []
|
25
|
+
|
26
|
+
if worksheets.empty?
|
27
|
+
loopworksheets = book.worksheets
|
28
|
+
else
|
29
|
+
worksheets.each do |index|
|
30
|
+
loopworksheets << book.worksheet(index)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
loopworksheets.each do |sheet|
|
35
|
+
sheet.each do |raw_row|
|
36
|
+
if lines_skipped < source.skip_lines
|
37
|
+
ETL::Engine.logger.debug "skipping line"
|
38
|
+
lines_skipped += 1
|
39
|
+
next
|
40
|
+
end
|
41
|
+
line += 1
|
42
|
+
row = {}
|
43
|
+
if self.ignore_blank_line and raw_row.empty?
|
44
|
+
lines_skipped += 1
|
45
|
+
next
|
46
|
+
end
|
47
|
+
validate_row(raw_row, line, file)
|
48
|
+
raw_row.each_with_index do |value, index|
|
49
|
+
f = fields[index]
|
50
|
+
row[f.name] = value
|
51
|
+
end
|
52
|
+
yield row
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Get an array of defined worksheets
|
59
|
+
def worksheets
|
60
|
+
@worksheets ||= []
|
61
|
+
end
|
62
|
+
|
63
|
+
# Get an array of defined fields
|
64
|
+
def fields
|
65
|
+
@fields ||= []
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
def validate_row(row, line, file)
|
70
|
+
ETL::Engine.logger.debug "validating line #{line} in file #{file}"
|
71
|
+
if row.length != fields.length
|
72
|
+
raise_with_info( MismatchError,
|
73
|
+
"The number of columns from the source (#{row.length}) does not match the number of columns in the definition (#{fields.length})",
|
74
|
+
line, file
|
75
|
+
)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
def configure
|
81
|
+
source.definition[:worksheets].each do |worksheet|
|
82
|
+
if Integer(worksheet)
|
83
|
+
worksheets << worksheet.to_i
|
84
|
+
else
|
85
|
+
raise DefinitionError, "Each worksheet definition must be an integer"
|
86
|
+
end
|
87
|
+
end unless source.definition[:worksheets].nil?
|
88
|
+
|
89
|
+
self.ignore_blank_line = source.definition[:ignore_blank_line]
|
90
|
+
|
91
|
+
source.definition[:fields].each do |options|
|
92
|
+
case options
|
93
|
+
when Symbol
|
94
|
+
fields << Field.new(options)
|
95
|
+
when Hash
|
96
|
+
fields << Field.new(options[:name])
|
97
|
+
else
|
98
|
+
raise DefinitionError, "Each field definition must either be a symbol or a hash"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
class Field #:nodoc:
|
104
|
+
attr_reader :name
|
105
|
+
def initialize(name)
|
106
|
+
@name = name
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: darrell-activewarehouse-etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 9
|
9
9
|
- 1
|
10
|
-
-
|
11
|
-
version: 0.9.1.
|
10
|
+
- 6
|
11
|
+
version: 0.9.1.6
|
12
12
|
platform: ruby
|
13
13
|
authors:
|
14
14
|
- Anthony Eden
|
@@ -171,6 +171,7 @@ files:
|
|
171
171
|
- lib/etl/generator/surrogate_key_generator.rb
|
172
172
|
- lib/etl/parser/apache_combined_log_parser.rb
|
173
173
|
- lib/etl/parser/delimited_parser.rb
|
174
|
+
- lib/etl/parser/excel_parser.rb
|
174
175
|
- lib/etl/parser/fixed_width_parser.rb
|
175
176
|
- lib/etl/parser/parser.rb
|
176
177
|
- lib/etl/parser/sax_parser.rb
|