darrell-activewarehouse-etl 0.9.1.4 → 0.9.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +2 -1
- data/Rakefile +3 -3
- data/lib/etl/control/destination.rb +22 -12
- data/lib/etl/control/source.rb +3 -2
- data/lib/etl/control/source/database_source.rb +12 -8
- data/lib/etl/control/source/file_source.rb +2 -2
- data/lib/etl/engine.rb +2 -1
- data/lib/etl/parser/delimited_parser.rb +19 -0
- data/lib/etl/parser/excel_parser.rb +112 -0
- metadata +4 -3
data/README
CHANGED
@@ -86,6 +86,7 @@ The tests require:
|
|
86
86
|
- gem install shoulda
|
87
87
|
- gem install flexmock
|
88
88
|
- gem install pg (if you want to run the tests on pg)
|
89
|
+
- gem install spreadsheet
|
89
90
|
|
90
91
|
The tests subfolder contains examples database.yml for mysql and postgres.
|
91
92
|
|
@@ -96,4 +97,4 @@ To run the tests:
|
|
96
97
|
== Feedback
|
97
98
|
This is a work in progress. Comments should be made on the
|
98
99
|
activewarehouse-discuss mailing list at the moment. Contributions are always
|
99
|
-
welcome.
|
100
|
+
welcome.
|
data/Rakefile
CHANGED
@@ -9,7 +9,7 @@ require File.join(File.dirname(__FILE__), 'lib/etl', 'version')
|
|
9
9
|
|
10
10
|
module AWETL
|
11
11
|
PKG_BUILD = ENV['PKG_BUILD'] ? '.' + ENV['PKG_BUILD'] : ''
|
12
|
-
PKG_NAME = 'activewarehouse-etl'
|
12
|
+
PKG_NAME = 'darrell-activewarehouse-etl'
|
13
13
|
PKG_VERSION = ETL::VERSION::STRING + PKG_BUILD
|
14
14
|
PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
|
15
15
|
PKG_DESTINATION = ENV["PKG_DESTINATION"] || "../#{PKG_NAME}"
|
@@ -87,7 +87,7 @@ module AWETL
|
|
87
87
|
|
88
88
|
def self.spec(package_prefix = '')
|
89
89
|
Gem::Specification.new do |s|
|
90
|
-
s.name = 'activewarehouse-etl'
|
90
|
+
s.name = 'darrell-activewarehouse-etl'
|
91
91
|
s.version = AWETL::PKG_VERSION
|
92
92
|
s.summary = "Pure Ruby ETL package."
|
93
93
|
s.description = <<-EOF
|
@@ -172,4 +172,4 @@ task :reinstall => [:package] do
|
|
172
172
|
gem = windows ? 'gem.bat' : 'gem'
|
173
173
|
`#{sudo} #{gem} uninstall #{AWETL::PKG_NAME} -x`
|
174
174
|
`#{sudo} #{gem} install pkg/#{AWETL::PKG_NAME}-#{AWETL::PKG_VERSION}`
|
175
|
-
end
|
175
|
+
end
|
@@ -209,7 +209,12 @@ module ETL #:nodoc:
|
|
209
209
|
return
|
210
210
|
end
|
211
211
|
|
212
|
-
@timestamp =
|
212
|
+
@timestamp = case configuration[:scd][:timestamp]
|
213
|
+
when Time, Date then configuration[:scd][:timestamp]
|
214
|
+
when Symbol then row[configuration[:scd][:timestamp]]
|
215
|
+
when nil then Time.now
|
216
|
+
else raise "Unknown timestamp: #{configuration[:scd][:timestamp].inspect}. Use Time or Date for a specific time, a symbol for a value from each row, or nil for the current time"
|
217
|
+
end
|
213
218
|
|
214
219
|
# See if the scd_fields of the current record have changed
|
215
220
|
# from the last time this record was loaded into the data
|
@@ -318,6 +323,12 @@ module ETL #:nodoc:
|
|
318
323
|
ETL::Engine.logger.debug "expiring original record"
|
319
324
|
@existing_row[scd_end_date_field] = @timestamp
|
320
325
|
@existing_row[scd_latest_version_field] = false
|
326
|
+
|
327
|
+
if configuration[:scd][:merge_nils]
|
328
|
+
scd_fields(row).each do |f|
|
329
|
+
row[f] ||= @existing_row[f]
|
330
|
+
end
|
331
|
+
end
|
321
332
|
|
322
333
|
buffer << @existing_row
|
323
334
|
|
@@ -381,17 +392,16 @@ module ETL #:nodoc:
|
|
381
392
|
# Check whether non-scd fields have changed since the last
|
382
393
|
# load of this record.
|
383
394
|
def has_scd_field_changes?(row)
|
384
|
-
scd_fields(row)
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
x
|
395
|
+
fields = scd_fields(row)
|
396
|
+
ETL::Engine.logger.debug " Row: %s" % row.slice(*fields).inspect
|
397
|
+
ETL::Engine.logger.debug "Existing Row: %s" % @existing_row.slice(*fields).inspect
|
398
|
+
|
399
|
+
fields.any? { |csd_field|
|
400
|
+
mismatch = configuration[:scd][:merge_nils] ? !row[csd_field].nil? : true
|
401
|
+
mismatch = mismatch && (row[csd_field].to_s != @existing_row[csd_field].to_s)
|
402
|
+
|
403
|
+
ETL::Engine.logger.debug "#{csd_field}: " + (mismatch ? row[csd_field].to_s + " != " + @existing_row[csd_field].to_s : @existing_row[csd_field].to_s)
|
404
|
+
mismatch
|
395
405
|
}
|
396
406
|
end
|
397
407
|
|
data/lib/etl/control/source.rb
CHANGED
@@ -40,7 +40,8 @@ module ETL #:nodoc:
|
|
40
40
|
@configuration = configuration
|
41
41
|
@definition = definition
|
42
42
|
|
43
|
-
@store_locally =
|
43
|
+
@store_locally = true
|
44
|
+
@store_locally = configuration[:store_locally] unless configuration[:store_locally].nil?
|
44
45
|
end
|
45
46
|
|
46
47
|
# Get an array of errors that occur during reading from the source
|
@@ -106,4 +107,4 @@ module ETL #:nodoc:
|
|
106
107
|
end
|
107
108
|
end
|
108
109
|
|
109
|
-
Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
|
110
|
+
Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
|
@@ -41,17 +41,18 @@ module ETL #:nodoc:
|
|
41
41
|
super
|
42
42
|
@target = configuration[:target]
|
43
43
|
@table = configuration[:table]
|
44
|
+
@query = configuration[:query]
|
44
45
|
end
|
45
46
|
|
46
47
|
# Get a String identifier for the source
|
47
48
|
def to_s
|
48
|
-
"#{host}/#{database}/#{table}"
|
49
|
+
"#{host}/#{database}/#{@table}"
|
49
50
|
end
|
50
51
|
|
51
52
|
# Get the local directory to use, which is a combination of the
|
52
53
|
# local_base, the db hostname the db database name and the db table.
|
53
54
|
def local_directory
|
54
|
-
File.join(local_base,
|
55
|
+
File.join(local_base, to_s)
|
55
56
|
end
|
56
57
|
|
57
58
|
# Get the join part of the query, defaults to nil
|
@@ -83,7 +84,7 @@ module ETL #:nodoc:
|
|
83
84
|
# Get the number of rows in the source
|
84
85
|
def count(use_cache=true)
|
85
86
|
return @count if @count && use_cache
|
86
|
-
if store_locally || read_locally
|
87
|
+
if @store_locally || read_locally
|
87
88
|
@count = count_locally
|
88
89
|
else
|
89
90
|
@count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
|
@@ -107,13 +108,16 @@ module ETL #:nodoc:
|
|
107
108
|
ETL::Engine.logger.debug "Reading from local cache"
|
108
109
|
read_rows(last_local_file, &block)
|
109
110
|
else # Read from the original source
|
110
|
-
if store_locally
|
111
|
+
if @store_locally
|
111
112
|
file = local_file
|
112
113
|
write_local(file)
|
113
114
|
read_rows(file, &block)
|
114
115
|
else
|
115
|
-
query_rows.each do |
|
116
|
-
row = ETL::Row.new(
|
116
|
+
query_rows.each do |r|
|
117
|
+
row = ETL::Row.new()
|
118
|
+
r.symbolize_keys.each_pair { |key, value|
|
119
|
+
row[key] = value
|
120
|
+
}
|
117
121
|
row.source = self
|
118
122
|
yield row
|
119
123
|
end
|
@@ -165,7 +169,7 @@ module ETL #:nodoc:
|
|
165
169
|
# Get the query to use
|
166
170
|
def query
|
167
171
|
return @query if @query
|
168
|
-
q = "SELECT #{select} FROM #{
|
172
|
+
q = "SELECT #{select} FROM #{@table}"
|
169
173
|
q << " #{join}" if join
|
170
174
|
|
171
175
|
conditions = []
|
@@ -217,4 +221,4 @@ module ETL #:nodoc:
|
|
217
221
|
end
|
218
222
|
end
|
219
223
|
end
|
220
|
-
end
|
224
|
+
end
|
@@ -41,7 +41,7 @@ module ETL #:nodoc:
|
|
41
41
|
# Returns each row from the source
|
42
42
|
def each
|
43
43
|
count = 0
|
44
|
-
copy_sources if store_locally
|
44
|
+
copy_sources if @store_locally
|
45
45
|
@parser.each do |row|
|
46
46
|
if ETL::Engine.offset && count < ETL::Engine.offset
|
47
47
|
count += 1
|
@@ -87,4 +87,4 @@ module ETL #:nodoc:
|
|
87
87
|
end
|
88
88
|
end
|
89
89
|
end
|
90
|
-
end
|
90
|
+
end
|
data/lib/etl/engine.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'active_support/all'
|
1
2
|
module ETL #:nodoc:
|
2
3
|
|
3
4
|
class Base < ActiveRecord::Base
|
@@ -32,7 +33,7 @@ module ETL #:nodoc:
|
|
32
33
|
options[:config] = 'config/database.yml' unless File.exist?(options[:config])
|
33
34
|
database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
|
34
35
|
ActiveRecord::Base.configurations.merge!(database_configuration)
|
35
|
-
ETL::Base.configurations = database_configuration
|
36
|
+
ETL::Base.configurations = HashWithIndifferentAccess.new(database_configuration)
|
36
37
|
#puts "configurations in init: #{ActiveRecord::Base.configurations.inspect}"
|
37
38
|
|
38
39
|
require 'etl/execution'
|
@@ -10,10 +10,29 @@ module ETL #:nodoc:
|
|
10
10
|
configure
|
11
11
|
end
|
12
12
|
|
13
|
+
def get_fields_names(file)
|
14
|
+
File.open(file) do |input|
|
15
|
+
fields = FasterCSV.parse(input.readline).first
|
16
|
+
new_fields = []
|
17
|
+
fields.each_with_index do |field,index|
|
18
|
+
# compute the index of occurrence of this specific occurrence of the field (usually, will be 1)
|
19
|
+
occurrence_index = fields[0..index].find_all { |e| e == field }.size
|
20
|
+
number_of_occurrences = fields.find_all { |e| e == field }.size
|
21
|
+
new_field = field + (number_of_occurrences > 1 ? "_#{occurrence_index}" : "")
|
22
|
+
new_fields << Field.new(new_field.to_sym)
|
23
|
+
end
|
24
|
+
return new_fields
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
13
28
|
# Returns each row.
|
14
29
|
def each
|
15
30
|
Dir.glob(file).each do |file|
|
16
31
|
ETL::Engine.logger.debug "parsing #{file}"
|
32
|
+
if fields.length == 0
|
33
|
+
ETL::Engine.logger.debug "no columns specified so reading names from first line of #{file}"
|
34
|
+
@fields = get_fields_names(file)
|
35
|
+
end
|
17
36
|
line = 0
|
18
37
|
lines_skipped = 0
|
19
38
|
FasterCSV.foreach(file, options) do |raw_row|
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'spreadsheet'
|
2
|
+
|
3
|
+
module ETL
|
4
|
+
module Parser
|
5
|
+
class ExcelParser < ETL::Parser::Parser
|
6
|
+
|
7
|
+
attr_accessor :ignore_blank_line
|
8
|
+
|
9
|
+
# Initialize the parser
|
10
|
+
# * <tt>source</tt>: The Source object
|
11
|
+
# * <tt>options</tt>: Parser options Hash
|
12
|
+
def initialize(source, options={})
|
13
|
+
super
|
14
|
+
configure
|
15
|
+
end
|
16
|
+
|
17
|
+
# Returns each row
|
18
|
+
def each
|
19
|
+
Dir.glob(file).each do |file|
|
20
|
+
ETL::Engine.logger.debug "parsing #{file}"
|
21
|
+
line = 0
|
22
|
+
lines_skipped = 0
|
23
|
+
book = Spreadsheet.open file
|
24
|
+
loopworksheets = []
|
25
|
+
|
26
|
+
if worksheets.empty?
|
27
|
+
loopworksheets = book.worksheets
|
28
|
+
else
|
29
|
+
worksheets.each do |index|
|
30
|
+
loopworksheets << book.worksheet(index)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
loopworksheets.each do |sheet|
|
35
|
+
sheet.each do |raw_row|
|
36
|
+
if lines_skipped < source.skip_lines
|
37
|
+
ETL::Engine.logger.debug "skipping line"
|
38
|
+
lines_skipped += 1
|
39
|
+
next
|
40
|
+
end
|
41
|
+
line += 1
|
42
|
+
row = {}
|
43
|
+
if self.ignore_blank_line and raw_row.empty?
|
44
|
+
lines_skipped += 1
|
45
|
+
next
|
46
|
+
end
|
47
|
+
validate_row(raw_row, line, file)
|
48
|
+
raw_row.each_with_index do |value, index|
|
49
|
+
f = fields[index]
|
50
|
+
row[f.name] = value
|
51
|
+
end
|
52
|
+
yield row
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Get an array of defined worksheets
|
59
|
+
def worksheets
|
60
|
+
@worksheets ||= []
|
61
|
+
end
|
62
|
+
|
63
|
+
# Get an array of defined fields
|
64
|
+
def fields
|
65
|
+
@fields ||= []
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
def validate_row(row, line, file)
|
70
|
+
ETL::Engine.logger.debug "validating line #{line} in file #{file}"
|
71
|
+
if row.length != fields.length
|
72
|
+
raise_with_info( MismatchError,
|
73
|
+
"The number of columns from the source (#{row.length}) does not match the number of columns in the definition (#{fields.length})",
|
74
|
+
line, file
|
75
|
+
)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
def configure
|
81
|
+
source.definition[:worksheets].each do |worksheet|
|
82
|
+
if Integer(worksheet)
|
83
|
+
worksheets << worksheet.to_i
|
84
|
+
else
|
85
|
+
raise DefinitionError, "Each worksheet definition must be an integer"
|
86
|
+
end
|
87
|
+
end unless source.definition[:worksheets].nil?
|
88
|
+
|
89
|
+
self.ignore_blank_line = source.definition[:ignore_blank_line]
|
90
|
+
|
91
|
+
source.definition[:fields].each do |options|
|
92
|
+
case options
|
93
|
+
when Symbol
|
94
|
+
fields << Field.new(options)
|
95
|
+
when Hash
|
96
|
+
fields << Field.new(options[:name])
|
97
|
+
else
|
98
|
+
raise DefinitionError, "Each field definition must either be a symbol or a hash"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
class Field #:nodoc:
|
104
|
+
attr_reader :name
|
105
|
+
def initialize(name)
|
106
|
+
@name = name
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: darrell-activewarehouse-etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 9
|
9
9
|
- 1
|
10
|
-
-
|
11
|
-
version: 0.9.1.
|
10
|
+
- 6
|
11
|
+
version: 0.9.1.6
|
12
12
|
platform: ruby
|
13
13
|
authors:
|
14
14
|
- Anthony Eden
|
@@ -171,6 +171,7 @@ files:
|
|
171
171
|
- lib/etl/generator/surrogate_key_generator.rb
|
172
172
|
- lib/etl/parser/apache_combined_log_parser.rb
|
173
173
|
- lib/etl/parser/delimited_parser.rb
|
174
|
+
- lib/etl/parser/excel_parser.rb
|
174
175
|
- lib/etl/parser/fixed_width_parser.rb
|
175
176
|
- lib/etl/parser/parser.rb
|
176
177
|
- lib/etl/parser/sax_parser.rb
|