darrell-activewarehouse-etl 0.9.1.4 → 0.9.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README CHANGED
@@ -86,6 +86,7 @@ The tests require:
86
86
  - gem install shoulda
87
87
  - gem install flexmock
88
88
  - gem install pg (if you want to run the tests on pg)
89
+ - gem install spreadsheet
89
90
 
90
91
  The tests subfolder contains examples database.yml for mysql and postgres.
91
92
 
@@ -96,4 +97,4 @@ To run the tests:
96
97
  == Feedback
97
98
  This is a work in progress. Comments should be made on the
98
99
  activewarehouse-discuss mailing list at the moment. Contributions are always
99
- welcome.
100
+ welcome.
data/Rakefile CHANGED
@@ -9,7 +9,7 @@ require File.join(File.dirname(__FILE__), 'lib/etl', 'version')
9
9
 
10
10
  module AWETL
11
11
  PKG_BUILD = ENV['PKG_BUILD'] ? '.' + ENV['PKG_BUILD'] : ''
12
- PKG_NAME = 'activewarehouse-etl'
12
+ PKG_NAME = 'darrell-activewarehouse-etl'
13
13
  PKG_VERSION = ETL::VERSION::STRING + PKG_BUILD
14
14
  PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
15
15
  PKG_DESTINATION = ENV["PKG_DESTINATION"] || "../#{PKG_NAME}"
@@ -87,7 +87,7 @@ module AWETL
87
87
 
88
88
  def self.spec(package_prefix = '')
89
89
  Gem::Specification.new do |s|
90
- s.name = 'activewarehouse-etl'
90
+ s.name = 'darrell-activewarehouse-etl'
91
91
  s.version = AWETL::PKG_VERSION
92
92
  s.summary = "Pure Ruby ETL package."
93
93
  s.description = <<-EOF
@@ -172,4 +172,4 @@ task :reinstall => [:package] do
172
172
  gem = windows ? 'gem.bat' : 'gem'
173
173
  `#{sudo} #{gem} uninstall #{AWETL::PKG_NAME} -x`
174
174
  `#{sudo} #{gem} install pkg/#{AWETL::PKG_NAME}-#{AWETL::PKG_VERSION}`
175
- end
175
+ end
@@ -209,7 +209,12 @@ module ETL #:nodoc:
209
209
  return
210
210
  end
211
211
 
212
- @timestamp = Time.now
212
+ @timestamp = case configuration[:scd][:timestamp]
213
+ when Time, Date then configuration[:scd][:timestamp]
214
+ when Symbol then row[configuration[:scd][:timestamp]]
215
+ when nil then Time.now
216
+ else raise "Unknown timestamp: #{configuration[:scd][:timestamp].inspect}. Use Time or Date for a specific time, a symbol for a value from each row, or nil for the current time"
217
+ end
213
218
 
214
219
  # See if the scd_fields of the current record have changed
215
220
  # from the last time this record was loaded into the data
@@ -318,6 +323,12 @@ module ETL #:nodoc:
318
323
  ETL::Engine.logger.debug "expiring original record"
319
324
  @existing_row[scd_end_date_field] = @timestamp
320
325
  @existing_row[scd_latest_version_field] = false
326
+
327
+ if configuration[:scd][:merge_nils]
328
+ scd_fields(row).each do |f|
329
+ row[f] ||= @existing_row[f]
330
+ end
331
+ end
321
332
 
322
333
  buffer << @existing_row
323
334
 
@@ -381,17 +392,16 @@ module ETL #:nodoc:
381
392
  # Check whether non-scd fields have changed since the last
382
393
  # load of this record.
383
394
  def has_scd_field_changes?(row)
384
- scd_fields(row).any? { |csd_field|
385
- ETL::Engine.logger.debug "Row: #{row.inspect}"
386
- ETL::Engine.logger.debug "Existing Row: #{@existing_row.inspect}"
387
- ETL::Engine.logger.debug "comparing: #{row[csd_field].to_s} != #{@existing_row[csd_field].to_s}"
388
- if row[csd_field].to_s != @existing_row[csd_field].to_s
389
- x=true
390
- else
391
- x=false
392
- end
393
- ETL::Engine.logger.debug "Fields differ?: #{x}"
394
- x
395
+ fields = scd_fields(row)
396
+ ETL::Engine.logger.debug " Row: %s" % row.slice(*fields).inspect
397
+ ETL::Engine.logger.debug "Existing Row: %s" % @existing_row.slice(*fields).inspect
398
+
399
+ fields.any? { |csd_field|
400
+ mismatch = configuration[:scd][:merge_nils] ? !row[csd_field].nil? : true
401
+ mismatch = mismatch && (row[csd_field].to_s != @existing_row[csd_field].to_s)
402
+
403
+ ETL::Engine.logger.debug "#{csd_field}: " + (mismatch ? row[csd_field].to_s + " != " + @existing_row[csd_field].to_s : @existing_row[csd_field].to_s)
404
+ mismatch
395
405
  }
396
406
  end
397
407
 
@@ -40,7 +40,8 @@ module ETL #:nodoc:
40
40
  @configuration = configuration
41
41
  @definition = definition
42
42
 
43
- @store_locally = configuration[:store_locally] || true
43
+ @store_locally = true
44
+ @store_locally = configuration[:store_locally] unless configuration[:store_locally].nil?
44
45
  end
45
46
 
46
47
  # Get an array of errors that occur during reading from the source
@@ -106,4 +107,4 @@ module ETL #:nodoc:
106
107
  end
107
108
  end
108
109
 
109
- Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
110
+ Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
@@ -41,17 +41,18 @@ module ETL #:nodoc:
41
41
  super
42
42
  @target = configuration[:target]
43
43
  @table = configuration[:table]
44
+ @query = configuration[:query]
44
45
  end
45
46
 
46
47
  # Get a String identifier for the source
47
48
  def to_s
48
- "#{host}/#{database}/#{table}"
49
+ "#{host}/#{database}/#{@table}"
49
50
  end
50
51
 
51
52
  # Get the local directory to use, which is a combination of the
52
53
  # local_base, the db hostname the db database name and the db table.
53
54
  def local_directory
54
- File.join(local_base, host, database, configuration[:table])
55
+ File.join(local_base, to_s)
55
56
  end
56
57
 
57
58
  # Get the join part of the query, defaults to nil
@@ -83,7 +84,7 @@ module ETL #:nodoc:
83
84
  # Get the number of rows in the source
84
85
  def count(use_cache=true)
85
86
  return @count if @count && use_cache
86
- if store_locally || read_locally
87
+ if @store_locally || read_locally
87
88
  @count = count_locally
88
89
  else
89
90
  @count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
@@ -107,13 +108,16 @@ module ETL #:nodoc:
107
108
  ETL::Engine.logger.debug "Reading from local cache"
108
109
  read_rows(last_local_file, &block)
109
110
  else # Read from the original source
110
- if store_locally
111
+ if @store_locally
111
112
  file = local_file
112
113
  write_local(file)
113
114
  read_rows(file, &block)
114
115
  else
115
- query_rows.each do |row|
116
- row = ETL::Row.new(row.symbolize_keys)
116
+ query_rows.each do |r|
117
+ row = ETL::Row.new()
118
+ r.symbolize_keys.each_pair { |key, value|
119
+ row[key] = value
120
+ }
117
121
  row.source = self
118
122
  yield row
119
123
  end
@@ -165,7 +169,7 @@ module ETL #:nodoc:
165
169
  # Get the query to use
166
170
  def query
167
171
  return @query if @query
168
- q = "SELECT #{select} FROM #{configuration[:table]}"
172
+ q = "SELECT #{select} FROM #{@table}"
169
173
  q << " #{join}" if join
170
174
 
171
175
  conditions = []
@@ -217,4 +221,4 @@ module ETL #:nodoc:
217
221
  end
218
222
  end
219
223
  end
220
- end
224
+ end
@@ -41,7 +41,7 @@ module ETL #:nodoc:
41
41
  # Returns each row from the source
42
42
  def each
43
43
  count = 0
44
- copy_sources if store_locally
44
+ copy_sources if @store_locally
45
45
  @parser.each do |row|
46
46
  if ETL::Engine.offset && count < ETL::Engine.offset
47
47
  count += 1
@@ -87,4 +87,4 @@ module ETL #:nodoc:
87
87
  end
88
88
  end
89
89
  end
90
- end
90
+ end
data/lib/etl/engine.rb CHANGED
@@ -1,3 +1,4 @@
1
+ require 'active_support/all'
1
2
  module ETL #:nodoc:
2
3
 
3
4
  class Base < ActiveRecord::Base
@@ -32,7 +33,7 @@ module ETL #:nodoc:
32
33
  options[:config] = 'config/database.yml' unless File.exist?(options[:config])
33
34
  database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
34
35
  ActiveRecord::Base.configurations.merge!(database_configuration)
35
- ETL::Base.configurations = database_configuration
36
+ ETL::Base.configurations = HashWithIndifferentAccess.new(database_configuration)
36
37
  #puts "configurations in init: #{ActiveRecord::Base.configurations.inspect}"
37
38
 
38
39
  require 'etl/execution'
@@ -10,10 +10,29 @@ module ETL #:nodoc:
10
10
  configure
11
11
  end
12
12
 
13
+ def get_fields_names(file)
14
+ File.open(file) do |input|
15
+ fields = FasterCSV.parse(input.readline).first
16
+ new_fields = []
17
+ fields.each_with_index do |field,index|
18
+ # compute the index of occurrence of this specific occurrence of the field (usually, will be 1)
19
+ occurrence_index = fields[0..index].find_all { |e| e == field }.size
20
+ number_of_occurrences = fields.find_all { |e| e == field }.size
21
+ new_field = field + (number_of_occurrences > 1 ? "_#{occurrence_index}" : "")
22
+ new_fields << Field.new(new_field.to_sym)
23
+ end
24
+ return new_fields
25
+ end
26
+ end
27
+
13
28
  # Returns each row.
14
29
  def each
15
30
  Dir.glob(file).each do |file|
16
31
  ETL::Engine.logger.debug "parsing #{file}"
32
+ if fields.length == 0
33
+ ETL::Engine.logger.debug "no columns specified so reading names from first line of #{file}"
34
+ @fields = get_fields_names(file)
35
+ end
17
36
  line = 0
18
37
  lines_skipped = 0
19
38
  FasterCSV.foreach(file, options) do |raw_row|
@@ -0,0 +1,112 @@
1
+ require 'spreadsheet'
2
+
3
+ module ETL
4
+ module Parser
5
+ class ExcelParser < ETL::Parser::Parser
6
+
7
+ attr_accessor :ignore_blank_line
8
+
9
+ # Initialize the parser
10
+ # * <tt>source</tt>: The Source object
11
+ # * <tt>options</tt>: Parser options Hash
12
+ def initialize(source, options={})
13
+ super
14
+ configure
15
+ end
16
+
17
+ # Returns each row
18
+ def each
19
+ Dir.glob(file).each do |file|
20
+ ETL::Engine.logger.debug "parsing #{file}"
21
+ line = 0
22
+ lines_skipped = 0
23
+ book = Spreadsheet.open file
24
+ loopworksheets = []
25
+
26
+ if worksheets.empty?
27
+ loopworksheets = book.worksheets
28
+ else
29
+ worksheets.each do |index|
30
+ loopworksheets << book.worksheet(index)
31
+ end
32
+ end
33
+
34
+ loopworksheets.each do |sheet|
35
+ sheet.each do |raw_row|
36
+ if lines_skipped < source.skip_lines
37
+ ETL::Engine.logger.debug "skipping line"
38
+ lines_skipped += 1
39
+ next
40
+ end
41
+ line += 1
42
+ row = {}
43
+ if self.ignore_blank_line and raw_row.empty?
44
+ lines_skipped += 1
45
+ next
46
+ end
47
+ validate_row(raw_row, line, file)
48
+ raw_row.each_with_index do |value, index|
49
+ f = fields[index]
50
+ row[f.name] = value
51
+ end
52
+ yield row
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ # Get an array of defined worksheets
59
+ def worksheets
60
+ @worksheets ||= []
61
+ end
62
+
63
+ # Get an array of defined fields
64
+ def fields
65
+ @fields ||= []
66
+ end
67
+
68
+ private
69
+ def validate_row(row, line, file)
70
+ ETL::Engine.logger.debug "validating line #{line} in file #{file}"
71
+ if row.length != fields.length
72
+ raise_with_info( MismatchError,
73
+ "The number of columns from the source (#{row.length}) does not match the number of columns in the definition (#{fields.length})",
74
+ line, file
75
+ )
76
+ end
77
+ end
78
+
79
+ private
80
+ def configure
81
+ source.definition[:worksheets].each do |worksheet|
82
+ if Integer(worksheet)
83
+ worksheets << worksheet.to_i
84
+ else
85
+ raise DefinitionError, "Each worksheet definition must be an integer"
86
+ end
87
+ end unless source.definition[:worksheets].nil?
88
+
89
+ self.ignore_blank_line = source.definition[:ignore_blank_line]
90
+
91
+ source.definition[:fields].each do |options|
92
+ case options
93
+ when Symbol
94
+ fields << Field.new(options)
95
+ when Hash
96
+ fields << Field.new(options[:name])
97
+ else
98
+ raise DefinitionError, "Each field definition must either be a symbol or a hash"
99
+ end
100
+ end
101
+ end
102
+
103
+ class Field #:nodoc:
104
+ attr_reader :name
105
+ def initialize(name)
106
+ @name = name
107
+ end
108
+ end
109
+
110
+ end
111
+ end
112
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: darrell-activewarehouse-etl
3
3
  version: !ruby/object:Gem::Version
4
- hash: 11
4
+ hash: 15
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 9
9
9
  - 1
10
- - 4
11
- version: 0.9.1.4
10
+ - 6
11
+ version: 0.9.1.6
12
12
  platform: ruby
13
13
  authors:
14
14
  - Anthony Eden
@@ -171,6 +171,7 @@ files:
171
171
  - lib/etl/generator/surrogate_key_generator.rb
172
172
  - lib/etl/parser/apache_combined_log_parser.rb
173
173
  - lib/etl/parser/delimited_parser.rb
174
+ - lib/etl/parser/excel_parser.rb
174
175
  - lib/etl/parser/fixed_width_parser.rb
175
176
  - lib/etl/parser/parser.rb
176
177
  - lib/etl/parser/sax_parser.rb