darrell-activewarehouse-etl 0.9.1.4 → 0.9.1.6

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -86,6 +86,7 @@ The tests require:
86
86
  - gem install shoulda
87
87
  - gem install flexmock
88
88
  - gem install pg (if you want to run the tests on pg)
89
+ - gem install spreadsheet
89
90
 
90
91
  The tests subfolder contains examples database.yml for mysql and postgres.
91
92
 
@@ -96,4 +97,4 @@ To run the tests:
96
97
  == Feedback
97
98
  This is a work in progress. Comments should be made on the
98
99
  activewarehouse-discuss mailing list at the moment. Contributions are always
99
- welcome.
100
+ welcome.
data/Rakefile CHANGED
@@ -9,7 +9,7 @@ require File.join(File.dirname(__FILE__), 'lib/etl', 'version')
9
9
 
10
10
  module AWETL
11
11
  PKG_BUILD = ENV['PKG_BUILD'] ? '.' + ENV['PKG_BUILD'] : ''
12
- PKG_NAME = 'activewarehouse-etl'
12
+ PKG_NAME = 'darrell-activewarehouse-etl'
13
13
  PKG_VERSION = ETL::VERSION::STRING + PKG_BUILD
14
14
  PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
15
15
  PKG_DESTINATION = ENV["PKG_DESTINATION"] || "../#{PKG_NAME}"
@@ -87,7 +87,7 @@ module AWETL
87
87
 
88
88
  def self.spec(package_prefix = '')
89
89
  Gem::Specification.new do |s|
90
- s.name = 'activewarehouse-etl'
90
+ s.name = 'darrell-activewarehouse-etl'
91
91
  s.version = AWETL::PKG_VERSION
92
92
  s.summary = "Pure Ruby ETL package."
93
93
  s.description = <<-EOF
@@ -172,4 +172,4 @@ task :reinstall => [:package] do
172
172
  gem = windows ? 'gem.bat' : 'gem'
173
173
  `#{sudo} #{gem} uninstall #{AWETL::PKG_NAME} -x`
174
174
  `#{sudo} #{gem} install pkg/#{AWETL::PKG_NAME}-#{AWETL::PKG_VERSION}`
175
- end
175
+ end
@@ -209,7 +209,12 @@ module ETL #:nodoc:
209
209
  return
210
210
  end
211
211
 
212
- @timestamp = Time.now
212
+ @timestamp = case configuration[:scd][:timestamp]
213
+ when Time, Date then configuration[:scd][:timestamp]
214
+ when Symbol then row[configuration[:scd][:timestamp]]
215
+ when nil then Time.now
216
+ else raise "Unknown timestamp: #{configuration[:scd][:timestamp].inspect}. Use Time or Date for a specific time, a symbol for a value from each row, or nil for the current time"
217
+ end
213
218
 
214
219
  # See if the scd_fields of the current record have changed
215
220
  # from the last time this record was loaded into the data
@@ -318,6 +323,12 @@ module ETL #:nodoc:
318
323
  ETL::Engine.logger.debug "expiring original record"
319
324
  @existing_row[scd_end_date_field] = @timestamp
320
325
  @existing_row[scd_latest_version_field] = false
326
+
327
+ if configuration[:scd][:merge_nils]
328
+ scd_fields(row).each do |f|
329
+ row[f] ||= @existing_row[f]
330
+ end
331
+ end
321
332
 
322
333
  buffer << @existing_row
323
334
 
@@ -381,17 +392,16 @@ module ETL #:nodoc:
381
392
  # Check whether non-scd fields have changed since the last
382
393
  # load of this record.
383
394
  def has_scd_field_changes?(row)
384
- scd_fields(row).any? { |csd_field|
385
- ETL::Engine.logger.debug "Row: #{row.inspect}"
386
- ETL::Engine.logger.debug "Existing Row: #{@existing_row.inspect}"
387
- ETL::Engine.logger.debug "comparing: #{row[csd_field].to_s} != #{@existing_row[csd_field].to_s}"
388
- if row[csd_field].to_s != @existing_row[csd_field].to_s
389
- x=true
390
- else
391
- x=false
392
- end
393
- ETL::Engine.logger.debug "Fields differ?: #{x}"
394
- x
395
+ fields = scd_fields(row)
396
+ ETL::Engine.logger.debug " Row: %s" % row.slice(*fields).inspect
397
+ ETL::Engine.logger.debug "Existing Row: %s" % @existing_row.slice(*fields).inspect
398
+
399
+ fields.any? { |csd_field|
400
+ mismatch = configuration[:scd][:merge_nils] ? !row[csd_field].nil? : true
401
+ mismatch = mismatch && (row[csd_field].to_s != @existing_row[csd_field].to_s)
402
+
403
+ ETL::Engine.logger.debug "#{csd_field}: " + (mismatch ? row[csd_field].to_s + " != " + @existing_row[csd_field].to_s : @existing_row[csd_field].to_s)
404
+ mismatch
395
405
  }
396
406
  end
397
407
 
@@ -40,7 +40,8 @@ module ETL #:nodoc:
40
40
  @configuration = configuration
41
41
  @definition = definition
42
42
 
43
- @store_locally = configuration[:store_locally] || true
43
+ @store_locally = true
44
+ @store_locally = configuration[:store_locally] unless configuration[:store_locally].nil?
44
45
  end
45
46
 
46
47
  # Get an array of errors that occur during reading from the source
@@ -106,4 +107,4 @@ module ETL #:nodoc:
106
107
  end
107
108
  end
108
109
 
109
- Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
110
+ Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
@@ -41,17 +41,18 @@ module ETL #:nodoc:
41
41
  super
42
42
  @target = configuration[:target]
43
43
  @table = configuration[:table]
44
+ @query = configuration[:query]
44
45
  end
45
46
 
46
47
  # Get a String identifier for the source
47
48
  def to_s
48
- "#{host}/#{database}/#{table}"
49
+ "#{host}/#{database}/#{@table}"
49
50
  end
50
51
 
51
52
  # Get the local directory to use, which is a combination of the
52
53
  # local_base, the db hostname the db database name and the db table.
53
54
  def local_directory
54
- File.join(local_base, host, database, configuration[:table])
55
+ File.join(local_base, to_s)
55
56
  end
56
57
 
57
58
  # Get the join part of the query, defaults to nil
@@ -83,7 +84,7 @@ module ETL #:nodoc:
83
84
  # Get the number of rows in the source
84
85
  def count(use_cache=true)
85
86
  return @count if @count && use_cache
86
- if store_locally || read_locally
87
+ if @store_locally || read_locally
87
88
  @count = count_locally
88
89
  else
89
90
  @count = connection.select_value(query.gsub(/SELECT .* FROM/, 'SELECT count(1) FROM'))
@@ -107,13 +108,16 @@ module ETL #:nodoc:
107
108
  ETL::Engine.logger.debug "Reading from local cache"
108
109
  read_rows(last_local_file, &block)
109
110
  else # Read from the original source
110
- if store_locally
111
+ if @store_locally
111
112
  file = local_file
112
113
  write_local(file)
113
114
  read_rows(file, &block)
114
115
  else
115
- query_rows.each do |row|
116
- row = ETL::Row.new(row.symbolize_keys)
116
+ query_rows.each do |r|
117
+ row = ETL::Row.new()
118
+ r.symbolize_keys.each_pair { |key, value|
119
+ row[key] = value
120
+ }
117
121
  row.source = self
118
122
  yield row
119
123
  end
@@ -165,7 +169,7 @@ module ETL #:nodoc:
165
169
  # Get the query to use
166
170
  def query
167
171
  return @query if @query
168
- q = "SELECT #{select} FROM #{configuration[:table]}"
172
+ q = "SELECT #{select} FROM #{@table}"
169
173
  q << " #{join}" if join
170
174
 
171
175
  conditions = []
@@ -217,4 +221,4 @@ module ETL #:nodoc:
217
221
  end
218
222
  end
219
223
  end
220
- end
224
+ end
@@ -41,7 +41,7 @@ module ETL #:nodoc:
41
41
  # Returns each row from the source
42
42
  def each
43
43
  count = 0
44
- copy_sources if store_locally
44
+ copy_sources if @store_locally
45
45
  @parser.each do |row|
46
46
  if ETL::Engine.offset && count < ETL::Engine.offset
47
47
  count += 1
@@ -87,4 +87,4 @@ module ETL #:nodoc:
87
87
  end
88
88
  end
89
89
  end
90
- end
90
+ end
data/lib/etl/engine.rb CHANGED
@@ -1,3 +1,4 @@
1
+ require 'active_support/all'
1
2
  module ETL #:nodoc:
2
3
 
3
4
  class Base < ActiveRecord::Base
@@ -32,7 +33,7 @@ module ETL #:nodoc:
32
33
  options[:config] = 'config/database.yml' unless File.exist?(options[:config])
33
34
  database_configuration = YAML::load(ERB.new(IO.read(options[:config])).result + "\n")
34
35
  ActiveRecord::Base.configurations.merge!(database_configuration)
35
- ETL::Base.configurations = database_configuration
36
+ ETL::Base.configurations = HashWithIndifferentAccess.new(database_configuration)
36
37
  #puts "configurations in init: #{ActiveRecord::Base.configurations.inspect}"
37
38
 
38
39
  require 'etl/execution'
@@ -10,10 +10,29 @@ module ETL #:nodoc:
10
10
  configure
11
11
  end
12
12
 
13
+ def get_fields_names(file)
14
+ File.open(file) do |input|
15
+ fields = FasterCSV.parse(input.readline).first
16
+ new_fields = []
17
+ fields.each_with_index do |field,index|
18
+ # compute the index of occurrence of this specific occurrence of the field (usually, will be 1)
19
+ occurrence_index = fields[0..index].find_all { |e| e == field }.size
20
+ number_of_occurrences = fields.find_all { |e| e == field }.size
21
+ new_field = field + (number_of_occurrences > 1 ? "_#{occurrence_index}" : "")
22
+ new_fields << Field.new(new_field.to_sym)
23
+ end
24
+ return new_fields
25
+ end
26
+ end
27
+
13
28
  # Returns each row.
14
29
  def each
15
30
  Dir.glob(file).each do |file|
16
31
  ETL::Engine.logger.debug "parsing #{file}"
32
+ if fields.length == 0
33
+ ETL::Engine.logger.debug "no columns specified so reading names from first line of #{file}"
34
+ @fields = get_fields_names(file)
35
+ end
17
36
  line = 0
18
37
  lines_skipped = 0
19
38
  FasterCSV.foreach(file, options) do |raw_row|
@@ -0,0 +1,112 @@
1
+ require 'spreadsheet'
2
+
3
+ module ETL
4
+ module Parser
5
+ class ExcelParser < ETL::Parser::Parser
6
+
7
+ attr_accessor :ignore_blank_line
8
+
9
+ # Initialize the parser
10
+ # * <tt>source</tt>: The Source object
11
+ # * <tt>options</tt>: Parser options Hash
12
+ def initialize(source, options={})
13
+ super
14
+ configure
15
+ end
16
+
17
+ # Returns each row
18
+ def each
19
+ Dir.glob(file).each do |file|
20
+ ETL::Engine.logger.debug "parsing #{file}"
21
+ line = 0
22
+ lines_skipped = 0
23
+ book = Spreadsheet.open file
24
+ loopworksheets = []
25
+
26
+ if worksheets.empty?
27
+ loopworksheets = book.worksheets
28
+ else
29
+ worksheets.each do |index|
30
+ loopworksheets << book.worksheet(index)
31
+ end
32
+ end
33
+
34
+ loopworksheets.each do |sheet|
35
+ sheet.each do |raw_row|
36
+ if lines_skipped < source.skip_lines
37
+ ETL::Engine.logger.debug "skipping line"
38
+ lines_skipped += 1
39
+ next
40
+ end
41
+ line += 1
42
+ row = {}
43
+ if self.ignore_blank_line and raw_row.empty?
44
+ lines_skipped += 1
45
+ next
46
+ end
47
+ validate_row(raw_row, line, file)
48
+ raw_row.each_with_index do |value, index|
49
+ f = fields[index]
50
+ row[f.name] = value
51
+ end
52
+ yield row
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ # Get an array of defined worksheets
59
+ def worksheets
60
+ @worksheets ||= []
61
+ end
62
+
63
+ # Get an array of defined fields
64
+ def fields
65
+ @fields ||= []
66
+ end
67
+
68
+ private
69
+ def validate_row(row, line, file)
70
+ ETL::Engine.logger.debug "validating line #{line} in file #{file}"
71
+ if row.length != fields.length
72
+ raise_with_info( MismatchError,
73
+ "The number of columns from the source (#{row.length}) does not match the number of columns in the definition (#{fields.length})",
74
+ line, file
75
+ )
76
+ end
77
+ end
78
+
79
+ private
80
+ def configure
81
+ source.definition[:worksheets].each do |worksheet|
82
+ if Integer(worksheet)
83
+ worksheets << worksheet.to_i
84
+ else
85
+ raise DefinitionError, "Each worksheet definition must be an integer"
86
+ end
87
+ end unless source.definition[:worksheets].nil?
88
+
89
+ self.ignore_blank_line = source.definition[:ignore_blank_line]
90
+
91
+ source.definition[:fields].each do |options|
92
+ case options
93
+ when Symbol
94
+ fields << Field.new(options)
95
+ when Hash
96
+ fields << Field.new(options[:name])
97
+ else
98
+ raise DefinitionError, "Each field definition must either be a symbol or a hash"
99
+ end
100
+ end
101
+ end
102
+
103
+ class Field #:nodoc:
104
+ attr_reader :name
105
+ def initialize(name)
106
+ @name = name
107
+ end
108
+ end
109
+
110
+ end
111
+ end
112
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: darrell-activewarehouse-etl
3
3
  version: !ruby/object:Gem::Version
4
- hash: 11
4
+ hash: 15
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 9
9
9
  - 1
10
- - 4
11
- version: 0.9.1.4
10
+ - 6
11
+ version: 0.9.1.6
12
12
  platform: ruby
13
13
  authors:
14
14
  - Anthony Eden
@@ -171,6 +171,7 @@ files:
171
171
  - lib/etl/generator/surrogate_key_generator.rb
172
172
  - lib/etl/parser/apache_combined_log_parser.rb
173
173
  - lib/etl/parser/delimited_parser.rb
174
+ - lib/etl/parser/excel_parser.rb
174
175
  - lib/etl/parser/fixed_width_parser.rb
175
176
  - lib/etl/parser/parser.rb
176
177
  - lib/etl/parser/sax_parser.rb