activewarehouse-etl 0.9.5.rc1 → 1.0.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (104) hide show
  1. data/.standalone_migrations +2 -0
  2. data/.travis.yml +15 -0
  3. data/CHANGELOG +10 -1
  4. data/HOW_TO_RELEASE +4 -0
  5. data/LICENSE +1 -1
  6. data/README.textile +111 -0
  7. data/Rakefile +37 -78
  8. data/activewarehouse-etl.gemspec +7 -4
  9. data/db/migrate/20120229203554_create_tables.rb +37 -0
  10. data/db/schema.rb +45 -0
  11. data/examples/database.example.yml +3 -3
  12. data/lib/etl.rb +16 -0
  13. data/lib/etl/commands/etl.rb +1 -0
  14. data/lib/etl/control/control.rb +1 -1
  15. data/lib/etl/control/destination.rb +5 -16
  16. data/lib/etl/control/destination/csv_destination.rb +122 -0
  17. data/lib/etl/control/destination/excel_destination.rb +1 -1
  18. data/lib/etl/control/destination/insert_update_database_destination.rb +6 -3
  19. data/lib/etl/control/destination/yaml_destination.rb +74 -0
  20. data/lib/etl/control/source.rb +39 -4
  21. data/lib/etl/control/source/database_source.rb +6 -1
  22. data/lib/etl/control/source/file_source.rb +4 -0
  23. data/lib/etl/control/source/mysql_streamer.rb +31 -0
  24. data/lib/etl/engine.rb +40 -20
  25. data/lib/etl/parser/{delimited_parser.rb → csv_parser.rb} +3 -3
  26. data/lib/etl/parser/excel_parser.rb +1 -1
  27. data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
  28. data/lib/etl/processor/bulk_import_processor.rb +11 -0
  29. data/lib/etl/processor/check_exist_processor.rb +6 -6
  30. data/lib/etl/processor/check_unique_processor.rb +4 -0
  31. data/lib/etl/processor/database_join_processor.rb +25 -4
  32. data/lib/etl/processor/encode_processor.rb +0 -2
  33. data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
  34. data/lib/etl/processor/imapattachment_downloader_processor.rb +2 -2
  35. data/lib/etl/processor/pop3attachment_downloader_processor.rb +2 -2
  36. data/lib/etl/processor/row_processor.rb +10 -0
  37. data/lib/etl/processor/sftp_downloader_processor.rb +1 -1
  38. data/lib/etl/processor/sftp_uploader_processor.rb +1 -1
  39. data/lib/etl/processor/truncate_processor.rb +4 -1
  40. data/lib/etl/processor/zip_file_processor.rb +1 -1
  41. data/lib/etl/transform/foreign_key_lookup_transform.rb +57 -15
  42. data/lib/etl/transform/md5_transform.rb +13 -0
  43. data/lib/etl/transform/{string_to_datetime_transform.rb → string_to_date_time_transform.rb} +0 -0
  44. data/lib/etl/version.rb +1 -1
  45. data/test/.gitignore +0 -1
  46. data/test/check_exist_processor_test.rb +89 -0
  47. data/test/check_unique_processor_test.rb +40 -0
  48. data/test/config/.gitignore +1 -0
  49. data/test/config/database.yml +28 -0
  50. data/test/config/{Gemfile.rails-3.0.x → gemfiles/Gemfile.rails-3.0.x} +1 -1
  51. data/test/config/{Gemfile.rails-2.3.x → gemfiles/Gemfile.rails-3.1.x} +1 -1
  52. data/test/config/gemfiles/Gemfile.rails-3.2.x +3 -0
  53. data/test/config/gemfiles/common.rb +29 -0
  54. data/test/control_test.rb +2 -2
  55. data/test/data/nokogiri.xml +38 -0
  56. data/test/database_join_processor_test.rb +43 -0
  57. data/test/delimited.ctl +1 -1
  58. data/test/delimited_absolute.ctl +1 -3
  59. data/test/delimited_destination_db.ctl +1 -3
  60. data/test/delimited_excel.ctl +1 -1
  61. data/test/delimited_insert_update.ctl +1 -1
  62. data/test/delimited_update.ctl +1 -1
  63. data/test/delimited_with_bulk_load.ctl +2 -2
  64. data/test/destination_test.rb +0 -4
  65. data/test/encode_processor_test.rb +2 -0
  66. data/test/engine_test.rb +65 -19
  67. data/test/ensure_fields_presence_processor_test.rb +33 -0
  68. data/test/foreign_key_lookup_transform_test.rb +50 -0
  69. data/test/multiple_delimited.ctl +1 -1
  70. data/test/multiple_source_delimited.ctl +2 -2
  71. data/test/nokogiri_all.ctl +35 -0
  72. data/test/nokogiri_select.ctl +35 -0
  73. data/test/nokogiri_test.rb +35 -0
  74. data/test/parser_test.rb +2 -2
  75. data/test/performance/delimited.ctl +1 -1
  76. data/test/processor_test.rb +0 -3
  77. data/test/scd_test.rb +2 -8
  78. data/test/scd_test_type_1.ctl +1 -1
  79. data/test/scd_test_type_2.ctl +1 -1
  80. data/test/screen_test.rb +2 -3
  81. data/test/source_test.rb +19 -6
  82. data/test/test_helper.rb +6 -8
  83. data/test/truncate_processor_test.rb +37 -0
  84. metadata +121 -144
  85. data/README +0 -101
  86. data/active_support_logger.patch +0 -78
  87. data/test-matrix.yml +0 -10
  88. data/test/config/Gemfile.rails-2.3.x.lock +0 -38
  89. data/test/config/Gemfile.rails-3.0.x.lock +0 -49
  90. data/test/config/common.rb +0 -21
  91. data/test/connection/mysql/connection.rb +0 -9
  92. data/test/connection/mysql/schema.sql +0 -36
  93. data/test/connection/postgresql/connection.rb +0 -13
  94. data/test/connection/postgresql/schema.sql +0 -39
  95. data/test/vendor/adapter_extensions-0.5.0/CHANGELOG +0 -26
  96. data/test/vendor/adapter_extensions-0.5.0/LICENSE +0 -16
  97. data/test/vendor/adapter_extensions-0.5.0/README +0 -7
  98. data/test/vendor/adapter_extensions-0.5.0/Rakefile +0 -158
  99. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions.rb +0 -12
  100. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/abstract_adapter.rb +0 -44
  101. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/mysql_adapter.rb +0 -63
  102. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/postgresql_adapter.rb +0 -52
  103. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/sqlserver_adapter.rb +0 -44
  104. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/version.rb +0 -10
@@ -1,7 +1,7 @@
1
1
  module ETL #:nodoc:
2
2
  module Parser #:nodoc:
3
- # Parses delimited files
4
- class DelimitedParser < ETL::Parser::Parser
3
+ # Parses CSV files
4
+ class CsvParser < ETL::Parser::Parser
5
5
  # Initialize the parser
6
6
  # * <tt>source</tt>: The Source object
7
7
  # * <tt>options</tt>: Hash of options for the parser, defaults to an empty hash
@@ -12,7 +12,7 @@ module ETL #:nodoc:
12
12
 
13
13
  def get_fields_names(file)
14
14
  File.open(file) do |input|
15
- fields = CSV.parse(input.readline).first
15
+ fields = CSV.parse(input.readline, options).first
16
16
  new_fields = []
17
17
  fields.each_with_index do |field,index|
18
18
  # compute the index of occurrence of this specific occurrence of the field (usually, will be 1)
@@ -1,4 +1,4 @@
1
- require 'spreadsheet'
1
+ optional_require 'spreadsheet'
2
2
 
3
3
  module ETL
4
4
  module Parser
@@ -0,0 +1,83 @@
1
+ optional_require 'nokogiri'
2
+ require 'open-uri'
3
+ optional_require 'zlib'
4
+
5
+ module ETL
6
+ module Parser
7
+ class NokogiriXmlParser < ETL::Parser::Parser
8
+ # Initialize the parser
9
+ # * <tt>source</tt>: The Source object
10
+ # * <tt>options</tt>: Parser options Hash
11
+ def initialize(source, options={})
12
+ super
13
+ configure
14
+ end
15
+
16
+ # Returns each row
17
+ def each
18
+ Dir.glob(file).each do |source|
19
+
20
+ doc = nil
21
+
22
+ gzip = false
23
+ magic = "1F8B".to_i(base=16) # Check for gzip archives
24
+ if File.exist?(source)
25
+ gzip = true if magic == (
26
+ File.open(source).read(2).unpack("H2H2").to_s.to_i(base=16))
27
+ end
28
+
29
+ if gzip
30
+ doc = Nokogiri::XML(Zlib::GzipReader.open(source))
31
+ else
32
+ doc = Nokogiri::XML(open(source))
33
+ end
34
+
35
+ doc.xpath(@collection_xpath).each do |nodeset|
36
+ row = {}
37
+
38
+ fields.each do |f|
39
+ value = nodeset.xpath(f.xpath).text
40
+ row[f.name] = value
41
+ end
42
+ yield row
43
+ end
44
+
45
+ end
46
+ end
47
+
48
+ # Get an array of defined fields
49
+ def fields
50
+ @fields ||= []
51
+ end
52
+
53
+ private
54
+ def configure
55
+ @collection_xpath = source.definition[:collection]
56
+ if @collection_xpath.nil?
57
+ raise ":collection => 'XPath' argument required"
58
+ end
59
+ source.definition[:fields].each do |options|
60
+ case options
61
+ when Symbol
62
+ fields << Field.new(options, options.to_s)
63
+ when Hash
64
+ options[:xpath] ||= options[:name]
65
+ fields << Field.new(options[:name], options[:xpath].to_s)
66
+ else
67
+ raise DefinitionError,
68
+ "Each field definition must either be an symbol " +
69
+ "or a hash of options for the field"
70
+ end
71
+ end
72
+ end
73
+
74
+ class Field
75
+ attr_reader :name, :xpath
76
+ def initialize(name, xpath)
77
+ @name = name
78
+ @xpath = xpath
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -23,6 +23,10 @@ module ETL #:nodoc:
23
23
  attr_accessor :line_separator
24
24
  # The string that indicates a NULL (defaults to an empty string)
25
25
  attr_accessor :null_string
26
+ # boolean that indicates disable keys before, then enable after load (MySql only optimization)
27
+ attr_accessor :disable_keys
28
+ # replace existing records, not just insert
29
+ attr_accessor :replace
26
30
 
27
31
  # Initialize the processor.
28
32
  #
@@ -36,6 +40,7 @@ module ETL #:nodoc:
36
40
  # * <tt>:field_separator</tt>: The field separator. Defaults to a comma
37
41
  # * <tt>:line_separator</tt>: The line separator. Defaults to a newline
38
42
  # * <tt>:field_enclosure</tt>: The field enclosure charcaters
43
+ # * <tt>:disable_keys</tt>: Set to true to disable keys before, then enable after load (MySql only optimization)
39
44
  def initialize(control, configuration)
40
45
  super
41
46
  @target = configuration[:target]
@@ -49,6 +54,8 @@ module ETL #:nodoc:
49
54
  @line_separator = (configuration[:line_separator] || "\n")
50
55
  @null_string = (configuration[:null_string] || "")
51
56
  @field_enclosure = configuration[:field_enclosure]
57
+ @disable_keys = configuration[:disable_keys] || false
58
+ @replace = configuration[:replace] || false
52
59
 
53
60
  raise ControlError, "Target must be specified" unless @target
54
61
  raise ControlError, "Table must be specified" unless @table
@@ -64,6 +71,10 @@ module ETL #:nodoc:
64
71
  conn.truncate(table_name) if truncate
65
72
  options = {}
66
73
  options[:columns] = columns
74
+
75
+ options[:disable_keys] = true if disable_keys
76
+ options[:replace] = true if replace
77
+
67
78
  if field_separator || field_enclosure || line_separator || null_string
68
79
  options[:fields] = {}
69
80
  options[:fields][:null_string] = null_string if null_string
@@ -21,11 +21,10 @@ module ETL #:nodoc:
21
21
 
22
22
  # Initialize the processor
23
23
  # Configuration options:
24
- # * <tt>:skip</tt>: A symbol or array of column names that should not
25
- # be checked
24
+ # * <tt>:columns</tt>: An array of symbols for columns that should be included in the query conditions. If this option is not specified then all of the columns in the row will be included in the conditions (unless :skip is specified).
25
+ # * <tt>:skip</tt>: A symbol or array of symbols that should not be included in the existence check. If this option is not specified then all of the columns will be included in the existence check (unless :columns is specified).
26
+ # * <tt>:target</tt>: The target connection
26
27
  # * <tt>:table</tt>: The table name
27
- # * <tt>:columns</tt>: An array of columns which represent the natural
28
- # key
29
28
  def initialize(control, configuration)
30
29
  super
31
30
  @skip = configuration[:skip] || []
@@ -58,6 +57,7 @@ module ETL #:nodoc:
58
57
  conn = ETL::Engine.connection(target)
59
58
  q = "SELECT * FROM #{table_name} WHERE "
60
59
  conditions = []
60
+ ensure_columns_available_in_row!(row, columns, 'for existence check')
61
61
  row.each do |k,v|
62
62
  if columns.nil? || columns.include?(k.to_sym)
63
63
  conditions << "#{k} = #{conn.quote(v)}" unless skip?(k.to_sym)
@@ -66,12 +66,12 @@ module ETL #:nodoc:
66
66
  q << conditions.join(" AND ")
67
67
  q << " LIMIT 1"
68
68
 
69
- #puts "query: #{q}"
70
69
  result = conn.select_one(q)
71
70
  return row if result.nil?
72
71
  end
73
72
 
74
- private
73
+ private
74
+
75
75
  def table_name
76
76
  ETL::Engine.table(table, ETL::Engine.connection(target))
77
77
  end
@@ -23,7 +23,11 @@ module ETL #:nodoc:
23
23
 
24
24
  # Process the row. This implementation will only return a row if it
25
25
  # it's key combination has not already been seen.
26
+ #
27
+ # An error will be raised if the row doesn't include the keys.
26
28
  def process(row)
29
+ ensure_columns_available_in_row!(row, keys, 'for unicity check')
30
+
27
31
  key = (keys.collect { |k| row[k] }).join('|')
28
32
  unless compound_key_constraints[key]
29
33
  compound_key_constraints[key] = 1
@@ -21,6 +21,9 @@ module ETL
21
21
  @target = configuration[:target]
22
22
  @query = configuration[:query]
23
23
  @fields = configuration[:fields]
24
+ raise ControlError, ":target must be specified" unless @target
25
+ raise ControlError, ":query must be specified" unless @query
26
+ raise ControlError, ":fields must be specified" unless @fields
24
27
  end
25
28
 
26
29
  # Get a String identifier for the source
@@ -40,10 +43,28 @@ module ETL
40
43
  ETL::Engine.logger.debug("Executing select: #{q}")
41
44
  res = connection.execute(q)
42
45
 
43
- res.each_hash do |r|
44
- @fields.each do |field|
45
- row[field.to_sym] = r[field]
46
- end
46
+ # TODO - refactor this and move it (and similar code around) to adapter_extensions
47
+ case connection.class.name
48
+ when "ActiveRecord::ConnectionAdapters::PostgreSQLAdapter";
49
+ res.each do |r|
50
+ @fields.each do |field|
51
+ row[field.to_sym] = r[field.to_s]
52
+ end
53
+ end
54
+ when "ActiveRecord::ConnectionAdapters::Mysql2Adapter";
55
+ res.each(:as => :hash) do |r|
56
+ @fields.each do |field|
57
+ row[field.to_sym] = r[field.to_s]
58
+ end
59
+ end
60
+ when "ActiveRecord::ConnectionAdapters::MysqlAdapter";
61
+ res.each_hash do |r|
62
+ @fields.each do |field|
63
+ row[field.to_sym] = r[field.to_s]
64
+ end
65
+ end
66
+ res.free
67
+ else raise "Unsupported adapter #{connection.class} for this destination"
47
68
  end
48
69
 
49
70
  return row
@@ -1,5 +1,3 @@
1
- require 'iconv'
2
-
3
1
  module ETL #:nodoc:
4
2
  module Processor #:nodoc:
5
3
  # The encode processor uses Iconv to convert a file from one encoding (eg: utf-8) to another (eg: latin1), line by line.
@@ -0,0 +1,24 @@
1
+ module ETL
2
+ module Processor
3
+ # Ensure that each specified field is available
4
+ class EnsureFieldsPresenceProcessor < ETL::Processor::RowProcessor
5
+
6
+ # Initialize the processor.
7
+ #
8
+ # Configuration options:
9
+ # * <tt>:fields</tt>: An array of keys whose presence should be verified in each row
10
+ def initialize(control, configuration)
11
+ super
12
+ @fields = configuration[:fields]
13
+ raise ControlError, ":fields must be specified" unless @fields
14
+ end
15
+
16
+ def process(row)
17
+ missing_fields = configuration[:fields].map(&:to_s) - row.keys.map(&:to_s)
18
+ raise(ETL::ControlError,
19
+ "Row missing required field(s) #{missing_fields.join(',')} in row. Available fields are : #{row.keys.join(',')}") unless missing_fields.empty?
20
+ row
21
+ end
22
+ end
23
+ end
24
+ end
@@ -1,5 +1,5 @@
1
- require 'net/imap'
2
- require 'tmail'
1
+ optional_require 'net/imap'
2
+ optional_require 'tmail'
3
3
 
4
4
  module ETL
5
5
  module Processor
@@ -1,5 +1,5 @@
1
- require 'net/pop'
2
- require 'tmail'
1
+ optional_require 'net/pop'
2
+ optional_require 'tmail'
3
3
 
4
4
  module ETL
5
5
  module Processor
@@ -12,6 +12,16 @@ module ETL #:nodoc:
12
12
  def process(row)
13
13
  raise "process_row is an abstract method"
14
14
  end
15
+
16
+ # Ensure a given row keys include all the provided columns
17
+ # and raise an error using the provided message if it doesn't
18
+ def ensure_columns_available_in_row!(row, columns, message)
19
+ unless columns.nil?
20
+ columns.each do |k|
21
+ raise(ETL::ControlError, "Row missing required field #{k.inspect} #{message}") unless row.keys.include?(k)
22
+ end
23
+ end
24
+ end
15
25
  end
16
26
  end
17
27
  end
@@ -1,4 +1,4 @@
1
- require 'net/sftp'
1
+ optional_require 'net/sftp'
2
2
 
3
3
  module ETL
4
4
  module Processor
@@ -1,4 +1,4 @@
1
- require 'net/sftp'
1
+ optional_require 'net/sftp'
2
2
 
3
3
  module ETL
4
4
  module Processor
@@ -14,16 +14,19 @@ module ETL #:nodoc:
14
14
  # Options:
15
15
  # * <tt>:target</tt>: The target connection
16
16
  # * <tt>:table</tt>: The table name
17
+ # * <tt>:options</tt>: Optional truncate options
17
18
  def initialize(control, configuration)
18
19
  super
19
20
  #@file = File.join(File.dirname(control.file), configuration[:file])
20
21
  @target = configuration[:target] || {}
21
22
  @table = configuration[:table]
23
+ @options = configuration[:options]
22
24
  end
23
25
 
24
26
  def process
25
27
  conn = ETL::Engine.connection(target)
26
- conn.truncate(table_name)
28
+ @options ||= 'RESTART IDENTITY' if conn.class.name =~ /postgres/i
29
+ conn.truncate(table_name, @options)
27
30
  end
28
31
 
29
32
  private
@@ -1,4 +1,4 @@
1
- require 'zip/zip'
1
+ optional_require 'zip/zip'
2
2
 
3
3
  module ETL
4
4
  module Processor
@@ -16,6 +16,7 @@ module ETL #:nodoc:
16
16
  # for future use.
17
17
  # *<tt>:resolver</tt>: Object or Class which implements the method resolve(value)
18
18
  # *<tt>:default</tt>: A default foreign key to use if no foreign key is found
19
+ # *<tt>:cache</tt>: If true and the resolver responds to load_cache, load_cache will be called
19
20
  def initialize(control, name, configuration={})
20
21
  super
21
22
 
@@ -23,7 +24,10 @@ module ETL #:nodoc:
23
24
  @resolver = configuration[:resolver]
24
25
  @resolver = @resolver.new if @resolver.is_a?(Class)
25
26
  @default = configuration[:default]
26
- if configuration[:cache] ||= true
27
+
28
+ configuration[:cache] = true if configuration[:cache].nil?
29
+
30
+ if configuration[:cache]
27
31
  if resolver.respond_to?(:load_cache)
28
32
  resolver.load_cache
29
33
  else
@@ -87,44 +91,57 @@ class SQLResolver
87
91
  # referencing a connection defined in the ETL database.yml file or an actual
88
92
  # ActiveRecord connection instance. If the connection is not specified then
89
93
  # the ActiveRecord::Base.connection will be used.
90
- def initialize(table, field, connection=nil)
91
- @table = table
92
- @field = field
94
+ def initialize(atable, afield, connection=nil)
95
+ # puts "table: #{atable.inspect} field:#{afield.inspect}"
96
+ @table = atable
97
+ @field = afield
93
98
  @connection = (connection.respond_to?(:quote) ? connection : ETL::Engine.connection(connection)) if connection
94
99
  @connection ||= ActiveRecord::Base.connection
95
100
  end
101
+
96
102
  def resolve(value)
103
+ return nil if value.nil?
104
+ r = nil
97
105
  if @use_cache
98
- cache[cache_key(value)]
106
+ r = cache[value]
107
+ # puts "resolve failed: #{value.class.name}:#{value.inspect} from: #{@table}.#{@field}" unless r
99
108
  else
100
109
  q = "SELECT id FROM #{table_name} WHERE #{wheres(value)}"
101
- @connection.select_value(q)
110
+ # puts q
111
+ r = @connection.select_value(q)
102
112
  end
113
+ r
103
114
  end
115
+
104
116
  def table_name
105
117
  ETL::Engine.table(@table, @connection)
106
118
  end
119
+
107
120
  def cache
108
121
  @cache ||= {}
109
122
  end
123
+
110
124
  def load_cache
111
- @use_cache = true
112
125
  q = "SELECT id, #{field.join(', ')} FROM #{table_name}"
126
+ # puts q
113
127
  @connection.select_all(q).each do |record|
114
- cache[cache_key(record.values_at(*field))] = record['id']
128
+ ck = @field.kind_of?(Array) ? record.values_at(*@field) : record[@field]
129
+ # puts "load_cache key: #{ck.class.name}:#{ck.inspect}"
130
+ # puts " #{@field.class.name}:#{@field.inspect}"
131
+ # puts " #{record[@field].class.name}:#{record[@field].inspect}"
132
+ cache[ck] = record['id']
115
133
  end
134
+ @use_cache = true
116
135
  end
117
136
 
118
137
  private
138
+
119
139
  def field
120
- unless @field.kind_of?(Array)
121
- @field = [ @field ]
140
+ if @field.kind_of?(Array)
141
+ @field
142
+ else
143
+ [ @field ]
122
144
  end
123
- @field
124
- end
125
-
126
- def cache_key(value)
127
- value.hash
128
145
  end
129
146
 
130
147
  def wheres(value)
@@ -135,6 +152,31 @@ class SQLResolver
135
152
  end
136
153
  end
137
154
 
155
+ class IncrementalCacheSQLResolver < SQLResolver
156
+
157
+ def initialize(atable, afield, connection=nil)
158
+ super
159
+ end
160
+
161
+ def resolve(value)
162
+ return nil if value.nil?
163
+ r = cache[value]
164
+ unless r
165
+ q = "SELECT id FROM #{table_name} WHERE #{wheres(value)}"
166
+ r = @connection.select_value(q)
167
+ if r
168
+ cache[value] = r
169
+ end
170
+ end
171
+ r
172
+ end
173
+
174
+ def load_cache
175
+ @cache = {}
176
+ end
177
+
178
+ end
179
+
138
180
  class FlatFileResolver
139
181
  # Initialize the flat file resolver. Expects to open a comma-delimited file.
140
182
  # Returns the column with the given result_field_index.