activewarehouse-etl 0.9.5.rc1 → 1.0.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (104) hide show
  1. data/.standalone_migrations +2 -0
  2. data/.travis.yml +15 -0
  3. data/CHANGELOG +10 -1
  4. data/HOW_TO_RELEASE +4 -0
  5. data/LICENSE +1 -1
  6. data/README.textile +111 -0
  7. data/Rakefile +37 -78
  8. data/activewarehouse-etl.gemspec +7 -4
  9. data/db/migrate/20120229203554_create_tables.rb +37 -0
  10. data/db/schema.rb +45 -0
  11. data/examples/database.example.yml +3 -3
  12. data/lib/etl.rb +16 -0
  13. data/lib/etl/commands/etl.rb +1 -0
  14. data/lib/etl/control/control.rb +1 -1
  15. data/lib/etl/control/destination.rb +5 -16
  16. data/lib/etl/control/destination/csv_destination.rb +122 -0
  17. data/lib/etl/control/destination/excel_destination.rb +1 -1
  18. data/lib/etl/control/destination/insert_update_database_destination.rb +6 -3
  19. data/lib/etl/control/destination/yaml_destination.rb +74 -0
  20. data/lib/etl/control/source.rb +39 -4
  21. data/lib/etl/control/source/database_source.rb +6 -1
  22. data/lib/etl/control/source/file_source.rb +4 -0
  23. data/lib/etl/control/source/mysql_streamer.rb +31 -0
  24. data/lib/etl/engine.rb +40 -20
  25. data/lib/etl/parser/{delimited_parser.rb → csv_parser.rb} +3 -3
  26. data/lib/etl/parser/excel_parser.rb +1 -1
  27. data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
  28. data/lib/etl/processor/bulk_import_processor.rb +11 -0
  29. data/lib/etl/processor/check_exist_processor.rb +6 -6
  30. data/lib/etl/processor/check_unique_processor.rb +4 -0
  31. data/lib/etl/processor/database_join_processor.rb +25 -4
  32. data/lib/etl/processor/encode_processor.rb +0 -2
  33. data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
  34. data/lib/etl/processor/imapattachment_downloader_processor.rb +2 -2
  35. data/lib/etl/processor/pop3attachment_downloader_processor.rb +2 -2
  36. data/lib/etl/processor/row_processor.rb +10 -0
  37. data/lib/etl/processor/sftp_downloader_processor.rb +1 -1
  38. data/lib/etl/processor/sftp_uploader_processor.rb +1 -1
  39. data/lib/etl/processor/truncate_processor.rb +4 -1
  40. data/lib/etl/processor/zip_file_processor.rb +1 -1
  41. data/lib/etl/transform/foreign_key_lookup_transform.rb +57 -15
  42. data/lib/etl/transform/md5_transform.rb +13 -0
  43. data/lib/etl/transform/{string_to_datetime_transform.rb → string_to_date_time_transform.rb} +0 -0
  44. data/lib/etl/version.rb +1 -1
  45. data/test/.gitignore +0 -1
  46. data/test/check_exist_processor_test.rb +89 -0
  47. data/test/check_unique_processor_test.rb +40 -0
  48. data/test/config/.gitignore +1 -0
  49. data/test/config/database.yml +28 -0
  50. data/test/config/{Gemfile.rails-3.0.x → gemfiles/Gemfile.rails-3.0.x} +1 -1
  51. data/test/config/{Gemfile.rails-2.3.x → gemfiles/Gemfile.rails-3.1.x} +1 -1
  52. data/test/config/gemfiles/Gemfile.rails-3.2.x +3 -0
  53. data/test/config/gemfiles/common.rb +29 -0
  54. data/test/control_test.rb +2 -2
  55. data/test/data/nokogiri.xml +38 -0
  56. data/test/database_join_processor_test.rb +43 -0
  57. data/test/delimited.ctl +1 -1
  58. data/test/delimited_absolute.ctl +1 -3
  59. data/test/delimited_destination_db.ctl +1 -3
  60. data/test/delimited_excel.ctl +1 -1
  61. data/test/delimited_insert_update.ctl +1 -1
  62. data/test/delimited_update.ctl +1 -1
  63. data/test/delimited_with_bulk_load.ctl +2 -2
  64. data/test/destination_test.rb +0 -4
  65. data/test/encode_processor_test.rb +2 -0
  66. data/test/engine_test.rb +65 -19
  67. data/test/ensure_fields_presence_processor_test.rb +33 -0
  68. data/test/foreign_key_lookup_transform_test.rb +50 -0
  69. data/test/multiple_delimited.ctl +1 -1
  70. data/test/multiple_source_delimited.ctl +2 -2
  71. data/test/nokogiri_all.ctl +35 -0
  72. data/test/nokogiri_select.ctl +35 -0
  73. data/test/nokogiri_test.rb +35 -0
  74. data/test/parser_test.rb +2 -2
  75. data/test/performance/delimited.ctl +1 -1
  76. data/test/processor_test.rb +0 -3
  77. data/test/scd_test.rb +2 -8
  78. data/test/scd_test_type_1.ctl +1 -1
  79. data/test/scd_test_type_2.ctl +1 -1
  80. data/test/screen_test.rb +2 -3
  81. data/test/source_test.rb +19 -6
  82. data/test/test_helper.rb +6 -8
  83. data/test/truncate_processor_test.rb +37 -0
  84. metadata +121 -144
  85. data/README +0 -101
  86. data/active_support_logger.patch +0 -78
  87. data/test-matrix.yml +0 -10
  88. data/test/config/Gemfile.rails-2.3.x.lock +0 -38
  89. data/test/config/Gemfile.rails-3.0.x.lock +0 -49
  90. data/test/config/common.rb +0 -21
  91. data/test/connection/mysql/connection.rb +0 -9
  92. data/test/connection/mysql/schema.sql +0 -36
  93. data/test/connection/postgresql/connection.rb +0 -13
  94. data/test/connection/postgresql/schema.sql +0 -39
  95. data/test/vendor/adapter_extensions-0.5.0/CHANGELOG +0 -26
  96. data/test/vendor/adapter_extensions-0.5.0/LICENSE +0 -16
  97. data/test/vendor/adapter_extensions-0.5.0/README +0 -7
  98. data/test/vendor/adapter_extensions-0.5.0/Rakefile +0 -158
  99. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions.rb +0 -12
  100. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/abstract_adapter.rb +0 -44
  101. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/mysql_adapter.rb +0 -63
  102. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/postgresql_adapter.rb +0 -52
  103. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/sqlserver_adapter.rb +0 -44
  104. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/version.rb +0 -10
@@ -1,7 +0,0 @@
1
- This library provides extensions to Rails' ActiveRecord adapters.
2
-
3
- As of version 0.5, adapter_extensions has dependencies on ActiveSupport and ActiveRecord 2.1.x or higher.
4
-
5
- To use the MySQL adapter extensions with Rails 2.x, you must patch the mysql_adapter with the mysql_adapter_opt_local_infile.patch.
6
-
7
- To execute the unit tests you must first construct a adapter_extensions_unittest database.
@@ -1,158 +0,0 @@
1
- require 'rake'
2
- require 'rake/testtask'
3
- require 'rake/rdoctask'
4
- require 'rake/packagetask'
5
- require 'rake/gempackagetask'
6
- require 'rake/contrib/rubyforgepublisher'
7
- require 'date'
8
-
9
- require File.join(File.dirname(__FILE__), 'lib/adapter_extensions', 'version')
10
-
11
- PKG_BUILD = ENV['PKG_BUILD'] ? '.' + ENV['PKG_BUILD'] : ''
12
- PKG_NAME = 'adapter_extensions'
13
- PKG_VERSION = AdapterExtensions::VERSION::STRING + PKG_BUILD
14
- PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
15
- PKG_DESTINATION = ENV["PKG_DESTINATION"] || "../#{PKG_NAME}"
16
-
17
- RELEASE_NAME = "REL #{PKG_VERSION}"
18
-
19
- RUBY_FORGE_PROJECT = "activewarehouse"
20
- RUBY_FORGE_USER = "aeden"
21
-
22
- desc 'Default: run unit tests.'
23
- task :default => :test
24
-
25
- desc 'Test the ETL application.'
26
- Rake::TestTask.new(:test) do |t|
27
- t.libs << 'lib'
28
- t.pattern = 'test/**/*_test.rb'
29
- t.verbose = true
30
- # TODO: reset the database
31
- end
32
-
33
- namespace :rcov do
34
- desc 'Measures test coverage'
35
- task :test do
36
- rm_f 'coverage.data'
37
- mkdir 'coverage' unless File.exist?('coverage')
38
- rcov = "rcov --aggregate coverage.data --text-summary -Ilib"
39
- system("#{rcov} test/*_test.rb test/**/*_test.rb")
40
- system("open coverage/index.html") if PLATFORM['darwin']
41
- end
42
- end
43
-
44
- desc 'Generate documentation for the AdapterExtensions library.'
45
- Rake::RDocTask.new(:rdoc) do |rdoc|
46
- rdoc.rdoc_dir = 'rdoc'
47
- rdoc.title = 'Extensions for Rails adapters'
48
- rdoc.options << '--line-numbers' << '--inline-source'
49
- rdoc.rdoc_files.include('README')
50
- rdoc.rdoc_files.include('lib/**/*.rb')
51
- end
52
-
53
- PKG_FILES = FileList[
54
- 'CHANGELOG',
55
- 'README',
56
- 'LICENSE',
57
- 'Rakefile',
58
- 'doc/**/*',
59
- 'lib/**/*',
60
- ] - [ 'test' ]
61
-
62
- spec = Gem::Specification.new do |s|
63
- s.name = 'adapter_extensions'
64
- s.version = PKG_VERSION
65
- s.summary = "Extensions to Rails ActiveRecord adapters."
66
- s.description = <<-EOF
67
- Provides various extensions to the Rails ActiveRecord adapters.
68
- EOF
69
-
70
- s.add_dependency('rake', '>= 0.8.3')
71
- s.add_dependency('activesupport', '>= 2.1.0')
72
- s.add_dependency('activerecord', '>= 2.1.0')
73
- s.add_dependency('fastercsv', '>= 1.0.0')
74
-
75
- s.rdoc_options << '--exclude' << '.'
76
- s.has_rdoc = false
77
-
78
- s.files = PKG_FILES.to_a.delete_if {|f| f.include?('.svn')}
79
- s.require_path = 'lib'
80
-
81
- s.author = "Anthony Eden"
82
- s.email = "anthonyeden@gmail.com"
83
- s.homepage = "http://activewarehouse.rubyforge.org/adapter_extensions"
84
- s.rubyforge_project = "activewarehouse"
85
- end
86
-
87
- Rake::GemPackageTask.new(spec) do |pkg|
88
- pkg.gem_spec = spec
89
- pkg.need_tar = true
90
- pkg.need_zip = true
91
- end
92
-
93
- namespace :github do
94
- desc "Update Github Gemspec"
95
- task :update_gemspec do
96
- File.open(File.join(File.dirname(__FILE__), "#{spec.name}.gemspec"), "w"){|f| f << spec.to_ruby}
97
- end
98
- end
99
-
100
-
101
-
102
- desc "Generate code statistics"
103
- task :lines do
104
- lines, codelines, total_lines, total_codelines = 0, 0, 0, 0
105
-
106
- for file_name in FileList["lib/**/*.rb"]
107
- next if file_name =~ /vendor/
108
- f = File.open(file_name)
109
-
110
- while line = f.gets
111
- lines += 1
112
- next if line =~ /^\s*$/
113
- next if line =~ /^\s*#/
114
- codelines += 1
115
- end
116
- puts "L: #{sprintf("%4d", lines)}, LOC #{sprintf("%4d", codelines)} | #{file_name}"
117
-
118
- total_lines += lines
119
- total_codelines += codelines
120
-
121
- lines, codelines = 0, 0
122
- end
123
-
124
- puts "Total: Lines #{total_lines}, LOC #{total_codelines}"
125
- end
126
-
127
- desc "Publish the release files to RubyForge."
128
- task :release => [ :package ] do
129
- `rubyforge login`
130
-
131
- for ext in %w( gem tgz zip )
132
- release_command = "rubyforge add_release activewarehouse #{PKG_NAME} 'REL #{PKG_VERSION}' pkg/#{PKG_NAME}-#{PKG_VERSION}.#{ext}"
133
- puts release_command
134
- system(release_command)
135
- end
136
- end
137
-
138
- desc "Publish the API documentation"
139
- task :pdoc => [:rdoc] do
140
- Rake::SshDirPublisher.new("aeden@rubyforge.org", "/var/www/gforge-projects/activewarehouse/adapter_extensions/rdoc", "rdoc").upload
141
- end
142
-
143
- desc "Install the gem from a local generated package"
144
- task :install => [:package] do
145
- windows = RUBY_PLATFORM =~ /mswin/
146
- sudo = windows ? '' : 'sudo'
147
- gem = windows ? 'gem.bat' : 'gem'
148
- `#{sudo} #{gem} install pkg/#{PKG_NAME}-#{PKG_VERSION}`
149
- end
150
-
151
- desc "Reinstall the gem from a local package copy"
152
- task :reinstall => [:package] do
153
- windows = RUBY_PLATFORM =~ /mswin/
154
- sudo = windows ? '' : 'sudo'
155
- gem = windows ? 'gem.bat' : 'gem'
156
- `#{sudo} #{gem} uninstall #{PKG_NAME} -x`
157
- `#{sudo} #{gem} install pkg/#{PKG_NAME}-#{PKG_VERSION}`
158
- end
@@ -1,12 +0,0 @@
1
- # Extensions to the Rails ActiveRecord adapters.
2
- #
3
- # Requiring this file will require all of the necessary files to function.
4
-
5
- puts "Using AdapterExtensions"
6
-
7
- require 'rubygems'
8
- require 'active_support'
9
- require 'active_record'
10
-
11
- $:.unshift(File.dirname(__FILE__))
12
- Dir[File.dirname(__FILE__) + "/adapter_extensions/**/*.rb"].each { |file| require(file) }
@@ -1,44 +0,0 @@
1
- # This source file contains extensions to the abstract adapter.
2
- module ActiveRecord #:nodoc:
3
- module ConnectionAdapters #:nodoc:
4
- # Extensions to the AbstractAdapter. In some cases a default implementation
5
- # is provided, in others it is adapter-dependent and the method will
6
- # raise a NotImplementedError if the adapter does not implement that method
7
- class AbstractAdapter
8
- # Truncate the specified table
9
- def truncate(table_name)
10
- execute("TRUNCATE TABLE #{table_name}")
11
- end
12
-
13
- # Bulk loading interface. Load the data from the specified file into the
14
- # given table. Note that options will be adapter-dependent.
15
- def bulk_load(file, table_name, options={})
16
- raise ArgumentError, "#{file} does not exist" unless File.exist?(file)
17
- raise ArgumentError, "#{table_name} does not exist" unless tables.include?(table_name)
18
- do_bulk_load(file, table_name, options)
19
- end
20
-
21
- # SQL select into statement constructs a new table from the results
22
- # of a select. It is used to select data from a table and create a new
23
- # table with its result set at the same time. Note that this method
24
- # name does not necessarily match the implementation. E.g. MySQL's
25
- # version of this is 'CREATE TABLE ... AS SELECT ...'
26
- def support_select_into_table?
27
- false
28
- end
29
-
30
- # Add a chunk of SQL to the given query that will create a new table and
31
- # execute the select into that table.
32
- def add_select_into_table(new_table_name, sql_query)
33
- raise NotImplementedError, "add_select_into_table is an abstract method"
34
- end
35
-
36
- protected
37
-
38
- # for subclasses to implement
39
- def do_bulk_load(file, table_name, options={})
40
- raise NotImplementedError, "do_bulk_load is an abstract method"
41
- end
42
- end
43
- end
44
- end
@@ -1,63 +0,0 @@
1
- # Source code for the MysqlAdapter extensions.
2
- module ActiveRecord #:nodoc:
3
- module ConnectionAdapters #:nodoc:
4
- # Adds new functionality to ActiveRecord MysqlAdapter.
5
- class MysqlAdapter < AbstractAdapter
6
-
7
- def support_select_into_table?
8
- true
9
- end
10
-
11
- # Inserts an INTO table_name clause to the sql_query.
12
- def add_select_into_table(new_table_name, sql_query)
13
- "CREATE TABLE #{new_table_name} " + sql_query
14
- end
15
-
16
- # Copy the specified table.
17
- def copy_table(old_table_name, new_table_name)
18
- transaction do
19
- execute "CREATE TABLE #{new_table_name} LIKE #{old_table_name}"
20
- execute "INSERT INTO #{new_table_name} SELECT * FROM #{old_table_name}"
21
- end
22
- end
23
-
24
- protected
25
- # Call +bulk_load+, as that method wraps this method.
26
- #
27
- # Bulk load the data in the specified file. This implementation always uses the LOCAL keyword
28
- # so the file must be found locally, not on the remote server, to be loaded.
29
- #
30
- # Options:
31
- # * <tt>:ignore</tt> -- Ignore the specified number of lines from the source file
32
- # * <tt>:columns</tt> -- Array of column names defining the source file column order
33
- # * <tt>:fields</tt> -- Hash of options for fields:
34
- # * <tt>:delimited_by</tt> -- The field delimiter
35
- # * <tt>:enclosed_by</tt> -- The field enclosure
36
- def do_bulk_load(file, table_name, options={})
37
- return if File.size(file) == 0
38
-
39
- # an unfortunate hack - setting the bulk load option after the connection has been
40
- # established does not seem to have any effect, and since the connection is made when
41
- # active-record is loaded, there's no chance for us to sneak it in earlier. So we
42
- # disconnect, set the option, then reconnect - fortunately, this only needs to happen once.
43
- unless @bulk_load_enabled
44
- disconnect!
45
- @connection.options(Mysql::OPT_LOCAL_INFILE, true)
46
- connect
47
- @bulk_load_enabled = true
48
- end
49
-
50
- q = "LOAD DATA LOCAL INFILE '#{file}' INTO TABLE #{table_name}"
51
- if options[:fields]
52
- q << " FIELDS"
53
- q << " TERMINATED BY '#{options[:fields][:delimited_by]}'" if options[:fields][:delimited_by]
54
- q << " ENCLOSED BY '#{options[:fields][:enclosed_by]}'" if options[:fields][:enclosed_by]
55
- end
56
- q << " IGNORE #{options[:ignore]} LINES" if options[:ignore]
57
- q << " (#{options[:columns].join(',')})" if options[:columns]
58
- execute(q)
59
- end
60
-
61
- end
62
- end
63
- end
@@ -1,52 +0,0 @@
1
- # Source code for the PostgreSQLAdapter extensions.
2
- module ActiveRecord #:nodoc:
3
- module ConnectionAdapters #:nodoc:
4
- # Adds new functionality to ActiveRecord PostgreSQLAdapter.
5
- class PostgreSQLAdapter < AbstractAdapter
6
- def support_select_into_table?
7
- true
8
- end
9
-
10
- # Inserts an INTO table_name clause to the sql_query.
11
- def add_select_into_table(new_table_name, sql_query)
12
- sql_query.sub(/FROM/i, "INTO #{new_table_name} FROM")
13
- end
14
-
15
- # Copy the specified table.
16
- def copy_table(old_table_name, new_table_name)
17
- execute add_select_into_table(new_table_name, "SELECT * FROM #{old_table_name}")
18
- end
19
-
20
- protected
21
- # Call +bulk_load+, as that method wraps this method.
22
- #
23
- # Bulk load the data in the specified file.
24
- #
25
- # Options:
26
- # * <tt>:ignore</tt> -- Ignore the specified number of lines from the source file. In the case of PostgreSQL
27
- # only the first line will be ignored from the source file regardless of the number of lines specified.
28
- # * <tt>:columns</tt> -- Array of column names defining the source file column order
29
- # * <tt>:fields</tt> -- Hash of options for fields:
30
- # * <tt>:delimited_by</tt> -- The field delimiter
31
- # * <tt>:null_string</tt> -- The string that should be interpreted as NULL (in addition to \N)
32
- # * <tt>:enclosed_by</tt> -- The field enclosure
33
- def do_bulk_load(file, table_name, options={})
34
- q = "COPY #{table_name} "
35
- q << "(#{options[:columns].join(',')}) " if options[:columns]
36
- q << "FROM '#{File.expand_path(file)}' "
37
- if options[:fields]
38
- q << "WITH "
39
- q << "DELIMITER '#{options[:fields][:delimited_by]}' " if options[:fields][:delimited_by]
40
- q << "NULL '#{options[:fields][:null_string]}'" if options[:fields][:null_string]
41
- if options[:fields][:enclosed_by] || options[:ignore] && options[:ignore] > 0
42
- q << "CSV "
43
- q << "HEADER " if options[:ignore] && options[:ignore] > 0
44
- q << "QUOTE '#{options[:fields][:enclosed_by]}' " if options[:fields][:enclosed_by]
45
- end
46
- end
47
-
48
- execute(q)
49
- end
50
- end
51
- end
52
- end
@@ -1,44 +0,0 @@
1
- # Source code for the SQLServerAdapter extensions.
2
- module ActiveRecord #:nodoc:
3
- module ConnectionAdapters #:nodoc:
4
- # Adds new functionality to ActiveRecord SQLServerAdapter.
5
- class SQLServerAdapter < AbstractAdapter
6
- def support_select_into_table?
7
- true
8
- end
9
-
10
- # Inserts an INTO table_name clause to the sql_query.
11
- def add_select_into_table(new_table_name, sql_query)
12
- sql_query.sub(/FROM/i, "INTO #{new_table_name} FROM")
13
- end
14
-
15
- # Copy the specified table.
16
- def copy_table(old_table_name, new_table_name)
17
- execute add_select_into_table(new_table_name, "SELECT * FROM #{old_table_name}")
18
- end
19
-
20
- protected
21
- # Call +bulk_load+, as that method wraps this method.
22
- #
23
- # Bulk load the data in the specified file. This implementation relies
24
- # on bcp being in your PATH.
25
- #
26
- # Options:
27
- # * <tt>:ignore</tt> -- Ignore the specified number of lines from the source file
28
- # * <tt>:columns</tt> -- Array of column names defining the source file column order
29
- # * <tt>:fields</tt> -- Hash of options for fields:
30
- # * <tt>:delimited_by</tt> -- The field delimiter
31
- # * <tt>:enclosed_by</tt> -- The field enclosure
32
- def do_bulk_load(file, table_name, options={})
33
- env_name = options[:env] || RAILS_ENV
34
- config = ActiveRecord::Base.configurations[env_name]
35
- puts "Loading table \"#{table_name}\" from file \"#{filename}\""
36
- cmd = "bcp \"#{config['database']}.dbo.#{table_name}\" in " +
37
- "\"#{filename}\" -S \"#{config['host']}\" -c " +
38
- "-t \"#{options[:delimited_by]}\" -b10000 -a8192 -q -E -U \"#{config['username']}\" " +
39
- "-P \"#{config['password']}\" -e \"#{filename}.in.errors\""
40
- `#{cmd}`
41
- end
42
- end
43
- end
44
- end
@@ -1,10 +0,0 @@
1
- # Source file identifying the version of AdapterExtensions in this package
2
- module AdapterExtensions#:nodoc:
3
- module VERSION #:nodoc:
4
- MAJOR = 0
5
- MINOR = 5
6
- TINY = 0
7
-
8
- STRING = [MAJOR, MINOR, TINY].join('.')
9
- end
10
- end