activewarehouse-etl 0.9.5.rc1 → 1.0.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (104) hide show
  1. data/.standalone_migrations +2 -0
  2. data/.travis.yml +15 -0
  3. data/CHANGELOG +10 -1
  4. data/HOW_TO_RELEASE +4 -0
  5. data/LICENSE +1 -1
  6. data/README.textile +111 -0
  7. data/Rakefile +37 -78
  8. data/activewarehouse-etl.gemspec +7 -4
  9. data/db/migrate/20120229203554_create_tables.rb +37 -0
  10. data/db/schema.rb +45 -0
  11. data/examples/database.example.yml +3 -3
  12. data/lib/etl.rb +16 -0
  13. data/lib/etl/commands/etl.rb +1 -0
  14. data/lib/etl/control/control.rb +1 -1
  15. data/lib/etl/control/destination.rb +5 -16
  16. data/lib/etl/control/destination/csv_destination.rb +122 -0
  17. data/lib/etl/control/destination/excel_destination.rb +1 -1
  18. data/lib/etl/control/destination/insert_update_database_destination.rb +6 -3
  19. data/lib/etl/control/destination/yaml_destination.rb +74 -0
  20. data/lib/etl/control/source.rb +39 -4
  21. data/lib/etl/control/source/database_source.rb +6 -1
  22. data/lib/etl/control/source/file_source.rb +4 -0
  23. data/lib/etl/control/source/mysql_streamer.rb +31 -0
  24. data/lib/etl/engine.rb +40 -20
  25. data/lib/etl/parser/{delimited_parser.rb → csv_parser.rb} +3 -3
  26. data/lib/etl/parser/excel_parser.rb +1 -1
  27. data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
  28. data/lib/etl/processor/bulk_import_processor.rb +11 -0
  29. data/lib/etl/processor/check_exist_processor.rb +6 -6
  30. data/lib/etl/processor/check_unique_processor.rb +4 -0
  31. data/lib/etl/processor/database_join_processor.rb +25 -4
  32. data/lib/etl/processor/encode_processor.rb +0 -2
  33. data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
  34. data/lib/etl/processor/imapattachment_downloader_processor.rb +2 -2
  35. data/lib/etl/processor/pop3attachment_downloader_processor.rb +2 -2
  36. data/lib/etl/processor/row_processor.rb +10 -0
  37. data/lib/etl/processor/sftp_downloader_processor.rb +1 -1
  38. data/lib/etl/processor/sftp_uploader_processor.rb +1 -1
  39. data/lib/etl/processor/truncate_processor.rb +4 -1
  40. data/lib/etl/processor/zip_file_processor.rb +1 -1
  41. data/lib/etl/transform/foreign_key_lookup_transform.rb +57 -15
  42. data/lib/etl/transform/md5_transform.rb +13 -0
  43. data/lib/etl/transform/{string_to_datetime_transform.rb → string_to_date_time_transform.rb} +0 -0
  44. data/lib/etl/version.rb +1 -1
  45. data/test/.gitignore +0 -1
  46. data/test/check_exist_processor_test.rb +89 -0
  47. data/test/check_unique_processor_test.rb +40 -0
  48. data/test/config/.gitignore +1 -0
  49. data/test/config/database.yml +28 -0
  50. data/test/config/{Gemfile.rails-3.0.x → gemfiles/Gemfile.rails-3.0.x} +1 -1
  51. data/test/config/{Gemfile.rails-2.3.x → gemfiles/Gemfile.rails-3.1.x} +1 -1
  52. data/test/config/gemfiles/Gemfile.rails-3.2.x +3 -0
  53. data/test/config/gemfiles/common.rb +29 -0
  54. data/test/control_test.rb +2 -2
  55. data/test/data/nokogiri.xml +38 -0
  56. data/test/database_join_processor_test.rb +43 -0
  57. data/test/delimited.ctl +1 -1
  58. data/test/delimited_absolute.ctl +1 -3
  59. data/test/delimited_destination_db.ctl +1 -3
  60. data/test/delimited_excel.ctl +1 -1
  61. data/test/delimited_insert_update.ctl +1 -1
  62. data/test/delimited_update.ctl +1 -1
  63. data/test/delimited_with_bulk_load.ctl +2 -2
  64. data/test/destination_test.rb +0 -4
  65. data/test/encode_processor_test.rb +2 -0
  66. data/test/engine_test.rb +65 -19
  67. data/test/ensure_fields_presence_processor_test.rb +33 -0
  68. data/test/foreign_key_lookup_transform_test.rb +50 -0
  69. data/test/multiple_delimited.ctl +1 -1
  70. data/test/multiple_source_delimited.ctl +2 -2
  71. data/test/nokogiri_all.ctl +35 -0
  72. data/test/nokogiri_select.ctl +35 -0
  73. data/test/nokogiri_test.rb +35 -0
  74. data/test/parser_test.rb +2 -2
  75. data/test/performance/delimited.ctl +1 -1
  76. data/test/processor_test.rb +0 -3
  77. data/test/scd_test.rb +2 -8
  78. data/test/scd_test_type_1.ctl +1 -1
  79. data/test/scd_test_type_2.ctl +1 -1
  80. data/test/screen_test.rb +2 -3
  81. data/test/source_test.rb +19 -6
  82. data/test/test_helper.rb +6 -8
  83. data/test/truncate_processor_test.rb +37 -0
  84. metadata +121 -144
  85. data/README +0 -101
  86. data/active_support_logger.patch +0 -78
  87. data/test-matrix.yml +0 -10
  88. data/test/config/Gemfile.rails-2.3.x.lock +0 -38
  89. data/test/config/Gemfile.rails-3.0.x.lock +0 -49
  90. data/test/config/common.rb +0 -21
  91. data/test/connection/mysql/connection.rb +0 -9
  92. data/test/connection/mysql/schema.sql +0 -36
  93. data/test/connection/postgresql/connection.rb +0 -13
  94. data/test/connection/postgresql/schema.sql +0 -39
  95. data/test/vendor/adapter_extensions-0.5.0/CHANGELOG +0 -26
  96. data/test/vendor/adapter_extensions-0.5.0/LICENSE +0 -16
  97. data/test/vendor/adapter_extensions-0.5.0/README +0 -7
  98. data/test/vendor/adapter_extensions-0.5.0/Rakefile +0 -158
  99. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions.rb +0 -12
  100. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/abstract_adapter.rb +0 -44
  101. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/mysql_adapter.rb +0 -63
  102. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/postgresql_adapter.rb +0 -52
  103. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/connection_adapters/sqlserver_adapter.rb +0 -44
  104. data/test/vendor/adapter_extensions-0.5.0/lib/adapter_extensions/version.rb +0 -10
data/README DELETED
@@ -1,101 +0,0 @@
1
- Ruby Extract-Transform-Load (ETL) tool.
2
-
3
- == Requirements
4
-
5
- * Ruby 1.8.5 or higher
6
- * Rubygems
7
-
8
- == Online Documentation
9
-
10
- Available at https://github.com/activewarehouse/activewarehouse-etl/wiki
11
-
12
- == Features
13
-
14
- Current supported features:
15
-
16
- * ETL Domain Specific Language (DSL) - Control files are specified in a Ruby-based DSL
17
- * Multiple source types. Current supported types:
18
- * Fixed-width and delimited text files
19
- * XML files through SAX
20
- * Apache combined log format
21
- * Multiple destination types - file and database destinations
22
- * Support for extracting from multiple sources in a single job
23
- * Support for writing to multiple destinations in a single job
24
- * A variety of built-in transformations are included:
25
- * Date-to-string, string-to-date, string-to-datetime, string-to-timestamp
26
- * Type transformation supporting strings, integers, floats and big decimals
27
- * Trim
28
- * SHA-1
29
- * Decode from an external decode file
30
- * Default replacement for empty values
31
- * Ordinalize
32
- * Hierarchy lookup
33
- * Foreign key lookup
34
- * Ruby blocks
35
- * Any custom transformation class
36
- * A variety of build-in row-level processors
37
- * Check exists processor to determine if the record already exists in the destination database
38
- * Check unique processor to determine whether a matching record was processed during this job execution
39
- * Copy field
40
- * Rename field
41
- * Hierarchy exploder which takes a tree structure defined through a parent id and explodes it into a hierarchy bridge table
42
- * Surrogate key generator including support for looking up the last surrogate key from the target table using a custom query
43
- * Sequence generator including support for context-sensitive sequences where the context can be defined as a combination of fields from the source data
44
- * New row-level processors can easily be defined and applied
45
- * Pre-processing
46
- * Truncate processor
47
- * Post-processing
48
- * Bulk import using native RDBMS bulk loader tools
49
- * Virtual fields - Add a field to the destination data which doesn't exist in the source data
50
- * Built in job and record meta data
51
- * Support for type 1 and type 2 slowly changing dimensions
52
- * Automated effective date and end date time stamping for type 2
53
- * CRC checking
54
-
55
- == Dependencies
56
- ActiveWarehouse ETL depends on the following gems:
57
- * ActiveSupport Gem
58
- * ActiveRecord Gem
59
- * FasterCSV Gem
60
- * AdapterExtensions Gem
61
-
62
- == Usage
63
- Once the ActiveWarehouse ETL gem is installed jobs can be invoked using the
64
- included `etl` script. The etl script includes several command line options
65
- and can process multiple control files at a time.
66
-
67
- Command line options:
68
- * <tt>--help, -h</tt>: Display the usage message.
69
- * <tt>--config, -c</tt>: Specify a database.yml configuration file to use.
70
- * <tt>--limit, -l</tt>: Specify a limit to the number of rows to process. This option is currently only applicable to database sources.
71
- * <tt>--offset, -o</tt>: Specify the start offset for reading from the source. This option is currently only applicable to database sources.
72
- * <tt>--newlog, -n</tt>: Instruct the engine to create a new ETL log rather than append to the last ETL log.
73
- * <tt>--skip-bulk-import, -s</tt>: Skip any bulk imports.
74
- * <tt>--read-locally</tt>: Read from the local cache (skip source extraction)
75
-
76
- == Control File Examples
77
- Control file examples can be found in the examples directory.
78
-
79
- == Running Tests
80
-
81
- Current state:
82
- - 11 failures on MySQL
83
- - 1 failure on Postgres
84
-
85
- The tests require:
86
- - gem install shoulda
87
- - gem install flexmock
88
- - gem install pg (if you want to run the tests on pg)
89
- - gem install spreadsheet
90
- - gem install tmail
91
-
92
- The tests subfolder contains examples database.yml for mysql and postgres.
93
-
94
- To run the tests:
95
- - rake test DB=postgresql (for postgres)
96
- - otherwise just rake test
97
-
98
- == Feedback
99
- This is a work in progress. Comments should be made on the
100
- activewarehouse-discuss mailing list at the moment. Contributions are always
101
- welcome.
@@ -1,78 +0,0 @@
1
- Index: lib/active_support/clean_logger.rb
2
- ===================================================================
3
- --- lib/active_support/clean_logger.rb (revision 5963)
4
- +++ lib/active_support/clean_logger.rb (working copy)
5
- @@ -1,10 +1,21 @@
6
- require 'logger'
7
- require File.dirname(__FILE__) + '/core_ext/class/attribute_accessors'
8
-
9
- -class Logger #:nodoc:
10
- +# Extensions to the built in Ruby logger.
11
- +#
12
- +# If you want to use the default log formatter as defined in the Ruby core, then you
13
- +# will need to set the formatter for the logger as in:
14
- +#
15
- +# logger.formatter = Formatter.new
16
- +#
17
- +# You can then specify the datetime format, for example:
18
- +#
19
- +# logger.datetime_format = "%Y-%m-%d"
20
- +class Logger
21
- + # Set to false to disable the silencer
22
- cattr_accessor :silencer
23
- self.silencer = true
24
- -
25
- +
26
- # Silences the logger for the duration of the block.
27
- def silence(temporary_level = Logger::ERROR)
28
- if silencer
29
- @@ -18,6 +29,35 @@
30
- yield self
31
- end
32
- end
33
- +
34
- + alias :old_datetime_format= :datetime_format=
35
- + # Logging date-time format (string passed to +strftime+). Ignored if the formatter
36
- + # does not respond to datetime_format=.
37
- + def datetime_format=(datetime_format)
38
- + formatter.datetime_format = datetime_format if formatter.respond_to?(:datetime_format=)
39
- + end
40
- +
41
- + alias :old_datetime_format :datetime_format
42
- + # Get the logging datetime format. Returns nil if the formatter does not support
43
- + # datetime formatting.
44
- + def datetime_format
45
- + formatter.datetime_format if formatter.respond_to?(:datetime_format)
46
- + end
47
- +
48
- + alias :old_formatter :formatter
49
- + # Get the current formatter. The default formatter is a SimpleFormatter which only
50
- + # displays the log message
51
- + def formatter
52
- + @formatter ||= SimpleFormatter.new
53
- + end
54
- +
55
- + # Simple formatter which only displays the message.
56
- + class SimpleFormatter < Logger::Formatter
57
- + # This method is invoked when a log event occurs
58
- + def call(severity, timestamp, progname, msg)
59
- + "#{msg}\n"
60
- + end
61
- + end
62
-
63
- private
64
- alias old_format_message format_message
65
- @@ -28,11 +68,11 @@
66
- # with Logger from 1.8.3 and vice versa.
67
- if method_defined?(:formatter=)
68
- def format_message(severity, timestamp, progname, msg)
69
- - "#{msg}\n"
70
- + formatter.call(severity, timestamp, progname, msg)
71
- end
72
- else
73
- def format_message(severity, timestamp, msg, progname)
74
- - "#{msg}\n"
75
- + formatter.call(severity, timestamp, progname, msg)
76
- end
77
- end
78
- end
@@ -1,10 +0,0 @@
1
- rvm:
2
- - 1.8.7
3
- - 1.9.2
4
- # - jruby-1.6.2
5
- rails:
6
- - 2.3.x
7
- - 3.0.x
8
- database:
9
- - mysql
10
- - postgresql
@@ -1,38 +0,0 @@
1
- GEM
2
- remote: http://rubygems.org/
3
- specs:
4
- activerecord (2.3.11)
5
- activesupport (= 2.3.11)
6
- activesupport (2.3.11)
7
- fastercsv (1.5.4)
8
- flexmock (0.9.0)
9
- mysql (2.8.1)
10
- net-sftp (2.0.5)
11
- net-ssh (>= 2.0.9)
12
- net-ssh (2.1.4)
13
- pg (0.11.0)
14
- rdoc (3.6.1)
15
- ruby-ole (1.2.11.1)
16
- shoulda (2.11.3)
17
- spreadsheet (0.6.5.4)
18
- ruby-ole (>= 1.0)
19
- tmail (1.2.7.1)
20
- zip (2.0.2)
21
-
22
- PLATFORMS
23
- java
24
- ruby
25
-
26
- DEPENDENCIES
27
- activerecord (= 2.3.11)
28
- activesupport (= 2.3.11)
29
- fastercsv (= 1.5.4)
30
- flexmock (= 0.9.0)
31
- mysql (= 2.8.1)
32
- net-sftp (= 2.0.5)
33
- pg (= 0.11.0)
34
- rdoc
35
- shoulda (= 2.11.3)
36
- spreadsheet (= 0.6.5.4)
37
- tmail (= 1.2.7.1)
38
- zip (= 2.0.2)
@@ -1,49 +0,0 @@
1
- GEM
2
- remote: http://rubygems.org/
3
- specs:
4
- activemodel (3.0.7)
5
- activesupport (= 3.0.7)
6
- builder (~> 2.1.2)
7
- i18n (~> 0.5.0)
8
- activerecord (3.0.7)
9
- activemodel (= 3.0.7)
10
- activesupport (= 3.0.7)
11
- arel (~> 2.0.2)
12
- tzinfo (~> 0.3.23)
13
- activesupport (3.0.7)
14
- arel (2.0.10)
15
- builder (2.1.2)
16
- fastercsv (1.5.4)
17
- flexmock (0.9.0)
18
- i18n (0.5.0)
19
- mysql (2.8.1)
20
- net-sftp (2.0.5)
21
- net-ssh (>= 2.0.9)
22
- net-ssh (2.1.4)
23
- pg (0.11.0)
24
- rdoc (3.6.1)
25
- ruby-ole (1.2.11.1)
26
- shoulda (2.11.3)
27
- spreadsheet (0.6.5.4)
28
- ruby-ole (>= 1.0)
29
- tmail (1.2.7.1)
30
- tzinfo (0.3.27)
31
- zip (2.0.2)
32
-
33
- PLATFORMS
34
- java
35
- ruby
36
-
37
- DEPENDENCIES
38
- activerecord (= 3.0.7)
39
- activesupport (= 3.0.7)
40
- fastercsv (= 1.5.4)
41
- flexmock (= 0.9.0)
42
- mysql (= 2.8.1)
43
- net-sftp (= 2.0.5)
44
- pg (= 0.11.0)
45
- rdoc
46
- shoulda (= 2.11.3)
47
- spreadsheet (= 0.6.5.4)
48
- tmail (= 1.2.7.1)
49
- zip (= 2.0.2)
@@ -1,21 +0,0 @@
1
- def common_gemfile(rails_version)
2
- source :rubygems
3
-
4
- # using explicit versions for the gems to avoid any weirdness later on
5
- gem "activesupport", rails_version
6
- gem "activerecord", rails_version
7
-
8
- gem "fastercsv", "1.5.4"
9
- gem "spreadsheet", "0.6.5.4"
10
- gem "tmail", "1.2.7.1"
11
- gem "net-sftp", "2.0.5"
12
- gem "zip", "2.0.2"
13
-
14
- gem "shoulda", "2.11.3"
15
- gem "flexmock", "0.9.0"
16
-
17
- gem "mysql", "2.8.1"
18
- gem "pg", "0.11.0"
19
-
20
- gem "rdoc"
21
- end
@@ -1,9 +0,0 @@
1
- print "Using native MySQL\n"
2
-
3
- puts "Resetting database"
4
- conn = ETL::Engine.connection(:data_warehouse)
5
- conn.recreate_database(conn.current_database)
6
- conn.reconnect!
7
- lines = open(File.join(File.dirname(__FILE__), 'schema.sql')).readlines
8
- lines.join.split(';').each { |line| conn.execute(line) }
9
- conn.disconnect!
@@ -1,36 +0,0 @@
1
- drop table if exists people;
2
- create table people (
3
- id int not null primary key,
4
- first_name char(255) not null,
5
- last_name char(255) not null,
6
- ssn char(64) not null
7
- );
8
- drop table if exists places;
9
- create table places (
10
- address text,
11
- city char(255),
12
- state char(255),
13
- country char(2)
14
- );
15
-
16
- drop table if exists person_dimension;
17
- create table person_dimension (
18
- id int not null primary key,
19
- first_name char(50),
20
- last_name char(50),
21
- address char(100),
22
- city char(50),
23
- state char(50),
24
- zip_code char(20),
25
- effective_date datetime,
26
- end_date datetime,
27
- latest_version boolean not null
28
- );
29
-
30
- drop table if exists truncate_test;
31
- create table truncate_test (
32
- x char(4)
33
- );
34
- insert into truncate_test (x) values ('a');
35
- insert into truncate_test (x) values ('b');
36
- insert into truncate_test (x) values ('c');
@@ -1,13 +0,0 @@
1
- print "Using PostgreSQL\n"
2
-
3
- puts "Resetting database"
4
- conn = ETL::Engine.connection(:data_warehouse)
5
-
6
- lines = open(File.join(File.dirname(__FILE__), 'schema.sql')).readlines
7
- lines.join.split(';').each_with_index do |line, index|
8
- begin
9
- conn.execute(line)
10
- rescue => e
11
- puts "failed to load line #{index}: #{e}"
12
- end
13
- end
@@ -1,39 +0,0 @@
1
- drop table people;
2
- create table people (
3
- id SERIAL PRIMARY KEY,
4
- first_name character varying(255) not null,
5
- /* null below allowed for bulk_import_with_empties.txt test */
6
- last_name character varying(255) null,
7
- ssn character varying(64) not null
8
- );
9
-
10
- drop table places;
11
- create table places (
12
- id SERIAL PRIMARY KEY,
13
- address text,
14
- city character varying(255),
15
- state character varying(255),
16
- country character varying(2)
17
- );
18
-
19
- drop table person_dimension;
20
- create table person_dimension (
21
- id SERIAL PRIMARY KEY,
22
- first_name character varying(50),
23
- last_name character varying(50),
24
- address character varying(100),
25
- city character varying(50),
26
- state character varying(50),
27
- zip_code character varying(20),
28
- effective_date timestamp without time zone,
29
- end_date timestamp without time zone,
30
- latest_version boolean not null
31
- );
32
-
33
- drop table truncate_test;
34
- create table truncate_test (
35
- x character varying(4)
36
- );
37
- insert into truncate_test (x) values ('a');
38
- insert into truncate_test (x) values ('b');
39
- insert into truncate_test (x) values ('c');
@@ -1,26 +0,0 @@
1
- 0.1.0 - March 5, 2007
2
- * Initial release
3
-
4
- 0.1.1 - March 5, 2007
5
- * Bug fixes
6
-
7
- 0.1.2 - March 5, 2007
8
- * Bug fixes
9
-
10
- 0.2.0 - March 6, 2007
11
- * SQL Server adapter included (Seth Ladd)
12
-
13
- 0.3.0 - March 8, 2007
14
- * PostgreSQL adapter included
15
- * Added tests for bulk loading
16
- * bulk_load method now handles table missing and file missing as error cases
17
-
18
- 0.3.1 - May 4, 2007
19
- * Added support for modifying SELECT statements to add an INSERT INTO.
20
-
21
- 0.4 - September 17, 2007
22
- * Added copy_table method that can copy the structure and data from one table to another. Currently implemented in MySQL (tested), PostgreSQL (tested) and SQL Server adapters (untested).
23
- * Added support for SELECT..INTO for PostgreSQL.
24
-
25
- 0.5 -
26
- * Updated dependencies for gem to current versions of ActiveRecord, ActiveSupport and Rake. May not be compatible with Rails versions less than 2.x.
@@ -1,16 +0,0 @@
1
- Copyright (c) 2007 Anthony Eden
2
-
3
- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
4
- associated documentation files (the "Software"), to deal in the Software without restriction, including
5
- without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
6
- copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the
7
- following conditions:
8
-
9
- The above copyright notice and this permission notice shall be included in all copies or substantial
10
- portions of the Software.
11
-
12
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
13
- LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
14
- NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
15
- WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
16
- SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.