darrell-activewarehouse-etl 0.9.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (83) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +99 -0
  4. data/Rakefile +175 -0
  5. data/TODO +28 -0
  6. data/bin/etl +28 -0
  7. data/bin/etl.cmd +8 -0
  8. data/examples/database.example.yml +16 -0
  9. data/lib/etl/batch/batch.rb +111 -0
  10. data/lib/etl/batch/directives.rb +55 -0
  11. data/lib/etl/batch.rb +2 -0
  12. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  13. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  14. data/lib/etl/builder.rb +2 -0
  15. data/lib/etl/commands/etl.rb +89 -0
  16. data/lib/etl/control/control.rb +405 -0
  17. data/lib/etl/control/destination/database_destination.rb +97 -0
  18. data/lib/etl/control/destination/file_destination.rb +126 -0
  19. data/lib/etl/control/destination.rb +448 -0
  20. data/lib/etl/control/source/database_source.rb +220 -0
  21. data/lib/etl/control/source/enumerable_source.rb +11 -0
  22. data/lib/etl/control/source/file_source.rb +90 -0
  23. data/lib/etl/control/source/model_source.rb +39 -0
  24. data/lib/etl/control/source.rb +109 -0
  25. data/lib/etl/control.rb +3 -0
  26. data/lib/etl/core_ext/time/calculations.rb +42 -0
  27. data/lib/etl/core_ext/time.rb +5 -0
  28. data/lib/etl/core_ext.rb +1 -0
  29. data/lib/etl/engine.rb +556 -0
  30. data/lib/etl/execution/base.rb +9 -0
  31. data/lib/etl/execution/batch.rb +8 -0
  32. data/lib/etl/execution/job.rb +8 -0
  33. data/lib/etl/execution/migration.rb +85 -0
  34. data/lib/etl/execution.rb +19 -0
  35. data/lib/etl/generator/generator.rb +20 -0
  36. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  37. data/lib/etl/generator.rb +2 -0
  38. data/lib/etl/http_tools.rb +139 -0
  39. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  40. data/lib/etl/parser/delimited_parser.rb +74 -0
  41. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  42. data/lib/etl/parser/parser.rb +41 -0
  43. data/lib/etl/parser/sax_parser.rb +218 -0
  44. data/lib/etl/parser/xml_parser.rb +65 -0
  45. data/lib/etl/parser.rb +11 -0
  46. data/lib/etl/processor/block_processor.rb +14 -0
  47. data/lib/etl/processor/bulk_import_processor.rb +83 -0
  48. data/lib/etl/processor/check_exist_processor.rb +80 -0
  49. data/lib/etl/processor/check_unique_processor.rb +35 -0
  50. data/lib/etl/processor/copy_field_processor.rb +26 -0
  51. data/lib/etl/processor/encode_processor.rb +55 -0
  52. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  53. data/lib/etl/processor/print_row_processor.rb +12 -0
  54. data/lib/etl/processor/processor.rb +25 -0
  55. data/lib/etl/processor/rename_processor.rb +24 -0
  56. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  57. data/lib/etl/processor/row_processor.rb +17 -0
  58. data/lib/etl/processor/sequence_processor.rb +23 -0
  59. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  60. data/lib/etl/processor/truncate_processor.rb +35 -0
  61. data/lib/etl/processor.rb +11 -0
  62. data/lib/etl/row.rb +20 -0
  63. data/lib/etl/screen/row_count_screen.rb +20 -0
  64. data/lib/etl/screen.rb +14 -0
  65. data/lib/etl/transform/block_transform.rb +13 -0
  66. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  67. data/lib/etl/transform/decode_transform.rb +51 -0
  68. data/lib/etl/transform/default_transform.rb +20 -0
  69. data/lib/etl/transform/foreign_key_lookup_transform.rb +169 -0
  70. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  71. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  72. data/lib/etl/transform/sha1_transform.rb +13 -0
  73. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  74. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  75. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  76. data/lib/etl/transform/transform.rb +61 -0
  77. data/lib/etl/transform/trim_transform.rb +26 -0
  78. data/lib/etl/transform/type_transform.rb +35 -0
  79. data/lib/etl/transform.rb +2 -0
  80. data/lib/etl/util.rb +59 -0
  81. data/lib/etl/version.rb +9 -0
  82. data/lib/etl.rb +83 -0
  83. metadata +245 -0
data/CHANGELOG ADDED
@@ -0,0 +1,198 @@
1
+ 0.1.0 - Dec 6, 2006
2
+ * Initial release
3
+
4
+ 0.2.0 - Dec 7, 2006
5
+ * Added an XML parser for source parsing
6
+ * Added support for compound key constraints in destinations via the
7
+ :unique => [] option
8
+ * Added ability to declare explicit columns in bulk import
9
+ * Added support for generators in destinations
10
+ * Added a SurrogateKeyGenerator for cases where the database doesn't support
11
+ auto generated surrogate keys
12
+
13
+ 0.3.0 - Dec 19, 2006
14
+ * Added support for calculated values in virtual fields with Proc
15
+
16
+ 0.4.0 - Jan 11, 2006
17
+ * Added :skip_lines option to file source configurations, which can be used
18
+ to skip the first n lines in the source data file
19
+ * Added better error handling in delimited parser - an error is now raised
20
+ if the expected and actual field lengths do not match
21
+ * Added :truncate option for database destination. Set to true to truncate
22
+ before importing data.
23
+ * Added support for :unique => [] option and virtual fields for the database
24
+ destination
25
+
26
+ 0.5.0 - Feb 17, 2007
27
+ * Changed require_gem to gem and added alias to allow for older versions of
28
+ rubygems.
29
+ * Added support for Hash in the source configuration where :name => :parser_name
30
+ defines the parser to use and :options => {} defines options to pass to the
31
+ parser.
32
+ * Added support for passing a custom Parser class in the source configuration.
33
+ * Removed the need to include Enumerable in each parser implementation.
34
+ * Added new date_to_string and string_to_date transformers.
35
+ * Implemented foreign_key_lookup transform including an ActiveRecordResolver.
36
+ * Added real time activity logging which is called when the etl bin script is
37
+ invoked.
38
+ * Improved error handling.
39
+ * Default logger level is now WARN.
40
+
41
+ 0.5.1 - Feb 18, 2007
42
+ * Fixed up truncate processor.
43
+ * Updated HOW_TO_RELEASE doc.
44
+
45
+ 0.5.2 - Feb 19, 2007
46
+ * Added error threshold.
47
+ * Fixed problem with transform error handling.
48
+
49
+ 0.6.0 - Mar 8, 2007
50
+ * Fixed missing method problem in validate in Control class.
51
+ * Removed control validation for now (source could be code in the control file).
52
+ * Transform interface now defined as taking 3 arguments, the field name, field
53
+ value and the row. This is not backwards compatible.
54
+ * Added HierarchyLookupTransform.
55
+ * Added DefaultTransform which will return a specified value if the initial
56
+ value is blank.
57
+ * Added row-level processing.
58
+ * Added HierarchyExploderProcessor which takes a single hierarchy row and
59
+ explodes it to multiple rows as used in a hierarchy bridge.
60
+ * Added ApacheCombinedLogParser which parses Apache Combined Log format,
61
+ including parsing of the
62
+ user agent string and the URI, returning a Hash.
63
+ * Fixed bug in SAX parser so that attributes are now set when the start_element
64
+ event is received.
65
+ * Added an HttpTools module which provides some parsing methods (for user agent
66
+ and URI).
67
+ * Database source now uses its own class for establishing an ActiveRecord
68
+ connection.
69
+ * Log files are now timestamped.
70
+ * Source files are now archived automatically during the extraction process
71
+ * Added a :condition option to the destination configuration Hash that accepts
72
+ a Proc with a single argument passed to it (the row).
73
+ * Added an :append_rows option to the destination configuration Hash that
74
+ accepts either a Hash (to append a single row) or an Array of Hashes (to
75
+ append multiple rows).
76
+ * Only print the read and written row counts if there is at least one source
77
+ and one destination respectively.
78
+ * Added a depends_on directive that accepts a list of arguments of either strings
79
+ or symbols. Each symbol is converted to a string and .ctl is appended;
80
+ strings are passed through directly. The dependencies are executed in the order
81
+ they are specified.
82
+ * The default field separator in the bulk loader is now a comma (was a tab).
83
+
84
+ 0.6.1 - Mar 22, 2007
85
+ * Added support for absolute paths in file sources
86
+ * Added CopyFieldProcessor
87
+
88
+ 0.7 - Apr 8, 2007
89
+ * Job execution is now tracked in a database. This means that ActiveRecord is
90
+ required regardless of the sources being used in the ETL scripts. An example
91
+ database configuration for the etl can be found in test/database.example.yml.
92
+ This file is loaded from either a.) the current working directory or b.) the
93
+ location specified using the -c command line argument when running the etl
94
+ command.
95
+ * etl script now supports the following command line arguments:
96
+ ** -h or --help: Prints the usage
97
+ ** -l or --limit: Specifies a limit for the number of source rows to read,
98
+ useful for testing your control files before executing a full ETL process
99
+ ** -o or --offset: Specified a start offset for reading from the source, useful
100
+ for testing your control files before executing a full ETL process
101
+ ** -c or --config: Specify the database.yml file to configure the ETL
102
+ execution data store
103
+ ** -n or --newlog: Write to the logfile rather than appending to it
104
+ * Database source now supports specifying the select, join and order parts of
105
+ the query.
106
+ * Database source understands the limit argument specified on the etl command
107
+ line
108
+ * Added CheckExistProcessor
109
+ * Added CheckUniqueProcessor
110
+ * Added SurrogateKeyProcessor. The SurrogateKey processor should be used in
111
+ conjunction with the CheckExistProcessor and CheckUniqueProcessor to provide
112
+ surrogate keys for all dimension records.
113
+ * Added SequenceProcessor
114
+ * Added OrdinalizeTransform
115
+ * Fixed a bug in the trim transform
116
+ * Sources now provide a trigger file which can be used to indicate that the
117
+ original source data has been completely extracted to the local file system.
118
+ This is useful if you need to recover from a failed ETL process.
119
+ * Updated README
120
+
121
+ 0.7.1 - Apr 8, 2007
122
+ * Fixed source caching
123
+
124
+ 0.7.2 - Apr 8, 2007
125
+ * Fixed quoting bug in CheckExistProcessor
126
+
127
+ 0.8.0 - Apr 12, 2007
128
+ * Source now available through the current row source accessor.
129
+ * Added new_rows_only configuration option to DatabaseSource. A date field must
130
+ be specified and only records that are greater than the date value in that
131
+ field, relative to the last successful
132
+ execution, will be returned from the source.
133
+ * Added an (untested) count feature which returns the number of rows for
134
+ processing.
135
+ * If no natural key is defined then an empty array will now be used, resulting
136
+ in the row being written to the output without going through change checks.
137
+ * Mapping argument in destination is now optional. An empty hash will be used
138
+ if the mapping hash is not specified. If the mapping hash is not specified
139
+ then the order will be determined using the originating source's order.
140
+ * ActiveRecord configurations loaded from database.yml by the etl tool will be
141
+ merged with ActiveRecord::Base.configurations.
142
+ * Fixed several bugs in how record change detection was implemented.
143
+ * Fixed how the read_locally functionality was implemented so that it will find
144
+ that last completed local source copy using the source's trigger file (untested).
145
+
146
+ 0.8.1 - Apr 12, 2007
147
+ * Added EnumerableSource
148
+ * Added :type configuration option to the source directive, allowing the source
149
+ type to be explicitly specified. The source type can be a string or symbol
150
+ (in which case the class will be constructed by appending Source to the type
151
+ name), a class (which will be instantiate and passed the control,
152
+ configuration and mapping) and finally an actual Source instance.
153
+
154
+ 0.8.2 - April 15, 2007
155
+ * Fixed bug with premature destination closing.
156
+ * Added indexes to execution records table.
157
+ * Added a PrintRowProcessor.
158
+ * Added support for conditions and "group by" in the database source.
159
+ * Added after_initialize hook in Processor base class.
160
+ * Added examples directory
161
+
162
+ 0.8.3 - May 13, 2007
163
+ * Added patches from Andy Triboletti
164
+
165
+ 0.8.4 - May 24, 2007
166
+ * Added fix for backslash in file writer
167
+
168
+ 0.9.0 - August 9, 2007
169
+ * Added support for batch processing through .ebf files. These files are
170
+ essentially control files that apply settings to an entire ETL process.
171
+ * Implemented support for screen blocks. These blocks can be used to test
172
+ the data and raise an error if the screens do not pass.
173
+ * Connections are now cached in a Hash available through
174
+ ETL::Engine.connection(name). This should be used rather than including
175
+ connection information in the control files.
176
+ * Implemented temp table support throughout.
177
+ * DateDimensionBuilder now included in ActiveWarehouse ETL directly.
178
+ * Time calculations for fiscal year now included in ActiveWarehouse ETL.
179
+
180
+ 0.9.1 -
181
+ * SQLResolver now uses ETL::Engine.table so it may utilize temp tables. (aeden)
182
+ * Added Thibaut Barrère's encode processor.
183
+ * Added MockSource and MockDestination test helpers (thbar)
184
+ * Added the block processor. Can call a block once (pre/post processor)
185
+ or once for each row (after_read/before_write row processor) (thbar)
186
+ * Changed temp table to use new AdapterExtension copy_table method (aeden)
187
+ * Added bin/etl.cmd windows batch - just add the bin folder to your PATH
188
+ and it will let you call etl on an unpacked/pistoned version of AW-ETL (thbar)
189
+ * Upgraded to support Rails 2.1. No longer compatible with older versions of Rails.
190
+ * Added ETL::Builder::TimeDimensionBuilder
191
+ * Added :default option to ForeignKeyLookupTransform that will be used if no
192
+ foreign key is found.
193
+ * Added :cache option to ForeignKeyLookupTransform that will preload the FK
194
+ mappings if the underlying resolver supports it. Currently supported by
195
+ SQLResolver.
196
+ * A Class extending ETL::Transform::Transform may now be passed as a transformer.
197
+ For example, in the control file you would define the transform as:
198
+ transform :a_field, MyTransform, {:option1 => 'option1'}.
data/LICENSE ADDED
@@ -0,0 +1,7 @@
1
+ Copyright (c) 2006-2007 Anthony Eden
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,99 @@
1
+ Ruby Extract-Transform-Load (ETL) tool.
2
+
3
+ == Requirements
4
+
5
+ * Ruby 1.8.5 or higher
6
+ * Rubygems
7
+
8
+ == Online Documentation
9
+
10
+ Available at http://activewarehouse.rubyforge.org/docs/activewarehouse-etl.html
11
+
12
+ == Features
13
+
14
+ Current supported features:
15
+
16
+ * ETL Domain Specific Language (DSL) - Control files are specified in a Ruby-based DSL
17
+ * Multiple source types. Current supported types:
18
+ * Fixed-width and delimited text files
19
+ * XML files through SAX
20
+ * Apache combined log format
21
+ * Multiple destination types - file and database destinations
22
+ * Support for extracting from multiple sources in a single job
23
+ * Support for writing to multiple destinations in a single job
24
+ * A variety of built-in transformations are included:
25
+ * Date-to-string, string-to-date, string-to-datetime, string-to-timestamp
26
+ * Type transformation supporting strings, integers, floats and big decimals
27
+ * Trim
28
+ * SHA-1
29
+ * Decode from an external decode file
30
+ * Default replacement for empty values
31
+ * Ordinalize
32
+ * Hierarchy lookup
33
+ * Foreign key lookup
34
+ * Ruby blocks
35
+ * Any custom transformation class
36
+ * A variety of build-in row-level processors
37
+ * Check exists processor to determine if the record already exists in the destination database
38
+ * Check unique processor to determine whether a matching record was processed during this job execution
39
+ * Copy field
40
+ * Rename field
41
+ * Hierarchy exploder which takes a tree structure defined through a parent id and explodes it into a hierarchy bridge table
42
+ * Surrogate key generator including support for looking up the last surrogate key from the target table using a custom query
43
+ * Sequence generator including support for context-sensitive sequences where the context can be defined as a combination of fields from the source data
44
+ * New row-level processors can easily be defined and applied
45
+ * Pre-processing
46
+ * Truncate processor
47
+ * Post-processing
48
+ * Bulk import using native RDBMS bulk loader tools
49
+ * Virtual fields - Add a field to the destination data which doesn't exist in the source data
50
+ * Built in job and record meta data
51
+ * Support for type 1 and type 2 slowly changing dimensions
52
+ * Automated effective date and end date time stamping for type 2
53
+ * CRC checking
54
+
55
+ == Dependencies
56
+ ActiveWarehouse ETL depends on the following gems:
57
+ * ActiveSupport Gem
58
+ * ActiveRecord Gem
59
+ * FasterCSV Gem
60
+ * AdapterExtensions Gem
61
+
62
+ == Usage
63
+ Once the ActiveWarehouse ETL gem is installed jobs can be invoked using the
64
+ included `etl` script. The etl script includes several command line options
65
+ and can process multiple control files at a time.
66
+
67
+ Command line options:
68
+ * <tt>--help, -h</tt>: Display the usage message.
69
+ * <tt>--config, -c</tt>: Specify a database.yml configuration file to use.
70
+ * <tt>--limit, -l</tt>: Specify a limit to the number of rows to process. This option is currently only applicable to database sources.
71
+ * <tt>--offset, -o</tt>: Specify the start offset for reading from the source. This option is currently only applicable to database sources.
72
+ * <tt>--newlog, -n</tt>: Instruct the engine to create a new ETL log rather than append to the last ETL log.
73
+ * <tt>--skip-bulk-import, -s</tt>: Skip any bulk imports.
74
+ * <tt>--read-locally</tt>: Read from the local cache (skip source extraction)
75
+
76
+ == Control File Examples
77
+ Control file examples can be found in the examples directory.
78
+
79
+ == Running Tests
80
+
81
+ Current state:
82
+ - 11 failures on MySQL
83
+ - 1 failure on Postgres
84
+
85
+ The tests require:
86
+ - gem install shoulda
87
+ - gem install flexmock
88
+ - gem install pg (if you want to run the tests on pg)
89
+
90
+ The tests subfolder contains examples database.yml for mysql and postgres.
91
+
92
+ To run the tests:
93
+ - rake test DB=postgresql (for postgres)
94
+ - otherwise just rake test
95
+
96
+ == Feedback
97
+ This is a work in progress. Comments should be made on the
98
+ activewarehouse-discuss mailing list at the moment. Contributions are always
99
+ welcome.
data/Rakefile ADDED
@@ -0,0 +1,175 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rake/rdoctask'
4
+ require 'rake/packagetask'
5
+ require 'rake/gempackagetask'
6
+ require 'rake/contrib/rubyforgepublisher'
7
+
8
+ require File.join(File.dirname(__FILE__), 'lib/etl', 'version')
9
+
10
+ module AWETL
11
+ PKG_BUILD = ENV['PKG_BUILD'] ? '.' + ENV['PKG_BUILD'] : ''
12
+ PKG_NAME = 'activewarehouse-etl'
13
+ PKG_VERSION = ETL::VERSION::STRING + PKG_BUILD
14
+ PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
15
+ PKG_DESTINATION = ENV["PKG_DESTINATION"] || "../#{PKG_NAME}"
16
+ end
17
+
18
+ # runcoderun
19
+
20
+ task :create_extra_mysql_db do
21
+ cmd_string = %[mysqladmin create etl_unittest -u build]
22
+ system cmd_string
23
+
24
+ cmd_string = %[mysqladmin create etl_unittest_execution -u build]
25
+ system cmd_string
26
+ end
27
+
28
+ task :copy_runcoderun_yml do
29
+ system("cp #{File.dirname(__FILE__)}/test/database.runcoderun.yml #{File.dirname(__FILE__)}/test/database.yml")
30
+ end
31
+
32
+ def runcoderun?
33
+ ENV["RUN_CODE_RUN"]
34
+ end
35
+
36
+ if runcoderun?
37
+ task :default => [:create_extra_mysql_db, :copy_runcoderun_yml, :test]
38
+ else
39
+ desc 'Default: run unit tests.'
40
+ task :default => :test
41
+ end
42
+
43
+ desc 'Test the ETL application.'
44
+ Rake::TestTask.new(:test) do |t|
45
+ t.libs << 'lib'
46
+ t.pattern = 'test/**/*_test.rb'
47
+ t.verbose = true
48
+ # TODO: reset the database
49
+ end
50
+
51
+ desc 'Generate documentation for the ETL application.'
52
+ Rake::RDocTask.new(:rdoc) do |rdoc|
53
+ rdoc.rdoc_dir = 'rdoc'
54
+ rdoc.title = 'ActiveWarehouse ETL'
55
+ rdoc.options << '--line-numbers' << '--inline-source'
56
+ rdoc.rdoc_files.include('README')
57
+ rdoc.rdoc_files.include('lib/**/*.rb')
58
+ end
59
+
60
+ namespace :rcov do
61
+ desc 'Measures test coverage'
62
+ task :test do
63
+ rm_f 'coverage.data'
64
+ mkdir 'coverage' unless File.exist?('coverage')
65
+ rcov = "rcov --aggregate coverage.data --text-summary -Ilib"
66
+ system("#{rcov} test/*_test.rb")
67
+ # system("open coverage/index.html") if PLATFORM['darwin']
68
+ end
69
+ end
70
+
71
+ # Gem Spec
72
+
73
+ module AWETL
74
+ def self.package_files(package_prefix)
75
+ FileList[
76
+ "#{package_prefix}CHANGELOG",
77
+ "#{package_prefix}LICENSE",
78
+ "#{package_prefix}README",
79
+ "#{package_prefix}TODO",
80
+ "#{package_prefix}Rakefile",
81
+ "#{package_prefix}bin/**/*",
82
+ "#{package_prefix}doc/**/*",
83
+ "#{package_prefix}lib/**/*",
84
+ "#{package_prefix}examples/**/*",
85
+ ] - [ "#{package_prefix}test" ]
86
+ end
87
+
88
+ def self.spec(package_prefix = '')
89
+ Gem::Specification.new do |s|
90
+ s.name = 'activewarehouse-etl'
91
+ s.version = AWETL::PKG_VERSION
92
+ s.summary = "Pure Ruby ETL package."
93
+ s.description = <<-EOF
94
+ ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.
95
+ EOF
96
+
97
+ s.add_dependency('rake', '>= 0.8.3')
98
+ s.add_dependency('activesupport', '>= 2.1.0')
99
+ s.add_dependency('activerecord', '>= 2.1.0')
100
+ s.add_dependency('fastercsv', '>= 1.2.0')
101
+ s.add_dependency('adapter_extensions', '>= 0.5.0')
102
+
103
+ s.rdoc_options << '--exclude' << '.'
104
+ s.has_rdoc = false
105
+
106
+ s.files = package_files(package_prefix).to_a.delete_if {|f| f.include?('.svn')}
107
+ s.require_path = 'lib'
108
+
109
+ s.bindir = "#{package_prefix}bin" # Use these for applications.
110
+ s.executables = ['etl']
111
+ s.default_executable = "etl"
112
+
113
+ s.author = "Anthony Eden"
114
+ s.email = "anthonyeden@gmail.com"
115
+ s.homepage = "http://activewarehouse.rubyforge.org/etl"
116
+ s.rubyforge_project = "activewarehouse"
117
+ end
118
+ end
119
+ end
120
+
121
+ Rake::GemPackageTask.new(AWETL.spec) do |pkg|
122
+ pkg.gem_spec = AWETL.spec
123
+ pkg.need_tar = true
124
+ pkg.need_zip = true
125
+ end
126
+
127
+ desc "Generate code statistics"
128
+ task :lines do
129
+ lines, codelines, total_lines, total_codelines = 0, 0, 0, 0
130
+
131
+ for file_name in FileList["lib/**/*.rb"]
132
+ next if file_name =~ /vendor/
133
+ f = File.open(file_name)
134
+
135
+ while line = f.gets
136
+ lines += 1
137
+ next if line =~ /^\s*$/
138
+ next if line =~ /^\s*#/
139
+ codelines += 1
140
+ end
141
+ puts "L: #{sprintf("%4d", lines)}, LOC #{sprintf("%4d", codelines)} | #{file_name}"
142
+
143
+ total_lines += lines
144
+ total_codelines += codelines
145
+
146
+ lines, codelines = 0, 0
147
+ end
148
+
149
+ puts "Total: Lines #{total_lines}, LOC #{total_codelines}"
150
+ end
151
+
152
+ desc "Publish the release files to RubyForge."
153
+ task :release => [ :package ] do
154
+ `rubyforge login`
155
+
156
+ for ext in %w( gem tgz zip )
157
+ release_command = "rubyforge add_release activewarehouse #{AWETL::PKG_NAME} 'REL #{AWETL::PKG_VERSION}' pkg/#{AWETL::PKG_NAME}-#{AWETL::PKG_VERSION}.#{ext}"
158
+ puts release_command
159
+ system(release_command)
160
+ end
161
+ end
162
+
163
+ desc "Publish the API documentation"
164
+ task :pdoc => [:rdoc] do
165
+ Rake::SshDirPublisher.new("aeden@rubyforge.org", "/var/www/gforge-projects/activewarehouse/etl/rdoc", "rdoc").upload
166
+ end
167
+
168
+ desc "Reinstall the gem from a local package copy"
169
+ task :reinstall => [:package] do
170
+ windows = RUBY_PLATFORM =~ /mswin/
171
+ sudo = windows ? '' : 'sudo'
172
+ gem = windows ? 'gem.bat' : 'gem'
173
+ `#{sudo} #{gem} uninstall #{AWETL::PKG_NAME} -x`
174
+ `#{sudo} #{gem} install pkg/#{AWETL::PKG_NAME}-#{AWETL::PKG_VERSION}`
175
+ end
data/TODO ADDED
@@ -0,0 +1,28 @@
1
+ TODO
2
+
3
+ * Add build-in support for audit_dimension
4
+ * Do not rerun the processing if it isn't needed, i.e. the source and control files have not been modified (allow forced override)
5
+ * Provide greater control in error handling
6
+ ** Allow a error threshold
7
+ ** Don't die completely if a parse error, just stop processing that specific file if error threshold is reached
8
+ ** Allow mismatch row length error in delimited parser to be ignored
9
+ * Improve error messages throughout, but especially in problems with the control files
10
+ * Add support for paritioned views during the insert process. Use specifiable columns as the trigger columns for determining the data output destination.
11
+ * Check if a temp table exists and the last job run was successful, in which case skip during the current run
12
+ * Create models for each of the tables in each of the databases defined in ETL::Engine.connections
13
+
14
+ Audit Record
15
+
16
+ Process-Level
17
+ * Start Time
18
+ * End Time
19
+ * (Duration)
20
+ * Rows Read
21
+ * Rows Written
22
+ * Rows Rejected
23
+ * Errors
24
+ * Destination
25
+ Record-Level
26
+ * Source
27
+ * Timestamp
28
+ * Transformation Log
data/bin/etl ADDED
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #--
4
+ # Copyright (c) 2006 Anthony Eden
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining
7
+ # a copy of this software and associated documentation files (the
8
+ # "Software"), to deal in the Software without restriction, including
9
+ # without limitation the rights to use, copy, modify, merge, publish,
10
+ # distribute, sublicense, and/or sell copies of the Software, and to
11
+ # permit persons to whom the Software is furnished to do so, subject to
12
+ # the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be
15
+ # included in all copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24
+ #++
25
+
26
+ $:.unshift(File.dirname(__FILE__) + '/../lib/')
27
+ require 'etl'
28
+ require 'etl/commands/etl'
data/bin/etl.cmd ADDED
@@ -0,0 +1,8 @@
1
+ @echo off
2
+
3
+ rem The purpose of this Windows script is to let you use the etl command line with a non-gem version of AW-ETL (eg: unpacked gem, pistoned trunk).
4
+ rem Just add the current folder on top of your PATH variable to use it instead of the etl command provided with the gem release.
5
+
6
+ rem %~dp0 returns the absolute path where the current script is. We just append 'etl' to it, and forward all the arguments with %*
7
+
8
+ ruby "%~dp0etl" %*
@@ -0,0 +1,16 @@
1
+ etl_execution:
2
+ adapter: mysql
3
+ username: root
4
+ host: localhost
5
+ database: etl_execution
6
+ encoding: utf8
7
+ datawarehouse:
8
+ adapter: mysql
9
+ username: root
10
+ host: localhost
11
+ database: datawarehouse_development
12
+ operational:
13
+ adapter: mysql
14
+ username: root
15
+ host: localhost
16
+ database: operational_production
@@ -0,0 +1,111 @@
1
+ module ETL #:nodoc:
2
+ module Batch
3
+ class Context
4
+ attr_reader :batch
5
+
6
+ class << self
7
+ # Create a context that is used when evaluating the batch file
8
+ def create(batch)
9
+ Context.new(batch).get_binding
10
+ end
11
+ end
12
+
13
+ def initialize(batch)
14
+ @batch = batch
15
+ end
16
+
17
+ def file
18
+ batch.file
19
+ end
20
+
21
+ def get_binding
22
+ binding
23
+ end
24
+
25
+ def run(file)
26
+ batch.run(File.dirname(self.file) + "/" + file)
27
+ end
28
+
29
+ def use_temp_tables(value=true)
30
+ batch.use_temp_tables(value)
31
+ end
32
+
33
+ end
34
+ class Batch
35
+ attr_accessor :file
36
+ attr_accessor :engine
37
+
38
+ class << self
39
+ # Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
40
+ # are:
41
+ # * The path to a control file as a String
42
+ # * A File object referencing the control file
43
+ # * The ETL::Control::Control object (which will just be returned)
44
+ #
45
+ # Raises a ControlError if any other type is given
46
+ def resolve(batch, engine)
47
+ batch = do_resolve(batch)
48
+ batch.engine = engine
49
+ batch
50
+ end
51
+
52
+ protected
53
+ def parse(batch_file)
54
+ batch_file = batch_file.path if batch_file.instance_of?(File)
55
+ batch = ETL::Batch::Batch.new(batch_file)
56
+ eval(IO.readlines(batch_file).join("\n"), Context.create(batch), batch_file)
57
+ batch
58
+ end
59
+
60
+ def do_resolve(batch)
61
+ case batch
62
+ when String
63
+ ETL::Batch::Batch.parse(File.new(batch))
64
+ when File
65
+ ETL::Batch::Batch.parse(batch)
66
+ when ETL::Batch::Batch
67
+ batch
68
+ else
69
+ raise RuntimeError, "Batch must be a String, File or Batch object"
70
+ end
71
+ end
72
+ end
73
+
74
+ def initialize(file)
75
+ @file = file
76
+ end
77
+
78
+ def run(file)
79
+ directives << Run.new(self, file)
80
+ end
81
+
82
+ def use_temp_tables(value = true)
83
+ directives << UseTempTables.new(self)
84
+ end
85
+
86
+ def execute
87
+ engine.say "Executing batch"
88
+ before_execute
89
+ directives.each do |directive|
90
+ directive.execute
91
+ end
92
+ engine.say "Finishing batch"
93
+ after_execute
94
+ engine.say "Batch complete"
95
+ end
96
+
97
+ def directives
98
+ @directives ||= []
99
+ end
100
+
101
+ def before_execute
102
+
103
+ end
104
+
105
+ def after_execute
106
+ ETL::Engine.finish # TODO: should be moved to the directive?
107
+ ETL::Engine.use_temp_tables = false # reset the temp tables
108
+ end
109
+ end
110
+ end
111
+ end