etl 0.9.5.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (215) hide show
  1. data/.gitignore +12 -0
  2. data/.yardopts +5 -0
  3. data/0.9-UPGRADE +6 -0
  4. data/CHANGELOG +236 -0
  5. data/Gemfile +4 -0
  6. data/HOW_TO_RELEASE +13 -0
  7. data/LICENSE +7 -0
  8. data/README.textile +111 -0
  9. data/Rakefile +105 -0
  10. data/TODO +28 -0
  11. data/activewarehouse-etl.gemspec +38 -0
  12. data/bin/etl +28 -0
  13. data/bin/etl.cmd +8 -0
  14. data/examples/database.example.yml +16 -0
  15. data/lib/etl.rb +97 -0
  16. data/lib/etl/batch.rb +2 -0
  17. data/lib/etl/batch/batch.rb +111 -0
  18. data/lib/etl/batch/directives.rb +65 -0
  19. data/lib/etl/builder.rb +2 -0
  20. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  21. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  22. data/lib/etl/commands/etl.rb +89 -0
  23. data/lib/etl/control.rb +3 -0
  24. data/lib/etl/control/control.rb +405 -0
  25. data/lib/etl/control/destination.rb +438 -0
  26. data/lib/etl/control/destination/csv_destination.rb +113 -0
  27. data/lib/etl/control/destination/database_destination.rb +97 -0
  28. data/lib/etl/control/destination/excel_destination.rb +91 -0
  29. data/lib/etl/control/destination/file_destination.rb +126 -0
  30. data/lib/etl/control/destination/insert_update_database_destination.rb +136 -0
  31. data/lib/etl/control/destination/update_database_destination.rb +109 -0
  32. data/lib/etl/control/destination/yaml_destination.rb +74 -0
  33. data/lib/etl/control/source.rb +132 -0
  34. data/lib/etl/control/source/database_source.rb +224 -0
  35. data/lib/etl/control/source/enumerable_source.rb +11 -0
  36. data/lib/etl/control/source/file_source.rb +90 -0
  37. data/lib/etl/control/source/model_source.rb +39 -0
  38. data/lib/etl/core_ext.rb +1 -0
  39. data/lib/etl/core_ext/time.rb +5 -0
  40. data/lib/etl/core_ext/time/calculations.rb +42 -0
  41. data/lib/etl/engine.rb +582 -0
  42. data/lib/etl/execution.rb +19 -0
  43. data/lib/etl/execution/base.rb +8 -0
  44. data/lib/etl/execution/batch.rb +10 -0
  45. data/lib/etl/execution/job.rb +8 -0
  46. data/lib/etl/execution/migration.rb +90 -0
  47. data/lib/etl/generator.rb +2 -0
  48. data/lib/etl/generator/generator.rb +20 -0
  49. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  50. data/lib/etl/http_tools.rb +139 -0
  51. data/lib/etl/parser.rb +11 -0
  52. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  53. data/lib/etl/parser/csv_parser.rb +93 -0
  54. data/lib/etl/parser/excel_parser.rb +112 -0
  55. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  56. data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
  57. data/lib/etl/parser/parser.rb +41 -0
  58. data/lib/etl/parser/sax_parser.rb +218 -0
  59. data/lib/etl/parser/xml_parser.rb +65 -0
  60. data/lib/etl/processor.rb +11 -0
  61. data/lib/etl/processor/block_processor.rb +14 -0
  62. data/lib/etl/processor/bulk_import_processor.rb +94 -0
  63. data/lib/etl/processor/check_exist_processor.rb +80 -0
  64. data/lib/etl/processor/check_unique_processor.rb +39 -0
  65. data/lib/etl/processor/copy_field_processor.rb +26 -0
  66. data/lib/etl/processor/database_join_processor.rb +82 -0
  67. data/lib/etl/processor/encode_processor.rb +55 -0
  68. data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
  69. data/lib/etl/processor/escape_csv_processor.rb +77 -0
  70. data/lib/etl/processor/filter_row_processor.rb +51 -0
  71. data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
  72. data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
  73. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  74. data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
  75. data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
  76. data/lib/etl/processor/print_row_processor.rb +12 -0
  77. data/lib/etl/processor/processor.rb +25 -0
  78. data/lib/etl/processor/rename_processor.rb +24 -0
  79. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  80. data/lib/etl/processor/row_processor.rb +27 -0
  81. data/lib/etl/processor/sequence_processor.rb +23 -0
  82. data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
  83. data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
  84. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  85. data/lib/etl/processor/truncate_processor.rb +40 -0
  86. data/lib/etl/processor/zip_file_processor.rb +27 -0
  87. data/lib/etl/row.rb +20 -0
  88. data/lib/etl/screen.rb +14 -0
  89. data/lib/etl/screen/row_count_screen.rb +20 -0
  90. data/lib/etl/transform.rb +2 -0
  91. data/lib/etl/transform/block_transform.rb +13 -0
  92. data/lib/etl/transform/calculation_transform.rb +71 -0
  93. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  94. data/lib/etl/transform/decode_transform.rb +51 -0
  95. data/lib/etl/transform/default_transform.rb +20 -0
  96. data/lib/etl/transform/foreign_key_lookup_transform.rb +211 -0
  97. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  98. data/lib/etl/transform/md5_transform.rb +13 -0
  99. data/lib/etl/transform/ordinalize_transform.rb +14 -0
  100. data/lib/etl/transform/sha1_transform.rb +13 -0
  101. data/lib/etl/transform/split_fields_transform.rb +27 -0
  102. data/lib/etl/transform/string_to_date_time_transform.rb +14 -0
  103. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  104. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  105. data/lib/etl/transform/transform.rb +61 -0
  106. data/lib/etl/transform/trim_transform.rb +26 -0
  107. data/lib/etl/transform/type_transform.rb +35 -0
  108. data/lib/etl/util.rb +59 -0
  109. data/lib/etl/version.rb +3 -0
  110. data/spec/fixtures/all.ebf +6 -0
  111. data/spec/fixtures/apache_combined_log.ctl +11 -0
  112. data/spec/fixtures/batch_with_error.ebf +6 -0
  113. data/spec/fixtures/batched1.ctl +0 -0
  114. data/spec/fixtures/batched2.ctl +0 -0
  115. data/spec/fixtures/block_processor.ctl +6 -0
  116. data/spec/fixtures/block_processor_error.ctl +1 -0
  117. data/spec/fixtures/block_processor_pre_post_process.ctl +4 -0
  118. data/spec/fixtures/block_processor_remove_rows.ctl +5 -0
  119. data/spec/fixtures/data/apache_combined_log.txt +3 -0
  120. data/spec/fixtures/data/bulk_import.txt +3 -0
  121. data/spec/fixtures/data/bulk_import_with_empties.txt +3 -0
  122. data/spec/fixtures/data/decode.txt +3 -0
  123. data/spec/fixtures/data/delimited.txt +3 -0
  124. data/spec/fixtures/data/encode_source_latin1.txt +2 -0
  125. data/spec/fixtures/data/excel.xls +0 -0
  126. data/spec/fixtures/data/excel2.xls +0 -0
  127. data/spec/fixtures/data/fixed_width.txt +3 -0
  128. data/spec/fixtures/data/multiple_delimited_1.txt +3 -0
  129. data/spec/fixtures/data/multiple_delimited_2.txt +3 -0
  130. data/spec/fixtures/data/nokogiri.xml +38 -0
  131. data/spec/fixtures/data/people.txt +3 -0
  132. data/spec/fixtures/data/sax.xml +14 -0
  133. data/spec/fixtures/data/xml.xml +16 -0
  134. data/spec/fixtures/delimited.ctl +30 -0
  135. data/spec/fixtures/delimited_absolute.ctl +31 -0
  136. data/spec/fixtures/delimited_destination_db.ctl +23 -0
  137. data/spec/fixtures/delimited_excel.ctl +31 -0
  138. data/spec/fixtures/delimited_insert_update.ctl +34 -0
  139. data/spec/fixtures/delimited_update.ctl +34 -0
  140. data/spec/fixtures/delimited_with_bulk_load.ctl +34 -0
  141. data/spec/fixtures/errors.ctl +24 -0
  142. data/spec/fixtures/excel.ctl +24 -0
  143. data/spec/fixtures/excel2.ctl +25 -0
  144. data/spec/fixtures/fixed_width.ctl +35 -0
  145. data/spec/fixtures/inline_parser.ctl +17 -0
  146. data/spec/fixtures/model_source.ctl +14 -0
  147. data/spec/fixtures/multiple_delimited.ctl +22 -0
  148. data/spec/fixtures/multiple_source_delimited.ctl +39 -0
  149. data/spec/fixtures/nokogiri_all.ctl +35 -0
  150. data/spec/fixtures/nokogiri_select.ctl +35 -0
  151. data/spec/fixtures/output/.ignore +1 -0
  152. data/spec/fixtures/output/delimited.txt +3 -0
  153. data/spec/fixtures/output/encode_destination_utf-8.txt +2 -0
  154. data/spec/fixtures/output/fixed_width.txt +3 -0
  155. data/spec/fixtures/output/inline_parser.txt +3 -0
  156. data/spec/fixtures/output/multiple_source_delimited.txt +6 -0
  157. data/spec/fixtures/output/test_excel_destination.xls +0 -0
  158. data/spec/fixtures/output/test_file_destination.2.txt +2 -0
  159. data/spec/fixtures/output/test_file_destination.txt +2 -0
  160. data/spec/fixtures/output/test_multiple_unique.txt +1 -0
  161. data/spec/fixtures/output/test_unique.txt +2 -0
  162. data/spec/fixtures/sax.ctl +26 -0
  163. data/spec/fixtures/scd/1.txt +1 -0
  164. data/spec/fixtures/scd/2.txt +1 -0
  165. data/spec/fixtures/scd/3.txt +1 -0
  166. data/spec/fixtures/scd_test_type_1.ctl +43 -0
  167. data/spec/fixtures/scd_test_type_2.ctl +34 -0
  168. data/spec/fixtures/screen_test_error.ctl +3 -0
  169. data/spec/fixtures/screen_test_fatal.ctl +3 -0
  170. data/spec/fixtures/xml.ctl +31 -0
  171. data/spec/quality_spec.rb +11 -0
  172. data/spec/spec_helper.rb +10 -0
  173. data/spec/support/custom_fixtures.rb +54 -0
  174. data/spec/support/custom_matchers.rb +54 -0
  175. data/test-matrix.yml +10 -0
  176. data/test/.gitignore +1 -0
  177. data/test/.ignore +2 -0
  178. data/test/batch_test.rb +41 -0
  179. data/test/block_processor_test.rb +38 -0
  180. data/test/check_exist_processor_test.rb +92 -0
  181. data/test/check_unique_processor_test.rb +40 -0
  182. data/test/config/Gemfile.rails-2.3.x +3 -0
  183. data/test/config/Gemfile.rails-2.3.x.lock +53 -0
  184. data/test/config/Gemfile.rails-3.0.x +3 -0
  185. data/test/config/Gemfile.rails-3.0.x.lock +61 -0
  186. data/test/config/common.rb +29 -0
  187. data/test/connection/mysql/connection.rb +9 -0
  188. data/test/connection/mysql/schema.sql +37 -0
  189. data/test/connection/postgresql/connection.rb +13 -0
  190. data/test/connection/postgresql/schema.sql +40 -0
  191. data/test/control_test.rb +43 -0
  192. data/test/database_join_processor_test.rb +43 -0
  193. data/test/date_dimension_builder_test.rb +96 -0
  194. data/test/destination_test.rb +275 -0
  195. data/test/directive_test.rb +23 -0
  196. data/test/encode_processor_test.rb +32 -0
  197. data/test/engine_test.rb +78 -0
  198. data/test/ensure_fields_presence_processor_test.rb +28 -0
  199. data/test/etl_test.rb +42 -0
  200. data/test/foreign_key_lookup_transform_test.rb +50 -0
  201. data/test/generator_test.rb +14 -0
  202. data/test/mocks/mock_destination.rb +26 -0
  203. data/test/mocks/mock_source.rb +25 -0
  204. data/test/nokogiri_test.rb +35 -0
  205. data/test/parser_test.rb +224 -0
  206. data/test/performance/delimited.ctl +30 -0
  207. data/test/processor_test.rb +44 -0
  208. data/test/row_processor_test.rb +17 -0
  209. data/test/scd_test.rb +257 -0
  210. data/test/screen_test.rb +9 -0
  211. data/test/source_test.rb +154 -0
  212. data/test/test_helper.rb +37 -0
  213. data/test/transform_test.rb +101 -0
  214. data/test/truncate_processor_test.rb +37 -0
  215. metadata +510 -0
data/.gitignore ADDED
@@ -0,0 +1,12 @@
1
+ pkg/*
2
+ source_data
3
+ test/output/*
4
+ rdoc
5
+ .rvmrc
6
+ .bundle
7
+ *.gem
8
+ Gemfile.lock
9
+ *.rbc
10
+ *.log
11
+ doc
12
+ .yardoc
data/.yardopts ADDED
@@ -0,0 +1,5 @@
1
+ --title "Ruby ETL"
2
+ lib/**/*.rb
3
+ README.textile
4
+ CHANGELOG
5
+ TODO
data/0.9-UPGRADE ADDED
@@ -0,0 +1,6 @@
1
+ The 0.9 revision of ActiveWarehouse ETL significantly changes how connections are maintained. This release is not backwards compatible.
2
+
3
+ To upgrade, you must do the following:
4
+
5
+ 1.) All database connections used in ETL control files must be declared in database.yml in the directory that contains your ETL control files.
6
+ 2.) All sources, destinations, transforms and processors that use a database connection must include the configuration name/value pair of :target => 'name' where name is replaced with the connection name defined in database.yml. Connection information should no longer be included in control files.
data/CHANGELOG ADDED
@@ -0,0 +1,236 @@
1
+ 0.9.5 - unreleased
2
+ * BREAKING CHANGE: the TruncateProcessor will now RESTART IDENTITY when using postgres - this behaviour can be changed by passing :options => 'CONTINUE IDENTITY' (thbar)
3
+ * New EnsureFieldsPresenceProcessor (thbar)
4
+ * New CsvDestination based on FasterCSV (lgustafson)
5
+ * Fix #41: Allow cache to be turned off on ForeignKeyLookupTransform
6
+ * Fix #36: etl --read-locally picks wrong last file (lgustafson)
7
+ * BREAKING CHANGE: DelimitedParser has been renamed to CsvParser, to emphasize the fact that this parser relies on FasterCSV under the hood (which means it will choke on faulty data).
8
+ * Fixes for Rails 3 (gkfabs)
9
+ * Fixes for Ruby 1.9.2 (jlecour, byrnejb, thbar)
10
+ * improvements on CalculationTransform (gkfabs)
11
+ * batch can have children (gkfabs)
12
+ * read all columns by default in CsvParser (smeyfroi)
13
+ * Turn db config into a HashWithIndifferentAccess (smeyfroi)
14
+ * DatabaseJoinProcessor (gkfabs)
15
+ * ImapattachmentDownloaderProcessor (gkfabs)
16
+ * Pop3attachementDownloaderProcessor (gkfabs)
17
+ * Enhancements and fixes on EscapeCSVProcessor (gkfabs)
18
+ * Bug correction on hash initialization DatabaseSource (gkfabs)
19
+ * FilterRowProcessor won't return nil (gkfabs)
20
+ * DatabaseSource fixes (gkfabs)
21
+ * New UpdateDatabaseDestination (gkfabs)
22
+ * New InsertUpdateDatabaseDestination (gkfabs)
23
+ * New Excel destination (gkfabs)
24
+ * New Excel parser (gkfabs)
25
+ * Add ability to use a query in DatabaseSource (gkfabs)
26
+ * Modified SQLResolver to allow for multiple key lookups on dimension
27
+ table (cdimartino)
28
+ * Add scd_required_fields which allow to avoid specifying the scd fields when specifying :unique in a destination (darrell)
29
+ * More SCD debug logging (darrell)
30
+ * Make database destination table quoting database agnostic (darrell)
31
+ * Allow absolute paths in file destinations and file bulk imports (darrell)
32
+ * Bug fixes on engine, destination (sasikumargn, cdimartino, darrell)
33
+ * Code and tests clean-up (mainej, aeden, thbar)
34
+ * No more failure on tests (thbar, gkfabs)
35
+ * Add the ability to automatically run the tests on a matrix of configuration including Rails 2/3, Ruby 1.8.7/1.9.2, MySQL/Postgresql (thbar)
36
+ * Use bundler for gem packaging and release (thbar)
37
+
38
+ 0.9.1 - January 14, 2009
39
+ * SQLResolver now uses ETL::Engine.table so it may utilize temp tables. (aeden)
40
+ * Added Thibaut Barrère's encode processor.
41
+ * Added MockSource and MockDestination test helpers (thbar)
42
+ * Added the block processor. Can call a block once (pre/post processor)
43
+ or once for each row (after_read/before_write row processor) (thbar)
44
+ * Changed temp table to use new AdapterExtension copy_table method (aeden)
45
+ * Added bin/etl.cmd windows batch - just add the bin folder to your PATH
46
+ and it will let you call etl on an unpacked/pistoned version of AW-ETL (thbar)
47
+ * Upgraded to support Rails 2.1. No longer compatible with older versions of Rails.
48
+ * Added ETL::Builder::TimeDimensionBuilder
49
+ * Added :default option to ForeignKeyLookupTransform that will be used if no
50
+ foreign key is found.
51
+ * Added :cache option to ForeignKeyLookupTransform that will preload the FK
52
+ mappings if the underlying resolver supports it. Currently supported by
53
+ SQLResolver.
54
+ * A Class extending ETL::Transform::Transform may now be passed as a transformer.
55
+ For example, in the control file you would define the transform as:
56
+ transform :a_field, MyTransform, {:option1 => 'option1'}.
57
+ * Support Ruby 1.9 CSV library
58
+
59
+ 0.9.0 - August 9, 2007
60
+ * Added support for batch processing through .ebf files. These files are
61
+ essentially control files that apply settings to an entire ETL process.
62
+ * Implemented support for screen blocks. These blocks can be used to test
63
+ the data and raise an error if the screens do not pass.
64
+ * Connections are now cached in a Hash available through
65
+ ETL::Engine.connection(name). This should be used rather than including
66
+ connection information in the control files.
67
+ * Implemented temp table support throughout.
68
+ * DateDimensionBuilder now included in ActiveWarehouse ETL directly.
69
+ * Time calculations for fiscal year now included in ActiveWarehouse ETL.
70
+
71
+ 0.8.4 - May 24, 2007
72
+ * Added fix for backslash in file writer
73
+
74
+ 0.8.3 - May 13, 2007
75
+ * Added patches from Andy Triboletti
76
+
77
+ 0.8.2 - April 15, 2007
78
+ * Fixed bug with premature destination closing.
79
+ * Added indexes to execution records table.
80
+ * Added a PrintRowProcessor.
81
+ * Added support for conditions and "group by" in the database source.
82
+ * Added after_initialize hook in Processor base class.
83
+ * Added examples directory
84
+
85
+ 0.8.1 - Apr 12, 2007
86
+ * Added EnumerableSource
87
+ * Added :type configuration option to the source directive, allowing the source
88
+ type to be explicitly specified. The source type can be a string or symbol
89
+ (in which case the class will be constructed by appending Source to the type
90
+ name), a class (which will be instantiate and passed the control,
91
+ configuration and mapping) and finally an actual Source instance.
92
+
93
+ 0.8.0 - Apr 12, 2007
94
+ * Source now available through the current row source accessor.
95
+ * Added new_rows_only configuration option to DatabaseSource. A date field must
96
+ be specified and only records that are greater than the date value in that
97
+ field, relative to the last successful
98
+ execution, will be returned from the source.
99
+ * Added an (untested) count feature which returns the number of rows for
100
+ processing.
101
+ * If no natural key is defined then an empty array will now be used, resulting
102
+ in the row being written to the output without going through change checks.
103
+ * Mapping argument in destination is now optional. An empty hash will be used
104
+ if the mapping hash is not specified. If the mapping hash is not specified
105
+ then the order will be determined using the originating source's order.
106
+ * ActiveRecord configurations loaded from database.yml by the etl tool will be
107
+ merged with ActiveRecord::Base.configurations.
108
+ * Fixed several bugs in how record change detection was implemented.
109
+ * Fixed how the read_locally functionality was implemented so that it will find
110
+ that last completed local source copy using the source's trigger file (untested).
111
+
112
+ 0.7.2 - Apr 8, 2007
113
+ * Fixed quoting bug in CheckExistProcessor
114
+
115
+ 0.7.1 - Apr 8, 2007
116
+ * Fixed source caching
117
+
118
+ 0.7 - Apr 8, 2007
119
+ * Job execution is now tracked in a database. This means that ActiveRecord is
120
+ required regardless of the sources being used in the ETL scripts. An example
121
+ database configuration for the etl can be found in test/database.example.yml.
122
+ This file is loaded from either a.) the current working directory or b.) the
123
+ location specified using the -c command line argument when running the etl
124
+ command.
125
+ * etl script now supports the following command line arguments:
126
+ ** -h or --help: Prints the usage
127
+ ** -l or --limit: Specifies a limit for the number of source rows to read,
128
+ useful for testing your control files before executing a full ETL process
129
+ ** -o or --offset: Specified a start offset for reading from the source, useful
130
+ for testing your control files before executing a full ETL process
131
+ ** -c or --config: Specify the database.yml file to configure the ETL
132
+ execution data store
133
+ ** -n or --newlog: Write to the logfile rather than appending to it
134
+ * Database source now supports specifying the select, join and order parts of
135
+ the query.
136
+ * Database source understands the limit argument specified on the etl command
137
+ line
138
+ * Added CheckExistProcessor
139
+ * Added CheckUniqueProcessor
140
+ * Added SurrogateKeyProcessor. The SurrogateKey processor should be used in
141
+ conjunction with the CheckExistProcessor and CheckUniqueProcessor to provide
142
+ surrogate keys for all dimension records.
143
+ * Added SequenceProcessor
144
+ * Added OrdinalizeTransform
145
+ * Fixed a bug in the trim transform
146
+ * Sources now provide a trigger file which can be used to indicate that the
147
+ original source data has been completely extracted to the local file system.
148
+ This is useful if you need to recover from a failed ETL process.
149
+ * Updated README
150
+
151
+ 0.6.1 - Mar 22, 2007
152
+ * Added support for absolute paths in file sources
153
+ * Added CopyFieldProcessor
154
+
155
+ 0.6.0 - Mar 8, 2007
156
+ * Fixed missing method problem in validate in Control class.
157
+ * Removed control validation for now (source could be code in the control file).
158
+ * Transform interface now defined as taking 3 arguments, the field name, field
159
+ value and the row. This is not backwards compatible.
160
+ * Added HierarchyLookupTransform.
161
+ * Added DefaultTransform which will return a specified value if the initial
162
+ value is blank.
163
+ * Added row-level processing.
164
+ * Added HierarchyExploderProcessor which takes a single hierarchy row and
165
+ explodes it to multiple rows as used in a hierarchy bridge.
166
+ * Added ApacheCombinedLogParser which parses Apache Combined Log format,
167
+ including parsing of the
168
+ user agent string and the URI, returning a Hash.
169
+ * Fixed bug in SAX parser so that attributes are now set when the start_element
170
+ event is received.
171
+ * Added an HttpTools module which provides some parsing methods (for user agent
172
+ and URI).
173
+ * Database source now uses its own class for establishing an ActiveRecord
174
+ connection.
175
+ * Log files are now timestamped.
176
+ * Source files are now archived automatically during the extraction process
177
+ * Added a :condition option to the destination configuration Hash that accepts
178
+ a Proc with a single argument passed to it (the row).
179
+ * Added an :append_rows option to the destination configuration Hash that
180
+ accepts either a Hash (to append a single row) or an Array of Hashes (to
181
+ append multiple rows).
182
+ * Only print the read and written row counts if there is at least one source
183
+ and one destination respectively.
184
+ * Added a depends_on directive that accepts a list of arguments of either strings
185
+ or symbols. Each symbol is converted to a string and .ctl is appended;
186
+ strings are passed through directly. The dependencies are executed in the order
187
+ they are specified.
188
+ * The default field separator in the bulk loader is now a comma (was a tab).
189
+
190
+ 0.5.2 - Feb 19, 2007
191
+ * Added error threshold.
192
+ * Fixed problem with transform error handling.
193
+
194
+ 0.5.1 - Feb 18, 2007
195
+ * Fixed up truncate processor.
196
+ * Updated HOW_TO_RELEASE doc.
197
+
198
+ 0.5.0 - Feb 17, 2007
199
+ * Changed require_gem to gem and added alias to allow for older versions of
200
+ rubygems.
201
+ * Added support for Hash in the source configuration where :name => :parser_name
202
+ defines the parser to use and :options => {} defines options to pass to the
203
+ parser.
204
+ * Added support for passing a custom Parser class in the source configuration.
205
+ * Removed the need to include Enumerable in each parser implementation.
206
+ * Added new date_to_string and string_to_date transformers.
207
+ * Implemented foreign_key_lookup transform including an ActiveRecordResolver.
208
+ * Added real time activity logging which is called when the etl bin script is
209
+ invoked.
210
+ * Improved error handling.
211
+ * Default logger level is now WARN.
212
+
213
+ 0.4.0 - Jan 11, 2006
214
+ * Added :skip_lines option to file source configurations, which can be used
215
+ to skip the first n lines in the source data file
216
+ * Added better error handling in delimited parser - an error is now raised
217
+ if the expected and actual field lengths do not match
218
+ * Added :truncate option for database destination. Set to true to truncate
219
+ before importing data.
220
+ * Added support for :unique => [] option and virtual fields for the database
221
+ destination
222
+
223
+ 0.3.0 - Dec 19, 2006
224
+ * Added support for calculated values in virtual fields with Proc
225
+
226
+ 0.2.0 - Dec 7, 2006
227
+ * Added an XML parser for source parsing
228
+ * Added support for compound key constraints in destinations via the
229
+ :unique => [] option
230
+ * Added ability to declare explicit columns in bulk import
231
+ * Added support for generators in destinations
232
+ * Added a SurrogateKeyGenerator for cases where the database doesn't support
233
+ auto generated surrogate keys
234
+
235
+ 0.1.0 - Dec 6, 2006
236
+ * Initial release
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in ..gemspec
4
+ gemspec
data/HOW_TO_RELEASE ADDED
@@ -0,0 +1,13 @@
1
+ * update lib/etl/version
2
+ * push your changes
3
+ * then use bundler to build + git tag + push to rubygems
4
+
5
+ rake release
6
+
7
+ * if you remain stuck at "Pushed git commits and tags", the task may silently wait for your password. Check this if it's the case:
8
+
9
+ https://github.com/carlhuda/bundler/issues/980
10
+
11
+ * you can list changes using github:
12
+
13
+ https://github.com/activewarehouse/activewarehouse-etl/compare/release-0.9.1...master
data/LICENSE ADDED
@@ -0,0 +1,7 @@
1
+ Copyright (c) 2006-2007 Anthony Eden
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.textile ADDED
@@ -0,0 +1,111 @@
1
+ h1. ActiveWarehouse-ETL
2
+
3
+ ActiveWarehouse-ETL is a Ruby Extract-Transform-Load (ETL) tool.
4
+
5
+ This tool is both usable and used in production under its current form - but be aware the project is under reorganization: a new team is shaping up and we're working mostly on making it easier for people to contribute first. Up-to-date documentation will only come later.
6
+
7
+ h2. Usage
8
+
9
+ The documentation is sparse and not everything is up to date, too, but here are useful bits to get you started:
10
+
11
+ * read the "Introduction":https://github.com/activewarehouse/activewarehouse-etl/wiki/Documentation
12
+ * later on, refer to the "RDoc":http://rdoc.info/github/activewarehouse/activewarehouse-etl/master/frames (be sure to check out Processor and Transform)
13
+ * read the "source":https://github.com/activewarehouse/activewarehouse-etl/tree/master/lib/etl
14
+
15
+ If you're lost, please ask questions on the "Google Group":http://groups.google.com/group/activewarehouse-discuss and we'll take care of it.
16
+
17
+ One thing to keep in mind is that ActiveWarehouse-ETL is highly hackable: you can pretty much create all you need with extra ruby code, even if it's not currently supported.
18
+
19
+ h2. Compatibility
20
+
21
+ Current code should work with any combination of Rails 2, Rails 3, Ruby 1.8.7, Ruby 1.9.2, MySQL and Postgresql. If you meet any issue, drop a line on the "Google Group":http://groups.google.com/group/activewarehouse-discuss and/or "create an issue on github":https://github.com/activewarehouse/activewarehouse-etl/issues.
22
+
23
+ h2. Contributing
24
+
25
+ Fork on GitHub and after you've committed tested patches, send a pull request.
26
+
27
+ If you meet any error while trying to run the tests, or any failure, please drop a line on the "Google Group":http://groups.google.com/group/activewarehouse-discuss.
28
+
29
+ h3. Pre-requisites to running the tests
30
+
31
+ * install RVM and Bundler
32
+ * install MySQL and/or Postgresql (you can use brew for that)
33
+ * create test/config/database.mysql.yml and test/config/database.postgresql.yml based on "test/config/database.example.yml":https://github.com/activewarehouse/activewarehouse-etl/blob/master/test/config/database.example.yml
34
+ * create databases 'etl_unittest' and 'etl_unittest_execution' in each database, with access to the user given in the yml files
35
+
36
+ If you don't install both MySQL and Postgresql, edit "test/config/common.rb":https://github.com/activewarehouse/activewarehouse-etl/blob/master/test/config/common.rb to comment out either 'mysql' or 'pg', or the test task will raise errors.
37
+
38
+ h3. Run the tests
39
+
40
+ You can run the tests on a "combination of environments":https://github.com/activewarehouse/activewarehouse-etl/blob/master/test-matrix.yml using:
41
+
42
+ <pre>
43
+ rake test:matrix
44
+ </pre>
45
+
46
+ h2. Contributors
47
+
48
+ ActiveWarehouse-ETL is the work of many people since late 2006 - here is a list, in no particular order:
49
+
50
+ * Anthony Eden
51
+ * Chris DiMartino
52
+ * Darrell Fuhriman
53
+ * Fabien Carrion
54
+ * Jacob Maine
55
+ * James B. Byrne
56
+ * Jay Zeschin
57
+ * Jeremy Lecour
58
+ * Steve Meyfroidt
59
+ * Seth Ladd
60
+ * Thibaut Barrère
61
+ * Stephen Touset
62
+ * sasikumargn
63
+ * Andrew Kuklewicz
64
+ * Leif Gustafson
65
+ * Andrew Sodt
66
+ * Tyler Kiley
67
+
68
+ If your name should be on the list but isn't, please leave a comment!
69
+
70
+ h2. Features
71
+
72
+ Currently supported features:
73
+
74
+ * ETL Domain Specific Language (DSL) - Control files are specified in a Ruby-based DSL
75
+ * Multiple source types. Current supported types:
76
+ ** Fixed-width and delimited text files
77
+ ** XML files through SAX
78
+ ** Apache combined log format
79
+ * Multiple destination types - file and database destinations
80
+ * Support for extracting from multiple sources in a single job
81
+ * Support for writing to multiple destinations in a single job
82
+ * A variety of built-in transformations are included:
83
+ ** Date-to-string, string-to-date, string-to-datetime, string-to-timestamp
84
+ ** Type transformation supporting strings, integers, floats and big decimals
85
+ ** Trim
86
+ ** SHA-1
87
+ ** Decode from an external decode file
88
+ ** Default replacement for empty values
89
+ ** Ordinalize
90
+ ** Hierarchy lookup
91
+ ** Foreign key lookup
92
+ ** Ruby blocks
93
+ ** Any custom transformation class
94
+ * A variety of build-in row-level processors
95
+ ** Check exists processor to determine if the record already exists in the destination database
96
+ ** Check unique processor to determine whether a matching record was processed during this job execution
97
+ ** Copy field
98
+ ** Rename field
99
+ ** Hierarchy exploder which takes a tree structure defined through a parent id and explodes it into a hierarchy bridge table
100
+ ** Surrogate key generator including support for looking up the last surrogate key from the target table using a custom query
101
+ ** Sequence generator including support for context-sensitive sequences where the context can be defined as a combination of fields from the source data
102
+ ** New row-level processors can easily be defined and applied
103
+ * Pre-processing
104
+ ** Truncate processor
105
+ * Post-processing
106
+ ** Bulk import using native RDBMS bulk loader tools
107
+ * Virtual fields - Add a field to the destination data which doesn't exist in the source data
108
+ * Built in job and record meta data
109
+ * Support for type 1 and type 2 slowly changing dimensions
110
+ ** Automated effective date and end date time stamping for type 2
111
+ ** CRC checking
data/Rakefile ADDED
@@ -0,0 +1,105 @@
1
+ require 'bundler/setup'
2
+ require 'rake'
3
+ require 'rake/testtask'
4
+ require 'yard'
5
+
6
+ require 'rspec'
7
+ require 'rspec/core'
8
+ require 'rspec/core/rake_task'
9
+
10
+ Bundler::GemHelper.install_tasks
11
+
12
+ desc "Run Specs"
13
+ RSpec::Core::RakeTask.new(:spec) do |spec|
14
+ spec.pattern = "spec/**/*_spec.rb"
15
+ spec.verbose = true
16
+ spec.rspec_opts = ['--color']
17
+ end
18
+
19
+
20
+ desc "Generate YARD docs"
21
+ YARD::Rake::YardocTask.new(:yard)
22
+
23
+
24
+ namespace :test do
25
+ def run_tests(rvm, rails, database)
26
+ database_yml = File.dirname(__FILE__) + "/test/config/database.#{database}.yml"
27
+ FileUtils.cp(database_yml, 'test/config/database.yml')
28
+
29
+ puts
30
+ puts "============ Ruby #{rvm} - Rails #{rails} - Db #{database} ============="
31
+ puts
32
+
33
+ rvm_script = File.expand_path("~/.rvm/scripts/rvm")
34
+
35
+ # a bit hackish - source rvm as described here
36
+ # https://rvm.beginrescueend.com/workflow/scripting/
37
+ sh <<-BASH
38
+ source #{rvm_script}
39
+ export BUNDLE_GEMFILE=test/config/Gemfile.rails-#{rails}
40
+ rvm #{rvm}
41
+ bundle install
42
+ rake test
43
+ BASH
44
+ end
45
+
46
+ desc 'Run the tests in all combinations described in test-matrix.yml'
47
+ task :matrix do
48
+ # a la travis
49
+ require 'yaml'
50
+ data = YAML.load(IO.read(File.dirname(__FILE__) + '/test-matrix.yml'))
51
+ data['rvm'].each do |rvm|
52
+ data['rails'].each do |rails|
53
+ data['database'].each do |database|
54
+ run_tests(rvm, rails, database)
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+ task :default => :test
62
+
63
+ desc 'Test the ETL application.'
64
+ Rake::TestTask.new(:test) do |t|
65
+ t.libs << 'lib' << '.'
66
+ t.pattern = 'test/**/*_test.rb'
67
+ t.verbose = true
68
+ # TODO: reset the database
69
+ end
70
+
71
+ namespace :rcov do
72
+ desc 'Measures test coverage'
73
+ task :test do
74
+ rm_f 'coverage.data'
75
+ mkdir 'coverage' unless File.exist?('coverage')
76
+ rcov = "rcov --aggregate coverage.data --text-summary -Ilib"
77
+ system("#{rcov} test/*_test.rb")
78
+ # system("open coverage/index.html") if PLATFORM['darwin']
79
+ end
80
+ end
81
+
82
+ desc "Generate code statistics"
83
+ task :lines do
84
+ lines, codelines, total_lines, total_codelines = 0, 0, 0, 0
85
+
86
+ for file_name in FileList["lib/**/*.rb"]
87
+ next if file_name =~ /vendor/
88
+ f = File.open(file_name)
89
+
90
+ while line = f.gets
91
+ lines += 1
92
+ next if line =~ /^\s*$/
93
+ next if line =~ /^\s*#/
94
+ codelines += 1
95
+ end
96
+ puts "L: #{sprintf("%4d", lines)}, LOC #{sprintf("%4d", codelines)} | #{file_name}"
97
+
98
+ total_lines += lines
99
+ total_codelines += codelines
100
+
101
+ lines, codelines = 0, 0
102
+ end
103
+
104
+ puts "Total: Lines #{total_lines}, LOC #{total_codelines}"
105
+ end