activewarehouse-etl-sgonyea 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. data/.gitignore +9 -0
  2. data/0.9-UPGRADE +6 -0
  3. data/CHANGELOG +236 -0
  4. data/Gemfile +4 -0
  5. data/HOW_TO_RELEASE +13 -0
  6. data/LICENSE +7 -0
  7. data/README.textile +111 -0
  8. data/Rakefile +103 -0
  9. data/TODO +28 -0
  10. data/active_support_logger.patch +78 -0
  11. data/activewarehouse-etl.gemspec +36 -0
  12. data/bin/etl +28 -0
  13. data/bin/etl.cmd +8 -0
  14. data/examples/database.example.yml +16 -0
  15. data/lib/etl.rb +97 -0
  16. data/lib/etl/batch.rb +2 -0
  17. data/lib/etl/batch/batch.rb +111 -0
  18. data/lib/etl/batch/directives.rb +65 -0
  19. data/lib/etl/builder.rb +2 -0
  20. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  21. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  22. data/lib/etl/commands/etl.rb +89 -0
  23. data/lib/etl/control.rb +3 -0
  24. data/lib/etl/control/control.rb +405 -0
  25. data/lib/etl/control/destination.rb +438 -0
  26. data/lib/etl/control/destination/csv_destination.rb +113 -0
  27. data/lib/etl/control/destination/database_destination.rb +97 -0
  28. data/lib/etl/control/destination/excel_destination.rb +91 -0
  29. data/lib/etl/control/destination/file_destination.rb +126 -0
  30. data/lib/etl/control/destination/insert_update_database_destination.rb +136 -0
  31. data/lib/etl/control/destination/update_database_destination.rb +109 -0
  32. data/lib/etl/control/destination/yaml_destination.rb +74 -0
  33. data/lib/etl/control/source.rb +132 -0
  34. data/lib/etl/control/source/database_source.rb +224 -0
  35. data/lib/etl/control/source/enumerable_source.rb +11 -0
  36. data/lib/etl/control/source/file_source.rb +90 -0
  37. data/lib/etl/control/source/model_source.rb +39 -0
  38. data/lib/etl/core_ext.rb +1 -0
  39. data/lib/etl/core_ext/time.rb +5 -0
  40. data/lib/etl/core_ext/time/calculations.rb +42 -0
  41. data/lib/etl/engine.rb +582 -0
  42. data/lib/etl/execution.rb +19 -0
  43. data/lib/etl/execution/base.rb +8 -0
  44. data/lib/etl/execution/batch.rb +10 -0
  45. data/lib/etl/execution/job.rb +8 -0
  46. data/lib/etl/execution/migration.rb +90 -0
  47. data/lib/etl/generator.rb +2 -0
  48. data/lib/etl/generator/generator.rb +20 -0
  49. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  50. data/lib/etl/http_tools.rb +139 -0
  51. data/lib/etl/parser.rb +11 -0
  52. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  53. data/lib/etl/parser/csv_parser.rb +93 -0
  54. data/lib/etl/parser/excel_parser.rb +112 -0
  55. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  56. data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
  57. data/lib/etl/parser/parser.rb +41 -0
  58. data/lib/etl/parser/sax_parser.rb +218 -0
  59. data/lib/etl/parser/xml_parser.rb +65 -0
  60. data/lib/etl/processor.rb +11 -0
  61. data/lib/etl/processor/block_processor.rb +14 -0
  62. data/lib/etl/processor/bulk_import_processor.rb +94 -0
  63. data/lib/etl/processor/check_exist_processor.rb +80 -0
  64. data/lib/etl/processor/check_unique_processor.rb +39 -0
  65. data/lib/etl/processor/copy_field_processor.rb +26 -0
  66. data/lib/etl/processor/database_join_processor.rb +82 -0
  67. data/lib/etl/processor/encode_processor.rb +55 -0
  68. data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
  69. data/lib/etl/processor/escape_csv_processor.rb +77 -0
  70. data/lib/etl/processor/filter_row_processor.rb +51 -0
  71. data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
  72. data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
  73. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  74. data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
  75. data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
  76. data/lib/etl/processor/print_row_processor.rb +12 -0
  77. data/lib/etl/processor/processor.rb +25 -0
  78. data/lib/etl/processor/rename_processor.rb +24 -0
  79. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  80. data/lib/etl/processor/row_processor.rb +27 -0
  81. data/lib/etl/processor/sequence_processor.rb +23 -0
  82. data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
  83. data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
  84. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  85. data/lib/etl/processor/truncate_processor.rb +40 -0
  86. data/lib/etl/processor/zip_file_processor.rb +27 -0
  87. data/lib/etl/row.rb +20 -0
  88. data/lib/etl/screen.rb +14 -0
  89. data/lib/etl/screen/row_count_screen.rb +20 -0
  90. data/lib/etl/transform.rb +2 -0
  91. data/lib/etl/transform/block_transform.rb +13 -0
  92. data/lib/etl/transform/calculation_transform.rb +71 -0
  93. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  94. data/lib/etl/transform/decode_transform.rb +51 -0
  95. data/lib/etl/transform/default_transform.rb +20 -0
  96. data/lib/etl/transform/foreign_key_lookup_transform.rb +211 -0
  97. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  98. data/lib/etl/transform/md5_transform.rb +13 -0
  99. data/lib/etl/transform/ordinalize_transform.rb +14 -0
  100. data/lib/etl/transform/sha1_transform.rb +13 -0
  101. data/lib/etl/transform/split_fields_transform.rb +27 -0
  102. data/lib/etl/transform/string_to_date_time_transform.rb +14 -0
  103. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  104. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  105. data/lib/etl/transform/transform.rb +61 -0
  106. data/lib/etl/transform/trim_transform.rb +26 -0
  107. data/lib/etl/transform/type_transform.rb +35 -0
  108. data/lib/etl/util.rb +59 -0
  109. data/lib/etl/version.rb +3 -0
  110. data/test-matrix.yml +10 -0
  111. data/test/.gitignore +1 -0
  112. data/test/.ignore +2 -0
  113. data/test/all.ebf +6 -0
  114. data/test/apache_combined_log.ctl +11 -0
  115. data/test/batch_test.rb +41 -0
  116. data/test/batch_with_error.ebf +6 -0
  117. data/test/batched1.ctl +0 -0
  118. data/test/batched2.ctl +0 -0
  119. data/test/block_processor.ctl +6 -0
  120. data/test/block_processor_error.ctl +1 -0
  121. data/test/block_processor_pre_post_process.ctl +4 -0
  122. data/test/block_processor_remove_rows.ctl +5 -0
  123. data/test/block_processor_test.rb +38 -0
  124. data/test/check_exist_processor_test.rb +92 -0
  125. data/test/check_unique_processor_test.rb +40 -0
  126. data/test/config/Gemfile.rails-2.3.x +3 -0
  127. data/test/config/Gemfile.rails-2.3.x.lock +53 -0
  128. data/test/config/Gemfile.rails-3.0.x +3 -0
  129. data/test/config/Gemfile.rails-3.0.x.lock +61 -0
  130. data/test/config/common.rb +29 -0
  131. data/test/connection/mysql/connection.rb +9 -0
  132. data/test/connection/mysql/schema.sql +37 -0
  133. data/test/connection/postgresql/connection.rb +13 -0
  134. data/test/connection/postgresql/schema.sql +40 -0
  135. data/test/control_test.rb +43 -0
  136. data/test/data/apache_combined_log.txt +3 -0
  137. data/test/data/bulk_import.txt +3 -0
  138. data/test/data/bulk_import_with_empties.txt +3 -0
  139. data/test/data/decode.txt +3 -0
  140. data/test/data/delimited.txt +3 -0
  141. data/test/data/encode_source_latin1.txt +2 -0
  142. data/test/data/excel.xls +0 -0
  143. data/test/data/excel2.xls +0 -0
  144. data/test/data/fixed_width.txt +3 -0
  145. data/test/data/multiple_delimited_1.txt +3 -0
  146. data/test/data/multiple_delimited_2.txt +3 -0
  147. data/test/data/nokogiri.xml +38 -0
  148. data/test/data/people.txt +3 -0
  149. data/test/data/sax.xml +14 -0
  150. data/test/data/xml.xml +16 -0
  151. data/test/database_join_processor_test.rb +43 -0
  152. data/test/date_dimension_builder_test.rb +96 -0
  153. data/test/delimited.ctl +30 -0
  154. data/test/delimited_absolute.ctl +31 -0
  155. data/test/delimited_destination_db.ctl +23 -0
  156. data/test/delimited_excel.ctl +31 -0
  157. data/test/delimited_insert_update.ctl +34 -0
  158. data/test/delimited_update.ctl +34 -0
  159. data/test/delimited_with_bulk_load.ctl +34 -0
  160. data/test/destination_test.rb +275 -0
  161. data/test/directive_test.rb +23 -0
  162. data/test/encode_processor_test.rb +32 -0
  163. data/test/engine_test.rb +78 -0
  164. data/test/ensure_fields_presence_processor_test.rb +28 -0
  165. data/test/errors.ctl +24 -0
  166. data/test/etl_test.rb +42 -0
  167. data/test/excel.ctl +24 -0
  168. data/test/excel2.ctl +25 -0
  169. data/test/fixed_width.ctl +35 -0
  170. data/test/foreign_key_lookup_transform_test.rb +50 -0
  171. data/test/generator_test.rb +14 -0
  172. data/test/inline_parser.ctl +17 -0
  173. data/test/mocks/mock_destination.rb +26 -0
  174. data/test/mocks/mock_source.rb +25 -0
  175. data/test/model_source.ctl +14 -0
  176. data/test/multiple_delimited.ctl +22 -0
  177. data/test/multiple_source_delimited.ctl +39 -0
  178. data/test/nokogiri_all.ctl +35 -0
  179. data/test/nokogiri_select.ctl +35 -0
  180. data/test/nokogiri_test.rb +35 -0
  181. data/test/parser_test.rb +224 -0
  182. data/test/performance/delimited.ctl +30 -0
  183. data/test/processor_test.rb +44 -0
  184. data/test/row_processor_test.rb +17 -0
  185. data/test/sax.ctl +26 -0
  186. data/test/scd/1.txt +1 -0
  187. data/test/scd/2.txt +1 -0
  188. data/test/scd/3.txt +1 -0
  189. data/test/scd_test.rb +257 -0
  190. data/test/scd_test_type_1.ctl +43 -0
  191. data/test/scd_test_type_2.ctl +34 -0
  192. data/test/screen_test.rb +9 -0
  193. data/test/screen_test_error.ctl +3 -0
  194. data/test/screen_test_fatal.ctl +3 -0
  195. data/test/source_test.rb +154 -0
  196. data/test/test_helper.rb +37 -0
  197. data/test/transform_test.rb +101 -0
  198. data/test/truncate_processor_test.rb +37 -0
  199. data/test/xml.ctl +31 -0
  200. metadata +370 -0
@@ -0,0 +1,35 @@
1
+ require File.dirname(__FILE__) + '/test_helper'
2
+
3
+ # Test the flat text parsers
4
+ class ParserTest < Test::Unit::TestCase
5
+
6
+ # Test the DOM-based Nokogiri XML parser. .
7
+ def test_nokogiri_xml_parser_for_all_nodes
8
+ control = ETL::Control::Control.resolve(
9
+ File.dirname(__FILE__) + '/nokogiri_all.ctl')
10
+ parser = ETL::Parser::NokogiriXmlParser.new(control.sources.first)
11
+ rows = parser.collect { |row| row }
12
+ assert_equal 3, rows.length
13
+ assert_equal(
14
+ { :hair_colour=>"black",
15
+ :first_name=>"Bob",
16
+ :last_name=>"Smith",
17
+ :ssn=>"123456789", :age=>"24"}, rows.first)
18
+ end
19
+
20
+ # Test the DOM-based Nokogiri XML parser. .
21
+ def test_nokogiri_xml_parser_for_selected_nodes
22
+ control = ETL::Control::Control.resolve(
23
+ File.dirname(__FILE__) + '/nokogiri_select.ctl')
24
+ parser = ETL::Parser::NokogiriXmlParser.new(control.sources.first)
25
+ rows = parser.collect { |row| row }
26
+ assert_equal 2, rows.length
27
+ assert_equal(
28
+ { :age=>"37",
29
+ :hair_colour=>"black",
30
+ :first_name=>"Jake",
31
+ :last_name=>"Smithsonian",
32
+ :ssn=>"133244566"}, rows.last)
33
+ end
34
+
35
+ end
@@ -0,0 +1,224 @@
1
+ require File.dirname(__FILE__) + '/test_helper'
2
+
3
+ # Test the flat text parsers
4
+ class ParserTest < Test::Unit::TestCase
5
+ # Test parsing delimited data
6
+ def test_csv_parser
7
+ control = ETL::Control::Control.resolve(File.dirname(__FILE__) + '/delimited.ctl')
8
+ parser = ETL::Parser::CsvParser.new(control.sources.first)
9
+ rows = parser.collect { |row| row }
10
+ assert_equal 3, rows.length
11
+ assert_equal({:first_name=>"Chris", :last_name=>"Smith", :ssn=>"111223333", :age=>"24", :sex => 'M'}, rows.first)
12
+ end
13
+
14
+ # Test parsing fixed-width data
15
+ def test_fixed_width_parser
16
+ control = ETL::Control::Control.resolve(File.dirname(__FILE__) + '/fixed_width.ctl')
17
+ parser = ETL::Parser::FixedWidthParser.new(control.sources.first)
18
+ rows = parser.collect { |row| row }
19
+ assert_equal 3, rows.length
20
+ assert_equal({:first_name=>"Bob", :last_name=>"Smith", :ssn=>"123445555", :age=>"23"}, rows.first)
21
+ end
22
+
23
+ # Test the DOM-based XML parser. Note that the DOM parser is slow and should
24
+ # probably be removed.
25
+ def test_xml_parser
26
+ control = ETL::Control::Control.resolve(File.dirname(__FILE__) + '/xml.ctl')
27
+ parser = ETL::Parser::XmlParser.new(control.sources.first)
28
+ rows = parser.collect { |row| row }
29
+ assert_equal 2, rows.length
30
+ assert_equal({:first_name=>"Bob", :last_name=>"Smith", :ssn=>"123456789", :age=>"24"}, rows.first)
31
+ end
32
+
33
+ # Test an inline parser
34
+ def test_inline_parser
35
+ ETL::Engine.process(File.dirname(__FILE__) + '/inline_parser.ctl')
36
+ lines = open(File.dirname(__FILE__) + '/output/inline_parser.txt').readlines
37
+ assert_equal 3, lines.length
38
+ end
39
+
40
+ # Test the SAX parser (preferred for XML parsing)
41
+ def test_sax_parser
42
+ control = ETL::Control::Control.resolve(File.dirname(__FILE__) + '/sax.ctl')
43
+ parser = control.sources.first.parser
44
+ rows = parser.collect { |row| row }
45
+ assert_equal 2, rows.length
46
+ assert_equal({:first_name=>"Bob", :last_name=>"Smith", :ssn=>"123456789", :age=>"24"}, rows.first)
47
+ end
48
+
49
+ # Test the Excel parser
50
+ def test_excel_parser
51
+ control = ETL::Control::Control.resolve(File.dirname(__FILE__) + '/excel.ctl')
52
+ parser = control.sources.first.parser
53
+ rows = parser.collect { |row| row }
54
+ assert_equal 2, rows.length
55
+ assert_equal({:first_name=>"Bob", :last_name=>"Smith", :ssn=>123456789, :age=>24}, rows.first)
56
+ end
57
+
58
+ # Test 2 the Excel parser
59
+ def test_excel2_parser
60
+ control = ETL::Control::Control.resolve(File.dirname(__FILE__) + '/excel2.ctl')
61
+ parser = control.sources.first.parser
62
+ rows = parser.collect { |row| row }
63
+ assert_equal 2, rows.length
64
+ assert_equal({:first_name=>"John", :last_name=>"Doe", :ssn=>222114545, :age=>31}, rows[1])
65
+ end
66
+
67
+ # Test the Apache combined log format parser
68
+ def test_apache_combined_log_parser
69
+ control = ETL::Control::Control.resolve(File.dirname(__FILE__) + '/apache_combined_log.ctl')
70
+ parser = ETL::Parser::ApacheCombinedLogParser.new(control.sources.first)
71
+ # first test the parse method
72
+ line = %Q(127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)")
73
+ fields = parser.parse(line)
74
+ assert_equal '127.0.0.1', fields[:ip_address]
75
+ assert_equal 'frank', fields[:user]
76
+ assert_equal Time.mktime(2000, 10, 10, 13, 55, 36), fields[:timestamp]
77
+ assert_equal 'GET /apache_pb.gif HTTP/1.0', fields[:request]
78
+ assert_equal '200', fields[:response_code]
79
+ assert_equal '2326', fields[:bytes]
80
+ assert_equal 'http://www.example.com/start.html', fields[:referrer]
81
+ assert_equal 'Mozilla/4.08 [en] (Win98; I ;Nav)', fields[:user_agent]
82
+ #now test the each method
83
+ rows = parser.collect { |row| row }
84
+ assert_equal 3, rows.length
85
+ assert_equal({
86
+ :user_agent=>"Mozilla/4.08 [en] (Win98; I ;Nav)",
87
+ :browser_version_minor=>nil,
88
+ :timestamp=>Time.mktime(2000, 10, 10, 13, 55, 36),
89
+ :ostype=>"Windows",
90
+ :request=>"GET /apache_pb.gif HTTP/1.0",
91
+ :referrer_host=>"www.example.com",
92
+ :referrer_domain=>"example.com",
93
+ :os=>"Win98",
94
+ :referrer_uri_path=>"/start.html",
95
+ :method=>"GET",
96
+ :response_code=>"200",
97
+ :referrer_port=>80,
98
+ :os_version=>nil,
99
+ :ip_address=>"127.0.0.1",
100
+ :referrer_scheme=>"http",
101
+ :bytes=>"2326",
102
+ :browser=>"Mozilla",
103
+ :identd=>nil,
104
+ :path=>"/apache_pb.gif",
105
+ :referrer=>"http://www.example.com/start.html",
106
+ :browser_version_major=>"4",
107
+ :user=>"frank"}, rows.first, 'Failed on first row')
108
+ assert_equal({
109
+ :user_agent=>"Mozilla/4.08 [en] (Win98; I ;Nav)",
110
+ :referrer_port=>80,
111
+ :timestamp=>Time.mktime(2000, 10, 11, 5, 22, 2),
112
+ :browser_version_minor=>nil,
113
+ :os_version=>nil,
114
+ :request=>"GET /apache_pb.gif HTTP/1.1",
115
+ :ostype=>"Windows",
116
+ :referrer_scheme=>"http",
117
+ :response_code=>"200",
118
+ :referrer_host=>"www.foo.com",
119
+ :referrer_domain=>"foo.com",
120
+ :ip_address=>"127.0.0.1",
121
+ :browser=>"Mozilla",
122
+ :bytes=>"2326",
123
+ :os=>"Win98",
124
+ :identd=>nil,
125
+ :browser_version_major=>"4",
126
+ :referrer=>"http://www.foo.com/",
127
+ :referrer_uri_path=>"/",
128
+ :method=>"GET",
129
+ :path=>"/apache_pb.gif",
130
+ :user=>"bob"}, rows[1], 'Failed on second row')
131
+ assert_equal({
132
+ :browser_version_major=>"4",
133
+ :browser_version_minor=>nil,
134
+ :referrer_port=>nil,
135
+ :request=>"GET /apache_pb.gif HTTP/1.1",
136
+ :ostype=>"Windows",
137
+ :os_version=>nil,
138
+ :response_code=>"200",
139
+ :referrer_host=>nil,
140
+ :referrer_scheme=>nil,
141
+ :bytes=>"2326",
142
+ :ip_address=>"127.0.0.1",
143
+ :browser=>"Mozilla",
144
+ :referrer=>nil,
145
+ :os=>"Win98",
146
+ :user=>"bob",
147
+ :user_agent=>"Mozilla/4.08 [en] (Win98; I ;Nav)",
148
+ :identd=>nil,
149
+ :referrer_uri_path=>nil,
150
+ :path=>"/apache_pb.gif",
151
+ :method=>"GET",
152
+ :timestamp=>Time.mktime(2000, 10, 11, 5, 52, 31)}, rows[2], 'Failed on third row')
153
+ end
154
+
155
+ # Test the user agent parser
156
+ def test_user_agent_parser
157
+ agents = <<-AGENTS
158
+ Mozilla/4.7 [en] (WinNT; U)
159
+ Mozilla/4.0 (compatible; MSIE 5.01; Windows NT)
160
+ Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461; .NET CLR 1.1.4322)
161
+ Mozilla/4.0 (compatible; MSIE 5.0; Windows NT 4.0) Opera 5.11 [en]
162
+ Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.0.2) Gecko/20030208 Netscape/7.02
163
+ Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.6) Gecko/20040612 Firefox/0.8
164
+ Mozilla/5.0 (compatible; Konqueror/3.2; Linux) (KHTML, like Gecko)
165
+ Lynx/2.8.4rel.1 libwww-FM/2.14 SSL-MM/1.4.1 OpenSSL/0.9.6h
166
+ AGENTS
167
+ agents = agents.split("\n").collect { |s| s.strip }
168
+
169
+ control = ETL::Control::Control.resolve(File.dirname(__FILE__) + '/apache_combined_log.ctl')
170
+ parser = ETL::Parser::ApacheCombinedLogParser.new(control.sources.first)
171
+ rows = parser.collect { |row| row }
172
+
173
+ assert_equal({:browser_version_major=>"4",
174
+ :browser_version_minor=>nil,
175
+ :ostype=>"Windows",
176
+ :os=>"WinNT",
177
+ :os_version=>nil,
178
+ :browser=>"Mozilla"}, parser.parse_user_agent(agents[0]), 'Agent 0 invalid'
179
+ )
180
+ assert_equal({:browser_version_major=>"5",
181
+ :browser_version_minor=>"01",
182
+ :ostype=>"Windows",
183
+ :os=>"Windows NT",
184
+ :os_version=>nil,
185
+ :browser=>"MSIE"}, parser.parse_user_agent(agents[1]), 'Agent 1 invalid'
186
+ )
187
+ assert_equal({:browser_version_major=>"6",
188
+ :browser_version_minor=>"0",
189
+ :ostype=>"Windows",
190
+ :os=>"Windows NT 5.0",
191
+ :os_version=>"5.0",
192
+ :browser=>"MSIE"}, parser.parse_user_agent(agents[2]), 'Agent 2 invalid'
193
+ )
194
+ assert_equal({:browser_version_major=>"5",
195
+ :browser_version_minor=>"0",
196
+ :ostype=>"Windows",
197
+ :os=>"Windows NT 4.0",
198
+ :os_version=>"4.0",
199
+ :browser=>"MSIE"}, parser.parse_user_agent(agents[3]), 'Agent 3 invalid'
200
+ )
201
+ assert_equal({:browser_version_major=>"7",
202
+ :browser_version_minor=>nil,
203
+ :ostype=>"Windows",
204
+ :os=>"Windows NT 5.0",
205
+ :os_version=>"5.0",
206
+ :browser=>"Netscape"}, parser.parse_user_agent(agents[4]), 'Agent 4 invalid'
207
+ )
208
+ assert_equal({:browser_version_major=>"0.8",
209
+ :browser_version_minor=>nil,
210
+ :ostype=>"Linux",
211
+ :os=>"Linux i686",
212
+ :os_version=>nil,
213
+ :browser=>"Firefox"}, parser.parse_user_agent(agents[5]), 'Agent 5 invalid'
214
+ )
215
+ # test fails here
216
+ # assert_equal({:browser_version_major=>"6",
217
+ # :browser_version_minor=>nil,
218
+ # :ostype=>"Linux",
219
+ # :os=>"Linux",
220
+ # :os_version=>nil,
221
+ # :browser=>"Konquerer"}, parser.parse_user_agent(agents[6]), 'Agent 6 invalid'
222
+ # )
223
+ end
224
+ end
@@ -0,0 +1,30 @@
1
+ # puts "executing delimited.ctl"
2
+
3
+ source :in, {
4
+ :file => 'delimited.txt',
5
+ :parser => :csv
6
+ },
7
+ [
8
+ :first_name,
9
+ :last_name,
10
+ :ssn,
11
+ {
12
+ :name => :age,
13
+ :type => :integer
14
+ },
15
+ :sex
16
+ ]
17
+
18
+ transform :ssn, :sha1
19
+ transform(:ssn){ |v| v[0,24] }
20
+ transform :sex, :decode, {:decode_table_path => 'delimited_decode.txt'}
21
+
22
+ destination :out, {
23
+ :file => 'delimited.out.txt'
24
+ },
25
+ {
26
+ :order => [:first_name, :last_name, :name, :ssn, :age, :sex],
27
+ :virtual => {
28
+ :name => Proc.new { |row| "#{row[:first_name]} #{row[:last_name]}" }
29
+ }
30
+ }
@@ -0,0 +1,44 @@
1
+ require File.dirname(__FILE__) + '/test_helper'
2
+
3
+ class Person < ActiveRecord::Base
4
+ end
5
+
6
+ # Test pre- and post-processors
7
+ class ProcessorTest < Test::Unit::TestCase
8
+ # Test bulk import functionality
9
+
10
+ context "the bulk import processor" do
11
+ should "should import successfully" do
12
+ assert_nothing_raised { do_bulk_import }
13
+ assert_equal 3, Person.count
14
+ assert_equal "Foxworthy", Person.find(2).last_name
15
+ end
16
+ end
17
+
18
+ def test_bulk_import_with_empties
19
+ # this test ensure that one column with empty value will still allow
20
+ # the row to be imported
21
+ # this doesn't apply to the id column though - untested
22
+ assert_nothing_raised { do_bulk_import('bulk_import_with_empties.txt') }
23
+ assert_equal 3, Person.count
24
+ assert Person.find(2).last_name.blank?
25
+ end
26
+
27
+ def test_truncate
28
+ # TODO: implement test
29
+ end
30
+
31
+ private
32
+
33
+ def do_bulk_import(file = 'bulk_import.txt')
34
+ control = ETL::Control::Control.new(File.join(File.dirname(__FILE__), 'delimited.ctl'))
35
+ configuration = {
36
+ :file => "data/#{file}",
37
+ :truncate => true,
38
+ :target => :data_warehouse,
39
+ :table => 'people'
40
+ }
41
+ processor = ETL::Processor::BulkImportProcessor.new(control, configuration)
42
+ processor.process
43
+ end
44
+ end
@@ -0,0 +1,17 @@
1
+ require File.dirname(__FILE__) + '/test_helper'
2
+
3
+ # Test row processors
4
+ class RowProcessorTest < Test::Unit::TestCase
5
+ def test_copy_field_processor
6
+
7
+ end
8
+ def test_hierarchy_exploder_processor
9
+
10
+ end
11
+ def test_rename_processor
12
+
13
+ end
14
+ def test_sequence_processor
15
+
16
+ end
17
+ end
@@ -0,0 +1,26 @@
1
+ # puts "executing fixed_width.ctl"
2
+
3
+ source :in, {
4
+ :file => 'data/sax.xml',
5
+ :parser => :sax
6
+ },
7
+ {
8
+ :write_trigger => 'people/person',
9
+ :fields => {
10
+ :first_name => 'people/person/first_name',
11
+ :last_name => 'people/person/last_name',
12
+ :ssn => 'people/person/social_security_number',
13
+ :age => 'people/person[age]'
14
+ }
15
+ }
16
+
17
+ transform :ssn, :sha1
18
+ transform(:ssn){ |v| v[0,24] }
19
+ transform :age, :type, {:type => :number}
20
+
21
+ destination :out, {
22
+ :file => 'output/sax.out.txt'
23
+ },
24
+ {
25
+ :order => [:first_name, :last_name, :ssn, :age]
26
+ }
@@ -0,0 +1 @@
1
+ Bob,Smith,200 South Drive,Boston,MA,32123
@@ -0,0 +1 @@
1
+ Bob,Smith,1010 SW 23rd St,Los Angeles,CA,90392
@@ -0,0 +1 @@
1
+ Bob,Smith,280 Pine Street,Los Angeles,CA,90392
@@ -0,0 +1,257 @@
1
+ require File.dirname(__FILE__) + '/test_helper'
2
+
3
+ class ScdTest < Test::Unit::TestCase
4
+ context "when working with a slowly changing dimension" do
5
+ setup do
6
+ @connection = ETL::Engine.connection(:data_warehouse)
7
+ @connection.delete("DELETE FROM person_dimension")
8
+ @end_of_time = DateTime.parse('9999-12-31 00:00:00')
9
+ end
10
+ context "of type 1" do
11
+ context "on run 1" do
12
+ setup do
13
+ do_type_1_run(1)
14
+ end
15
+ should "insert record" do
16
+ assert_equal 1, count_bobs
17
+ end
18
+ should "set the original address" do
19
+ assert_boston_address(find_bobs.first)
20
+ end
21
+ should "set the original id" do
22
+ assert_equal 1, find_bobs.first.id
23
+ end
24
+ should "skip the load if there is no change" do
25
+ do_type_1_run(1)
26
+ lines = lines_for('scd_test_type_1.txt')
27
+ assert lines.empty?, "scheduled load expected to be empty, was #{lines.size} records"
28
+ end
29
+ end
30
+ context "on run 2" do
31
+ setup do
32
+ do_type_1_run(1)
33
+ do_type_1_run(2)
34
+ end
35
+ should "delete the old record" do
36
+ assert_equal 1, count_bobs, "new record created, but old not deleted: #{find_bobs.inspect}"
37
+ end
38
+ should "update the address" do
39
+ assert_los_angeles_address(find_bobs.last)
40
+ end
41
+ should "keep id" do
42
+ assert_equal 1, find_bobs.first.id
43
+ end
44
+ should "only change once even if run again" do
45
+ do_type_1_run(2)
46
+ assert_equal 1, count_bobs
47
+ lines = lines_for('scd_test_type_1.txt')
48
+ assert lines.empty?, "scheduled load expected to be empty, was #{lines.size} records"
49
+ end
50
+ should "revert address on new record" do
51
+ do_type_1_run(1)
52
+ assert_boston_address(find_bobs.first)
53
+ end
54
+ should "keep record on revert" do
55
+ do_type_1_run(1)
56
+ assert_equal 1, count_bobs
57
+ end
58
+ end
59
+ end
60
+ context "of type 2" do
61
+ context "on run 1" do
62
+ setup do
63
+ do_type_2_run(1)
64
+ end
65
+ should "insert record" do
66
+ assert_equal 1, count_bobs
67
+ end
68
+ should "set the original record" do
69
+ assert_boston_address(find_bobs.first)
70
+ end
71
+ should "set the original id" do
72
+ assert_equal 1, find_bobs.first.id
73
+ end
74
+ should "set the effective date" do
75
+ # doing comparison on strings, as comparison on objects
76
+ # doesn't consider things equal for some yet to be understood
77
+ # reason
78
+ assert_equal current_datetime.to_s, find_bobs.first.effective_date.to_s
79
+ end
80
+ should "set the end date" do
81
+ assert_equal @end_of_time, find_bobs.first.end_date
82
+ end
83
+ should "set the latest version flag" do
84
+ assert find_bobs.first.latest_version?
85
+ end
86
+ should "skip the load if there is no change" do
87
+ do_type_2_run(1)
88
+ assert_equal 1, find_bobs.last.id, "scheduled load expected to be empty"
89
+ end
90
+
91
+ end
92
+ context "on run 2" do
93
+ setup do
94
+ do_type_2_run(1)
95
+ do_type_2_run(2)
96
+ end
97
+ should "insert new record" do
98
+ assert_equal 2, count_bobs
99
+ end
100
+ should "keep the primary key of the original version" do
101
+ assert_not_nil find_bobs.detect { |bob| 1 == bob.id }
102
+ end
103
+ should "increment the primary key for the new version" do
104
+ assert_not_nil find_bobs.detect { |bob| 2 == bob.id }
105
+ end
106
+ should "expire the old record" do
107
+ original_bob = find_bobs.detect { |bob| 1 == bob.id }
108
+ new_bob = find_bobs.detect { |bob| 2 == bob.id }
109
+ assert_equal new_bob.effective_date, original_bob.end_date
110
+ end
111
+ should "keep the address for the expired record" do
112
+ assert_boston_address(find_bobs.detect { |bob| 1 == bob.id })
113
+ end
114
+ should "update the address on the new record" do
115
+ assert_los_angeles_address(find_bobs.detect { |bob| 2 == bob.id })
116
+ end
117
+ should "activate the new record" do
118
+ # doing comparison on strings, as comparison on objects
119
+ # doesn't consider things equal for some yet to be understood
120
+ # reason
121
+ assert_equal current_datetime.to_s, find_bobs.detect { |bob| 2 == bob.id }.effective_date.to_s
122
+ end
123
+ should "set the end date for the new record" do
124
+ assert_equal @end_of_time, find_bobs.detect { |bob| 2 == bob.id }.end_date
125
+ end
126
+ should "shift the latest version" do
127
+ original_bob = find_bobs.detect { |bob| 1 == bob.id }
128
+ new_bob = find_bobs.detect { |bob| 2 == bob.id }
129
+ assert !original_bob.latest_version?
130
+ assert new_bob.latest_version?
131
+ end
132
+ should "only execute a change once" do
133
+ do_type_2_run(2)
134
+ assert_equal 2, count_bobs, "scheduled load expected to be empty"
135
+ end
136
+ should "insert new records on revert" do
137
+ do_type_2_run(1)
138
+ assert_equal 3, count_bobs
139
+ end
140
+ should "update address on new record on revert" do
141
+ do_type_2_run(1)
142
+ assert_boston_address(find_bobs.detect { |bob| 3 == bob.id })
143
+ end
144
+ should "only delete one row on an scd change" do
145
+ # Two records right now
146
+ assert_equal 2, count_bobs
147
+ do_type_2_run(1) # put third version in (same as first version, but that's irrelevant)
148
+ # was failing because first and second versions were being deleted.
149
+ assert_equal 3, count_bobs
150
+ end
151
+ end
152
+ context "on non sdc fields that change" do
153
+ setup do
154
+ do_type_2_run_with_only_city_state_zip_scd(1)
155
+ do_type_2_run_with_only_city_state_zip_scd(2)
156
+ end
157
+ should "not create an extra record" do
158
+ do_type_2_run_with_only_city_state_zip_scd(3)
159
+ assert_equal 2, count_bobs
160
+ end
161
+ should "keep id" do
162
+ do_type_2_run_with_only_city_state_zip_scd(3)
163
+ assert_not_nil find_bobs.detect { |bob| 2 == bob.id }
164
+ end
165
+ should "keep dates" do
166
+ old_bob = find_bobs.detect { |bob| 2 == bob.id }
167
+ do_type_2_run_with_only_city_state_zip_scd(3)
168
+ new_bob = find_bobs.detect { |bob| 2 == bob.id }
169
+ assert_equal old_bob.end_date, new_bob.end_date
170
+ assert_equal old_bob.effective_date, new_bob.effective_date
171
+ end
172
+ should "keep the latest version flag" do
173
+ do_type_2_run_with_only_city_state_zip_scd(3)
174
+ assert find_bobs.detect { |bob| 2 == bob.id }.latest_version?
175
+ end
176
+ should "treat non scd fields like type 1 fields" do
177
+ do_type_2_run_with_only_city_state_zip_scd(3)
178
+ assert_los_angeles_address(find_bobs.detect { |bob| 2 == bob.id }, "280 Pine Street")
179
+ end
180
+ should "skip load when there is no change" do
181
+ do_type_2_run_with_only_city_state_zip_scd(2)
182
+ assert_equal 2, count_bobs, "scheduled load expected to be empty"
183
+ end
184
+ end
185
+ end
186
+ end
187
+
188
+ def do_type_2_run(run_num)
189
+ ENV['run_number'] = run_num.to_s
190
+ assert_nothing_raised do
191
+ run_ctl_file("scd_test_type_2.ctl")
192
+ end
193
+ end
194
+
195
+ def do_type_2_run_with_only_city_state_zip_scd(run_num)
196
+ ENV['type_2_scd_fields'] = Marshal.dump([:city, :state, :zip_code])
197
+ do_type_2_run(run_num)
198
+ end
199
+
200
+ def do_type_1_run(run_num)
201
+ ENV['run_number'] = run_num.to_s
202
+ assert_nothing_raised do
203
+ run_ctl_file("scd_test_type_1.ctl")
204
+ end
205
+ end
206
+
207
+ def lines_for(file)
208
+ File.readlines(File.dirname(__FILE__) + "/output/#{file}")
209
+ end
210
+
211
+ def run_ctl_file(file)
212
+ ETL::Engine.process(File.dirname(__FILE__) + "/#{file}")
213
+ end
214
+
215
+ def count_bobs
216
+ @connection.select_value(
217
+ "SELECT count(*) FROM person_dimension WHERE first_name = 'Bob' and last_name = 'Smith'").to_i
218
+ end
219
+
220
+ def find_bobs
221
+ bobs = @connection.select_all(
222
+ "SELECT * FROM person_dimension WHERE first_name = 'Bob' and last_name = 'Smith'")
223
+ bobs.each do |bob|
224
+ def bob.id
225
+ self["id"].to_i
226
+ end
227
+ def bob.effective_date
228
+ DateTime.parse(self["effective_date"])
229
+ end
230
+ def bob.end_date
231
+ DateTime.parse(self["end_date"])
232
+ end
233
+ def bob.latest_version?
234
+ ActiveRecord::ConnectionAdapters::Column.value_to_boolean(self["latest_version"])
235
+ end
236
+ end
237
+ bobs
238
+ end
239
+
240
+ def current_datetime
241
+ DateTime.parse(Time.now.to_s(:db))
242
+ end
243
+
244
+ def assert_boston_address(bob, street = "200 South Drive")
245
+ assert_equal street, bob['address'], bob.inspect
246
+ assert_equal "Boston", bob['city'], bob.inspect
247
+ assert_equal "MA", bob['state'], bob.inspect
248
+ assert_equal "32123", bob['zip_code'], bob.inspect
249
+ end
250
+
251
+ def assert_los_angeles_address(bob, street = "1010 SW 23rd St")
252
+ assert_equal street, bob['address'], bob.inspect
253
+ assert_equal "Los Angeles", bob['city'], bob.inspect
254
+ assert_equal "CA", bob['state'], bob.inspect
255
+ assert_equal "90392", bob['zip_code'], bob.inspect
256
+ end
257
+ end