factorylabs-activewarehouse-etl 0.9.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. data/CHANGELOG +198 -0
  2. data/LICENSE +7 -0
  3. data/README +85 -0
  4. data/Rakefile +153 -0
  5. data/TODO +28 -0
  6. data/bin/etl +28 -0
  7. data/bin/etl.cmd +8 -0
  8. data/examples/database.example.yml +16 -0
  9. data/lib/etl.rb +78 -0
  10. data/lib/etl/batch.rb +2 -0
  11. data/lib/etl/batch/batch.rb +111 -0
  12. data/lib/etl/batch/directives.rb +55 -0
  13. data/lib/etl/builder.rb +2 -0
  14. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  15. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  16. data/lib/etl/commands/etl.rb +89 -0
  17. data/lib/etl/control.rb +3 -0
  18. data/lib/etl/control/control.rb +405 -0
  19. data/lib/etl/control/destination.rb +420 -0
  20. data/lib/etl/control/destination/database_destination.rb +95 -0
  21. data/lib/etl/control/destination/file_destination.rb +124 -0
  22. data/lib/etl/control/source.rb +109 -0
  23. data/lib/etl/control/source/database_source.rb +220 -0
  24. data/lib/etl/control/source/enumerable_source.rb +11 -0
  25. data/lib/etl/control/source/file_source.rb +90 -0
  26. data/lib/etl/control/source/model_source.rb +39 -0
  27. data/lib/etl/core_ext.rb +1 -0
  28. data/lib/etl/core_ext/time.rb +5 -0
  29. data/lib/etl/core_ext/time/calculations.rb +42 -0
  30. data/lib/etl/engine.rb +556 -0
  31. data/lib/etl/execution.rb +20 -0
  32. data/lib/etl/execution/base.rb +9 -0
  33. data/lib/etl/execution/batch.rb +8 -0
  34. data/lib/etl/execution/job.rb +8 -0
  35. data/lib/etl/execution/migration.rb +85 -0
  36. data/lib/etl/execution/record.rb +18 -0
  37. data/lib/etl/generator.rb +2 -0
  38. data/lib/etl/generator/generator.rb +20 -0
  39. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  40. data/lib/etl/http_tools.rb +139 -0
  41. data/lib/etl/parser.rb +11 -0
  42. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  43. data/lib/etl/parser/delimited_parser.rb +74 -0
  44. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  45. data/lib/etl/parser/parser.rb +41 -0
  46. data/lib/etl/parser/sax_parser.rb +218 -0
  47. data/lib/etl/parser/xml_parser.rb +65 -0
  48. data/lib/etl/processor.rb +11 -0
  49. data/lib/etl/processor/block_processor.rb +14 -0
  50. data/lib/etl/processor/bulk_import_processor.rb +81 -0
  51. data/lib/etl/processor/check_exist_processor.rb +80 -0
  52. data/lib/etl/processor/check_unique_processor.rb +35 -0
  53. data/lib/etl/processor/copy_field_processor.rb +26 -0
  54. data/lib/etl/processor/encode_processor.rb +55 -0
  55. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  56. data/lib/etl/processor/print_row_processor.rb +12 -0
  57. data/lib/etl/processor/processor.rb +25 -0
  58. data/lib/etl/processor/rename_processor.rb +24 -0
  59. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  60. data/lib/etl/processor/row_processor.rb +17 -0
  61. data/lib/etl/processor/sequence_processor.rb +23 -0
  62. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  63. data/lib/etl/processor/truncate_processor.rb +35 -0
  64. data/lib/etl/row.rb +20 -0
  65. data/lib/etl/screen.rb +14 -0
  66. data/lib/etl/screen/row_count_screen.rb +20 -0
  67. data/lib/etl/transform.rb +2 -0
  68. data/lib/etl/transform/block_transform.rb +13 -0
  69. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  70. data/lib/etl/transform/decode_transform.rb +51 -0
  71. data/lib/etl/transform/default_transform.rb +20 -0
  72. data/lib/etl/transform/foreign_key_lookup_transform.rb +151 -0
  73. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  74. data/lib/etl/transform/ordinalize_transform.rb +12 -0
  75. data/lib/etl/transform/sha1_transform.rb +13 -0
  76. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  77. data/lib/etl/transform/string_to_datetime_transform.rb +14 -0
  78. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  79. data/lib/etl/transform/transform.rb +61 -0
  80. data/lib/etl/transform/trim_transform.rb +26 -0
  81. data/lib/etl/transform/type_transform.rb +35 -0
  82. data/lib/etl/util.rb +59 -0
  83. data/lib/etl/version.rb +9 -0
  84. metadata +195 -0
@@ -0,0 +1,14 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform a String representation of a date to a DateTime instance
4
+ class StringToDateTimeTransform < ETL::Transform::Transform
5
+ # Transform the value using DateTime.parse.
6
+ #
7
+ # WARNING: This transform is slow (due to the Ruby implementation), but if you need to
8
+ # parse timestamps before or after the values supported by the Time.parse.
9
+ def transform(name, value, row)
10
+ DateTime.parse(value) unless value.nil?
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,11 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform a String representation of a date to a Time instance
4
+ class StringToTimeTransform < ETL::Transform::Transform
5
+ # Transform the value using Time.parse
6
+ def transform(name, value, row)
7
+ Time.parse(value) unless value.nil?
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,61 @@
1
+ module ETL#:nodoc:
2
+ module Transform#:nodoc:
3
+ # Base class for transforms.
4
+ #
5
+ # A transform converts one value to another value using some sort of algorithm.
6
+ #
7
+ # A simple transform has two arguments, the field to transform and the name of the transform:
8
+ #
9
+ # transform :ssn, :sha1
10
+ #
11
+ # Transforms can also be blocks:
12
+ #
13
+ # transform(:ssn){ |v| v[0,24] }
14
+ #
15
+ # Finally, a transform can include a configuration hash:
16
+ #
17
+ # transform :sex, :decode, {:decode_table_path => 'delimited_decode.txt'}
18
+ class Transform
19
+ class << self
20
+ # Transform the specified value using the given transforms. The transforms can either be
21
+ # Proc objects or objects which extend from Transform and implement the method <tt>transform(value)</tt>.
22
+ # Any other object will result in a ControlError being raised.
23
+ def transform(name, value, row, transforms)
24
+ transforms.each do |transform|
25
+ benchmarks[transform.class] ||= 0
26
+ benchmarks[transform.class] += Benchmark.realtime do
27
+ Engine.logger.debug "Transforming field #{name} with #{transform.inspect}"
28
+ case transform
29
+ when Proc
30
+ value = transform.call([name, value, row])
31
+ when Transform
32
+ value = transform.transform(name, value, row)
33
+ else
34
+ raise ControlError, "Unsupported transform configuration type: #{transform}"
35
+ end
36
+ end
37
+ end
38
+ value
39
+ end
40
+
41
+ def benchmarks
42
+ @benchmarks ||= {}
43
+ end
44
+ end
45
+
46
+ attr_reader :control, :name, :configuration
47
+
48
+ # Initialize the transform object with the given control object, field name and
49
+ # configuration hash
50
+ def initialize(control, name, configuration={})
51
+ @control = control
52
+ @name = name
53
+ @configuration = configuration
54
+ end
55
+
56
+ def transform(name, value, row)
57
+ raise "transform is an abstract method"
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,26 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform to trim string
4
+ class TrimTransform < ETL::Transform::Transform
5
+ # Configuration options:
6
+ # * <tt>:type</tt>: :left, :right or :both. Default is :both
7
+ def initialize(control, name, configuration={})
8
+ super
9
+ @type = (configuration[:type] || :both).to_sym
10
+ end
11
+ # Transform the value
12
+ def transform(name, value, row)
13
+ case @type
14
+ when :left
15
+ value.lstrip
16
+ when :right
17
+ value.rstrip
18
+ when :both
19
+ value.strip
20
+ else
21
+ raise "Trim type, if specified, must be :left, :right or :both"
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,35 @@
1
+ module ETL #:nodoc:
2
+ module Transform #:nodoc:
3
+ # Transform from one type to another
4
+ class TypeTransform < ETL::Transform::Transform
5
+ # Initialize the transformer.
6
+ #
7
+ # Configuration options:
8
+ # * <tt>:type</tt>: The type to convert to. Supported types:
9
+ # ** :string
10
+ # ** :number,:integer
11
+ # ** :float
12
+ # ** :decimal
13
+ def initialize(control, name, configuration={})
14
+ super
15
+ @type = configuration[:type]
16
+ @significant = configuration[:significant] ||= 0
17
+ end
18
+ # Transform the value
19
+ def transform(name, value, row)
20
+ case @type
21
+ when :string
22
+ value.to_s
23
+ when :number, :integer
24
+ value.to_i
25
+ when :float
26
+ value.to_f
27
+ when :decimal
28
+ BigDecimal.new(value.to_s, @significant)
29
+ else
30
+ raise "Unsupported type: #{@type}"
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
data/lib/etl/util.rb ADDED
@@ -0,0 +1,59 @@
1
+ module ETL
2
+ module Util
3
+ # Return the distance of time in words from the given from_time to the specified to_time. If to_time
4
+ # is not specified then Time.now is used. By default seconds are included...set the include_seconds
5
+ # argument to false to disable the seconds.
6
+ def distance_of_time_in_words(from_time, to_time=Time.now)
7
+ from_time = from_time.to_time if from_time.respond_to?(:to_time)
8
+ to_time = to_time.to_time if to_time.respond_to?(:to_time)
9
+ seconds = (to_time - from_time).round
10
+ distance_in_days = (seconds/(60*60*24)).round
11
+ seconds = seconds % (60*60*24)
12
+ distance_in_hours = (seconds/(60*60)).round
13
+ seconds = seconds % (60*60)
14
+ distance_in_minutes = (seconds/60).round
15
+ seconds = seconds % 60
16
+ distance_in_seconds = seconds
17
+
18
+ s = ''
19
+ s << "#{distance_in_days} days," if distance_in_days > 0
20
+ s << "#{distance_in_hours} hours, " if distance_in_hours > 0
21
+ s << "#{distance_in_minutes} minutes, " if distance_in_minutes > 0
22
+ s << "#{distance_in_seconds} seconds"
23
+ s
24
+ end
25
+
26
+ # Get the approximate disntance of time in words from the given from_time
27
+ # to the the given to_time. If to_time is not specified then it is set
28
+ # to Time.now. By default seconds are included...set the include_seconds
29
+ # argument to false to disable the seconds.
30
+ def approximate_distance_of_time_in_words(from_time, to_time=Time.now, include_seconds=true)
31
+ from_time = from_time.to_time if from_time.respond_to?(:to_time)
32
+ to_time = to_time.to_time if to_time.respond_to?(:to_time)
33
+ distance_in_minutes = (((to_time - from_time).abs)/60).round
34
+ distance_in_seconds = ((to_time - from_time).abs).round
35
+
36
+ case distance_in_minutes
37
+ when 0..1
38
+ return (distance_in_minutes == 0) ? 'less than a minute' : '1 minute' unless include_seconds
39
+ case distance_in_seconds
40
+ when 0..4 then 'less than 5 seconds'
41
+ when 5..9 then 'less than 10 seconds'
42
+ when 10..19 then 'less than 20 seconds'
43
+ when 20..39 then 'half a minute'
44
+ when 40..59 then 'less than a minute'
45
+ else '1 minute'
46
+ end
47
+ when 2..44 then "#{distance_in_minutes} minutes"
48
+ when 45..89 then 'about 1 hour'
49
+ when 90..1439 then "about #{(distance_in_minutes.to_f / 60.0).round} hours"
50
+ when 1440..2879 then '1 day'
51
+ when 2880..43199 then "#{(distance_in_minutes / 1440).round} days"
52
+ when 43200..86399 then 'about 1 month'
53
+ when 86400..525959 then "#{(distance_in_minutes / 43200).round} months"
54
+ when 525960..1051919 then 'about 1 year'
55
+ else "over #{(distance_in_minutes / 525960).round} years"
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,9 @@
1
+ module ETL#:nodoc:
2
+ module VERSION #:nodoc:
3
+ MAJOR = 0
4
+ MINOR = 9
5
+ TINY = 1
6
+
7
+ STRING = [MAJOR, MINOR, TINY].join('.')
8
+ end
9
+ end
metadata ADDED
@@ -0,0 +1,195 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: factorylabs-activewarehouse-etl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.9.1.2
5
+ platform: ruby
6
+ authors:
7
+ - Anthony Eden
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-08-14 00:00:00 -07:00
13
+ default_executable: etl
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rake
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 0.7.1
23
+ version:
24
+ - !ruby/object:Gem::Dependency
25
+ name: activesupport
26
+ version_requirement:
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ requirements:
29
+ - - ">="
30
+ - !ruby/object:Gem::Version
31
+ version: 1.3.1
32
+ version:
33
+ - !ruby/object:Gem::Dependency
34
+ name: activerecord
35
+ version_requirement:
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 1.14.4
41
+ version:
42
+ - !ruby/object:Gem::Dependency
43
+ name: fastercsv
44
+ version_requirement:
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 1.2.0
50
+ version:
51
+ - !ruby/object:Gem::Dependency
52
+ name: adapter_extensions
53
+ version_requirement:
54
+ version_requirements: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: 0.1.0
59
+ version:
60
+ description: ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.
61
+ email: anthonyeden@gmail.com
62
+ executables:
63
+ - etl
64
+ extensions: []
65
+
66
+ extra_rdoc_files: []
67
+
68
+ files:
69
+ - CHANGELOG
70
+ - LICENSE
71
+ - README
72
+ - TODO
73
+ - Rakefile
74
+ - bin/etl
75
+ - bin/etl.cmd
76
+ - lib/etl
77
+ - lib/etl.rb
78
+ - lib/etl/batch
79
+ - lib/etl/batch.rb
80
+ - lib/etl/builder
81
+ - lib/etl/builder.rb
82
+ - lib/etl/commands
83
+ - lib/etl/control
84
+ - lib/etl/control.rb
85
+ - lib/etl/core_ext
86
+ - lib/etl/core_ext.rb
87
+ - lib/etl/engine.rb
88
+ - lib/etl/execution
89
+ - lib/etl/execution.rb
90
+ - lib/etl/generator
91
+ - lib/etl/generator.rb
92
+ - lib/etl/http_tools.rb
93
+ - lib/etl/parser
94
+ - lib/etl/parser.rb
95
+ - lib/etl/processor
96
+ - lib/etl/processor.rb
97
+ - lib/etl/row.rb
98
+ - lib/etl/screen
99
+ - lib/etl/screen.rb
100
+ - lib/etl/transform
101
+ - lib/etl/transform.rb
102
+ - lib/etl/util.rb
103
+ - lib/etl/version.rb
104
+ - lib/etl/batch/batch.rb
105
+ - lib/etl/batch/directives.rb
106
+ - lib/etl/builder/date_dimension_builder.rb
107
+ - lib/etl/builder/time_dimension_builder.rb
108
+ - lib/etl/commands/etl.rb
109
+ - lib/etl/control/control.rb
110
+ - lib/etl/control/destination
111
+ - lib/etl/control/destination.rb
112
+ - lib/etl/control/source
113
+ - lib/etl/control/source.rb
114
+ - lib/etl/control/destination/database_destination.rb
115
+ - lib/etl/control/destination/file_destination.rb
116
+ - lib/etl/control/source/database_source.rb
117
+ - lib/etl/control/source/enumerable_source.rb
118
+ - lib/etl/control/source/file_source.rb
119
+ - lib/etl/control/source/model_source.rb
120
+ - lib/etl/core_ext/time
121
+ - lib/etl/core_ext/time.rb
122
+ - lib/etl/core_ext/time/calculations.rb
123
+ - lib/etl/execution/base.rb
124
+ - lib/etl/execution/batch.rb
125
+ - lib/etl/execution/job.rb
126
+ - lib/etl/execution/migration.rb
127
+ - lib/etl/execution/record.rb
128
+ - lib/etl/generator/generator.rb
129
+ - lib/etl/generator/surrogate_key_generator.rb
130
+ - lib/etl/parser/apache_combined_log_parser.rb
131
+ - lib/etl/parser/delimited_parser.rb
132
+ - lib/etl/parser/fixed_width_parser.rb
133
+ - lib/etl/parser/parser.rb
134
+ - lib/etl/parser/sax_parser.rb
135
+ - lib/etl/parser/xml_parser.rb
136
+ - lib/etl/processor/block_processor.rb
137
+ - lib/etl/processor/bulk_import_processor.rb
138
+ - lib/etl/processor/check_exist_processor.rb
139
+ - lib/etl/processor/check_unique_processor.rb
140
+ - lib/etl/processor/copy_field_processor.rb
141
+ - lib/etl/processor/encode_processor.rb
142
+ - lib/etl/processor/hierarchy_exploder_processor.rb
143
+ - lib/etl/processor/print_row_processor.rb
144
+ - lib/etl/processor/processor.rb
145
+ - lib/etl/processor/rename_processor.rb
146
+ - lib/etl/processor/require_non_blank_processor.rb
147
+ - lib/etl/processor/row_processor.rb
148
+ - lib/etl/processor/sequence_processor.rb
149
+ - lib/etl/processor/surrogate_key_processor.rb
150
+ - lib/etl/processor/truncate_processor.rb
151
+ - lib/etl/screen/row_count_screen.rb
152
+ - lib/etl/transform/block_transform.rb
153
+ - lib/etl/transform/date_to_string_transform.rb
154
+ - lib/etl/transform/decode_transform.rb
155
+ - lib/etl/transform/default_transform.rb
156
+ - lib/etl/transform/foreign_key_lookup_transform.rb
157
+ - lib/etl/transform/hierarchy_lookup_transform.rb
158
+ - lib/etl/transform/ordinalize_transform.rb
159
+ - lib/etl/transform/sha1_transform.rb
160
+ - lib/etl/transform/string_to_date_transform.rb
161
+ - lib/etl/transform/string_to_datetime_transform.rb
162
+ - lib/etl/transform/string_to_time_transform.rb
163
+ - lib/etl/transform/transform.rb
164
+ - lib/etl/transform/trim_transform.rb
165
+ - lib/etl/transform/type_transform.rb
166
+ - examples/database.example.yml
167
+ has_rdoc: false
168
+ homepage: http://activewarehouse.rubyforge.org/etl
169
+ post_install_message:
170
+ rdoc_options:
171
+ - --exclude
172
+ - .
173
+ require_paths:
174
+ - lib
175
+ required_ruby_version: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: "0"
180
+ version:
181
+ required_rubygems_version: !ruby/object:Gem::Requirement
182
+ requirements:
183
+ - - ">="
184
+ - !ruby/object:Gem::Version
185
+ version: "0"
186
+ version:
187
+ requirements: []
188
+
189
+ rubyforge_project: activewarehouse
190
+ rubygems_version: 1.2.0
191
+ signing_key:
192
+ specification_version: 2
193
+ summary: Pure Ruby ETL package.
194
+ test_files: []
195
+