activewarehouse-etl 0.9.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +22 -2
- data/README +12 -0
- data/Rakefile +64 -59
- data/bin/etl +0 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +11 -1
- data/lib/etl.rb +9 -21
- data/lib/etl/builder.rb +2 -1
- data/lib/etl/builder/date_dimension_builder.rb +67 -54
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/commands/etl.rb +1 -2
- data/lib/etl/control/control.rb +46 -18
- data/lib/etl/control/destination.rb +201 -138
- data/lib/etl/control/destination/database_destination.rb +10 -5
- data/lib/etl/control/source.rb +1 -1
- data/lib/etl/control/source/database_source.rb +8 -10
- data/lib/etl/core_ext/time/calculations.rb +4 -2
- data/lib/etl/engine.rb +35 -10
- data/lib/etl/execution/migration.rb +21 -9
- data/lib/etl/generator/generator.rb +1 -1
- data/lib/etl/http_tools.rb +21 -7
- data/lib/etl/parser/apache_combined_log_parser.rb +3 -1
- data/lib/etl/parser/delimited_parser.rb +1 -1
- data/lib/etl/parser/parser.rb +1 -1
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +5 -1
- data/lib/etl/processor/check_exist_processor.rb +1 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/transform/date_to_string_transform.rb +1 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +67 -2
- data/lib/etl/transform/string_to_date_transform.rb +6 -1
- data/lib/etl/transform/string_to_datetime_transform.rb +1 -1
- data/lib/etl/transform/string_to_time_transform.rb +1 -1
- data/lib/etl/version.rb +1 -1
- metadata +94 -78
@@ -5,6 +5,9 @@ module ETL #:nodoc:
|
|
5
5
|
# The resolver to use if the foreign key is not found in the collection
|
6
6
|
attr_accessor :resolver
|
7
7
|
|
8
|
+
# The default foreign key to use if none is found.
|
9
|
+
attr_accessor :default
|
10
|
+
|
8
11
|
# Initialize the foreign key lookup transform.
|
9
12
|
#
|
10
13
|
# Configuration options:
|
@@ -12,12 +15,21 @@ module ETL #:nodoc:
|
|
12
15
|
# an empty Hash will be used. This Hash will be used to cache values that have been resolved already
|
13
16
|
# for future use.
|
14
17
|
# *<tt>:resolver</tt>: Object or Class which implements the method resolve(value)
|
18
|
+
# *<tt>:default</tt>: A default foreign key to use if no foreign key is found
|
15
19
|
def initialize(control, name, configuration={})
|
16
20
|
super
|
17
21
|
|
18
22
|
@collection = (configuration[:collection] || {})
|
19
23
|
@resolver = configuration[:resolver]
|
20
24
|
@resolver = @resolver.new if @resolver.is_a?(Class)
|
25
|
+
@default = configuration[:default]
|
26
|
+
if configuration[:cache] ||= true
|
27
|
+
if resolver.respond_to?(:load_cache)
|
28
|
+
resolver.load_cache
|
29
|
+
else
|
30
|
+
ETL::Engine.logger.info "#{resolver.class.name} does not support caching"
|
31
|
+
end
|
32
|
+
end
|
21
33
|
end
|
22
34
|
|
23
35
|
# Transform the value by resolving it to a foriegn key
|
@@ -27,7 +39,8 @@ module ETL #:nodoc:
|
|
27
39
|
raise ResolverError, "Foreign key for #{value} not found and no resolver specified" unless resolver
|
28
40
|
raise ResolverError, "Resolver does not appear to respond to resolve method" unless resolver.respond_to?(:resolve)
|
29
41
|
fk = resolver.resolve(value)
|
30
|
-
|
42
|
+
fk ||= @default
|
43
|
+
raise ResolverError, "Unable to resolve #{value} to foreign key for #{name} in row #{ETL::Engine.rows_read}. You may want to specify a :default value." unless fk
|
31
44
|
@collection[value] = fk
|
32
45
|
end
|
33
46
|
fk
|
@@ -81,6 +94,58 @@ class SQLResolver
|
|
81
94
|
@connection ||= ActiveRecord::Base.connection
|
82
95
|
end
|
83
96
|
def resolve(value)
|
84
|
-
|
97
|
+
if @use_cache
|
98
|
+
cache[value]
|
99
|
+
else
|
100
|
+
q = "SELECT id FROM #{table_name} WHERE #{@field} = #{@connection.quote(value)}"
|
101
|
+
ETL::Engine.logger.debug("Executing query: #{q}")
|
102
|
+
@connection.select_value(q)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
def table_name
|
106
|
+
ETL::Engine.table(@table, @connection)
|
107
|
+
end
|
108
|
+
def cache
|
109
|
+
@cache ||= {}
|
110
|
+
end
|
111
|
+
def load_cache
|
112
|
+
@use_cache = true
|
113
|
+
q = "SELECT id, #{@field} FROM #{table_name}"
|
114
|
+
@connection.select_all(q).each do |record|
|
115
|
+
cache[record[@field]] = record['id']
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
class FlatFileResolver
|
121
|
+
# Initialize the flat file resolver. Expects to open a comma-delimited file.
|
122
|
+
# Returns the column with the given result_field_index.
|
123
|
+
#
|
124
|
+
# The matches argument is a Hash with the key as the column index to search and
|
125
|
+
# the value of the Hash as a String to match exactly. It will only match the first
|
126
|
+
# result.
|
127
|
+
def initialize(file, match_index, result_field_index)
|
128
|
+
@file = file
|
129
|
+
@match_index = match_index
|
130
|
+
@result_field_index = result_field_index
|
131
|
+
end
|
132
|
+
|
133
|
+
# Get the rows from the file specified in the initializer.
|
134
|
+
def rows
|
135
|
+
@rows ||= FasterCSV.read(@file)
|
136
|
+
end
|
137
|
+
protected :rows
|
138
|
+
|
139
|
+
# Match the row field from the column indicated by the match_index with the given
|
140
|
+
# value and return the field value from the column identified by the result_field_index.
|
141
|
+
def resolve(value)
|
142
|
+
rows.each do |row|
|
143
|
+
#puts "checking #{row.inspect} for #{value}"
|
144
|
+
if row[@match_index] == value
|
145
|
+
#puts "match found!, returning #{row[@result_field_index]}"
|
146
|
+
return row[@result_field_index]
|
147
|
+
end
|
148
|
+
end
|
149
|
+
nil
|
85
150
|
end
|
86
151
|
end
|
@@ -4,7 +4,12 @@ module ETL #:nodoc:
|
|
4
4
|
class StringToDateTransform < ETL::Transform::Transform
|
5
5
|
# Transform the value using Date.parse
|
6
6
|
def transform(name, value, row)
|
7
|
-
|
7
|
+
return value if value.nil?
|
8
|
+
begin
|
9
|
+
Date.parse(value)
|
10
|
+
rescue => e
|
11
|
+
return value
|
12
|
+
end
|
8
13
|
end
|
9
14
|
end
|
10
15
|
end
|
@@ -7,7 +7,7 @@ module ETL #:nodoc:
|
|
7
7
|
# WARNING: This transform is slow (due to the Ruby implementation), but if you need to
|
8
8
|
# parse timestamps before or after the values supported by the Time.parse.
|
9
9
|
def transform(name, value, row)
|
10
|
-
DateTime.parse(value)
|
10
|
+
DateTime.parse(value) unless value.nil?
|
11
11
|
end
|
12
12
|
end
|
13
13
|
end
|
data/lib/etl/version.rb
CHANGED
metadata
CHANGED
@@ -1,33 +1,75 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.2
|
3
|
-
specification_version: 1
|
4
2
|
name: activewarehouse-etl
|
5
3
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.9.
|
7
|
-
date: 2007-08-09 00:00:00 -04:00
|
8
|
-
summary: Pure Ruby ETL package.
|
9
|
-
require_paths:
|
10
|
-
- lib
|
11
|
-
email: anthonyeden@gmail.com
|
12
|
-
homepage: http://activewarehouse.rubyforge.org/etl
|
13
|
-
rubyforge_project: activewarehouse
|
14
|
-
description: ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.
|
15
|
-
autorequire:
|
16
|
-
default_executable: etl
|
17
|
-
bindir: bin
|
18
|
-
has_rdoc: false
|
19
|
-
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">"
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 0.0.0
|
24
|
-
version:
|
4
|
+
version: 0.9.1
|
25
5
|
platform: ruby
|
26
|
-
signing_key:
|
27
|
-
cert_chain:
|
28
|
-
post_install_message:
|
29
6
|
authors:
|
30
7
|
- Anthony Eden
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-01-14 00:00:00 -05:00
|
13
|
+
default_executable: etl
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rake
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.7.1
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: activesupport
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.3.1
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: activerecord
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.14.4
|
44
|
+
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: fastercsv
|
47
|
+
type: :runtime
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.2.0
|
54
|
+
version:
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: adapter_extensions
|
57
|
+
type: :runtime
|
58
|
+
version_requirement:
|
59
|
+
version_requirements: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: 0.1.0
|
64
|
+
version:
|
65
|
+
description: ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.
|
66
|
+
email: anthonyeden@gmail.com
|
67
|
+
executables:
|
68
|
+
- etl
|
69
|
+
extensions: []
|
70
|
+
|
71
|
+
extra_rdoc_files: []
|
72
|
+
|
31
73
|
files:
|
32
74
|
- CHANGELOG
|
33
75
|
- LICENSE
|
@@ -35,6 +77,7 @@ files:
|
|
35
77
|
- TODO
|
36
78
|
- Rakefile
|
37
79
|
- bin/etl
|
80
|
+
- bin/etl.cmd
|
38
81
|
- lib/etl
|
39
82
|
- lib/etl/batch
|
40
83
|
- lib/etl/batch/batch.rb
|
@@ -42,6 +85,7 @@ files:
|
|
42
85
|
- lib/etl/batch.rb
|
43
86
|
- lib/etl/builder
|
44
87
|
- lib/etl/builder/date_dimension_builder.rb
|
88
|
+
- lib/etl/builder/time_dimension_builder.rb
|
45
89
|
- lib/etl/builder.rb
|
46
90
|
- lib/etl/commands
|
47
91
|
- lib/etl/commands/etl.rb
|
@@ -85,10 +129,12 @@ files:
|
|
85
129
|
- lib/etl/parser/xml_parser.rb
|
86
130
|
- lib/etl/parser.rb
|
87
131
|
- lib/etl/processor
|
132
|
+
- lib/etl/processor/block_processor.rb
|
88
133
|
- lib/etl/processor/bulk_import_processor.rb
|
89
134
|
- lib/etl/processor/check_exist_processor.rb
|
90
135
|
- lib/etl/processor/check_unique_processor.rb
|
91
136
|
- lib/etl/processor/copy_field_processor.rb
|
137
|
+
- lib/etl/processor/encode_processor.rb
|
92
138
|
- lib/etl/processor/hierarchy_exploder_processor.rb
|
93
139
|
- lib/etl/processor/print_row_processor.rb
|
94
140
|
- lib/etl/processor/processor.rb
|
@@ -123,62 +169,32 @@ files:
|
|
123
169
|
- lib/etl/version.rb
|
124
170
|
- lib/etl.rb
|
125
171
|
- examples/database.example.yml
|
126
|
-
|
127
|
-
|
172
|
+
has_rdoc: false
|
173
|
+
homepage: http://activewarehouse.rubyforge.org/etl
|
174
|
+
post_install_message:
|
128
175
|
rdoc_options:
|
129
176
|
- --exclude
|
130
177
|
- .
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
178
|
+
require_paths:
|
179
|
+
- lib
|
180
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
181
|
+
requirements:
|
182
|
+
- - ">="
|
183
|
+
- !ruby/object:Gem::Version
|
184
|
+
version: "0"
|
185
|
+
version:
|
186
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
187
|
+
requirements:
|
188
|
+
- - ">="
|
189
|
+
- !ruby/object:Gem::Version
|
190
|
+
version: "0"
|
191
|
+
version:
|
137
192
|
requirements: []
|
138
193
|
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
- !ruby/object:Gem::Version
|
147
|
-
version: 0.7.1
|
148
|
-
version:
|
149
|
-
- !ruby/object:Gem::Dependency
|
150
|
-
name: activesupport
|
151
|
-
version_requirement:
|
152
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
153
|
-
requirements:
|
154
|
-
- - ">="
|
155
|
-
- !ruby/object:Gem::Version
|
156
|
-
version: 1.3.1
|
157
|
-
version:
|
158
|
-
- !ruby/object:Gem::Dependency
|
159
|
-
name: activerecord
|
160
|
-
version_requirement:
|
161
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
162
|
-
requirements:
|
163
|
-
- - ">="
|
164
|
-
- !ruby/object:Gem::Version
|
165
|
-
version: 1.14.4
|
166
|
-
version:
|
167
|
-
- !ruby/object:Gem::Dependency
|
168
|
-
name: fastercsv
|
169
|
-
version_requirement:
|
170
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
171
|
-
requirements:
|
172
|
-
- - ">="
|
173
|
-
- !ruby/object:Gem::Version
|
174
|
-
version: 1.2.0
|
175
|
-
version:
|
176
|
-
- !ruby/object:Gem::Dependency
|
177
|
-
name: adapter_extensions
|
178
|
-
version_requirement:
|
179
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
180
|
-
requirements:
|
181
|
-
- - ">="
|
182
|
-
- !ruby/object:Gem::Version
|
183
|
-
version: 0.1.0
|
184
|
-
version:
|
194
|
+
rubyforge_project: activewarehouse
|
195
|
+
rubygems_version: 1.3.1
|
196
|
+
signing_key:
|
197
|
+
specification_version: 2
|
198
|
+
summary: Pure Ruby ETL package.
|
199
|
+
test_files: []
|
200
|
+
|