activewarehouse-etl 0.9.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +22 -2
- data/README +12 -0
- data/Rakefile +64 -59
- data/bin/etl +0 -0
- data/bin/etl.cmd +8 -0
- data/examples/database.example.yml +11 -1
- data/lib/etl.rb +9 -21
- data/lib/etl/builder.rb +2 -1
- data/lib/etl/builder/date_dimension_builder.rb +67 -54
- data/lib/etl/builder/time_dimension_builder.rb +31 -0
- data/lib/etl/commands/etl.rb +1 -2
- data/lib/etl/control/control.rb +46 -18
- data/lib/etl/control/destination.rb +201 -138
- data/lib/etl/control/destination/database_destination.rb +10 -5
- data/lib/etl/control/source.rb +1 -1
- data/lib/etl/control/source/database_source.rb +8 -10
- data/lib/etl/core_ext/time/calculations.rb +4 -2
- data/lib/etl/engine.rb +35 -10
- data/lib/etl/execution/migration.rb +21 -9
- data/lib/etl/generator/generator.rb +1 -1
- data/lib/etl/http_tools.rb +21 -7
- data/lib/etl/parser/apache_combined_log_parser.rb +3 -1
- data/lib/etl/parser/delimited_parser.rb +1 -1
- data/lib/etl/parser/parser.rb +1 -1
- data/lib/etl/processor/block_processor.rb +14 -0
- data/lib/etl/processor/bulk_import_processor.rb +5 -1
- data/lib/etl/processor/check_exist_processor.rb +1 -0
- data/lib/etl/processor/encode_processor.rb +55 -0
- data/lib/etl/transform/date_to_string_transform.rb +1 -0
- data/lib/etl/transform/foreign_key_lookup_transform.rb +67 -2
- data/lib/etl/transform/string_to_date_transform.rb +6 -1
- data/lib/etl/transform/string_to_datetime_transform.rb +1 -1
- data/lib/etl/transform/string_to_time_transform.rb +1 -1
- data/lib/etl/version.rb +1 -1
- metadata +94 -78
@@ -5,6 +5,9 @@ module ETL #:nodoc:
|
|
5
5
|
# The resolver to use if the foreign key is not found in the collection
|
6
6
|
attr_accessor :resolver
|
7
7
|
|
8
|
+
# The default foreign key to use if none is found.
|
9
|
+
attr_accessor :default
|
10
|
+
|
8
11
|
# Initialize the foreign key lookup transform.
|
9
12
|
#
|
10
13
|
# Configuration options:
|
@@ -12,12 +15,21 @@ module ETL #:nodoc:
|
|
12
15
|
# an empty Hash will be used. This Hash will be used to cache values that have been resolved already
|
13
16
|
# for future use.
|
14
17
|
# *<tt>:resolver</tt>: Object or Class which implements the method resolve(value)
|
18
|
+
# *<tt>:default</tt>: A default foreign key to use if no foreign key is found
|
15
19
|
def initialize(control, name, configuration={})
|
16
20
|
super
|
17
21
|
|
18
22
|
@collection = (configuration[:collection] || {})
|
19
23
|
@resolver = configuration[:resolver]
|
20
24
|
@resolver = @resolver.new if @resolver.is_a?(Class)
|
25
|
+
@default = configuration[:default]
|
26
|
+
if configuration[:cache] ||= true
|
27
|
+
if resolver.respond_to?(:load_cache)
|
28
|
+
resolver.load_cache
|
29
|
+
else
|
30
|
+
ETL::Engine.logger.info "#{resolver.class.name} does not support caching"
|
31
|
+
end
|
32
|
+
end
|
21
33
|
end
|
22
34
|
|
23
35
|
# Transform the value by resolving it to a foriegn key
|
@@ -27,7 +39,8 @@ module ETL #:nodoc:
|
|
27
39
|
raise ResolverError, "Foreign key for #{value} not found and no resolver specified" unless resolver
|
28
40
|
raise ResolverError, "Resolver does not appear to respond to resolve method" unless resolver.respond_to?(:resolve)
|
29
41
|
fk = resolver.resolve(value)
|
30
|
-
|
42
|
+
fk ||= @default
|
43
|
+
raise ResolverError, "Unable to resolve #{value} to foreign key for #{name} in row #{ETL::Engine.rows_read}. You may want to specify a :default value." unless fk
|
31
44
|
@collection[value] = fk
|
32
45
|
end
|
33
46
|
fk
|
@@ -81,6 +94,58 @@ class SQLResolver
|
|
81
94
|
@connection ||= ActiveRecord::Base.connection
|
82
95
|
end
|
83
96
|
def resolve(value)
|
84
|
-
|
97
|
+
if @use_cache
|
98
|
+
cache[value]
|
99
|
+
else
|
100
|
+
q = "SELECT id FROM #{table_name} WHERE #{@field} = #{@connection.quote(value)}"
|
101
|
+
ETL::Engine.logger.debug("Executing query: #{q}")
|
102
|
+
@connection.select_value(q)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
def table_name
|
106
|
+
ETL::Engine.table(@table, @connection)
|
107
|
+
end
|
108
|
+
def cache
|
109
|
+
@cache ||= {}
|
110
|
+
end
|
111
|
+
def load_cache
|
112
|
+
@use_cache = true
|
113
|
+
q = "SELECT id, #{@field} FROM #{table_name}"
|
114
|
+
@connection.select_all(q).each do |record|
|
115
|
+
cache[record[@field]] = record['id']
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
class FlatFileResolver
|
121
|
+
# Initialize the flat file resolver. Expects to open a comma-delimited file.
|
122
|
+
# Returns the column with the given result_field_index.
|
123
|
+
#
|
124
|
+
# The matches argument is a Hash with the key as the column index to search and
|
125
|
+
# the value of the Hash as a String to match exactly. It will only match the first
|
126
|
+
# result.
|
127
|
+
def initialize(file, match_index, result_field_index)
|
128
|
+
@file = file
|
129
|
+
@match_index = match_index
|
130
|
+
@result_field_index = result_field_index
|
131
|
+
end
|
132
|
+
|
133
|
+
# Get the rows from the file specified in the initializer.
|
134
|
+
def rows
|
135
|
+
@rows ||= FasterCSV.read(@file)
|
136
|
+
end
|
137
|
+
protected :rows
|
138
|
+
|
139
|
+
# Match the row field from the column indicated by the match_index with the given
|
140
|
+
# value and return the field value from the column identified by the result_field_index.
|
141
|
+
def resolve(value)
|
142
|
+
rows.each do |row|
|
143
|
+
#puts "checking #{row.inspect} for #{value}"
|
144
|
+
if row[@match_index] == value
|
145
|
+
#puts "match found!, returning #{row[@result_field_index]}"
|
146
|
+
return row[@result_field_index]
|
147
|
+
end
|
148
|
+
end
|
149
|
+
nil
|
85
150
|
end
|
86
151
|
end
|
@@ -4,7 +4,12 @@ module ETL #:nodoc:
|
|
4
4
|
class StringToDateTransform < ETL::Transform::Transform
|
5
5
|
# Transform the value using Date.parse
|
6
6
|
def transform(name, value, row)
|
7
|
-
|
7
|
+
return value if value.nil?
|
8
|
+
begin
|
9
|
+
Date.parse(value)
|
10
|
+
rescue => e
|
11
|
+
return value
|
12
|
+
end
|
8
13
|
end
|
9
14
|
end
|
10
15
|
end
|
@@ -7,7 +7,7 @@ module ETL #:nodoc:
|
|
7
7
|
# WARNING: This transform is slow (due to the Ruby implementation), but if you need to
|
8
8
|
# parse timestamps before or after the values supported by the Time.parse.
|
9
9
|
def transform(name, value, row)
|
10
|
-
DateTime.parse(value)
|
10
|
+
DateTime.parse(value) unless value.nil?
|
11
11
|
end
|
12
12
|
end
|
13
13
|
end
|
data/lib/etl/version.rb
CHANGED
metadata
CHANGED
@@ -1,33 +1,75 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.2
|
3
|
-
specification_version: 1
|
4
2
|
name: activewarehouse-etl
|
5
3
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.9.
|
7
|
-
date: 2007-08-09 00:00:00 -04:00
|
8
|
-
summary: Pure Ruby ETL package.
|
9
|
-
require_paths:
|
10
|
-
- lib
|
11
|
-
email: anthonyeden@gmail.com
|
12
|
-
homepage: http://activewarehouse.rubyforge.org/etl
|
13
|
-
rubyforge_project: activewarehouse
|
14
|
-
description: ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.
|
15
|
-
autorequire:
|
16
|
-
default_executable: etl
|
17
|
-
bindir: bin
|
18
|
-
has_rdoc: false
|
19
|
-
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">"
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 0.0.0
|
24
|
-
version:
|
4
|
+
version: 0.9.1
|
25
5
|
platform: ruby
|
26
|
-
signing_key:
|
27
|
-
cert_chain:
|
28
|
-
post_install_message:
|
29
6
|
authors:
|
30
7
|
- Anthony Eden
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-01-14 00:00:00 -05:00
|
13
|
+
default_executable: etl
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rake
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.7.1
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: activesupport
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.3.1
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: activerecord
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.14.4
|
44
|
+
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: fastercsv
|
47
|
+
type: :runtime
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.2.0
|
54
|
+
version:
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: adapter_extensions
|
57
|
+
type: :runtime
|
58
|
+
version_requirement:
|
59
|
+
version_requirements: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: 0.1.0
|
64
|
+
version:
|
65
|
+
description: ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.
|
66
|
+
email: anthonyeden@gmail.com
|
67
|
+
executables:
|
68
|
+
- etl
|
69
|
+
extensions: []
|
70
|
+
|
71
|
+
extra_rdoc_files: []
|
72
|
+
|
31
73
|
files:
|
32
74
|
- CHANGELOG
|
33
75
|
- LICENSE
|
@@ -35,6 +77,7 @@ files:
|
|
35
77
|
- TODO
|
36
78
|
- Rakefile
|
37
79
|
- bin/etl
|
80
|
+
- bin/etl.cmd
|
38
81
|
- lib/etl
|
39
82
|
- lib/etl/batch
|
40
83
|
- lib/etl/batch/batch.rb
|
@@ -42,6 +85,7 @@ files:
|
|
42
85
|
- lib/etl/batch.rb
|
43
86
|
- lib/etl/builder
|
44
87
|
- lib/etl/builder/date_dimension_builder.rb
|
88
|
+
- lib/etl/builder/time_dimension_builder.rb
|
45
89
|
- lib/etl/builder.rb
|
46
90
|
- lib/etl/commands
|
47
91
|
- lib/etl/commands/etl.rb
|
@@ -85,10 +129,12 @@ files:
|
|
85
129
|
- lib/etl/parser/xml_parser.rb
|
86
130
|
- lib/etl/parser.rb
|
87
131
|
- lib/etl/processor
|
132
|
+
- lib/etl/processor/block_processor.rb
|
88
133
|
- lib/etl/processor/bulk_import_processor.rb
|
89
134
|
- lib/etl/processor/check_exist_processor.rb
|
90
135
|
- lib/etl/processor/check_unique_processor.rb
|
91
136
|
- lib/etl/processor/copy_field_processor.rb
|
137
|
+
- lib/etl/processor/encode_processor.rb
|
92
138
|
- lib/etl/processor/hierarchy_exploder_processor.rb
|
93
139
|
- lib/etl/processor/print_row_processor.rb
|
94
140
|
- lib/etl/processor/processor.rb
|
@@ -123,62 +169,32 @@ files:
|
|
123
169
|
- lib/etl/version.rb
|
124
170
|
- lib/etl.rb
|
125
171
|
- examples/database.example.yml
|
126
|
-
|
127
|
-
|
172
|
+
has_rdoc: false
|
173
|
+
homepage: http://activewarehouse.rubyforge.org/etl
|
174
|
+
post_install_message:
|
128
175
|
rdoc_options:
|
129
176
|
- --exclude
|
130
177
|
- .
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
178
|
+
require_paths:
|
179
|
+
- lib
|
180
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
181
|
+
requirements:
|
182
|
+
- - ">="
|
183
|
+
- !ruby/object:Gem::Version
|
184
|
+
version: "0"
|
185
|
+
version:
|
186
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
187
|
+
requirements:
|
188
|
+
- - ">="
|
189
|
+
- !ruby/object:Gem::Version
|
190
|
+
version: "0"
|
191
|
+
version:
|
137
192
|
requirements: []
|
138
193
|
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
- !ruby/object:Gem::Version
|
147
|
-
version: 0.7.1
|
148
|
-
version:
|
149
|
-
- !ruby/object:Gem::Dependency
|
150
|
-
name: activesupport
|
151
|
-
version_requirement:
|
152
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
153
|
-
requirements:
|
154
|
-
- - ">="
|
155
|
-
- !ruby/object:Gem::Version
|
156
|
-
version: 1.3.1
|
157
|
-
version:
|
158
|
-
- !ruby/object:Gem::Dependency
|
159
|
-
name: activerecord
|
160
|
-
version_requirement:
|
161
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
162
|
-
requirements:
|
163
|
-
- - ">="
|
164
|
-
- !ruby/object:Gem::Version
|
165
|
-
version: 1.14.4
|
166
|
-
version:
|
167
|
-
- !ruby/object:Gem::Dependency
|
168
|
-
name: fastercsv
|
169
|
-
version_requirement:
|
170
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
171
|
-
requirements:
|
172
|
-
- - ">="
|
173
|
-
- !ruby/object:Gem::Version
|
174
|
-
version: 1.2.0
|
175
|
-
version:
|
176
|
-
- !ruby/object:Gem::Dependency
|
177
|
-
name: adapter_extensions
|
178
|
-
version_requirement:
|
179
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
180
|
-
requirements:
|
181
|
-
- - ">="
|
182
|
-
- !ruby/object:Gem::Version
|
183
|
-
version: 0.1.0
|
184
|
-
version:
|
194
|
+
rubyforge_project: activewarehouse
|
195
|
+
rubygems_version: 1.3.1
|
196
|
+
signing_key:
|
197
|
+
specification_version: 2
|
198
|
+
summary: Pure Ruby ETL package.
|
199
|
+
test_files: []
|
200
|
+
|