dataduck 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/DEV_README.md +17 -0
- data/README.md +1 -43
- data/lib/dataduck/commands.rb +5 -4
- data/lib/dataduck/destination.rb +2 -10
- data/lib/dataduck/etl.rb +6 -11
- data/lib/dataduck/redshift_destination.rb +8 -18
- data/lib/dataduck/source.rb +3 -0
- data/lib/dataduck/table.rb +29 -9
- data/lib/dataduck/version.rb +1 -1
- data/lib/templates/quickstart/table.rb.erb +2 -2
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a4dabe01cff2c6455751ab08c520d4bfaee62139
|
4
|
+
data.tar.gz: d20ef216bc631c445daad0767a51788b42b7f90f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d2eacaf08c612c25ae8bf9b1b1d46d4a0312fe0024211d0a8306faa5a810b972a5c2aa8386c4b05b04a26d73093bcae5a89d72bcadef98f6ed7e062054d40410
|
7
|
+
data.tar.gz: 2c4c1aec2a0257ad3dcc4e9559436c39de0f747a6ec3fdb816afb7d096678d7c1f608269b6c8dc55d1f1aeac514153bdbec7dbb7d3c082699a89f28e16577b22
|
data/DEV_README.md
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# Helpful things to remember when developing
|
2
|
+
|
3
|
+
## Publishing to Rubygems
|
4
|
+
|
5
|
+
Ensure the version number is updated (lib/dataduck/version.rb.
|
6
|
+
|
7
|
+
rspec
|
8
|
+
|
9
|
+
gem build dataduck.gemspec
|
10
|
+
|
11
|
+
gem push dataduck-VERSION.gem
|
12
|
+
|
13
|
+
## Requiring the local version of the Gem
|
14
|
+
|
15
|
+
Use something like this:
|
16
|
+
|
17
|
+
gem 'dataduck', '0.3.0', path: '/Users/jrp/projects/dataduck'
|
data/README.md
CHANGED
@@ -12,10 +12,6 @@ DataDuck ETL is currently focused on loading to Amazon Redshift (through Amazon
|
|
12
12
|
|
13
13
|
## Installation
|
14
14
|
|
15
|
-
##### Example project
|
16
|
-
|
17
|
-
See [https://github.com/DataDuckETL/DataDuck/tree/master/examples/example](https://github.com/DataDuckETL/DataDuck/tree/master/examples/example) for an example project setup.
|
18
|
-
|
19
15
|
##### Instructions for using DataDuck ETL
|
20
16
|
|
21
17
|
Create a new, empty directory. Inside this directory, create a file named Gemfile, and add the following to it:
|
@@ -40,45 +36,7 @@ If you'd like to run this regularly, such as every night, it's recommended to us
|
|
40
36
|
|
41
37
|
## Documentation
|
42
38
|
|
43
|
-
|
44
|
-
|
45
|
-
```ruby
|
46
|
-
class Decks < DataDuck::Table
|
47
|
-
source :my_database, ["id", "name", "user_id", "cards",
|
48
|
-
"num_wins", "num_losses", "created_at", "updated_at",
|
49
|
-
"is_drafted", "num_draft_wins", "num_draft_losses"]
|
50
|
-
|
51
|
-
transforms :calculate_num_totals
|
52
|
-
|
53
|
-
validates :validates_num_total
|
54
|
-
|
55
|
-
output({
|
56
|
-
:id => :integer,
|
57
|
-
:name => :string,
|
58
|
-
:user_id => :integer,
|
59
|
-
:num_wins => :integer,
|
60
|
-
:num_losses => :integer,
|
61
|
-
:num_total => :integer,
|
62
|
-
:num_draft_total => :integer,
|
63
|
-
:created_at => :datetime,
|
64
|
-
:updated_at => :datetime,
|
65
|
-
:is_drafted => :boolean,
|
66
|
-
# Note that num_draft_wins and num_draft_losses
|
67
|
-
# are not included in the output, but are used in
|
68
|
-
# the transformation.
|
69
|
-
})
|
70
|
-
|
71
|
-
def calculate_num_totals(row)
|
72
|
-
row[:num_total] = row[:num_wins] + row[:num_losses]
|
73
|
-
row[:num_draft_total] = row[:num_draft_wins] + row[:num_draft_losses]
|
74
|
-
row
|
75
|
-
end
|
76
|
-
|
77
|
-
def validates_num_total(row)
|
78
|
-
return "Deck id #{ row[:id] } has negative value #{ row[:num_total] } for num_total." if row[:num_total] < 0
|
79
|
-
end
|
80
|
-
end
|
81
|
-
```
|
39
|
+
Visit the [docs page](http://dataducketl.com/docs/overview/welcome) to read the documentation. The docs page is autogenerated from the files in this project's docs directory.
|
82
40
|
|
83
41
|
## Contributing
|
84
42
|
|
data/lib/dataduck/commands.rb
CHANGED
@@ -60,8 +60,7 @@ module DataDuck
|
|
60
60
|
|
61
61
|
def self.quickstart
|
62
62
|
puts "Welcome to DataDuck!"
|
63
|
-
puts "This quickstart wizard will
|
64
|
-
|
63
|
+
puts "This quickstart wizard will help you set up DataDuck."
|
65
64
|
|
66
65
|
puts "What kind of database would you like to source from?"
|
67
66
|
db_type = prompt_choices([
|
@@ -115,7 +114,7 @@ module DataDuck
|
|
115
114
|
config_obj = {
|
116
115
|
'sources' => {
|
117
116
|
'my_database' => {
|
118
|
-
'type' =>
|
117
|
+
'type' => db_type.to_s,
|
119
118
|
'host' => source_host,
|
120
119
|
'database' => source_database,
|
121
120
|
'port' => source_port,
|
@@ -170,9 +169,11 @@ module DataDuck
|
|
170
169
|
columns << [property_name.to_s, property_type.to_s, commented_out]
|
171
170
|
end
|
172
171
|
|
172
|
+
columns.sort! { |a, b| a[0] <=> b[0] }
|
173
|
+
|
173
174
|
table_name = table_name.to_s.downcase
|
174
175
|
table_name_camelcased = table_name.split('_').collect(&:capitalize).join
|
175
|
-
namespace = Namespace.new(
|
176
|
+
namespace = Namespace.new(table_name_camelcased: table_name_camelcased, table_name: table_name, columns: columns)
|
176
177
|
template = File.open("#{ DataDuck.gem_root }/lib/templates/quickstart/table.rb.erb", 'r').read
|
177
178
|
result = ERB.new(template).result(namespace.get_binding)
|
178
179
|
DataDuck::Commands.quickstart_save_file("#{ DataDuck.project_root }/src/tables/#{ table_name }.rb", result)
|
data/lib/dataduck/destination.rb
CHANGED
@@ -8,16 +8,8 @@ module DataDuck
|
|
8
8
|
DataDuck.config['destinations'][name.to_s]
|
9
9
|
end
|
10
10
|
|
11
|
-
def
|
12
|
-
raise Exception.new("Must implement
|
13
|
-
end
|
14
|
-
|
15
|
-
def before_all_loads!
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
def after_all_loads!
|
20
|
-
# e.g. cleanup
|
11
|
+
def load_table!(table)
|
12
|
+
raise Exception.new("Must implement load_table! in subclass")
|
21
13
|
end
|
22
14
|
|
23
15
|
def self.destination(destination_name)
|
data/lib/dataduck/etl.rb
CHANGED
@@ -31,18 +31,13 @@ module DataDuck
|
|
31
31
|
def process!
|
32
32
|
puts "Processing ETL..."
|
33
33
|
|
34
|
-
table_instances = []
|
35
34
|
@tables.each do |table_class|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
self.class.destinations.each do |destination|
|
43
|
-
destination.before_all_loads!(table_instances)
|
44
|
-
destination.load_tables!(table_instances)
|
45
|
-
destination.after_all_loads!(table_instances)
|
35
|
+
table_to_etl = table_class.new
|
36
|
+
table_to_etl.extract!
|
37
|
+
table_to_etl.transform!
|
38
|
+
self.class.destinations.each do |destination|
|
39
|
+
destination.load_table!(table_to_etl)
|
40
|
+
end
|
46
41
|
end
|
47
42
|
end
|
48
43
|
end
|
@@ -144,24 +144,14 @@ module DataDuck
|
|
144
144
|
return s3_obj
|
145
145
|
end
|
146
146
|
|
147
|
-
def
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
self.create_staging_table!(table)
|
156
|
-
self.create_output_table_on_data_warehouse!(table)
|
157
|
-
self.run_query(self.copy_query(table, s3_object.s3_path))
|
158
|
-
self.merge_from_staging!(table)
|
159
|
-
self.drop_staging_table!(table)
|
160
|
-
end
|
161
|
-
end
|
162
|
-
|
163
|
-
def after_all_loads!(tables)
|
164
|
-
|
147
|
+
def load_table!(table)
|
148
|
+
puts "Loading table #{ table.name }..."
|
149
|
+
s3_object = self.upload_table_to_s3!(table)
|
150
|
+
self.create_staging_table!(table)
|
151
|
+
self.create_output_table_on_data_warehouse!(table)
|
152
|
+
self.run_query(self.copy_query(table, s3_object.s3_path))
|
153
|
+
self.merge_from_staging!(table)
|
154
|
+
self.drop_staging_table!(table)
|
165
155
|
end
|
166
156
|
|
167
157
|
def self.value_to_string(value)
|
data/lib/dataduck/source.rb
CHANGED
@@ -22,6 +22,9 @@ module DataDuck
|
|
22
22
|
if source_type == "postgresql"
|
23
23
|
DataDuck.sources[name] = DataDuck::PostgresqlSource.new(configuration)
|
24
24
|
return DataDuck.sources[name]
|
25
|
+
elsif source_type == "mysql"
|
26
|
+
DataDuck.sources[name] = DataDuck::MysqlSource.new(configuration)
|
27
|
+
return DataDuck.sources[name]
|
25
28
|
else
|
26
29
|
raise ArgumentError.new("Unknown type '#{ source_type }' for source #{ name }.")
|
27
30
|
end
|
data/lib/dataduck/table.rb
CHANGED
@@ -4,10 +4,10 @@ module DataDuck
|
|
4
4
|
attr_accessor :sources
|
5
5
|
attr_accessor :output_schema
|
6
6
|
attr_accessor :actions
|
7
|
-
attr_accessor :errors
|
8
7
|
end
|
9
8
|
|
10
9
|
attr_accessor :data
|
10
|
+
attr_accessor :errors
|
11
11
|
|
12
12
|
def self.transforms(transformation_name)
|
13
13
|
self.actions ||= []
|
@@ -21,10 +21,20 @@ module DataDuck
|
|
21
21
|
end
|
22
22
|
singleton_class.send(:alias_method, :validate, :validates)
|
23
23
|
|
24
|
-
def self.source(source_name,
|
25
|
-
self.sources ||=
|
26
|
-
|
27
|
-
|
24
|
+
def self.source(source_name, source_table_or_query = nil, source_columns = nil)
|
25
|
+
self.sources ||= []
|
26
|
+
|
27
|
+
source_spec = {}
|
28
|
+
if source_table_or_query.respond_to?(:to_s) && source_table_or_query.to_s.downcase.include?('select ')
|
29
|
+
source_spec = {query: source_table_or_query}
|
30
|
+
elsif source_columns.nil? && source_table_or_query.respond_to?(:each)
|
31
|
+
source_spec = {columns: source_table_or_query, table_name: DataDuck::Util.camelcase_to_underscore(self.name)}
|
32
|
+
else
|
33
|
+
source_spec = {columns: source_columns, table_name: source_table_or_query.to_s}
|
34
|
+
end
|
35
|
+
|
36
|
+
source_spec[:source] = DataDuck::Source.source(source_name)
|
37
|
+
self.sources << source_spec
|
28
38
|
end
|
29
39
|
|
30
40
|
def self.output(schema)
|
@@ -49,19 +59,29 @@ module DataDuck
|
|
49
59
|
|
50
60
|
self.errors ||= []
|
51
61
|
self.data = []
|
52
|
-
self.class.sources.
|
53
|
-
|
54
|
-
|
62
|
+
self.class.sources.each do |source_spec|
|
63
|
+
source = source_spec[:source]
|
64
|
+
my_query = self.extract_query(source_spec)
|
65
|
+
results = source.query(my_query)
|
55
66
|
self.data = results
|
56
67
|
end
|
57
68
|
self.data
|
58
69
|
end
|
59
70
|
|
71
|
+
def extract_query(source_spec)
|
72
|
+
if source_spec.has_key?(:query)
|
73
|
+
query
|
74
|
+
else
|
75
|
+
"SELECT \"#{ source_spec[:columns].sort.join('","') }\" FROM #{ source_spec[:table_name] }"
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
60
79
|
def transform!
|
61
80
|
puts "Transforming table #{ self.name }..."
|
62
81
|
|
63
82
|
self.errors ||= []
|
64
|
-
self.actions
|
83
|
+
self.class.actions ||= []
|
84
|
+
self.class.actions.each do |action|
|
65
85
|
action_type = action[0]
|
66
86
|
action_method_name = action[1]
|
67
87
|
if action_type == :transform
|
data/lib/dataduck/version.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
class <%=
|
2
|
-
source :my_database, ["<%= columns.map { |col| col[0] }.join('", "') %>"]
|
1
|
+
class <%= table_name_camelcased %> < DataDuck::Table
|
2
|
+
source :my_database, :<%= table_name %>, ["<%= columns.map { |col| col[0] }.join('", "') %>"]
|
3
3
|
|
4
4
|
output({<% columns.each do |col| %>
|
5
5
|
<%= '# ' if col[2] %>:<%= col[0] %> => :<%= col[1] %>,<% end %>
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataduck
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeff Pickhardt
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-10-
|
11
|
+
date: 2015-10-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -134,6 +134,7 @@ files:
|
|
134
134
|
- ".gitignore"
|
135
135
|
- ".rspec"
|
136
136
|
- ".ruby-version"
|
137
|
+
- DEV_README.md
|
137
138
|
- Gemfile
|
138
139
|
- README.md
|
139
140
|
- Rakefile
|