dataduck 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/DEV_README.md +17 -0
- data/README.md +1 -43
- data/lib/dataduck/commands.rb +5 -4
- data/lib/dataduck/destination.rb +2 -10
- data/lib/dataduck/etl.rb +6 -11
- data/lib/dataduck/redshift_destination.rb +8 -18
- data/lib/dataduck/source.rb +3 -0
- data/lib/dataduck/table.rb +29 -9
- data/lib/dataduck/version.rb +1 -1
- data/lib/templates/quickstart/table.rb.erb +2 -2
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a4dabe01cff2c6455751ab08c520d4bfaee62139
|
4
|
+
data.tar.gz: d20ef216bc631c445daad0767a51788b42b7f90f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d2eacaf08c612c25ae8bf9b1b1d46d4a0312fe0024211d0a8306faa5a810b972a5c2aa8386c4b05b04a26d73093bcae5a89d72bcadef98f6ed7e062054d40410
|
7
|
+
data.tar.gz: 2c4c1aec2a0257ad3dcc4e9559436c39de0f747a6ec3fdb816afb7d096678d7c1f608269b6c8dc55d1f1aeac514153bdbec7dbb7d3c082699a89f28e16577b22
|
data/DEV_README.md
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# Helpful things to remember when developing
|
2
|
+
|
3
|
+
## Publishing to Rubygems
|
4
|
+
|
5
|
+
Ensure the version number is updated (lib/dataduck/version.rb.
|
6
|
+
|
7
|
+
rspec
|
8
|
+
|
9
|
+
gem build dataduck.gemspec
|
10
|
+
|
11
|
+
gem push dataduck-VERSION.gem
|
12
|
+
|
13
|
+
## Requiring the local version of the Gem
|
14
|
+
|
15
|
+
Use something like this:
|
16
|
+
|
17
|
+
gem 'dataduck', '0.3.0', path: '/Users/jrp/projects/dataduck'
|
data/README.md
CHANGED
@@ -12,10 +12,6 @@ DataDuck ETL is currently focused on loading to Amazon Redshift (through Amazon
|
|
12
12
|
|
13
13
|
## Installation
|
14
14
|
|
15
|
-
##### Example project
|
16
|
-
|
17
|
-
See [https://github.com/DataDuckETL/DataDuck/tree/master/examples/example](https://github.com/DataDuckETL/DataDuck/tree/master/examples/example) for an example project setup.
|
18
|
-
|
19
15
|
##### Instructions for using DataDuck ETL
|
20
16
|
|
21
17
|
Create a new, empty directory. Inside this directory, create a file named Gemfile, and add the following to it:
|
@@ -40,45 +36,7 @@ If you'd like to run this regularly, such as every night, it's recommended to us
|
|
40
36
|
|
41
37
|
## Documentation
|
42
38
|
|
43
|
-
|
44
|
-
|
45
|
-
```ruby
|
46
|
-
class Decks < DataDuck::Table
|
47
|
-
source :my_database, ["id", "name", "user_id", "cards",
|
48
|
-
"num_wins", "num_losses", "created_at", "updated_at",
|
49
|
-
"is_drafted", "num_draft_wins", "num_draft_losses"]
|
50
|
-
|
51
|
-
transforms :calculate_num_totals
|
52
|
-
|
53
|
-
validates :validates_num_total
|
54
|
-
|
55
|
-
output({
|
56
|
-
:id => :integer,
|
57
|
-
:name => :string,
|
58
|
-
:user_id => :integer,
|
59
|
-
:num_wins => :integer,
|
60
|
-
:num_losses => :integer,
|
61
|
-
:num_total => :integer,
|
62
|
-
:num_draft_total => :integer,
|
63
|
-
:created_at => :datetime,
|
64
|
-
:updated_at => :datetime,
|
65
|
-
:is_drafted => :boolean,
|
66
|
-
# Note that num_draft_wins and num_draft_losses
|
67
|
-
# are not included in the output, but are used in
|
68
|
-
# the transformation.
|
69
|
-
})
|
70
|
-
|
71
|
-
def calculate_num_totals(row)
|
72
|
-
row[:num_total] = row[:num_wins] + row[:num_losses]
|
73
|
-
row[:num_draft_total] = row[:num_draft_wins] + row[:num_draft_losses]
|
74
|
-
row
|
75
|
-
end
|
76
|
-
|
77
|
-
def validates_num_total(row)
|
78
|
-
return "Deck id #{ row[:id] } has negative value #{ row[:num_total] } for num_total." if row[:num_total] < 0
|
79
|
-
end
|
80
|
-
end
|
81
|
-
```
|
39
|
+
Visit the [docs page](http://dataducketl.com/docs/overview/welcome) to read the documentation. The docs page is autogenerated from the files in this project's docs directory.
|
82
40
|
|
83
41
|
## Contributing
|
84
42
|
|
data/lib/dataduck/commands.rb
CHANGED
@@ -60,8 +60,7 @@ module DataDuck
|
|
60
60
|
|
61
61
|
def self.quickstart
|
62
62
|
puts "Welcome to DataDuck!"
|
63
|
-
puts "This quickstart wizard will
|
64
|
-
|
63
|
+
puts "This quickstart wizard will help you set up DataDuck."
|
65
64
|
|
66
65
|
puts "What kind of database would you like to source from?"
|
67
66
|
db_type = prompt_choices([
|
@@ -115,7 +114,7 @@ module DataDuck
|
|
115
114
|
config_obj = {
|
116
115
|
'sources' => {
|
117
116
|
'my_database' => {
|
118
|
-
'type' =>
|
117
|
+
'type' => db_type.to_s,
|
119
118
|
'host' => source_host,
|
120
119
|
'database' => source_database,
|
121
120
|
'port' => source_port,
|
@@ -170,9 +169,11 @@ module DataDuck
|
|
170
169
|
columns << [property_name.to_s, property_type.to_s, commented_out]
|
171
170
|
end
|
172
171
|
|
172
|
+
columns.sort! { |a, b| a[0] <=> b[0] }
|
173
|
+
|
173
174
|
table_name = table_name.to_s.downcase
|
174
175
|
table_name_camelcased = table_name.split('_').collect(&:capitalize).join
|
175
|
-
namespace = Namespace.new(
|
176
|
+
namespace = Namespace.new(table_name_camelcased: table_name_camelcased, table_name: table_name, columns: columns)
|
176
177
|
template = File.open("#{ DataDuck.gem_root }/lib/templates/quickstart/table.rb.erb", 'r').read
|
177
178
|
result = ERB.new(template).result(namespace.get_binding)
|
178
179
|
DataDuck::Commands.quickstart_save_file("#{ DataDuck.project_root }/src/tables/#{ table_name }.rb", result)
|
data/lib/dataduck/destination.rb
CHANGED
@@ -8,16 +8,8 @@ module DataDuck
|
|
8
8
|
DataDuck.config['destinations'][name.to_s]
|
9
9
|
end
|
10
10
|
|
11
|
-
def
|
12
|
-
raise Exception.new("Must implement
|
13
|
-
end
|
14
|
-
|
15
|
-
def before_all_loads!
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
def after_all_loads!
|
20
|
-
# e.g. cleanup
|
11
|
+
def load_table!(table)
|
12
|
+
raise Exception.new("Must implement load_table! in subclass")
|
21
13
|
end
|
22
14
|
|
23
15
|
def self.destination(destination_name)
|
data/lib/dataduck/etl.rb
CHANGED
@@ -31,18 +31,13 @@ module DataDuck
|
|
31
31
|
def process!
|
32
32
|
puts "Processing ETL..."
|
33
33
|
|
34
|
-
table_instances = []
|
35
34
|
@tables.each do |table_class|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
self.class.destinations.each do |destination|
|
43
|
-
destination.before_all_loads!(table_instances)
|
44
|
-
destination.load_tables!(table_instances)
|
45
|
-
destination.after_all_loads!(table_instances)
|
35
|
+
table_to_etl = table_class.new
|
36
|
+
table_to_etl.extract!
|
37
|
+
table_to_etl.transform!
|
38
|
+
self.class.destinations.each do |destination|
|
39
|
+
destination.load_table!(table_to_etl)
|
40
|
+
end
|
46
41
|
end
|
47
42
|
end
|
48
43
|
end
|
@@ -144,24 +144,14 @@ module DataDuck
|
|
144
144
|
return s3_obj
|
145
145
|
end
|
146
146
|
|
147
|
-
def
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
self.create_staging_table!(table)
|
156
|
-
self.create_output_table_on_data_warehouse!(table)
|
157
|
-
self.run_query(self.copy_query(table, s3_object.s3_path))
|
158
|
-
self.merge_from_staging!(table)
|
159
|
-
self.drop_staging_table!(table)
|
160
|
-
end
|
161
|
-
end
|
162
|
-
|
163
|
-
def after_all_loads!(tables)
|
164
|
-
|
147
|
+
def load_table!(table)
|
148
|
+
puts "Loading table #{ table.name }..."
|
149
|
+
s3_object = self.upload_table_to_s3!(table)
|
150
|
+
self.create_staging_table!(table)
|
151
|
+
self.create_output_table_on_data_warehouse!(table)
|
152
|
+
self.run_query(self.copy_query(table, s3_object.s3_path))
|
153
|
+
self.merge_from_staging!(table)
|
154
|
+
self.drop_staging_table!(table)
|
165
155
|
end
|
166
156
|
|
167
157
|
def self.value_to_string(value)
|
data/lib/dataduck/source.rb
CHANGED
@@ -22,6 +22,9 @@ module DataDuck
|
|
22
22
|
if source_type == "postgresql"
|
23
23
|
DataDuck.sources[name] = DataDuck::PostgresqlSource.new(configuration)
|
24
24
|
return DataDuck.sources[name]
|
25
|
+
elsif source_type == "mysql"
|
26
|
+
DataDuck.sources[name] = DataDuck::MysqlSource.new(configuration)
|
27
|
+
return DataDuck.sources[name]
|
25
28
|
else
|
26
29
|
raise ArgumentError.new("Unknown type '#{ source_type }' for source #{ name }.")
|
27
30
|
end
|
data/lib/dataduck/table.rb
CHANGED
@@ -4,10 +4,10 @@ module DataDuck
|
|
4
4
|
attr_accessor :sources
|
5
5
|
attr_accessor :output_schema
|
6
6
|
attr_accessor :actions
|
7
|
-
attr_accessor :errors
|
8
7
|
end
|
9
8
|
|
10
9
|
attr_accessor :data
|
10
|
+
attr_accessor :errors
|
11
11
|
|
12
12
|
def self.transforms(transformation_name)
|
13
13
|
self.actions ||= []
|
@@ -21,10 +21,20 @@ module DataDuck
|
|
21
21
|
end
|
22
22
|
singleton_class.send(:alias_method, :validate, :validates)
|
23
23
|
|
24
|
-
def self.source(source_name,
|
25
|
-
self.sources ||=
|
26
|
-
|
27
|
-
|
24
|
+
def self.source(source_name, source_table_or_query = nil, source_columns = nil)
|
25
|
+
self.sources ||= []
|
26
|
+
|
27
|
+
source_spec = {}
|
28
|
+
if source_table_or_query.respond_to?(:to_s) && source_table_or_query.to_s.downcase.include?('select ')
|
29
|
+
source_spec = {query: source_table_or_query}
|
30
|
+
elsif source_columns.nil? && source_table_or_query.respond_to?(:each)
|
31
|
+
source_spec = {columns: source_table_or_query, table_name: DataDuck::Util.camelcase_to_underscore(self.name)}
|
32
|
+
else
|
33
|
+
source_spec = {columns: source_columns, table_name: source_table_or_query.to_s}
|
34
|
+
end
|
35
|
+
|
36
|
+
source_spec[:source] = DataDuck::Source.source(source_name)
|
37
|
+
self.sources << source_spec
|
28
38
|
end
|
29
39
|
|
30
40
|
def self.output(schema)
|
@@ -49,19 +59,29 @@ module DataDuck
|
|
49
59
|
|
50
60
|
self.errors ||= []
|
51
61
|
self.data = []
|
52
|
-
self.class.sources.
|
53
|
-
|
54
|
-
|
62
|
+
self.class.sources.each do |source_spec|
|
63
|
+
source = source_spec[:source]
|
64
|
+
my_query = self.extract_query(source_spec)
|
65
|
+
results = source.query(my_query)
|
55
66
|
self.data = results
|
56
67
|
end
|
57
68
|
self.data
|
58
69
|
end
|
59
70
|
|
71
|
+
def extract_query(source_spec)
|
72
|
+
if source_spec.has_key?(:query)
|
73
|
+
query
|
74
|
+
else
|
75
|
+
"SELECT \"#{ source_spec[:columns].sort.join('","') }\" FROM #{ source_spec[:table_name] }"
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
60
79
|
def transform!
|
61
80
|
puts "Transforming table #{ self.name }..."
|
62
81
|
|
63
82
|
self.errors ||= []
|
64
|
-
self.actions
|
83
|
+
self.class.actions ||= []
|
84
|
+
self.class.actions.each do |action|
|
65
85
|
action_type = action[0]
|
66
86
|
action_method_name = action[1]
|
67
87
|
if action_type == :transform
|
data/lib/dataduck/version.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
class <%=
|
2
|
-
source :my_database, ["<%= columns.map { |col| col[0] }.join('", "') %>"]
|
1
|
+
class <%= table_name_camelcased %> < DataDuck::Table
|
2
|
+
source :my_database, :<%= table_name %>, ["<%= columns.map { |col| col[0] }.join('", "') %>"]
|
3
3
|
|
4
4
|
output({<% columns.each do |col| %>
|
5
5
|
<%= '# ' if col[2] %>:<%= col[0] %> => :<%= col[1] %>,<% end %>
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataduck
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeff Pickhardt
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-10-
|
11
|
+
date: 2015-10-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -134,6 +134,7 @@ files:
|
|
134
134
|
- ".gitignore"
|
135
135
|
- ".rspec"
|
136
136
|
- ".ruby-version"
|
137
|
+
- DEV_README.md
|
137
138
|
- Gemfile
|
138
139
|
- README.md
|
139
140
|
- Rakefile
|