dataduck 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dcc9a5407d2bae97ab0ecb754f95d3c872b92cfe
4
- data.tar.gz: d32783d694d625367fb5f732602a6c2a997e8241
3
+ metadata.gz: a4dabe01cff2c6455751ab08c520d4bfaee62139
4
+ data.tar.gz: d20ef216bc631c445daad0767a51788b42b7f90f
5
5
  SHA512:
6
- metadata.gz: 184f298a735a3928a78d5b8e85e22e498d4e26b554d89a2b4201afa0e91d8b812872e0c72e4a6046970b6b8489ff451ad4b1fd93d2271314d133484762189470
7
- data.tar.gz: d9e4001147d98b51c3a481f894d52129349ba656d1f6b362845f37618e41b2bdf29005389b106df842ffb8ec02d0b84b45c683a561a49bc4674a88ce392fefcc
6
+ metadata.gz: d2eacaf08c612c25ae8bf9b1b1d46d4a0312fe0024211d0a8306faa5a810b972a5c2aa8386c4b05b04a26d73093bcae5a89d72bcadef98f6ed7e062054d40410
7
+ data.tar.gz: 2c4c1aec2a0257ad3dcc4e9559436c39de0f747a6ec3fdb816afb7d096678d7c1f608269b6c8dc55d1f1aeac514153bdbec7dbb7d3c082699a89f28e16577b22
@@ -0,0 +1,17 @@
1
+ # Helpful things to remember when developing
2
+
3
+ ## Publishing to Rubygems
4
+
5
+ Ensure the version number is updated (lib/dataduck/version.rb.
6
+
7
+ rspec
8
+
9
+ gem build dataduck.gemspec
10
+
11
+ gem push dataduck-VERSION.gem
12
+
13
+ ## Requiring the local version of the Gem
14
+
15
+ Use something like this:
16
+
17
+ gem 'dataduck', '0.3.0', path: '/Users/jrp/projects/dataduck'
data/README.md CHANGED
@@ -12,10 +12,6 @@ DataDuck ETL is currently focused on loading to Amazon Redshift (through Amazon
12
12
 
13
13
  ## Installation
14
14
 
15
- ##### Example project
16
-
17
- See [https://github.com/DataDuckETL/DataDuck/tree/master/examples/example](https://github.com/DataDuckETL/DataDuck/tree/master/examples/example) for an example project setup.
18
-
19
15
  ##### Instructions for using DataDuck ETL
20
16
 
21
17
  Create a new, empty directory. Inside this directory, create a file named Gemfile, and add the following to it:
@@ -40,45 +36,7 @@ If you'd like to run this regularly, such as every night, it's recommended to us
40
36
 
41
37
  ## Documentation
42
38
 
43
- Tables are defined in their own file under /src/tables. Here's an example table:
44
-
45
- ```ruby
46
- class Decks < DataDuck::Table
47
- source :my_database, ["id", "name", "user_id", "cards",
48
- "num_wins", "num_losses", "created_at", "updated_at",
49
- "is_drafted", "num_draft_wins", "num_draft_losses"]
50
-
51
- transforms :calculate_num_totals
52
-
53
- validates :validates_num_total
54
-
55
- output({
56
- :id => :integer,
57
- :name => :string,
58
- :user_id => :integer,
59
- :num_wins => :integer,
60
- :num_losses => :integer,
61
- :num_total => :integer,
62
- :num_draft_total => :integer,
63
- :created_at => :datetime,
64
- :updated_at => :datetime,
65
- :is_drafted => :boolean,
66
- # Note that num_draft_wins and num_draft_losses
67
- # are not included in the output, but are used in
68
- # the transformation.
69
- })
70
-
71
- def calculate_num_totals(row)
72
- row[:num_total] = row[:num_wins] + row[:num_losses]
73
- row[:num_draft_total] = row[:num_draft_wins] + row[:num_draft_losses]
74
- row
75
- end
76
-
77
- def validates_num_total(row)
78
- return "Deck id #{ row[:id] } has negative value #{ row[:num_total] } for num_total." if row[:num_total] < 0
79
- end
80
- end
81
- ```
39
+ Visit the [docs page](http://dataducketl.com/docs/overview/welcome) to read the documentation. The docs page is autogenerated from the files in this project's docs directory.
82
40
 
83
41
  ## Contributing
84
42
 
@@ -60,8 +60,7 @@ module DataDuck
60
60
 
61
61
  def self.quickstart
62
62
  puts "Welcome to DataDuck!"
63
- puts "This quickstart wizard will create your application, assuming the source is a Postgres database and the destination is an Amazon Redshift data warehouse."
64
-
63
+ puts "This quickstart wizard will help you set up DataDuck."
65
64
 
66
65
  puts "What kind of database would you like to source from?"
67
66
  db_type = prompt_choices([
@@ -115,7 +114,7 @@ module DataDuck
115
114
  config_obj = {
116
115
  'sources' => {
117
116
  'my_database' => {
118
- 'type' => 'postgresql',
117
+ 'type' => db_type.to_s,
119
118
  'host' => source_host,
120
119
  'database' => source_database,
121
120
  'port' => source_port,
@@ -170,9 +169,11 @@ module DataDuck
170
169
  columns << [property_name.to_s, property_type.to_s, commented_out]
171
170
  end
172
171
 
172
+ columns.sort! { |a, b| a[0] <=> b[0] }
173
+
173
174
  table_name = table_name.to_s.downcase
174
175
  table_name_camelcased = table_name.split('_').collect(&:capitalize).join
175
- namespace = Namespace.new(table_name: table_name_camelcased, columns: columns)
176
+ namespace = Namespace.new(table_name_camelcased: table_name_camelcased, table_name: table_name, columns: columns)
176
177
  template = File.open("#{ DataDuck.gem_root }/lib/templates/quickstart/table.rb.erb", 'r').read
177
178
  result = ERB.new(template).result(namespace.get_binding)
178
179
  DataDuck::Commands.quickstart_save_file("#{ DataDuck.project_root }/src/tables/#{ table_name }.rb", result)
@@ -8,16 +8,8 @@ module DataDuck
8
8
  DataDuck.config['destinations'][name.to_s]
9
9
  end
10
10
 
11
- def load_tables!(tables)
12
- raise Exception.new("Must implement load_tables! in subclass")
13
- end
14
-
15
- def before_all_loads!
16
-
17
- end
18
-
19
- def after_all_loads!
20
- # e.g. cleanup
11
+ def load_table!(table)
12
+ raise Exception.new("Must implement load_table! in subclass")
21
13
  end
22
14
 
23
15
  def self.destination(destination_name)
@@ -31,18 +31,13 @@ module DataDuck
31
31
  def process!
32
32
  puts "Processing ETL..."
33
33
 
34
- table_instances = []
35
34
  @tables.each do |table_class|
36
- table_instance = table_class.new
37
- table_instances << table_instance
38
- table_instance.extract!
39
- table_instance.transform!
40
- end
41
-
42
- self.class.destinations.each do |destination|
43
- destination.before_all_loads!(table_instances)
44
- destination.load_tables!(table_instances)
45
- destination.after_all_loads!(table_instances)
35
+ table_to_etl = table_class.new
36
+ table_to_etl.extract!
37
+ table_to_etl.transform!
38
+ self.class.destinations.each do |destination|
39
+ destination.load_table!(table_to_etl)
40
+ end
46
41
  end
47
42
  end
48
43
  end
@@ -144,24 +144,14 @@ module DataDuck
144
144
  return s3_obj
145
145
  end
146
146
 
147
- def before_all_loads!(tables)
148
-
149
- end
150
-
151
- def load_tables!(tables)
152
- tables.each do |table|
153
- puts "Loading table #{ table.name }..."
154
- s3_object = self.upload_table_to_s3!(table)
155
- self.create_staging_table!(table)
156
- self.create_output_table_on_data_warehouse!(table)
157
- self.run_query(self.copy_query(table, s3_object.s3_path))
158
- self.merge_from_staging!(table)
159
- self.drop_staging_table!(table)
160
- end
161
- end
162
-
163
- def after_all_loads!(tables)
164
-
147
+ def load_table!(table)
148
+ puts "Loading table #{ table.name }..."
149
+ s3_object = self.upload_table_to_s3!(table)
150
+ self.create_staging_table!(table)
151
+ self.create_output_table_on_data_warehouse!(table)
152
+ self.run_query(self.copy_query(table, s3_object.s3_path))
153
+ self.merge_from_staging!(table)
154
+ self.drop_staging_table!(table)
165
155
  end
166
156
 
167
157
  def self.value_to_string(value)
@@ -22,6 +22,9 @@ module DataDuck
22
22
  if source_type == "postgresql"
23
23
  DataDuck.sources[name] = DataDuck::PostgresqlSource.new(configuration)
24
24
  return DataDuck.sources[name]
25
+ elsif source_type == "mysql"
26
+ DataDuck.sources[name] = DataDuck::MysqlSource.new(configuration)
27
+ return DataDuck.sources[name]
25
28
  else
26
29
  raise ArgumentError.new("Unknown type '#{ source_type }' for source #{ name }.")
27
30
  end
@@ -4,10 +4,10 @@ module DataDuck
4
4
  attr_accessor :sources
5
5
  attr_accessor :output_schema
6
6
  attr_accessor :actions
7
- attr_accessor :errors
8
7
  end
9
8
 
10
9
  attr_accessor :data
10
+ attr_accessor :errors
11
11
 
12
12
  def self.transforms(transformation_name)
13
13
  self.actions ||= []
@@ -21,10 +21,20 @@ module DataDuck
21
21
  end
22
22
  singleton_class.send(:alias_method, :validate, :validates)
23
23
 
24
- def self.source(source_name, source_data = [])
25
- self.sources ||= {}
26
- source = DataDuck::Source.source(source_name)
27
- self.sources[source] = source_data
24
+ def self.source(source_name, source_table_or_query = nil, source_columns = nil)
25
+ self.sources ||= []
26
+
27
+ source_spec = {}
28
+ if source_table_or_query.respond_to?(:to_s) && source_table_or_query.to_s.downcase.include?('select ')
29
+ source_spec = {query: source_table_or_query}
30
+ elsif source_columns.nil? && source_table_or_query.respond_to?(:each)
31
+ source_spec = {columns: source_table_or_query, table_name: DataDuck::Util.camelcase_to_underscore(self.name)}
32
+ else
33
+ source_spec = {columns: source_columns, table_name: source_table_or_query.to_s}
34
+ end
35
+
36
+ source_spec[:source] = DataDuck::Source.source(source_name)
37
+ self.sources << source_spec
28
38
  end
29
39
 
30
40
  def self.output(schema)
@@ -49,19 +59,29 @@ module DataDuck
49
59
 
50
60
  self.errors ||= []
51
61
  self.data = []
52
- self.class.sources.each_pair do |source, source_columns|
53
- import_query = "SELECT \"#{ source_columns.sort.join('","') }\" FROM #{ self.name }"
54
- results = source.query(import_query)
62
+ self.class.sources.each do |source_spec|
63
+ source = source_spec[:source]
64
+ my_query = self.extract_query(source_spec)
65
+ results = source.query(my_query)
55
66
  self.data = results
56
67
  end
57
68
  self.data
58
69
  end
59
70
 
71
+ def extract_query(source_spec)
72
+ if source_spec.has_key?(:query)
73
+ query
74
+ else
75
+ "SELECT \"#{ source_spec[:columns].sort.join('","') }\" FROM #{ source_spec[:table_name] }"
76
+ end
77
+ end
78
+
60
79
  def transform!
61
80
  puts "Transforming table #{ self.name }..."
62
81
 
63
82
  self.errors ||= []
64
- self.actions.each do |action|
83
+ self.class.actions ||= []
84
+ self.class.actions.each do |action|
65
85
  action_type = action[0]
66
86
  action_method_name = action[1]
67
87
  if action_type == :transform
@@ -1,6 +1,6 @@
1
1
  module DataDuck
2
2
  VERSION_MAJOR = 0
3
- VERSION_MINOR = 3
3
+ VERSION_MINOR = 4
4
4
  VERSION_PATCH = 0
5
5
  VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
6
6
  end
@@ -1,5 +1,5 @@
1
- class <%= table_name %> < DataDuck::Table
2
- source :my_database, ["<%= columns.map { |col| col[0] }.join('", "') %>"]
1
+ class <%= table_name_camelcased %> < DataDuck::Table
2
+ source :my_database, :<%= table_name %>, ["<%= columns.map { |col| col[0] }.join('", "') %>"]
3
3
 
4
4
  output({<% columns.each do |col| %>
5
5
  <%= '# ' if col[2] %>:<%= col[0] %> => :<%= col[1] %>,<% end %>
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataduck
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeff Pickhardt
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-11 00:00:00.000000000 Z
11
+ date: 2015-10-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -134,6 +134,7 @@ files:
134
134
  - ".gitignore"
135
135
  - ".rspec"
136
136
  - ".ruby-version"
137
+ - DEV_README.md
137
138
  - Gemfile
138
139
  - README.md
139
140
  - Rakefile