dataduck 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dcc9a5407d2bae97ab0ecb754f95d3c872b92cfe
4
- data.tar.gz: d32783d694d625367fb5f732602a6c2a997e8241
3
+ metadata.gz: a4dabe01cff2c6455751ab08c520d4bfaee62139
4
+ data.tar.gz: d20ef216bc631c445daad0767a51788b42b7f90f
5
5
  SHA512:
6
- metadata.gz: 184f298a735a3928a78d5b8e85e22e498d4e26b554d89a2b4201afa0e91d8b812872e0c72e4a6046970b6b8489ff451ad4b1fd93d2271314d133484762189470
7
- data.tar.gz: d9e4001147d98b51c3a481f894d52129349ba656d1f6b362845f37618e41b2bdf29005389b106df842ffb8ec02d0b84b45c683a561a49bc4674a88ce392fefcc
6
+ metadata.gz: d2eacaf08c612c25ae8bf9b1b1d46d4a0312fe0024211d0a8306faa5a810b972a5c2aa8386c4b05b04a26d73093bcae5a89d72bcadef98f6ed7e062054d40410
7
+ data.tar.gz: 2c4c1aec2a0257ad3dcc4e9559436c39de0f747a6ec3fdb816afb7d096678d7c1f608269b6c8dc55d1f1aeac514153bdbec7dbb7d3c082699a89f28e16577b22
@@ -0,0 +1,17 @@
1
+ # Helpful things to remember when developing
2
+
3
+ ## Publishing to Rubygems
4
+
5
+ Ensure the version number is updated (lib/dataduck/version.rb.
6
+
7
+ rspec
8
+
9
+ gem build dataduck.gemspec
10
+
11
+ gem push dataduck-VERSION.gem
12
+
13
+ ## Requiring the local version of the Gem
14
+
15
+ Use something like this:
16
+
17
+ gem 'dataduck', '0.3.0', path: '/Users/jrp/projects/dataduck'
data/README.md CHANGED
@@ -12,10 +12,6 @@ DataDuck ETL is currently focused on loading to Amazon Redshift (through Amazon
12
12
 
13
13
  ## Installation
14
14
 
15
- ##### Example project
16
-
17
- See [https://github.com/DataDuckETL/DataDuck/tree/master/examples/example](https://github.com/DataDuckETL/DataDuck/tree/master/examples/example) for an example project setup.
18
-
19
15
  ##### Instructions for using DataDuck ETL
20
16
 
21
17
  Create a new, empty directory. Inside this directory, create a file named Gemfile, and add the following to it:
@@ -40,45 +36,7 @@ If you'd like to run this regularly, such as every night, it's recommended to us
40
36
 
41
37
  ## Documentation
42
38
 
43
- Tables are defined in their own file under /src/tables. Here's an example table:
44
-
45
- ```ruby
46
- class Decks < DataDuck::Table
47
- source :my_database, ["id", "name", "user_id", "cards",
48
- "num_wins", "num_losses", "created_at", "updated_at",
49
- "is_drafted", "num_draft_wins", "num_draft_losses"]
50
-
51
- transforms :calculate_num_totals
52
-
53
- validates :validates_num_total
54
-
55
- output({
56
- :id => :integer,
57
- :name => :string,
58
- :user_id => :integer,
59
- :num_wins => :integer,
60
- :num_losses => :integer,
61
- :num_total => :integer,
62
- :num_draft_total => :integer,
63
- :created_at => :datetime,
64
- :updated_at => :datetime,
65
- :is_drafted => :boolean,
66
- # Note that num_draft_wins and num_draft_losses
67
- # are not included in the output, but are used in
68
- # the transformation.
69
- })
70
-
71
- def calculate_num_totals(row)
72
- row[:num_total] = row[:num_wins] + row[:num_losses]
73
- row[:num_draft_total] = row[:num_draft_wins] + row[:num_draft_losses]
74
- row
75
- end
76
-
77
- def validates_num_total(row)
78
- return "Deck id #{ row[:id] } has negative value #{ row[:num_total] } for num_total." if row[:num_total] < 0
79
- end
80
- end
81
- ```
39
+ Visit the [docs page](http://dataducketl.com/docs/overview/welcome) to read the documentation. The docs page is autogenerated from the files in this project's docs directory.
82
40
 
83
41
  ## Contributing
84
42
 
@@ -60,8 +60,7 @@ module DataDuck
60
60
 
61
61
  def self.quickstart
62
62
  puts "Welcome to DataDuck!"
63
- puts "This quickstart wizard will create your application, assuming the source is a Postgres database and the destination is an Amazon Redshift data warehouse."
64
-
63
+ puts "This quickstart wizard will help you set up DataDuck."
65
64
 
66
65
  puts "What kind of database would you like to source from?"
67
66
  db_type = prompt_choices([
@@ -115,7 +114,7 @@ module DataDuck
115
114
  config_obj = {
116
115
  'sources' => {
117
116
  'my_database' => {
118
- 'type' => 'postgresql',
117
+ 'type' => db_type.to_s,
119
118
  'host' => source_host,
120
119
  'database' => source_database,
121
120
  'port' => source_port,
@@ -170,9 +169,11 @@ module DataDuck
170
169
  columns << [property_name.to_s, property_type.to_s, commented_out]
171
170
  end
172
171
 
172
+ columns.sort! { |a, b| a[0] <=> b[0] }
173
+
173
174
  table_name = table_name.to_s.downcase
174
175
  table_name_camelcased = table_name.split('_').collect(&:capitalize).join
175
- namespace = Namespace.new(table_name: table_name_camelcased, columns: columns)
176
+ namespace = Namespace.new(table_name_camelcased: table_name_camelcased, table_name: table_name, columns: columns)
176
177
  template = File.open("#{ DataDuck.gem_root }/lib/templates/quickstart/table.rb.erb", 'r').read
177
178
  result = ERB.new(template).result(namespace.get_binding)
178
179
  DataDuck::Commands.quickstart_save_file("#{ DataDuck.project_root }/src/tables/#{ table_name }.rb", result)
@@ -8,16 +8,8 @@ module DataDuck
8
8
  DataDuck.config['destinations'][name.to_s]
9
9
  end
10
10
 
11
- def load_tables!(tables)
12
- raise Exception.new("Must implement load_tables! in subclass")
13
- end
14
-
15
- def before_all_loads!
16
-
17
- end
18
-
19
- def after_all_loads!
20
- # e.g. cleanup
11
+ def load_table!(table)
12
+ raise Exception.new("Must implement load_table! in subclass")
21
13
  end
22
14
 
23
15
  def self.destination(destination_name)
@@ -31,18 +31,13 @@ module DataDuck
31
31
  def process!
32
32
  puts "Processing ETL..."
33
33
 
34
- table_instances = []
35
34
  @tables.each do |table_class|
36
- table_instance = table_class.new
37
- table_instances << table_instance
38
- table_instance.extract!
39
- table_instance.transform!
40
- end
41
-
42
- self.class.destinations.each do |destination|
43
- destination.before_all_loads!(table_instances)
44
- destination.load_tables!(table_instances)
45
- destination.after_all_loads!(table_instances)
35
+ table_to_etl = table_class.new
36
+ table_to_etl.extract!
37
+ table_to_etl.transform!
38
+ self.class.destinations.each do |destination|
39
+ destination.load_table!(table_to_etl)
40
+ end
46
41
  end
47
42
  end
48
43
  end
@@ -144,24 +144,14 @@ module DataDuck
144
144
  return s3_obj
145
145
  end
146
146
 
147
- def before_all_loads!(tables)
148
-
149
- end
150
-
151
- def load_tables!(tables)
152
- tables.each do |table|
153
- puts "Loading table #{ table.name }..."
154
- s3_object = self.upload_table_to_s3!(table)
155
- self.create_staging_table!(table)
156
- self.create_output_table_on_data_warehouse!(table)
157
- self.run_query(self.copy_query(table, s3_object.s3_path))
158
- self.merge_from_staging!(table)
159
- self.drop_staging_table!(table)
160
- end
161
- end
162
-
163
- def after_all_loads!(tables)
164
-
147
+ def load_table!(table)
148
+ puts "Loading table #{ table.name }..."
149
+ s3_object = self.upload_table_to_s3!(table)
150
+ self.create_staging_table!(table)
151
+ self.create_output_table_on_data_warehouse!(table)
152
+ self.run_query(self.copy_query(table, s3_object.s3_path))
153
+ self.merge_from_staging!(table)
154
+ self.drop_staging_table!(table)
165
155
  end
166
156
 
167
157
  def self.value_to_string(value)
@@ -22,6 +22,9 @@ module DataDuck
22
22
  if source_type == "postgresql"
23
23
  DataDuck.sources[name] = DataDuck::PostgresqlSource.new(configuration)
24
24
  return DataDuck.sources[name]
25
+ elsif source_type == "mysql"
26
+ DataDuck.sources[name] = DataDuck::MysqlSource.new(configuration)
27
+ return DataDuck.sources[name]
25
28
  else
26
29
  raise ArgumentError.new("Unknown type '#{ source_type }' for source #{ name }.")
27
30
  end
@@ -4,10 +4,10 @@ module DataDuck
4
4
  attr_accessor :sources
5
5
  attr_accessor :output_schema
6
6
  attr_accessor :actions
7
- attr_accessor :errors
8
7
  end
9
8
 
10
9
  attr_accessor :data
10
+ attr_accessor :errors
11
11
 
12
12
  def self.transforms(transformation_name)
13
13
  self.actions ||= []
@@ -21,10 +21,20 @@ module DataDuck
21
21
  end
22
22
  singleton_class.send(:alias_method, :validate, :validates)
23
23
 
24
- def self.source(source_name, source_data = [])
25
- self.sources ||= {}
26
- source = DataDuck::Source.source(source_name)
27
- self.sources[source] = source_data
24
+ def self.source(source_name, source_table_or_query = nil, source_columns = nil)
25
+ self.sources ||= []
26
+
27
+ source_spec = {}
28
+ if source_table_or_query.respond_to?(:to_s) && source_table_or_query.to_s.downcase.include?('select ')
29
+ source_spec = {query: source_table_or_query}
30
+ elsif source_columns.nil? && source_table_or_query.respond_to?(:each)
31
+ source_spec = {columns: source_table_or_query, table_name: DataDuck::Util.camelcase_to_underscore(self.name)}
32
+ else
33
+ source_spec = {columns: source_columns, table_name: source_table_or_query.to_s}
34
+ end
35
+
36
+ source_spec[:source] = DataDuck::Source.source(source_name)
37
+ self.sources << source_spec
28
38
  end
29
39
 
30
40
  def self.output(schema)
@@ -49,19 +59,29 @@ module DataDuck
49
59
 
50
60
  self.errors ||= []
51
61
  self.data = []
52
- self.class.sources.each_pair do |source, source_columns|
53
- import_query = "SELECT \"#{ source_columns.sort.join('","') }\" FROM #{ self.name }"
54
- results = source.query(import_query)
62
+ self.class.sources.each do |source_spec|
63
+ source = source_spec[:source]
64
+ my_query = self.extract_query(source_spec)
65
+ results = source.query(my_query)
55
66
  self.data = results
56
67
  end
57
68
  self.data
58
69
  end
59
70
 
71
+ def extract_query(source_spec)
72
+ if source_spec.has_key?(:query)
73
+ query
74
+ else
75
+ "SELECT \"#{ source_spec[:columns].sort.join('","') }\" FROM #{ source_spec[:table_name] }"
76
+ end
77
+ end
78
+
60
79
  def transform!
61
80
  puts "Transforming table #{ self.name }..."
62
81
 
63
82
  self.errors ||= []
64
- self.actions.each do |action|
83
+ self.class.actions ||= []
84
+ self.class.actions.each do |action|
65
85
  action_type = action[0]
66
86
  action_method_name = action[1]
67
87
  if action_type == :transform
@@ -1,6 +1,6 @@
1
1
  module DataDuck
2
2
  VERSION_MAJOR = 0
3
- VERSION_MINOR = 3
3
+ VERSION_MINOR = 4
4
4
  VERSION_PATCH = 0
5
5
  VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
6
6
  end
@@ -1,5 +1,5 @@
1
- class <%= table_name %> < DataDuck::Table
2
- source :my_database, ["<%= columns.map { |col| col[0] }.join('", "') %>"]
1
+ class <%= table_name_camelcased %> < DataDuck::Table
2
+ source :my_database, :<%= table_name %>, ["<%= columns.map { |col| col[0] }.join('", "') %>"]
3
3
 
4
4
  output({<% columns.each do |col| %>
5
5
  <%= '# ' if col[2] %>:<%= col[0] %> => :<%= col[1] %>,<% end %>
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataduck
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeff Pickhardt
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-11 00:00:00.000000000 Z
11
+ date: 2015-10-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -134,6 +134,7 @@ files:
134
134
  - ".gitignore"
135
135
  - ".rspec"
136
136
  - ".ruby-version"
137
+ - DEV_README.md
137
138
  - Gemfile
138
139
  - README.md
139
140
  - Rakefile