RubyGems - dataduck - Versions diffs - 0.3.0 → 0.4.0 - Mend

dataduck 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/DEV_README.md +17 -0
data/README.md +1 -43
data/lib/dataduck/commands.rb +5 -4
data/lib/dataduck/destination.rb +2 -10
data/lib/dataduck/etl.rb +6 -11
data/lib/dataduck/redshift_destination.rb +8 -18
data/lib/dataduck/source.rb +3 -0
data/lib/dataduck/table.rb +29 -9
data/lib/dataduck/version.rb +1 -1
data/lib/templates/quickstart/table.rb.erb +2 -2
metadata +3 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: dcc9a5407d2bae97ab0ecb754f95d3c872b92cfe
-  data.tar.gz: d32783d694d625367fb5f732602a6c2a997e8241
+  metadata.gz: a4dabe01cff2c6455751ab08c520d4bfaee62139
+  data.tar.gz: d20ef216bc631c445daad0767a51788b42b7f90f
 SHA512:
-  metadata.gz: 184f298a735a3928a78d5b8e85e22e498d4e26b554d89a2b4201afa0e91d8b812872e0c72e4a6046970b6b8489ff451ad4b1fd93d2271314d133484762189470
-  data.tar.gz: d9e4001147d98b51c3a481f894d52129349ba656d1f6b362845f37618e41b2bdf29005389b106df842ffb8ec02d0b84b45c683a561a49bc4674a88ce392fefcc
+  metadata.gz: d2eacaf08c612c25ae8bf9b1b1d46d4a0312fe0024211d0a8306faa5a810b972a5c2aa8386c4b05b04a26d73093bcae5a89d72bcadef98f6ed7e062054d40410
+  data.tar.gz: 2c4c1aec2a0257ad3dcc4e9559436c39de0f747a6ec3fdb816afb7d096678d7c1f608269b6c8dc55d1f1aeac514153bdbec7dbb7d3c082699a89f28e16577b22

data/DEV_README.md ADDED

@@ -0,0 +1,17 @@
+# Helpful things to remember when developing
+## Publishing to Rubygems
+Ensure the version number is updated (lib/dataduck/version.rb.
+rspec
+gem build dataduck.gemspec
+gem push dataduck-VERSION.gem
+## Requiring the local version of the Gem
+Use something like this:
+gem 'dataduck', '0.3.0', path: '/Users/jrp/projects/dataduck'

data/README.md CHANGED

@@ -12,10 +12,6 @@ DataDuck ETL is currently focused on loading to Amazon Redshift (through Amazon
 ## Installation
-##### Example project
-See [https://github.com/DataDuckETL/DataDuck/tree/master/examples/example](https://github.com/DataDuckETL/DataDuck/tree/master/examples/example) for an example project setup.
 ##### Instructions for using DataDuck ETL
 Create a new, empty directory. Inside this directory, create a file named Gemfile, and add the following to it:
@@ -40,45 +36,7 @@ If you'd like to run this regularly, such as every night, it's recommended to us
 ## Documentation
-Tables are defined in their own file under /src/tables. Here's an example table:
-```ruby
-class Decks < DataDuck::Table
-  source :my_database, ["id", "name", "user_id", "cards",
-      "num_wins", "num_losses", "created_at", "updated_at",
-      "is_drafted", "num_draft_wins", "num_draft_losses"]
-  transforms :calculate_num_totals
-  validates :validates_num_total
-  output({
-      :id => :integer,
-      :name => :string,
-      :user_id => :integer,
-      :num_wins => :integer,
-      :num_losses => :integer,
-      :num_total => :integer,
-      :num_draft_total => :integer,
-      :created_at => :datetime,
-      :updated_at => :datetime,
-      :is_drafted => :boolean,
-      # Note that num_draft_wins and num_draft_losses
-      # are not included in the output, but are used in
-      # the transformation.
-  })
-  def calculate_num_totals(row)
-    row[:num_total] = row[:num_wins] + row[:num_losses]
-    row[:num_draft_total] = row[:num_draft_wins] + row[:num_draft_losses]
-    row
-  end
-  def validates_num_total(row)
-    return "Deck id #{ row[:id] } has negative value #{ row[:num_total] } for num_total." if row[:num_total] < 0
-  end
-end
-```
+Visit the [docs page](http://dataducketl.com/docs/overview/welcome) to read the documentation. The docs page is autogenerated from the files in this project's docs directory.
 ## Contributing

data/lib/dataduck/commands.rb CHANGED

@@ -60,8 +60,7 @@ module DataDuck
     def self.quickstart
       puts "Welcome to DataDuck!"
-      puts "This quickstart wizard will create your application, assuming the source is a Postgres database and the destination is an Amazon Redshift data warehouse."
+      puts "This quickstart wizard will help you set up DataDuck."
       puts "What kind of database would you like to source from?"
       db_type = prompt_choices([
@@ -115,7 +114,7 @@ module DataDuck
       config_obj = {
         'sources' => {
           'my_database' => {
-            'type' => 'postgresql',
+            'type' => db_type.to_s,
             'host' => source_host,
             'database' => source_database,
             'port' => source_port,
@@ -170,9 +169,11 @@ module DataDuck
         columns << [property_name.to_s, property_type.to_s, commented_out]
       end
+      columns.sort! { |a, b| a[0] <=> b[0] }
       table_name = table_name.to_s.downcase
       table_name_camelcased = table_name.split('_').collect(&:capitalize).join
-      namespace = Namespace.new(table_name: table_name_camelcased, columns: columns)
+      namespace = Namespace.new(table_name_camelcased: table_name_camelcased, table_name: table_name, columns: columns)
       template = File.open("#{ DataDuck.gem_root }/lib/templates/quickstart/table.rb.erb", 'r').read
       result = ERB.new(template).result(namespace.get_binding)
       DataDuck::Commands.quickstart_save_file("#{ DataDuck.project_root }/src/tables/#{ table_name }.rb", result)

data/lib/dataduck/destination.rb CHANGED

@@ -8,16 +8,8 @@ module DataDuck
       DataDuck.config['destinations'][name.to_s]
     end
-    def load_tables!(tables)
-      raise Exception.new("Must implement load_tables! in subclass")
-    end
-    def before_all_loads!
-    end
-    def after_all_loads!
-      # e.g. cleanup
+    def load_table!(table)
+      raise Exception.new("Must implement load_table! in subclass")
     end
     def self.destination(destination_name)

data/lib/dataduck/etl.rb CHANGED

@@ -31,18 +31,13 @@ module DataDuck
     def process!
       puts "Processing ETL..."
-      table_instances = []
       @tables.each do |table_class|
-        table_instance = table_class.new
-        table_instances << table_instance
-        table_instance.extract!
-        table_instance.transform!
-      end
-      self.class.destinations.each do |destination|
-        destination.before_all_loads!(table_instances)
-        destination.load_tables!(table_instances)
-        destination.after_all_loads!(table_instances)
+        table_to_etl = table_class.new
+        table_to_etl.extract!
+        table_to_etl.transform!
+        self.class.destinations.each do |destination|
+          destination.load_table!(table_to_etl)
+        end
       end
     end
   end

data/lib/dataduck/redshift_destination.rb CHANGED

@@ -144,24 +144,14 @@ module DataDuck
       return s3_obj
     end
-    def before_all_loads!(tables)
-    end
-    def load_tables!(tables)
-      tables.each do |table|
-        puts "Loading table #{ table.name }..."
-        s3_object = self.upload_table_to_s3!(table)
-        self.create_staging_table!(table)
-        self.create_output_table_on_data_warehouse!(table)
-        self.run_query(self.copy_query(table, s3_object.s3_path))
-        self.merge_from_staging!(table)
-        self.drop_staging_table!(table)
-      end
-    end
-    def after_all_loads!(tables)
+    def load_table!(table)
+      puts "Loading table #{ table.name }..."
+      s3_object = self.upload_table_to_s3!(table)
+      self.create_staging_table!(table)
+      self.create_output_table_on_data_warehouse!(table)
+      self.run_query(self.copy_query(table, s3_object.s3_path))
+      self.merge_from_staging!(table)
+      self.drop_staging_table!(table)
     end
     def self.value_to_string(value)

data/lib/dataduck/source.rb CHANGED

@@ -22,6 +22,9 @@ module DataDuck
       if source_type == "postgresql"
         DataDuck.sources[name] = DataDuck::PostgresqlSource.new(configuration)
         return DataDuck.sources[name]
+      elsif source_type == "mysql"
+        DataDuck.sources[name] = DataDuck::MysqlSource.new(configuration)
+        return DataDuck.sources[name]
       else
         raise ArgumentError.new("Unknown type '#{ source_type }' for source #{ name }.")
       end

data/lib/dataduck/table.rb CHANGED

@@ -4,10 +4,10 @@ module DataDuck
       attr_accessor :sources
       attr_accessor :output_schema
       attr_accessor :actions
-      attr_accessor :errors
     end
     attr_accessor :data
+    attr_accessor :errors
     def self.transforms(transformation_name)
       self.actions ||= []
@@ -21,10 +21,20 @@ module DataDuck
     end
     singleton_class.send(:alias_method, :validate, :validates)
-    def self.source(source_name, source_data = [])
-      self.sources ||= {}
-      source = DataDuck::Source.source(source_name)
-      self.sources[source] = source_data
+    def self.source(source_name, source_table_or_query = nil, source_columns = nil)
+      self.sources ||= []
+      source_spec = {}
+      if source_table_or_query.respond_to?(:to_s) && source_table_or_query.to_s.downcase.include?('select ')
+        source_spec = {query: source_table_or_query}
+      elsif source_columns.nil? && source_table_or_query.respond_to?(:each)
+        source_spec = {columns: source_table_or_query, table_name: DataDuck::Util.camelcase_to_underscore(self.name)}
+      else
+        source_spec = {columns: source_columns, table_name: source_table_or_query.to_s}
+      end
+      source_spec[:source] = DataDuck::Source.source(source_name)
+      self.sources << source_spec
     end
     def self.output(schema)
@@ -49,19 +59,29 @@ module DataDuck
       self.errors ||= []
       self.data = []
-      self.class.sources.each_pair do |source, source_columns|
-        import_query = "SELECT \"#{ source_columns.sort.join('","') }\" FROM #{ self.name }"
-        results = source.query(import_query)
+      self.class.sources.each do |source_spec|
+        source = source_spec[:source]
+        my_query = self.extract_query(source_spec)
+        results = source.query(my_query)
         self.data = results
       end
       self.data
     end
+    def extract_query(source_spec)
+      if source_spec.has_key?(:query)
+        query
+      else
+        "SELECT \"#{ source_spec[:columns].sort.join('","') }\" FROM #{ source_spec[:table_name] }"
+      end
+    end
     def transform!
       puts "Transforming table #{ self.name }..."
       self.errors ||= []
-      self.actions.each do |action|
+      self.class.actions ||= []
+      self.class.actions.each do |action|
         action_type = action[0]
         action_method_name = action[1]
         if action_type == :transform

data/lib/dataduck/version.rb CHANGED

@@ -1,6 +1,6 @@
 module DataDuck
   VERSION_MAJOR = 0
-  VERSION_MINOR = 3
+  VERSION_MINOR = 4
   VERSION_PATCH = 0
   VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
 end

data/lib/templates/quickstart/table.rb.erb CHANGED

@@ -1,5 +1,5 @@
-class <%= table_name %> < DataDuck::Table
-  source :my_database, ["<%= columns.map { |col| col[0] }.join('", "') %>"]
+class <%= table_name_camelcased %> < DataDuck::Table
+  source :my_database, :<%= table_name %>, ["<%= columns.map { |col| col[0] }.join('", "') %>"]
   output({<% columns.each do |col| %>
       <%= '# ' if col[2] %>:<%= col[0] %> => :<%= col[1] %>,<% end %>

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: dataduck
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.4.0
 platform: ruby
 authors:
 - Jeff Pickhardt
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-10-11 00:00:00.000000000 Z
+date: 2015-10-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -134,6 +134,7 @@ files:
 - ".gitignore"
 - ".rspec"
 - ".ruby-version"
+- DEV_README.md
 - Gemfile
 - README.md
 - Rakefile