RubyGems - dataduck - Versions diffs - 0.6.5 → 0.6.6 - Mend

dataduck 0.6.5 → 0.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/lib/dataduck/etl.rb +3 -3
data/lib/dataduck/redshift_destination.rb +17 -1
data/lib/dataduck/table.rb +6 -0
data/lib/dataduck/version.rb +1 -1
data/lib/integrations/optimizely/experiments.rb +12 -38
data/lib/integrations/optimizely/optimizely_integration.rb +60 -2
data/lib/integrations/optimizely/optimizely_table.rb +4 -0
data/lib/integrations/optimizely/projects.rb +7 -14
data/lib/integrations/optimizely/variations.rb +48 -1
data/lib/integrations/semrush/organic_results.rb +38 -11
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 8134b77413b5c8b0ab92de410257aa000ef25605
-  data.tar.gz: c50fea221ac3ebe10c5d2e7e6d648410dc4aee98
+  metadata.gz: 8bd5effb261990d6eccb37fe16de93195c89ef24
+  data.tar.gz: b2c7d31fac424890e6b605d2828423b624625467
 SHA512:
-  metadata.gz: 3df107d634e0ab1950ac6e2ac91658058325e09eb3a38c50097242157f6d98cdbb7d60e123ef426ee33399aec892e517a20f952ef9ce48443bbe095ec4c9322b
-  data.tar.gz: bf512224a1d5065bb3175c12be43884ddedb10d4d2b38e7818cc67fa819352cdf28b78b01d1ef83574f87c0f1f0dfd2ed76d58aeb7ba437f5dad5f793530795b
+  metadata.gz: 61b17cc19b83ad9733b9744de6a65f884af5f3b145ddbc967ce01eff2fa8b552b103c9c69016c59f87c729cd9bc10866ed5ee720f903236c90e1d2b4ec2712c5
+  data.tar.gz: 0217b069e2d3997698c4135a170c65c22d16187cb02d35adf68d4e62c8798073c2c95a9d5f9689329762cc8c6d25a0b5682f542f7b802918a6776f792e302db3

data/lib/dataduck/etl.rb CHANGED Viewed

@@ -25,9 +25,9 @@ module DataDuck
           table_name_underscores = file.split("/").last.gsub(".rb", "")
           table_name_camelized = DataDuck::Util.underscore_to_camelcase(table_name_underscores)
           require file
-          table = Object.const_get(table_name_camelized)
-          if table <= DataDuck::Table
-            @tables << table
+          table_class = Object.const_get(table_name_camelized)
+          if table_class <= DataDuck::Table && table_class.new.include_with_all?
+            @tables << table_class
           end
         end
       end

data/lib/dataduck/redshift_destination.rb CHANGED Viewed

@@ -175,10 +175,26 @@ module DataDuck
       end
       # Following guidelines in http://docs.aws.amazon.com/redshift/latest/dg/merge-examples.html
+      self.delete_before_inserting!(table)
+      self.insert_from_staging!(table)
+    end
+    def delete_before_inserting!(table)
       staging_name = table.staging_name
       building_name = table.building_name
-      delete_query = "DELETE FROM #{ building_name } USING #{ staging_name } WHERE #{ building_name }.id = #{ staging_name }.id" # TODO allow custom or multiple keys
+      where_equals_parts = []
+      table.identify_by_columns.each do |attribute|
+        where_equals_parts << "#{ building_name }.#{ attribute } = #{ staging_name }.#{ attribute }"
+      end
+      delete_query = "DELETE FROM #{ building_name } USING #{ staging_name } WHERE #{ where_equals_parts.join(' AND ') }"
       self.query(delete_query)
+    end
+    def insert_from_staging!(table)
+      staging_name = table.staging_name
+      building_name = table.building_name
       insert_query = "INSERT INTO #{ building_name } (\"#{ table.output_column_names.join('","') }\") SELECT \"#{ table.output_column_names.join('","') }\" FROM #{ staging_name }"
       self.query(insert_query)
     end

data/lib/dataduck/table.rb CHANGED Viewed

@@ -194,6 +194,12 @@ module DataDuck
       nil
     end
+    def identify_by_columns
+      return ["id"] if self.output_column_names.include?("id")
+      []
+    end
     def should_fully_reload?
       false # Set to true if you want to fully reload a table with each ETL
     end

data/lib/dataduck/version.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module DataDuck
   if !defined?(DataDuck::VERSION)
     VERSION_MAJOR = 0
     VERSION_MINOR = 6
-    VERSION_PATCH = 5
+    VERSION_PATCH = 6
     VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
   end
 end

data/lib/integrations/optimizely/experiments.rb CHANGED Viewed

@@ -7,50 +7,18 @@ require 'date'
 module DataDuck
   module Optimizely
     class Experiments < DataDuck::Optimizely::OptimizelyTable
       transforms :percentage_included_to_float
-      transforms :parse_datetimes
-      def extract!(destination, options = {})
-        self.data = []
+      transforms :rename_description_to_name
-        projects_response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects", headers: {'Token' => self.optimizely_api_token})
-        if projects_response.response_code != 200
-          raise Exception.new("Optimizely API for projects returned error #{ response.response_code} #{ response.body }")
-        end
-        projects = Oj.load(projects_response.body)
-        projects.each do |project|
-          self.extract_for_project!(project["id"])
-        end
+      def initialize(experiments)
+        self.data = experiments
       end
-      def extract_for_project!(project_id)
-        now = DateTime.now
-        response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects/#{ project_id }/experiments", headers: {'Token' => self.optimizely_api_token})
-        if response.response_code != 200
-          raise Exception.new("Optimizely API for experiments returned error #{ response.response_code} #{ response.body }")
-        end
-        experiments = Oj.load(response.body)
-        experiments.each do |experiment|
-          experiment[:dataduck_extracted_at] = now
-          experiment[:project_id] = project_id
-        end
-        self.data.concat(experiments)
+      def extract!(*args)
+        # already initialized data
       end
-      def parse_datetimes(row)
-        row["created"] = DateTime.parse(row["created"])
-        row["last_modified"] = DateTime.parse(row["last_modified"])
-        row
-      end
-      def rename_description_to_name
+      def rename_description_to_name(row)
         row[:name] = row['description']
         row
@@ -62,6 +30,10 @@ module DataDuck
         row
       end
+      def should_fully_reload?
+        true
+      end
       def indexes
         ["id", "project_id", "primary_goal_id", "name"]
       end
@@ -76,6 +48,7 @@ module DataDuck
           :primary_goal_id => :integer,
           :details => :bigtext,
           :status => :string,
+          :audience_ids => :bigtext,
           :url_conditions => :bigtext,
           :last_modified => :datetime,
           :is_multivariate => :boolean,
@@ -84,6 +57,7 @@ module DataDuck
           :percentage_included => :float,
           :experiment_type => :string,
           :edit_url => :string,
+          :auto_allocated => :boolean,
           :dataduck_extracted_at => :datetime,
       })
     end

data/lib/integrations/optimizely/optimizely_integration.rb CHANGED Viewed

@@ -1,15 +1,73 @@
+require 'typhoeus'
+require 'oj'
+require 'date'
+require_relative './experiments'
+require_relative './projects'
+require_relative './variations'
 module DataDuck
   module Optimizely
     class OptimizelyIntegration < DataDuck::Optimizely::OptimizelyTable
       def etl!(destinations, options = {})
+        now = DateTime.now
         projects = fetch_data("projects")
-        # TODO alternate way to load Optimizely data
+        experiments = []
+        projects.each do |project|
+          project["created"] = DateTime.parse(project["created"])
+          project["last_modified"] = DateTime.parse(project["last_modified"])
+          project_experiments = fetch_data("projects/#{ project['id'] }/experiments")
+          project_experiments.each do |proj_exp|
+            proj_exp['project_id'] = project['id']
+            proj_exp["created"] = DateTime.parse(proj_exp["created"])
+            proj_exp["last_modified"] = DateTime.parse(proj_exp["last_modified"])
+          end
+          experiments.concat(project_experiments)
+        end
+        variations = []
+        # Experiments started after January 21, 2015 have statistics computed by Optimizely Stats Engine.
+        # Older experiments should use the old results endpoint.
+        date_for_stats_engine = DateTime.parse('Jan 22, 2015')
+        date_too_old_for_api = DateTime.parse('Jan 1, 2013')
+        broken_experiments = []
+        experiments.each do |experiment|
+          if experiment["created"] < date_too_old_for_api
+            next # seems like there's a problem with the API and old experiments
+          end
+          endpoint = experiment["created"] >= date_for_stats_engine ? "experiments/#{ experiment["id"] }/stats" : "experiments/#{ experiment["id"] }/results"
+          experiment_variations = []
+          begin
+            experiment_variations = fetch_data(endpoint)
+          rescue Exception => err
+            broken_experiments << experiment
+          end
+          experiment_variations.each do |exp_var|
+            exp_var["begin_time"] = DateTime.parse(exp_var["begin_time"]) if exp_var["begin_time"]
+            exp_var["end_time"] = DateTime.parse(exp_var["end_time"]) if exp_var["end_time"]
+            exp_var["experiment_id"] = experiment["id"]
+          end
+          variations.concat(experiment_variations)
+        end
+        projects_etl_table = DataDuck::Optimizely::Projects.new(projects)
+        projects_etl_table.etl!(destinations, options)
+        experiments_etl_table = DataDuck::Optimizely::Experiments.new(experiments)
+        experiments_etl_table.etl!(destinations, options)
+        variations_etl_table = DataDuck::Optimizely::Variations.new(variations)
+        variations_etl_table.etl!(destinations, options)
       end
       def fetch_data(api_endpoint)
         now = DateTime.now
-        response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/#{ api_endpoint }", headers: {'Token' => self.optimizely_api_token})
+        response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/#{ api_endpoint }", headers: {'Token' => optimizely_api_token})
         if response.response_code != 200
           raise Exception.new("Optimizely API for #{ api_endpoint } returned error #{ response.response_code} #{ response.body }")
         end

data/lib/integrations/optimizely/optimizely_table.rb CHANGED Viewed

@@ -5,6 +5,10 @@ module DataDuck
         ENV['optimizely_api_token']
       end
+      def prefix
+        "optimizely_"
+      end
       def should_fully_reload?
         true
       end

data/lib/integrations/optimizely/projects.rb CHANGED Viewed

@@ -7,27 +7,20 @@ require 'date'
 module DataDuck
   module Optimizely
     class Projects < DataDuck::Optimizely::OptimizelyTable
-      transforms :parse_datetimes
-      def extract!(destination, options = {})
-        self.data = []
-        now = DateTime.now
-        response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects", headers: {'Token' => self.optimizely_api_token})
+      def initialize(data)
+        self.data = data
+      end
-        self.data = Oj.load(response.body)
-        self.data.each do |project|
-          project[:dataduck_extracted_at] = now
-        end
+      def extract!(*args)
+        # already initialized data
       end
       def indexes
         ["id", "account_id", "project_name"]
       end
-      def parse_datetimes
-        project["created"] = DateTime.parse(project["created"])
-        project["last_modified"] = DateTime.parse(project["last_modified"])
+      def should_fully_reload?
+        true
       end
       output({

data/lib/integrations/optimizely/variations.rb CHANGED Viewed

@@ -5,7 +5,54 @@ require_relative 'optimizely_table'
 module DataDuck
   module Optimizely
     class Variations < DataDuck::Optimizely::OptimizelyTable
-      # this table should contain experiment variations and either /results or /stats for the result data
+      transforms :fix_fields
+      def initialize(data)
+        self.data = data
+      end
+      def extract!(*args)
+        # already initialized data
+      end
+      def fix_fields(row)
+        row[:id] = row['variation_id'].to_i
+        row[:name] = row['variation_name']
+        row['baseline_id'] = row['baseline_id'].to_i
+        row['improvement'] = row['improvement'].to_f
+        row['confidence'] = row['confidence'].to_f
+        row['conversion_rate'] = row['conversion_rate'].to_f
+        row['difference'] = row['difference'].to_f
+        row
+      end
+      def indexes
+        ["id", "goal_id", "experiment_id", "name"]
+      end
+      def should_fully_reload?
+        true
+      end
+      output({
+          :id => :bigint,
+          :name => :string,
+          :experiment_id => :bigint,
+          :baseline_id => :bigint,
+          :goal_name => :string,
+          :goal_id => :bigint,
+          :visitors => :integer,
+          :conversions => :integer,
+          :begin_time => :datetime,
+          :end_time => :datetime,
+          :improvement => :float,
+          :confidence => :float,
+          :conversion_rate => :float,
+          :difference => :float,
+          :status => :string,
+          :dataduck_extracted_at => :datetime,
+      })
     end
   end
 end

data/lib/integrations/semrush/organic_results.rb CHANGED Viewed

@@ -1,10 +1,12 @@
+require 'date'
 require 'typhoeus'
+require 'uri'
 module DataDuck
   module SEMRush
     class OrganicResults < DataDuck::IntegrationTable
       def display_limit
-        25
+        20
       end
       def key
@@ -24,23 +26,48 @@ module DataDuck
       end
       def extract!(destination, options = {})
-        dates = options[:dates]
-        if dates.nil? || dates.length == 0
-          raise Exception("Must pass at least one date.")
-        end
         self.data = []
         self.phrases.each do |phrase|
-          self.dates.each do |date|
-            self.extract_results_for_keyword_and_date!(phrase, date)
+          self.extract_results_for_keyword_and_date!(phrase)
+        end
+      end
+      def extract_results_for_keyword_and_date!(phrase)
+        date = Date.today
+        phrase.strip!
+        escaped_phrase = URI.escape(phrase)
+        semrush_api_url = "http://api.semrush.com/?type=phrase_organic&key=#{ self.key }&display_limit=#{ self.display_limit }&export_columns=Dn,Ur&phrase=#{ escaped_phrase }&database=#{ self.search_database }"
+        response = Typhoeus.get(semrush_api_url)
+        if response.response_code != 200
+          raise Exception.new("SEMrush API returned error #{ response.response_code} #{ response.body }")
+        end
+        rank = -1
+        response.body.each_line do |line|
+          rank += 1
+          if rank == 0
+            # This is the header line
+            next
           end
+          domain, url = line.split(';')
+          domain.strip!
+          url.strip!
+          self.data << {
+              date: date,
+              phrase: phrase,
+              rank: rank,
+              domain: domain,
+              url: url
+          }
         end
       end
-      def extract_results_for_keyword_and_date!(phrase, date)
-        response = Typhoeus.get("http://api.semrush.com/?type=phrase_organic&key=#{ self.key }&display_limit=#{ self.display_limit }&export_columns=Dn,Ur&phrase=#{ phrase }&database=#{ self.search_database }")
-        # TODO
+      def identify_by_columns
+        ["date", "phrase"]
       end
       def indexes

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: dataduck
 version: !ruby/object:Gem::Version
-  version: 0.6.5
+  version: 0.6.6
 platform: ruby
 authors:
 - Jeff Pickhardt
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-11-04 00:00:00.000000000 Z
+date: 2015-11-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler