RubyGems - dataduck - Versions diffs - 0.6.5 → 0.6.6 - Mend

dataduck 0.6.5 → 0.6.6

Files changed (12) hide show

checksums.yaml +4 -4
data/lib/dataduck/etl.rb +3 -3
data/lib/dataduck/redshift_destination.rb +17 -1
data/lib/dataduck/table.rb +6 -0
data/lib/dataduck/version.rb +1 -1
data/lib/integrations/optimizely/experiments.rb +12 -38
data/lib/integrations/optimizely/optimizely_integration.rb +60 -2
data/lib/integrations/optimizely/optimizely_table.rb +4 -0
data/lib/integrations/optimizely/projects.rb +7 -14
data/lib/integrations/optimizely/variations.rb +48 -1
data/lib/integrations/semrush/organic_results.rb +38 -11
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 8134b77413b5c8b0ab92de410257aa000ef25605
-  data.tar.gz: c50fea221ac3ebe10c5d2e7e6d648410dc4aee98
+  metadata.gz: 8bd5effb261990d6eccb37fe16de93195c89ef24
+  data.tar.gz: b2c7d31fac424890e6b605d2828423b624625467
 SHA512:
-  metadata.gz: 3df107d634e0ab1950ac6e2ac91658058325e09eb3a38c50097242157f6d98cdbb7d60e123ef426ee33399aec892e517a20f952ef9ce48443bbe095ec4c9322b
-  data.tar.gz: bf512224a1d5065bb3175c12be43884ddedb10d4d2b38e7818cc67fa819352cdf28b78b01d1ef83574f87c0f1f0dfd2ed76d58aeb7ba437f5dad5f793530795b
+  metadata.gz: 61b17cc19b83ad9733b9744de6a65f884af5f3b145ddbc967ce01eff2fa8b552b103c9c69016c59f87c729cd9bc10866ed5ee720f903236c90e1d2b4ec2712c5
+  data.tar.gz: 0217b069e2d3997698c4135a170c65c22d16187cb02d35adf68d4e62c8798073c2c95a9d5f9689329762cc8c6d25a0b5682f542f7b802918a6776f792e302db3

data/lib/dataduck/etl.rb CHANGED Viewed

@@ -25,9 +25,9 @@ module DataDuck
           table_name_underscores = file.split("/").last.gsub(".rb", "")
           table_name_camelized = DataDuck::Util.underscore_to_camelcase(table_name_underscores)
           require file
-          table = Object.const_get(table_name_camelized)
-          if table <= DataDuck::Table
-            @tables << table
+          table_class = Object.const_get(table_name_camelized)
+          if table_class <= DataDuck::Table && table_class.new.include_with_all?
+            @tables << table_class
           end
         end
       end

data/lib/dataduck/redshift_destination.rb CHANGED Viewed

@@ -175,10 +175,26 @@ module DataDuck
       end
       # Following guidelines in http://docs.aws.amazon.com/redshift/latest/dg/merge-examples.html
+      self.delete_before_inserting!(table)
+      self.insert_from_staging!(table)
+    end
+    def delete_before_inserting!(table)
       staging_name = table.staging_name
       building_name = table.building_name
-      delete_query = "DELETE FROM #{ building_name } USING #{ staging_name } WHERE #{ building_name }.id = #{ staging_name }.id" # TODO allow custom or multiple keys
+      where_equals_parts = []
+      table.identify_by_columns.each do |attribute|
+        where_equals_parts << "#{ building_name }.#{ attribute } = #{ staging_name }.#{ attribute }"
+      end
+      delete_query = "DELETE FROM #{ building_name } USING #{ staging_name } WHERE #{ where_equals_parts.join(' AND ') }"
       self.query(delete_query)
+    end
+    def insert_from_staging!(table)
+      staging_name = table.staging_name
+      building_name = table.building_name
       insert_query = "INSERT INTO #{ building_name } (\"#{ table.output_column_names.join('","') }\") SELECT \"#{ table.output_column_names.join('","') }\" FROM #{ staging_name }"
       self.query(insert_query)
     end

data/lib/dataduck/table.rb CHANGED Viewed

@@ -194,6 +194,12 @@ module DataDuck
       nil
     end
+    def identify_by_columns
+      return ["id"] if self.output_column_names.include?("id")
+      []
+    end
     def should_fully_reload?
       false # Set to true if you want to fully reload a table with each ETL
     end

data/lib/dataduck/version.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module DataDuck
   if !defined?(DataDuck::VERSION)
     VERSION_MAJOR = 0
     VERSION_MINOR = 6
-    VERSION_PATCH = 5
+    VERSION_PATCH = 6
     VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
   end
 end

data/lib/integrations/optimizely/experiments.rb CHANGED Viewed

@@ -7,50 +7,18 @@ require 'date'
 module DataDuck
   module Optimizely
     class Experiments < DataDuck::Optimizely::OptimizelyTable
       transforms :percentage_included_to_float
-      transforms :parse_datetimes
-      def extract!(destination, options = {})
-        self.data = []
+      transforms :rename_description_to_name
-        projects_response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects", headers: {'Token' => self.optimizely_api_token})
-        if projects_response.response_code != 200
-          raise Exception.new("Optimizely API for projects returned error #{ response.response_code} #{ response.body }")
-        end
-        projects = Oj.load(projects_response.body)
-        projects.each do |project|
-          self.extract_for_project!(project["id"])
-        end
+      def initialize(experiments)
+        self.data = experiments
       end
-      def extract_for_project!(project_id)
-        now = DateTime.now
-        response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects/#{ project_id }/experiments", headers: {'Token' => self.optimizely_api_token})
-        if response.response_code != 200
-          raise Exception.new("Optimizely API for experiments returned error #{ response.response_code} #{ response.body }")
-        end
-        experiments = Oj.load(response.body)
-        experiments.each do |experiment|
-          experiment[:dataduck_extracted_at] = now
-          experiment[:project_id] = project_id
-        end
-        self.data.concat(experiments)
+      def extract!(*args)
+        # already initialized data
       end
-      def parse_datetimes(row)
-        row["created"] = DateTime.parse(row["created"])
-        row["last_modified"] = DateTime.parse(row["last_modified"])
-        row
-      end
-      def rename_description_to_name
+      def rename_description_to_name(row)
         row[:name] = row['description']
         row
@@ -62,6 +30,10 @@ module DataDuck
         row
       end
+      def should_fully_reload?
+        true
+      end
       def indexes
         ["id", "project_id", "primary_goal_id", "name"]
       end
@@ -76,6 +48,7 @@ module DataDuck
           :primary_goal_id => :integer,
           :details => :bigtext,
           :status => :string,
+          :audience_ids => :bigtext,
           :url_conditions => :bigtext,
           :last_modified => :datetime,
           :is_multivariate => :boolean,
@@ -84,6 +57,7 @@ module DataDuck
           :percentage_included => :float,
           :experiment_type => :string,
           :edit_url => :string,
+          :auto_allocated => :boolean,
           :dataduck_extracted_at => :datetime,
       })
     end

data/lib/integrations/optimizely/optimizely_integration.rb CHANGED Viewed

@@ -1,15 +1,73 @@
+require 'typhoeus'
+require 'oj'
+require 'date'
+require_relative './experiments'
+require_relative './projects'
+require_relative './variations'
 module DataDuck
   module Optimizely
     class OptimizelyIntegration < DataDuck::Optimizely::OptimizelyTable
       def etl!(destinations, options = {})
+        now = DateTime.now
         projects = fetch_data("projects")
-        # TODO alternate way to load Optimizely data
+        experiments = []
+        projects.each do |project|
+          project["created"] = DateTime.parse(project["created"])
+          project["last_modified"] = DateTime.parse(project["last_modified"])
+          project_experiments = fetch_data("projects/#{ project['id'] }/experiments")
+          project_experiments.each do |proj_exp|
+            proj_exp['project_id'] = project['id']
+            proj_exp["created"] = DateTime.parse(proj_exp["created"])
+            proj_exp["last_modified"] = DateTime.parse(proj_exp["last_modified"])
+          end
+          experiments.concat(project_experiments)
+        end
+        variations = []
+        # Experiments started after January 21, 2015 have statistics computed by Optimizely Stats Engine.
+        # Older experiments should use the old results endpoint.
+        date_for_stats_engine = DateTime.parse('Jan 22, 2015')
+        date_too_old_for_api = DateTime.parse('Jan 1, 2013')
+        broken_experiments = []
+        experiments.each do |experiment|
+          if experiment["created"] < date_too_old_for_api
+            next # seems like there's a problem with the API and old experiments
+          end
+          endpoint = experiment["created"] >= date_for_stats_engine ? "experiments/#{ experiment["id"] }/stats" : "experiments/#{ experiment["id"] }/results"
+          experiment_variations = []
+          begin
+            experiment_variations = fetch_data(endpoint)
+          rescue Exception => err
+            broken_experiments << experiment
+          end
+          experiment_variations.each do |exp_var|
+            exp_var["begin_time"] = DateTime.parse(exp_var["begin_time"]) if exp_var["begin_time"]
+            exp_var["end_time"] = DateTime.parse(exp_var["end_time"]) if exp_var["end_time"]
+            exp_var["experiment_id"] = experiment["id"]
+          end
+          variations.concat(experiment_variations)
+        end
+        projects_etl_table = DataDuck::Optimizely::Projects.new(projects)
+        projects_etl_table.etl!(destinations, options)
+        experiments_etl_table = DataDuck::Optimizely::Experiments.new(experiments)
+        experiments_etl_table.etl!(destinations, options)
+        variations_etl_table = DataDuck::Optimizely::Variations.new(variations)
+        variations_etl_table.etl!(destinations, options)
       end
       def fetch_data(api_endpoint)
         now = DateTime.now
-        response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/#{ api_endpoint }", headers: {'Token' => self.optimizely_api_token})
+        response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/#{ api_endpoint }", headers: {'Token' => optimizely_api_token})
         if response.response_code != 200
           raise Exception.new("Optimizely API for #{ api_endpoint } returned error #{ response.response_code} #{ response.body }")
         end

data/lib/integrations/optimizely/optimizely_table.rb CHANGED Viewed

@@ -5,6 +5,10 @@ module DataDuck
         ENV['optimizely_api_token']
       end
+      def prefix
+        "optimizely_"
+      end
       def should_fully_reload?
         true
       end

data/lib/integrations/optimizely/projects.rb CHANGED Viewed

@@ -7,27 +7,20 @@ require 'date'
 module DataDuck
   module Optimizely
     class Projects < DataDuck::Optimizely::OptimizelyTable
-      transforms :parse_datetimes
-      def extract!(destination, options = {})
-        self.data = []
-        now = DateTime.now
-        response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects", headers: {'Token' => self.optimizely_api_token})
+      def initialize(data)
+        self.data = data
+      end
-        self.data = Oj.load(response.body)
-        self.data.each do |project|
-          project[:dataduck_extracted_at] = now
-        end
+      def extract!(*args)
+        # already initialized data
       end
       def indexes
         ["id", "account_id", "project_name"]
       end
-      def parse_datetimes
-        project["created"] = DateTime.parse(project["created"])
-        project["last_modified"] = DateTime.parse(project["last_modified"])
+      def should_fully_reload?
+        true
       end
       output({

data/lib/integrations/optimizely/variations.rb CHANGED Viewed

@@ -5,7 +5,54 @@ require_relative 'optimizely_table'
 module DataDuck
   module Optimizely
     class Variations < DataDuck::Optimizely::OptimizelyTable
-      # this table should contain experiment variations and either /results or /stats for the result data
+      transforms :fix_fields
+      def initialize(data)
+        self.data = data
+      end
+      def extract!(*args)
+        # already initialized data
+      end
+      def fix_fields(row)
+        row[:id] = row['variation_id'].to_i
+        row[:name] = row['variation_name']
+        row['baseline_id'] = row['baseline_id'].to_i
+        row['improvement'] = row['improvement'].to_f
+        row['confidence'] = row['confidence'].to_f
+        row['conversion_rate'] = row['conversion_rate'].to_f
+        row['difference'] = row['difference'].to_f
+        row
+      end
+      def indexes
+        ["id", "goal_id", "experiment_id", "name"]
+      end
+      def should_fully_reload?
+        true
+      end
+      output({
+          :id => :bigint,
+          :name => :string,
+          :experiment_id => :bigint,
+          :baseline_id => :bigint,
+          :goal_name => :string,
+          :goal_id => :bigint,
+          :visitors => :integer,
+          :conversions => :integer,
+          :begin_time => :datetime,
+          :end_time => :datetime,
+          :improvement => :float,
+          :confidence => :float,
+          :conversion_rate => :float,
+          :difference => :float,
+          :status => :string,
+          :dataduck_extracted_at => :datetime,
+      })
     end
   end
 end

data/lib/integrations/semrush/organic_results.rb CHANGED Viewed

@@ -1,10 +1,12 @@
+require 'date'
 require 'typhoeus'
+require 'uri'
 module DataDuck
   module SEMRush
     class OrganicResults < DataDuck::IntegrationTable
       def display_limit
-        25
+        20
       end
       def key
@@ -24,23 +26,48 @@ module DataDuck
       end
       def extract!(destination, options = {})
-        dates = options[:dates]
-        if dates.nil? || dates.length == 0
-          raise Exception("Must pass at least one date.")
-        end
         self.data = []
         self.phrases.each do |phrase|
-          self.dates.each do |date|
-            self.extract_results_for_keyword_and_date!(phrase, date)
+          self.extract_results_for_keyword_and_date!(phrase)
+        end
+      end
+      def extract_results_for_keyword_and_date!(phrase)
+        date = Date.today
+        phrase.strip!
+        escaped_phrase = URI.escape(phrase)
+        semrush_api_url = "http://api.semrush.com/?type=phrase_organic&key=#{ self.key }&display_limit=#{ self.display_limit }&export_columns=Dn,Ur&phrase=#{ escaped_phrase }&database=#{ self.search_database }"
+        response = Typhoeus.get(semrush_api_url)
+        if response.response_code != 200
+          raise Exception.new("SEMrush API returned error #{ response.response_code} #{ response.body }")
+        end
+        rank = -1
+        response.body.each_line do |line|
+          rank += 1
+          if rank == 0
+            # This is the header line
+            next
           end
+          domain, url = line.split(';')
+          domain.strip!
+          url.strip!
+          self.data << {
+              date: date,
+              phrase: phrase,
+              rank: rank,
+              domain: domain,
+              url: url
+          }
         end
       end
-      def extract_results_for_keyword_and_date!(phrase, date)
-        response = Typhoeus.get("http://api.semrush.com/?type=phrase_organic&key=#{ self.key }&display_limit=#{ self.display_limit }&export_columns=Dn,Ur&phrase=#{ phrase }&database=#{ self.search_database }")
-        # TODO
+      def identify_by_columns
+        ["date", "phrase"]
       end
       def indexes

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: dataduck
 version: !ruby/object:Gem::Version
-  version: 0.6.5
+  version: 0.6.6
 platform: ruby
 authors:
 - Jeff Pickhardt
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-11-04 00:00:00.000000000 Z
+date: 2015-11-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler