dataduck 0.6.5 → 0.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8134b77413b5c8b0ab92de410257aa000ef25605
4
- data.tar.gz: c50fea221ac3ebe10c5d2e7e6d648410dc4aee98
3
+ metadata.gz: 8bd5effb261990d6eccb37fe16de93195c89ef24
4
+ data.tar.gz: b2c7d31fac424890e6b605d2828423b624625467
5
5
  SHA512:
6
- metadata.gz: 3df107d634e0ab1950ac6e2ac91658058325e09eb3a38c50097242157f6d98cdbb7d60e123ef426ee33399aec892e517a20f952ef9ce48443bbe095ec4c9322b
7
- data.tar.gz: bf512224a1d5065bb3175c12be43884ddedb10d4d2b38e7818cc67fa819352cdf28b78b01d1ef83574f87c0f1f0dfd2ed76d58aeb7ba437f5dad5f793530795b
6
+ metadata.gz: 61b17cc19b83ad9733b9744de6a65f884af5f3b145ddbc967ce01eff2fa8b552b103c9c69016c59f87c729cd9bc10866ed5ee720f903236c90e1d2b4ec2712c5
7
+ data.tar.gz: 0217b069e2d3997698c4135a170c65c22d16187cb02d35adf68d4e62c8798073c2c95a9d5f9689329762cc8c6d25a0b5682f542f7b802918a6776f792e302db3
data/lib/dataduck/etl.rb CHANGED
@@ -25,9 +25,9 @@ module DataDuck
25
25
  table_name_underscores = file.split("/").last.gsub(".rb", "")
26
26
  table_name_camelized = DataDuck::Util.underscore_to_camelcase(table_name_underscores)
27
27
  require file
28
- table = Object.const_get(table_name_camelized)
29
- if table <= DataDuck::Table
30
- @tables << table
28
+ table_class = Object.const_get(table_name_camelized)
29
+ if table_class <= DataDuck::Table && table_class.new.include_with_all?
30
+ @tables << table_class
31
31
  end
32
32
  end
33
33
  end
@@ -175,10 +175,26 @@ module DataDuck
175
175
  end
176
176
 
177
177
  # Following guidelines in http://docs.aws.amazon.com/redshift/latest/dg/merge-examples.html
178
+ self.delete_before_inserting!(table)
179
+ self.insert_from_staging!(table)
180
+ end
181
+
182
+ def delete_before_inserting!(table)
178
183
  staging_name = table.staging_name
179
184
  building_name = table.building_name
180
- delete_query = "DELETE FROM #{ building_name } USING #{ staging_name } WHERE #{ building_name }.id = #{ staging_name }.id" # TODO allow custom or multiple keys
185
+
186
+ where_equals_parts = []
187
+ table.identify_by_columns.each do |attribute|
188
+ where_equals_parts << "#{ building_name }.#{ attribute } = #{ staging_name }.#{ attribute }"
189
+ end
190
+
191
+ delete_query = "DELETE FROM #{ building_name } USING #{ staging_name } WHERE #{ where_equals_parts.join(' AND ') }"
181
192
  self.query(delete_query)
193
+ end
194
+
195
+ def insert_from_staging!(table)
196
+ staging_name = table.staging_name
197
+ building_name = table.building_name
182
198
  insert_query = "INSERT INTO #{ building_name } (\"#{ table.output_column_names.join('","') }\") SELECT \"#{ table.output_column_names.join('","') }\" FROM #{ staging_name }"
183
199
  self.query(insert_query)
184
200
  end
@@ -194,6 +194,12 @@ module DataDuck
194
194
  nil
195
195
  end
196
196
 
197
+ def identify_by_columns
198
+ return ["id"] if self.output_column_names.include?("id")
199
+
200
+ []
201
+ end
202
+
197
203
  def should_fully_reload?
198
204
  false # Set to true if you want to fully reload a table with each ETL
199
205
  end
@@ -2,7 +2,7 @@ module DataDuck
2
2
  if !defined?(DataDuck::VERSION)
3
3
  VERSION_MAJOR = 0
4
4
  VERSION_MINOR = 6
5
- VERSION_PATCH = 5
5
+ VERSION_PATCH = 6
6
6
  VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
7
7
  end
8
8
  end
@@ -7,50 +7,18 @@ require 'date'
7
7
  module DataDuck
8
8
  module Optimizely
9
9
  class Experiments < DataDuck::Optimizely::OptimizelyTable
10
-
11
10
  transforms :percentage_included_to_float
12
- transforms :parse_datetimes
13
-
14
- def extract!(destination, options = {})
15
- self.data = []
11
+ transforms :rename_description_to_name
16
12
 
17
- projects_response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects", headers: {'Token' => self.optimizely_api_token})
18
- if projects_response.response_code != 200
19
- raise Exception.new("Optimizely API for projects returned error #{ response.response_code} #{ response.body }")
20
- end
21
- projects = Oj.load(projects_response.body)
22
-
23
- projects.each do |project|
24
- self.extract_for_project!(project["id"])
25
- end
13
+ def initialize(experiments)
14
+ self.data = experiments
26
15
  end
27
16
 
28
- def extract_for_project!(project_id)
29
- now = DateTime.now
30
-
31
- response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects/#{ project_id }/experiments", headers: {'Token' => self.optimizely_api_token})
32
-
33
- if response.response_code != 200
34
- raise Exception.new("Optimizely API for experiments returned error #{ response.response_code} #{ response.body }")
35
- end
36
-
37
- experiments = Oj.load(response.body)
38
- experiments.each do |experiment|
39
- experiment[:dataduck_extracted_at] = now
40
- experiment[:project_id] = project_id
41
- end
42
-
43
- self.data.concat(experiments)
17
+ def extract!(*args)
18
+ # already initialized data
44
19
  end
45
20
 
46
- def parse_datetimes(row)
47
- row["created"] = DateTime.parse(row["created"])
48
- row["last_modified"] = DateTime.parse(row["last_modified"])
49
-
50
- row
51
- end
52
-
53
- def rename_description_to_name
21
+ def rename_description_to_name(row)
54
22
  row[:name] = row['description']
55
23
 
56
24
  row
@@ -62,6 +30,10 @@ module DataDuck
62
30
  row
63
31
  end
64
32
 
33
+ def should_fully_reload?
34
+ true
35
+ end
36
+
65
37
  def indexes
66
38
  ["id", "project_id", "primary_goal_id", "name"]
67
39
  end
@@ -76,6 +48,7 @@ module DataDuck
76
48
  :primary_goal_id => :integer,
77
49
  :details => :bigtext,
78
50
  :status => :string,
51
+ :audience_ids => :bigtext,
79
52
  :url_conditions => :bigtext,
80
53
  :last_modified => :datetime,
81
54
  :is_multivariate => :boolean,
@@ -84,6 +57,7 @@ module DataDuck
84
57
  :percentage_included => :float,
85
58
  :experiment_type => :string,
86
59
  :edit_url => :string,
60
+ :auto_allocated => :boolean,
87
61
  :dataduck_extracted_at => :datetime,
88
62
  })
89
63
  end
@@ -1,15 +1,73 @@
1
+ require 'typhoeus'
2
+ require 'oj'
3
+ require 'date'
4
+
5
+ require_relative './experiments'
6
+ require_relative './projects'
7
+ require_relative './variations'
8
+
1
9
  module DataDuck
2
10
  module Optimizely
3
11
  class OptimizelyIntegration < DataDuck::Optimizely::OptimizelyTable
4
12
  def etl!(destinations, options = {})
13
+ now = DateTime.now
14
+
5
15
  projects = fetch_data("projects")
6
- # TODO alternate way to load Optimizely data
16
+
17
+ experiments = []
18
+ projects.each do |project|
19
+ project["created"] = DateTime.parse(project["created"])
20
+ project["last_modified"] = DateTime.parse(project["last_modified"])
21
+
22
+ project_experiments = fetch_data("projects/#{ project['id'] }/experiments")
23
+ project_experiments.each do |proj_exp|
24
+ proj_exp['project_id'] = project['id']
25
+ proj_exp["created"] = DateTime.parse(proj_exp["created"])
26
+ proj_exp["last_modified"] = DateTime.parse(proj_exp["last_modified"])
27
+ end
28
+ experiments.concat(project_experiments)
29
+ end
30
+
31
+ variations = []
32
+ # Experiments started after January 21, 2015 have statistics computed by Optimizely Stats Engine.
33
+ # Older experiments should use the old results endpoint.
34
+ date_for_stats_engine = DateTime.parse('Jan 22, 2015')
35
+ date_too_old_for_api = DateTime.parse('Jan 1, 2013')
36
+ broken_experiments = []
37
+ experiments.each do |experiment|
38
+ if experiment["created"] < date_too_old_for_api
39
+ next # seems like there's a problem with the API and old experiments
40
+ end
41
+
42
+ endpoint = experiment["created"] >= date_for_stats_engine ? "experiments/#{ experiment["id"] }/stats" : "experiments/#{ experiment["id"] }/results"
43
+ experiment_variations = []
44
+ begin
45
+ experiment_variations = fetch_data(endpoint)
46
+ rescue Exception => err
47
+ broken_experiments << experiment
48
+ end
49
+ experiment_variations.each do |exp_var|
50
+ exp_var["begin_time"] = DateTime.parse(exp_var["begin_time"]) if exp_var["begin_time"]
51
+ exp_var["end_time"] = DateTime.parse(exp_var["end_time"]) if exp_var["end_time"]
52
+ exp_var["experiment_id"] = experiment["id"]
53
+ end
54
+ variations.concat(experiment_variations)
55
+ end
56
+
57
+ projects_etl_table = DataDuck::Optimizely::Projects.new(projects)
58
+ projects_etl_table.etl!(destinations, options)
59
+
60
+ experiments_etl_table = DataDuck::Optimizely::Experiments.new(experiments)
61
+ experiments_etl_table.etl!(destinations, options)
62
+
63
+ variations_etl_table = DataDuck::Optimizely::Variations.new(variations)
64
+ variations_etl_table.etl!(destinations, options)
7
65
  end
8
66
 
9
67
  def fetch_data(api_endpoint)
10
68
  now = DateTime.now
11
69
 
12
- response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/#{ api_endpoint }", headers: {'Token' => self.optimizely_api_token})
70
+ response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/#{ api_endpoint }", headers: {'Token' => optimizely_api_token})
13
71
  if response.response_code != 200
14
72
  raise Exception.new("Optimizely API for #{ api_endpoint } returned error #{ response.response_code} #{ response.body }")
15
73
  end
@@ -5,6 +5,10 @@ module DataDuck
5
5
  ENV['optimizely_api_token']
6
6
  end
7
7
 
8
+ def prefix
9
+ "optimizely_"
10
+ end
11
+
8
12
  def should_fully_reload?
9
13
  true
10
14
  end
@@ -7,27 +7,20 @@ require 'date'
7
7
  module DataDuck
8
8
  module Optimizely
9
9
  class Projects < DataDuck::Optimizely::OptimizelyTable
10
- transforms :parse_datetimes
11
-
12
- def extract!(destination, options = {})
13
- self.data = []
14
-
15
- now = DateTime.now
16
- response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects", headers: {'Token' => self.optimizely_api_token})
10
+ def initialize(data)
11
+ self.data = data
12
+ end
17
13
 
18
- self.data = Oj.load(response.body)
19
- self.data.each do |project|
20
- project[:dataduck_extracted_at] = now
21
- end
14
+ def extract!(*args)
15
+ # already initialized data
22
16
  end
23
17
 
24
18
  def indexes
25
19
  ["id", "account_id", "project_name"]
26
20
  end
27
21
 
28
- def parse_datetimes
29
- project["created"] = DateTime.parse(project["created"])
30
- project["last_modified"] = DateTime.parse(project["last_modified"])
22
+ def should_fully_reload?
23
+ true
31
24
  end
32
25
 
33
26
  output({
@@ -5,7 +5,54 @@ require_relative 'optimizely_table'
5
5
  module DataDuck
6
6
  module Optimizely
7
7
  class Variations < DataDuck::Optimizely::OptimizelyTable
8
- # this table should contain experiment variations and either /results or /stats for the result data
8
+ transforms :fix_fields
9
+
10
+ def initialize(data)
11
+ self.data = data
12
+ end
13
+
14
+ def extract!(*args)
15
+ # already initialized data
16
+ end
17
+
18
+ def fix_fields(row)
19
+ row[:id] = row['variation_id'].to_i
20
+ row[:name] = row['variation_name']
21
+ row['baseline_id'] = row['baseline_id'].to_i
22
+ row['improvement'] = row['improvement'].to_f
23
+ row['confidence'] = row['confidence'].to_f
24
+ row['conversion_rate'] = row['conversion_rate'].to_f
25
+ row['difference'] = row['difference'].to_f
26
+
27
+ row
28
+ end
29
+
30
+ def indexes
31
+ ["id", "goal_id", "experiment_id", "name"]
32
+ end
33
+
34
+ def should_fully_reload?
35
+ true
36
+ end
37
+
38
+ output({
39
+ :id => :bigint,
40
+ :name => :string,
41
+ :experiment_id => :bigint,
42
+ :baseline_id => :bigint,
43
+ :goal_name => :string,
44
+ :goal_id => :bigint,
45
+ :visitors => :integer,
46
+ :conversions => :integer,
47
+ :begin_time => :datetime,
48
+ :end_time => :datetime,
49
+ :improvement => :float,
50
+ :confidence => :float,
51
+ :conversion_rate => :float,
52
+ :difference => :float,
53
+ :status => :string,
54
+ :dataduck_extracted_at => :datetime,
55
+ })
9
56
  end
10
57
  end
11
58
  end
@@ -1,10 +1,12 @@
1
+ require 'date'
1
2
  require 'typhoeus'
3
+ require 'uri'
2
4
 
3
5
  module DataDuck
4
6
  module SEMRush
5
7
  class OrganicResults < DataDuck::IntegrationTable
6
8
  def display_limit
7
- 25
9
+ 20
8
10
  end
9
11
 
10
12
  def key
@@ -24,23 +26,48 @@ module DataDuck
24
26
  end
25
27
 
26
28
  def extract!(destination, options = {})
27
- dates = options[:dates]
28
- if dates.nil? || dates.length == 0
29
- raise Exception("Must pass at least one date.")
30
- end
31
-
32
29
  self.data = []
33
30
 
34
31
  self.phrases.each do |phrase|
35
- self.dates.each do |date|
36
- self.extract_results_for_keyword_and_date!(phrase, date)
32
+ self.extract_results_for_keyword_and_date!(phrase)
33
+ end
34
+ end
35
+
36
+ def extract_results_for_keyword_and_date!(phrase)
37
+ date = Date.today
38
+ phrase.strip!
39
+ escaped_phrase = URI.escape(phrase)
40
+ semrush_api_url = "http://api.semrush.com/?type=phrase_organic&key=#{ self.key }&display_limit=#{ self.display_limit }&export_columns=Dn,Ur&phrase=#{ escaped_phrase }&database=#{ self.search_database }"
41
+
42
+ response = Typhoeus.get(semrush_api_url)
43
+ if response.response_code != 200
44
+ raise Exception.new("SEMrush API returned error #{ response.response_code} #{ response.body }")
45
+ end
46
+
47
+ rank = -1
48
+ response.body.each_line do |line|
49
+ rank += 1
50
+ if rank == 0
51
+ # This is the header line
52
+ next
37
53
  end
54
+
55
+ domain, url = line.split(';')
56
+ domain.strip!
57
+ url.strip!
58
+
59
+ self.data << {
60
+ date: date,
61
+ phrase: phrase,
62
+ rank: rank,
63
+ domain: domain,
64
+ url: url
65
+ }
38
66
  end
39
67
  end
40
68
 
41
- def extract_results_for_keyword_and_date!(phrase, date)
42
- response = Typhoeus.get("http://api.semrush.com/?type=phrase_organic&key=#{ self.key }&display_limit=#{ self.display_limit }&export_columns=Dn,Ur&phrase=#{ phrase }&database=#{ self.search_database }")
43
- # TODO
69
+ def identify_by_columns
70
+ ["date", "phrase"]
44
71
  end
45
72
 
46
73
  def indexes
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataduck
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.5
4
+ version: 0.6.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeff Pickhardt
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-04 00:00:00.000000000 Z
11
+ date: 2015-11-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler