dataduck 0.6.5 → 0.6.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8134b77413b5c8b0ab92de410257aa000ef25605
4
- data.tar.gz: c50fea221ac3ebe10c5d2e7e6d648410dc4aee98
3
+ metadata.gz: 8bd5effb261990d6eccb37fe16de93195c89ef24
4
+ data.tar.gz: b2c7d31fac424890e6b605d2828423b624625467
5
5
  SHA512:
6
- metadata.gz: 3df107d634e0ab1950ac6e2ac91658058325e09eb3a38c50097242157f6d98cdbb7d60e123ef426ee33399aec892e517a20f952ef9ce48443bbe095ec4c9322b
7
- data.tar.gz: bf512224a1d5065bb3175c12be43884ddedb10d4d2b38e7818cc67fa819352cdf28b78b01d1ef83574f87c0f1f0dfd2ed76d58aeb7ba437f5dad5f793530795b
6
+ metadata.gz: 61b17cc19b83ad9733b9744de6a65f884af5f3b145ddbc967ce01eff2fa8b552b103c9c69016c59f87c729cd9bc10866ed5ee720f903236c90e1d2b4ec2712c5
7
+ data.tar.gz: 0217b069e2d3997698c4135a170c65c22d16187cb02d35adf68d4e62c8798073c2c95a9d5f9689329762cc8c6d25a0b5682f542f7b802918a6776f792e302db3
data/lib/dataduck/etl.rb CHANGED
@@ -25,9 +25,9 @@ module DataDuck
25
25
  table_name_underscores = file.split("/").last.gsub(".rb", "")
26
26
  table_name_camelized = DataDuck::Util.underscore_to_camelcase(table_name_underscores)
27
27
  require file
28
- table = Object.const_get(table_name_camelized)
29
- if table <= DataDuck::Table
30
- @tables << table
28
+ table_class = Object.const_get(table_name_camelized)
29
+ if table_class <= DataDuck::Table && table_class.new.include_with_all?
30
+ @tables << table_class
31
31
  end
32
32
  end
33
33
  end
@@ -175,10 +175,26 @@ module DataDuck
175
175
  end
176
176
 
177
177
  # Following guidelines in http://docs.aws.amazon.com/redshift/latest/dg/merge-examples.html
178
+ self.delete_before_inserting!(table)
179
+ self.insert_from_staging!(table)
180
+ end
181
+
182
+ def delete_before_inserting!(table)
178
183
  staging_name = table.staging_name
179
184
  building_name = table.building_name
180
- delete_query = "DELETE FROM #{ building_name } USING #{ staging_name } WHERE #{ building_name }.id = #{ staging_name }.id" # TODO allow custom or multiple keys
185
+
186
+ where_equals_parts = []
187
+ table.identify_by_columns.each do |attribute|
188
+ where_equals_parts << "#{ building_name }.#{ attribute } = #{ staging_name }.#{ attribute }"
189
+ end
190
+
191
+ delete_query = "DELETE FROM #{ building_name } USING #{ staging_name } WHERE #{ where_equals_parts.join(' AND ') }"
181
192
  self.query(delete_query)
193
+ end
194
+
195
+ def insert_from_staging!(table)
196
+ staging_name = table.staging_name
197
+ building_name = table.building_name
182
198
  insert_query = "INSERT INTO #{ building_name } (\"#{ table.output_column_names.join('","') }\") SELECT \"#{ table.output_column_names.join('","') }\" FROM #{ staging_name }"
183
199
  self.query(insert_query)
184
200
  end
@@ -194,6 +194,12 @@ module DataDuck
194
194
  nil
195
195
  end
196
196
 
197
+ def identify_by_columns
198
+ return ["id"] if self.output_column_names.include?("id")
199
+
200
+ []
201
+ end
202
+
197
203
  def should_fully_reload?
198
204
  false # Set to true if you want to fully reload a table with each ETL
199
205
  end
@@ -2,7 +2,7 @@ module DataDuck
2
2
  if !defined?(DataDuck::VERSION)
3
3
  VERSION_MAJOR = 0
4
4
  VERSION_MINOR = 6
5
- VERSION_PATCH = 5
5
+ VERSION_PATCH = 6
6
6
  VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
7
7
  end
8
8
  end
@@ -7,50 +7,18 @@ require 'date'
7
7
  module DataDuck
8
8
  module Optimizely
9
9
  class Experiments < DataDuck::Optimizely::OptimizelyTable
10
-
11
10
  transforms :percentage_included_to_float
12
- transforms :parse_datetimes
13
-
14
- def extract!(destination, options = {})
15
- self.data = []
11
+ transforms :rename_description_to_name
16
12
 
17
- projects_response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects", headers: {'Token' => self.optimizely_api_token})
18
- if projects_response.response_code != 200
19
- raise Exception.new("Optimizely API for projects returned error #{ response.response_code} #{ response.body }")
20
- end
21
- projects = Oj.load(projects_response.body)
22
-
23
- projects.each do |project|
24
- self.extract_for_project!(project["id"])
25
- end
13
+ def initialize(experiments)
14
+ self.data = experiments
26
15
  end
27
16
 
28
- def extract_for_project!(project_id)
29
- now = DateTime.now
30
-
31
- response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects/#{ project_id }/experiments", headers: {'Token' => self.optimizely_api_token})
32
-
33
- if response.response_code != 200
34
- raise Exception.new("Optimizely API for experiments returned error #{ response.response_code} #{ response.body }")
35
- end
36
-
37
- experiments = Oj.load(response.body)
38
- experiments.each do |experiment|
39
- experiment[:dataduck_extracted_at] = now
40
- experiment[:project_id] = project_id
41
- end
42
-
43
- self.data.concat(experiments)
17
+ def extract!(*args)
18
+ # already initialized data
44
19
  end
45
20
 
46
- def parse_datetimes(row)
47
- row["created"] = DateTime.parse(row["created"])
48
- row["last_modified"] = DateTime.parse(row["last_modified"])
49
-
50
- row
51
- end
52
-
53
- def rename_description_to_name
21
+ def rename_description_to_name(row)
54
22
  row[:name] = row['description']
55
23
 
56
24
  row
@@ -62,6 +30,10 @@ module DataDuck
62
30
  row
63
31
  end
64
32
 
33
+ def should_fully_reload?
34
+ true
35
+ end
36
+
65
37
  def indexes
66
38
  ["id", "project_id", "primary_goal_id", "name"]
67
39
  end
@@ -76,6 +48,7 @@ module DataDuck
76
48
  :primary_goal_id => :integer,
77
49
  :details => :bigtext,
78
50
  :status => :string,
51
+ :audience_ids => :bigtext,
79
52
  :url_conditions => :bigtext,
80
53
  :last_modified => :datetime,
81
54
  :is_multivariate => :boolean,
@@ -84,6 +57,7 @@ module DataDuck
84
57
  :percentage_included => :float,
85
58
  :experiment_type => :string,
86
59
  :edit_url => :string,
60
+ :auto_allocated => :boolean,
87
61
  :dataduck_extracted_at => :datetime,
88
62
  })
89
63
  end
@@ -1,15 +1,73 @@
1
+ require 'typhoeus'
2
+ require 'oj'
3
+ require 'date'
4
+
5
+ require_relative './experiments'
6
+ require_relative './projects'
7
+ require_relative './variations'
8
+
1
9
  module DataDuck
2
10
  module Optimizely
3
11
  class OptimizelyIntegration < DataDuck::Optimizely::OptimizelyTable
4
12
  def etl!(destinations, options = {})
13
+ now = DateTime.now
14
+
5
15
  projects = fetch_data("projects")
6
- # TODO alternate way to load Optimizely data
16
+
17
+ experiments = []
18
+ projects.each do |project|
19
+ project["created"] = DateTime.parse(project["created"])
20
+ project["last_modified"] = DateTime.parse(project["last_modified"])
21
+
22
+ project_experiments = fetch_data("projects/#{ project['id'] }/experiments")
23
+ project_experiments.each do |proj_exp|
24
+ proj_exp['project_id'] = project['id']
25
+ proj_exp["created"] = DateTime.parse(proj_exp["created"])
26
+ proj_exp["last_modified"] = DateTime.parse(proj_exp["last_modified"])
27
+ end
28
+ experiments.concat(project_experiments)
29
+ end
30
+
31
+ variations = []
32
+ # Experiments started after January 21, 2015 have statistics computed by Optimizely Stats Engine.
33
+ # Older experiments should use the old results endpoint.
34
+ date_for_stats_engine = DateTime.parse('Jan 22, 2015')
35
+ date_too_old_for_api = DateTime.parse('Jan 1, 2013')
36
+ broken_experiments = []
37
+ experiments.each do |experiment|
38
+ if experiment["created"] < date_too_old_for_api
39
+ next # seems like there's a problem with the API and old experiments
40
+ end
41
+
42
+ endpoint = experiment["created"] >= date_for_stats_engine ? "experiments/#{ experiment["id"] }/stats" : "experiments/#{ experiment["id"] }/results"
43
+ experiment_variations = []
44
+ begin
45
+ experiment_variations = fetch_data(endpoint)
46
+ rescue Exception => err
47
+ broken_experiments << experiment
48
+ end
49
+ experiment_variations.each do |exp_var|
50
+ exp_var["begin_time"] = DateTime.parse(exp_var["begin_time"]) if exp_var["begin_time"]
51
+ exp_var["end_time"] = DateTime.parse(exp_var["end_time"]) if exp_var["end_time"]
52
+ exp_var["experiment_id"] = experiment["id"]
53
+ end
54
+ variations.concat(experiment_variations)
55
+ end
56
+
57
+ projects_etl_table = DataDuck::Optimizely::Projects.new(projects)
58
+ projects_etl_table.etl!(destinations, options)
59
+
60
+ experiments_etl_table = DataDuck::Optimizely::Experiments.new(experiments)
61
+ experiments_etl_table.etl!(destinations, options)
62
+
63
+ variations_etl_table = DataDuck::Optimizely::Variations.new(variations)
64
+ variations_etl_table.etl!(destinations, options)
7
65
  end
8
66
 
9
67
  def fetch_data(api_endpoint)
10
68
  now = DateTime.now
11
69
 
12
- response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/#{ api_endpoint }", headers: {'Token' => self.optimizely_api_token})
70
+ response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/#{ api_endpoint }", headers: {'Token' => optimizely_api_token})
13
71
  if response.response_code != 200
14
72
  raise Exception.new("Optimizely API for #{ api_endpoint } returned error #{ response.response_code} #{ response.body }")
15
73
  end
@@ -5,6 +5,10 @@ module DataDuck
5
5
  ENV['optimizely_api_token']
6
6
  end
7
7
 
8
+ def prefix
9
+ "optimizely_"
10
+ end
11
+
8
12
  def should_fully_reload?
9
13
  true
10
14
  end
@@ -7,27 +7,20 @@ require 'date'
7
7
  module DataDuck
8
8
  module Optimizely
9
9
  class Projects < DataDuck::Optimizely::OptimizelyTable
10
- transforms :parse_datetimes
11
-
12
- def extract!(destination, options = {})
13
- self.data = []
14
-
15
- now = DateTime.now
16
- response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects", headers: {'Token' => self.optimizely_api_token})
10
+ def initialize(data)
11
+ self.data = data
12
+ end
17
13
 
18
- self.data = Oj.load(response.body)
19
- self.data.each do |project|
20
- project[:dataduck_extracted_at] = now
21
- end
14
+ def extract!(*args)
15
+ # already initialized data
22
16
  end
23
17
 
24
18
  def indexes
25
19
  ["id", "account_id", "project_name"]
26
20
  end
27
21
 
28
- def parse_datetimes
29
- project["created"] = DateTime.parse(project["created"])
30
- project["last_modified"] = DateTime.parse(project["last_modified"])
22
+ def should_fully_reload?
23
+ true
31
24
  end
32
25
 
33
26
  output({
@@ -5,7 +5,54 @@ require_relative 'optimizely_table'
5
5
  module DataDuck
6
6
  module Optimizely
7
7
  class Variations < DataDuck::Optimizely::OptimizelyTable
8
- # this table should contain experiment variations and either /results or /stats for the result data
8
+ transforms :fix_fields
9
+
10
+ def initialize(data)
11
+ self.data = data
12
+ end
13
+
14
+ def extract!(*args)
15
+ # already initialized data
16
+ end
17
+
18
+ def fix_fields(row)
19
+ row[:id] = row['variation_id'].to_i
20
+ row[:name] = row['variation_name']
21
+ row['baseline_id'] = row['baseline_id'].to_i
22
+ row['improvement'] = row['improvement'].to_f
23
+ row['confidence'] = row['confidence'].to_f
24
+ row['conversion_rate'] = row['conversion_rate'].to_f
25
+ row['difference'] = row['difference'].to_f
26
+
27
+ row
28
+ end
29
+
30
+ def indexes
31
+ ["id", "goal_id", "experiment_id", "name"]
32
+ end
33
+
34
+ def should_fully_reload?
35
+ true
36
+ end
37
+
38
+ output({
39
+ :id => :bigint,
40
+ :name => :string,
41
+ :experiment_id => :bigint,
42
+ :baseline_id => :bigint,
43
+ :goal_name => :string,
44
+ :goal_id => :bigint,
45
+ :visitors => :integer,
46
+ :conversions => :integer,
47
+ :begin_time => :datetime,
48
+ :end_time => :datetime,
49
+ :improvement => :float,
50
+ :confidence => :float,
51
+ :conversion_rate => :float,
52
+ :difference => :float,
53
+ :status => :string,
54
+ :dataduck_extracted_at => :datetime,
55
+ })
9
56
  end
10
57
  end
11
58
  end
@@ -1,10 +1,12 @@
1
+ require 'date'
1
2
  require 'typhoeus'
3
+ require 'uri'
2
4
 
3
5
  module DataDuck
4
6
  module SEMRush
5
7
  class OrganicResults < DataDuck::IntegrationTable
6
8
  def display_limit
7
- 25
9
+ 20
8
10
  end
9
11
 
10
12
  def key
@@ -24,23 +26,48 @@ module DataDuck
24
26
  end
25
27
 
26
28
  def extract!(destination, options = {})
27
- dates = options[:dates]
28
- if dates.nil? || dates.length == 0
29
- raise Exception("Must pass at least one date.")
30
- end
31
-
32
29
  self.data = []
33
30
 
34
31
  self.phrases.each do |phrase|
35
- self.dates.each do |date|
36
- self.extract_results_for_keyword_and_date!(phrase, date)
32
+ self.extract_results_for_keyword_and_date!(phrase)
33
+ end
34
+ end
35
+
36
+ def extract_results_for_keyword_and_date!(phrase)
37
+ date = Date.today
38
+ phrase.strip!
39
+ escaped_phrase = URI.escape(phrase)
40
+ semrush_api_url = "http://api.semrush.com/?type=phrase_organic&key=#{ self.key }&display_limit=#{ self.display_limit }&export_columns=Dn,Ur&phrase=#{ escaped_phrase }&database=#{ self.search_database }"
41
+
42
+ response = Typhoeus.get(semrush_api_url)
43
+ if response.response_code != 200
44
+ raise Exception.new("SEMrush API returned error #{ response.response_code} #{ response.body }")
45
+ end
46
+
47
+ rank = -1
48
+ response.body.each_line do |line|
49
+ rank += 1
50
+ if rank == 0
51
+ # This is the header line
52
+ next
37
53
  end
54
+
55
+ domain, url = line.split(';')
56
+ domain.strip!
57
+ url.strip!
58
+
59
+ self.data << {
60
+ date: date,
61
+ phrase: phrase,
62
+ rank: rank,
63
+ domain: domain,
64
+ url: url
65
+ }
38
66
  end
39
67
  end
40
68
 
41
- def extract_results_for_keyword_and_date!(phrase, date)
42
- response = Typhoeus.get("http://api.semrush.com/?type=phrase_organic&key=#{ self.key }&display_limit=#{ self.display_limit }&export_columns=Dn,Ur&phrase=#{ phrase }&database=#{ self.search_database }")
43
- # TODO
69
+ def identify_by_columns
70
+ ["date", "phrase"]
44
71
  end
45
72
 
46
73
  def indexes
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataduck
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.5
4
+ version: 0.6.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeff Pickhardt
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-04 00:00:00.000000000 Z
11
+ date: 2015-11-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler