dataduck 0.6.2 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fb5bcf70fd0c35ad944220251f360767852eeb80
4
- data.tar.gz: e73dd71f9a0761c56dee3637754e26ae962d210b
3
+ metadata.gz: d72a9c14a46ac5a3377bd79b8952b61919d6a4e7
4
+ data.tar.gz: b527269b93a5b57057553330859ccda872150e36
5
5
  SHA512:
6
- metadata.gz: a607da3c47de0279fa521321555bc4b1218f5375a6ec0aaa2244472e0b42322dca01708a57c813be64d9b34f78ef4588982c38e76c8e7c8ff6c83efb774e0d93
7
- data.tar.gz: f7742ec8eff3e4c8bc36fa5015b42ede4fb852be759cb9be5bae25091a13c74160ae93eb91ba2391e1ba0099a53340f17e0839050ece403b90021d363eec3334
6
+ metadata.gz: c8bade98533f439afd0465f22e12ae98b56bb7a29343a76e714ddc79908b4feb19df928f3fb0ff470b5ce608800c2b43e7380f1a63ce52f8acd8b4697f182484
7
+ data.tar.gz: cde4b3cc18a2140330fc3ba8577efc0c4caecff7de5d2b36223b637421a0e7f1cdd7b25d9f47d6b53292efe015376a1fa6918659ddae2cccc8d646b9e9648421
data/dataduck.gemspec CHANGED
@@ -27,5 +27,6 @@ Gem::Specification.new do |spec|
27
27
  spec.add_runtime_dependency "mysql2", '~> 0.4'
28
28
  spec.add_runtime_dependency "aws-sdk", "~> 2.0"
29
29
  spec.add_runtime_dependency "typhoeus", "~> 0.8"
30
+ spec.add_runtime_dependency "oj", "~> 2.12"
30
31
  spec.add_runtime_dependency "sequel-redshift"
31
32
  end
data/lib/dataduck.rb CHANGED
@@ -11,6 +11,14 @@ Dir[File.dirname(__FILE__) + '/dataduck/*.rb'].each do |file|
11
11
  require file
12
12
  end
13
13
 
14
+ Dir[File.dirname(__FILE__) + '/integrations/*.rb'].each do |file|
15
+ require file
16
+ end
17
+
18
+ Dir[File.dirname(__FILE__) + '/integrations/*/*.rb'].each do |file|
19
+ require file
20
+ end
21
+
14
22
  module DataDuck
15
23
  extend ModuleVars
16
24
 
@@ -1,6 +1,8 @@
1
1
  require_relative 'destination'
2
2
 
3
3
  module DataDuck
4
+ class RedshiftLoadError < StandardError; end
5
+
4
6
  class RedshiftDestination < DataDuck::Destination
5
7
  attr_accessor :aws_key
6
8
  attr_accessor :aws_secret
@@ -45,7 +47,7 @@ module DataDuck
45
47
  query_fragments << "FROM '#{ s3_path }'"
46
48
  query_fragments << "CREDENTIALS 'aws_access_key_id=#{ self.aws_key };aws_secret_access_key=#{ self.aws_secret }'"
47
49
  query_fragments << "REGION '#{ self.s3_region }'"
48
- query_fragments << "CSV TRUNCATECOLUMNS ACCEPTINVCHARS EMPTYASNULL"
50
+ query_fragments << "CSV IGNOREHEADER 1 TRUNCATECOLUMNS ACCEPTINVCHARS EMPTYASNULL"
49
51
  query_fragments << "DATEFORMAT 'auto'"
50
52
  return query_fragments.join(" ")
51
53
  end
@@ -90,10 +92,17 @@ module DataDuck
90
92
  end
91
93
 
92
94
  def data_as_csv_string(data, property_names)
93
- data_string_components = [] # for performance reasons, join strings this way
95
+ data_string_components = [] # join strings this way for now, could be optimized later
96
+
97
+ data_string_components << property_names.join(',') # header column
98
+ data_string_components << "\n"
99
+
94
100
  data.each do |result|
95
101
  property_names.each_with_index do |property_name, index|
96
102
  value = result[property_name.to_sym]
103
+ if value.nil?
104
+ value = result[property_name.to_s]
105
+ end
97
106
 
98
107
  if index == 0
99
108
  data_string_components << '"'
@@ -176,7 +185,24 @@ module DataDuck
176
185
 
177
186
  def query(sql)
178
187
  Logs.debug("SQL executing on #{ self.name }:\n " + sql)
179
- self.connection[sql].map { |elem| elem }
188
+ begin
189
+ self.connection[sql].map { |elem| elem }
190
+ rescue Exception => err
191
+ if err.to_s.include?("Check 'stl_load_errors' system table for details")
192
+ self.raise_stl_load_error!
193
+ else
194
+ raise err
195
+ end
196
+ end
197
+ end
198
+
199
+ def raise_stl_load_error!
200
+ load_error_sql = "SELECT filename, line_number, colname, position, err_code, err_reason FROM stl_load_errors ORDER BY starttime DESC LIMIT 1"
201
+ load_error_details = self.connection[load_error_sql].map { |elem| elem }.first
202
+
203
+ raise RedshiftLoadError.new("Error loading Redshift, '#{ load_error_details[:err_reason].strip }' " +
204
+ "(code #{ load_error_details[:err_code] }) with file #{ load_error_details[:filename].strip } " +
205
+ "for column '#{ load_error_details[:colname].strip }'. The error occurred at line #{ load_error_details[:line_number] }, position #{ load_error_details[:position] }.")
180
206
  end
181
207
 
182
208
  def table_names
@@ -237,10 +263,16 @@ module DataDuck
237
263
 
238
264
  def self.value_to_string(value)
239
265
  string_value = ''
240
- if value.respond_to? :to_s
266
+
267
+ if value.respond_to?(:strftime)
268
+ from_value = value.respond_to?(:utc) ? value.utc : value
269
+ string_value = from_value.strftime('%Y-%m-%d %H:%M:%S')
270
+ elsif value.respond_to?(:to_s)
241
271
  string_value = value.to_s
242
272
  end
273
+
243
274
  string_value.gsub!('"', '""')
275
+
244
276
  return string_value
245
277
  end
246
278
  end
@@ -45,7 +45,14 @@ module DataDuck
45
45
  end
46
46
 
47
47
  def actions
48
- self.class.actions
48
+ my_actions = []
49
+ for_class = self.class
50
+ while for_class < Table
51
+ my_actions.concat(for_class.actions || [])
52
+ for_class = for_class.superclass
53
+ end
54
+
55
+ my_actions
49
56
  end
50
57
 
51
58
  def check_table_valid!
@@ -63,11 +70,17 @@ module DataDuck
63
70
  end
64
71
  end
65
72
 
66
- def etl!(destinations)
73
+ def etl!(destinations, options = {})
67
74
  if destinations.length != 1
68
75
  raise ArgumentError.new("DataDuck can only etl to one destination at a time for now.")
69
76
  end
77
+
78
+ if options[:dates].nil?
79
+ options[:dates] = [Date.today]
80
+ end
81
+
70
82
  self.check_table_valid!
83
+
71
84
  destination = destinations.first
72
85
 
73
86
  if self.should_fully_reload?
@@ -77,7 +90,7 @@ module DataDuck
77
90
  batch_number = 0
78
91
  while batch_number < 1_000
79
92
  batch_number += 1
80
- self.extract!(destination)
93
+ self.extract!(destination, options)
81
94
  self.transform!
82
95
  self.load!(destination)
83
96
 
@@ -100,7 +113,7 @@ module DataDuck
100
113
  end
101
114
  end
102
115
 
103
- def extract!(destination = nil)
116
+ def extract!(destination = nil, options = {})
104
117
  DataDuck::Logs.info "Extracting table #{ self.name }"
105
118
 
106
119
  self.errors ||= []
@@ -109,7 +122,7 @@ module DataDuck
109
122
  source = source_spec[:source]
110
123
  my_query = self.extract_query(source_spec, destination)
111
124
  results = source.query(my_query)
112
- self.data = results
125
+ self.data.concat(results)
113
126
  end
114
127
  self.data
115
128
  end
@@ -156,6 +169,10 @@ module DataDuck
156
169
  destination.load_table!(self)
157
170
  end
158
171
 
172
+ def include_with_all?
173
+ true
174
+ end
175
+
159
176
  def indexes
160
177
  which_columns = []
161
178
  which_columns << "id" if self.output_column_names.include?("id")
@@ -186,7 +203,7 @@ module DataDuck
186
203
  end
187
204
 
188
205
  def output_schema
189
- self.class.output_schema || {}
206
+ self.class.output_schema || self.class.superclass.output_schema || {}
190
207
  end
191
208
 
192
209
  def output_column_names
@@ -217,8 +234,7 @@ module DataDuck
217
234
  DataDuck::Logs.info "Transforming table #{ self.name }"
218
235
 
219
236
  self.errors ||= []
220
- self.class.actions ||= []
221
- self.class.actions.each do |action|
237
+ self.actions.each do |action|
222
238
  action_type = action[0]
223
239
  action_method_name = action[1]
224
240
  if action_type == :transform
@@ -233,7 +249,16 @@ module DataDuck
233
249
  end
234
250
 
235
251
  def name
236
- DataDuck::Util.camelcase_to_underscore(self.class.name)
252
+ fixed_name = DataDuck::Util.camelcase_to_underscore(self.class.name)
253
+ if fixed_name.start_with?("data_duck/")
254
+ fixed_name = fixed_name.split("/").last
255
+ end
256
+
257
+ self.prefix + fixed_name
258
+ end
259
+
260
+ def prefix
261
+ ""
237
262
  end
238
263
  end
239
264
  end
@@ -2,7 +2,7 @@ module DataDuck
2
2
  if !defined?(DataDuck::VERSION)
3
3
  VERSION_MAJOR = 0
4
4
  VERSION_MINOR = 6
5
- VERSION_PATCH = 2
5
+ VERSION_PATCH = 3
6
6
  VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
7
7
  end
8
8
  end
@@ -0,0 +1,5 @@
1
+ module DataDuck
2
+ class IntegrationTable < DataDuck::Table
3
+ # nothing for now, but there could be integration-specific stuff here
4
+ end
5
+ end
@@ -0,0 +1,91 @@
1
+ require_relative 'optimizely_table'
2
+
3
+ require 'typhoeus'
4
+ require 'oj'
5
+ require 'date'
6
+
7
+ module DataDuck
8
+ module Optimizely
9
+ class Experiments < DataDuck::Optimizely::OptimizelyTable
10
+
11
+ transforms :percentage_included_to_float
12
+ transforms :parse_datetimes
13
+
14
+ def extract!(destination, options = {})
15
+ self.data = []
16
+
17
+ projects_response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects", headers: {'Token' => self.optimizely_api_token})
18
+ if projects_response.response_code != 200
19
+ raise Exception.new("Optimizely API for projects returned error #{ response.response_code} #{ response.body }")
20
+ end
21
+ projects = Oj.load(projects_response.body)
22
+
23
+ projects.each do |project|
24
+ self.extract_for_project!(project["id"])
25
+ end
26
+ end
27
+
28
+ def extract_for_project!(project_id)
29
+ now = DateTime.now
30
+
31
+ response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects/#{ project_id }/experiments", headers: {'Token' => self.optimizely_api_token})
32
+
33
+ if response.response_code != 200
34
+ raise Exception.new("Optimizely API for experiments returned error #{ response.response_code} #{ response.body }")
35
+ end
36
+
37
+ experiments = Oj.load(response.body)
38
+ experiments.each do |experiment|
39
+ experiment[:dataduck_extracted_at] = now
40
+ experiment[:project_id] = project_id
41
+ end
42
+
43
+ self.data.concat(experiments)
44
+ end
45
+
46
+ def parse_datetimes(row)
47
+ row["created"] = DateTime.parse(row["created"])
48
+ row["last_modified"] = DateTime.parse(row["last_modified"])
49
+
50
+ row
51
+ end
52
+
53
+ def rename_description_to_name
54
+ row[:name] = row['description']
55
+
56
+ row
57
+ end
58
+
59
+ def percentage_included_to_float(row)
60
+ row['percentage_included'] = row['percentage_included'].to_i / 100.0
61
+
62
+ row
63
+ end
64
+
65
+ def indexes
66
+ ["id", "project_id", "primary_goal_id", "name"]
67
+ end
68
+
69
+ output({
70
+ :id => :bigint,
71
+ :project_id => :bigint, # integers have an overflow error because optimizely numbers get too big
72
+ :name => :string,
73
+ :shareable_results_link => :string,
74
+ :conditional_code => :bigtext,
75
+ :custom_js => :bigtext,
76
+ :primary_goal_id => :integer,
77
+ :details => :bigtext,
78
+ :status => :string,
79
+ :url_conditions => :bigtext,
80
+ :last_modified => :datetime,
81
+ :is_multivariate => :boolean,
82
+ :activation_mode => :string,
83
+ :created => :datetime,
84
+ :percentage_included => :float,
85
+ :experiment_type => :string,
86
+ :edit_url => :string,
87
+ :dataduck_extracted_at => :datetime,
88
+ })
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,26 @@
1
+ module DataDuck
2
+ module Optimizely
3
+ class OptimizelyIntegration < DataDuck::Optimizely::OptimizelyTable
4
+ def etl!(destinations, options = {})
5
+ projects = fetch_data("projects")
6
+ # TODO alternate way to load Optimizely data
7
+ end
8
+
9
+ def fetch_data(api_endpoint)
10
+ now = DateTime.now
11
+
12
+ response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/#{ api_endpoint }", headers: {'Token' => self.optimizely_api_token})
13
+ if response.response_code != 200
14
+ raise Exception.new("Optimizely API for #{ api_endpoint } returned error #{ response.response_code} #{ response.body }")
15
+ end
16
+
17
+ rows = Oj.load(response.body)
18
+ rows.each do |row|
19
+ row[:dataduck_extracted_at] = now
20
+ end
21
+
22
+ rows
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,13 @@
1
+ module DataDuck
2
+ module Optimizely
3
+ class OptimizelyTable < DataDuck::IntegrationTable
4
+ def optimizely_api_token
5
+ ENV['optimizely_api_token']
6
+ end
7
+
8
+ def should_fully_reload?
9
+ true
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,56 @@
1
+ require_relative 'optimizely_table'
2
+
3
+ require 'typhoeus'
4
+ require 'oj'
5
+ require 'date'
6
+
7
+ module DataDuck
8
+ module Optimizely
9
+ class Projects < DataDuck::Optimizely::OptimizelyTable
10
+ transforms :parse_datetimes
11
+
12
+ def extract!(destination, options = {})
13
+ self.data = []
14
+
15
+ now = DateTime.now
16
+ response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects", headers: {'Token' => self.optimizely_api_token})
17
+
18
+ self.data = Oj.load(response.body)
19
+ self.data.each do |project|
20
+ project[:dataduck_extracted_at] = now
21
+ end
22
+ end
23
+
24
+ def indexes
25
+ ["id", "account_id", "project_name"]
26
+ end
27
+
28
+ def parse_datetimes
29
+ project["created"] = DateTime.parse(project["created"])
30
+ project["last_modified"] = DateTime.parse(project["last_modified"])
31
+ end
32
+
33
+ output({
34
+ :id => :bigint,
35
+ :account_id => :bigint,
36
+ :code_revision => :integer,
37
+ :project_name => :string,
38
+ :project_status => :string,
39
+ :created => :datetime,
40
+ :last_modified => :datetime,
41
+ :library => :string,
42
+ :include_jquery => :bool,
43
+ :js_file_size => :integer,
44
+ :project_javascript => :bigtext,
45
+ :enable_force_variation => :boolean,
46
+ :exclude_disabled_experiments => :boolean,
47
+ :exclude_names => :boolean,
48
+ :ip_anonymization => :boolean,
49
+ :ip_filter => :string,
50
+ :socket_token => :string,
51
+ :dcp_service_id => :integer,
52
+ :dataduck_extracted_at => :datetime,
53
+ })
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,11 @@
1
+ require 'typhoeus'
2
+
3
+ require_relative 'optimizely_table'
4
+
5
+ module DataDuck
6
+ module Optimizely
7
+ class Variations < DataDuck::Optimizely::OptimizelyTable
8
+ # this table should contain experiment variations and either /results or /stats for the result data
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,59 @@
1
+ require 'typhoeus'
2
+
3
+ module DataDuck
4
+ module SEMRush
5
+ class OrganicResults < DataDuck::IntegrationTable
6
+ def display_limit
7
+ 25
8
+ end
9
+
10
+ def key
11
+ ENV['semrush_api_key']
12
+ end
13
+
14
+ def phrases
15
+ raise Exception("Must implement phrases method to be an array of the phrases you want.")
16
+ end
17
+
18
+ def prefix
19
+ "semrush_"
20
+ end
21
+
22
+ def search_database
23
+ 'us'
24
+ end
25
+
26
+ def extract!(destination, options = {})
27
+ dates = options[:dates]
28
+ if dates.nil? || dates.length == 0
29
+ raise Exception("Must pass at least one date.")
30
+ end
31
+
32
+ self.data = []
33
+
34
+ self.phrases.each do |phrase|
35
+ self.dates.each do |date|
36
+ self.extract_results_for_keyword_and_date!(phrase, date)
37
+ end
38
+ end
39
+ end
40
+
41
+ def extract_results_for_keyword_and_date!(phrase, date)
42
+ response = Typhoeus.get("http://api.semrush.com/?type=phrase_organic&key=#{ self.key }&display_limit=#{ self.display_limit }&export_columns=Dn,Ur&phrase=#{ phrase }&database=#{ self.search_database }")
43
+ # TODO
44
+ end
45
+
46
+ def indexes
47
+ ["date", "phrase", "domain"]
48
+ end
49
+
50
+ output({
51
+ :date => :date,
52
+ :phrase => :string,
53
+ :rank => :integer,
54
+ :domain => :string,
55
+ :url => :string,
56
+ })
57
+ end
58
+ end
59
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataduck
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.6.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeff Pickhardt
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-02 00:00:00.000000000 Z
11
+ date: 2015-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -136,6 +136,20 @@ dependencies:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
138
  version: '0.8'
139
+ - !ruby/object:Gem::Dependency
140
+ name: oj
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '2.12'
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '2.12'
139
153
  - !ruby/object:Gem::Dependency
140
154
  name: sequel-redshift
141
155
  requirement: !ruby/object:Gem::Requirement
@@ -205,6 +219,13 @@ files:
205
219
  - lib/dataduck/util.rb
206
220
  - lib/dataduck/version.rb
207
221
  - lib/helpers/module_vars.rb
222
+ - lib/integrations/integration_table.rb
223
+ - lib/integrations/optimizely/experiments.rb
224
+ - lib/integrations/optimizely/optimizely_integration.rb
225
+ - lib/integrations/optimizely/optimizely_table.rb
226
+ - lib/integrations/optimizely/projects.rb
227
+ - lib/integrations/optimizely/variations.rb
228
+ - lib/integrations/semrush/organic_results.rb
208
229
  - lib/templates/quickstart/table.rb.erb
209
230
  - static/logo.png
210
231
  homepage: http://dataducketl.com/