dataduck 0.6.2 → 0.6.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fb5bcf70fd0c35ad944220251f360767852eeb80
4
- data.tar.gz: e73dd71f9a0761c56dee3637754e26ae962d210b
3
+ metadata.gz: d72a9c14a46ac5a3377bd79b8952b61919d6a4e7
4
+ data.tar.gz: b527269b93a5b57057553330859ccda872150e36
5
5
  SHA512:
6
- metadata.gz: a607da3c47de0279fa521321555bc4b1218f5375a6ec0aaa2244472e0b42322dca01708a57c813be64d9b34f78ef4588982c38e76c8e7c8ff6c83efb774e0d93
7
- data.tar.gz: f7742ec8eff3e4c8bc36fa5015b42ede4fb852be759cb9be5bae25091a13c74160ae93eb91ba2391e1ba0099a53340f17e0839050ece403b90021d363eec3334
6
+ metadata.gz: c8bade98533f439afd0465f22e12ae98b56bb7a29343a76e714ddc79908b4feb19df928f3fb0ff470b5ce608800c2b43e7380f1a63ce52f8acd8b4697f182484
7
+ data.tar.gz: cde4b3cc18a2140330fc3ba8577efc0c4caecff7de5d2b36223b637421a0e7f1cdd7b25d9f47d6b53292efe015376a1fa6918659ddae2cccc8d646b9e9648421
data/dataduck.gemspec CHANGED
@@ -27,5 +27,6 @@ Gem::Specification.new do |spec|
27
27
  spec.add_runtime_dependency "mysql2", '~> 0.4'
28
28
  spec.add_runtime_dependency "aws-sdk", "~> 2.0"
29
29
  spec.add_runtime_dependency "typhoeus", "~> 0.8"
30
+ spec.add_runtime_dependency "oj", "~> 2.12"
30
31
  spec.add_runtime_dependency "sequel-redshift"
31
32
  end
data/lib/dataduck.rb CHANGED
@@ -11,6 +11,14 @@ Dir[File.dirname(__FILE__) + '/dataduck/*.rb'].each do |file|
11
11
  require file
12
12
  end
13
13
 
14
+ Dir[File.dirname(__FILE__) + '/integrations/*.rb'].each do |file|
15
+ require file
16
+ end
17
+
18
+ Dir[File.dirname(__FILE__) + '/integrations/*/*.rb'].each do |file|
19
+ require file
20
+ end
21
+
14
22
  module DataDuck
15
23
  extend ModuleVars
16
24
 
@@ -1,6 +1,8 @@
1
1
  require_relative 'destination'
2
2
 
3
3
  module DataDuck
4
+ class RedshiftLoadError < StandardError; end
5
+
4
6
  class RedshiftDestination < DataDuck::Destination
5
7
  attr_accessor :aws_key
6
8
  attr_accessor :aws_secret
@@ -45,7 +47,7 @@ module DataDuck
45
47
  query_fragments << "FROM '#{ s3_path }'"
46
48
  query_fragments << "CREDENTIALS 'aws_access_key_id=#{ self.aws_key };aws_secret_access_key=#{ self.aws_secret }'"
47
49
  query_fragments << "REGION '#{ self.s3_region }'"
48
- query_fragments << "CSV TRUNCATECOLUMNS ACCEPTINVCHARS EMPTYASNULL"
50
+ query_fragments << "CSV IGNOREHEADER 1 TRUNCATECOLUMNS ACCEPTINVCHARS EMPTYASNULL"
49
51
  query_fragments << "DATEFORMAT 'auto'"
50
52
  return query_fragments.join(" ")
51
53
  end
@@ -90,10 +92,17 @@ module DataDuck
90
92
  end
91
93
 
92
94
  def data_as_csv_string(data, property_names)
93
- data_string_components = [] # for performance reasons, join strings this way
95
+ data_string_components = [] # join strings this way for now, could be optimized later
96
+
97
+ data_string_components << property_names.join(',') # header column
98
+ data_string_components << "\n"
99
+
94
100
  data.each do |result|
95
101
  property_names.each_with_index do |property_name, index|
96
102
  value = result[property_name.to_sym]
103
+ if value.nil?
104
+ value = result[property_name.to_s]
105
+ end
97
106
 
98
107
  if index == 0
99
108
  data_string_components << '"'
@@ -176,7 +185,24 @@ module DataDuck
176
185
 
177
186
  def query(sql)
178
187
  Logs.debug("SQL executing on #{ self.name }:\n " + sql)
179
- self.connection[sql].map { |elem| elem }
188
+ begin
189
+ self.connection[sql].map { |elem| elem }
190
+ rescue Exception => err
191
+ if err.to_s.include?("Check 'stl_load_errors' system table for details")
192
+ self.raise_stl_load_error!
193
+ else
194
+ raise err
195
+ end
196
+ end
197
+ end
198
+
199
+ def raise_stl_load_error!
200
+ load_error_sql = "SELECT filename, line_number, colname, position, err_code, err_reason FROM stl_load_errors ORDER BY starttime DESC LIMIT 1"
201
+ load_error_details = self.connection[load_error_sql].map { |elem| elem }.first
202
+
203
+ raise RedshiftLoadError.new("Error loading Redshift, '#{ load_error_details[:err_reason].strip }' " +
204
+ "(code #{ load_error_details[:err_code] }) with file #{ load_error_details[:filename].strip } " +
205
+ "for column '#{ load_error_details[:colname].strip }'. The error occurred at line #{ load_error_details[:line_number] }, position #{ load_error_details[:position] }.")
180
206
  end
181
207
 
182
208
  def table_names
@@ -237,10 +263,16 @@ module DataDuck
237
263
 
238
264
  def self.value_to_string(value)
239
265
  string_value = ''
240
- if value.respond_to? :to_s
266
+
267
+ if value.respond_to?(:strftime)
268
+ from_value = value.respond_to?(:utc) ? value.utc : value
269
+ string_value = from_value.strftime('%Y-%m-%d %H:%M:%S')
270
+ elsif value.respond_to?(:to_s)
241
271
  string_value = value.to_s
242
272
  end
273
+
243
274
  string_value.gsub!('"', '""')
275
+
244
276
  return string_value
245
277
  end
246
278
  end
@@ -45,7 +45,14 @@ module DataDuck
45
45
  end
46
46
 
47
47
  def actions
48
- self.class.actions
48
+ my_actions = []
49
+ for_class = self.class
50
+ while for_class < Table
51
+ my_actions.concat(for_class.actions || [])
52
+ for_class = for_class.superclass
53
+ end
54
+
55
+ my_actions
49
56
  end
50
57
 
51
58
  def check_table_valid!
@@ -63,11 +70,17 @@ module DataDuck
63
70
  end
64
71
  end
65
72
 
66
- def etl!(destinations)
73
+ def etl!(destinations, options = {})
67
74
  if destinations.length != 1
68
75
  raise ArgumentError.new("DataDuck can only etl to one destination at a time for now.")
69
76
  end
77
+
78
+ if options[:dates].nil?
79
+ options[:dates] = [Date.today]
80
+ end
81
+
70
82
  self.check_table_valid!
83
+
71
84
  destination = destinations.first
72
85
 
73
86
  if self.should_fully_reload?
@@ -77,7 +90,7 @@ module DataDuck
77
90
  batch_number = 0
78
91
  while batch_number < 1_000
79
92
  batch_number += 1
80
- self.extract!(destination)
93
+ self.extract!(destination, options)
81
94
  self.transform!
82
95
  self.load!(destination)
83
96
 
@@ -100,7 +113,7 @@ module DataDuck
100
113
  end
101
114
  end
102
115
 
103
- def extract!(destination = nil)
116
+ def extract!(destination = nil, options = {})
104
117
  DataDuck::Logs.info "Extracting table #{ self.name }"
105
118
 
106
119
  self.errors ||= []
@@ -109,7 +122,7 @@ module DataDuck
109
122
  source = source_spec[:source]
110
123
  my_query = self.extract_query(source_spec, destination)
111
124
  results = source.query(my_query)
112
- self.data = results
125
+ self.data.concat(results)
113
126
  end
114
127
  self.data
115
128
  end
@@ -156,6 +169,10 @@ module DataDuck
156
169
  destination.load_table!(self)
157
170
  end
158
171
 
172
+ def include_with_all?
173
+ true
174
+ end
175
+
159
176
  def indexes
160
177
  which_columns = []
161
178
  which_columns << "id" if self.output_column_names.include?("id")
@@ -186,7 +203,7 @@ module DataDuck
186
203
  end
187
204
 
188
205
  def output_schema
189
- self.class.output_schema || {}
206
+ self.class.output_schema || self.class.superclass.output_schema || {}
190
207
  end
191
208
 
192
209
  def output_column_names
@@ -217,8 +234,7 @@ module DataDuck
217
234
  DataDuck::Logs.info "Transforming table #{ self.name }"
218
235
 
219
236
  self.errors ||= []
220
- self.class.actions ||= []
221
- self.class.actions.each do |action|
237
+ self.actions.each do |action|
222
238
  action_type = action[0]
223
239
  action_method_name = action[1]
224
240
  if action_type == :transform
@@ -233,7 +249,16 @@ module DataDuck
233
249
  end
234
250
 
235
251
  def name
236
- DataDuck::Util.camelcase_to_underscore(self.class.name)
252
+ fixed_name = DataDuck::Util.camelcase_to_underscore(self.class.name)
253
+ if fixed_name.start_with?("data_duck/")
254
+ fixed_name = fixed_name.split("/").last
255
+ end
256
+
257
+ self.prefix + fixed_name
258
+ end
259
+
260
+ def prefix
261
+ ""
237
262
  end
238
263
  end
239
264
  end
@@ -2,7 +2,7 @@ module DataDuck
2
2
  if !defined?(DataDuck::VERSION)
3
3
  VERSION_MAJOR = 0
4
4
  VERSION_MINOR = 6
5
- VERSION_PATCH = 2
5
+ VERSION_PATCH = 3
6
6
  VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
7
7
  end
8
8
  end
@@ -0,0 +1,5 @@
1
+ module DataDuck
2
+ class IntegrationTable < DataDuck::Table
3
+ # nothing for now, but there could be integration-specific stuff here
4
+ end
5
+ end
@@ -0,0 +1,91 @@
1
+ require_relative 'optimizely_table'
2
+
3
+ require 'typhoeus'
4
+ require 'oj'
5
+ require 'date'
6
+
7
+ module DataDuck
8
+ module Optimizely
9
+ class Experiments < DataDuck::Optimizely::OptimizelyTable
10
+
11
+ transforms :percentage_included_to_float
12
+ transforms :parse_datetimes
13
+
14
+ def extract!(destination, options = {})
15
+ self.data = []
16
+
17
+ projects_response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects", headers: {'Token' => self.optimizely_api_token})
18
+ if projects_response.response_code != 200
19
+ raise Exception.new("Optimizely API for projects returned error #{ response.response_code} #{ response.body }")
20
+ end
21
+ projects = Oj.load(projects_response.body)
22
+
23
+ projects.each do |project|
24
+ self.extract_for_project!(project["id"])
25
+ end
26
+ end
27
+
28
+ def extract_for_project!(project_id)
29
+ now = DateTime.now
30
+
31
+ response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects/#{ project_id }/experiments", headers: {'Token' => self.optimizely_api_token})
32
+
33
+ if response.response_code != 200
34
+ raise Exception.new("Optimizely API for experiments returned error #{ response.response_code} #{ response.body }")
35
+ end
36
+
37
+ experiments = Oj.load(response.body)
38
+ experiments.each do |experiment|
39
+ experiment[:dataduck_extracted_at] = now
40
+ experiment[:project_id] = project_id
41
+ end
42
+
43
+ self.data.concat(experiments)
44
+ end
45
+
46
+ def parse_datetimes(row)
47
+ row["created"] = DateTime.parse(row["created"])
48
+ row["last_modified"] = DateTime.parse(row["last_modified"])
49
+
50
+ row
51
+ end
52
+
53
+ def rename_description_to_name
54
+ row[:name] = row['description']
55
+
56
+ row
57
+ end
58
+
59
+ def percentage_included_to_float(row)
60
+ row['percentage_included'] = row['percentage_included'].to_i / 100.0
61
+
62
+ row
63
+ end
64
+
65
+ def indexes
66
+ ["id", "project_id", "primary_goal_id", "name"]
67
+ end
68
+
69
+ output({
70
+ :id => :bigint,
71
+ :project_id => :bigint, # integers have an overflow error because optimizely numbers get too big
72
+ :name => :string,
73
+ :shareable_results_link => :string,
74
+ :conditional_code => :bigtext,
75
+ :custom_js => :bigtext,
76
+ :primary_goal_id => :integer,
77
+ :details => :bigtext,
78
+ :status => :string,
79
+ :url_conditions => :bigtext,
80
+ :last_modified => :datetime,
81
+ :is_multivariate => :boolean,
82
+ :activation_mode => :string,
83
+ :created => :datetime,
84
+ :percentage_included => :float,
85
+ :experiment_type => :string,
86
+ :edit_url => :string,
87
+ :dataduck_extracted_at => :datetime,
88
+ })
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,26 @@
1
+ module DataDuck
2
+ module Optimizely
3
+ class OptimizelyIntegration < DataDuck::Optimizely::OptimizelyTable
4
+ def etl!(destinations, options = {})
5
+ projects = fetch_data("projects")
6
+ # TODO alternate way to load Optimizely data
7
+ end
8
+
9
+ def fetch_data(api_endpoint)
10
+ now = DateTime.now
11
+
12
+ response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/#{ api_endpoint }", headers: {'Token' => self.optimizely_api_token})
13
+ if response.response_code != 200
14
+ raise Exception.new("Optimizely API for #{ api_endpoint } returned error #{ response.response_code} #{ response.body }")
15
+ end
16
+
17
+ rows = Oj.load(response.body)
18
+ rows.each do |row|
19
+ row[:dataduck_extracted_at] = now
20
+ end
21
+
22
+ rows
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,13 @@
1
+ module DataDuck
2
+ module Optimizely
3
+ class OptimizelyTable < DataDuck::IntegrationTable
4
+ def optimizely_api_token
5
+ ENV['optimizely_api_token']
6
+ end
7
+
8
+ def should_fully_reload?
9
+ true
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,56 @@
1
+ require_relative 'optimizely_table'
2
+
3
+ require 'typhoeus'
4
+ require 'oj'
5
+ require 'date'
6
+
7
+ module DataDuck
8
+ module Optimizely
9
+ class Projects < DataDuck::Optimizely::OptimizelyTable
10
+ transforms :parse_datetimes
11
+
12
+ def extract!(destination, options = {})
13
+ self.data = []
14
+
15
+ now = DateTime.now
16
+ response = Typhoeus.get("https://www.optimizelyapis.com/experiment/v1/projects", headers: {'Token' => self.optimizely_api_token})
17
+
18
+ self.data = Oj.load(response.body)
19
+ self.data.each do |project|
20
+ project[:dataduck_extracted_at] = now
21
+ end
22
+ end
23
+
24
+ def indexes
25
+ ["id", "account_id", "project_name"]
26
+ end
27
+
28
+ def parse_datetimes
29
+ project["created"] = DateTime.parse(project["created"])
30
+ project["last_modified"] = DateTime.parse(project["last_modified"])
31
+ end
32
+
33
+ output({
34
+ :id => :bigint,
35
+ :account_id => :bigint,
36
+ :code_revision => :integer,
37
+ :project_name => :string,
38
+ :project_status => :string,
39
+ :created => :datetime,
40
+ :last_modified => :datetime,
41
+ :library => :string,
42
+ :include_jquery => :bool,
43
+ :js_file_size => :integer,
44
+ :project_javascript => :bigtext,
45
+ :enable_force_variation => :boolean,
46
+ :exclude_disabled_experiments => :boolean,
47
+ :exclude_names => :boolean,
48
+ :ip_anonymization => :boolean,
49
+ :ip_filter => :string,
50
+ :socket_token => :string,
51
+ :dcp_service_id => :integer,
52
+ :dataduck_extracted_at => :datetime,
53
+ })
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,11 @@
1
+ require 'typhoeus'
2
+
3
+ require_relative 'optimizely_table'
4
+
5
+ module DataDuck
6
+ module Optimizely
7
+ class Variations < DataDuck::Optimizely::OptimizelyTable
8
+ # this table should contain experiment variations and either /results or /stats for the result data
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,59 @@
1
+ require 'typhoeus'
2
+
3
+ module DataDuck
4
+ module SEMRush
5
+ class OrganicResults < DataDuck::IntegrationTable
6
+ def display_limit
7
+ 25
8
+ end
9
+
10
+ def key
11
+ ENV['semrush_api_key']
12
+ end
13
+
14
+ def phrases
15
+ raise Exception("Must implement phrases method to be an array of the phrases you want.")
16
+ end
17
+
18
+ def prefix
19
+ "semrush_"
20
+ end
21
+
22
+ def search_database
23
+ 'us'
24
+ end
25
+
26
+ def extract!(destination, options = {})
27
+ dates = options[:dates]
28
+ if dates.nil? || dates.length == 0
29
+ raise Exception("Must pass at least one date.")
30
+ end
31
+
32
+ self.data = []
33
+
34
+ self.phrases.each do |phrase|
35
+ self.dates.each do |date|
36
+ self.extract_results_for_keyword_and_date!(phrase, date)
37
+ end
38
+ end
39
+ end
40
+
41
+ def extract_results_for_keyword_and_date!(phrase, date)
42
+ response = Typhoeus.get("http://api.semrush.com/?type=phrase_organic&key=#{ self.key }&display_limit=#{ self.display_limit }&export_columns=Dn,Ur&phrase=#{ phrase }&database=#{ self.search_database }")
43
+ # TODO
44
+ end
45
+
46
+ def indexes
47
+ ["date", "phrase", "domain"]
48
+ end
49
+
50
+ output({
51
+ :date => :date,
52
+ :phrase => :string,
53
+ :rank => :integer,
54
+ :domain => :string,
55
+ :url => :string,
56
+ })
57
+ end
58
+ end
59
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataduck
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.6.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeff Pickhardt
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-02 00:00:00.000000000 Z
11
+ date: 2015-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -136,6 +136,20 @@ dependencies:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
138
  version: '0.8'
139
+ - !ruby/object:Gem::Dependency
140
+ name: oj
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '2.12'
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '2.12'
139
153
  - !ruby/object:Gem::Dependency
140
154
  name: sequel-redshift
141
155
  requirement: !ruby/object:Gem::Requirement
@@ -205,6 +219,13 @@ files:
205
219
  - lib/dataduck/util.rb
206
220
  - lib/dataduck/version.rb
207
221
  - lib/helpers/module_vars.rb
222
+ - lib/integrations/integration_table.rb
223
+ - lib/integrations/optimizely/experiments.rb
224
+ - lib/integrations/optimizely/optimizely_integration.rb
225
+ - lib/integrations/optimizely/optimizely_table.rb
226
+ - lib/integrations/optimizely/projects.rb
227
+ - lib/integrations/optimizely/variations.rb
228
+ - lib/integrations/semrush/organic_results.rb
208
229
  - lib/templates/quickstart/table.rb.erb
209
230
  - static/logo.png
210
231
  homepage: http://dataducketl.com/