dataduck 0.6.8 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e07cb92ca8f7f36c672dfb89e617e41ddb92f302
4
- data.tar.gz: 40037cb63d58385f830043257883322e7dad1dbb
3
+ metadata.gz: ef6f3cd5a8054cf855b227324845f2ec365516dd
4
+ data.tar.gz: 45030532745d7a68988bee5e95a791ed55bae6b0
5
5
  SHA512:
6
- metadata.gz: bfce8a29678d44a96f26b05c3cef1fe047776976ced0d856a028276f9f65c0b8094f0ac64405f1eb8af91c1c2c07205b15776a448e9475d52c573596455409b2
7
- data.tar.gz: 55a7dc9934972cb5953bac4e416dfddf26d4ae65389022168e8e8e36a6d6ffc74c4d76ce57eca9289bbc7a677061a093ec1f809a398b1838b776f0d0975589c6
6
+ metadata.gz: 1d167785a5f64fd8ea77546dcb3f7d19107c3c651aac5933afe09e7eee4eb1674c25a9223b1c724a8229334d669c41604288c29fa4da6e35bfa596ad8bde6a90
7
+ data.tar.gz: 708a7eb4404cee131bcd946314b57febaa6908c716c4ce2928167c88a0a61f084820f34258a3b5b74cbc97a215759e8cec13e9f89ff25a17aba2453a636d4517
data/dataduck.gemspec CHANGED
@@ -30,4 +30,5 @@ Gem::Specification.new do |spec|
30
30
  spec.add_runtime_dependency "oj", "~> 2.12"
31
31
  spec.add_runtime_dependency "sequel-redshift"
32
32
  spec.add_runtime_dependency "whenever", "~> 0.9"
33
+ spec.add_runtime_dependency "sentry-raven", '~>0.15'
33
34
  end
data/docs/contents.yml CHANGED
@@ -12,3 +12,8 @@
12
12
 
13
13
  "Tables":
14
14
  "The Table Class": README
15
+ "Incremental vs Full Loading": incremental_vs_full_loading
16
+
17
+ "Integrations":
18
+ "Optimizely": optimizely
19
+ "SEMrush": semrush
@@ -0,0 +1,21 @@
1
+ # Optimizely Integration
2
+
3
+ Optimizely is a website optimization platform which includes a/b testing and personalization products.
4
+
5
+ The Optimizely integration uses Optimizely's API to fetch data for your projects, experiments, and variations, then puts them into
6
+ three tables in your data warehouse.
7
+
8
+ To use the Optimizely integration, first, get an API token from [https://app.optimizely.com/tokens](https://app.optimizely.com/tokens). Then add the following to your project's .env file:
9
+
10
+ ```
11
+ optimizely_api_token=YOUR_TOKEN
12
+ ```
13
+
14
+ Finally, add the following file to your project's /src/tables directory, naming it optimizely_integration.rb
15
+
16
+ ```ruby
17
+ class OptimizelyIntegration < DataDuck::Optimizely::OptimizelyIntegration
18
+ end
19
+ ```
20
+
21
+ Now, running `dataduck etl optimizely_integration` will ETL three tables for you. These tables are `optimizely_projects`, `optimizely_experiments`, and `optimizely_variations`. The results data can be found on the variations. Additionally, a `dataduck_extracted_at` datetime column indicates how fresh the data is.
@@ -0,0 +1,36 @@
1
+ # SEMrush Integration
2
+
3
+ SEMrush is a powerful and versatile competitive intelligence suite for online marketing, from SEO and PPC to social media and video advertising research.
4
+
5
+ The SEMrush integration is currently focused on SEO. It will create a table called `semrush_organic_results` that shows the Google search ranking for specific phrases. By running this regularly, you can see how your website or your competitors' websites search rankings change over time.
6
+
7
+ To use the SEMrush integration, first add the following to your .env file:
8
+
9
+ ```
10
+ semrush_api_key=YOUR_API_KEY
11
+ ```
12
+
13
+ Then create a table called `organic_results` with the following:
14
+
15
+ ```ruby
16
+ class OrganicResults < DataDuck::SEMRush::OrganicResults
17
+ def display_limit
18
+ 20 # Default is 20
19
+ end
20
+
21
+ def search_database
22
+ 'us' # Default is 'us'
23
+ end
24
+
25
+ def phrases
26
+ ['My Phrase 1',
27
+ "Another Phrase",
28
+ "Some Other Keywords",
29
+ ]
30
+ end
31
+ end
32
+ ```
33
+
34
+ This table will have five columns: date, phrase, rank, domain, and url.
35
+
36
+ The methods display_limit and search_database are optional, but can be modified to fit your particular use case.
@@ -0,0 +1,70 @@
1
+ # Incremental vs Full Loading
2
+
3
+ Loading a table can be performed either incrementally or with a full reload each time. An incremental load is generally
4
+ better, since it takes less time and transfers less data, however not all tables cannot be loaded incrementally.
5
+
6
+ ## Incremental loading
7
+
8
+ If you are running an ETL process regularly, rather than loading an entire table each time, it is more efficient
9
+ to load just the rows that have changed. This is known as an incremental load. By default, if a table contains
10
+ a row called `updated_at`, DataDuck ETL will use incremental loading based off of that column. If no such column
11
+ exists, it will load the entire table each time.
12
+
13
+ If rows can be deleted from a table, you should not use incremental loading either, since DataDuck ETL won't know which rows
14
+ have been deleted. Soft deleting a row, by setting a column to 'deleted' (for example) is fine to use with incremental loading.
15
+
16
+ Under the hood, before extracting, DataDuck ETL will check the destination for the latest value of a column, then use that value as a LIMIT
17
+ when running the extract query.
18
+
19
+ If you would like to base an incremental load on a different column, such as `id` or `created_at` (common in cases where
20
+ the rows are not expected to change, like an event stream), then you can do so by giving your table a method `extract_by_column`.
21
+
22
+ ```ruby
23
+ class MyTable < DataDuck::Table
24
+ source :source1, ["id", "created_at", "name"]
25
+
26
+ def extract_by_column
27
+ 'created_at'
28
+ end
29
+
30
+ output({
31
+ :id => :integer,
32
+ :created_at => :datetime,
33
+ :name => :string,
34
+ })
35
+ end
36
+ ```
37
+
38
+ ## Full reloads
39
+
40
+ Fully reloading a table takes longer, so it is only recommended you do this with tables where it is not possible to use
41
+ incremental loads.
42
+
43
+ If you would like to fully reload the table each time, you may give your table an `extract_by_column` that returns `nil`.
44
+ Alternatively, if you want to have an `extract_by_column` but still reload the entire table each time, you may
45
+ give it a method `should_fully_reload?` that returns true. An example of when you might want to do this is if you are
46
+ reloading an entire table, but doing it in batches.
47
+
48
+ ```ruby
49
+ class MyTableFullyReloaded < DataDuck::Table
50
+ source :source1, ["id", "created_at", "name"]
51
+
52
+ def batch_size
53
+ 1_000_000 # if there is a lot of data, and you want to use less memory (for example), batching is a good idea
54
+ end
55
+
56
+ def extract_by_column
57
+ 'created_at'
58
+ end
59
+
60
+ def should_fully_reload?
61
+ true
62
+ end
63
+
64
+ output({
65
+ :id => :integer,
66
+ :created_at => :datetime,
67
+ :name => :string,
68
+ })
69
+ end
70
+ ```
@@ -47,7 +47,11 @@ module DataDuck
47
47
  return DataDuck::Commands.help
48
48
  end
49
49
 
50
- DataDuck::Commands.public_send(command, *args[1..-1])
50
+ begin
51
+ DataDuck::Commands.public_send(command, *args[1..-1])
52
+ rescue Exception => err
53
+ DataDuck::Logs.error(err)
54
+ end
51
55
  end
52
56
 
53
57
  def self.c
@@ -10,7 +10,7 @@ module DataDuck
10
10
  raise Exception.new("Must implement connection in subclass.")
11
11
  end
12
12
 
13
- def query
13
+ def query(sql)
14
14
  raise Exception.new("Must implement query in subclass.")
15
15
  end
16
16
 
@@ -34,6 +34,10 @@ module DataDuck
34
34
  raise Exception.new("Must implement load_table! in subclass")
35
35
  end
36
36
 
37
+ def postprocess!(table)
38
+ # e.g. vacuum or build indexes
39
+ end
40
+
37
41
  def self.destination(name, allow_nil = false)
38
42
  name = name.to_s
39
43
 
data/lib/dataduck/logs.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'logger'
2
+ require 'raven'
2
3
 
3
4
  module DataDuck
4
5
  module Logs
@@ -43,6 +44,10 @@ module DataDuck
43
44
 
44
45
  puts "[ERROR] #{ message }"
45
46
  @@logger.error(message)
47
+
48
+ if ENV['SENTRY_DSN']
49
+ Raven.capture_exception(err)
50
+ end
46
51
  end
47
52
 
48
53
  private
@@ -277,6 +277,12 @@ module DataDuck
277
277
  self.query("DROP TABLE zz_dataduck_recreating_old_#{ table.name }")
278
278
  end
279
279
 
280
+ def postprocess!(table)
281
+ DataDuck::Logs.info "Vacuuming table #{ table.name }"
282
+ vacuum_type = table.indexes.length == 0 ? "FULL" : "REINDEX"
283
+ self.query("VACUUM #{ vacuum_type } #{ table.name }")
284
+ end
285
+
280
286
  def self.value_to_string(value)
281
287
  string_value = ''
282
288
 
@@ -111,6 +111,8 @@ module DataDuck
111
111
  if self.should_fully_reload?
112
112
  destination.finish_fully_reloading_table!(self)
113
113
  end
114
+
115
+ self.postprocess!(destination, options)
114
116
  end
115
117
 
116
118
  def extract!(destination = nil, options = {})
@@ -220,6 +222,10 @@ module DataDuck
220
222
  self.output_schema.keys.sort.map(&:to_s)
221
223
  end
222
224
 
225
+ def postprocess!(destination, options = {})
226
+ destination.postprocess!(self)
227
+ end
228
+
223
229
  def recreate!(destination)
224
230
  destination.recreate_table!(self)
225
231
  end
@@ -1,8 +1,8 @@
1
1
  module DataDuck
2
2
  if !defined?(DataDuck::VERSION)
3
3
  VERSION_MAJOR = 0
4
- VERSION_MINOR = 6
5
- VERSION_PATCH = 8
4
+ VERSION_MINOR = 7
5
+ VERSION_PATCH = 0
6
6
  VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
7
7
  end
8
8
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataduck
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.8
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeff Pickhardt
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-09 00:00:00.000000000 Z
11
+ date: 2015-12-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -178,6 +178,20 @@ dependencies:
178
178
  - - "~>"
179
179
  - !ruby/object:Gem::Version
180
180
  version: '0.9'
181
+ - !ruby/object:Gem::Dependency
182
+ name: sentry-raven
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - "~>"
186
+ - !ruby/object:Gem::Version
187
+ version: '0.15'
188
+ type: :runtime
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - "~>"
193
+ - !ruby/object:Gem::Version
194
+ version: '0.15'
181
195
  description: A straightforward, effective ETL framework.
182
196
  email:
183
197
  - pickhardt@gmail.com
@@ -206,9 +220,12 @@ files:
206
220
  - docs/commands/recreate.md
207
221
  - docs/commands/show.md
208
222
  - docs/contents.yml
223
+ - docs/integrations/optimizely.md
224
+ - docs/integrations/semrush.md
209
225
  - docs/overview/README.md
210
226
  - docs/overview/getting_started.md
211
227
  - docs/tables/README.md
228
+ - docs/tables/incremental_vs_full_loading.md
212
229
  - examples/example/.gitignore
213
230
  - examples/example/.ruby-version
214
231
  - examples/example/Gemfile