dataduck 1.1.0 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4c85a45aea48dc00fdef79467a5148e71a32c691
4
- data.tar.gz: 3c6a10310fcde7ca07efee6ed127d5339174b3dd
3
+ metadata.gz: a48e9e513313a27d24f94c23e4f2659ebecf1c1d
4
+ data.tar.gz: a89623ebfe52f3dd24bb239aea5e366aeed0e99f
5
5
  SHA512:
6
- metadata.gz: 14b0a7a521cf17446418dbf7536bbf60cebe32e38e2eae34464b96980d1e6318aa5f21be430cda827098771c63dd607f1b7b3fc21f1b27f2590e9bfe3ea2bd7b
7
- data.tar.gz: fd14a28b61f6e75c4f5af591c39ec72b20abcfaf659de7ef0e63cf80c943f42f02567e61a1c2757e368e6ba322a90c2279f057d070ab68d93d9d0605551499ad
6
+ metadata.gz: 07c9acd3135428eda030cf6fe7270ee0e827448f6227761627c2ef546cc861faef4dd0151a446aa91b2ba5267e1014d45dac5c2b27daae5e58d09d0116b63a0c
7
+ data.tar.gz: 468134eb9bdffbc2996c9a7060449bca8625d5a8597a2dc81ed0ea04c3d1aa26cb7932580f33ea1da440d645534fa54c4a0f4d30720cb75f7ad006fbacc6ac89
@@ -45,9 +45,47 @@ and leave the rest of the process (and the Redshift loading) up to DataDuck.
45
45
  ## The `extract!` method
46
46
 
47
47
  The `extract!` method takes one argument: the destination. It then extracts the data from the source necessary to load
48
- data into the destination. If you are writing your own Table class with some custom third party API, you will probably
48
+ data into the destination. If you are writing your own Table class with some custom third party API, you will probably
49
49
  want to overwrite this method.
50
50
 
51
+ ## Overriding indexes (sortkeys)
52
+
53
+ By sortkey, Redshift means what other databases would generally call indexes. DataDuck ETL will use `id` and `created_at` as sortkeys by default. If you would like to specify your own, simply overwrite the `indexes` method on your table, like this example:
54
+
55
+ ```ruby
56
+ class Decks < DataDuck::Table
57
+ # source info goes here
58
+
59
+ def indexes
60
+ ["id", "user_id"]
61
+ end
62
+
63
+ # output info goes here
64
+ end
65
+ ```
66
+
67
+ ## Overriding distkeys and diststyles
68
+
69
+ For large datasets, Redshift can distribute the data across multiple compute nodes according to your distkey and diststyle. To use these, simply overwrite the distribution_key and distribution_style methods.
70
+
71
+ ```ruby
72
+ class Decks < DataDuck::Table
73
+ # source info goes here
74
+
75
+ def distribution_key
76
+ "company_id"
77
+ end
78
+
79
+ def distribution_style
80
+ "all"
81
+ end
82
+
83
+ # output info goes here
84
+ end
85
+ ```
86
+
87
+ For more info, read: [http://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html](Distributing Data)
88
+
51
89
  ## Example Table
52
90
 
53
91
  The following is an example table.
@@ -240,7 +240,10 @@ module DataDuck
240
240
  puts "Connection successful. Detected #{ table_names.length } tables."
241
241
  puts "Creating scaffolding..."
242
242
  table_names.each do |table_name|
243
- DataDuck::Commands.quickstart_create_table(table_name, db_source)
243
+ begin
244
+ DataDuck::Commands.quickstart_create_table(table_name, db_source)
245
+ rescue
246
+ end
244
247
  end
245
248
 
246
249
  config_obj = {
@@ -72,9 +72,10 @@ module DataDuck
72
72
  props_string = props_array.join(', ')
73
73
 
74
74
  distribution_clause = table.distribution_key ? "DISTKEY(#{ table.distribution_key })" : ""
75
+ distribution_style_clause = table.distribution_style ? "DISTSTYLE #{ distribution_style }" : ""
75
76
  index_clause = table.indexes.length > 0 ? "INTERLEAVED SORTKEY (#{ table.indexes.join(',') })" : ""
76
77
 
77
- "CREATE TABLE IF NOT EXISTS #{ table_name } (#{ props_string }) #{ distribution_clause } #{ index_clause }"
78
+ "CREATE TABLE IF NOT EXISTS #{ table_name } (#{ props_string }) #{ distribution_clause } #{ distribution_style_clause } #{ index_clause }"
78
79
  end
79
80
 
80
81
  def create_output_tables!(table)
@@ -70,6 +70,10 @@ module DataDuck
70
70
  end
71
71
  end
72
72
 
73
+ def distribution_style
74
+ nil
75
+ end
76
+
73
77
  def etl!(destinations, options = {})
74
78
  if destinations.length != 1
75
79
  raise ArgumentError.new("DataDuck can only etl to one destination at a time for now.")
@@ -1,7 +1,7 @@
1
1
  module DataDuck
2
2
  if !defined?(DataDuck::VERSION)
3
3
  VERSION_MAJOR = 1
4
- VERSION_MINOR = 1
4
+ VERSION_MINOR = 2
5
5
  VERSION_PATCH = 0
6
6
  VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataduck
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeff Pickhardt
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-09-05 00:00:00.000000000 Z
11
+ date: 2017-03-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -295,7 +295,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
295
295
  version: '0'
296
296
  requirements: []
297
297
  rubyforge_project:
298
- rubygems_version: 2.4.6
298
+ rubygems_version: 2.6.8
299
299
  signing_key:
300
300
  specification_version: 4
301
301
  summary: A straightforward, effective ETL framework.