dataduck 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/docs/tables/README.md +39 -1
- data/lib/dataduck/commands.rb +4 -1
- data/lib/dataduck/redshift_destination.rb +2 -1
- data/lib/dataduck/table.rb +4 -0
- data/lib/dataduck/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a48e9e513313a27d24f94c23e4f2659ebecf1c1d
|
4
|
+
data.tar.gz: a89623ebfe52f3dd24bb239aea5e366aeed0e99f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 07c9acd3135428eda030cf6fe7270ee0e827448f6227761627c2ef546cc861faef4dd0151a446aa91b2ba5267e1014d45dac5c2b27daae5e58d09d0116b63a0c
|
7
|
+
data.tar.gz: 468134eb9bdffbc2996c9a7060449bca8625d5a8597a2dc81ed0ea04c3d1aa26cb7932580f33ea1da440d645534fa54c4a0f4d30720cb75f7ad006fbacc6ac89
|
data/docs/tables/README.md
CHANGED
@@ -45,9 +45,47 @@ and leave the rest of the process (and the Redshift loading) up to DataDuck.
|
|
45
45
|
## The `extract!` method
|
46
46
|
|
47
47
|
The `extract!` method takes one argument: the destination. It then extracts the data from the source necessary to load
|
48
|
-
data into the destination. If you are writing your own Table class with some custom third party API, you will probably
|
48
|
+
data into the destination. If you are writing your own Table class with some custom third party API, you will probably
|
49
49
|
want to overwrite this method.
|
50
50
|
|
51
|
+
## Overriding indexes (sortkeys)
|
52
|
+
|
53
|
+
By sortkey, Redshift means what other databases would generally call indexes. DataDuck ETL will use `id` and `created_at` as sortkeys by default. If you would like to specify your own, simply overwrite the `indexes` method on your table, like this example:
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
class Decks < DataDuck::Table
|
57
|
+
# source info goes here
|
58
|
+
|
59
|
+
def indexes
|
60
|
+
["id", "user_id"]
|
61
|
+
end
|
62
|
+
|
63
|
+
# output info goes here
|
64
|
+
end
|
65
|
+
```
|
66
|
+
|
67
|
+
## Overriding distkeys and diststyles
|
68
|
+
|
69
|
+
For large datasets, Redshift can distribute the data across multiple compute nodes according to your distkey and diststyle. To use these, simply overwrite the distribution_key and distribution_style methods.
|
70
|
+
|
71
|
+
```ruby
|
72
|
+
class Decks < DataDuck::Table
|
73
|
+
# source info goes here
|
74
|
+
|
75
|
+
def distribution_key
|
76
|
+
"company_id"
|
77
|
+
end
|
78
|
+
|
79
|
+
def distribution_style
|
80
|
+
"all"
|
81
|
+
end
|
82
|
+
|
83
|
+
# output info goes here
|
84
|
+
end
|
85
|
+
```
|
86
|
+
|
87
|
+
For more info, read: [http://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html](Distributing Data)
|
88
|
+
|
51
89
|
## Example Table
|
52
90
|
|
53
91
|
The following is an example table.
|
data/lib/dataduck/commands.rb
CHANGED
@@ -240,7 +240,10 @@ module DataDuck
|
|
240
240
|
puts "Connection successful. Detected #{ table_names.length } tables."
|
241
241
|
puts "Creating scaffolding..."
|
242
242
|
table_names.each do |table_name|
|
243
|
-
|
243
|
+
begin
|
244
|
+
DataDuck::Commands.quickstart_create_table(table_name, db_source)
|
245
|
+
rescue
|
246
|
+
end
|
244
247
|
end
|
245
248
|
|
246
249
|
config_obj = {
|
@@ -72,9 +72,10 @@ module DataDuck
|
|
72
72
|
props_string = props_array.join(', ')
|
73
73
|
|
74
74
|
distribution_clause = table.distribution_key ? "DISTKEY(#{ table.distribution_key })" : ""
|
75
|
+
distribution_style_clause = table.distribution_style ? "DISTSTYLE #{ distribution_style }" : ""
|
75
76
|
index_clause = table.indexes.length > 0 ? "INTERLEAVED SORTKEY (#{ table.indexes.join(',') })" : ""
|
76
77
|
|
77
|
-
"CREATE TABLE IF NOT EXISTS #{ table_name } (#{ props_string }) #{ distribution_clause } #{ index_clause }"
|
78
|
+
"CREATE TABLE IF NOT EXISTS #{ table_name } (#{ props_string }) #{ distribution_clause } #{ distribution_style_clause } #{ index_clause }"
|
78
79
|
end
|
79
80
|
|
80
81
|
def create_output_tables!(table)
|
data/lib/dataduck/table.rb
CHANGED
data/lib/dataduck/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataduck
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeff Pickhardt
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-03-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -295,7 +295,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
295
295
|
version: '0'
|
296
296
|
requirements: []
|
297
297
|
rubyforge_project:
|
298
|
-
rubygems_version: 2.
|
298
|
+
rubygems_version: 2.6.8
|
299
299
|
signing_key:
|
300
300
|
specification_version: 4
|
301
301
|
summary: A straightforward, effective ETL framework.
|