dataduck 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a4dabe01cff2c6455751ab08c520d4bfaee62139
4
- data.tar.gz: d20ef216bc631c445daad0767a51788b42b7f90f
3
+ metadata.gz: de529cfe949f8c1fb4a4cb36129188636ffbcb74
4
+ data.tar.gz: ebbcaa35d0babcbabdaef339f9ef72b061fc54d5
5
5
  SHA512:
6
- metadata.gz: d2eacaf08c612c25ae8bf9b1b1d46d4a0312fe0024211d0a8306faa5a810b972a5c2aa8386c4b05b04a26d73093bcae5a89d72bcadef98f6ed7e062054d40410
7
- data.tar.gz: 2c4c1aec2a0257ad3dcc4e9559436c39de0f747a6ec3fdb816afb7d096678d7c1f608269b6c8dc55d1f1aeac514153bdbec7dbb7d3c082699a89f28e16577b22
6
+ metadata.gz: 2958e2909631c314c7104fa340f0a587b47ab172417aa792f2b1a31377b1c455188acb0554c84ab656d057c4c060f9d7110bb6971fe45cedc8d4d3a117339d1e
7
+ data.tar.gz: 5e62d009d64ebe30b1ade7184c5e7f1041e8c58d3cbf5be39d3d1885e89a3126870197568694ead1673018985b079757ad00117d1aa5423d6413d027a292cd09
data/docs/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Documentation
2
2
 
3
- The documentation directory is viewable at (http://dataducketl.com/docs)[http://dataducketl.com/docs].
3
+ The documentation directory is viewable at http://dataducketl.com/docs.
4
4
 
5
5
  # Autogenerated
6
6
 
@@ -0,0 +1,12 @@
1
+ # Commands
2
+
3
+ Commands can be run by running `dataduck commandname` from the project directory, assuming that you've already
4
+ run `bundle install` to install the DataDuck gem.
5
+
6
+ The list of commands is:
7
+
8
+ - [console](/docs/commands/console)
9
+ - [dbconsole](/docs/commands/dbconsole)
10
+ - [etl](/docs/commands/etl)
11
+ - [quickstart](/docs/commands/quickstart)
12
+ - [show](/docs/commands/show)
@@ -0,0 +1,5 @@
1
+ # The `console` command
2
+
3
+ The `console` command will place you into a Ruby console with DataDuck loaded. This can be useful for debugging. Run it with:
4
+
5
+ `$ dataduck console`
@@ -0,0 +1,16 @@
1
+ # The `dbconsole` command
2
+
3
+ The `dbconsole` command will place you into a database connection with one of your databases, by using the appropriate command
4
+ on your system (e.g. `mysql` or `psql`).
5
+
6
+ This will connect you with the destination (e.g. Redshift):
7
+
8
+ `$ dataduck dbconsole`
9
+
10
+ You can also use one of these:
11
+
12
+ `$ dataduck dbconsole source`
13
+
14
+ `$ dataduck dbconsole destination`
15
+
16
+ `$ dataduck dbconsole [db_name]`
@@ -0,0 +1,11 @@
1
+ # The `etl` command
2
+
3
+ The `etl` command is the main command for running an ETL process. You can use it to ETL all the tables, or just one table at a time.
4
+
5
+ To ETL all tables, use:
6
+
7
+ `$ dataduck etl all`
8
+
9
+ To ETL just one table, use:
10
+
11
+ `$ dataduck etl my_table_name`
@@ -0,0 +1,7 @@
1
+ # The `quickstart` command
2
+
3
+ The `quickstart` command will give you a wizard for getting started with DataDuck. You should only use this with a brand new DataDuck project.
4
+
5
+ It will ask you for the credentials to your database, and then create the basic setup for your project. After you are completely setup, your project's ETL can be run by running `dataduck etl`
6
+
7
+ If you would like to run the ETL regularly, such as every night, it's recommended to use the [whenever](https://github.com/javan/whenever) gem to manage a cron job to regularly run the ETL.
@@ -0,0 +1,27 @@
1
+ # The `show` command
2
+
3
+ The `show` command shows you the database tables that DataDuck is planning to ETL.
4
+
5
+ Usage to show all table names:
6
+
7
+ `$ dataduck show`
8
+
9
+
10
+ Usage to show info for just one table:
11
+
12
+ ```bash
13
+ $ dataduck show users
14
+ Table users
15
+
16
+ Sources from users on my_database
17
+ created_at
18
+ updated_at
19
+ id
20
+ username
21
+
22
+ Outputs
23
+ created_at datetime
24
+ updated_at datetime
25
+ id integer
26
+ username string
27
+ ```
data/docs/contents.yml CHANGED
@@ -2,5 +2,12 @@
2
2
  "Welcome": README
3
3
  "Getting Started": getting_started
4
4
 
5
+ "Commands":
6
+ "console": console
7
+ "dbconsole": dbconsole
8
+ "etl": etl
9
+ "quickstart": quickstart
10
+ "show": show
11
+
5
12
  "Tables":
6
13
  "The Table Class": README
@@ -23,6 +23,6 @@ Finally, run the quickstart command:
23
23
 
24
24
  $ dataduck quickstart
25
25
 
26
- It will ask you for the credentials to your database, and then create the basic setup for your project. After the setup, your project's ETL can be run by running `ruby src/main.rb`
26
+ It will ask you for the credentials to your database, and then create the basic setup for your project. After you are completely setup, your project's ETL can be run by running `dataduck etl`
27
27
 
28
28
  If you would like to run this regularly, such as every night, it's recommended to use the [whenever](https://github.com/javan/whenever) gem to manage a cron job to regularly run the ETL.
@@ -5,6 +5,49 @@ Each of these table files inherits from `DataDuck::Table`, the base table class.
5
5
 
6
6
  You may also define transformations with the `transforms` method and validations with `validates` method.
7
7
 
8
+ ## Types of Loading Methods
9
+
10
+ There are a few different methods to load your table. You can load the whole table fresh with each ETL, or you can load
11
+ just the most recently changed rows (based off some column such as an updated_at column).
12
+
13
+ Loading just those rows that have changed is best for most tables, since it significantly reduces the amount of data you
14
+ transfer as well as the time your ETL process takes. Loading the whole table fresh each time is best if the table is
15
+ small or rows may be deleted from the table by your main application. (In the case that rows are deleted, you need to reload
16
+ the whole table each ETL, since the ETL process wouldn't otherwise know which rows no longer exist.)
17
+
18
+ ## The `should_fully_reload?` method
19
+
20
+ If `should_fully_reload?` is true, the table will be fully reloaded each ETL. By default, this is false.
21
+
22
+ ## The `extract_by_column` and `batch_size` methods
23
+
24
+ The alternative to fully reloading is to use an `extract_by_column`. By default, `extract_by_column` returns updated_at
25
+ if your table has an updated_at column. This way, only the rows that have changed need be ETLed. This can give you
26
+ significant performance improvements, which is why it is the default.
27
+
28
+ If the `batch_size` method is set, the extract query will use a `LIMIT batch_size` clause. This is useful if your table
29
+ is fairly big and you are running DataDuck on a small EC2 instance or other computer without a lot of memory.
30
+
31
+ In order to use `batch_size`, you must also set the `extract_by_column`
32
+
33
+ An example of where you might want to override the default `extract_by_column` is if you are tracking visitor events in
34
+ a table, and the visitor events are never modified. In this case, you might not even have an `updated_at` column. Instead,
35
+ you could use the `created_at` column or the `id` column (if ids are assumed to be generated always increasing).
36
+
37
+ ## The `etl!` method
38
+
39
+ The `etl!` method is what gets called when you run the `dataduck etl` command. It first extracts the
40
+ data from your source via the `extract!` method, transforms the data according to any transformations you've created in
41
+ the `transform!` method, then loads the data into your destination with the `destination.load_table!` method.
42
+ You may overwrite this if you have some custom ETL process, however, it may be better to overwrite the `extract!` method
43
+ and leave the rest of the process (and the Redshift loading) up to DataDuck.
44
+
45
+ ## The `extract!` method
46
+
47
+ The `extract!` method takes one argument: the destination. It then extracts the data from the source necessary to load
48
+ data into the destination. If you are writing your own Table class with some custom third party API, you will probably
49
+ want to overwrite this method.
50
+
8
51
  ## Example Table
9
52
 
10
53
  The following is an example table.
@@ -32,7 +32,7 @@ module DataDuck
32
32
  end
33
33
 
34
34
  def self.acceptable_commands
35
- ['console', 'quickstart']
35
+ ['console', 'dbconsole', 'etl', 'quickstart', 'show']
36
36
  end
37
37
 
38
38
  def self.route_command(args)
@@ -46,16 +46,92 @@ module DataDuck
46
46
  return DataDuck::Commands.help
47
47
  end
48
48
 
49
- DataDuck::Commands.public_send(command)
49
+ DataDuck::Commands.public_send(command, *args[1..-1])
50
50
  end
51
51
 
52
52
  def self.console
53
53
  require "irb"
54
+ ARGV.clear
54
55
  IRB.start
55
56
  end
56
57
 
58
+ def self.dbconsole(where = "destination")
59
+ which_database = nil
60
+ if where == "destination"
61
+ which_database = DataDuck::Destination.only_destination
62
+ elsif where == "source"
63
+ which_database = DataDuck::Source.only_source
64
+ else
65
+ found_source = DataDuck::Source.source(where, true)
66
+ found_destination = DataDuck::Destination.destination(where, true)
67
+ if found_source && found_destination
68
+ raise ArgumentError.new("Ambiguous call to dbconsole for #{ where } since there is both a source and destination named #{ where }.")
69
+ end
70
+
71
+ which_database = found_source if found_source
72
+ which_database = found_destination if found_destination
73
+ end
74
+
75
+ if which_database.nil?
76
+ raise ArgumentError.new("Could not find database '#{ where }'")
77
+ end
78
+
79
+ puts "Connecting to #{ where }..."
80
+ which_database.dbconsole
81
+ end
82
+
83
+ def self.etl(what = nil)
84
+ if what.nil?
85
+ puts "You need to specify a table name or 'all'. Usage: dataduck etl all OR datduck etl my_table_name"
86
+ return
87
+ end
88
+
89
+ only_destination = DataDuck::Destination.only_destination
90
+
91
+ if what == "all"
92
+ etl = ETL.new(destinations: [only_destination], autoload_tables: true)
93
+ etl.process!
94
+ else
95
+ table_name_camelized = DataDuck::Util.underscore_to_camelcase(what)
96
+ require DataDuck.project_root + "/src/tables/#{ what }.rb"
97
+ table_class = Object.const_get(table_name_camelized)
98
+ if !(table_class <= DataDuck::Table)
99
+ raise Exception.new("Table class #{ table_name_camelized } must inherit from DataDuck::Table")
100
+ end
101
+
102
+ table = table_class.new
103
+ etl = ETL.new(destinations: [only_destination], autoload_tables: false, tables: [table])
104
+ etl.process_table!(table)
105
+ end
106
+ end
107
+
57
108
  def self.help
58
109
  puts "Usage: dataduck commandname"
110
+ puts "Commands: #{ acceptable_commands.sort.join(' ') }"
111
+ end
112
+
113
+ def self.show(table_name = nil)
114
+ if table_name.nil?
115
+ Dir[DataDuck.project_root + "/src/tables/*.rb"].each do |file|
116
+ table_name_underscores = file.split("/").last.gsub(".rb", "")
117
+ table_name_camelized = DataDuck::Util.underscore_to_camelcase(table_name_underscores)
118
+ require file
119
+ table = Object.const_get(table_name_camelized)
120
+ if table <= DataDuck::Table
121
+ puts table_name_underscores
122
+ end
123
+ end
124
+ else
125
+ table_name_camelized = DataDuck::Util.underscore_to_camelcase(table_name)
126
+ require DataDuck.project_root + "/src/tables/#{ table_name }.rb"
127
+ table_class = Object.const_get(table_name_camelized)
128
+ if !(table_class <= DataDuck::Table)
129
+ raise Exception.new("Table class #{ table_name_camelized } must inherit from DataDuck::Table")
130
+ end
131
+
132
+ table = table_class.new
133
+ table.show
134
+ end
59
135
  end
60
136
 
61
137
  def self.quickstart
@@ -0,0 +1,81 @@
1
+ module DataDuck
2
+ class Database
3
+ attr_accessor :name
4
+
5
+ def initialize(name, *args)
6
+ self.name = name
7
+ end
8
+
9
+ def connection
10
+ raise Exception.new("Must implement connection in subclass.")
11
+ end
12
+
13
+ def query
14
+ raise Exception.new("Must implement query in subclass.")
15
+ end
16
+
17
+ def table_names
18
+ raise Exception.new("Must implement query in subclass.")
19
+ end
20
+
21
+ protected
22
+
23
+ def find_command_and_execute(commands, *args)
24
+ # This function was originally sourced from Rails
25
+ # https://github.com/rails/rails
26
+ #
27
+ # Licensed under the MIT license
28
+ # http://opensource.org/licenses/MIT
29
+ #
30
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
31
+ # of this software and associated documentation files (the "Software"), to deal
32
+ # in the Software without restriction, including without limitation the rights
33
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
34
+ # copies of the Software, and to permit persons to whom the Software is
35
+ # furnished to do so, subject to the following conditions:
36
+ #
37
+ # The above copyright notice and this permission notice shall be included in
38
+ # all copies or substantial portions of the Software.
39
+ #
40
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
43
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
44
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
45
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
46
+ # THE SOFTWARE.
47
+
48
+ commands = Array(commands)
49
+
50
+ dirs_on_path = ENV['PATH'].to_s.split(File::PATH_SEPARATOR)
51
+
52
+ full_path_command = nil
53
+ found = commands.detect do |cmd|
54
+ dirs_on_path.detect do |path|
55
+ full_path_command = File.join(path, cmd)
56
+ File.file?(full_path_command) && File.executable?(full_path_command)
57
+ end
58
+ end
59
+
60
+ if found
61
+ exec full_path_command, *args
62
+ else
63
+ abort("Couldn't find command: #{commands.join(', ')}. Check your $PATH and try again.")
64
+ end
65
+ end
66
+
67
+ def is_mutating_sql?(sql)
68
+ # This method is not all exhaustive, and is not meant to be necessarily relied on, but is a
69
+ # sanity check that can be used to ensure certain sql is not mutating.
70
+
71
+ return true if sql.downcase.start_with?("drop table")
72
+ return true if sql.downcase.start_with?("create table")
73
+ return true if sql.downcase.start_with?("delete from")
74
+ return true if sql.downcase.start_with?("insert into")
75
+ return true if sql.downcase.start_with?("alter table")
76
+
77
+ false
78
+ end
79
+
80
+ end
81
+ end
@@ -1,5 +1,21 @@
1
1
  module DataDuck
2
- class Destination
2
+ class Destination < DataDuck::Database
3
+ def self.load_config!
4
+ all_config = DataDuck.config['destinations']
5
+ return if all_config.nil?
6
+
7
+ all_config.each_key do |destination_name|
8
+ configuration = all_config[destination_name]
9
+ destination_type = configuration['type']
10
+
11
+ if destination_type == "redshift"
12
+ DataDuck.destinations[destination_name] = DataDuck::RedshiftDestination.new(destination_name, configuration)
13
+ else
14
+ raise ArgumentError.new("Unknown type '#{ destination_type }' for destination #{ destination_name }.")
15
+ end
16
+ end
17
+ end
18
+
3
19
  def self.destination_config(name)
4
20
  if DataDuck.config['destinations'].nil? || DataDuck.config['destinations'][name.to_s].nil?
5
21
  raise Exception.new("Could not find destination #{ name } in destinations configs.")
@@ -12,21 +28,25 @@ module DataDuck
12
28
  raise Exception.new("Must implement load_table! in subclass")
13
29
  end
14
30
 
15
- def self.destination(destination_name)
16
- destination_name = destination_name.to_s
31
+ def self.destination(name, allow_nil = false)
32
+ name = name.to_s
17
33
 
18
- if DataDuck.destinations[destination_name]
19
- return DataDuck.destinations[destination_name]
34
+ if DataDuck.destinations[name]
35
+ return DataDuck.destinations[name]
36
+ elsif allow_nil
37
+ return nil
38
+ else
39
+ raise Exception.new("Could not find destination #{ name } in destination configs.")
20
40
  end
41
+ end
21
42
 
22
- destination_configuration = DataDuck::Destination.destination_config(destination_name)
23
- destination_type = destination_configuration['type']
24
- if destination_type == "redshift"
25
- DataDuck.destinations[destination_name] = DataDuck::RedshiftDestination.new(destination_configuration)
26
- return DataDuck.destinations[destination_name]
27
- else
28
- raise ArgumentError.new("Unknown type '#{ destination_type }' for destination #{ destination_name }.")
43
+ def self.only_destination
44
+ if DataDuck.destinations.keys.length != 1
45
+ raise ArgumentError.new("Must be exactly 1 destination.")
29
46
  end
47
+
48
+ destination_name = DataDuck.destinations.keys[0]
49
+ return DataDuck::Destination.destination(destination_name)
30
50
  end
31
51
  end
32
52
  end
data/lib/dataduck/etl.rb CHANGED
@@ -11,8 +11,13 @@ module DataDuck
11
11
  self.destinations << DataDuck::Destination.destination(destination_name)
12
12
  end
13
13
 
14
+ attr_accessor :destinations
15
+ attr_accessor :tables
16
+
14
17
  def initialize(options = {})
18
+ self.class.destinations ||= []
15
19
  @tables = options[:tables] || []
20
+ @destinations = options[:destinations] || []
16
21
 
17
22
  @autoload_tables = options[:autoload_tables].nil? ? true : options[:autoload_tables]
18
23
  if @autoload_tables
@@ -29,16 +34,28 @@ module DataDuck
29
34
  end
30
35
 
31
36
  def process!
32
- puts "Processing ETL..."
37
+ DataDuck::Logs.info "Processing ETL..."
38
+
39
+ destinations_to_use = []
40
+ destinations_to_use = destinations_to_use.concat(self.class.destinations)
41
+ destinations_to_use = destinations_to_use.concat(self.destinations)
42
+ destinations_to_use.uniq!
33
43
 
34
44
  @tables.each do |table_class|
35
45
  table_to_etl = table_class.new
36
- table_to_etl.extract!
37
- table_to_etl.transform!
38
- self.class.destinations.each do |destination|
39
- destination.load_table!(table_to_etl)
40
- end
46
+ table_to_etl.etl!(destinations_to_use)
41
47
  end
42
48
  end
49
+
50
+ def process_table!(table)
51
+ DataDuck::Logs.info "Processing ETL for table #{ table.name }..."
52
+
53
+ destinations_to_use = []
54
+ destinations_to_use = destinations_to_use.concat(self.class.destinations)
55
+ destinations_to_use = destinations_to_use.concat(self.destinations)
56
+ destinations_to_use.uniq!
57
+
58
+ table.etl!(destinations_to_use)
59
+ end
43
60
  end
44
61
  end
@@ -0,0 +1,34 @@
1
+ require 'logger'
2
+
3
+ module DataDuck
4
+ module Logs
5
+ @@ONE_MB_IN_BYTES = 1048576
6
+
7
+ @@logger = nil
8
+
9
+ def Logs.ensure_logger_exists!
10
+ log_file_path = DataDuck.project_root + '/log/dataduck.log'
11
+ DataDuck::Util.ensure_path_exists!(log_file_path)
12
+ @@logger ||= Logger.new(log_file_path, shift_age = 100, shift_size = 100 * @@ONE_MB_IN_BYTES)
13
+ end
14
+
15
+ def Logs.info(message)
16
+ self.ensure_logger_exists!
17
+ puts "[INFO] #{ message }"
18
+ @@logger.info(message)
19
+ end
20
+
21
+ def Logs.warn(message)
22
+ self.ensure_logger_exists!
23
+ puts "[WARN] #{ message }"
24
+ @@logger.warn(message)
25
+ end
26
+
27
+ def Logs.error(err, message = nil)
28
+ self.ensure_logger_exists!
29
+ message = err.to_s unless message
30
+ puts "[ERROR] #{ message }"
31
+ @@logger.error(message)
32
+ end
33
+ end
34
+ end
@@ -7,5 +7,16 @@ module DataDuck
7
7
  def db_type
8
8
  'mysql'
9
9
  end
10
+
11
+ def dbconsole(options = {})
12
+ args = []
13
+ args << "--host=#{ @host }"
14
+ args << "--user=#{ @username }"
15
+ args << "--database=#{ @database }"
16
+ args << "--port=#{ @port }"
17
+ args << "--password=#{ @password }"
18
+
19
+ self.find_command_and_execute("mysql", *args)
20
+ end
10
21
  end
11
22
  end
@@ -7,5 +7,23 @@ module DataDuck
7
7
  def db_type
8
8
  'postgres'
9
9
  end
10
+
11
+ def dbconsole(options = {})
12
+ args = []
13
+ args << "--host=#{ @host }"
14
+ args << "--username=#{ @username }"
15
+ args << "--dbname=#{ @database }"
16
+ args << "--port=#{ @port }"
17
+
18
+ ENV['PGPASSWORD'] = @password
19
+
20
+ self.find_command_and_execute("psql", *args)
21
+ end
22
+
23
+ def data_size_for_table(table_name)
24
+ size_in_bytes = self.query("SELECT pg_total_relation_size('#{ table_name }')").first.to_i
25
+ size_in_gb = size_in_bytes / 1_000_000_000.0
26
+ size_in_gb
27
+ end
10
28
  end
11
29
  end
@@ -2,7 +2,7 @@ require_relative 'destination.rb'
2
2
 
3
3
  module DataDuck
4
4
  class RedshiftDestination < DataDuck::Destination
5
- def initialize(config)
5
+ def initialize(name, config)
6
6
  @aws_key = config['aws_key']
7
7
  @aws_secret = config['aws_secret']
8
8
  @s3_bucket = config['s3_bucket']
@@ -14,6 +14,8 @@ module DataDuck
14
14
  @username = config['username']
15
15
  @password = config['password']
16
16
  @redshift_connection = nil
17
+
18
+ super
17
19
  end
18
20
 
19
21
  def connection
@@ -27,7 +29,7 @@ module DataDuck
27
29
  def copy_query(table, s3_path)
28
30
  properties_joined_string = "\"#{ table.output_column_names.join('","') }\""
29
31
  query_fragments = []
30
- query_fragments << "COPY #{ self.staging_table_name(table) } (#{ properties_joined_string })"
32
+ query_fragments << "COPY #{ table.staging_name } (#{ properties_joined_string })"
31
33
  query_fragments << "FROM '#{ s3_path }'"
32
34
  query_fragments << "CREDENTIALS 'aws_access_key_id=#{ @aws_key };aws_secret_access_key=#{ @aws_secret }'"
33
35
  query_fragments << "REGION '#{ @s3_region }'"
@@ -37,13 +39,13 @@ module DataDuck
37
39
  end
38
40
 
39
41
  def create_columns_on_data_warehouse!(table)
40
- columns = get_columns_in_data_warehouse(table)
42
+ columns = get_columns_in_data_warehouse(table.building_name)
41
43
  column_names = columns.map { |col| col[:name].to_s }
42
44
  table.output_schema.map do |name, data_type|
43
45
  if !column_names.include?(name.to_s)
44
46
  redshift_data_type = data_type.to_s
45
47
  redshift_data_type = 'varchar(255)' if redshift_data_type == 'string'
46
- self.run_query("ALTER TABLE #{ table.name } ADD #{ name } #{ redshift_data_type }")
48
+ self.query("ALTER TABLE #{ table.building_name } ADD #{ name } #{ redshift_data_type }")
47
49
  end
48
50
  end
49
51
  end
@@ -56,18 +58,21 @@ module DataDuck
56
58
  "\"#{ name }\" #{ redshift_data_type }"
57
59
  end
58
60
  props_string = props_array.join(', ')
59
- "CREATE TABLE IF NOT EXISTS #{ table_name } (#{ props_string })"
61
+
62
+ distribution_clause = table.distribution_key ? "DISTKEY(#{ table.distribution_key })" : ""
63
+ index_clause = table.indexes.length > 0 ? "INTERLEAVED SORTKEY (#{ table.indexes.join(',') })" : ""
64
+
65
+ "CREATE TABLE IF NOT EXISTS #{ table_name } (#{ props_string }) #{ distribution_clause } #{ index_clause }"
60
66
  end
61
67
 
62
- def create_output_table_on_data_warehouse!(table)
63
- self.run_query(self.create_table_query(table))
68
+ def create_output_tables!(table)
69
+ self.query(self.create_table_query(table, table.building_name))
64
70
  self.create_columns_on_data_warehouse!(table)
65
- end
66
71
 
67
- def create_staging_table!(table)
68
- table_name = self.staging_table_name(table)
69
- self.drop_staging_table!(table)
70
- self.run_query(self.create_table_query(table, table_name))
72
+ if table.building_name != table.staging_name
73
+ self.drop_staging_table!(table)
74
+ self.query(self.create_table_query(table, table.staging_name))
75
+ end
71
76
  end
72
77
 
73
78
  def data_as_csv_string(data, property_names)
@@ -94,13 +99,25 @@ module DataDuck
94
99
  return data_string_components.join
95
100
  end
96
101
 
102
+ def dbconsole(options = {})
103
+ args = []
104
+ args << "--host=#{ @host }"
105
+ args << "--username=#{ @username }"
106
+ args << "--dbname=#{ @database }"
107
+ args << "--port=#{ @port }"
108
+
109
+ ENV['PGPASSWORD'] = @password
110
+
111
+ self.find_command_and_execute("psql", *args)
112
+ end
113
+
97
114
  def drop_staging_table!(table)
98
- self.run_query("DROP TABLE IF EXISTS #{ self.staging_table_name(table) }")
115
+ self.query("DROP TABLE IF EXISTS #{ table.staging_name }")
99
116
  end
100
117
 
101
- def get_columns_in_data_warehouse(table)
102
- query = "SELECT pg_table_def.column as name, type as data_type, distkey, sortkey FROM pg_table_def WHERE tablename='#{ table.name }'"
103
- results = self.run_query(query)
118
+ def get_columns_in_data_warehouse(table_name)
119
+ cols_query = "SELECT pg_table_def.column AS name, type AS data_type, distkey, sortkey FROM pg_table_def WHERE tablename='#{ table_name }'"
120
+ results = self.query(cols_query)
104
121
 
105
122
  columns = []
106
123
  results.each do |result|
@@ -108,7 +125,7 @@ module DataDuck
108
125
  name: result[:name],
109
126
  data_type: result[:data_type],
110
127
  distkey: result[:distkey],
111
- sortkey: result[:sortkey]
128
+ sortkey: result[:sortkey],
112
129
  }
113
130
  end
114
131
 
@@ -116,20 +133,25 @@ module DataDuck
116
133
  end
117
134
 
118
135
  def merge_from_staging!(table)
136
+ if table.staging_name == table.building_name
137
+ return
138
+ end
139
+
119
140
  # Following guidelines in http://docs.aws.amazon.com/redshift/latest/dg/merge-examples.html
120
- staging_name = self.staging_table_name(table)
121
- delete_query = "DELETE FROM #{ table.name } USING #{ staging_name } WHERE #{ table.name }.id = #{ staging_name }.id" # TODO allow custom or multiple keys
122
- self.run_query(delete_query)
123
- insert_query = "INSERT INTO #{ table.name } (\"#{ table.output_column_names.join('","') }\") SELECT \"#{ table.output_column_names.join('","') }\" FROM #{ staging_name }"
124
- self.run_query(insert_query)
141
+ staging_name = table.staging_name
142
+ building_name = table.building_name
143
+ delete_query = "DELETE FROM #{ building_name } USING #{ staging_name } WHERE #{ building_name }.id = #{ staging_name }.id" # TODO allow custom or multiple keys
144
+ self.query(delete_query)
145
+ insert_query = "INSERT INTO #{ building_name } (\"#{ table.output_column_names.join('","') }\") SELECT \"#{ table.output_column_names.join('","') }\" FROM #{ staging_name }"
146
+ self.query(insert_query)
125
147
  end
126
148
 
127
- def run_query(sql)
149
+ def query(sql)
128
150
  self.connection[sql].map { |elem| elem }
129
151
  end
130
152
 
131
- def staging_table_name(table)
132
- "zz_dataduck_#{ table.name }"
153
+ def table_names
154
+ self.query("SELECT DISTINCT(tablename) AS name FROM pg_table_def WHERE schemaname='public' ORDER BY name").map { |item| item[:name] }
133
155
  end
134
156
 
135
157
  def upload_table_to_s3!(table)
@@ -144,14 +166,28 @@ module DataDuck
144
166
  return s3_obj
145
167
  end
146
168
 
169
+ def finish_fully_reloading_table!(table)
170
+ self.query("DROP TABLE IF EXISTS dataduck_zz_old_#{ table.name }")
171
+
172
+ table_already_exists = self.table_names.include?(table.name)
173
+ if table_already_exists
174
+ self.query("ALTER TABLE #{ table.name } RENAME TO dataduck_zz_old_#{ table.name }")
175
+ end
176
+
177
+ self.query("ALTER TABLE #{ table.staging_name } RENAME TO #{ table.name }")
178
+ self.query("DROP TABLE IF EXISTS dataduck_zz_old_#{ table.name }")
179
+ end
180
+
147
181
  def load_table!(table)
148
- puts "Loading table #{ table.name }..."
182
+ DataDuck::Logs.info "Loading table #{ table.name }..."
149
183
  s3_object = self.upload_table_to_s3!(table)
150
- self.create_staging_table!(table)
151
- self.create_output_table_on_data_warehouse!(table)
152
- self.run_query(self.copy_query(table, s3_object.s3_path))
153
- self.merge_from_staging!(table)
154
- self.drop_staging_table!(table)
184
+ self.create_output_tables!(table)
185
+ self.query(self.copy_query(table, s3_object.s3_path))
186
+
187
+ if table.staging_name != table.building_name
188
+ self.merge_from_staging!(table)
189
+ self.drop_staging_table!(table)
190
+ end
155
191
  end
156
192
 
157
193
  def self.value_to_string(value)
@@ -1,6 +1,23 @@
1
1
  module DataDuck
2
+ class Source < DataDuck::Database
3
+ def self.load_config!
4
+ all_sources = DataDuck.config['sources']
5
+ return if all_sources.nil?
6
+
7
+ all_sources.each_key do |source_name|
8
+ configuration = all_sources[source_name]
9
+ source_type = configuration['type']
10
+
11
+ if source_type == "postgresql"
12
+ DataDuck.sources[source_name] = DataDuck::PostgresqlSource.new(source_name, configuration)
13
+ elsif source_type == "mysql"
14
+ DataDuck.sources[source_name] = DataDuck::MysqlSource.new(source_name, configuration)
15
+ else
16
+ raise ArgumentError.new("Unknown type '#{ source_type }' for source #{ source_name }.")
17
+ end
18
+ end
19
+ end
2
20
 
3
- class Source
4
21
  def self.source_config(name)
5
22
  if DataDuck.config['sources'].nil? || DataDuck.config['sources'][name.to_s].nil?
6
23
  raise Exception.new("Could not find source #{ name } in source configs.")
@@ -9,33 +26,25 @@ module DataDuck
9
26
  DataDuck.config['sources'][name.to_s]
10
27
  end
11
28
 
12
- def self.source(name)
29
+ def self.source(name, allow_nil = false)
13
30
  name = name.to_s
14
31
 
15
32
  if DataDuck.sources[name]
16
33
  return DataDuck.sources[name]
17
- end
18
-
19
- configuration = DataDuck::Source.source_config(name)
20
- source_type = configuration['type']
21
-
22
- if source_type == "postgresql"
23
- DataDuck.sources[name] = DataDuck::PostgresqlSource.new(configuration)
24
- return DataDuck.sources[name]
25
- elsif source_type == "mysql"
26
- DataDuck.sources[name] = DataDuck::MysqlSource.new(configuration)
27
- return DataDuck.sources[name]
34
+ elsif allow_nil
35
+ return nil
28
36
  else
29
- raise ArgumentError.new("Unknown type '#{ source_type }' for source #{ name }.")
37
+ raise Exception.new("Could not find source #{ name } in source configs.")
30
38
  end
31
39
  end
32
40
 
33
- def connection
34
- raise Exception.new("Must implement connection in subclass.")
35
- end
41
+ def self.only_source
42
+ if DataDuck.sources.keys.length != 1
43
+ raise ArgumentError.new("Must be exactly 1 source.")
44
+ end
36
45
 
37
- def query
38
- raise Exception.new("Must implement query in subclass.")
46
+ source_name = DataDuck.sources.keys[0]
47
+ return DataDuck::Source.source(source_name)
39
48
  end
40
49
 
41
50
  def schema(table_name)
@@ -4,13 +4,15 @@ require 'sequel'
4
4
 
5
5
  module DataDuck
6
6
  class SqlDbSource < DataDuck::Source
7
- def initialize(data)
7
+ def initialize(name, data)
8
8
  @host = data['host']
9
9
  @port = data['port']
10
10
  @username = data['username']
11
11
  @password = data['password']
12
12
  @database = data['database']
13
13
  @initialized_db_type = data['db_type']
14
+
15
+ super
14
16
  end
15
17
 
16
18
  def connection
@@ -35,6 +37,10 @@ module DataDuck
35
37
  end
36
38
 
37
39
  def query(sql)
40
+ if self.is_mutating_sql?(sql)
41
+ raise ArgumentError.new("Database #{ self.name } must not run mutating sql: #{ sql }")
42
+ end
43
+
38
44
  self.connection.fetch(sql).all
39
45
  end
40
46
  end
@@ -46,38 +46,150 @@ module DataDuck
46
46
  self.class.actions
47
47
  end
48
48
 
49
- def output_schema
50
- self.class.output_schema
49
+ def check_table_valid!
50
+ if !self.batch_size.nil?
51
+ raise Exception.new("Table #{ self.name }'s batch_size must be > 0") unless self.batch_size > 0
52
+ raise Exception.new("Table #{ self.name } has batch_size defined but no extract_by_column") if self.extract_by_column.nil?
53
+ end
51
54
  end
52
55
 
53
- def output_column_names
54
- self.class.output_schema.keys.sort
56
+ def distribution_key
57
+ if self.output_column_names.include?("id")
58
+ "id"
59
+ else
60
+ nil
61
+ end
55
62
  end
56
63
 
57
- def extract!
58
- puts "Extracting table #{ self.name }..."
64
+ def etl!(destinations)
65
+ if destinations.length != 1
66
+ raise ArgumentError.new("DataDuck can only etl to one destination at a time for now.")
67
+ end
68
+ self.check_table_valid!
69
+ destination = destinations.first
70
+
71
+ if self.should_fully_reload?
72
+ destination.drop_staging_table!(self)
73
+ end
74
+
75
+ batch_number = 0
76
+ while batch_number < 1_000
77
+ batch_number += 1
78
+ self.extract!(destination)
79
+ self.transform!
80
+ destination.load_table!(self)
81
+
82
+ if self.batch_size.nil?
83
+ break
84
+ else
85
+ if self.batch_size == self.data.length
86
+ DataDuck::Logs.info "Finished batch #{ batch_number }, continuing with the next batch"
87
+ else
88
+ DataDuck::Logs.info "Finished batch #{ batch_number } (last batch)"
89
+ break
90
+ end
91
+ end
92
+ end
93
+
94
+ self.data = []
95
+
96
+ if self.should_fully_reload?
97
+ destination.finish_fully_reloading_table!(self)
98
+ end
99
+ end
100
+
101
+ def extract!(destination = nil)
102
+ DataDuck::Logs.info "Extracting table #{ self.name }"
59
103
 
60
104
  self.errors ||= []
61
105
  self.data = []
62
106
  self.class.sources.each do |source_spec|
63
107
  source = source_spec[:source]
64
- my_query = self.extract_query(source_spec)
108
+ my_query = self.extract_query(source_spec, destination)
65
109
  results = source.query(my_query)
66
110
  self.data = results
67
111
  end
68
112
  self.data
69
113
  end
70
114
 
71
- def extract_query(source_spec)
72
- if source_spec.has_key?(:query)
73
- query
74
- else
75
- "SELECT \"#{ source_spec[:columns].sort.join('","') }\" FROM #{ source_spec[:table_name] }"
115
+ def extract_query(source_spec, destination = nil)
116
+ base_query = source_spec.has_key?(:query) ? source_spec[:query] :
117
+ "SELECT \"#{ source_spec[:columns].sort.join('","') }\" FROM #{ source_spec[:table_name] }"
118
+
119
+ extract_by_clause = ""
120
+ limit_clause = ""
121
+
122
+ if self.extract_by_column
123
+ if destination.table_names.include?(self.building_name)
124
+ extract_by_value = destination.query("SELECT MAX(#{ self.extract_by_column }) AS val FROM #{ self.building_name }").first
125
+ extract_by_value = extract_by_value.nil? ? nil : extract_by_value[:val]
126
+
127
+ if extract_by_value
128
+ extract_by_clause = "WHERE #{ self.extract_by_column } >= '#{ extract_by_value }'"
129
+ end
130
+ end
131
+
132
+ limit_clause = self.batch_size ? "ORDER BY #{ self.extract_by_column } LIMIT #{ self.batch_size }" : ""
133
+ end
134
+
135
+ [base_query, extract_by_clause, limit_clause].join(' ').strip
136
+ end
137
+
138
+ def indexes
139
+ which_columns = []
140
+ which_columns << "id" if self.output_column_names.include?("id")
141
+ which_columns << "created_at" if self.output_column_names.include?("created_at")
142
+ which_columns
143
+ end
144
+
145
+ def batch_size
146
+ nil
147
+ end
148
+
149
+ def extract_by_column
150
+ return 'updated_at' if self.output_column_names.include?("updated_at")
151
+
152
+ nil
153
+ end
154
+
155
+ def should_fully_reload?
156
+ false # Set to true if you want to fully reload a table with each ETL
157
+ end
158
+
159
+ def building_name
160
+ self.should_fully_reload? ? self.staging_name : self.name
161
+ end
162
+
163
+ def staging_name
164
+ "zz_dataduck_#{ self.name }"
165
+ end
166
+
167
+ def output_schema
168
+ self.class.output_schema
169
+ end
170
+
171
+ def output_column_names
172
+ self.class.output_schema.keys.sort.map(&:to_s)
173
+ end
174
+
175
+ def show
176
+ puts "Table #{ self.name }"
177
+ self.class.sources.each do |source_spec|
178
+ puts "\nSources from #{ source_spec[:table_name] || source_spec[:query] } on #{ source_spec[:source].name }"
179
+ source_spec[:columns].each do |col_name|
180
+ puts " #{ col_name }"
181
+ end
182
+ end
183
+
184
+ puts "\nOutputs "
185
+ num_separators = self.output_schema.keys.map { |key| key.length }.max
186
+ self.output_schema.each_pair do |name, datatype|
187
+ puts " #{ name }#{ ' ' * (num_separators + 2 - name.length) }#{ datatype }"
76
188
  end
77
189
  end
78
190
 
79
191
  def transform!
80
- puts "Transforming table #{ self.name }..."
192
+ DataDuck::Logs.info "Transforming table #{ self.name }"
81
193
 
82
194
  self.errors ||= []
83
195
  self.class.actions ||= []
data/lib/dataduck/util.rb CHANGED
@@ -1,10 +1,20 @@
1
+ require 'fileutils'
2
+
1
3
  module DataDuck
2
- class Util
3
- def self.underscore_to_camelcase(str)
4
+ module Util
5
+ def Util.ensure_path_exists!(full_path)
6
+ split_paths = full_path.split('/')
7
+ just_file_path = split_paths.pop
8
+ directory_path = split_paths.join('/')
9
+ FileUtils.mkdir_p(directory_path)
10
+ FileUtils.touch("#{ directory_path }/#{ just_file_path }")
11
+ end
12
+
13
+ def Util.underscore_to_camelcase(str)
4
14
  str.split('_').map{ |chunk| chunk.capitalize }.join
5
15
  end
6
16
 
7
- def self.camelcase_to_underscore(str)
17
+ def Util.camelcase_to_underscore(str)
8
18
  str.gsub(/::/, '/')
9
19
  .gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2')
10
20
  .gsub(/([a-z\d])([A-Z])/,'\1_\2')
@@ -1,6 +1,6 @@
1
1
  module DataDuck
2
2
  VERSION_MAJOR = 0
3
- VERSION_MINOR = 4
3
+ VERSION_MINOR = 5
4
4
  VERSION_PATCH = 0
5
5
  VERSION = [VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH].join('.')
6
6
  end
data/lib/dataduck.rb CHANGED
@@ -1,3 +1,5 @@
1
+ require 'yaml'
2
+
1
3
  Dir[File.dirname(__FILE__) + '/helpers/*.rb'].each do |file|
2
4
  require file
3
5
  end
@@ -6,13 +8,11 @@ Dir[File.dirname(__FILE__) + '/dataduck/*.rb'].each do |file|
6
8
  require file
7
9
  end
8
10
 
9
- require 'yaml'
10
-
11
11
  module DataDuck
12
12
  extend ModuleVars
13
13
 
14
14
  ENV['DATADUCK_ENV'] ||= "development"
15
- create_module_var("environment", ENV['DATADUCK_ENV'])
15
+ create_module_var("environment", ENV['DATADUCK_ENV'])
16
16
 
17
17
  spec = Gem::Specification.find_by_name("dataduck")
18
18
  create_module_var("gem_root", spec.gem_dir)
@@ -26,4 +26,13 @@ module DataDuck
26
26
 
27
27
  create_module_var("sources", {})
28
28
  create_module_var("destinations", {})
29
+
30
+ DataDuck::Source.load_config!
31
+ DataDuck::Destination.load_config!
32
+
33
+ Dir[DataDuck.project_root + "/src/tables/*.rb"].each do |file|
34
+ table_name_underscores = file.split("/").last.gsub(".rb", "")
35
+ table_name_camelized = DataDuck::Util.underscore_to_camelcase(table_name_underscores)
36
+ require file
37
+ end
29
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataduck
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeff Pickhardt
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-14 00:00:00.000000000 Z
11
+ date: 2015-10-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -143,6 +143,12 @@ files:
143
143
  - bin/setup
144
144
  - dataduck.gemspec
145
145
  - docs/README.md
146
+ - docs/commands/README.md
147
+ - docs/commands/console.md
148
+ - docs/commands/dbconsole.md
149
+ - docs/commands/etl.md
150
+ - docs/commands/quickstart.md
151
+ - docs/commands/show.md
146
152
  - docs/contents.yml
147
153
  - docs/overview/README.md
148
154
  - docs/overview/getting_started.md
@@ -157,8 +163,10 @@ files:
157
163
  - examples/example/src/tables/users.rb
158
164
  - lib/dataduck.rb
159
165
  - lib/dataduck/commands.rb
166
+ - lib/dataduck/database.rb
160
167
  - lib/dataduck/destination.rb
161
168
  - lib/dataduck/etl.rb
169
+ - lib/dataduck/logs.rb
162
170
  - lib/dataduck/mysql_source.rb
163
171
  - lib/dataduck/postgresql_source.rb
164
172
  - lib/dataduck/redshift_destination.rb