forklift_etl 1.0.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (121) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +10 -0
  3. data/.rbenv-version +1 -0
  4. data/.travis.yml +10 -0
  5. data/Gemfile +10 -0
  6. data/Gemfile.lock +74 -0
  7. data/Rakefile +13 -0
  8. data/bin/forklift +61 -0
  9. data/doc/EmailSuffix.html +228 -0
  10. data/doc/Forklift.html +187 -0
  11. data/doc/Forklift/Base.html +167 -0
  12. data/doc/Forklift/Base/Connection.html +590 -0
  13. data/doc/Forklift/Base/Logger.html +453 -0
  14. data/doc/Forklift/Base/Mailer.html +399 -0
  15. data/doc/Forklift/Base/Mailer/ERBBinding.html +256 -0
  16. data/doc/Forklift/Base/Pid.html +489 -0
  17. data/doc/Forklift/Base/Utils.html +252 -0
  18. data/doc/Forklift/Connection.html +164 -0
  19. data/doc/Forklift/Connection/Elasticsearch.html +419 -0
  20. data/doc/Forklift/Connection/Mysql.html +939 -0
  21. data/doc/Forklift/Patterns.html +164 -0
  22. data/doc/Forklift/Patterns/Elasticsearch.html +169 -0
  23. data/doc/Forklift/Patterns/Mysql.html +402 -0
  24. data/doc/Forklift/Plan.html +704 -0
  25. data/doc/Gemfile.html +132 -0
  26. data/doc/Object.html +326 -0
  27. data/doc/Rakefile.html +138 -0
  28. data/doc/SpecClient.html +291 -0
  29. data/doc/SpecPlan.html +253 -0
  30. data/doc/SpecSeeds.html +303 -0
  31. data/doc/created.rid +35 -0
  32. data/doc/example/Gemfile.html +129 -0
  33. data/doc/images/add.png +0 -0
  34. data/doc/images/brick.png +0 -0
  35. data/doc/images/brick_link.png +0 -0
  36. data/doc/images/bug.png +0 -0
  37. data/doc/images/bullet_black.png +0 -0
  38. data/doc/images/bullet_toggle_minus.png +0 -0
  39. data/doc/images/bullet_toggle_plus.png +0 -0
  40. data/doc/images/date.png +0 -0
  41. data/doc/images/delete.png +0 -0
  42. data/doc/images/find.png +0 -0
  43. data/doc/images/loadingAnimation.gif +0 -0
  44. data/doc/images/macFFBgHack.png +0 -0
  45. data/doc/images/package.png +0 -0
  46. data/doc/images/page_green.png +0 -0
  47. data/doc/images/page_white_text.png +0 -0
  48. data/doc/images/page_white_width.png +0 -0
  49. data/doc/images/plugin.png +0 -0
  50. data/doc/images/ruby.png +0 -0
  51. data/doc/images/tag_blue.png +0 -0
  52. data/doc/images/tag_green.png +0 -0
  53. data/doc/images/transparent.png +0 -0
  54. data/doc/images/wrench.png +0 -0
  55. data/doc/images/wrench_orange.png +0 -0
  56. data/doc/images/zoom.png +0 -0
  57. data/doc/index.html +122 -0
  58. data/doc/js/darkfish.js +155 -0
  59. data/doc/js/jquery.js +18 -0
  60. data/doc/js/navigation.js +142 -0
  61. data/doc/js/search.js +94 -0
  62. data/doc/js/search_index.js +1 -0
  63. data/doc/js/searcher.js +228 -0
  64. data/doc/rdoc.css +543 -0
  65. data/doc/table_of_contents.html +309 -0
  66. data/example/Gemfile +3 -0
  67. data/example/Gemfile.lock +55 -0
  68. data/example/config/connections/elasticsearch/source.yml +1 -0
  69. data/example/config/connections/mysql/destination.yml +6 -0
  70. data/example/config/connections/mysql/source.yml +6 -0
  71. data/example/config/email.yml +18 -0
  72. data/example/plan.rb +87 -0
  73. data/example/template/email.erb +6 -0
  74. data/example/transformations/cleanup.sql +1 -0
  75. data/example/transformations/combined_name.sql +7 -0
  76. data/example/transformations/email_suffix.rb +20 -0
  77. data/forklift.jpg +0 -0
  78. data/forklift_etl.gemspec +28 -0
  79. data/lib/forklift/base/connection.rb +72 -0
  80. data/lib/forklift/base/logger.rb +49 -0
  81. data/lib/forklift/base/mailer.rb +83 -0
  82. data/lib/forklift/base/pid.rb +55 -0
  83. data/lib/forklift/base/utils.rb +23 -0
  84. data/lib/forklift/forklift.rb +19 -0
  85. data/lib/forklift/patterns/elasticsearch_patterns.rb +7 -0
  86. data/lib/forklift/patterns/mysql_patterns.rb +87 -0
  87. data/lib/forklift/plan.rb +138 -0
  88. data/lib/forklift/transports/elasticsearch.rb +75 -0
  89. data/lib/forklift/transports/mysql.rb +241 -0
  90. data/lib/forklift/version.rb +3 -0
  91. data/readme.md +410 -0
  92. data/spec/config/connections/elasticsearch/forklift_test.yml +1 -0
  93. data/spec/config/connections/mysql/forklift_test_destination.yml +6 -0
  94. data/spec/config/connections/mysql/forklift_test_source_a.yml +6 -0
  95. data/spec/config/connections/mysql/forklift_test_source_b.yml +6 -0
  96. data/spec/config/connections/mysql/forklift_test_working.yml +6 -0
  97. data/spec/config/email.yml +4 -0
  98. data/spec/integration/basic_spec.rb +29 -0
  99. data/spec/integration/elasticsearch_patterns_spec.rb +5 -0
  100. data/spec/integration/elasticsearch_spec.rb +95 -0
  101. data/spec/integration/multi_transport_spec.rb +112 -0
  102. data/spec/integration/mysql_patterns_spec.rb +76 -0
  103. data/spec/integration/mysql_spec.rb +138 -0
  104. data/spec/spec_helper.rb +30 -0
  105. data/spec/support/dumps/elasticsearch/forklift_test.json +7 -0
  106. data/spec/support/dumps/mysql/forklift_test_source_a.sql +79 -0
  107. data/spec/support/dumps/mysql/forklift_test_source_b.sql +23 -0
  108. data/spec/support/spec_client.rb +30 -0
  109. data/spec/support/spec_plan.rb +15 -0
  110. data/spec/support/spec_seeds.rb +69 -0
  111. data/spec/template/spec_email_template.erb +4 -0
  112. data/spec/unit/connection/mysql_spec.rb +102 -0
  113. data/spec/unit/misc/email_spec.rb +37 -0
  114. data/spec/unit/misc/pid_spec.rb +25 -0
  115. data/spec/unit/misc/step_spec.rb +53 -0
  116. data/template/destination.yml +6 -0
  117. data/template/email.erb +1 -0
  118. data/template/email.yml +18 -0
  119. data/template/plan.rb +10 -0
  120. data/template/source.yml +6 -0
  121. metadata +289 -0
@@ -0,0 +1,6 @@
1
+ <h1>Your forklift email</h1>
2
+
3
+ <ul>
4
+ <li><strong>Total Users</strong>: <%= @total_users_count %></li>
5
+ <li><strong>New Users</strong>: <%= @new_users_count %></li>
6
+ </ul>
@@ -0,0 +1 @@
1
+ ALTER TABLE `users` DROP `combined_name`;
@@ -0,0 +1,7 @@
1
+ ALTER TABLE `users` ADD `combined_name` VARCHAR(255) NULL DEFAULT NULL AFTER `last_name`;
2
+
3
+ UPDATE `users` SET `combined_name` = (
4
+ select CONCAT(first_name, " ", last_name)
5
+ );
6
+
7
+ CREATE INDEX combined_name ON users (combined_name);
@@ -0,0 +1,20 @@
1
+ class EmailSuffix
2
+
3
+ def do!(connection, forklift)
4
+ forklift.logger.log "collecting email suffixes..."
5
+
6
+ suffixes = {}
7
+ connection.read("select email from users"){|data|
8
+ data.each do |row|
9
+ part = row[:email].split('@').last
10
+ suffixes[part] = 0 if suffixes[part].nil?
11
+ suffixes[part] = suffixes[part] + 1
12
+ end
13
+ }
14
+
15
+ suffixes.each do |suffix, count|
16
+ forklift.logger.log " > #{suffix}: #{count}" if count > 5
17
+ end
18
+ end
19
+
20
+ end
data/forklift.jpg ADDED
Binary file
@@ -0,0 +1,28 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'forklift/version'
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "forklift_etl"
8
+ s.version = Forklift::VERSION
9
+ s.authors = ["Evan Tahler"]
10
+ s.email = ["evan@taskrabbit.com"]
11
+ s.homepage = "https://github.com/taskrabbit/forklift"
12
+ s.summary = %q{Forklift: Moving big databases around. A ruby ETL tool.}
13
+ s.description = %q{A collection of ETL tools and patterns for mysql and elasticsearch.}
14
+ s.license = "MIT"
15
+
16
+ s.rubyforge_project = "forklift_etl"
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+
23
+ s.add_dependency "activesupport", '~> 4.0', ">= 4.0.0"
24
+ s.add_dependency "mysql2", '~> 0.0', ">= 0.0.1"
25
+ s.add_dependency "elasticsearch", '~> 1.0', ">= 1.0.0"
26
+ s.add_dependency "pony", '~> 1.0', ">= 1.0.0"
27
+ s.add_dependency "lumberjack", '~> 1.0', ">= 1.0.0"
28
+ end
@@ -0,0 +1,72 @@
1
+ module Forklift
2
+ module Base
3
+ class Connection
4
+
5
+ def initialize(config)
6
+ @config = config
7
+ end
8
+
9
+ def config
10
+ @config
11
+ end
12
+
13
+ def client
14
+ @client
15
+ end
16
+
17
+ def connect
18
+ # Will define @client
19
+ raise 'not implemented'
20
+ end
21
+
22
+ def disconnect
23
+ raise 'not implemented'
24
+ end
25
+
26
+ def read(query)
27
+ # will return an array of data rows
28
+ raise 'not implemented'
29
+ end
30
+
31
+ def write(data, collection)
32
+ # will write array data to collection (table)
33
+ raise 'not implemented'
34
+ end
35
+
36
+ def pipe
37
+ # when copying within the same connection, this method can be defined to speed things up
38
+ raise 'not implemented'
39
+ end
40
+
41
+ def exec(path)
42
+ begin
43
+ exec!(path)
44
+ rescue Exception => e
45
+ forklift.logger.log(e)
46
+ end
47
+ end
48
+
49
+ def exec!(path)
50
+ forklift.logger.log "Running script: #{path}"
51
+ extension = path.split(".").last
52
+ if(extension == "rb" || extension == "ruby")
53
+ exec_ruby(path)
54
+ else
55
+ exec_script(path)
56
+ end
57
+ end
58
+
59
+ def exec_ruby(path)
60
+ klass = forklift.utils.class_name_from_file(path)
61
+ require path
62
+ model = eval("#{klass}.new")
63
+ model.do!(self, forklift)
64
+ end
65
+
66
+ def exec_script(path)
67
+ raise 'not implemented'
68
+ end
69
+
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,49 @@
1
+ require 'lumberjack'
2
+
3
+ module Forklift
4
+ module Base
5
+ class Logger
6
+
7
+ def initialize(forklift)
8
+ @forklift = forklift
9
+ end
10
+
11
+ def forklift
12
+ @forklift
13
+ end
14
+
15
+ def messages
16
+ @messages ||= []
17
+ end
18
+
19
+ def logger
20
+ log_dir = "#{forklift.config[:project_root]}/log"
21
+ @logger ||= ::Lumberjack::Logger.new("#{log_dir}/forklift.log", :buffer_size => 0)
22
+ end
23
+
24
+ def log(message, severity="info")
25
+ timed_message = "[Forklift @ #{Time.now}] #{message}"
26
+ puts timed_message unless forklift.config[:logger][:stdout] != true
27
+ logger.send(severity.to_sym, message) unless logger.nil?
28
+ messages << timed_message
29
+ end
30
+
31
+ def debug(message)
32
+ if forklift.config[:logger][:debug] == true
33
+ log("[debug] #{message}")
34
+ end
35
+ end
36
+
37
+ def emphatically(message)
38
+ log "" if message.length > 0
39
+ log "*** #{message} ***"
40
+ log ""
41
+ end
42
+
43
+ def fatal(message)
44
+ log "!!! #{message} !!!"
45
+ end
46
+
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,83 @@
1
+ require 'pony'
2
+ require 'erb'
3
+ require 'active_support/core_ext/hash/keys'
4
+
5
+ module Forklift
6
+ module Base
7
+ class Mailer
8
+
9
+ def initialize(forklift)
10
+ @forklift = forklift
11
+ end
12
+
13
+ # Public: Pull out the settings from config/email.yml.
14
+ #
15
+ # Returns a Hash with all symbolized keys.
16
+ def config
17
+ config_file = "#{forklift.config[:project_root]}/config/email.yml"
18
+ @config ||= forklift.utils.load_yml(config_file).deep_symbolize_keys
19
+ end
20
+
21
+ def forklift
22
+ @forklift
23
+ end
24
+
25
+ def message_defaults
26
+ {
27
+ :from => "Forklift",
28
+ :subject => "Forklift has moved your database @ #{Time.new}",
29
+ :body => "Forklift has moved your database @ #{Time.new}",
30
+ }
31
+ end
32
+
33
+ def send_template(args, template_file, variables, attachment_lines=[])
34
+ renderer = ERB.new(File.read(template_file))
35
+ binder = ERBBinding.new(variables)
36
+ body = renderer.result(binder.get_binding)
37
+ args[:body] = body
38
+ send(args, attachment_lines)
39
+ end
40
+
41
+ def send(args, attachment_lines=[])
42
+ params = message_defaults
43
+ [:to, :from, :subject, :body].each do |i|
44
+ params[i] = args[i] unless args[i].nil?
45
+ end
46
+ if attachment_lines.length > 0
47
+ params[:attachments] = {"log.txt" => attachment_lines.join("\r\n")}
48
+ end
49
+ deliver(params)
50
+ end
51
+
52
+ private
53
+
54
+ # Private: Actually deliver the message using Pony.
55
+ #
56
+ # Returns the raw email from Pony.
57
+ def deliver(params)
58
+ forklift.logger.log("Sending email via #{config[:via]}")
59
+ if params[:html_body].nil?
60
+ params[:html_body] = params[:body]
61
+ params.delete(:body)
62
+ end
63
+ params[:via] = config[:via].to_sym
64
+ params[:via_options] = config[:via_options]
65
+ Pony.mail(params)
66
+ end
67
+
68
+ class ERBBinding
69
+ def initialize(hash)
70
+ hash.each do |k,v|
71
+ v = v.gsub("'", " ") if v.class == String
72
+ instance_variable_set("@#{k}", v)
73
+ end
74
+ end
75
+
76
+ def get_binding
77
+ return binding()
78
+ end
79
+ end
80
+
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,55 @@
1
+ module Forklift
2
+ module Base
3
+ class Pid
4
+
5
+ def initialize(forklift)
6
+ @forklift = forklift
7
+ end
8
+
9
+ def forklift
10
+ @forklift
11
+ end
12
+
13
+ def pid_dir
14
+ "#{forklift.config[:project_root]}/pid"
15
+ end
16
+
17
+ def ensure_pid_dir
18
+ `mkdir -p #{pid_dir}`
19
+ end
20
+
21
+ def pidfile
22
+ "#{pid_dir}/pidfile"
23
+ end
24
+
25
+ def store!
26
+ forklift.logger.debug "Creating pidfile @ #{pidfile}"
27
+ ensure_pid_dir
28
+ File.open(pidfile, 'w') {|f| f << Process.pid}
29
+ end
30
+
31
+ def recall
32
+ ensure_pid_dir
33
+ IO.read(pidfile).to_i rescue nil
34
+ end
35
+
36
+ def delete!
37
+ forklift.logger.debug "Removing pidfile @ #{pidfile}"
38
+ FileUtils.rm(pidfile) rescue nil
39
+ end
40
+
41
+ def safe_to_run?
42
+ return if recall.nil?
43
+ count = `ps -p #{recall} | wc -l`.to_i
44
+ if count >= 2
45
+ forklift.logger.fatal "This application is already running (pidfile) #{recall}. Exiting now"
46
+ exit(1)
47
+ else
48
+ forklift.logger.log "Clearing old pidfile from previous process #{recall}"
49
+ delete!
50
+ end
51
+ end
52
+
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,23 @@
1
+ require 'yaml'
2
+ require 'erb'
3
+
4
+ module Forklift
5
+ module Base
6
+ class Utils
7
+
8
+ def load_yml(file)
9
+ YAML.load(ERB.new(File.read(file)).result)
10
+ end
11
+
12
+ def class_name_from_file(file)
13
+ klass = ""
14
+ words = file.split("/").last.split(".").first.split("_")
15
+ words.each do |word|
16
+ klass << word.capitalize
17
+ end
18
+ klass
19
+ end
20
+
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,19 @@
1
+ require 'rubygems'
2
+
3
+ module Forklift
4
+
5
+ lib = File.expand_path(File.dirname(__FILE__))
6
+
7
+ require "#{lib}/base/utils.rb"
8
+ require "#{lib}/base/pid.rb"
9
+ require "#{lib}/base/logger.rb"
10
+ require "#{lib}/base/mailer.rb"
11
+ require "#{lib}/base/connection.rb"
12
+
13
+ Dir["#{lib}/transports/*.rb"].each {|file| require file }
14
+ Dir["#{lib}/patterns/*.rb"].each {|file| require file }
15
+ Dir["#{Dir.pwd}/transports/*.rb"].each {|file| require file } if File.directory?("#{Dir.pwd}/transports")
16
+ Dir["#{Dir.pwd}/patterns/*.rb"].each {|file| require file } if File.directory?("#{Dir.pwd}/patterns")
17
+
18
+ require "#{lib}/plan.rb"
19
+ end
@@ -0,0 +1,7 @@
1
+ module Forklift
2
+ module Patterns
3
+ class Elasticsearch
4
+
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,87 @@
1
+ module Forklift
2
+ module Patterns
3
+ class Mysql
4
+
5
+ def self.pipe(source, from_table, destination, to_table)
6
+ start = Time.new.to_i
7
+ from_db = source.current_database
8
+ to_db = destination.current_database
9
+ source.forklift.logger.log("mysql pipe: `#{from_db}`.`#{from_table}` => `#{to_db}`.`#{to_table}`")
10
+ source.q("drop table if exists `#{to_db}`.`#{to_table}`")
11
+ source.q("create table `#{to_db}`.`#{to_table}` like `#{from_db}`.`#{from_table}`")
12
+ source.q("insert into `#{to_db}`.`#{to_table}` select * from `#{from_db}`.`#{from_table}`")
13
+ delta = Time.new.to_i - start
14
+ source.forklift.logger.log(" ^ moved #{destination.count(to_table, to_db)} rows in #{delta}s")
15
+ end
16
+
17
+ def self.incremental_pipe(source, from_table, destination, to_table, matcher=source.default_matcher, primary_key='id')
18
+ start = Time.new.to_i
19
+ from_db = source.current_database
20
+ to_db = destination.current_database
21
+ source.forklift.logger.log("mysql incremental_pipe: `#{from_db}`.`#{from_table}` => `#{to_db}`.`#{to_table}`")
22
+ source.q("create table if not exists `#{to_db}`.`#{to_table}` like `#{from_db}`.`#{from_table}`")
23
+
24
+ # Count the number of rows in to_table
25
+ original_count = source.count(to_table, to_db)
26
+
27
+ # Find the latest/max/newest timestamp from the final table
28
+ # in order to determine the last copied row.
29
+ latest_timestamp = source.max_timestamp(to_table, matcher, to_db)
30
+
31
+ # If to_table has existing rows, ensure none of them are "stale."
32
+ # A stale row in to_table means a previously copied row was
33
+ # updated in from_table, so let's delete it from the to_table
34
+ # so we can get a fresh copy of that row.
35
+ if original_count > 0
36
+ # Get the ids of rows in from_table that are newer than the newest row in to_table.
37
+ # Some of these rows could either be a) stale or b) new.
38
+ source.read("select `#{primary_key}` from `#{from_db}`.`#{from_table}` where `#{matcher}` > \"#{latest_timestamp}\" order by `#{matcher}`") do |stale_rows|
39
+ if stale_rows.length > 0
40
+ # Delete these ids from to_table.
41
+ # If the ids are stale, then they'll be deleted. If they're new, they won't exist, and nothing will happen.
42
+ stale_ids = stale_rows.map { |row| row[primary_key.to_sym] }.join(',')
43
+ source.q("delete from `#{to_db}`.`#{to_table}` where `#{primary_key}` in (#{stale_ids})")
44
+ source.forklift.logger.log(" ^ deleted up to #{stale_rows.length} stale rows from `#{to_db}`.`#{to_table}`")
45
+ end
46
+ end
47
+ end
48
+
49
+ # Do the insert into to_table
50
+ destination.q("insert into `#{to_db}`.`#{to_table}` select * from `#{from_db}`.`#{from_table}` where `#{matcher}` > \"#{latest_timestamp}\" order by `#{matcher}`")
51
+ delta = Time.new.to_i - start
52
+ new_count = destination.count(to_table, to_db) - original_count
53
+ source.forklift.logger.log(" ^ created #{new_count} new rows in #{delta}s")
54
+ end
55
+
56
+ def self.optimistic_pipe(source, from_table, destination, to_table, matcher=source.default_matcher, primary_key='id')
57
+ from_db = source.current_database
58
+ to_db = destination.current_database
59
+ if self.can_incremental_pipe?(from_db, from_table)
60
+ incremental_pipe(from_db, from_table, to_db, to_table, matcher, primary_key)
61
+ else
62
+ pipe(from_db, from_table, to_db, to_table)
63
+ end
64
+ end
65
+
66
+ def self.can_incremental_pipe?(conn, table, matcher=conn.default_matcher)
67
+ conn.columns(table, conn.current_database).include?(matcher)
68
+ end
69
+
70
+ ## When you are copying data to and from mysql
71
+ ## An implamentation of "pipe" for remote databases
72
+ def self.mysql_optimistic_import(source, destination)
73
+ #TODO: allow passing in of matcher and primary_key
74
+ source.tables.each do |table|
75
+ if( source.columns(table).include?(source.default_matcher) && destination.tables.include?(table) )
76
+ since = destination.max_timestamp(table)
77
+ source.read_since(table, since){ |data| destination.write(data, table) }
78
+ else
79
+ destination.truncate table
80
+ source.read("select * from #{table}"){ |data| destination.write(data, table) }
81
+ end
82
+ end
83
+ end
84
+
85
+ end
86
+ end
87
+ end