jetpants 0.8.0 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +4 -9
  3. data/bin/jetpants +7 -6
  4. data/doc/capacity_plan.rdoc +77 -0
  5. data/doc/commands.rdoc +1 -1
  6. data/doc/jetpants_collins.rdoc +2 -1
  7. data/doc/online_schema_change.rdoc +45 -0
  8. data/doc/plugins.rdoc +7 -1
  9. data/doc/requirements.rdoc +1 -1
  10. data/doc/upgrade_helper.rdoc +68 -0
  11. data/lib/jetpants/db/client.rb +2 -1
  12. data/lib/jetpants/db/import_export.rb +12 -3
  13. data/lib/jetpants/db/replication.rb +6 -2
  14. data/lib/jetpants/db/schema.rb +40 -0
  15. data/lib/jetpants/db/server.rb +2 -2
  16. data/lib/jetpants/host.rb +12 -1
  17. data/lib/jetpants/pool.rb +41 -0
  18. data/lib/jetpants/shard.rb +201 -124
  19. data/lib/jetpants/table.rb +80 -10
  20. data/plugins/capacity_plan/capacity_plan.rb +353 -0
  21. data/plugins/capacity_plan/commandsuite.rb +19 -0
  22. data/plugins/capacity_plan/monkeypatch.rb +20 -0
  23. data/plugins/jetpants_collins/db.rb +45 -6
  24. data/plugins/jetpants_collins/jetpants_collins.rb +32 -21
  25. data/plugins/jetpants_collins/pool.rb +22 -1
  26. data/plugins/jetpants_collins/shard.rb +9 -2
  27. data/plugins/jetpants_collins/topology.rb +8 -9
  28. data/plugins/online_schema_change/commandsuite.rb +56 -0
  29. data/plugins/online_schema_change/db.rb +33 -0
  30. data/plugins/online_schema_change/online_schema_change.rb +5 -0
  31. data/plugins/online_schema_change/pool.rb +105 -0
  32. data/plugins/online_schema_change/topology.rb +56 -0
  33. data/plugins/simple_tracker/shard.rb +1 -1
  34. data/plugins/upgrade_helper/commandsuite.rb +212 -0
  35. data/plugins/upgrade_helper/db.rb +78 -0
  36. data/plugins/upgrade_helper/host.rb +22 -0
  37. data/plugins/upgrade_helper/pool.rb +259 -0
  38. data/plugins/upgrade_helper/shard.rb +61 -0
  39. data/plugins/upgrade_helper/upgrade_helper.rb +21 -0
  40. data/scripts/global_rowcount.rb +75 -0
  41. metadata +28 -15
@@ -0,0 +1,56 @@
1
+ module Jetpants
2
+ class Topology
3
+
4
+ # run an alter table on all the sharded pools
5
+ # if you specify dry run it will run a dry run on all the shards
6
+ # otherwise it will run on the first shard and ask if you want to
7
+ # continue on the rest of the shards, 10 shards at a time
8
+ def alter_table_shards(database, table, alter, dry_run=true)
9
+ my_shards = shards.dup
10
+ first_shard = my_shards.shift
11
+ print "Will run on first shard and prompt for going past the dry run only on the first shard\n\n"
12
+ print "[#{Time.now.to_s.blue}] #{first_shard.pool.to_s}\n"
13
+ unless first_shard.alter_table(database, table, alter, dry_run, false)
14
+ print "First shard had an error, please check output\n"
15
+ return
16
+ end
17
+
18
+ continue = 'no'
19
+ continue = ask('First shard complete would you like to continue with the rest of the shards?: (YES/no) - YES has to be in all caps and fully typed')
20
+ if continue == 'YES'
21
+ errors = []
22
+
23
+ my_shards.limited_concurrent_map(10) do |shard|
24
+ print "[#{Time.now.to_s.blue}] #{shard.pool.to_s}\n"
25
+ errors << shard unless shard.alter_table(database, table, alter, dry_run, true)
26
+ end
27
+
28
+ errors.each do |shard|
29
+ print "check #{shard.name} for errors during online schema change\n"
30
+ end
31
+ end
32
+ end
33
+
34
+ # will drop old table from the shards after a alter table
35
+ # this is because we do not drop the old table in the osc
36
+ # also I will do the first shard and ask if you want to
37
+ # continue, after that it will do each table serially
38
+ def drop_old_alter_table_shards(database, table)
39
+ my_shards = shards.dup
40
+ first_shard = my_shards.shift
41
+ print "Will run on first shard and prompt before going on to the rest\n\n"
42
+ print "[#{Time.now.to_s.blue}] #{first_shard.pool.to_s}\n"
43
+ first_shard.drop_old_alter_table(database, table)
44
+
45
+ continue = 'no'
46
+ continue = ask('First shard complete would you like to continue with the rest of the shards?: (YES/no) - YES has to be in all caps and fully typed')
47
+ if continue == 'YES'
48
+ my_shards.each do |shard|
49
+ print "[#{Time.now.to_s.blue}] #{shard.pool.to_s}\n"
50
+ shard.drop_old_alter_table(database, table)
51
+ end
52
+ end
53
+ end
54
+
55
+ end
56
+ end
@@ -62,7 +62,7 @@ module Jetpants
62
62
  # read-only shards, and offline shards appropriately.
63
63
  return me.merge case state
64
64
  when :ready, :needs_cleanup then {'host' => master.ip}
65
- when :child then {'host_read' => master.ip, 'host_write' => parent.master.ip}
65
+ when :child then {'host_read' => master.ip, 'host_write' => master.master.ip}
66
66
  when :read_only then {'host_read' => master.ip, 'host_write' => false}
67
67
  when :offline then {'host' => false}
68
68
  end
@@ -0,0 +1,212 @@
1
+ # additional commands added by this plugin
2
+
3
+ require 'thor'
4
+
5
+ module Jetpants
6
+ class CommandSuite < Thor
7
+
8
+ desc 'upgrade_clone_slave', 'clone a standby slave to target node(s) running a newer version of MySQL'
9
+ method_option :source, :desc => 'IP of node to clone from'
10
+ method_option :target, :desc => 'IP of node(s) to clone to'
11
+ def upgrade_clone_slave
12
+ puts "This task clones the data set of a standby slave to target node(s) that have a"
13
+ puts "newer version of MySQL already installed."
14
+ source = ask_node('Please enter IP of node to clone from: ', options[:source])
15
+ source.master.probe if source.master # fail early if there are any replication issues in this pool
16
+ describe source
17
+
18
+ puts "You may clone to particular IP address(es), or can type \"spare\" to claim a node from the spare pool."
19
+ target = options[:target] || ask('Please enter comma-separated list of targets (IPs or "spare") to clone to: ')
20
+ spares_needed = target.split(',').count {|t| t.strip.upcase == 'SPARE'}
21
+ target = 'spare' if target.strip == '' || target.split(',').length == 0
22
+ if spares_needed > 0
23
+ spares_available = Jetpants.topology.count_spares(role: :standby_slave, like: source, version: Plugin::UpgradeHelper.new_version)
24
+ raise "Not enough upgraded spares with role of standby slave! Requested #{spares_needed} but only have #{spares_available} available." if spares_needed > spares_available
25
+ claimed_spares = Jetpants.topology.claim_spares(spares_needed, role: :standby_slave, like: source, version: Plugin::UpgradeHelper.new_version)
26
+ end
27
+
28
+ targets = target.split(',').map do |ip|
29
+ ip.strip!
30
+ if is_ip? ip
31
+ ip.to_db
32
+ elsif ip == '' || ip.upcase == 'SPARE'
33
+ claimed_spares.shift
34
+ else
35
+ error "target (#{ip}) does not appear to be an IP."
36
+ end
37
+ end
38
+
39
+ source.start_mysql if ! source.running?
40
+ error "source (#{source}) is not a standby slave" unless source.is_standby?
41
+
42
+ targets.each do |t|
43
+ error "target #{t} already has a master; please clear out node (including in asset tracker) before proceeding" if t.master
44
+ end
45
+
46
+ # Disable fast shutdown on the source
47
+ source.mysql_root_cmd 'SET GLOBAL innodb_fast_shutdown = 0'
48
+
49
+ # Flag the nodes as needing upgrade, which will get triggered when
50
+ # enslave_siblings restarts them
51
+ targets.each {|t| t.needs_upgrade = true}
52
+
53
+ # Remove ib_lru_dump if present on targets
54
+ targets.concurrent_each {|t| t.ssh_cmd "rm -rf #{t.mysql_directory}/ib_lru_dump"}
55
+
56
+ source.enslave_siblings!(targets)
57
+ targets.concurrent_each {|t| t.resume_replication; t.catch_up_to_master}
58
+ source.pool.sync_configuration
59
+
60
+ puts "Clone-and-upgrade complete."
61
+ Jetpants.topology.write_config
62
+ end
63
+
64
+
65
+ desc 'upgrade_promotion', 'demote and destroy a master running an older version of MySQL'
66
+ method_option :demote, :desc => 'node to demote'
67
+ def upgrade_promotion
68
+ demoted = ask_node 'Please enter the IP address of the node to demote:', options[:demote]
69
+ demoted.probe
70
+
71
+ # This task should not be used for emergency promotions (master failures)
72
+ # since the regular "jetpants promotion" logic is actually fine in that case.
73
+ error "Unable to connect to node #{demoted} to demote" unless demoted.running?
74
+
75
+ # Before running this task, the pool should already have an extra standby slave,
76
+ # since we're going to be removing the master from the pool.
77
+ standby_slaves_needed = Jetpants.standby_slaves_per_pool + 1
78
+ error "Only run this task on a pool with 3 standby slaves!" unless demoted.pool(true).standby_slaves.size >= standby_slaves_needed
79
+
80
+ # Verify that all nodes except the master are running the same version, and
81
+ # are higher version than the master
82
+ unless demoted.slaves.all? {|s| s.version_cmp(demoted.slaves.first) == 0 && s.version_cmp(demoted) > 0}
83
+ error "This task can only be used when all slaves are running the same version of MySQL,"
84
+ error "and the master's version is older than that of all the slaves."
85
+ end
86
+
87
+ puts
88
+ inform "Summary of affected pool"
89
+ inform "Binary log positions and slave lag shown below are just a snapshot taken at the current time." if demoted.running?
90
+ puts
91
+ demoted.pool(true).summary(true)
92
+ puts
93
+
94
+ promoted = ask_node 'Please enter the IP address of a standby slave to promote: '
95
+
96
+ error "Node to promote #{promoted} is not a standby slave of node to demote #{demoted}" unless promoted.master == demoted && promoted.role == :standby_slave
97
+ error "The chosen node cannot be promoted. Please choose another." unless promoted.promotable_to_master?(false)
98
+
99
+ inform "Going to DEMOTE AND DESTROY existing master #{demoted} and PROMOTE new master #{promoted}."
100
+ error "Aborting." unless agree "Proceed? [yes/no]: "
101
+
102
+ # Perform the promotion, but without making the old master become a slave of the new master
103
+ # We then rely on the built-in call to Pool#sync_configuration or Pool#after_master_promotion!
104
+ # to remove the old master from the pool in the same way it would handle a failed master (which
105
+ # is entirely asset-tracker-plugin specific)
106
+ demoted.pool(true).master_promotion!(promoted, false)
107
+ end
108
+ def self.after_upgrade_promotion
109
+ reminders(
110
+ 'Commit/push the configuration in version control.',
111
+ 'Deploy the configuration to all machines.',
112
+ )
113
+ end
114
+
115
+
116
+ desc 'shard_upgrade', 'upgrade a shard via four-step lockless process'
117
+ method_option :min_id, :desc => 'Minimum ID of shard to upgrade'
118
+ method_option :max_id, :desc => 'Maximum ID of shard to ugprade'
119
+ method_option :reads, :desc => 'Move reads to the new master', :type => :boolean
120
+ method_option :writes, :desc => 'Move writes to new master', :type => :boolean
121
+ method_option :cleanup, :desc => 'Tear down the old-version nodes', :type => :boolean
122
+ def shard_upgrade
123
+ if options[:reads]
124
+ raise 'The --reads, --writes, and --cleanup options are mutually exclusive' if options[:writes] || options[:cleanup]
125
+ s = ask_shard_being_upgraded :reads
126
+ s.branched_upgrade_move_reads
127
+ Jetpants.topology.write_config
128
+ self.class.reminders(
129
+ 'Commit/push the configuration in version control.',
130
+ 'Deploy the configuration to all machines.',
131
+ 'Wait for reads to stop on the old shard master.',
132
+ 'Proceed to next step: jetpants shard_upgrade --writes'
133
+ )
134
+ elsif options[:writes]
135
+ raise 'The --reads, --writes, and --cleanup options are mutually exclusive' if options[:reads] || options[:cleanup]
136
+ s = ask_shard_being_upgraded :writes
137
+ s.branched_upgrade_move_writes
138
+ Jetpants.topology.write_config
139
+ self.class.reminders(
140
+ 'Commit/push the configuration in version control.',
141
+ 'Deploy the configuration to all machines.',
142
+ 'Wait for writes to stop on the old parent master.',
143
+ 'Proceed to next step: jetpants shard_upgrade --cleanup',
144
+ )
145
+
146
+ elsif options[:cleanup]
147
+ raise 'The --reads, --writes, and --cleanup options are mutually exclusive' if options[:reads] || options[:writes]
148
+ s = ask_shard_being_upgraded :cleanup
149
+ s.cleanup!
150
+
151
+ else
152
+ self.class.reminders(
153
+ 'This process may take an hour or two. You probably want to run this from a screen session.',
154
+ 'Be especially careful if you are relying on SSH Agent Forwarding for your root key, since this is not screen-friendly.'
155
+ )
156
+ s = ask_shard_being_upgraded :prep
157
+ s.branched_upgrade_prep
158
+ self.class.reminders(
159
+ 'Proceed to next step: jetpants shard_upgrade --reads'
160
+ )
161
+ end
162
+ end
163
+
164
+
165
+ desc 'checksum_pool', 'Run pt-table-checksum on a pool to verify data consistency after an upgrade of one slave'
166
+ method_option :pool, :desc => 'name of pool'
167
+ def checksum_pool
168
+ pool_name = options[:pool] || ask('Please enter name of pool to checksum: ')
169
+ pool = Jetpants.topology.pool(pool_name) or raise "Pool #{pool_name} does not exist"
170
+ pool.checksum_tables
171
+ end
172
+
173
+
174
+ desc 'check_pool_queries', 'Runs pt-upgrade on a pool to verify query performance and results between different MySQL versions'
175
+ method_option :pool, :desc => 'name of pool'
176
+ method_option :dumptime, :desc => 'number of seconds of tcpdump data to consider'
177
+ def check_pool_queries
178
+ pool_name = options[:pool] || ask('Please enter name of pool to checksum: ')
179
+ dump_time = options[:dumptime].to_i if options[:dumptime]
180
+ dump_time ||= 30
181
+
182
+ pool = Jetpants.topology.pool(pool_name) or raise "Pool #{pool_name} does not exist"
183
+ pool.collect_and_compare_queries!(dump_time)
184
+ end
185
+
186
+ no_tasks do
187
+ def ask_shard_being_upgraded(stage=:prep)
188
+ shards_being_upgraded = Jetpants.shards.select {|s| [:child, :needs_cleanup].include?(s.state) && !s.parent && s.master.master}
189
+ if stage == :writes || stage == :cleanup
190
+ if shards_being_upgraded.size == 0
191
+ raise 'No shards are currently being upgraded. You can only use this task after running "jetpants shard_upgrade".'
192
+ elsif shards_being_upgraded.size == 1
193
+ s = shards_being_upgraded.first
194
+ puts "Detected #{s} as the only shard currently involved in an upgrade operation."
195
+ error "Aborting." unless agree "Is this the right shard that you want to perform this action on? [yes/no]: "
196
+ return s
197
+ else
198
+ puts "The following shards are already involved in an upgrade operation:"
199
+ shards_being_upgraded.each {|sbu| puts "* #{sbu}"}
200
+ end
201
+ end
202
+ puts "Which shard would you like to perform this action on?"
203
+ shard_min = options[:min_id] || ask('Please enter min ID of the shard: ')
204
+ shard_max = options[:max_id] || ask('Please enter max ID of the shard: ')
205
+ s = Jetpants.topology.shard shard_min, shard_max
206
+ raise 'Shard not found' unless s
207
+ s
208
+ end
209
+ end
210
+
211
+ end
212
+ end
@@ -0,0 +1,78 @@
1
+ module Jetpants
2
+ class DB
3
+ attr_accessor :needs_upgrade
4
+
5
+ ##### CALLBACKS ############################################################
6
+
7
+ # Handle upgrading mysql if needed
8
+ def before_start_mysql(*options)
9
+ return unless @needs_upgrade
10
+
11
+ @repl_paused = false if @master
12
+ running = ssh_cmd "netstat -ln | grep #{@port} | wc -l"
13
+ raise "[#{@ip}] Failed to start MySQL: Something is already listening on port #{@port}" unless running.chomp == '0'
14
+
15
+ output "Attempting to start MySQL with --skip-networking --skip-grant-tables in prep for upgrade"
16
+
17
+ # Can't use start_mysql here without causing infinite recursion! Also don't need
18
+ # to do all the same checks here, nor do we need to store these to @options.
19
+ output service(:start, 'mysql', '--skip-networking --skip-grant-tables')
20
+
21
+ output "Attempting to run mysql_upgrade"
22
+ output ssh_cmd('mysql_upgrade')
23
+
24
+ output "Upgrade complete"
25
+ @needs_upgrade = false
26
+
27
+ # Now shut down mysql, so that start_mysql can restart it without the --skip-* options
28
+ stop_mysql
29
+ end
30
+
31
+ ##### NEW METHODS ##########################################################
32
+
33
+ # Creates a temporary user for use of pt-table-checksum, yields to the
34
+ # supplied block, and then drops the user.
35
+ # The user will have a randomly-generated 50-character password, and will
36
+ # have elevated permissions (ALL PRIVILEGES on the application schema, and
37
+ # a few global privs as well) since these are necessary to run the tools.
38
+ # The block will be passed the randomly-generated password.
39
+ def with_pt_checksum_user(username='pt-checksum')
40
+ password = DB.random_password
41
+ create_user username, password
42
+ grant_privileges username, '*', 'PROCESS', 'REPLICATION CLIENT', 'REPLICATION SLAVE'
43
+ grant_privileges username, app_schema, 'ALL PRIVILEGES'
44
+ begin
45
+ yield username, password
46
+ rescue
47
+ drop_user username
48
+ raise
49
+ end
50
+ drop_user username
51
+ end
52
+
53
+ # Captures mysql traffic with tcpdump for the specified amount of time, in seconds.
54
+ # The dumpfile will be saved to #{Jetpants.export_location} with filename
55
+ # #{hostname}.dumpfile, and the filename portion will be returned by this method.
56
+ #
57
+ # Not all traffic will be included -- uses a method by Devananda van der Veen described in
58
+ # http://www.mysqlperformanceblog.com/2011/04/18/how-to-use-tcpdump-on-very-busy-hosts/
59
+ # to sample the traffic.
60
+ #
61
+ # Requires that tcpdump is available in root's PATH. Also assumes root's shell is bash
62
+ # or supports equivalent syntax. Currently only works if mysqld running on port 3306.
63
+ #
64
+ # Warning: tcpdump can be taxing on the server, and also can generate rather large
65
+ # amounts of output! Also, will overwrite any previous file at the destination path!
66
+ def tcpdump!(duration=30, interface=false)
67
+ interface ||= Jetpants.private_interface
68
+ output "Using tcpdump to capture sample of MySQL traffic for #{duration} seconds"
69
+ tcpdump_options = "-i #{interface} -s 65535 -x -n -q -tttt 'port 3306 and tcp[1] & 7 == 2 and tcp[3] & 7 == 2'"
70
+ outfile = "#{Jetpants.export_location}/#{hostname}.dumpfile"
71
+ ssh_cmd "tcpdump #{tcpdump_options} > #{outfile} & export DUMP_PID=$! && sleep #{duration} && kill $DUMP_PID"
72
+ output "Completed capturing traffic sample"
73
+ "#{hostname}.dumpfile"
74
+ end
75
+
76
+
77
+ end
78
+ end
@@ -0,0 +1,22 @@
1
+ module Jetpants
2
+ class Host
3
+
4
+ ##### NEW METHODS ##########################################################
5
+
6
+ # Converts tcpdump output into slowlog format using pt-query-digest. Requires that
7
+ # pt-query-digest is installed and in root's path. Returns the full path to the
8
+ # slowlog. Does not delete or remove the tcpdump output file.
9
+ #
10
+ # This is in Host instead of DB because it may be preferable to run this on
11
+ # the host running Jetpants, as opposed to the DB where the dumpfile came from,
12
+ # because pt-query-digest may be taxing to run on the server.
13
+ def dumpfile_to_slowlog(tcpdump_output_file_path, delete_tcpdumpfile=true)
14
+ slowlog_file_path = tcpdump_output_file_path.sub('.dumpfile', '') + '.slowlog'
15
+ ssh_cmd "pt-query-digest #{tcpdump_output_file_path} --type tcpdump --no-report --print >#{slowlog_file_path}"
16
+ ssh_cmd "rm #{tcpdump_output_file_path}" if delete_tcpdumpfile
17
+ slowlog_file_path
18
+ end
19
+
20
+
21
+ end
22
+ end
@@ -0,0 +1,259 @@
1
+ require 'open3'
2
+
3
+ module Jetpants
4
+ class Pool
5
+ collins_attr_accessor :checksum_running
6
+
7
+ # Runs pt-table-checksum on the pool.
8
+ # Returns true if no problems found, false otherwise.
9
+ # If problems were found, the 'checksums' table will be
10
+ # left in the pool - the user must review and manually delete.
11
+ def checksum_tables
12
+ schema = master.app_schema
13
+ success = false
14
+ output_lines = []
15
+
16
+ # check if already running, or a previous run died
17
+ previous_run = collins_checksum_running
18
+ previous_run = nil if previous_run == ''
19
+ if previous_run
20
+ run_data = JSON.parse(previous_run.downcase) # hash with 'from_host', 'from_pid', 'timestamp'
21
+ previous_host = run_data['from_host'].to_host
22
+ previous_pid = run_data['from_pid'] or die 'No previous pid found in previous rundata?'
23
+ still_running = previous_host.pid_running?(previous_pid, 'pt-table-checksum')
24
+ raise "Checksum already in progress from #{previous_host}, pid=#{previous_pid}" if still_running
25
+ output "Previous failed run detected, will use --resume parameter"
26
+ end
27
+
28
+ # Determine what to pass to --max-load
29
+ master.output "Polling for normal max threads_running, please wait"
30
+ max_threads_running = master.max_threads_running
31
+ limit_threads_running = [(max_threads_running * 1.2).ceil, 50].max
32
+ master.output "Found max threads_running=#{max_threads_running}, will use limit of #{limit_threads_running}"
33
+
34
+ # Operate with a temporary user that has elevated permissions
35
+ master.with_pt_checksum_user do |username, password|
36
+ # Build command line
37
+ command_line = ['pt-table-checksum',
38
+ '--no-check-replication-filters',
39
+ "--databases #{schema}",
40
+ "--host #{master.ip}",
41
+ "--port #{master.port}",
42
+ "--max-load Threads_running:#{limit_threads_running}",
43
+ "--replicate #{schema}.checksums",
44
+ "--replicate-database #{schema}",
45
+ "--user #{username}",
46
+ "--password #{password}"
47
+ ].join ' '
48
+ command_line += ' --resume' if previous_run
49
+
50
+ # Spawn the process
51
+ Open3.popen3(command_line) do |stdin, stdout, stderr, wait_thread|
52
+ exit_code = nil
53
+ pid = wait_thread.pid
54
+ puts "Running pt-table-checksum targetting #{master}, pid on Jetpants host is #{pid}"
55
+
56
+ self.collins_checksum_running = {
57
+ 'from_host' => Host.local.ip,
58
+ 'from_pid' => pid,
59
+ 'timestamp' => Time.now.to_i,
60
+ }.to_json
61
+
62
+ # Display STDERR output in real-time, via a separate thread
63
+ Thread.new do
64
+ begin
65
+ stderr.each {|line| puts line}
66
+ rescue IOError, Interrupt
67
+ nil
68
+ end
69
+ end
70
+
71
+ # Capture STDOUT and buffer it; since this is the main thread, also
72
+ # watch out for broken pipe or ctrl-c
73
+ begin
74
+ stdout.each {|line| output_lines << line}
75
+ exit_code = wait_thread.value.to_i
76
+ rescue IOError, Interrupt => ex
77
+ puts "Caught exception #{ex.message}"
78
+ exit_code = 130 # by unix convention, return 128 + SIGINT
79
+ end
80
+
81
+ # Dump out stdout: first anything we buffered on our end, plus anything
82
+ # that Perl or the OS had buffered on its end
83
+ puts
84
+ output_lines.each {|line| puts line}
85
+ unless stdout.eof?
86
+ stdout.each {|line| puts line} rescue nil
87
+ end
88
+ puts
89
+
90
+ puts "Checksum completed with exit code #{exit_code}.\n"
91
+ success = (exit_code == 0)
92
+
93
+ # Run again with --replicate-check-only to display ALL diffs, including ones from
94
+ # prior runs of the tool.
95
+ puts 'Verifying all results via --replicate-check-only...'
96
+ output, diff_success = `#{command_line} --replicate-check-only`, $?.success?
97
+ if diff_success
98
+ puts 'No diffs found in any tables.'
99
+ puts output
100
+ else
101
+ puts 'Found diffs:'
102
+ puts output
103
+ success = false
104
+ end
105
+
106
+ # Drop the checksums table, but only if there were no diffs
107
+ if success
108
+ output "Dropping table #{schema}.checksums..."
109
+ master.connect(user: username, pass: password)
110
+ master.query('DROP TABLE checksums')
111
+ output "Table dropped."
112
+ master.disconnect
113
+ self.collins_checksum_running = ''
114
+ else
115
+ output 'Keeping checksums table in place for your review.'
116
+ output 'Please manually drop it when done.'
117
+ end
118
+ puts
119
+ end # popen3
120
+ end # with_pt_checksum_user
121
+ success
122
+ end
123
+
124
+
125
+ # Uses pt-upgrade to compare query performance and resultsets among nodes
126
+ # in a pool. Supply params:
127
+ # * a full path to a slowlog file
128
+ # * a boolean indicating whether or not you want to do an initial silent
129
+ # run (results discarded) to populate the buffer pools on the nodes
130
+ # * Two or more nodes, or no nodes if you want to default to using the
131
+ # pool's standby slaves
132
+ #
133
+ # Requires that pt-upgrade is in root's PATH on the node running Jetpants.
134
+ def compare_queries(slowlog_path, silent_run_first, *compare_nodes)
135
+ if compare_nodes.size == 0
136
+ compare_nodes = standby_slaves
137
+ else
138
+ compare_nodes.flatten!
139
+ raise "Supplied nodes must all be in this pool" unless compare_nodes.all? {|n| n == master || n.master == master}
140
+ end
141
+
142
+ # We need to create a temporary SUPER user on the nodes to compare
143
+ # Also attempt to silence warning 1592 about unsafe-for-replication statements if
144
+ # using Percona Server 5.5.10+ which supports this.
145
+ username = 'pt-upgrade'
146
+ password = DB.random_password
147
+ remove_suppress_1592 = []
148
+ compare_nodes.each do |node|
149
+ node.create_user username, password
150
+ node.grant_privileges username, '*', 'SUPER'
151
+ node.grant_privileges username, node.app_schema, 'ALL PRIVILEGES'
152
+
153
+ # We only want to try this if (a) the node supports log_warnings_suppress,
154
+ # and (b) the node isn't already suppressing warning 1592
155
+ if node.global_variables[:log_warnings_suppress] == ''
156
+ node.mysql_root_cmd "SET GLOBAL log_warnings_suppress = '1592'"
157
+ remove_suppress_1592 << node
158
+ end
159
+ end
160
+
161
+ node_text = compare_nodes.map {|s| s.to_s + ' (v' + s.normalized_version(3) + ')'}.join ' vs '
162
+ dsn_text = compare_nodes.map {|n| "h=#{n.ip},P=#{n.port},u=#{username},p=#{password},D=#{n.app_schema}"}.join ' '
163
+
164
+ # Do silent run if requested (to populate buffer pools)
165
+ if silent_run_first
166
+ output "Doing a silent run of pt-upgrade with slowlog #{slowlog_path} to populate buffer pool."
167
+ output "Comparing nodes #{node_text}..."
168
+ stdout, exit_code = `pt-upgrade --set-vars wait_timeout=10000 #{slowlog_path} #{dsn_text} 2>&1`, $?.to_i
169
+ output "pt-upgrade silent run completed with exit code #{exit_code}"
170
+ puts
171
+ puts
172
+ end
173
+
174
+ # Run pt-upgrade for real. Note that we only compare query times and results, NOT warnings,
175
+ # due to issues with warning 1592 causing a huge amount of difficult-to-parse output.
176
+ output "Running pt-upgrade with slowlog #{slowlog_path}"
177
+ output "Comparing nodes #{node_text}..."
178
+ stdout, exit_code = `pt-upgrade --set-vars wait_timeout=10000 --compare query_times,results #{slowlog_path} #{dsn_text} 2>&1`, $?.to_i
179
+ output stdout
180
+ puts
181
+ output "pt-upgrade completed with exit code #{exit_code}"
182
+
183
+ # Drop the SUPER user and re-enable logging of warning 1592
184
+ compare_nodes.each {|node| node.drop_user username}
185
+ remove_suppress_1592.each {|node| node.mysql_root_cmd "SET GLOBAL log_warnings_suppress = ''"}
186
+ end
187
+
188
+
189
+ # Collects query slowlog on the master (and one active slave, if there are any)
190
+ # using tcpdump, copies over to the host Jetpants is running on, converts to a
191
+ # slowlog, and then uses Pool#compare_queries to run pt-upgrade.
192
+ #
193
+ # The supplied *compare_nodes should be standby slaves, and you may omit them
194
+ # to automatically select two standby slaves (of different versions, if available)
195
+ #
196
+ # When comparing exactly two nodes, we stop replication on the nodes temporarily
197
+ # to ensure a consistent dataset for comparing query results. Otherwise, async
198
+ # replication can naturally result in false-positives.
199
+ def collect_and_compare_queries!(tcpdump_time=30, *compare_nodes)
200
+ # Sample traffic and convert to slowlog for master
201
+ master_dump_filename = master.tcpdump!(tcpdump_time)
202
+ local = Host.local # node where we're running Jetpants from
203
+ local.ssh_cmd "mkdir -p #{Jetpants.export_location}"
204
+ master.fast_copy_chain(Jetpants.export_location, local, files: master_dump_filename, overwrite: true)
205
+ master.ssh_cmd "rm #{Jetpants.export_location}/#{master_dump_filename}"
206
+ master_slowlog_path = local.dumpfile_to_slowlog("#{Jetpants.export_location}/#{master_dump_filename}")
207
+
208
+ # If we also have an active slave running, grab sampled slowlog from there too
209
+ active_slowlog_path = nil
210
+ if active_slaves.size > 0
211
+ active_slave = active_slaves.first
212
+ active_dump_filename = active_slave.tcpdump!(tcpdump_time)
213
+ active_slave.fast_copy_chain(Jetpants.export_location, local, files: active_dump_filename, overwrite: true)
214
+ active_slave.ssh_cmd "rm #{Jetpants.export_location}/#{active_dump_filename}"
215
+ active_slowlog_path = local.dumpfile_to_slowlog("#{Jetpants.export_location}/#{active_dump_filename}")
216
+ end
217
+
218
+ # Gather our comparison nodes
219
+ if compare_nodes.size == 0
220
+ higher_ver_standby = standby_slaves.select {|s| s.version_cmp(master) > 0}.first
221
+ same_ver_standby = standby_slaves.select {|s| s.version_cmp(master) == 0}.first
222
+ if higher_ver_standby && same_ver_standby
223
+ compare_nodes = [same_ver_standby, higher_ver_standby]
224
+ else
225
+ compare_nodes = standby_slaves[0, 2]
226
+ end
227
+ end
228
+
229
+ # Disable monitoring on our comparison nodes, and then stop replication
230
+ # at the same position. We only proceed with this if we're comparing
231
+ # exactly two nodes; this may be improved in a future release.
232
+ if compare_nodes.size == 2
233
+ compare_nodes.each {|n| n.disable_monitoring}
234
+ compare_nodes.first.pause_replication_with(compare_nodes.last)
235
+ end
236
+
237
+ # Run pt-upgrade using the master dumpfile
238
+ puts
239
+ output "COMPARISON VIA QUERY LOG FROM MASTER"
240
+ compare_queries(master_slowlog_path, true, *compare_nodes)
241
+
242
+ if active_slowlog_path
243
+ puts
244
+ output "COMPARISON VIA QUERY LOG FROM ACTIVE SLAVE"
245
+ compare_queries(active_slowlog_path, true, *compare_nodes)
246
+ end
247
+
248
+ # If we previously paused replication and disabled monitoring, un-do this
249
+ if compare_nodes.size == 2
250
+ compare_nodes.concurrent_each do |n|
251
+ n.resume_replication
252
+ n.catch_up_to_master
253
+ n.enable_monitoring
254
+ end
255
+ end
256
+ end
257
+
258
+ end
259
+ end