jetpants 0.8.0 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.rdoc +4 -9
- data/bin/jetpants +7 -6
- data/doc/capacity_plan.rdoc +77 -0
- data/doc/commands.rdoc +1 -1
- data/doc/jetpants_collins.rdoc +2 -1
- data/doc/online_schema_change.rdoc +45 -0
- data/doc/plugins.rdoc +7 -1
- data/doc/requirements.rdoc +1 -1
- data/doc/upgrade_helper.rdoc +68 -0
- data/lib/jetpants/db/client.rb +2 -1
- data/lib/jetpants/db/import_export.rb +12 -3
- data/lib/jetpants/db/replication.rb +6 -2
- data/lib/jetpants/db/schema.rb +40 -0
- data/lib/jetpants/db/server.rb +2 -2
- data/lib/jetpants/host.rb +12 -1
- data/lib/jetpants/pool.rb +41 -0
- data/lib/jetpants/shard.rb +201 -124
- data/lib/jetpants/table.rb +80 -10
- data/plugins/capacity_plan/capacity_plan.rb +353 -0
- data/plugins/capacity_plan/commandsuite.rb +19 -0
- data/plugins/capacity_plan/monkeypatch.rb +20 -0
- data/plugins/jetpants_collins/db.rb +45 -6
- data/plugins/jetpants_collins/jetpants_collins.rb +32 -21
- data/plugins/jetpants_collins/pool.rb +22 -1
- data/plugins/jetpants_collins/shard.rb +9 -2
- data/plugins/jetpants_collins/topology.rb +8 -9
- data/plugins/online_schema_change/commandsuite.rb +56 -0
- data/plugins/online_schema_change/db.rb +33 -0
- data/plugins/online_schema_change/online_schema_change.rb +5 -0
- data/plugins/online_schema_change/pool.rb +105 -0
- data/plugins/online_schema_change/topology.rb +56 -0
- data/plugins/simple_tracker/shard.rb +1 -1
- data/plugins/upgrade_helper/commandsuite.rb +212 -0
- data/plugins/upgrade_helper/db.rb +78 -0
- data/plugins/upgrade_helper/host.rb +22 -0
- data/plugins/upgrade_helper/pool.rb +259 -0
- data/plugins/upgrade_helper/shard.rb +61 -0
- data/plugins/upgrade_helper/upgrade_helper.rb +21 -0
- data/scripts/global_rowcount.rb +75 -0
- metadata +28 -15
@@ -0,0 +1,56 @@
|
|
1
|
+
module Jetpants
|
2
|
+
class Topology
|
3
|
+
|
4
|
+
# run an alter table on all the sharded pools
|
5
|
+
# if you specify dry run it will run a dry run on all the shards
|
6
|
+
# otherwise it will run on the first shard and ask if you want to
|
7
|
+
# continue on the rest of the shards, 10 shards at a time
|
8
|
+
def alter_table_shards(database, table, alter, dry_run=true)
|
9
|
+
my_shards = shards.dup
|
10
|
+
first_shard = my_shards.shift
|
11
|
+
print "Will run on first shard and prompt for going past the dry run only on the first shard\n\n"
|
12
|
+
print "[#{Time.now.to_s.blue}] #{first_shard.pool.to_s}\n"
|
13
|
+
unless first_shard.alter_table(database, table, alter, dry_run, false)
|
14
|
+
print "First shard had an error, please check output\n"
|
15
|
+
return
|
16
|
+
end
|
17
|
+
|
18
|
+
continue = 'no'
|
19
|
+
continue = ask('First shard complete would you like to continue with the rest of the shards?: (YES/no) - YES has to be in all caps and fully typed')
|
20
|
+
if continue == 'YES'
|
21
|
+
errors = []
|
22
|
+
|
23
|
+
my_shards.limited_concurrent_map(10) do |shard|
|
24
|
+
print "[#{Time.now.to_s.blue}] #{shard.pool.to_s}\n"
|
25
|
+
errors << shard unless shard.alter_table(database, table, alter, dry_run, true)
|
26
|
+
end
|
27
|
+
|
28
|
+
errors.each do |shard|
|
29
|
+
print "check #{shard.name} for errors during online schema change\n"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# will drop old table from the shards after a alter table
|
35
|
+
# this is because we do not drop the old table in the osc
|
36
|
+
# also I will do the first shard and ask if you want to
|
37
|
+
# continue, after that it will do each table serially
|
38
|
+
def drop_old_alter_table_shards(database, table)
|
39
|
+
my_shards = shards.dup
|
40
|
+
first_shard = my_shards.shift
|
41
|
+
print "Will run on first shard and prompt before going on to the rest\n\n"
|
42
|
+
print "[#{Time.now.to_s.blue}] #{first_shard.pool.to_s}\n"
|
43
|
+
first_shard.drop_old_alter_table(database, table)
|
44
|
+
|
45
|
+
continue = 'no'
|
46
|
+
continue = ask('First shard complete would you like to continue with the rest of the shards?: (YES/no) - YES has to be in all caps and fully typed')
|
47
|
+
if continue == 'YES'
|
48
|
+
my_shards.each do |shard|
|
49
|
+
print "[#{Time.now.to_s.blue}] #{shard.pool.to_s}\n"
|
50
|
+
shard.drop_old_alter_table(database, table)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
@@ -62,7 +62,7 @@ module Jetpants
|
|
62
62
|
# read-only shards, and offline shards appropriately.
|
63
63
|
return me.merge case state
|
64
64
|
when :ready, :needs_cleanup then {'host' => master.ip}
|
65
|
-
when :child then {'host_read' => master.ip, 'host_write' =>
|
65
|
+
when :child then {'host_read' => master.ip, 'host_write' => master.master.ip}
|
66
66
|
when :read_only then {'host_read' => master.ip, 'host_write' => false}
|
67
67
|
when :offline then {'host' => false}
|
68
68
|
end
|
@@ -0,0 +1,212 @@
|
|
1
|
+
# additional commands added by this plugin
|
2
|
+
|
3
|
+
require 'thor'
|
4
|
+
|
5
|
+
module Jetpants
|
6
|
+
class CommandSuite < Thor
|
7
|
+
|
8
|
+
desc 'upgrade_clone_slave', 'clone a standby slave to target node(s) running a newer version of MySQL'
|
9
|
+
method_option :source, :desc => 'IP of node to clone from'
|
10
|
+
method_option :target, :desc => 'IP of node(s) to clone to'
|
11
|
+
def upgrade_clone_slave
|
12
|
+
puts "This task clones the data set of a standby slave to target node(s) that have a"
|
13
|
+
puts "newer version of MySQL already installed."
|
14
|
+
source = ask_node('Please enter IP of node to clone from: ', options[:source])
|
15
|
+
source.master.probe if source.master # fail early if there are any replication issues in this pool
|
16
|
+
describe source
|
17
|
+
|
18
|
+
puts "You may clone to particular IP address(es), or can type \"spare\" to claim a node from the spare pool."
|
19
|
+
target = options[:target] || ask('Please enter comma-separated list of targets (IPs or "spare") to clone to: ')
|
20
|
+
spares_needed = target.split(',').count {|t| t.strip.upcase == 'SPARE'}
|
21
|
+
target = 'spare' if target.strip == '' || target.split(',').length == 0
|
22
|
+
if spares_needed > 0
|
23
|
+
spares_available = Jetpants.topology.count_spares(role: :standby_slave, like: source, version: Plugin::UpgradeHelper.new_version)
|
24
|
+
raise "Not enough upgraded spares with role of standby slave! Requested #{spares_needed} but only have #{spares_available} available." if spares_needed > spares_available
|
25
|
+
claimed_spares = Jetpants.topology.claim_spares(spares_needed, role: :standby_slave, like: source, version: Plugin::UpgradeHelper.new_version)
|
26
|
+
end
|
27
|
+
|
28
|
+
targets = target.split(',').map do |ip|
|
29
|
+
ip.strip!
|
30
|
+
if is_ip? ip
|
31
|
+
ip.to_db
|
32
|
+
elsif ip == '' || ip.upcase == 'SPARE'
|
33
|
+
claimed_spares.shift
|
34
|
+
else
|
35
|
+
error "target (#{ip}) does not appear to be an IP."
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
source.start_mysql if ! source.running?
|
40
|
+
error "source (#{source}) is not a standby slave" unless source.is_standby?
|
41
|
+
|
42
|
+
targets.each do |t|
|
43
|
+
error "target #{t} already has a master; please clear out node (including in asset tracker) before proceeding" if t.master
|
44
|
+
end
|
45
|
+
|
46
|
+
# Disable fast shutdown on the source
|
47
|
+
source.mysql_root_cmd 'SET GLOBAL innodb_fast_shutdown = 0'
|
48
|
+
|
49
|
+
# Flag the nodes as needing upgrade, which will get triggered when
|
50
|
+
# enslave_siblings restarts them
|
51
|
+
targets.each {|t| t.needs_upgrade = true}
|
52
|
+
|
53
|
+
# Remove ib_lru_dump if present on targets
|
54
|
+
targets.concurrent_each {|t| t.ssh_cmd "rm -rf #{t.mysql_directory}/ib_lru_dump"}
|
55
|
+
|
56
|
+
source.enslave_siblings!(targets)
|
57
|
+
targets.concurrent_each {|t| t.resume_replication; t.catch_up_to_master}
|
58
|
+
source.pool.sync_configuration
|
59
|
+
|
60
|
+
puts "Clone-and-upgrade complete."
|
61
|
+
Jetpants.topology.write_config
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
desc 'upgrade_promotion', 'demote and destroy a master running an older version of MySQL'
|
66
|
+
method_option :demote, :desc => 'node to demote'
|
67
|
+
def upgrade_promotion
|
68
|
+
demoted = ask_node 'Please enter the IP address of the node to demote:', options[:demote]
|
69
|
+
demoted.probe
|
70
|
+
|
71
|
+
# This task should not be used for emergency promotions (master failures)
|
72
|
+
# since the regular "jetpants promotion" logic is actually fine in that case.
|
73
|
+
error "Unable to connect to node #{demoted} to demote" unless demoted.running?
|
74
|
+
|
75
|
+
# Before running this task, the pool should already have an extra standby slave,
|
76
|
+
# since we're going to be removing the master from the pool.
|
77
|
+
standby_slaves_needed = Jetpants.standby_slaves_per_pool + 1
|
78
|
+
error "Only run this task on a pool with 3 standby slaves!" unless demoted.pool(true).standby_slaves.size >= standby_slaves_needed
|
79
|
+
|
80
|
+
# Verify that all nodes except the master are running the same version, and
|
81
|
+
# are higher version than the master
|
82
|
+
unless demoted.slaves.all? {|s| s.version_cmp(demoted.slaves.first) == 0 && s.version_cmp(demoted) > 0}
|
83
|
+
error "This task can only be used when all slaves are running the same version of MySQL,"
|
84
|
+
error "and the master's version is older than that of all the slaves."
|
85
|
+
end
|
86
|
+
|
87
|
+
puts
|
88
|
+
inform "Summary of affected pool"
|
89
|
+
inform "Binary log positions and slave lag shown below are just a snapshot taken at the current time." if demoted.running?
|
90
|
+
puts
|
91
|
+
demoted.pool(true).summary(true)
|
92
|
+
puts
|
93
|
+
|
94
|
+
promoted = ask_node 'Please enter the IP address of a standby slave to promote: '
|
95
|
+
|
96
|
+
error "Node to promote #{promoted} is not a standby slave of node to demote #{demoted}" unless promoted.master == demoted && promoted.role == :standby_slave
|
97
|
+
error "The chosen node cannot be promoted. Please choose another." unless promoted.promotable_to_master?(false)
|
98
|
+
|
99
|
+
inform "Going to DEMOTE AND DESTROY existing master #{demoted} and PROMOTE new master #{promoted}."
|
100
|
+
error "Aborting." unless agree "Proceed? [yes/no]: "
|
101
|
+
|
102
|
+
# Perform the promotion, but without making the old master become a slave of the new master
|
103
|
+
# We then rely on the built-in call to Pool#sync_configuration or Pool#after_master_promotion!
|
104
|
+
# to remove the old master from the pool in the same way it would handle a failed master (which
|
105
|
+
# is entirely asset-tracker-plugin specific)
|
106
|
+
demoted.pool(true).master_promotion!(promoted, false)
|
107
|
+
end
|
108
|
+
def self.after_upgrade_promotion
|
109
|
+
reminders(
|
110
|
+
'Commit/push the configuration in version control.',
|
111
|
+
'Deploy the configuration to all machines.',
|
112
|
+
)
|
113
|
+
end
|
114
|
+
|
115
|
+
|
116
|
+
desc 'shard_upgrade', 'upgrade a shard via four-step lockless process'
|
117
|
+
method_option :min_id, :desc => 'Minimum ID of shard to upgrade'
|
118
|
+
method_option :max_id, :desc => 'Maximum ID of shard to ugprade'
|
119
|
+
method_option :reads, :desc => 'Move reads to the new master', :type => :boolean
|
120
|
+
method_option :writes, :desc => 'Move writes to new master', :type => :boolean
|
121
|
+
method_option :cleanup, :desc => 'Tear down the old-version nodes', :type => :boolean
|
122
|
+
def shard_upgrade
|
123
|
+
if options[:reads]
|
124
|
+
raise 'The --reads, --writes, and --cleanup options are mutually exclusive' if options[:writes] || options[:cleanup]
|
125
|
+
s = ask_shard_being_upgraded :reads
|
126
|
+
s.branched_upgrade_move_reads
|
127
|
+
Jetpants.topology.write_config
|
128
|
+
self.class.reminders(
|
129
|
+
'Commit/push the configuration in version control.',
|
130
|
+
'Deploy the configuration to all machines.',
|
131
|
+
'Wait for reads to stop on the old shard master.',
|
132
|
+
'Proceed to next step: jetpants shard_upgrade --writes'
|
133
|
+
)
|
134
|
+
elsif options[:writes]
|
135
|
+
raise 'The --reads, --writes, and --cleanup options are mutually exclusive' if options[:reads] || options[:cleanup]
|
136
|
+
s = ask_shard_being_upgraded :writes
|
137
|
+
s.branched_upgrade_move_writes
|
138
|
+
Jetpants.topology.write_config
|
139
|
+
self.class.reminders(
|
140
|
+
'Commit/push the configuration in version control.',
|
141
|
+
'Deploy the configuration to all machines.',
|
142
|
+
'Wait for writes to stop on the old parent master.',
|
143
|
+
'Proceed to next step: jetpants shard_upgrade --cleanup',
|
144
|
+
)
|
145
|
+
|
146
|
+
elsif options[:cleanup]
|
147
|
+
raise 'The --reads, --writes, and --cleanup options are mutually exclusive' if options[:reads] || options[:writes]
|
148
|
+
s = ask_shard_being_upgraded :cleanup
|
149
|
+
s.cleanup!
|
150
|
+
|
151
|
+
else
|
152
|
+
self.class.reminders(
|
153
|
+
'This process may take an hour or two. You probably want to run this from a screen session.',
|
154
|
+
'Be especially careful if you are relying on SSH Agent Forwarding for your root key, since this is not screen-friendly.'
|
155
|
+
)
|
156
|
+
s = ask_shard_being_upgraded :prep
|
157
|
+
s.branched_upgrade_prep
|
158
|
+
self.class.reminders(
|
159
|
+
'Proceed to next step: jetpants shard_upgrade --reads'
|
160
|
+
)
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
|
165
|
+
desc 'checksum_pool', 'Run pt-table-checksum on a pool to verify data consistency after an upgrade of one slave'
|
166
|
+
method_option :pool, :desc => 'name of pool'
|
167
|
+
def checksum_pool
|
168
|
+
pool_name = options[:pool] || ask('Please enter name of pool to checksum: ')
|
169
|
+
pool = Jetpants.topology.pool(pool_name) or raise "Pool #{pool_name} does not exist"
|
170
|
+
pool.checksum_tables
|
171
|
+
end
|
172
|
+
|
173
|
+
|
174
|
+
desc 'check_pool_queries', 'Runs pt-upgrade on a pool to verify query performance and results between different MySQL versions'
|
175
|
+
method_option :pool, :desc => 'name of pool'
|
176
|
+
method_option :dumptime, :desc => 'number of seconds of tcpdump data to consider'
|
177
|
+
def check_pool_queries
|
178
|
+
pool_name = options[:pool] || ask('Please enter name of pool to checksum: ')
|
179
|
+
dump_time = options[:dumptime].to_i if options[:dumptime]
|
180
|
+
dump_time ||= 30
|
181
|
+
|
182
|
+
pool = Jetpants.topology.pool(pool_name) or raise "Pool #{pool_name} does not exist"
|
183
|
+
pool.collect_and_compare_queries!(dump_time)
|
184
|
+
end
|
185
|
+
|
186
|
+
no_tasks do
|
187
|
+
def ask_shard_being_upgraded(stage=:prep)
|
188
|
+
shards_being_upgraded = Jetpants.shards.select {|s| [:child, :needs_cleanup].include?(s.state) && !s.parent && s.master.master}
|
189
|
+
if stage == :writes || stage == :cleanup
|
190
|
+
if shards_being_upgraded.size == 0
|
191
|
+
raise 'No shards are currently being upgraded. You can only use this task after running "jetpants shard_upgrade".'
|
192
|
+
elsif shards_being_upgraded.size == 1
|
193
|
+
s = shards_being_upgraded.first
|
194
|
+
puts "Detected #{s} as the only shard currently involved in an upgrade operation."
|
195
|
+
error "Aborting." unless agree "Is this the right shard that you want to perform this action on? [yes/no]: "
|
196
|
+
return s
|
197
|
+
else
|
198
|
+
puts "The following shards are already involved in an upgrade operation:"
|
199
|
+
shards_being_upgraded.each {|sbu| puts "* #{sbu}"}
|
200
|
+
end
|
201
|
+
end
|
202
|
+
puts "Which shard would you like to perform this action on?"
|
203
|
+
shard_min = options[:min_id] || ask('Please enter min ID of the shard: ')
|
204
|
+
shard_max = options[:max_id] || ask('Please enter max ID of the shard: ')
|
205
|
+
s = Jetpants.topology.shard shard_min, shard_max
|
206
|
+
raise 'Shard not found' unless s
|
207
|
+
s
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
end
|
212
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module Jetpants
|
2
|
+
class DB
|
3
|
+
attr_accessor :needs_upgrade
|
4
|
+
|
5
|
+
##### CALLBACKS ############################################################
|
6
|
+
|
7
|
+
# Handle upgrading mysql if needed
|
8
|
+
def before_start_mysql(*options)
|
9
|
+
return unless @needs_upgrade
|
10
|
+
|
11
|
+
@repl_paused = false if @master
|
12
|
+
running = ssh_cmd "netstat -ln | grep #{@port} | wc -l"
|
13
|
+
raise "[#{@ip}] Failed to start MySQL: Something is already listening on port #{@port}" unless running.chomp == '0'
|
14
|
+
|
15
|
+
output "Attempting to start MySQL with --skip-networking --skip-grant-tables in prep for upgrade"
|
16
|
+
|
17
|
+
# Can't use start_mysql here without causing infinite recursion! Also don't need
|
18
|
+
# to do all the same checks here, nor do we need to store these to @options.
|
19
|
+
output service(:start, 'mysql', '--skip-networking --skip-grant-tables')
|
20
|
+
|
21
|
+
output "Attempting to run mysql_upgrade"
|
22
|
+
output ssh_cmd('mysql_upgrade')
|
23
|
+
|
24
|
+
output "Upgrade complete"
|
25
|
+
@needs_upgrade = false
|
26
|
+
|
27
|
+
# Now shut down mysql, so that start_mysql can restart it without the --skip-* options
|
28
|
+
stop_mysql
|
29
|
+
end
|
30
|
+
|
31
|
+
##### NEW METHODS ##########################################################
|
32
|
+
|
33
|
+
# Creates a temporary user for use of pt-table-checksum, yields to the
|
34
|
+
# supplied block, and then drops the user.
|
35
|
+
# The user will have a randomly-generated 50-character password, and will
|
36
|
+
# have elevated permissions (ALL PRIVILEGES on the application schema, and
|
37
|
+
# a few global privs as well) since these are necessary to run the tools.
|
38
|
+
# The block will be passed the randomly-generated password.
|
39
|
+
def with_pt_checksum_user(username='pt-checksum')
|
40
|
+
password = DB.random_password
|
41
|
+
create_user username, password
|
42
|
+
grant_privileges username, '*', 'PROCESS', 'REPLICATION CLIENT', 'REPLICATION SLAVE'
|
43
|
+
grant_privileges username, app_schema, 'ALL PRIVILEGES'
|
44
|
+
begin
|
45
|
+
yield username, password
|
46
|
+
rescue
|
47
|
+
drop_user username
|
48
|
+
raise
|
49
|
+
end
|
50
|
+
drop_user username
|
51
|
+
end
|
52
|
+
|
53
|
+
# Captures mysql traffic with tcpdump for the specified amount of time, in seconds.
|
54
|
+
# The dumpfile will be saved to #{Jetpants.export_location} with filename
|
55
|
+
# #{hostname}.dumpfile, and the filename portion will be returned by this method.
|
56
|
+
#
|
57
|
+
# Not all traffic will be included -- uses a method by Devananda van der Veen described in
|
58
|
+
# http://www.mysqlperformanceblog.com/2011/04/18/how-to-use-tcpdump-on-very-busy-hosts/
|
59
|
+
# to sample the traffic.
|
60
|
+
#
|
61
|
+
# Requires that tcpdump is available in root's PATH. Also assumes root's shell is bash
|
62
|
+
# or supports equivalent syntax. Currently only works if mysqld running on port 3306.
|
63
|
+
#
|
64
|
+
# Warning: tcpdump can be taxing on the server, and also can generate rather large
|
65
|
+
# amounts of output! Also, will overwrite any previous file at the destination path!
|
66
|
+
def tcpdump!(duration=30, interface=false)
|
67
|
+
interface ||= Jetpants.private_interface
|
68
|
+
output "Using tcpdump to capture sample of MySQL traffic for #{duration} seconds"
|
69
|
+
tcpdump_options = "-i #{interface} -s 65535 -x -n -q -tttt 'port 3306 and tcp[1] & 7 == 2 and tcp[3] & 7 == 2'"
|
70
|
+
outfile = "#{Jetpants.export_location}/#{hostname}.dumpfile"
|
71
|
+
ssh_cmd "tcpdump #{tcpdump_options} > #{outfile} & export DUMP_PID=$! && sleep #{duration} && kill $DUMP_PID"
|
72
|
+
output "Completed capturing traffic sample"
|
73
|
+
"#{hostname}.dumpfile"
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Jetpants
|
2
|
+
class Host
|
3
|
+
|
4
|
+
##### NEW METHODS ##########################################################
|
5
|
+
|
6
|
+
# Converts tcpdump output into slowlog format using pt-query-digest. Requires that
|
7
|
+
# pt-query-digest is installed and in root's path. Returns the full path to the
|
8
|
+
# slowlog. Does not delete or remove the tcpdump output file.
|
9
|
+
#
|
10
|
+
# This is in Host instead of DB because it may be preferable to run this on
|
11
|
+
# the host running Jetpants, as opposed to the DB where the dumpfile came from,
|
12
|
+
# because pt-query-digest may be taxing to run on the server.
|
13
|
+
def dumpfile_to_slowlog(tcpdump_output_file_path, delete_tcpdumpfile=true)
|
14
|
+
slowlog_file_path = tcpdump_output_file_path.sub('.dumpfile', '') + '.slowlog'
|
15
|
+
ssh_cmd "pt-query-digest #{tcpdump_output_file_path} --type tcpdump --no-report --print >#{slowlog_file_path}"
|
16
|
+
ssh_cmd "rm #{tcpdump_output_file_path}" if delete_tcpdumpfile
|
17
|
+
slowlog_file_path
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,259 @@
|
|
1
|
+
require 'open3'
|
2
|
+
|
3
|
+
module Jetpants
|
4
|
+
class Pool
|
5
|
+
collins_attr_accessor :checksum_running
|
6
|
+
|
7
|
+
# Runs pt-table-checksum on the pool.
|
8
|
+
# Returns true if no problems found, false otherwise.
|
9
|
+
# If problems were found, the 'checksums' table will be
|
10
|
+
# left in the pool - the user must review and manually delete.
|
11
|
+
def checksum_tables
|
12
|
+
schema = master.app_schema
|
13
|
+
success = false
|
14
|
+
output_lines = []
|
15
|
+
|
16
|
+
# check if already running, or a previous run died
|
17
|
+
previous_run = collins_checksum_running
|
18
|
+
previous_run = nil if previous_run == ''
|
19
|
+
if previous_run
|
20
|
+
run_data = JSON.parse(previous_run.downcase) # hash with 'from_host', 'from_pid', 'timestamp'
|
21
|
+
previous_host = run_data['from_host'].to_host
|
22
|
+
previous_pid = run_data['from_pid'] or die 'No previous pid found in previous rundata?'
|
23
|
+
still_running = previous_host.pid_running?(previous_pid, 'pt-table-checksum')
|
24
|
+
raise "Checksum already in progress from #{previous_host}, pid=#{previous_pid}" if still_running
|
25
|
+
output "Previous failed run detected, will use --resume parameter"
|
26
|
+
end
|
27
|
+
|
28
|
+
# Determine what to pass to --max-load
|
29
|
+
master.output "Polling for normal max threads_running, please wait"
|
30
|
+
max_threads_running = master.max_threads_running
|
31
|
+
limit_threads_running = [(max_threads_running * 1.2).ceil, 50].max
|
32
|
+
master.output "Found max threads_running=#{max_threads_running}, will use limit of #{limit_threads_running}"
|
33
|
+
|
34
|
+
# Operate with a temporary user that has elevated permissions
|
35
|
+
master.with_pt_checksum_user do |username, password|
|
36
|
+
# Build command line
|
37
|
+
command_line = ['pt-table-checksum',
|
38
|
+
'--no-check-replication-filters',
|
39
|
+
"--databases #{schema}",
|
40
|
+
"--host #{master.ip}",
|
41
|
+
"--port #{master.port}",
|
42
|
+
"--max-load Threads_running:#{limit_threads_running}",
|
43
|
+
"--replicate #{schema}.checksums",
|
44
|
+
"--replicate-database #{schema}",
|
45
|
+
"--user #{username}",
|
46
|
+
"--password #{password}"
|
47
|
+
].join ' '
|
48
|
+
command_line += ' --resume' if previous_run
|
49
|
+
|
50
|
+
# Spawn the process
|
51
|
+
Open3.popen3(command_line) do |stdin, stdout, stderr, wait_thread|
|
52
|
+
exit_code = nil
|
53
|
+
pid = wait_thread.pid
|
54
|
+
puts "Running pt-table-checksum targetting #{master}, pid on Jetpants host is #{pid}"
|
55
|
+
|
56
|
+
self.collins_checksum_running = {
|
57
|
+
'from_host' => Host.local.ip,
|
58
|
+
'from_pid' => pid,
|
59
|
+
'timestamp' => Time.now.to_i,
|
60
|
+
}.to_json
|
61
|
+
|
62
|
+
# Display STDERR output in real-time, via a separate thread
|
63
|
+
Thread.new do
|
64
|
+
begin
|
65
|
+
stderr.each {|line| puts line}
|
66
|
+
rescue IOError, Interrupt
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Capture STDOUT and buffer it; since this is the main thread, also
|
72
|
+
# watch out for broken pipe or ctrl-c
|
73
|
+
begin
|
74
|
+
stdout.each {|line| output_lines << line}
|
75
|
+
exit_code = wait_thread.value.to_i
|
76
|
+
rescue IOError, Interrupt => ex
|
77
|
+
puts "Caught exception #{ex.message}"
|
78
|
+
exit_code = 130 # by unix convention, return 128 + SIGINT
|
79
|
+
end
|
80
|
+
|
81
|
+
# Dump out stdout: first anything we buffered on our end, plus anything
|
82
|
+
# that Perl or the OS had buffered on its end
|
83
|
+
puts
|
84
|
+
output_lines.each {|line| puts line}
|
85
|
+
unless stdout.eof?
|
86
|
+
stdout.each {|line| puts line} rescue nil
|
87
|
+
end
|
88
|
+
puts
|
89
|
+
|
90
|
+
puts "Checksum completed with exit code #{exit_code}.\n"
|
91
|
+
success = (exit_code == 0)
|
92
|
+
|
93
|
+
# Run again with --replicate-check-only to display ALL diffs, including ones from
|
94
|
+
# prior runs of the tool.
|
95
|
+
puts 'Verifying all results via --replicate-check-only...'
|
96
|
+
output, diff_success = `#{command_line} --replicate-check-only`, $?.success?
|
97
|
+
if diff_success
|
98
|
+
puts 'No diffs found in any tables.'
|
99
|
+
puts output
|
100
|
+
else
|
101
|
+
puts 'Found diffs:'
|
102
|
+
puts output
|
103
|
+
success = false
|
104
|
+
end
|
105
|
+
|
106
|
+
# Drop the checksums table, but only if there were no diffs
|
107
|
+
if success
|
108
|
+
output "Dropping table #{schema}.checksums..."
|
109
|
+
master.connect(user: username, pass: password)
|
110
|
+
master.query('DROP TABLE checksums')
|
111
|
+
output "Table dropped."
|
112
|
+
master.disconnect
|
113
|
+
self.collins_checksum_running = ''
|
114
|
+
else
|
115
|
+
output 'Keeping checksums table in place for your review.'
|
116
|
+
output 'Please manually drop it when done.'
|
117
|
+
end
|
118
|
+
puts
|
119
|
+
end # popen3
|
120
|
+
end # with_pt_checksum_user
|
121
|
+
success
|
122
|
+
end
|
123
|
+
|
124
|
+
|
125
|
+
# Uses pt-upgrade to compare query performance and resultsets among nodes
|
126
|
+
# in a pool. Supply params:
|
127
|
+
# * a full path to a slowlog file
|
128
|
+
# * a boolean indicating whether or not you want to do an initial silent
|
129
|
+
# run (results discarded) to populate the buffer pools on the nodes
|
130
|
+
# * Two or more nodes, or no nodes if you want to default to using the
|
131
|
+
# pool's standby slaves
|
132
|
+
#
|
133
|
+
# Requires that pt-upgrade is in root's PATH on the node running Jetpants.
|
134
|
+
def compare_queries(slowlog_path, silent_run_first, *compare_nodes)
|
135
|
+
if compare_nodes.size == 0
|
136
|
+
compare_nodes = standby_slaves
|
137
|
+
else
|
138
|
+
compare_nodes.flatten!
|
139
|
+
raise "Supplied nodes must all be in this pool" unless compare_nodes.all? {|n| n == master || n.master == master}
|
140
|
+
end
|
141
|
+
|
142
|
+
# We need to create a temporary SUPER user on the nodes to compare
|
143
|
+
# Also attempt to silence warning 1592 about unsafe-for-replication statements if
|
144
|
+
# using Percona Server 5.5.10+ which supports this.
|
145
|
+
username = 'pt-upgrade'
|
146
|
+
password = DB.random_password
|
147
|
+
remove_suppress_1592 = []
|
148
|
+
compare_nodes.each do |node|
|
149
|
+
node.create_user username, password
|
150
|
+
node.grant_privileges username, '*', 'SUPER'
|
151
|
+
node.grant_privileges username, node.app_schema, 'ALL PRIVILEGES'
|
152
|
+
|
153
|
+
# We only want to try this if (a) the node supports log_warnings_suppress,
|
154
|
+
# and (b) the node isn't already suppressing warning 1592
|
155
|
+
if node.global_variables[:log_warnings_suppress] == ''
|
156
|
+
node.mysql_root_cmd "SET GLOBAL log_warnings_suppress = '1592'"
|
157
|
+
remove_suppress_1592 << node
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
node_text = compare_nodes.map {|s| s.to_s + ' (v' + s.normalized_version(3) + ')'}.join ' vs '
|
162
|
+
dsn_text = compare_nodes.map {|n| "h=#{n.ip},P=#{n.port},u=#{username},p=#{password},D=#{n.app_schema}"}.join ' '
|
163
|
+
|
164
|
+
# Do silent run if requested (to populate buffer pools)
|
165
|
+
if silent_run_first
|
166
|
+
output "Doing a silent run of pt-upgrade with slowlog #{slowlog_path} to populate buffer pool."
|
167
|
+
output "Comparing nodes #{node_text}..."
|
168
|
+
stdout, exit_code = `pt-upgrade --set-vars wait_timeout=10000 #{slowlog_path} #{dsn_text} 2>&1`, $?.to_i
|
169
|
+
output "pt-upgrade silent run completed with exit code #{exit_code}"
|
170
|
+
puts
|
171
|
+
puts
|
172
|
+
end
|
173
|
+
|
174
|
+
# Run pt-upgrade for real. Note that we only compare query times and results, NOT warnings,
|
175
|
+
# due to issues with warning 1592 causing a huge amount of difficult-to-parse output.
|
176
|
+
output "Running pt-upgrade with slowlog #{slowlog_path}"
|
177
|
+
output "Comparing nodes #{node_text}..."
|
178
|
+
stdout, exit_code = `pt-upgrade --set-vars wait_timeout=10000 --compare query_times,results #{slowlog_path} #{dsn_text} 2>&1`, $?.to_i
|
179
|
+
output stdout
|
180
|
+
puts
|
181
|
+
output "pt-upgrade completed with exit code #{exit_code}"
|
182
|
+
|
183
|
+
# Drop the SUPER user and re-enable logging of warning 1592
|
184
|
+
compare_nodes.each {|node| node.drop_user username}
|
185
|
+
remove_suppress_1592.each {|node| node.mysql_root_cmd "SET GLOBAL log_warnings_suppress = ''"}
|
186
|
+
end
|
187
|
+
|
188
|
+
|
189
|
+
# Collects query slowlog on the master (and one active slave, if there are any)
|
190
|
+
# using tcpdump, copies over to the host Jetpants is running on, converts to a
|
191
|
+
# slowlog, and then uses Pool#compare_queries to run pt-upgrade.
|
192
|
+
#
|
193
|
+
# The supplied *compare_nodes should be standby slaves, and you may omit them
|
194
|
+
# to automatically select two standby slaves (of different versions, if available)
|
195
|
+
#
|
196
|
+
# When comparing exactly two nodes, we stop replication on the nodes temporarily
|
197
|
+
# to ensure a consistent dataset for comparing query results. Otherwise, async
|
198
|
+
# replication can naturally result in false-positives.
|
199
|
+
def collect_and_compare_queries!(tcpdump_time=30, *compare_nodes)
|
200
|
+
# Sample traffic and convert to slowlog for master
|
201
|
+
master_dump_filename = master.tcpdump!(tcpdump_time)
|
202
|
+
local = Host.local # node where we're running Jetpants from
|
203
|
+
local.ssh_cmd "mkdir -p #{Jetpants.export_location}"
|
204
|
+
master.fast_copy_chain(Jetpants.export_location, local, files: master_dump_filename, overwrite: true)
|
205
|
+
master.ssh_cmd "rm #{Jetpants.export_location}/#{master_dump_filename}"
|
206
|
+
master_slowlog_path = local.dumpfile_to_slowlog("#{Jetpants.export_location}/#{master_dump_filename}")
|
207
|
+
|
208
|
+
# If we also have an active slave running, grab sampled slowlog from there too
|
209
|
+
active_slowlog_path = nil
|
210
|
+
if active_slaves.size > 0
|
211
|
+
active_slave = active_slaves.first
|
212
|
+
active_dump_filename = active_slave.tcpdump!(tcpdump_time)
|
213
|
+
active_slave.fast_copy_chain(Jetpants.export_location, local, files: active_dump_filename, overwrite: true)
|
214
|
+
active_slave.ssh_cmd "rm #{Jetpants.export_location}/#{active_dump_filename}"
|
215
|
+
active_slowlog_path = local.dumpfile_to_slowlog("#{Jetpants.export_location}/#{active_dump_filename}")
|
216
|
+
end
|
217
|
+
|
218
|
+
# Gather our comparison nodes
|
219
|
+
if compare_nodes.size == 0
|
220
|
+
higher_ver_standby = standby_slaves.select {|s| s.version_cmp(master) > 0}.first
|
221
|
+
same_ver_standby = standby_slaves.select {|s| s.version_cmp(master) == 0}.first
|
222
|
+
if higher_ver_standby && same_ver_standby
|
223
|
+
compare_nodes = [same_ver_standby, higher_ver_standby]
|
224
|
+
else
|
225
|
+
compare_nodes = standby_slaves[0, 2]
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
# Disable monitoring on our comparison nodes, and then stop replication
|
230
|
+
# at the same position. We only proceed with this if we're comparing
|
231
|
+
# exactly two nodes; this may be improved in a future release.
|
232
|
+
if compare_nodes.size == 2
|
233
|
+
compare_nodes.each {|n| n.disable_monitoring}
|
234
|
+
compare_nodes.first.pause_replication_with(compare_nodes.last)
|
235
|
+
end
|
236
|
+
|
237
|
+
# Run pt-upgrade using the master dumpfile
|
238
|
+
puts
|
239
|
+
output "COMPARISON VIA QUERY LOG FROM MASTER"
|
240
|
+
compare_queries(master_slowlog_path, true, *compare_nodes)
|
241
|
+
|
242
|
+
if active_slowlog_path
|
243
|
+
puts
|
244
|
+
output "COMPARISON VIA QUERY LOG FROM ACTIVE SLAVE"
|
245
|
+
compare_queries(active_slowlog_path, true, *compare_nodes)
|
246
|
+
end
|
247
|
+
|
248
|
+
# If we previously paused replication and disabled monitoring, un-do this
|
249
|
+
if compare_nodes.size == 2
|
250
|
+
compare_nodes.concurrent_each do |n|
|
251
|
+
n.resume_replication
|
252
|
+
n.catch_up_to_master
|
253
|
+
n.enable_monitoring
|
254
|
+
end
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
end
|
259
|
+
end
|