jetpants 0.7.0 → 0.7.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +6 -0
- data/bin/jetpants +100 -8
- data/doc/commands.rdoc +3 -1
- data/doc/faq.rdoc +117 -0
- data/doc/requirements.rdoc +4 -3
- data/lib/jetpants.rb +14 -14
- data/lib/jetpants/db/privileges.rb +5 -3
- data/lib/jetpants/db/replication.rb +12 -5
- data/lib/jetpants/db/state.rb +30 -4
- data/lib/jetpants/host.rb +2 -0
- data/lib/jetpants/pool.rb +78 -32
- data/lib/jetpants/shard.rb +10 -12
- metadata +23 -33
- data/tasks/promotion.rb +0 -260
data/README.rdoc
CHANGED
@@ -66,6 +66,12 @@ Other recommended uses of plugins include integration with your site's monitorin
|
|
66
66
|
|
67
67
|
For more information on how to write plugins and use the Jetpants::CallbackHandler system, please see doc/plugins.rdoc ({view on GitHub}[https://github.com/tumblr/jetpants/blob/master/doc/plugins.rdoc])
|
68
68
|
|
69
|
+
== FREQUENTLY ASKED QUESTIONS:
|
70
|
+
|
71
|
+
Please see doc/faq.rdoc ({view on GitHub}[https://github.com/tumblr/jetpants/blob/master/doc/faq.rdoc]) for answers to common questions.
|
72
|
+
|
73
|
+
If you have a question that isn't covered here, please feel free to email the authors at the addresses listed in jetpants.gemspec.
|
74
|
+
|
69
75
|
== CREDITS:
|
70
76
|
|
71
77
|
* <b>Evan Elias</b>: Lead developer. Core class implementations, shard split logic, plugin system
|
data/bin/jetpants
CHANGED
@@ -1,9 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
jetpants_base_dir = File.expand_path(File.dirname(__FILE__) + '/..')
|
3
3
|
$:.unshift File.join(jetpants_base_dir, 'lib')
|
4
|
-
%w[thor pry
|
5
|
-
# load tasks
|
6
|
-
Dir[File.join jetpants_base_dir, 'tasks', '**'].each {|f| require f}
|
4
|
+
%w[thor pry highline/import terminal-table colored].each {|g| require g}
|
7
5
|
|
8
6
|
module Jetpants
|
9
7
|
|
@@ -28,6 +26,7 @@ module Jetpants
|
|
28
26
|
self.send "after_#{task_name}" if self.respond_to? "after_#{task_name}"
|
29
27
|
end
|
30
28
|
|
29
|
+
|
31
30
|
desc 'console', 'Jetpants interactive console'
|
32
31
|
def console
|
33
32
|
Jetpants.pry
|
@@ -42,13 +41,82 @@ module Jetpants
|
|
42
41
|
print "\n#{message}\n\n"
|
43
42
|
end
|
44
43
|
|
45
|
-
|
44
|
+
|
45
|
+
desc 'promotion', 'perform a master promotion, changing which node is the master of a pool'
|
46
46
|
method_option :demote, :desc => 'node to demote'
|
47
47
|
method_option :promote, :desc => 'node to promote'
|
48
48
|
def promotion
|
49
|
-
|
49
|
+
# It's not uncommon for the demoted master to be an offline/unavailable node, so relax Jetpants' normal
|
50
|
+
# checks regarding replication threads being in different states.
|
51
|
+
Jetpants.verify_replication = false
|
52
|
+
|
53
|
+
promoted = options[:promote] ? options[:promote].to_db : nil
|
54
|
+
demoted = options[:demote] ? options[:demote].to_db : nil
|
55
|
+
|
56
|
+
if promoted && !demoted
|
57
|
+
error "Node to promote #{promoted} is not a slave" unless promoted.is_slave?
|
58
|
+
demoted = promoted.master
|
59
|
+
inform "Will demote #{demoted}, the master of specified promoted node #{promoted}."
|
60
|
+
end
|
61
|
+
|
62
|
+
if demoted
|
63
|
+
demoted.probe
|
64
|
+
else
|
65
|
+
demoted = ask_node 'Please enter the IP address of the node to demote:'
|
66
|
+
if demoted.running?
|
67
|
+
error 'Cannot demote a node that has no slaves!' unless demoted.has_slaves?
|
68
|
+
else
|
69
|
+
inform "Unable to connect to node #{demoted} to demote"
|
70
|
+
error "Unable to perform promotion" unless agree "Please confirm that #{demoted} is offline [yes/no]: "
|
71
|
+
|
72
|
+
# An asset-tracker plugin may have been populated the slave list anyway
|
73
|
+
if demoted.slaves && demoted.slaves.count > 0
|
74
|
+
demoted.slaves.each {|s| s.probe}
|
75
|
+
else
|
76
|
+
replicas = ask("Please enter a comma-seperated list of IP addresses of all current replicas of #{demoted}: ").split /\s*,\s*/
|
77
|
+
error "No replicas entered" unless replicas && replicas.count > 0
|
78
|
+
error "User supplied list of replicas appears to be invalid - #{replicas}" unless replicas.all? {|replica| is_ip? replica}
|
79
|
+
demoted.instance_eval {@slaves = replicas.map &:to_db}
|
80
|
+
demoted.slaves.each do |replica|
|
81
|
+
# Validate that they are really slaves of demoted
|
82
|
+
error "#{replica} does not appear to be a valid replica of #{demoted}" unless replica.master == demoted
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
puts
|
89
|
+
inform "Summary of affected pool"
|
90
|
+
inform "Binary log positions and slave lag shown below are just a snapshot taken at the current time." if demoted.running?
|
91
|
+
puts
|
92
|
+
demoted.pool(true).summary(true)
|
93
|
+
puts
|
94
|
+
|
95
|
+
unless promoted
|
96
|
+
if demoted.running?
|
97
|
+
inform "Recommendation: promote the standby slave with the highest binary log coordinates"
|
98
|
+
else
|
99
|
+
inform "Recommendation: promote the standby slave or active slave with the highest binary log coordinates"
|
100
|
+
end
|
101
|
+
promoted = ask_node 'Please enter the IP address of the node to promote: '
|
102
|
+
end
|
103
|
+
|
104
|
+
error "Unable to determine a node to demote and a node to promote" unless demoted.kind_of?(Jetpants::DB) && promoted.kind_of?(Jetpants::DB)
|
105
|
+
error "Node to promote #{promoted} is not a slave of node to demote #{demoted}" unless promoted.master == demoted
|
106
|
+
error "Cannot promote a backup slave. Please choose another." if promoted.for_backups?
|
107
|
+
|
108
|
+
inform "Going to DEMOTE existing master #{demoted} and PROMOTE new master #{promoted}."
|
109
|
+
error "Aborting." unless agree "Proceed? [yes/no]: "
|
110
|
+
demoted.pool(true).master_promotion! promoted
|
50
111
|
end
|
51
|
-
|
112
|
+
def self.after_promotion
|
113
|
+
reminders(
|
114
|
+
'Commit/push the configuration in version control.',
|
115
|
+
'Deploy the configuration to all machines.',
|
116
|
+
)
|
117
|
+
end
|
118
|
+
|
119
|
+
|
52
120
|
desc 'show_slaves', 'show the current slaves of a master'
|
53
121
|
method_option :node, :desc => 'node to query for slaves'
|
54
122
|
def show_slaves
|
@@ -70,7 +138,8 @@ module Jetpants
|
|
70
138
|
inform "node (#{node}) currently has no slaves."
|
71
139
|
end
|
72
140
|
end
|
73
|
-
|
141
|
+
|
142
|
+
|
74
143
|
desc 'show_master', 'show the current master of a node'
|
75
144
|
method_option :node, :desc => 'node to query for master'
|
76
145
|
method_option :siblings, :desc => 'show nodes current slave siblings'
|
@@ -97,7 +166,8 @@ module Jetpants
|
|
97
166
|
inform "node (#{node}) does not appear to be a slave"
|
98
167
|
end
|
99
168
|
end
|
100
|
-
|
169
|
+
|
170
|
+
|
101
171
|
desc 'node_info', 'show information about a given node'
|
102
172
|
method_option :node, :desc => 'node to query for information'
|
103
173
|
def node_info
|
@@ -146,11 +216,13 @@ module Jetpants
|
|
146
216
|
end
|
147
217
|
end
|
148
218
|
|
219
|
+
|
149
220
|
desc 'regen_config', 'regenerate the application configuration'
|
150
221
|
def regen_config
|
151
222
|
Jetpants.topology.write_config
|
152
223
|
end
|
153
224
|
|
225
|
+
|
154
226
|
desc 'clone_slave', 'clone a standby slave'
|
155
227
|
method_option :source, :desc => 'IP of node to clone from'
|
156
228
|
method_option :target, :desc => 'IP of node to clone to'
|
@@ -180,6 +252,7 @@ module Jetpants
|
|
180
252
|
)
|
181
253
|
end
|
182
254
|
|
255
|
+
|
183
256
|
desc 'activate_slave', 'turn a standby slave into an active slave'
|
184
257
|
method_option :node, :desc => 'IP of standby slave to activate'
|
185
258
|
def activate_slave
|
@@ -195,9 +268,11 @@ module Jetpants
|
|
195
268
|
Jetpants.topology.write_config
|
196
269
|
end
|
197
270
|
|
271
|
+
|
198
272
|
desc 'weigh_slave', 'change the weight of an active slave'
|
199
273
|
alias :weigh_slave :activate_slave
|
200
274
|
|
275
|
+
|
201
276
|
desc 'pull_slave', 'turn an active slave into a standby slave'
|
202
277
|
method_option :node, :desc => 'IP of active slave to pull'
|
203
278
|
def pull_slave
|
@@ -209,6 +284,7 @@ module Jetpants
|
|
209
284
|
Jetpants.topology.write_config
|
210
285
|
end
|
211
286
|
|
287
|
+
|
212
288
|
desc 'destroy_slave', 'remove a standby slave from its pool'
|
213
289
|
method_option :node, :desc => 'IP of standby slave to remove'
|
214
290
|
def destroy_slave
|
@@ -221,6 +297,7 @@ module Jetpants
|
|
221
297
|
node.pool.remove_slave!(node)
|
222
298
|
end
|
223
299
|
|
300
|
+
|
224
301
|
desc 'rebuild_slave', 'export and re-import data set on a standby slave'
|
225
302
|
method_option :node, :desc => 'IP of standby slave to rebuild'
|
226
303
|
def rebuild_slave
|
@@ -233,6 +310,7 @@ module Jetpants
|
|
233
310
|
node.rebuild!
|
234
311
|
end
|
235
312
|
|
313
|
+
|
236
314
|
desc 'shard_read_only', 'mark a shard as read-only'
|
237
315
|
method_option :min_id, :desc => 'Minimum ID of shard to mark as read-only'
|
238
316
|
def shard_read_only
|
@@ -244,6 +322,7 @@ module Jetpants
|
|
244
322
|
Jetpants.topology.write_config
|
245
323
|
end
|
246
324
|
|
325
|
+
|
247
326
|
desc 'shard_offline', 'mark a shard as offline (not readable or writable)'
|
248
327
|
method_option :min_id, :desc => 'Minimum ID of shard to mark as offline'
|
249
328
|
def shard_offline
|
@@ -255,6 +334,7 @@ module Jetpants
|
|
255
334
|
Jetpants.topology.write_config
|
256
335
|
end
|
257
336
|
|
337
|
+
|
258
338
|
desc 'shard_online', 'mark a shard as fully online (readable and writable)'
|
259
339
|
method_option :min_id, :desc => 'Minimum ID of shard to mark as fully online'
|
260
340
|
def shard_online
|
@@ -266,6 +346,7 @@ module Jetpants
|
|
266
346
|
Jetpants.topology.write_config
|
267
347
|
end
|
268
348
|
|
349
|
+
|
269
350
|
desc 'shard_split', 'shard split step 1 of 4: spin up child pools with different portions of data set'
|
270
351
|
method_option :min_id, :desc => 'Minimum ID of parent shard to split'
|
271
352
|
method_option :max_id, :desc => 'Maximum ID of parent shard to split'
|
@@ -315,6 +396,7 @@ module Jetpants
|
|
315
396
|
)
|
316
397
|
end
|
317
398
|
|
399
|
+
|
318
400
|
# This step is only really necessary if asset-tracker changes don't immediately reflect in application configuration.
|
319
401
|
# (ie, if app configuration is a static file that needs to be deployed to webs.)
|
320
402
|
desc 'shard_split_child_reads', 'shard split step 2 of 4: move reads to child shards'
|
@@ -330,6 +412,7 @@ module Jetpants
|
|
330
412
|
)
|
331
413
|
end
|
332
414
|
|
415
|
+
|
333
416
|
desc 'shard_split_child_writes', 'shard split step 3 of 4: move writes to child shards'
|
334
417
|
method_option :min_id, :desc => 'Minimum ID of parent shard being split'
|
335
418
|
method_option :max_id, :desc => 'Maximum ID of parent shard being split'
|
@@ -351,6 +434,7 @@ module Jetpants
|
|
351
434
|
)
|
352
435
|
end
|
353
436
|
|
437
|
+
|
354
438
|
desc 'shard_split_cleanup', 'shard split step 4 of 4: clean up data that replicated to wrong shard'
|
355
439
|
method_option :min_id, :desc => 'Minimum ID of parent shard being split'
|
356
440
|
method_option :max_id, :desc => 'Maximum ID of parent shard being split'
|
@@ -368,6 +452,7 @@ module Jetpants
|
|
368
452
|
)
|
369
453
|
end
|
370
454
|
|
455
|
+
|
371
456
|
desc 'shard_cutover', 'truncate the current last shard range, and add a new shard after it'
|
372
457
|
method_option :cutover_id, :desc => 'Minimum ID of new last shard being created'
|
373
458
|
def shard_cutover
|
@@ -412,6 +497,7 @@ module Jetpants
|
|
412
497
|
)
|
413
498
|
end
|
414
499
|
|
500
|
+
|
415
501
|
no_tasks do
|
416
502
|
def is_ip? address
|
417
503
|
address =~ /(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})/
|
@@ -424,6 +510,12 @@ module Jetpants
|
|
424
510
|
def inform message
|
425
511
|
puts message.blue
|
426
512
|
end
|
513
|
+
|
514
|
+
def ask_node(prompt)
|
515
|
+
node = ask prompt
|
516
|
+
error "Node (#{node}) does not appear to be an IP address." unless is_ip? node
|
517
|
+
node.to_db
|
518
|
+
end
|
427
519
|
end
|
428
520
|
|
429
521
|
def self.reminders(*strings)
|
data/doc/commands.rdoc
CHANGED
@@ -23,7 +23,9 @@ These commands change the type of a slave, or promote a slave to be a master. <
|
|
23
23
|
|
24
24
|
<b><tt>jetpants promotion</tt></b> changes which node in a pool is the master by performing a full MySQL master promotion. This is usable even if the old master is offline or unavailable. All nodes in the pool will now slave off of the new master. If the old master is online/available, it will become a standby slave of the new master.
|
25
25
|
|
26
|
-
Please note that the master promotion process enables global READ_ONLY mode on the old master. This is a required step of the
|
26
|
+
Please note that the master promotion process enables global READ_ONLY mode on the old master. This is a required step of the most generic MySQL master promotion technique. After doing a promotion in \Jetpants, you'll need to update/deploy your application's configuration as quickly as possible, if a plugin doesn't do it automatically for you.
|
27
|
+
|
28
|
+
Be aware that if the old master is offline/unavailable and the pool's slaves have replicated different amounts of data (ie, their relay logs progressed to different points at the exact moment the old master died), <tt>jetpants promotion</tt> may result in minor data inconsistencies (a couple seconds of writes) because these lost transactions are not automatically replayed on slaves that missed them. You can manually replay them using <tt>mysqlbinlog</tt>; this process is difficult to automate, which is why Jetpants and many other promotion tools do not do so. This may be implemented in a future release.
|
27
29
|
|
28
30
|
<b><tt>jetpants activate_slave</tt></b> turns a standby slave into an active slave. Use this if you want to generate an app configuration that now sends read queries to a slave that formerly did not receive them.
|
29
31
|
|
data/doc/faq.rdoc
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
= Frequently Asked Questions
|
2
|
+
|
3
|
+
== Is \Jetpants a server? How do I connect to it?
|
4
|
+
|
5
|
+
\Jetpants is an automation toolkit, not a server. In this way it differs from most other large-scale MySQL sharding solutions, which tend to be middleware/proxy servers.
|
6
|
+
|
7
|
+
The benefit of a toolkit is that you can still leverage standard MySQL replication, still use InnoDB/XtraDB as a robust storage engine choice, etc. \Jetpants largely doesn't interfere with any of that, and instead just provides tools to help you manage a large MySQL topology and support a range-based sharding scheme.
|
8
|
+
|
9
|
+
|
10
|
+
== Is \Jetpants still useful if my architecture isn't sharded?
|
11
|
+
|
12
|
+
Potentially, since \Jetpants fully supports "global" pools, also known as "functional partitions". You can even use \Jetpants to help manage a standard single-pool MySQL topology (1 master and some number of slaves) for handling common operations like slave cloning and master promotions. That said, there are other tools that may be easier to use if your MySQL footprint is smaller than, say, a dozen machines.
|
13
|
+
|
14
|
+
However, \Jetpants is also very useful as a Ruby library for performing arbitrary data migrations. It comes with methods for quickly importing and exporting large amounts of data, so it can be used for this purpose regardless of what your database topology looks like.
|
15
|
+
|
16
|
+
|
17
|
+
== What is a sharding key?
|
18
|
+
|
19
|
+
A sharding key is a core foreign key column that is present in most of your large tables, which can be used to group your data into shards. For many sites this could be <tt>user_id</tt> or <tt>customer_id</tt>, but it depends entirely on your data model and access patterns.
|
20
|
+
|
21
|
+
For example, on a blogging site the sharding key might be <tt>blog_id</tt>. Most tables that contain a <tt>blog_id</tt> column can be sharded, which will mean that all data related to a particular blog (posts, comments on those posts, authors, etc) is found on the same shard. By organizing data this way, you can continue to use relational operations such as JOIN when querying data that lives on the same shard.
|
22
|
+
|
23
|
+
Regardless of sharding key, some tables will not be shardable. This includes any "global" table that doesn't contain your sharding key column, as well as any tables that have global lookup patterns. For this reason you might not be able to shard the core table which has your sharding_key as its primary key!
|
24
|
+
|
25
|
+
In other words: if your sharding key is <tt>user_id</tt>, you might not actually be able to shard your <tt>users</tt> table because you need to do global lookups (ie, by email address) on this table. Denormalization is a common work-around; you could split your users table into a "global lookup" portion in a global pool and an "extended data" portion that lives on shards.
|
26
|
+
|
27
|
+
|
28
|
+
== What is range-based sharding? Why use it, and what are the alternatives?
|
29
|
+
|
30
|
+
Range-based sharding groups data based on ranges of your sharding key. For example, with a sharding key of <tt>user_id</tt>, all sharded data for users 1-1000 may be on the first shard, users 1001-3000 on the second shard, and users 3001-infinity on the third and final shard.
|
31
|
+
|
32
|
+
The main benefit of range-based sharding is simplicity. You can express the shard ranges in a language-neutral format like YAML or JSON, and the code to route queries to the correct DB can be implemented in a trivially small amount of code. There's no need for a lookup service, so we avoid a single point of failure. It's also easy for a human to look at the ranges and figure out which DB to query when debugging a problem by hand.
|
33
|
+
|
34
|
+
Rebalancing range-based shards can be accomplished quickly as long as the primary key of each table begins with the sharding key. InnoDB stores data in order of its primary key, which means it is extremely fast and efficient to dump out a portion of your data set based on a range of your sharding key.
|
35
|
+
|
36
|
+
The main downside to the range-based approach is lack of even distribution of "hot" data. If a small handful of users on a given shard are using a disproportionate amount of resources, there's no way to move _only_ those users to a different shard. For this reason, range-based sharding can work best for "long-tail" sites where the majority of activity is created by the majority of common users.
|
37
|
+
|
38
|
+
Some alternatives to the range-based approach include:
|
39
|
+
|
40
|
+
* <b>Modulus or hash</b>: Apply a function to your sharding key to determine which shard the data lives on.
|
41
|
+
|
42
|
+
This approach helps to distribute data very evenly. Many sites find that their latest users behave differently than their oldest users, so grouping users together by ranges of ID (essentially ranges of account creation date) can be problematic. Using a modulus or hash avoids this problem.
|
43
|
+
|
44
|
+
The main issue with this approach is how to rebalance shards that are too large. A simple modulus can't do this unless you want to simultaneously split all of your shards in half, which leads to painful exponential growth. A hash function can be more versatile but can still lead to great complexity. Worse yet, there's no way to rebalance _quickly_ because data is not stored on disk in sorted order based on the hash function.
|
45
|
+
|
46
|
+
* <b>Lookup table</b>: Use a separate service or data store which takes a sharding key value as an input and returns the appropriate shard as an output.
|
47
|
+
|
48
|
+
This scheme allows you to very specifically allocate particular data to shards, and works well for sites that have a lot of "hot" data from celebrity users. However, the lookup service is essentially a single point of failure, which counteracts many of the attractive features of sharded architectures. Rebalancing can also be slow and tricky, since you need a notion of "locking" a sharding key value while its rows are being migrated.
|
49
|
+
|
50
|
+
|
51
|
+
== How does \Jetpants perform slave-cloning?
|
52
|
+
|
53
|
+
\Jetpants clones slaves by stopping replication, shutting down the MySQL daemon, and then copying the raw files to the destination(s). This is the fastest way to get a consistent clone of a data set in MySQL. After the copy operation is complete, we start MySQL back up on the source and destinations, and then make the destination instances start slaving at the appropriate binlog coordinates.
|
54
|
+
|
55
|
+
We perform the copy operation using a combination of tar (for archiving), pigz (for fast compression), and nc (for transferring the data over the network). If there are multiple destinations, we create a serial "copy chain" using tee and a fifo.
|
56
|
+
|
57
|
+
Please note that we don't encrypt the data in this process, so we assume you are using it on a private LAN or over a VPN tunnel.
|
58
|
+
|
59
|
+
Because this process shuts down MySQL, you can only use it on a standby slave. Never use it on a machine that is actively taking queries from your application. If you need to do that, use a hot-copy solution instead.
|
60
|
+
|
61
|
+
|
62
|
+
== What are standby slaves? Why run two of them per pool?
|
63
|
+
|
64
|
+
Standby slaves are standard MySQL replicas that your application doesn't send queries to. We recommend maintaining exactly 2 standby slaves in every single pool/shard for high availability reasons:
|
65
|
+
|
66
|
+
* If a pool's master fails, you promote one standby slave to be the new master, and use the second standby slave to clone a replacement for the first standby slave.
|
67
|
+
|
68
|
+
* If an active slave fails, promote one standby slave to be a new active slave in its place, and use the second standby slave to clone a replacement for the first.
|
69
|
+
|
70
|
+
* If a standby slave fails, use the other standby slave to clone a replacement.
|
71
|
+
|
72
|
+
In other words: as long as you have two standbys, you can recover from a single failure quickly, without needing to do a hot-copy (which is much slower). Faster recovery time = less time in a degraded state = lower chance that a second failure will occur while the pool is already degraded.
|
73
|
+
|
74
|
+
Resist the temptation to send any queries from your application to your standby slaves. If your application's read requirements are high enough to require additional nodes, create more active slaves as needed, but don't repurpose the standbys without replacing them. Otherwise, if a machine fails, you'd no longer have enough capacity to serve normal traffic load or no longer have a way to quickly spin up replacement nodes.
|
75
|
+
|
76
|
+
You can, however, use your standby slaves for creating backups, running ad-hoc batch/analytic queries, etc. You can also make _one_ of your standby slaves be a weaker class of hardware if desired, and just take care to only use that node for cloning slaves, never for directly promoting. \Jetpants supports this, and considers this type of slave to be a "backup slave".
|
77
|
+
|
78
|
+
|
79
|
+
== When should I split a shard?
|
80
|
+
|
81
|
+
Typically when some individual component on the shard's master is getting close to being full/saturated:
|
82
|
+
|
83
|
+
* Disk is getting full, in terms of capacity -- 80%+ impacts performance for SSDs and eventually for most filesystems as well
|
84
|
+
* Disk utilization (ie, what <tt>iostat</tt> shows you) is reaching 90%+
|
85
|
+
* Network utilization is approaching your link's saturation point
|
86
|
+
|
87
|
+
Depending on your type of disk and amount of RAM, you may find that the first two may happen at roughly the same time. An increasingly large data set usually means your working set will exceed your amount of memory, so InnoDB's cache hit rate starts to drop, and your disk utilization starts creeping upwards.
|
88
|
+
|
89
|
+
|
90
|
+
== Why does so much of the command suite functionality require an asset tracker plugin?
|
91
|
+
|
92
|
+
For any given operation that requires an asset tracker, there's one of two reasons:
|
93
|
+
|
94
|
+
* The operation involves juggling a lot of servers. For example, a shard split needs to be able to obtain a minimum of 6 spare MySQL instances, and eventually turns the original shard's 3 MySQL instances into spares. Doing this kind of operation without an automated asset tracker can easily lead to major human error.
|
95
|
+
|
96
|
+
* The operation inherently involves generating a new configuration for your application -- for example, setting a shard to read-only or promoting a standby slave to an active slave. These operations are meaningless outside of your application, since MySQL has no notion of "standby slave" or "degraded shard". \Jetpants has a notion of these things, but needs to persist the information somewhere, and it makes more sense to have \Jetpants relay this information to an external hardware management tool rather than maintain a separate (and potentially conflicting) source of truth.
|
97
|
+
|
98
|
+
If you have enough servers to be using a sharded architecture, you hopefully already have some sort of hardware management / asset tracker system in place. \Jetpants is designed to be integrated with this system, but since every site runs something different, this requires that you write some custom plugin code to achieve.
|
99
|
+
|
100
|
+
|
101
|
+
== Can I use \Jetpants with PostgreSQL?
|
102
|
+
|
103
|
+
The core functionality is currently very MySQL-specific. In theory a plugin could override a bunch of methods to target Postgres, and maybe even Redis or other persistent data stores with replication and import/export functionality. This would be a substantial effort though.
|
104
|
+
|
105
|
+
At present, several methods have "mysql" in the name. These may change to more generic names in an upcoming release; in this case the old names will still be available as aliases to the new ones.
|
106
|
+
|
107
|
+
|
108
|
+
== In the shard split process, why create the standby slaves AFTER doing the export / re-import?
|
109
|
+
|
110
|
+
We do this to avoid replicating the LOAD DATA INFILE statements. Because MySQL replication is single-threaded, these statements won't execute in parallel on slaves, so the import process would be substantially slower. Instead, we create the new shard masters, do the export/import dance on those instances, and THEN clone their final data set to 2 new standby slaves each.
|
111
|
+
|
112
|
+
This also allows us to disable binary logging during the import process, which is a very noticeable speed enhancement.
|
113
|
+
|
114
|
+
|
115
|
+
== In the cleanup stage of a shard split, why not just remove unwanted data with a single DELETE statement?
|
116
|
+
|
117
|
+
Because MySQL replication is single-threaded, it's a bad idea to execute single write queries that impact thousands of rows, since these will cause slaves to lag. Giant transactions are also not ideal in general due to how MVCC and rollbacks work in InnoDB.
|
data/doc/requirements.rdoc
CHANGED
@@ -6,10 +6,11 @@ Plugins may freely override these assumptions, and upstream patches are very wel
|
|
6
6
|
|
7
7
|
== Environment
|
8
8
|
|
9
|
-
*
|
10
|
-
*
|
9
|
+
* Ruby 1.9.2 or higher
|
10
|
+
* MySQL (or Percona Server), specifically version 5.1 or higher.
|
11
|
+
* a RHEL/CentOS distribution of Linux.
|
11
12
|
* It should be easy to write a plugin supporting another distribution. The main change might be overriding Jetpants::Host#service, if your distribution doesn't have <tt>/sbin/service</tt>.
|
12
|
-
*
|
13
|
+
* InnoDB / Percona XtraDB for storage engine. \Jetpants has not been tested with MyISAM, since \Jetpants is geared towards huge tables, and MyISAM is generally a bad fit.
|
13
14
|
* All MySQL instances run on port 3306, with only one instance per logical machine.
|
14
15
|
* A plugin could override this easily, but would require you to use the --report-host option on all slaves, so that crawling the replication topology is possible. It would also have to override various methods that specify the MySQL init script location, config file location, data directory, etc.
|
15
16
|
* Since there's no "standard" layout for multi-instance MySQL, this won't ever be part of the \Jetpants core, but we may include one implementation as a bundled plugin in a future release.
|
data/lib/jetpants.rb
CHANGED
@@ -17,21 +17,21 @@ module Jetpants
|
|
17
17
|
# Establish default configuration values, and then merge in whatever we find globally
|
18
18
|
# in /etc/jetpants.yaml and per-user in ~/.jetpants.yaml
|
19
19
|
@config = {
|
20
|
-
'max_concurrency' => 40,
|
21
|
-
'standby_slaves_per_pool' => 2,
|
22
|
-
'mysql_schema' => 'test',
|
23
|
-
'mysql_app_user' =>
|
24
|
-
'mysql_app_password' =>
|
25
|
-
'mysql_repl_user' =>
|
26
|
-
'mysql_repl_password' =>
|
27
|
-
'mysql_root_password' => false,
|
20
|
+
'max_concurrency' => 40, # max threads/conns per database
|
21
|
+
'standby_slaves_per_pool' => 2, # number of standby slaves in every pool
|
22
|
+
'mysql_schema' => 'test', # database name
|
23
|
+
'mysql_app_user' => 'appuser', # mysql user for application
|
24
|
+
'mysql_app_password' => '', # mysql password for application
|
25
|
+
'mysql_repl_user' => 'repluser', # mysql user for replication
|
26
|
+
'mysql_repl_password' => '', # mysql password for replication
|
27
|
+
'mysql_root_password' => false, # mysql root password. omit if specified in /root/.my.cnf instead.
|
28
28
|
'mysql_grant_ips' => ['192.168.%'], # mysql user manipulations are applied to these IPs
|
29
|
-
'mysql_grant_privs' => ['ALL'],
|
30
|
-
'export_location' => '/tmp',
|
31
|
-
'verify_replication' => true,
|
32
|
-
'plugins' => {},
|
33
|
-
'ssh_keys' => nil,
|
34
|
-
'sharded_tables' => [],
|
29
|
+
'mysql_grant_privs' => ['ALL'], # mysql user manipulations grant this set of privileges by default
|
30
|
+
'export_location' => '/tmp', # directory to use for data dumping
|
31
|
+
'verify_replication' => true, # raise exception if the 2 repl threads are in different states, or if actual repl topology differs from Jetpants' understanding of it
|
32
|
+
'plugins' => {}, # hash of plugin name => arbitrary plugin data (usually a nested hash of settings)
|
33
|
+
'ssh_keys' => nil, # array of SSH key file locations
|
34
|
+
'sharded_tables' => [], # array of name => {sharding_key=>X, chunks=>Y} hashes
|
35
35
|
}
|
36
36
|
%w(/etc/jetpants.yaml ~/.jetpants.yml ~/.jetpants.yaml).each do |path|
|
37
37
|
overrides = YAML.load_file(File.expand_path path) rescue {}
|
@@ -68,19 +68,21 @@ module Jetpants
|
|
68
68
|
# been split.
|
69
69
|
def revoke_all_access!
|
70
70
|
user_name = Jetpants.app_credentials[:user]
|
71
|
-
|
72
|
-
|
71
|
+
enable_read_only!
|
72
|
+
output "Revoking access for user #{user_name}."
|
73
73
|
output(drop_user(user_name, true)) # drop the user without replicating the drop statement to slaves
|
74
74
|
end
|
75
75
|
|
76
76
|
# Enables global read-only mode on the database.
|
77
|
-
def
|
77
|
+
def enable_read_only!
|
78
|
+
output "Enabling global read_only mode"
|
78
79
|
mysql_root_cmd 'SET GLOBAL read_only = 1' unless read_only?
|
79
80
|
read_only?
|
80
81
|
end
|
81
82
|
|
82
83
|
# Disables global read-only mode on the database.
|
83
84
|
def disable_read_only!
|
85
|
+
output "Disabling global read_only mode"
|
84
86
|
mysql_root_cmd 'SET GLOBAL read_only = 0' if read_only?
|
85
87
|
not read_only?
|
86
88
|
end
|
@@ -37,7 +37,7 @@ module Jetpants
|
|
37
37
|
"MASTER_USER='#{repl_user}', " +
|
38
38
|
"MASTER_PASSWORD='#{repl_pass}'"
|
39
39
|
|
40
|
-
output "Changing master to #{new_master} with coordinates (#{logfile}, #{pos})
|
40
|
+
output "Changing master to #{new_master} with coordinates (#{logfile}, #{pos}). #{result}"
|
41
41
|
@master.slaves.delete(self) if @master rescue nil
|
42
42
|
@master = new_master
|
43
43
|
@repl_paused = true
|
@@ -67,7 +67,7 @@ module Jetpants
|
|
67
67
|
def disable_replication!
|
68
68
|
raise "This DB object has no master" unless master
|
69
69
|
output "Disabling replication; this db is no longer a slave."
|
70
|
-
output mysql_root_cmd "STOP SLAVE; RESET SLAVE"
|
70
|
+
output mysql_root_cmd "STOP SLAVE; CHANGE MASTER TO master_host=''; RESET SLAVE"
|
71
71
|
@master.slaves.delete(self) rescue nil
|
72
72
|
@master = nil
|
73
73
|
@repl_paused = nil
|
@@ -84,6 +84,7 @@ module Jetpants
|
|
84
84
|
def enslave!(targets, repl_user=false, repl_pass=false)
|
85
85
|
repl_user ||= (Jetpants.replication_credentials[:user] || replication_credentials[:user])
|
86
86
|
repl_pass ||= (Jetpants.replication_credentials[:pass] || replication_credentials[:pass])
|
87
|
+
disable_monitoring
|
87
88
|
pause_replication if master && ! @repl_paused
|
88
89
|
file, pos = binlog_coordinates
|
89
90
|
clone_to!(targets)
|
@@ -95,6 +96,7 @@ module Jetpants
|
|
95
96
|
password: repl_pass )
|
96
97
|
end
|
97
98
|
resume_replication if @master # should already have happened from the clone_to! restart anyway, but just to be explicit
|
99
|
+
enable_monitoring
|
98
100
|
end
|
99
101
|
|
100
102
|
# Wipes out the target instances and turns them into slaves of self's master.
|
@@ -138,10 +140,10 @@ module Jetpants
|
|
138
140
|
# database. Only useful when called on a master. This is the current
|
139
141
|
# instance's own binlog coordinates, NOT the coordinates of replication
|
140
142
|
# progress on a slave!
|
141
|
-
def binlog_coordinates
|
143
|
+
def binlog_coordinates(display_info=true)
|
142
144
|
hash = mysql_root_cmd('SHOW MASTER STATUS', :parse=>true)
|
143
145
|
raise "Cannot obtain binlog coordinates of this master becaues binary logging is not enabled" unless hash[:file]
|
144
|
-
output "Own binlog coordinates are (#{hash[:file]}, #{hash[:position].to_i})."
|
146
|
+
output "Own binlog coordinates are (#{hash[:file]}, #{hash[:position].to_i})." if display_info
|
145
147
|
[hash[:file], hash[:position].to_i]
|
146
148
|
end
|
147
149
|
|
@@ -149,7 +151,8 @@ module Jetpants
|
|
149
151
|
# as reported by SHOW SLAVE STATUS.
|
150
152
|
def seconds_behind_master
|
151
153
|
raise "This instance is not a slave" unless master
|
152
|
-
slave_status[:seconds_behind_master]
|
154
|
+
lag = slave_status[:seconds_behind_master]
|
155
|
+
lag == 'NULL' ? nil : lag.to_i
|
153
156
|
end
|
154
157
|
|
155
158
|
# Waits for this instance's SECONDS_BEHIND_MASTER to reach 0 and stay at
|
@@ -175,6 +178,10 @@ module Jetpants
|
|
175
178
|
return true
|
176
179
|
end
|
177
180
|
sleep poll_frequency
|
181
|
+
elsif lag.nil?
|
182
|
+
resume_replication
|
183
|
+
sleep 1
|
184
|
+
raise "Unable to restart replication" if seconds_behind_master.nil?
|
178
185
|
else
|
179
186
|
output "Currently #{lag} seconds behind master."
|
180
187
|
times_at_zero = 0
|
data/lib/jetpants/db/state.rb
CHANGED
@@ -61,6 +61,7 @@ module Jetpants
|
|
61
61
|
probe_running
|
62
62
|
probe_master
|
63
63
|
probe_slaves
|
64
|
+
self
|
64
65
|
end
|
65
66
|
|
66
67
|
# Alias for probe(true)
|
@@ -144,10 +145,34 @@ module Jetpants
|
|
144
145
|
end
|
145
146
|
|
146
147
|
# Returns the Jetpants::Pool that this instance belongs to, if any.
|
147
|
-
|
148
|
-
|
148
|
+
# Can optionally create an anonymous pool if no pool was found. This anonymous
|
149
|
+
# pool intentionally has a blank sync_configuration implementation.
|
150
|
+
def pool(create_if_missing=false)
|
151
|
+
result = Jetpants.topology.pool(self) || Jetpants.topology.pool(master)
|
152
|
+
if !result && create_if_missing
|
153
|
+
pool_master = master || self
|
154
|
+
result = Pool.new('anon_pool_' + pool_master.ip.tr('.', ''), pool_master)
|
155
|
+
def result.sync_configuration; end
|
156
|
+
end
|
157
|
+
return result
|
149
158
|
end
|
150
159
|
|
160
|
+
# Determines the DB's role in its pool. Returns either :master,
|
161
|
+
# :active_slave, :standby_slave, or :backup_slave.
|
162
|
+
#
|
163
|
+
# Note that we consider a node with no master and no slaves to be
|
164
|
+
# a :master, since we can't determine if it had slaves but they're
|
165
|
+
# just offline/dead, vs it being an orphaned machine.
|
166
|
+
def role
|
167
|
+
p = pool
|
168
|
+
case
|
169
|
+
when !@master then :master
|
170
|
+
when for_backups? then :backup_slave
|
171
|
+
when p && p.active_slave_weights[self] then :active_slave # if pool in topology, determine based on expected/ideal state
|
172
|
+
when !p && !is_standby? then :active_slave # if pool missing from topology, determine based on actual state
|
173
|
+
else :standby_slave
|
174
|
+
end
|
175
|
+
end
|
151
176
|
|
152
177
|
###### Private methods #####################################################
|
153
178
|
|
@@ -178,8 +203,9 @@ module Jetpants
|
|
178
203
|
raise "#{self}: #{message}" if Jetpants.verify_replication
|
179
204
|
output message
|
180
205
|
pause_replication
|
206
|
+
else
|
207
|
+
@repl_paused = (status[:slave_io_running].downcase == 'no')
|
181
208
|
end
|
182
|
-
@repl_paused = (status[:slave_io_running].downcase == 'no')
|
183
209
|
end
|
184
210
|
end
|
185
211
|
|
@@ -202,7 +228,7 @@ module Jetpants
|
|
202
228
|
processes.grep(/Binlog Dump/).concurrent_each do |p|
|
203
229
|
tokens = p.split
|
204
230
|
ip, dummy = tokens[2].split ':'
|
205
|
-
db =
|
231
|
+
db = ip.to_db
|
206
232
|
db.probe
|
207
233
|
slaves_mutex.synchronize {@slaves << db if db.master == self}
|
208
234
|
end
|
data/lib/jetpants/host.rb
CHANGED
@@ -25,6 +25,8 @@ module Jetpants
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def initialize(ip)
|
28
|
+
# Only supporting ipv4 for now
|
29
|
+
raise "Invalid IP address: #{ip}" unless ip =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/
|
28
30
|
@ip = ip
|
29
31
|
@connection_pool = [] # array of idle Net::SSH::Connection::Session objects
|
30
32
|
@lock = Mutex.new
|
data/lib/jetpants/pool.rb
CHANGED
@@ -94,7 +94,7 @@ module Jetpants
|
|
94
94
|
# returns a flat array of all Jetpants::DB objects in the pool: the master and
|
95
95
|
# all slaves of all types.
|
96
96
|
def nodes
|
97
|
-
[master, slaves].flatten
|
97
|
+
[master, slaves].flatten.compact
|
98
98
|
end
|
99
99
|
|
100
100
|
# Informs Jetpants that slave_db is an active slave. Potentially used by
|
@@ -152,52 +152,96 @@ module Jetpants
|
|
152
152
|
# of returning a string, so that you can invoke something like:
|
153
153
|
# Jetpants.topology.pools.each &:summary
|
154
154
|
# to easily display a summary.
|
155
|
-
def summary
|
155
|
+
def summary(extended_info=false)
|
156
156
|
probe
|
157
|
-
|
158
|
-
|
157
|
+
|
158
|
+
alias_text = @aliases.count > 0 ? ' (aliases: ' + @aliases.join(', ') + ')' : ''
|
159
|
+
data_size = @master.running? ? "[#{master.data_set_size(true)}GB]" : ''
|
160
|
+
print "#{name}#{alias_text} #{data_size}\n"
|
161
|
+
|
162
|
+
if extended_info
|
163
|
+
details = {}
|
164
|
+
nodes.concurrent_each do |s|
|
165
|
+
if !s.running?
|
166
|
+
details[s] = {coordinates: ['unknown'], lag: 'N/A'}
|
167
|
+
elsif s == @master
|
168
|
+
details[s] = {coordinates: s.binlog_coordinates(false), lag: 'N/A'}
|
169
|
+
else
|
170
|
+
details[s] = {coordinates: s.repl_binlog_coordinates(false), lag: s.seconds_behind_master.to_s + 's'}
|
171
|
+
end
|
172
|
+
end
|
159
173
|
end
|
160
|
-
|
161
|
-
|
174
|
+
|
175
|
+
binlog_pos = extended_info ? details[@master][:coordinates].join(':') : ''
|
176
|
+
print "\tmaster = %-13s %-30s %s\n" % [@master.ip, @master.hostname, binlog_pos]
|
177
|
+
|
162
178
|
[:active, :standby, :backup].each do |type|
|
163
179
|
slave_list = slaves(type)
|
164
|
-
slave_list.each_with_index do |s, i|
|
165
|
-
|
180
|
+
slave_list.sort.each_with_index do |s, i|
|
181
|
+
binlog_pos = extended_info ? details[s][:coordinates].join(':') : ''
|
182
|
+
slave_lag = extended_info ? "lag=#{details[s][:lag]}" : ''
|
183
|
+
print "\t%-7s slave #{i + 1} = %-13s %-30s %-26s %s\n" % [type, s.ip, s.hostname, binlog_pos, slave_lag]
|
166
184
|
end
|
167
185
|
end
|
168
186
|
true
|
169
187
|
end
|
170
188
|
|
171
|
-
#
|
172
|
-
# as a stand-alone method; there's other necessary logic, such as setting
|
173
|
-
# the old master to read-only mode, and doing a STOP SLAVE on all slaves.
|
174
|
-
# Use the "jetpants promotion" task instead to do an interactive promotion.
|
175
|
-
# (In a future release, this will be refactored to be fully scriptable.)
|
189
|
+
# Demotes the pool's existing master, promoting a slave in its place.
|
176
190
|
def master_promotion!(promoted)
|
177
191
|
demoted = @master
|
178
|
-
raise "
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
192
|
+
raise "Demoted node is already the master of this pool!" if demoted == promoted
|
193
|
+
raise "Promoted host is not in the right pool!" unless demoted.slaves.include?(promoted)
|
194
|
+
|
195
|
+
output "Preparing to demote master #{demoted} and promote #{promoted} in its place."
|
196
|
+
|
197
|
+
# If demoted machine is available, confirm it is read-only and binlog isn't moving,
|
198
|
+
# and then wait for slaves to catch up to this position
|
199
|
+
if demoted.running?
|
200
|
+
demoted.enable_read_only! unless demoted.read_only?
|
201
|
+
raise "Unable to enable global read-only mode on demoted machine" unless demoted.read_only?
|
202
|
+
coordinates = demoted.binlog_coordinates
|
203
|
+
raise "Demoted machine still taking writes (from superuser or replication?) despite being read-only" unless coordinates == demoted.binlog_coordinates
|
204
|
+
demoted.slaves.concurrent_each do |s|
|
205
|
+
while true do
|
206
|
+
sleep 1
|
207
|
+
break if s.repl_binlog_coordinates == coordinates
|
208
|
+
output "Still catching up to coordinates of demoted master"
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
# Demoted machine not available -- wait for slaves' binlogs to stop moving
|
185
213
|
else
|
186
|
-
|
214
|
+
demoted.slaves.concurrent_each do |s|
|
215
|
+
progress = s.repl_binlog_coordinates
|
216
|
+
while true do
|
217
|
+
sleep 1
|
218
|
+
break if s.repl_binlog_coordinates == progress
|
219
|
+
s.output "Still catching up on replication"
|
220
|
+
end
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
# Stop replication on all slaves
|
225
|
+
replicas = demoted.slaves.dup
|
226
|
+
replicas.each do |s|
|
227
|
+
s.pause_replication if s.replicating?
|
187
228
|
end
|
229
|
+
raise "Unable to stop replication on all slaves" if replicas.any? {|s| s.replicating?}
|
230
|
+
|
231
|
+
user, password = promoted.replication_credentials.values
|
232
|
+
log, position = promoted.binlog_coordinates
|
233
|
+
|
234
|
+
# reset slave on promoted, and make sure read_only is disabled
|
235
|
+
promoted.disable_replication!
|
236
|
+
promoted.disable_read_only!
|
188
237
|
|
189
238
|
# gather our new replicas
|
190
|
-
replicas
|
191
|
-
replicas << demoted if demoted.
|
192
|
-
|
193
|
-
|
239
|
+
replicas.delete promoted
|
240
|
+
replicas << demoted if demoted.running?
|
241
|
+
|
194
242
|
# perform promotion
|
195
|
-
replicas.each do |
|
196
|
-
|
197
|
-
:user => user,
|
198
|
-
:password => password,
|
199
|
-
:log_file => log,
|
200
|
-
:log_pos => position
|
243
|
+
replicas.each do |r|
|
244
|
+
r.change_master_to promoted, user: user, password: password, log_file: log, log_pos: position
|
201
245
|
end
|
202
246
|
|
203
247
|
# ensure our replicas are configured correctly by comparing our staged values to current values of replicas
|
@@ -215,13 +259,15 @@ module Jetpants
|
|
215
259
|
end
|
216
260
|
|
217
261
|
# Update the pool
|
218
|
-
# Note: if the demoted machine is
|
262
|
+
# Note: if the demoted machine is not available, plugin may need to implement an
|
219
263
|
# after_master_promotion! method which handles this case in configuration tracker
|
220
264
|
@active_slave_weights.delete promoted # if promoting an active slave, remove it from read pool
|
221
265
|
@master = promoted
|
222
266
|
sync_configuration
|
223
267
|
Jetpants.topology.write_config
|
224
268
|
|
269
|
+
output "Promotion complete. Pool master is now #{promoted}."
|
270
|
+
|
225
271
|
replicas.all? {|r| r.replicating?}
|
226
272
|
end
|
227
273
|
|
data/lib/jetpants/shard.rb
CHANGED
@@ -169,7 +169,7 @@ module Jetpants
|
|
169
169
|
|
170
170
|
init_children(pieces) unless @children.count > 0
|
171
171
|
|
172
|
-
@children.concurrent_each {|c| c.
|
172
|
+
@children.concurrent_each {|c| c.disable_binary_logging}
|
173
173
|
clone_to_children!
|
174
174
|
@children.concurrent_each {|c| c.rebuild!}
|
175
175
|
@children.each {|c| c.sync_configuration}
|
@@ -225,25 +225,22 @@ module Jetpants
|
|
225
225
|
|
226
226
|
# Exports data that should stay on this shard, drops and re-creates tables,
|
227
227
|
# re-imports the data, and then adds slaves to the shard pool as needed.
|
228
|
-
|
229
|
-
# useful if you're running this manually and it failed part-way.
|
230
|
-
def rebuild!(stage=0)
|
228
|
+
def rebuild!
|
231
229
|
# Sanity check
|
232
230
|
raise "Cannot rebuild a shard that isn't still slaving from another shard" unless @master.is_slave?
|
233
231
|
raise "Cannot rebuild an active shard" if in_config?
|
234
232
|
|
233
|
+
stop_query_killer
|
235
234
|
tables = Table.from_config 'sharded_tables'
|
236
235
|
|
237
|
-
if
|
238
|
-
raise "Shard is not in the expected initializing or exporting states" unless [:initializing, :exporting].include? @state
|
236
|
+
if [:initializing, :exporting].include? @state
|
239
237
|
@state = :exporting
|
240
238
|
sync_configuration
|
241
239
|
export_schemata tables
|
242
240
|
export_data tables, @min_id, @max_id
|
243
241
|
end
|
244
242
|
|
245
|
-
if
|
246
|
-
raise "Shard is not in the expected exporting or importing states" unless [:exporting, :importing].include? @state
|
243
|
+
if [:exporting, :importing].include? @state
|
247
244
|
@state = :importing
|
248
245
|
sync_configuration
|
249
246
|
import_schemata!
|
@@ -252,8 +249,7 @@ module Jetpants
|
|
252
249
|
start_query_killer
|
253
250
|
end
|
254
251
|
|
255
|
-
if
|
256
|
-
raise "Shard is not in the expected importing or replicating states" unless [:importing, :replicating].include? @state
|
252
|
+
if [:importing, :replicating].include? @state
|
257
253
|
enable_binary_logging
|
258
254
|
restart_mysql
|
259
255
|
@state = :replicating
|
@@ -262,6 +258,8 @@ module Jetpants
|
|
262
258
|
enslave!(my_slaves)
|
263
259
|
my_slaves.each {|slv| slv.resume_replication}
|
264
260
|
[self, my_slaves].flatten.each {|db| db.catch_up_to_master}
|
261
|
+
else
|
262
|
+
raise "Shard not in a state compatible with calling rebuild! (current state=#{@state})"
|
265
263
|
end
|
266
264
|
|
267
265
|
@state = :child
|
@@ -298,8 +296,8 @@ module Jetpants
|
|
298
296
|
end
|
299
297
|
|
300
298
|
# Displays information about the shard
|
301
|
-
def summary(with_children=true)
|
302
|
-
super()
|
299
|
+
def summary(extended_info=false, with_children=true)
|
300
|
+
super(extended_info)
|
303
301
|
if with_children
|
304
302
|
children.each {|c| c.summary}
|
305
303
|
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: jetpants
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.7.
|
5
|
+
version: 0.7.2
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Evan Elias
|
@@ -11,7 +11,7 @@ autorequire:
|
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
13
|
|
14
|
-
date: 2012-06-
|
14
|
+
date: 2012-06-18 00:00:00 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: mysql2
|
@@ -47,7 +47,7 @@ dependencies:
|
|
47
47
|
type: :runtime
|
48
48
|
version_requirements: *id003
|
49
49
|
- !ruby/object:Gem::Dependency
|
50
|
-
name:
|
50
|
+
name: pry
|
51
51
|
prerelease: false
|
52
52
|
requirement: &id004 !ruby/object:Gem::Requirement
|
53
53
|
none: false
|
@@ -58,7 +58,7 @@ dependencies:
|
|
58
58
|
type: :runtime
|
59
59
|
version_requirements: *id004
|
60
60
|
- !ruby/object:Gem::Dependency
|
61
|
-
name:
|
61
|
+
name: thor
|
62
62
|
prerelease: false
|
63
63
|
requirement: &id005 !ruby/object:Gem::Requirement
|
64
64
|
none: false
|
@@ -69,7 +69,7 @@ dependencies:
|
|
69
69
|
type: :runtime
|
70
70
|
version_requirements: *id005
|
71
71
|
- !ruby/object:Gem::Dependency
|
72
|
-
name:
|
72
|
+
name: highline
|
73
73
|
prerelease: false
|
74
74
|
requirement: &id006 !ruby/object:Gem::Requirement
|
75
75
|
none: false
|
@@ -80,7 +80,7 @@ dependencies:
|
|
80
80
|
type: :runtime
|
81
81
|
version_requirements: *id006
|
82
82
|
- !ruby/object:Gem::Dependency
|
83
|
-
name:
|
83
|
+
name: terminal-table
|
84
84
|
prerelease: false
|
85
85
|
requirement: &id007 !ruby/object:Gem::Requirement
|
86
86
|
none: false
|
@@ -91,7 +91,7 @@ dependencies:
|
|
91
91
|
type: :runtime
|
92
92
|
version_requirements: *id007
|
93
93
|
- !ruby/object:Gem::Dependency
|
94
|
-
name:
|
94
|
+
name: colored
|
95
95
|
prerelease: false
|
96
96
|
requirement: &id008 !ruby/object:Gem::Requirement
|
97
97
|
none: false
|
@@ -101,17 +101,6 @@ dependencies:
|
|
101
101
|
version: "0"
|
102
102
|
type: :runtime
|
103
103
|
version_requirements: *id008
|
104
|
-
- !ruby/object:Gem::Dependency
|
105
|
-
name: colored
|
106
|
-
prerelease: false
|
107
|
-
requirement: &id009 !ruby/object:Gem::Requirement
|
108
|
-
none: false
|
109
|
-
requirements:
|
110
|
-
- - ">="
|
111
|
-
- !ruby/object:Gem::Version
|
112
|
-
version: "0"
|
113
|
-
type: :runtime
|
114
|
-
version_requirements: *id009
|
115
104
|
description: Jetpants is an automation toolkit for handling monstrously large MySQL database topologies. It is geared towards common operational tasks like cloning slaves, rebalancing shards, and performing master promotions. It features a command suite for easy use by operations staff, though it's also a full Ruby library for use in developing custom migration scripts and database automation.
|
116
105
|
email:
|
117
106
|
- me@evanelias.com
|
@@ -122,39 +111,40 @@ extensions: []
|
|
122
111
|
|
123
112
|
extra_rdoc_files:
|
124
113
|
- README.rdoc
|
125
|
-
- doc/plugins.rdoc
|
126
114
|
- doc/configuration.rdoc
|
127
|
-
- doc/
|
115
|
+
- doc/faq.rdoc
|
128
116
|
- doc/requirements.rdoc
|
117
|
+
- doc/commands.rdoc
|
118
|
+
- doc/plugins.rdoc
|
129
119
|
files:
|
130
120
|
- Gemfile
|
131
121
|
- README.rdoc
|
132
|
-
- doc/plugins.rdoc
|
133
122
|
- doc/configuration.rdoc
|
134
|
-
- doc/
|
123
|
+
- doc/faq.rdoc
|
135
124
|
- doc/requirements.rdoc
|
136
|
-
-
|
137
|
-
-
|
138
|
-
- lib/jetpants/
|
139
|
-
- lib/jetpants/db/state.rb
|
125
|
+
- doc/commands.rdoc
|
126
|
+
- doc/plugins.rdoc
|
127
|
+
- lib/jetpants/monkeypatch.rb
|
140
128
|
- lib/jetpants/db/import_export.rb
|
141
129
|
- lib/jetpants/db/privileges.rb
|
142
130
|
- lib/jetpants/db/client.rb
|
143
131
|
- lib/jetpants/db/replication.rb
|
144
|
-
- lib/jetpants/
|
132
|
+
- lib/jetpants/db/server.rb
|
133
|
+
- lib/jetpants/db/state.rb
|
145
134
|
- lib/jetpants/db.rb
|
146
|
-
- lib/jetpants/
|
135
|
+
- lib/jetpants/shard.rb
|
147
136
|
- lib/jetpants/pool.rb
|
148
|
-
- lib/jetpants/monkeypatch.rb
|
149
137
|
- lib/jetpants/table.rb
|
138
|
+
- lib/jetpants/topology.rb
|
139
|
+
- lib/jetpants/callback.rb
|
140
|
+
- lib/jetpants/host.rb
|
150
141
|
- lib/jetpants.rb
|
151
142
|
- bin/jetpants
|
152
|
-
- plugins/simple_tracker/topology.rb
|
153
|
-
- plugins/simple_tracker/shard.rb
|
154
|
-
- plugins/simple_tracker/simple_tracker.rb
|
155
143
|
- plugins/simple_tracker/db.rb
|
144
|
+
- plugins/simple_tracker/shard.rb
|
156
145
|
- plugins/simple_tracker/pool.rb
|
157
|
-
-
|
146
|
+
- plugins/simple_tracker/simple_tracker.rb
|
147
|
+
- plugins/simple_tracker/topology.rb
|
158
148
|
- etc/jetpants.yaml.sample
|
159
149
|
homepage: https://github.com/tumblr/jetpants/
|
160
150
|
licenses: []
|
data/tasks/promotion.rb
DELETED
@@ -1,260 +0,0 @@
|
|
1
|
-
module Jetpants
|
2
|
-
module Tasks
|
3
|
-
class Promotion
|
4
|
-
|
5
|
-
def initialize nodes = {}
|
6
|
-
@demoted = nodes['demote']
|
7
|
-
@promoted = nodes['promote']
|
8
|
-
super
|
9
|
-
Jetpants.verify_replication = false # since master may be offline
|
10
|
-
advise
|
11
|
-
establish_roles
|
12
|
-
prepare
|
13
|
-
end
|
14
|
-
|
15
|
-
def error message
|
16
|
-
abort ['ERROR:'.red, message].join ' '
|
17
|
-
end
|
18
|
-
|
19
|
-
def inform message
|
20
|
-
puts message.blue
|
21
|
-
end
|
22
|
-
|
23
|
-
def is_ip? address
|
24
|
-
address =~ /(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})/
|
25
|
-
end
|
26
|
-
|
27
|
-
def establish_roles
|
28
|
-
establish_demoted
|
29
|
-
establish_replicas
|
30
|
-
establish_promoted
|
31
|
-
end
|
32
|
-
|
33
|
-
def establish_demoted
|
34
|
-
# derive demoted from promoted if possible
|
35
|
-
if @promoted and not @demoted
|
36
|
-
error "invalid ip address #{@promoted}" unless is_ip? @promoted
|
37
|
-
@promoted = Jetpants::DB.new @promoted
|
38
|
-
|
39
|
-
# bail the promoted node isn't a slave or we can't connect
|
40
|
-
unless @promoted.is_slave?
|
41
|
-
error "node (#{@promoted}) does not appear to be a replica of another node"
|
42
|
-
end rescue error("unable to connect to node #{@promoted} to promote")
|
43
|
-
|
44
|
-
# recommend a node to demote
|
45
|
-
agreed = agree [
|
46
|
-
"Would you like to demote the following node?",
|
47
|
-
"address: #{@promoted.master}",
|
48
|
-
"slaves : #{@promoted.master.slaves.join(', ')}",
|
49
|
-
"- yes/no -"
|
50
|
-
].join "\n"
|
51
|
-
error "unable to promote #{@promoted} unless you demote #{@promoted.master}" unless agreed
|
52
|
-
|
53
|
-
@demoted = @promoted.master.ip
|
54
|
-
end
|
55
|
-
|
56
|
-
# unable to derive demoted, so ask and convert to a DB object
|
57
|
-
unless @demoted.kind_of? Jetpants::DB
|
58
|
-
@demoted = ask 'Please enter the node to demote:' unless @demoted
|
59
|
-
error "Invalid IP address #{@demoted}" unless is_ip? @demoted
|
60
|
-
@demoted = @demoted.to_db
|
61
|
-
end
|
62
|
-
|
63
|
-
# connect and ensure node is a master; handle offline nodes appropriately
|
64
|
-
if @demoted.available?
|
65
|
-
error 'Cannot demote a node that has no slaves!' unless @demoted.has_slaves?
|
66
|
-
else
|
67
|
-
inform "unable to connect to node #{@demoted} to demote"
|
68
|
-
error "unable to perform promotion" unless agree "please confirm that #{@demoted} is offline: yes/no "
|
69
|
-
@replicas = @demoted.slaves # An asset-tracker plugin may have been populated the slave list anyway
|
70
|
-
if !@replicas || @replicas.count < 1
|
71
|
-
replicas = ask "please provide a comma seperated list of current replicas of #{@demoted}: ", lambda {|replicas| replicas.split /,\s*/}
|
72
|
-
error "user supplied list of replicas appears to be invalid - #{replicas}" unless replicas.all? {|replica| is_ip? replica}
|
73
|
-
@replicas = replicas.collect {|replica| replica.to_db}
|
74
|
-
|
75
|
-
# ensure they were replicas of @demoted
|
76
|
-
@replicas.each do |replica|
|
77
|
-
error "#{replica} does not appear to be a valid replica of #{@demoted}" unless replica.master == @demoted
|
78
|
-
end
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
error 'unable to establish demoteable node' unless @demoted.kind_of? Jetpants::DB
|
83
|
-
end
|
84
|
-
|
85
|
-
def establish_replicas
|
86
|
-
@replicas ||= @demoted.slaves
|
87
|
-
error 'no replicas to promote' if @replicas.empty?
|
88
|
-
error 'replicas appear to be invalid' unless @replicas.all? {|replica| replica.kind_of? Jetpants::DB}
|
89
|
-
inform "#{@demoted} has the following replicas: #{@replicas.join(', ')}"
|
90
|
-
end
|
91
|
-
|
92
|
-
def establish_promoted
|
93
|
-
# user supplied node to promote
|
94
|
-
if @promoted and not @promoted.kind_of? Jetpants::DB
|
95
|
-
error "invalid ip address #{@promoted}" unless is_ip? @promoted
|
96
|
-
@promoted = Jetpants::DB.new @promoted
|
97
|
-
end
|
98
|
-
|
99
|
-
# user hasn't supplied a valid node to promote
|
100
|
-
unless @replicas.include? @promoted
|
101
|
-
inform "unable to promote node (#{@promoted}) that is not a replica of #{@demoted}" if @promoted
|
102
|
-
|
103
|
-
# recommend a node
|
104
|
-
puts "\nREPLICA LIST:"
|
105
|
-
@replicas.sort_by {|replica| replica.seconds_behind_master}.each do |node|
|
106
|
-
file, pos = node.repl_binlog_coordinates(false)
|
107
|
-
puts " * %-13s %-30s lag: %2ds coordinates: (%-13s, %d)" % [node.ip, node.hostname, node.seconds_behind_master, file, pos]
|
108
|
-
end
|
109
|
-
puts
|
110
|
-
recommended = @replicas.sort_by {|replica| replica.seconds_behind_master}.reject {|r| r.for_backups?}.first
|
111
|
-
agreed = agree [
|
112
|
-
"Would you like to promote the following replica?",
|
113
|
-
"#{recommended.ip} (#{recommended.hostname})",
|
114
|
-
"- yes/no -"
|
115
|
-
].join "\n"
|
116
|
-
@promoted = recommended if agreed
|
117
|
-
|
118
|
-
# choose a new node if they disagreed with our recommendation
|
119
|
-
unless agreed
|
120
|
-
choose do |promote|
|
121
|
-
promote.prompt = 'Please choose a replica to promote:'
|
122
|
-
@replicas.each do |replica|
|
123
|
-
promote.choice "#{replica} - replication lag: #{replica.seconds_behind_master} seconds" do
|
124
|
-
@promoted = replica
|
125
|
-
end
|
126
|
-
end
|
127
|
-
end
|
128
|
-
raise "You chose a backup slave. These are not suitable for promotion. Please try again." if @promoted.for_backups?
|
129
|
-
end
|
130
|
-
end
|
131
|
-
|
132
|
-
error "unable to establish node to promote" unless @promoted.kind_of? Jetpants::DB
|
133
|
-
end
|
134
|
-
|
135
|
-
def advise
|
136
|
-
@states = {
|
137
|
-
preparing: "processing promotion requirements",
|
138
|
-
prepared: "preparing to disable writes on #{@demoted}",
|
139
|
-
read_only: "writes have been disabled on #{@demoted}, preparing to demote #{@demoted} and promote #{@promoted}",
|
140
|
-
promoted: "#{@promoted} has been promoted, please prepare database config for deploy.",
|
141
|
-
deployable: "promotion is complete, please commit and deploy.",
|
142
|
-
}
|
143
|
-
inform @states[@state.to_sym]
|
144
|
-
end
|
145
|
-
|
146
|
-
state_machine :initial => :preparing do
|
147
|
-
after_transition any => any, :do => :advise
|
148
|
-
|
149
|
-
event :prepare do
|
150
|
-
transition :preparing => :prepared, :if => :roles_populated?
|
151
|
-
end
|
152
|
-
after_transition :preparing => :prepared, :do => :disable_writes
|
153
|
-
|
154
|
-
event :disable_writes do
|
155
|
-
transition :prepared => :read_only, :if => :read_only!
|
156
|
-
end
|
157
|
-
after_transition :prepared => :read_only, :do => :promote
|
158
|
-
|
159
|
-
event :promote do
|
160
|
-
transition :read_only => :promoted, :if => :execute_promotion
|
161
|
-
end
|
162
|
-
after_transition :read_only => :promoted, :do => :prepare_config
|
163
|
-
|
164
|
-
event :prepare_config do
|
165
|
-
transition :promoted => :deployable, :if => :nodes_consistent?
|
166
|
-
end
|
167
|
-
after_transition :promoted => :deployable, :do => :summarize_promotion
|
168
|
-
|
169
|
-
state :preparing, :prepared do
|
170
|
-
def is_db? node
|
171
|
-
node.kind_of? Jetpants::DB
|
172
|
-
end
|
173
|
-
|
174
|
-
def roles_populated?
|
175
|
-
# ensure our roles are populated with dbs
|
176
|
-
[@demoted, @promoted, @replicas].all? do |role|
|
177
|
-
is_db? role or role.all? do |node|
|
178
|
-
is_db? node
|
179
|
-
end
|
180
|
-
end
|
181
|
-
end
|
182
|
-
|
183
|
-
def read_only!
|
184
|
-
unless @demoted.available?
|
185
|
-
status = @promoted.slave_status
|
186
|
-
@log, @position = status[:master_log_file], status[:exec_master_log_pos].to_i
|
187
|
-
return true
|
188
|
-
end
|
189
|
-
|
190
|
-
# set read_only if needed
|
191
|
-
@demoted.read_only! unless @demoted.read_only?
|
192
|
-
# bail if we're unable to set read_only
|
193
|
-
error "unable to set 'read_only' on #{@demoted}" unless @demoted.read_only?
|
194
|
-
# record the current log possition to ensure writes are not taking place later.
|
195
|
-
@log, @position = @demoted.binlog_coordinates
|
196
|
-
error "#{@demoted} is still taking writes, unable to promote #{@promoted}" unless writes_disabled?
|
197
|
-
@demoted.read_only?
|
198
|
-
end
|
199
|
-
|
200
|
-
def writes_disabled?
|
201
|
-
return true unless @demoted.available?
|
202
|
-
|
203
|
-
# ensure no writes have been logged since read_only!
|
204
|
-
[@log, @position] == @demoted.binlog_coordinates
|
205
|
-
end
|
206
|
-
|
207
|
-
end
|
208
|
-
|
209
|
-
state :read_only, :promoted, :promoted, :deployable do
|
210
|
-
def nodes_consistent?
|
211
|
-
return true unless @demoted.available?
|
212
|
-
@replicas.all? {|replica| replica.slave_status[:exec_master_log_pos].to_i == @position}
|
213
|
-
end
|
214
|
-
|
215
|
-
def ensure_nodes_consistent?
|
216
|
-
inform "ensuring replicas are in a consistent state"
|
217
|
-
until nodes_consistent? do
|
218
|
-
print '.'
|
219
|
-
sleep 0.5
|
220
|
-
end
|
221
|
-
nodes_consistent?
|
222
|
-
end
|
223
|
-
|
224
|
-
def promotable?
|
225
|
-
disable_replication if ensure_nodes_consistent? and @promoted.disable_read_only!
|
226
|
-
end
|
227
|
-
|
228
|
-
def execute_promotion
|
229
|
-
error 'nodes are not in a promotable state.' unless promotable?
|
230
|
-
error 'replicas are not in a consistent state' unless nodes_consistent?
|
231
|
-
|
232
|
-
@demoted.pool.master_promotion! @promoted
|
233
|
-
end
|
234
|
-
|
235
|
-
def replicas_replicating? replicas = @replicas
|
236
|
-
replicas.all? {|replica| replica.replicating?}
|
237
|
-
end
|
238
|
-
|
239
|
-
def disable_replication replicas = @replicas
|
240
|
-
replicas.each do |replica|
|
241
|
-
replica.pause_replication if replica.replicating?
|
242
|
-
end
|
243
|
-
not replicas_replicating? replicas
|
244
|
-
end
|
245
|
-
|
246
|
-
def summarize_promotion transition
|
247
|
-
summary = Terminal::Table.new :title => 'Promotion Summary:' do |rows|
|
248
|
-
rows << ['demoted', @demoted]
|
249
|
-
rows << ['promoted', @promoted]
|
250
|
-
rows << ["replicas of #{@promoted}", @promoted.slaves.join(', ')]
|
251
|
-
end
|
252
|
-
puts summary
|
253
|
-
exit
|
254
|
-
end
|
255
|
-
end
|
256
|
-
end
|
257
|
-
|
258
|
-
end
|
259
|
-
end
|
260
|
-
end
|