jetpants 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,146 @@
1
+ module Jetpants
2
+
3
+ # The Table class associates a table name with a column (or list of columns)
4
+ # making up the table's sharding key or primary key. It is geared towards
5
+ # generating SQL for importing/exporting a table, NOT for representing an
6
+ # application data model.
7
+ #
8
+ # None of these methods actually *execute* the SQL they generate, since the
9
+ # Table class isn't tied to a specific DB. This allows us to represent the set
10
+ # of all sharded tables with a single set of Table objects, without having to
11
+ # duplicate those objects for every Shard or DB instance. If you want to run
12
+ # the generated SQL on a database, use one of the DB#query* methods.
13
+ class Table
14
+ include CallbackHandler
15
+
16
+ # Name of the table as it exists in your database.
17
+ attr_reader :name
18
+
19
+ # Your application's sharding_key is the column used to determine which rows
20
+ # live on which shard. Generally this should be the same logical value for your
21
+ # entire application (example: id column of the User table), although the column
22
+ # name need not be identical across tables (one may call it 'user_id', another
23
+ # could call it 'target_user_id' or 'from_user_id'.) The sharding_keys attribute
24
+ # stores the name of that column for this particular table.
25
+ #
26
+ # For a sharded table, sharding_keys should generally be a single column,
27
+ # represented here as a single string.
28
+ #
29
+ # Jetpants supports mapping-tables with multiple sharding key columns (for
30
+ # instance, if user_id is your app's sharding key, a "following" table mapping
31
+ # one user_id to another). However this makes exports and cleanup extremely
32
+ # inefficient, so its use is not recommended.
33
+ #
34
+ # For a non-sharded table, simply set sharding_keys to the first column of
35
+ # the table's primary key. This is sufficient to make chunked exports/imports
36
+ # work properly.
37
+ attr_reader :sharding_keys
38
+
39
+ # Jetpants supports doing import and export operations in parallel "chunks" of
40
+ # the data set. For tables with few rows, this is irrelevant and can be left at
41
+ # the default of 1 (meaning no chunking). For tables with hundreds of millions
42
+ # of rows, you may want to do exports/imports in a few hundred chunks to speed
43
+ # things up and keep the transactions smaller.
44
+ attr_reader :chunks
45
+
46
+ # Create a Table. Params should have string keys, not symbols. Possible keys include
47
+ # 'sharding_key' (or equivalently 'primary_key'), 'chunks', and 'order_by'.
48
+ def initialize(name, params={})
49
+ @name = name
50
+ params['sharding_key'] ||= params['primary_keys'] || params['primary_key'] || 'user_id'
51
+ @sharding_keys = (params['sharding_key'].is_a?(Array) ? params['sharding_key'] : [params['sharding_key']])
52
+ @chunks = params['chunks'] || 1
53
+ @order_by = params['order_by']
54
+ end
55
+
56
+ # Return an array of Table objects based on the contents of Jetpants' config file entry
57
+ # of the given label.
58
+ def Table.from_config(label)
59
+ result = []
60
+ Jetpants.send(label).map {|name, attributes| Table.new name, attributes}
61
+ end
62
+
63
+ def to_s
64
+ return @name
65
+ end
66
+
67
+ # Returns the SQL for performing a data export of a given ID range
68
+ def sql_export_range(min_id=false, max_id=false)
69
+ outfile = export_file_path min_id, max_id
70
+ sql = "SELECT * FROM #{@name} "
71
+
72
+ if min_id || max_id
73
+ clauses = case
74
+ when min_id && max_id then @sharding_keys.collect {|col| "(#{col} >= #{min_id} AND #{col} <= #{max_id}) "}
75
+ when min_id then @sharding_keys.collect {|col| "#{col} >= #{min_id} "}
76
+ when max_id then @sharding_keys.collect {|col| "#{col} <= #{max_id} "}
77
+ end
78
+ sql << "WHERE " + clauses.join('OR ')
79
+ end
80
+
81
+ sql << "ORDER BY #{@order_by} " if @order_by
82
+ sql << "INTO OUTFILE '#{outfile}'"
83
+ end
84
+ alias sql_export_all sql_export_range
85
+
86
+ # Returns the SQL necessary to load the table's data.
87
+ # Note that we use an IGNORE on multi-sharding-key tables. This is because
88
+ # we get duplicate rows between export chunk files in this case.
89
+ def sql_import_range(min_id=false, max_id=false)
90
+ outfile = export_file_path min_id, max_id
91
+ ignore = (@sharding_keys.count > 1 && (min_id || max_id) ? ' IGNORE' : '')
92
+ sql = "LOAD DATA INFILE '#{outfile}'#{ignore} INTO TABLE #{@name} CHARACTER SET binary"
93
+ end
94
+ alias sql_import_all sql_import_range
95
+
96
+ # Returns the SQL necessary to iterate over a given sharding key by ID -- returns
97
+ # the next ID desired. Useful when performing a cleanup operation over a sparse
98
+ # ID range.
99
+ def sql_cleanup_next_id(sharding_key, id, direction)
100
+ if direction == :asc
101
+ "SELECT MIN(#{sharding_key}) FROM #{@name} WHERE #{sharding_key} > #{id}"
102
+ elsif direction == :desc
103
+ "SELECT MAX(#{sharding_key}) FROM #{@name} WHERE #{sharding_key} < #{id}"
104
+ else
105
+ raise "Unknown direction parameter #{direction}"
106
+ end
107
+ end
108
+
109
+ # Returns the SQL necessary to clean rows that shouldn't be on this shard.
110
+ # Pass in a sharding key and the min/max allowed ID on the shard, and get back
111
+ # a SQL DELETE statement. When running that statement, pass in an ID (obtained
112
+ # from sql_cleanup_next_id) as a bind variable.
113
+ def sql_cleanup_delete(sharding_key, min_keep_id, max_keep_id)
114
+ sql = "DELETE FROM #{@name} WHERE #{sharding_key} = ?"
115
+
116
+ # if there are multiple sharding cols, we need to be more careful to keep rows
117
+ # where the OTHER sharding col(s) do fall within the shard's range
118
+ @sharding_keys.each do |other_col|
119
+ next if other_col == sharding_key
120
+ sql << " AND NOT (#{other_col} >= #{min_keep_id} AND #{other_col} <= #{max_keep_id})"
121
+ end
122
+
123
+ return sql
124
+ end
125
+
126
+ # Counts number of rows between the given ID ranges. Warning: will give
127
+ # potentially misleading counts on multi-sharding-key tables.
128
+ def sql_count_rows(min_id, max_id)
129
+ sql = "SELECT COUNT(*) FROM #{@name} WHERE "
130
+ wheres = []
131
+ @sharding_keys.each {|col| wheres << "(#{col} >= #{min_id} AND #{col} <= #{max_id})"}
132
+ sql << wheres.join(" OR ")
133
+ end
134
+
135
+ # Returns a file path (as a String) for the export dumpfile of the given ID range.
136
+ def export_file_path(min_id=false, max_id=false)
137
+ case
138
+ when min_id && max_id then "#{Jetpants.export_location}/#{@name}#{min_id}-#{max_id}.out"
139
+ when min_id then "#{Jetpants.export_location}/#{@name}#{min_id}-and-up.out"
140
+ when max_id then "#{Jetpants.export_location}/#{@name}start-#{max_id}.out"
141
+ else "#{Jetpants.export_location}/#{@name}-full.out"
142
+ end
143
+ end
144
+
145
+ end
146
+ end
@@ -0,0 +1,144 @@
1
+ module Jetpants
2
+
3
+ # Topology maintains a list of all DB pools/shards, and is responsible for
4
+ # reading/writing configurations and manages spare box assignments.
5
+ # Much of this behavior needs to be overridden by a plugin to actually be
6
+ # useful. The implementation here is just a stub.
7
+ class Topology
8
+ attr_reader :pools
9
+
10
+ def initialize
11
+ @pools = [] # array of Pool objects
12
+ load_pools
13
+ end
14
+
15
+ ###### Class methods #######################################################
16
+
17
+ # Metaprogramming hackery to create a "synchronized" method decorator
18
+ @lock = Mutex.new
19
+ @do_sync = false
20
+ @synchronized_methods = {} # symbol => true
21
+ class << self
22
+ # Decorator that causes the next method to be wrapped in a mutex
23
+ # (only affects the next method definition, not ALL subsequent method
24
+ # definitions)
25
+ # If the method is subsequently overridden by a plugin, the new version
26
+ # will be synchronized as well, even if the decorator is omitted.
27
+ def synchronized
28
+ @do_sync = true
29
+ end
30
+
31
+ def method_added(name)
32
+ if @do_sync || @synchronized_methods[name]
33
+ lock = @lock
34
+ @do_sync = false
35
+ @synchronized_methods[name] = false # prevent infinite recursion from the following line
36
+ alias_method "#{name}_without_synchronization".to_sym, name
37
+ define_method name do |*args|
38
+ result = nil
39
+ lock.synchronize {result = send "#{name}_without_synchronization".to_sym, *args}
40
+ result
41
+ end
42
+ @synchronized_methods[name] = true # remember it is synchronized, to re-apply wrapper if method overridden by a plugin
43
+ end
44
+ end
45
+ end
46
+
47
+
48
+ ###### Overrideable methods ################################################
49
+ # Plugins should override these if the behavior is needed. (Note that plugins
50
+ # don't need to repeat the "synchronized" decorator; it automatically
51
+ # applies to overrides.)
52
+
53
+ synchronized
54
+ # Plugin should override so that this reads in a configuration and initializes
55
+ # @pools as appropriate.
56
+ def load_pools
57
+ puts "\nNotice: no plugin has overridden Topology#load_pools, so no pools are imported automatically"
58
+ end
59
+
60
+ synchronized
61
+ # Plugin should override so that it writes a configuration file or commits a
62
+ # configuration change to a config service.
63
+ def write_config
64
+ puts "\nNotice: no plugin has overridden Topology#write_config, so configuration data is not saved"
65
+ end
66
+
67
+ synchronized
68
+ # Plugin should override so that this returns an array of [count] Jetpants::DB
69
+ # objects, or throws an exception if not enough left.
70
+ # Options hash is plugin-specific. The only assumed option used by the rest of
71
+ # Jetpants is :role of 'MASTER' or 'STANDBY_SLAVE', for grabbing hardware
72
+ # suited for a particular purpose. This can be ignored if your hardware is
73
+ # entirely uniform and/or a burn-in process is already performed on all new
74
+ # hardware intakes.
75
+ def claim_spares(count, options={})
76
+ raise "Plugin must override Topology#claim_spares"
77
+ end
78
+
79
+ synchronized
80
+ # Plugin should override so that this returns a count of spare machines
81
+ # matching the selected options.
82
+ def count_spares(options={})
83
+ raise "Plugin must override Topology#count_spares"
84
+ end
85
+
86
+
87
+ ###### Accessors ###########################################################
88
+
89
+ # Returns array of this topology's Jetpants::Pool objects of type Jetpants::Shard
90
+ def shards
91
+ @pools.select {|p| p.is_a? Shard}
92
+ end
93
+
94
+ # Returns array of this topology's Jetpants::Pool objects that are NOT of type Jetpants::Shard
95
+ def functional_partitions
96
+ @pools.reject {|p| p.is_a? Shard}
97
+ end
98
+
99
+ # Finds and returns a single Jetpants::Pool. Target may be a name (string) or master (DB object).
100
+ def pool(target)
101
+ if target.is_a?(DB)
102
+ @pools.select {|p| p.master == target}.first
103
+ else
104
+ @pools.select {|p| p.name == target}.first
105
+ end
106
+ end
107
+
108
+ # Finds and returns a single Jetpants::Shard. Pass in one of these:
109
+ # * a min ID and a max ID
110
+ # * just a min ID
111
+ # * a Range object
112
+ def shard(*args)
113
+ if args.count == 2 || args[0].is_a?(Array)
114
+ args.flatten!
115
+ args.map! {|x| x.to_s.upcase == 'INFINITY' ? 'INFINITY' : x.to_i}
116
+ shards.select {|s| s.min_id == args[0] && s.max_id == args[1]}.first
117
+ elsif args[0].is_a?(Range)
118
+ shards.select {|s| s.min_id == args[0].min && s.max_id == args[0].max}.first
119
+ else
120
+ result = shards.select {|s| s.min_id == args[0].to_i}
121
+ raise "Multiple shards found with that min_id!" if result.count > 1
122
+ result.first
123
+ end
124
+ end
125
+
126
+ # Returns the Jetpants::Shard that handles the given ID.
127
+ def shard_for_id(id)
128
+ @shards.select {|s| s.min_id <= id && (s.max_id == 'INFINITY' || s.max_id >= id)}[0]
129
+ end
130
+
131
+ # Returns the Jetpants::DB that handles the given ID with the specified
132
+ # mode (either :read or :write)
133
+ def shard_db_for_id(id, mode=:read)
134
+ shard_for_id(id).db(mode)
135
+ end
136
+
137
+ # Nicer inteface into claim_spares when only one DB is desired -- returns
138
+ # a single Jetpants::DB object instead of an array.
139
+ def claim_spare(options={})
140
+ claim_spares(1, options)[0]
141
+ end
142
+
143
+ end
144
+ end
@@ -0,0 +1,23 @@
1
+ module Jetpants
2
+ class DB
3
+
4
+ ##### CALLBACKS ############################################################
5
+
6
+ # Determine master from asset tracker if machine is unreachable or MySQL isn't running.
7
+ def after_probe_master
8
+ unless @running
9
+ my_pool, my_role = Jetpants.topology.tracker.determine_pool_and_role(@ip, @port)
10
+ @master = (my_role == 'MASTER' ? false : my_pool.master)
11
+ end
12
+ end
13
+
14
+ # Determine slaves from asset tracker if machine is unreachable or MySQL isn't running
15
+ def after_probe_slaves
16
+ unless @running
17
+ @slaves = Jetpants.topology.tracker.determine_slaves(@ip, @port)
18
+ end
19
+ end
20
+
21
+ end
22
+ end
23
+
@@ -0,0 +1,70 @@
1
+ module Jetpants
2
+ class Pool
3
+
4
+ ##### METHOD OVERRIDES #####################################################
5
+
6
+ # This actually re-writes ALL the tracker json. With a more dynamic
7
+ # asset tracker (something backed by a database, for example) this
8
+ # wouldn't be necessary - instead Pool#sync_configuration could just
9
+ # update the info for the current pool (self) only.
10
+ def sync_configuration
11
+ Jetpants.topology.update_tracker_data
12
+ end
13
+
14
+ # If the pool's master hasn't been probed yet, return active_slaves list
15
+ # based strictly on what we found in the asset tracker. This is a major
16
+ # speed-up at start-up time, especially for tasks that need to iterate
17
+ # over all pools' active slaves only, such as Topology#write_config.
18
+ alias :active_slaves_from_probe :active_slaves
19
+ def active_slaves
20
+ if @master.probed?
21
+ active_slaves_from_probe
22
+ else
23
+ @active_slave_weights.keys
24
+ end
25
+ end
26
+
27
+
28
+ ##### NEW CLASS-LEVEL METHODS ##############################################
29
+
30
+ # Converts a hash (from asset tracker json file) into a Pool.
31
+ def self.from_hash(h)
32
+ return nil unless h['master']
33
+ p = Pool.new(h['name'], h['master'].to_db)
34
+ p.master_read_weight = h['master_read_weight']
35
+ p.slave_name = h['slave_name']
36
+ h['aliases'].each {|a| p.has_alias a}
37
+ h['slaves'].each do |slave_info|
38
+ s = slave_info['host'].to_db
39
+ p.has_active_slave(s, slave_info['weight']) if slave_info['role'] == 'ACTIVE_SLAVE'
40
+ end
41
+ p
42
+ end
43
+
44
+
45
+ ##### NEW METHODS ##########################################################
46
+
47
+ # Converts a Pool to a hash, for use in either the internal asset tracker
48
+ # json (for_app_config=false) or for use in the application config file yaml
49
+ # (for_app_config=true)
50
+ def to_hash(for_app_config=false)
51
+ if for_app_config
52
+ slave_data = active_slave_weights.map {|db, weight| {'host' => db.to_s, 'weight' => weight}}
53
+ else
54
+ slave_data = active_slave_weights.map {|db, weight| {'host' => db.to_s, 'weight' => weight, 'role' => 'ACTIVE_SLAVE'}} +
55
+ standby_slaves.map {|db| {'host' => db.to_s, 'role' => 'STANDBY_SLAVE'}} +
56
+ backup_slaves.map {|db| {'host' => db.to_s, 'role' => 'BACKUP_SLAVE'}}
57
+ end
58
+
59
+ {
60
+ 'name' => name,
61
+ 'aliases' => aliases,
62
+ 'slave_name' => slave_name,
63
+ 'master' => master.to_s,
64
+ 'master_read_weight' => master_read_weight || 0,
65
+ 'slaves' => slave_data
66
+ }
67
+ end
68
+
69
+ end
70
+ end
@@ -0,0 +1,76 @@
1
+ module Jetpants
2
+ class Shard < Pool
3
+
4
+ ##### CALLBACKS ############################################################
5
+
6
+ # After changing the state of a shard, sync config back to the asset tracker json
7
+ def after_state=(value)
8
+ sync_configuration
9
+ end
10
+
11
+
12
+ ##### NEW CLASS-LEVEL METHODS ##############################################
13
+
14
+ # Converts a hash (from asset tracker json file) into a Shard.
15
+ def self.from_hash(h)
16
+ # we just return the shard for now... we have to wait until later to
17
+ # set up children + parents, since it's easier to grab the corresponding
18
+ # objects once all pools have been initialized.
19
+ Shard.new(h['min_id'], h['max_id'], h['master'], h['state'].to_sym)
20
+ end
21
+
22
+ # Sets up parent/child relationships for the shard represented by the
23
+ # supplied hash.
24
+ def self.assign_relationships(h, all_shards)
25
+ return unless h['parent']
26
+
27
+ # figure out which shard corresponds to hash h
28
+ min_id = h['min_id'].to_i
29
+ max_id = (h['max_id'].to_s.upcase == 'INFINITY' ? 'INFINITY' : h['max_id'].to_i)
30
+ shard = all_shards.select {|s| s.min_id == min_id && s.max_id == max_id}.first
31
+
32
+ # now figure out which one is the parent, and assign parent/child relationship
33
+ parent = all_shards.select {|s| s.name == h['parent']}.first
34
+ raise "Cannot find parent shard #{h['parent']}" unless parent
35
+ parent.add_child shard
36
+ end
37
+
38
+
39
+ ##### NEW METHODS ##########################################################
40
+
41
+ # Converts a Shard to a hash, for use in either the internal asset tracker
42
+ # json (for_app_config=false) or for use in the application config file yaml
43
+ # (for_app_config=true)
44
+ def to_hash(for_app_config=false)
45
+
46
+ if for_app_config
47
+ # Ignore shards that shouldn't receive queries from the application
48
+ return nil unless in_config?
49
+ me = {'min_id' => min_id.to_i, 'max_id' => max_id == 'INFINITY' ? max_id : max_id.to_i}
50
+
51
+ # We need to correctly handle child shards (which still have writes sent their parent),
52
+ # read-only shards, and offline shards appropriately.
53
+ return me.merge case state
54
+ when :ready, :needs_cleanup then {'host' => master.ip}
55
+ when :child then {'host_read' => master.ip, 'host_write' => parent.master.ip}
56
+ when :read_only then {'host_read' => master.ip, 'host_write' => false}
57
+ when :offline then {'host' => false}
58
+ end
59
+ else
60
+ slave_data = active_slave_weights.map {|db, weight| {'host' => db.to_s, 'weight' => weight, 'role' => 'ACTIVE_SLAVE'}} +
61
+ standby_slaves.map {|db| {'host' => db.to_s, 'role' => 'STANDBY_SLAVE'}} +
62
+ backup_slaves.map {|db| {'host' => db.to_s, 'role' => 'BACKUP_SLAVE'}}
63
+ return {
64
+ 'min_id' => min_id,
65
+ 'max_id' => max_id,
66
+ 'parent' => parent ? parent.to_s : nil,
67
+ 'state' => state,
68
+ 'master' => master,
69
+ 'slaves' => slave_data,
70
+ }
71
+ end
72
+ end
73
+
74
+
75
+ end
76
+ end