jetpants 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,146 @@
1
+ module Jetpants
2
+
3
+ # The Table class associates a table name with a column (or list of columns)
4
+ # making up the table's sharding key or primary key. It is geared towards
5
+ # generating SQL for importing/exporting a table, NOT for representing an
6
+ # application data model.
7
+ #
8
+ # None of these methods actually *execute* the SQL they generate, since the
9
+ # Table class isn't tied to a specific DB. This allows us to represent the set
10
+ # of all sharded tables with a single set of Table objects, without having to
11
+ # duplicate those objects for every Shard or DB instance. If you want to run
12
+ # the generated SQL on a database, use one of the DB#query* methods.
13
+ class Table
14
+ include CallbackHandler
15
+
16
+ # Name of the table as it exists in your database.
17
+ attr_reader :name
18
+
19
+ # Your application's sharding_key is the column used to determine which rows
20
+ # live on which shard. Generally this should be the same logical value for your
21
+ # entire application (example: id column of the User table), although the column
22
+ # name need not be identical across tables (one may call it 'user_id', another
23
+ # could call it 'target_user_id' or 'from_user_id'.) The sharding_keys attribute
24
+ # stores the name of that column for this particular table.
25
+ #
26
+ # For a sharded table, sharding_keys should generally be a single column,
27
+ # represented here as a single string.
28
+ #
29
+ # Jetpants supports mapping-tables with multiple sharding key columns (for
30
+ # instance, if user_id is your app's sharding key, a "following" table mapping
31
+ # one user_id to another). However this makes exports and cleanup extremely
32
+ # inefficient, so its use is not recommended.
33
+ #
34
+ # For a non-sharded table, simply set sharding_keys to the first column of
35
+ # the table's primary key. This is sufficient to make chunked exports/imports
36
+ # work properly.
37
+ attr_reader :sharding_keys
38
+
39
+ # Jetpants supports doing import and export operations in parallel "chunks" of
40
+ # the data set. For tables with few rows, this is irrelevant and can be left at
41
+ # the default of 1 (meaning no chunking). For tables with hundreds of millions
42
+ # of rows, you may want to do exports/imports in a few hundred chunks to speed
43
+ # things up and keep the transactions smaller.
44
+ attr_reader :chunks
45
+
46
+ # Create a Table. Params should have string keys, not symbols. Possible keys include
47
+ # 'sharding_key' (or equivalently 'primary_key'), 'chunks', and 'order_by'.
48
+ def initialize(name, params={})
49
+ @name = name
50
+ params['sharding_key'] ||= params['primary_keys'] || params['primary_key'] || 'user_id'
51
+ @sharding_keys = (params['sharding_key'].is_a?(Array) ? params['sharding_key'] : [params['sharding_key']])
52
+ @chunks = params['chunks'] || 1
53
+ @order_by = params['order_by']
54
+ end
55
+
56
+ # Return an array of Table objects based on the contents of Jetpants' config file entry
57
+ # of the given label.
58
+ def Table.from_config(label)
59
+ result = []
60
+ Jetpants.send(label).map {|name, attributes| Table.new name, attributes}
61
+ end
62
+
63
+ def to_s
64
+ return @name
65
+ end
66
+
67
+ # Returns the SQL for performing a data export of a given ID range
68
+ def sql_export_range(min_id=false, max_id=false)
69
+ outfile = export_file_path min_id, max_id
70
+ sql = "SELECT * FROM #{@name} "
71
+
72
+ if min_id || max_id
73
+ clauses = case
74
+ when min_id && max_id then @sharding_keys.collect {|col| "(#{col} >= #{min_id} AND #{col} <= #{max_id}) "}
75
+ when min_id then @sharding_keys.collect {|col| "#{col} >= #{min_id} "}
76
+ when max_id then @sharding_keys.collect {|col| "#{col} <= #{max_id} "}
77
+ end
78
+ sql << "WHERE " + clauses.join('OR ')
79
+ end
80
+
81
+ sql << "ORDER BY #{@order_by} " if @order_by
82
+ sql << "INTO OUTFILE '#{outfile}'"
83
+ end
84
+ alias sql_export_all sql_export_range
85
+
86
+ # Returns the SQL necessary to load the table's data.
87
+ # Note that we use an IGNORE on multi-sharding-key tables. This is because
88
+ # we get duplicate rows between export chunk files in this case.
89
+ def sql_import_range(min_id=false, max_id=false)
90
+ outfile = export_file_path min_id, max_id
91
+ ignore = (@sharding_keys.count > 1 && (min_id || max_id) ? ' IGNORE' : '')
92
+ sql = "LOAD DATA INFILE '#{outfile}'#{ignore} INTO TABLE #{@name} CHARACTER SET binary"
93
+ end
94
+ alias sql_import_all sql_import_range
95
+
96
+ # Returns the SQL necessary to iterate over a given sharding key by ID -- returns
97
+ # the next ID desired. Useful when performing a cleanup operation over a sparse
98
+ # ID range.
99
+ def sql_cleanup_next_id(sharding_key, id, direction)
100
+ if direction == :asc
101
+ "SELECT MIN(#{sharding_key}) FROM #{@name} WHERE #{sharding_key} > #{id}"
102
+ elsif direction == :desc
103
+ "SELECT MAX(#{sharding_key}) FROM #{@name} WHERE #{sharding_key} < #{id}"
104
+ else
105
+ raise "Unknown direction parameter #{direction}"
106
+ end
107
+ end
108
+
109
+ # Returns the SQL necessary to clean rows that shouldn't be on this shard.
110
+ # Pass in a sharding key and the min/max allowed ID on the shard, and get back
111
+ # a SQL DELETE statement. When running that statement, pass in an ID (obtained
112
+ # from sql_cleanup_next_id) as a bind variable.
113
+ def sql_cleanup_delete(sharding_key, min_keep_id, max_keep_id)
114
+ sql = "DELETE FROM #{@name} WHERE #{sharding_key} = ?"
115
+
116
+ # if there are multiple sharding cols, we need to be more careful to keep rows
117
+ # where the OTHER sharding col(s) do fall within the shard's range
118
+ @sharding_keys.each do |other_col|
119
+ next if other_col == sharding_key
120
+ sql << " AND NOT (#{other_col} >= #{min_keep_id} AND #{other_col} <= #{max_keep_id})"
121
+ end
122
+
123
+ return sql
124
+ end
125
+
126
+ # Counts number of rows between the given ID ranges. Warning: will give
127
+ # potentially misleading counts on multi-sharding-key tables.
128
+ def sql_count_rows(min_id, max_id)
129
+ sql = "SELECT COUNT(*) FROM #{@name} WHERE "
130
+ wheres = []
131
+ @sharding_keys.each {|col| wheres << "(#{col} >= #{min_id} AND #{col} <= #{max_id})"}
132
+ sql << wheres.join(" OR ")
133
+ end
134
+
135
+ # Returns a file path (as a String) for the export dumpfile of the given ID range.
136
+ def export_file_path(min_id=false, max_id=false)
137
+ case
138
+ when min_id && max_id then "#{Jetpants.export_location}/#{@name}#{min_id}-#{max_id}.out"
139
+ when min_id then "#{Jetpants.export_location}/#{@name}#{min_id}-and-up.out"
140
+ when max_id then "#{Jetpants.export_location}/#{@name}start-#{max_id}.out"
141
+ else "#{Jetpants.export_location}/#{@name}-full.out"
142
+ end
143
+ end
144
+
145
+ end
146
+ end
@@ -0,0 +1,144 @@
1
+ module Jetpants
2
+
3
+ # Topology maintains a list of all DB pools/shards, and is responsible for
4
+ # reading/writing configurations and manages spare box assignments.
5
+ # Much of this behavior needs to be overridden by a plugin to actually be
6
+ # useful. The implementation here is just a stub.
7
+ class Topology
8
+ attr_reader :pools
9
+
10
+ def initialize
11
+ @pools = [] # array of Pool objects
12
+ load_pools
13
+ end
14
+
15
+ ###### Class methods #######################################################
16
+
17
+ # Metaprogramming hackery to create a "synchronized" method decorator
18
+ @lock = Mutex.new
19
+ @do_sync = false
20
+ @synchronized_methods = {} # symbol => true
21
+ class << self
22
+ # Decorator that causes the next method to be wrapped in a mutex
23
+ # (only affects the next method definition, not ALL subsequent method
24
+ # definitions)
25
+ # If the method is subsequently overridden by a plugin, the new version
26
+ # will be synchronized as well, even if the decorator is omitted.
27
+ def synchronized
28
+ @do_sync = true
29
+ end
30
+
31
+ def method_added(name)
32
+ if @do_sync || @synchronized_methods[name]
33
+ lock = @lock
34
+ @do_sync = false
35
+ @synchronized_methods[name] = false # prevent infinite recursion from the following line
36
+ alias_method "#{name}_without_synchronization".to_sym, name
37
+ define_method name do |*args|
38
+ result = nil
39
+ lock.synchronize {result = send "#{name}_without_synchronization".to_sym, *args}
40
+ result
41
+ end
42
+ @synchronized_methods[name] = true # remember it is synchronized, to re-apply wrapper if method overridden by a plugin
43
+ end
44
+ end
45
+ end
46
+
47
+
48
+ ###### Overrideable methods ################################################
49
+ # Plugins should override these if the behavior is needed. (Note that plugins
50
+ # don't need to repeat the "synchronized" decorator; it automatically
51
+ # applies to overrides.)
52
+
53
+ synchronized
54
+ # Plugin should override so that this reads in a configuration and initializes
55
+ # @pools as appropriate.
56
+ def load_pools
57
+ puts "\nNotice: no plugin has overridden Topology#load_pools, so no pools are imported automatically"
58
+ end
59
+
60
+ synchronized
61
+ # Plugin should override so that it writes a configuration file or commits a
62
+ # configuration change to a config service.
63
+ def write_config
64
+ puts "\nNotice: no plugin has overridden Topology#write_config, so configuration data is not saved"
65
+ end
66
+
67
+ synchronized
68
+ # Plugin should override so that this returns an array of [count] Jetpants::DB
69
+ # objects, or throws an exception if not enough left.
70
+ # Options hash is plugin-specific. The only assumed option used by the rest of
71
+ # Jetpants is :role of 'MASTER' or 'STANDBY_SLAVE', for grabbing hardware
72
+ # suited for a particular purpose. This can be ignored if your hardware is
73
+ # entirely uniform and/or a burn-in process is already performed on all new
74
+ # hardware intakes.
75
+ def claim_spares(count, options={})
76
+ raise "Plugin must override Topology#claim_spares"
77
+ end
78
+
79
+ synchronized
80
+ # Plugin should override so that this returns a count of spare machines
81
+ # matching the selected options.
82
+ def count_spares(options={})
83
+ raise "Plugin must override Topology#count_spares"
84
+ end
85
+
86
+
87
+ ###### Accessors ###########################################################
88
+
89
+ # Returns array of this topology's Jetpants::Pool objects of type Jetpants::Shard
90
+ def shards
91
+ @pools.select {|p| p.is_a? Shard}
92
+ end
93
+
94
+ # Returns array of this topology's Jetpants::Pool objects that are NOT of type Jetpants::Shard
95
+ def functional_partitions
96
+ @pools.reject {|p| p.is_a? Shard}
97
+ end
98
+
99
+ # Finds and returns a single Jetpants::Pool. Target may be a name (string) or master (DB object).
100
+ def pool(target)
101
+ if target.is_a?(DB)
102
+ @pools.select {|p| p.master == target}.first
103
+ else
104
+ @pools.select {|p| p.name == target}.first
105
+ end
106
+ end
107
+
108
+ # Finds and returns a single Jetpants::Shard. Pass in one of these:
109
+ # * a min ID and a max ID
110
+ # * just a min ID
111
+ # * a Range object
112
+ def shard(*args)
113
+ if args.count == 2 || args[0].is_a?(Array)
114
+ args.flatten!
115
+ args.map! {|x| x.to_s.upcase == 'INFINITY' ? 'INFINITY' : x.to_i}
116
+ shards.select {|s| s.min_id == args[0] && s.max_id == args[1]}.first
117
+ elsif args[0].is_a?(Range)
118
+ shards.select {|s| s.min_id == args[0].min && s.max_id == args[0].max}.first
119
+ else
120
+ result = shards.select {|s| s.min_id == args[0].to_i}
121
+ raise "Multiple shards found with that min_id!" if result.count > 1
122
+ result.first
123
+ end
124
+ end
125
+
126
+ # Returns the Jetpants::Shard that handles the given ID.
127
+ def shard_for_id(id)
128
+ @shards.select {|s| s.min_id <= id && (s.max_id == 'INFINITY' || s.max_id >= id)}[0]
129
+ end
130
+
131
+ # Returns the Jetpants::DB that handles the given ID with the specified
132
+ # mode (either :read or :write)
133
+ def shard_db_for_id(id, mode=:read)
134
+ shard_for_id(id).db(mode)
135
+ end
136
+
137
+ # Nicer inteface into claim_spares when only one DB is desired -- returns
138
+ # a single Jetpants::DB object instead of an array.
139
+ def claim_spare(options={})
140
+ claim_spares(1, options)[0]
141
+ end
142
+
143
+ end
144
+ end
@@ -0,0 +1,23 @@
1
+ module Jetpants
2
+ class DB
3
+
4
+ ##### CALLBACKS ############################################################
5
+
6
+ # Determine master from asset tracker if machine is unreachable or MySQL isn't running.
7
+ def after_probe_master
8
+ unless @running
9
+ my_pool, my_role = Jetpants.topology.tracker.determine_pool_and_role(@ip, @port)
10
+ @master = (my_role == 'MASTER' ? false : my_pool.master)
11
+ end
12
+ end
13
+
14
+ # Determine slaves from asset tracker if machine is unreachable or MySQL isn't running
15
+ def after_probe_slaves
16
+ unless @running
17
+ @slaves = Jetpants.topology.tracker.determine_slaves(@ip, @port)
18
+ end
19
+ end
20
+
21
+ end
22
+ end
23
+
@@ -0,0 +1,70 @@
1
+ module Jetpants
2
+ class Pool
3
+
4
+ ##### METHOD OVERRIDES #####################################################
5
+
6
+ # This actually re-writes ALL the tracker json. With a more dynamic
7
+ # asset tracker (something backed by a database, for example) this
8
+ # wouldn't be necessary - instead Pool#sync_configuration could just
9
+ # update the info for the current pool (self) only.
10
+ def sync_configuration
11
+ Jetpants.topology.update_tracker_data
12
+ end
13
+
14
+ # If the pool's master hasn't been probed yet, return active_slaves list
15
+ # based strictly on what we found in the asset tracker. This is a major
16
+ # speed-up at start-up time, especially for tasks that need to iterate
17
+ # over all pools' active slaves only, such as Topology#write_config.
18
+ alias :active_slaves_from_probe :active_slaves
19
+ def active_slaves
20
+ if @master.probed?
21
+ active_slaves_from_probe
22
+ else
23
+ @active_slave_weights.keys
24
+ end
25
+ end
26
+
27
+
28
+ ##### NEW CLASS-LEVEL METHODS ##############################################
29
+
30
+ # Converts a hash (from asset tracker json file) into a Pool.
31
+ def self.from_hash(h)
32
+ return nil unless h['master']
33
+ p = Pool.new(h['name'], h['master'].to_db)
34
+ p.master_read_weight = h['master_read_weight']
35
+ p.slave_name = h['slave_name']
36
+ h['aliases'].each {|a| p.has_alias a}
37
+ h['slaves'].each do |slave_info|
38
+ s = slave_info['host'].to_db
39
+ p.has_active_slave(s, slave_info['weight']) if slave_info['role'] == 'ACTIVE_SLAVE'
40
+ end
41
+ p
42
+ end
43
+
44
+
45
+ ##### NEW METHODS ##########################################################
46
+
47
+ # Converts a Pool to a hash, for use in either the internal asset tracker
48
+ # json (for_app_config=false) or for use in the application config file yaml
49
+ # (for_app_config=true)
50
+ def to_hash(for_app_config=false)
51
+ if for_app_config
52
+ slave_data = active_slave_weights.map {|db, weight| {'host' => db.to_s, 'weight' => weight}}
53
+ else
54
+ slave_data = active_slave_weights.map {|db, weight| {'host' => db.to_s, 'weight' => weight, 'role' => 'ACTIVE_SLAVE'}} +
55
+ standby_slaves.map {|db| {'host' => db.to_s, 'role' => 'STANDBY_SLAVE'}} +
56
+ backup_slaves.map {|db| {'host' => db.to_s, 'role' => 'BACKUP_SLAVE'}}
57
+ end
58
+
59
+ {
60
+ 'name' => name,
61
+ 'aliases' => aliases,
62
+ 'slave_name' => slave_name,
63
+ 'master' => master.to_s,
64
+ 'master_read_weight' => master_read_weight || 0,
65
+ 'slaves' => slave_data
66
+ }
67
+ end
68
+
69
+ end
70
+ end
@@ -0,0 +1,76 @@
1
+ module Jetpants
2
+ class Shard < Pool
3
+
4
+ ##### CALLBACKS ############################################################
5
+
6
+ # After changing the state of a shard, sync config back to the asset tracker json
7
+ def after_state=(value)
8
+ sync_configuration
9
+ end
10
+
11
+
12
+ ##### NEW CLASS-LEVEL METHODS ##############################################
13
+
14
+ # Converts a hash (from asset tracker json file) into a Shard.
15
+ def self.from_hash(h)
16
+ # we just return the shard for now... we have to wait until later to
17
+ # set up children + parents, since it's easier to grab the corresponding
18
+ # objects once all pools have been initialized.
19
+ Shard.new(h['min_id'], h['max_id'], h['master'], h['state'].to_sym)
20
+ end
21
+
22
+ # Sets up parent/child relationships for the shard represented by the
23
+ # supplied hash.
24
+ def self.assign_relationships(h, all_shards)
25
+ return unless h['parent']
26
+
27
+ # figure out which shard corresponds to hash h
28
+ min_id = h['min_id'].to_i
29
+ max_id = (h['max_id'].to_s.upcase == 'INFINITY' ? 'INFINITY' : h['max_id'].to_i)
30
+ shard = all_shards.select {|s| s.min_id == min_id && s.max_id == max_id}.first
31
+
32
+ # now figure out which one is the parent, and assign parent/child relationship
33
+ parent = all_shards.select {|s| s.name == h['parent']}.first
34
+ raise "Cannot find parent shard #{h['parent']}" unless parent
35
+ parent.add_child shard
36
+ end
37
+
38
+
39
+ ##### NEW METHODS ##########################################################
40
+
41
+ # Converts a Shard to a hash, for use in either the internal asset tracker
42
+ # json (for_app_config=false) or for use in the application config file yaml
43
+ # (for_app_config=true)
44
+ def to_hash(for_app_config=false)
45
+
46
+ if for_app_config
47
+ # Ignore shards that shouldn't receive queries from the application
48
+ return nil unless in_config?
49
+ me = {'min_id' => min_id.to_i, 'max_id' => max_id == 'INFINITY' ? max_id : max_id.to_i}
50
+
51
+ # We need to correctly handle child shards (which still have writes sent their parent),
52
+ # read-only shards, and offline shards appropriately.
53
+ return me.merge case state
54
+ when :ready, :needs_cleanup then {'host' => master.ip}
55
+ when :child then {'host_read' => master.ip, 'host_write' => parent.master.ip}
56
+ when :read_only then {'host_read' => master.ip, 'host_write' => false}
57
+ when :offline then {'host' => false}
58
+ end
59
+ else
60
+ slave_data = active_slave_weights.map {|db, weight| {'host' => db.to_s, 'weight' => weight, 'role' => 'ACTIVE_SLAVE'}} +
61
+ standby_slaves.map {|db| {'host' => db.to_s, 'role' => 'STANDBY_SLAVE'}} +
62
+ backup_slaves.map {|db| {'host' => db.to_s, 'role' => 'BACKUP_SLAVE'}}
63
+ return {
64
+ 'min_id' => min_id,
65
+ 'max_id' => max_id,
66
+ 'parent' => parent ? parent.to_s : nil,
67
+ 'state' => state,
68
+ 'master' => master,
69
+ 'slaves' => slave_data,
70
+ }
71
+ end
72
+ end
73
+
74
+
75
+ end
76
+ end