RubyGems - jetpants - Versions diffs - 0.7.0 - Mend

jetpants 0.7.0

Files changed (30) hide show

data/Gemfile +3 -0
data/README.rdoc +88 -0
data/bin/jetpants +442 -0
data/doc/commands.rdoc +119 -0
data/doc/configuration.rdoc +27 -0
data/doc/plugins.rdoc +120 -0
data/doc/requirements.rdoc +54 -0
data/etc/jetpants.yaml.sample +58 -0
data/lib/jetpants.rb +100 -0
data/lib/jetpants/callback.rb +131 -0
data/lib/jetpants/db.rb +122 -0
data/lib/jetpants/db/client.rb +103 -0
data/lib/jetpants/db/import_export.rb +330 -0
data/lib/jetpants/db/privileges.rb +89 -0
data/lib/jetpants/db/replication.rb +226 -0
data/lib/jetpants/db/server.rb +79 -0
data/lib/jetpants/db/state.rb +212 -0
data/lib/jetpants/host.rb +396 -0
data/lib/jetpants/monkeypatch.rb +74 -0
data/lib/jetpants/pool.rb +272 -0
data/lib/jetpants/shard.rb +311 -0
data/lib/jetpants/table.rb +146 -0
data/lib/jetpants/topology.rb +144 -0
data/plugins/simple_tracker/db.rb +23 -0
data/plugins/simple_tracker/pool.rb +70 -0
data/plugins/simple_tracker/shard.rb +76 -0
data/plugins/simple_tracker/simple_tracker.rb +74 -0
data/plugins/simple_tracker/topology.rb +66 -0
data/tasks/promotion.rb +260 -0
metadata +191 -0

@@ -0,0 +1,146 @@
+module Jetpants
+  # The Table class associates a table name with a column (or list of columns)
+  # making up the table's sharding key or primary key. It is geared towards
+  # generating SQL for importing/exporting a table, NOT for representing an
+  # application data model.
+  #
+  # None of these methods actually *execute* the SQL they generate, since the
+  # Table class isn't tied to a specific DB. This allows us to represent the set
+  # of all sharded tables with a single set of Table objects, without having to
+  # duplicate those objects for every Shard or DB instance. If you want to run
+  # the generated SQL on a database, use one of the DB#query* methods.
+  class Table
+    include CallbackHandler
+    # Name of the table as it exists in your database.
+    attr_reader :name
+    # Your application's sharding_key is the column used to determine which rows
+    # live on which shard. Generally this should be the same logical value for your
+    # entire application (example: id column of the User table), although the column
+    # name need not be identical across tables (one may call it 'user_id', another
+    # could call it 'target_user_id' or 'from_user_id'.) The sharding_keys attribute
+    # stores the name of that column for this particular table.
+    #
+    # For a sharded table, sharding_keys should generally be a single column,
+    # represented here as a single string.
+    #
+    # Jetpants supports mapping-tables with multiple sharding key columns (for
+    # instance, if user_id is your app's sharding key, a "following" table mapping
+    # one user_id to another). However this makes exports and cleanup extremely
+    # inefficient, so its use is not recommended.
+    #
+    # For a non-sharded table, simply set sharding_keys to the first column of
+    # the table's primary key. This is sufficient to make chunked exports/imports
+    # work properly.
+    attr_reader :sharding_keys
+    # Jetpants supports doing import and export operations in parallel "chunks" of
+    # the data set. For tables with few rows, this is irrelevant and can be left at
+    # the default of 1 (meaning no chunking). For tables with hundreds of millions
+    # of rows, you may want to do exports/imports in a few hundred chunks to speed
+    # things up and keep the transactions smaller.
+    attr_reader :chunks
+    # Create a Table. Params should have string keys, not symbols. Possible keys include
+    # 'sharding_key' (or equivalently 'primary_key'), 'chunks', and 'order_by'.
+    def initialize(name, params={})
+      @name = name
+      params['sharding_key'] ||= params['primary_keys'] || params['primary_key'] || 'user_id'
+      @sharding_keys = (params['sharding_key'].is_a?(Array) ? params['sharding_key'] : [params['sharding_key']])
+      @chunks = params['chunks'] || 1
+      @order_by = params['order_by']
+    end
+    # Return an array of Table objects based on the contents of Jetpants' config file entry
+    # of the given label.
+    def Table.from_config(label)
+      result = []
+      Jetpants.send(label).map {|name, attributes| Table.new name, attributes}
+    end
+    def to_s
+      return @name
+    end
+    # Returns the SQL for performing a data export of a given ID range
+    def sql_export_range(min_id=false, max_id=false)
+      outfile = export_file_path min_id, max_id
+      sql = "SELECT * FROM #{@name} "
+      if min_id || max_id
+        clauses = case
+                  when min_id && max_id then @sharding_keys.collect {|col| "(#{col} >= #{min_id} AND #{col} <= #{max_id}) "}
+                  when min_id           then @sharding_keys.collect {|col| "#{col} >= #{min_id} "}
+                  when max_id           then @sharding_keys.collect {|col| "#{col} <= #{max_id} "}
+                  end
+        sql << "WHERE " + clauses.join('OR ')
+      end
+      sql << "ORDER BY #{@order_by} " if @order_by
+      sql << "INTO OUTFILE '#{outfile}'"
+    end
+    alias sql_export_all sql_export_range
+    # Returns the SQL necessary to load the table's data.
+    # Note that we use an IGNORE on multi-sharding-key tables. This is because
+    # we get duplicate rows between export chunk files in this case.
+    def sql_import_range(min_id=false, max_id=false)
+      outfile = export_file_path min_id, max_id
+      ignore = (@sharding_keys.count > 1 && (min_id || max_id) ? ' IGNORE' : '')
+      sql = "LOAD DATA INFILE '#{outfile}'#{ignore} INTO TABLE #{@name} CHARACTER SET binary"
+    end
+    alias sql_import_all sql_import_range
+    # Returns the SQL necessary to iterate over a given sharding key by ID -- returns
+    # the next ID desired.  Useful when performing a cleanup operation over a sparse
+    # ID range.
+    def sql_cleanup_next_id(sharding_key, id, direction)
+      if direction == :asc
+        "SELECT MIN(#{sharding_key}) FROM #{@name} WHERE #{sharding_key} > #{id}"
+      elsif direction == :desc
+        "SELECT MAX(#{sharding_key}) FROM #{@name} WHERE #{sharding_key} < #{id}"
+      else
+        raise "Unknown direction parameter #{direction}"
+      end
+    end
+    # Returns the SQL necessary to clean rows that shouldn't be on this shard.
+    # Pass in a sharding key and the min/max allowed ID on the shard, and get back
+    # a SQL DELETE statement.  When running that statement, pass in an ID (obtained
+    # from sql_cleanup_next_id) as a bind variable.
+    def sql_cleanup_delete(sharding_key, min_keep_id, max_keep_id)
+      sql = "DELETE FROM #{@name} WHERE #{sharding_key} = ?"
+      # if there are multiple sharding cols, we need to be more careful to keep rows
+      # where the OTHER sharding col(s) do fall within the shard's range
+      @sharding_keys.each do |other_col|
+        next if other_col == sharding_key
+        sql << " AND NOT (#{other_col} >= #{min_keep_id} AND #{other_col} <= #{max_keep_id})"
+      end
+      return sql
+    end
+    # Counts number of rows between the given ID ranges.  Warning: will give
+    # potentially misleading counts on multi-sharding-key tables.
+    def sql_count_rows(min_id, max_id)
+      sql = "SELECT COUNT(*) FROM #{@name} WHERE "
+      wheres = []
+      @sharding_keys.each {|col| wheres << "(#{col} >= #{min_id} AND #{col} <= #{max_id})"}
+      sql << wheres.join(" OR ")
+    end
+    # Returns a file path (as a String) for the export dumpfile of the given ID range.
+    def export_file_path(min_id=false, max_id=false)
+      case
+      when min_id && max_id then  "#{Jetpants.export_location}/#{@name}#{min_id}-#{max_id}.out"
+      when min_id           then  "#{Jetpants.export_location}/#{@name}#{min_id}-and-up.out"
+      when max_id           then  "#{Jetpants.export_location}/#{@name}start-#{max_id}.out"
+      else                        "#{Jetpants.export_location}/#{@name}-full.out"
+      end
+    end
+  end
+end

data/lib/jetpants/topology.rb ADDED

@@ -0,0 +1,144 @@
+module Jetpants
+  # Topology maintains a list of all DB pools/shards, and is responsible for
+  # reading/writing configurations and manages spare box assignments.
+  # Much of this behavior needs to be overridden by a plugin to actually be
+  # useful.  The implementation here is just a stub.
+  class Topology
+    attr_reader :pools
+    def initialize
+      @pools  = [] # array of Pool objects
+      load_pools
+    end
+    ###### Class methods #######################################################
+    # Metaprogramming hackery to create a "synchronized" method decorator
+    @lock = Mutex.new
+    @do_sync = false
+    @synchronized_methods = {} # symbol => true
+    class << self
+      # Decorator that causes the next method to be wrapped in a mutex
+      # (only affects the next method definition, not ALL subsequent method
+      # definitions)
+      # If the method is subsequently overridden by a plugin, the new version
+      # will be synchronized as well, even if the decorator is omitted.
+      def synchronized
+        @do_sync = true
+      end
+      def method_added(name)
+        if @do_sync || @synchronized_methods[name]
+          lock = @lock
+          @do_sync = false
+          @synchronized_methods[name] = false # prevent infinite recursion from the following line
+          alias_method "#{name}_without_synchronization".to_sym, name
+          define_method name do |*args|
+            result = nil
+            lock.synchronize {result = send "#{name}_without_synchronization".to_sym, *args}
+            result
+          end
+          @synchronized_methods[name] = true # remember it is synchronized, to re-apply wrapper if method overridden by a plugin
+        end
+      end
+    end
+    ###### Overrideable methods ################################################
+    # Plugins should override these if the behavior is needed. (Note that plugins
+    # don't need to repeat the "synchronized" decorator; it automatically
+    # applies to overrides.)
+    synchronized
+    # Plugin should override so that this reads in a configuration and initializes
+    # @pools as appropriate.
+    def load_pools
+      puts "\nNotice: no plugin has overridden Topology#load_pools, so no pools are imported automatically"
+    end
+    synchronized
+    # Plugin should override so that it writes a configuration file or commits a
+    # configuration change to a config service.
+    def write_config
+      puts "\nNotice: no plugin has overridden Topology#write_config, so configuration data is not saved"
+    end
+    synchronized
+    # Plugin should override so that this returns an array of [count] Jetpants::DB
+    # objects, or throws an exception if not enough left.
+    # Options hash is plugin-specific. The only assumed option used by the rest of
+    # Jetpants is :role of 'MASTER' or 'STANDBY_SLAVE', for grabbing hardware
+    # suited for a particular purpose. This can be ignored if your hardware is
+    # entirely uniform and/or a burn-in process is already performed on all new
+    # hardware intakes.
+    def claim_spares(count, options={})
+      raise "Plugin must override Topology#claim_spares"
+    end
+    synchronized
+    # Plugin should override so that this returns a count of spare machines
+    # matching the selected options.
+    def count_spares(options={})
+      raise "Plugin must override Topology#count_spares"
+    end
+    ###### Accessors ###########################################################
+    # Returns array of this topology's Jetpants::Pool objects of type Jetpants::Shard
+    def shards
+      @pools.select {|p| p.is_a? Shard}
+    end
+    # Returns array of this topology's Jetpants::Pool objects that are NOT of type Jetpants::Shard
+    def functional_partitions
+      @pools.reject {|p| p.is_a? Shard}
+    end
+    # Finds and returns a single Jetpants::Pool. Target may be a name (string) or master (DB object).
+    def pool(target)
+      if target.is_a?(DB)
+        @pools.select {|p| p.master == target}.first
+      else
+        @pools.select {|p| p.name == target}.first
+      end
+    end
+    # Finds and returns a single Jetpants::Shard. Pass in one of these:
+    # * a min ID and a max ID
+    # * just a min ID
+    # * a Range object
+    def shard(*args)
+      if args.count == 2 || args[0].is_a?(Array)
+        args.flatten!
+        args.map! {|x| x.to_s.upcase == 'INFINITY' ? 'INFINITY' : x.to_i}
+        shards.select {|s| s.min_id == args[0] && s.max_id == args[1]}.first
+      elsif args[0].is_a?(Range)
+        shards.select {|s| s.min_id == args[0].min && s.max_id == args[0].max}.first
+      else
+        result = shards.select {|s| s.min_id == args[0].to_i}
+        raise "Multiple shards found with that min_id!" if result.count > 1
+        result.first
+      end
+    end
+    # Returns the Jetpants::Shard that handles the given ID.
+    def shard_for_id(id)
+      @shards.select {|s| s.min_id <= id && (s.max_id == 'INFINITY' || s.max_id >= id)}[0]
+    end
+    # Returns the Jetpants::DB that handles the given ID with the specified
+    # mode (either :read or :write)
+    def shard_db_for_id(id, mode=:read)
+      shard_for_id(id).db(mode)
+    end
+    # Nicer inteface into claim_spares when only one DB is desired -- returns
+    # a single Jetpants::DB object instead of an array.
+    def claim_spare(options={})
+      claim_spares(1, options)[0]
+    end
+  end
+end

data/plugins/simple_tracker/db.rb ADDED

@@ -0,0 +1,23 @@
+module Jetpants
+  class DB
+    ##### CALLBACKS ############################################################
+    # Determine master from asset tracker if machine is unreachable or MySQL isn't running.
+    def after_probe_master
+      unless @running
+        my_pool, my_role = Jetpants.topology.tracker.determine_pool_and_role(@ip, @port)
+        @master = (my_role == 'MASTER' ? false : my_pool.master)
+      end
+    end
+    # Determine slaves from asset tracker if machine is unreachable or MySQL isn't running
+    def after_probe_slaves
+      unless @running
+        @slaves = Jetpants.topology.tracker.determine_slaves(@ip, @port)
+      end
+    end
+  end
+end

data/plugins/simple_tracker/pool.rb ADDED

@@ -0,0 +1,70 @@
+module Jetpants
+  class Pool
+    ##### METHOD OVERRIDES #####################################################
+    # This actually re-writes ALL the tracker json. With a more dynamic
+    # asset tracker (something backed by a database, for example) this
+    # wouldn't be necessary - instead Pool#sync_configuration could just
+    # update the info for the current pool (self) only.
+    def sync_configuration
+      Jetpants.topology.update_tracker_data
+    end
+    # If the pool's master hasn't been probed yet, return active_slaves list
+    # based strictly on what we found in the asset tracker. This is a major
+    # speed-up at start-up time, especially for tasks that need to iterate
+    # over all pools' active slaves only, such as Topology#write_config.
+    alias :active_slaves_from_probe :active_slaves
+    def active_slaves
+      if @master.probed?
+        active_slaves_from_probe
+      else
+        @active_slave_weights.keys
+      end
+    end
+    ##### NEW CLASS-LEVEL METHODS ##############################################
+    # Converts a hash (from asset tracker json file) into a Pool.
+    def self.from_hash(h)
+      return nil unless h['master']
+      p = Pool.new(h['name'], h['master'].to_db)
+      p.master_read_weight = h['master_read_weight']
+      p.slave_name = h['slave_name']
+      h['aliases'].each {|a| p.has_alias a}
+      h['slaves'].each do |slave_info|
+        s = slave_info['host'].to_db
+        p.has_active_slave(s, slave_info['weight']) if slave_info['role'] == 'ACTIVE_SLAVE'
+      end
+      p
+    end
+    ##### NEW METHODS ##########################################################
+    # Converts a Pool to a hash, for use in either the internal asset tracker
+    # json (for_app_config=false) or for use in the application config file yaml
+    # (for_app_config=true)
+    def to_hash(for_app_config=false)
+      if for_app_config
+        slave_data = active_slave_weights.map {|db, weight| {'host' => db.to_s, 'weight' => weight}}
+      else
+        slave_data =  active_slave_weights.map {|db, weight| {'host' => db.to_s, 'weight' => weight, 'role' => 'ACTIVE_SLAVE'}} +
+                      standby_slaves.map {|db| {'host' => db.to_s, 'role' => 'STANDBY_SLAVE'}} +
+                      backup_slaves.map {|db| {'host' => db.to_s, 'role' => 'BACKUP_SLAVE'}}
+      end
+      {
+        'name'                => name,
+        'aliases'             => aliases,
+        'slave_name'          => slave_name,
+        'master'              => master.to_s,
+        'master_read_weight'  => master_read_weight || 0,
+        'slaves'              => slave_data
+      }
+    end
+  end
+end

data/plugins/simple_tracker/shard.rb ADDED

@@ -0,0 +1,76 @@
+module Jetpants
+  class Shard < Pool
+    ##### CALLBACKS ############################################################
+    # After changing the state of a shard, sync config back to the asset tracker json
+    def after_state=(value)
+      sync_configuration
+    end
+    ##### NEW CLASS-LEVEL METHODS ##############################################
+    # Converts a hash (from asset tracker json file) into a Shard.
+    def self.from_hash(h)
+      # we just return the shard for now... we have to wait until later to
+      # set up children + parents, since it's easier to grab the corresponding
+      # objects once all pools have been initialized.
+      Shard.new(h['min_id'], h['max_id'], h['master'], h['state'].to_sym)
+    end
+    # Sets up parent/child relationships for the shard represented by the
+    # supplied hash.
+    def self.assign_relationships(h, all_shards)
+      return unless h['parent']
+      # figure out which shard corresponds to hash h
+      min_id = h['min_id'].to_i
+      max_id = (h['max_id'].to_s.upcase == 'INFINITY' ? 'INFINITY' : h['max_id'].to_i)
+      shard = all_shards.select {|s| s.min_id == min_id && s.max_id == max_id}.first
+      # now figure out which one is the parent, and assign parent/child relationship
+      parent = all_shards.select {|s| s.name == h['parent']}.first
+      raise "Cannot find parent shard #{h['parent']}" unless parent
+      parent.add_child shard
+    end
+    ##### NEW METHODS ##########################################################
+    # Converts a Shard to a hash, for use in either the internal asset tracker
+    # json (for_app_config=false) or for use in the application config file yaml
+    # (for_app_config=true)
+    def to_hash(for_app_config=false)
+      if for_app_config
+        # Ignore shards that shouldn't receive queries from the application
+        return nil unless in_config?
+        me = {'min_id' => min_id.to_i, 'max_id' => max_id == 'INFINITY' ? max_id : max_id.to_i}
+        # We need to correctly handle child shards (which still have writes sent their parent),
+        # read-only shards, and offline shards appropriately.
+        return me.merge case state
+                 when :ready, :needs_cleanup then {'host' => master.ip}
+                 when :child then {'host_read' => master.ip, 'host_write' => parent.master.ip}
+                 when :read_only then {'host_read' => master.ip, 'host_write' => false}
+                 when :offline then {'host' => false}
+                 end
+      else
+        slave_data =  active_slave_weights.map {|db, weight| {'host' => db.to_s, 'weight' => weight, 'role' => 'ACTIVE_SLAVE'}} +
+                      standby_slaves.map {|db| {'host' => db.to_s, 'role' => 'STANDBY_SLAVE'}} +
+                      backup_slaves.map {|db| {'host' => db.to_s, 'role' => 'BACKUP_SLAVE'}}
+        return {
+          'min_id'    =>  min_id,
+          'max_id'    =>  max_id,
+          'parent'    =>  parent ? parent.to_s : nil,
+          'state'     =>  state,
+          'master'    =>  master,
+          'slaves'    =>  slave_data,
+        }
+      end
+    end
+  end
+end