RubyGems - sq-dbsync - Versions diffs - 1.0.0 - Mend

sq-dbsync 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

data/HISTORY.md +5 -0
data/LICENSE +14 -0
data/README.md +218 -0
data/lib/sq/dbsync/all_tables_plan.rb +51 -0
data/lib/sq/dbsync/batch_load_action.rb +95 -0
data/lib/sq/dbsync/config.rb +12 -0
data/lib/sq/dbsync/consistency_verifier.rb +70 -0
data/lib/sq/dbsync/database/common.rb +91 -0
data/lib/sq/dbsync/database/connection.rb +23 -0
data/lib/sq/dbsync/database/mysql.rb +163 -0
data/lib/sq/dbsync/database/postgres.rb +77 -0
data/lib/sq/dbsync/error_handler.rb +59 -0
data/lib/sq/dbsync/example_record_destroyer.rb +77 -0
data/lib/sq/dbsync/incremental_load_action.rb +95 -0
data/lib/sq/dbsync/load_action.rb +156 -0
data/lib/sq/dbsync/loggers.rb +135 -0
data/lib/sq/dbsync/manager.rb +241 -0
data/lib/sq/dbsync/pipeline/simple_context.rb +15 -0
data/lib/sq/dbsync/pipeline/threaded_context.rb +95 -0
data/lib/sq/dbsync/pipeline.rb +80 -0
data/lib/sq/dbsync/refresh_recent_load_action.rb +71 -0
data/lib/sq/dbsync/schema_maker.rb +87 -0
data/lib/sq/dbsync/static_table_plan.rb +42 -0
data/lib/sq/dbsync/table_registry.rb +75 -0
data/lib/sq/dbsync/tempfile_factory.rb +41 -0
data/lib/sq/dbsync/version.rb +5 -0
data/lib/sq/dbsync.rb +9 -0
data/spec/acceptance/loading_spec.rb +237 -0
data/spec/acceptance_helper.rb +2 -0
data/spec/database_helper.rb +86 -0
data/spec/integration/all_tables_plan_spec.rb +36 -0
data/spec/integration/batch_load_action_spec.rb +229 -0
data/spec/integration/consistency_verifier_spec.rb +54 -0
data/spec/integration/database_connection_spec.rb +61 -0
data/spec/integration/incremental_load_action_spec.rb +196 -0
data/spec/integration/manager_spec.rb +109 -0
data/spec/integration/schema_maker_spec.rb +119 -0
data/spec/integration_helper.rb +43 -0
data/spec/spec_helper.rb +27 -0
data/spec/unit/config_spec.rb +18 -0
data/spec/unit/error_handler_spec.rb +52 -0
data/spec/unit/pipeline_spec.rb +42 -0
data/spec/unit/stream_logger_spec.rb +33 -0
data/spec/unit_helper.rb +1 -0
data/sq-dbsync.gemspec +32 -0
metadata +188 -0

data/lib/sq/dbsync/loggers.rb ADDED Viewed

@@ -0,0 +1,135 @@
+require 'time'
+require 'socket'
+# Instrumenting various aspects of the system is critical since it will take
+# longer and longer as the data sources grow and it is necessary to know when
+# this import time is taking too long (either replication can't keep up, or
+# recovery time is too long).
+module Sq::Dbsync::Loggers
+  # Abstract base class for loggers. This is useful because the CompositeLogger
+  # needs to delegate to a set of loggers, which requires an explicit interface
+  # to communicate with. This class helps define that relationship and describe
+  # the interfaces.
+  class Abstract
+    def measure(label, &block); end
+    def log(str); end
+  end
+  # Writes timing information to stdout. Thread-safe in that calls to measure
+  # from separate threads will execute in parallel but synchronize before
+  # writing their output.
+  class Stream < Abstract
+    def initialize(out = $stdout)
+      @mutex   = Mutex.new
+      @out     = out
+    end
+    def measure(label, &block)
+      start_time = Time.now.utc
+      log_measurement(start_time, :starting, 0, label)
+      ret = nil
+      exception = nil
+      state = :finished
+      begin
+        ret = block.call
+      rescue => e
+        state = :failed
+        exception = e
+        raise
+      ensure
+        end_time = Time.now.utc
+        log_measurement(end_time, state, end_time - start_time, label)
+        log(exception.message) if exception
+      end
+      ret
+    end
+    def log_measurement(time, event, duration, object)
+      log([
+        event,
+        "%.3f" % duration,
+        object
+      ].join("\t"), time)
+    end
+    def log(str, time = Time.now.utc)
+      # Synchronize to ensure lines are not interwoven.
+      mutex.synchronize { out.puts([time, str].join("\t")) }
+    end
+    private
+    attr_reader :mutex, :out
+  end
+  # Combines multiple loggers together.
+  class Composite  < Abstract
+    attr_accessor :loggers
+    def initialize(loggers = nil)
+      @loggers = loggers
+    end
+    def measure(label, &block)
+      # Babushka doll! Logger inside a logger inside a logger.
+      loggers.inject(block) do |block, logger|
+        lambda do
+          logger.measure(label) do
+            block.call
+          end
+        end
+      end.call
+    end
+    def log(str)
+      loggers.each { |logger| logger.log(str) }
+    end
+  end
+  # Logs metric run time to graphite.
+  class Graphite < Abstract
+    def initialize(opts)
+      @opts   = opts
+    end
+    def measure(label, &block)
+      start_time = Time.now.utc
+      block.call
+    ensure
+      end_time = Time.now.utc
+      record_metric(end_time.to_i, label, end_time - start_time)
+    end
+    def record_metric(timestamp, name, value)
+      msg = "#{@opts.fetch(:prefix, 'dbsync')}.#{name} #{value} #{timestamp}\n"
+      s = TCPSocket.new(@opts[:host], @opts.fetch(:port, 2003))
+      s.send msg, 0
+      s.close
+    end
+  end
+  # Used in test environments where instrumentation is not required.
+  class Null < Abstract
+    def measure(label, &block)
+      block.call
+    end
+  end
+  # Logging is one of the few outputs of the system, this class is provided as a
+  # cheap way to allow tests to hook into events. It should not be used in
+  # production.
+  class NullWithCallbacks < Abstract
+    attr_accessor :callbacks
+    def initialize(callbacks = nil)
+      @callbacks = callbacks
+    end
+    def measure(label, &block)
+      (callbacks || {}).fetch(label, ->{}).call
+      block.call
+    end
+  end
+end

data/lib/sq/dbsync/manager.rb ADDED Viewed

@@ -0,0 +1,241 @@
+require 'sq/dbsync/config'
+require 'sq/dbsync/batch_load_action'
+require 'sq/dbsync/incremental_load_action'
+require 'sq/dbsync/refresh_recent_load_action'
+require 'sq/dbsync/pipeline'
+require 'sq/dbsync/table_registry'
+require 'sq/dbsync/consistency_verifier'
+require 'sq/dbsync/database/connection'
+require 'sq/dbsync/error_handler'
+# The manager orchestrates the high level functions of the sync, such as
+# keeping the database up-to-date and batch loading.
+#
+# This is the main entry point for the application.
+class Sq::Dbsync::Manager
+  include Sq::Dbsync
+  EPOCH = Date.new(2000, 1, 1).to_time
+  MAX_RETRIES = 10
+  def initialize(config, plans)
+    @config        = Sq::Dbsync::Config.make(config)
+    @plans         = plans
+    @error_handler = ErrorHandler.new(config)
+  end
+  def batch(tables = :all)
+    error_handler.wrap do
+      batch_nonactive(tables)
+      refresh_recent(tables)
+    end
+  end
+  def increment
+    error_handler.wrap do
+      incremental
+    end
+  end
+  def batch_nonactive(tables = :all)
+    registry.ensure_storage_exists
+    measure(:batch_total) do
+      raise_all_if_pipeline_failure(
+        run_load(BatchLoadAction, Pipeline::ThreadedContext, tables)
+      )
+    end
+  end
+  def refresh_recent(tables = :all)
+    registry.ensure_storage_exists
+    measure(:refresh_recent_total) do
+      raise_all_if_pipeline_failure(
+        run_load(RefreshRecentLoadAction, Pipeline::ThreadedContext, tables)
+      )
+    end
+  end
+  def incremental
+    @running = true
+    counter = 0
+    loop_with_retry_on(->{ @running }, transient_exceptions) do
+      incremental_once
+      counter = (counter + 1) % 100
+      if counter == 1
+        # No need to do this every cycle, 100 is chosen to be as good as any
+        # other number. It should run on the very first cycle however so that
+        # the specs will cover it.
+        increment_checkpoint
+      end
+    end
+  end
+  def incremental_once
+    # In theory, this ensures that any changes to the source IP (such as from a
+    # virtual IP flip) are picked up.
+    sources.each do |_, db|
+      db.disconnect
+    end
+    raise_if_pipeline_failure(
+      # ThreadedContext would be ideal here, but it leaks memory in JRuby. Not
+      # sure why yet, but mass creation of threads seems like an obvious
+      # candidate for brokenness.
+      #
+      # TODO: Above comment probably isn't true with 1.7 and ThreadedContext
+      # fixes.
+      run_load(incremental_action, Pipeline::SimpleContext)
+    )
+  end
+  # Actions that need to be performed regularly, but not every cycle. Please do
+  # suggest a better name for this method.
+  def increment_checkpoint
+    # No need to do this every cycle, 100 is chosen to be as good as any
+    # other number. It should run on the very first cycle however so that
+    # our specs will cover it.
+    verifier.check_consistency!(tables_to_load)
+    purge_registry
+  end
+  def stop!
+    @running = false
+  end
+  def target
+    @target ||= Sq::Dbsync::Database::Connection.create(config[:target])
+  end
+  def tables_to_load
+    plans_with_sources.map do |plan, source|
+      plan.tables(source).map do |x|
+        x.update(source_db: source)
+      end
+    end.reduce([], :+).uniq {|x| x[:table_name] }
+  end
+  def plans_with_sources
+    @plans_with_sources ||= plans.map do |plan, source_name|
+      [plan, sources.fetch(source_name)]
+    end
+  end
+  def sources
+    @sources ||= Hash[config[:sources].map do |name, opts|
+      [name, Sq::Dbsync::Database::Connection.create(opts)]
+    end]
+  end
+  attr_accessor :config, :plans, :error_handler
+  private
+  def run_load(action, context, tables = :all)
+    items = tables_to_load.map do |tplan|
+      if tables != :all
+        next unless tables.include?(tplan[:table_name])
+        # Force loading of specified tables, otherwise it would be impossible
+        # to batch load tables that were not regularly loaded.
+        tplan[:batch_load] = true
+        # Force refresh of tables, this is expected behaviour if you are
+        # calling the refresh-recent script with an explicit table list.
+        tplan[:refresh_recent] = true
+      end
+      if tplan[:refresh_recent].is_a?(Symbol)
+        tplan[:aux_timestamp_column] = tplan[:refresh_recent]
+      end
+      action.new(db, tplan, registry, logger, config[:clock])
+    end.compact
+    Pipeline.new(items, *LoadAction.stages).run(context)
+  end
+  # This is necessary so that old tables that are no longer being synced do not
+  # break our lag calculations.
+  def purge_registry
+    registry.purge_except(expected_table_names)
+  end
+  def expected_table_names
+    tables_to_load.map {|x| x[:table_name] } + config.fetch(:extra_tables, [])
+  end
+  def loop_with_retry_on(guard, transient_exceptions, &block)
+    consecutive_fails = 0
+    while guard.call
+      begin
+        block.call
+        consecutive_fails = 0
+      rescue *transient_exceptions
+        consecutive_fails += 1
+        raise if consecutive_fails >= MAX_RETRIES
+      end
+    end
+  end
+  def raise_if_pipeline_failure(results)
+    results.each do |result|
+      if result.is_a?(Pipeline::Failure)
+        raise result.wrapped_exception
+      end
+    end
+  end
+  def raise_all_if_pipeline_failure(results)
+    failed = false
+    results.each do |result|
+      if result.is_a?(Pipeline::Failure)
+        error_handler.notify_error(result.task.tag, result.wrapped_exception)
+        failed = true
+      end
+    end
+    if failed
+      raise Database::ExtractError,
+        "One or more loads failed, see other exceptions for details."
+    end
+  end
+  def measure(label, &block)
+    logger.measure(label) do
+      block.call
+    end
+  end
+  def registry
+    TableRegistry.new(target)
+  end
+  def verifier
+    @verifier ||= ConsistencyVerifier.new(target, registry)
+  end
+  def logger
+    config[:logger]
+  end
+  def db
+    @db ||= Database::Connection.create(config[:target])
+  end
+  def transient_exceptions
+    [
+      Database::ExtractError,
+      Database::TransientError
+    ]
+  end
+  def incremental_action
+    config.fetch(:incremental_action, IncrementalLoadAction)
+  end
+end

data/lib/sq/dbsync/pipeline/simple_context.rb ADDED Viewed

@@ -0,0 +1,15 @@
+# See lib/pipeline.rb
+class Sq::Dbsync::Pipeline
+  # A computational context that passes a number of tasks through a set of
+  # stages in sequence.
+  class SimpleContext
+    def self.call(tasks, stages, process)
+      tasks.map do |task|
+        stages.inject(task) do |result, stage|
+          process.call(stage, result)
+        end
+      end
+    end
+  end
+end

data/lib/sq/dbsync/pipeline/threaded_context.rb ADDED Viewed

@@ -0,0 +1,95 @@
+require 'thread'
+# See lib/sq/dbsync/pipeline.rb
+class Sq::Dbsync::Pipeline
+  # A computational context for passing a number of tasks through a set of
+  # stages, where each stage uses resources independent of the other stages.
+  # For example, stage one may be able to execute a maximum of two tasks at
+  # once, and stage two may also have a maximum of two, but it is optimum
+  # that a total of four tasks to be processing at any one time.
+  class ThreadedContext
+    # Tracer object to mark the end of a stream of tasks.
+    FINISH = Object.new
+    def self.call(*args, &block)
+      new(*args, &block).run
+    end
+    def initialize(tasks, stages, process)
+      self.tasks   = tasks
+      self.stages  = stages
+      self.process = process
+      self.threads = []
+    end
+    def run
+      initial_queue, final_queue = build_pipeline(stages, tasks.length)
+      tasks.each.with_index do |task, i|
+        initial_queue << [i, task]
+      end
+      result = ordered (0...tasks.length).map { final_queue.pop }
+      flush_threads(initial_queue)
+      result
+    end
+  protected
+    attr_accessor :tasks, :stages, :process
+    # Floods the queue with enough FINISH markers to guarantee that each thread
+    # will see one and shut itself down.
+    def flush_threads(initial_queue)
+      threads.size.times { initial_queue << FINISH }
+      threads.each(&:join)
+    end
+    def ordered(tasks)
+      tasks.
+        sort_by(&:first).
+        map(&:last)
+    end
+    def concurrency(stage)
+      2
+    end
+    def build_pipeline(stages, number_of_tasks)
+      initial_queue = Queue.new
+      final_queue   = stages.inject(initial_queue) do |task_queue, stage|
+        spawn_workers(stage, task_queue, number_of_tasks)
+      end
+      [initial_queue, final_queue]
+    end
+    def spawn_workers(stage, task_queue, number_of_tasks)
+      next_queue = Queue.new
+      self.threads += in_threads(concurrency(stage)) do
+        while true
+          index, task = task_queue.pop
+          if index == FINISH
+            next_queue << FINISH
+            break
+          else
+            next_queue << [index, process.call(stage, task)]
+          end
+        end
+      end
+      next_queue
+    end
+    def in_threads(n, &block)
+      n.times.map do
+        Thread.new(&block)
+      end
+    end
+    attr_accessor :threads
+  end
+end

data/lib/sq/dbsync/pipeline.rb ADDED Viewed

@@ -0,0 +1,80 @@
+require 'sq/dbsync/pipeline/threaded_context'
+require 'sq/dbsync/pipeline/simple_context'
+module Sq::Dbsync
+  # An inject/reduce/fold-like abstraction to pass an array through a set of
+  # operations, where the result of the first operation is passed to the second
+  # operation, second to the third, and so on through the set until the final
+  # result is returned.  It gracefully handles any individual failure and still
+  # allows other results to be computed.
+  #
+  # Any unhandled exception will place an instance of `Pipeline::Failure` into
+  # the returned results.
+  #
+  # The order and timing of when stages are is undefined (for example, they may
+  # be parallelized), so they should be well isolated from each other.
+  #
+  # Examples
+  #
+  #     Pipeline.new([1, 2, 3],
+  #       ->(x) { x * x },
+  #       ->(x) { x + x }
+  #     ).run
+  #     # => [2, 8, 18]
+  #
+  #     Pipeline.new([1, 2],
+  #       ->(x) { x == 1 ? raise : x },
+  #       ->(x) { x * 10 }
+  #     ).run
+  #     # => [Pipeline::Failure, 20]
+  class Pipeline
+    def initialize(tasks, *stages)
+      self.tasks  = tasks
+      self.stages = stages
+    end
+    # Run the pipeline and return the computed results.
+    #
+    # context - The computational context in which to run the pipeline. Must
+    #           respond to `#call` and take tasks, stages, and a processing
+    #           lambda as arguments. By default runs the pipeline in parallel,
+    #           but an alternative `SimpleContext` is provided to run in a
+    #           single thread to aid debugging and testing.
+    def run(context = ThreadedContext)
+      context.call(tasks, stages, ->(stage, result) {
+        process(stage, result)
+      })
+    end
+    # Used to signal failed operations in a pipeline.
+    class Failure < StandardError
+      # The original exception that caused this failure.
+      attr_reader :wrapped_exception
+      # The task that was being processed when this failure occurred.
+      attr_reader :task
+      def initialize(wrapped, task)
+        @wrapped_exception = wrapped
+        @task              = task
+      end
+    end
+  protected
+    def process(stage, task)
+      if task.is_a?(Failure)
+        task
+      else
+        begin
+          stage.call(task)
+        rescue => e
+          Failure.new(e, task)
+        end
+      end
+    end
+    attr_accessor :tasks, :stages
+  end
+end

data/lib/sq/dbsync/refresh_recent_load_action.rb ADDED Viewed

@@ -0,0 +1,71 @@
+require 'sq/dbsync/load_action'
+module Sq::Dbsync
+  # This is a terribly named class that will delete the last X days of data
+  # from a table and reload it. Useful for tables that are nearly append only
+  # but sometimes will update recent data (for instance, a failed import). The
+  # tables are too big to regularly reload in their entirety, but reloading
+  # only recent data fixes the main issues.
+  class RefreshRecentLoadAction < LoadAction
+    WINDOW = 60 * 60 * 24 * 2 # 2 days
+    def operation; 'refresh_recent'; end
+    def prepare
+      return false unless plan.refresh_recent
+      super
+    end
+    def post_load
+    end
+    def extract_data
+      @metadata   = registry.get(plan.table_name)
+      @start_time = now.call
+      @since      = (
+        @metadata[:last_row_at] ||
+        @metadata[:last_synced_at]
+      ) - WINDOW
+      @file, @last_row_at = measure(:extract) { extract_to_file(@since) }
+      self
+    end
+    def load_data
+      measure(:load) do
+        tname   = plan.table_name
+        columns = plan.columns
+        db.transaction do
+          db.delete_recent(plan, @since)
+          db.load_from_file(tname, columns, @file.path)
+        end
+      end
+      @file.close!
+      self
+    end
+    private
+    def filter_columns
+      source         = plan.source_db
+      source_columns = source.hash_schema(plan.source_table_name).keys
+      plan.columns   = resolve_columns(plan, source_columns) &
+        (target_columns || source_columns)
+    end
+    def target_columns
+      # Because we may create the target table later if necessary,
+      # we need to check if it *really* exists
+      target_columns = if target.table_exists?(plan.table_name)
+        target.hash_schema(plan.table_name).keys
+      else
+        nil
+      end
+    end
+    def prefix
+      ''
+    end
+  end
+end