RubyGems - sq-dbsync - Versions diffs - 1.0.0 - Mend

sq-dbsync 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

data/HISTORY.md +5 -0
data/LICENSE +14 -0
data/README.md +218 -0
data/lib/sq/dbsync/all_tables_plan.rb +51 -0
data/lib/sq/dbsync/batch_load_action.rb +95 -0
data/lib/sq/dbsync/config.rb +12 -0
data/lib/sq/dbsync/consistency_verifier.rb +70 -0
data/lib/sq/dbsync/database/common.rb +91 -0
data/lib/sq/dbsync/database/connection.rb +23 -0
data/lib/sq/dbsync/database/mysql.rb +163 -0
data/lib/sq/dbsync/database/postgres.rb +77 -0
data/lib/sq/dbsync/error_handler.rb +59 -0
data/lib/sq/dbsync/example_record_destroyer.rb +77 -0
data/lib/sq/dbsync/incremental_load_action.rb +95 -0
data/lib/sq/dbsync/load_action.rb +156 -0
data/lib/sq/dbsync/loggers.rb +135 -0
data/lib/sq/dbsync/manager.rb +241 -0
data/lib/sq/dbsync/pipeline/simple_context.rb +15 -0
data/lib/sq/dbsync/pipeline/threaded_context.rb +95 -0
data/lib/sq/dbsync/pipeline.rb +80 -0
data/lib/sq/dbsync/refresh_recent_load_action.rb +71 -0
data/lib/sq/dbsync/schema_maker.rb +87 -0
data/lib/sq/dbsync/static_table_plan.rb +42 -0
data/lib/sq/dbsync/table_registry.rb +75 -0
data/lib/sq/dbsync/tempfile_factory.rb +41 -0
data/lib/sq/dbsync/version.rb +5 -0
data/lib/sq/dbsync.rb +9 -0
data/spec/acceptance/loading_spec.rb +237 -0
data/spec/acceptance_helper.rb +2 -0
data/spec/database_helper.rb +86 -0
data/spec/integration/all_tables_plan_spec.rb +36 -0
data/spec/integration/batch_load_action_spec.rb +229 -0
data/spec/integration/consistency_verifier_spec.rb +54 -0
data/spec/integration/database_connection_spec.rb +61 -0
data/spec/integration/incremental_load_action_spec.rb +196 -0
data/spec/integration/manager_spec.rb +109 -0
data/spec/integration/schema_maker_spec.rb +119 -0
data/spec/integration_helper.rb +43 -0
data/spec/spec_helper.rb +27 -0
data/spec/unit/config_spec.rb +18 -0
data/spec/unit/error_handler_spec.rb +52 -0
data/spec/unit/pipeline_spec.rb +42 -0
data/spec/unit/stream_logger_spec.rb +33 -0
data/spec/unit_helper.rb +1 -0
data/sq-dbsync.gemspec +32 -0
metadata +188 -0

data/lib/sq/dbsync/database/mysql.rb ADDED Viewed

@@ -0,0 +1,163 @@
+require 'delegate'
+require 'csv'
+require 'sq/dbsync/database/common'
+module Sq::Dbsync::Database
+  # Thrown when a known temporary database error is detected.
+  class TransientError < RuntimeError; end
+  # Thrown when a command run via a sub-shell rather than Sequel fails.
+  class ExtractError < RuntimeError; end
+  # Decorator around a Sequel database object, providing some non-standard
+  # extensions required for effective ETL with MySQL.
+  class Mysql < Delegator
+    include Common
+    def initialize(db)
+      super
+      @db = db
+    end
+    def inspect; "#<Database::Mysql #{opts[:database]}>"; end
+    def load_from_file(table_name, columns, file_name)
+      ensure_connection
+      sql = "LOAD DATA INFILE '%s' IGNORE INTO TABLE %s (%s)" % [
+        file_name,
+        table_name,
+        escape_columns(columns)
+      ]
+      db.run sql
+    end
+    def set_lock_timeout(seconds)
+      db.run lock_timeout_sql(seconds)
+    end
+    def load_incrementally_from_file(table_name, columns, file_name)
+      ensure_connection
+      # Very low lock wait timeout, since we don't want loads to be blocked
+      # waiting for long queries.
+      set_lock_timeout(10)
+      db.run "LOAD DATA INFILE '%s' REPLACE INTO TABLE %s (%s)" % [
+        file_name,
+        table_name,
+        escape_columns(columns)
+      ]
+    rescue Sequel::DatabaseError => e
+      transient_regex =
+        /Lock wait timeout exceeded|Deadlock found when trying to get lock/
+      if e.message =~ transient_regex
+        raise TransientError, e.message, e.backtrace
+      else
+        raise
+      end
+    end
+    # 2 days is chosen as an arbitrary buffer
+    AUX_TIME_BUFFER = 60 * 60 * 24 * 2 # 2 days
+    # Deletes recent rows based on timestamp, but also allows filtering by an
+    # auxilary timestamp column for the case where the primary one is not
+    # indexed on the target (such as the DFR reports, where imported_at is not
+    # indexed, but reporting date is).
+    def delete_recent(plan, since)
+      ensure_connection
+      query = db[plan.table_name].
+        filter("#{plan.timestamp} > ?", since)
+      if plan.aux_timestamp_column
+        query = query.filter(
+          "#{plan.aux_timestamp_column} > ?",
+          since - AUX_TIME_BUFFER
+        )
+      end
+      query.delete
+    end
+    def consistency_check(table_name, t)
+      ensure_connection
+      db[table_name].
+        filter("created_at BETWEEN ? AND ?", t - 60*60, t).
+        count
+    end
+    # Overriden because the Sequel implementation does not work with partial
+    # permissions on a table. See:
+    # https://github.com/jeremyevans/sequel/issues/422
+    def table_exists?(table_name)
+      begin
+        !!db.schema(table_name, reload: true)
+      rescue Sequel::DatabaseError
+        false
+      end
+    end
+    def drop_table(table_name)
+      db.drop_table(table_name)
+    end
+    def switch_table(to_replace, new_table)
+      ensure_connection
+      to_replace = to_replace.to_s
+      renames = []
+      drops   = []
+      if table_exists?(to_replace)
+        renames << [to_replace, 'old_' + to_replace]
+        drops << 'old_' + to_replace
+      end
+      renames << [new_table, to_replace]
+      db.run <<-SQL
+        RENAME TABLE #{renames.map {|tables| "%s TO %s" % tables }.join(', ')}
+      SQL
+      drops.each { |table| drop_table(table) }
+    end
+    protected
+    attr_reader :db
+    def extract_sql_to_file(sql, file_name)
+      file = sql_to_file(connection_settings + sql)
+      cmd = "set -o pipefail; mysql --skip-column-names"
+      cmd += " -u %s"   % opts[:user]     if opts[:user]
+      cmd += " -p%s"    % opts[:password] if opts[:password]
+      cmd += " -h %s"   % opts[:host]     if opts[:host]
+      cmd += " -P %i"   % opts[:port]     if opts[:port]
+      cmd += " %s"      % opts.fetch(:database)
+      # This option prevents mysql from buffering results in memory before
+      # outputting them, allowing us to stream large tables correctly.
+      cmd += " --quick"
+      cmd += " < #{file.path}"
+      cmd += " | sed 's/NULL/\\\\\\N/g'"
+      cmd += " > %s" % file_name
+      execute!(cmd)
+    end
+    def escape_columns(columns)
+      columns.map {|x| "`#{x}`" }.join(', ')
+    end
+    def connection_settings
+      lock_timeout_sql(10)
+    end
+    def lock_timeout_sql(seconds)
+      "SET SESSION innodb_lock_wait_timeout = %i;" % seconds
+    end
+  end
+end

data/lib/sq/dbsync/database/postgres.rb ADDED Viewed

@@ -0,0 +1,77 @@
+require 'delegate'
+require 'tempfile'
+require 'sq/dbsync/database/common'
+module Sq::Dbsync::Database
+  # Decorator around a Sequel database object, providing some non-standard
+  # extensions required for effective extraction from Postgres.
+  class Postgres < Delegator
+    include Sq::Dbsync::Database::Common
+    def initialize(db)
+      super
+      @db = db
+    end
+    def inspect; "#<Database::Postgres #{opts[:database]}>"; end
+    def set_lock_timeout(seconds)
+      # Unimplemented
+    end
+    def hash_schema(table_name)
+      ensure_connection
+      result = schema(table_name).each do |col, metadata|
+        metadata[:db_type] = psql_to_mysql_conversion(metadata[:db_type])
+      end
+      Hash[result]
+    end
+    protected
+    attr_reader :db
+    def psql_to_mysql_conversion(db_type)
+      {
+        "text" => "varchar(255)",
+        "character varying(255)" => "varchar(255)",
+        # 255 is an arbitrary choice here. The one example we have
+        # only has data 32 characters long in it.
+        "character varying"      => "varchar(255)",
+        # Arbitrarily chosen precision. The default numeric type in mysql is
+        # (10, 0), which is perhaps the most useless default I could imagine.
+        "numeric" => "numeric(12,6)",
+        "time without time zone" => "time",
+        "timestamp without time zone" => "datetime",
+        "boolean" => "char(1)"
+      }.fetch(db_type, db_type)
+    end
+    def extract_sql_to_file(sql, file_name)
+      sql = "COPY (#{sql}) TO STDOUT"
+      file = sql_to_file(sql)
+      cmd = "set -o pipefail; "
+      cmd += "psql --no-align --tuples-only -F '\t'"
+      cmd += " -U %s" % opts[:user]     if opts[:user]
+      cmd += " -h %s" % opts[:host]     if opts[:host]
+      cmd += " -p %i" % opts[:port]     if opts[:port]
+      cmd += " %s"    % opts.fetch(:database)
+      cmd += " -f %s" % file.path
+      cmd += " > %s"  % file_name
+      execute!(cmd)
+    ensure
+      file.close! if file
+    end
+  end
+end

data/lib/sq/dbsync/error_handler.rb ADDED Viewed

@@ -0,0 +1,59 @@
+module Sq::Dbsync
+  # Handles redacting sensitive information for error messages, and delegating
+  # response to a user-defined handler.
+  class ErrorHandler
+    def initialize(config)
+      @config = config
+      @handler = config.fetch(:error_handler, ->(ex) {})
+    end
+    def wrap(&block)
+      begin
+        with_massaged_exception(redact_passwords, &block)
+      rescue => ex
+        handler[ex]
+        raise ex
+      end
+    end
+    def notify_error(tag, ex)
+      with_massaged_exception(redact_passwords) do
+        raise ex, "[%s] %s" % [tag, ex.message], ex.backtrace
+      end
+    rescue => e
+      handler[e]
+    end
+    def redact_passwords
+      lambda do |message|
+        (
+          config[:sources].values + [config[:target]]
+        ).compact.inject(message) do |m, options|
+          if options[:password]
+            m.gsub(options[:password], 'REDACTED')
+          else
+            m
+          end
+        end
+      end
+    end
+    def with_massaged_exception(*massagers)
+      yield
+    rescue => ex
+      message = massagers.inject(ex.message) do |a, v|
+        v.call(a)
+      end
+      raise ex, message, ex.backtrace
+    end
+    private
+    attr_reader :config, :handler
+  end
+end

data/lib/sq/dbsync/example_record_destroyer.rb ADDED Viewed

@@ -0,0 +1,77 @@
+# An example class that can reconstruct deletes from an audit log.
+# We use the audit table as a proxy, though this is not written to in the same
+# transaction as the destroy so it may arrive some time later.
+#
+# A faux-table is added to the sync times metadata "record_deletes" to make
+# this process resilient to replication failures in either table.
+#
+# This is an example implementation, you will need to modify it to suit your
+# purposes.
+class ExampleRecordDestroyer < Struct.new(:db,
+                                          :registry,
+                                          :audit_table,
+                                          :other_table)
+  def self.run(*args)
+    new(*args).run
+  end
+  def run
+    max = last_sync_time(audit_table)
+    if max
+      user_ids = extract_deletes(unprocessed_audit_logs(max))
+      # This conditional should not be required, but MySQL cannot optimize the
+      # impossible where clause correctly and instead scans the table.
+      if user_ids.any?
+        db[other_table].filter(
+          user_id: user_ids
+        ).delete
+      end
+      # last_row_at calculation isn't correct but we don't use it.
+      registry.set!(meta_table,
+        last_synced_at:       max,
+        last_row_at:          max,
+        last_batch_synced_at: nil
+      )
+    end
+  end
+  def extract_deletes(audit_logs)
+    audit_logs.
+      group_by {|x| x[:target_id] }.
+      select {|_, xs| last_value_set(xs) == 'false' }.
+      keys
+  end
+  def unprocessed_audit_logs(max)
+    query = db[audit_table].
+      select(:target_id, :new_value, :updated_at).
+      filter('updated_at <= ?', max).
+      filter(action_name: %w(delete))
+    min = last_sync_time(meta_table)
+    if min
+      query = query.filter('updated_at > ?', min)
+    end
+    query.to_a
+  end
+  def last_sync_time(table)
+    record = registry.get(table)
+    (record || {}).fetch(:last_synced_at, nil)
+  end
+  # updated_at is not distinct, so use id column as a tie-break.
+  def last_value_set(xs)
+    xs.sort_by {|y| [y[:updated_at], y[:id]] }.last[:new_value]
+  end
+  def meta_table
+    :"#{other_table}_deletes"
+  end
+end

data/lib/sq/dbsync/incremental_load_action.rb ADDED Viewed

@@ -0,0 +1,95 @@
+require 'sq/dbsync/load_action'
+module Sq::Dbsync
+  # Load action to incrementally keep a table up-to-date by loading deltas from
+  # the source system. Note that this technique is unable by itself to detect
+  # deletes, but behaviour can be added to delete records based on a separate
+  # audit log. See documentation for more details.
+  class IncrementalLoadAction < LoadAction
+    def operation; 'increment'; end
+    def prepare
+      if super
+        if plan.always_sync
+          registry.set(plan.table_name,
+            last_synced_at:       EPOCH,
+            last_batch_synced_at: EPOCH,
+            last_row_at:          nil
+          )
+        end
+        !!registry.get(plan.table_name)
+      else
+        if plan.always_sync
+          registry.delete(plan.table_name)
+          target.drop_table(plan.table_name)
+        end
+        false
+      end
+    end
+    def extract_data
+      @metadata   = registry.get(plan.table_name)
+      @start_time = now.call
+      since       = (
+        @metadata[:last_row_at] ||
+        @metadata[:last_synced_at]
+      ) - overlap
+      @file, @last_row_at = measure(:extract) { extract_to_file(since) }
+      self
+    end
+    def load_data
+      measure(:load) do
+        db.transaction do
+          db.load_incrementally_from_file(
+            plan.prefixed_table_name,
+            plan.columns,
+            @file.path
+          )
+          process_deletes
+          registry.update(plan.table_name, @metadata[:last_batch_synced_at],
+            last_synced_at: @start_time,
+            last_row_at:    @last_row_at
+          )
+        end
+        @file.close!
+      end
+      self
+    end
+    def post_load
+      self
+    end
+    def prefix
+      ''
+    end
+    def filter_columns
+      source         = plan.source_db
+      source_columns = source.hash_schema(plan.source_table_name).keys
+      plan.columns   = resolve_columns(plan, source_columns) &
+        (target_columns || source_columns)
+    end
+    def target_columns
+      # Because we may create the target table later if necessary,
+      # we need to check if it *really* exists
+      target_columns = if target.table_exists?(plan.table_name)
+        tname = "#{prefix}#{plan.table_name}"
+        target.hash_schema(tname).keys
+      else
+        nil
+      end
+    end
+    def process_deletes
+      # Provided as a hook for subclasses
+    end
+  end
+end

data/lib/sq/dbsync/load_action.rb ADDED Viewed

@@ -0,0 +1,156 @@
+require 'date'
+require 'ostruct'
+require 'sq/dbsync/schema_maker'
+require 'sq/dbsync/tempfile_factory'
+module Sq::Dbsync
+  # An stateful action object representing the transfer of data from a source
+  # table to a target. The action can be performed in full using `#call`, but
+  # control can also be inverted using the `.stages` method, which allows the
+  # action to be combined to run efficiently in parallel with other actions.
+  #
+  # This is useful because a single load taxes the source system then the target
+  # system in sequence, so for maximum efficency a second load should be
+  # interleaved to start taxing the source system as soon as the first finishes
+  # the extract, rather than waiting for it to also finish the load. This is not
+  # possible if the process is fully encapsulated as it is in `#call`.
+  #
+  # This is an abstract base class, see `BatchLoadAction` and
+  # `IncrementalLoadAction` for example subclasses.
+  class LoadAction
+    EPOCH = Date.new(2000, 1, 1).to_time
+    # An empty action that is used when a load needs to be noop'ed in a manner
+    # that does not raise an error (i.e. expected conditions).
+    class NullAction
+      def extract_data; self; end
+      def load_data; self; end
+      def post_load; self; end
+    end
+    def initialize(target, plan, registry, logger, now = ->{ Time.now.utc })
+      @target   = target
+      @plan     = OpenStruct.new(plan)
+      @registry = registry
+      @logger   = logger
+      @now      = now
+    end
+    def tag
+      plan.table_name
+    end
+    def call
+      self.class.stages.inject(self) {|x, v| v.call(x) }
+    end
+    def self.stages
+      [
+        ->(x) { x.do_prepare || NullAction.new },
+        ->(x) { x.extract_data },
+        ->(x) { x.load_data },
+        ->(x) { x.post_load }
+      ]
+    end
+    def do_prepare
+      return unless prepare
+      ensure_target_exists
+      self
+    end
+    protected
+    attr_reader :target, :plan, :registry, :logger, :now
+    def prepare
+      unless plan.source_db.table_exists?(plan.source_table_name)
+        logger.log("%s does not exist" % plan.source_table_name)
+        return false
+      end
+      add_schema_to_table_plan(plan)
+      plan.prefixed_table_name = (prefix + plan.table_name.to_s).to_sym
+      filter_columns
+      plan.timestamp ||=
+        ([:updated_at, :created_at] & plan.columns)[0]
+    end
+    def ensure_target_exists
+      unless target.table_exists?(plan.prefixed_table_name)
+        SchemaMaker.create_table(target, plan)
+      end
+    end
+    def add_schema_to_table_plan(x)
+      x.schema ||= x.source_db.hash_schema(x.source_table_name)
+      x
+    end
+    def resolve_columns(plan, source_columns)
+      if plan.columns == :all
+        source_columns
+      else
+        source_columns & plan.columns
+      end
+    end
+    def extract_to_file(since)
+      plan.source_db.ensure_connection
+      plan.source_db.set_lock_timeout(10)
+      last_row_at = timestamp_table(plan).
+        max(plan.timestamp)
+      file = make_writeable_tempfile
+      plan.source_db.extract_incrementally_to_file(
+        plan,
+        file.path,
+        since,
+        0
+      )
+      [file, last_row_at]
+    end
+    # This functionality is provided as a work around for the postgres query
+    # planner failing to use indexes correctly for MAX() on a view that uses
+    # UNION under the covers.
+    #
+    # It is most useful under the assumption that one of the tables being
+    # unioned will always contain the most recent record (true in all current
+    # cases). If this is not true, you must provide a custom view that supports
+    # this query with a sane plan.
+    def timestamp_table(plan)
+      plan.source_db[plan.timestamp_table_name || plan.source_table_name]
+    end
+    def db; target; end
+    def measure(stage, &block)
+      label = "%s.%s.%s" % [
+        operation,
+        stage,
+        plan.table_name
+      ]
+      logger.measure(label) { block.call }
+    end
+    def overlap
+      self.class.overlap
+    end
+    # The distance we look back in time (in seconds) prior to the most recent
+    # row we have seen. This needs to comfortably more that the maximum
+    # expected time for a long running transaction.
+    def self.overlap
+      120
+    end
+    def make_writeable_tempfile
+      TempfileFactory.make_world_writable(plan.table_name.to_s)
+    end
+  end
+end