RubyGems - sq-dbsync - Versions diffs - 1.0.0 - Mend

sq-dbsync 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

data/HISTORY.md +5 -0
data/LICENSE +14 -0
data/README.md +218 -0
data/lib/sq/dbsync/all_tables_plan.rb +51 -0
data/lib/sq/dbsync/batch_load_action.rb +95 -0
data/lib/sq/dbsync/config.rb +12 -0
data/lib/sq/dbsync/consistency_verifier.rb +70 -0
data/lib/sq/dbsync/database/common.rb +91 -0
data/lib/sq/dbsync/database/connection.rb +23 -0
data/lib/sq/dbsync/database/mysql.rb +163 -0
data/lib/sq/dbsync/database/postgres.rb +77 -0
data/lib/sq/dbsync/error_handler.rb +59 -0
data/lib/sq/dbsync/example_record_destroyer.rb +77 -0
data/lib/sq/dbsync/incremental_load_action.rb +95 -0
data/lib/sq/dbsync/load_action.rb +156 -0
data/lib/sq/dbsync/loggers.rb +135 -0
data/lib/sq/dbsync/manager.rb +241 -0
data/lib/sq/dbsync/pipeline/simple_context.rb +15 -0
data/lib/sq/dbsync/pipeline/threaded_context.rb +95 -0
data/lib/sq/dbsync/pipeline.rb +80 -0
data/lib/sq/dbsync/refresh_recent_load_action.rb +71 -0
data/lib/sq/dbsync/schema_maker.rb +87 -0
data/lib/sq/dbsync/static_table_plan.rb +42 -0
data/lib/sq/dbsync/table_registry.rb +75 -0
data/lib/sq/dbsync/tempfile_factory.rb +41 -0
data/lib/sq/dbsync/version.rb +5 -0
data/lib/sq/dbsync.rb +9 -0
data/spec/acceptance/loading_spec.rb +237 -0
data/spec/acceptance_helper.rb +2 -0
data/spec/database_helper.rb +86 -0
data/spec/integration/all_tables_plan_spec.rb +36 -0
data/spec/integration/batch_load_action_spec.rb +229 -0
data/spec/integration/consistency_verifier_spec.rb +54 -0
data/spec/integration/database_connection_spec.rb +61 -0
data/spec/integration/incremental_load_action_spec.rb +196 -0
data/spec/integration/manager_spec.rb +109 -0
data/spec/integration/schema_maker_spec.rb +119 -0
data/spec/integration_helper.rb +43 -0
data/spec/spec_helper.rb +27 -0
data/spec/unit/config_spec.rb +18 -0
data/spec/unit/error_handler_spec.rb +52 -0
data/spec/unit/pipeline_spec.rb +42 -0
data/spec/unit/stream_logger_spec.rb +33 -0
data/spec/unit_helper.rb +1 -0
data/sq-dbsync.gemspec +32 -0
metadata +188 -0

data/HISTORY.md ADDED Viewed

@@ -0,0 +1,5 @@
+# Square Dbsync History
+## 1.0.0 - 23 February 2013 (c505c0e7)
+* Initial public release.

data/LICENSE ADDED Viewed

@@ -0,0 +1,14 @@
+   Copyright 2012 Square Inc.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

data/README.md ADDED Viewed

@@ -0,0 +1,218 @@
+Square Dbsync
+=============
+An extract and load system to shunt data between databases.
+It uses timestamp based replication which is fast and easy to keep running,
+but has some caveats. Most notably, it does not handle deletes well (see
+documentation below for details).
+This was useful to us at Square because we needed partial (only select
+columns), continuous replication from both MySQL and PostgreSQL databases to a
+single target database with some basic ETL logic along the way. None of the
+existing solutions were able to do this adequately.
+At some point you will need to bite the bullet and implement a real ETL system,
+but `sq-dbsync` can tide you over until you get there.
+Usage
+-----
+``` Ruby
+include Sq::Dbsync
+# Config will typically differ per environment.
+config = {
+  sources: {
+    db_a: {
+      database: 'db_a_production',
+      user:     'sqdbsync-ro',
+      password: 'password',
+      host:     'db-a-host',
+      brand:    'mysql',
+      port:     3306,
+    },
+    db_b: {
+      database: 'db_b_production',
+      user:     'sqdbsync-ro',
+      password: 'password',
+      host:     'db-b-host',
+      brand:    'postgresl',
+      port:     5432,
+    }
+  },
+  target: {
+    database: 'replica',
+    user:     'sqdbsync',
+    password: 'password',
+    # Only localhost supported, since `LOAD DATA INFILE` is used which
+    # requires a shared temp directory.
+    host:     'localhost',
+    brand:    'mysql',
+    port:     3306,
+  },
+  # Optional configuration
+  logger: Loggers::Stream.new,     # A graphite logger is provided, see source.
+  clock: ->{ Time.now.utc },       # In test env it can be useful to fix this.
+  error_handler: ->(e) { puts(e) } # Notify your exception system
+}
+# Write plans that specify how data is replicated.
+DB_A_PLAN = [{
+  table_name: :users,
+  columns: [
+    # You must replicate the primary key.
+    :id,
+    # You must replicate a timestamp column, and it should be indexed on the
+    # target system.
+    :updated_at,
+    # Then whatever other columns you require.
+    :name,
+    :account_type,
+    :created_at,
+  ],
+  indexes: {
+    # Indexing it on the source system is optional
+    index_users_on_updated_at: {:columns=>[:updated_at], :unique=>false},
+  },
+  # Basic schema transformations are supported.
+  db_types: {
+    :account_type => [:enum, %w(
+      bronze
+      silver
+      gold
+    )]
+  }
+},{
+  table_name: :account_types,
+  source_table_name: :user_account_types,
+  columns: :all
+}]
+plans = [
+  [StaticTablePlan.new(DB_A_PLAN), :db_a],
+  [AllTablesPlan.new, :db_b]
+]
+manager = Manager.new(config, plans)
+# Run a batch load nightly
+manager.batch(ALL_TABLES)
+# Run an incremental load continuously
+manager.increment
+# You can load a subset of tables if necessary
+manager.batch([:users])
+```
+Documentation
+-------------
+### Plan Options
+* `batch_load` whether or not to batch load this table in the default batch
+  load. If the table is specifically requested, it will be loaded regardless of
+  this setting. (default: true)
+* `charset` charset to use when creating the table. Passed directly through to
+  [Sequel::MySQL::Database#connect](http://sequel.rubyforge.org/rdoc-adapters/classes/Sequel/MySQL/Database.html).
+  MySQL only, ignored for Postgres. (default: 'utf8')
+* `columns` Either an array of columns to replicate, or `:all` indicating that
+  all columns should be replicated. (required)
+* `consistency` Perform a basic consistency check on the table regularly during
+  the incremental load by comparing recent counts of the source and target
+  tables. Make sure you have a timestamp index on both tables! This was
+  particularly useful when developing the project, but honestly probably isn't
+  that useful now --- I can't remember the last time I saw an error from this.
+  (default: false)
+* `db_types` A hash that allows you to modify the target schema from the
+  source. See the example in usage section above. (default: `{}`)
+* `indexes` A hash defining desired indexes on the target table. Indexes are
+  *not* copied from source tables. See example in usage section above.
+  (default: `{}`)
+* `refresh_recent` Some table are too large to batch load regularly, but
+  modifications are known to be recent. This setting will cause the last two
+  days of data to be dropped an recreated as part of the nightly batch load.
+  (default: false)
+* `source_table_name` Allows the source and target tables to be named
+  differently. (default: `table_name` configuration option)
+* `timestamp_table_name` A hack to workaround the postgres query planner
+  failing to use indexes correctly for `MAX()` on a view that uses `UNION`
+  under the covers. If this describes your source view, and one of the
+  underlying tables is guaranteed to contain the latest record you can set this
+  value to that and it will be used for all timestamp related queries. If not,
+  you must provide a custom view that supports a `MAX` query with a sane query
+  plan. (default: nil)
+* `table_name` The name of the table to be replicated. If `source_table_name`
+  is specified, this option defines the name of the table in the target
+  database only.
+* `primary_key` Usually the primary key can be inferred from the source schema,
+  but if you are replicating from a view you will need to specify it explictly
+  with this option. Should be an array of symbols. (default: nil, will
+  auto-detect from source schema)
+* `timestamp` The column to treat as a timestamp. Must be a member of the
+  `:columns` option. (default: select `updated_at` or `created_at`, in that
+  order)
+### Handling Deletes
+The incremental load has no way of detecting deleted records. The nightly batch
+load will reload all tables, so there will be at most a one day turn-around on
+deletes. Some tables will be too big to batch load every night however, so this
+is not a great solution in that case.
+If you have an "audit" table that contains enough data for you to reconstruct
+deletes in other tables, then you can provide a custom subclass to the
+incremental loader that will be able to run this logic.
+``` ruby
+class IncrementalLoadWithDeletes < Sq::Dbsync::IncrementalLoadAction
+  def process_deletes
+    if plan.table_name == :audit_logs
+      ExampleRecordDestroyer.run(db, registry, :audit_logs, :other_table)
+    end
+  end
+end
+CONFIG = {
+  # ...
+  incremental_action: IncrementalLoadWithDeletes,
+}
+```
+See `lib/sq/dbsync/example_record_destroyer` for a sample implementation.
+### Database Settings
+If your target database is MySQL, we recommend that you ensure it is running
+under the `READ COMMITTED` isolation level. This makes it much harder for an
+analyst to lock a table and block replication. (Statements like `CREATE TABLE
+AS SELECT FROM ...` tend to be the culprit.)
+Developing
+----------
+    bundle
+    bundle exec rake
+Compatibility
+-------------
+Requires 1.9. Tested on CRuby 1.9.3 and JRuby.
+## Support
+Make a [new github issue](https://github.com/square/sq-dbsync/issues/new).
+## Contributing
+Fork and patch! Before any changes are merged to master, we need you to sign an
+[Individual Contributor
+Agreement](https://spreadsheets.google.com/a/squareup.com/spreadsheet/viewform?formkey=dDViT2xzUHAwRkI3X3k5Z0lQM091OGc6MQ&ndplr=1)
+(Google Form).

data/lib/sq/dbsync/all_tables_plan.rb ADDED Viewed

@@ -0,0 +1,51 @@
+module Sq::Dbsync
+  # Fetches all tables from the given source, retrieving tables and columns.
+  # Indexes are currently ignored.
+  class AllTablesPlan
+    def tables(source)
+      source.ensure_connection
+      source.tables.map do |t|
+        schema_for_table(source, t)
+      end.compact
+    end
+  private
+    def schema_for_table(source, t)
+      schema = source.schema(t, reload: true)
+      return unless has_primary_key?(schema)
+      return unless has_timestamp?(schema)
+      cols = schema.map do |col|
+        col[0]
+      end
+      {
+        source_db:  source,
+        source_table_name: t,
+        table_name: t,
+        columns:    cols,
+        indexes:    {},
+        always_sync: true
+      }
+    rescue Sequel::DatabaseError
+      # This handles a race condition where the table is deleted between us
+      # selecting the list of tables and fetching the schema.
+      nil
+    end
+    def has_primary_key?(schema)
+      schema.any? do |table|
+        table[1][:primary_key]
+      end
+    end
+    def has_timestamp?(schema)
+      schema.any? do |table|
+        [:updated_at, :created_at].include?(table[0])
+      end
+    end
+  end
+end

data/lib/sq/dbsync/batch_load_action.rb ADDED Viewed

@@ -0,0 +1,95 @@
+require 'sq/dbsync/load_action'
+module Sq::Dbsync
+  # Load action to reload an entire table in full. The table will be loaded in
+  # parallel to the existing one, then atomically swapped in on completion.
+  class BatchLoadAction < LoadAction
+    MAX_LAG = 60 * 5
+    def operation; 'batch'; end
+    def prepare
+      return false if plan.batch_load == false
+      if super
+        if target.table_exists?(plan.prefixed_table_name)
+          target.drop_table(plan.prefixed_table_name)
+        end
+        true
+      end
+    end
+    def extract_data
+      @start_time  = now.call
+      @file, @last_row_at = measure(:extract) { extract_to_file(nil) }
+      self
+    end
+    def load_data
+      measure(:load) do
+        TempfileFactory.split(@file, 1_000_000, logger) do |path|
+          db.load_from_file(
+            plan.prefixed_table_name,
+            plan.columns,
+            path
+          )
+        end
+        @file.close!
+      end
+      self
+    end
+    def post_load
+      while @start_time <= now.call - MAX_LAG
+        @start_time = now.call
+        catchup
+      end
+      switch_tables
+      self
+    end
+    private
+    def filter_columns
+      source = plan.source_db
+      source_columns = source.hash_schema(plan.source_table_name).keys
+      plan.columns = resolve_columns(plan, source_columns)
+    end
+    def prefix
+      'new_'
+    end
+    def catchup
+      file, @last_row_at = measure(:catchup_extract) {
+        extract_to_file(@last_row_at ? @last_row_at - overlap : nil)
+      }
+      measure(:catchup_load) do
+        db.load_incrementally_from_file(
+          plan.prefixed_table_name,
+          plan.columns,
+          file.path
+        )
+        file.close!
+      end
+    end
+    def switch_tables
+      measure(:switch) do
+        registry.delete(plan.table_name)
+        db.switch_table(
+          plan.table_name,
+          plan.prefixed_table_name
+        )
+        registry.set(plan.table_name,
+          last_synced_at:       @start_time,
+          last_batch_synced_at: @start_time,
+          last_row_at:          @last_row_at
+        )
+      end
+    end
+  end
+end

data/lib/sq/dbsync/config.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require 'sq/dbsync/loggers'
+# Helper class to provide sane defaults to user-supplied config.
+class Sq::Dbsync::Config
+  def self.make(hash)
+    {
+      clock:         ->{ Time.now.utc },
+      logger:        Sq::Dbsync::Loggers::Stream.new,
+      error_handler: ->(e) { $stderr.puts(e.message, e.backtrace) }
+    }.merge(hash)
+  end
+end

data/lib/sq/dbsync/consistency_verifier.rb ADDED Viewed

@@ -0,0 +1,70 @@
+require 'sq/dbsync/load_action' # For overlap, not ideal
+module Sq::Dbsync
+  # Performs a cheap check to verify that the number of records present for a
+  # recent time slice are the same across source and target tables.
+  #
+  # This checks consistency on the current tables, not the new_ set.
+  class ConsistencyVerifier
+    def initialize(target, registry)
+      @target   = target
+      @registry = registry
+    end
+    def check_consistency!(tables)
+      tables.each do |tplan|
+        next unless tplan[:consistency]
+        verify_consistency!(tplan)
+      end
+    end
+    def verify_consistency!(tplan)
+      last_row_at = registry.get(tplan[:table_name])[:last_row_at]
+      return unless last_row_at
+      now = registry.get(tplan[:table_name])[:last_row_at] - LoadAction.overlap
+      counts = [
+        tplan[:source_db],
+        target
+      ].map do |x|
+        x.consistency_check(tplan[:table_name], now)
+      end
+      delta = counts.reduce(:-)
+      unless delta == 0
+        raise ConsistencyError.new(
+          tplan[:table_name],
+          delta,
+          "source: #{tplan[:source_db].name} (count: #{counts[0]}), " +
+          "sink: #{target.name} (count: #{counts[1]})"
+        )
+      end
+    end
+    attr_reader :target, :registry
+    # Used to signal an observed error in the number of records between source
+    # and target tables. There are no current known situations in which this
+    # occurs, though in the past buggy handling of replication lag was normally
+    # the culprit.
+    #
+    # If it does occur, a good first response is to set `last_sync_time` to the
+    # last batch time (usually within 24 hours) which will force the
+    # incremental load to reconsider all recent records.
+    class ConsistencyError < RuntimeError
+      def initialize(table_name, delta, description="")
+        @table_name  = table_name
+        @delta       = delta
+        @description = description
+      end
+      def message
+        output = "%s had a count difference of %i" % [@table_name, @delta]
+        output = output + "; " + @description if !@description.empty?
+      end
+    end
+  end
+end

data/lib/sq/dbsync/database/common.rb ADDED Viewed

@@ -0,0 +1,91 @@
+require 'sq/dbsync/tempfile_factory'
+module Sq::Dbsync::Database
+  module Common
+    SQD = ::Sq::Dbsync
+    def extract_to_file(table_name, columns, file_name)
+      extract_sql_to_file("SELECT %s FROM %s" % [
+        columns.join(', '),
+        table_name
+      ], file_name)
+    end
+    def extract_incrementally_to_file(plan, file_name, last_row_at, overlap)
+      table_name = plan.source_table_name.to_sym
+      db_columns = db.schema(table_name).map(&:first)
+      query = self[table_name].select(*plan.columns)
+      if last_row_at
+        query = query.filter("#{plan.timestamp} > ?", last_row_at - overlap)
+      end
+      extract_sql_to_file(query.sql, file_name)
+    end
+    def hash_schema(table_name)
+      ensure_connection
+      Hash[schema(table_name)]
+    end
+    def name
+      self['SELECT database()'].first.fetch(:'database()')
+    end
+    # Since we go so long without using connections (during a batch load), they
+    # go stale and raise DatabaseDisconnectError when we try to use them. This
+    # method ensures that the connection is fresh even after a long time
+    # between drinks.
+    def ensure_connection
+      db.disconnect
+    end
+    def __getobj__
+      db
+    end
+    def __setobj__(db)
+      @db = db
+    end
+    protected
+    def execute!(cmd)
+      # psql doesn't return a non-zero error code when executing commands from
+      # a file. The best way I can come up with is to raise if anything is
+      # present on stderr.
+      errors_file = SQD::TempfileFactory.make('extract_sql_to_file_errors')
+      cmd = %{bash -c "#{cmd.gsub(/"/, '\\"')}"}
+      result = run_shell(cmd, errors_file)
+      unless result.exitstatus == 0 && File.size(errors_file.path) == 0
+        raise(ExtractError, "Command failed: #{cmd}")
+      end
+    ensure
+      errors_file.close! if errors_file
+    end
+    def sql_to_file(sql)
+      SQD::TempfileFactory.make_with_content('extract_sql_to_file', sql)
+    end
+    private
+    def run_shell(cmd, errors_file)
+      if RUBY_PLATFORM == 'java'
+        IO.popen4(cmd) {|_, _, _, stderr|
+          errors_file.write(stderr.read)
+          errors_file.flush
+        }
+        $?
+      else
+        pid = Process.spawn(cmd, STDERR => errors_file.path)
+        Process.waitpid2(pid)[1]
+      end
+    end
+  end
+end

data/lib/sq/dbsync/database/connection.rb ADDED Viewed

@@ -0,0 +1,23 @@
+require 'sequel/no_core_ext'
+Sequel.default_timezone = :utc
+require 'sq/dbsync/database/mysql'
+require 'sq/dbsync/database/postgres'
+module Sq::Dbsync::Database
+  # Factory class to abstract selection of a decorator to faciliate databases
+  # other than MySQL.
+  class Connection
+    def self.create(opts)
+      case opts[:brand]
+      when 'mysql'
+        Sq::Dbsync::Database::Mysql.new(Sequel.connect(opts))
+      when 'postgresql'
+        Sq::Dbsync::Database::Postgres.new(Sequel.connect(opts))
+      else
+        raise "Unsupported database: #{opts.inspect}"
+      end
+    end
+  end
+end