sq-dbsync 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/HISTORY.md +5 -0
  2. data/LICENSE +14 -0
  3. data/README.md +218 -0
  4. data/lib/sq/dbsync/all_tables_plan.rb +51 -0
  5. data/lib/sq/dbsync/batch_load_action.rb +95 -0
  6. data/lib/sq/dbsync/config.rb +12 -0
  7. data/lib/sq/dbsync/consistency_verifier.rb +70 -0
  8. data/lib/sq/dbsync/database/common.rb +91 -0
  9. data/lib/sq/dbsync/database/connection.rb +23 -0
  10. data/lib/sq/dbsync/database/mysql.rb +163 -0
  11. data/lib/sq/dbsync/database/postgres.rb +77 -0
  12. data/lib/sq/dbsync/error_handler.rb +59 -0
  13. data/lib/sq/dbsync/example_record_destroyer.rb +77 -0
  14. data/lib/sq/dbsync/incremental_load_action.rb +95 -0
  15. data/lib/sq/dbsync/load_action.rb +156 -0
  16. data/lib/sq/dbsync/loggers.rb +135 -0
  17. data/lib/sq/dbsync/manager.rb +241 -0
  18. data/lib/sq/dbsync/pipeline/simple_context.rb +15 -0
  19. data/lib/sq/dbsync/pipeline/threaded_context.rb +95 -0
  20. data/lib/sq/dbsync/pipeline.rb +80 -0
  21. data/lib/sq/dbsync/refresh_recent_load_action.rb +71 -0
  22. data/lib/sq/dbsync/schema_maker.rb +87 -0
  23. data/lib/sq/dbsync/static_table_plan.rb +42 -0
  24. data/lib/sq/dbsync/table_registry.rb +75 -0
  25. data/lib/sq/dbsync/tempfile_factory.rb +41 -0
  26. data/lib/sq/dbsync/version.rb +5 -0
  27. data/lib/sq/dbsync.rb +9 -0
  28. data/spec/acceptance/loading_spec.rb +237 -0
  29. data/spec/acceptance_helper.rb +2 -0
  30. data/spec/database_helper.rb +86 -0
  31. data/spec/integration/all_tables_plan_spec.rb +36 -0
  32. data/spec/integration/batch_load_action_spec.rb +229 -0
  33. data/spec/integration/consistency_verifier_spec.rb +54 -0
  34. data/spec/integration/database_connection_spec.rb +61 -0
  35. data/spec/integration/incremental_load_action_spec.rb +196 -0
  36. data/spec/integration/manager_spec.rb +109 -0
  37. data/spec/integration/schema_maker_spec.rb +119 -0
  38. data/spec/integration_helper.rb +43 -0
  39. data/spec/spec_helper.rb +27 -0
  40. data/spec/unit/config_spec.rb +18 -0
  41. data/spec/unit/error_handler_spec.rb +52 -0
  42. data/spec/unit/pipeline_spec.rb +42 -0
  43. data/spec/unit/stream_logger_spec.rb +33 -0
  44. data/spec/unit_helper.rb +1 -0
  45. data/sq-dbsync.gemspec +32 -0
  46. metadata +188 -0
@@ -0,0 +1,163 @@
1
+ require 'delegate'
2
+ require 'csv'
3
+ require 'sq/dbsync/database/common'
4
+
5
+ module Sq::Dbsync::Database
6
+
7
+ # Thrown when a known temporary database error is detected.
8
+ class TransientError < RuntimeError; end
9
+
10
+ # Thrown when a command run via a sub-shell rather than Sequel fails.
11
+ class ExtractError < RuntimeError; end
12
+
13
+ # Decorator around a Sequel database object, providing some non-standard
14
+ # extensions required for effective ETL with MySQL.
15
+ class Mysql < Delegator
16
+
17
+ include Common
18
+
19
+ def initialize(db)
20
+ super
21
+ @db = db
22
+ end
23
+
24
+ def inspect; "#<Database::Mysql #{opts[:database]}>"; end
25
+
26
+ def load_from_file(table_name, columns, file_name)
27
+ ensure_connection
28
+ sql = "LOAD DATA INFILE '%s' IGNORE INTO TABLE %s (%s)" % [
29
+ file_name,
30
+ table_name,
31
+ escape_columns(columns)
32
+ ]
33
+ db.run sql
34
+ end
35
+
36
+ def set_lock_timeout(seconds)
37
+ db.run lock_timeout_sql(seconds)
38
+ end
39
+
40
+ def load_incrementally_from_file(table_name, columns, file_name)
41
+ ensure_connection
42
+ # Very low lock wait timeout, since we don't want loads to be blocked
43
+ # waiting for long queries.
44
+ set_lock_timeout(10)
45
+ db.run "LOAD DATA INFILE '%s' REPLACE INTO TABLE %s (%s)" % [
46
+ file_name,
47
+ table_name,
48
+ escape_columns(columns)
49
+ ]
50
+ rescue Sequel::DatabaseError => e
51
+ transient_regex =
52
+ /Lock wait timeout exceeded|Deadlock found when trying to get lock/
53
+
54
+ if e.message =~ transient_regex
55
+ raise TransientError, e.message, e.backtrace
56
+ else
57
+ raise
58
+ end
59
+ end
60
+
61
+ # 2 days is chosen as an arbitrary buffer
62
+ AUX_TIME_BUFFER = 60 * 60 * 24 * 2 # 2 days
63
+
64
+ # Deletes recent rows based on timestamp, but also allows filtering by an
65
+ # auxilary timestamp column for the case where the primary one is not
66
+ # indexed on the target (such as the DFR reports, where imported_at is not
67
+ # indexed, but reporting date is).
68
+ def delete_recent(plan, since)
69
+ ensure_connection
70
+ query = db[plan.table_name].
71
+ filter("#{plan.timestamp} > ?", since)
72
+
73
+ if plan.aux_timestamp_column
74
+ query = query.filter(
75
+ "#{plan.aux_timestamp_column} > ?",
76
+ since - AUX_TIME_BUFFER
77
+ )
78
+ end
79
+
80
+ query.delete
81
+ end
82
+
83
+ def consistency_check(table_name, t)
84
+ ensure_connection
85
+ db[table_name].
86
+ filter("created_at BETWEEN ? AND ?", t - 60*60, t).
87
+ count
88
+ end
89
+
90
+ # Overriden because the Sequel implementation does not work with partial
91
+ # permissions on a table. See:
92
+ # https://github.com/jeremyevans/sequel/issues/422
93
+ def table_exists?(table_name)
94
+ begin
95
+ !!db.schema(table_name, reload: true)
96
+ rescue Sequel::DatabaseError
97
+ false
98
+ end
99
+ end
100
+
101
+ def drop_table(table_name)
102
+ db.drop_table(table_name)
103
+ end
104
+
105
+ def switch_table(to_replace, new_table)
106
+ ensure_connection
107
+
108
+ to_replace = to_replace.to_s
109
+
110
+ renames = []
111
+ drops = []
112
+
113
+ if table_exists?(to_replace)
114
+ renames << [to_replace, 'old_' + to_replace]
115
+ drops << 'old_' + to_replace
116
+ end
117
+ renames << [new_table, to_replace]
118
+
119
+ db.run <<-SQL
120
+ RENAME TABLE #{renames.map {|tables| "%s TO %s" % tables }.join(', ')}
121
+ SQL
122
+
123
+ drops.each { |table| drop_table(table) }
124
+ end
125
+
126
+ protected
127
+
128
+ attr_reader :db
129
+
130
+ def extract_sql_to_file(sql, file_name)
131
+ file = sql_to_file(connection_settings + sql)
132
+ cmd = "set -o pipefail; mysql --skip-column-names"
133
+ cmd += " -u %s" % opts[:user] if opts[:user]
134
+ cmd += " -p%s" % opts[:password] if opts[:password]
135
+ cmd += " -h %s" % opts[:host] if opts[:host]
136
+ cmd += " -P %i" % opts[:port] if opts[:port]
137
+ cmd += " %s" % opts.fetch(:database)
138
+
139
+ # This option prevents mysql from buffering results in memory before
140
+ # outputting them, allowing us to stream large tables correctly.
141
+ cmd += " --quick"
142
+
143
+ cmd += " < #{file.path}"
144
+ cmd += " | sed 's/NULL/\\\\\\N/g'"
145
+ cmd += " > %s" % file_name
146
+
147
+ execute!(cmd)
148
+ end
149
+
150
+ def escape_columns(columns)
151
+ columns.map {|x| "`#{x}`" }.join(', ')
152
+ end
153
+
154
+ def connection_settings
155
+ lock_timeout_sql(10)
156
+ end
157
+
158
+ def lock_timeout_sql(seconds)
159
+ "SET SESSION innodb_lock_wait_timeout = %i;" % seconds
160
+ end
161
+
162
+ end
163
+ end
@@ -0,0 +1,77 @@
1
+ require 'delegate'
2
+ require 'tempfile'
3
+
4
+ require 'sq/dbsync/database/common'
5
+
6
+ module Sq::Dbsync::Database
7
+
8
+ # Decorator around a Sequel database object, providing some non-standard
9
+ # extensions required for effective extraction from Postgres.
10
+ class Postgres < Delegator
11
+
12
+ include Sq::Dbsync::Database::Common
13
+
14
+ def initialize(db)
15
+ super
16
+ @db = db
17
+ end
18
+
19
+ def inspect; "#<Database::Postgres #{opts[:database]}>"; end
20
+
21
+ def set_lock_timeout(seconds)
22
+ # Unimplemented
23
+ end
24
+
25
+ def hash_schema(table_name)
26
+ ensure_connection
27
+
28
+ result = schema(table_name).each do |col, metadata|
29
+ metadata[:db_type] = psql_to_mysql_conversion(metadata[:db_type])
30
+ end
31
+
32
+ Hash[result]
33
+ end
34
+
35
+ protected
36
+
37
+ attr_reader :db
38
+
39
+ def psql_to_mysql_conversion(db_type)
40
+ {
41
+ "text" => "varchar(255)",
42
+ "character varying(255)" => "varchar(255)",
43
+
44
+ # 255 is an arbitrary choice here. The one example we have
45
+ # only has data 32 characters long in it.
46
+ "character varying" => "varchar(255)",
47
+
48
+ # Arbitrarily chosen precision. The default numeric type in mysql is
49
+ # (10, 0), which is perhaps the most useless default I could imagine.
50
+ "numeric" => "numeric(12,6)",
51
+
52
+ "time without time zone" => "time",
53
+ "timestamp without time zone" => "datetime",
54
+ "boolean" => "char(1)"
55
+ }.fetch(db_type, db_type)
56
+ end
57
+
58
+ def extract_sql_to_file(sql, file_name)
59
+ sql = "COPY (#{sql}) TO STDOUT"
60
+ file = sql_to_file(sql)
61
+
62
+ cmd = "set -o pipefail; "
63
+ cmd += "psql --no-align --tuples-only -F '\t'"
64
+ cmd += " -U %s" % opts[:user] if opts[:user]
65
+ cmd += " -h %s" % opts[:host] if opts[:host]
66
+ cmd += " -p %i" % opts[:port] if opts[:port]
67
+ cmd += " %s" % opts.fetch(:database)
68
+ cmd += " -f %s" % file.path
69
+
70
+ cmd += " > %s" % file_name
71
+
72
+ execute!(cmd)
73
+ ensure
74
+ file.close! if file
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,59 @@
1
+
2
+ module Sq::Dbsync
3
+
4
+ # Handles redacting sensitive information for error messages, and delegating
5
+ # response to a user-defined handler.
6
+ class ErrorHandler
7
+ def initialize(config)
8
+ @config = config
9
+ @handler = config.fetch(:error_handler, ->(ex) {})
10
+ end
11
+
12
+ def wrap(&block)
13
+ begin
14
+ with_massaged_exception(redact_passwords, &block)
15
+ rescue => ex
16
+ handler[ex]
17
+
18
+ raise ex
19
+ end
20
+ end
21
+
22
+ def notify_error(tag, ex)
23
+ with_massaged_exception(redact_passwords) do
24
+ raise ex, "[%s] %s" % [tag, ex.message], ex.backtrace
25
+ end
26
+ rescue => e
27
+ handler[e]
28
+ end
29
+
30
+ def redact_passwords
31
+ lambda do |message|
32
+ (
33
+ config[:sources].values + [config[:target]]
34
+ ).compact.inject(message) do |m, options|
35
+ if options[:password]
36
+ m.gsub(options[:password], 'REDACTED')
37
+ else
38
+ m
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ def with_massaged_exception(*massagers)
45
+ yield
46
+ rescue => ex
47
+ message = massagers.inject(ex.message) do |a, v|
48
+ v.call(a)
49
+ end
50
+
51
+ raise ex, message, ex.backtrace
52
+ end
53
+
54
+ private
55
+
56
+ attr_reader :config, :handler
57
+ end
58
+
59
+ end
@@ -0,0 +1,77 @@
1
+ # An example class that can reconstruct deletes from an audit log.
2
+ # We use the audit table as a proxy, though this is not written to in the same
3
+ # transaction as the destroy so it may arrive some time later.
4
+ #
5
+ # A faux-table is added to the sync times metadata "record_deletes" to make
6
+ # this process resilient to replication failures in either table.
7
+ #
8
+ # This is an example implementation, you will need to modify it to suit your
9
+ # purposes.
10
+ class ExampleRecordDestroyer < Struct.new(:db,
11
+ :registry,
12
+ :audit_table,
13
+ :other_table)
14
+ def self.run(*args)
15
+ new(*args).run
16
+ end
17
+
18
+ def run
19
+ max = last_sync_time(audit_table)
20
+
21
+ if max
22
+ user_ids = extract_deletes(unprocessed_audit_logs(max))
23
+
24
+ # This conditional should not be required, but MySQL cannot optimize the
25
+ # impossible where clause correctly and instead scans the table.
26
+ if user_ids.any?
27
+ db[other_table].filter(
28
+ user_id: user_ids
29
+ ).delete
30
+ end
31
+
32
+ # last_row_at calculation isn't correct but we don't use it.
33
+ registry.set!(meta_table,
34
+ last_synced_at: max,
35
+ last_row_at: max,
36
+ last_batch_synced_at: nil
37
+ )
38
+ end
39
+ end
40
+
41
+ def extract_deletes(audit_logs)
42
+ audit_logs.
43
+ group_by {|x| x[:target_id] }.
44
+ select {|_, xs| last_value_set(xs) == 'false' }.
45
+ keys
46
+ end
47
+
48
+ def unprocessed_audit_logs(max)
49
+
50
+ query = db[audit_table].
51
+ select(:target_id, :new_value, :updated_at).
52
+ filter('updated_at <= ?', max).
53
+ filter(action_name: %w(delete))
54
+
55
+ min = last_sync_time(meta_table)
56
+ if min
57
+ query = query.filter('updated_at > ?', min)
58
+ end
59
+
60
+ query.to_a
61
+ end
62
+
63
+ def last_sync_time(table)
64
+ record = registry.get(table)
65
+
66
+ (record || {}).fetch(:last_synced_at, nil)
67
+ end
68
+
69
+ # updated_at is not distinct, so use id column as a tie-break.
70
+ def last_value_set(xs)
71
+ xs.sort_by {|y| [y[:updated_at], y[:id]] }.last[:new_value]
72
+ end
73
+
74
+ def meta_table
75
+ :"#{other_table}_deletes"
76
+ end
77
+ end
@@ -0,0 +1,95 @@
1
+ require 'sq/dbsync/load_action'
2
+
3
+ module Sq::Dbsync
4
+
5
+ # Load action to incrementally keep a table up-to-date by loading deltas from
6
+ # the source system. Note that this technique is unable by itself to detect
7
+ # deletes, but behaviour can be added to delete records based on a separate
8
+ # audit log. See documentation for more details.
9
+ class IncrementalLoadAction < LoadAction
10
+ def operation; 'increment'; end
11
+
12
+ def prepare
13
+ if super
14
+ if plan.always_sync
15
+ registry.set(plan.table_name,
16
+ last_synced_at: EPOCH,
17
+ last_batch_synced_at: EPOCH,
18
+ last_row_at: nil
19
+ )
20
+ end
21
+
22
+ !!registry.get(plan.table_name)
23
+ else
24
+ if plan.always_sync
25
+ registry.delete(plan.table_name)
26
+ target.drop_table(plan.table_name)
27
+ end
28
+ false
29
+ end
30
+ end
31
+
32
+ def extract_data
33
+ @metadata = registry.get(plan.table_name)
34
+ @start_time = now.call
35
+ since = (
36
+ @metadata[:last_row_at] ||
37
+ @metadata[:last_synced_at]
38
+ ) - overlap
39
+
40
+ @file, @last_row_at = measure(:extract) { extract_to_file(since) }
41
+ self
42
+ end
43
+
44
+ def load_data
45
+ measure(:load) do
46
+ db.transaction do
47
+ db.load_incrementally_from_file(
48
+ plan.prefixed_table_name,
49
+ plan.columns,
50
+ @file.path
51
+ )
52
+
53
+ process_deletes
54
+
55
+ registry.update(plan.table_name, @metadata[:last_batch_synced_at],
56
+ last_synced_at: @start_time,
57
+ last_row_at: @last_row_at
58
+ )
59
+ end
60
+ @file.close!
61
+ end
62
+ self
63
+ end
64
+
65
+ def post_load
66
+ self
67
+ end
68
+
69
+ def prefix
70
+ ''
71
+ end
72
+
73
+ def filter_columns
74
+ source = plan.source_db
75
+ source_columns = source.hash_schema(plan.source_table_name).keys
76
+ plan.columns = resolve_columns(plan, source_columns) &
77
+ (target_columns || source_columns)
78
+ end
79
+
80
+ def target_columns
81
+ # Because we may create the target table later if necessary,
82
+ # we need to check if it *really* exists
83
+ target_columns = if target.table_exists?(plan.table_name)
84
+ tname = "#{prefix}#{plan.table_name}"
85
+ target.hash_schema(tname).keys
86
+ else
87
+ nil
88
+ end
89
+ end
90
+
91
+ def process_deletes
92
+ # Provided as a hook for subclasses
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,156 @@
1
+ require 'date'
2
+ require 'ostruct'
3
+
4
+ require 'sq/dbsync/schema_maker'
5
+ require 'sq/dbsync/tempfile_factory'
6
+
7
+ module Sq::Dbsync
8
+ # An stateful action object representing the transfer of data from a source
9
+ # table to a target. The action can be performed in full using `#call`, but
10
+ # control can also be inverted using the `.stages` method, which allows the
11
+ # action to be combined to run efficiently in parallel with other actions.
12
+ #
13
+ # This is useful because a single load taxes the source system then the target
14
+ # system in sequence, so for maximum efficency a second load should be
15
+ # interleaved to start taxing the source system as soon as the first finishes
16
+ # the extract, rather than waiting for it to also finish the load. This is not
17
+ # possible if the process is fully encapsulated as it is in `#call`.
18
+ #
19
+ # This is an abstract base class, see `BatchLoadAction` and
20
+ # `IncrementalLoadAction` for example subclasses.
21
+ class LoadAction
22
+ EPOCH = Date.new(2000, 1, 1).to_time
23
+
24
+ # An empty action that is used when a load needs to be noop'ed in a manner
25
+ # that does not raise an error (i.e. expected conditions).
26
+ class NullAction
27
+ def extract_data; self; end
28
+ def load_data; self; end
29
+ def post_load; self; end
30
+ end
31
+
32
+ def initialize(target, plan, registry, logger, now = ->{ Time.now.utc })
33
+ @target = target
34
+ @plan = OpenStruct.new(plan)
35
+ @registry = registry
36
+ @logger = logger
37
+ @now = now
38
+ end
39
+
40
+ def tag
41
+ plan.table_name
42
+ end
43
+
44
+ def call
45
+ self.class.stages.inject(self) {|x, v| v.call(x) }
46
+ end
47
+
48
+ def self.stages
49
+ [
50
+ ->(x) { x.do_prepare || NullAction.new },
51
+ ->(x) { x.extract_data },
52
+ ->(x) { x.load_data },
53
+ ->(x) { x.post_load }
54
+ ]
55
+ end
56
+
57
+ def do_prepare
58
+ return unless prepare
59
+
60
+ ensure_target_exists
61
+ self
62
+ end
63
+
64
+ protected
65
+
66
+ attr_reader :target, :plan, :registry, :logger, :now
67
+
68
+ def prepare
69
+ unless plan.source_db.table_exists?(plan.source_table_name)
70
+ logger.log("%s does not exist" % plan.source_table_name)
71
+ return false
72
+ end
73
+ add_schema_to_table_plan(plan)
74
+ plan.prefixed_table_name = (prefix + plan.table_name.to_s).to_sym
75
+ filter_columns
76
+ plan.timestamp ||=
77
+ ([:updated_at, :created_at] & plan.columns)[0]
78
+ end
79
+
80
+ def ensure_target_exists
81
+ unless target.table_exists?(plan.prefixed_table_name)
82
+ SchemaMaker.create_table(target, plan)
83
+ end
84
+ end
85
+
86
+ def add_schema_to_table_plan(x)
87
+ x.schema ||= x.source_db.hash_schema(x.source_table_name)
88
+ x
89
+ end
90
+
91
+ def resolve_columns(plan, source_columns)
92
+ if plan.columns == :all
93
+ source_columns
94
+ else
95
+ source_columns & plan.columns
96
+ end
97
+ end
98
+
99
+ def extract_to_file(since)
100
+ plan.source_db.ensure_connection
101
+ plan.source_db.set_lock_timeout(10)
102
+
103
+ last_row_at = timestamp_table(plan).
104
+ max(plan.timestamp)
105
+
106
+ file = make_writeable_tempfile
107
+
108
+ plan.source_db.extract_incrementally_to_file(
109
+ plan,
110
+ file.path,
111
+ since,
112
+ 0
113
+ )
114
+
115
+ [file, last_row_at]
116
+ end
117
+
118
+ # This functionality is provided as a work around for the postgres query
119
+ # planner failing to use indexes correctly for MAX() on a view that uses
120
+ # UNION under the covers.
121
+ #
122
+ # It is most useful under the assumption that one of the tables being
123
+ # unioned will always contain the most recent record (true in all current
124
+ # cases). If this is not true, you must provide a custom view that supports
125
+ # this query with a sane plan.
126
+ def timestamp_table(plan)
127
+ plan.source_db[plan.timestamp_table_name || plan.source_table_name]
128
+ end
129
+
130
+ def db; target; end
131
+
132
+ def measure(stage, &block)
133
+ label = "%s.%s.%s" % [
134
+ operation,
135
+ stage,
136
+ plan.table_name
137
+ ]
138
+ logger.measure(label) { block.call }
139
+ end
140
+
141
+ def overlap
142
+ self.class.overlap
143
+ end
144
+
145
+ # The distance we look back in time (in seconds) prior to the most recent
146
+ # row we have seen. This needs to comfortably more that the maximum
147
+ # expected time for a long running transaction.
148
+ def self.overlap
149
+ 120
150
+ end
151
+
152
+ def make_writeable_tempfile
153
+ TempfileFactory.make_world_writable(plan.table_name.to_s)
154
+ end
155
+ end
156
+ end