sq-dbsync 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/HISTORY.md +5 -0
  2. data/LICENSE +14 -0
  3. data/README.md +218 -0
  4. data/lib/sq/dbsync/all_tables_plan.rb +51 -0
  5. data/lib/sq/dbsync/batch_load_action.rb +95 -0
  6. data/lib/sq/dbsync/config.rb +12 -0
  7. data/lib/sq/dbsync/consistency_verifier.rb +70 -0
  8. data/lib/sq/dbsync/database/common.rb +91 -0
  9. data/lib/sq/dbsync/database/connection.rb +23 -0
  10. data/lib/sq/dbsync/database/mysql.rb +163 -0
  11. data/lib/sq/dbsync/database/postgres.rb +77 -0
  12. data/lib/sq/dbsync/error_handler.rb +59 -0
  13. data/lib/sq/dbsync/example_record_destroyer.rb +77 -0
  14. data/lib/sq/dbsync/incremental_load_action.rb +95 -0
  15. data/lib/sq/dbsync/load_action.rb +156 -0
  16. data/lib/sq/dbsync/loggers.rb +135 -0
  17. data/lib/sq/dbsync/manager.rb +241 -0
  18. data/lib/sq/dbsync/pipeline/simple_context.rb +15 -0
  19. data/lib/sq/dbsync/pipeline/threaded_context.rb +95 -0
  20. data/lib/sq/dbsync/pipeline.rb +80 -0
  21. data/lib/sq/dbsync/refresh_recent_load_action.rb +71 -0
  22. data/lib/sq/dbsync/schema_maker.rb +87 -0
  23. data/lib/sq/dbsync/static_table_plan.rb +42 -0
  24. data/lib/sq/dbsync/table_registry.rb +75 -0
  25. data/lib/sq/dbsync/tempfile_factory.rb +41 -0
  26. data/lib/sq/dbsync/version.rb +5 -0
  27. data/lib/sq/dbsync.rb +9 -0
  28. data/spec/acceptance/loading_spec.rb +237 -0
  29. data/spec/acceptance_helper.rb +2 -0
  30. data/spec/database_helper.rb +86 -0
  31. data/spec/integration/all_tables_plan_spec.rb +36 -0
  32. data/spec/integration/batch_load_action_spec.rb +229 -0
  33. data/spec/integration/consistency_verifier_spec.rb +54 -0
  34. data/spec/integration/database_connection_spec.rb +61 -0
  35. data/spec/integration/incremental_load_action_spec.rb +196 -0
  36. data/spec/integration/manager_spec.rb +109 -0
  37. data/spec/integration/schema_maker_spec.rb +119 -0
  38. data/spec/integration_helper.rb +43 -0
  39. data/spec/spec_helper.rb +27 -0
  40. data/spec/unit/config_spec.rb +18 -0
  41. data/spec/unit/error_handler_spec.rb +52 -0
  42. data/spec/unit/pipeline_spec.rb +42 -0
  43. data/spec/unit/stream_logger_spec.rb +33 -0
  44. data/spec/unit_helper.rb +1 -0
  45. data/sq-dbsync.gemspec +32 -0
  46. metadata +188 -0
@@ -0,0 +1,135 @@
1
+ require 'time'
2
+ require 'socket'
3
+
4
+ # Instrumenting various aspects of the system is critical since it will take
5
+ # longer and longer as the data sources grow and it is necessary to know when
6
+ # this import time is taking too long (either replication can't keep up, or
7
+ # recovery time is too long).
8
+ module Sq::Dbsync::Loggers
9
+
10
+ # Abstract base class for loggers. This is useful because the CompositeLogger
11
+ # needs to delegate to a set of loggers, which requires an explicit interface
12
+ # to communicate with. This class helps define that relationship and describe
13
+ # the interfaces.
14
+ class Abstract
15
+ def measure(label, &block); end
16
+ def log(str); end
17
+ end
18
+
19
+ # Writes timing information to stdout. Thread-safe in that calls to measure
20
+ # from separate threads will execute in parallel but synchronize before
21
+ # writing their output.
22
+ class Stream < Abstract
23
+ def initialize(out = $stdout)
24
+ @mutex = Mutex.new
25
+ @out = out
26
+ end
27
+
28
+ def measure(label, &block)
29
+ start_time = Time.now.utc
30
+ log_measurement(start_time, :starting, 0, label)
31
+ ret = nil
32
+ exception = nil
33
+ state = :finished
34
+ begin
35
+ ret = block.call
36
+ rescue => e
37
+ state = :failed
38
+ exception = e
39
+ raise
40
+ ensure
41
+ end_time = Time.now.utc
42
+ log_measurement(end_time, state, end_time - start_time, label)
43
+ log(exception.message) if exception
44
+ end
45
+ ret
46
+ end
47
+
48
+ def log_measurement(time, event, duration, object)
49
+ log([
50
+ event,
51
+ "%.3f" % duration,
52
+ object
53
+ ].join("\t"), time)
54
+ end
55
+
56
+ def log(str, time = Time.now.utc)
57
+ # Synchronize to ensure lines are not interwoven.
58
+ mutex.synchronize { out.puts([time, str].join("\t")) }
59
+ end
60
+
61
+ private
62
+
63
+ attr_reader :mutex, :out
64
+ end
65
+
66
+ # Combines multiple loggers together.
67
+ class Composite < Abstract
68
+ attr_accessor :loggers
69
+
70
+ def initialize(loggers = nil)
71
+ @loggers = loggers
72
+ end
73
+
74
+ def measure(label, &block)
75
+ # Babushka doll! Logger inside a logger inside a logger.
76
+ loggers.inject(block) do |block, logger|
77
+ lambda do
78
+ logger.measure(label) do
79
+ block.call
80
+ end
81
+ end
82
+ end.call
83
+ end
84
+
85
+ def log(str)
86
+ loggers.each { |logger| logger.log(str) }
87
+ end
88
+ end
89
+
90
+ # Logs metric run time to graphite.
91
+ class Graphite < Abstract
92
+ def initialize(opts)
93
+ @opts = opts
94
+ end
95
+
96
+ def measure(label, &block)
97
+ start_time = Time.now.utc
98
+ block.call
99
+ ensure
100
+ end_time = Time.now.utc
101
+ record_metric(end_time.to_i, label, end_time - start_time)
102
+ end
103
+
104
+ def record_metric(timestamp, name, value)
105
+ msg = "#{@opts.fetch(:prefix, 'dbsync')}.#{name} #{value} #{timestamp}\n"
106
+
107
+ s = TCPSocket.new(@opts[:host], @opts.fetch(:port, 2003))
108
+ s.send msg, 0
109
+ s.close
110
+ end
111
+ end
112
+
113
+ # Used in test environments where instrumentation is not required.
114
+ class Null < Abstract
115
+ def measure(label, &block)
116
+ block.call
117
+ end
118
+ end
119
+
120
+ # Logging is one of the few outputs of the system, this class is provided as a
121
+ # cheap way to allow tests to hook into events. It should not be used in
122
+ # production.
123
+ class NullWithCallbacks < Abstract
124
+ attr_accessor :callbacks
125
+
126
+ def initialize(callbacks = nil)
127
+ @callbacks = callbacks
128
+ end
129
+
130
+ def measure(label, &block)
131
+ (callbacks || {}).fetch(label, ->{}).call
132
+ block.call
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,241 @@
1
+ require 'sq/dbsync/config'
2
+ require 'sq/dbsync/batch_load_action'
3
+ require 'sq/dbsync/incremental_load_action'
4
+ require 'sq/dbsync/refresh_recent_load_action'
5
+ require 'sq/dbsync/pipeline'
6
+ require 'sq/dbsync/table_registry'
7
+ require 'sq/dbsync/consistency_verifier'
8
+ require 'sq/dbsync/database/connection'
9
+ require 'sq/dbsync/error_handler'
10
+
11
+ # The manager orchestrates the high level functions of the sync, such as
12
+ # keeping the database up-to-date and batch loading.
13
+ #
14
+ # This is the main entry point for the application.
15
+ class Sq::Dbsync::Manager
16
+ include Sq::Dbsync
17
+
18
+ EPOCH = Date.new(2000, 1, 1).to_time
19
+ MAX_RETRIES = 10
20
+
21
+ def initialize(config, plans)
22
+ @config = Sq::Dbsync::Config.make(config)
23
+ @plans = plans
24
+ @error_handler = ErrorHandler.new(config)
25
+ end
26
+
27
+ def batch(tables = :all)
28
+ error_handler.wrap do
29
+ batch_nonactive(tables)
30
+ refresh_recent(tables)
31
+ end
32
+ end
33
+
34
+ def increment
35
+ error_handler.wrap do
36
+ incremental
37
+ end
38
+ end
39
+
40
+ def batch_nonactive(tables = :all)
41
+ registry.ensure_storage_exists
42
+
43
+ measure(:batch_total) do
44
+ raise_all_if_pipeline_failure(
45
+ run_load(BatchLoadAction, Pipeline::ThreadedContext, tables)
46
+ )
47
+ end
48
+ end
49
+
50
+ def refresh_recent(tables = :all)
51
+ registry.ensure_storage_exists
52
+
53
+ measure(:refresh_recent_total) do
54
+ raise_all_if_pipeline_failure(
55
+ run_load(RefreshRecentLoadAction, Pipeline::ThreadedContext, tables)
56
+ )
57
+ end
58
+ end
59
+
60
+ def incremental
61
+ @running = true
62
+ counter = 0
63
+
64
+ loop_with_retry_on(->{ @running }, transient_exceptions) do
65
+ incremental_once
66
+
67
+ counter = (counter + 1) % 100
68
+ if counter == 1
69
+ # No need to do this every cycle, 100 is chosen to be as good as any
70
+ # other number. It should run on the very first cycle however so that
71
+ # the specs will cover it.
72
+ increment_checkpoint
73
+ end
74
+ end
75
+ end
76
+
77
+ def incremental_once
78
+ # In theory, this ensures that any changes to the source IP (such as from a
79
+ # virtual IP flip) are picked up.
80
+ sources.each do |_, db|
81
+ db.disconnect
82
+ end
83
+
84
+ raise_if_pipeline_failure(
85
+ # ThreadedContext would be ideal here, but it leaks memory in JRuby. Not
86
+ # sure why yet, but mass creation of threads seems like an obvious
87
+ # candidate for brokenness.
88
+ #
89
+ # TODO: Above comment probably isn't true with 1.7 and ThreadedContext
90
+ # fixes.
91
+ run_load(incremental_action, Pipeline::SimpleContext)
92
+ )
93
+ end
94
+
95
+ # Actions that need to be performed regularly, but not every cycle. Please do
96
+ # suggest a better name for this method.
97
+ def increment_checkpoint
98
+ # No need to do this every cycle, 100 is chosen to be as good as any
99
+ # other number. It should run on the very first cycle however so that
100
+ # our specs will cover it.
101
+ verifier.check_consistency!(tables_to_load)
102
+
103
+ purge_registry
104
+ end
105
+
106
+ def stop!
107
+ @running = false
108
+ end
109
+
110
+ def target
111
+ @target ||= Sq::Dbsync::Database::Connection.create(config[:target])
112
+ end
113
+
114
+ def tables_to_load
115
+ plans_with_sources.map do |plan, source|
116
+ plan.tables(source).map do |x|
117
+ x.update(source_db: source)
118
+ end
119
+ end.reduce([], :+).uniq {|x| x[:table_name] }
120
+ end
121
+
122
+ def plans_with_sources
123
+ @plans_with_sources ||= plans.map do |plan, source_name|
124
+ [plan, sources.fetch(source_name)]
125
+ end
126
+ end
127
+
128
+ def sources
129
+ @sources ||= Hash[config[:sources].map do |name, opts|
130
+ [name, Sq::Dbsync::Database::Connection.create(opts)]
131
+ end]
132
+ end
133
+
134
+ attr_accessor :config, :plans, :error_handler
135
+
136
+ private
137
+
138
+ def run_load(action, context, tables = :all)
139
+ items = tables_to_load.map do |tplan|
140
+ if tables != :all
141
+ next unless tables.include?(tplan[:table_name])
142
+
143
+ # Force loading of specified tables, otherwise it would be impossible
144
+ # to batch load tables that were not regularly loaded.
145
+ tplan[:batch_load] = true
146
+
147
+ # Force refresh of tables, this is expected behaviour if you are
148
+ # calling the refresh-recent script with an explicit table list.
149
+ tplan[:refresh_recent] = true
150
+ end
151
+
152
+ if tplan[:refresh_recent].is_a?(Symbol)
153
+ tplan[:aux_timestamp_column] = tplan[:refresh_recent]
154
+ end
155
+
156
+ action.new(db, tplan, registry, logger, config[:clock])
157
+ end.compact
158
+ Pipeline.new(items, *LoadAction.stages).run(context)
159
+ end
160
+
161
+ # This is necessary so that old tables that are no longer being synced do not
162
+ # break our lag calculations.
163
+ def purge_registry
164
+ registry.purge_except(expected_table_names)
165
+ end
166
+
167
+ def expected_table_names
168
+ tables_to_load.map {|x| x[:table_name] } + config.fetch(:extra_tables, [])
169
+ end
170
+
171
+ def loop_with_retry_on(guard, transient_exceptions, &block)
172
+ consecutive_fails = 0
173
+
174
+ while guard.call
175
+ begin
176
+ block.call
177
+ consecutive_fails = 0
178
+ rescue *transient_exceptions
179
+ consecutive_fails += 1
180
+ raise if consecutive_fails >= MAX_RETRIES
181
+ end
182
+ end
183
+ end
184
+
185
+ def raise_if_pipeline_failure(results)
186
+ results.each do |result|
187
+ if result.is_a?(Pipeline::Failure)
188
+ raise result.wrapped_exception
189
+ end
190
+ end
191
+ end
192
+
193
+ def raise_all_if_pipeline_failure(results)
194
+ failed = false
195
+ results.each do |result|
196
+ if result.is_a?(Pipeline::Failure)
197
+ error_handler.notify_error(result.task.tag, result.wrapped_exception)
198
+ failed = true
199
+ end
200
+ end
201
+
202
+ if failed
203
+ raise Database::ExtractError,
204
+ "One or more loads failed, see other exceptions for details."
205
+ end
206
+ end
207
+
208
+ def measure(label, &block)
209
+ logger.measure(label) do
210
+ block.call
211
+ end
212
+ end
213
+
214
+ def registry
215
+ TableRegistry.new(target)
216
+ end
217
+
218
+ def verifier
219
+ @verifier ||= ConsistencyVerifier.new(target, registry)
220
+ end
221
+
222
+ def logger
223
+ config[:logger]
224
+ end
225
+
226
+ def db
227
+ @db ||= Database::Connection.create(config[:target])
228
+ end
229
+
230
+ def transient_exceptions
231
+ [
232
+ Database::ExtractError,
233
+ Database::TransientError
234
+ ]
235
+ end
236
+
237
+ def incremental_action
238
+ config.fetch(:incremental_action, IncrementalLoadAction)
239
+ end
240
+
241
+ end
@@ -0,0 +1,15 @@
1
+ # See lib/pipeline.rb
2
+ class Sq::Dbsync::Pipeline
3
+
4
+ # A computational context that passes a number of tasks through a set of
5
+ # stages in sequence.
6
+ class SimpleContext
7
+ def self.call(tasks, stages, process)
8
+ tasks.map do |task|
9
+ stages.inject(task) do |result, stage|
10
+ process.call(stage, result)
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,95 @@
1
+ require 'thread'
2
+
3
+ # See lib/sq/dbsync/pipeline.rb
4
+ class Sq::Dbsync::Pipeline
5
+
6
+ # A computational context for passing a number of tasks through a set of
7
+ # stages, where each stage uses resources independent of the other stages.
8
+ # For example, stage one may be able to execute a maximum of two tasks at
9
+ # once, and stage two may also have a maximum of two, but it is optimum
10
+ # that a total of four tasks to be processing at any one time.
11
+ class ThreadedContext
12
+
13
+ # Tracer object to mark the end of a stream of tasks.
14
+ FINISH = Object.new
15
+
16
+ def self.call(*args, &block)
17
+ new(*args, &block).run
18
+ end
19
+
20
+ def initialize(tasks, stages, process)
21
+ self.tasks = tasks
22
+ self.stages = stages
23
+ self.process = process
24
+ self.threads = []
25
+ end
26
+
27
+ def run
28
+ initial_queue, final_queue = build_pipeline(stages, tasks.length)
29
+
30
+ tasks.each.with_index do |task, i|
31
+ initial_queue << [i, task]
32
+ end
33
+
34
+ result = ordered (0...tasks.length).map { final_queue.pop }
35
+ flush_threads(initial_queue)
36
+ result
37
+ end
38
+
39
+ protected
40
+
41
+ attr_accessor :tasks, :stages, :process
42
+
43
+ # Floods the queue with enough FINISH markers to guarantee that each thread
44
+ # will see one and shut itself down.
45
+ def flush_threads(initial_queue)
46
+ threads.size.times { initial_queue << FINISH }
47
+ threads.each(&:join)
48
+ end
49
+
50
+ def ordered(tasks)
51
+ tasks.
52
+ sort_by(&:first).
53
+ map(&:last)
54
+ end
55
+
56
+ def concurrency(stage)
57
+ 2
58
+ end
59
+
60
+ def build_pipeline(stages, number_of_tasks)
61
+ initial_queue = Queue.new
62
+ final_queue = stages.inject(initial_queue) do |task_queue, stage|
63
+ spawn_workers(stage, task_queue, number_of_tasks)
64
+ end
65
+
66
+ [initial_queue, final_queue]
67
+ end
68
+
69
+ def spawn_workers(stage, task_queue, number_of_tasks)
70
+ next_queue = Queue.new
71
+
72
+ self.threads += in_threads(concurrency(stage)) do
73
+ while true
74
+ index, task = task_queue.pop
75
+ if index == FINISH
76
+ next_queue << FINISH
77
+ break
78
+ else
79
+ next_queue << [index, process.call(stage, task)]
80
+ end
81
+ end
82
+ end
83
+
84
+ next_queue
85
+ end
86
+
87
+ def in_threads(n, &block)
88
+ n.times.map do
89
+ Thread.new(&block)
90
+ end
91
+ end
92
+
93
+ attr_accessor :threads
94
+ end
95
+ end
@@ -0,0 +1,80 @@
1
+ require 'sq/dbsync/pipeline/threaded_context'
2
+ require 'sq/dbsync/pipeline/simple_context'
3
+
4
+ module Sq::Dbsync
5
+
6
+ # An inject/reduce/fold-like abstraction to pass an array through a set of
7
+ # operations, where the result of the first operation is passed to the second
8
+ # operation, second to the third, and so on through the set until the final
9
+ # result is returned. It gracefully handles any individual failure and still
10
+ # allows other results to be computed.
11
+ #
12
+ # Any unhandled exception will place an instance of `Pipeline::Failure` into
13
+ # the returned results.
14
+ #
15
+ # The order and timing of when stages are is undefined (for example, they may
16
+ # be parallelized), so they should be well isolated from each other.
17
+ #
18
+ # Examples
19
+ #
20
+ # Pipeline.new([1, 2, 3],
21
+ # ->(x) { x * x },
22
+ # ->(x) { x + x }
23
+ # ).run
24
+ # # => [2, 8, 18]
25
+ #
26
+ # Pipeline.new([1, 2],
27
+ # ->(x) { x == 1 ? raise : x },
28
+ # ->(x) { x * 10 }
29
+ # ).run
30
+ # # => [Pipeline::Failure, 20]
31
+ class Pipeline
32
+
33
+ def initialize(tasks, *stages)
34
+ self.tasks = tasks
35
+ self.stages = stages
36
+ end
37
+
38
+ # Run the pipeline and return the computed results.
39
+ #
40
+ # context - The computational context in which to run the pipeline. Must
41
+ # respond to `#call` and take tasks, stages, and a processing
42
+ # lambda as arguments. By default runs the pipeline in parallel,
43
+ # but an alternative `SimpleContext` is provided to run in a
44
+ # single thread to aid debugging and testing.
45
+ def run(context = ThreadedContext)
46
+ context.call(tasks, stages, ->(stage, result) {
47
+ process(stage, result)
48
+ })
49
+ end
50
+
51
+ # Used to signal failed operations in a pipeline.
52
+ class Failure < StandardError
53
+ # The original exception that caused this failure.
54
+ attr_reader :wrapped_exception
55
+
56
+ # The task that was being processed when this failure occurred.
57
+ attr_reader :task
58
+
59
+ def initialize(wrapped, task)
60
+ @wrapped_exception = wrapped
61
+ @task = task
62
+ end
63
+ end
64
+
65
+ protected
66
+ def process(stage, task)
67
+ if task.is_a?(Failure)
68
+ task
69
+ else
70
+ begin
71
+ stage.call(task)
72
+ rescue => e
73
+ Failure.new(e, task)
74
+ end
75
+ end
76
+ end
77
+
78
+ attr_accessor :tasks, :stages
79
+ end
80
+ end
@@ -0,0 +1,71 @@
1
+ require 'sq/dbsync/load_action'
2
+
3
+ module Sq::Dbsync
4
+
5
+ # This is a terribly named class that will delete the last X days of data
6
+ # from a table and reload it. Useful for tables that are nearly append only
7
+ # but sometimes will update recent data (for instance, a failed import). The
8
+ # tables are too big to regularly reload in their entirety, but reloading
9
+ # only recent data fixes the main issues.
10
+ class RefreshRecentLoadAction < LoadAction
11
+ WINDOW = 60 * 60 * 24 * 2 # 2 days
12
+
13
+ def operation; 'refresh_recent'; end
14
+
15
+ def prepare
16
+ return false unless plan.refresh_recent
17
+
18
+ super
19
+ end
20
+
21
+ def post_load
22
+ end
23
+
24
+ def extract_data
25
+ @metadata = registry.get(plan.table_name)
26
+ @start_time = now.call
27
+ @since = (
28
+ @metadata[:last_row_at] ||
29
+ @metadata[:last_synced_at]
30
+ ) - WINDOW
31
+ @file, @last_row_at = measure(:extract) { extract_to_file(@since) }
32
+ self
33
+ end
34
+
35
+ def load_data
36
+ measure(:load) do
37
+ tname = plan.table_name
38
+ columns = plan.columns
39
+ db.transaction do
40
+ db.delete_recent(plan, @since)
41
+ db.load_from_file(tname, columns, @file.path)
42
+ end
43
+ end
44
+ @file.close!
45
+ self
46
+ end
47
+
48
+ private
49
+
50
+ def filter_columns
51
+ source = plan.source_db
52
+ source_columns = source.hash_schema(plan.source_table_name).keys
53
+ plan.columns = resolve_columns(plan, source_columns) &
54
+ (target_columns || source_columns)
55
+ end
56
+
57
+ def target_columns
58
+ # Because we may create the target table later if necessary,
59
+ # we need to check if it *really* exists
60
+ target_columns = if target.table_exists?(plan.table_name)
61
+ target.hash_schema(plan.table_name).keys
62
+ else
63
+ nil
64
+ end
65
+ end
66
+
67
+ def prefix
68
+ ''
69
+ end
70
+ end
71
+ end