sq-dbsync 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/HISTORY.md +5 -0
  2. data/LICENSE +14 -0
  3. data/README.md +218 -0
  4. data/lib/sq/dbsync/all_tables_plan.rb +51 -0
  5. data/lib/sq/dbsync/batch_load_action.rb +95 -0
  6. data/lib/sq/dbsync/config.rb +12 -0
  7. data/lib/sq/dbsync/consistency_verifier.rb +70 -0
  8. data/lib/sq/dbsync/database/common.rb +91 -0
  9. data/lib/sq/dbsync/database/connection.rb +23 -0
  10. data/lib/sq/dbsync/database/mysql.rb +163 -0
  11. data/lib/sq/dbsync/database/postgres.rb +77 -0
  12. data/lib/sq/dbsync/error_handler.rb +59 -0
  13. data/lib/sq/dbsync/example_record_destroyer.rb +77 -0
  14. data/lib/sq/dbsync/incremental_load_action.rb +95 -0
  15. data/lib/sq/dbsync/load_action.rb +156 -0
  16. data/lib/sq/dbsync/loggers.rb +135 -0
  17. data/lib/sq/dbsync/manager.rb +241 -0
  18. data/lib/sq/dbsync/pipeline/simple_context.rb +15 -0
  19. data/lib/sq/dbsync/pipeline/threaded_context.rb +95 -0
  20. data/lib/sq/dbsync/pipeline.rb +80 -0
  21. data/lib/sq/dbsync/refresh_recent_load_action.rb +71 -0
  22. data/lib/sq/dbsync/schema_maker.rb +87 -0
  23. data/lib/sq/dbsync/static_table_plan.rb +42 -0
  24. data/lib/sq/dbsync/table_registry.rb +75 -0
  25. data/lib/sq/dbsync/tempfile_factory.rb +41 -0
  26. data/lib/sq/dbsync/version.rb +5 -0
  27. data/lib/sq/dbsync.rb +9 -0
  28. data/spec/acceptance/loading_spec.rb +237 -0
  29. data/spec/acceptance_helper.rb +2 -0
  30. data/spec/database_helper.rb +86 -0
  31. data/spec/integration/all_tables_plan_spec.rb +36 -0
  32. data/spec/integration/batch_load_action_spec.rb +229 -0
  33. data/spec/integration/consistency_verifier_spec.rb +54 -0
  34. data/spec/integration/database_connection_spec.rb +61 -0
  35. data/spec/integration/incremental_load_action_spec.rb +196 -0
  36. data/spec/integration/manager_spec.rb +109 -0
  37. data/spec/integration/schema_maker_spec.rb +119 -0
  38. data/spec/integration_helper.rb +43 -0
  39. data/spec/spec_helper.rb +27 -0
  40. data/spec/unit/config_spec.rb +18 -0
  41. data/spec/unit/error_handler_spec.rb +52 -0
  42. data/spec/unit/pipeline_spec.rb +42 -0
  43. data/spec/unit/stream_logger_spec.rb +33 -0
  44. data/spec/unit_helper.rb +1 -0
  45. data/sq-dbsync.gemspec +32 -0
  46. metadata +188 -0
@@ -0,0 +1,135 @@
1
+ require 'time'
2
+ require 'socket'
3
+
4
+ # Instrumenting various aspects of the system is critical since it will take
5
+ # longer and longer as the data sources grow and it is necessary to know when
6
+ # this import time is taking too long (either replication can't keep up, or
7
+ # recovery time is too long).
8
+ module Sq::Dbsync::Loggers
9
+
10
+ # Abstract base class for loggers. This is useful because the CompositeLogger
11
+ # needs to delegate to a set of loggers, which requires an explicit interface
12
+ # to communicate with. This class helps define that relationship and describe
13
+ # the interfaces.
14
+ class Abstract
15
+ def measure(label, &block); end
16
+ def log(str); end
17
+ end
18
+
19
+ # Writes timing information to stdout. Thread-safe in that calls to measure
20
+ # from separate threads will execute in parallel but synchronize before
21
+ # writing their output.
22
+ class Stream < Abstract
23
+ def initialize(out = $stdout)
24
+ @mutex = Mutex.new
25
+ @out = out
26
+ end
27
+
28
+ def measure(label, &block)
29
+ start_time = Time.now.utc
30
+ log_measurement(start_time, :starting, 0, label)
31
+ ret = nil
32
+ exception = nil
33
+ state = :finished
34
+ begin
35
+ ret = block.call
36
+ rescue => e
37
+ state = :failed
38
+ exception = e
39
+ raise
40
+ ensure
41
+ end_time = Time.now.utc
42
+ log_measurement(end_time, state, end_time - start_time, label)
43
+ log(exception.message) if exception
44
+ end
45
+ ret
46
+ end
47
+
48
+ def log_measurement(time, event, duration, object)
49
+ log([
50
+ event,
51
+ "%.3f" % duration,
52
+ object
53
+ ].join("\t"), time)
54
+ end
55
+
56
+ def log(str, time = Time.now.utc)
57
+ # Synchronize to ensure lines are not interwoven.
58
+ mutex.synchronize { out.puts([time, str].join("\t")) }
59
+ end
60
+
61
+ private
62
+
63
+ attr_reader :mutex, :out
64
+ end
65
+
66
+ # Combines multiple loggers together.
67
+ class Composite < Abstract
68
+ attr_accessor :loggers
69
+
70
+ def initialize(loggers = nil)
71
+ @loggers = loggers
72
+ end
73
+
74
+ def measure(label, &block)
75
+ # Babushka doll! Logger inside a logger inside a logger.
76
+ loggers.inject(block) do |block, logger|
77
+ lambda do
78
+ logger.measure(label) do
79
+ block.call
80
+ end
81
+ end
82
+ end.call
83
+ end
84
+
85
+ def log(str)
86
+ loggers.each { |logger| logger.log(str) }
87
+ end
88
+ end
89
+
90
+ # Logs metric run time to graphite.
91
+ class Graphite < Abstract
92
+ def initialize(opts)
93
+ @opts = opts
94
+ end
95
+
96
+ def measure(label, &block)
97
+ start_time = Time.now.utc
98
+ block.call
99
+ ensure
100
+ end_time = Time.now.utc
101
+ record_metric(end_time.to_i, label, end_time - start_time)
102
+ end
103
+
104
+ def record_metric(timestamp, name, value)
105
+ msg = "#{@opts.fetch(:prefix, 'dbsync')}.#{name} #{value} #{timestamp}\n"
106
+
107
+ s = TCPSocket.new(@opts[:host], @opts.fetch(:port, 2003))
108
+ s.send msg, 0
109
+ s.close
110
+ end
111
+ end
112
+
113
+ # Used in test environments where instrumentation is not required.
114
+ class Null < Abstract
115
+ def measure(label, &block)
116
+ block.call
117
+ end
118
+ end
119
+
120
+ # Logging is one of the few outputs of the system, this class is provided as a
121
+ # cheap way to allow tests to hook into events. It should not be used in
122
+ # production.
123
+ class NullWithCallbacks < Abstract
124
+ attr_accessor :callbacks
125
+
126
+ def initialize(callbacks = nil)
127
+ @callbacks = callbacks
128
+ end
129
+
130
+ def measure(label, &block)
131
+ (callbacks || {}).fetch(label, ->{}).call
132
+ block.call
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,241 @@
1
+ require 'sq/dbsync/config'
2
+ require 'sq/dbsync/batch_load_action'
3
+ require 'sq/dbsync/incremental_load_action'
4
+ require 'sq/dbsync/refresh_recent_load_action'
5
+ require 'sq/dbsync/pipeline'
6
+ require 'sq/dbsync/table_registry'
7
+ require 'sq/dbsync/consistency_verifier'
8
+ require 'sq/dbsync/database/connection'
9
+ require 'sq/dbsync/error_handler'
10
+
11
+ # The manager orchestrates the high level functions of the sync, such as
12
+ # keeping the database up-to-date and batch loading.
13
+ #
14
+ # This is the main entry point for the application.
15
+ class Sq::Dbsync::Manager
16
+ include Sq::Dbsync
17
+
18
+ EPOCH = Date.new(2000, 1, 1).to_time
19
+ MAX_RETRIES = 10
20
+
21
+ def initialize(config, plans)
22
+ @config = Sq::Dbsync::Config.make(config)
23
+ @plans = plans
24
+ @error_handler = ErrorHandler.new(config)
25
+ end
26
+
27
+ def batch(tables = :all)
28
+ error_handler.wrap do
29
+ batch_nonactive(tables)
30
+ refresh_recent(tables)
31
+ end
32
+ end
33
+
34
+ def increment
35
+ error_handler.wrap do
36
+ incremental
37
+ end
38
+ end
39
+
40
+ def batch_nonactive(tables = :all)
41
+ registry.ensure_storage_exists
42
+
43
+ measure(:batch_total) do
44
+ raise_all_if_pipeline_failure(
45
+ run_load(BatchLoadAction, Pipeline::ThreadedContext, tables)
46
+ )
47
+ end
48
+ end
49
+
50
+ def refresh_recent(tables = :all)
51
+ registry.ensure_storage_exists
52
+
53
+ measure(:refresh_recent_total) do
54
+ raise_all_if_pipeline_failure(
55
+ run_load(RefreshRecentLoadAction, Pipeline::ThreadedContext, tables)
56
+ )
57
+ end
58
+ end
59
+
60
+ def incremental
61
+ @running = true
62
+ counter = 0
63
+
64
+ loop_with_retry_on(->{ @running }, transient_exceptions) do
65
+ incremental_once
66
+
67
+ counter = (counter + 1) % 100
68
+ if counter == 1
69
+ # No need to do this every cycle, 100 is chosen to be as good as any
70
+ # other number. It should run on the very first cycle however so that
71
+ # the specs will cover it.
72
+ increment_checkpoint
73
+ end
74
+ end
75
+ end
76
+
77
+ def incremental_once
78
+ # In theory, this ensures that any changes to the source IP (such as from a
79
+ # virtual IP flip) are picked up.
80
+ sources.each do |_, db|
81
+ db.disconnect
82
+ end
83
+
84
+ raise_if_pipeline_failure(
85
+ # ThreadedContext would be ideal here, but it leaks memory in JRuby. Not
86
+ # sure why yet, but mass creation of threads seems like an obvious
87
+ # candidate for brokenness.
88
+ #
89
+ # TODO: Above comment probably isn't true with 1.7 and ThreadedContext
90
+ # fixes.
91
+ run_load(incremental_action, Pipeline::SimpleContext)
92
+ )
93
+ end
94
+
95
+ # Actions that need to be performed regularly, but not every cycle. Please do
96
+ # suggest a better name for this method.
97
+ def increment_checkpoint
98
+ # No need to do this every cycle, 100 is chosen to be as good as any
99
+ # other number. It should run on the very first cycle however so that
100
+ # our specs will cover it.
101
+ verifier.check_consistency!(tables_to_load)
102
+
103
+ purge_registry
104
+ end
105
+
106
+ def stop!
107
+ @running = false
108
+ end
109
+
110
+ def target
111
+ @target ||= Sq::Dbsync::Database::Connection.create(config[:target])
112
+ end
113
+
114
+ def tables_to_load
115
+ plans_with_sources.map do |plan, source|
116
+ plan.tables(source).map do |x|
117
+ x.update(source_db: source)
118
+ end
119
+ end.reduce([], :+).uniq {|x| x[:table_name] }
120
+ end
121
+
122
+ def plans_with_sources
123
+ @plans_with_sources ||= plans.map do |plan, source_name|
124
+ [plan, sources.fetch(source_name)]
125
+ end
126
+ end
127
+
128
+ def sources
129
+ @sources ||= Hash[config[:sources].map do |name, opts|
130
+ [name, Sq::Dbsync::Database::Connection.create(opts)]
131
+ end]
132
+ end
133
+
134
+ attr_accessor :config, :plans, :error_handler
135
+
136
+ private
137
+
138
+ def run_load(action, context, tables = :all)
139
+ items = tables_to_load.map do |tplan|
140
+ if tables != :all
141
+ next unless tables.include?(tplan[:table_name])
142
+
143
+ # Force loading of specified tables, otherwise it would be impossible
144
+ # to batch load tables that were not regularly loaded.
145
+ tplan[:batch_load] = true
146
+
147
+ # Force refresh of tables, this is expected behaviour if you are
148
+ # calling the refresh-recent script with an explicit table list.
149
+ tplan[:refresh_recent] = true
150
+ end
151
+
152
+ if tplan[:refresh_recent].is_a?(Symbol)
153
+ tplan[:aux_timestamp_column] = tplan[:refresh_recent]
154
+ end
155
+
156
+ action.new(db, tplan, registry, logger, config[:clock])
157
+ end.compact
158
+ Pipeline.new(items, *LoadAction.stages).run(context)
159
+ end
160
+
161
+ # This is necessary so that old tables that are no longer being synced do not
162
+ # break our lag calculations.
163
+ def purge_registry
164
+ registry.purge_except(expected_table_names)
165
+ end
166
+
167
+ def expected_table_names
168
+ tables_to_load.map {|x| x[:table_name] } + config.fetch(:extra_tables, [])
169
+ end
170
+
171
+ def loop_with_retry_on(guard, transient_exceptions, &block)
172
+ consecutive_fails = 0
173
+
174
+ while guard.call
175
+ begin
176
+ block.call
177
+ consecutive_fails = 0
178
+ rescue *transient_exceptions
179
+ consecutive_fails += 1
180
+ raise if consecutive_fails >= MAX_RETRIES
181
+ end
182
+ end
183
+ end
184
+
185
+ def raise_if_pipeline_failure(results)
186
+ results.each do |result|
187
+ if result.is_a?(Pipeline::Failure)
188
+ raise result.wrapped_exception
189
+ end
190
+ end
191
+ end
192
+
193
+ def raise_all_if_pipeline_failure(results)
194
+ failed = false
195
+ results.each do |result|
196
+ if result.is_a?(Pipeline::Failure)
197
+ error_handler.notify_error(result.task.tag, result.wrapped_exception)
198
+ failed = true
199
+ end
200
+ end
201
+
202
+ if failed
203
+ raise Database::ExtractError,
204
+ "One or more loads failed, see other exceptions for details."
205
+ end
206
+ end
207
+
208
+ def measure(label, &block)
209
+ logger.measure(label) do
210
+ block.call
211
+ end
212
+ end
213
+
214
+ def registry
215
+ TableRegistry.new(target)
216
+ end
217
+
218
+ def verifier
219
+ @verifier ||= ConsistencyVerifier.new(target, registry)
220
+ end
221
+
222
+ def logger
223
+ config[:logger]
224
+ end
225
+
226
+ def db
227
+ @db ||= Database::Connection.create(config[:target])
228
+ end
229
+
230
+ def transient_exceptions
231
+ [
232
+ Database::ExtractError,
233
+ Database::TransientError
234
+ ]
235
+ end
236
+
237
+ def incremental_action
238
+ config.fetch(:incremental_action, IncrementalLoadAction)
239
+ end
240
+
241
+ end
@@ -0,0 +1,15 @@
1
+ # See lib/pipeline.rb
2
+ class Sq::Dbsync::Pipeline
3
+
4
+ # A computational context that passes a number of tasks through a set of
5
+ # stages in sequence.
6
+ class SimpleContext
7
+ def self.call(tasks, stages, process)
8
+ tasks.map do |task|
9
+ stages.inject(task) do |result, stage|
10
+ process.call(stage, result)
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,95 @@
1
+ require 'thread'
2
+
3
+ # See lib/sq/dbsync/pipeline.rb
4
+ class Sq::Dbsync::Pipeline
5
+
6
+ # A computational context for passing a number of tasks through a set of
7
+ # stages, where each stage uses resources independent of the other stages.
8
+ # For example, stage one may be able to execute a maximum of two tasks at
9
+ # once, and stage two may also have a maximum of two, but it is optimum
10
+ # that a total of four tasks to be processing at any one time.
11
+ class ThreadedContext
12
+
13
+ # Tracer object to mark the end of a stream of tasks.
14
+ FINISH = Object.new
15
+
16
+ def self.call(*args, &block)
17
+ new(*args, &block).run
18
+ end
19
+
20
+ def initialize(tasks, stages, process)
21
+ self.tasks = tasks
22
+ self.stages = stages
23
+ self.process = process
24
+ self.threads = []
25
+ end
26
+
27
+ def run
28
+ initial_queue, final_queue = build_pipeline(stages, tasks.length)
29
+
30
+ tasks.each.with_index do |task, i|
31
+ initial_queue << [i, task]
32
+ end
33
+
34
+ result = ordered (0...tasks.length).map { final_queue.pop }
35
+ flush_threads(initial_queue)
36
+ result
37
+ end
38
+
39
+ protected
40
+
41
+ attr_accessor :tasks, :stages, :process
42
+
43
+ # Floods the queue with enough FINISH markers to guarantee that each thread
44
+ # will see one and shut itself down.
45
+ def flush_threads(initial_queue)
46
+ threads.size.times { initial_queue << FINISH }
47
+ threads.each(&:join)
48
+ end
49
+
50
+ def ordered(tasks)
51
+ tasks.
52
+ sort_by(&:first).
53
+ map(&:last)
54
+ end
55
+
56
+ def concurrency(stage)
57
+ 2
58
+ end
59
+
60
+ def build_pipeline(stages, number_of_tasks)
61
+ initial_queue = Queue.new
62
+ final_queue = stages.inject(initial_queue) do |task_queue, stage|
63
+ spawn_workers(stage, task_queue, number_of_tasks)
64
+ end
65
+
66
+ [initial_queue, final_queue]
67
+ end
68
+
69
+ def spawn_workers(stage, task_queue, number_of_tasks)
70
+ next_queue = Queue.new
71
+
72
+ self.threads += in_threads(concurrency(stage)) do
73
+ while true
74
+ index, task = task_queue.pop
75
+ if index == FINISH
76
+ next_queue << FINISH
77
+ break
78
+ else
79
+ next_queue << [index, process.call(stage, task)]
80
+ end
81
+ end
82
+ end
83
+
84
+ next_queue
85
+ end
86
+
87
+ def in_threads(n, &block)
88
+ n.times.map do
89
+ Thread.new(&block)
90
+ end
91
+ end
92
+
93
+ attr_accessor :threads
94
+ end
95
+ end
@@ -0,0 +1,80 @@
1
+ require 'sq/dbsync/pipeline/threaded_context'
2
+ require 'sq/dbsync/pipeline/simple_context'
3
+
4
+ module Sq::Dbsync
5
+
6
+ # An inject/reduce/fold-like abstraction to pass an array through a set of
7
+ # operations, where the result of the first operation is passed to the second
8
+ # operation, second to the third, and so on through the set until the final
9
+ # result is returned. It gracefully handles any individual failure and still
10
+ # allows other results to be computed.
11
+ #
12
+ # Any unhandled exception will place an instance of `Pipeline::Failure` into
13
+ # the returned results.
14
+ #
15
+ # The order and timing of when stages are is undefined (for example, they may
16
+ # be parallelized), so they should be well isolated from each other.
17
+ #
18
+ # Examples
19
+ #
20
+ # Pipeline.new([1, 2, 3],
21
+ # ->(x) { x * x },
22
+ # ->(x) { x + x }
23
+ # ).run
24
+ # # => [2, 8, 18]
25
+ #
26
+ # Pipeline.new([1, 2],
27
+ # ->(x) { x == 1 ? raise : x },
28
+ # ->(x) { x * 10 }
29
+ # ).run
30
+ # # => [Pipeline::Failure, 20]
31
+ class Pipeline
32
+
33
+ def initialize(tasks, *stages)
34
+ self.tasks = tasks
35
+ self.stages = stages
36
+ end
37
+
38
+ # Run the pipeline and return the computed results.
39
+ #
40
+ # context - The computational context in which to run the pipeline. Must
41
+ # respond to `#call` and take tasks, stages, and a processing
42
+ # lambda as arguments. By default runs the pipeline in parallel,
43
+ # but an alternative `SimpleContext` is provided to run in a
44
+ # single thread to aid debugging and testing.
45
+ def run(context = ThreadedContext)
46
+ context.call(tasks, stages, ->(stage, result) {
47
+ process(stage, result)
48
+ })
49
+ end
50
+
51
+ # Used to signal failed operations in a pipeline.
52
+ class Failure < StandardError
53
+ # The original exception that caused this failure.
54
+ attr_reader :wrapped_exception
55
+
56
+ # The task that was being processed when this failure occurred.
57
+ attr_reader :task
58
+
59
+ def initialize(wrapped, task)
60
+ @wrapped_exception = wrapped
61
+ @task = task
62
+ end
63
+ end
64
+
65
+ protected
66
+ def process(stage, task)
67
+ if task.is_a?(Failure)
68
+ task
69
+ else
70
+ begin
71
+ stage.call(task)
72
+ rescue => e
73
+ Failure.new(e, task)
74
+ end
75
+ end
76
+ end
77
+
78
+ attr_accessor :tasks, :stages
79
+ end
80
+ end
@@ -0,0 +1,71 @@
1
+ require 'sq/dbsync/load_action'
2
+
3
+ module Sq::Dbsync
4
+
5
+ # This is a terribly named class that will delete the last X days of data
6
+ # from a table and reload it. Useful for tables that are nearly append only
7
+ # but sometimes will update recent data (for instance, a failed import). The
8
+ # tables are too big to regularly reload in their entirety, but reloading
9
+ # only recent data fixes the main issues.
10
+ class RefreshRecentLoadAction < LoadAction
11
+ WINDOW = 60 * 60 * 24 * 2 # 2 days
12
+
13
+ def operation; 'refresh_recent'; end
14
+
15
+ def prepare
16
+ return false unless plan.refresh_recent
17
+
18
+ super
19
+ end
20
+
21
+ def post_load
22
+ end
23
+
24
+ def extract_data
25
+ @metadata = registry.get(plan.table_name)
26
+ @start_time = now.call
27
+ @since = (
28
+ @metadata[:last_row_at] ||
29
+ @metadata[:last_synced_at]
30
+ ) - WINDOW
31
+ @file, @last_row_at = measure(:extract) { extract_to_file(@since) }
32
+ self
33
+ end
34
+
35
+ def load_data
36
+ measure(:load) do
37
+ tname = plan.table_name
38
+ columns = plan.columns
39
+ db.transaction do
40
+ db.delete_recent(plan, @since)
41
+ db.load_from_file(tname, columns, @file.path)
42
+ end
43
+ end
44
+ @file.close!
45
+ self
46
+ end
47
+
48
+ private
49
+
50
+ def filter_columns
51
+ source = plan.source_db
52
+ source_columns = source.hash_schema(plan.source_table_name).keys
53
+ plan.columns = resolve_columns(plan, source_columns) &
54
+ (target_columns || source_columns)
55
+ end
56
+
57
+ def target_columns
58
+ # Because we may create the target table later if necessary,
59
+ # we need to check if it *really* exists
60
+ target_columns = if target.table_exists?(plan.table_name)
61
+ target.hash_schema(plan.table_name).keys
62
+ else
63
+ nil
64
+ end
65
+ end
66
+
67
+ def prefix
68
+ ''
69
+ end
70
+ end
71
+ end