sq-dbsync 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/HISTORY.md +5 -0
- data/LICENSE +14 -0
- data/README.md +218 -0
- data/lib/sq/dbsync/all_tables_plan.rb +51 -0
- data/lib/sq/dbsync/batch_load_action.rb +95 -0
- data/lib/sq/dbsync/config.rb +12 -0
- data/lib/sq/dbsync/consistency_verifier.rb +70 -0
- data/lib/sq/dbsync/database/common.rb +91 -0
- data/lib/sq/dbsync/database/connection.rb +23 -0
- data/lib/sq/dbsync/database/mysql.rb +163 -0
- data/lib/sq/dbsync/database/postgres.rb +77 -0
- data/lib/sq/dbsync/error_handler.rb +59 -0
- data/lib/sq/dbsync/example_record_destroyer.rb +77 -0
- data/lib/sq/dbsync/incremental_load_action.rb +95 -0
- data/lib/sq/dbsync/load_action.rb +156 -0
- data/lib/sq/dbsync/loggers.rb +135 -0
- data/lib/sq/dbsync/manager.rb +241 -0
- data/lib/sq/dbsync/pipeline/simple_context.rb +15 -0
- data/lib/sq/dbsync/pipeline/threaded_context.rb +95 -0
- data/lib/sq/dbsync/pipeline.rb +80 -0
- data/lib/sq/dbsync/refresh_recent_load_action.rb +71 -0
- data/lib/sq/dbsync/schema_maker.rb +87 -0
- data/lib/sq/dbsync/static_table_plan.rb +42 -0
- data/lib/sq/dbsync/table_registry.rb +75 -0
- data/lib/sq/dbsync/tempfile_factory.rb +41 -0
- data/lib/sq/dbsync/version.rb +5 -0
- data/lib/sq/dbsync.rb +9 -0
- data/spec/acceptance/loading_spec.rb +237 -0
- data/spec/acceptance_helper.rb +2 -0
- data/spec/database_helper.rb +86 -0
- data/spec/integration/all_tables_plan_spec.rb +36 -0
- data/spec/integration/batch_load_action_spec.rb +229 -0
- data/spec/integration/consistency_verifier_spec.rb +54 -0
- data/spec/integration/database_connection_spec.rb +61 -0
- data/spec/integration/incremental_load_action_spec.rb +196 -0
- data/spec/integration/manager_spec.rb +109 -0
- data/spec/integration/schema_maker_spec.rb +119 -0
- data/spec/integration_helper.rb +43 -0
- data/spec/spec_helper.rb +27 -0
- data/spec/unit/config_spec.rb +18 -0
- data/spec/unit/error_handler_spec.rb +52 -0
- data/spec/unit/pipeline_spec.rb +42 -0
- data/spec/unit/stream_logger_spec.rb +33 -0
- data/spec/unit_helper.rb +1 -0
- data/sq-dbsync.gemspec +32 -0
- metadata +188 -0
@@ -0,0 +1,135 @@
|
|
1
|
+
require 'time'
|
2
|
+
require 'socket'
|
3
|
+
|
4
|
+
# Instrumenting various aspects of the system is critical since it will take
|
5
|
+
# longer and longer as the data sources grow and it is necessary to know when
|
6
|
+
# this import time is taking too long (either replication can't keep up, or
|
7
|
+
# recovery time is too long).
|
8
|
+
module Sq::Dbsync::Loggers
|
9
|
+
|
10
|
+
# Abstract base class for loggers. This is useful because the CompositeLogger
|
11
|
+
# needs to delegate to a set of loggers, which requires an explicit interface
|
12
|
+
# to communicate with. This class helps define that relationship and describe
|
13
|
+
# the interfaces.
|
14
|
+
class Abstract
|
15
|
+
def measure(label, &block); end
|
16
|
+
def log(str); end
|
17
|
+
end
|
18
|
+
|
19
|
+
# Writes timing information to stdout. Thread-safe in that calls to measure
|
20
|
+
# from separate threads will execute in parallel but synchronize before
|
21
|
+
# writing their output.
|
22
|
+
class Stream < Abstract
|
23
|
+
def initialize(out = $stdout)
|
24
|
+
@mutex = Mutex.new
|
25
|
+
@out = out
|
26
|
+
end
|
27
|
+
|
28
|
+
def measure(label, &block)
|
29
|
+
start_time = Time.now.utc
|
30
|
+
log_measurement(start_time, :starting, 0, label)
|
31
|
+
ret = nil
|
32
|
+
exception = nil
|
33
|
+
state = :finished
|
34
|
+
begin
|
35
|
+
ret = block.call
|
36
|
+
rescue => e
|
37
|
+
state = :failed
|
38
|
+
exception = e
|
39
|
+
raise
|
40
|
+
ensure
|
41
|
+
end_time = Time.now.utc
|
42
|
+
log_measurement(end_time, state, end_time - start_time, label)
|
43
|
+
log(exception.message) if exception
|
44
|
+
end
|
45
|
+
ret
|
46
|
+
end
|
47
|
+
|
48
|
+
def log_measurement(time, event, duration, object)
|
49
|
+
log([
|
50
|
+
event,
|
51
|
+
"%.3f" % duration,
|
52
|
+
object
|
53
|
+
].join("\t"), time)
|
54
|
+
end
|
55
|
+
|
56
|
+
def log(str, time = Time.now.utc)
|
57
|
+
# Synchronize to ensure lines are not interwoven.
|
58
|
+
mutex.synchronize { out.puts([time, str].join("\t")) }
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
attr_reader :mutex, :out
|
64
|
+
end
|
65
|
+
|
66
|
+
# Combines multiple loggers together.
|
67
|
+
class Composite < Abstract
|
68
|
+
attr_accessor :loggers
|
69
|
+
|
70
|
+
def initialize(loggers = nil)
|
71
|
+
@loggers = loggers
|
72
|
+
end
|
73
|
+
|
74
|
+
def measure(label, &block)
|
75
|
+
# Babushka doll! Logger inside a logger inside a logger.
|
76
|
+
loggers.inject(block) do |block, logger|
|
77
|
+
lambda do
|
78
|
+
logger.measure(label) do
|
79
|
+
block.call
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end.call
|
83
|
+
end
|
84
|
+
|
85
|
+
def log(str)
|
86
|
+
loggers.each { |logger| logger.log(str) }
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Logs metric run time to graphite.
|
91
|
+
class Graphite < Abstract
|
92
|
+
def initialize(opts)
|
93
|
+
@opts = opts
|
94
|
+
end
|
95
|
+
|
96
|
+
def measure(label, &block)
|
97
|
+
start_time = Time.now.utc
|
98
|
+
block.call
|
99
|
+
ensure
|
100
|
+
end_time = Time.now.utc
|
101
|
+
record_metric(end_time.to_i, label, end_time - start_time)
|
102
|
+
end
|
103
|
+
|
104
|
+
def record_metric(timestamp, name, value)
|
105
|
+
msg = "#{@opts.fetch(:prefix, 'dbsync')}.#{name} #{value} #{timestamp}\n"
|
106
|
+
|
107
|
+
s = TCPSocket.new(@opts[:host], @opts.fetch(:port, 2003))
|
108
|
+
s.send msg, 0
|
109
|
+
s.close
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# Used in test environments where instrumentation is not required.
|
114
|
+
class Null < Abstract
|
115
|
+
def measure(label, &block)
|
116
|
+
block.call
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Logging is one of the few outputs of the system, this class is provided as a
|
121
|
+
# cheap way to allow tests to hook into events. It should not be used in
|
122
|
+
# production.
|
123
|
+
class NullWithCallbacks < Abstract
|
124
|
+
attr_accessor :callbacks
|
125
|
+
|
126
|
+
def initialize(callbacks = nil)
|
127
|
+
@callbacks = callbacks
|
128
|
+
end
|
129
|
+
|
130
|
+
def measure(label, &block)
|
131
|
+
(callbacks || {}).fetch(label, ->{}).call
|
132
|
+
block.call
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,241 @@
|
|
1
|
+
require 'sq/dbsync/config'
|
2
|
+
require 'sq/dbsync/batch_load_action'
|
3
|
+
require 'sq/dbsync/incremental_load_action'
|
4
|
+
require 'sq/dbsync/refresh_recent_load_action'
|
5
|
+
require 'sq/dbsync/pipeline'
|
6
|
+
require 'sq/dbsync/table_registry'
|
7
|
+
require 'sq/dbsync/consistency_verifier'
|
8
|
+
require 'sq/dbsync/database/connection'
|
9
|
+
require 'sq/dbsync/error_handler'
|
10
|
+
|
11
|
+
# The manager orchestrates the high level functions of the sync, such as
|
12
|
+
# keeping the database up-to-date and batch loading.
|
13
|
+
#
|
14
|
+
# This is the main entry point for the application.
|
15
|
+
class Sq::Dbsync::Manager
|
16
|
+
include Sq::Dbsync
|
17
|
+
|
18
|
+
EPOCH = Date.new(2000, 1, 1).to_time
|
19
|
+
MAX_RETRIES = 10
|
20
|
+
|
21
|
+
def initialize(config, plans)
|
22
|
+
@config = Sq::Dbsync::Config.make(config)
|
23
|
+
@plans = plans
|
24
|
+
@error_handler = ErrorHandler.new(config)
|
25
|
+
end
|
26
|
+
|
27
|
+
def batch(tables = :all)
|
28
|
+
error_handler.wrap do
|
29
|
+
batch_nonactive(tables)
|
30
|
+
refresh_recent(tables)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def increment
|
35
|
+
error_handler.wrap do
|
36
|
+
incremental
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def batch_nonactive(tables = :all)
|
41
|
+
registry.ensure_storage_exists
|
42
|
+
|
43
|
+
measure(:batch_total) do
|
44
|
+
raise_all_if_pipeline_failure(
|
45
|
+
run_load(BatchLoadAction, Pipeline::ThreadedContext, tables)
|
46
|
+
)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def refresh_recent(tables = :all)
|
51
|
+
registry.ensure_storage_exists
|
52
|
+
|
53
|
+
measure(:refresh_recent_total) do
|
54
|
+
raise_all_if_pipeline_failure(
|
55
|
+
run_load(RefreshRecentLoadAction, Pipeline::ThreadedContext, tables)
|
56
|
+
)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def incremental
|
61
|
+
@running = true
|
62
|
+
counter = 0
|
63
|
+
|
64
|
+
loop_with_retry_on(->{ @running }, transient_exceptions) do
|
65
|
+
incremental_once
|
66
|
+
|
67
|
+
counter = (counter + 1) % 100
|
68
|
+
if counter == 1
|
69
|
+
# No need to do this every cycle, 100 is chosen to be as good as any
|
70
|
+
# other number. It should run on the very first cycle however so that
|
71
|
+
# the specs will cover it.
|
72
|
+
increment_checkpoint
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def incremental_once
|
78
|
+
# In theory, this ensures that any changes to the source IP (such as from a
|
79
|
+
# virtual IP flip) are picked up.
|
80
|
+
sources.each do |_, db|
|
81
|
+
db.disconnect
|
82
|
+
end
|
83
|
+
|
84
|
+
raise_if_pipeline_failure(
|
85
|
+
# ThreadedContext would be ideal here, but it leaks memory in JRuby. Not
|
86
|
+
# sure why yet, but mass creation of threads seems like an obvious
|
87
|
+
# candidate for brokenness.
|
88
|
+
#
|
89
|
+
# TODO: Above comment probably isn't true with 1.7 and ThreadedContext
|
90
|
+
# fixes.
|
91
|
+
run_load(incremental_action, Pipeline::SimpleContext)
|
92
|
+
)
|
93
|
+
end
|
94
|
+
|
95
|
+
# Actions that need to be performed regularly, but not every cycle. Please do
|
96
|
+
# suggest a better name for this method.
|
97
|
+
def increment_checkpoint
|
98
|
+
# No need to do this every cycle, 100 is chosen to be as good as any
|
99
|
+
# other number. It should run on the very first cycle however so that
|
100
|
+
# our specs will cover it.
|
101
|
+
verifier.check_consistency!(tables_to_load)
|
102
|
+
|
103
|
+
purge_registry
|
104
|
+
end
|
105
|
+
|
106
|
+
def stop!
|
107
|
+
@running = false
|
108
|
+
end
|
109
|
+
|
110
|
+
def target
|
111
|
+
@target ||= Sq::Dbsync::Database::Connection.create(config[:target])
|
112
|
+
end
|
113
|
+
|
114
|
+
def tables_to_load
|
115
|
+
plans_with_sources.map do |plan, source|
|
116
|
+
plan.tables(source).map do |x|
|
117
|
+
x.update(source_db: source)
|
118
|
+
end
|
119
|
+
end.reduce([], :+).uniq {|x| x[:table_name] }
|
120
|
+
end
|
121
|
+
|
122
|
+
def plans_with_sources
|
123
|
+
@plans_with_sources ||= plans.map do |plan, source_name|
|
124
|
+
[plan, sources.fetch(source_name)]
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def sources
|
129
|
+
@sources ||= Hash[config[:sources].map do |name, opts|
|
130
|
+
[name, Sq::Dbsync::Database::Connection.create(opts)]
|
131
|
+
end]
|
132
|
+
end
|
133
|
+
|
134
|
+
attr_accessor :config, :plans, :error_handler
|
135
|
+
|
136
|
+
private
|
137
|
+
|
138
|
+
def run_load(action, context, tables = :all)
|
139
|
+
items = tables_to_load.map do |tplan|
|
140
|
+
if tables != :all
|
141
|
+
next unless tables.include?(tplan[:table_name])
|
142
|
+
|
143
|
+
# Force loading of specified tables, otherwise it would be impossible
|
144
|
+
# to batch load tables that were not regularly loaded.
|
145
|
+
tplan[:batch_load] = true
|
146
|
+
|
147
|
+
# Force refresh of tables, this is expected behaviour if you are
|
148
|
+
# calling the refresh-recent script with an explicit table list.
|
149
|
+
tplan[:refresh_recent] = true
|
150
|
+
end
|
151
|
+
|
152
|
+
if tplan[:refresh_recent].is_a?(Symbol)
|
153
|
+
tplan[:aux_timestamp_column] = tplan[:refresh_recent]
|
154
|
+
end
|
155
|
+
|
156
|
+
action.new(db, tplan, registry, logger, config[:clock])
|
157
|
+
end.compact
|
158
|
+
Pipeline.new(items, *LoadAction.stages).run(context)
|
159
|
+
end
|
160
|
+
|
161
|
+
# This is necessary so that old tables that are no longer being synced do not
|
162
|
+
# break our lag calculations.
|
163
|
+
def purge_registry
|
164
|
+
registry.purge_except(expected_table_names)
|
165
|
+
end
|
166
|
+
|
167
|
+
def expected_table_names
|
168
|
+
tables_to_load.map {|x| x[:table_name] } + config.fetch(:extra_tables, [])
|
169
|
+
end
|
170
|
+
|
171
|
+
def loop_with_retry_on(guard, transient_exceptions, &block)
|
172
|
+
consecutive_fails = 0
|
173
|
+
|
174
|
+
while guard.call
|
175
|
+
begin
|
176
|
+
block.call
|
177
|
+
consecutive_fails = 0
|
178
|
+
rescue *transient_exceptions
|
179
|
+
consecutive_fails += 1
|
180
|
+
raise if consecutive_fails >= MAX_RETRIES
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
def raise_if_pipeline_failure(results)
|
186
|
+
results.each do |result|
|
187
|
+
if result.is_a?(Pipeline::Failure)
|
188
|
+
raise result.wrapped_exception
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
def raise_all_if_pipeline_failure(results)
|
194
|
+
failed = false
|
195
|
+
results.each do |result|
|
196
|
+
if result.is_a?(Pipeline::Failure)
|
197
|
+
error_handler.notify_error(result.task.tag, result.wrapped_exception)
|
198
|
+
failed = true
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
if failed
|
203
|
+
raise Database::ExtractError,
|
204
|
+
"One or more loads failed, see other exceptions for details."
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def measure(label, &block)
|
209
|
+
logger.measure(label) do
|
210
|
+
block.call
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
def registry
|
215
|
+
TableRegistry.new(target)
|
216
|
+
end
|
217
|
+
|
218
|
+
def verifier
|
219
|
+
@verifier ||= ConsistencyVerifier.new(target, registry)
|
220
|
+
end
|
221
|
+
|
222
|
+
def logger
|
223
|
+
config[:logger]
|
224
|
+
end
|
225
|
+
|
226
|
+
def db
|
227
|
+
@db ||= Database::Connection.create(config[:target])
|
228
|
+
end
|
229
|
+
|
230
|
+
def transient_exceptions
|
231
|
+
[
|
232
|
+
Database::ExtractError,
|
233
|
+
Database::TransientError
|
234
|
+
]
|
235
|
+
end
|
236
|
+
|
237
|
+
def incremental_action
|
238
|
+
config.fetch(:incremental_action, IncrementalLoadAction)
|
239
|
+
end
|
240
|
+
|
241
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# See lib/pipeline.rb
|
2
|
+
class Sq::Dbsync::Pipeline
|
3
|
+
|
4
|
+
# A computational context that passes a number of tasks through a set of
|
5
|
+
# stages in sequence.
|
6
|
+
class SimpleContext
|
7
|
+
def self.call(tasks, stages, process)
|
8
|
+
tasks.map do |task|
|
9
|
+
stages.inject(task) do |result, stage|
|
10
|
+
process.call(stage, result)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require 'thread'
|
2
|
+
|
3
|
+
# See lib/sq/dbsync/pipeline.rb
|
4
|
+
class Sq::Dbsync::Pipeline
|
5
|
+
|
6
|
+
# A computational context for passing a number of tasks through a set of
|
7
|
+
# stages, where each stage uses resources independent of the other stages.
|
8
|
+
# For example, stage one may be able to execute a maximum of two tasks at
|
9
|
+
# once, and stage two may also have a maximum of two, but it is optimum
|
10
|
+
# that a total of four tasks to be processing at any one time.
|
11
|
+
class ThreadedContext
|
12
|
+
|
13
|
+
# Tracer object to mark the end of a stream of tasks.
|
14
|
+
FINISH = Object.new
|
15
|
+
|
16
|
+
def self.call(*args, &block)
|
17
|
+
new(*args, &block).run
|
18
|
+
end
|
19
|
+
|
20
|
+
def initialize(tasks, stages, process)
|
21
|
+
self.tasks = tasks
|
22
|
+
self.stages = stages
|
23
|
+
self.process = process
|
24
|
+
self.threads = []
|
25
|
+
end
|
26
|
+
|
27
|
+
def run
|
28
|
+
initial_queue, final_queue = build_pipeline(stages, tasks.length)
|
29
|
+
|
30
|
+
tasks.each.with_index do |task, i|
|
31
|
+
initial_queue << [i, task]
|
32
|
+
end
|
33
|
+
|
34
|
+
result = ordered (0...tasks.length).map { final_queue.pop }
|
35
|
+
flush_threads(initial_queue)
|
36
|
+
result
|
37
|
+
end
|
38
|
+
|
39
|
+
protected
|
40
|
+
|
41
|
+
attr_accessor :tasks, :stages, :process
|
42
|
+
|
43
|
+
# Floods the queue with enough FINISH markers to guarantee that each thread
|
44
|
+
# will see one and shut itself down.
|
45
|
+
def flush_threads(initial_queue)
|
46
|
+
threads.size.times { initial_queue << FINISH }
|
47
|
+
threads.each(&:join)
|
48
|
+
end
|
49
|
+
|
50
|
+
def ordered(tasks)
|
51
|
+
tasks.
|
52
|
+
sort_by(&:first).
|
53
|
+
map(&:last)
|
54
|
+
end
|
55
|
+
|
56
|
+
def concurrency(stage)
|
57
|
+
2
|
58
|
+
end
|
59
|
+
|
60
|
+
def build_pipeline(stages, number_of_tasks)
|
61
|
+
initial_queue = Queue.new
|
62
|
+
final_queue = stages.inject(initial_queue) do |task_queue, stage|
|
63
|
+
spawn_workers(stage, task_queue, number_of_tasks)
|
64
|
+
end
|
65
|
+
|
66
|
+
[initial_queue, final_queue]
|
67
|
+
end
|
68
|
+
|
69
|
+
def spawn_workers(stage, task_queue, number_of_tasks)
|
70
|
+
next_queue = Queue.new
|
71
|
+
|
72
|
+
self.threads += in_threads(concurrency(stage)) do
|
73
|
+
while true
|
74
|
+
index, task = task_queue.pop
|
75
|
+
if index == FINISH
|
76
|
+
next_queue << FINISH
|
77
|
+
break
|
78
|
+
else
|
79
|
+
next_queue << [index, process.call(stage, task)]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
next_queue
|
85
|
+
end
|
86
|
+
|
87
|
+
def in_threads(n, &block)
|
88
|
+
n.times.map do
|
89
|
+
Thread.new(&block)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
attr_accessor :threads
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'sq/dbsync/pipeline/threaded_context'
|
2
|
+
require 'sq/dbsync/pipeline/simple_context'
|
3
|
+
|
4
|
+
module Sq::Dbsync
|
5
|
+
|
6
|
+
# An inject/reduce/fold-like abstraction to pass an array through a set of
|
7
|
+
# operations, where the result of the first operation is passed to the second
|
8
|
+
# operation, second to the third, and so on through the set until the final
|
9
|
+
# result is returned. It gracefully handles any individual failure and still
|
10
|
+
# allows other results to be computed.
|
11
|
+
#
|
12
|
+
# Any unhandled exception will place an instance of `Pipeline::Failure` into
|
13
|
+
# the returned results.
|
14
|
+
#
|
15
|
+
# The order and timing of when stages are is undefined (for example, they may
|
16
|
+
# be parallelized), so they should be well isolated from each other.
|
17
|
+
#
|
18
|
+
# Examples
|
19
|
+
#
|
20
|
+
# Pipeline.new([1, 2, 3],
|
21
|
+
# ->(x) { x * x },
|
22
|
+
# ->(x) { x + x }
|
23
|
+
# ).run
|
24
|
+
# # => [2, 8, 18]
|
25
|
+
#
|
26
|
+
# Pipeline.new([1, 2],
|
27
|
+
# ->(x) { x == 1 ? raise : x },
|
28
|
+
# ->(x) { x * 10 }
|
29
|
+
# ).run
|
30
|
+
# # => [Pipeline::Failure, 20]
|
31
|
+
class Pipeline
|
32
|
+
|
33
|
+
def initialize(tasks, *stages)
|
34
|
+
self.tasks = tasks
|
35
|
+
self.stages = stages
|
36
|
+
end
|
37
|
+
|
38
|
+
# Run the pipeline and return the computed results.
|
39
|
+
#
|
40
|
+
# context - The computational context in which to run the pipeline. Must
|
41
|
+
# respond to `#call` and take tasks, stages, and a processing
|
42
|
+
# lambda as arguments. By default runs the pipeline in parallel,
|
43
|
+
# but an alternative `SimpleContext` is provided to run in a
|
44
|
+
# single thread to aid debugging and testing.
|
45
|
+
def run(context = ThreadedContext)
|
46
|
+
context.call(tasks, stages, ->(stage, result) {
|
47
|
+
process(stage, result)
|
48
|
+
})
|
49
|
+
end
|
50
|
+
|
51
|
+
# Used to signal failed operations in a pipeline.
|
52
|
+
class Failure < StandardError
|
53
|
+
# The original exception that caused this failure.
|
54
|
+
attr_reader :wrapped_exception
|
55
|
+
|
56
|
+
# The task that was being processed when this failure occurred.
|
57
|
+
attr_reader :task
|
58
|
+
|
59
|
+
def initialize(wrapped, task)
|
60
|
+
@wrapped_exception = wrapped
|
61
|
+
@task = task
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
protected
|
66
|
+
def process(stage, task)
|
67
|
+
if task.is_a?(Failure)
|
68
|
+
task
|
69
|
+
else
|
70
|
+
begin
|
71
|
+
stage.call(task)
|
72
|
+
rescue => e
|
73
|
+
Failure.new(e, task)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
attr_accessor :tasks, :stages
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'sq/dbsync/load_action'
|
2
|
+
|
3
|
+
module Sq::Dbsync
|
4
|
+
|
5
|
+
# This is a terribly named class that will delete the last X days of data
|
6
|
+
# from a table and reload it. Useful for tables that are nearly append only
|
7
|
+
# but sometimes will update recent data (for instance, a failed import). The
|
8
|
+
# tables are too big to regularly reload in their entirety, but reloading
|
9
|
+
# only recent data fixes the main issues.
|
10
|
+
class RefreshRecentLoadAction < LoadAction
|
11
|
+
WINDOW = 60 * 60 * 24 * 2 # 2 days
|
12
|
+
|
13
|
+
def operation; 'refresh_recent'; end
|
14
|
+
|
15
|
+
def prepare
|
16
|
+
return false unless plan.refresh_recent
|
17
|
+
|
18
|
+
super
|
19
|
+
end
|
20
|
+
|
21
|
+
def post_load
|
22
|
+
end
|
23
|
+
|
24
|
+
def extract_data
|
25
|
+
@metadata = registry.get(plan.table_name)
|
26
|
+
@start_time = now.call
|
27
|
+
@since = (
|
28
|
+
@metadata[:last_row_at] ||
|
29
|
+
@metadata[:last_synced_at]
|
30
|
+
) - WINDOW
|
31
|
+
@file, @last_row_at = measure(:extract) { extract_to_file(@since) }
|
32
|
+
self
|
33
|
+
end
|
34
|
+
|
35
|
+
def load_data
|
36
|
+
measure(:load) do
|
37
|
+
tname = plan.table_name
|
38
|
+
columns = plan.columns
|
39
|
+
db.transaction do
|
40
|
+
db.delete_recent(plan, @since)
|
41
|
+
db.load_from_file(tname, columns, @file.path)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
@file.close!
|
45
|
+
self
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def filter_columns
|
51
|
+
source = plan.source_db
|
52
|
+
source_columns = source.hash_schema(plan.source_table_name).keys
|
53
|
+
plan.columns = resolve_columns(plan, source_columns) &
|
54
|
+
(target_columns || source_columns)
|
55
|
+
end
|
56
|
+
|
57
|
+
def target_columns
|
58
|
+
# Because we may create the target table later if necessary,
|
59
|
+
# we need to check if it *really* exists
|
60
|
+
target_columns = if target.table_exists?(plan.table_name)
|
61
|
+
target.hash_schema(plan.table_name).keys
|
62
|
+
else
|
63
|
+
nil
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def prefix
|
68
|
+
''
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|