sq-dbsync 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/HISTORY.md +5 -0
  2. data/LICENSE +14 -0
  3. data/README.md +218 -0
  4. data/lib/sq/dbsync/all_tables_plan.rb +51 -0
  5. data/lib/sq/dbsync/batch_load_action.rb +95 -0
  6. data/lib/sq/dbsync/config.rb +12 -0
  7. data/lib/sq/dbsync/consistency_verifier.rb +70 -0
  8. data/lib/sq/dbsync/database/common.rb +91 -0
  9. data/lib/sq/dbsync/database/connection.rb +23 -0
  10. data/lib/sq/dbsync/database/mysql.rb +163 -0
  11. data/lib/sq/dbsync/database/postgres.rb +77 -0
  12. data/lib/sq/dbsync/error_handler.rb +59 -0
  13. data/lib/sq/dbsync/example_record_destroyer.rb +77 -0
  14. data/lib/sq/dbsync/incremental_load_action.rb +95 -0
  15. data/lib/sq/dbsync/load_action.rb +156 -0
  16. data/lib/sq/dbsync/loggers.rb +135 -0
  17. data/lib/sq/dbsync/manager.rb +241 -0
  18. data/lib/sq/dbsync/pipeline/simple_context.rb +15 -0
  19. data/lib/sq/dbsync/pipeline/threaded_context.rb +95 -0
  20. data/lib/sq/dbsync/pipeline.rb +80 -0
  21. data/lib/sq/dbsync/refresh_recent_load_action.rb +71 -0
  22. data/lib/sq/dbsync/schema_maker.rb +87 -0
  23. data/lib/sq/dbsync/static_table_plan.rb +42 -0
  24. data/lib/sq/dbsync/table_registry.rb +75 -0
  25. data/lib/sq/dbsync/tempfile_factory.rb +41 -0
  26. data/lib/sq/dbsync/version.rb +5 -0
  27. data/lib/sq/dbsync.rb +9 -0
  28. data/spec/acceptance/loading_spec.rb +237 -0
  29. data/spec/acceptance_helper.rb +2 -0
  30. data/spec/database_helper.rb +86 -0
  31. data/spec/integration/all_tables_plan_spec.rb +36 -0
  32. data/spec/integration/batch_load_action_spec.rb +229 -0
  33. data/spec/integration/consistency_verifier_spec.rb +54 -0
  34. data/spec/integration/database_connection_spec.rb +61 -0
  35. data/spec/integration/incremental_load_action_spec.rb +196 -0
  36. data/spec/integration/manager_spec.rb +109 -0
  37. data/spec/integration/schema_maker_spec.rb +119 -0
  38. data/spec/integration_helper.rb +43 -0
  39. data/spec/spec_helper.rb +27 -0
  40. data/spec/unit/config_spec.rb +18 -0
  41. data/spec/unit/error_handler_spec.rb +52 -0
  42. data/spec/unit/pipeline_spec.rb +42 -0
  43. data/spec/unit/stream_logger_spec.rb +33 -0
  44. data/spec/unit_helper.rb +1 -0
  45. data/sq-dbsync.gemspec +32 -0
  46. metadata +188 -0
@@ -0,0 +1,87 @@
1
+ module Sq::Dbsync
2
+ # Service class for mapping table plans to DDL.
3
+ class SchemaMaker
4
+
5
+ # Creates a table in the database based on the given plan. If the table
6
+ # already exists, it will be recreated and any data will be lost.
7
+ def self.create_table(db, table_plan)
8
+ new(db, table_plan).run
9
+ end
10
+
11
+ def run
12
+ table_plan = @table_plan
13
+
14
+ db.create_table!(table_name,
15
+ engine: 'InnoDB',
16
+ charset: table_plan.charset || 'utf8'
17
+ ) do
18
+ extend Helpers
19
+
20
+ add_columns!(table_plan)
21
+ add_indexes!(table_plan)
22
+ add_primary_key!(table_plan)
23
+ end
24
+ end
25
+
26
+ protected
27
+
28
+ def initialize(db, table_plan)
29
+ @db = db
30
+ @table_plan = table_plan
31
+ end
32
+
33
+ attr_reader :db, :table_plan
34
+
35
+ def table_name
36
+ table_plan.prefixed_table_name
37
+ end
38
+
39
+ module Helpers
40
+ def add_columns!(table_plan)
41
+ columns = table_plan.columns
42
+ db_types = table_plan.db_types || {}
43
+ schema = table_plan.schema
44
+ columns.each do |column_name|
45
+ add_column(column_name, db_types, schema)
46
+ end
47
+ end
48
+
49
+ def add_indexes!(table_plan)
50
+ indexes = table_plan.indexes || []
51
+
52
+ indexes.each do |index_name, index_metadata|
53
+ index_columns = index_metadata[:columns]
54
+ unique = index_metadata[:unique] || false
55
+ send(:index, index_columns, name: index_name, unique: unique)
56
+ end
57
+ end
58
+
59
+ def add_primary_key!(table_plan)
60
+ columns = if table_plan.primary_key
61
+ table_plan.primary_key
62
+ else
63
+ table_plan.schema.select {|col, schema|
64
+ schema[:primary_key]
65
+ }.map(&:first)
66
+ end
67
+
68
+ columns = [:id] if columns.empty?
69
+
70
+ primary_key(columns)
71
+ end
72
+
73
+
74
+ def add_column(column_name, db_types, schema)
75
+ db_type = db_types[column_name] || [schema[column_name][:db_type]]
76
+
77
+ extra = if db_type[0] == :enum
78
+ { elements: db_type[1] }
79
+ else
80
+ {}
81
+ end
82
+
83
+ send db_type[0], column_name, extra
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,42 @@
1
+ # Generates a plan given a static "spec" of tables, columns, optional data
2
+ # types, and indexes. This is useful for partial replication of tables, for
3
+ # instance when some columns contain sensitive or large information that is not
4
+ # to be replicated.
5
+ #
6
+ # Simple Example:
7
+ #
8
+ # spec = [{
9
+ # table_name: :users,
10
+ # columns: [:id, :updated_at],
11
+ # indexes: {
12
+ # index_users_on_updated_at: {columns: [:updated_at]}
13
+ # }
14
+ # }]
15
+ class Sq::Dbsync::StaticTablePlan
16
+ def initialize(spec)
17
+ @spec = format_spec(spec)
18
+ end
19
+
20
+ def tables(source)
21
+ deep_clone(@spec).map do |tplan|
22
+ tplan.update(source_db: source)
23
+ end
24
+ end
25
+
26
+ def deep_clone(object)
27
+ Marshal.load(Marshal.dump(object))
28
+ end
29
+
30
+ def format_spec(spec)
31
+ # Support copying from different relations of a Postgres DB, but to the
32
+ # same target database in MySQL.
33
+ spec.map do |table_def|
34
+ unless table_def[:source_table_name]
35
+ table_def[:source_table_name] = table_def[:table_name]
36
+ table_def[:table_name] = table_def[:source_table_name].
37
+ to_s.gsub('__', '_').to_sym
38
+ end
39
+ table_def
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,75 @@
1
+ module Sq::Dbsync
2
+
3
+ # A key-value abstraction that is used to store metadata about loads on a
4
+ # per-table basis.
5
+ class TableRegistry
6
+ def initialize(db)
7
+ @db = db
8
+ @table = db[table_name]
9
+ end
10
+
11
+ def delete(key)
12
+ table.filter(table_name: key.to_s).delete
13
+ end
14
+
15
+ # Set a value if an existing value does not already exist.
16
+ def set(key, values)
17
+ unless exists?(key)
18
+ table.insert(values.merge(table_name: key.to_s))
19
+ end
20
+ end
21
+
22
+ # Set a value, overriding any existing.
23
+ def set!(key, values)
24
+ db.transaction do
25
+ delete(key)
26
+ set(key, values)
27
+ end
28
+ end
29
+
30
+ def update(key, lock, values)
31
+ table.
32
+ filter(
33
+ table_name: key.to_s,
34
+ last_batch_synced_at: lock
35
+ ).
36
+ update(values)
37
+ end
38
+
39
+ def get(key)
40
+ table.
41
+ select(:last_synced_at, :last_row_at, :last_batch_synced_at).
42
+ filter(table_name: key.to_s).
43
+ first
44
+ end
45
+
46
+ def purge_except(keys)
47
+ query = table
48
+ if keys.any?
49
+ query = query.where('table_name NOT IN ?', keys.map(&:to_s))
50
+ end
51
+ query.delete
52
+ end
53
+
54
+ def ensure_storage_exists
55
+ db.create_table?(table_name, charset: 'utf8') do
56
+ String :table_name, primary_key: true
57
+ DateTime :last_synced_at
58
+ DateTime :last_batch_synced_at
59
+ DateTime :last_row_at
60
+ end
61
+ end
62
+
63
+ private
64
+
65
+ attr_reader :table, :db
66
+
67
+ def table_name
68
+ :meta_last_sync_times
69
+ end
70
+
71
+ def exists?(key)
72
+ table.filter(table_name: key.to_s).count > 0
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,41 @@
1
+ require 'tempfile'
2
+
3
+ module Sq::Dbsync
4
+ # Provide extra functionality on top of the standard tempfile API.
5
+ class TempfileFactory
6
+
7
+ # ENV['TMPDIR'] is explicitly referenced here, since a change to JRuby in
8
+ # 1.7.0 makes `Dir.tmpdir` preference non-world writable directories first,
9
+ # of which `.` is a member. This makes it impossible to configure a world
10
+ # writable directory solely via the environment.
11
+ def self.make(name)
12
+ Tempfile.new(name, ENV['TMPDIR'] || Dir.tmpdir)
13
+ end
14
+
15
+ def self.make_with_content(name, content)
16
+ file = make(name)
17
+ file.write(content)
18
+ file.flush
19
+ file
20
+ end
21
+
22
+ # A world writable file is necessary if it is being used as a communication
23
+ # mechanism with other processes (such as MySQL `LOAD DATA INFILE`).
24
+ def self.make_world_writable(name)
25
+ file = make(name)
26
+ file.chmod(0666)
27
+ file
28
+ end
29
+
30
+ def self.split(file, n, logger, &block)
31
+ `split -l #{n} #{file.path} #{file.path}.`
32
+ files = Dir[file.path + '.*']
33
+ files.each_with_index do |tempfile, i|
34
+ logger.log("Loading chunk #{i+1}/#{files.length}")
35
+ block.call(tempfile)
36
+ FileUtils.rm(tempfile)
37
+ end
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,5 @@
1
+ module Sq
2
+ module Dbsync
3
+ VERSION = '1.0.0'
4
+ end
5
+ end
data/lib/sq/dbsync.rb ADDED
@@ -0,0 +1,9 @@
1
+ module Sq
2
+ module Dbsync
3
+ end
4
+ end
5
+
6
+ require 'sq/dbsync/manager'
7
+ require 'sq/dbsync/static_table_plan'
8
+ require 'sq/dbsync/all_tables_plan'
9
+ require 'sq/dbsync/loggers'
@@ -0,0 +1,237 @@
1
+ require 'acceptance_helper'
2
+
3
+ require 'sq/dbsync'
4
+
5
+ describe 'Syncing source databases to a target' do
6
+ let(:config) {{
7
+ sources: TEST_SOURCES,
8
+ target: TEST_TARGET,
9
+ logger: SQD::Loggers::Composite.new([logger]),
10
+ clock: ->{ @now }
11
+ }}
12
+ let(:logger) { SQD::Loggers::NullWithCallbacks.new }
13
+ let(:manager) {
14
+ SQD::Manager.new(config, [[SQD::StaticTablePlan.new(plan), :source]])
15
+ }
16
+ let(:source) { manager.sources.fetch(:source) }
17
+ let(:alt_source) { manager.sources.fetch(:alt_source) }
18
+ let(:target) { manager.target }
19
+ let(:plan) {[{
20
+ table_name: :test_table,
21
+ source_table_name: :test_table,
22
+ refresh_recent: true,
23
+ columns: [:id, :updated_at]
24
+ }] }
25
+
26
+ MINUTE = 60
27
+ WEEK = MINUTE * 60 * 24 * 7
28
+
29
+ before do
30
+ @now = Time.now.utc
31
+
32
+ setup_source_table
33
+ end
34
+
35
+ context 'batch loads' do
36
+ it 'batch loads the nonactive database and switches it to active' do
37
+ row = source[:test_table].insert(updated_at: @now)
38
+
39
+ manager.batch_nonactive
40
+
41
+ target[:test_table].map {|x| x[:id] }.should include(row)
42
+ target[:meta_last_sync_times].count.should == 1
43
+ end
44
+
45
+ it 'catches up missed rows with an incremental update' do
46
+ new_row_id = nil
47
+
48
+ logger.callbacks = {
49
+ 'batch.load.test_table' => ->{
50
+ @now += SQD::BatchLoadAction::MAX_LAG
51
+ new_row_id = source[:test_table].insert(updated_at: @now - 1)
52
+ }
53
+ }
54
+
55
+ row = source[:test_table].insert(updated_at: @now)
56
+
57
+ manager.batch_nonactive
58
+
59
+ target[:test_table].map {|x| x[:id] }.should include(row)
60
+ target[:test_table].map {|x| x[:id] }.should include(new_row_id)
61
+ end
62
+
63
+ it 'loads from two distinct sources' do
64
+ manager = SQD::Manager.new(config, [
65
+ [SQD::StaticTablePlan.new(plan), :source],
66
+ [SQD::AllTablesPlan.new, :alt_source]
67
+ ])
68
+
69
+ row = source[:test_table ].insert(updated_at: @now)
70
+ alt_row = alt_source[:alt_test_table].insert(updated_at: @now)
71
+
72
+ manager.batch_nonactive
73
+
74
+ target[:test_table ].map {|x| x[:id] }.should include(row)
75
+ target[:alt_test_table ].map {|x| x[:id] }.should include(alt_row)
76
+ end
77
+ end
78
+
79
+ context 'refresh recent load' do
80
+ before do
81
+ manager.batch_nonactive
82
+ end
83
+
84
+ it 'reloads all recent data' do
85
+ deleted = 1
86
+ to_keep = 2
87
+ new_row = 3
88
+
89
+ target[:test_table].insert(id: deleted, updated_at: @now)
90
+ target[:test_table].insert(id: to_keep, updated_at: @now - WEEK)
91
+ source[:test_table ].insert(id: new_row, updated_at: @now - MINUTE)
92
+
93
+ x = target[:test_table].map {|x| x[:id] }
94
+
95
+ manager.refresh_recent
96
+
97
+ target[:test_table].map {|x| x[:id] }.should include(new_row)
98
+ target[:test_table].map {|x| x[:id] }.should include(to_keep)
99
+ target[:test_table].map {|x| x[:id] }.should_not include(deleted)
100
+ end
101
+
102
+ describe 'when a column is provided' do
103
+ let(:plan) {[{
104
+ table_name: :test_table,
105
+ refresh_recent: :reporting_date,
106
+ source_table_name: :test_table,
107
+ columns: [:id, :updated_at, :reporting_date]
108
+ }] }
109
+
110
+ it 'adds an extra filter when a column is provided' do
111
+ deleted = 1
112
+ to_keep = 2
113
+ new_row = 3
114
+ to_keep2 = 4
115
+
116
+ target[:test_table].insert(
117
+ id: deleted,
118
+ updated_at: @now,
119
+ reporting_date: @now
120
+ )
121
+ target[:test_table].insert(
122
+ id: to_keep,
123
+ updated_at: @now - WEEK,
124
+ reporting_date: @now
125
+ )
126
+ target[:test_table].insert(
127
+ id: to_keep2,
128
+ updated_at: @now,
129
+ reporting_date: @now - WEEK
130
+ )
131
+ source[:test_table].insert(
132
+ id: new_row,
133
+ updated_at: @now - MINUTE
134
+ )
135
+
136
+ x = target[:test_table].map {|x| x[:id] }
137
+ manager.refresh_recent
138
+
139
+ target[:test_table].map {|x| x[:id] }.should include(new_row)
140
+ target[:test_table].map {|x| x[:id] }.should include(to_keep)
141
+ target[:test_table].map {|x| x[:id] }.should include(to_keep2)
142
+ target[:test_table].map {|x| x[:id] }.should_not include(deleted)
143
+ end
144
+ end
145
+ end
146
+
147
+ context 'incremental loads' do
148
+ let(:worker) { @worker }
149
+ before do
150
+ manager.batch_nonactive
151
+ @worker = background do
152
+ manager.incremental
153
+ end
154
+ end
155
+
156
+ after do
157
+ manager.stop!
158
+ worker.wait_until_finished!
159
+ end
160
+
161
+ def background(&block)
162
+ worker = Thread.new(&block)
163
+ worker.instance_eval do
164
+ def wait_until_finished!; join rescue nil; end
165
+ end
166
+ worker
167
+ end
168
+
169
+ it 'updates the active database' do
170
+ 2.times do |t|
171
+ insert_and_verify_row
172
+ end
173
+ end
174
+
175
+ it 'does not continually retry consistent failures' do
176
+ manager.
177
+ stub!(:incremental_once).
178
+ and_raise(SQD::Database::ExtractError.new)
179
+
180
+ ->{
181
+ worker.value
182
+ }.should raise_error(SQD::Database::ExtractError)
183
+ end
184
+
185
+ context 'with an all tables plan' do
186
+ let(:manager) { SQD::Manager.new(config, [[
187
+ SQD::AllTablesPlan.new, :source
188
+ ]]) }
189
+
190
+ it 'adds a database that is newly in the source but not in the target' do
191
+ source.create_table! :new_table do
192
+ primary_key :id
193
+ DateTime :updated_at
194
+ end
195
+
196
+ insert_and_verify_row(:new_table)
197
+ end
198
+ end
199
+ end
200
+
201
+ def insert_and_verify_row(table = :test_table)
202
+ row = source[table].insert(updated_at: Time.now.utc)
203
+ spin_for(1) do
204
+ target.table_exists?(table) &&
205
+ target[table].map {|x| x[:id] }.include?(row)
206
+ end
207
+ end
208
+
209
+ def setup_source_table
210
+ source.create_table! :test_table do
211
+ primary_key :id
212
+ DateTime :reporting_date
213
+ DateTime :updated_at
214
+ end
215
+ source.create_table! :test_table_2 do
216
+ primary_key :id
217
+ DateTime :updated_at
218
+ end
219
+ alt_source.create_table! :alt_test_table do
220
+ primary_key :id
221
+ DateTime :updated_at
222
+ end
223
+ end
224
+
225
+ def spin_for(timeout = 1)
226
+ result = false
227
+ start_time = Time.now
228
+
229
+ while start_time + timeout > Time.now
230
+ result = yield
231
+ break if result
232
+ sleep 0.001
233
+ end
234
+
235
+ raise "timed out" unless result
236
+ end
237
+ end
@@ -0,0 +1,2 @@
1
+ require 'spec_helper'
2
+ require 'database_helper'
@@ -0,0 +1,86 @@
1
+ require 'sq/dbsync/database/connection'
2
+
3
+ def db_options(opts)
4
+ opts = {
5
+ user: 'root',
6
+ host: 'localhost',
7
+ brand: 'mysql',
8
+ port: opts[:brand] == 'postgresql' ? 5432 : 3306
9
+ }.merge(opts)
10
+
11
+ if RUBY_PLATFORM == 'java'
12
+ opts.merge(
13
+ adapter: "jdbc",
14
+ uri: begin
15
+ base = 'jdbc:%s://%s:%i/%s?user=%s' % [
16
+ opts.fetch(:brand),
17
+ opts.fetch(:host),
18
+ opts.fetch(:port),
19
+ opts.fetch(:database),
20
+ opts.fetch(:user)
21
+ ]
22
+ if opts[:password]
23
+ base += '&password=%s' % opts[:password]
24
+ end
25
+ base
26
+ end
27
+ )
28
+ else
29
+ {
30
+ adapter: opts[:brand] == 'postgresql' ? 'postgres' : 'mysql2',
31
+ }.merge(opts)
32
+ end
33
+ end
34
+
35
+ TEST_SOURCES = {
36
+ source: db_options(database: 'sq_dbsync_test_source'),
37
+ alt_source: db_options(database: 'sq_dbsync_test_source_alt'),
38
+ postgres: db_options(
39
+ user: `whoami`.chomp,
40
+ brand: 'postgresql',
41
+ host: 'localhost',
42
+ database: 'sq_dbsync_pg_test_source'
43
+ )
44
+ }
45
+ TEST_TARGET = db_options(database: 'sq_dbsync_test_target')
46
+
47
+ $target = nil
48
+ def test_target
49
+ $target ||= SQD::Database::Connection.create(TEST_TARGET)
50
+ end
51
+
52
+ $sources = {}
53
+ def test_source(name)
54
+ $sources[name] ||= SQD::Database::Connection.create(TEST_SOURCES.fetch(name))
55
+ end
56
+
57
+ RSpec.configure do |config|
58
+ db_specs_only = {
59
+ example_group: {file_path: /spec\/(integration|acceptance)/}
60
+ }
61
+
62
+ config.before(:suite, db_specs_only) do
63
+ (TEST_SOURCES.values + [TEST_TARGET]).each do |opts|
64
+ db = opts.fetch(:database)
65
+
66
+ case opts.fetch(:brand)
67
+ when 'mysql'
68
+ `mysql -u root -e "drop database if exists #{db}"`
69
+ `mysql -u root -e "create database #{db}"`
70
+ when 'postgresql'
71
+ `dropdb #{db}`
72
+ `createdb #{db}`
73
+ else
74
+ raise "Unknown database: #{opts.inspect}"
75
+ end
76
+ end
77
+ end
78
+
79
+ config.before(:each, db_specs_only) do
80
+ (TEST_SOURCES.keys.map {|x| test_source(x) } + [test_target]).each do |db|
81
+ db.tables.each do |name|
82
+ db.drop_table(name)
83
+ end
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,36 @@
1
+ require 'integration_helper'
2
+
3
+ require 'sq/dbsync/all_tables_plan'
4
+ require 'sq/dbsync/database/connection'
5
+
6
+ describe SQD::AllTablesPlan do
7
+ let(:source) { test_source(:source) }
8
+
9
+ it 'does not return tables with no PK' do
10
+ source.create_table :test_table do
11
+ Integer :col1
12
+ DateTime :updated_at
13
+ end
14
+
15
+ SQD::AllTablesPlan.new.tables(source).should == []
16
+ end
17
+
18
+ it 'does not return tables with no timestamps' do
19
+ source.create_table :test_table do
20
+ primary_key :id
21
+ end
22
+
23
+ SQD::AllTablesPlan.new.tables(source).should == []
24
+ end
25
+
26
+ it 'handles table dropped after select' do
27
+ source.create_table :test_table do
28
+ primary_key :id
29
+ DateTime :updated_at
30
+ end
31
+
32
+ source.should_receive(:schema).and_raise(Sequel::DatabaseError)
33
+
34
+ SQD::AllTablesPlan.new.tables(source).should == []
35
+ end
36
+ end