sq-dbsync 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/HISTORY.md +5 -0
  2. data/LICENSE +14 -0
  3. data/README.md +218 -0
  4. data/lib/sq/dbsync/all_tables_plan.rb +51 -0
  5. data/lib/sq/dbsync/batch_load_action.rb +95 -0
  6. data/lib/sq/dbsync/config.rb +12 -0
  7. data/lib/sq/dbsync/consistency_verifier.rb +70 -0
  8. data/lib/sq/dbsync/database/common.rb +91 -0
  9. data/lib/sq/dbsync/database/connection.rb +23 -0
  10. data/lib/sq/dbsync/database/mysql.rb +163 -0
  11. data/lib/sq/dbsync/database/postgres.rb +77 -0
  12. data/lib/sq/dbsync/error_handler.rb +59 -0
  13. data/lib/sq/dbsync/example_record_destroyer.rb +77 -0
  14. data/lib/sq/dbsync/incremental_load_action.rb +95 -0
  15. data/lib/sq/dbsync/load_action.rb +156 -0
  16. data/lib/sq/dbsync/loggers.rb +135 -0
  17. data/lib/sq/dbsync/manager.rb +241 -0
  18. data/lib/sq/dbsync/pipeline/simple_context.rb +15 -0
  19. data/lib/sq/dbsync/pipeline/threaded_context.rb +95 -0
  20. data/lib/sq/dbsync/pipeline.rb +80 -0
  21. data/lib/sq/dbsync/refresh_recent_load_action.rb +71 -0
  22. data/lib/sq/dbsync/schema_maker.rb +87 -0
  23. data/lib/sq/dbsync/static_table_plan.rb +42 -0
  24. data/lib/sq/dbsync/table_registry.rb +75 -0
  25. data/lib/sq/dbsync/tempfile_factory.rb +41 -0
  26. data/lib/sq/dbsync/version.rb +5 -0
  27. data/lib/sq/dbsync.rb +9 -0
  28. data/spec/acceptance/loading_spec.rb +237 -0
  29. data/spec/acceptance_helper.rb +2 -0
  30. data/spec/database_helper.rb +86 -0
  31. data/spec/integration/all_tables_plan_spec.rb +36 -0
  32. data/spec/integration/batch_load_action_spec.rb +229 -0
  33. data/spec/integration/consistency_verifier_spec.rb +54 -0
  34. data/spec/integration/database_connection_spec.rb +61 -0
  35. data/spec/integration/incremental_load_action_spec.rb +196 -0
  36. data/spec/integration/manager_spec.rb +109 -0
  37. data/spec/integration/schema_maker_spec.rb +119 -0
  38. data/spec/integration_helper.rb +43 -0
  39. data/spec/spec_helper.rb +27 -0
  40. data/spec/unit/config_spec.rb +18 -0
  41. data/spec/unit/error_handler_spec.rb +52 -0
  42. data/spec/unit/pipeline_spec.rb +42 -0
  43. data/spec/unit/stream_logger_spec.rb +33 -0
  44. data/spec/unit_helper.rb +1 -0
  45. data/sq-dbsync.gemspec +32 -0
  46. metadata +188 -0
data/HISTORY.md ADDED
@@ -0,0 +1,5 @@
1
+ # Square Dbsync History
2
+
3
+ ## 1.0.0 - 23 February 2013 (c505c0e7)
4
+
5
+ * Initial public release.
data/LICENSE ADDED
@@ -0,0 +1,14 @@
1
+
2
+ Copyright 2012 Square Inc.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
data/README.md ADDED
@@ -0,0 +1,218 @@
1
+ Square Dbsync
2
+ =============
3
+
4
+ An extract and load system to shunt data between databases.
5
+
6
+ It uses timestamp based replication which is fast and easy to keep running,
7
+ but has some caveats. Most notably, it does not handle deletes well (see
8
+ documentation below for details).
9
+
10
+ This was useful to us at Square because we needed partial (only select
11
+ columns), continuous replication from both MySQL and PostgreSQL databases to a
12
+ single target database with some basic ETL logic along the way. None of the
13
+ existing solutions were able to do this adequately.
14
+
15
+ At some point you will need to bite the bullet and implement a real ETL system,
16
+ but `sq-dbsync` can tide you over until you get there.
17
+
18
+ Usage
19
+ -----
20
+
21
+ ``` Ruby
22
+ include Sq::Dbsync
23
+
24
+ # Config will typically differ per environment.
25
+ config = {
26
+ sources: {
27
+ db_a: {
28
+ database: 'db_a_production',
29
+ user: 'sqdbsync-ro',
30
+ password: 'password',
31
+ host: 'db-a-host',
32
+ brand: 'mysql',
33
+ port: 3306,
34
+ },
35
+ db_b: {
36
+ database: 'db_b_production',
37
+ user: 'sqdbsync-ro',
38
+ password: 'password',
39
+ host: 'db-b-host',
40
+ brand: 'postgresl',
41
+ port: 5432,
42
+ }
43
+ },
44
+ target: {
45
+ database: 'replica',
46
+ user: 'sqdbsync',
47
+ password: 'password',
48
+
49
+ # Only localhost supported, since `LOAD DATA INFILE` is used which
50
+ # requires a shared temp directory.
51
+ host: 'localhost',
52
+ brand: 'mysql',
53
+ port: 3306,
54
+ },
55
+
56
+ # Optional configuration
57
+ logger: Loggers::Stream.new, # A graphite logger is provided, see source.
58
+ clock: ->{ Time.now.utc }, # In test env it can be useful to fix this.
59
+ error_handler: ->(e) { puts(e) } # Notify your exception system
60
+ }
61
+
62
+ # Write plans that specify how data is replicated.
63
+ DB_A_PLAN = [{
64
+ table_name: :users,
65
+ columns: [
66
+ # You must replicate the primary key.
67
+ :id,
68
+
69
+ # You must replicate a timestamp column, and it should be indexed on the
70
+ # target system.
71
+ :updated_at,
72
+
73
+ # Then whatever other columns you require.
74
+ :name,
75
+ :account_type,
76
+ :created_at,
77
+
78
+ ],
79
+ indexes: {
80
+ # Indexing it on the source system is optional
81
+ index_users_on_updated_at: {:columns=>[:updated_at], :unique=>false},
82
+ },
83
+
84
+ # Basic schema transformations are supported.
85
+ db_types: {
86
+ :account_type => [:enum, %w(
87
+ bronze
88
+ silver
89
+ gold
90
+ )]
91
+ }
92
+ },{
93
+ table_name: :account_types,
94
+ source_table_name: :user_account_types,
95
+ columns: :all
96
+ }]
97
+
98
+ plans = [
99
+ [StaticTablePlan.new(DB_A_PLAN), :db_a],
100
+ [AllTablesPlan.new, :db_b]
101
+ ]
102
+
103
+ manager = Manager.new(config, plans)
104
+
105
+ # Run a batch load nightly
106
+ manager.batch(ALL_TABLES)
107
+
108
+ # Run an incremental load continuously
109
+ manager.increment
110
+
111
+ # You can load a subset of tables if necessary
112
+ manager.batch([:users])
113
+ ```
114
+
115
+ Documentation
116
+ -------------
117
+
118
+ ### Plan Options
119
+
120
+ * `batch_load` whether or not to batch load this table in the default batch
121
+ load. If the table is specifically requested, it will be loaded regardless of
122
+ this setting. (default: true)
123
+ * `charset` charset to use when creating the table. Passed directly through to
124
+ [Sequel::MySQL::Database#connect](http://sequel.rubyforge.org/rdoc-adapters/classes/Sequel/MySQL/Database.html).
125
+ MySQL only, ignored for Postgres. (default: 'utf8')
126
+ * `columns` Either an array of columns to replicate, or `:all` indicating that
127
+ all columns should be replicated. (required)
128
+ * `consistency` Perform a basic consistency check on the table regularly during
129
+ the incremental load by comparing recent counts of the source and target
130
+ tables. Make sure you have a timestamp index on both tables! This was
131
+ particularly useful when developing the project, but honestly probably isn't
132
+ that useful now --- I can't remember the last time I saw an error from this.
133
+ (default: false)
134
+ * `db_types` A hash that allows you to modify the target schema from the
135
+ source. See the example in usage section above. (default: `{}`)
136
+ * `indexes` A hash defining desired indexes on the target table. Indexes are
137
+ *not* copied from source tables. See example in usage section above.
138
+ (default: `{}`)
139
+ * `refresh_recent` Some table are too large to batch load regularly, but
140
+ modifications are known to be recent. This setting will cause the last two
141
+ days of data to be dropped an recreated as part of the nightly batch load.
142
+ (default: false)
143
+ * `source_table_name` Allows the source and target tables to be named
144
+ differently. (default: `table_name` configuration option)
145
+ * `timestamp_table_name` A hack to workaround the postgres query planner
146
+ failing to use indexes correctly for `MAX()` on a view that uses `UNION`
147
+ under the covers. If this describes your source view, and one of the
148
+ underlying tables is guaranteed to contain the latest record you can set this
149
+ value to that and it will be used for all timestamp related queries. If not,
150
+ you must provide a custom view that supports a `MAX` query with a sane query
151
+ plan. (default: nil)
152
+ * `table_name` The name of the table to be replicated. If `source_table_name`
153
+ is specified, this option defines the name of the table in the target
154
+ database only.
155
+ * `primary_key` Usually the primary key can be inferred from the source schema,
156
+ but if you are replicating from a view you will need to specify it explictly
157
+ with this option. Should be an array of symbols. (default: nil, will
158
+ auto-detect from source schema)
159
+ * `timestamp` The column to treat as a timestamp. Must be a member of the
160
+ `:columns` option. (default: select `updated_at` or `created_at`, in that
161
+ order)
162
+
163
+ ### Handling Deletes
164
+
165
+ The incremental load has no way of detecting deleted records. The nightly batch
166
+ load will reload all tables, so there will be at most a one day turn-around on
167
+ deletes. Some tables will be too big to batch load every night however, so this
168
+ is not a great solution in that case.
169
+
170
+ If you have an "audit" table that contains enough data for you to reconstruct
171
+ deletes in other tables, then you can provide a custom subclass to the
172
+ incremental loader that will be able to run this logic.
173
+
174
+ ``` ruby
175
+ class IncrementalLoadWithDeletes < Sq::Dbsync::IncrementalLoadAction
176
+ def process_deletes
177
+ if plan.table_name == :audit_logs
178
+ ExampleRecordDestroyer.run(db, registry, :audit_logs, :other_table)
179
+ end
180
+ end
181
+ end
182
+
183
+ CONFIG = {
184
+ # ...
185
+ incremental_action: IncrementalLoadWithDeletes,
186
+ }
187
+ ```
188
+
189
+ See `lib/sq/dbsync/example_record_destroyer` for a sample implementation.
190
+
191
+ ### Database Settings
192
+
193
+ If your target database is MySQL, we recommend that you ensure it is running
194
+ under the `READ COMMITTED` isolation level. This makes it much harder for an
195
+ analyst to lock a table and block replication. (Statements like `CREATE TABLE
196
+ AS SELECT FROM ...` tend to be the culprit.)
197
+
198
+ Developing
199
+ ----------
200
+
201
+ bundle
202
+ bundle exec rake
203
+
204
+ Compatibility
205
+ -------------
206
+
207
+ Requires 1.9. Tested on CRuby 1.9.3 and JRuby.
208
+
209
+ ## Support
210
+
211
+ Make a [new github issue](https://github.com/square/sq-dbsync/issues/new).
212
+
213
+ ## Contributing
214
+
215
+ Fork and patch! Before any changes are merged to master, we need you to sign an
216
+ [Individual Contributor
217
+ Agreement](https://spreadsheets.google.com/a/squareup.com/spreadsheet/viewform?formkey=dDViT2xzUHAwRkI3X3k5Z0lQM091OGc6MQ&ndplr=1)
218
+ (Google Form).
@@ -0,0 +1,51 @@
1
+ module Sq::Dbsync
2
+ # Fetches all tables from the given source, retrieving tables and columns.
3
+ # Indexes are currently ignored.
4
+ class AllTablesPlan
5
+ def tables(source)
6
+ source.ensure_connection
7
+
8
+ source.tables.map do |t|
9
+ schema_for_table(source, t)
10
+ end.compact
11
+ end
12
+
13
+ private
14
+
15
+ def schema_for_table(source, t)
16
+ schema = source.schema(t, reload: true)
17
+
18
+ return unless has_primary_key?(schema)
19
+ return unless has_timestamp?(schema)
20
+
21
+ cols = schema.map do |col|
22
+ col[0]
23
+ end
24
+
25
+ {
26
+ source_db: source,
27
+ source_table_name: t,
28
+ table_name: t,
29
+ columns: cols,
30
+ indexes: {},
31
+ always_sync: true
32
+ }
33
+ rescue Sequel::DatabaseError
34
+ # This handles a race condition where the table is deleted between us
35
+ # selecting the list of tables and fetching the schema.
36
+ nil
37
+ end
38
+
39
+ def has_primary_key?(schema)
40
+ schema.any? do |table|
41
+ table[1][:primary_key]
42
+ end
43
+ end
44
+
45
+ def has_timestamp?(schema)
46
+ schema.any? do |table|
47
+ [:updated_at, :created_at].include?(table[0])
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,95 @@
1
+ require 'sq/dbsync/load_action'
2
+
3
+ module Sq::Dbsync
4
+ # Load action to reload an entire table in full. The table will be loaded in
5
+ # parallel to the existing one, then atomically swapped in on completion.
6
+ class BatchLoadAction < LoadAction
7
+ MAX_LAG = 60 * 5
8
+
9
+ def operation; 'batch'; end
10
+
11
+ def prepare
12
+ return false if plan.batch_load == false
13
+
14
+ if super
15
+ if target.table_exists?(plan.prefixed_table_name)
16
+ target.drop_table(plan.prefixed_table_name)
17
+ end
18
+ true
19
+ end
20
+ end
21
+
22
+ def extract_data
23
+ @start_time = now.call
24
+ @file, @last_row_at = measure(:extract) { extract_to_file(nil) }
25
+ self
26
+ end
27
+
28
+ def load_data
29
+ measure(:load) do
30
+ TempfileFactory.split(@file, 1_000_000, logger) do |path|
31
+ db.load_from_file(
32
+ plan.prefixed_table_name,
33
+ plan.columns,
34
+ path
35
+ )
36
+ end
37
+ @file.close!
38
+ end
39
+ self
40
+ end
41
+
42
+ def post_load
43
+ while @start_time <= now.call - MAX_LAG
44
+ @start_time = now.call
45
+ catchup
46
+ end
47
+
48
+ switch_tables
49
+ self
50
+ end
51
+
52
+ private
53
+
54
+ def filter_columns
55
+ source = plan.source_db
56
+ source_columns = source.hash_schema(plan.source_table_name).keys
57
+
58
+ plan.columns = resolve_columns(plan, source_columns)
59
+ end
60
+
61
+ def prefix
62
+ 'new_'
63
+ end
64
+
65
+ def catchup
66
+ file, @last_row_at = measure(:catchup_extract) {
67
+ extract_to_file(@last_row_at ? @last_row_at - overlap : nil)
68
+ }
69
+ measure(:catchup_load) do
70
+ db.load_incrementally_from_file(
71
+ plan.prefixed_table_name,
72
+ plan.columns,
73
+ file.path
74
+ )
75
+ file.close!
76
+ end
77
+ end
78
+
79
+ def switch_tables
80
+ measure(:switch) do
81
+ registry.delete(plan.table_name)
82
+ db.switch_table(
83
+ plan.table_name,
84
+ plan.prefixed_table_name
85
+ )
86
+ registry.set(plan.table_name,
87
+ last_synced_at: @start_time,
88
+ last_batch_synced_at: @start_time,
89
+ last_row_at: @last_row_at
90
+ )
91
+ end
92
+ end
93
+
94
+ end
95
+ end
@@ -0,0 +1,12 @@
1
+ require 'sq/dbsync/loggers'
2
+
3
+ # Helper class to provide sane defaults to user-supplied config.
4
+ class Sq::Dbsync::Config
5
+ def self.make(hash)
6
+ {
7
+ clock: ->{ Time.now.utc },
8
+ logger: Sq::Dbsync::Loggers::Stream.new,
9
+ error_handler: ->(e) { $stderr.puts(e.message, e.backtrace) }
10
+ }.merge(hash)
11
+ end
12
+ end
@@ -0,0 +1,70 @@
1
+ require 'sq/dbsync/load_action' # For overlap, not ideal
2
+
3
+ module Sq::Dbsync
4
+
5
+ # Performs a cheap check to verify that the number of records present for a
6
+ # recent time slice are the same across source and target tables.
7
+ #
8
+ # This checks consistency on the current tables, not the new_ set.
9
+ class ConsistencyVerifier
10
+ def initialize(target, registry)
11
+ @target = target
12
+ @registry = registry
13
+ end
14
+
15
+ def check_consistency!(tables)
16
+ tables.each do |tplan|
17
+ next unless tplan[:consistency]
18
+ verify_consistency!(tplan)
19
+ end
20
+ end
21
+
22
+ def verify_consistency!(tplan)
23
+ last_row_at = registry.get(tplan[:table_name])[:last_row_at]
24
+ return unless last_row_at
25
+
26
+ now = registry.get(tplan[:table_name])[:last_row_at] - LoadAction.overlap
27
+
28
+ counts = [
29
+ tplan[:source_db],
30
+ target
31
+ ].map do |x|
32
+ x.consistency_check(tplan[:table_name], now)
33
+ end
34
+
35
+ delta = counts.reduce(:-)
36
+
37
+ unless delta == 0
38
+ raise ConsistencyError.new(
39
+ tplan[:table_name],
40
+ delta,
41
+ "source: #{tplan[:source_db].name} (count: #{counts[0]}), " +
42
+ "sink: #{target.name} (count: #{counts[1]})"
43
+ )
44
+ end
45
+ end
46
+
47
+ attr_reader :target, :registry
48
+
49
+ # Used to signal an observed error in the number of records between source
50
+ # and target tables. There are no current known situations in which this
51
+ # occurs, though in the past buggy handling of replication lag was normally
52
+ # the culprit.
53
+ #
54
+ # If it does occur, a good first response is to set `last_sync_time` to the
55
+ # last batch time (usually within 24 hours) which will force the
56
+ # incremental load to reconsider all recent records.
57
+ class ConsistencyError < RuntimeError
58
+ def initialize(table_name, delta, description="")
59
+ @table_name = table_name
60
+ @delta = delta
61
+ @description = description
62
+ end
63
+
64
+ def message
65
+ output = "%s had a count difference of %i" % [@table_name, @delta]
66
+ output = output + "; " + @description if !@description.empty?
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,91 @@
1
+ require 'sq/dbsync/tempfile_factory'
2
+
3
+ module Sq::Dbsync::Database
4
+ module Common
5
+
6
+ SQD = ::Sq::Dbsync
7
+
8
+ def extract_to_file(table_name, columns, file_name)
9
+ extract_sql_to_file("SELECT %s FROM %s" % [
10
+ columns.join(', '),
11
+ table_name
12
+ ], file_name)
13
+ end
14
+
15
+ def extract_incrementally_to_file(plan, file_name, last_row_at, overlap)
16
+ table_name = plan.source_table_name.to_sym
17
+ db_columns = db.schema(table_name).map(&:first)
18
+
19
+ query = self[table_name].select(*plan.columns)
20
+ if last_row_at
21
+ query = query.filter("#{plan.timestamp} > ?", last_row_at - overlap)
22
+ end
23
+
24
+ extract_sql_to_file(query.sql, file_name)
25
+ end
26
+
27
+ def hash_schema(table_name)
28
+ ensure_connection
29
+ Hash[schema(table_name)]
30
+ end
31
+
32
+ def name
33
+ self['SELECT database()'].first.fetch(:'database()')
34
+ end
35
+
36
+ # Since we go so long without using connections (during a batch load), they
37
+ # go stale and raise DatabaseDisconnectError when we try to use them. This
38
+ # method ensures that the connection is fresh even after a long time
39
+ # between drinks.
40
+ def ensure_connection
41
+ db.disconnect
42
+ end
43
+
44
+ def __getobj__
45
+ db
46
+ end
47
+
48
+ def __setobj__(db)
49
+ @db = db
50
+ end
51
+
52
+ protected
53
+
54
+ def execute!(cmd)
55
+ # psql doesn't return a non-zero error code when executing commands from
56
+ # a file. The best way I can come up with is to raise if anything is
57
+ # present on stderr.
58
+ errors_file = SQD::TempfileFactory.make('extract_sql_to_file_errors')
59
+
60
+ cmd = %{bash -c "#{cmd.gsub(/"/, '\\"')}"}
61
+
62
+ result = run_shell(cmd, errors_file)
63
+
64
+ unless result.exitstatus == 0 && File.size(errors_file.path) == 0
65
+ raise(ExtractError, "Command failed: #{cmd}")
66
+ end
67
+ ensure
68
+ errors_file.close! if errors_file
69
+ end
70
+
71
+ def sql_to_file(sql)
72
+ SQD::TempfileFactory.make_with_content('extract_sql_to_file', sql)
73
+ end
74
+
75
+ private
76
+
77
+ def run_shell(cmd, errors_file)
78
+ if RUBY_PLATFORM == 'java'
79
+ IO.popen4(cmd) {|_, _, _, stderr|
80
+ errors_file.write(stderr.read)
81
+ errors_file.flush
82
+ }
83
+ $?
84
+ else
85
+ pid = Process.spawn(cmd, STDERR => errors_file.path)
86
+ Process.waitpid2(pid)[1]
87
+ end
88
+ end
89
+
90
+ end
91
+ end
@@ -0,0 +1,23 @@
1
+ require 'sequel/no_core_ext'
2
+
3
+ Sequel.default_timezone = :utc
4
+
5
+ require 'sq/dbsync/database/mysql'
6
+ require 'sq/dbsync/database/postgres'
7
+
8
+ module Sq::Dbsync::Database
9
+ # Factory class to abstract selection of a decorator to faciliate databases
10
+ # other than MySQL.
11
+ class Connection
12
+ def self.create(opts)
13
+ case opts[:brand]
14
+ when 'mysql'
15
+ Sq::Dbsync::Database::Mysql.new(Sequel.connect(opts))
16
+ when 'postgresql'
17
+ Sq::Dbsync::Database::Postgres.new(Sequel.connect(opts))
18
+ else
19
+ raise "Unsupported database: #{opts.inspect}"
20
+ end
21
+ end
22
+ end
23
+ end