sq-dbsync 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/HISTORY.md +5 -0
  2. data/LICENSE +14 -0
  3. data/README.md +218 -0
  4. data/lib/sq/dbsync/all_tables_plan.rb +51 -0
  5. data/lib/sq/dbsync/batch_load_action.rb +95 -0
  6. data/lib/sq/dbsync/config.rb +12 -0
  7. data/lib/sq/dbsync/consistency_verifier.rb +70 -0
  8. data/lib/sq/dbsync/database/common.rb +91 -0
  9. data/lib/sq/dbsync/database/connection.rb +23 -0
  10. data/lib/sq/dbsync/database/mysql.rb +163 -0
  11. data/lib/sq/dbsync/database/postgres.rb +77 -0
  12. data/lib/sq/dbsync/error_handler.rb +59 -0
  13. data/lib/sq/dbsync/example_record_destroyer.rb +77 -0
  14. data/lib/sq/dbsync/incremental_load_action.rb +95 -0
  15. data/lib/sq/dbsync/load_action.rb +156 -0
  16. data/lib/sq/dbsync/loggers.rb +135 -0
  17. data/lib/sq/dbsync/manager.rb +241 -0
  18. data/lib/sq/dbsync/pipeline/simple_context.rb +15 -0
  19. data/lib/sq/dbsync/pipeline/threaded_context.rb +95 -0
  20. data/lib/sq/dbsync/pipeline.rb +80 -0
  21. data/lib/sq/dbsync/refresh_recent_load_action.rb +71 -0
  22. data/lib/sq/dbsync/schema_maker.rb +87 -0
  23. data/lib/sq/dbsync/static_table_plan.rb +42 -0
  24. data/lib/sq/dbsync/table_registry.rb +75 -0
  25. data/lib/sq/dbsync/tempfile_factory.rb +41 -0
  26. data/lib/sq/dbsync/version.rb +5 -0
  27. data/lib/sq/dbsync.rb +9 -0
  28. data/spec/acceptance/loading_spec.rb +237 -0
  29. data/spec/acceptance_helper.rb +2 -0
  30. data/spec/database_helper.rb +86 -0
  31. data/spec/integration/all_tables_plan_spec.rb +36 -0
  32. data/spec/integration/batch_load_action_spec.rb +229 -0
  33. data/spec/integration/consistency_verifier_spec.rb +54 -0
  34. data/spec/integration/database_connection_spec.rb +61 -0
  35. data/spec/integration/incremental_load_action_spec.rb +196 -0
  36. data/spec/integration/manager_spec.rb +109 -0
  37. data/spec/integration/schema_maker_spec.rb +119 -0
  38. data/spec/integration_helper.rb +43 -0
  39. data/spec/spec_helper.rb +27 -0
  40. data/spec/unit/config_spec.rb +18 -0
  41. data/spec/unit/error_handler_spec.rb +52 -0
  42. data/spec/unit/pipeline_spec.rb +42 -0
  43. data/spec/unit/stream_logger_spec.rb +33 -0
  44. data/spec/unit_helper.rb +1 -0
  45. data/sq-dbsync.gemspec +32 -0
  46. metadata +188 -0
data/HISTORY.md ADDED
@@ -0,0 +1,5 @@
1
+ # Square Dbsync History
2
+
3
+ ## 1.0.0 - 23 February 2013 (c505c0e7)
4
+
5
+ * Initial public release.
data/LICENSE ADDED
@@ -0,0 +1,14 @@
1
+
2
+ Copyright 2012 Square Inc.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
data/README.md ADDED
@@ -0,0 +1,218 @@
1
+ Square Dbsync
2
+ =============
3
+
4
+ An extract and load system to shunt data between databases.
5
+
6
+ It uses timestamp based replication which is fast and easy to keep running,
7
+ but has some caveats. Most notably, it does not handle deletes well (see
8
+ documentation below for details).
9
+
10
+ This was useful to us at Square because we needed partial (only select
11
+ columns), continuous replication from both MySQL and PostgreSQL databases to a
12
+ single target database with some basic ETL logic along the way. None of the
13
+ existing solutions were able to do this adequately.
14
+
15
+ At some point you will need to bite the bullet and implement a real ETL system,
16
+ but `sq-dbsync` can tide you over until you get there.
17
+
18
+ Usage
19
+ -----
20
+
21
+ ``` Ruby
22
+ include Sq::Dbsync
23
+
24
+ # Config will typically differ per environment.
25
+ config = {
26
+ sources: {
27
+ db_a: {
28
+ database: 'db_a_production',
29
+ user: 'sqdbsync-ro',
30
+ password: 'password',
31
+ host: 'db-a-host',
32
+ brand: 'mysql',
33
+ port: 3306,
34
+ },
35
+ db_b: {
36
+ database: 'db_b_production',
37
+ user: 'sqdbsync-ro',
38
+ password: 'password',
39
+ host: 'db-b-host',
40
+ brand: 'postgresl',
41
+ port: 5432,
42
+ }
43
+ },
44
+ target: {
45
+ database: 'replica',
46
+ user: 'sqdbsync',
47
+ password: 'password',
48
+
49
+ # Only localhost supported, since `LOAD DATA INFILE` is used which
50
+ # requires a shared temp directory.
51
+ host: 'localhost',
52
+ brand: 'mysql',
53
+ port: 3306,
54
+ },
55
+
56
+ # Optional configuration
57
+ logger: Loggers::Stream.new, # A graphite logger is provided, see source.
58
+ clock: ->{ Time.now.utc }, # In test env it can be useful to fix this.
59
+ error_handler: ->(e) { puts(e) } # Notify your exception system
60
+ }
61
+
62
+ # Write plans that specify how data is replicated.
63
+ DB_A_PLAN = [{
64
+ table_name: :users,
65
+ columns: [
66
+ # You must replicate the primary key.
67
+ :id,
68
+
69
+ # You must replicate a timestamp column, and it should be indexed on the
70
+ # target system.
71
+ :updated_at,
72
+
73
+ # Then whatever other columns you require.
74
+ :name,
75
+ :account_type,
76
+ :created_at,
77
+
78
+ ],
79
+ indexes: {
80
+ # Indexing it on the source system is optional
81
+ index_users_on_updated_at: {:columns=>[:updated_at], :unique=>false},
82
+ },
83
+
84
+ # Basic schema transformations are supported.
85
+ db_types: {
86
+ :account_type => [:enum, %w(
87
+ bronze
88
+ silver
89
+ gold
90
+ )]
91
+ }
92
+ },{
93
+ table_name: :account_types,
94
+ source_table_name: :user_account_types,
95
+ columns: :all
96
+ }]
97
+
98
+ plans = [
99
+ [StaticTablePlan.new(DB_A_PLAN), :db_a],
100
+ [AllTablesPlan.new, :db_b]
101
+ ]
102
+
103
+ manager = Manager.new(config, plans)
104
+
105
+ # Run a batch load nightly
106
+ manager.batch(ALL_TABLES)
107
+
108
+ # Run an incremental load continuously
109
+ manager.increment
110
+
111
+ # You can load a subset of tables if necessary
112
+ manager.batch([:users])
113
+ ```
114
+
115
+ Documentation
116
+ -------------
117
+
118
+ ### Plan Options
119
+
120
+ * `batch_load` whether or not to batch load this table in the default batch
121
+ load. If the table is specifically requested, it will be loaded regardless of
122
+ this setting. (default: true)
123
+ * `charset` charset to use when creating the table. Passed directly through to
124
+ [Sequel::MySQL::Database#connect](http://sequel.rubyforge.org/rdoc-adapters/classes/Sequel/MySQL/Database.html).
125
+ MySQL only, ignored for Postgres. (default: 'utf8')
126
+ * `columns` Either an array of columns to replicate, or `:all` indicating that
127
+ all columns should be replicated. (required)
128
+ * `consistency` Perform a basic consistency check on the table regularly during
129
+ the incremental load by comparing recent counts of the source and target
130
+ tables. Make sure you have a timestamp index on both tables! This was
131
+ particularly useful when developing the project, but honestly probably isn't
132
+ that useful now --- I can't remember the last time I saw an error from this.
133
+ (default: false)
134
+ * `db_types` A hash that allows you to modify the target schema from the
135
+ source. See the example in usage section above. (default: `{}`)
136
+ * `indexes` A hash defining desired indexes on the target table. Indexes are
137
+ *not* copied from source tables. See example in usage section above.
138
+ (default: `{}`)
139
+ * `refresh_recent` Some table are too large to batch load regularly, but
140
+ modifications are known to be recent. This setting will cause the last two
141
+ days of data to be dropped an recreated as part of the nightly batch load.
142
+ (default: false)
143
+ * `source_table_name` Allows the source and target tables to be named
144
+ differently. (default: `table_name` configuration option)
145
+ * `timestamp_table_name` A hack to workaround the postgres query planner
146
+ failing to use indexes correctly for `MAX()` on a view that uses `UNION`
147
+ under the covers. If this describes your source view, and one of the
148
+ underlying tables is guaranteed to contain the latest record you can set this
149
+ value to that and it will be used for all timestamp related queries. If not,
150
+ you must provide a custom view that supports a `MAX` query with a sane query
151
+ plan. (default: nil)
152
+ * `table_name` The name of the table to be replicated. If `source_table_name`
153
+ is specified, this option defines the name of the table in the target
154
+ database only.
155
+ * `primary_key` Usually the primary key can be inferred from the source schema,
156
+ but if you are replicating from a view you will need to specify it explictly
157
+ with this option. Should be an array of symbols. (default: nil, will
158
+ auto-detect from source schema)
159
+ * `timestamp` The column to treat as a timestamp. Must be a member of the
160
+ `:columns` option. (default: select `updated_at` or `created_at`, in that
161
+ order)
162
+
163
+ ### Handling Deletes
164
+
165
+ The incremental load has no way of detecting deleted records. The nightly batch
166
+ load will reload all tables, so there will be at most a one day turn-around on
167
+ deletes. Some tables will be too big to batch load every night however, so this
168
+ is not a great solution in that case.
169
+
170
+ If you have an "audit" table that contains enough data for you to reconstruct
171
+ deletes in other tables, then you can provide a custom subclass to the
172
+ incremental loader that will be able to run this logic.
173
+
174
+ ``` ruby
175
+ class IncrementalLoadWithDeletes < Sq::Dbsync::IncrementalLoadAction
176
+ def process_deletes
177
+ if plan.table_name == :audit_logs
178
+ ExampleRecordDestroyer.run(db, registry, :audit_logs, :other_table)
179
+ end
180
+ end
181
+ end
182
+
183
+ CONFIG = {
184
+ # ...
185
+ incremental_action: IncrementalLoadWithDeletes,
186
+ }
187
+ ```
188
+
189
+ See `lib/sq/dbsync/example_record_destroyer` for a sample implementation.
190
+
191
+ ### Database Settings
192
+
193
+ If your target database is MySQL, we recommend that you ensure it is running
194
+ under the `READ COMMITTED` isolation level. This makes it much harder for an
195
+ analyst to lock a table and block replication. (Statements like `CREATE TABLE
196
+ AS SELECT FROM ...` tend to be the culprit.)
197
+
198
+ Developing
199
+ ----------
200
+
201
+ bundle
202
+ bundle exec rake
203
+
204
+ Compatibility
205
+ -------------
206
+
207
+ Requires 1.9. Tested on CRuby 1.9.3 and JRuby.
208
+
209
+ ## Support
210
+
211
+ Make a [new github issue](https://github.com/square/sq-dbsync/issues/new).
212
+
213
+ ## Contributing
214
+
215
+ Fork and patch! Before any changes are merged to master, we need you to sign an
216
+ [Individual Contributor
217
+ Agreement](https://spreadsheets.google.com/a/squareup.com/spreadsheet/viewform?formkey=dDViT2xzUHAwRkI3X3k5Z0lQM091OGc6MQ&ndplr=1)
218
+ (Google Form).
@@ -0,0 +1,51 @@
1
+ module Sq::Dbsync
2
+ # Fetches all tables from the given source, retrieving tables and columns.
3
+ # Indexes are currently ignored.
4
+ class AllTablesPlan
5
+ def tables(source)
6
+ source.ensure_connection
7
+
8
+ source.tables.map do |t|
9
+ schema_for_table(source, t)
10
+ end.compact
11
+ end
12
+
13
+ private
14
+
15
+ def schema_for_table(source, t)
16
+ schema = source.schema(t, reload: true)
17
+
18
+ return unless has_primary_key?(schema)
19
+ return unless has_timestamp?(schema)
20
+
21
+ cols = schema.map do |col|
22
+ col[0]
23
+ end
24
+
25
+ {
26
+ source_db: source,
27
+ source_table_name: t,
28
+ table_name: t,
29
+ columns: cols,
30
+ indexes: {},
31
+ always_sync: true
32
+ }
33
+ rescue Sequel::DatabaseError
34
+ # This handles a race condition where the table is deleted between us
35
+ # selecting the list of tables and fetching the schema.
36
+ nil
37
+ end
38
+
39
+ def has_primary_key?(schema)
40
+ schema.any? do |table|
41
+ table[1][:primary_key]
42
+ end
43
+ end
44
+
45
+ def has_timestamp?(schema)
46
+ schema.any? do |table|
47
+ [:updated_at, :created_at].include?(table[0])
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,95 @@
1
+ require 'sq/dbsync/load_action'
2
+
3
+ module Sq::Dbsync
4
+ # Load action to reload an entire table in full. The table will be loaded in
5
+ # parallel to the existing one, then atomically swapped in on completion.
6
+ class BatchLoadAction < LoadAction
7
+ MAX_LAG = 60 * 5
8
+
9
+ def operation; 'batch'; end
10
+
11
+ def prepare
12
+ return false if plan.batch_load == false
13
+
14
+ if super
15
+ if target.table_exists?(plan.prefixed_table_name)
16
+ target.drop_table(plan.prefixed_table_name)
17
+ end
18
+ true
19
+ end
20
+ end
21
+
22
+ def extract_data
23
+ @start_time = now.call
24
+ @file, @last_row_at = measure(:extract) { extract_to_file(nil) }
25
+ self
26
+ end
27
+
28
+ def load_data
29
+ measure(:load) do
30
+ TempfileFactory.split(@file, 1_000_000, logger) do |path|
31
+ db.load_from_file(
32
+ plan.prefixed_table_name,
33
+ plan.columns,
34
+ path
35
+ )
36
+ end
37
+ @file.close!
38
+ end
39
+ self
40
+ end
41
+
42
+ def post_load
43
+ while @start_time <= now.call - MAX_LAG
44
+ @start_time = now.call
45
+ catchup
46
+ end
47
+
48
+ switch_tables
49
+ self
50
+ end
51
+
52
+ private
53
+
54
+ def filter_columns
55
+ source = plan.source_db
56
+ source_columns = source.hash_schema(plan.source_table_name).keys
57
+
58
+ plan.columns = resolve_columns(plan, source_columns)
59
+ end
60
+
61
+ def prefix
62
+ 'new_'
63
+ end
64
+
65
+ def catchup
66
+ file, @last_row_at = measure(:catchup_extract) {
67
+ extract_to_file(@last_row_at ? @last_row_at - overlap : nil)
68
+ }
69
+ measure(:catchup_load) do
70
+ db.load_incrementally_from_file(
71
+ plan.prefixed_table_name,
72
+ plan.columns,
73
+ file.path
74
+ )
75
+ file.close!
76
+ end
77
+ end
78
+
79
+ def switch_tables
80
+ measure(:switch) do
81
+ registry.delete(plan.table_name)
82
+ db.switch_table(
83
+ plan.table_name,
84
+ plan.prefixed_table_name
85
+ )
86
+ registry.set(plan.table_name,
87
+ last_synced_at: @start_time,
88
+ last_batch_synced_at: @start_time,
89
+ last_row_at: @last_row_at
90
+ )
91
+ end
92
+ end
93
+
94
+ end
95
+ end
@@ -0,0 +1,12 @@
1
+ require 'sq/dbsync/loggers'
2
+
3
+ # Helper class to provide sane defaults to user-supplied config.
4
+ class Sq::Dbsync::Config
5
+ def self.make(hash)
6
+ {
7
+ clock: ->{ Time.now.utc },
8
+ logger: Sq::Dbsync::Loggers::Stream.new,
9
+ error_handler: ->(e) { $stderr.puts(e.message, e.backtrace) }
10
+ }.merge(hash)
11
+ end
12
+ end
@@ -0,0 +1,70 @@
1
+ require 'sq/dbsync/load_action' # For overlap, not ideal
2
+
3
+ module Sq::Dbsync
4
+
5
+ # Performs a cheap check to verify that the number of records present for a
6
+ # recent time slice are the same across source and target tables.
7
+ #
8
+ # This checks consistency on the current tables, not the new_ set.
9
+ class ConsistencyVerifier
10
+ def initialize(target, registry)
11
+ @target = target
12
+ @registry = registry
13
+ end
14
+
15
+ def check_consistency!(tables)
16
+ tables.each do |tplan|
17
+ next unless tplan[:consistency]
18
+ verify_consistency!(tplan)
19
+ end
20
+ end
21
+
22
+ def verify_consistency!(tplan)
23
+ last_row_at = registry.get(tplan[:table_name])[:last_row_at]
24
+ return unless last_row_at
25
+
26
+ now = registry.get(tplan[:table_name])[:last_row_at] - LoadAction.overlap
27
+
28
+ counts = [
29
+ tplan[:source_db],
30
+ target
31
+ ].map do |x|
32
+ x.consistency_check(tplan[:table_name], now)
33
+ end
34
+
35
+ delta = counts.reduce(:-)
36
+
37
+ unless delta == 0
38
+ raise ConsistencyError.new(
39
+ tplan[:table_name],
40
+ delta,
41
+ "source: #{tplan[:source_db].name} (count: #{counts[0]}), " +
42
+ "sink: #{target.name} (count: #{counts[1]})"
43
+ )
44
+ end
45
+ end
46
+
47
+ attr_reader :target, :registry
48
+
49
+ # Used to signal an observed error in the number of records between source
50
+ # and target tables. There are no current known situations in which this
51
+ # occurs, though in the past buggy handling of replication lag was normally
52
+ # the culprit.
53
+ #
54
+ # If it does occur, a good first response is to set `last_sync_time` to the
55
+ # last batch time (usually within 24 hours) which will force the
56
+ # incremental load to reconsider all recent records.
57
+ class ConsistencyError < RuntimeError
58
+ def initialize(table_name, delta, description="")
59
+ @table_name = table_name
60
+ @delta = delta
61
+ @description = description
62
+ end
63
+
64
+ def message
65
+ output = "%s had a count difference of %i" % [@table_name, @delta]
66
+ output = output + "; " + @description if !@description.empty?
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,91 @@
1
+ require 'sq/dbsync/tempfile_factory'
2
+
3
+ module Sq::Dbsync::Database
4
+ module Common
5
+
6
+ SQD = ::Sq::Dbsync
7
+
8
+ def extract_to_file(table_name, columns, file_name)
9
+ extract_sql_to_file("SELECT %s FROM %s" % [
10
+ columns.join(', '),
11
+ table_name
12
+ ], file_name)
13
+ end
14
+
15
+ def extract_incrementally_to_file(plan, file_name, last_row_at, overlap)
16
+ table_name = plan.source_table_name.to_sym
17
+ db_columns = db.schema(table_name).map(&:first)
18
+
19
+ query = self[table_name].select(*plan.columns)
20
+ if last_row_at
21
+ query = query.filter("#{plan.timestamp} > ?", last_row_at - overlap)
22
+ end
23
+
24
+ extract_sql_to_file(query.sql, file_name)
25
+ end
26
+
27
+ def hash_schema(table_name)
28
+ ensure_connection
29
+ Hash[schema(table_name)]
30
+ end
31
+
32
+ def name
33
+ self['SELECT database()'].first.fetch(:'database()')
34
+ end
35
+
36
+ # Since we go so long without using connections (during a batch load), they
37
+ # go stale and raise DatabaseDisconnectError when we try to use them. This
38
+ # method ensures that the connection is fresh even after a long time
39
+ # between drinks.
40
+ def ensure_connection
41
+ db.disconnect
42
+ end
43
+
44
+ def __getobj__
45
+ db
46
+ end
47
+
48
+ def __setobj__(db)
49
+ @db = db
50
+ end
51
+
52
+ protected
53
+
54
+ def execute!(cmd)
55
+ # psql doesn't return a non-zero error code when executing commands from
56
+ # a file. The best way I can come up with is to raise if anything is
57
+ # present on stderr.
58
+ errors_file = SQD::TempfileFactory.make('extract_sql_to_file_errors')
59
+
60
+ cmd = %{bash -c "#{cmd.gsub(/"/, '\\"')}"}
61
+
62
+ result = run_shell(cmd, errors_file)
63
+
64
+ unless result.exitstatus == 0 && File.size(errors_file.path) == 0
65
+ raise(ExtractError, "Command failed: #{cmd}")
66
+ end
67
+ ensure
68
+ errors_file.close! if errors_file
69
+ end
70
+
71
+ def sql_to_file(sql)
72
+ SQD::TempfileFactory.make_with_content('extract_sql_to_file', sql)
73
+ end
74
+
75
+ private
76
+
77
+ def run_shell(cmd, errors_file)
78
+ if RUBY_PLATFORM == 'java'
79
+ IO.popen4(cmd) {|_, _, _, stderr|
80
+ errors_file.write(stderr.read)
81
+ errors_file.flush
82
+ }
83
+ $?
84
+ else
85
+ pid = Process.spawn(cmd, STDERR => errors_file.path)
86
+ Process.waitpid2(pid)[1]
87
+ end
88
+ end
89
+
90
+ end
91
+ end
@@ -0,0 +1,23 @@
1
+ require 'sequel/no_core_ext'
2
+
3
+ Sequel.default_timezone = :utc
4
+
5
+ require 'sq/dbsync/database/mysql'
6
+ require 'sq/dbsync/database/postgres'
7
+
8
+ module Sq::Dbsync::Database
9
+ # Factory class to abstract selection of a decorator to faciliate databases
10
+ # other than MySQL.
11
+ class Connection
12
+ def self.create(opts)
13
+ case opts[:brand]
14
+ when 'mysql'
15
+ Sq::Dbsync::Database::Mysql.new(Sequel.connect(opts))
16
+ when 'postgresql'
17
+ Sq::Dbsync::Database::Postgres.new(Sequel.connect(opts))
18
+ else
19
+ raise "Unsupported database: #{opts.inspect}"
20
+ end
21
+ end
22
+ end
23
+ end