cleansweep 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +201 -0
- data/Rakefile +3 -0
- data/cleansweep.gemspec +34 -0
- data/lib/clean_sweep/purge_runner/logging.rb +38 -0
- data/lib/clean_sweep/purge_runner/mysql_status.rb +82 -0
- data/lib/clean_sweep/purge_runner.rb +211 -0
- data/lib/clean_sweep/purge_stopped.rb +9 -0
- data/lib/clean_sweep/table_schema/column_schema.rb +23 -0
- data/lib/clean_sweep/table_schema/index_schema.rb +72 -0
- data/lib/clean_sweep/table_schema.rb +112 -0
- data/lib/clean_sweep/version.rb +3 -0
- data/lib/clean_sweep.rb +11 -0
- data/lib/cleansweep.rb +1 -0
- data/spec/factories/books.rb +36 -0
- data/spec/factories/comments.rb +26 -0
- data/spec/purge_runner_spec.rb +222 -0
- data/spec/spec_helper.rb +36 -0
- data/spec/table_schema_spec.rb +111 -0
- metadata +199 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 03e179a7416960bd102d09f4dd50f1c7f204fbc4
|
4
|
+
data.tar.gz: 559252c905e8b3302bf2eafe2d9774c574d322aa
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7f9019b2cf6b6fb339a5ac4ed32eb8146cb8b7742282da6cb7924e4c319414fb4307c0ca093b2bfcd8a26f58c768221183fd5cb85a576b40ab022b560802d28a
|
7
|
+
data.tar.gz: ae47587361ffe0ac2cfdca18f99095c259c0838442b22a6dccbaacc3224cccafe37f07ed99ed1bd3eb461f46261b187d52bdedd2bf03b3f0b3d4816e12b4a321
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Bill Kayser
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,201 @@
|
|
1
|
+
Cleansweep is a utility for scripting purges using ruby in an efficient, low-impact manner on
|
2
|
+
mysql innodb tables. Based on the Percona `pt-archive` utility.
|
3
|
+
|
4
|
+
## Installation
|
5
|
+
|
6
|
+
Add this line to your application's Gemfile:
|
7
|
+
|
8
|
+
```ruby
|
9
|
+
gem 'cleansweep'
|
10
|
+
```
|
11
|
+
|
12
|
+
And then execute:
|
13
|
+
|
14
|
+
$ bundle
|
15
|
+
|
16
|
+
Or install it yourself as:
|
17
|
+
|
18
|
+
$ gem install cleansweep
|
19
|
+
|
20
|
+
## How it works
|
21
|
+
|
22
|
+
Consider the table:
|
23
|
+
```sql
|
24
|
+
create table comments (
|
25
|
+
`id` int(11) primary key auto_increment,
|
26
|
+
`timestamp` datetime,
|
27
|
+
`account` int(11),
|
28
|
+
`liked` boolean,
|
29
|
+
key comments_on_account_timestamp(account, timestamp)
|
30
|
+
)
|
31
|
+
```
|
32
|
+
Assume there is an active record model for it:
|
33
|
+
|
34
|
+
class Comment < ActiveRecord::Base ; end
|
35
|
+
|
36
|
+
### Purging by traversing an index
|
37
|
+
|
38
|
+
The most efficient way to work through a table is by scanning through an index one chunk
|
39
|
+
at a time.
|
40
|
+
|
41
|
+
Let's assume we want to purge Comments older than 1 month. We can
|
42
|
+
scan the primary key index or the `account`,`timestamp` index. In this case the latter will
|
43
|
+
probably work better since we are evaluating the timestamp for the purge.
|
44
|
+
|
45
|
+
```ruby
|
46
|
+
r = CleanSweep::PurgeRunner.new model: Comment,
|
47
|
+
index: 'comments_on_account_timestamp' do | scope |
|
48
|
+
scope.where('timestamp < ?', 1.month.ago)
|
49
|
+
end
|
50
|
+
```
|
51
|
+
|
52
|
+
To execute the purge, do:
|
53
|
+
|
54
|
+
```ruby
|
55
|
+
count = r.execute_in_batches
|
56
|
+
puts "Deleted #{count} rows"
|
57
|
+
```
|
58
|
+
|
59
|
+
Check what it will do:
|
60
|
+
|
61
|
+
```ruby
|
62
|
+
r.print_queries($stdout)
|
63
|
+
```
|
64
|
+
|
65
|
+
This will show you what it will do by printing out the three different statements used:
|
66
|
+
|
67
|
+
```sql
|
68
|
+
Initial Query:
|
69
|
+
SELECT `id`,`account`,`timestamp`
|
70
|
+
FROM `comments` FORCE INDEX(comments_on_account_timestamp)
|
71
|
+
WHERE (timestamp < '2014-11-25 21:47:43')
|
72
|
+
ORDER BY `account` ASC,`timestamp` ASC
|
73
|
+
LIMIT 500
|
74
|
+
Chunk Query:
|
75
|
+
SELECT `id`,`account`,`timestamp`
|
76
|
+
FROM `comments` FORCE INDEX(comments_on_account_timestamp)
|
77
|
+
WHERE (timestamp < '2014-11-25 21:47:43') AND (`account` > 0 OR (`account` = 0 AND `timestamp` > '2014-11-18 21:47:43'))\n ORDER BY `account` ASC,`timestamp` ASC
|
78
|
+
LIMIT 500
|
79
|
+
Delete Statement:
|
80
|
+
DELETE
|
81
|
+
FROM `comments`
|
82
|
+
WHERE (`id` = 2)
|
83
|
+
```
|
84
|
+
|
85
|
+
It does the initial statement once to get the first chunk of rows. Then it does subsequent queries
|
86
|
+
starting at the index where the last chunk left off, thereby avoiding a complete index scan. This works
|
87
|
+
fine as long as you don't have rows with duplicate account id and timestamps. If you do, you'll possibly
|
88
|
+
miss rows between chunks.
|
89
|
+
|
90
|
+
To avoid missing duplicates, you can traverse the index using only the first column with an inclusive comparator
|
91
|
+
like `>=` instead of `>`. Here's what that would look like:
|
92
|
+
|
93
|
+
```ruby
|
94
|
+
r = CleanSweep::PurgeRunner.new model:Comment,
|
95
|
+
index: 'comments_on_account_timestamp',
|
96
|
+
first_only: true do | scope |
|
97
|
+
scope.where('timestamp < ?', 1.month.ago)
|
98
|
+
end
|
99
|
+
```
|
100
|
+
|
101
|
+
The chunk query looks like:
|
102
|
+
|
103
|
+
```sql
|
104
|
+
SELECT `id`,`account`,`timestamp`
|
105
|
+
FROM `comments` FORCE INDEX(comments_on_account_timestamp)
|
106
|
+
WHERE (timestamp < '2014-11-25 21:47:43') AND (`account` >= 0)
|
107
|
+
LIMIT 500
|
108
|
+
```
|
109
|
+
|
110
|
+
You can scan the index in either direction. To specify descending order, use the `reverse: true` option.
|
111
|
+
|
112
|
+
### Copying rows from one table to another
|
113
|
+
|
114
|
+
You can use the same technique to copy rows from one table to another. Support in CleanSweep is pretty
|
115
|
+
minimal. It won't _move_ rows, only copy them, although it would be easy to fix this.
|
116
|
+
I used this to copy ids into a temporary table which I then
|
117
|
+
used to delete later.
|
118
|
+
|
119
|
+
Here's an example that copies rows from the `Comment` model to the `ExpiredComment` model (`expired_comments`).
|
120
|
+
Comments older than one week are copied.
|
121
|
+
|
122
|
+
```ruby
|
123
|
+
copier = CleanSweep::PurgeRunner.new model: Comment,
|
124
|
+
index: 'comments_on_account_timestamp',
|
125
|
+
dest_model: ExpiredComment,
|
126
|
+
copy_columns: %w[liked] do do | model |
|
127
|
+
model.where('last_used_at < ?', 1.week.ago)
|
128
|
+
end
|
129
|
+
```
|
130
|
+
|
131
|
+
The `copy_columns` option specifies additional columns to be inserted into the `expired_comments` table.
|
132
|
+
|
133
|
+
### Watching the history list and replication lag
|
134
|
+
|
135
|
+
You can enter thresholds for the history list size and replication lag that will be used to pause the
|
136
|
+
purge if either of those values get into an unsafe territory. The script will pause for 5 minutes and
|
137
|
+
only start once the corresponding metric goes back down to 90% of the specified threshold.
|
138
|
+
|
139
|
+
### Logging and monitoring progress
|
140
|
+
|
141
|
+
You pass in a standard log instance to capture all running output. By default it will log to your
|
142
|
+
`ActiveRecord::Base` logger, or stdout if that's not set up.
|
143
|
+
|
144
|
+
If you specify a reporting interval
|
145
|
+
with the `report` option it will print the status of the purge at that interval. This is useful to track
|
146
|
+
progress and assess the rate of deletion.
|
147
|
+
|
148
|
+
### Joins and subqueries
|
149
|
+
|
150
|
+
You can add subqueries and joins to your query in the scope block, but be careful. The index and order
|
151
|
+
clause may work against you if the table you are joining with doesn't have good parity with the indexes
|
152
|
+
in your target table.
|
153
|
+
|
154
|
+
### Limitations
|
155
|
+
|
156
|
+
* Only works for mysql (as far as I know). I have only used it against 5.5.
|
157
|
+
* Should work with ActiveRecord 3.* - 4.*.
|
158
|
+
* Using a non-unique index risks missing duplicate rows unless you use the `first_only` option.
|
159
|
+
* Using the `first_only` option risks rescanning many rows if you have many more duplicates than your
|
160
|
+
chunk size
|
161
|
+
* An index is required but you should be able to run a purge without one. It just means you're not
|
162
|
+
scanning the index in chunks. This might be okay if you are deleting everything as you go along because
|
163
|
+
then you're not rescanning the rows. It wouldn't require much to modify CleanSweep to support this
|
164
|
+
mode.
|
165
|
+
|
166
|
+
### Other options
|
167
|
+
|
168
|
+
There are a number of other options you can use to tune the script. For details look at the
|
169
|
+
[API on the `PurgeRunner` class](http://bkayser.github.io/cleansweep/rdoc/CleanSweep/PurgeRunner.html)
|
170
|
+
|
171
|
+
### NewRelic integration
|
172
|
+
|
173
|
+
The script requires the [New Relic](http://github.com/newrelic/rpm) gem. It won't impact anyting if you
|
174
|
+
don't have a New Relic account to report to, but if you do use New Relic it is configured to show you
|
175
|
+
detailed metrics. I recommend turning off transaction traces for long purge jobs to reduce your memory
|
176
|
+
footprint.
|
177
|
+
|
178
|
+
## Testing
|
179
|
+
|
180
|
+
To run the specs, start a local mysql instance. The default user is root with an empty password.
|
181
|
+
Override the user/password with environment variables `DB_USER` and `DB_PASSWORD`. The test
|
182
|
+
creates a db called 'cstest'.
|
183
|
+
|
184
|
+
## Contributing
|
185
|
+
|
186
|
+
1. Fork it ( https://github.com/bkayser/cleansweep/fork )
|
187
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
188
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
189
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
190
|
+
5. Create a new Pull Request
|
191
|
+
|
192
|
+
## License and Copyright
|
193
|
+
|
194
|
+
Copyright 2014 New Relic, Inc., and Bill Kayser
|
195
|
+
|
196
|
+
Covered by the MIT [LICENSE](LICENSE.txt).
|
197
|
+
|
198
|
+
### Credits
|
199
|
+
|
200
|
+
This was all inspired and informed by [Percona's `pt-archiver` script](http://www.percona.com/doc/percona-toolkit/2.1/pt-archiver.html)
|
201
|
+
written by Baron Schwartz.
|
data/Rakefile
ADDED
data/cleansweep.gemspec
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'clean_sweep/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "cleansweep"
|
8
|
+
spec.version = CleanSweep::VERSION
|
9
|
+
spec.authors = ["Bill Kayser"]
|
10
|
+
spec.email = ["bkayser@newrelic.com"]
|
11
|
+
spec.summary = %q{Utility to purge or archive rows in mysql tables}
|
12
|
+
spec.description = <<-EOF
|
13
|
+
Purge data from mysql innodb tables efficiently with low overhead and impact.
|
14
|
+
Based on the Percona pt-archive utility.
|
15
|
+
EOF
|
16
|
+
spec.homepage = "http://github.com/bkayser/cleansweep"
|
17
|
+
spec.license = "MIT"
|
18
|
+
|
19
|
+
spec.files = `git ls-files -z`.split("\x0")
|
20
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
21
|
+
spec.test_files = spec.files.grep(%r{^spec/})
|
22
|
+
spec.require_paths = ["lib"]
|
23
|
+
|
24
|
+
spec.add_runtime_dependency 'activerecord', '~>3'
|
25
|
+
spec.add_runtime_dependency 'newrelic_rpm', '~>3'
|
26
|
+
spec.add_runtime_dependency 'mysql2', '~> 0.3.17'
|
27
|
+
|
28
|
+
spec.add_development_dependency 'pry', '~>0'
|
29
|
+
spec.add_development_dependency 'bundler', '~> 1.7'
|
30
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
31
|
+
spec.add_development_dependency 'rspec', '~> 3.1'
|
32
|
+
spec.add_development_dependency 'factory_girl', '~> 4.4'
|
33
|
+
spec.add_development_dependency 'awesome_print', '~>1.2'
|
34
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module CleanSweep::PurgeRunner::Logging
|
2
|
+
|
3
|
+
def report(force=false)
|
4
|
+
report_duration = Time.now - @report_interval_start
|
5
|
+
if (force || report_duration >= @report_interval)
|
6
|
+
while (@report_interval_start < Time.now - @report_interval) do
|
7
|
+
@report_interval_start += @report_interval
|
8
|
+
end
|
9
|
+
report = []
|
10
|
+
elapsed = [1, (Time.now - @start).to_i].max
|
11
|
+
rate = (@total_deleted / elapsed).to_i
|
12
|
+
rate = "#{rate > 0 ? '%12i' % rate : ('%12s' %'< 1')} records/second"
|
13
|
+
report << "report:"
|
14
|
+
if copy_mode?
|
15
|
+
report << " #{@dry_run ? 'queried' : 'copied'}: #{'%12i' % @total_deleted} #{@model.table_name} records"
|
16
|
+
else
|
17
|
+
report << " #{@dry_run ? 'queried' : 'deleted'}: #{'%12i' % @total_deleted} #{@model.table_name} records"
|
18
|
+
end
|
19
|
+
report << " elapsed: #{'%12s' % format(elapsed)}"
|
20
|
+
report << " rate: #{rate}"
|
21
|
+
log :info, report.join("\n")
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def log level, msg
|
26
|
+
prefix = level == :debug ? " *** " : " ** "
|
27
|
+
out = msg.split("\n").map {|line| prefix + line}.join("\n")
|
28
|
+
@logger.send level, out
|
29
|
+
end
|
30
|
+
|
31
|
+
def format(time)
|
32
|
+
format_string = "%H:%M:%S"
|
33
|
+
if (time.to_i > (24 * 60 * 60))
|
34
|
+
format_string = "%d days, %H:%M"
|
35
|
+
end
|
36
|
+
Time.at(time).strftime(format_string)
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'newrelic_rpm'
|
2
|
+
|
3
|
+
class CleanSweep::PurgeRunner::MysqlStatus
|
4
|
+
|
5
|
+
# Options:
|
6
|
+
# logger, model, max_history, max_repl_lag, check_period
|
7
|
+
def initialize(options={})
|
8
|
+
@logger = options[:logger] || ActiveRecord::Base.logger
|
9
|
+
@model = options[:model]
|
10
|
+
@max_history = options[:max_history]
|
11
|
+
@max_replication_lag = options[:max_repl_lag]
|
12
|
+
@check_period = options[:check_period] || 2.minutes
|
13
|
+
@last_check = @check_period.ago
|
14
|
+
end
|
15
|
+
|
16
|
+
def check!
|
17
|
+
return if Time.now - @check_period < @last_check
|
18
|
+
while (v = get_violations).any? do
|
19
|
+
@logger.warn("pausing 5 minutes (#{v.to_a.map{ |key, value| "#{key} = #{value}"}.join(", ")})") if !paused?
|
20
|
+
@paused = true
|
21
|
+
pause 5.minutes
|
22
|
+
end
|
23
|
+
@logger.info("violations clear") if paused?
|
24
|
+
all_clear!
|
25
|
+
end
|
26
|
+
|
27
|
+
def get_violations
|
28
|
+
violations = {}
|
29
|
+
if @max_history
|
30
|
+
current = get_history_length
|
31
|
+
violations["history length"] = current if threshold(@max_history) < current
|
32
|
+
end
|
33
|
+
if @max_replication_lag
|
34
|
+
current = get_replication_lag
|
35
|
+
violations["replication lag"] = current if threshold(@max_replication_lag) < current
|
36
|
+
end
|
37
|
+
return violations
|
38
|
+
end
|
39
|
+
|
40
|
+
# Return the threshold to use in the check. If we are already failing, don't
|
41
|
+
# start up again until we've recovered at least 10%
|
42
|
+
def threshold(value)
|
43
|
+
if paused?
|
44
|
+
value = 0.90 * value
|
45
|
+
else
|
46
|
+
value
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def pause time
|
51
|
+
Kernel.sleep time
|
52
|
+
end
|
53
|
+
add_method_tracer :pause
|
54
|
+
|
55
|
+
def paused?
|
56
|
+
@paused
|
57
|
+
end
|
58
|
+
|
59
|
+
def all_clear!
|
60
|
+
@last_check = Time.now
|
61
|
+
@paused = nil
|
62
|
+
end
|
63
|
+
|
64
|
+
def get_replication_lag
|
65
|
+
rows = @model.connection.select_rows 'SHOW SLAVE STATUS'
|
66
|
+
if rows.nil? || rows.empty?
|
67
|
+
return 0
|
68
|
+
else
|
69
|
+
return rows[0][32]
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
|
74
|
+
def get_history_length
|
75
|
+
rows = @model.connection.select_rows <<-EOF
|
76
|
+
show engine innodb status
|
77
|
+
EOF
|
78
|
+
status_string = rows.first[2]
|
79
|
+
return /History list length ([0-9]+)/.match(status_string)[1].to_i
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
@@ -0,0 +1,211 @@
|
|
1
|
+
require 'stringio'
|
2
|
+
# This is a utility built to mimic some of the features of the pt_archive script
|
3
|
+
# to make really big purges go faster with no production impact.
|
4
|
+
#
|
5
|
+
# It uses a strategy of descending an index, querying for purgeable ids and then
|
6
|
+
# deleting them in batches.
|
7
|
+
#
|
8
|
+
# === Required Options
|
9
|
+
#
|
10
|
+
# [:model]
|
11
|
+
# Required: the Active Record model for the table being purged or copied from
|
12
|
+
#
|
13
|
+
# === Optional Options
|
14
|
+
#
|
15
|
+
# [:chunk_size]
|
16
|
+
# The number of rows to copy in each block. Defaults to 500.
|
17
|
+
# [:index]
|
18
|
+
# The index to traverse in ascending order doing the purge. Rows are read in the order of
|
19
|
+
# the index, which must be a btree index. If not specified, <tt>PRIMARY</tt> is assumed.
|
20
|
+
# [:reverse]
|
21
|
+
# Traverse the index in reverse order. For example, if your index is on <tt>account_id</tt>,
|
22
|
+
# <tt>timestamp</tt>, this option will move through the rows starting at the highest account
|
23
|
+
# number, then move through timestamps starting with the most recent.
|
24
|
+
# [:first_only]
|
25
|
+
# Traverse only the first column of the index, and do so inclusively using the <tt>>=</tt> operator
|
26
|
+
# instead of the strict <tt>></tt> operator. This is important if the index is not unique and there
|
27
|
+
# are a lot of duplicates. Otherwise the delete could miss rows. Not allowed in copy mode because you'd
|
28
|
+
# be inserting duplicate rows.
|
29
|
+
# [:dry_run]
|
30
|
+
# Print out the queries that are going to be used. You should run explain on these.
|
31
|
+
# [:stop_after]
|
32
|
+
# The operation will end after copying this many rows.
|
33
|
+
# [:report]
|
34
|
+
# Specify an interval in seconds between status messages being printed out.
|
35
|
+
# [:logger]
|
36
|
+
# The log instance to use. Defaults to the <tt>ActiveRecord::Base.logger</tt>
|
37
|
+
# if not nil, otherwise it uses _$stdout_
|
38
|
+
# [:dest_model]
|
39
|
+
# When this option is present nothing is deleted, and instead rows are copied to
|
40
|
+
# the table for this model. This model must
|
41
|
+
# have identically named columns as the source model. By default, only columns in the
|
42
|
+
# named index and primary key are copied but these can be augmented with columns in the
|
43
|
+
# <tt>copy_columns</tt> option.
|
44
|
+
# [:copy_columns]
|
45
|
+
# Extra columns to add when copying to a dest model.
|
46
|
+
#
|
47
|
+
# === Safety thresholds
|
48
|
+
# [:sleep]
|
49
|
+
# Time in seconds to sleep between each chunk.
|
50
|
+
# [:max_history]
|
51
|
+
# The history list size (if available) is checked every 5 minutes and if it exceeds this size
|
52
|
+
# the purge will pause until the history list is below 90% of this value.
|
53
|
+
# [:max_repl_lag]
|
54
|
+
# The maximum length of the replication lag. Checked every 5 minutes and if exceeded the purge
|
55
|
+
# pauses until the replication lag is below 90% of this value.
|
56
|
+
|
57
|
+
class CleanSweep::PurgeRunner
|
58
|
+
|
59
|
+
require 'clean_sweep/purge_runner/logging'
|
60
|
+
require 'clean_sweep/purge_runner/mysql_status'
|
61
|
+
|
62
|
+
include CleanSweep::PurgeRunner::Logging
|
63
|
+
|
64
|
+
# This helps us track the state of replication and history list and pause
|
65
|
+
# if necessary
|
66
|
+
attr_reader :mysql_status
|
67
|
+
|
68
|
+
def initialize(options={})
|
69
|
+
@model = options[:model] or raise "source model class required"
|
70
|
+
@limit = options[:chunk_size] || 500
|
71
|
+
|
72
|
+
@target_model = options[:dest_model]
|
73
|
+
@stop_after = options[:stop_after]
|
74
|
+
@report_interval = options[:report] || 10.seconds
|
75
|
+
@logger = options[:logger] || ActiveRecord::Base.logger || Logger.new($stdout)
|
76
|
+
@dry_run = options[:dry_run]
|
77
|
+
@sleep = options[:sleep]
|
78
|
+
|
79
|
+
@max_history = options[:max_history]
|
80
|
+
@max_repl_lag = options[:max_repl_lag]
|
81
|
+
|
82
|
+
@table_schema = CleanSweep::TableSchema.new @model,
|
83
|
+
key_name: options[:index],
|
84
|
+
ascending: !options[:reverse],
|
85
|
+
extra_columns: options[:copy_columns],
|
86
|
+
first_only: options[:first_only]
|
87
|
+
|
88
|
+
if (@max_history || @max_repl_lag)
|
89
|
+
@mysql_status = CleanSweep::PurgeRunner::MysqlStatus.new model: @model,
|
90
|
+
max_history: @max_history,
|
91
|
+
max_repl_lag: @max_repl_lag,
|
92
|
+
check_period: options[:check_period],
|
93
|
+
logger: @logger
|
94
|
+
end
|
95
|
+
|
96
|
+
raise "You can't copy rows from a table into itself" if copy_mode? && @model == @target_model
|
97
|
+
raise "An index is required in copy mode" if copy_mode? && @table_schema.traversing_key.nil?
|
98
|
+
raise "first_only option not allowed in copy mode" if copy_mode? && @table_schema.first_only?
|
99
|
+
|
100
|
+
@report_interval_start = Time.now
|
101
|
+
|
102
|
+
@query = @table_schema.initial_scope.limit(@limit)
|
103
|
+
|
104
|
+
@query = yield(@query) if block_given?
|
105
|
+
end
|
106
|
+
|
107
|
+
|
108
|
+
def copy_mode?
|
109
|
+
@target_model.present?
|
110
|
+
end
|
111
|
+
|
112
|
+
# Execute the purge in chunks according to the parameters given on instance creation.
|
113
|
+
# Will raise <tt>CleanSweep::PurgeStopped</tt> if a <tt>stop_after</tt> option was provided and
|
114
|
+
# that limit is hit.
|
115
|
+
#
|
116
|
+
# Returns the number of rows copied or deleted.
|
117
|
+
#
|
118
|
+
def execute_in_batches
|
119
|
+
|
120
|
+
print_queries($stdout) and return 0 if @dry_run
|
121
|
+
|
122
|
+
@start = Time.now
|
123
|
+
verb = copy_mode? ? "copying" : "purging"
|
124
|
+
|
125
|
+
msg = "starting: #{verb} #{@table_schema.name} records in batches of #@limit"
|
126
|
+
msg << " to #{@target_model.table_name}" if copy_mode?
|
127
|
+
|
128
|
+
|
129
|
+
log :info, "sleeping #{@sleep} seconds between purging" if @sleep && !copy_mode?
|
130
|
+
@total_deleted = 0
|
131
|
+
|
132
|
+
# Iterate through the rows in limit chunks
|
133
|
+
log :debug, "find rows: #{@query.to_sql}" if @logger.level == Logger::DEBUG
|
134
|
+
|
135
|
+
@mysql_status.check! if @mysql_status
|
136
|
+
|
137
|
+
rows = NewRelic::Agent.with_database_metric_name(@model.name, 'SELECT') do
|
138
|
+
@model.connection.select_rows @query.to_sql
|
139
|
+
end
|
140
|
+
while rows.any? && (!@stop_after || @total_deleted < @stop_after) do
|
141
|
+
# index_entrypoint_args = Hash[*@source_keys.zip(rows.last).flatten]
|
142
|
+
log :debug, "#{verb} #{rows.size} records between #{rows.first.inspect} and #{rows.last.inspect}" if @logger.level == Logger::DEBUG
|
143
|
+
stopped = @stop_after && rows.size + @total_deleted > @stop_after
|
144
|
+
|
145
|
+
rows = rows.first(@stop_after - @total_deleted) if stopped
|
146
|
+
last_row = rows.last
|
147
|
+
if copy_mode?
|
148
|
+
metric_op_name = 'INSERT'
|
149
|
+
statement = @table_schema.insert_statement(@target_model, rows)
|
150
|
+
else
|
151
|
+
metric_op_name = 'DELETE'
|
152
|
+
statement = @table_schema.delete_statement(rows)
|
153
|
+
end
|
154
|
+
log :debug, statement if @logger.level == Logger::DEBUG
|
155
|
+
chunk_deleted = NewRelic::Agent.with_database_metric_name(@target_model, metric_op_name) do
|
156
|
+
@model.connection.update statement
|
157
|
+
end
|
158
|
+
|
159
|
+
@total_deleted += chunk_deleted
|
160
|
+
raise CleanSweep::PurgeStopped.new("stopped after #{verb} #{@total_deleted} #{@model} records", @total_deleted) if stopped
|
161
|
+
q = @table_schema.scope_to_next_chunk(@query, last_row).to_sql
|
162
|
+
log :debug, "find rows: #{q}" if @logger.level == Logger::DEBUG
|
163
|
+
|
164
|
+
sleep @sleep if @sleep && !copy_mode?
|
165
|
+
@mysql_status.check! if @mysql_status
|
166
|
+
|
167
|
+
rows = NewRelic::Agent.with_database_metric_name(@model, 'SELECT') do
|
168
|
+
@model.connection.select_rows(q)
|
169
|
+
end
|
170
|
+
report
|
171
|
+
end
|
172
|
+
report(true)
|
173
|
+
if copy_mode?
|
174
|
+
log :info, "completed after #{verb} #{@total_deleted} #{@table_schema.name} records to #{@target_model.table_name}"
|
175
|
+
else
|
176
|
+
log :info, "completed after #{verb} #{@total_deleted} #{@table_schema.name} records"
|
177
|
+
end
|
178
|
+
|
179
|
+
return @total_deleted
|
180
|
+
end
|
181
|
+
|
182
|
+
def sleep duration
|
183
|
+
Kernel.sleep duration
|
184
|
+
end
|
185
|
+
|
186
|
+
add_method_tracer :sleep
|
187
|
+
add_method_tracer :execute_in_batches
|
188
|
+
|
189
|
+
def print_queries(io)
|
190
|
+
io.puts 'Initial Query:'
|
191
|
+
io.puts format_query(' ', @query.to_sql)
|
192
|
+
rows = @model.connection.select_rows @query.limit(1).to_sql
|
193
|
+
io.puts "Chunk Query:"
|
194
|
+
io.puts format_query(' ', @table_schema.scope_to_next_chunk(@query, rows.first).to_sql)
|
195
|
+
if copy_mode?
|
196
|
+
io.puts "Insert Statement:"
|
197
|
+
io.puts format_query(' ', @table_schema.insert_statement(@target_model, rows))
|
198
|
+
else
|
199
|
+
io.puts "Delete Statement:"
|
200
|
+
io.puts format_query(' ', @table_schema.delete_statement(rows))
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
private
|
205
|
+
|
206
|
+
def format_query indentation, query
|
207
|
+
lines = query.split(/ (?=values|from|where|order|limit)/i)
|
208
|
+
lines.map {|line| indentation + line.strip }.join("\n")
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
class CleanSweep::TableSchema::ColumnSchema
|
2
|
+
|
3
|
+
attr_reader :name
|
4
|
+
attr_accessor :select_position
|
5
|
+
|
6
|
+
def initialize(name, model)
|
7
|
+
@name = name.to_sym
|
8
|
+
col_num = model.column_names.index(name.to_s) or raise "Can't find #{name} in #{model.name}"
|
9
|
+
@model = model
|
10
|
+
@column = model.columns[col_num]
|
11
|
+
end
|
12
|
+
|
13
|
+
def quoted_name
|
14
|
+
"`#{name}`"
|
15
|
+
end
|
16
|
+
def value(row)
|
17
|
+
row[select_position]
|
18
|
+
end
|
19
|
+
def quoted_value(row)
|
20
|
+
@model.quote_value(value(row), @column)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|