cleansweep 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +201 -0
- data/Rakefile +3 -0
- data/cleansweep.gemspec +34 -0
- data/lib/clean_sweep/purge_runner/logging.rb +38 -0
- data/lib/clean_sweep/purge_runner/mysql_status.rb +82 -0
- data/lib/clean_sweep/purge_runner.rb +211 -0
- data/lib/clean_sweep/purge_stopped.rb +9 -0
- data/lib/clean_sweep/table_schema/column_schema.rb +23 -0
- data/lib/clean_sweep/table_schema/index_schema.rb +72 -0
- data/lib/clean_sweep/table_schema.rb +112 -0
- data/lib/clean_sweep/version.rb +3 -0
- data/lib/clean_sweep.rb +11 -0
- data/lib/cleansweep.rb +1 -0
- data/spec/factories/books.rb +36 -0
- data/spec/factories/comments.rb +26 -0
- data/spec/purge_runner_spec.rb +222 -0
- data/spec/spec_helper.rb +36 -0
- data/spec/table_schema_spec.rb +111 -0
- metadata +199 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 03e179a7416960bd102d09f4dd50f1c7f204fbc4
|
4
|
+
data.tar.gz: 559252c905e8b3302bf2eafe2d9774c574d322aa
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7f9019b2cf6b6fb339a5ac4ed32eb8146cb8b7742282da6cb7924e4c319414fb4307c0ca093b2bfcd8a26f58c768221183fd5cb85a576b40ab022b560802d28a
|
7
|
+
data.tar.gz: ae47587361ffe0ac2cfdca18f99095c259c0838442b22a6dccbaacc3224cccafe37f07ed99ed1bd3eb461f46261b187d52bdedd2bf03b3f0b3d4816e12b4a321
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Bill Kayser
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,201 @@
|
|
1
|
+
Cleansweep is a utility for scripting purges using ruby in an efficient, low-impact manner on
|
2
|
+
mysql innodb tables. Based on the Percona `pt-archive` utility.
|
3
|
+
|
4
|
+
## Installation
|
5
|
+
|
6
|
+
Add this line to your application's Gemfile:
|
7
|
+
|
8
|
+
```ruby
|
9
|
+
gem 'cleansweep'
|
10
|
+
```
|
11
|
+
|
12
|
+
And then execute:
|
13
|
+
|
14
|
+
$ bundle
|
15
|
+
|
16
|
+
Or install it yourself as:
|
17
|
+
|
18
|
+
$ gem install cleansweep
|
19
|
+
|
20
|
+
## How it works
|
21
|
+
|
22
|
+
Consider the table:
|
23
|
+
```sql
|
24
|
+
create table comments (
|
25
|
+
`id` int(11) primary key auto_increment,
|
26
|
+
`timestamp` datetime,
|
27
|
+
`account` int(11),
|
28
|
+
`liked` boolean,
|
29
|
+
key comments_on_account_timestamp(account, timestamp)
|
30
|
+
)
|
31
|
+
```
|
32
|
+
Assume there is an active record model for it:
|
33
|
+
|
34
|
+
class Comment < ActiveRecord::Base ; end
|
35
|
+
|
36
|
+
### Purging by traversing an index
|
37
|
+
|
38
|
+
The most efficient way to work through a table is by scanning through an index one chunk
|
39
|
+
at a time.
|
40
|
+
|
41
|
+
Let's assume we want to purge Comments older than 1 month. We can
|
42
|
+
scan the primary key index or the `account`,`timestamp` index. In this case the latter will
|
43
|
+
probably work better since we are evaluating the timestamp for the purge.
|
44
|
+
|
45
|
+
```ruby
|
46
|
+
r = CleanSweep::PurgeRunner.new model: Comment,
|
47
|
+
index: 'comments_on_account_timestamp' do | scope |
|
48
|
+
scope.where('timestamp < ?', 1.month.ago)
|
49
|
+
end
|
50
|
+
```
|
51
|
+
|
52
|
+
To execute the purge, do:
|
53
|
+
|
54
|
+
```ruby
|
55
|
+
count = r.execute_in_batches
|
56
|
+
puts "Deleted #{count} rows"
|
57
|
+
```
|
58
|
+
|
59
|
+
Check what it will do:
|
60
|
+
|
61
|
+
```ruby
|
62
|
+
r.print_queries($stdout)
|
63
|
+
```
|
64
|
+
|
65
|
+
This will show you what it will do by printing out the three different statements used:
|
66
|
+
|
67
|
+
```sql
|
68
|
+
Initial Query:
|
69
|
+
SELECT `id`,`account`,`timestamp`
|
70
|
+
FROM `comments` FORCE INDEX(comments_on_account_timestamp)
|
71
|
+
WHERE (timestamp < '2014-11-25 21:47:43')
|
72
|
+
ORDER BY `account` ASC,`timestamp` ASC
|
73
|
+
LIMIT 500
|
74
|
+
Chunk Query:
|
75
|
+
SELECT `id`,`account`,`timestamp`
|
76
|
+
FROM `comments` FORCE INDEX(comments_on_account_timestamp)
|
77
|
+
WHERE (timestamp < '2014-11-25 21:47:43') AND (`account` > 0 OR (`account` = 0 AND `timestamp` > '2014-11-18 21:47:43'))\n ORDER BY `account` ASC,`timestamp` ASC
|
78
|
+
LIMIT 500
|
79
|
+
Delete Statement:
|
80
|
+
DELETE
|
81
|
+
FROM `comments`
|
82
|
+
WHERE (`id` = 2)
|
83
|
+
```
|
84
|
+
|
85
|
+
It does the initial statement once to get the first chunk of rows. Then it does subsequent queries
|
86
|
+
starting at the index where the last chunk left off, thereby avoiding a complete index scan. This works
|
87
|
+
fine as long as you don't have rows with duplicate account id and timestamps. If you do, you'll possibly
|
88
|
+
miss rows between chunks.
|
89
|
+
|
90
|
+
To avoid missing duplicates, you can traverse the index using only the first column with an inclusive comparator
|
91
|
+
like `>=` instead of `>`. Here's what that would look like:
|
92
|
+
|
93
|
+
```ruby
|
94
|
+
r = CleanSweep::PurgeRunner.new model:Comment,
|
95
|
+
index: 'comments_on_account_timestamp',
|
96
|
+
first_only: true do | scope |
|
97
|
+
scope.where('timestamp < ?', 1.month.ago)
|
98
|
+
end
|
99
|
+
```
|
100
|
+
|
101
|
+
The chunk query looks like:
|
102
|
+
|
103
|
+
```sql
|
104
|
+
SELECT `id`,`account`,`timestamp`
|
105
|
+
FROM `comments` FORCE INDEX(comments_on_account_timestamp)
|
106
|
+
WHERE (timestamp < '2014-11-25 21:47:43') AND (`account` >= 0)
|
107
|
+
LIMIT 500
|
108
|
+
```
|
109
|
+
|
110
|
+
You can scan the index in either direction. To specify descending order, use the `reverse: true` option.
|
111
|
+
|
112
|
+
### Copying rows from one table to another
|
113
|
+
|
114
|
+
You can use the same technique to copy rows from one table to another. Support in CleanSweep is pretty
|
115
|
+
minimal. It won't _move_ rows, only copy them, although it would be easy to fix this.
|
116
|
+
I used this to copy ids into a temporary table which I then
|
117
|
+
used to delete later.
|
118
|
+
|
119
|
+
Here's an example that copies rows from the `Comment` model to the `ExpiredComment` model (`expired_comments`).
|
120
|
+
Comments older than one week are copied.
|
121
|
+
|
122
|
+
```ruby
|
123
|
+
copier = CleanSweep::PurgeRunner.new model: Comment,
|
124
|
+
index: 'comments_on_account_timestamp',
|
125
|
+
dest_model: ExpiredComment,
|
126
|
+
copy_columns: %w[liked] do do | model |
|
127
|
+
model.where('last_used_at < ?', 1.week.ago)
|
128
|
+
end
|
129
|
+
```
|
130
|
+
|
131
|
+
The `copy_columns` option specifies additional columns to be inserted into the `expired_comments` table.
|
132
|
+
|
133
|
+
### Watching the history list and replication lag
|
134
|
+
|
135
|
+
You can enter thresholds for the history list size and replication lag that will be used to pause the
|
136
|
+
purge if either of those values get into an unsafe territory. The script will pause for 5 minutes and
|
137
|
+
only start once the corresponding metric goes back down to 90% of the specified threshold.
|
138
|
+
|
139
|
+
### Logging and monitoring progress
|
140
|
+
|
141
|
+
You pass in a standard log instance to capture all running output. By default it will log to your
|
142
|
+
`ActiveRecord::Base` logger, or stdout if that's not set up.
|
143
|
+
|
144
|
+
If you specify a reporting interval
|
145
|
+
with the `report` option it will print the status of the purge at that interval. This is useful to track
|
146
|
+
progress and assess the rate of deletion.
|
147
|
+
|
148
|
+
### Joins and subqueries
|
149
|
+
|
150
|
+
You can add subqueries and joins to your query in the scope block, but be careful. The index and order
|
151
|
+
clause may work against you if the table you are joining with doesn't have good parity with the indexes
|
152
|
+
in your target table.
|
153
|
+
|
154
|
+
### Limitations
|
155
|
+
|
156
|
+
* Only works for mysql (as far as I know). I have only used it against 5.5.
|
157
|
+
* Should work with ActiveRecord 3.* - 4.*.
|
158
|
+
* Using a non-unique index risks missing duplicate rows unless you use the `first_only` option.
|
159
|
+
* Using the `first_only` option risks rescanning many rows if you have many more duplicates than your
|
160
|
+
chunk size
|
161
|
+
* An index is required but you should be able to run a purge without one. It just means you're not
|
162
|
+
scanning the index in chunks. This might be okay if you are deleting everything as you go along because
|
163
|
+
then you're not rescanning the rows. It wouldn't require much to modify CleanSweep to support this
|
164
|
+
mode.
|
165
|
+
|
166
|
+
### Other options
|
167
|
+
|
168
|
+
There are a number of other options you can use to tune the script. For details look at the
|
169
|
+
[API on the `PurgeRunner` class](http://bkayser.github.io/cleansweep/rdoc/CleanSweep/PurgeRunner.html)
|
170
|
+
|
171
|
+
### NewRelic integration
|
172
|
+
|
173
|
+
The script requires the [New Relic](http://github.com/newrelic/rpm) gem. It won't impact anyting if you
|
174
|
+
don't have a New Relic account to report to, but if you do use New Relic it is configured to show you
|
175
|
+
detailed metrics. I recommend turning off transaction traces for long purge jobs to reduce your memory
|
176
|
+
footprint.
|
177
|
+
|
178
|
+
## Testing
|
179
|
+
|
180
|
+
To run the specs, start a local mysql instance. The default user is root with an empty password.
|
181
|
+
Override the user/password with environment variables `DB_USER` and `DB_PASSWORD`. The test
|
182
|
+
creates a db called 'cstest'.
|
183
|
+
|
184
|
+
## Contributing
|
185
|
+
|
186
|
+
1. Fork it ( https://github.com/bkayser/cleansweep/fork )
|
187
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
188
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
189
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
190
|
+
5. Create a new Pull Request
|
191
|
+
|
192
|
+
## License and Copyright
|
193
|
+
|
194
|
+
Copyright 2014 New Relic, Inc., and Bill Kayser
|
195
|
+
|
196
|
+
Covered by the MIT [LICENSE](LICENSE.txt).
|
197
|
+
|
198
|
+
### Credits
|
199
|
+
|
200
|
+
This was all inspired and informed by [Percona's `pt-archiver` script](http://www.percona.com/doc/percona-toolkit/2.1/pt-archiver.html)
|
201
|
+
written by Baron Schwartz.
|
data/Rakefile
ADDED
data/cleansweep.gemspec
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'clean_sweep/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "cleansweep"
|
8
|
+
spec.version = CleanSweep::VERSION
|
9
|
+
spec.authors = ["Bill Kayser"]
|
10
|
+
spec.email = ["bkayser@newrelic.com"]
|
11
|
+
spec.summary = %q{Utility to purge or archive rows in mysql tables}
|
12
|
+
spec.description = <<-EOF
|
13
|
+
Purge data from mysql innodb tables efficiently with low overhead and impact.
|
14
|
+
Based on the Percona pt-archive utility.
|
15
|
+
EOF
|
16
|
+
spec.homepage = "http://github.com/bkayser/cleansweep"
|
17
|
+
spec.license = "MIT"
|
18
|
+
|
19
|
+
spec.files = `git ls-files -z`.split("\x0")
|
20
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
21
|
+
spec.test_files = spec.files.grep(%r{^spec/})
|
22
|
+
spec.require_paths = ["lib"]
|
23
|
+
|
24
|
+
spec.add_runtime_dependency 'activerecord', '~>3'
|
25
|
+
spec.add_runtime_dependency 'newrelic_rpm', '~>3'
|
26
|
+
spec.add_runtime_dependency 'mysql2', '~> 0.3.17'
|
27
|
+
|
28
|
+
spec.add_development_dependency 'pry', '~>0'
|
29
|
+
spec.add_development_dependency 'bundler', '~> 1.7'
|
30
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
31
|
+
spec.add_development_dependency 'rspec', '~> 3.1'
|
32
|
+
spec.add_development_dependency 'factory_girl', '~> 4.4'
|
33
|
+
spec.add_development_dependency 'awesome_print', '~>1.2'
|
34
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module CleanSweep::PurgeRunner::Logging
|
2
|
+
|
3
|
+
def report(force=false)
|
4
|
+
report_duration = Time.now - @report_interval_start
|
5
|
+
if (force || report_duration >= @report_interval)
|
6
|
+
while (@report_interval_start < Time.now - @report_interval) do
|
7
|
+
@report_interval_start += @report_interval
|
8
|
+
end
|
9
|
+
report = []
|
10
|
+
elapsed = [1, (Time.now - @start).to_i].max
|
11
|
+
rate = (@total_deleted / elapsed).to_i
|
12
|
+
rate = "#{rate > 0 ? '%12i' % rate : ('%12s' %'< 1')} records/second"
|
13
|
+
report << "report:"
|
14
|
+
if copy_mode?
|
15
|
+
report << " #{@dry_run ? 'queried' : 'copied'}: #{'%12i' % @total_deleted} #{@model.table_name} records"
|
16
|
+
else
|
17
|
+
report << " #{@dry_run ? 'queried' : 'deleted'}: #{'%12i' % @total_deleted} #{@model.table_name} records"
|
18
|
+
end
|
19
|
+
report << " elapsed: #{'%12s' % format(elapsed)}"
|
20
|
+
report << " rate: #{rate}"
|
21
|
+
log :info, report.join("\n")
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def log level, msg
|
26
|
+
prefix = level == :debug ? " *** " : " ** "
|
27
|
+
out = msg.split("\n").map {|line| prefix + line}.join("\n")
|
28
|
+
@logger.send level, out
|
29
|
+
end
|
30
|
+
|
31
|
+
def format(time)
|
32
|
+
format_string = "%H:%M:%S"
|
33
|
+
if (time.to_i > (24 * 60 * 60))
|
34
|
+
format_string = "%d days, %H:%M"
|
35
|
+
end
|
36
|
+
Time.at(time).strftime(format_string)
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'newrelic_rpm'
|
2
|
+
|
3
|
+
class CleanSweep::PurgeRunner::MysqlStatus
|
4
|
+
|
5
|
+
# Options:
|
6
|
+
# logger, model, max_history, max_repl_lag, check_period
|
7
|
+
def initialize(options={})
|
8
|
+
@logger = options[:logger] || ActiveRecord::Base.logger
|
9
|
+
@model = options[:model]
|
10
|
+
@max_history = options[:max_history]
|
11
|
+
@max_replication_lag = options[:max_repl_lag]
|
12
|
+
@check_period = options[:check_period] || 2.minutes
|
13
|
+
@last_check = @check_period.ago
|
14
|
+
end
|
15
|
+
|
16
|
+
def check!
|
17
|
+
return if Time.now - @check_period < @last_check
|
18
|
+
while (v = get_violations).any? do
|
19
|
+
@logger.warn("pausing 5 minutes (#{v.to_a.map{ |key, value| "#{key} = #{value}"}.join(", ")})") if !paused?
|
20
|
+
@paused = true
|
21
|
+
pause 5.minutes
|
22
|
+
end
|
23
|
+
@logger.info("violations clear") if paused?
|
24
|
+
all_clear!
|
25
|
+
end
|
26
|
+
|
27
|
+
def get_violations
|
28
|
+
violations = {}
|
29
|
+
if @max_history
|
30
|
+
current = get_history_length
|
31
|
+
violations["history length"] = current if threshold(@max_history) < current
|
32
|
+
end
|
33
|
+
if @max_replication_lag
|
34
|
+
current = get_replication_lag
|
35
|
+
violations["replication lag"] = current if threshold(@max_replication_lag) < current
|
36
|
+
end
|
37
|
+
return violations
|
38
|
+
end
|
39
|
+
|
40
|
+
# Return the threshold to use in the check. If we are already failing, don't
|
41
|
+
# start up again until we've recovered at least 10%
|
42
|
+
def threshold(value)
|
43
|
+
if paused?
|
44
|
+
value = 0.90 * value
|
45
|
+
else
|
46
|
+
value
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def pause time
|
51
|
+
Kernel.sleep time
|
52
|
+
end
|
53
|
+
add_method_tracer :pause
|
54
|
+
|
55
|
+
def paused?
|
56
|
+
@paused
|
57
|
+
end
|
58
|
+
|
59
|
+
def all_clear!
|
60
|
+
@last_check = Time.now
|
61
|
+
@paused = nil
|
62
|
+
end
|
63
|
+
|
64
|
+
def get_replication_lag
|
65
|
+
rows = @model.connection.select_rows 'SHOW SLAVE STATUS'
|
66
|
+
if rows.nil? || rows.empty?
|
67
|
+
return 0
|
68
|
+
else
|
69
|
+
return rows[0][32]
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
|
74
|
+
def get_history_length
|
75
|
+
rows = @model.connection.select_rows <<-EOF
|
76
|
+
show engine innodb status
|
77
|
+
EOF
|
78
|
+
status_string = rows.first[2]
|
79
|
+
return /History list length ([0-9]+)/.match(status_string)[1].to_i
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
@@ -0,0 +1,211 @@
|
|
1
|
+
require 'stringio'
|
2
|
+
# This is a utility built to mimic some of the features of the pt_archive script
|
3
|
+
# to make really big purges go faster with no production impact.
|
4
|
+
#
|
5
|
+
# It uses a strategy of descending an index, querying for purgeable ids and then
|
6
|
+
# deleting them in batches.
|
7
|
+
#
|
8
|
+
# === Required Options
|
9
|
+
#
|
10
|
+
# [:model]
|
11
|
+
# Required: the Active Record model for the table being purged or copied from
|
12
|
+
#
|
13
|
+
# === Optional Options
|
14
|
+
#
|
15
|
+
# [:chunk_size]
|
16
|
+
# The number of rows to copy in each block. Defaults to 500.
|
17
|
+
# [:index]
|
18
|
+
# The index to traverse in ascending order doing the purge. Rows are read in the order of
|
19
|
+
# the index, which must be a btree index. If not specified, <tt>PRIMARY</tt> is assumed.
|
20
|
+
# [:reverse]
|
21
|
+
# Traverse the index in reverse order. For example, if your index is on <tt>account_id</tt>,
|
22
|
+
# <tt>timestamp</tt>, this option will move through the rows starting at the highest account
|
23
|
+
# number, then move through timestamps starting with the most recent.
|
24
|
+
# [:first_only]
|
25
|
+
# Traverse only the first column of the index, and do so inclusively using the <tt>>=</tt> operator
|
26
|
+
# instead of the strict <tt>></tt> operator. This is important if the index is not unique and there
|
27
|
+
# are a lot of duplicates. Otherwise the delete could miss rows. Not allowed in copy mode because you'd
|
28
|
+
# be inserting duplicate rows.
|
29
|
+
# [:dry_run]
|
30
|
+
# Print out the queries that are going to be used. You should run explain on these.
|
31
|
+
# [:stop_after]
|
32
|
+
# The operation will end after copying this many rows.
|
33
|
+
# [:report]
|
34
|
+
# Specify an interval in seconds between status messages being printed out.
|
35
|
+
# [:logger]
|
36
|
+
# The log instance to use. Defaults to the <tt>ActiveRecord::Base.logger</tt>
|
37
|
+
# if not nil, otherwise it uses _$stdout_
|
38
|
+
# [:dest_model]
|
39
|
+
# When this option is present nothing is deleted, and instead rows are copied to
|
40
|
+
# the table for this model. This model must
|
41
|
+
# have identically named columns as the source model. By default, only columns in the
|
42
|
+
# named index and primary key are copied but these can be augmented with columns in the
|
43
|
+
# <tt>copy_columns</tt> option.
|
44
|
+
# [:copy_columns]
|
45
|
+
# Extra columns to add when copying to a dest model.
|
46
|
+
#
|
47
|
+
# === Safety thresholds
|
48
|
+
# [:sleep]
|
49
|
+
# Time in seconds to sleep between each chunk.
|
50
|
+
# [:max_history]
|
51
|
+
# The history list size (if available) is checked every 5 minutes and if it exceeds this size
|
52
|
+
# the purge will pause until the history list is below 90% of this value.
|
53
|
+
# [:max_repl_lag]
|
54
|
+
# The maximum length of the replication lag. Checked every 5 minutes and if exceeded the purge
|
55
|
+
# pauses until the replication lag is below 90% of this value.
|
56
|
+
|
57
|
+
class CleanSweep::PurgeRunner
|
58
|
+
|
59
|
+
require 'clean_sweep/purge_runner/logging'
|
60
|
+
require 'clean_sweep/purge_runner/mysql_status'
|
61
|
+
|
62
|
+
include CleanSweep::PurgeRunner::Logging
|
63
|
+
|
64
|
+
# This helps us track the state of replication and history list and pause
|
65
|
+
# if necessary
|
66
|
+
attr_reader :mysql_status
|
67
|
+
|
68
|
+
def initialize(options={})
|
69
|
+
@model = options[:model] or raise "source model class required"
|
70
|
+
@limit = options[:chunk_size] || 500
|
71
|
+
|
72
|
+
@target_model = options[:dest_model]
|
73
|
+
@stop_after = options[:stop_after]
|
74
|
+
@report_interval = options[:report] || 10.seconds
|
75
|
+
@logger = options[:logger] || ActiveRecord::Base.logger || Logger.new($stdout)
|
76
|
+
@dry_run = options[:dry_run]
|
77
|
+
@sleep = options[:sleep]
|
78
|
+
|
79
|
+
@max_history = options[:max_history]
|
80
|
+
@max_repl_lag = options[:max_repl_lag]
|
81
|
+
|
82
|
+
@table_schema = CleanSweep::TableSchema.new @model,
|
83
|
+
key_name: options[:index],
|
84
|
+
ascending: !options[:reverse],
|
85
|
+
extra_columns: options[:copy_columns],
|
86
|
+
first_only: options[:first_only]
|
87
|
+
|
88
|
+
if (@max_history || @max_repl_lag)
|
89
|
+
@mysql_status = CleanSweep::PurgeRunner::MysqlStatus.new model: @model,
|
90
|
+
max_history: @max_history,
|
91
|
+
max_repl_lag: @max_repl_lag,
|
92
|
+
check_period: options[:check_period],
|
93
|
+
logger: @logger
|
94
|
+
end
|
95
|
+
|
96
|
+
raise "You can't copy rows from a table into itself" if copy_mode? && @model == @target_model
|
97
|
+
raise "An index is required in copy mode" if copy_mode? && @table_schema.traversing_key.nil?
|
98
|
+
raise "first_only option not allowed in copy mode" if copy_mode? && @table_schema.first_only?
|
99
|
+
|
100
|
+
@report_interval_start = Time.now
|
101
|
+
|
102
|
+
@query = @table_schema.initial_scope.limit(@limit)
|
103
|
+
|
104
|
+
@query = yield(@query) if block_given?
|
105
|
+
end
|
106
|
+
|
107
|
+
|
108
|
+
def copy_mode?
|
109
|
+
@target_model.present?
|
110
|
+
end
|
111
|
+
|
112
|
+
# Execute the purge in chunks according to the parameters given on instance creation.
|
113
|
+
# Will raise <tt>CleanSweep::PurgeStopped</tt> if a <tt>stop_after</tt> option was provided and
|
114
|
+
# that limit is hit.
|
115
|
+
#
|
116
|
+
# Returns the number of rows copied or deleted.
|
117
|
+
#
|
118
|
+
def execute_in_batches
|
119
|
+
|
120
|
+
print_queries($stdout) and return 0 if @dry_run
|
121
|
+
|
122
|
+
@start = Time.now
|
123
|
+
verb = copy_mode? ? "copying" : "purging"
|
124
|
+
|
125
|
+
msg = "starting: #{verb} #{@table_schema.name} records in batches of #@limit"
|
126
|
+
msg << " to #{@target_model.table_name}" if copy_mode?
|
127
|
+
|
128
|
+
|
129
|
+
log :info, "sleeping #{@sleep} seconds between purging" if @sleep && !copy_mode?
|
130
|
+
@total_deleted = 0
|
131
|
+
|
132
|
+
# Iterate through the rows in limit chunks
|
133
|
+
log :debug, "find rows: #{@query.to_sql}" if @logger.level == Logger::DEBUG
|
134
|
+
|
135
|
+
@mysql_status.check! if @mysql_status
|
136
|
+
|
137
|
+
rows = NewRelic::Agent.with_database_metric_name(@model.name, 'SELECT') do
|
138
|
+
@model.connection.select_rows @query.to_sql
|
139
|
+
end
|
140
|
+
while rows.any? && (!@stop_after || @total_deleted < @stop_after) do
|
141
|
+
# index_entrypoint_args = Hash[*@source_keys.zip(rows.last).flatten]
|
142
|
+
log :debug, "#{verb} #{rows.size} records between #{rows.first.inspect} and #{rows.last.inspect}" if @logger.level == Logger::DEBUG
|
143
|
+
stopped = @stop_after && rows.size + @total_deleted > @stop_after
|
144
|
+
|
145
|
+
rows = rows.first(@stop_after - @total_deleted) if stopped
|
146
|
+
last_row = rows.last
|
147
|
+
if copy_mode?
|
148
|
+
metric_op_name = 'INSERT'
|
149
|
+
statement = @table_schema.insert_statement(@target_model, rows)
|
150
|
+
else
|
151
|
+
metric_op_name = 'DELETE'
|
152
|
+
statement = @table_schema.delete_statement(rows)
|
153
|
+
end
|
154
|
+
log :debug, statement if @logger.level == Logger::DEBUG
|
155
|
+
chunk_deleted = NewRelic::Agent.with_database_metric_name(@target_model, metric_op_name) do
|
156
|
+
@model.connection.update statement
|
157
|
+
end
|
158
|
+
|
159
|
+
@total_deleted += chunk_deleted
|
160
|
+
raise CleanSweep::PurgeStopped.new("stopped after #{verb} #{@total_deleted} #{@model} records", @total_deleted) if stopped
|
161
|
+
q = @table_schema.scope_to_next_chunk(@query, last_row).to_sql
|
162
|
+
log :debug, "find rows: #{q}" if @logger.level == Logger::DEBUG
|
163
|
+
|
164
|
+
sleep @sleep if @sleep && !copy_mode?
|
165
|
+
@mysql_status.check! if @mysql_status
|
166
|
+
|
167
|
+
rows = NewRelic::Agent.with_database_metric_name(@model, 'SELECT') do
|
168
|
+
@model.connection.select_rows(q)
|
169
|
+
end
|
170
|
+
report
|
171
|
+
end
|
172
|
+
report(true)
|
173
|
+
if copy_mode?
|
174
|
+
log :info, "completed after #{verb} #{@total_deleted} #{@table_schema.name} records to #{@target_model.table_name}"
|
175
|
+
else
|
176
|
+
log :info, "completed after #{verb} #{@total_deleted} #{@table_schema.name} records"
|
177
|
+
end
|
178
|
+
|
179
|
+
return @total_deleted
|
180
|
+
end
|
181
|
+
|
182
|
+
def sleep duration
|
183
|
+
Kernel.sleep duration
|
184
|
+
end
|
185
|
+
|
186
|
+
add_method_tracer :sleep
|
187
|
+
add_method_tracer :execute_in_batches
|
188
|
+
|
189
|
+
def print_queries(io)
|
190
|
+
io.puts 'Initial Query:'
|
191
|
+
io.puts format_query(' ', @query.to_sql)
|
192
|
+
rows = @model.connection.select_rows @query.limit(1).to_sql
|
193
|
+
io.puts "Chunk Query:"
|
194
|
+
io.puts format_query(' ', @table_schema.scope_to_next_chunk(@query, rows.first).to_sql)
|
195
|
+
if copy_mode?
|
196
|
+
io.puts "Insert Statement:"
|
197
|
+
io.puts format_query(' ', @table_schema.insert_statement(@target_model, rows))
|
198
|
+
else
|
199
|
+
io.puts "Delete Statement:"
|
200
|
+
io.puts format_query(' ', @table_schema.delete_statement(rows))
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
private
|
205
|
+
|
206
|
+
def format_query indentation, query
|
207
|
+
lines = query.split(/ (?=values|from|where|order|limit)/i)
|
208
|
+
lines.map {|line| indentation + line.strip }.join("\n")
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
class CleanSweep::TableSchema::ColumnSchema
|
2
|
+
|
3
|
+
attr_reader :name
|
4
|
+
attr_accessor :select_position
|
5
|
+
|
6
|
+
def initialize(name, model)
|
7
|
+
@name = name.to_sym
|
8
|
+
col_num = model.column_names.index(name.to_s) or raise "Can't find #{name} in #{model.name}"
|
9
|
+
@model = model
|
10
|
+
@column = model.columns[col_num]
|
11
|
+
end
|
12
|
+
|
13
|
+
def quoted_name
|
14
|
+
"`#{name}`"
|
15
|
+
end
|
16
|
+
def value(row)
|
17
|
+
row[select_position]
|
18
|
+
end
|
19
|
+
def quoted_value(row)
|
20
|
+
@model.quote_value(value(row), @column)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|