cleansweep 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,72 @@
1
+ class CleanSweep::TableSchema::IndexSchema < Struct.new :name, :model, :ascending
2
+
3
+ attr_accessor :columns, :name, :model, :ascending, :first_only
4
+
5
+ def initialize name, model
6
+ @model = model
7
+ @columns = []
8
+ @name = name
9
+ end
10
+
11
+ # Add a column
12
+ def << col_name
13
+ @columns << CleanSweep::TableSchema::ColumnSchema.new(col_name, model)
14
+ end
15
+
16
+ # Take columns referenced by this index and add them to the list if they
17
+ # are not present. Record their position in the list because the position will
18
+ # be where they are located in a row of values passed in later to #scope_to_next_chunk
19
+ def add_columns_to select_columns
20
+ @columns.each do | column |
21
+ pos = select_columns.index column.name
22
+ if pos.nil?
23
+ select_columns << column.name
24
+ pos = select_columns.size - 1
25
+ end
26
+ column.select_position = pos
27
+ end
28
+ end
29
+
30
+ def order(scope)
31
+ direction = ascending ? 'ASC' : 'DESC'
32
+ if @first_only
33
+ scope.order("#{columns.first.quoted_name} #{direction}")
34
+ else
35
+ scope.order(columns.map { |col| "#{col.quoted_name} #{direction}"}.join(","))
36
+ end
37
+ end
38
+
39
+ def scope_to_next_chunk(scope, last_row)
40
+ query_args = {}
41
+ if @first_only
42
+ query_args[columns.first.name] = columns.first.value(last_row)
43
+ else
44
+ columns.each do |column|
45
+ query_args[column.name] = column.value(last_row)
46
+ end
47
+ end
48
+ scope.where(chunk_clause, query_args)
49
+ end
50
+
51
+ private
52
+
53
+ def chunk_clause
54
+ @chunk_clause ||=
55
+ if @first_only
56
+ # If we're only using the first column, you have to do an inclusive comparison
57
+ "#{columns.first.quoted_name} #{ascending ? ">=" : "<="} :#{columns.first.name}"
58
+ else
59
+ # If you are using all columns of the index, build the expression recursively
60
+ add_term(columns.dup)
61
+ end
62
+ end
63
+
64
+ def add_term(columns)
65
+ column = columns.shift
66
+ clause = "#{column.quoted_name} #{ascending ? ">" : "<"} :#{column.name}"
67
+ if columns.any?
68
+ clause << " OR (#{column.quoted_name} = :#{column.name} AND #{add_term columns})"
69
+ end
70
+ return clause
71
+ end
72
+ end
@@ -0,0 +1,112 @@
1
+
2
+ class CleanSweep::TableSchema
3
+
4
+ # The list of columns used when selecting, the union of pk and traversing key columns
5
+ attr_reader :select_columns
6
+
7
+ # The schema for the primary key
8
+ attr_reader :primary_key
9
+
10
+ # The schema for the traversing key, or nil
11
+ attr_reader :traversing_key
12
+
13
+ attr_reader :name
14
+
15
+ def initialize(model, options={})
16
+
17
+ traversing_key_name = options[:key_name]
18
+ ascending = options.include?(:ascending) ? options[:ascending] : true
19
+ first_only = options[:first_only]
20
+ @model = model
21
+ @name = @model.table_name
22
+ @select_columns = (options[:extra_columns] && options[:extra_columns].map(&:to_sym)) || []
23
+
24
+ key_schemas = build_indexes
25
+
26
+ # Primary key only supported, but we could probably get around this by adding
27
+ # all columns as 'primary key columns'
28
+ raise "Table #{model.table_name} must have a primary key" unless key_schemas.include? 'primary'
29
+
30
+ @primary_key = key_schemas['primary']
31
+ @primary_key.add_columns_to @select_columns
32
+ if traversing_key_name
33
+ traversing_key_name.downcase!
34
+ raise "BTREE Index #{traversing_key_name} not found" unless key_schemas.include? traversing_key_name
35
+ @traversing_key = key_schemas[traversing_key_name]
36
+ @traversing_key.add_columns_to @select_columns
37
+ @traversing_key.ascending = ascending
38
+ @traversing_key.first_only = first_only
39
+ end
40
+
41
+ end
42
+
43
+ def insert_statement(target_model, rows)
44
+ "insert into #{target_model.quoted_table_name} (#{quoted_column_names}) values #{quoted_row_values(rows)}"
45
+ end
46
+
47
+ def delete_statement(rows)
48
+ rec_criteria = rows.map do | row |
49
+ row_compares = []
50
+ @primary_key.columns.each do |column|
51
+ row_compares << "#{column.quoted_name} = #{column.quoted_value(row)}"
52
+ end
53
+ "(" + row_compares.join(" AND ") + ")"
54
+ end
55
+ "DELETE FROM #{@model.quoted_table_name} WHERE #{rec_criteria.join(" OR ")}"
56
+ end
57
+
58
+ def initial_scope
59
+ scope = @model.all.select(quoted_column_names).from(from_clause)
60
+ scope = @traversing_key.order(scope) if @traversing_key
61
+ return scope
62
+ end
63
+
64
+ def scope_to_next_chunk scope, last_row
65
+ if @traversing_key.blank?
66
+ scope
67
+ else
68
+ @traversing_key.scope_to_next_chunk(scope, last_row)
69
+ end
70
+ end
71
+
72
+ def first_only?
73
+ @traversing_key && @traversing_key.first_only
74
+ end
75
+
76
+ private
77
+
78
+ def from_clause
79
+ table_name = @model.quoted_table_name
80
+ table_name += " FORCE INDEX(#{@traversing_key.name})" if @traversing_key
81
+ return table_name
82
+ end
83
+
84
+ def quoted_column_names
85
+ select_columns.map{|c| "`#{c}`"}.join(",")
86
+ end
87
+
88
+ def quoted_row_values(rows)
89
+ rows.map do |vec|
90
+ quoted_column_values = vec.map do |col_value|
91
+ @model.connection.quote(col_value)
92
+ end.join(",")
93
+ "(#{quoted_column_values})"
94
+ end.join(",")
95
+ end
96
+
97
+ def build_indexes
98
+ indexes = {}
99
+ column_details = @model.connection.select_rows "show indexes from #{@model.quoted_table_name}"
100
+ column_details.each do | col |
101
+ key_name = col[2].downcase
102
+ col_name = col[4].downcase
103
+ type = col[10]
104
+ next if key_name != 'PRIMARY' && type != 'BTREE' # Only BTREE indexes supported for traversing
105
+ indexes[key_name] ||= IndexSchema.new key_name, @model
106
+ indexes[key_name] << col_name
107
+ end
108
+ return indexes
109
+ end
110
+
111
+ end
112
+
@@ -0,0 +1,3 @@
1
+ module CleanSweep
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,11 @@
1
+ require "clean_sweep/version"
2
+
3
+ module CleanSweep
4
+
5
+ end
6
+
7
+ require 'clean_sweep/purge_stopped'
8
+ require 'clean_sweep/table_schema'
9
+ require 'clean_sweep/table_schema/column_schema'
10
+ require 'clean_sweep/table_schema/index_schema'
11
+ require 'clean_sweep/purge_runner'
data/lib/cleansweep.rb ADDED
@@ -0,0 +1 @@
1
+ require 'clean_sweep'
@@ -0,0 +1,36 @@
1
+ class Book < ActiveRecord::Base
2
+
3
+ def self.create_table
4
+ connection.execute <<-EOF
5
+ create temporary table if not exists
6
+ books (
7
+ `id` int(11) auto_increment,
8
+ `bin` int(11),
9
+ `publisher` varchar(64),
10
+ `title` varchar(64),
11
+ primary key (id),
12
+ key book_index_by_bin(bin, id)
13
+ )
14
+ EOF
15
+ end
16
+
17
+ end
18
+
19
+ FactoryGirl.define do
20
+ factory :book do | book |
21
+ book.publisher "Random House"
22
+ book.sequence(:bin) { | n | (n % 3) * 1000 }
23
+ book.sequence(:title) { |n| "Jaws, Part #{n}"}
24
+ end
25
+ end
26
+
27
+ class BookTemp < ActiveRecord::Base
28
+
29
+ self.table_name = 'book_vault'
30
+
31
+ def self.create_table
32
+ connection.execute <<-EOF
33
+ create temporary table if not exists book_vault like books
34
+ EOF
35
+ end
36
+ end
@@ -0,0 +1,26 @@
1
+ class Comment < ActiveRecord::Base
2
+
3
+ def self.create_table
4
+ connection.execute <<-EOF
5
+ create temporary table if not exists
6
+ comments (
7
+ `id` int(11) primary key auto_increment,
8
+ `timestamp` datetime,
9
+ `account` int(11),
10
+ `seen` boolean,
11
+ key comments_on_account_timestamp(account, timestamp),
12
+ key comments_on_timestamp(timestamp desc)
13
+ )
14
+ EOF
15
+ connection.execute 'truncate table comments'
16
+ end
17
+
18
+ end
19
+
20
+ FactoryGirl.define do
21
+ factory :comment do | comment |
22
+ comment.timestamp Time.now
23
+ comment.seen false
24
+ comment.sequence(:account) { | n | (n % 3)* 100 }
25
+ end
26
+ end
@@ -0,0 +1,222 @@
1
+ require 'spec_helper'
2
+
3
+ require 'active_support/testing/time_helpers'
4
+ describe CleanSweep::PurgeRunner do
5
+
6
+ context 'PurgeRunner' do
7
+ include ActiveSupport::Testing::TimeHelpers
8
+ before do
9
+ travel_to Time.parse("2014-12-02 13:47:43 -0800")
10
+ end
11
+ after do
12
+ travel_back
13
+ end
14
+
15
+ context "using comments" do
16
+ before do
17
+ Comment.create_table
18
+ end
19
+ context "with duplicate rows" do
20
+
21
+ # This testcase demonstrates a weakness in the index traversal
22
+ # which is that if you aren't using a unique index or the first_only option,
23
+ # you can miss rows.
24
+ #
25
+ # In this case we have some duplicate rows but because the chunk_size is
26
+ # set low, we don't get all the duplicates in one chunk. And they miss
27
+ # the next chunk because we are looking for values greater than the
28
+ # columns in the current chunk.
29
+ #
30
+ # If you use the first_only option it means it builds the where clause using only
31
+ # the first column of the index, and it also uses the >=, <= operators instead
32
+ # of >, <. So it picks up all the rows.
33
+ #
34
+
35
+ before do
36
+ 10.times { create(:comment, timestamp: 2.weeks.ago, seen: false) }
37
+ 10.times { create(:comment, timestamp: 2.weeks.ago, seen: false) }
38
+ 10.times { create(:comment, timestamp: 2.days.ago, seen: false) }
39
+ 10.times { create(:comment, timestamp: 2.days.ago, seen: false) }
40
+ 10.times { create(:comment, timestamp: 2.days.ago, seen: false) }
41
+ end
42
+
43
+ it "can miss some rows" do
44
+ purger = CleanSweep::PurgeRunner.new model: Comment,
45
+ index: 'comments_on_timestamp',
46
+ chunk_size: 7 do | scope |
47
+ scope.where('timestamp < ?', 1.week.ago)
48
+ end
49
+ expect( -> {
50
+ purger.execute_in_batches
51
+ }).to change(Comment, :count).from(50).to(43) # if it deleted all dups this would be 30, not 42
52
+ end
53
+ it "won't miss rows using first_only option" do
54
+ purger = CleanSweep::PurgeRunner.new model: Comment,
55
+ index: 'comments_on_timestamp',
56
+ first_only: true,
57
+ chunk_size: 7 do | scope |
58
+ scope.where('timestamp < ?', 1.week.ago)
59
+ end
60
+ expect( -> {
61
+ purger.execute_in_batches
62
+ }).to change(Comment, :count).from(50).to(30) # if it deleted all dups this would be 30, not 42
63
+
64
+ end
65
+
66
+ it 'prints out the queries in a dry run' do
67
+ purger = CleanSweep::PurgeRunner.new model: Comment,
68
+ index: 'comments_on_account_timestamp' do | scope |
69
+ scope.where('timestamp < ?', 1.week.ago)
70
+ end
71
+ output = StringIO.new
72
+ purger.print_queries(output)
73
+ expect(output.string).to eq <<EOF
74
+ Initial Query:
75
+ SELECT `id`,`account`,`timestamp`
76
+ FROM `comments` FORCE INDEX(comments_on_account_timestamp)
77
+ WHERE (timestamp < '2014-11-25 21:47:43')
78
+ ORDER BY `account` ASC,`timestamp` ASC
79
+ LIMIT 500
80
+ Chunk Query:
81
+ SELECT `id`,`account`,`timestamp`
82
+ FROM `comments` FORCE INDEX(comments_on_account_timestamp)
83
+ WHERE (timestamp < '2014-11-25 21:47:43') AND (`account` > 0 OR (`account` = 0 AND `timestamp` > '2014-11-18 21:47:43'))\n ORDER BY `account` ASC,`timestamp` ASC
84
+ LIMIT 500
85
+ Delete Statement:
86
+ DELETE
87
+ FROM `comments`
88
+ WHERE (`id` = 2)
89
+ EOF
90
+ end
91
+ end
92
+ context "with unique rows" do
93
+ before do
94
+ # Create 10 comments going back 0..9 days...
95
+ 10.times { |i| create(:comment, timestamp: i.days.ago) }
96
+ end
97
+
98
+ it "ascends the index" do
99
+ purger = CleanSweep::PurgeRunner.new model: Comment,
100
+ index: 'comments_on_timestamp',
101
+ stop_after: 5
102
+ begin
103
+ purger.execute_in_batches
104
+ rescue CleanSweep::PurgeStopped
105
+ end
106
+ expect(Comment.count).to eq(5)
107
+ # Only old comments deleted before stopping
108
+ expect(Comment.where('timestamp >= ?', 4.days.ago).count).to eq(5)
109
+ end
110
+ it "descends the index" do
111
+ purger = CleanSweep::PurgeRunner.new model: Comment,
112
+ index: 'comments_on_timestamp',
113
+ reverse: true,
114
+ stop_after: 5
115
+ begin
116
+ purger.execute_in_batches
117
+ rescue CleanSweep::PurgeStopped
118
+ end
119
+ # Delete from the most recent comments, so only old ones are left.
120
+ expect(Comment.count).to eq(5)
121
+ expect(Comment.where('timestamp <= ?', 4.days.ago).count).to eq(5)
122
+
123
+ end
124
+ end
125
+ end
126
+
127
+
128
+ context "using books" do
129
+
130
+ before do
131
+ @total_book_size = 50
132
+ Book.create_table
133
+ @total_book_size.times { create(:book) }
134
+ end
135
+
136
+ after do
137
+ Book.delete_all
138
+ end
139
+
140
+ it 'waits for history' do
141
+ purger = CleanSweep::PurgeRunner.new model: Book,
142
+ max_history: 100,
143
+ chunk_size: 10
144
+ mysql_status = purger.mysql_status
145
+ expect(mysql_status).to receive(:check!).exactly(6).times
146
+
147
+ purger.execute_in_batches
148
+
149
+ end
150
+
151
+ it 'should not check when there are no limits' do
152
+ purger = CleanSweep::PurgeRunner.new model: Book,
153
+ chunk_size: 4
154
+
155
+ expect(purger.mysql_status).to be_nil
156
+ end
157
+
158
+ it 'purges books' do
159
+ purger = CleanSweep::PurgeRunner.new model: Book,
160
+ chunk_size: 4
161
+
162
+ count = purger.execute_in_batches
163
+ expect(count).to be(@total_book_size)
164
+ expect(Book.count).to be 0
165
+ end
166
+
167
+ it 'copies books' do
168
+ BookTemp.create_table
169
+ purger = CleanSweep::PurgeRunner.new model: Book,
170
+ dest_model: BookTemp,
171
+ chunk_size: 4,
172
+ index: 'book_index_by_bin'
173
+
174
+ count = purger.execute_in_batches
175
+ expect(count).to be(@total_book_size)
176
+ expect(BookTemp.count).to eq(@total_book_size)
177
+ end
178
+
179
+ end
180
+ end
181
+ end
182
+
183
+ describe CleanSweep::PurgeRunner::MysqlStatus do
184
+
185
+ context "mysql status check tool" do
186
+
187
+ let(:mysql_status) do
188
+ CleanSweep::PurgeRunner::MysqlStatus.new model: Book, max_history:100, max_repl_lag: 100
189
+ end
190
+
191
+ before do
192
+ Book.create_table
193
+ end
194
+
195
+ it "fetches innodb status" do
196
+ mysql_status.get_replication_lag
197
+ end
198
+ it "checks history and pauses" do
199
+ allow(mysql_status).to receive(:get_history_length).and_return(101, 95, 89)
200
+ expect(mysql_status).to receive(:pause).twice
201
+ mysql_status.check!
202
+ end
203
+ it "checks replication and pauses" do
204
+ allow(mysql_status).to receive(:get_replication_lag).and_return(101, 95, 89)
205
+ expect(mysql_status).to receive(:pause).twice
206
+ mysql_status.check!
207
+ end
208
+
209
+ it "checks and continues" do
210
+ allow(mysql_status).to receive(:get_history_length).and_return(80)
211
+ expect(mysql_status).not_to receive(:pause)
212
+ mysql_status.check!
213
+ end
214
+
215
+ it "fetches slave status" do
216
+ mysql_status.get_history_length
217
+ end
218
+ end
219
+
220
+ end
221
+
222
+
@@ -0,0 +1,36 @@
1
+ ENV['RACK_ENV'] = 'test'
2
+
3
+ require 'clean_sweep'
4
+ require 'factory_girl'
5
+ require 'fileutils'
6
+ require 'active_record'
7
+ require 'mysql2'
8
+ RSpec.configure do |config|
9
+ config.include FactoryGirl::Syntax::Methods
10
+ config.formatter = :progress
11
+ #config.order = 'random'
12
+
13
+ config.before(:suite) do
14
+ FactoryGirl.find_definitions
15
+ end
16
+
17
+ end
18
+
19
+ logdir = File.expand_path "../../log",__FILE__
20
+ FileUtils.mkdir_p logdir
21
+ logfile = File.open(File.join(logdir, "test.log"), "w+")
22
+ ActiveRecord::Base.logger = Logger.new(logfile)
23
+
24
+ database = {
25
+ encoding: 'utf8',
26
+ adapter: 'mysql2',
27
+ username: ENV['DB_USERNAME'] || 'root',
28
+ host: 'localhost',
29
+ password: ENV['DB_PASSWORD'],
30
+ }
31
+ db_name = ENV['DB_SCHEMA'] || 'cstest'
32
+ connection = Mysql2::Client.new(database)
33
+ connection.query "CREATE DATABASE IF NOT EXISTS #{db_name}"
34
+ database[:database] = db_name
35
+
36
+ ActiveRecord::Base.establish_connection(database)
@@ -0,0 +1,111 @@
1
+ require 'spec_helper'
2
+
3
+ describe CleanSweep::TableSchema do
4
+
5
+ before do
6
+ Comment.create_table
7
+ end
8
+
9
+ context "using ascending account, timestamp index" do
10
+ let(:schema) { CleanSweep::TableSchema.new Comment, key_name:'comments_on_account_timestamp', ascending: true }
11
+
12
+ it 'should read comments' do
13
+ expect(schema.primary_key.columns.map(&:name)).to eq([:id])
14
+ expect(schema.traversing_key.columns.map(&:name)).to eq([:account, :timestamp])
15
+ end
16
+
17
+ it 'should produce an ascending chunk clause' do
18
+ rows = account_and_timestamp_rows
19
+ expect(schema.scope_to_next_chunk(schema.initial_scope, rows.last).to_sql)
20
+ .to include("(`account` > 5 OR (`account` = 5 AND `timestamp` > '2014-12-01 23:13:25'))")
21
+ end
22
+
23
+ it 'should produce all select columns' do
24
+ expect(schema.select_columns).to eq([:id, :account, :timestamp])
25
+ end
26
+
27
+ it 'should produce the ascending order clause' do
28
+ expect(schema.initial_scope.to_sql).to include('`account` ASC,`timestamp` ASC')
29
+ end
30
+
31
+
32
+ it 'should produce an insert statement' do
33
+ schema = CleanSweep::TableSchema.new Comment, key_name: 'comments_on_account_timestamp'
34
+ rows = account_and_timestamp_rows
35
+ expect(schema.insert_statement(Comment, rows)).to eq("insert into `comments` (`id`,`account`,`timestamp`) values (1001,5,'2014-12-02 01:13:25'),(1002,2,'2014-12-02 00:13:25'),(1005,5,'2014-12-01 23:13:25')")
36
+ end
37
+ end
38
+
39
+ context "using descending account, timestamp index" do
40
+
41
+ let(:schema) { CleanSweep::TableSchema.new Comment, key_name:'comments_on_account_timestamp', ascending: false }
42
+
43
+ it 'should produce a descending where clause' do
44
+ rows = account_and_timestamp_rows
45
+ expect(schema.scope_to_next_chunk(schema.initial_scope, rows.last).to_sql)
46
+ .to include("(`account` < 5 OR (`account` = 5 AND `timestamp` < '2014-12-01 23:13:25'))")
47
+ end
48
+
49
+
50
+ it 'should produce the descending order clause' do
51
+ rows = account_and_timestamp_rows
52
+ expect(schema.scope_to_next_chunk(schema.initial_scope, rows.last).to_sql)
53
+ .to include("`account` DESC,`timestamp` DESC")
54
+ end
55
+
56
+ end
57
+
58
+ context "using account, timestamp index first column only" do
59
+ let(:schema) { CleanSweep::TableSchema.new Comment, key_name:'comments_on_account_timestamp', first_only: true }
60
+
61
+ it 'should select all the rows' do
62
+ expect(schema.select_columns).to eq([:id, :account, :timestamp])
63
+ end
64
+
65
+ it 'should only query using the first column of the index' do
66
+ rows = account_and_timestamp_rows
67
+ expect(schema.scope_to_next_chunk(schema.initial_scope, rows.last).to_sql)
68
+ .to include(" (`account` >= 5) ")
69
+
70
+ end
71
+
72
+ end
73
+
74
+ it 'should not care about case' do
75
+ CleanSweep::TableSchema.new Comment, key_name: 'primary'
76
+ end
77
+
78
+ it 'should work without a descending index' do
79
+ schema = CleanSweep::TableSchema.new Comment
80
+ expect(schema.primary_key.columns.map(&:name)).to eq([:id])
81
+ expect(schema.traversing_key).to be_nil
82
+ end
83
+
84
+ it 'should produce minimal select columns' do
85
+ schema = CleanSweep::TableSchema.new Comment, key_name: 'PRIMARY'
86
+ expect(schema.select_columns).to eq([:id])
87
+ end
88
+
89
+ it 'should produce the from clause with an index' do
90
+ schema = CleanSweep::TableSchema.new Comment, key_name:'comments_on_timestamp'
91
+ expect(schema.initial_scope.to_sql).to include("`comments` FORCE INDEX(comments_on_timestamp)")
92
+ end
93
+
94
+ it 'should include additional columns' do
95
+ schema = CleanSweep::TableSchema.new Comment, key_name: 'comments_on_account_timestamp', extra_columns: %w[seen id]
96
+ expect(schema.select_columns).to eq([:seen, :id, :account, :timestamp])
97
+ rows = account_and_timestamp_rows
98
+ rows.map! { |row| row.unshift 1 } # Insert 'seen' value to beginning of row
99
+ expect(schema.insert_statement(Comment, rows)).to eq("insert into `comments` (`seen`,`id`,`account`,`timestamp`) values (1,1001,5,'2014-12-02 01:13:25'),(1,1002,2,'2014-12-02 00:13:25'),(1,1005,5,'2014-12-01 23:13:25')")
100
+
101
+ end
102
+
103
+
104
+ def account_and_timestamp_rows
105
+ rows = []
106
+ t = Time.parse '2014-12-01 17:13:25'
107
+ rows << [1001, 5, t]
108
+ rows << [1002, 2, t - 1.hour]
109
+ rows << [1005, 5, t - 2.hours]
110
+ end
111
+ end