cleansweep 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,72 @@
1
+ class CleanSweep::TableSchema::IndexSchema < Struct.new :name, :model, :ascending
2
+
3
+ attr_accessor :columns, :name, :model, :ascending, :first_only
4
+
5
+ def initialize name, model
6
+ @model = model
7
+ @columns = []
8
+ @name = name
9
+ end
10
+
11
+ # Add a column
12
+ def << col_name
13
+ @columns << CleanSweep::TableSchema::ColumnSchema.new(col_name, model)
14
+ end
15
+
16
+ # Take columns referenced by this index and add them to the list if they
17
+ # are not present. Record their position in the list because the position will
18
+ # be where they are located in a row of values passed in later to #scope_to_next_chunk
19
+ def add_columns_to select_columns
20
+ @columns.each do | column |
21
+ pos = select_columns.index column.name
22
+ if pos.nil?
23
+ select_columns << column.name
24
+ pos = select_columns.size - 1
25
+ end
26
+ column.select_position = pos
27
+ end
28
+ end
29
+
30
+ def order(scope)
31
+ direction = ascending ? 'ASC' : 'DESC'
32
+ if @first_only
33
+ scope.order("#{columns.first.quoted_name} #{direction}")
34
+ else
35
+ scope.order(columns.map { |col| "#{col.quoted_name} #{direction}"}.join(","))
36
+ end
37
+ end
38
+
39
+ def scope_to_next_chunk(scope, last_row)
40
+ query_args = {}
41
+ if @first_only
42
+ query_args[columns.first.name] = columns.first.value(last_row)
43
+ else
44
+ columns.each do |column|
45
+ query_args[column.name] = column.value(last_row)
46
+ end
47
+ end
48
+ scope.where(chunk_clause, query_args)
49
+ end
50
+
51
+ private
52
+
53
+ def chunk_clause
54
+ @chunk_clause ||=
55
+ if @first_only
56
+ # If we're only using the first column, you have to do an inclusive comparison
57
+ "#{columns.first.quoted_name} #{ascending ? ">=" : "<="} :#{columns.first.name}"
58
+ else
59
+ # If you are using all columns of the index, build the expression recursively
60
+ add_term(columns.dup)
61
+ end
62
+ end
63
+
64
+ def add_term(columns)
65
+ column = columns.shift
66
+ clause = "#{column.quoted_name} #{ascending ? ">" : "<"} :#{column.name}"
67
+ if columns.any?
68
+ clause << " OR (#{column.quoted_name} = :#{column.name} AND #{add_term columns})"
69
+ end
70
+ return clause
71
+ end
72
+ end
@@ -0,0 +1,112 @@
1
+
2
+ class CleanSweep::TableSchema
3
+
4
+ # The list of columns used when selecting, the union of pk and traversing key columns
5
+ attr_reader :select_columns
6
+
7
+ # The schema for the primary key
8
+ attr_reader :primary_key
9
+
10
+ # The schema for the traversing key, or nil
11
+ attr_reader :traversing_key
12
+
13
+ attr_reader :name
14
+
15
+ def initialize(model, options={})
16
+
17
+ traversing_key_name = options[:key_name]
18
+ ascending = options.include?(:ascending) ? options[:ascending] : true
19
+ first_only = options[:first_only]
20
+ @model = model
21
+ @name = @model.table_name
22
+ @select_columns = (options[:extra_columns] && options[:extra_columns].map(&:to_sym)) || []
23
+
24
+ key_schemas = build_indexes
25
+
26
+ # Primary key only supported, but we could probably get around this by adding
27
+ # all columns as 'primary key columns'
28
+ raise "Table #{model.table_name} must have a primary key" unless key_schemas.include? 'primary'
29
+
30
+ @primary_key = key_schemas['primary']
31
+ @primary_key.add_columns_to @select_columns
32
+ if traversing_key_name
33
+ traversing_key_name.downcase!
34
+ raise "BTREE Index #{traversing_key_name} not found" unless key_schemas.include? traversing_key_name
35
+ @traversing_key = key_schemas[traversing_key_name]
36
+ @traversing_key.add_columns_to @select_columns
37
+ @traversing_key.ascending = ascending
38
+ @traversing_key.first_only = first_only
39
+ end
40
+
41
+ end
42
+
43
+ def insert_statement(target_model, rows)
44
+ "insert into #{target_model.quoted_table_name} (#{quoted_column_names}) values #{quoted_row_values(rows)}"
45
+ end
46
+
47
+ def delete_statement(rows)
48
+ rec_criteria = rows.map do | row |
49
+ row_compares = []
50
+ @primary_key.columns.each do |column|
51
+ row_compares << "#{column.quoted_name} = #{column.quoted_value(row)}"
52
+ end
53
+ "(" + row_compares.join(" AND ") + ")"
54
+ end
55
+ "DELETE FROM #{@model.quoted_table_name} WHERE #{rec_criteria.join(" OR ")}"
56
+ end
57
+
58
+ def initial_scope
59
+ scope = @model.all.select(quoted_column_names).from(from_clause)
60
+ scope = @traversing_key.order(scope) if @traversing_key
61
+ return scope
62
+ end
63
+
64
+ def scope_to_next_chunk scope, last_row
65
+ if @traversing_key.blank?
66
+ scope
67
+ else
68
+ @traversing_key.scope_to_next_chunk(scope, last_row)
69
+ end
70
+ end
71
+
72
+ def first_only?
73
+ @traversing_key && @traversing_key.first_only
74
+ end
75
+
76
+ private
77
+
78
+ def from_clause
79
+ table_name = @model.quoted_table_name
80
+ table_name += " FORCE INDEX(#{@traversing_key.name})" if @traversing_key
81
+ return table_name
82
+ end
83
+
84
+ def quoted_column_names
85
+ select_columns.map{|c| "`#{c}`"}.join(",")
86
+ end
87
+
88
+ def quoted_row_values(rows)
89
+ rows.map do |vec|
90
+ quoted_column_values = vec.map do |col_value|
91
+ @model.connection.quote(col_value)
92
+ end.join(",")
93
+ "(#{quoted_column_values})"
94
+ end.join(",")
95
+ end
96
+
97
+ def build_indexes
98
+ indexes = {}
99
+ column_details = @model.connection.select_rows "show indexes from #{@model.quoted_table_name}"
100
+ column_details.each do | col |
101
+ key_name = col[2].downcase
102
+ col_name = col[4].downcase
103
+ type = col[10]
104
+ next if key_name != 'PRIMARY' && type != 'BTREE' # Only BTREE indexes supported for traversing
105
+ indexes[key_name] ||= IndexSchema.new key_name, @model
106
+ indexes[key_name] << col_name
107
+ end
108
+ return indexes
109
+ end
110
+
111
+ end
112
+
@@ -0,0 +1,3 @@
1
+ module CleanSweep
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,11 @@
1
+ require "clean_sweep/version"
2
+
3
+ module CleanSweep
4
+
5
+ end
6
+
7
+ require 'clean_sweep/purge_stopped'
8
+ require 'clean_sweep/table_schema'
9
+ require 'clean_sweep/table_schema/column_schema'
10
+ require 'clean_sweep/table_schema/index_schema'
11
+ require 'clean_sweep/purge_runner'
data/lib/cleansweep.rb ADDED
@@ -0,0 +1 @@
1
+ require 'clean_sweep'
@@ -0,0 +1,36 @@
1
+ class Book < ActiveRecord::Base
2
+
3
+ def self.create_table
4
+ connection.execute <<-EOF
5
+ create temporary table if not exists
6
+ books (
7
+ `id` int(11) auto_increment,
8
+ `bin` int(11),
9
+ `publisher` varchar(64),
10
+ `title` varchar(64),
11
+ primary key (id),
12
+ key book_index_by_bin(bin, id)
13
+ )
14
+ EOF
15
+ end
16
+
17
+ end
18
+
19
+ FactoryGirl.define do
20
+ factory :book do | book |
21
+ book.publisher "Random House"
22
+ book.sequence(:bin) { | n | (n % 3) * 1000 }
23
+ book.sequence(:title) { |n| "Jaws, Part #{n}"}
24
+ end
25
+ end
26
+
27
+ class BookTemp < ActiveRecord::Base
28
+
29
+ self.table_name = 'book_vault'
30
+
31
+ def self.create_table
32
+ connection.execute <<-EOF
33
+ create temporary table if not exists book_vault like books
34
+ EOF
35
+ end
36
+ end
@@ -0,0 +1,26 @@
1
+ class Comment < ActiveRecord::Base
2
+
3
+ def self.create_table
4
+ connection.execute <<-EOF
5
+ create temporary table if not exists
6
+ comments (
7
+ `id` int(11) primary key auto_increment,
8
+ `timestamp` datetime,
9
+ `account` int(11),
10
+ `seen` boolean,
11
+ key comments_on_account_timestamp(account, timestamp),
12
+ key comments_on_timestamp(timestamp desc)
13
+ )
14
+ EOF
15
+ connection.execute 'truncate table comments'
16
+ end
17
+
18
+ end
19
+
20
+ FactoryGirl.define do
21
+ factory :comment do | comment |
22
+ comment.timestamp Time.now
23
+ comment.seen false
24
+ comment.sequence(:account) { | n | (n % 3)* 100 }
25
+ end
26
+ end
@@ -0,0 +1,222 @@
1
+ require 'spec_helper'
2
+
3
+ require 'active_support/testing/time_helpers'
4
+ describe CleanSweep::PurgeRunner do
5
+
6
+ context 'PurgeRunner' do
7
+ include ActiveSupport::Testing::TimeHelpers
8
+ before do
9
+ travel_to Time.parse("2014-12-02 13:47:43 -0800")
10
+ end
11
+ after do
12
+ travel_back
13
+ end
14
+
15
+ context "using comments" do
16
+ before do
17
+ Comment.create_table
18
+ end
19
+ context "with duplicate rows" do
20
+
21
+ # This testcase demonstrates a weakness in the index traversal
22
+ # which is that if you aren't using a unique index or the first_only option,
23
+ # you can miss rows.
24
+ #
25
+ # In this case we have some duplicate rows but because the chunk_size is
26
+ # set low, we don't get all the duplicates in one chunk. And they miss
27
+ # the next chunk because we are looking for values greater than the
28
+ # columns in the current chunk.
29
+ #
30
+ # If you use the first_only option it means it builds the where clause using only
31
+ # the first column of the index, and it also uses the >=, <= operators instead
32
+ # of >, <. So it picks up all the rows.
33
+ #
34
+
35
+ before do
36
+ 10.times { create(:comment, timestamp: 2.weeks.ago, seen: false) }
37
+ 10.times { create(:comment, timestamp: 2.weeks.ago, seen: false) }
38
+ 10.times { create(:comment, timestamp: 2.days.ago, seen: false) }
39
+ 10.times { create(:comment, timestamp: 2.days.ago, seen: false) }
40
+ 10.times { create(:comment, timestamp: 2.days.ago, seen: false) }
41
+ end
42
+
43
+ it "can miss some rows" do
44
+ purger = CleanSweep::PurgeRunner.new model: Comment,
45
+ index: 'comments_on_timestamp',
46
+ chunk_size: 7 do | scope |
47
+ scope.where('timestamp < ?', 1.week.ago)
48
+ end
49
+ expect( -> {
50
+ purger.execute_in_batches
51
+ }).to change(Comment, :count).from(50).to(43) # if it deleted all dups this would be 30, not 42
52
+ end
53
+ it "won't miss rows using first_only option" do
54
+ purger = CleanSweep::PurgeRunner.new model: Comment,
55
+ index: 'comments_on_timestamp',
56
+ first_only: true,
57
+ chunk_size: 7 do | scope |
58
+ scope.where('timestamp < ?', 1.week.ago)
59
+ end
60
+ expect( -> {
61
+ purger.execute_in_batches
62
+ }).to change(Comment, :count).from(50).to(30) # if it deleted all dups this would be 30, not 42
63
+
64
+ end
65
+
66
+ it 'prints out the queries in a dry run' do
67
+ purger = CleanSweep::PurgeRunner.new model: Comment,
68
+ index: 'comments_on_account_timestamp' do | scope |
69
+ scope.where('timestamp < ?', 1.week.ago)
70
+ end
71
+ output = StringIO.new
72
+ purger.print_queries(output)
73
+ expect(output.string).to eq <<EOF
74
+ Initial Query:
75
+ SELECT `id`,`account`,`timestamp`
76
+ FROM `comments` FORCE INDEX(comments_on_account_timestamp)
77
+ WHERE (timestamp < '2014-11-25 21:47:43')
78
+ ORDER BY `account` ASC,`timestamp` ASC
79
+ LIMIT 500
80
+ Chunk Query:
81
+ SELECT `id`,`account`,`timestamp`
82
+ FROM `comments` FORCE INDEX(comments_on_account_timestamp)
83
+ WHERE (timestamp < '2014-11-25 21:47:43') AND (`account` > 0 OR (`account` = 0 AND `timestamp` > '2014-11-18 21:47:43'))\n ORDER BY `account` ASC,`timestamp` ASC
84
+ LIMIT 500
85
+ Delete Statement:
86
+ DELETE
87
+ FROM `comments`
88
+ WHERE (`id` = 2)
89
+ EOF
90
+ end
91
+ end
92
+ context "with unique rows" do
93
+ before do
94
+ # Create 10 comments going back 0..9 days...
95
+ 10.times { |i| create(:comment, timestamp: i.days.ago) }
96
+ end
97
+
98
+ it "ascends the index" do
99
+ purger = CleanSweep::PurgeRunner.new model: Comment,
100
+ index: 'comments_on_timestamp',
101
+ stop_after: 5
102
+ begin
103
+ purger.execute_in_batches
104
+ rescue CleanSweep::PurgeStopped
105
+ end
106
+ expect(Comment.count).to eq(5)
107
+ # Only old comments deleted before stopping
108
+ expect(Comment.where('timestamp >= ?', 4.days.ago).count).to eq(5)
109
+ end
110
+ it "descends the index" do
111
+ purger = CleanSweep::PurgeRunner.new model: Comment,
112
+ index: 'comments_on_timestamp',
113
+ reverse: true,
114
+ stop_after: 5
115
+ begin
116
+ purger.execute_in_batches
117
+ rescue CleanSweep::PurgeStopped
118
+ end
119
+ # Delete from the most recent comments, so only old ones are left.
120
+ expect(Comment.count).to eq(5)
121
+ expect(Comment.where('timestamp <= ?', 4.days.ago).count).to eq(5)
122
+
123
+ end
124
+ end
125
+ end
126
+
127
+
128
+ context "using books" do
129
+
130
+ before do
131
+ @total_book_size = 50
132
+ Book.create_table
133
+ @total_book_size.times { create(:book) }
134
+ end
135
+
136
+ after do
137
+ Book.delete_all
138
+ end
139
+
140
+ it 'waits for history' do
141
+ purger = CleanSweep::PurgeRunner.new model: Book,
142
+ max_history: 100,
143
+ chunk_size: 10
144
+ mysql_status = purger.mysql_status
145
+ expect(mysql_status).to receive(:check!).exactly(6).times
146
+
147
+ purger.execute_in_batches
148
+
149
+ end
150
+
151
+ it 'should not check when there are no limits' do
152
+ purger = CleanSweep::PurgeRunner.new model: Book,
153
+ chunk_size: 4
154
+
155
+ expect(purger.mysql_status).to be_nil
156
+ end
157
+
158
+ it 'purges books' do
159
+ purger = CleanSweep::PurgeRunner.new model: Book,
160
+ chunk_size: 4
161
+
162
+ count = purger.execute_in_batches
163
+ expect(count).to be(@total_book_size)
164
+ expect(Book.count).to be 0
165
+ end
166
+
167
+ it 'copies books' do
168
+ BookTemp.create_table
169
+ purger = CleanSweep::PurgeRunner.new model: Book,
170
+ dest_model: BookTemp,
171
+ chunk_size: 4,
172
+ index: 'book_index_by_bin'
173
+
174
+ count = purger.execute_in_batches
175
+ expect(count).to be(@total_book_size)
176
+ expect(BookTemp.count).to eq(@total_book_size)
177
+ end
178
+
179
+ end
180
+ end
181
+ end
182
+
183
+ describe CleanSweep::PurgeRunner::MysqlStatus do
184
+
185
+ context "mysql status check tool" do
186
+
187
+ let(:mysql_status) do
188
+ CleanSweep::PurgeRunner::MysqlStatus.new model: Book, max_history:100, max_repl_lag: 100
189
+ end
190
+
191
+ before do
192
+ Book.create_table
193
+ end
194
+
195
+ it "fetches innodb status" do
196
+ mysql_status.get_replication_lag
197
+ end
198
+ it "checks history and pauses" do
199
+ allow(mysql_status).to receive(:get_history_length).and_return(101, 95, 89)
200
+ expect(mysql_status).to receive(:pause).twice
201
+ mysql_status.check!
202
+ end
203
+ it "checks replication and pauses" do
204
+ allow(mysql_status).to receive(:get_replication_lag).and_return(101, 95, 89)
205
+ expect(mysql_status).to receive(:pause).twice
206
+ mysql_status.check!
207
+ end
208
+
209
+ it "checks and continues" do
210
+ allow(mysql_status).to receive(:get_history_length).and_return(80)
211
+ expect(mysql_status).not_to receive(:pause)
212
+ mysql_status.check!
213
+ end
214
+
215
+ it "fetches slave status" do
216
+ mysql_status.get_history_length
217
+ end
218
+ end
219
+
220
+ end
221
+
222
+
@@ -0,0 +1,36 @@
1
+ ENV['RACK_ENV'] = 'test'
2
+
3
+ require 'clean_sweep'
4
+ require 'factory_girl'
5
+ require 'fileutils'
6
+ require 'active_record'
7
+ require 'mysql2'
8
+ RSpec.configure do |config|
9
+ config.include FactoryGirl::Syntax::Methods
10
+ config.formatter = :progress
11
+ #config.order = 'random'
12
+
13
+ config.before(:suite) do
14
+ FactoryGirl.find_definitions
15
+ end
16
+
17
+ end
18
+
19
+ logdir = File.expand_path "../../log",__FILE__
20
+ FileUtils.mkdir_p logdir
21
+ logfile = File.open(File.join(logdir, "test.log"), "w+")
22
+ ActiveRecord::Base.logger = Logger.new(logfile)
23
+
24
+ database = {
25
+ encoding: 'utf8',
26
+ adapter: 'mysql2',
27
+ username: ENV['DB_USERNAME'] || 'root',
28
+ host: 'localhost',
29
+ password: ENV['DB_PASSWORD'],
30
+ }
31
+ db_name = ENV['DB_SCHEMA'] || 'cstest'
32
+ connection = Mysql2::Client.new(database)
33
+ connection.query "CREATE DATABASE IF NOT EXISTS #{db_name}"
34
+ database[:database] = db_name
35
+
36
+ ActiveRecord::Base.establish_connection(database)
@@ -0,0 +1,111 @@
1
+ require 'spec_helper'
2
+
3
+ describe CleanSweep::TableSchema do
4
+
5
+ before do
6
+ Comment.create_table
7
+ end
8
+
9
+ context "using ascending account, timestamp index" do
10
+ let(:schema) { CleanSweep::TableSchema.new Comment, key_name:'comments_on_account_timestamp', ascending: true }
11
+
12
+ it 'should read comments' do
13
+ expect(schema.primary_key.columns.map(&:name)).to eq([:id])
14
+ expect(schema.traversing_key.columns.map(&:name)).to eq([:account, :timestamp])
15
+ end
16
+
17
+ it 'should produce an ascending chunk clause' do
18
+ rows = account_and_timestamp_rows
19
+ expect(schema.scope_to_next_chunk(schema.initial_scope, rows.last).to_sql)
20
+ .to include("(`account` > 5 OR (`account` = 5 AND `timestamp` > '2014-12-01 23:13:25'))")
21
+ end
22
+
23
+ it 'should produce all select columns' do
24
+ expect(schema.select_columns).to eq([:id, :account, :timestamp])
25
+ end
26
+
27
+ it 'should produce the ascending order clause' do
28
+ expect(schema.initial_scope.to_sql).to include('`account` ASC,`timestamp` ASC')
29
+ end
30
+
31
+
32
+ it 'should produce an insert statement' do
33
+ schema = CleanSweep::TableSchema.new Comment, key_name: 'comments_on_account_timestamp'
34
+ rows = account_and_timestamp_rows
35
+ expect(schema.insert_statement(Comment, rows)).to eq("insert into `comments` (`id`,`account`,`timestamp`) values (1001,5,'2014-12-02 01:13:25'),(1002,2,'2014-12-02 00:13:25'),(1005,5,'2014-12-01 23:13:25')")
36
+ end
37
+ end
38
+
39
+ context "using descending account, timestamp index" do
40
+
41
+ let(:schema) { CleanSweep::TableSchema.new Comment, key_name:'comments_on_account_timestamp', ascending: false }
42
+
43
+ it 'should produce a descending where clause' do
44
+ rows = account_and_timestamp_rows
45
+ expect(schema.scope_to_next_chunk(schema.initial_scope, rows.last).to_sql)
46
+ .to include("(`account` < 5 OR (`account` = 5 AND `timestamp` < '2014-12-01 23:13:25'))")
47
+ end
48
+
49
+
50
+ it 'should produce the descending order clause' do
51
+ rows = account_and_timestamp_rows
52
+ expect(schema.scope_to_next_chunk(schema.initial_scope, rows.last).to_sql)
53
+ .to include("`account` DESC,`timestamp` DESC")
54
+ end
55
+
56
+ end
57
+
58
+ context "using account, timestamp index first column only" do
59
+ let(:schema) { CleanSweep::TableSchema.new Comment, key_name:'comments_on_account_timestamp', first_only: true }
60
+
61
+ it 'should select all the rows' do
62
+ expect(schema.select_columns).to eq([:id, :account, :timestamp])
63
+ end
64
+
65
+ it 'should only query using the first column of the index' do
66
+ rows = account_and_timestamp_rows
67
+ expect(schema.scope_to_next_chunk(schema.initial_scope, rows.last).to_sql)
68
+ .to include(" (`account` >= 5) ")
69
+
70
+ end
71
+
72
+ end
73
+
74
+ it 'should not care about case' do
75
+ CleanSweep::TableSchema.new Comment, key_name: 'primary'
76
+ end
77
+
78
+ it 'should work without a descending index' do
79
+ schema = CleanSweep::TableSchema.new Comment
80
+ expect(schema.primary_key.columns.map(&:name)).to eq([:id])
81
+ expect(schema.traversing_key).to be_nil
82
+ end
83
+
84
+ it 'should produce minimal select columns' do
85
+ schema = CleanSweep::TableSchema.new Comment, key_name: 'PRIMARY'
86
+ expect(schema.select_columns).to eq([:id])
87
+ end
88
+
89
+ it 'should produce the from clause with an index' do
90
+ schema = CleanSweep::TableSchema.new Comment, key_name:'comments_on_timestamp'
91
+ expect(schema.initial_scope.to_sql).to include("`comments` FORCE INDEX(comments_on_timestamp)")
92
+ end
93
+
94
+ it 'should include additional columns' do
95
+ schema = CleanSweep::TableSchema.new Comment, key_name: 'comments_on_account_timestamp', extra_columns: %w[seen id]
96
+ expect(schema.select_columns).to eq([:seen, :id, :account, :timestamp])
97
+ rows = account_and_timestamp_rows
98
+ rows.map! { |row| row.unshift 1 } # Insert 'seen' value to beginning of row
99
+ expect(schema.insert_statement(Comment, rows)).to eq("insert into `comments` (`seen`,`id`,`account`,`timestamp`) values (1,1001,5,'2014-12-02 01:13:25'),(1,1002,2,'2014-12-02 00:13:25'),(1,1005,5,'2014-12-01 23:13:25')")
100
+
101
+ end
102
+
103
+
104
+ def account_and_timestamp_rows
105
+ rows = []
106
+ t = Time.parse '2014-12-01 17:13:25'
107
+ rows << [1001, 5, t]
108
+ rows << [1002, 2, t - 1.hour]
109
+ rows << [1005, 5, t - 2.hours]
110
+ end
111
+ end