RubyGems - cleansweep - Versions diffs - 1.0.0 - Mend

cleansweep 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +7 -0
data/.gitignore +20 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +201 -0
data/Rakefile +3 -0
data/cleansweep.gemspec +34 -0
data/lib/clean_sweep/purge_runner/logging.rb +38 -0
data/lib/clean_sweep/purge_runner/mysql_status.rb +82 -0
data/lib/clean_sweep/purge_runner.rb +211 -0
data/lib/clean_sweep/purge_stopped.rb +9 -0
data/lib/clean_sweep/table_schema/column_schema.rb +23 -0
data/lib/clean_sweep/table_schema/index_schema.rb +72 -0
data/lib/clean_sweep/table_schema.rb +112 -0
data/lib/clean_sweep/version.rb +3 -0
data/lib/clean_sweep.rb +11 -0
data/lib/cleansweep.rb +1 -0
data/spec/factories/books.rb +36 -0
data/spec/factories/comments.rb +26 -0
data/spec/purge_runner_spec.rb +222 -0
data/spec/spec_helper.rb +36 -0
data/spec/table_schema_spec.rb +111 -0
metadata +199 -0

data/lib/clean_sweep/table_schema/index_schema.rb ADDED Viewed

@@ -0,0 +1,72 @@
+class CleanSweep::TableSchema::IndexSchema < Struct.new :name, :model, :ascending
+  attr_accessor :columns, :name, :model, :ascending, :first_only
+  def initialize name, model
+    @model = model
+    @columns = []
+    @name = name
+  end
+  # Add a column
+  def << col_name
+    @columns << CleanSweep::TableSchema::ColumnSchema.new(col_name, model)
+  end
+  # Take columns referenced by this index and add them to the list if they
+  # are not present.  Record their position in the list because the position will
+  # be where they are located in a row of values passed in later to #scope_to_next_chunk
+  def add_columns_to select_columns
+    @columns.each do | column |
+      pos = select_columns.index column.name
+      if pos.nil?
+        select_columns << column.name
+        pos = select_columns.size - 1
+      end
+      column.select_position = pos
+    end
+  end
+  def order(scope)
+    direction = ascending ? 'ASC' : 'DESC'
+    if @first_only
+      scope.order("#{columns.first.quoted_name} #{direction}")
+    else
+      scope.order(columns.map { |col| "#{col.quoted_name} #{direction}"}.join(","))
+    end
+  end
+  def scope_to_next_chunk(scope, last_row)
+    query_args = {}
+    if @first_only
+      query_args[columns.first.name] = columns.first.value(last_row)
+    else
+      columns.each do |column|
+        query_args[column.name] = column.value(last_row)
+      end
+    end
+    scope.where(chunk_clause, query_args)
+  end
+  private
+  def chunk_clause
+    @chunk_clause ||=
+        if @first_only
+          # If we're only using the first column, you have to do an inclusive comparison
+          "#{columns.first.quoted_name} #{ascending ? ">=" : "<="} :#{columns.first.name}"
+        else
+          # If you are using all columns of the index, build the expression recursively
+          add_term(columns.dup)
+        end
+  end
+  def add_term(columns)
+    column = columns.shift
+    clause = "#{column.quoted_name} #{ascending ? ">" : "<"} :#{column.name}"
+    if columns.any?
+      clause << " OR (#{column.quoted_name} = :#{column.name} AND #{add_term columns})"
+    end
+    return clause
+  end
+end

data/lib/clean_sweep/table_schema.rb ADDED Viewed

@@ -0,0 +1,112 @@
+class CleanSweep::TableSchema
+  # The list of columns used when selecting, the union of pk and traversing key columns
+  attr_reader :select_columns
+  # The schema for the primary key
+  attr_reader :primary_key
+  # The schema for the traversing key, or nil
+  attr_reader :traversing_key
+  attr_reader :name
+  def initialize(model, options={})
+    traversing_key_name  = options[:key_name]
+    ascending            = options.include?(:ascending) ? options[:ascending] : true
+    first_only           = options[:first_only]
+    @model               = model
+    @name                = @model.table_name
+    @select_columns      = (options[:extra_columns] && options[:extra_columns].map(&:to_sym)) || []
+    key_schemas = build_indexes
+    # Primary key only supported, but we could probably get around this by adding
+    # all columns as 'primary key columns'
+    raise "Table #{model.table_name} must have a primary key" unless key_schemas.include? 'primary'
+    @primary_key = key_schemas['primary']
+    @primary_key.add_columns_to @select_columns
+    if traversing_key_name
+      traversing_key_name.downcase!
+      raise "BTREE Index #{traversing_key_name} not found" unless key_schemas.include? traversing_key_name
+      @traversing_key = key_schemas[traversing_key_name]
+      @traversing_key.add_columns_to @select_columns
+      @traversing_key.ascending = ascending
+      @traversing_key.first_only = first_only
+    end
+  end
+  def insert_statement(target_model, rows)
+    "insert into #{target_model.quoted_table_name} (#{quoted_column_names}) values #{quoted_row_values(rows)}"
+  end
+  def delete_statement(rows)
+    rec_criteria = rows.map do | row |
+      row_compares = []
+      @primary_key.columns.each do |column|
+        row_compares << "#{column.quoted_name} = #{column.quoted_value(row)}"
+      end
+      "(" + row_compares.join(" AND ") + ")"
+    end
+    "DELETE FROM #{@model.quoted_table_name} WHERE #{rec_criteria.join(" OR ")}"
+  end
+  def initial_scope
+    scope = @model.all.select(quoted_column_names).from(from_clause)
+    scope = @traversing_key.order(scope) if @traversing_key
+    return scope
+  end
+  def scope_to_next_chunk scope, last_row
+    if @traversing_key.blank?
+      scope
+    else
+      @traversing_key.scope_to_next_chunk(scope, last_row)
+    end
+  end
+  def first_only?
+    @traversing_key && @traversing_key.first_only
+  end
+  private
+  def from_clause
+    table_name = @model.quoted_table_name
+    table_name += " FORCE INDEX(#{@traversing_key.name})" if @traversing_key
+    return table_name
+  end
+  def quoted_column_names
+    select_columns.map{|c| "`#{c}`"}.join(",")
+  end
+  def quoted_row_values(rows)
+    rows.map do |vec|
+      quoted_column_values = vec.map do |col_value|
+        @model.connection.quote(col_value)
+      end.join(",")
+      "(#{quoted_column_values})"
+    end.join(",")
+  end
+  def build_indexes
+    indexes = {}
+    column_details = @model.connection.select_rows "show indexes from #{@model.quoted_table_name}"
+    column_details.each do | col |
+      key_name = col[2].downcase
+      col_name = col[4].downcase
+      type = col[10]
+      next if key_name != 'PRIMARY' && type != 'BTREE'  # Only BTREE indexes supported for traversing
+      indexes[key_name] ||= IndexSchema.new key_name, @model
+      indexes[key_name] << col_name
+    end
+    return indexes
+  end
+end

data/lib/clean_sweep/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module CleanSweep
+  VERSION = "1.0.0"
+end

data/lib/clean_sweep.rb ADDED Viewed

@@ -0,0 +1,11 @@
+require "clean_sweep/version"
+module CleanSweep
+end
+require 'clean_sweep/purge_stopped'
+require 'clean_sweep/table_schema'
+require 'clean_sweep/table_schema/column_schema'
+require 'clean_sweep/table_schema/index_schema'
+require 'clean_sweep/purge_runner'

data/lib/cleansweep.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require 'clean_sweep'

data/spec/factories/books.rb ADDED Viewed

@@ -0,0 +1,36 @@
+class Book < ActiveRecord::Base
+  def self.create_table
+    connection.execute <<-EOF
+    create temporary table if not exists
+    books (
+       `id` int(11) auto_increment,
+       `bin` int(11),
+       `publisher` varchar(64),
+       `title` varchar(64),
+       primary key (id),
+       key book_index_by_bin(bin, id)
+    )
+    EOF
+  end
+end
+FactoryGirl.define do
+  factory :book do | book |
+    book.publisher "Random House"
+    book.sequence(:bin) { | n | (n % 3)  * 1000 }
+    book.sequence(:title) { |n|  "Jaws, Part #{n}"}
+  end
+end
+class BookTemp < ActiveRecord::Base
+  self.table_name = 'book_vault'
+  def self.create_table
+    connection.execute <<-EOF
+    create temporary table if not exists book_vault like books
+    EOF
+  end
+end

data/spec/factories/comments.rb ADDED Viewed

@@ -0,0 +1,26 @@
+class Comment < ActiveRecord::Base
+  def self.create_table
+    connection.execute <<-EOF
+    create temporary table if not exists
+    comments (
+       `id` int(11) primary key auto_increment,
+       `timestamp` datetime,
+       `account` int(11),
+       `seen` boolean,
+       key comments_on_account_timestamp(account, timestamp),
+       key comments_on_timestamp(timestamp desc)
+    )
+    EOF
+    connection.execute 'truncate table comments'
+  end
+end
+FactoryGirl.define do
+  factory :comment do | comment |
+    comment.timestamp Time.now
+    comment.seen false
+    comment.sequence(:account) { | n | (n % 3)* 100 }
+  end
+end

data/spec/purge_runner_spec.rb ADDED Viewed

@@ -0,0 +1,222 @@
+require 'spec_helper'
+require 'active_support/testing/time_helpers'
+describe CleanSweep::PurgeRunner do
+  context 'PurgeRunner' do
+    include ActiveSupport::Testing::TimeHelpers
+    before do
+      travel_to Time.parse("2014-12-02 13:47:43 -0800")
+    end
+    after do
+      travel_back
+    end
+    context "using comments" do
+      before do
+        Comment.create_table
+      end
+      context "with duplicate rows" do
+        # This testcase demonstrates a weakness in the index traversal
+        # which is that if you aren't using a unique index or the first_only option,
+        # you can miss rows.
+        #
+        # In this case we have some duplicate rows but because the chunk_size is
+        # set low, we don't get all the duplicates in one chunk.  And they miss
+        # the next chunk because we are looking for values greater than the
+        # columns in the current chunk.
+        #
+        # If you use the first_only option it means it builds the where clause using only
+        # the first column of the index, and it also uses the >=, <= operators instead
+        # of >, <.  So it picks up all the rows.
+        #
+        before do
+          10.times { create(:comment, timestamp: 2.weeks.ago, seen: false) }
+          10.times { create(:comment, timestamp: 2.weeks.ago, seen: false) }
+          10.times { create(:comment, timestamp: 2.days.ago, seen: false) }
+          10.times { create(:comment, timestamp: 2.days.ago, seen: false) }
+          10.times { create(:comment, timestamp: 2.days.ago, seen: false) }
+        end
+        it "can miss some rows" do
+          purger = CleanSweep::PurgeRunner.new model: Comment,
+                                               index: 'comments_on_timestamp',
+                                               chunk_size: 7 do | scope |
+            scope.where('timestamp < ?', 1.week.ago)
+          end
+          expect( -> {
+            purger.execute_in_batches
+          }).to change(Comment, :count).from(50).to(43)  # if it deleted all dups this would be 30, not 42
+        end
+        it "won't miss rows using first_only option" do
+          purger = CleanSweep::PurgeRunner.new model: Comment,
+                                               index: 'comments_on_timestamp',
+                                               first_only: true,
+                                               chunk_size: 7 do | scope |
+            scope.where('timestamp < ?', 1.week.ago)
+          end
+          expect( -> {
+            purger.execute_in_batches
+          }).to change(Comment, :count).from(50).to(30)  # if it deleted all dups this would be 30, not 42
+        end
+        it 'prints out the queries in a dry run' do
+          purger = CleanSweep::PurgeRunner.new model: Comment,
+                                               index: 'comments_on_account_timestamp'  do | scope |
+            scope.where('timestamp < ?', 1.week.ago)
+          end
+          output = StringIO.new
+          purger.print_queries(output)
+          expect(output.string).to eq <<EOF
+Initial Query:
+    SELECT  `id`,`account`,`timestamp`
+    FROM `comments` FORCE INDEX(comments_on_account_timestamp)
+    WHERE (timestamp < '2014-11-25 21:47:43')
+    ORDER BY `account` ASC,`timestamp` ASC
+    LIMIT 500
+Chunk Query:
+    SELECT  `id`,`account`,`timestamp`
+    FROM `comments` FORCE INDEX(comments_on_account_timestamp)
+    WHERE (timestamp < '2014-11-25 21:47:43') AND (`account` > 0 OR (`account` = 0 AND `timestamp` > '2014-11-18 21:47:43'))\n    ORDER BY `account` ASC,`timestamp` ASC
+    LIMIT 500
+Delete Statement:
+    DELETE
+    FROM `comments`
+    WHERE (`id` = 2)
+EOF
+        end
+      end
+      context "with unique rows" do
+        before do
+          # Create 10 comments going back 0..9 days...
+          10.times { |i| create(:comment, timestamp: i.days.ago) }
+        end
+        it "ascends the index" do
+          purger = CleanSweep::PurgeRunner.new model: Comment,
+                                               index: 'comments_on_timestamp',
+                                               stop_after: 5
+          begin
+            purger.execute_in_batches
+          rescue CleanSweep::PurgeStopped
+          end
+          expect(Comment.count).to eq(5)
+          # Only old comments deleted before stopping
+          expect(Comment.where('timestamp >= ?', 4.days.ago).count).to eq(5)
+        end
+        it "descends the index" do
+          purger = CleanSweep::PurgeRunner.new model: Comment,
+                                               index: 'comments_on_timestamp',
+                                               reverse: true,
+                                               stop_after: 5
+          begin
+            purger.execute_in_batches
+          rescue CleanSweep::PurgeStopped
+          end
+          # Delete from the most recent comments, so only old ones are left.
+          expect(Comment.count).to eq(5)
+          expect(Comment.where('timestamp <= ?', 4.days.ago).count).to eq(5)
+        end
+      end
+    end
+    context "using books" do
+      before do
+        @total_book_size = 50
+        Book.create_table
+        @total_book_size.times { create(:book) }
+      end
+      after do
+        Book.delete_all
+      end
+      it 'waits for history' do
+        purger = CleanSweep::PurgeRunner.new model: Book,
+                                             max_history: 100,
+                                             chunk_size: 10
+        mysql_status = purger.mysql_status
+        expect(mysql_status).to receive(:check!).exactly(6).times
+        purger.execute_in_batches
+      end
+      it 'should not check when there are no limits' do
+        purger = CleanSweep::PurgeRunner.new model: Book,
+                                             chunk_size: 4
+        expect(purger.mysql_status).to be_nil
+      end
+      it 'purges books' do
+        purger = CleanSweep::PurgeRunner.new model: Book,
+                                             chunk_size: 4
+        count = purger.execute_in_batches
+        expect(count).to be(@total_book_size)
+        expect(Book.count).to be 0
+      end
+      it 'copies books' do
+        BookTemp.create_table
+        purger = CleanSweep::PurgeRunner.new model: Book,
+                                             dest_model: BookTemp,
+                                             chunk_size: 4,
+                                             index: 'book_index_by_bin'
+        count = purger.execute_in_batches
+        expect(count).to be(@total_book_size)
+        expect(BookTemp.count).to eq(@total_book_size)
+      end
+    end
+  end
+end
+describe CleanSweep::PurgeRunner::MysqlStatus do
+  context "mysql status check tool" do
+    let(:mysql_status) do
+      CleanSweep::PurgeRunner::MysqlStatus.new model: Book, max_history:100, max_repl_lag: 100
+    end
+    before do
+      Book.create_table
+    end
+    it "fetches innodb status" do
+      mysql_status.get_replication_lag
+    end
+    it "checks history and pauses" do
+      allow(mysql_status).to receive(:get_history_length).and_return(101, 95, 89)
+      expect(mysql_status).to receive(:pause).twice
+      mysql_status.check!
+    end
+    it "checks replication and pauses" do
+      allow(mysql_status).to receive(:get_replication_lag).and_return(101, 95, 89)
+      expect(mysql_status).to receive(:pause).twice
+      mysql_status.check!
+    end
+    it "checks and continues" do
+      allow(mysql_status).to receive(:get_history_length).and_return(80)
+      expect(mysql_status).not_to receive(:pause)
+      mysql_status.check!
+    end
+    it "fetches slave status" do
+      mysql_status.get_history_length
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,36 @@
+ENV['RACK_ENV'] = 'test'
+require 'clean_sweep'
+require 'factory_girl'
+require 'fileutils'
+require 'active_record'
+require 'mysql2'
+RSpec.configure do |config|
+  config.include FactoryGirl::Syntax::Methods
+  config.formatter = :progress
+  #config.order = 'random'
+  config.before(:suite) do
+    FactoryGirl.find_definitions
+  end
+end
+logdir = File.expand_path "../../log",__FILE__
+FileUtils.mkdir_p logdir
+logfile = File.open(File.join(logdir, "test.log"), "w+")
+ActiveRecord::Base.logger = Logger.new(logfile)
+database = {
+  encoding: 'utf8',
+  adapter: 'mysql2',
+  username: ENV['DB_USERNAME'] || 'root',
+  host: 'localhost',
+  password: ENV['DB_PASSWORD'],
+}
+db_name = ENV['DB_SCHEMA'] || 'cstest'
+connection = Mysql2::Client.new(database)
+connection.query "CREATE DATABASE IF NOT EXISTS #{db_name}"
+database[:database] = db_name
+ActiveRecord::Base.establish_connection(database)

data/spec/table_schema_spec.rb ADDED Viewed

@@ -0,0 +1,111 @@
+require 'spec_helper'
+describe CleanSweep::TableSchema do
+  before do
+    Comment.create_table
+  end
+  context "using ascending account, timestamp index" do
+    let(:schema) { CleanSweep::TableSchema.new Comment, key_name:'comments_on_account_timestamp', ascending: true }
+    it 'should read comments' do
+      expect(schema.primary_key.columns.map(&:name)).to eq([:id])
+      expect(schema.traversing_key.columns.map(&:name)).to eq([:account, :timestamp])
+    end
+    it 'should produce an ascending chunk clause' do
+      rows = account_and_timestamp_rows
+      expect(schema.scope_to_next_chunk(schema.initial_scope, rows.last).to_sql)
+          .to include("(`account` > 5 OR (`account` = 5 AND `timestamp` > '2014-12-01 23:13:25'))")
+    end
+    it 'should produce all select columns' do
+      expect(schema.select_columns).to eq([:id, :account, :timestamp])
+    end
+    it 'should produce the ascending order clause' do
+      expect(schema.initial_scope.to_sql).to include('`account` ASC,`timestamp` ASC')
+    end
+    it 'should produce an insert statement' do
+      schema = CleanSweep::TableSchema.new Comment, key_name: 'comments_on_account_timestamp'
+      rows = account_and_timestamp_rows
+      expect(schema.insert_statement(Comment, rows)).to eq("insert into `comments` (`id`,`account`,`timestamp`) values (1001,5,'2014-12-02 01:13:25'),(1002,2,'2014-12-02 00:13:25'),(1005,5,'2014-12-01 23:13:25')")
+    end
+  end
+  context "using descending account, timestamp index" do
+    let(:schema) { CleanSweep::TableSchema.new Comment, key_name:'comments_on_account_timestamp', ascending: false }
+    it 'should produce a descending where clause' do
+      rows = account_and_timestamp_rows
+      expect(schema.scope_to_next_chunk(schema.initial_scope, rows.last).to_sql)
+          .to include("(`account` < 5 OR (`account` = 5 AND `timestamp` < '2014-12-01 23:13:25'))")
+    end
+    it 'should produce the descending order clause' do
+      rows = account_and_timestamp_rows
+      expect(schema.scope_to_next_chunk(schema.initial_scope, rows.last).to_sql)
+          .to include("`account` DESC,`timestamp` DESC")
+    end
+  end
+  context "using account, timestamp index first column only" do
+    let(:schema) { CleanSweep::TableSchema.new Comment, key_name:'comments_on_account_timestamp', first_only: true }
+    it 'should select all the rows' do
+      expect(schema.select_columns).to eq([:id, :account, :timestamp])
+    end
+    it 'should only query using the first column of the index' do
+      rows = account_and_timestamp_rows
+      expect(schema.scope_to_next_chunk(schema.initial_scope, rows.last).to_sql)
+        .to include(" (`account` >= 5) ")
+    end
+  end
+  it 'should not care about case' do
+    CleanSweep::TableSchema.new Comment, key_name: 'primary'
+  end
+  it 'should work without a descending index' do
+    schema = CleanSweep::TableSchema.new Comment
+    expect(schema.primary_key.columns.map(&:name)).to eq([:id])
+    expect(schema.traversing_key).to be_nil
+  end
+  it 'should produce minimal select columns' do
+    schema = CleanSweep::TableSchema.new Comment, key_name: 'PRIMARY'
+    expect(schema.select_columns).to eq([:id])
+  end
+  it 'should produce the from clause with an index' do
+    schema = CleanSweep::TableSchema.new Comment, key_name:'comments_on_timestamp'
+    expect(schema.initial_scope.to_sql).to include("`comments` FORCE INDEX(comments_on_timestamp)")
+  end
+  it 'should include additional columns' do
+    schema = CleanSweep::TableSchema.new Comment, key_name: 'comments_on_account_timestamp', extra_columns: %w[seen id]
+    expect(schema.select_columns).to eq([:seen, :id, :account, :timestamp])
+    rows = account_and_timestamp_rows
+    rows.map! { |row| row.unshift 1 } # Insert 'seen' value to beginning of row
+    expect(schema.insert_statement(Comment, rows)).to eq("insert into `comments` (`seen`,`id`,`account`,`timestamp`) values (1,1001,5,'2014-12-02 01:13:25'),(1,1002,2,'2014-12-02 00:13:25'),(1,1005,5,'2014-12-01 23:13:25')")
+  end
+  def account_and_timestamp_rows
+    rows = []
+    t = Time.parse '2014-12-01 17:13:25'
+    rows << [1001, 5, t]
+    rows << [1002, 2, t - 1.hour]
+    rows << [1005, 5, t - 2.hours]
+  end
+end