RubyGems - schema_transformer - Versions diffs - 0.2.0 → 0.3.0 - Mend

schema_transformer 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/README.markdown +19 -0
data/TODO +0 -1
data/lib/schema_transformer.rb +3 -1
data/lib/schema_transformer/analyze.rb +52 -0
data/lib/schema_transformer/base.rb +2 -247
data/lib/schema_transformer/cli.rb +12 -12
data/lib/schema_transformer/transform.rb +252 -0
data/lib/schema_transformer/version.rb +1 -1
data/test/fake_app/config/schema_transformations/books.json +1 -1
data/test/fake_app/log/schema_transformer.log +2143 -57778
data/test/schema_transformer_test.rb +141 -193
data/test/test_helper.rb +75 -0
metadata +7 -5
data/test/fake_app/config/schema_transformations/users.json +0 -1

data/README.markdown CHANGED

@@ -62,6 +62,25 @@ Thank you.  Have a very nice day.
 tung@walle $
 </pre>
+It is strongly recommended that the tables that you are altering has updated_at timestamp columns
+with indexes on them.  If the tables do not updated_at columns at all, then only the last 100,000 rows
+get updated in the final sync in the "schema_transformer switch ..." command.  If the updated_at column
+is available then the final sync will use it to update all the data.  However, because it uses the
+updated_at column, it is extremly important that the updated_at column is indexed or the final
+"schema_transformer switch ..." command possibly could be slow.  Because of this, you should analyze your
+database schema for missing updated_at columns and indexes with the command "schema_transformer analyze".
+Example:
+<pre>
+tung@walle $ schema_transformer analyze
+Analyzing your database schema...
+There are no tables without the updated_at timestamp.  GOOD
+These tables do have an updated_at timestamp, but no index:
+  users
+tung@walle $
+</pre>
 FAQ
 -------

data/TODO CHANGED

@@ -8,4 +8,3 @@
 * TODO:
 * add logging again: schema_transformer.log
 * updated_at if its available and use a real time vs some guess
-* clean up spec: use real mocks, get rid of $testing_books

data/lib/schema_transformer.rb CHANGED

@@ -7,4 +7,6 @@ require 'fileutils'
 require File.expand_path('../schema_transformer/version', __FILE__)
 require File.expand_path('../schema_transformer/help', __FILE__)
 require File.expand_path('../schema_transformer/base', __FILE__)
-require File.expand_path('../schema_transformer/cli', __FILE__)
+require File.expand_path('../schema_transformer/transform', __FILE__)
+require File.expand_path('../schema_transformer/cli', __FILE__)
+require File.expand_path('../schema_transformer/analyze', __FILE__)

data/lib/schema_transformer/analyze.rb ADDED

@@ -0,0 +1,52 @@
+module SchemaTransformer
+  class Analyze < Base
+    def self.run(options)
+      @analyze = Analyze.new(options[:base] || Dir.pwd, options)
+      puts "Analyzing your database schema..."
+      if @analyze.no_timestamps.empty?
+        puts "There are no tables without the updated_at timestamp.  GOOD"
+      else
+        puts "These tables do not have updated_at timestamps: "
+        puts "  #{@analyze.no_timestamps.join("\n  ")}"
+      end
+      if @analyze.no_indexes.empty?
+        puts "There are no tables with updated_at timestamp but no indexes.  GOOD"
+      else
+        puts "These tables do have an updated_at timestamp, but no index: "
+        puts "  #{@analyze.no_indexes.join("\n  ")}"
+      end
+      if @analyze.no_timestamps.empty? or @analyze.no_timestamps.empty?
+        "Everything looks GOOD!"
+      else
+        puts "You should add the missing columns or indexes."
+      end
+    end
+    # tells which tables are missing updated_at and index on updated_at
+    def no_timestamps
+      @conn.tables - timestamps
+    end
+    def timestamps
+      tables = []
+      @conn.tables.each do |table|
+        has_updated_at = @conn.columns(table).detect {|col| col.name == "updated_at" }
+        tables << table if has_updated_at
+      end
+      tables
+    end
+    def indexes
+      tables = []
+      timestamps.each do |table|
+        has_index = @conn.indexes(table).detect {|col| col.columns == ["updated_at"] }
+        tables << table if has_index
+      end
+      tables
+    end
+    def no_indexes
+      timestamps - indexes
+    end
+  end
+end

data/lib/schema_transformer/base.rb CHANGED

@@ -1,16 +1,6 @@
 module SchemaTransformer
-  class UsageError < RuntimeError; end
   class Base
-    include Help
-    @@stagger = 0
-    def self.run(options)
-      @@stagger = options[:stagger] || 0
-      @transformer = SchemaTransformer::Base.new(options[:base] || Dir.pwd)
-      @transformer.run(options)
-    end
-    attr_reader :options, :temp_table, :table
+    attr_reader :options
     def initialize(base = File.expand_path("..", __FILE__), options = {})
       @base = base
       @db, @log, @mail = ActiveWrapper.setup(
@@ -20,241 +10,6 @@ module SchemaTransformer
       )
       @db.establish_connection
       @conn = ActiveRecord::Base.connection
-      @batch_size = options[:batch_size] || 10_000
-    end
-    def run(options)
-      @action = options[:action].first
-      case @action
-      when "generate"
-        self.generate
-        help(:generate)
-      when "sync"
-        help(:sync_progress)
-        table = options[:action][1]
-        self.gather_info(table)
-        self.create
-        self.sync
-        help(:sync)
-      when "switch"
-        table = options[:action][1]
-        self.gather_info(table)
-        self.switch
-        self.cleanup
-        help(:switch)
-      else
-        raise UsageError, "Invalid action #{@action}"
-      end
-    end
-    def generate
-      data = {}
-      ask "What is the name of the table you want to alter?"
-      data[:table] = gets(:table)
-      ask <<-TXT
-What is the modification to the table?
-Examples 1:
-  ADD COLUMN smart tinyint(1) DEFAULT '0'
-Examples 2:
-  ADD INDEX idx_name (name)
-Examples 3:
-  ADD COLUMN smart tinyint(1) DEFAULT '0', DROP COLUMN full_name
-TXT
-      data[:mod] = gets(:mod)
-      path = transform_file(data[:table])
-      FileUtils.mkdir(File.dirname(path)) unless File.exist?(File.dirname(path))
-      File.open(path,"w") { |f| f << data.to_json }
-      @table = data[:table]
-      data
-    end
-    def gather_info(table)
-      if table.nil?
-        raise UsageError, "You need to specific the table name: schema_transformer #{@action} <table_name>"
-      end
-      data = JSON.parse(IO.read(transform_file(table)))
-      @table = data["table"]
-      @mod = data["mod"]
-      # variables need for rest of the program
-      @temp_table = "#{@table}_st_temp"
-      @trash_table = "#{@table}_st_trash"
-      @model = define_model(@table)
-    end
-    def create
-      if self.temp_table_exists?
-        @temp_model = define_model(@temp_table)
-      else
-        sql_create = %{CREATE TABLE #{@temp_table} LIKE #{@table}}
-        sql_mod = %{ALTER TABLE #{@temp_table} #{@mod}}
-        @conn.execute(sql_create)
-        @conn.execute(sql_mod)
-        @temp_model = define_model(@temp_table)
-      end
-      reset_column_info
-    end
-    def sync
-      res = @conn.execute("SELECT max(id) AS max_id FROM `#{@temp_table}`")
-      start = res.fetch_row[0].to_i + 1 # nil case is okay: [nil][0].to_i => 0
-      find_in_batches(@table, :start => start, :batch_size => @batch_size) do |batch|
-        # puts "batch #{batch.inspect}"
-        lower = batch.first
-        upper = batch.last
-        columns = insert_columns_sql
-        sql = %Q{
-          INSERT INTO #{@temp_table} (
-            SELECT #{columns}
-          	FROM #{@table} WHERE id >= #{lower} AND id <= #{upper}
-          )
-        }
-        # puts sql
-        @conn.execute(sql)
-        if @@stagger > 0
-          log("Staggering: delaying for #{@@stagger} seconds before next batch insert")
-          sleep(@@stagger)
-        end
-      end
-    end
-    def final_sync
-      @temp_model = define_model(@temp_table)
-      reset_column_info
-      sync
-      columns = subset_columns.collect{|x| "#{@temp_table}.`#{x}` = #{@table}.`#{x}`" }.join(", ")
-      # need to limit the final sync, if we do the entire table it takes a long time
-      limit_cond = get_limit_cond
-      sql = %{
-        UPDATE #{@temp_table} INNER JOIN #{@table}
-          ON #{@temp_table}.id = #{@table}.id
-          SET #{columns}
-        WHERE #{limit_cond}
-      }
-      # puts sql
-      @conn.execute(sql)
-    end
-    def switch
-      final_sync
-      to_trash  = %Q{RENAME TABLE #{@table} TO #{@trash_table}}
-      from_temp = %Q{RENAME TABLE #{@temp_table} TO #{@table}}
-      @conn.execute(to_trash)
-      @conn.execute(from_temp)
-    end
-    def cleanup
-      sql = %Q{DROP TABLE #{@trash_table}}
-      @conn.execute(sql)
-    end
-    def get_limit_cond
-      if @model.column_names.include?("updated_at")
-        "#{@table}.updated_at >= '#{1.day.ago.strftime("%Y-%m-%d")}'"
-      else
-        sql = "select id from #{@table} order by id desc limit 100000"
-        resp = @conn.execute(sql)
-        bound = 0
-        while row = resp.fetch_row do
-          bound = row[0].to_i
-        end
-        "#{@table}.id >= #{bound}"
-      end
-    end
-    # the parameter is only for testing
-    def gets(name = nil)
-      STDIN.gets.strip
-    end
-    def subset_columns
-      removed = @model.column_names - @temp_model.column_names
-      subset  = @model.column_names - removed
-    end
-    def insert_columns_sql
-      # existing subset
-      subset = subset_columns
-      # added
-      added_s = @temp_model.column_names - @model.column_names
-      added = @temp_model.columns.
-                select{|c| added_s.include?(c.name) }.
-                collect{|c| "#{extract_default(c)} AS `#{c.name}`" }
-      # combine both
-      columns = subset.collect{|x| "`#{x}`"} + added
-      sql = columns.join(", ")
-    end
-    # returns Array of record ids
-    def find(table, cond)
-      sql = "SELECT id FROM #{table} WHERE #{cond}"
-      response = @conn.execute(sql)
-      results = []
-      while row = response.fetch_row do
-        results << row[0].to_i
-      end
-      results
-    end
-    # lower memory heavy version of ActiveRecord's find in batches
-    def find_in_batches(table, options = {})
-      raise "You can't specify an order, it's forced to be #{batch_order}" if options[:order]
-      raise "You can't specify a limit, it's forced to be the batch_size"  if options[:limit]
-      start = options.delete(:start).to_i
-      batch_size = options.delete(:batch_size) || 1000
-      order_limit = "ORDER BY id LIMIT #{batch_size}"
-      records = find(table, "id >= #{start} #{order_limit}")
-      while records.any?
-        yield records
-        break if records.size < batch_size
-        records = find(table, "id > #{records.last} #{order_limit}")
-      end
-    end
-    def define_model(table)
-      # Object.const_set(table.classify, Class.new(ActiveRecord::Base))
-      Object.class_eval(<<-code)
-        class #{table.classify} < ActiveRecord::Base
-          set_table_name "#{table}"
-        end
-      code
-      table.classify.constantize # returns the constant
-    end
-    def transform_file(table)
-      @base+"/config/schema_transformations/#{table}.json"
-    end
-    def temp_table_exists?
-      @conn.table_exists?(@temp_table)
-    end
-    def reset_column_info
-      @model.reset_column_information
-      @temp_model.reset_column_information
-    end
-    def log(msg)
-      @log.info(msg)
-    end
-  private
-    def ask(msg)
-      puts msg
-      print "> "
-    end
-    def extract_default(col)
-      @conn.quote(col.default)
     end
   end
-end
+end

data/lib/schema_transformer/cli.rb CHANGED

@@ -1,8 +1,3 @@
-#!/usr/bin/env ruby
-require 'rubygems'
-require 'active_wrapper'
 module SchemaTransformer
   class CLI
@@ -80,18 +75,23 @@ module SchemaTransformer
     end
     def run
-      begin
-        SchemaTransformer::Base.run(options)
-      rescue UsageError => e
-        puts "Usage Error: #{e.message}"
-        puts help_message
-        puts option_parser
+      @action = options[:action].first
+      if @action == "analyze"
+        SchemaTransformer::Analyze.run(options)
+      else
+        begin
+          SchemaTransformer::Transform.run(options)
+        rescue UsageError => e
+          puts "Usage Error: #{e.message}"
+          puts help_message
+          puts option_parser
+        end
       end
     end
     private
     def help_message
-      "Available actions: generate, sync, switch"
+      "Available actions: analyze, generate, sync, switch"
     end
   end

data/lib/schema_transformer/transform.rb ADDED

@@ -0,0 +1,252 @@
+module SchemaTransformer
+  class UsageError < RuntimeError; end
+  class Transform < Base
+    include Help
+    @@stagger = 0
+    def self.run(options)
+      @@stagger = options[:stagger] || 0
+      @transformer = SchemaTransformer::Transform.new(options[:base] || Dir.pwd)
+      @transformer.run(options)
+    end
+    attr_reader :temp_table, :table
+    def initialize(base = File.expand_path("..", __FILE__), options = {})
+      super
+      @batch_size = options[:batch_size] || 10_000
+    end
+    def run(options)
+      @action = options[:action].first
+      case @action
+      when "generate"
+        self.generate
+        help(:generate)
+      when "sync"
+        help(:sync_progress)
+        table = options[:action][1]
+        self.gather_info(table)
+        self.create
+        self.sync
+        help(:sync)
+      when "switch"
+        table = options[:action][1]
+        self.gather_info(table)
+        self.switch
+        self.cleanup
+        help(:switch)
+      else
+        raise UsageError, "Invalid action #{@action}"
+      end
+    end
+    def generate
+      data = {}
+      ask "What is the name of the table you want to alter?"
+      data[:table] = gets(:table)
+      ask <<-TXT
+What is the modification to the table?
+Examples 1:
+  ADD COLUMN smart tinyint(1) DEFAULT '0'
+Examples 2:
+  ADD INDEX idx_name (name)
+Examples 3:
+  ADD COLUMN smart tinyint(1) DEFAULT '0', DROP COLUMN full_name
+TXT
+      data[:mod] = gets(:mod)
+      path = transform_file(data[:table])
+      FileUtils.mkdir(File.dirname(path)) unless File.exist?(File.dirname(path))
+      File.open(path,"w") { |f| f << data.to_json }
+      @table = data[:table]
+      data
+    end
+    def gather_info(table)
+      if table.nil?
+        raise UsageError, "You need to specific the table name: schema_transformer #{@action} <table_name>"
+      end
+      data = JSON.parse(IO.read(transform_file(table)))
+      @table = data["table"]
+      @mod = data["mod"]
+      # variables need for rest of the program
+      @temp_table = "#{@table}_st_temp"
+      @trash_table = "#{@table}_st_trash"
+      @model = define_model(@table)
+    end
+    def create
+      if self.temp_table_exists?
+        @temp_model = define_model(@temp_table)
+      else
+        sql_create = %{CREATE TABLE #{@temp_table} LIKE #{@table}}
+        sql_mod = %{ALTER TABLE #{@temp_table} #{@mod}}
+        @conn.execute(sql_create)
+        @conn.execute(sql_mod)
+        @temp_model = define_model(@temp_table)
+      end
+      reset_column_info
+    end
+    def sync
+      res = @conn.execute("SELECT max(id) AS max_id FROM `#{@temp_table}`")
+      start = res.fetch_row[0].to_i + 1 # nil case is okay: [nil][0].to_i => 0
+      find_in_batches(@table, :start => start, :batch_size => @batch_size) do |batch|
+        # puts "batch #{batch.inspect}"
+        lower = batch.first
+        upper = batch.last
+        columns = insert_columns_sql
+        sql = %Q{
+          INSERT INTO #{@temp_table} (
+            SELECT #{columns}
+          	FROM #{@table} WHERE id >= #{lower} AND id <= #{upper}
+          )
+        }
+        # puts sql
+        @conn.execute(sql)
+        if @@stagger > 0
+          log("Staggering: delaying for #{@@stagger} seconds before next batch insert")
+          sleep(@@stagger)
+        end
+      end
+    end
+    def final_sync
+      @temp_model = define_model(@temp_table)
+      reset_column_info
+      sync
+      columns = subset_columns.collect{|x| "#{@temp_table}.`#{x}` = #{@table}.`#{x}`" }.join(", ")
+      # need to limit the final sync, if we do the entire table it takes a long time
+      limit_cond = get_limit_cond
+      sql = %{
+        UPDATE #{@temp_table} INNER JOIN #{@table}
+          ON #{@temp_table}.id = #{@table}.id
+          SET #{columns}
+        WHERE #{limit_cond}
+      }
+      # puts sql
+      @conn.execute(sql)
+    end
+    def switch
+      final_sync
+      to_trash  = %Q{RENAME TABLE #{@table} TO #{@trash_table}}
+      from_temp = %Q{RENAME TABLE #{@temp_table} TO #{@table}}
+      @conn.execute(to_trash)
+      @conn.execute(from_temp)
+    end
+    def cleanup
+      sql = %Q{DROP TABLE #{@trash_table}}
+      @conn.execute(sql)
+    end
+    def get_limit_cond
+      if @model.column_names.include?("updated_at")
+        "#{@table}.updated_at >= '#{1.day.ago.strftime("%Y-%m-%d")}'"
+      else
+        sql = "select id from #{@table} order by id desc limit 100000"
+        resp = @conn.execute(sql)
+        bound = 0
+        while row = resp.fetch_row do
+          bound = row[0].to_i
+        end
+        "#{@table}.id >= #{bound}"
+      end
+    end
+    # the parameter is only for testing
+    def gets(name = nil)
+      STDIN.gets.strip
+    end
+    def subset_columns
+      removed = @model.column_names - @temp_model.column_names
+      subset  = @model.column_names - removed
+    end
+    def insert_columns_sql
+      # existing subset
+      subset = subset_columns
+      # added
+      added_s = @temp_model.column_names - @model.column_names
+      added = @temp_model.columns.
+                select{|c| added_s.include?(c.name) }.
+                collect{|c| "#{extract_default(c)} AS `#{c.name}`" }
+      # combine both
+      columns = subset.collect{|x| "`#{x}`"} + added
+      sql = columns.join(", ")
+    end
+    # returns Array of record ids
+    def find(table, cond)
+      sql = "SELECT id FROM #{table} WHERE #{cond}"
+      response = @conn.execute(sql)
+      results = []
+      while row = response.fetch_row do
+        results << row[0].to_i
+      end
+      results
+    end
+    # lower memory heavy version of ActiveRecord's find in batches
+    def find_in_batches(table, options = {})
+      raise "You can't specify an order, it's forced to be #{batch_order}" if options[:order]
+      raise "You can't specify a limit, it's forced to be the batch_size"  if options[:limit]
+      start = options.delete(:start).to_i
+      batch_size = options.delete(:batch_size) || 1000
+      order_limit = "ORDER BY id LIMIT #{batch_size}"
+      records = find(table, "id >= #{start} #{order_limit}")
+      while records.any?
+        yield records
+        break if records.size < batch_size
+        records = find(table, "id > #{records.last} #{order_limit}")
+      end
+    end
+    def define_model(table)
+      # Object.const_set(table.classify, Class.new(ActiveRecord::Base))
+      Object.class_eval(<<-code)
+        class #{table.classify} < ActiveRecord::Base
+          set_table_name "#{table}"
+        end
+      code
+      table.classify.constantize # returns the constant
+    end
+    def transform_file(table)
+      @base+"/config/schema_transformations/#{table}.json"
+    end
+    def temp_table_exists?
+      @conn.table_exists?(@temp_table)
+    end
+    def reset_column_info
+      @model.reset_column_information
+      @temp_model.reset_column_information
+    end
+    def log(msg)
+      @log.info(msg)
+    end
+  private
+    def ask(msg)
+      puts msg
+      print "> "
+    end
+    def extract_default(col)
+      @conn.quote(col.default)
+    end
+  end
+end