RubyGems - shiba - Versions diffs - 0.1.2 → 0.2.0 - Mend

shiba 0.1.2 → 0.2.0

Files changed (36) hide show

data/lib/shiba/fuzzer.rb ADDED Viewed

@@ -0,0 +1,77 @@
+require 'shiba/index_stats'
+module Shiba
+  class Fuzzer
+    def initialize(connection)
+      @connection = connection
+      @index_stats = IndexStats.new
+    end
+    attr_reader :connection
+    def fuzz!
+      fetch_index!
+      table_sizes = guess_table_sizes
+      @index_stats.tables.each do |name, table|
+        table.count = table_sizes[name]
+        table.indexes.each do |name, index|
+          index.columns.each do |column|
+            column.rows_per = index.unique ? 1 : 2
+          end
+        end
+      end
+      @index_stats
+    end
+    private
+    BIG_FUZZ_SIZE   = 5_000
+    SMALL_FUZZ_SIZE = 100
+    def fetch_index!
+      records = connection.query("select * from information_schema.statistics where table_schema = DATABASE()")
+      tables = {}
+      records.each do |h|
+        h.keys.each { |k| h[k.downcase] = h.delete(k) }
+        h["cardinality"] = h["cardinality"].to_i
+        @index_stats.add_index_column(h['table_name'], h['index_name'], h['column_name'], h['cardinality'], h['non_unique'] == "0")
+      end
+    end
+    # Create fake table sizes based on the table's index count.
+    # The more indexes, the bigger the table. Seems to rank tables fairly well.
+    def guess_table_sizes
+      index_count_query = "select TABLE_NAME as table_name, count(*) as index_count
+        from information_schema.statistics where table_schema = DATABASE()
+        and seq_in_index = 1 and index_name not like 'fk_rails%'
+        group by table_name order by index_count"
+      index_counts = connection.query(index_count_query).to_a
+      # 90th table percentile based on number of indexes
+      # round down so we don't blow up on small tables
+      large_table_idx = (index_counts.size * 0.9).floor
+      large_table_index_count = index_counts[large_table_idx]["index_count"].to_f
+      sizes = Hash[index_counts.map(&:values)]
+      sizes.each do |table_name, index_count|
+        if index_count == 0
+          index_count = 1
+        end
+        size = sizes[table_name]
+        # Big
+        if size >= large_table_index_count
+          sizes[table_name] = BIG_FUZZ_SIZE
+        else
+        #small
+          sizes[table_name] = SMALL_FUZZ_SIZE
+        end
+      end
+      sizes
+    end
+  end
+end

data/lib/shiba/index.rb CHANGED Viewed

@@ -1,6 +1,9 @@
-module Shiba
-  module Index
+require 'yaml'
+require 'pp'
+require 'shiba/index_stats'
+module Shiba
+  class Index
     # Given the path to the information_schema.statistics output, returns index statistics keyed by table name.
     # Examples:
     # Exploring the schema:
@@ -12,140 +15,16 @@ module Shiba
     # => {:table_schema=>"blog_test", :table_name=>"users", :non_unique=>"0", :column_name=>"id", :cardinality=>"2", :is_visible=>"YES", :"expression\n"=>"NULL\n"}
     #
     def self.parse(path)
+      stats = IndexStats.new
       tables = {}
       records = read(path)
       headers = records.shift.map { |header| header.downcase }
       records.each do |r|
         h = Hash[headers.zip(r)]
         h["cardinality"] = h["cardinality"].to_i
-        table = tables[h['table_name']] ||= []
-        table.push(h)
-      end
-      tables
-    end
-    # Getting a row count for a table:
-    #
-    # schema_stats = Index.parse("./shiba/schema_stats.tsv")
-    # users_count = Index.count(:users, schema_stats)
-    # => 2
-    def self.count(table, schema)
-      return nil unless schema[table]
-      primary = schema[table].detect { |index| index['index_name'] == "PRIMARY" }
-      if primary.nil?
-        # find the highest cardinality of a unique index, if it exists
-        schema[table].map do |index|
-          if index['non_unique'].to_i == 0
-            index['cardinality']
-          else
-            nil
-          end
-        end.compact.max
-      else
-        primary['cardinality'].to_i
-      end
-    end
-    def self.fuzzed?(table, schema)
-      return nil unless schema[table]
-      schema[table].first['fuzzed']
-    end
-    def self.estimate_key(table, key, parts, schema)
-      table_count = count(table, schema)
-      return nil unless table_count
-      key_stat = schema[table].detect do |i|
-        i["index_name"] == key && i["column_name"] == parts.last
-      end
-      return nil unless key_stat
-      return 0 if key_stat['cardinality'] == 0
-      table_count / key_stat['cardinality']
-    end
-    def self.query(connection)
-      records = connection.query("select * from information_schema.statistics where table_schema = DATABASE()")
-      tables = {}
-      records.each do |h|
-        h.keys.each { |k| h[k.downcase] = h.delete(k) }
-        h["cardinality"] = h["cardinality"].to_i
-        table = tables[h['table_name']] ||= []
-        table.push(h)
-      end
-      tables
-    end
-    # Up the cardinality on our indexes.
-    # Non uniques have a little less cardinality.
-    def self.fuzz!(stats)
-      db = stats.values.first.first['table_schema']
-      table_sizes = self.guess_table_sizes(db)
-      stats.each do |table,indexes|
-        indexes.each do |idx|
-          idx['cardinality'] = table_sizes[table]
-          if idx['non_unique'] == 1
-            idx['cardinality'] = (idx['cardinality'] * 0.7).round
-          end
-          idx['fuzzed'] = true
-        end
-      end
-    end
-    MINIMUM_TABLE_SIZE = 500
-    # Approximate median size of the tables is less than 500.
-    def self.insufficient_stats?(stats)
-      if stats.length == 0
-        return true
-      end
-      # Calculate a rough median.
-      primary_keys = stats.map do |_,indexes|
-        indexes.detect { |idx| idx['index_name'] == 'PRIMARY' } || {}
+        stats.add_index_column(h['table_name'], h['index_name'], h['column_name'], h['cardinality'], h['non_unique'] == "0")
       end
-      table_counts = primary_keys.map { |pk| pk['cardinality'].to_i }
-      median = table_counts[table_counts.size/2]
-      return median < MINIMUM_TABLE_SIZE
-    end
-    STANDARD_FUZZ_SIZE = 5_000
-    # Create fake table sizes based on the table's index count.
-    # The more indexes, the bigger the table. Seems to rank tables fairly well.
-    def self.guess_table_sizes(db)
-      db = Shiba.connection.escape(db)
-      index_count_query = "select TABLE_NAME as table_name, count(*) as index_count
-        from information_schema.statistics where table_schema = '#{db}'
-        and seq_in_index = 1 and index_name not like 'fk_rails%'
-        group by table_name order by index_count"
-        index_counts = Shiba.connection.query(index_count_query).to_a
-        # 80th table percentile based on number of indexes
-        large_table_idx = (index_counts.size * 0.8).round
-        large_table = index_counts[large_table_idx]
-        sizes = Hash[index_counts.map(&:values)]
-        sizes.each do |table_name, index_count|
-          if index_count == 0
-            index_count = 1
-          end
-          sizes[table_name] = STANDARD_FUZZ_SIZE * (index_count / large_table['index_count'].to_f)
-        end
-        sizes
+      stats
     end
     protected

data/lib/shiba/index_stats.rb ADDED Viewed

@@ -0,0 +1,210 @@
+require 'yaml'
+require 'active_support/core_ext/hash/keys'
+module Shiba
+  class IndexStats
+    def initialize(tables = {})
+      @tables = tables
+      build_from_hash!
+    end
+    def any?
+      @tables.any?
+    end
+    Table = Struct.new(:name, :count, :indexes) do
+      def encode_with(coder)
+        coder.map = self.to_h.stringify_keys
+        coder.map.delete('name')
+        if self.count.nil?
+          #uuuugly.  No unique keys.  we'll take our best guess.
+          self.count = indexes.map { |i, parts| parts.columns.map { |v| v.raw_cardinality } }.flatten.max
+        end
+        coder.tag = nil
+      end
+      def build_index(index_name, is_unique)
+        self.indexes[index_name] ||= Index.new(self, index_name, [], is_unique)
+      end
+      def add_index_column(index_name, column_name, rows_per, cardinality, is_unique)
+        index = build_index(index_name, is_unique)
+        index.columns << Column.new(column_name, index, rows_per, cardinality)
+        if is_unique && !self.count
+          # set row count from unique index
+          self.count = cardinality
+        end
+      end
+    end
+    Index = Struct.new(:table, :name, :columns, :unique) do
+      def add_column(column_name, cardinality)
+        columns << Column.new(self, column_name, cardinality)
+      end
+      def encode_with(coder)
+        coder.map = self.to_h.stringify_keys
+        coder.map.delete('table')
+        coder.tag = nil
+      end
+    end
+    class Column
+      def initialize(column, index, rows_per, cardinality)
+        @column = column
+        @index = index
+        @rows_per = rows_per
+        @cardinality = cardinality
+      end
+      attr_reader :column
+      def table_count
+        @index.table.count
+      end
+      def raw_cardinality
+        @cardinality
+      end
+      def rows_per
+        return @rows_per if @rows_per && @rows_per.is_a?(Integer)
+        return nil if table_count.nil?
+        if @rows_per.nil?
+          if table_count == 0
+            @rows_per = 1
+          else
+            @rows_per = (table_count / @cardinality).round
+          end
+        elsif @rows_per.is_a?(String)
+          @rows_per = ((@rows_per.to_f / 100.0) * table_count.to_f).round
+        end
+        @rows_per
+      end
+      attr_writer :rows_per
+      def encode_with(coder)
+        coder.map = {'column' => @column}
+        count = table_count
+        count = 1 if count == 0
+        ratio_per_item = self.rows_per / count.to_f rescue debugger
+        if count <= 10
+          ratio_threshold = 1_000_0000 # always show a number
+        elsif count <= 1000
+          ratio_threshold = 0.1
+        elsif count <= 1_000_000
+          ratio_threshold = 0.01
+        elsif count <= 1_000_000_000
+          ratio_threshold = 0.001
+        end
+        if ratio_per_item > ratio_threshold
+          coder.map['rows_per'] = (ratio_per_item * 100).round.to_s + "%"
+        else
+          coder.map['rows_per'] = rows_per
+        end
+        coder.tag = nil
+      end
+    end
+    def build_from_hash!
+      @tables = @tables.collect do |tbl_name, tbl_hash|
+        t = Table.new(tbl_name, tbl_hash['count'], {})
+        tbl_hash['indexes'].each do |idx_name, idx_hash|
+          idx_hash['columns'].each do |col_hash|
+            t.add_index_column(idx_name, col_hash['column'], col_hash['rows_per'], nil, idx_hash['unique'])
+          end
+        end
+        [tbl_name, t]
+      end.to_h
+    end
+    attr_reader :tables
+    def table_count(table)
+      return @tables[table].count if @tables[table]
+    end
+    def fetch_index(table, name)
+      tbl = @tables[table]
+      return nil unless tbl
+      tbl.indexes[name]
+    end
+    def build_table(name)
+      @tables[name] ||= Table.new(name, nil, {})
+    end
+    def add_index_column(table, index_name, column_name, cardinality, is_unique)
+      table = build_table(table)
+      table.add_index_column(index_name, column_name, nil, cardinality, is_unique)
+    end
+    def estimate_key(table_name, key, parts)
+      index = fetch_index(table_name, key)
+      return nil unless index
+      index_part = index.columns.detect do |p|
+        p.column == parts.last
+      end
+      return nil unless index_part
+      index_part.rows_per
+    end
+    def convert_rows_per_to_output!
+      each_index_column do |table, column|
+        cardinality = column.delete('cardinality')
+        if table.rows == 0
+          column['rows_per'] = 1
+          next
+        end
+        # the bigger the table, the more likely we should be
+        # to show percentages for larger counts.
+        #
+        # small table, show row count up to 10% ish
+        # 100_000 - show rows up to 1000, 1%
+        # large table, 1_000_000.  show rows up to 0.1% ( 1000 )
+        # how many rows does each index value contain?
+        if cardinality
+          rows_per_item = (table.rows.to_f / cardinality.to_f)
+        else
+          rows_per_item = column.rows_per
+        end
+      end
+    end
+    def to_yaml
+      @tables.to_yaml
+    end
+    private
+    def each_index_column(&block)
+      @tables.each do |name, table|
+        table.indexes.each do |index_name, index|
+          index.columns.each do |column|
+            yield(table, column)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/shiba/output.rb CHANGED Viewed

@@ -1,15 +1,14 @@
 require 'yaml'
 require 'json'
 require 'fileutils'
+require 'tmpdir'
 require 'erb'
 module Shiba
   class Output
-    OUTPUT_PATH = "/tmp/shiba_results"
-    WEB_PATH = File.dirname(__FILE__) + "/../../web"
+    WEB_PATH = File.join(File.dirname(__FILE__), "..", "..", "web")
     def self.tags
-      @tags ||= YAML.load_file(File.dirname(__FILE__) + "/output/tags.yaml")
+      @tags ||= YAML.load_file(File.join(File.dirname(__FILE__), "output", "tags.yaml"))
     end
     def initialize(queries, options = {})
@@ -17,10 +16,22 @@ module Shiba
       @options = options
     end
+    def default_filename
+      @default_filename ||= "shiba_results-#{Time.now.to_i}.html"
+    end
+    def logdir
+      File.join(Dir.pwd, "log")
+    end
     def output_path
-      path ||= File.join(@options['output'], "shiba_results") if @options['output']
-      path ||= Dir.pwd + "/log/shiba_results" if File.exist?(Dir.pwd + "/log")
-      path ||= OUTPUT_PATH
+      return @options['output'] if @options['output']
+      if File.exist?(logdir)
+        FileUtils.mkdir_p(File.join(logdir, "shiba_results"))
+        File.join(Dir.pwd, "log", "shiba_results", default_filename)
+      else
+        File.join(Dir.tmpdir, default_filename)
+      end
     end
     def js_path
@@ -38,28 +49,23 @@ module Shiba
     end
     def make_web!
-      FileUtils.mkdir_p(js_path)
-      js = Dir.glob(WEB_PATH + "/dist/*.js").map { |f| File.basename(f) }
-      js.each do |f|
-        system("cp #{WEB_PATH}/dist/#{f} #{js_path}")
-      end
+      js  = Dir.glob(File.join(WEB_PATH, "dist", "*.js"))
+      css = Dir.glob(File.join(WEB_PATH, "*.css"))
       data = {
         js: js,
+        css: css,
         queries: @queries,
         tags: self.class.tags,
         url: remote_url
       }
-      system("cp #{WEB_PATH}/*.css #{output_path}")
-      erb = ERB.new(File.read(WEB_PATH + "/../web/results.html.erb"))
-      File.open(output_path + "/results.html", "w+") do |f|
+      erb = ERB.new(File.read(File.join(WEB_PATH, "..", "web", "results.html.erb")))
+      File.open(output_path, "w+") do |f|
         f.write(erb.result(binding))
       end
-      puts "done, results are in " + File.join(output_path, "results.html")
+      output_path
     end
   end
 end