shiba 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,77 @@
1
+ require 'shiba/index_stats'
2
+
3
+ module Shiba
4
+ class Fuzzer
5
+
6
+ def initialize(connection)
7
+ @connection = connection
8
+ @index_stats = IndexStats.new
9
+ end
10
+
11
+ attr_reader :connection
12
+
13
+ def fuzz!
14
+ fetch_index!
15
+ table_sizes = guess_table_sizes
16
+ @index_stats.tables.each do |name, table|
17
+ table.count = table_sizes[name]
18
+ table.indexes.each do |name, index|
19
+ index.columns.each do |column|
20
+ column.rows_per = index.unique ? 1 : 2
21
+ end
22
+ end
23
+ end
24
+ @index_stats
25
+ end
26
+
27
+ private
28
+
29
+ BIG_FUZZ_SIZE = 5_000
30
+ SMALL_FUZZ_SIZE = 100
31
+
32
+ def fetch_index!
33
+ records = connection.query("select * from information_schema.statistics where table_schema = DATABASE()")
34
+ tables = {}
35
+ records.each do |h|
36
+ h.keys.each { |k| h[k.downcase] = h.delete(k) }
37
+ h["cardinality"] = h["cardinality"].to_i
38
+ @index_stats.add_index_column(h['table_name'], h['index_name'], h['column_name'], h['cardinality'], h['non_unique'] == "0")
39
+ end
40
+ end
41
+
42
+ # Create fake table sizes based on the table's index count.
43
+ # The more indexes, the bigger the table. Seems to rank tables fairly well.
44
+ def guess_table_sizes
45
+ index_count_query = "select TABLE_NAME as table_name, count(*) as index_count
46
+ from information_schema.statistics where table_schema = DATABASE()
47
+ and seq_in_index = 1 and index_name not like 'fk_rails%'
48
+ group by table_name order by index_count"
49
+
50
+ index_counts = connection.query(index_count_query).to_a
51
+
52
+ # 90th table percentile based on number of indexes
53
+ # round down so we don't blow up on small tables
54
+ large_table_idx = (index_counts.size * 0.9).floor
55
+ large_table_index_count = index_counts[large_table_idx]["index_count"].to_f
56
+
57
+ sizes = Hash[index_counts.map(&:values)]
58
+
59
+ sizes.each do |table_name, index_count|
60
+ if index_count == 0
61
+ index_count = 1
62
+ end
63
+
64
+ size = sizes[table_name]
65
+ # Big
66
+ if size >= large_table_index_count
67
+ sizes[table_name] = BIG_FUZZ_SIZE
68
+ else
69
+ #small
70
+ sizes[table_name] = SMALL_FUZZ_SIZE
71
+ end
72
+ end
73
+
74
+ sizes
75
+ end
76
+ end
77
+ end
data/lib/shiba/index.rb CHANGED
@@ -1,6 +1,9 @@
1
- module Shiba
2
- module Index
1
+ require 'yaml'
2
+ require 'pp'
3
+ require 'shiba/index_stats'
3
4
 
5
+ module Shiba
6
+ class Index
4
7
  # Given the path to the information_schema.statistics output, returns index statistics keyed by table name.
5
8
  # Examples:
6
9
  # Exploring the schema:
@@ -12,140 +15,16 @@ module Shiba
12
15
  # => {:table_schema=>"blog_test", :table_name=>"users", :non_unique=>"0", :column_name=>"id", :cardinality=>"2", :is_visible=>"YES", :"expression\n"=>"NULL\n"}
13
16
  #
14
17
  def self.parse(path)
18
+ stats = IndexStats.new
15
19
  tables = {}
16
20
  records = read(path)
17
21
  headers = records.shift.map { |header| header.downcase }
18
22
  records.each do |r|
19
23
  h = Hash[headers.zip(r)]
20
24
  h["cardinality"] = h["cardinality"].to_i
21
- table = tables[h['table_name']] ||= []
22
- table.push(h)
23
- end
24
- tables
25
- end
26
-
27
- # Getting a row count for a table:
28
- #
29
- # schema_stats = Index.parse("./shiba/schema_stats.tsv")
30
- # users_count = Index.count(:users, schema_stats)
31
- # => 2
32
- def self.count(table, schema)
33
- return nil unless schema[table]
34
- primary = schema[table].detect { |index| index['index_name'] == "PRIMARY" }
35
- if primary.nil?
36
- # find the highest cardinality of a unique index, if it exists
37
- schema[table].map do |index|
38
- if index['non_unique'].to_i == 0
39
- index['cardinality']
40
- else
41
- nil
42
- end
43
- end.compact.max
44
- else
45
- primary['cardinality'].to_i
46
- end
47
- end
48
-
49
- def self.fuzzed?(table, schema)
50
- return nil unless schema[table]
51
- schema[table].first['fuzzed']
52
- end
53
-
54
- def self.estimate_key(table, key, parts, schema)
55
- table_count = count(table, schema)
56
- return nil unless table_count
57
-
58
- key_stat = schema[table].detect do |i|
59
- i["index_name"] == key && i["column_name"] == parts.last
60
- end
61
-
62
- return nil unless key_stat
63
-
64
- return 0 if key_stat['cardinality'] == 0
65
- table_count / key_stat['cardinality']
66
- end
67
-
68
- def self.query(connection)
69
- records = connection.query("select * from information_schema.statistics where table_schema = DATABASE()")
70
- tables = {}
71
- records.each do |h|
72
- h.keys.each { |k| h[k.downcase] = h.delete(k) }
73
- h["cardinality"] = h["cardinality"].to_i
74
- table = tables[h['table_name']] ||= []
75
- table.push(h)
76
- end
77
- tables
78
- end
79
-
80
-
81
- # Up the cardinality on our indexes.
82
- # Non uniques have a little less cardinality.
83
- def self.fuzz!(stats)
84
- db = stats.values.first.first['table_schema']
85
- table_sizes = self.guess_table_sizes(db)
86
-
87
-
88
-
89
- stats.each do |table,indexes|
90
- indexes.each do |idx|
91
- idx['cardinality'] = table_sizes[table]
92
-
93
- if idx['non_unique'] == 1
94
- idx['cardinality'] = (idx['cardinality'] * 0.7).round
95
- end
96
-
97
- idx['fuzzed'] = true
98
- end
99
- end
100
- end
101
-
102
- MINIMUM_TABLE_SIZE = 500
103
-
104
- # Approximate median size of the tables is less than 500.
105
- def self.insufficient_stats?(stats)
106
- if stats.length == 0
107
- return true
108
- end
109
-
110
- # Calculate a rough median.
111
- primary_keys = stats.map do |_,indexes|
112
- indexes.detect { |idx| idx['index_name'] == 'PRIMARY' } || {}
25
+ stats.add_index_column(h['table_name'], h['index_name'], h['column_name'], h['cardinality'], h['non_unique'] == "0")
113
26
  end
114
-
115
- table_counts = primary_keys.map { |pk| pk['cardinality'].to_i }
116
- median = table_counts[table_counts.size/2]
117
-
118
- return median < MINIMUM_TABLE_SIZE
119
- end
120
-
121
- STANDARD_FUZZ_SIZE = 5_000
122
-
123
- # Create fake table sizes based on the table's index count.
124
- # The more indexes, the bigger the table. Seems to rank tables fairly well.
125
- def self.guess_table_sizes(db)
126
- db = Shiba.connection.escape(db)
127
- index_count_query = "select TABLE_NAME as table_name, count(*) as index_count
128
- from information_schema.statistics where table_schema = '#{db}'
129
- and seq_in_index = 1 and index_name not like 'fk_rails%'
130
- group by table_name order by index_count"
131
-
132
- index_counts = Shiba.connection.query(index_count_query).to_a
133
-
134
- # 80th table percentile based on number of indexes
135
- large_table_idx = (index_counts.size * 0.8).round
136
- large_table = index_counts[large_table_idx]
137
-
138
- sizes = Hash[index_counts.map(&:values)]
139
-
140
- sizes.each do |table_name, index_count|
141
- if index_count == 0
142
- index_count = 1
143
- end
144
-
145
- sizes[table_name] = STANDARD_FUZZ_SIZE * (index_count / large_table['index_count'].to_f)
146
- end
147
-
148
- sizes
27
+ stats
149
28
  end
150
29
 
151
30
  protected
@@ -0,0 +1,210 @@
1
+ require 'yaml'
2
+ require 'active_support/core_ext/hash/keys'
3
+
4
+ module Shiba
5
+ class IndexStats
6
+
7
+ def initialize(tables = {})
8
+ @tables = tables
9
+ build_from_hash!
10
+ end
11
+
12
+ def any?
13
+ @tables.any?
14
+ end
15
+
16
+ Table = Struct.new(:name, :count, :indexes) do
17
+ def encode_with(coder)
18
+ coder.map = self.to_h.stringify_keys
19
+ coder.map.delete('name')
20
+
21
+ if self.count.nil?
22
+ #uuuugly. No unique keys. we'll take our best guess.
23
+ self.count = indexes.map { |i, parts| parts.columns.map { |v| v.raw_cardinality } }.flatten.max
24
+ end
25
+
26
+ coder.tag = nil
27
+ end
28
+
29
+ def build_index(index_name, is_unique)
30
+ self.indexes[index_name] ||= Index.new(self, index_name, [], is_unique)
31
+ end
32
+
33
+ def add_index_column(index_name, column_name, rows_per, cardinality, is_unique)
34
+ index = build_index(index_name, is_unique)
35
+ index.columns << Column.new(column_name, index, rows_per, cardinality)
36
+
37
+ if is_unique && !self.count
38
+ # set row count from unique index
39
+ self.count = cardinality
40
+ end
41
+ end
42
+ end
43
+
44
+ Index = Struct.new(:table, :name, :columns, :unique) do
45
+ def add_column(column_name, cardinality)
46
+ columns << Column.new(self, column_name, cardinality)
47
+ end
48
+
49
+ def encode_with(coder)
50
+ coder.map = self.to_h.stringify_keys
51
+ coder.map.delete('table')
52
+ coder.tag = nil
53
+ end
54
+ end
55
+
56
+ class Column
57
+ def initialize(column, index, rows_per, cardinality)
58
+ @column = column
59
+ @index = index
60
+ @rows_per = rows_per
61
+ @cardinality = cardinality
62
+ end
63
+
64
+ attr_reader :column
65
+
66
+ def table_count
67
+ @index.table.count
68
+ end
69
+
70
+ def raw_cardinality
71
+ @cardinality
72
+ end
73
+
74
+ def rows_per
75
+ return @rows_per if @rows_per && @rows_per.is_a?(Integer)
76
+ return nil if table_count.nil?
77
+
78
+ if @rows_per.nil?
79
+ if table_count == 0
80
+ @rows_per = 1
81
+ else
82
+ @rows_per = (table_count / @cardinality).round
83
+ end
84
+ elsif @rows_per.is_a?(String)
85
+ @rows_per = ((@rows_per.to_f / 100.0) * table_count.to_f).round
86
+ end
87
+ @rows_per
88
+ end
89
+
90
+ attr_writer :rows_per
91
+
92
+
93
+ def encode_with(coder)
94
+ coder.map = {'column' => @column}
95
+
96
+ count = table_count
97
+ count = 1 if count == 0
98
+ ratio_per_item = self.rows_per / count.to_f rescue debugger
99
+
100
+
101
+ if count <= 10
102
+ ratio_threshold = 1_000_0000 # always show a number
103
+ elsif count <= 1000
104
+ ratio_threshold = 0.1
105
+ elsif count <= 1_000_000
106
+ ratio_threshold = 0.01
107
+ elsif count <= 1_000_000_000
108
+ ratio_threshold = 0.001
109
+ end
110
+
111
+ if ratio_per_item > ratio_threshold
112
+ coder.map['rows_per'] = (ratio_per_item * 100).round.to_s + "%"
113
+ else
114
+ coder.map['rows_per'] = rows_per
115
+ end
116
+ coder.tag = nil
117
+ end
118
+ end
119
+
120
+ def build_from_hash!
121
+ @tables = @tables.collect do |tbl_name, tbl_hash|
122
+ t = Table.new(tbl_name, tbl_hash['count'], {})
123
+ tbl_hash['indexes'].each do |idx_name, idx_hash|
124
+ idx_hash['columns'].each do |col_hash|
125
+ t.add_index_column(idx_name, col_hash['column'], col_hash['rows_per'], nil, idx_hash['unique'])
126
+ end
127
+ end
128
+ [tbl_name, t]
129
+ end.to_h
130
+ end
131
+
132
+ attr_reader :tables
133
+
134
+ def table_count(table)
135
+ return @tables[table].count if @tables[table]
136
+ end
137
+
138
+ def fetch_index(table, name)
139
+ tbl = @tables[table]
140
+ return nil unless tbl
141
+
142
+ tbl.indexes[name]
143
+ end
144
+
145
+ def build_table(name)
146
+ @tables[name] ||= Table.new(name, nil, {})
147
+ end
148
+
149
+ def add_index_column(table, index_name, column_name, cardinality, is_unique)
150
+ table = build_table(table)
151
+ table.add_index_column(index_name, column_name, nil, cardinality, is_unique)
152
+ end
153
+
154
+ def estimate_key(table_name, key, parts)
155
+ index = fetch_index(table_name, key)
156
+
157
+ return nil unless index
158
+
159
+ index_part = index.columns.detect do |p|
160
+ p.column == parts.last
161
+ end
162
+
163
+ return nil unless index_part
164
+
165
+ index_part.rows_per
166
+ end
167
+
168
+ def convert_rows_per_to_output!
169
+ each_index_column do |table, column|
170
+ cardinality = column.delete('cardinality')
171
+
172
+ if table.rows == 0
173
+ column['rows_per'] = 1
174
+ next
175
+ end
176
+
177
+ # the bigger the table, the more likely we should be
178
+ # to show percentages for larger counts.
179
+ #
180
+ # small table, show row count up to 10% ish
181
+ # 100_000 - show rows up to 1000, 1%
182
+ # large table, 1_000_000. show rows up to 0.1% ( 1000 )
183
+
184
+
185
+ # how many rows does each index value contain?
186
+ if cardinality
187
+ rows_per_item = (table.rows.to_f / cardinality.to_f)
188
+ else
189
+ rows_per_item = column.rows_per
190
+ end
191
+
192
+ end
193
+ end
194
+
195
+ def to_yaml
196
+ @tables.to_yaml
197
+ end
198
+
199
+ private
200
+ def each_index_column(&block)
201
+ @tables.each do |name, table|
202
+ table.indexes.each do |index_name, index|
203
+ index.columns.each do |column|
204
+ yield(table, column)
205
+ end
206
+ end
207
+ end
208
+ end
209
+ end
210
+ end
data/lib/shiba/output.rb CHANGED
@@ -1,15 +1,14 @@
1
1
  require 'yaml'
2
2
  require 'json'
3
3
  require 'fileutils'
4
+ require 'tmpdir'
4
5
  require 'erb'
5
6
 
6
7
  module Shiba
7
8
  class Output
8
- OUTPUT_PATH = "/tmp/shiba_results"
9
-
10
- WEB_PATH = File.dirname(__FILE__) + "/../../web"
9
+ WEB_PATH = File.join(File.dirname(__FILE__), "..", "..", "web")
11
10
  def self.tags
12
- @tags ||= YAML.load_file(File.dirname(__FILE__) + "/output/tags.yaml")
11
+ @tags ||= YAML.load_file(File.join(File.dirname(__FILE__), "output", "tags.yaml"))
13
12
  end
14
13
 
15
14
  def initialize(queries, options = {})
@@ -17,10 +16,22 @@ module Shiba
17
16
  @options = options
18
17
  end
19
18
 
19
+ def default_filename
20
+ @default_filename ||= "shiba_results-#{Time.now.to_i}.html"
21
+ end
22
+
23
+ def logdir
24
+ File.join(Dir.pwd, "log")
25
+ end
26
+
20
27
  def output_path
21
- path ||= File.join(@options['output'], "shiba_results") if @options['output']
22
- path ||= Dir.pwd + "/log/shiba_results" if File.exist?(Dir.pwd + "/log")
23
- path ||= OUTPUT_PATH
28
+ return @options['output'] if @options['output']
29
+ if File.exist?(logdir)
30
+ FileUtils.mkdir_p(File.join(logdir, "shiba_results"))
31
+ File.join(Dir.pwd, "log", "shiba_results", default_filename)
32
+ else
33
+ File.join(Dir.tmpdir, default_filename)
34
+ end
24
35
  end
25
36
 
26
37
  def js_path
@@ -38,28 +49,23 @@ module Shiba
38
49
  end
39
50
 
40
51
  def make_web!
41
- FileUtils.mkdir_p(js_path)
42
-
43
- js = Dir.glob(WEB_PATH + "/dist/*.js").map { |f| File.basename(f) }
44
- js.each do |f|
45
- system("cp #{WEB_PATH}/dist/#{f} #{js_path}")
46
- end
52
+ js = Dir.glob(File.join(WEB_PATH, "dist", "*.js"))
53
+ css = Dir.glob(File.join(WEB_PATH, "*.css"))
47
54
 
48
55
  data = {
49
56
  js: js,
57
+ css: css,
50
58
  queries: @queries,
51
59
  tags: self.class.tags,
52
60
  url: remote_url
53
61
  }
54
62
 
55
- system("cp #{WEB_PATH}/*.css #{output_path}")
56
-
57
- erb = ERB.new(File.read(WEB_PATH + "/../web/results.html.erb"))
58
- File.open(output_path + "/results.html", "w+") do |f|
63
+ erb = ERB.new(File.read(File.join(WEB_PATH, "..", "web", "results.html.erb")))
64
+ File.open(output_path, "w+") do |f|
59
65
  f.write(erb.result(binding))
60
66
  end
61
67
 
62
- puts "done, results are in " + File.join(output_path, "results.html")
68
+ output_path
63
69
  end
64
70
  end
65
71
  end