shiba 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,100 @@
1
+ require 'shiba'
2
+ require 'shiba/query'
3
+ require 'json'
4
+ require 'logger'
5
+
6
+ module Shiba
7
+ class Analyzer
8
+
9
+ def self.analyze(file, output, stats, options)
10
+ new(file, output, stats, options).analyze
11
+ end
12
+
13
+ def initialize(file, output, stats, options)
14
+ @file = file
15
+ @output = output
16
+ @stats = stats
17
+ @options = options
18
+ @fingerprints = {}
19
+ end
20
+
21
+ def analyze
22
+ idx = 0
23
+ queries = []
24
+ while line = @file.gets
25
+ # strip out colors
26
+ begin
27
+ line.gsub!(/\e\[?.*?[\@-~]/, '')
28
+ rescue ArgumentError => e
29
+ next
30
+ end
31
+
32
+ if line =~ /(select.*from.*)/i
33
+ sql = $1
34
+ else
35
+ next
36
+ end
37
+
38
+ if @options['limit']
39
+ return if idx == @options['limit']
40
+ end
41
+
42
+ if @options['index']
43
+ next unless idx == @options['index']
44
+ end
45
+
46
+ sql.chomp!
47
+ query = Shiba::Query.new(sql, @stats)
48
+
49
+ if !@fingerprints[query.fingerprint]
50
+ if sql.downcase.start_with?("select")
51
+ if @options['debug']
52
+ require 'byebug'
53
+ debugger
54
+ end
55
+
56
+ explain = analyze_query(query)
57
+ if explain
58
+ idx += 1
59
+ queries << explain
60
+ end
61
+ end
62
+ end
63
+
64
+ @fingerprints[query.fingerprint] = true
65
+ end
66
+ queries
67
+ end
68
+
69
+ protected
70
+
71
+ def dump_error(e, query)
72
+ $stderr.puts "got exception trying to explain: #{e.message}"
73
+ $stderr.puts "query: #{query.sql} (index #{query.index})"
74
+ $stderr.puts e.backtrace.join("\n")
75
+ end
76
+
77
+ def analyze_query(query)
78
+ explain = nil
79
+ begin
80
+ explain = query.explain
81
+ rescue Mysql2::Error => e
82
+ # we're picking up crap on the command-line that's not good SQL. ignore it.
83
+ if !(e.message =~ /You have an error in your SQL syntax/)
84
+ dump_error(e, query)
85
+ end
86
+ rescue StandardError => e
87
+ dump_error(e, query)
88
+ end
89
+ return nil unless explain
90
+
91
+ json = JSON.dump(explain.as_json)
92
+ write(json)
93
+ explain.as_json
94
+ end
95
+
96
+ def write(line)
97
+ @output.puts(line)
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,31 @@
1
+ require 'pathname'
2
+
3
+ module Shiba
4
+ module Configure
5
+
6
+ # avoiding Rails dependency on the cli tools for now.
7
+ # yanked from https://github.com/rails/rails/blob/v5.0.5/railties/lib/rails/application/configuration.rb
8
+ def self.activerecord_configuration
9
+ yaml = Pathname.new("config/database.yml")
10
+
11
+ config = if yaml && yaml.exist?
12
+ require "yaml"
13
+ require "erb"
14
+ YAML.load(ERB.new(yaml.read).result) || {}
15
+ elsif ENV['DATABASE_URL']
16
+ # Value from ENV['DATABASE_URL'] is set to default database connection
17
+ # by Active Record.
18
+ {}
19
+ end
20
+
21
+ config
22
+ rescue Psych::SyntaxError => e
23
+ raise "YAML syntax error occurred while parsing #{yaml.to_s}. " \
24
+ "Please note that YAML must be consistently indented using spaces. Tabs are not allowed. " \
25
+ "Error: #{e.message}"
26
+ rescue => e
27
+ raise e, "Cannot load `#{path}`:\n#{e.message}", e.backtrace
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,234 @@
1
+ require 'json'
2
+ require 'shiba/index'
3
+
4
+ module Shiba
5
+ class Explain
6
+ def initialize(sql, stats, options = {})
7
+ @sql = sql
8
+
9
+ @sql, _, @backtrace = @sql.partition(" /*shiba")
10
+ if options[:force_key]
11
+ @sql = @sql.sub(/(FROM\s*\S+)/i, '\1' + " FORCE INDEX(`#{options[:force_key]}`)")
12
+ end
13
+
14
+ @options = options
15
+ ex = Shiba.connection.query("EXPLAIN FORMAT=JSON #{@sql}").to_a
16
+ json = JSON.parse(ex.first['EXPLAIN'])
17
+ @rows = self.class.transform_json(json['query_block'])
18
+ @stats = stats
19
+ run_checks!
20
+ end
21
+
22
+ def as_json
23
+ @backtrace.chomp!("*/")
24
+
25
+ {
26
+ sql: @sql,
27
+ table: get_table,
28
+ key: first_key,
29
+ tags: messages,
30
+ cost: @cost,
31
+ used_key_parts: first['used_key_parts'],
32
+ possible_keys: first['possible_keys'],
33
+ backtrace: JSON.parse(@backtrace)
34
+ }
35
+ end
36
+
37
+ def get_table
38
+ @sql =~ /\s+from\s*([^\s,]+)/i
39
+ table = $1
40
+ return nil unless table
41
+
42
+ table = table.downcase
43
+ table.gsub!('`', '')
44
+ table.gsub!(/.*\.(.*)/, '\1')
45
+ table
46
+ end
47
+
48
+ def self.transform_table(table)
49
+ t = table
50
+ res = {}
51
+ res['table'] = t['table_name']
52
+ res['access_type'] = t['access_type']
53
+ res['key'] = t['key']
54
+ res['used_key_parts'] = t['used_key_parts'] if t['used_key_parts']
55
+ res['rows'] = t['rows_examined_per_scan']
56
+ res['filtered'] = t['filtered']
57
+
58
+ if t['possible_keys'] && t['possible_keys'] != [res['key']]
59
+ res['possible_keys'] = t['possible_keys']
60
+ end
61
+ res['using_index'] = t['using_index'] if t['using_index']
62
+ res
63
+ end
64
+
65
+ def self.transform_json(json, res = [])
66
+ rows = []
67
+
68
+ if json['ordering_operation']
69
+ return transform_json(json['ordering_operation'])
70
+ elsif json['duplicates_removal']
71
+ return transform_json(json['duplicates_removal'])
72
+ elsif !json['nested_loop'] && !json['table']
73
+ return [{'Extra' => json['message']}]
74
+ elsif json['nested_loop']
75
+ json['nested_loop'].map do |nested|
76
+ transform_json(nested, res)
77
+ end
78
+ elsif json['table']
79
+ res << transform_table(json['table'])
80
+ end
81
+ res
82
+ end
83
+
84
+ # [{"id"=>1, "select_type"=>"SIMPLE", "table"=>"interwiki", "partitions"=>nil, "type"=>"const", "possible_keys"=>"PRIMARY", "key"=>"PRIMARY", "key_len"=>"34", "ref"=>"const", "rows"=>1, "filtered"=>100.0, "Extra"=>nil}]
85
+ attr_reader :cost
86
+
87
+ def first
88
+ @rows.first
89
+ end
90
+
91
+ def first_table
92
+ first["table"]
93
+ end
94
+
95
+ def first_key
96
+ first["key"]
97
+ end
98
+
99
+ def first_extra
100
+ first["Extra"]
101
+ end
102
+
103
+ def messages
104
+ @messages ||= []
105
+ end
106
+
107
+ # shiba: {"possible_keys"=>nil, "key"=>nil, "key_len"=>nil, "ref"=>nil, "rows"=>6, "filtered"=>16.67, "Extra"=>"Using where"}
108
+ def to_log
109
+ plan = first.symbolize_keys
110
+ "possible: #{plan[:possible_keys]}, rows: #{plan[:rows]}, filtered: #{plan[:filtered]}, cost: #{self.cost}, access: #{plan[:access_type]}"
111
+ end
112
+
113
+ def to_h
114
+ first.merge(cost: cost, messages: messages)
115
+ end
116
+
117
+ IGNORE_PATTERNS = [
118
+ /No tables used/,
119
+ /Impossible WHERE/,
120
+ /Select tables optimized away/,
121
+ /No matching min\/max row/
122
+ ]
123
+
124
+ def table_size
125
+ Shiba::Index.count(first["table"], @stats)
126
+ end
127
+
128
+ def no_matching_row_in_const_table?
129
+ first_extra && first_extra =~ /no matching row in const table/
130
+ end
131
+
132
+ def ignore_explain?
133
+ first_extra && IGNORE_PATTERNS.any? { |p| first_extra =~ p }
134
+ end
135
+
136
+ def derived?
137
+ first['table'] =~ /<derived.*?>/
138
+ end
139
+
140
+ # TODO: need to parse SQL here I think
141
+ def simple_table_scan?
142
+ @rows.size == 1 && (@sql !~ /where/i || @sql =~ /where\s*1=1/i) && (@sql !~ /order by/i)
143
+ end
144
+
145
+ def limit
146
+ if @sql =~ /limit\s*(\d+)\s*(offset \d+)?$/i
147
+ $1.to_i
148
+ else
149
+ nil
150
+ end
151
+ end
152
+
153
+ def tag_query_type
154
+ access_type = first['access_type']
155
+
156
+ return unless access_type
157
+ access_type = 'tablescan' if access_type == 'ALL'
158
+ messages << "access_type_" + access_type
159
+ end
160
+
161
+ def estimate_row_count
162
+ if no_matching_row_in_const_table?
163
+ messages << "access_type_const"
164
+ first['key'] = 'PRIMARY'
165
+ return 0
166
+ end
167
+
168
+ return 0 if ignore_explain?
169
+
170
+ messages << "fuzzed_data" if Shiba::Index.fuzzed?(first_table, @stats)
171
+
172
+ if simple_table_scan?
173
+ if limit
174
+ messages << 'limited_tablescan'
175
+ else
176
+ messages << 'access_type_tablescan'
177
+ end
178
+
179
+ return limit || table_size
180
+ end
181
+
182
+ if derived?
183
+ # select count(*) from ( select 1 from foo where blah )
184
+ @rows.shift
185
+ return estimate_row_count
186
+ end
187
+
188
+ tag_query_type
189
+
190
+ # TODO: if possible_keys but mysql chooses NULL, this could be a test-data issue,
191
+ # pick the best key from the list of possibilities.
192
+ #
193
+ if first_key
194
+ Shiba::Index.estimate_key(first_table, first_key, first['used_key_parts'], @stats)
195
+ else
196
+ if first['possible_keys'].nil?
197
+ # if no possibile we're table scanning, use PRIMARY to indicate that cost.
198
+ # note that this can be wildly inaccurate bcs of WHERE + LIMIT stuff.
199
+ Shiba::Index.count(first_table, @stats)
200
+ else
201
+ if @options[:force_key]
202
+ # we were asked to force a key, but mysql still told us to fuck ourselves.
203
+ # (no index used)
204
+ #
205
+ # there seems to be cases where mysql lists `possible_key` values
206
+ # that it then cannot use, seen this in OR queries.
207
+ return Shiba::Index.count(first_table, @stats)
208
+ end
209
+
210
+ possibilities = [Shiba::Index.count(first_table, @stats)]
211
+ possibilities += first['possible_keys'].map do |key|
212
+ estimate_row_count_with_key(key)
213
+ end
214
+ possibilities.compact.min
215
+ end
216
+ end
217
+ end
218
+
219
+ def estimate_row_count_with_key(key)
220
+ Explain.new(@sql, @stats, force_key: key).estimate_row_count
221
+ rescue Mysql2::Error => e
222
+ if /Key .+? doesn't exist in table/ =~ e.message
223
+ return nil
224
+ end
225
+
226
+ raise e
227
+ end
228
+
229
+ def run_checks!
230
+ @cost = estimate_row_count
231
+ end
232
+ end
233
+ end
234
+
@@ -0,0 +1,159 @@
1
+ module Shiba
2
+ module Index
3
+
4
+ # Given the path to the information_schema.statistics output, returns index statistics keyed by table name.
5
+ # Examples:
6
+ # Exploring the schema:
7
+ #
8
+ # schema_stats = Index.parse("./shiba/schema_stats.tsv")
9
+ # schema_stats.keys
10
+ # => :users, :posts, :comments
11
+ # schema_stats[:users]
12
+ # => {:table_schema=>"blog_test", :table_name=>"users", :non_unique=>"0", :column_name=>"id", :cardinality=>"2", :is_visible=>"YES", :"expression\n"=>"NULL\n"}
13
+ #
14
+ def self.parse(path)
15
+ tables = {}
16
+ records = read(path)
17
+ headers = records.shift.map { |header| header.downcase }
18
+ records.each do |r|
19
+ h = Hash[headers.zip(r)]
20
+ h["cardinality"] = h["cardinality"].to_i
21
+ table = tables[h['table_name']] ||= []
22
+ table.push(h)
23
+ end
24
+ tables
25
+ end
26
+
27
+ # Getting a row count for a table:
28
+ #
29
+ # schema_stats = Index.parse("./shiba/schema_stats.tsv")
30
+ # users_count = Index.count(:users, schema_stats)
31
+ # => 2
32
+ def self.count(table, schema)
33
+ return nil unless schema[table]
34
+ primary = schema[table].detect { |index| index['index_name'] == "PRIMARY" }
35
+ if primary.nil?
36
+ # find the highest cardinality of a unique index, if it exists
37
+ schema[table].map do |index|
38
+ if index['non_unique'].to_i == 0
39
+ index['cardinality']
40
+ else
41
+ nil
42
+ end
43
+ end.compact.max
44
+ else
45
+ primary['cardinality'].to_i
46
+ end
47
+ end
48
+
49
+ def self.fuzzed?(table, schema)
50
+ return nil unless schema[table]
51
+ schema[table].first['fuzzed']
52
+ end
53
+
54
+ def self.estimate_key(table, key, parts, schema)
55
+ table_count = count(table, schema)
56
+ return nil unless table_count
57
+
58
+ key_stat = schema[table].detect do |i|
59
+ i["index_name"] == key && i["column_name"] == parts.last
60
+ end
61
+
62
+ return nil unless key_stat
63
+
64
+ return 0 if key_stat['cardinality'] == 0
65
+ table_count / key_stat['cardinality']
66
+ end
67
+
68
+ def self.query(connection)
69
+ records = connection.query("select * from information_schema.statistics where table_schema = DATABASE()")
70
+ tables = {}
71
+ records.each do |h|
72
+ h.keys.each { |k| h[k.downcase] = h.delete(k) }
73
+ h["cardinality"] = h["cardinality"].to_i
74
+ table = tables[h['table_name']] ||= []
75
+ table.push(h)
76
+ end
77
+ tables
78
+ end
79
+
80
+
81
+ # Up the cardinality on our indexes.
82
+ # Non uniques have a little less cardinality.
83
+ def self.fuzz!(stats)
84
+ db = stats.values.first.first['table_schema']
85
+ table_sizes = self.guess_table_sizes(db)
86
+
87
+
88
+
89
+ stats.each do |table,indexes|
90
+ indexes.each do |idx|
91
+ idx['cardinality'] = table_sizes[table]
92
+
93
+ if idx['non_unique'] == 1
94
+ idx['cardinality'] = (idx['cardinality'] * 0.7).round
95
+ end
96
+
97
+ idx['fuzzed'] = true
98
+ end
99
+ end
100
+ end
101
+
102
+ MINIMUM_TABLE_SIZE = 500
103
+
104
+ # Approximate median size of the tables is less than 500.
105
+ def self.insufficient_stats?(stats)
106
+ if stats.length == 0
107
+ return true
108
+ end
109
+
110
+ # Calculate a rough median.
111
+ primary_keys = stats.map do |_,indexes|
112
+ indexes.detect { |idx| idx['index_name'] == 'PRIMARY' } || {}
113
+ end
114
+
115
+ table_counts = primary_keys.map { |pk| pk['cardinality'].to_i }
116
+ median = table_counts[table_counts.size/2]
117
+
118
+ return median < MINIMUM_TABLE_SIZE
119
+ end
120
+
121
+ STANDARD_FUZZ_SIZE = 5_000
122
+
123
+ # Create fake table sizes based on the table's index count.
124
+ # The more indexes, the bigger the table. Seems to rank tables fairly well.
125
+ def self.guess_table_sizes(db)
126
+ db = Shiba.connection.escape(db)
127
+ index_count_query = "select TABLE_NAME as table_name, count(*) as index_count
128
+ from information_schema.statistics where table_schema = '#{db}'
129
+ and seq_in_index = 1 and index_name not like 'fk_rails%'
130
+ group by table_name order by index_count"
131
+
132
+ index_counts = Shiba.connection.query(index_count_query).to_a
133
+
134
+ # 80th table percentile based on number of indexes
135
+ large_table_idx = (index_counts.size * 0.8).round
136
+ large_table = index_counts[large_table_idx]
137
+
138
+ sizes = Hash[index_counts.map(&:values)]
139
+
140
+ sizes.each do |table_name, index_count|
141
+ if index_count == 0
142
+ index_count = 1
143
+ end
144
+
145
+ sizes[table_name] = STANDARD_FUZZ_SIZE * (index_count / large_table['index_count'].to_f)
146
+ end
147
+
148
+ sizes
149
+ end
150
+
151
+ protected
152
+
153
+ def self.read(path)
154
+ # fixes :"expression\n"=>"NULL\n"},
155
+ IO.foreach(path).map { |l| l.gsub!("\n", "").split("\t") }
156
+ end
157
+
158
+ end
159
+ end