shiba 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,100 @@
1
+ require 'shiba'
2
+ require 'shiba/query'
3
+ require 'json'
4
+ require 'logger'
5
+
6
+ module Shiba
7
+ class Analyzer
8
+
9
+ def self.analyze(file, output, stats, options)
10
+ new(file, output, stats, options).analyze
11
+ end
12
+
13
+ def initialize(file, output, stats, options)
14
+ @file = file
15
+ @output = output
16
+ @stats = stats
17
+ @options = options
18
+ @fingerprints = {}
19
+ end
20
+
21
+ def analyze
22
+ idx = 0
23
+ queries = []
24
+ while line = @file.gets
25
+ # strip out colors
26
+ begin
27
+ line.gsub!(/\e\[?.*?[\@-~]/, '')
28
+ rescue ArgumentError => e
29
+ next
30
+ end
31
+
32
+ if line =~ /(select.*from.*)/i
33
+ sql = $1
34
+ else
35
+ next
36
+ end
37
+
38
+ if @options['limit']
39
+ return if idx == @options['limit']
40
+ end
41
+
42
+ if @options['index']
43
+ next unless idx == @options['index']
44
+ end
45
+
46
+ sql.chomp!
47
+ query = Shiba::Query.new(sql, @stats)
48
+
49
+ if !@fingerprints[query.fingerprint]
50
+ if sql.downcase.start_with?("select")
51
+ if @options['debug']
52
+ require 'byebug'
53
+ debugger
54
+ end
55
+
56
+ explain = analyze_query(query)
57
+ if explain
58
+ idx += 1
59
+ queries << explain
60
+ end
61
+ end
62
+ end
63
+
64
+ @fingerprints[query.fingerprint] = true
65
+ end
66
+ queries
67
+ end
68
+
69
+ protected
70
+
71
+ def dump_error(e, query)
72
+ $stderr.puts "got exception trying to explain: #{e.message}"
73
+ $stderr.puts "query: #{query.sql} (index #{query.index})"
74
+ $stderr.puts e.backtrace.join("\n")
75
+ end
76
+
77
+ def analyze_query(query)
78
+ explain = nil
79
+ begin
80
+ explain = query.explain
81
+ rescue Mysql2::Error => e
82
+ # we're picking up crap on the command-line that's not good SQL. ignore it.
83
+ if !(e.message =~ /You have an error in your SQL syntax/)
84
+ dump_error(e, query)
85
+ end
86
+ rescue StandardError => e
87
+ dump_error(e, query)
88
+ end
89
+ return nil unless explain
90
+
91
+ json = JSON.dump(explain.as_json)
92
+ write(json)
93
+ explain.as_json
94
+ end
95
+
96
+ def write(line)
97
+ @output.puts(line)
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,31 @@
1
+ require 'pathname'
2
+
3
+ module Shiba
4
+ module Configure
5
+
6
+ # avoiding Rails dependency on the cli tools for now.
7
+ # yanked from https://github.com/rails/rails/blob/v5.0.5/railties/lib/rails/application/configuration.rb
8
+ def self.activerecord_configuration
9
+ yaml = Pathname.new("config/database.yml")
10
+
11
+ config = if yaml && yaml.exist?
12
+ require "yaml"
13
+ require "erb"
14
+ YAML.load(ERB.new(yaml.read).result) || {}
15
+ elsif ENV['DATABASE_URL']
16
+ # Value from ENV['DATABASE_URL'] is set to default database connection
17
+ # by Active Record.
18
+ {}
19
+ end
20
+
21
+ config
22
+ rescue Psych::SyntaxError => e
23
+ raise "YAML syntax error occurred while parsing #{yaml.to_s}. " \
24
+ "Please note that YAML must be consistently indented using spaces. Tabs are not allowed. " \
25
+ "Error: #{e.message}"
26
+ rescue => e
27
+ raise e, "Cannot load `#{path}`:\n#{e.message}", e.backtrace
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,234 @@
1
+ require 'json'
2
+ require 'shiba/index'
3
+
4
+ module Shiba
5
+ class Explain
6
+ def initialize(sql, stats, options = {})
7
+ @sql = sql
8
+
9
+ @sql, _, @backtrace = @sql.partition(" /*shiba")
10
+ if options[:force_key]
11
+ @sql = @sql.sub(/(FROM\s*\S+)/i, '\1' + " FORCE INDEX(`#{options[:force_key]}`)")
12
+ end
13
+
14
+ @options = options
15
+ ex = Shiba.connection.query("EXPLAIN FORMAT=JSON #{@sql}").to_a
16
+ json = JSON.parse(ex.first['EXPLAIN'])
17
+ @rows = self.class.transform_json(json['query_block'])
18
+ @stats = stats
19
+ run_checks!
20
+ end
21
+
22
+ def as_json
23
+ @backtrace.chomp!("*/")
24
+
25
+ {
26
+ sql: @sql,
27
+ table: get_table,
28
+ key: first_key,
29
+ tags: messages,
30
+ cost: @cost,
31
+ used_key_parts: first['used_key_parts'],
32
+ possible_keys: first['possible_keys'],
33
+ backtrace: JSON.parse(@backtrace)
34
+ }
35
+ end
36
+
37
+ def get_table
38
+ @sql =~ /\s+from\s*([^\s,]+)/i
39
+ table = $1
40
+ return nil unless table
41
+
42
+ table = table.downcase
43
+ table.gsub!('`', '')
44
+ table.gsub!(/.*\.(.*)/, '\1')
45
+ table
46
+ end
47
+
48
+ def self.transform_table(table)
49
+ t = table
50
+ res = {}
51
+ res['table'] = t['table_name']
52
+ res['access_type'] = t['access_type']
53
+ res['key'] = t['key']
54
+ res['used_key_parts'] = t['used_key_parts'] if t['used_key_parts']
55
+ res['rows'] = t['rows_examined_per_scan']
56
+ res['filtered'] = t['filtered']
57
+
58
+ if t['possible_keys'] && t['possible_keys'] != [res['key']]
59
+ res['possible_keys'] = t['possible_keys']
60
+ end
61
+ res['using_index'] = t['using_index'] if t['using_index']
62
+ res
63
+ end
64
+
65
+ def self.transform_json(json, res = [])
66
+ rows = []
67
+
68
+ if json['ordering_operation']
69
+ return transform_json(json['ordering_operation'])
70
+ elsif json['duplicates_removal']
71
+ return transform_json(json['duplicates_removal'])
72
+ elsif !json['nested_loop'] && !json['table']
73
+ return [{'Extra' => json['message']}]
74
+ elsif json['nested_loop']
75
+ json['nested_loop'].map do |nested|
76
+ transform_json(nested, res)
77
+ end
78
+ elsif json['table']
79
+ res << transform_table(json['table'])
80
+ end
81
+ res
82
+ end
83
+
84
+ # [{"id"=>1, "select_type"=>"SIMPLE", "table"=>"interwiki", "partitions"=>nil, "type"=>"const", "possible_keys"=>"PRIMARY", "key"=>"PRIMARY", "key_len"=>"34", "ref"=>"const", "rows"=>1, "filtered"=>100.0, "Extra"=>nil}]
85
+ attr_reader :cost
86
+
87
+ def first
88
+ @rows.first
89
+ end
90
+
91
+ def first_table
92
+ first["table"]
93
+ end
94
+
95
+ def first_key
96
+ first["key"]
97
+ end
98
+
99
+ def first_extra
100
+ first["Extra"]
101
+ end
102
+
103
+ def messages
104
+ @messages ||= []
105
+ end
106
+
107
+ # shiba: {"possible_keys"=>nil, "key"=>nil, "key_len"=>nil, "ref"=>nil, "rows"=>6, "filtered"=>16.67, "Extra"=>"Using where"}
108
+ def to_log
109
+ plan = first.symbolize_keys
110
+ "possible: #{plan[:possible_keys]}, rows: #{plan[:rows]}, filtered: #{plan[:filtered]}, cost: #{self.cost}, access: #{plan[:access_type]}"
111
+ end
112
+
113
+ def to_h
114
+ first.merge(cost: cost, messages: messages)
115
+ end
116
+
117
+ IGNORE_PATTERNS = [
118
+ /No tables used/,
119
+ /Impossible WHERE/,
120
+ /Select tables optimized away/,
121
+ /No matching min\/max row/
122
+ ]
123
+
124
+ def table_size
125
+ Shiba::Index.count(first["table"], @stats)
126
+ end
127
+
128
+ def no_matching_row_in_const_table?
129
+ first_extra && first_extra =~ /no matching row in const table/
130
+ end
131
+
132
+ def ignore_explain?
133
+ first_extra && IGNORE_PATTERNS.any? { |p| first_extra =~ p }
134
+ end
135
+
136
+ def derived?
137
+ first['table'] =~ /<derived.*?>/
138
+ end
139
+
140
+ # TODO: need to parse SQL here I think
141
+ def simple_table_scan?
142
+ @rows.size == 1 && (@sql !~ /where/i || @sql =~ /where\s*1=1/i) && (@sql !~ /order by/i)
143
+ end
144
+
145
+ def limit
146
+ if @sql =~ /limit\s*(\d+)\s*(offset \d+)?$/i
147
+ $1.to_i
148
+ else
149
+ nil
150
+ end
151
+ end
152
+
153
+ def tag_query_type
154
+ access_type = first['access_type']
155
+
156
+ return unless access_type
157
+ access_type = 'tablescan' if access_type == 'ALL'
158
+ messages << "access_type_" + access_type
159
+ end
160
+
161
+ def estimate_row_count
162
+ if no_matching_row_in_const_table?
163
+ messages << "access_type_const"
164
+ first['key'] = 'PRIMARY'
165
+ return 0
166
+ end
167
+
168
+ return 0 if ignore_explain?
169
+
170
+ messages << "fuzzed_data" if Shiba::Index.fuzzed?(first_table, @stats)
171
+
172
+ if simple_table_scan?
173
+ if limit
174
+ messages << 'limited_tablescan'
175
+ else
176
+ messages << 'access_type_tablescan'
177
+ end
178
+
179
+ return limit || table_size
180
+ end
181
+
182
+ if derived?
183
+ # select count(*) from ( select 1 from foo where blah )
184
+ @rows.shift
185
+ return estimate_row_count
186
+ end
187
+
188
+ tag_query_type
189
+
190
+ # TODO: if possible_keys but mysql chooses NULL, this could be a test-data issue,
191
+ # pick the best key from the list of possibilities.
192
+ #
193
+ if first_key
194
+ Shiba::Index.estimate_key(first_table, first_key, first['used_key_parts'], @stats)
195
+ else
196
+ if first['possible_keys'].nil?
197
+ # if no possibile we're table scanning, use PRIMARY to indicate that cost.
198
+ # note that this can be wildly inaccurate bcs of WHERE + LIMIT stuff.
199
+ Shiba::Index.count(first_table, @stats)
200
+ else
201
+ if @options[:force_key]
202
+ # we were asked to force a key, but mysql still told us to fuck ourselves.
203
+ # (no index used)
204
+ #
205
+ # there seems to be cases where mysql lists `possible_key` values
206
+ # that it then cannot use, seen this in OR queries.
207
+ return Shiba::Index.count(first_table, @stats)
208
+ end
209
+
210
+ possibilities = [Shiba::Index.count(first_table, @stats)]
211
+ possibilities += first['possible_keys'].map do |key|
212
+ estimate_row_count_with_key(key)
213
+ end
214
+ possibilities.compact.min
215
+ end
216
+ end
217
+ end
218
+
219
+ def estimate_row_count_with_key(key)
220
+ Explain.new(@sql, @stats, force_key: key).estimate_row_count
221
+ rescue Mysql2::Error => e
222
+ if /Key .+? doesn't exist in table/ =~ e.message
223
+ return nil
224
+ end
225
+
226
+ raise e
227
+ end
228
+
229
+ def run_checks!
230
+ @cost = estimate_row_count
231
+ end
232
+ end
233
+ end
234
+
@@ -0,0 +1,159 @@
1
+ module Shiba
2
+ module Index
3
+
4
+ # Given the path to the information_schema.statistics output, returns index statistics keyed by table name.
5
+ # Examples:
6
+ # Exploring the schema:
7
+ #
8
+ # schema_stats = Index.parse("./shiba/schema_stats.tsv")
9
+ # schema_stats.keys
10
+ # => :users, :posts, :comments
11
+ # schema_stats[:users]
12
+ # => {:table_schema=>"blog_test", :table_name=>"users", :non_unique=>"0", :column_name=>"id", :cardinality=>"2", :is_visible=>"YES", :"expression\n"=>"NULL\n"}
13
+ #
14
+ def self.parse(path)
15
+ tables = {}
16
+ records = read(path)
17
+ headers = records.shift.map { |header| header.downcase }
18
+ records.each do |r|
19
+ h = Hash[headers.zip(r)]
20
+ h["cardinality"] = h["cardinality"].to_i
21
+ table = tables[h['table_name']] ||= []
22
+ table.push(h)
23
+ end
24
+ tables
25
+ end
26
+
27
+ # Getting a row count for a table:
28
+ #
29
+ # schema_stats = Index.parse("./shiba/schema_stats.tsv")
30
+ # users_count = Index.count(:users, schema_stats)
31
+ # => 2
32
+ def self.count(table, schema)
33
+ return nil unless schema[table]
34
+ primary = schema[table].detect { |index| index['index_name'] == "PRIMARY" }
35
+ if primary.nil?
36
+ # find the highest cardinality of a unique index, if it exists
37
+ schema[table].map do |index|
38
+ if index['non_unique'].to_i == 0
39
+ index['cardinality']
40
+ else
41
+ nil
42
+ end
43
+ end.compact.max
44
+ else
45
+ primary['cardinality'].to_i
46
+ end
47
+ end
48
+
49
+ def self.fuzzed?(table, schema)
50
+ return nil unless schema[table]
51
+ schema[table].first['fuzzed']
52
+ end
53
+
54
+ def self.estimate_key(table, key, parts, schema)
55
+ table_count = count(table, schema)
56
+ return nil unless table_count
57
+
58
+ key_stat = schema[table].detect do |i|
59
+ i["index_name"] == key && i["column_name"] == parts.last
60
+ end
61
+
62
+ return nil unless key_stat
63
+
64
+ return 0 if key_stat['cardinality'] == 0
65
+ table_count / key_stat['cardinality']
66
+ end
67
+
68
+ def self.query(connection)
69
+ records = connection.query("select * from information_schema.statistics where table_schema = DATABASE()")
70
+ tables = {}
71
+ records.each do |h|
72
+ h.keys.each { |k| h[k.downcase] = h.delete(k) }
73
+ h["cardinality"] = h["cardinality"].to_i
74
+ table = tables[h['table_name']] ||= []
75
+ table.push(h)
76
+ end
77
+ tables
78
+ end
79
+
80
+
81
+ # Up the cardinality on our indexes.
82
+ # Non uniques have a little less cardinality.
83
+ def self.fuzz!(stats)
84
+ db = stats.values.first.first['table_schema']
85
+ table_sizes = self.guess_table_sizes(db)
86
+
87
+
88
+
89
+ stats.each do |table,indexes|
90
+ indexes.each do |idx|
91
+ idx['cardinality'] = table_sizes[table]
92
+
93
+ if idx['non_unique'] == 1
94
+ idx['cardinality'] = (idx['cardinality'] * 0.7).round
95
+ end
96
+
97
+ idx['fuzzed'] = true
98
+ end
99
+ end
100
+ end
101
+
102
+ MINIMUM_TABLE_SIZE = 500
103
+
104
+ # Approximate median size of the tables is less than 500.
105
+ def self.insufficient_stats?(stats)
106
+ if stats.length == 0
107
+ return true
108
+ end
109
+
110
+ # Calculate a rough median.
111
+ primary_keys = stats.map do |_,indexes|
112
+ indexes.detect { |idx| idx['index_name'] == 'PRIMARY' } || {}
113
+ end
114
+
115
+ table_counts = primary_keys.map { |pk| pk['cardinality'].to_i }
116
+ median = table_counts[table_counts.size/2]
117
+
118
+ return median < MINIMUM_TABLE_SIZE
119
+ end
120
+
121
+ STANDARD_FUZZ_SIZE = 5_000
122
+
123
+ # Create fake table sizes based on the table's index count.
124
+ # The more indexes, the bigger the table. Seems to rank tables fairly well.
125
+ def self.guess_table_sizes(db)
126
+ db = Shiba.connection.escape(db)
127
+ index_count_query = "select TABLE_NAME as table_name, count(*) as index_count
128
+ from information_schema.statistics where table_schema = '#{db}'
129
+ and seq_in_index = 1 and index_name not like 'fk_rails%'
130
+ group by table_name order by index_count"
131
+
132
+ index_counts = Shiba.connection.query(index_count_query).to_a
133
+
134
+ # 80th table percentile based on number of indexes
135
+ large_table_idx = (index_counts.size * 0.8).round
136
+ large_table = index_counts[large_table_idx]
137
+
138
+ sizes = Hash[index_counts.map(&:values)]
139
+
140
+ sizes.each do |table_name, index_count|
141
+ if index_count == 0
142
+ index_count = 1
143
+ end
144
+
145
+ sizes[table_name] = STANDARD_FUZZ_SIZE * (index_count / large_table['index_count'].to_f)
146
+ end
147
+
148
+ sizes
149
+ end
150
+
151
+ protected
152
+
153
+ def self.read(path)
154
+ # fixes :"expression\n"=>"NULL\n"},
155
+ IO.foreach(path).map { |l| l.gsub!("\n", "").split("\t") }
156
+ end
157
+
158
+ end
159
+ end