active_record-sql_analyzer 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +2 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +15 -0
  5. data/CONTRIBUTING.md +9 -0
  6. data/Gemfile +12 -0
  7. data/LICENSE.md +202 -0
  8. data/README.md +119 -0
  9. data/Rakefile +6 -0
  10. data/active_record-sql_analyzer.gemspec +21 -0
  11. data/bin/ar-log-analyzer +10 -0
  12. data/lib/active_record/sql_analyzer.rb +40 -0
  13. data/lib/active_record/sql_analyzer/analyzer.rb +45 -0
  14. data/lib/active_record/sql_analyzer/background_processor.rb +54 -0
  15. data/lib/active_record/sql_analyzer/backtrace_filter.rb +38 -0
  16. data/lib/active_record/sql_analyzer/cli.rb +74 -0
  17. data/lib/active_record/sql_analyzer/cli_processor.rb +122 -0
  18. data/lib/active_record/sql_analyzer/compact_logger.rb +31 -0
  19. data/lib/active_record/sql_analyzer/configuration.rb +174 -0
  20. data/lib/active_record/sql_analyzer/logger.rb +29 -0
  21. data/lib/active_record/sql_analyzer/monkeypatches/query.rb +35 -0
  22. data/lib/active_record/sql_analyzer/monkeypatches/tagger.rb +24 -0
  23. data/lib/active_record/sql_analyzer/redacted_logger.rb +22 -0
  24. data/lib/active_record/sql_analyzer/redactor.rb +5 -0
  25. data/lib/active_record/sql_analyzer/version.rb +5 -0
  26. data/spec/active_record/sql_analyzer/analyzer_spec.rb +33 -0
  27. data/spec/active_record/sql_analyzer/background_processor_spec.rb +44 -0
  28. data/spec/active_record/sql_analyzer/backtrace_filter_spec.rb +28 -0
  29. data/spec/active_record/sql_analyzer/cli_processor_spec.rb +120 -0
  30. data/spec/active_record/sql_analyzer/cli_spec.rb +66 -0
  31. data/spec/active_record/sql_analyzer/end_to_end_spec.rb +121 -0
  32. data/spec/active_record/sql_analyzer/redacted_logger_spec.rb +67 -0
  33. data/spec/spec_helper.rb +34 -0
  34. data/spec/support/db_connection.rb +65 -0
  35. data/spec/support/stub_logger.rb +9 -0
  36. data/spec/support/stub_rails.rb +9 -0
  37. data/spec/support/wait_for_pop.rb +13 -0
  38. metadata +129 -0
@@ -0,0 +1,45 @@
1
+ module ActiveRecord
2
+ module SqlAnalyzer
3
+ class Analyzer
4
+ attr_reader :options
5
+
6
+ def initialize
7
+ @options = {}
8
+ end
9
+
10
+ def [](key)
11
+ @options[key]
12
+ end
13
+
14
+ # Tables to watch for this analyzer
15
+ def tables(names)
16
+ unless names.is_a?(Array)
17
+ raise ArgumentError, "Names of tables must be an array"
18
+ end
19
+
20
+ @options[:table_regex] = /\A\s*((SELECT|DELETE).*(FROM|JOIN)|(INSERT\s+INTO|UPDATE))\s+`?(#{names.join('|')})`?/i
21
+ end
22
+
23
+ # Logger class to use for recording data
24
+ def logger(klass)
25
+ @options[:logger] = klass
26
+ end
27
+
28
+ # How to tag the data
29
+ def name(name)
30
+ if !name.is_a?(String) && name !~ /\A([a-z0-9A-Z_]+)\z/
31
+ raise ArgumentError, "Name for this analyzer can only contain [a-z0-9A-Z_] characters"
32
+ end
33
+
34
+ @options[:name] = name
35
+ end
36
+
37
+ def setup
38
+ @options[:logger_instance] ||= (@options[:logger] || RedactedLogger).new(
39
+ SqlAnalyzer.config[:logger_root_path],
40
+ @options[:name]
41
+ )
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,54 @@
1
+ require "set"
2
+
3
+ module ActiveRecord
4
+ module SqlAnalyzer
5
+ class BackgroundProcessor
6
+ def initialize
7
+ @queue = Queue.new
8
+ end
9
+
10
+ def <<(event)
11
+ processor_thread
12
+ @queue << event
13
+ end
14
+
15
+ private
16
+
17
+ MUTEX = Mutex.new
18
+
19
+ def process_queue
20
+ event = @queue.pop
21
+
22
+ event[:caller] = SqlAnalyzer.config[:backtrace_filter_proc].call(event[:caller])
23
+ event[:sql] = SqlAnalyzer.config[:sql_redactor_complex_proc].call(event[:sql].dup)
24
+
25
+ logger = event.delete(:logger)
26
+ logger.filter_event(event)
27
+ logger.log(event)
28
+ end
29
+
30
+ def processor_thread
31
+ # Avoid grabbing a mutex unless we really need to
32
+ return if @thread && @thread.alive?
33
+
34
+ MUTEX.synchronize do
35
+ # Double check to avoid a race condition
36
+ return if @thread && @thread.alive?
37
+
38
+ @thread = Thread.new do
39
+ Rails.logger.info "[SQL-Analyzer] Starting background query thread id #{Thread.current.object_id} in pid #{Process.pid}"
40
+
41
+ begin
42
+ loop do
43
+ process_queue
44
+ end
45
+ rescue => ex
46
+ Rails.logger.warn "[SQL-Analyzer] Exception in thread #{Thread.current.object_id}: #{ex.class}, #{ex.message}"
47
+ Rails.logger.warn "[SQL-Analyzer] #{ex.backtrace.join(", ")}"
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,38 @@
1
+ require "pathname"
2
+
3
+ # This is a bit complex but can't be avoided since otherwise we have to log 5000000 backtrace lines
4
+ module ActiveRecord
5
+ module SqlAnalyzer
6
+ class BacktraceFilter
7
+ def self.library_paths
8
+ @library_paths ||= begin
9
+ paths = Gem.path + Gem.path.map { |f| File.realpath(f) }
10
+ paths << "(eval):"
11
+ paths << RbConfig::CONFIG.fetch('libdir')
12
+ paths
13
+ end
14
+ end
15
+
16
+ def self.rails_root_regex
17
+ @rails_root ||= %r{^#{Regexp.escape(Rails.root.to_s)}}
18
+ end
19
+
20
+ def self.proc
21
+ @proc ||= Proc.new do |lines|
22
+ filtered = []
23
+ lines.each do |line|
24
+ unless library_paths.any? { |path| line.include?(path) }
25
+ if line =~ rails_root_regex
26
+ filtered << Pathname.new(line).relative_path_from(Rails.root).to_s
27
+ else
28
+ filtered << line
29
+ end
30
+ end
31
+ end
32
+
33
+ filtered
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,74 @@
1
+ module ActiveRecord
2
+ module SqlAnalyzer
3
+ class CLI
4
+ attr_reader :options, :processor
5
+
6
+ def initialize
7
+ @options = {
8
+ concurrency: 6
9
+ }
10
+ end
11
+
12
+ def processor
13
+ @processor ||= ActiveRecord::SqlAnalyzer::CLIProcessor.new(options[:concurrency])
14
+ end
15
+
16
+ def run
17
+ definition_logs = Dir["#{options[:log_dir]}/*_definitions.log*"].map do |path|
18
+ [File.basename(path).gsub(/_definitions\.log.*/, ""), path]
19
+ end
20
+
21
+ if definition_logs.empty?
22
+ raise ArgumentError, "Cannot find any log files in '#{options[:log_dir]}'"
23
+ end
24
+
25
+ # Process the definition logs
26
+ processor.run_definition(definition_logs)
27
+
28
+ # Process the usage logs
29
+ usage_logs = Dir["#{options[:log_dir]}/{#{definition_logs.map(&:first).uniq.join(",")}}.log*"].map do |path|
30
+ [File.basename(path).split(".", 2).first, path]
31
+ end
32
+
33
+ processor.run_usage(usage_logs)
34
+
35
+ processor.dump(options[:dest_dir])
36
+ end
37
+
38
+ def parse_options(args)
39
+ opts = OptionParser.new
40
+
41
+ opts.on("--log-dir [DIR]", String, "Directory that logs are in") do |val|
42
+ unless Dir.exist?(val)
43
+ raise ArgumentError, "log directory '#{val}' does not exist"
44
+ end
45
+
46
+ options[:log_dir] = val
47
+ end
48
+
49
+ opts.on("--dest-dir [DIR]", String, "Directory to dump logs to") do |val|
50
+ unless Dir.exist?(val)
51
+ raise ArgumentError, "dest directory '#{val}' does not exist"
52
+ end
53
+
54
+ options[:dest_dir] = val
55
+ end
56
+
57
+ opts.on("-c", "--concurrency", Integer, "How many threads to use for processing log files") do |val|
58
+ if val <= 0
59
+ raise ArgumentError, "Concurrency must be >0"
60
+ end
61
+
62
+ options[:concurrency] = val
63
+ end
64
+
65
+ opts.on_tail("-h", "--help") do
66
+ puts opts
67
+ exit
68
+ end
69
+
70
+ opts.parse(args)
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,122 @@
1
+ module ActiveRecord
2
+ module SqlAnalyzer
3
+ class CLIProcessor
4
+ attr_reader :concurrency, :definitions
5
+
6
+ def initialize(concurrency)
7
+ @concurrency = concurrency
8
+ @definitions = {}
9
+ end
10
+
11
+ def self.process_queue(queue)
12
+ local_data = {}
13
+
14
+ while !queue.empty? do
15
+ prefix, path = queue.pop
16
+ local_data[prefix] ||= {}
17
+
18
+ File.open(path, "r") do |io|
19
+ while !io.eof? do
20
+ yield local_data[prefix], io.readline.strip
21
+ end
22
+ end
23
+ end
24
+
25
+ local_data
26
+
27
+ rescue => ex
28
+ puts "#{ex.class}: #{ex.message}"
29
+ puts ex.backtrace
30
+ raise
31
+ end
32
+
33
+ def run_definition(logs)
34
+ queue = Queue.new
35
+ logs.each { |l| queue << l }
36
+
37
+ # Spin up threads to start processing the queue
38
+ threads = concurrency.times.map do
39
+ Thread.new(queue) do |t_queue|
40
+ # Create a local copy of each definitions then merge them in
41
+ CLIProcessor.process_queue(t_queue) do |local_definitions, line|
42
+ line.strip!
43
+
44
+ unless line == ""
45
+ sha, event = line.split("|", 2)
46
+ local_definitions[sha] = JSON.parse(event)
47
+ end
48
+ end
49
+ end
50
+ end
51
+
52
+ # Merge everything
53
+ threads.each do |thread|
54
+ thread.value.each do |prefix, data|
55
+ definitions[prefix] ||= {}
56
+ definitions[prefix].merge!(data)
57
+ end
58
+ end
59
+ end
60
+
61
+ def run_usage(logs)
62
+ queue = Queue.new
63
+ logs.each { |l| queue << l }
64
+
65
+ # Spin up threads to start processing the queue
66
+ threads = concurrency.times.map do
67
+ Thread.new(queue) do |t_queue|
68
+ # Create a local copy of the usage for each SHA then merge it in at the end
69
+ CLIProcessor.process_queue(t_queue) do |local_usage, line|
70
+ line.strip!
71
+
72
+ unless line == ""
73
+ last_called, sha = line.split("|", 2)
74
+ last_called = Time.at(last_called.to_i).utc
75
+
76
+ local_usage[sha] ||= {"count" => 0}
77
+ local_usage[sha]["count"] += 1
78
+
79
+ if !local_usage[sha]["last_called"] || local_usage[sha]["last_called"] < last_called
80
+ local_usage[sha]["last_called"] = last_called
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
86
+
87
+ # Merge everything
88
+ threads.each do |thread|
89
+ thread.value.each do |prefix, data|
90
+ definitions[prefix] ||= {}
91
+
92
+ data.each do |sha, usage|
93
+ definition = definitions[prefix][sha]
94
+ unless definition
95
+ puts "Undefined event '#{sha}'"
96
+ next
97
+ end
98
+
99
+ definition["count"] ||= 0
100
+ definition["count"] += usage["count"]
101
+
102
+ if !definition["last_called"] || definition["last_called"] < usage["last_called"]
103
+ definition["last_called"] = usage["last_called"]
104
+ end
105
+ end
106
+ end
107
+ end
108
+ end
109
+
110
+ def dump(dest_dir)
111
+ definitions.each do |prefix, data|
112
+ path = "#{dest_dir}/#{prefix}_#{Time.now.strftime("%Y-%m-%d")}.log"
113
+ puts "Writing logs to '#{path}' (#{data.length} queries)"
114
+
115
+ File.open(path, "w+") do |io|
116
+ io.write(data.to_json)
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,31 @@
1
+ require "digest"
2
+
3
+ module ActiveRecord
4
+ module SqlAnalyzer
5
+ class CompactLogger < Logger
6
+ attr_reader :logged_shas, :definition_log_file
7
+
8
+ def initialize(*)
9
+ super
10
+
11
+ @logged_shas = Set.new
12
+ @definition_log_file = File.open("#{log_root}/#{log_prefix}_definitions.log", "a+")
13
+ end
14
+
15
+ def log(event)
16
+ sha = Digest::MD5.hexdigest(event.to_s)
17
+ unless logged_shas.include?(sha)
18
+ definition_log_file.puts("#{sha}|#{event.to_json}")
19
+ logged_shas << sha
20
+ end
21
+
22
+ log_file.puts("#{Time.now.to_i}|#{sha}")
23
+ end
24
+
25
+ def close
26
+ @definition_log_file.close rescue nil
27
+ super
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,174 @@
1
+ module ActiveRecord
2
+ module SqlAnalyzer
3
+ class Configuration
4
+ attr_reader :options
5
+
6
+ def initialize
7
+ @options = {}
8
+ setup_defaults
9
+ end
10
+
11
+ # Setup a custom proc that filters out lines before passing them to the loggers.
12
+ # By default, this attempts to filter out all non-app code lines.
13
+ def backtrace_filter_proc(proc)
14
+ check_proc(proc, 1, "the backtrace lines")
15
+ @options[:backtrace_filter_proc] = proc
16
+ end
17
+
18
+ # Setup a new analyzer for monitoring tables
19
+ #
20
+ # add_analyzer(
21
+ # name: 'users',
22
+ # tables: %w(users permissions),
23
+ # logger: ActiveRecord::SqlAnalyzer::RedactedLogger
24
+ # )
25
+ #
26
+ # Will setup an analyzer that looks at the tables users and permissions
27
+ # when it finds relevant data, it passes it through the `RedactedLogger` class.
28
+ # When calling the proc passed to `log_sample_proc`, it will use the name
29
+ # `users` to help identify it, as well as when logging to disk.
30
+ #
31
+ def add_analyzer(result)
32
+ analyzer = Analyzer.new
33
+ analyzer.name(result[:name])
34
+ analyzer.tables(result[:tables])
35
+ analyzer.logger(result[:logger])
36
+ analyzer.setup
37
+
38
+ @options[:analyzers] << analyzer
39
+ analyzer
40
+ end
41
+
42
+ # Root path where all logs go.
43
+ # Defaults to `Rails.root.join('log')`
44
+ def logger_root_path(path)
45
+ unless Dir.exist?(path)
46
+ raise ArgumentError, "Path '#{path}' is not a directory"
47
+ end
48
+
49
+ @options[:logger_root_path] = path
50
+ end
51
+
52
+ # Set a proc that determines whether or not to log a single event.
53
+ # This must be set to log anything, and controls how many SQL queries you look at.
54
+ #
55
+ # Proc.new { |name| true }
56
+ #
57
+ # Will log everything no matter what
58
+ #
59
+ # Proc.new do |name|
60
+ # rand(1..100) <= 50
61
+ # end
62
+ #
63
+ # Will only log 50% of queries.
64
+ #
65
+ # You can hook this into something like Redis to allow dynamic control of the ratio
66
+ # without having to redeploy/restart your application.
67
+ #
68
+ def log_sample_proc(proc)
69
+ check_proc(proc, 1, "the analyzer name")
70
+ @options[:should_log_sample_proc] = proc
71
+ end
72
+
73
+ # For hooking in more complicated redactions beyond a simple find/replace.
74
+ def complex_sql_redactor_proc(proc)
75
+ check_proc(proc, 1, "the SQL statement")
76
+ @options[:sql_redactor_complex_proc] = proc
77
+ end
78
+
79
+ # Additional redactors to filter out data in SQL you don't want logged
80
+ def add_sql_redactors(list)
81
+ @options[:sql_redactors].concat(create_redactors(list))
82
+ end
83
+
84
+ # Backtrace redactors filter out data in the backtrace
85
+ # useful if you want to get rid of lines numbers
86
+ def add_backtrace_redactors(list)
87
+ @options[:backtrace_redactors].concat(create_redactors(list))
88
+ end
89
+
90
+ # If the first line in the backtrace matches the regex given, we switch to
91
+ # ambiguous tracing mode for that call where we log more of the backtrace.
92
+ #
93
+ # As an example, if you find you're only getting middleware, you could use:
94
+ #
95
+ # %r{\Aapp/middleware/query_string_sanitizer\.rb:\d+:in `call'\z}
96
+ #
97
+ # Which would log up to ambiguous_backtrace_lines (default 3) total lines,
98
+ # rather than the default 1.
99
+ def add_ambiguous_tracers(list)
100
+ list.each do |row|
101
+ unless row.is_a?(Regexp)
102
+ raise ArgumentError, "Tracing filters must be a Regexp to match on"
103
+ end
104
+ end
105
+
106
+ @options[:ambiguous_tracers].concat(list)
107
+ end
108
+
109
+ # How many total lines to log when the caller is ambiguous
110
+ def ambiguous_backtrace_lines(lines)
111
+ if !lines.is_a?(Fixnum)
112
+ raise ArgumentError, "Lines must be a Fixnum"
113
+ elsif lines <= 1
114
+ raise ArgumentError, "Lines cannot be <= 1"
115
+ end
116
+
117
+ @options[:ambiguous_backtrace_lines] = lines
118
+ end
119
+
120
+ def [](key)
121
+ @options[key]
122
+ end
123
+
124
+ private
125
+
126
+ def check_proc(proc, arity, msg)
127
+ if !proc.is_a?(Proc)
128
+ raise ArgumentError, "You must pass a proc"
129
+ elsif proc.arity != 1
130
+ raise ArgumentError, "Proc must accept 1 argument for #{msg}"
131
+ end
132
+ end
133
+
134
+ def create_redactors(list)
135
+ list.map do |redact|
136
+ if redact.length != 2
137
+ raise ArgumentError, "Redactor row should only have two entries"
138
+ elsif !redact.first.is_a?(Regexp)
139
+ raise ArgumentError, "First value in pair must be a Regexp to match on"
140
+ elsif !redact.last.is_a?(String)
141
+ raise ArgumentError, "Last value in pair must be a String to replace with"
142
+ end
143
+
144
+ Redactor.new(*redact)
145
+ end
146
+ end
147
+
148
+ def setup_defaults
149
+ @options[:sql_redactors] = [
150
+ Redactor.new(/\n/, " "),
151
+ Redactor.new(/\s+/, " "),
152
+ Redactor.new(/(\s|\b|`)(=|!=|>=|>|<=|<) ?(BINARY )?-?\d+(\.\d+)?/, " = [REDACTED]"),
153
+ Redactor.new(/(\s|\b|`)(=|!=|>=|>|<=|<) ?(BINARY )?x?'[^']*'/, " = '[REDACTED]'"),
154
+ Redactor.new(/VALUES \(.+\)$/, "VALUES ([REDACTED])"),
155
+ Redactor.new(/IN \([^)]+\)/, "IN ([REDACTED])"),
156
+ Redactor.new(/BETWEEN '[^']*' AND '[^']*'/, "BETWEEN '[REDACTED]' AND '[REDACTED]'"),
157
+ Redactor.new(/LIKE '[^'\\]*(?:\\.[^'\\]*)*'/, "LIKE '[REDACTED]'"),
158
+ Redactor.new(/ LIMIT \d+/, ""),
159
+ Redactor.new(/ OFFSET \d+/, ""),
160
+ Redactor.new(/INSERT INTO (`?\w+`?) \([^)]+\)/, 'INSERT INTO \1 ([COLUMNS])'),
161
+ ]
162
+
163
+ @options[:should_log_sample_proc] = Proc.new { |_name| false }
164
+ @options[:sql_redactor_complex_proc] = Proc.new { |sql| sql }
165
+ @options[:backtrace_redactors] = []
166
+ @options[:ambiguous_tracers] = []
167
+ @options[:ambiguous_backtrace_lines] = 3
168
+ @options[:analyzers] = []
169
+ @options[:logger_root_path] = Rails.root.join('log')
170
+ @options[:backtrace_filter_proc] = BacktraceFilter.proc
171
+ end
172
+ end
173
+ end
174
+ end