scout_apm 2.0.0.pre → 2.0.0.pre2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGELOG.markdown +22 -5
  4. data/Rakefile +5 -0
  5. data/lib/scout_apm.rb +4 -0
  6. data/lib/scout_apm/agent.rb +22 -8
  7. data/lib/scout_apm/agent/reporting.rb +8 -3
  8. data/lib/scout_apm/attribute_arranger.rb +4 -0
  9. data/lib/scout_apm/bucket_name_splitter.rb +3 -3
  10. data/lib/scout_apm/config.rb +5 -2
  11. data/lib/scout_apm/histogram.rb +20 -0
  12. data/lib/scout_apm/instant_reporting.rb +40 -0
  13. data/lib/scout_apm/instruments/action_controller_rails_3_rails4.rb +11 -1
  14. data/lib/scout_apm/instruments/percentile_sampler.rb +38 -0
  15. data/lib/scout_apm/layaway.rb +1 -4
  16. data/lib/scout_apm/layaway_file.rb +26 -2
  17. data/lib/scout_apm/layer.rb +1 -1
  18. data/lib/scout_apm/layer_converters/converter_base.rb +6 -4
  19. data/lib/scout_apm/layer_converters/slow_job_converter.rb +21 -13
  20. data/lib/scout_apm/layer_converters/slow_request_converter.rb +37 -24
  21. data/lib/scout_apm/metric_meta.rb +5 -1
  22. data/lib/scout_apm/metric_set.rb +15 -6
  23. data/lib/scout_apm/reporter.rb +9 -3
  24. data/lib/scout_apm/request_histograms.rb +46 -0
  25. data/lib/scout_apm/scored_item_set.rb +79 -0
  26. data/lib/scout_apm/serializers/payload_serializer_to_json.rb +2 -0
  27. data/lib/scout_apm/serializers/slow_jobs_serializer_to_json.rb +2 -0
  28. data/lib/scout_apm/slow_job_policy.rb +89 -19
  29. data/lib/scout_apm/slow_job_record.rb +18 -1
  30. data/lib/scout_apm/slow_request_policy.rb +80 -12
  31. data/lib/scout_apm/slow_transaction.rb +22 -3
  32. data/lib/scout_apm/store.rb +35 -13
  33. data/lib/scout_apm/tracked_request.rb +63 -11
  34. data/lib/scout_apm/utils/backtrace_parser.rb +4 -4
  35. data/lib/scout_apm/utils/sql_sanitizer.rb +1 -1
  36. data/lib/scout_apm/utils/sql_sanitizer_regex.rb +2 -2
  37. data/lib/scout_apm/utils/sql_sanitizer_regex_1_8_7.rb +2 -2
  38. data/lib/scout_apm/version.rb +1 -1
  39. data/scout_apm.gemspec +1 -0
  40. data/test/test_helper.rb +4 -3
  41. data/test/unit/layaway_test.rb +5 -8
  42. data/test/unit/metric_set_test.rb +101 -0
  43. data/test/unit/scored_item_set_test.rb +65 -0
  44. data/test/unit/serializers/payload_serializer_test.rb +2 -1
  45. data/test/unit/slow_item_set_test.rb +2 -1
  46. data/test/unit/slow_request_policy_test.rb +42 -0
  47. data/test/unit/sql_sanitizer_test.rb +6 -0
  48. metadata +28 -3
@@ -19,10 +19,12 @@ module ScoutApm
19
19
  # render :update
20
20
  # end
21
21
  def scope_layer
22
- @scope_layer ||= walker.walk do |layer|
23
- if layer.type == "Controller"
24
- break layer
25
- end
22
+ @scope_layer ||= find_first_layer_of_type("Controller") || find_first_layer_of_type("Job")
23
+ end
24
+
25
+ def find_first_layer_of_type(layer_type)
26
+ walker.walk do |layer|
27
+ return layer if layer.type == layer_type
26
28
  end
27
29
  end
28
30
  end
@@ -4,15 +4,29 @@ module ScoutApm
4
4
  def initialize(*)
5
5
  @backtraces = []
6
6
  super
7
+
8
+ # After call to super, so @request is populated
9
+ @points = if request.job?
10
+ ScoutApm::Agent.instance.slow_job_policy.score(request)
11
+ else
12
+ -1
13
+ end
7
14
  end
8
15
 
9
- def call
10
- return unless request.job?
16
+ def name
17
+ request.unique_name
18
+ end
19
+
20
+ def score
21
+ @points
22
+ end
11
23
 
12
- job_name = [queue_layer.name, job_layer.name]
24
+ def call
25
+ return nil unless request.job?
26
+ return nil unless queue_layer
27
+ return nil unless job_layer
13
28
 
14
- slow_enough = ScoutApm::Agent.instance.slow_job_policy.slow?(job_name, root_layer.total_call_time)
15
- return unless slow_enough
29
+ ScoutApm::Agent.instance.slow_job_policy.stored!(request)
16
30
 
17
31
  # record the change in memory usage
18
32
  mem_delta = ScoutApm::Instruments::Process::ProcessMemory.rss_to_mb(request.capture_mem_delta!)
@@ -32,8 +46,8 @@ module ScoutApm
32
46
  timing_metrics,
33
47
  allocation_metrics,
34
48
  mem_delta,
35
- job_layer.total_allocations
36
- )
49
+ job_layer.total_allocations,
50
+ score)
37
51
  end
38
52
 
39
53
  def queue_layer
@@ -44,12 +58,6 @@ module ScoutApm
44
58
  @job_layer ||= find_first_layer_of_type("Job")
45
59
  end
46
60
 
47
- def find_first_layer_of_type(layer_type)
48
- walker.walk do |layer|
49
- return layer if layer.type == layer_type
50
- end
51
- end
52
-
53
61
  def create_metrics
54
62
  metric_hash = Hash.new
55
63
  allocation_metric_hash = Hash.new
@@ -4,25 +4,34 @@ module ScoutApm
4
4
  def initialize(*)
5
5
  @backtraces = [] # An Array of MetricMetas that have a backtrace
6
6
  super
7
+
8
+ # After call to super, so @request is populated
9
+ @points = if request.web?
10
+ ScoutApm::Agent.instance.slow_request_policy.score(request)
11
+ else
12
+ -1
13
+ end
14
+ end
15
+
16
+ def name
17
+ request.unique_name
18
+ end
19
+
20
+ def score
21
+ @points
7
22
  end
8
23
 
24
+ # Unconditionally attempts to convert this into a SlowTransaction object.
25
+ # Can return nil if the request didn't have any scope_layer.
9
26
  def call
10
27
  scope = scope_layer
11
- return [nil, {}] unless scope
28
+ return nil unless scope
12
29
 
13
- policy = ScoutApm::Agent.instance.slow_request_policy.capture_type(root_layer.total_call_time)
14
- if policy == ScoutApm::SlowRequestPolicy::CAPTURE_NONE
15
- return [nil, {}]
16
- end
30
+ ScoutApm::Agent.instance.slow_request_policy.stored!(request)
17
31
 
18
32
  # record the change in memory usage
19
33
  mem_delta = ScoutApm::Instruments::Process::ProcessMemory.rss_to_mb(@request.capture_mem_delta!)
20
34
 
21
- # increment the slow transaction count if this is a slow transaction.
22
- meta = MetricMeta.new("SlowTransaction/#{scope.legacy_metric_name}")
23
- stat = MetricStats.new
24
- stat.update!(1)
25
-
26
35
  uri = request.annotations[:uri] || ""
27
36
 
28
37
  timing_metrics, allocation_metrics = create_metrics
@@ -30,23 +39,27 @@ module ScoutApm
30
39
  allocation_metrics = {}
31
40
  end
32
41
 
42
+ ScoutApm::Agent.instance.config.value("ignore_traces").each do |pattern|
43
+ if /#{pattern}/ =~ uri
44
+ ScoutApm::Agent.instance.logger.debug("Skipped recording a trace for #{uri} due to `ignore_traces` pattern: #{pattern}")
45
+ return nil
46
+ end
47
+ end
48
+
33
49
  # Disable stackprof output for now
34
50
  stackprof = [] # request.stackprof
35
51
 
36
- [
37
- SlowTransaction.new(uri,
38
- scope.legacy_metric_name,
39
- root_layer.total_call_time,
40
- timing_metrics,
41
- allocation_metrics,
42
- request.context,
43
- root_layer.stop_time,
44
- stackprof,
45
- mem_delta,
46
- root_layer.total_allocations
47
- ),
48
- { meta => stat }
49
- ]
52
+ SlowTransaction.new(uri,
53
+ scope.legacy_metric_name,
54
+ root_layer.total_call_time,
55
+ timing_metrics,
56
+ allocation_metrics,
57
+ request.context,
58
+ root_layer.stop_time,
59
+ stackprof,
60
+ mem_delta,
61
+ root_layer.total_allocations,
62
+ @points)
50
63
  end
51
64
 
52
65
  # Iterates over the TrackedRequest's MetricMetas that have backtraces and attaches each to correct MetricMeta in the Metric Hash.
@@ -17,7 +17,11 @@ class MetricMeta
17
17
 
18
18
  # Unsure if type or bucket is a better name.
19
19
  def type
20
- bucket
20
+ bucket_type
21
+ end
22
+
23
+ def name
24
+ bucket_name
21
25
  end
22
26
 
23
27
  # A key metric is the "core" of a request - either the Rails controller reached, or the background Job executed
@@ -2,7 +2,7 @@ module ScoutApm
2
2
  class MetricSet
3
3
  # We can't aggregate CPU, Memory, Capacity, or Controller, so pass through these metrics directly
4
4
  # TODO: Figure out a way to not have this duplicate what's in Samplers, and also on server's ingest
5
- PASSTHROUGH_METRICS = ["CPU", "Memory", "Instance", "Controller", "SlowTransaction"]
5
+ PASSTHROUGH_METRICS = ["CPU", "Memory", "Instance", "Controller", "SlowTransaction", "Percentile", "Job"]
6
6
 
7
7
  attr_reader :metrics
8
8
 
@@ -23,11 +23,15 @@ module ScoutApm
23
23
  @metrics[meta].combine!(stat)
24
24
 
25
25
  elsif meta.type == "Errors" # Sadly special cased, we want both raw and aggregate values
26
- @metrics[meta] ||= MetricStats.new
27
- @metrics[meta].combine!(stat)
28
- agg_meta = MetricMeta.new("Errors/Request", :scope => meta.scope)
29
- @metrics[agg_meta] ||= MetricStats.new
30
- @metrics[agg_meta].combine!(stat)
26
+ # When combining MetricSets between different
27
+ @metrics[meta] ||= MetricStats.new
28
+ @metrics[meta].combine!(stat)
29
+
30
+ if !@combine_in_progress
31
+ agg_meta = MetricMeta.new("Errors/Request", :scope => meta.scope)
32
+ @metrics[agg_meta] ||= MetricStats.new
33
+ @metrics[agg_meta].combine!(stat)
34
+ end
31
35
 
32
36
  else # Combine down to a single /all key
33
37
  agg_meta = MetricMeta.new("#{meta.type}/all", :scope => meta.scope)
@@ -36,8 +40,13 @@ module ScoutApm
36
40
  end
37
41
  end
38
42
 
43
+ # Sets a combine_in_progress flag to prevent double-counting Error metrics.
44
+ # Without it, the Errors/Request number would be increasingly off as
45
+ # metric_sets get merged in.
39
46
  def combine!(other)
47
+ @combine_in_progress = true
40
48
  absorb_all(other.metrics)
49
+ @combine_in_progress = false
41
50
  self
42
51
  end
43
52
  end
@@ -8,17 +8,21 @@ module ScoutApm
8
8
  attr_reader :config
9
9
  attr_reader :logger
10
10
  attr_reader :type
11
+ attr_reader :instant_key
11
12
 
12
- def initialize(type = :checkin, config=Agent.instance.config, logger=Agent.instance.logger)
13
+ def initialize(type = :checkin, config=Agent.instance.config, logger=Agent.instance.logger, instant_key=nil)
13
14
  @config = config
14
15
  @logger = logger
15
16
  @type = type
17
+ @instant_key = instant_key
16
18
  end
17
19
 
18
20
  # TODO: Parse & return a real response object, not the HTTP Response object
19
21
  def report(payload, headers = {})
20
- Array(config.value('host')).each do |host|
22
+ # Some posts (typically ones under development) bypass the ingestion pipeline and go directly to the webserver. They use direct_host instead of host
23
+ hosts = [:deploy_hook, :instant_trace].include?(type) ? config.value('direct_host') : config.value('host')
21
24
 
25
+ Array(hosts).each do |host|
22
26
  full_uri = uri(host)
23
27
  response = post(full_uri, payload, headers)
24
28
  unless response && response.is_a?(Net::HTTPSuccess)
@@ -34,7 +38,9 @@ module ScoutApm
34
38
  when :app_server_load
35
39
  URI.parse("#{host}/apps/app_server_load.scout?key=#{config.value('key')}&name=#{CGI.escape(Environment.instance.application_name)}")
36
40
  when :deploy_hook
37
- URI.parse("https://apm.scoutapp.com/apps/deploy.scout?key=#{config.value('key')}&name=#{CGI.escape(config.value('name'))}")
41
+ URI.parse("#{host}/apps/deploy.scout?key=#{config.value('key')}&name=#{CGI.escape(config.value('name'))}")
42
+ when :instant_trace
43
+ URI.parse("#{host}/apps/instant_trace.scout?key=#{config.value('key')}&name=#{CGI.escape(config.value('name'))}&instant_key=#{instant_key}")
38
44
  end.tap{|u| logger.debug("Posting to #{u.to_s}")}
39
45
  end
40
46
 
@@ -0,0 +1,46 @@
1
+ module ScoutApm
2
+ class RequestHistograms
3
+ DEFAULT_HISTOGRAM_SIZE = 50
4
+
5
+ # Private Accessor:
6
+ # A hash of Endpoint Name to an approximate histogram
7
+ #
8
+ # Each time a new request is requested to see if it's slow or not, we
9
+ # should insert it into the histogram, and get the approximate percentile
10
+ # of that time
11
+ attr_reader :histograms
12
+ private :histograms
13
+
14
+ attr_reader :histogram_size
15
+
16
+ def initialize(histogram_size = DEFAULT_HISTOGRAM_SIZE)
17
+ @histogram_size = histogram_size
18
+ initialize_histograms_hash
19
+ end
20
+
21
+ def each_name
22
+ @histograms.keys.each { |n| yield n }
23
+ end
24
+
25
+ def add(item, value)
26
+ @histograms[item].add(value)
27
+ end
28
+
29
+ def approximate_quantile_of_value(item, value)
30
+ @histograms[item].approximate_quantile_of_value(value)
31
+ end
32
+
33
+ def quantile(item, q)
34
+ @histograms[item].quantile(q)
35
+ end
36
+
37
+ # Wipes all histograms, setting them back to empty
38
+ def reset_all!
39
+ initialize_histograms_hash
40
+ end
41
+
42
+ def initialize_histograms_hash
43
+ @histograms = Hash.new { |h, k| h[k] = NumericHistogram.new(histogram_size) }
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,79 @@
1
+ # Attempts to keep the highest score.
2
+ #
3
+ # Each item must respond to:
4
+ # #call to get the storable item
5
+ # #name to get a unique identifier of the storable
6
+ # #score to get a numeric score, where higher is better
7
+ module ScoutApm
8
+ class ScoredItemSet
9
+ include Enumerable
10
+
11
+ # A number larger than any score we will actually get.
12
+ ARBITRARILY_LARGE = 100000000
13
+
14
+ # Without otherwise saying, default the size to this
15
+ DEFAULT_MAX_SIZE = 10
16
+
17
+ attr_reader :max_size
18
+ attr_reader :items
19
+
20
+ def initialize(max_size = DEFAULT_MAX_SIZE)
21
+ @items = {}
22
+ @max_size = max_size
23
+ end
24
+
25
+ def each
26
+ items.each do |(_, (_, item))|
27
+ yield item
28
+ end
29
+ end
30
+
31
+ # This function is a large if statement, with a few branches. See inline comments for each branch.
32
+ def <<(new_item)
33
+ return if new_item.name == :unknown
34
+
35
+ # If we have this item in the hash already, compare the new & old ones, and store
36
+ # the new one only if it's higher score.
37
+ if items.has_key?(new_item.name)
38
+ if new_item.score > items[new_item.name].first
39
+ store!(new_item)
40
+ end
41
+
42
+
43
+ # If the set is full, then we have to see if we evict anything to store
44
+ # this one
45
+ elsif full?
46
+ smallest_name, smallest_score = items.inject([nil, ARBITRARILY_LARGE]) do |(memo_name, memo_score), (name, (stored_score, _))|
47
+ if stored_score < memo_score
48
+ [name, stored_score]
49
+ else
50
+ [memo_name, memo_score]
51
+ end
52
+ end
53
+
54
+ if smallest_score < new_item.score
55
+ items.delete(smallest_name)
56
+ store!(new_item)
57
+ end
58
+
59
+
60
+ # Set isn't full, and we've not seen this new_item, so go ahead and store it.
61
+ else
62
+ store!(new_item)
63
+ end
64
+ end
65
+
66
+
67
+ private
68
+
69
+ def full?
70
+ items.size >= max_size
71
+ end
72
+
73
+ def store!(new_item)
74
+ if !new_item.name.nil? # Never store a nil name.
75
+ items[new_item.name] = [new_item.score, new_item.call]
76
+ end
77
+ end
78
+ end
79
+ end
@@ -59,6 +59,8 @@ module ScoutApm
59
59
  "[#{all_the_elements.join(",")}]"
60
60
  when Numeric
61
61
  formatee
62
+ when Time
63
+ %Q["#{formatee.iso8601}"]
62
64
  when nil
63
65
  "null"
64
66
  else # strings and everything
@@ -24,6 +24,8 @@ module ScoutApm
24
24
  "metrics" => MetricsToJsonSerializer.new(job.metrics).as_json, # New style of metrics
25
25
  "allocation_metrics" => MetricsToJsonSerializer.new(job.allocation_metrics).as_json, # New style of metrics
26
26
  "context" => job.context.to_hash,
27
+
28
+ "score" => job.score,
27
29
  }
28
30
  end
29
31
  end
@@ -1,29 +1,99 @@
1
- # Create one of these at startup time, and ask it if a certain worker's
2
- # processing time is slow enough for us to collect a slow trace.
3
- #
4
- # Keeps track of a histogram of times for each worker class (spearately), and
5
- # uses a percentile of normal to mark individual runs as "slow".
6
- #
7
- # This assumes that all worker calls will be requested once to `slow?`, so that
8
- # the data can be stored
1
+ # Long running class that determines if, and in how much detail a potentially
2
+ # slow job should be recorded in
3
+
9
4
  module ScoutApm
10
5
  class SlowJobPolicy
11
- DEFAULT_HISTOGRAM_SIZE = 50
6
+ CAPTURE_TYPES = [
7
+ CAPTURE_DETAIL = "capture_detail",
8
+ CAPTURE_NONE = "capture_none",
9
+ ]
10
+
11
+ # Adjust speed points. See the function
12
+ POINT_MULTIPLIER_SPEED = 0.25
13
+
14
+ # For each minute we haven't seen an endpoint
15
+ POINT_MULTIPLIER_AGE = 0.25
16
+
17
+ # Outliers are worth up to "1000ms" of weight
18
+ POINT_MULTIPLIER_PERCENTILE = 1.0
19
+
20
+ # A hash of Job Names to the last time we stored a slow trace for it.
21
+ #
22
+ # Defaults to a start time that is pretty close to application boot time.
23
+ # So the "age" of an endpoint we've never seen is the time the application
24
+ # has been running.
25
+ attr_reader :last_seen
12
26
 
13
- QUANTILE = 95
14
27
 
15
- def initialize(histogram_size = DEFAULT_HISTOGRAM_SIZE)
16
- @histograms = Hash.new { |h, k| h[k] = NumericHistogram.new(histogram_size) }
28
+ def initialize
29
+ zero_time = Time.now
30
+ @last_seen = Hash.new { |h, k| h[k] = zero_time }
17
31
  end
18
32
 
19
- # worker: just the worker class name. "PasswordResetJob" or similar
20
- # total_time: runtime of the job in seconds
21
- # returns true if this request should be stored in higher trace detail, false otherwise
22
- def slow?(worker, total_time)
23
- @histograms[worker].add(total_time)
24
- return false if @histograms[worker].total == 1 # First call is never slow
33
+ def stored!(request)
34
+ last_seen[unique_name_for(request)] = Time.now
35
+ end
36
+
37
+ # Determine if this job trace should be fully analyzed by scoring it
38
+ # across several metrics, and then determining if that's good enough to
39
+ # make it into this minute's payload.
40
+ #
41
+ # Due to the combining nature of the agent & layaway file, there's no
42
+ # guarantee that a high scoring local champion will still be a winner when
43
+ # they go up to "regionals" and are compared against the other processes
44
+ # running on a node.
45
+ def score(request)
46
+ unique_name = request.unique_name
47
+ if unique_name == :unknown
48
+ return -1 # A negative score, should never be good enough to store.
49
+ end
50
+
51
+ total_time = request.root_layer.total_call_time
52
+
53
+ # How long has it been since we've seen this?
54
+ age = Time.now - last_seen[unique_name]
55
+
56
+ # What approximate percentile was this request?
57
+ percentile = ScoutApm::Agent.instance.request_histograms.approximate_quantile_of_value(unique_name, total_time)
58
+
59
+ return speed_points(total_time) + percentile_points(percentile) + age_points(age)
60
+ end
61
+
62
+ private
63
+
64
+ def unique_name_for(request)
65
+ scope_layer = LayerConverters::ConverterBase.new(request).scope_layer
66
+ if scope_layer
67
+ scope_layer.legacy_metric_name
68
+ else
69
+ :unknown
70
+ end
71
+ end
72
+
73
+ # Time in seconds
74
+ # Logarithm keeps huge times from swamping the other metrics.
75
+ # 1+ is necessary to keep the log function in positive territory.
76
+ def speed_points(time)
77
+ Math.log(1 + time) * POINT_MULTIPLIER_SPEED
78
+ end
79
+
80
+ def percentile_points(percentile)
81
+ if percentile < 40
82
+ 0.4 # Don't put much emphasis on capturing low percentiles.
83
+ elsif percentile < 60
84
+ 1.4 # Highest here to get mean traces
85
+ elsif percentile < 90
86
+ 0.7 # Between 60 & 90% is fine.
87
+ elsif percentile >= 90
88
+ 1.4 # Highest here to get 90+%ile traces
89
+ else
90
+ # impossible.
91
+ percentile
92
+ end
93
+ end
25
94
 
26
- total_time >= @histograms[worker].quantile(QUANTILE)
95
+ def age_points(age)
96
+ age / 60.0 * POINT_MULTIPLIER_AGE
27
97
  end
28
98
  end
29
99
  end