scout_apm 1.5.5 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.markdown +8 -0
  3. data/lib/scout_apm.rb +3 -0
  4. data/lib/scout_apm/agent.rb +23 -25
  5. data/lib/scout_apm/agent/reporting.rb +8 -3
  6. data/lib/scout_apm/attribute_arranger.rb +4 -0
  7. data/lib/scout_apm/bucket_name_splitter.rb +3 -3
  8. data/lib/scout_apm/config.rb +4 -2
  9. data/lib/scout_apm/histogram.rb +20 -0
  10. data/lib/scout_apm/instruments/percentile_sampler.rb +37 -0
  11. data/lib/scout_apm/instruments/process/process_cpu.rb +12 -0
  12. data/lib/scout_apm/instruments/process/process_memory.rb +12 -0
  13. data/lib/scout_apm/layer_converters/converter_base.rb +6 -4
  14. data/lib/scout_apm/layer_converters/slow_job_converter.rb +21 -13
  15. data/lib/scout_apm/layer_converters/slow_request_converter.rb +28 -22
  16. data/lib/scout_apm/metric_meta.rb +5 -1
  17. data/lib/scout_apm/metric_set.rb +1 -1
  18. data/lib/scout_apm/reporter.rb +3 -1
  19. data/lib/scout_apm/request_histograms.rb +46 -0
  20. data/lib/scout_apm/scored_item_set.rb +79 -0
  21. data/lib/scout_apm/serializers/slow_jobs_serializer_to_json.rb +2 -0
  22. data/lib/scout_apm/slow_job_policy.rb +89 -19
  23. data/lib/scout_apm/slow_job_record.rb +20 -1
  24. data/lib/scout_apm/slow_request_policy.rb +80 -12
  25. data/lib/scout_apm/slow_transaction.rb +19 -2
  26. data/lib/scout_apm/store.rb +45 -15
  27. data/lib/scout_apm/tracked_request.rb +33 -10
  28. data/lib/scout_apm/version.rb +1 -1
  29. data/test/test_helper.rb +4 -3
  30. data/test/unit/layaway_test.rb +5 -8
  31. data/test/unit/scored_item_set_test.rb +65 -0
  32. data/test/unit/serializers/payload_serializer_test.rb +2 -1
  33. data/test/unit/slow_item_set_test.rb +2 -1
  34. data/test/unit/slow_request_policy_test.rb +42 -0
  35. metadata +9 -2
@@ -17,7 +17,11 @@ class MetricMeta
17
17
 
18
18
  # Unsure if type or bucket is a better name.
19
19
  def type
20
- bucket
20
+ bucket_type
21
+ end
22
+
23
+ def name
24
+ bucket_name
21
25
  end
22
26
 
23
27
  # A key metric is the "core" of a request - either the Rails controller reached, or the background Job executed
@@ -2,7 +2,7 @@ module ScoutApm
2
2
  class MetricSet
3
3
  # We can't aggregate CPU, Memory, Capacity, or Controller, so pass through these metrics directly
4
4
  # TODO: Figure out a way to not have this duplicate what's in Samplers, and also on server's ingest
5
- PASSTHROUGH_METRICS = ["CPU", "Memory", "Instance", "Controller", "SlowTransaction"]
5
+ PASSTHROUGH_METRICS = ["CPU", "Memory", "Instance", "Controller", "SlowTransaction", "Percentile", "Job"]
6
6
 
7
7
  attr_reader :metrics
8
8
 
@@ -17,8 +17,10 @@ module ScoutApm
17
17
 
18
18
  # TODO: Parse & return a real response object, not the HTTP Response object
19
19
  def report(payload, headers = {})
20
- Array(config.value('host')).each do |host|
20
+ # Some posts (typically ones under development) bypass the ingestion pipeline and go directly to the webserver. They use direct_host instead of host
21
+ hosts = [:deploy_hook, :instant_trace].include?(type) ? config.value('direct_host') : config.value('host')
21
22
 
23
+ Array(hosts).each do |host|
22
24
  full_uri = uri(host)
23
25
  response = post(full_uri, payload, headers)
24
26
  unless response && response.is_a?(Net::HTTPSuccess)
@@ -0,0 +1,46 @@
1
+ module ScoutApm
2
+ class RequestHistograms
3
+ DEFAULT_HISTOGRAM_SIZE = 50
4
+
5
+ # Private Accessor:
6
+ # A hash of Endpoint Name to an approximate histogram
7
+ #
8
+ # Each time a new request is requested to see if it's slow or not, we
9
+ # should insert it into the histogram, and get the approximate percentile
10
+ # of that time
11
+ attr_reader :histograms
12
+ private :histograms
13
+
14
+ attr_reader :histogram_size
15
+
16
+ def initialize(histogram_size = DEFAULT_HISTOGRAM_SIZE)
17
+ @histogram_size = histogram_size
18
+ initialize_histograms_hash
19
+ end
20
+
21
+ def each_name
22
+ @histograms.keys.each { |n| yield n }
23
+ end
24
+
25
+ def add(item, value)
26
+ @histograms[item].add(value)
27
+ end
28
+
29
+ def approximate_quantile_of_value(item, value)
30
+ @histograms[item].approximate_quantile_of_value(value)
31
+ end
32
+
33
+ def quantile(item, q)
34
+ @histograms[item].quantile(q)
35
+ end
36
+
37
+ # Wipes all histograms, setting them back to empty
38
+ def reset_all!
39
+ initialize_histograms_hash
40
+ end
41
+
42
+ def initialize_histograms_hash
43
+ @histograms = Hash.new { |h, k| h[k] = NumericHistogram.new(histogram_size) }
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,79 @@
1
+ # Attempts to keep the highest score.
2
+ #
3
+ # Each item must respond to:
4
+ # #call to get the storable item
5
+ # #name to get a unique identifier of the storable
6
+ # #score to get a numeric score, where higher is better
7
+ module ScoutApm
8
+ class ScoredItemSet
9
+ include Enumerable
10
+
11
+ # A number larger than any score we will actually get.
12
+ ARBITRARILY_LARGE = 100000000
13
+
14
+ # Without otherwise saying, default the size to this
15
+ DEFAULT_MAX_SIZE = 10
16
+
17
+ attr_reader :max_size
18
+ attr_reader :items
19
+
20
+ def initialize(max_size = DEFAULT_MAX_SIZE)
21
+ @items = {}
22
+ @max_size = max_size
23
+ end
24
+
25
+ def each
26
+ items.each do |(_, (_, item))|
27
+ yield item
28
+ end
29
+ end
30
+
31
+ # This function is a large if statement, with a few branches. See inline comments for each branch.
32
+ def <<(new_item)
33
+ return if new_item.name == :unknown
34
+
35
+ # If we have this item in the hash already, compare the new & old ones, and store
36
+ # the new one only if it's higher score.
37
+ if items.has_key?(new_item.name)
38
+ if new_item.score > items[new_item.name].first
39
+ store!(new_item)
40
+ end
41
+
42
+
43
+ # If the set is full, then we have to see if we evict anything to store
44
+ # this one
45
+ elsif full?
46
+ smallest_name, smallest_score = items.inject([nil, ARBITRARILY_LARGE]) do |(memo_name, memo_score), (name, (stored_score, _))|
47
+ if stored_score < memo_score
48
+ [name, stored_score]
49
+ else
50
+ [memo_name, memo_score]
51
+ end
52
+ end
53
+
54
+ if smallest_score < new_item.score
55
+ items.delete(smallest_name)
56
+ store!(new_item)
57
+ end
58
+
59
+
60
+ # Set isn't full, and we've not seen this new_item, so go ahead and store it.
61
+ else
62
+ store!(new_item)
63
+ end
64
+ end
65
+
66
+
67
+ private
68
+
69
+ def full?
70
+ items.size >= max_size
71
+ end
72
+
73
+ def store!(new_item)
74
+ if !new_item.name.nil? # Never store a nil name.
75
+ items[new_item.name] = [new_item.score, new_item.call]
76
+ end
77
+ end
78
+ end
79
+ end
@@ -20,6 +20,8 @@ module ScoutApm
20
20
 
21
21
  "metrics" => MetricsToJsonSerializer.new(job.metrics).as_json, # New style of metrics
22
22
  "context" => job.context.to_hash,
23
+
24
+ "score" => job.score,
23
25
  }
24
26
  end
25
27
  end
@@ -1,29 +1,99 @@
1
- # Create one of these at startup time, and ask it if a certain worker's
2
- # processing time is slow enough for us to collect a slow trace.
3
- #
4
- # Keeps track of a histogram of times for each worker class (spearately), and
5
- # uses a percentile of normal to mark individual runs as "slow".
6
- #
7
- # This assumes that all worker calls will be requested once to `slow?`, so that
8
- # the data can be stored
1
+ # Long running class that determines if, and in how much detail a potentially
2
+ # slow job should be recorded in
3
+
9
4
  module ScoutApm
10
5
  class SlowJobPolicy
11
- DEFAULT_HISTOGRAM_SIZE = 50
6
+ CAPTURE_TYPES = [
7
+ CAPTURE_DETAIL = "capture_detail",
8
+ CAPTURE_NONE = "capture_none",
9
+ ]
10
+
11
+ # Adjust speed points. See the function
12
+ POINT_MULTIPLIER_SPEED = 0.25
13
+
14
+ # For each minute we haven't seen an endpoint
15
+ POINT_MULTIPLIER_AGE = 0.25
16
+
17
+ # Outliers are worth up to "1000ms" of weight
18
+ POINT_MULTIPLIER_PERCENTILE = 1.0
19
+
20
+ # A hash of Job Names to the last time we stored a slow trace for it.
21
+ #
22
+ # Defaults to a start time that is pretty close to application boot time.
23
+ # So the "age" of an endpoint we've never seen is the time the application
24
+ # has been running.
25
+ attr_reader :last_seen
12
26
 
13
- QUANTILE = 95
14
27
 
15
- def initialize(histogram_size = DEFAULT_HISTOGRAM_SIZE)
16
- @histograms = Hash.new { |h, k| h[k] = NumericHistogram.new(histogram_size) }
28
+ def initialize
29
+ zero_time = Time.now
30
+ @last_seen = Hash.new { |h, k| h[k] = zero_time }
17
31
  end
18
32
 
19
- # worker: just the worker class name. "PasswordResetJob" or similar
20
- # total_time: runtime of the job in seconds
21
- # returns true if this request should be stored in higher trace detail, false otherwise
22
- def slow?(worker, total_time)
23
- @histograms[worker].add(total_time)
24
- return false if @histograms[worker].total == 1 # First call is never slow
33
+ def stored!(request)
34
+ last_seen[unique_name_for(request)] = Time.now
35
+ end
36
+
37
+ # Determine if this job trace should be fully analyzed by scoring it
38
+ # across several metrics, and then determining if that's good enough to
39
+ # make it into this minute's payload.
40
+ #
41
+ # Due to the combining nature of the agent & layaway file, there's no
42
+ # guarantee that a high scoring local champion will still be a winner when
43
+ # they go up to "regionals" and are compared against the other processes
44
+ # running on a node.
45
+ def score(request)
46
+ unique_name = request.unique_name
47
+ if unique_name == :unknown
48
+ return -1 # A negative score, should never be good enough to store.
49
+ end
50
+
51
+ total_time = request.root_layer.total_call_time
52
+
53
+ # How long has it been since we've seen this?
54
+ age = Time.now - last_seen[unique_name]
55
+
56
+ # What approximate percentile was this request?
57
+ percentile = ScoutApm::Agent.instance.request_histograms.approximate_quantile_of_value(unique_name, total_time)
58
+
59
+ return speed_points(total_time) + percentile_points(percentile) + age_points(age)
60
+ end
61
+
62
+ private
63
+
64
+ def unique_name_for(request)
65
+ scope_layer = LayerConverters::ConverterBase.new(request).scope_layer
66
+ if scope_layer
67
+ scope_layer.legacy_metric_name
68
+ else
69
+ :unknown
70
+ end
71
+ end
72
+
73
+ # Time in seconds
74
+ # Logarithm keeps huge times from swamping the other metrics.
75
+ # 1+ is necessary to keep the log function in positive territory.
76
+ def speed_points(time)
77
+ Math.log(1 + time) * POINT_MULTIPLIER_SPEED
78
+ end
79
+
80
+ def percentile_points(percentile)
81
+ if percentile < 40
82
+ 0.4 # Don't put much emphasis on capturing low percentiles.
83
+ elsif percentile < 60
84
+ 1.4 # Highest here to get mean traces
85
+ elsif percentile < 90
86
+ 0.7 # Between 60 & 90% is fine.
87
+ elsif percentile >= 90
88
+ 1.4 # Highest here to get 90+%ile traces
89
+ else
90
+ # impossible.
91
+ percentile
92
+ end
93
+ end
25
94
 
26
- total_time >= @histograms[worker].quantile(QUANTILE)
95
+ def age_points(age)
96
+ age / 60.0 * POINT_MULTIPLIER_AGE
27
97
  end
28
98
  end
29
99
  end
@@ -15,7 +15,9 @@ module ScoutApm
15
15
 
16
16
  attr_reader :metrics
17
17
 
18
- def initialize(queue_name, job_name, time, total_time, exclusive_time, context, metrics)
18
+ attr_reader :score
19
+
20
+ def initialize(queue_name, job_name, time, total_time, exclusive_time, context, metrics, score)
19
21
  @queue_name = queue_name
20
22
  @job_name = job_name
21
23
  @time = time
@@ -23,11 +25,28 @@ module ScoutApm
23
25
  @exclusive_time = exclusive_time
24
26
  @context = context
25
27
  @metrics = metrics
28
+ @score = score
26
29
  end
27
30
 
28
31
  def metric_name
29
32
  "Job/#{queue_name}/#{job_name}"
30
33
  end
31
34
 
35
+ ########################
36
+ # Scorable interface
37
+ #
38
+ # Needed so we can merge ScoredItemSet instances
39
+ def call
40
+ self
41
+ end
42
+
43
+ def name
44
+ metric_name
45
+ end
46
+
47
+ def score
48
+ @score
49
+ end
50
+
32
51
  end
33
52
  end
@@ -1,8 +1,5 @@
1
1
  # Long running class that determines if, and in how much detail a potentially
2
2
  # slow transaction should be recorded in
3
- #
4
- # Rules:
5
- # - Runtime must be slower than a threshold
6
3
 
7
4
  module ScoutApm
8
5
  class SlowRequestPolicy
@@ -11,21 +8,92 @@ module ScoutApm
11
8
  CAPTURE_NONE = "capture_none",
12
9
  ]
13
10
 
14
- # It's not slow unless it's at least this slow
15
- SLOW_REQUEST_TIME_THRESHOLD = 2.0 # seconds
11
+ # Adjust speed points. See the function
12
+ POINT_MULTIPLIER_SPEED = 0.25
16
13
 
17
- def capture_type(time)
18
- if !slow_enough?(time)
19
- CAPTURE_NONE
20
- else
21
- CAPTURE_DETAIL
14
+ # For each minute we haven't seen an endpoint
15
+ POINT_MULTIPLIER_AGE = 0.25
16
+
17
+ # Outliers are worth up to "1000ms" of weight
18
+ POINT_MULTIPLIER_PERCENTILE = 1.0
19
+
20
+ # A hash of Endpoint Name to the last time we stored a slow transaction for it.
21
+ #
22
+ # Defaults to a start time that is pretty close to application boot time.
23
+ # So the "age" of an endpoint we've never seen is the time the application
24
+ # has been running.
25
+ attr_reader :last_seen
26
+
27
+
28
+ def initialize
29
+ zero_time = Time.now
30
+ @last_seen = Hash.new { |h, k| h[k] = zero_time }
31
+ end
32
+
33
+ def stored!(request)
34
+ last_seen[unique_name_for(request)] = Time.now
35
+ end
36
+
37
+ # Determine if this request trace should be fully analyzed by scoring it
38
+ # across several metrics, and then determining if that's good enough to
39
+ # make it into this minute's payload.
40
+ #
41
+ # Due to the combining nature of the agent & layaway file, there's no
42
+ # guarantee that a high scoring local champion will still be a winner when
43
+ # they go up to "regionals" and are compared against the other processes
44
+ # running on a node.
45
+ def score(request)
46
+ unique_name = request.unique_name
47
+ if unique_name == :unknown
48
+ return -1 # A negative score, should never be good enough to store.
22
49
  end
50
+
51
+ total_time = request.root_layer.total_call_time
52
+
53
+ # How long has it been since we've seen this?
54
+ age = Time.now - last_seen[unique_name]
55
+
56
+ # What approximate percentile was this request?
57
+ percentile = ScoutApm::Agent.instance.request_histograms.approximate_quantile_of_value(unique_name, total_time)
58
+
59
+ return speed_points(total_time) + percentile_points(percentile) + age_points(age)
23
60
  end
24
61
 
25
62
  private
26
63
 
27
- def slow_enough?(time)
28
- time > SLOW_REQUEST_TIME_THRESHOLD
64
+ def unique_name_for(request)
65
+ scope_layer = LayerConverters::ConverterBase.new(request).scope_layer
66
+ if scope_layer
67
+ scope_layer.legacy_metric_name
68
+ else
69
+ :unknown
70
+ end
71
+ end
72
+
73
+ # Time in seconds
74
+ # Logarithm keeps huge times from swamping the other metrics.
75
+ # 1+ is necessary to keep the log function in positive territory.
76
+ def speed_points(time)
77
+ Math.log(1 + time) * POINT_MULTIPLIER_SPEED
78
+ end
79
+
80
+ def percentile_points(percentile)
81
+ if percentile < 40
82
+ 0.4 # Don't put much emphasis on capturing low percentiles.
83
+ elsif percentile < 60
84
+ 1.4 # Highest here to get mean traces
85
+ elsif percentile < 90
86
+ 0.7 # Between 60 & 90% is fine.
87
+ elsif percentile >= 90
88
+ 1.4 # Highest here to get 90+%ile traces
89
+ else
90
+ # impossible.
91
+ percentile
92
+ end
93
+ end
94
+
95
+ def age_points(age)
96
+ age / 60.0 * POINT_MULTIPLIER_AGE
29
97
  end
30
98
  end
31
99
  end
@@ -12,7 +12,7 @@ module ScoutApm
12
12
  attr_reader :prof
13
13
  attr_reader :raw_prof
14
14
 
15
- def initialize(uri, metric_name, total_call_time, metrics, context, time, raw_stackprof)
15
+ def initialize(uri, metric_name, total_call_time, metrics, context, time, raw_stackprof, score)
16
16
  @uri = uri
17
17
  @metric_name = metric_name
18
18
  @total_call_time = total_call_time
@@ -21,6 +21,7 @@ module ScoutApm
21
21
  @time = time
22
22
  @prof = ScoutApm::StackprofTreeCollapser.new(raw_stackprof).call
23
23
  @raw_prof = raw_stackprof # Send whole data up to server
24
+ @score = score
24
25
  end
25
26
 
26
27
  # Used to remove metrics when the payload will be too large.
@@ -34,12 +35,28 @@ module ScoutApm
34
35
  end
35
36
 
36
37
  def as_json
37
- json_attributes = [:key, :time, :total_call_time, :uri, [:context, :context_hash], :prof]
38
+ json_attributes = [:key, :time, :total_call_time, :uri, [:context, :context_hash], :prof, :score]
38
39
  ScoutApm::AttributeArranger.call(self, json_attributes)
39
40
  end
40
41
 
41
42
  def context_hash
42
43
  context.to_hash
43
44
  end
45
+
46
+ ########################
47
+ # Scorable interface
48
+ #
49
+ # Needed so we can merge ScoredItemSet instances
50
+ def call
51
+ self
52
+ end
53
+
54
+ def name
55
+ metric_name
56
+ end
57
+
58
+ def score
59
+ @score
60
+ end
44
61
  end
45
62
  end