scout_apm 1.5.5 → 1.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.markdown +8 -0
  3. data/lib/scout_apm.rb +3 -0
  4. data/lib/scout_apm/agent.rb +23 -25
  5. data/lib/scout_apm/agent/reporting.rb +8 -3
  6. data/lib/scout_apm/attribute_arranger.rb +4 -0
  7. data/lib/scout_apm/bucket_name_splitter.rb +3 -3
  8. data/lib/scout_apm/config.rb +4 -2
  9. data/lib/scout_apm/histogram.rb +20 -0
  10. data/lib/scout_apm/instruments/percentile_sampler.rb +37 -0
  11. data/lib/scout_apm/instruments/process/process_cpu.rb +12 -0
  12. data/lib/scout_apm/instruments/process/process_memory.rb +12 -0
  13. data/lib/scout_apm/layer_converters/converter_base.rb +6 -4
  14. data/lib/scout_apm/layer_converters/slow_job_converter.rb +21 -13
  15. data/lib/scout_apm/layer_converters/slow_request_converter.rb +28 -22
  16. data/lib/scout_apm/metric_meta.rb +5 -1
  17. data/lib/scout_apm/metric_set.rb +1 -1
  18. data/lib/scout_apm/reporter.rb +3 -1
  19. data/lib/scout_apm/request_histograms.rb +46 -0
  20. data/lib/scout_apm/scored_item_set.rb +79 -0
  21. data/lib/scout_apm/serializers/slow_jobs_serializer_to_json.rb +2 -0
  22. data/lib/scout_apm/slow_job_policy.rb +89 -19
  23. data/lib/scout_apm/slow_job_record.rb +20 -1
  24. data/lib/scout_apm/slow_request_policy.rb +80 -12
  25. data/lib/scout_apm/slow_transaction.rb +19 -2
  26. data/lib/scout_apm/store.rb +45 -15
  27. data/lib/scout_apm/tracked_request.rb +33 -10
  28. data/lib/scout_apm/version.rb +1 -1
  29. data/test/test_helper.rb +4 -3
  30. data/test/unit/layaway_test.rb +5 -8
  31. data/test/unit/scored_item_set_test.rb +65 -0
  32. data/test/unit/serializers/payload_serializer_test.rb +2 -1
  33. data/test/unit/slow_item_set_test.rb +2 -1
  34. data/test/unit/slow_request_policy_test.rb +42 -0
  35. metadata +9 -2
@@ -17,7 +17,11 @@ class MetricMeta
17
17
 
18
18
  # Unsure if type or bucket is a better name.
19
19
  def type
20
- bucket
20
+ bucket_type
21
+ end
22
+
23
+ def name
24
+ bucket_name
21
25
  end
22
26
 
23
27
  # A key metric is the "core" of a request - either the Rails controller reached, or the background Job executed
@@ -2,7 +2,7 @@ module ScoutApm
2
2
  class MetricSet
3
3
  # We can't aggregate CPU, Memory, Capacity, or Controller, so pass through these metrics directly
4
4
  # TODO: Figure out a way to not have this duplicate what's in Samplers, and also on server's ingest
5
- PASSTHROUGH_METRICS = ["CPU", "Memory", "Instance", "Controller", "SlowTransaction"]
5
+ PASSTHROUGH_METRICS = ["CPU", "Memory", "Instance", "Controller", "SlowTransaction", "Percentile", "Job"]
6
6
 
7
7
  attr_reader :metrics
8
8
 
@@ -17,8 +17,10 @@ module ScoutApm
17
17
 
18
18
  # TODO: Parse & return a real response object, not the HTTP Response object
19
19
  def report(payload, headers = {})
20
- Array(config.value('host')).each do |host|
20
+ # Some posts (typically ones under development) bypass the ingestion pipeline and go directly to the webserver. They use direct_host instead of host
21
+ hosts = [:deploy_hook, :instant_trace].include?(type) ? config.value('direct_host') : config.value('host')
21
22
 
23
+ Array(hosts).each do |host|
22
24
  full_uri = uri(host)
23
25
  response = post(full_uri, payload, headers)
24
26
  unless response && response.is_a?(Net::HTTPSuccess)
@@ -0,0 +1,46 @@
1
+ module ScoutApm
2
+ class RequestHistograms
3
+ DEFAULT_HISTOGRAM_SIZE = 50
4
+
5
+ # Private Accessor:
6
+ # A hash of Endpoint Name to an approximate histogram
7
+ #
8
+ # Each time a new request is requested to see if it's slow or not, we
9
+ # should insert it into the histogram, and get the approximate percentile
10
+ # of that time
11
+ attr_reader :histograms
12
+ private :histograms
13
+
14
+ attr_reader :histogram_size
15
+
16
+ def initialize(histogram_size = DEFAULT_HISTOGRAM_SIZE)
17
+ @histogram_size = histogram_size
18
+ initialize_histograms_hash
19
+ end
20
+
21
+ def each_name
22
+ @histograms.keys.each { |n| yield n }
23
+ end
24
+
25
+ def add(item, value)
26
+ @histograms[item].add(value)
27
+ end
28
+
29
+ def approximate_quantile_of_value(item, value)
30
+ @histograms[item].approximate_quantile_of_value(value)
31
+ end
32
+
33
+ def quantile(item, q)
34
+ @histograms[item].quantile(q)
35
+ end
36
+
37
+ # Wipes all histograms, setting them back to empty
38
+ def reset_all!
39
+ initialize_histograms_hash
40
+ end
41
+
42
+ def initialize_histograms_hash
43
+ @histograms = Hash.new { |h, k| h[k] = NumericHistogram.new(histogram_size) }
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,79 @@
1
+ # Attempts to keep the highest score.
2
+ #
3
+ # Each item must respond to:
4
+ # #call to get the storable item
5
+ # #name to get a unique identifier of the storable
6
+ # #score to get a numeric score, where higher is better
7
+ module ScoutApm
8
+ class ScoredItemSet
9
+ include Enumerable
10
+
11
+ # A number larger than any score we will actually get.
12
+ ARBITRARILY_LARGE = 100000000
13
+
14
+ # Without otherwise saying, default the size to this
15
+ DEFAULT_MAX_SIZE = 10
16
+
17
+ attr_reader :max_size
18
+ attr_reader :items
19
+
20
+ def initialize(max_size = DEFAULT_MAX_SIZE)
21
+ @items = {}
22
+ @max_size = max_size
23
+ end
24
+
25
+ def each
26
+ items.each do |(_, (_, item))|
27
+ yield item
28
+ end
29
+ end
30
+
31
+ # This function is a large if statement, with a few branches. See inline comments for each branch.
32
+ def <<(new_item)
33
+ return if new_item.name == :unknown
34
+
35
+ # If we have this item in the hash already, compare the new & old ones, and store
36
+ # the new one only if it's higher score.
37
+ if items.has_key?(new_item.name)
38
+ if new_item.score > items[new_item.name].first
39
+ store!(new_item)
40
+ end
41
+
42
+
43
+ # If the set is full, then we have to see if we evict anything to store
44
+ # this one
45
+ elsif full?
46
+ smallest_name, smallest_score = items.inject([nil, ARBITRARILY_LARGE]) do |(memo_name, memo_score), (name, (stored_score, _))|
47
+ if stored_score < memo_score
48
+ [name, stored_score]
49
+ else
50
+ [memo_name, memo_score]
51
+ end
52
+ end
53
+
54
+ if smallest_score < new_item.score
55
+ items.delete(smallest_name)
56
+ store!(new_item)
57
+ end
58
+
59
+
60
+ # Set isn't full, and we've not seen this new_item, so go ahead and store it.
61
+ else
62
+ store!(new_item)
63
+ end
64
+ end
65
+
66
+
67
+ private
68
+
69
+ def full?
70
+ items.size >= max_size
71
+ end
72
+
73
+ def store!(new_item)
74
+ if !new_item.name.nil? # Never store a nil name.
75
+ items[new_item.name] = [new_item.score, new_item.call]
76
+ end
77
+ end
78
+ end
79
+ end
@@ -20,6 +20,8 @@ module ScoutApm
20
20
 
21
21
  "metrics" => MetricsToJsonSerializer.new(job.metrics).as_json, # New style of metrics
22
22
  "context" => job.context.to_hash,
23
+
24
+ "score" => job.score,
23
25
  }
24
26
  end
25
27
  end
@@ -1,29 +1,99 @@
1
- # Create one of these at startup time, and ask it if a certain worker's
2
- # processing time is slow enough for us to collect a slow trace.
3
- #
4
- # Keeps track of a histogram of times for each worker class (spearately), and
5
- # uses a percentile of normal to mark individual runs as "slow".
6
- #
7
- # This assumes that all worker calls will be requested once to `slow?`, so that
8
- # the data can be stored
1
+ # Long running class that determines if, and in how much detail a potentially
2
+ # slow job should be recorded in
3
+
9
4
  module ScoutApm
10
5
  class SlowJobPolicy
11
- DEFAULT_HISTOGRAM_SIZE = 50
6
+ CAPTURE_TYPES = [
7
+ CAPTURE_DETAIL = "capture_detail",
8
+ CAPTURE_NONE = "capture_none",
9
+ ]
10
+
11
+ # Adjust speed points. See the function
12
+ POINT_MULTIPLIER_SPEED = 0.25
13
+
14
+ # For each minute we haven't seen an endpoint
15
+ POINT_MULTIPLIER_AGE = 0.25
16
+
17
+ # Outliers are worth up to "1000ms" of weight
18
+ POINT_MULTIPLIER_PERCENTILE = 1.0
19
+
20
+ # A hash of Job Names to the last time we stored a slow trace for it.
21
+ #
22
+ # Defaults to a start time that is pretty close to application boot time.
23
+ # So the "age" of an endpoint we've never seen is the time the application
24
+ # has been running.
25
+ attr_reader :last_seen
12
26
 
13
- QUANTILE = 95
14
27
 
15
- def initialize(histogram_size = DEFAULT_HISTOGRAM_SIZE)
16
- @histograms = Hash.new { |h, k| h[k] = NumericHistogram.new(histogram_size) }
28
+ def initialize
29
+ zero_time = Time.now
30
+ @last_seen = Hash.new { |h, k| h[k] = zero_time }
17
31
  end
18
32
 
19
- # worker: just the worker class name. "PasswordResetJob" or similar
20
- # total_time: runtime of the job in seconds
21
- # returns true if this request should be stored in higher trace detail, false otherwise
22
- def slow?(worker, total_time)
23
- @histograms[worker].add(total_time)
24
- return false if @histograms[worker].total == 1 # First call is never slow
33
+ def stored!(request)
34
+ last_seen[unique_name_for(request)] = Time.now
35
+ end
36
+
37
+ # Determine if this job trace should be fully analyzed by scoring it
38
+ # across several metrics, and then determining if that's good enough to
39
+ # make it into this minute's payload.
40
+ #
41
+ # Due to the combining nature of the agent & layaway file, there's no
42
+ # guarantee that a high scoring local champion will still be a winner when
43
+ # they go up to "regionals" and are compared against the other processes
44
+ # running on a node.
45
+ def score(request)
46
+ unique_name = request.unique_name
47
+ if unique_name == :unknown
48
+ return -1 # A negative score, should never be good enough to store.
49
+ end
50
+
51
+ total_time = request.root_layer.total_call_time
52
+
53
+ # How long has it been since we've seen this?
54
+ age = Time.now - last_seen[unique_name]
55
+
56
+ # What approximate percentile was this request?
57
+ percentile = ScoutApm::Agent.instance.request_histograms.approximate_quantile_of_value(unique_name, total_time)
58
+
59
+ return speed_points(total_time) + percentile_points(percentile) + age_points(age)
60
+ end
61
+
62
+ private
63
+
64
+ def unique_name_for(request)
65
+ scope_layer = LayerConverters::ConverterBase.new(request).scope_layer
66
+ if scope_layer
67
+ scope_layer.legacy_metric_name
68
+ else
69
+ :unknown
70
+ end
71
+ end
72
+
73
+ # Time in seconds
74
+ # Logarithm keeps huge times from swamping the other metrics.
75
+ # 1+ is necessary to keep the log function in positive territory.
76
+ def speed_points(time)
77
+ Math.log(1 + time) * POINT_MULTIPLIER_SPEED
78
+ end
79
+
80
+ def percentile_points(percentile)
81
+ if percentile < 40
82
+ 0.4 # Don't put much emphasis on capturing low percentiles.
83
+ elsif percentile < 60
84
+ 1.4 # Highest here to get mean traces
85
+ elsif percentile < 90
86
+ 0.7 # Between 60 & 90% is fine.
87
+ elsif percentile >= 90
88
+ 1.4 # Highest here to get 90+%ile traces
89
+ else
90
+ # impossible.
91
+ percentile
92
+ end
93
+ end
25
94
 
26
- total_time >= @histograms[worker].quantile(QUANTILE)
95
+ def age_points(age)
96
+ age / 60.0 * POINT_MULTIPLIER_AGE
27
97
  end
28
98
  end
29
99
  end
@@ -15,7 +15,9 @@ module ScoutApm
15
15
 
16
16
  attr_reader :metrics
17
17
 
18
- def initialize(queue_name, job_name, time, total_time, exclusive_time, context, metrics)
18
+ attr_reader :score
19
+
20
+ def initialize(queue_name, job_name, time, total_time, exclusive_time, context, metrics, score)
19
21
  @queue_name = queue_name
20
22
  @job_name = job_name
21
23
  @time = time
@@ -23,11 +25,28 @@ module ScoutApm
23
25
  @exclusive_time = exclusive_time
24
26
  @context = context
25
27
  @metrics = metrics
28
+ @score = score
26
29
  end
27
30
 
28
31
  def metric_name
29
32
  "Job/#{queue_name}/#{job_name}"
30
33
  end
31
34
 
35
+ ########################
36
+ # Scorable interface
37
+ #
38
+ # Needed so we can merge ScoredItemSet instances
39
+ def call
40
+ self
41
+ end
42
+
43
+ def name
44
+ metric_name
45
+ end
46
+
47
+ def score
48
+ @score
49
+ end
50
+
32
51
  end
33
52
  end
@@ -1,8 +1,5 @@
1
1
  # Long running class that determines if, and in how much detail a potentially
2
2
  # slow transaction should be recorded in
3
- #
4
- # Rules:
5
- # - Runtime must be slower than a threshold
6
3
 
7
4
  module ScoutApm
8
5
  class SlowRequestPolicy
@@ -11,21 +8,92 @@ module ScoutApm
11
8
  CAPTURE_NONE = "capture_none",
12
9
  ]
13
10
 
14
- # It's not slow unless it's at least this slow
15
- SLOW_REQUEST_TIME_THRESHOLD = 2.0 # seconds
11
+ # Adjust speed points. See the function
12
+ POINT_MULTIPLIER_SPEED = 0.25
16
13
 
17
- def capture_type(time)
18
- if !slow_enough?(time)
19
- CAPTURE_NONE
20
- else
21
- CAPTURE_DETAIL
14
+ # For each minute we haven't seen an endpoint
15
+ POINT_MULTIPLIER_AGE = 0.25
16
+
17
+ # Outliers are worth up to "1000ms" of weight
18
+ POINT_MULTIPLIER_PERCENTILE = 1.0
19
+
20
+ # A hash of Endpoint Name to the last time we stored a slow transaction for it.
21
+ #
22
+ # Defaults to a start time that is pretty close to application boot time.
23
+ # So the "age" of an endpoint we've never seen is the time the application
24
+ # has been running.
25
+ attr_reader :last_seen
26
+
27
+
28
+ def initialize
29
+ zero_time = Time.now
30
+ @last_seen = Hash.new { |h, k| h[k] = zero_time }
31
+ end
32
+
33
+ def stored!(request)
34
+ last_seen[unique_name_for(request)] = Time.now
35
+ end
36
+
37
+ # Determine if this request trace should be fully analyzed by scoring it
38
+ # across several metrics, and then determining if that's good enough to
39
+ # make it into this minute's payload.
40
+ #
41
+ # Due to the combining nature of the agent & layaway file, there's no
42
+ # guarantee that a high scoring local champion will still be a winner when
43
+ # they go up to "regionals" and are compared against the other processes
44
+ # running on a node.
45
+ def score(request)
46
+ unique_name = request.unique_name
47
+ if unique_name == :unknown
48
+ return -1 # A negative score, should never be good enough to store.
22
49
  end
50
+
51
+ total_time = request.root_layer.total_call_time
52
+
53
+ # How long has it been since we've seen this?
54
+ age = Time.now - last_seen[unique_name]
55
+
56
+ # What approximate percentile was this request?
57
+ percentile = ScoutApm::Agent.instance.request_histograms.approximate_quantile_of_value(unique_name, total_time)
58
+
59
+ return speed_points(total_time) + percentile_points(percentile) + age_points(age)
23
60
  end
24
61
 
25
62
  private
26
63
 
27
- def slow_enough?(time)
28
- time > SLOW_REQUEST_TIME_THRESHOLD
64
+ def unique_name_for(request)
65
+ scope_layer = LayerConverters::ConverterBase.new(request).scope_layer
66
+ if scope_layer
67
+ scope_layer.legacy_metric_name
68
+ else
69
+ :unknown
70
+ end
71
+ end
72
+
73
+ # Time in seconds
74
+ # Logarithm keeps huge times from swamping the other metrics.
75
+ # 1+ is necessary to keep the log function in positive territory.
76
+ def speed_points(time)
77
+ Math.log(1 + time) * POINT_MULTIPLIER_SPEED
78
+ end
79
+
80
+ def percentile_points(percentile)
81
+ if percentile < 40
82
+ 0.4 # Don't put much emphasis on capturing low percentiles.
83
+ elsif percentile < 60
84
+ 1.4 # Highest here to get mean traces
85
+ elsif percentile < 90
86
+ 0.7 # Between 60 & 90% is fine.
87
+ elsif percentile >= 90
88
+ 1.4 # Highest here to get 90+%ile traces
89
+ else
90
+ # impossible.
91
+ percentile
92
+ end
93
+ end
94
+
95
+ def age_points(age)
96
+ age / 60.0 * POINT_MULTIPLIER_AGE
29
97
  end
30
98
  end
31
99
  end
@@ -12,7 +12,7 @@ module ScoutApm
12
12
  attr_reader :prof
13
13
  attr_reader :raw_prof
14
14
 
15
- def initialize(uri, metric_name, total_call_time, metrics, context, time, raw_stackprof)
15
+ def initialize(uri, metric_name, total_call_time, metrics, context, time, raw_stackprof, score)
16
16
  @uri = uri
17
17
  @metric_name = metric_name
18
18
  @total_call_time = total_call_time
@@ -21,6 +21,7 @@ module ScoutApm
21
21
  @time = time
22
22
  @prof = ScoutApm::StackprofTreeCollapser.new(raw_stackprof).call
23
23
  @raw_prof = raw_stackprof # Send whole data up to server
24
+ @score = score
24
25
  end
25
26
 
26
27
  # Used to remove metrics when the payload will be too large.
@@ -34,12 +35,28 @@ module ScoutApm
34
35
  end
35
36
 
36
37
  def as_json
37
- json_attributes = [:key, :time, :total_call_time, :uri, [:context, :context_hash], :prof]
38
+ json_attributes = [:key, :time, :total_call_time, :uri, [:context, :context_hash], :prof, :score]
38
39
  ScoutApm::AttributeArranger.call(self, json_attributes)
39
40
  end
40
41
 
41
42
  def context_hash
42
43
  context.to_hash
43
44
  end
45
+
46
+ ########################
47
+ # Scorable interface
48
+ #
49
+ # Needed so we can merge ScoredItemSet instances
50
+ def call
51
+ self
52
+ end
53
+
54
+ def name
55
+ metric_name
56
+ end
57
+
58
+ def score
59
+ @score
60
+ end
44
61
  end
45
62
  end