scout_apm 1.5.5 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.markdown +8 -0
- data/lib/scout_apm.rb +3 -0
- data/lib/scout_apm/agent.rb +23 -25
- data/lib/scout_apm/agent/reporting.rb +8 -3
- data/lib/scout_apm/attribute_arranger.rb +4 -0
- data/lib/scout_apm/bucket_name_splitter.rb +3 -3
- data/lib/scout_apm/config.rb +4 -2
- data/lib/scout_apm/histogram.rb +20 -0
- data/lib/scout_apm/instruments/percentile_sampler.rb +37 -0
- data/lib/scout_apm/instruments/process/process_cpu.rb +12 -0
- data/lib/scout_apm/instruments/process/process_memory.rb +12 -0
- data/lib/scout_apm/layer_converters/converter_base.rb +6 -4
- data/lib/scout_apm/layer_converters/slow_job_converter.rb +21 -13
- data/lib/scout_apm/layer_converters/slow_request_converter.rb +28 -22
- data/lib/scout_apm/metric_meta.rb +5 -1
- data/lib/scout_apm/metric_set.rb +1 -1
- data/lib/scout_apm/reporter.rb +3 -1
- data/lib/scout_apm/request_histograms.rb +46 -0
- data/lib/scout_apm/scored_item_set.rb +79 -0
- data/lib/scout_apm/serializers/slow_jobs_serializer_to_json.rb +2 -0
- data/lib/scout_apm/slow_job_policy.rb +89 -19
- data/lib/scout_apm/slow_job_record.rb +20 -1
- data/lib/scout_apm/slow_request_policy.rb +80 -12
- data/lib/scout_apm/slow_transaction.rb +19 -2
- data/lib/scout_apm/store.rb +45 -15
- data/lib/scout_apm/tracked_request.rb +33 -10
- data/lib/scout_apm/version.rb +1 -1
- data/test/test_helper.rb +4 -3
- data/test/unit/layaway_test.rb +5 -8
- data/test/unit/scored_item_set_test.rb +65 -0
- data/test/unit/serializers/payload_serializer_test.rb +2 -1
- data/test/unit/slow_item_set_test.rb +2 -1
- data/test/unit/slow_request_policy_test.rb +42 -0
- metadata +9 -2
@@ -17,7 +17,11 @@ class MetricMeta
|
|
17
17
|
|
18
18
|
# Unsure if type or bucket is a better name.
|
19
19
|
def type
|
20
|
-
|
20
|
+
bucket_type
|
21
|
+
end
|
22
|
+
|
23
|
+
def name
|
24
|
+
bucket_name
|
21
25
|
end
|
22
26
|
|
23
27
|
# A key metric is the "core" of a request - either the Rails controller reached, or the background Job executed
|
data/lib/scout_apm/metric_set.rb
CHANGED
@@ -2,7 +2,7 @@ module ScoutApm
|
|
2
2
|
class MetricSet
|
3
3
|
# We can't aggregate CPU, Memory, Capacity, or Controller, so pass through these metrics directly
|
4
4
|
# TODO: Figure out a way to not have this duplicate what's in Samplers, and also on server's ingest
|
5
|
-
PASSTHROUGH_METRICS = ["CPU", "Memory", "Instance", "Controller", "SlowTransaction"]
|
5
|
+
PASSTHROUGH_METRICS = ["CPU", "Memory", "Instance", "Controller", "SlowTransaction", "Percentile", "Job"]
|
6
6
|
|
7
7
|
attr_reader :metrics
|
8
8
|
|
data/lib/scout_apm/reporter.rb
CHANGED
@@ -17,8 +17,10 @@ module ScoutApm
|
|
17
17
|
|
18
18
|
# TODO: Parse & return a real response object, not the HTTP Response object
|
19
19
|
def report(payload, headers = {})
|
20
|
-
|
20
|
+
# Some posts (typically ones under development) bypass the ingestion pipeline and go directly to the webserver. They use direct_host instead of host
|
21
|
+
hosts = [:deploy_hook, :instant_trace].include?(type) ? config.value('direct_host') : config.value('host')
|
21
22
|
|
23
|
+
Array(hosts).each do |host|
|
22
24
|
full_uri = uri(host)
|
23
25
|
response = post(full_uri, payload, headers)
|
24
26
|
unless response && response.is_a?(Net::HTTPSuccess)
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module ScoutApm
|
2
|
+
class RequestHistograms
|
3
|
+
DEFAULT_HISTOGRAM_SIZE = 50
|
4
|
+
|
5
|
+
# Private Accessor:
|
6
|
+
# A hash of Endpoint Name to an approximate histogram
|
7
|
+
#
|
8
|
+
# Each time a new request is requested to see if it's slow or not, we
|
9
|
+
# should insert it into the histogram, and get the approximate percentile
|
10
|
+
# of that time
|
11
|
+
attr_reader :histograms
|
12
|
+
private :histograms
|
13
|
+
|
14
|
+
attr_reader :histogram_size
|
15
|
+
|
16
|
+
def initialize(histogram_size = DEFAULT_HISTOGRAM_SIZE)
|
17
|
+
@histogram_size = histogram_size
|
18
|
+
initialize_histograms_hash
|
19
|
+
end
|
20
|
+
|
21
|
+
def each_name
|
22
|
+
@histograms.keys.each { |n| yield n }
|
23
|
+
end
|
24
|
+
|
25
|
+
def add(item, value)
|
26
|
+
@histograms[item].add(value)
|
27
|
+
end
|
28
|
+
|
29
|
+
def approximate_quantile_of_value(item, value)
|
30
|
+
@histograms[item].approximate_quantile_of_value(value)
|
31
|
+
end
|
32
|
+
|
33
|
+
def quantile(item, q)
|
34
|
+
@histograms[item].quantile(q)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Wipes all histograms, setting them back to empty
|
38
|
+
def reset_all!
|
39
|
+
initialize_histograms_hash
|
40
|
+
end
|
41
|
+
|
42
|
+
def initialize_histograms_hash
|
43
|
+
@histograms = Hash.new { |h, k| h[k] = NumericHistogram.new(histogram_size) }
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# Attempts to keep the highest score.
|
2
|
+
#
|
3
|
+
# Each item must respond to:
|
4
|
+
# #call to get the storable item
|
5
|
+
# #name to get a unique identifier of the storable
|
6
|
+
# #score to get a numeric score, where higher is better
|
7
|
+
module ScoutApm
|
8
|
+
class ScoredItemSet
|
9
|
+
include Enumerable
|
10
|
+
|
11
|
+
# A number larger than any score we will actually get.
|
12
|
+
ARBITRARILY_LARGE = 100000000
|
13
|
+
|
14
|
+
# Without otherwise saying, default the size to this
|
15
|
+
DEFAULT_MAX_SIZE = 10
|
16
|
+
|
17
|
+
attr_reader :max_size
|
18
|
+
attr_reader :items
|
19
|
+
|
20
|
+
def initialize(max_size = DEFAULT_MAX_SIZE)
|
21
|
+
@items = {}
|
22
|
+
@max_size = max_size
|
23
|
+
end
|
24
|
+
|
25
|
+
def each
|
26
|
+
items.each do |(_, (_, item))|
|
27
|
+
yield item
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# This function is a large if statement, with a few branches. See inline comments for each branch.
|
32
|
+
def <<(new_item)
|
33
|
+
return if new_item.name == :unknown
|
34
|
+
|
35
|
+
# If we have this item in the hash already, compare the new & old ones, and store
|
36
|
+
# the new one only if it's higher score.
|
37
|
+
if items.has_key?(new_item.name)
|
38
|
+
if new_item.score > items[new_item.name].first
|
39
|
+
store!(new_item)
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
# If the set is full, then we have to see if we evict anything to store
|
44
|
+
# this one
|
45
|
+
elsif full?
|
46
|
+
smallest_name, smallest_score = items.inject([nil, ARBITRARILY_LARGE]) do |(memo_name, memo_score), (name, (stored_score, _))|
|
47
|
+
if stored_score < memo_score
|
48
|
+
[name, stored_score]
|
49
|
+
else
|
50
|
+
[memo_name, memo_score]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
if smallest_score < new_item.score
|
55
|
+
items.delete(smallest_name)
|
56
|
+
store!(new_item)
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
# Set isn't full, and we've not seen this new_item, so go ahead and store it.
|
61
|
+
else
|
62
|
+
store!(new_item)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def full?
|
70
|
+
items.size >= max_size
|
71
|
+
end
|
72
|
+
|
73
|
+
def store!(new_item)
|
74
|
+
if !new_item.name.nil? # Never store a nil name.
|
75
|
+
items[new_item.name] = [new_item.score, new_item.call]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -1,29 +1,99 @@
|
|
1
|
-
#
|
2
|
-
#
|
3
|
-
|
4
|
-
# Keeps track of a histogram of times for each worker class (spearately), and
|
5
|
-
# uses a percentile of normal to mark individual runs as "slow".
|
6
|
-
#
|
7
|
-
# This assumes that all worker calls will be requested once to `slow?`, so that
|
8
|
-
# the data can be stored
|
1
|
+
# Long running class that determines if, and in how much detail a potentially
|
2
|
+
# slow job should be recorded in
|
3
|
+
|
9
4
|
module ScoutApm
|
10
5
|
class SlowJobPolicy
|
11
|
-
|
6
|
+
CAPTURE_TYPES = [
|
7
|
+
CAPTURE_DETAIL = "capture_detail",
|
8
|
+
CAPTURE_NONE = "capture_none",
|
9
|
+
]
|
10
|
+
|
11
|
+
# Adjust speed points. See the function
|
12
|
+
POINT_MULTIPLIER_SPEED = 0.25
|
13
|
+
|
14
|
+
# For each minute we haven't seen an endpoint
|
15
|
+
POINT_MULTIPLIER_AGE = 0.25
|
16
|
+
|
17
|
+
# Outliers are worth up to "1000ms" of weight
|
18
|
+
POINT_MULTIPLIER_PERCENTILE = 1.0
|
19
|
+
|
20
|
+
# A hash of Job Names to the last time we stored a slow trace for it.
|
21
|
+
#
|
22
|
+
# Defaults to a start time that is pretty close to application boot time.
|
23
|
+
# So the "age" of an endpoint we've never seen is the time the application
|
24
|
+
# has been running.
|
25
|
+
attr_reader :last_seen
|
12
26
|
|
13
|
-
QUANTILE = 95
|
14
27
|
|
15
|
-
def initialize
|
16
|
-
|
28
|
+
def initialize
|
29
|
+
zero_time = Time.now
|
30
|
+
@last_seen = Hash.new { |h, k| h[k] = zero_time }
|
17
31
|
end
|
18
32
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
33
|
+
def stored!(request)
|
34
|
+
last_seen[unique_name_for(request)] = Time.now
|
35
|
+
end
|
36
|
+
|
37
|
+
# Determine if this job trace should be fully analyzed by scoring it
|
38
|
+
# across several metrics, and then determining if that's good enough to
|
39
|
+
# make it into this minute's payload.
|
40
|
+
#
|
41
|
+
# Due to the combining nature of the agent & layaway file, there's no
|
42
|
+
# guarantee that a high scoring local champion will still be a winner when
|
43
|
+
# they go up to "regionals" and are compared against the other processes
|
44
|
+
# running on a node.
|
45
|
+
def score(request)
|
46
|
+
unique_name = request.unique_name
|
47
|
+
if unique_name == :unknown
|
48
|
+
return -1 # A negative score, should never be good enough to store.
|
49
|
+
end
|
50
|
+
|
51
|
+
total_time = request.root_layer.total_call_time
|
52
|
+
|
53
|
+
# How long has it been since we've seen this?
|
54
|
+
age = Time.now - last_seen[unique_name]
|
55
|
+
|
56
|
+
# What approximate percentile was this request?
|
57
|
+
percentile = ScoutApm::Agent.instance.request_histograms.approximate_quantile_of_value(unique_name, total_time)
|
58
|
+
|
59
|
+
return speed_points(total_time) + percentile_points(percentile) + age_points(age)
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def unique_name_for(request)
|
65
|
+
scope_layer = LayerConverters::ConverterBase.new(request).scope_layer
|
66
|
+
if scope_layer
|
67
|
+
scope_layer.legacy_metric_name
|
68
|
+
else
|
69
|
+
:unknown
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Time in seconds
|
74
|
+
# Logarithm keeps huge times from swamping the other metrics.
|
75
|
+
# 1+ is necessary to keep the log function in positive territory.
|
76
|
+
def speed_points(time)
|
77
|
+
Math.log(1 + time) * POINT_MULTIPLIER_SPEED
|
78
|
+
end
|
79
|
+
|
80
|
+
def percentile_points(percentile)
|
81
|
+
if percentile < 40
|
82
|
+
0.4 # Don't put much emphasis on capturing low percentiles.
|
83
|
+
elsif percentile < 60
|
84
|
+
1.4 # Highest here to get mean traces
|
85
|
+
elsif percentile < 90
|
86
|
+
0.7 # Between 60 & 90% is fine.
|
87
|
+
elsif percentile >= 90
|
88
|
+
1.4 # Highest here to get 90+%ile traces
|
89
|
+
else
|
90
|
+
# impossible.
|
91
|
+
percentile
|
92
|
+
end
|
93
|
+
end
|
25
94
|
|
26
|
-
|
95
|
+
def age_points(age)
|
96
|
+
age / 60.0 * POINT_MULTIPLIER_AGE
|
27
97
|
end
|
28
98
|
end
|
29
99
|
end
|
@@ -15,7 +15,9 @@ module ScoutApm
|
|
15
15
|
|
16
16
|
attr_reader :metrics
|
17
17
|
|
18
|
-
|
18
|
+
attr_reader :score
|
19
|
+
|
20
|
+
def initialize(queue_name, job_name, time, total_time, exclusive_time, context, metrics, score)
|
19
21
|
@queue_name = queue_name
|
20
22
|
@job_name = job_name
|
21
23
|
@time = time
|
@@ -23,11 +25,28 @@ module ScoutApm
|
|
23
25
|
@exclusive_time = exclusive_time
|
24
26
|
@context = context
|
25
27
|
@metrics = metrics
|
28
|
+
@score = score
|
26
29
|
end
|
27
30
|
|
28
31
|
def metric_name
|
29
32
|
"Job/#{queue_name}/#{job_name}"
|
30
33
|
end
|
31
34
|
|
35
|
+
########################
|
36
|
+
# Scorable interface
|
37
|
+
#
|
38
|
+
# Needed so we can merge ScoredItemSet instances
|
39
|
+
def call
|
40
|
+
self
|
41
|
+
end
|
42
|
+
|
43
|
+
def name
|
44
|
+
metric_name
|
45
|
+
end
|
46
|
+
|
47
|
+
def score
|
48
|
+
@score
|
49
|
+
end
|
50
|
+
|
32
51
|
end
|
33
52
|
end
|
@@ -1,8 +1,5 @@
|
|
1
1
|
# Long running class that determines if, and in how much detail a potentially
|
2
2
|
# slow transaction should be recorded in
|
3
|
-
#
|
4
|
-
# Rules:
|
5
|
-
# - Runtime must be slower than a threshold
|
6
3
|
|
7
4
|
module ScoutApm
|
8
5
|
class SlowRequestPolicy
|
@@ -11,21 +8,92 @@ module ScoutApm
|
|
11
8
|
CAPTURE_NONE = "capture_none",
|
12
9
|
]
|
13
10
|
|
14
|
-
#
|
15
|
-
|
11
|
+
# Adjust speed points. See the function
|
12
|
+
POINT_MULTIPLIER_SPEED = 0.25
|
16
13
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
14
|
+
# For each minute we haven't seen an endpoint
|
15
|
+
POINT_MULTIPLIER_AGE = 0.25
|
16
|
+
|
17
|
+
# Outliers are worth up to "1000ms" of weight
|
18
|
+
POINT_MULTIPLIER_PERCENTILE = 1.0
|
19
|
+
|
20
|
+
# A hash of Endpoint Name to the last time we stored a slow transaction for it.
|
21
|
+
#
|
22
|
+
# Defaults to a start time that is pretty close to application boot time.
|
23
|
+
# So the "age" of an endpoint we've never seen is the time the application
|
24
|
+
# has been running.
|
25
|
+
attr_reader :last_seen
|
26
|
+
|
27
|
+
|
28
|
+
def initialize
|
29
|
+
zero_time = Time.now
|
30
|
+
@last_seen = Hash.new { |h, k| h[k] = zero_time }
|
31
|
+
end
|
32
|
+
|
33
|
+
def stored!(request)
|
34
|
+
last_seen[unique_name_for(request)] = Time.now
|
35
|
+
end
|
36
|
+
|
37
|
+
# Determine if this request trace should be fully analyzed by scoring it
|
38
|
+
# across several metrics, and then determining if that's good enough to
|
39
|
+
# make it into this minute's payload.
|
40
|
+
#
|
41
|
+
# Due to the combining nature of the agent & layaway file, there's no
|
42
|
+
# guarantee that a high scoring local champion will still be a winner when
|
43
|
+
# they go up to "regionals" and are compared against the other processes
|
44
|
+
# running on a node.
|
45
|
+
def score(request)
|
46
|
+
unique_name = request.unique_name
|
47
|
+
if unique_name == :unknown
|
48
|
+
return -1 # A negative score, should never be good enough to store.
|
22
49
|
end
|
50
|
+
|
51
|
+
total_time = request.root_layer.total_call_time
|
52
|
+
|
53
|
+
# How long has it been since we've seen this?
|
54
|
+
age = Time.now - last_seen[unique_name]
|
55
|
+
|
56
|
+
# What approximate percentile was this request?
|
57
|
+
percentile = ScoutApm::Agent.instance.request_histograms.approximate_quantile_of_value(unique_name, total_time)
|
58
|
+
|
59
|
+
return speed_points(total_time) + percentile_points(percentile) + age_points(age)
|
23
60
|
end
|
24
61
|
|
25
62
|
private
|
26
63
|
|
27
|
-
def
|
28
|
-
|
64
|
+
def unique_name_for(request)
|
65
|
+
scope_layer = LayerConverters::ConverterBase.new(request).scope_layer
|
66
|
+
if scope_layer
|
67
|
+
scope_layer.legacy_metric_name
|
68
|
+
else
|
69
|
+
:unknown
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Time in seconds
|
74
|
+
# Logarithm keeps huge times from swamping the other metrics.
|
75
|
+
# 1+ is necessary to keep the log function in positive territory.
|
76
|
+
def speed_points(time)
|
77
|
+
Math.log(1 + time) * POINT_MULTIPLIER_SPEED
|
78
|
+
end
|
79
|
+
|
80
|
+
def percentile_points(percentile)
|
81
|
+
if percentile < 40
|
82
|
+
0.4 # Don't put much emphasis on capturing low percentiles.
|
83
|
+
elsif percentile < 60
|
84
|
+
1.4 # Highest here to get mean traces
|
85
|
+
elsif percentile < 90
|
86
|
+
0.7 # Between 60 & 90% is fine.
|
87
|
+
elsif percentile >= 90
|
88
|
+
1.4 # Highest here to get 90+%ile traces
|
89
|
+
else
|
90
|
+
# impossible.
|
91
|
+
percentile
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def age_points(age)
|
96
|
+
age / 60.0 * POINT_MULTIPLIER_AGE
|
29
97
|
end
|
30
98
|
end
|
31
99
|
end
|
@@ -12,7 +12,7 @@ module ScoutApm
|
|
12
12
|
attr_reader :prof
|
13
13
|
attr_reader :raw_prof
|
14
14
|
|
15
|
-
def initialize(uri, metric_name, total_call_time, metrics, context, time, raw_stackprof)
|
15
|
+
def initialize(uri, metric_name, total_call_time, metrics, context, time, raw_stackprof, score)
|
16
16
|
@uri = uri
|
17
17
|
@metric_name = metric_name
|
18
18
|
@total_call_time = total_call_time
|
@@ -21,6 +21,7 @@ module ScoutApm
|
|
21
21
|
@time = time
|
22
22
|
@prof = ScoutApm::StackprofTreeCollapser.new(raw_stackprof).call
|
23
23
|
@raw_prof = raw_stackprof # Send whole data up to server
|
24
|
+
@score = score
|
24
25
|
end
|
25
26
|
|
26
27
|
# Used to remove metrics when the payload will be too large.
|
@@ -34,12 +35,28 @@ module ScoutApm
|
|
34
35
|
end
|
35
36
|
|
36
37
|
def as_json
|
37
|
-
json_attributes = [:key, :time, :total_call_time, :uri, [:context, :context_hash], :prof]
|
38
|
+
json_attributes = [:key, :time, :total_call_time, :uri, [:context, :context_hash], :prof, :score]
|
38
39
|
ScoutApm::AttributeArranger.call(self, json_attributes)
|
39
40
|
end
|
40
41
|
|
41
42
|
def context_hash
|
42
43
|
context.to_hash
|
43
44
|
end
|
45
|
+
|
46
|
+
########################
|
47
|
+
# Scorable interface
|
48
|
+
#
|
49
|
+
# Needed so we can merge ScoredItemSet instances
|
50
|
+
def call
|
51
|
+
self
|
52
|
+
end
|
53
|
+
|
54
|
+
def name
|
55
|
+
metric_name
|
56
|
+
end
|
57
|
+
|
58
|
+
def score
|
59
|
+
@score
|
60
|
+
end
|
44
61
|
end
|
45
62
|
end
|