scout_apm 1.5.5 → 1.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.markdown +8 -0
- data/lib/scout_apm.rb +3 -0
- data/lib/scout_apm/agent.rb +23 -25
- data/lib/scout_apm/agent/reporting.rb +8 -3
- data/lib/scout_apm/attribute_arranger.rb +4 -0
- data/lib/scout_apm/bucket_name_splitter.rb +3 -3
- data/lib/scout_apm/config.rb +4 -2
- data/lib/scout_apm/histogram.rb +20 -0
- data/lib/scout_apm/instruments/percentile_sampler.rb +37 -0
- data/lib/scout_apm/instruments/process/process_cpu.rb +12 -0
- data/lib/scout_apm/instruments/process/process_memory.rb +12 -0
- data/lib/scout_apm/layer_converters/converter_base.rb +6 -4
- data/lib/scout_apm/layer_converters/slow_job_converter.rb +21 -13
- data/lib/scout_apm/layer_converters/slow_request_converter.rb +28 -22
- data/lib/scout_apm/metric_meta.rb +5 -1
- data/lib/scout_apm/metric_set.rb +1 -1
- data/lib/scout_apm/reporter.rb +3 -1
- data/lib/scout_apm/request_histograms.rb +46 -0
- data/lib/scout_apm/scored_item_set.rb +79 -0
- data/lib/scout_apm/serializers/slow_jobs_serializer_to_json.rb +2 -0
- data/lib/scout_apm/slow_job_policy.rb +89 -19
- data/lib/scout_apm/slow_job_record.rb +20 -1
- data/lib/scout_apm/slow_request_policy.rb +80 -12
- data/lib/scout_apm/slow_transaction.rb +19 -2
- data/lib/scout_apm/store.rb +45 -15
- data/lib/scout_apm/tracked_request.rb +33 -10
- data/lib/scout_apm/version.rb +1 -1
- data/test/test_helper.rb +4 -3
- data/test/unit/layaway_test.rb +5 -8
- data/test/unit/scored_item_set_test.rb +65 -0
- data/test/unit/serializers/payload_serializer_test.rb +2 -1
- data/test/unit/slow_item_set_test.rb +2 -1
- data/test/unit/slow_request_policy_test.rb +42 -0
- metadata +9 -2
@@ -17,7 +17,11 @@ class MetricMeta
|
|
17
17
|
|
18
18
|
# Unsure if type or bucket is a better name.
|
19
19
|
def type
|
20
|
-
|
20
|
+
bucket_type
|
21
|
+
end
|
22
|
+
|
23
|
+
def name
|
24
|
+
bucket_name
|
21
25
|
end
|
22
26
|
|
23
27
|
# A key metric is the "core" of a request - either the Rails controller reached, or the background Job executed
|
data/lib/scout_apm/metric_set.rb
CHANGED
@@ -2,7 +2,7 @@ module ScoutApm
|
|
2
2
|
class MetricSet
|
3
3
|
# We can't aggregate CPU, Memory, Capacity, or Controller, so pass through these metrics directly
|
4
4
|
# TODO: Figure out a way to not have this duplicate what's in Samplers, and also on server's ingest
|
5
|
-
PASSTHROUGH_METRICS = ["CPU", "Memory", "Instance", "Controller", "SlowTransaction"]
|
5
|
+
PASSTHROUGH_METRICS = ["CPU", "Memory", "Instance", "Controller", "SlowTransaction", "Percentile", "Job"]
|
6
6
|
|
7
7
|
attr_reader :metrics
|
8
8
|
|
data/lib/scout_apm/reporter.rb
CHANGED
@@ -17,8 +17,10 @@ module ScoutApm
|
|
17
17
|
|
18
18
|
# TODO: Parse & return a real response object, not the HTTP Response object
|
19
19
|
def report(payload, headers = {})
|
20
|
-
|
20
|
+
# Some posts (typically ones under development) bypass the ingestion pipeline and go directly to the webserver. They use direct_host instead of host
|
21
|
+
hosts = [:deploy_hook, :instant_trace].include?(type) ? config.value('direct_host') : config.value('host')
|
21
22
|
|
23
|
+
Array(hosts).each do |host|
|
22
24
|
full_uri = uri(host)
|
23
25
|
response = post(full_uri, payload, headers)
|
24
26
|
unless response && response.is_a?(Net::HTTPSuccess)
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module ScoutApm
|
2
|
+
class RequestHistograms
|
3
|
+
DEFAULT_HISTOGRAM_SIZE = 50
|
4
|
+
|
5
|
+
# Private Accessor:
|
6
|
+
# A hash of Endpoint Name to an approximate histogram
|
7
|
+
#
|
8
|
+
# Each time a new request is requested to see if it's slow or not, we
|
9
|
+
# should insert it into the histogram, and get the approximate percentile
|
10
|
+
# of that time
|
11
|
+
attr_reader :histograms
|
12
|
+
private :histograms
|
13
|
+
|
14
|
+
attr_reader :histogram_size
|
15
|
+
|
16
|
+
def initialize(histogram_size = DEFAULT_HISTOGRAM_SIZE)
|
17
|
+
@histogram_size = histogram_size
|
18
|
+
initialize_histograms_hash
|
19
|
+
end
|
20
|
+
|
21
|
+
def each_name
|
22
|
+
@histograms.keys.each { |n| yield n }
|
23
|
+
end
|
24
|
+
|
25
|
+
def add(item, value)
|
26
|
+
@histograms[item].add(value)
|
27
|
+
end
|
28
|
+
|
29
|
+
def approximate_quantile_of_value(item, value)
|
30
|
+
@histograms[item].approximate_quantile_of_value(value)
|
31
|
+
end
|
32
|
+
|
33
|
+
def quantile(item, q)
|
34
|
+
@histograms[item].quantile(q)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Wipes all histograms, setting them back to empty
|
38
|
+
def reset_all!
|
39
|
+
initialize_histograms_hash
|
40
|
+
end
|
41
|
+
|
42
|
+
def initialize_histograms_hash
|
43
|
+
@histograms = Hash.new { |h, k| h[k] = NumericHistogram.new(histogram_size) }
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# Attempts to keep the highest score.
|
2
|
+
#
|
3
|
+
# Each item must respond to:
|
4
|
+
# #call to get the storable item
|
5
|
+
# #name to get a unique identifier of the storable
|
6
|
+
# #score to get a numeric score, where higher is better
|
7
|
+
module ScoutApm
|
8
|
+
class ScoredItemSet
|
9
|
+
include Enumerable
|
10
|
+
|
11
|
+
# A number larger than any score we will actually get.
|
12
|
+
ARBITRARILY_LARGE = 100000000
|
13
|
+
|
14
|
+
# Without otherwise saying, default the size to this
|
15
|
+
DEFAULT_MAX_SIZE = 10
|
16
|
+
|
17
|
+
attr_reader :max_size
|
18
|
+
attr_reader :items
|
19
|
+
|
20
|
+
def initialize(max_size = DEFAULT_MAX_SIZE)
|
21
|
+
@items = {}
|
22
|
+
@max_size = max_size
|
23
|
+
end
|
24
|
+
|
25
|
+
def each
|
26
|
+
items.each do |(_, (_, item))|
|
27
|
+
yield item
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# This function is a large if statement, with a few branches. See inline comments for each branch.
|
32
|
+
def <<(new_item)
|
33
|
+
return if new_item.name == :unknown
|
34
|
+
|
35
|
+
# If we have this item in the hash already, compare the new & old ones, and store
|
36
|
+
# the new one only if it's higher score.
|
37
|
+
if items.has_key?(new_item.name)
|
38
|
+
if new_item.score > items[new_item.name].first
|
39
|
+
store!(new_item)
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
# If the set is full, then we have to see if we evict anything to store
|
44
|
+
# this one
|
45
|
+
elsif full?
|
46
|
+
smallest_name, smallest_score = items.inject([nil, ARBITRARILY_LARGE]) do |(memo_name, memo_score), (name, (stored_score, _))|
|
47
|
+
if stored_score < memo_score
|
48
|
+
[name, stored_score]
|
49
|
+
else
|
50
|
+
[memo_name, memo_score]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
if smallest_score < new_item.score
|
55
|
+
items.delete(smallest_name)
|
56
|
+
store!(new_item)
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
# Set isn't full, and we've not seen this new_item, so go ahead and store it.
|
61
|
+
else
|
62
|
+
store!(new_item)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def full?
|
70
|
+
items.size >= max_size
|
71
|
+
end
|
72
|
+
|
73
|
+
def store!(new_item)
|
74
|
+
if !new_item.name.nil? # Never store a nil name.
|
75
|
+
items[new_item.name] = [new_item.score, new_item.call]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -1,29 +1,99 @@
|
|
1
|
-
#
|
2
|
-
#
|
3
|
-
|
4
|
-
# Keeps track of a histogram of times for each worker class (spearately), and
|
5
|
-
# uses a percentile of normal to mark individual runs as "slow".
|
6
|
-
#
|
7
|
-
# This assumes that all worker calls will be requested once to `slow?`, so that
|
8
|
-
# the data can be stored
|
1
|
+
# Long running class that determines if, and in how much detail a potentially
|
2
|
+
# slow job should be recorded in
|
3
|
+
|
9
4
|
module ScoutApm
|
10
5
|
class SlowJobPolicy
|
11
|
-
|
6
|
+
CAPTURE_TYPES = [
|
7
|
+
CAPTURE_DETAIL = "capture_detail",
|
8
|
+
CAPTURE_NONE = "capture_none",
|
9
|
+
]
|
10
|
+
|
11
|
+
# Adjust speed points. See the function
|
12
|
+
POINT_MULTIPLIER_SPEED = 0.25
|
13
|
+
|
14
|
+
# For each minute we haven't seen an endpoint
|
15
|
+
POINT_MULTIPLIER_AGE = 0.25
|
16
|
+
|
17
|
+
# Outliers are worth up to "1000ms" of weight
|
18
|
+
POINT_MULTIPLIER_PERCENTILE = 1.0
|
19
|
+
|
20
|
+
# A hash of Job Names to the last time we stored a slow trace for it.
|
21
|
+
#
|
22
|
+
# Defaults to a start time that is pretty close to application boot time.
|
23
|
+
# So the "age" of an endpoint we've never seen is the time the application
|
24
|
+
# has been running.
|
25
|
+
attr_reader :last_seen
|
12
26
|
|
13
|
-
QUANTILE = 95
|
14
27
|
|
15
|
-
def initialize
|
16
|
-
|
28
|
+
def initialize
|
29
|
+
zero_time = Time.now
|
30
|
+
@last_seen = Hash.new { |h, k| h[k] = zero_time }
|
17
31
|
end
|
18
32
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
33
|
+
def stored!(request)
|
34
|
+
last_seen[unique_name_for(request)] = Time.now
|
35
|
+
end
|
36
|
+
|
37
|
+
# Determine if this job trace should be fully analyzed by scoring it
|
38
|
+
# across several metrics, and then determining if that's good enough to
|
39
|
+
# make it into this minute's payload.
|
40
|
+
#
|
41
|
+
# Due to the combining nature of the agent & layaway file, there's no
|
42
|
+
# guarantee that a high scoring local champion will still be a winner when
|
43
|
+
# they go up to "regionals" and are compared against the other processes
|
44
|
+
# running on a node.
|
45
|
+
def score(request)
|
46
|
+
unique_name = request.unique_name
|
47
|
+
if unique_name == :unknown
|
48
|
+
return -1 # A negative score, should never be good enough to store.
|
49
|
+
end
|
50
|
+
|
51
|
+
total_time = request.root_layer.total_call_time
|
52
|
+
|
53
|
+
# How long has it been since we've seen this?
|
54
|
+
age = Time.now - last_seen[unique_name]
|
55
|
+
|
56
|
+
# What approximate percentile was this request?
|
57
|
+
percentile = ScoutApm::Agent.instance.request_histograms.approximate_quantile_of_value(unique_name, total_time)
|
58
|
+
|
59
|
+
return speed_points(total_time) + percentile_points(percentile) + age_points(age)
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def unique_name_for(request)
|
65
|
+
scope_layer = LayerConverters::ConverterBase.new(request).scope_layer
|
66
|
+
if scope_layer
|
67
|
+
scope_layer.legacy_metric_name
|
68
|
+
else
|
69
|
+
:unknown
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Time in seconds
|
74
|
+
# Logarithm keeps huge times from swamping the other metrics.
|
75
|
+
# 1+ is necessary to keep the log function in positive territory.
|
76
|
+
def speed_points(time)
|
77
|
+
Math.log(1 + time) * POINT_MULTIPLIER_SPEED
|
78
|
+
end
|
79
|
+
|
80
|
+
def percentile_points(percentile)
|
81
|
+
if percentile < 40
|
82
|
+
0.4 # Don't put much emphasis on capturing low percentiles.
|
83
|
+
elsif percentile < 60
|
84
|
+
1.4 # Highest here to get mean traces
|
85
|
+
elsif percentile < 90
|
86
|
+
0.7 # Between 60 & 90% is fine.
|
87
|
+
elsif percentile >= 90
|
88
|
+
1.4 # Highest here to get 90+%ile traces
|
89
|
+
else
|
90
|
+
# impossible.
|
91
|
+
percentile
|
92
|
+
end
|
93
|
+
end
|
25
94
|
|
26
|
-
|
95
|
+
def age_points(age)
|
96
|
+
age / 60.0 * POINT_MULTIPLIER_AGE
|
27
97
|
end
|
28
98
|
end
|
29
99
|
end
|
@@ -15,7 +15,9 @@ module ScoutApm
|
|
15
15
|
|
16
16
|
attr_reader :metrics
|
17
17
|
|
18
|
-
|
18
|
+
attr_reader :score
|
19
|
+
|
20
|
+
def initialize(queue_name, job_name, time, total_time, exclusive_time, context, metrics, score)
|
19
21
|
@queue_name = queue_name
|
20
22
|
@job_name = job_name
|
21
23
|
@time = time
|
@@ -23,11 +25,28 @@ module ScoutApm
|
|
23
25
|
@exclusive_time = exclusive_time
|
24
26
|
@context = context
|
25
27
|
@metrics = metrics
|
28
|
+
@score = score
|
26
29
|
end
|
27
30
|
|
28
31
|
def metric_name
|
29
32
|
"Job/#{queue_name}/#{job_name}"
|
30
33
|
end
|
31
34
|
|
35
|
+
########################
|
36
|
+
# Scorable interface
|
37
|
+
#
|
38
|
+
# Needed so we can merge ScoredItemSet instances
|
39
|
+
def call
|
40
|
+
self
|
41
|
+
end
|
42
|
+
|
43
|
+
def name
|
44
|
+
metric_name
|
45
|
+
end
|
46
|
+
|
47
|
+
def score
|
48
|
+
@score
|
49
|
+
end
|
50
|
+
|
32
51
|
end
|
33
52
|
end
|
@@ -1,8 +1,5 @@
|
|
1
1
|
# Long running class that determines if, and in how much detail a potentially
|
2
2
|
# slow transaction should be recorded in
|
3
|
-
#
|
4
|
-
# Rules:
|
5
|
-
# - Runtime must be slower than a threshold
|
6
3
|
|
7
4
|
module ScoutApm
|
8
5
|
class SlowRequestPolicy
|
@@ -11,21 +8,92 @@ module ScoutApm
|
|
11
8
|
CAPTURE_NONE = "capture_none",
|
12
9
|
]
|
13
10
|
|
14
|
-
#
|
15
|
-
|
11
|
+
# Adjust speed points. See the function
|
12
|
+
POINT_MULTIPLIER_SPEED = 0.25
|
16
13
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
14
|
+
# For each minute we haven't seen an endpoint
|
15
|
+
POINT_MULTIPLIER_AGE = 0.25
|
16
|
+
|
17
|
+
# Outliers are worth up to "1000ms" of weight
|
18
|
+
POINT_MULTIPLIER_PERCENTILE = 1.0
|
19
|
+
|
20
|
+
# A hash of Endpoint Name to the last time we stored a slow transaction for it.
|
21
|
+
#
|
22
|
+
# Defaults to a start time that is pretty close to application boot time.
|
23
|
+
# So the "age" of an endpoint we've never seen is the time the application
|
24
|
+
# has been running.
|
25
|
+
attr_reader :last_seen
|
26
|
+
|
27
|
+
|
28
|
+
def initialize
|
29
|
+
zero_time = Time.now
|
30
|
+
@last_seen = Hash.new { |h, k| h[k] = zero_time }
|
31
|
+
end
|
32
|
+
|
33
|
+
def stored!(request)
|
34
|
+
last_seen[unique_name_for(request)] = Time.now
|
35
|
+
end
|
36
|
+
|
37
|
+
# Determine if this request trace should be fully analyzed by scoring it
|
38
|
+
# across several metrics, and then determining if that's good enough to
|
39
|
+
# make it into this minute's payload.
|
40
|
+
#
|
41
|
+
# Due to the combining nature of the agent & layaway file, there's no
|
42
|
+
# guarantee that a high scoring local champion will still be a winner when
|
43
|
+
# they go up to "regionals" and are compared against the other processes
|
44
|
+
# running on a node.
|
45
|
+
def score(request)
|
46
|
+
unique_name = request.unique_name
|
47
|
+
if unique_name == :unknown
|
48
|
+
return -1 # A negative score, should never be good enough to store.
|
22
49
|
end
|
50
|
+
|
51
|
+
total_time = request.root_layer.total_call_time
|
52
|
+
|
53
|
+
# How long has it been since we've seen this?
|
54
|
+
age = Time.now - last_seen[unique_name]
|
55
|
+
|
56
|
+
# What approximate percentile was this request?
|
57
|
+
percentile = ScoutApm::Agent.instance.request_histograms.approximate_quantile_of_value(unique_name, total_time)
|
58
|
+
|
59
|
+
return speed_points(total_time) + percentile_points(percentile) + age_points(age)
|
23
60
|
end
|
24
61
|
|
25
62
|
private
|
26
63
|
|
27
|
-
def
|
28
|
-
|
64
|
+
def unique_name_for(request)
|
65
|
+
scope_layer = LayerConverters::ConverterBase.new(request).scope_layer
|
66
|
+
if scope_layer
|
67
|
+
scope_layer.legacy_metric_name
|
68
|
+
else
|
69
|
+
:unknown
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Time in seconds
|
74
|
+
# Logarithm keeps huge times from swamping the other metrics.
|
75
|
+
# 1+ is necessary to keep the log function in positive territory.
|
76
|
+
def speed_points(time)
|
77
|
+
Math.log(1 + time) * POINT_MULTIPLIER_SPEED
|
78
|
+
end
|
79
|
+
|
80
|
+
def percentile_points(percentile)
|
81
|
+
if percentile < 40
|
82
|
+
0.4 # Don't put much emphasis on capturing low percentiles.
|
83
|
+
elsif percentile < 60
|
84
|
+
1.4 # Highest here to get mean traces
|
85
|
+
elsif percentile < 90
|
86
|
+
0.7 # Between 60 & 90% is fine.
|
87
|
+
elsif percentile >= 90
|
88
|
+
1.4 # Highest here to get 90+%ile traces
|
89
|
+
else
|
90
|
+
# impossible.
|
91
|
+
percentile
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def age_points(age)
|
96
|
+
age / 60.0 * POINT_MULTIPLIER_AGE
|
29
97
|
end
|
30
98
|
end
|
31
99
|
end
|
@@ -12,7 +12,7 @@ module ScoutApm
|
|
12
12
|
attr_reader :prof
|
13
13
|
attr_reader :raw_prof
|
14
14
|
|
15
|
-
def initialize(uri, metric_name, total_call_time, metrics, context, time, raw_stackprof)
|
15
|
+
def initialize(uri, metric_name, total_call_time, metrics, context, time, raw_stackprof, score)
|
16
16
|
@uri = uri
|
17
17
|
@metric_name = metric_name
|
18
18
|
@total_call_time = total_call_time
|
@@ -21,6 +21,7 @@ module ScoutApm
|
|
21
21
|
@time = time
|
22
22
|
@prof = ScoutApm::StackprofTreeCollapser.new(raw_stackprof).call
|
23
23
|
@raw_prof = raw_stackprof # Send whole data up to server
|
24
|
+
@score = score
|
24
25
|
end
|
25
26
|
|
26
27
|
# Used to remove metrics when the payload will be too large.
|
@@ -34,12 +35,28 @@ module ScoutApm
|
|
34
35
|
end
|
35
36
|
|
36
37
|
def as_json
|
37
|
-
json_attributes = [:key, :time, :total_call_time, :uri, [:context, :context_hash], :prof]
|
38
|
+
json_attributes = [:key, :time, :total_call_time, :uri, [:context, :context_hash], :prof, :score]
|
38
39
|
ScoutApm::AttributeArranger.call(self, json_attributes)
|
39
40
|
end
|
40
41
|
|
41
42
|
def context_hash
|
42
43
|
context.to_hash
|
43
44
|
end
|
45
|
+
|
46
|
+
########################
|
47
|
+
# Scorable interface
|
48
|
+
#
|
49
|
+
# Needed so we can merge ScoredItemSet instances
|
50
|
+
def call
|
51
|
+
self
|
52
|
+
end
|
53
|
+
|
54
|
+
def name
|
55
|
+
metric_name
|
56
|
+
end
|
57
|
+
|
58
|
+
def score
|
59
|
+
@score
|
60
|
+
end
|
44
61
|
end
|
45
62
|
end
|