scout_apm 1.6.8 → 2.0.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +8 -1
- data/CHANGELOG.markdown +7 -57
- data/ext/allocations/allocations.c +84 -0
- data/ext/allocations/extconf.rb +3 -0
- data/lib/scout_apm/agent/reporting.rb +9 -32
- data/lib/scout_apm/agent.rb +45 -31
- data/lib/scout_apm/app_server_load.rb +1 -2
- data/lib/scout_apm/attribute_arranger.rb +0 -4
- data/lib/scout_apm/background_worker.rb +6 -9
- data/lib/scout_apm/bucket_name_splitter.rb +3 -3
- data/lib/scout_apm/call_set.rb +1 -0
- data/lib/scout_apm/config.rb +110 -66
- data/lib/scout_apm/environment.rb +16 -10
- data/lib/scout_apm/framework_integrations/rails_2.rb +12 -14
- data/lib/scout_apm/framework_integrations/rails_3_or_4.rb +5 -17
- data/lib/scout_apm/framework_integrations/ruby.rb +0 -4
- data/lib/scout_apm/framework_integrations/sinatra.rb +0 -4
- data/lib/scout_apm/histogram.rb +0 -20
- data/lib/scout_apm/instruments/action_controller_rails_3_rails4.rb +1 -4
- data/lib/scout_apm/instruments/active_record.rb +149 -8
- data/lib/scout_apm/instruments/mongoid.rb +5 -78
- data/lib/scout_apm/instruments/process/process_cpu.rb +0 -12
- data/lib/scout_apm/instruments/process/process_memory.rb +14 -43
- data/lib/scout_apm/layaway.rb +34 -134
- data/lib/scout_apm/layaway_file.rb +50 -27
- data/lib/scout_apm/layer.rb +45 -1
- data/lib/scout_apm/layer_converters/allocation_metric_converter.rb +17 -0
- data/lib/scout_apm/layer_converters/converter_base.rb +4 -6
- data/lib/scout_apm/layer_converters/job_converter.rb +1 -0
- data/lib/scout_apm/layer_converters/metric_converter.rb +2 -1
- data/lib/scout_apm/layer_converters/slow_job_converter.rb +42 -21
- data/lib/scout_apm/layer_converters/slow_request_converter.rb +58 -37
- data/lib/scout_apm/metric_meta.rb +1 -5
- data/lib/scout_apm/metric_set.rb +6 -15
- data/lib/scout_apm/reporter.rb +4 -6
- data/lib/scout_apm/serializers/metrics_to_json_serializer.rb +5 -1
- data/lib/scout_apm/serializers/payload_serializer_to_json.rb +1 -3
- data/lib/scout_apm/serializers/slow_jobs_serializer_to_json.rb +5 -3
- data/lib/scout_apm/slow_job_policy.rb +19 -89
- data/lib/scout_apm/slow_job_record.rb +12 -20
- data/lib/scout_apm/slow_request_policy.rb +12 -80
- data/lib/scout_apm/slow_transaction.rb +16 -20
- data/lib/scout_apm/stackprof_tree_collapser.rb +103 -0
- data/lib/scout_apm/store.rb +16 -78
- data/lib/scout_apm/tracked_request.rb +53 -36
- data/lib/scout_apm/utils/active_record_metric_name.rb +2 -0
- data/lib/scout_apm/utils/fake_stack_prof.rb +40 -0
- data/lib/scout_apm/utils/klass_helper.rb +26 -0
- data/lib/scout_apm/utils/sql_sanitizer.rb +1 -1
- data/lib/scout_apm/utils/sql_sanitizer_regex.rb +2 -2
- data/lib/scout_apm/utils/sql_sanitizer_regex_1_8_7.rb +2 -2
- data/lib/scout_apm/version.rb +1 -1
- data/lib/scout_apm.rb +13 -7
- data/scout_apm.gemspec +3 -1
- data/test/test_helper.rb +3 -4
- data/test/unit/layaway_test.rb +8 -5
- data/test/unit/serializers/payload_serializer_test.rb +2 -2
- data/test/unit/slow_item_set_test.rb +1 -2
- data/test/unit/sql_sanitizer_test.rb +0 -6
- metadata +28 -20
- data/LICENSE.md +0 -27
- data/lib/scout_apm/instruments/grape.rb +0 -69
- data/lib/scout_apm/instruments/percentile_sampler.rb +0 -37
- data/lib/scout_apm/request_histograms.rb +0 -46
- data/lib/scout_apm/scored_item_set.rb +0 -79
- data/test/unit/metric_set_test.rb +0 -101
- data/test/unit/scored_item_set_test.rb +0 -65
- data/test/unit/slow_request_policy_test.rb +0 -42
@@ -4,29 +4,23 @@ module ScoutApm
|
|
4
4
|
def initialize(*)
|
5
5
|
@backtraces = []
|
6
6
|
super
|
7
|
-
|
8
|
-
# After call to super, so @request is populated
|
9
|
-
@points = if request.job?
|
10
|
-
ScoutApm::Agent.instance.slow_job_policy.score(request)
|
11
|
-
else
|
12
|
-
-1
|
13
|
-
end
|
14
7
|
end
|
15
8
|
|
16
|
-
def
|
17
|
-
request.
|
18
|
-
end
|
9
|
+
def call
|
10
|
+
return unless request.job?
|
19
11
|
|
20
|
-
|
21
|
-
@points
|
22
|
-
end
|
12
|
+
job_name = [queue_layer.name, job_layer.name]
|
23
13
|
|
24
|
-
|
25
|
-
return
|
26
|
-
|
27
|
-
|
14
|
+
slow_enough = ScoutApm::Agent.instance.slow_job_policy.slow?(job_name, root_layer.total_call_time)
|
15
|
+
return unless slow_enough
|
16
|
+
|
17
|
+
# record the change in memory usage
|
18
|
+
mem_delta = ScoutApm::Instruments::Process::ProcessMemory.rss_to_mb(request.capture_mem_delta!)
|
28
19
|
|
29
|
-
|
20
|
+
timing_metrics, allocation_metrics = create_metrics
|
21
|
+
unless ScoutApm::Instruments::Allocations::ENABLED
|
22
|
+
allocation_metrics = {}
|
23
|
+
end
|
30
24
|
|
31
25
|
SlowJobRecord.new(
|
32
26
|
queue_layer.name,
|
@@ -35,8 +29,11 @@ module ScoutApm
|
|
35
29
|
job_layer.total_call_time,
|
36
30
|
job_layer.total_exclusive_time,
|
37
31
|
request.context,
|
38
|
-
|
39
|
-
|
32
|
+
timing_metrics,
|
33
|
+
allocation_metrics,
|
34
|
+
mem_delta,
|
35
|
+
job_layer.total_allocations
|
36
|
+
)
|
40
37
|
end
|
41
38
|
|
42
39
|
def queue_layer
|
@@ -47,8 +44,15 @@ module ScoutApm
|
|
47
44
|
@job_layer ||= find_first_layer_of_type("Job")
|
48
45
|
end
|
49
46
|
|
47
|
+
def find_first_layer_of_type(layer_type)
|
48
|
+
walker.walk do |layer|
|
49
|
+
return layer if layer.type == layer_type
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
50
53
|
def create_metrics
|
51
54
|
metric_hash = Hash.new
|
55
|
+
allocation_metric_hash = Hash.new
|
52
56
|
|
53
57
|
# Keep a list of subscopes, but only ever use the front one. The rest
|
54
58
|
# get pushed/popped in cases when we have many levels of subscopable
|
@@ -68,6 +72,14 @@ module ScoutApm
|
|
68
72
|
end
|
69
73
|
|
70
74
|
walker.walk do |layer|
|
75
|
+
# Sometimes we start capturing a layer without knowing if we really
|
76
|
+
# want to make an entry for it. See ActiveRecord instrumentation for
|
77
|
+
# an example. We start capturing before we know if a query is cached
|
78
|
+
# or not, and want to skip any cached queries.
|
79
|
+
next if layer.annotations[:ignorable]
|
80
|
+
|
81
|
+
# The queue_layer is useful to capture for other reasons, but doesn't
|
82
|
+
# create a MetricMeta/Stat of its own
|
71
83
|
next if layer == queue_layer
|
72
84
|
|
73
85
|
meta_options = if subscope_layers.first && layer != subscope_layers.first # Don't scope under ourself.
|
@@ -82,6 +94,8 @@ module ScoutApm
|
|
82
94
|
# Specific Metric
|
83
95
|
meta_options.merge!(:desc => layer.desc.to_s) if layer.desc
|
84
96
|
meta = MetricMeta.new(layer.legacy_metric_name, meta_options)
|
97
|
+
meta.extra.merge!(layer.annotations)
|
98
|
+
|
85
99
|
if layer.backtrace
|
86
100
|
bt = ScoutApm::Utils::BacktraceParser.new(layer.backtrace).call
|
87
101
|
if bt.any? # we could walk thru the call stack and not find in-app code
|
@@ -95,19 +109,26 @@ module ScoutApm
|
|
95
109
|
end
|
96
110
|
|
97
111
|
metric_hash[meta] ||= MetricStats.new( meta_options.has_key?(:scope) )
|
112
|
+
allocation_metric_hash[meta] ||= MetricStats.new( meta_options.has_key?(:scope) )
|
98
113
|
stat = metric_hash[meta]
|
99
114
|
stat.update!(layer.total_call_time, layer.total_exclusive_time)
|
115
|
+
stat = allocation_metric_hash[meta]
|
116
|
+
stat.update!(layer.total_allocations, layer.total_exclusive_allocations)
|
100
117
|
|
101
118
|
# Merged Metric (no specifics, just sum up by type)
|
102
119
|
meta = MetricMeta.new("#{layer.type}/all")
|
103
120
|
metric_hash[meta] ||= MetricStats.new(false)
|
121
|
+
allocation_metric_hash[meta] ||= MetricStats.new(false)
|
104
122
|
stat = metric_hash[meta]
|
105
123
|
stat.update!(layer.total_call_time, layer.total_exclusive_time)
|
124
|
+
stat = allocation_metric_hash[meta]
|
125
|
+
stat.update!(layer.total_allocations, layer.total_exclusive_allocations)
|
106
126
|
end
|
107
127
|
|
108
128
|
metric_hash = attach_backtraces(metric_hash)
|
129
|
+
allocation_metric_hash = attach_backtraces(allocation_metric_hash)
|
109
130
|
|
110
|
-
metric_hash
|
131
|
+
[metric_hash,allocation_metric_hash]
|
111
132
|
end
|
112
133
|
|
113
134
|
def attach_backtraces(metric_hash)
|
@@ -4,50 +4,49 @@ module ScoutApm
|
|
4
4
|
def initialize(*)
|
5
5
|
@backtraces = [] # An Array of MetricMetas that have a backtrace
|
6
6
|
super
|
7
|
-
|
8
|
-
# After call to super, so @request is populated
|
9
|
-
@points = if request.web?
|
10
|
-
ScoutApm::Agent.instance.slow_request_policy.score(request)
|
11
|
-
else
|
12
|
-
-1
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
def name
|
17
|
-
request.unique_name
|
18
7
|
end
|
19
8
|
|
20
|
-
def score
|
21
|
-
@points
|
22
|
-
end
|
23
|
-
|
24
|
-
# Unconditionally attempts to convert this into a SlowTransaction object.
|
25
|
-
# Can return nil if the request didn't have any scope_layer.
|
26
9
|
def call
|
27
10
|
scope = scope_layer
|
28
|
-
return nil unless scope
|
11
|
+
return [nil, {}] unless scope
|
12
|
+
|
13
|
+
policy = ScoutApm::Agent.instance.slow_request_policy.capture_type(root_layer.total_call_time)
|
14
|
+
if policy == ScoutApm::SlowRequestPolicy::CAPTURE_NONE
|
15
|
+
return [nil, {}]
|
16
|
+
end
|
29
17
|
|
30
|
-
|
18
|
+
# record the change in memory usage
|
19
|
+
mem_delta = ScoutApm::Instruments::Process::ProcessMemory.rss_to_mb(@request.capture_mem_delta!)
|
20
|
+
|
21
|
+
# increment the slow transaction count if this is a slow transaction.
|
22
|
+
meta = MetricMeta.new("SlowTransaction/#{scope.legacy_metric_name}")
|
23
|
+
stat = MetricStats.new
|
24
|
+
stat.update!(1)
|
31
25
|
|
32
26
|
uri = request.annotations[:uri] || ""
|
33
27
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
return nil
|
38
|
-
end
|
28
|
+
timing_metrics, allocation_metrics = create_metrics
|
29
|
+
unless ScoutApm::Instruments::Allocations::ENABLED
|
30
|
+
allocation_metrics = {}
|
39
31
|
end
|
40
32
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
33
|
+
# Disable stackprof output for now
|
34
|
+
stackprof = [] # request.stackprof
|
35
|
+
|
36
|
+
[
|
37
|
+
SlowTransaction.new(uri,
|
38
|
+
scope.legacy_metric_name,
|
39
|
+
root_layer.total_call_time,
|
40
|
+
timing_metrics,
|
41
|
+
allocation_metrics,
|
42
|
+
request.context,
|
43
|
+
root_layer.stop_time,
|
44
|
+
stackprof,
|
45
|
+
mem_delta,
|
46
|
+
root_layer.total_allocations
|
47
|
+
),
|
48
|
+
{ meta => stat }
|
49
|
+
]
|
51
50
|
end
|
52
51
|
|
53
52
|
# Iterates over the TrackedRequest's MetricMetas that have backtraces and attaches each to correct MetricMeta in the Metric Hash.
|
@@ -58,12 +57,14 @@ module ScoutApm
|
|
58
57
|
metric_hash
|
59
58
|
end
|
60
59
|
|
61
|
-
# Full metrics from this request. These get
|
62
|
-
# overview metrics, or stored permanently in a SlowTransaction
|
60
|
+
# Full metrics from this request. These get stored permanently in a SlowTransaction.
|
63
61
|
# Some merging of metrics will happen here, so if a request calls the same
|
64
62
|
# ActiveRecord or View repeatedly, it'll get merged.
|
63
|
+
#
|
64
|
+
# This returns a 2-element of Metric Hashes (the first element is timing metrics, the second element is allocation metrics)
|
65
65
|
def create_metrics
|
66
66
|
metric_hash = Hash.new
|
67
|
+
allocation_metric_hash = Hash.new
|
67
68
|
|
68
69
|
# Keep a list of subscopes, but only ever use the front one. The rest
|
69
70
|
# get pushed/popped in cases when we have many levels of subscopable
|
@@ -83,6 +84,14 @@ module ScoutApm
|
|
83
84
|
end
|
84
85
|
|
85
86
|
walker.walk do |layer|
|
87
|
+
# Sometimes we start capturing a layer without knowing if we really
|
88
|
+
# want to make an entry for it. See ActiveRecord instrumentation for
|
89
|
+
# an example. We start capturing before we know if a query is cached
|
90
|
+
# or not, and want to skip any cached queries.
|
91
|
+
if layer.annotations[:ignorable]
|
92
|
+
next
|
93
|
+
end
|
94
|
+
|
86
95
|
meta_options = if subscope_layers.first && layer != subscope_layers.first # Don't scope under ourself.
|
87
96
|
subscope_name = subscope_layers.first.legacy_metric_name
|
88
97
|
{:scope => subscope_name}
|
@@ -95,6 +104,7 @@ module ScoutApm
|
|
95
104
|
# Specific Metric
|
96
105
|
meta_options.merge!(:desc => layer.desc.to_s) if layer.desc
|
97
106
|
meta = MetricMeta.new(layer.legacy_metric_name, meta_options)
|
107
|
+
meta.extra.merge!(layer.annotations)
|
98
108
|
if layer.backtrace
|
99
109
|
bt = ScoutApm::Utils::BacktraceParser.new(layer.backtrace).call
|
100
110
|
if bt.any? # we could walk thru the call stack and not find in-app code
|
@@ -109,19 +119,30 @@ module ScoutApm
|
|
109
119
|
end
|
110
120
|
end
|
111
121
|
metric_hash[meta] ||= MetricStats.new( meta_options.has_key?(:scope) )
|
122
|
+
allocation_metric_hash[meta] ||= MetricStats.new( meta_options.has_key?(:scope) )
|
123
|
+
# timing
|
112
124
|
stat = metric_hash[meta]
|
113
125
|
stat.update!(layer.total_call_time, layer.total_exclusive_time)
|
126
|
+
# allocations
|
127
|
+
stat = allocation_metric_hash[meta]
|
128
|
+
stat.update!(layer.total_allocations, layer.total_exclusive_allocations)
|
114
129
|
|
115
130
|
# Merged Metric (no specifics, just sum up by type)
|
116
131
|
meta = MetricMeta.new("#{layer.type}/all")
|
117
132
|
metric_hash[meta] ||= MetricStats.new(false)
|
133
|
+
allocation_metric_hash[meta] ||= MetricStats.new(false)
|
134
|
+
# timing
|
118
135
|
stat = metric_hash[meta]
|
119
136
|
stat.update!(layer.total_call_time, layer.total_exclusive_time)
|
137
|
+
# allocations
|
138
|
+
stat = allocation_metric_hash[meta]
|
139
|
+
stat.update!(layer.total_allocations, layer.total_exclusive_allocations)
|
120
140
|
end
|
121
141
|
|
122
142
|
metric_hash = attach_backtraces(metric_hash)
|
143
|
+
allocation_metric_hash = attach_backtraces(allocation_metric_hash)
|
123
144
|
|
124
|
-
metric_hash
|
145
|
+
[metric_hash,allocation_metric_hash]
|
125
146
|
end
|
126
147
|
end
|
127
148
|
end
|
@@ -17,11 +17,7 @@ class MetricMeta
|
|
17
17
|
|
18
18
|
# Unsure if type or bucket is a better name.
|
19
19
|
def type
|
20
|
-
|
21
|
-
end
|
22
|
-
|
23
|
-
def name
|
24
|
-
bucket_name
|
20
|
+
bucket
|
25
21
|
end
|
26
22
|
|
27
23
|
# A key metric is the "core" of a request - either the Rails controller reached, or the background Job executed
|
data/lib/scout_apm/metric_set.rb
CHANGED
@@ -2,7 +2,7 @@ module ScoutApm
|
|
2
2
|
class MetricSet
|
3
3
|
# We can't aggregate CPU, Memory, Capacity, or Controller, so pass through these metrics directly
|
4
4
|
# TODO: Figure out a way to not have this duplicate what's in Samplers, and also on server's ingest
|
5
|
-
PASSTHROUGH_METRICS = ["CPU", "Memory", "Instance", "Controller", "SlowTransaction"
|
5
|
+
PASSTHROUGH_METRICS = ["CPU", "Memory", "Instance", "Controller", "SlowTransaction"]
|
6
6
|
|
7
7
|
attr_reader :metrics
|
8
8
|
|
@@ -23,15 +23,11 @@ module ScoutApm
|
|
23
23
|
@metrics[meta].combine!(stat)
|
24
24
|
|
25
25
|
elsif meta.type == "Errors" # Sadly special cased, we want both raw and aggregate values
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
agg_meta = MetricMeta.new("Errors/Request", :scope => meta.scope)
|
32
|
-
@metrics[agg_meta] ||= MetricStats.new
|
33
|
-
@metrics[agg_meta].combine!(stat)
|
34
|
-
end
|
26
|
+
@metrics[meta] ||= MetricStats.new
|
27
|
+
@metrics[meta].combine!(stat)
|
28
|
+
agg_meta = MetricMeta.new("Errors/Request", :scope => meta.scope)
|
29
|
+
@metrics[agg_meta] ||= MetricStats.new
|
30
|
+
@metrics[agg_meta].combine!(stat)
|
35
31
|
|
36
32
|
else # Combine down to a single /all key
|
37
33
|
agg_meta = MetricMeta.new("#{meta.type}/all", :scope => meta.scope)
|
@@ -40,13 +36,8 @@ module ScoutApm
|
|
40
36
|
end
|
41
37
|
end
|
42
38
|
|
43
|
-
# Sets a combine_in_progress flag to prevent double-counting Error metrics.
|
44
|
-
# Without it, the Errors/Request number would be increasingly off as
|
45
|
-
# metric_sets get merged in.
|
46
39
|
def combine!(other)
|
47
|
-
@combine_in_progress = true
|
48
40
|
absorb_all(other.metrics)
|
49
|
-
@combine_in_progress = false
|
50
41
|
self
|
51
42
|
end
|
52
43
|
end
|
data/lib/scout_apm/reporter.rb
CHANGED
@@ -17,10 +17,8 @@ module ScoutApm
|
|
17
17
|
|
18
18
|
# TODO: Parse & return a real response object, not the HTTP Response object
|
19
19
|
def report(payload, headers = {})
|
20
|
-
|
21
|
-
hosts = [:deploy_hook, :instant_trace].include?(type) ? config.value('direct_host') : config.value('host')
|
20
|
+
Array(config.value('host')).each do |host|
|
22
21
|
|
23
|
-
Array(hosts).each do |host|
|
24
22
|
full_uri = uri(host)
|
25
23
|
response = post(full_uri, payload, headers)
|
26
24
|
unless response && response.is_a?(Net::HTTPSuccess)
|
@@ -36,7 +34,7 @@ module ScoutApm
|
|
36
34
|
when :app_server_load
|
37
35
|
URI.parse("#{host}/apps/app_server_load.scout?key=#{config.value('key')}&name=#{CGI.escape(Environment.instance.application_name)}")
|
38
36
|
when :deploy_hook
|
39
|
-
URI.parse("
|
37
|
+
URI.parse("https://apm.scoutapp.com/apps/deploy.scout?key=#{config.value('key')}&name=#{CGI.escape(config.value('name'))}")
|
40
38
|
end.tap{|u| logger.debug("Posting to #{u.to_s}")}
|
41
39
|
end
|
42
40
|
|
@@ -58,7 +56,7 @@ module ScoutApm
|
|
58
56
|
private
|
59
57
|
|
60
58
|
def post(uri, body, headers = Hash.new)
|
61
|
-
response =
|
59
|
+
response = :connection_failed
|
62
60
|
request(uri) do |connection|
|
63
61
|
post = Net::HTTP::Post.new( uri.path +
|
64
62
|
(uri.query ? ('?' + uri.query) : ''),
|
@@ -84,7 +82,7 @@ module ScoutApm
|
|
84
82
|
logger.debug "/#{type} FAILED: #{response.inspect}"
|
85
83
|
end
|
86
84
|
rescue Exception
|
87
|
-
logger.
|
85
|
+
logger.info "Exception sending request to server: \n#{$!.message}\n\t#{$!.backtrace.join("\n\t")}"
|
88
86
|
ensure
|
89
87
|
response
|
90
88
|
end
|
@@ -9,7 +9,11 @@ module ScoutApm
|
|
9
9
|
end
|
10
10
|
|
11
11
|
def as_json
|
12
|
-
metrics
|
12
|
+
if metrics
|
13
|
+
metrics.map{|meta, stat| metric_as_json(meta, stat) }
|
14
|
+
else
|
15
|
+
nil
|
16
|
+
end
|
13
17
|
end
|
14
18
|
|
15
19
|
# Children metrics is a hash of meta=>stat pairs. Leave empty for no children.
|
@@ -22,7 +22,7 @@ module ScoutApm
|
|
22
22
|
|
23
23
|
def rearrange_the_slow_transactions(slow_transactions)
|
24
24
|
slow_transactions.to_a.map do |slow_t|
|
25
|
-
slow_t.as_json.merge(:metrics => rearrange_the_metrics(slow_t.metrics))
|
25
|
+
slow_t.as_json.merge(:metrics => rearrange_the_metrics(slow_t.metrics), :allocation_metrics => rearrange_the_metrics(slow_t.allocation_metrics))
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
@@ -59,8 +59,6 @@ module ScoutApm
|
|
59
59
|
"[#{all_the_elements.join(",")}]"
|
60
60
|
when Numeric
|
61
61
|
formatee
|
62
|
-
when Time
|
63
|
-
%Q["#{formatee.iso8601}"]
|
64
62
|
when nil
|
65
63
|
"null"
|
66
64
|
else # strings and everything
|
@@ -17,11 +17,13 @@ module ScoutApm
|
|
17
17
|
"time" => job.time,
|
18
18
|
"total_time" => job.total_time,
|
19
19
|
"exclusive_time" => job.exclusive_time,
|
20
|
-
|
20
|
+
"mem_delta" => job.mem_delta,
|
21
|
+
"allocations" => job.allocations,
|
22
|
+
"seconds_since_startup" => job.seconds_since_startup,
|
23
|
+
"hostname" => job.hostname,
|
21
24
|
"metrics" => MetricsToJsonSerializer.new(job.metrics).as_json, # New style of metrics
|
25
|
+
"allocation_metrics" => MetricsToJsonSerializer.new(job.allocation_metrics).as_json, # New style of metrics
|
22
26
|
"context" => job.context.to_hash,
|
23
|
-
|
24
|
-
"score" => job.score,
|
25
27
|
}
|
26
28
|
end
|
27
29
|
end
|
@@ -1,99 +1,29 @@
|
|
1
|
-
#
|
2
|
-
# slow
|
3
|
-
|
1
|
+
# Create one of these at startup time, and ask it if a certain worker's
|
2
|
+
# processing time is slow enough for us to collect a slow trace.
|
3
|
+
#
|
4
|
+
# Keeps track of a histogram of times for each worker class (spearately), and
|
5
|
+
# uses a percentile of normal to mark individual runs as "slow".
|
6
|
+
#
|
7
|
+
# This assumes that all worker calls will be requested once to `slow?`, so that
|
8
|
+
# the data can be stored
|
4
9
|
module ScoutApm
|
5
10
|
class SlowJobPolicy
|
6
|
-
|
7
|
-
CAPTURE_DETAIL = "capture_detail",
|
8
|
-
CAPTURE_NONE = "capture_none",
|
9
|
-
]
|
10
|
-
|
11
|
-
# Adjust speed points. See the function
|
12
|
-
POINT_MULTIPLIER_SPEED = 0.25
|
13
|
-
|
14
|
-
# For each minute we haven't seen an endpoint
|
15
|
-
POINT_MULTIPLIER_AGE = 0.25
|
16
|
-
|
17
|
-
# Outliers are worth up to "1000ms" of weight
|
18
|
-
POINT_MULTIPLIER_PERCENTILE = 1.0
|
19
|
-
|
20
|
-
# A hash of Job Names to the last time we stored a slow trace for it.
|
21
|
-
#
|
22
|
-
# Defaults to a start time that is pretty close to application boot time.
|
23
|
-
# So the "age" of an endpoint we've never seen is the time the application
|
24
|
-
# has been running.
|
25
|
-
attr_reader :last_seen
|
11
|
+
DEFAULT_HISTOGRAM_SIZE = 50
|
26
12
|
|
13
|
+
QUANTILE = 95
|
27
14
|
|
28
|
-
def initialize
|
29
|
-
|
30
|
-
@last_seen = Hash.new { |h, k| h[k] = zero_time }
|
15
|
+
def initialize(histogram_size = DEFAULT_HISTOGRAM_SIZE)
|
16
|
+
@histograms = Hash.new { |h, k| h[k] = NumericHistogram.new(histogram_size) }
|
31
17
|
end
|
32
18
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
# make it into this minute's payload.
|
40
|
-
#
|
41
|
-
# Due to the combining nature of the agent & layaway file, there's no
|
42
|
-
# guarantee that a high scoring local champion will still be a winner when
|
43
|
-
# they go up to "regionals" and are compared against the other processes
|
44
|
-
# running on a node.
|
45
|
-
def score(request)
|
46
|
-
unique_name = request.unique_name
|
47
|
-
if unique_name == :unknown
|
48
|
-
return -1 # A negative score, should never be good enough to store.
|
49
|
-
end
|
50
|
-
|
51
|
-
total_time = request.root_layer.total_call_time
|
52
|
-
|
53
|
-
# How long has it been since we've seen this?
|
54
|
-
age = Time.now - last_seen[unique_name]
|
55
|
-
|
56
|
-
# What approximate percentile was this request?
|
57
|
-
percentile = ScoutApm::Agent.instance.request_histograms.approximate_quantile_of_value(unique_name, total_time)
|
58
|
-
|
59
|
-
return speed_points(total_time) + percentile_points(percentile) + age_points(age)
|
60
|
-
end
|
61
|
-
|
62
|
-
private
|
63
|
-
|
64
|
-
def unique_name_for(request)
|
65
|
-
scope_layer = LayerConverters::ConverterBase.new(request).scope_layer
|
66
|
-
if scope_layer
|
67
|
-
scope_layer.legacy_metric_name
|
68
|
-
else
|
69
|
-
:unknown
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
# Time in seconds
|
74
|
-
# Logarithm keeps huge times from swamping the other metrics.
|
75
|
-
# 1+ is necessary to keep the log function in positive territory.
|
76
|
-
def speed_points(time)
|
77
|
-
Math.log(1 + time) * POINT_MULTIPLIER_SPEED
|
78
|
-
end
|
79
|
-
|
80
|
-
def percentile_points(percentile)
|
81
|
-
if percentile < 40
|
82
|
-
0.4 # Don't put much emphasis on capturing low percentiles.
|
83
|
-
elsif percentile < 60
|
84
|
-
1.4 # Highest here to get mean traces
|
85
|
-
elsif percentile < 90
|
86
|
-
0.7 # Between 60 & 90% is fine.
|
87
|
-
elsif percentile >= 90
|
88
|
-
1.4 # Highest here to get 90+%ile traces
|
89
|
-
else
|
90
|
-
# impossible.
|
91
|
-
percentile
|
92
|
-
end
|
93
|
-
end
|
19
|
+
# worker: just the worker class name. "PasswordResetJob" or similar
|
20
|
+
# total_time: runtime of the job in seconds
|
21
|
+
# returns true if this request should be stored in higher trace detail, false otherwise
|
22
|
+
def slow?(worker, total_time)
|
23
|
+
@histograms[worker].add(total_time)
|
24
|
+
return false if @histograms[worker].total == 1 # First call is never slow
|
94
25
|
|
95
|
-
|
96
|
-
age / 60.0 * POINT_MULTIPLIER_AGE
|
26
|
+
total_time >= @histograms[worker].quantile(QUANTILE)
|
97
27
|
end
|
98
28
|
end
|
99
29
|
end
|
@@ -14,10 +14,13 @@ module ScoutApm
|
|
14
14
|
alias_method :total_call_time, :total_time
|
15
15
|
|
16
16
|
attr_reader :metrics
|
17
|
+
attr_reader :allocation_metrics
|
18
|
+
attr_reader :mem_delta
|
19
|
+
attr_reader :allocations
|
20
|
+
attr_reader :hostname
|
21
|
+
attr_reader :seconds_since_startup
|
17
22
|
|
18
|
-
|
19
|
-
|
20
|
-
def initialize(queue_name, job_name, time, total_time, exclusive_time, context, metrics, score)
|
23
|
+
def initialize(queue_name, job_name, time, total_time, exclusive_time, context, metrics, allocation_metrics, mem_delta, allocations)
|
21
24
|
@queue_name = queue_name
|
22
25
|
@job_name = job_name
|
23
26
|
@time = time
|
@@ -25,28 +28,17 @@ module ScoutApm
|
|
25
28
|
@exclusive_time = exclusive_time
|
26
29
|
@context = context
|
27
30
|
@metrics = metrics
|
28
|
-
@
|
31
|
+
@allocation_metrics = allocation_metrics
|
32
|
+
@mem_delta = mem_delta
|
33
|
+
@allocations = allocations
|
34
|
+
@seconds_since_startup = (Time.now - ScoutApm::Agent.instance.process_start_time)
|
35
|
+
@hostname = ScoutApm::Environment.instance.hostname
|
36
|
+
ScoutApm::Agent.instance.logger.debug { "Slow Job [#{metric_name}] - Call Time: #{total_call_time} Mem Delta: #{mem_delta}"}
|
29
37
|
end
|
30
38
|
|
31
39
|
def metric_name
|
32
40
|
"Job/#{queue_name}/#{job_name}"
|
33
41
|
end
|
34
42
|
|
35
|
-
########################
|
36
|
-
# Scorable interface
|
37
|
-
#
|
38
|
-
# Needed so we can merge ScoredItemSet instances
|
39
|
-
def call
|
40
|
-
self
|
41
|
-
end
|
42
|
-
|
43
|
-
def name
|
44
|
-
metric_name
|
45
|
-
end
|
46
|
-
|
47
|
-
def score
|
48
|
-
@score
|
49
|
-
end
|
50
|
-
|
51
43
|
end
|
52
44
|
end
|