scout_apm 2.0.0.pre → 2.0.0.pre2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.markdown +22 -5
- data/Rakefile +5 -0
- data/lib/scout_apm.rb +4 -0
- data/lib/scout_apm/agent.rb +22 -8
- data/lib/scout_apm/agent/reporting.rb +8 -3
- data/lib/scout_apm/attribute_arranger.rb +4 -0
- data/lib/scout_apm/bucket_name_splitter.rb +3 -3
- data/lib/scout_apm/config.rb +5 -2
- data/lib/scout_apm/histogram.rb +20 -0
- data/lib/scout_apm/instant_reporting.rb +40 -0
- data/lib/scout_apm/instruments/action_controller_rails_3_rails4.rb +11 -1
- data/lib/scout_apm/instruments/percentile_sampler.rb +38 -0
- data/lib/scout_apm/layaway.rb +1 -4
- data/lib/scout_apm/layaway_file.rb +26 -2
- data/lib/scout_apm/layer.rb +1 -1
- data/lib/scout_apm/layer_converters/converter_base.rb +6 -4
- data/lib/scout_apm/layer_converters/slow_job_converter.rb +21 -13
- data/lib/scout_apm/layer_converters/slow_request_converter.rb +37 -24
- data/lib/scout_apm/metric_meta.rb +5 -1
- data/lib/scout_apm/metric_set.rb +15 -6
- data/lib/scout_apm/reporter.rb +9 -3
- data/lib/scout_apm/request_histograms.rb +46 -0
- data/lib/scout_apm/scored_item_set.rb +79 -0
- data/lib/scout_apm/serializers/payload_serializer_to_json.rb +2 -0
- data/lib/scout_apm/serializers/slow_jobs_serializer_to_json.rb +2 -0
- data/lib/scout_apm/slow_job_policy.rb +89 -19
- data/lib/scout_apm/slow_job_record.rb +18 -1
- data/lib/scout_apm/slow_request_policy.rb +80 -12
- data/lib/scout_apm/slow_transaction.rb +22 -3
- data/lib/scout_apm/store.rb +35 -13
- data/lib/scout_apm/tracked_request.rb +63 -11
- data/lib/scout_apm/utils/backtrace_parser.rb +4 -4
- data/lib/scout_apm/utils/sql_sanitizer.rb +1 -1
- data/lib/scout_apm/utils/sql_sanitizer_regex.rb +2 -2
- data/lib/scout_apm/utils/sql_sanitizer_regex_1_8_7.rb +2 -2
- data/lib/scout_apm/version.rb +1 -1
- data/scout_apm.gemspec +1 -0
- data/test/test_helper.rb +4 -3
- data/test/unit/layaway_test.rb +5 -8
- data/test/unit/metric_set_test.rb +101 -0
- data/test/unit/scored_item_set_test.rb +65 -0
- data/test/unit/serializers/payload_serializer_test.rb +2 -1
- data/test/unit/slow_item_set_test.rb +2 -1
- data/test/unit/slow_request_policy_test.rb +42 -0
- data/test/unit/sql_sanitizer_test.rb +6 -0
- metadata +28 -3
@@ -19,8 +19,9 @@ module ScoutApm
|
|
19
19
|
attr_reader :allocations
|
20
20
|
attr_reader :hostname
|
21
21
|
attr_reader :seconds_since_startup
|
22
|
+
attr_reader :score
|
22
23
|
|
23
|
-
def initialize(queue_name, job_name, time, total_time, exclusive_time, context, metrics, allocation_metrics, mem_delta, allocations)
|
24
|
+
def initialize(queue_name, job_name, time, total_time, exclusive_time, context, metrics, allocation_metrics, mem_delta, allocations, score)
|
24
25
|
@queue_name = queue_name
|
25
26
|
@job_name = job_name
|
26
27
|
@time = time
|
@@ -33,6 +34,7 @@ module ScoutApm
|
|
33
34
|
@allocations = allocations
|
34
35
|
@seconds_since_startup = (Time.now - ScoutApm::Agent.instance.process_start_time)
|
35
36
|
@hostname = ScoutApm::Environment.instance.hostname
|
37
|
+
@score = score
|
36
38
|
ScoutApm::Agent.instance.logger.debug { "Slow Job [#{metric_name}] - Call Time: #{total_call_time} Mem Delta: #{mem_delta}"}
|
37
39
|
end
|
38
40
|
|
@@ -40,5 +42,20 @@ module ScoutApm
|
|
40
42
|
"Job/#{queue_name}/#{job_name}"
|
41
43
|
end
|
42
44
|
|
45
|
+
########################
|
46
|
+
# Scorable interface
|
47
|
+
#
|
48
|
+
# Needed so we can merge ScoredItemSet instances
|
49
|
+
def call
|
50
|
+
self
|
51
|
+
end
|
52
|
+
|
53
|
+
def name
|
54
|
+
metric_name
|
55
|
+
end
|
56
|
+
|
57
|
+
def score
|
58
|
+
@score
|
59
|
+
end
|
43
60
|
end
|
44
61
|
end
|
@@ -1,8 +1,5 @@
|
|
1
1
|
# Long running class that determines if, and in how much detail a potentially
|
2
2
|
# slow transaction should be recorded in
|
3
|
-
#
|
4
|
-
# Rules:
|
5
|
-
# - Runtime must be slower than a threshold
|
6
3
|
|
7
4
|
module ScoutApm
|
8
5
|
class SlowRequestPolicy
|
@@ -11,21 +8,92 @@ module ScoutApm
|
|
11
8
|
CAPTURE_NONE = "capture_none",
|
12
9
|
]
|
13
10
|
|
14
|
-
#
|
15
|
-
|
11
|
+
# Adjust speed points. See the function
|
12
|
+
POINT_MULTIPLIER_SPEED = 0.25
|
16
13
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
14
|
+
# For each minute we haven't seen an endpoint
|
15
|
+
POINT_MULTIPLIER_AGE = 0.25
|
16
|
+
|
17
|
+
# Outliers are worth up to "1000ms" of weight
|
18
|
+
POINT_MULTIPLIER_PERCENTILE = 1.0
|
19
|
+
|
20
|
+
# A hash of Endpoint Name to the last time we stored a slow transaction for it.
|
21
|
+
#
|
22
|
+
# Defaults to a start time that is pretty close to application boot time.
|
23
|
+
# So the "age" of an endpoint we've never seen is the time the application
|
24
|
+
# has been running.
|
25
|
+
attr_reader :last_seen
|
26
|
+
|
27
|
+
|
28
|
+
def initialize
|
29
|
+
zero_time = Time.now
|
30
|
+
@last_seen = Hash.new { |h, k| h[k] = zero_time }
|
31
|
+
end
|
32
|
+
|
33
|
+
def stored!(request)
|
34
|
+
last_seen[unique_name_for(request)] = Time.now
|
35
|
+
end
|
36
|
+
|
37
|
+
# Determine if this request trace should be fully analyzed by scoring it
|
38
|
+
# across several metrics, and then determining if that's good enough to
|
39
|
+
# make it into this minute's payload.
|
40
|
+
#
|
41
|
+
# Due to the combining nature of the agent & layaway file, there's no
|
42
|
+
# guarantee that a high scoring local champion will still be a winner when
|
43
|
+
# they go up to "regionals" and are compared against the other processes
|
44
|
+
# running on a node.
|
45
|
+
def score(request)
|
46
|
+
unique_name = request.unique_name
|
47
|
+
if unique_name == :unknown
|
48
|
+
return -1 # A negative score, should never be good enough to store.
|
22
49
|
end
|
50
|
+
|
51
|
+
total_time = request.root_layer.total_call_time
|
52
|
+
|
53
|
+
# How long has it been since we've seen this?
|
54
|
+
age = Time.now - last_seen[unique_name]
|
55
|
+
|
56
|
+
# What approximate percentile was this request?
|
57
|
+
percentile = ScoutApm::Agent.instance.request_histograms.approximate_quantile_of_value(unique_name, total_time)
|
58
|
+
|
59
|
+
return speed_points(total_time) + percentile_points(percentile) + age_points(age)
|
23
60
|
end
|
24
61
|
|
25
62
|
private
|
26
63
|
|
27
|
-
def
|
28
|
-
|
64
|
+
def unique_name_for(request)
|
65
|
+
scope_layer = LayerConverters::ConverterBase.new(request).scope_layer
|
66
|
+
if scope_layer
|
67
|
+
scope_layer.legacy_metric_name
|
68
|
+
else
|
69
|
+
:unknown
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Time in seconds
|
74
|
+
# Logarithm keeps huge times from swamping the other metrics.
|
75
|
+
# 1+ is necessary to keep the log function in positive territory.
|
76
|
+
def speed_points(time)
|
77
|
+
Math.log(1 + time) * POINT_MULTIPLIER_SPEED
|
78
|
+
end
|
79
|
+
|
80
|
+
def percentile_points(percentile)
|
81
|
+
if percentile < 40
|
82
|
+
0.4 # Don't put much emphasis on capturing low percentiles.
|
83
|
+
elsif percentile < 60
|
84
|
+
1.4 # Highest here to get mean traces
|
85
|
+
elsif percentile < 90
|
86
|
+
0.7 # Between 60 & 90% is fine.
|
87
|
+
elsif percentile >= 90
|
88
|
+
1.4 # Highest here to get 90+%ile traces
|
89
|
+
else
|
90
|
+
# impossible.
|
91
|
+
percentile
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def age_points(age)
|
96
|
+
age / 60.0 * POINT_MULTIPLIER_AGE
|
29
97
|
end
|
30
98
|
end
|
31
99
|
end
|
@@ -17,7 +17,7 @@ module ScoutApm
|
|
17
17
|
attr_accessor :hostname # hack - we need to reset these server side.
|
18
18
|
attr_accessor :seconds_since_startup # hack - we need to reset these server side.
|
19
19
|
|
20
|
-
def initialize(uri, metric_name, total_call_time, metrics, allocation_metrics, context, time, raw_stackprof, mem_delta, allocations)
|
20
|
+
def initialize(uri, metric_name, total_call_time, metrics, allocation_metrics, context, time, raw_stackprof, mem_delta, allocations, score)
|
21
21
|
@uri = uri
|
22
22
|
@metric_name = metric_name
|
23
23
|
@total_call_time = total_call_time
|
@@ -27,11 +27,14 @@ module ScoutApm
|
|
27
27
|
@time = time
|
28
28
|
@prof = ScoutApm::StackprofTreeCollapser.new(raw_stackprof).call
|
29
29
|
@raw_prof = raw_stackprof # Send whole data up to server
|
30
|
+
|
30
31
|
@mem_delta = mem_delta
|
31
32
|
@allocations = allocations
|
32
33
|
@seconds_since_startup = (Time.now - ScoutApm::Agent.instance.process_start_time)
|
33
34
|
@hostname = ScoutApm::Environment.instance.hostname
|
34
|
-
|
35
|
+
|
36
|
+
@score = score
|
37
|
+
ScoutApm::Agent.instance.logger.debug { "Slow Request [#{uri}] - Call Time: #{total_call_time} Mem Delta: #{mem_delta} Score: #{score}"}
|
35
38
|
end
|
36
39
|
|
37
40
|
# Used to remove metrics when the payload will be too large.
|
@@ -45,12 +48,28 @@ module ScoutApm
|
|
45
48
|
end
|
46
49
|
|
47
50
|
def as_json
|
48
|
-
json_attributes = [:key, :time, :total_call_time, :uri, [:context, :context_hash], :prof, :mem_delta, :allocations, :seconds_since_startup, :hostname]
|
51
|
+
json_attributes = [:key, :time, :total_call_time, :uri, [:context, :context_hash], :score, :prof, :mem_delta, :allocations, :seconds_since_startup, :hostname]
|
49
52
|
ScoutApm::AttributeArranger.call(self, json_attributes)
|
50
53
|
end
|
51
54
|
|
52
55
|
def context_hash
|
53
56
|
context.to_hash
|
54
57
|
end
|
58
|
+
|
59
|
+
########################
|
60
|
+
# Scorable interface
|
61
|
+
#
|
62
|
+
# Needed so we can merge ScoredItemSet instances
|
63
|
+
def call
|
64
|
+
self
|
65
|
+
end
|
66
|
+
|
67
|
+
def name
|
68
|
+
metric_name
|
69
|
+
end
|
70
|
+
|
71
|
+
def score
|
72
|
+
@score
|
73
|
+
end
|
55
74
|
end
|
56
75
|
end
|
data/lib/scout_apm/store.rb
CHANGED
@@ -22,7 +22,7 @@ module ScoutApm
|
|
22
22
|
# Save newly collected metrics
|
23
23
|
def track!(metrics, options={})
|
24
24
|
@mutex.synchronize {
|
25
|
-
current_period.
|
25
|
+
current_period.absorb_metrics!(metrics)
|
26
26
|
}
|
27
27
|
end
|
28
28
|
|
@@ -107,11 +107,12 @@ module ScoutApm
|
|
107
107
|
|
108
108
|
# One period of Storage. Typically 1 minute
|
109
109
|
class StoreReportingPeriod
|
110
|
-
# A SlowItemSet to store slow transactions in
|
111
|
-
attr_reader :slow_transactions
|
112
110
|
|
113
|
-
# A
|
114
|
-
attr_reader :
|
111
|
+
# A ScoredItemSet holding the "best" traces for the period
|
112
|
+
attr_reader :request_traces
|
113
|
+
|
114
|
+
# A ScoredItemSet holding the "best" traces for the period
|
115
|
+
attr_reader :job_traces
|
115
116
|
|
116
117
|
# A StoreReportingPeriodTimestamp representing the time that this
|
117
118
|
# collection of metrics is for
|
@@ -122,31 +123,50 @@ module ScoutApm
|
|
122
123
|
def initialize(timestamp)
|
123
124
|
@timestamp = timestamp
|
124
125
|
|
125
|
-
@
|
126
|
-
@
|
126
|
+
@request_traces = ScoredItemSet.new
|
127
|
+
@job_traces = ScoredItemSet.new
|
127
128
|
|
128
129
|
@metric_set = MetricSet.new
|
129
130
|
@jobs = Hash.new
|
130
131
|
end
|
131
132
|
|
133
|
+
# Merges another StoreReportingPeriod into this one
|
134
|
+
def merge(other)
|
135
|
+
self.
|
136
|
+
merge_metrics!(other.metric_set).
|
137
|
+
merge_slow_transactions!(other.slow_transactions_payload).
|
138
|
+
merge_jobs!(other.jobs).
|
139
|
+
merge_slow_jobs!(other.slow_jobs_payload)
|
140
|
+
self
|
141
|
+
end
|
142
|
+
|
132
143
|
#################################
|
133
144
|
# Add metrics as they are recorded
|
134
145
|
#################################
|
135
|
-
|
146
|
+
|
147
|
+
# For absorbing an array of metric {Meta => Stat} records
|
148
|
+
def absorb_metrics!(metrics)
|
136
149
|
metric_set.absorb_all(metrics)
|
137
150
|
self
|
138
151
|
end
|
139
152
|
|
153
|
+
# For merging when you have another metric_set object
|
154
|
+
# Makes sure that you don't duplicate error count records
|
155
|
+
def merge_metrics!(other_metric_set)
|
156
|
+
metric_set.combine!(other_metric_set)
|
157
|
+
self
|
158
|
+
end
|
159
|
+
|
140
160
|
def merge_slow_transactions!(new_transactions)
|
141
161
|
Array(new_transactions).each do |one_transaction|
|
142
|
-
|
162
|
+
request_traces << one_transaction
|
143
163
|
end
|
144
164
|
|
145
165
|
self
|
146
166
|
end
|
147
167
|
|
148
168
|
def merge_jobs!(jobs)
|
149
|
-
jobs.each do |job|
|
169
|
+
Array(jobs).each do |job|
|
150
170
|
if @jobs.has_key?(job)
|
151
171
|
@jobs[job].combine!(job)
|
152
172
|
else
|
@@ -159,8 +179,10 @@ module ScoutApm
|
|
159
179
|
|
160
180
|
def merge_slow_jobs!(new_jobs)
|
161
181
|
Array(new_jobs).each do |job|
|
162
|
-
|
182
|
+
job_traces << job
|
163
183
|
end
|
184
|
+
|
185
|
+
self
|
164
186
|
end
|
165
187
|
|
166
188
|
#################################
|
@@ -171,7 +193,7 @@ module ScoutApm
|
|
171
193
|
end
|
172
194
|
|
173
195
|
def slow_transactions_payload
|
174
|
-
|
196
|
+
request_traces.to_a
|
175
197
|
end
|
176
198
|
|
177
199
|
def jobs
|
@@ -179,7 +201,7 @@ module ScoutApm
|
|
179
201
|
end
|
180
202
|
|
181
203
|
def slow_jobs_payload
|
182
|
-
|
204
|
+
job_traces.to_a
|
183
205
|
end
|
184
206
|
|
185
207
|
#################################
|
@@ -39,6 +39,10 @@ module ScoutApm
|
|
39
39
|
# with same names across multiple types.
|
40
40
|
attr_accessor :call_counts
|
41
41
|
|
42
|
+
# if there's an instant_key, pass the transaction trace on for immediate reporting (in addition to the usual background aggregation)
|
43
|
+
# this is set in the controller instumentation (ActionControllerRails3Rails4 according)
|
44
|
+
attr_accessor :instant_key
|
45
|
+
|
42
46
|
BACKTRACE_THRESHOLD = 0.5 # the minimum threshold in seconds to record the backtrace for a metric.
|
43
47
|
|
44
48
|
def initialize
|
@@ -50,6 +54,7 @@ module ScoutApm
|
|
50
54
|
@root_layer = nil
|
51
55
|
@stackprof = nil
|
52
56
|
@error = false
|
57
|
+
@instant_key = nil
|
53
58
|
@mem_start = mem_usage
|
54
59
|
end
|
55
60
|
|
@@ -67,6 +72,17 @@ module ScoutApm
|
|
67
72
|
return if ignoring_children?
|
68
73
|
|
69
74
|
layer = @layers.pop
|
75
|
+
|
76
|
+
# Safeguard against a mismatch in the layer tracking in an instrument.
|
77
|
+
# This class works under the assumption that start & stop layers are
|
78
|
+
# lined up correctly. If stop_layer gets called twice, when it should
|
79
|
+
# only have been called once you'll end up with this error.
|
80
|
+
if layer.nil?
|
81
|
+
ScoutApm::Agent.instance.logger.warn("Error stopping layer, was nil. Root Layer: #{@root_layer.inspect}")
|
82
|
+
stop_request
|
83
|
+
return
|
84
|
+
end
|
85
|
+
|
70
86
|
layer.record_stop_time!
|
71
87
|
layer.record_allocations!
|
72
88
|
|
@@ -87,7 +103,7 @@ module ScoutApm
|
|
87
103
|
# instrumentation early, and gradually learn more about the request that
|
88
104
|
# actually happened as we go (for instance, the # of records found, or the
|
89
105
|
# actual SQL generated).
|
90
|
-
#
|
106
|
+
#
|
91
107
|
# Returns nil in the case there is no current layer. That would be normal
|
92
108
|
# for a completed TrackedRequest
|
93
109
|
def current_layer
|
@@ -202,6 +218,10 @@ module ScoutApm
|
|
202
218
|
request_type == "web"
|
203
219
|
end
|
204
220
|
|
221
|
+
def instant?
|
222
|
+
instant_key
|
223
|
+
end
|
224
|
+
|
205
225
|
###################################
|
206
226
|
# Persist the Request
|
207
227
|
###################################
|
@@ -211,27 +231,59 @@ module ScoutApm
|
|
211
231
|
def record!
|
212
232
|
@recorded = true
|
213
233
|
|
234
|
+
# Update immediate and long-term histograms for both job and web requests
|
235
|
+
if unique_name != :unknown
|
236
|
+
ScoutApm::Agent.instance.request_histograms.add(unique_name, root_layer.total_call_time)
|
237
|
+
ScoutApm::Agent.instance.request_histograms_resettable.add(unique_name, root_layer.total_call_time)
|
238
|
+
end
|
239
|
+
|
214
240
|
metrics = LayerConverters::MetricConverter.new(self).call
|
215
241
|
ScoutApm::Agent.instance.store.track!(metrics)
|
216
242
|
|
217
|
-
slow, slow_metrics = LayerConverters::SlowRequestConverter.new(self).call
|
218
|
-
ScoutApm::Agent.instance.store.track_slow_transaction!(slow)
|
219
|
-
ScoutApm::Agent.instance.store.track!(slow_metrics)
|
220
|
-
|
221
243
|
error_metrics = LayerConverters::ErrorConverter.new(self).call
|
222
244
|
ScoutApm::Agent.instance.store.track!(error_metrics)
|
223
245
|
|
224
|
-
|
225
|
-
ScoutApm::Agent.instance.store.track!(
|
246
|
+
allocation_metrics = LayerConverters::AllocationMetricConverter.new(self).call
|
247
|
+
ScoutApm::Agent.instance.store.track!(allocation_metrics)
|
248
|
+
|
249
|
+
if web?
|
250
|
+
# Don't #call this - that's the job of the ScoredItemSet later.
|
251
|
+
slow_converter = LayerConverters::SlowRequestConverter.new(self)
|
252
|
+
ScoutApm::Agent.instance.store.track_slow_transaction!(slow_converter)
|
253
|
+
|
254
|
+
queue_time_metrics = LayerConverters::RequestQueueTimeConverter.new(self).call
|
255
|
+
ScoutApm::Agent.instance.store.track!(queue_time_metrics)
|
226
256
|
|
227
|
-
|
228
|
-
|
257
|
+
# If there's an instant_key, it means we need to report this right away
|
258
|
+
if instant?
|
259
|
+
trace = slow_converter.call
|
260
|
+
ScoutApm::InstantReporting.new(trace, instant_key).call()
|
261
|
+
end
|
262
|
+
end
|
229
263
|
|
230
|
-
|
231
|
-
|
264
|
+
if job?
|
265
|
+
job_metrics = LayerConverters::JobConverter.new(self).call
|
266
|
+
ScoutApm::Agent.instance.store.track_job!(job_metrics)
|
267
|
+
|
268
|
+
job_converter = LayerConverters::SlowJobConverter.new(self)
|
269
|
+
ScoutApm::Agent.instance.store.track_slow_job!(job_converter)
|
270
|
+
end
|
232
271
|
|
233
272
|
allocation_metrics = LayerConverters::AllocationMetricConverter.new(self).call
|
234
273
|
ScoutApm::Agent.instance.store.track!(allocation_metrics)
|
274
|
+
|
275
|
+
end
|
276
|
+
|
277
|
+
# Only call this after the request is complete
|
278
|
+
def unique_name
|
279
|
+
@unique_name ||= begin
|
280
|
+
scope_layer = LayerConverters::ConverterBase.new(self).scope_layer
|
281
|
+
if scope_layer
|
282
|
+
scope_layer.legacy_metric_name
|
283
|
+
else
|
284
|
+
:unknown
|
285
|
+
end
|
286
|
+
end
|
235
287
|
end
|
236
288
|
|
237
289
|
# Have we already persisted this request?
|
@@ -6,21 +6,21 @@ module ScoutApm
|
|
6
6
|
module Utils
|
7
7
|
class BacktraceParser
|
8
8
|
|
9
|
+
APP_FRAMES = 3 # will return up to 3 frames from the app stack.
|
10
|
+
|
9
11
|
def initialize(call_stack)
|
10
12
|
@call_stack = call_stack
|
11
13
|
# We can't use a constant as it'd be too early to fetch environment info
|
12
14
|
@@app_dir_regex ||= /\A(#{ScoutApm::Environment.instance.root.to_s.gsub('/','\/')}\/)(app\/(.+))/.freeze
|
13
15
|
end
|
14
16
|
|
15
|
-
# Given a call stack Array, grabs the first
|
17
|
+
# Given a call stack Array, grabs the first +APP_FRAMES+ callers within the application root directory.
|
16
18
|
def call
|
17
|
-
# We used to return an array of up to 5 elements...this will return a single element-array for backwards compatibility.
|
18
|
-
# Only the first element is used in Github code display.
|
19
19
|
stack = []
|
20
20
|
@call_stack.each_with_index do |c,i|
|
21
21
|
if m = c.match(@@app_dir_regex)
|
22
22
|
stack << m[2]
|
23
|
-
break
|
23
|
+
break if stack.size == APP_FRAMES
|
24
24
|
end
|
25
25
|
end
|
26
26
|
stack
|