scout_apm 2.0.0.pre → 2.0.0.pre2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGELOG.markdown +22 -5
  4. data/Rakefile +5 -0
  5. data/lib/scout_apm.rb +4 -0
  6. data/lib/scout_apm/agent.rb +22 -8
  7. data/lib/scout_apm/agent/reporting.rb +8 -3
  8. data/lib/scout_apm/attribute_arranger.rb +4 -0
  9. data/lib/scout_apm/bucket_name_splitter.rb +3 -3
  10. data/lib/scout_apm/config.rb +5 -2
  11. data/lib/scout_apm/histogram.rb +20 -0
  12. data/lib/scout_apm/instant_reporting.rb +40 -0
  13. data/lib/scout_apm/instruments/action_controller_rails_3_rails4.rb +11 -1
  14. data/lib/scout_apm/instruments/percentile_sampler.rb +38 -0
  15. data/lib/scout_apm/layaway.rb +1 -4
  16. data/lib/scout_apm/layaway_file.rb +26 -2
  17. data/lib/scout_apm/layer.rb +1 -1
  18. data/lib/scout_apm/layer_converters/converter_base.rb +6 -4
  19. data/lib/scout_apm/layer_converters/slow_job_converter.rb +21 -13
  20. data/lib/scout_apm/layer_converters/slow_request_converter.rb +37 -24
  21. data/lib/scout_apm/metric_meta.rb +5 -1
  22. data/lib/scout_apm/metric_set.rb +15 -6
  23. data/lib/scout_apm/reporter.rb +9 -3
  24. data/lib/scout_apm/request_histograms.rb +46 -0
  25. data/lib/scout_apm/scored_item_set.rb +79 -0
  26. data/lib/scout_apm/serializers/payload_serializer_to_json.rb +2 -0
  27. data/lib/scout_apm/serializers/slow_jobs_serializer_to_json.rb +2 -0
  28. data/lib/scout_apm/slow_job_policy.rb +89 -19
  29. data/lib/scout_apm/slow_job_record.rb +18 -1
  30. data/lib/scout_apm/slow_request_policy.rb +80 -12
  31. data/lib/scout_apm/slow_transaction.rb +22 -3
  32. data/lib/scout_apm/store.rb +35 -13
  33. data/lib/scout_apm/tracked_request.rb +63 -11
  34. data/lib/scout_apm/utils/backtrace_parser.rb +4 -4
  35. data/lib/scout_apm/utils/sql_sanitizer.rb +1 -1
  36. data/lib/scout_apm/utils/sql_sanitizer_regex.rb +2 -2
  37. data/lib/scout_apm/utils/sql_sanitizer_regex_1_8_7.rb +2 -2
  38. data/lib/scout_apm/version.rb +1 -1
  39. data/scout_apm.gemspec +1 -0
  40. data/test/test_helper.rb +4 -3
  41. data/test/unit/layaway_test.rb +5 -8
  42. data/test/unit/metric_set_test.rb +101 -0
  43. data/test/unit/scored_item_set_test.rb +65 -0
  44. data/test/unit/serializers/payload_serializer_test.rb +2 -1
  45. data/test/unit/slow_item_set_test.rb +2 -1
  46. data/test/unit/slow_request_policy_test.rb +42 -0
  47. data/test/unit/sql_sanitizer_test.rb +6 -0
  48. metadata +28 -3
@@ -19,8 +19,9 @@ module ScoutApm
19
19
  attr_reader :allocations
20
20
  attr_reader :hostname
21
21
  attr_reader :seconds_since_startup
22
+ attr_reader :score
22
23
 
23
- def initialize(queue_name, job_name, time, total_time, exclusive_time, context, metrics, allocation_metrics, mem_delta, allocations)
24
+ def initialize(queue_name, job_name, time, total_time, exclusive_time, context, metrics, allocation_metrics, mem_delta, allocations, score)
24
25
  @queue_name = queue_name
25
26
  @job_name = job_name
26
27
  @time = time
@@ -33,6 +34,7 @@ module ScoutApm
33
34
  @allocations = allocations
34
35
  @seconds_since_startup = (Time.now - ScoutApm::Agent.instance.process_start_time)
35
36
  @hostname = ScoutApm::Environment.instance.hostname
37
+ @score = score
36
38
  ScoutApm::Agent.instance.logger.debug { "Slow Job [#{metric_name}] - Call Time: #{total_call_time} Mem Delta: #{mem_delta}"}
37
39
  end
38
40
 
@@ -40,5 +42,20 @@ module ScoutApm
40
42
  "Job/#{queue_name}/#{job_name}"
41
43
  end
42
44
 
45
+ ########################
46
+ # Scorable interface
47
+ #
48
+ # Needed so we can merge ScoredItemSet instances
49
+ def call
50
+ self
51
+ end
52
+
53
+ def name
54
+ metric_name
55
+ end
56
+
57
+ def score
58
+ @score
59
+ end
43
60
  end
44
61
  end
@@ -1,8 +1,5 @@
1
1
  # Long running class that determines if, and in how much detail a potentially
2
2
  # slow transaction should be recorded in
3
- #
4
- # Rules:
5
- # - Runtime must be slower than a threshold
6
3
 
7
4
  module ScoutApm
8
5
  class SlowRequestPolicy
@@ -11,21 +8,92 @@ module ScoutApm
11
8
  CAPTURE_NONE = "capture_none",
12
9
  ]
13
10
 
14
- # It's not slow unless it's at least this slow
15
- SLOW_REQUEST_TIME_THRESHOLD = 2.0 # seconds
11
+ # Adjust speed points. See the function
12
+ POINT_MULTIPLIER_SPEED = 0.25
16
13
 
17
- def capture_type(time)
18
- if !slow_enough?(time)
19
- CAPTURE_NONE
20
- else
21
- CAPTURE_DETAIL
14
+ # For each minute we haven't seen an endpoint
15
+ POINT_MULTIPLIER_AGE = 0.25
16
+
17
+ # Outliers are worth up to "1000ms" of weight
18
+ POINT_MULTIPLIER_PERCENTILE = 1.0
19
+
20
+ # A hash of Endpoint Name to the last time we stored a slow transaction for it.
21
+ #
22
+ # Defaults to a start time that is pretty close to application boot time.
23
+ # So the "age" of an endpoint we've never seen is the time the application
24
+ # has been running.
25
+ attr_reader :last_seen
26
+
27
+
28
+ def initialize
29
+ zero_time = Time.now
30
+ @last_seen = Hash.new { |h, k| h[k] = zero_time }
31
+ end
32
+
33
+ def stored!(request)
34
+ last_seen[unique_name_for(request)] = Time.now
35
+ end
36
+
37
+ # Determine if this request trace should be fully analyzed by scoring it
38
+ # across several metrics, and then determining if that's good enough to
39
+ # make it into this minute's payload.
40
+ #
41
+ # Due to the combining nature of the agent & layaway file, there's no
42
+ # guarantee that a high scoring local champion will still be a winner when
43
+ # they go up to "regionals" and are compared against the other processes
44
+ # running on a node.
45
+ def score(request)
46
+ unique_name = request.unique_name
47
+ if unique_name == :unknown
48
+ return -1 # A negative score, should never be good enough to store.
22
49
  end
50
+
51
+ total_time = request.root_layer.total_call_time
52
+
53
+ # How long has it been since we've seen this?
54
+ age = Time.now - last_seen[unique_name]
55
+
56
+ # What approximate percentile was this request?
57
+ percentile = ScoutApm::Agent.instance.request_histograms.approximate_quantile_of_value(unique_name, total_time)
58
+
59
+ return speed_points(total_time) + percentile_points(percentile) + age_points(age)
23
60
  end
24
61
 
25
62
  private
26
63
 
27
- def slow_enough?(time)
28
- time > SLOW_REQUEST_TIME_THRESHOLD
64
+ def unique_name_for(request)
65
+ scope_layer = LayerConverters::ConverterBase.new(request).scope_layer
66
+ if scope_layer
67
+ scope_layer.legacy_metric_name
68
+ else
69
+ :unknown
70
+ end
71
+ end
72
+
73
+ # Time in seconds
74
+ # Logarithm keeps huge times from swamping the other metrics.
75
+ # 1+ is necessary to keep the log function in positive territory.
76
+ def speed_points(time)
77
+ Math.log(1 + time) * POINT_MULTIPLIER_SPEED
78
+ end
79
+
80
+ def percentile_points(percentile)
81
+ if percentile < 40
82
+ 0.4 # Don't put much emphasis on capturing low percentiles.
83
+ elsif percentile < 60
84
+ 1.4 # Highest here to get mean traces
85
+ elsif percentile < 90
86
+ 0.7 # Between 60 & 90% is fine.
87
+ elsif percentile >= 90
88
+ 1.4 # Highest here to get 90+%ile traces
89
+ else
90
+ # impossible.
91
+ percentile
92
+ end
93
+ end
94
+
95
+ def age_points(age)
96
+ age / 60.0 * POINT_MULTIPLIER_AGE
29
97
  end
30
98
  end
31
99
  end
@@ -17,7 +17,7 @@ module ScoutApm
17
17
  attr_accessor :hostname # hack - we need to reset these server side.
18
18
  attr_accessor :seconds_since_startup # hack - we need to reset these server side.
19
19
 
20
- def initialize(uri, metric_name, total_call_time, metrics, allocation_metrics, context, time, raw_stackprof, mem_delta, allocations)
20
+ def initialize(uri, metric_name, total_call_time, metrics, allocation_metrics, context, time, raw_stackprof, mem_delta, allocations, score)
21
21
  @uri = uri
22
22
  @metric_name = metric_name
23
23
  @total_call_time = total_call_time
@@ -27,11 +27,14 @@ module ScoutApm
27
27
  @time = time
28
28
  @prof = ScoutApm::StackprofTreeCollapser.new(raw_stackprof).call
29
29
  @raw_prof = raw_stackprof # Send whole data up to server
30
+
30
31
  @mem_delta = mem_delta
31
32
  @allocations = allocations
32
33
  @seconds_since_startup = (Time.now - ScoutApm::Agent.instance.process_start_time)
33
34
  @hostname = ScoutApm::Environment.instance.hostname
34
- ScoutApm::Agent.instance.logger.debug { "Slow Request [#{uri}] - Call Time: #{total_call_time} Mem Delta: #{mem_delta}"}
35
+
36
+ @score = score
37
+ ScoutApm::Agent.instance.logger.debug { "Slow Request [#{uri}] - Call Time: #{total_call_time} Mem Delta: #{mem_delta} Score: #{score}"}
35
38
  end
36
39
 
37
40
  # Used to remove metrics when the payload will be too large.
@@ -45,12 +48,28 @@ module ScoutApm
45
48
  end
46
49
 
47
50
  def as_json
48
- json_attributes = [:key, :time, :total_call_time, :uri, [:context, :context_hash], :prof, :mem_delta, :allocations, :seconds_since_startup, :hostname]
51
+ json_attributes = [:key, :time, :total_call_time, :uri, [:context, :context_hash], :score, :prof, :mem_delta, :allocations, :seconds_since_startup, :hostname]
49
52
  ScoutApm::AttributeArranger.call(self, json_attributes)
50
53
  end
51
54
 
52
55
  def context_hash
53
56
  context.to_hash
54
57
  end
58
+
59
+ ########################
60
+ # Scorable interface
61
+ #
62
+ # Needed so we can merge ScoredItemSet instances
63
+ def call
64
+ self
65
+ end
66
+
67
+ def name
68
+ metric_name
69
+ end
70
+
71
+ def score
72
+ @score
73
+ end
55
74
  end
56
75
  end
@@ -22,7 +22,7 @@ module ScoutApm
22
22
  # Save newly collected metrics
23
23
  def track!(metrics, options={})
24
24
  @mutex.synchronize {
25
- current_period.merge_metrics!(metrics)
25
+ current_period.absorb_metrics!(metrics)
26
26
  }
27
27
  end
28
28
 
@@ -107,11 +107,12 @@ module ScoutApm
107
107
 
108
108
  # One period of Storage. Typically 1 minute
109
109
  class StoreReportingPeriod
110
- # A SlowItemSet to store slow transactions in
111
- attr_reader :slow_transactions
112
110
 
113
- # A SlowItemSet to store slow jobs in
114
- attr_reader :slow_jobs
111
+ # A ScoredItemSet holding the "best" traces for the period
112
+ attr_reader :request_traces
113
+
114
+ # A ScoredItemSet holding the "best" traces for the period
115
+ attr_reader :job_traces
115
116
 
116
117
  # A StoreReportingPeriodTimestamp representing the time that this
117
118
  # collection of metrics is for
@@ -122,31 +123,50 @@ module ScoutApm
122
123
  def initialize(timestamp)
123
124
  @timestamp = timestamp
124
125
 
125
- @slow_transactions = SlowItemSet.new
126
- @slow_jobs = SlowItemSet.new
126
+ @request_traces = ScoredItemSet.new
127
+ @job_traces = ScoredItemSet.new
127
128
 
128
129
  @metric_set = MetricSet.new
129
130
  @jobs = Hash.new
130
131
  end
131
132
 
133
+ # Merges another StoreReportingPeriod into this one
134
+ def merge(other)
135
+ self.
136
+ merge_metrics!(other.metric_set).
137
+ merge_slow_transactions!(other.slow_transactions_payload).
138
+ merge_jobs!(other.jobs).
139
+ merge_slow_jobs!(other.slow_jobs_payload)
140
+ self
141
+ end
142
+
132
143
  #################################
133
144
  # Add metrics as they are recorded
134
145
  #################################
135
- def merge_metrics!(metrics)
146
+
147
+ # For absorbing an array of metric {Meta => Stat} records
148
+ def absorb_metrics!(metrics)
136
149
  metric_set.absorb_all(metrics)
137
150
  self
138
151
  end
139
152
 
153
+ # For merging when you have another metric_set object
154
+ # Makes sure that you don't duplicate error count records
155
+ def merge_metrics!(other_metric_set)
156
+ metric_set.combine!(other_metric_set)
157
+ self
158
+ end
159
+
140
160
  def merge_slow_transactions!(new_transactions)
141
161
  Array(new_transactions).each do |one_transaction|
142
- slow_transactions << one_transaction
162
+ request_traces << one_transaction
143
163
  end
144
164
 
145
165
  self
146
166
  end
147
167
 
148
168
  def merge_jobs!(jobs)
149
- jobs.each do |job|
169
+ Array(jobs).each do |job|
150
170
  if @jobs.has_key?(job)
151
171
  @jobs[job].combine!(job)
152
172
  else
@@ -159,8 +179,10 @@ module ScoutApm
159
179
 
160
180
  def merge_slow_jobs!(new_jobs)
161
181
  Array(new_jobs).each do |job|
162
- slow_jobs << job
182
+ job_traces << job
163
183
  end
184
+
185
+ self
164
186
  end
165
187
 
166
188
  #################################
@@ -171,7 +193,7 @@ module ScoutApm
171
193
  end
172
194
 
173
195
  def slow_transactions_payload
174
- slow_transactions.to_a
196
+ request_traces.to_a
175
197
  end
176
198
 
177
199
  def jobs
@@ -179,7 +201,7 @@ module ScoutApm
179
201
  end
180
202
 
181
203
  def slow_jobs_payload
182
- slow_jobs.to_a
204
+ job_traces.to_a
183
205
  end
184
206
 
185
207
  #################################
@@ -39,6 +39,10 @@ module ScoutApm
39
39
  # with same names across multiple types.
40
40
  attr_accessor :call_counts
41
41
 
42
+ # if there's an instant_key, pass the transaction trace on for immediate reporting (in addition to the usual background aggregation)
43
+ # this is set in the controller instumentation (ActionControllerRails3Rails4 according)
44
+ attr_accessor :instant_key
45
+
42
46
  BACKTRACE_THRESHOLD = 0.5 # the minimum threshold in seconds to record the backtrace for a metric.
43
47
 
44
48
  def initialize
@@ -50,6 +54,7 @@ module ScoutApm
50
54
  @root_layer = nil
51
55
  @stackprof = nil
52
56
  @error = false
57
+ @instant_key = nil
53
58
  @mem_start = mem_usage
54
59
  end
55
60
 
@@ -67,6 +72,17 @@ module ScoutApm
67
72
  return if ignoring_children?
68
73
 
69
74
  layer = @layers.pop
75
+
76
+ # Safeguard against a mismatch in the layer tracking in an instrument.
77
+ # This class works under the assumption that start & stop layers are
78
+ # lined up correctly. If stop_layer gets called twice, when it should
79
+ # only have been called once you'll end up with this error.
80
+ if layer.nil?
81
+ ScoutApm::Agent.instance.logger.warn("Error stopping layer, was nil. Root Layer: #{@root_layer.inspect}")
82
+ stop_request
83
+ return
84
+ end
85
+
70
86
  layer.record_stop_time!
71
87
  layer.record_allocations!
72
88
 
@@ -87,7 +103,7 @@ module ScoutApm
87
103
  # instrumentation early, and gradually learn more about the request that
88
104
  # actually happened as we go (for instance, the # of records found, or the
89
105
  # actual SQL generated).
90
- #
106
+ #
91
107
  # Returns nil in the case there is no current layer. That would be normal
92
108
  # for a completed TrackedRequest
93
109
  def current_layer
@@ -202,6 +218,10 @@ module ScoutApm
202
218
  request_type == "web"
203
219
  end
204
220
 
221
+ def instant?
222
+ instant_key
223
+ end
224
+
205
225
  ###################################
206
226
  # Persist the Request
207
227
  ###################################
@@ -211,27 +231,59 @@ module ScoutApm
211
231
  def record!
212
232
  @recorded = true
213
233
 
234
+ # Update immediate and long-term histograms for both job and web requests
235
+ if unique_name != :unknown
236
+ ScoutApm::Agent.instance.request_histograms.add(unique_name, root_layer.total_call_time)
237
+ ScoutApm::Agent.instance.request_histograms_resettable.add(unique_name, root_layer.total_call_time)
238
+ end
239
+
214
240
  metrics = LayerConverters::MetricConverter.new(self).call
215
241
  ScoutApm::Agent.instance.store.track!(metrics)
216
242
 
217
- slow, slow_metrics = LayerConverters::SlowRequestConverter.new(self).call
218
- ScoutApm::Agent.instance.store.track_slow_transaction!(slow)
219
- ScoutApm::Agent.instance.store.track!(slow_metrics)
220
-
221
243
  error_metrics = LayerConverters::ErrorConverter.new(self).call
222
244
  ScoutApm::Agent.instance.store.track!(error_metrics)
223
245
 
224
- queue_time_metrics = LayerConverters::RequestQueueTimeConverter.new(self).call
225
- ScoutApm::Agent.instance.store.track!(queue_time_metrics)
246
+ allocation_metrics = LayerConverters::AllocationMetricConverter.new(self).call
247
+ ScoutApm::Agent.instance.store.track!(allocation_metrics)
248
+
249
+ if web?
250
+ # Don't #call this - that's the job of the ScoredItemSet later.
251
+ slow_converter = LayerConverters::SlowRequestConverter.new(self)
252
+ ScoutApm::Agent.instance.store.track_slow_transaction!(slow_converter)
253
+
254
+ queue_time_metrics = LayerConverters::RequestQueueTimeConverter.new(self).call
255
+ ScoutApm::Agent.instance.store.track!(queue_time_metrics)
226
256
 
227
- job = LayerConverters::JobConverter.new(self).call
228
- ScoutApm::Agent.instance.store.track_job!(job)
257
+ # If there's an instant_key, it means we need to report this right away
258
+ if instant?
259
+ trace = slow_converter.call
260
+ ScoutApm::InstantReporting.new(trace, instant_key).call()
261
+ end
262
+ end
229
263
 
230
- slow_job = LayerConverters::SlowJobConverter.new(self).call
231
- ScoutApm::Agent.instance.store.track_slow_job!(slow_job)
264
+ if job?
265
+ job_metrics = LayerConverters::JobConverter.new(self).call
266
+ ScoutApm::Agent.instance.store.track_job!(job_metrics)
267
+
268
+ job_converter = LayerConverters::SlowJobConverter.new(self)
269
+ ScoutApm::Agent.instance.store.track_slow_job!(job_converter)
270
+ end
232
271
 
233
272
  allocation_metrics = LayerConverters::AllocationMetricConverter.new(self).call
234
273
  ScoutApm::Agent.instance.store.track!(allocation_metrics)
274
+
275
+ end
276
+
277
+ # Only call this after the request is complete
278
+ def unique_name
279
+ @unique_name ||= begin
280
+ scope_layer = LayerConverters::ConverterBase.new(self).scope_layer
281
+ if scope_layer
282
+ scope_layer.legacy_metric_name
283
+ else
284
+ :unknown
285
+ end
286
+ end
235
287
  end
236
288
 
237
289
  # Have we already persisted this request?
@@ -6,21 +6,21 @@ module ScoutApm
6
6
  module Utils
7
7
  class BacktraceParser
8
8
 
9
+ APP_FRAMES = 3 # will return up to 3 frames from the app stack.
10
+
9
11
  def initialize(call_stack)
10
12
  @call_stack = call_stack
11
13
  # We can't use a constant as it'd be too early to fetch environment info
12
14
  @@app_dir_regex ||= /\A(#{ScoutApm::Environment.instance.root.to_s.gsub('/','\/')}\/)(app\/(.+))/.freeze
13
15
  end
14
16
 
15
- # Given a call stack Array, grabs the first call within the application root directory.
17
+ # Given a call stack Array, grabs the first +APP_FRAMES+ callers within the application root directory.
16
18
  def call
17
- # We used to return an array of up to 5 elements...this will return a single element-array for backwards compatibility.
18
- # Only the first element is used in Github code display.
19
19
  stack = []
20
20
  @call_stack.each_with_index do |c,i|
21
21
  if m = c.match(@@app_dir_regex)
22
22
  stack << m[2]
23
- break
23
+ break if stack.size == APP_FRAMES
24
24
  end
25
25
  end
26
26
  stack