scout_apm 2.0.0.pre → 2.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGELOG.markdown +22 -5
  4. data/Rakefile +5 -0
  5. data/lib/scout_apm.rb +4 -0
  6. data/lib/scout_apm/agent.rb +22 -8
  7. data/lib/scout_apm/agent/reporting.rb +8 -3
  8. data/lib/scout_apm/attribute_arranger.rb +4 -0
  9. data/lib/scout_apm/bucket_name_splitter.rb +3 -3
  10. data/lib/scout_apm/config.rb +5 -2
  11. data/lib/scout_apm/histogram.rb +20 -0
  12. data/lib/scout_apm/instant_reporting.rb +40 -0
  13. data/lib/scout_apm/instruments/action_controller_rails_3_rails4.rb +11 -1
  14. data/lib/scout_apm/instruments/percentile_sampler.rb +38 -0
  15. data/lib/scout_apm/layaway.rb +1 -4
  16. data/lib/scout_apm/layaway_file.rb +26 -2
  17. data/lib/scout_apm/layer.rb +1 -1
  18. data/lib/scout_apm/layer_converters/converter_base.rb +6 -4
  19. data/lib/scout_apm/layer_converters/slow_job_converter.rb +21 -13
  20. data/lib/scout_apm/layer_converters/slow_request_converter.rb +37 -24
  21. data/lib/scout_apm/metric_meta.rb +5 -1
  22. data/lib/scout_apm/metric_set.rb +15 -6
  23. data/lib/scout_apm/reporter.rb +9 -3
  24. data/lib/scout_apm/request_histograms.rb +46 -0
  25. data/lib/scout_apm/scored_item_set.rb +79 -0
  26. data/lib/scout_apm/serializers/payload_serializer_to_json.rb +2 -0
  27. data/lib/scout_apm/serializers/slow_jobs_serializer_to_json.rb +2 -0
  28. data/lib/scout_apm/slow_job_policy.rb +89 -19
  29. data/lib/scout_apm/slow_job_record.rb +18 -1
  30. data/lib/scout_apm/slow_request_policy.rb +80 -12
  31. data/lib/scout_apm/slow_transaction.rb +22 -3
  32. data/lib/scout_apm/store.rb +35 -13
  33. data/lib/scout_apm/tracked_request.rb +63 -11
  34. data/lib/scout_apm/utils/backtrace_parser.rb +4 -4
  35. data/lib/scout_apm/utils/sql_sanitizer.rb +1 -1
  36. data/lib/scout_apm/utils/sql_sanitizer_regex.rb +2 -2
  37. data/lib/scout_apm/utils/sql_sanitizer_regex_1_8_7.rb +2 -2
  38. data/lib/scout_apm/version.rb +1 -1
  39. data/scout_apm.gemspec +1 -0
  40. data/test/test_helper.rb +4 -3
  41. data/test/unit/layaway_test.rb +5 -8
  42. data/test/unit/metric_set_test.rb +101 -0
  43. data/test/unit/scored_item_set_test.rb +65 -0
  44. data/test/unit/serializers/payload_serializer_test.rb +2 -1
  45. data/test/unit/slow_item_set_test.rb +2 -1
  46. data/test/unit/slow_request_policy_test.rb +42 -0
  47. data/test/unit/sql_sanitizer_test.rb +6 -0
  48. metadata +28 -3
@@ -19,10 +19,12 @@ module ScoutApm
19
19
  # render :update
20
20
  # end
21
21
  def scope_layer
22
- @scope_layer ||= walker.walk do |layer|
23
- if layer.type == "Controller"
24
- break layer
25
- end
22
+ @scope_layer ||= find_first_layer_of_type("Controller") || find_first_layer_of_type("Job")
23
+ end
24
+
25
+ def find_first_layer_of_type(layer_type)
26
+ walker.walk do |layer|
27
+ return layer if layer.type == layer_type
26
28
  end
27
29
  end
28
30
  end
@@ -4,15 +4,29 @@ module ScoutApm
4
4
  def initialize(*)
5
5
  @backtraces = []
6
6
  super
7
+
8
+ # After call to super, so @request is populated
9
+ @points = if request.job?
10
+ ScoutApm::Agent.instance.slow_job_policy.score(request)
11
+ else
12
+ -1
13
+ end
7
14
  end
8
15
 
9
- def call
10
- return unless request.job?
16
+ def name
17
+ request.unique_name
18
+ end
19
+
20
+ def score
21
+ @points
22
+ end
11
23
 
12
- job_name = [queue_layer.name, job_layer.name]
24
+ def call
25
+ return nil unless request.job?
26
+ return nil unless queue_layer
27
+ return nil unless job_layer
13
28
 
14
- slow_enough = ScoutApm::Agent.instance.slow_job_policy.slow?(job_name, root_layer.total_call_time)
15
- return unless slow_enough
29
+ ScoutApm::Agent.instance.slow_job_policy.stored!(request)
16
30
 
17
31
  # record the change in memory usage
18
32
  mem_delta = ScoutApm::Instruments::Process::ProcessMemory.rss_to_mb(request.capture_mem_delta!)
@@ -32,8 +46,8 @@ module ScoutApm
32
46
  timing_metrics,
33
47
  allocation_metrics,
34
48
  mem_delta,
35
- job_layer.total_allocations
36
- )
49
+ job_layer.total_allocations,
50
+ score)
37
51
  end
38
52
 
39
53
  def queue_layer
@@ -44,12 +58,6 @@ module ScoutApm
44
58
  @job_layer ||= find_first_layer_of_type("Job")
45
59
  end
46
60
 
47
- def find_first_layer_of_type(layer_type)
48
- walker.walk do |layer|
49
- return layer if layer.type == layer_type
50
- end
51
- end
52
-
53
61
  def create_metrics
54
62
  metric_hash = Hash.new
55
63
  allocation_metric_hash = Hash.new
@@ -4,25 +4,34 @@ module ScoutApm
4
4
  def initialize(*)
5
5
  @backtraces = [] # An Array of MetricMetas that have a backtrace
6
6
  super
7
+
8
+ # After call to super, so @request is populated
9
+ @points = if request.web?
10
+ ScoutApm::Agent.instance.slow_request_policy.score(request)
11
+ else
12
+ -1
13
+ end
14
+ end
15
+
16
+ def name
17
+ request.unique_name
18
+ end
19
+
20
+ def score
21
+ @points
7
22
  end
8
23
 
24
+ # Unconditionally attempts to convert this into a SlowTransaction object.
25
+ # Can return nil if the request didn't have any scope_layer.
9
26
  def call
10
27
  scope = scope_layer
11
- return [nil, {}] unless scope
28
+ return nil unless scope
12
29
 
13
- policy = ScoutApm::Agent.instance.slow_request_policy.capture_type(root_layer.total_call_time)
14
- if policy == ScoutApm::SlowRequestPolicy::CAPTURE_NONE
15
- return [nil, {}]
16
- end
30
+ ScoutApm::Agent.instance.slow_request_policy.stored!(request)
17
31
 
18
32
  # record the change in memory usage
19
33
  mem_delta = ScoutApm::Instruments::Process::ProcessMemory.rss_to_mb(@request.capture_mem_delta!)
20
34
 
21
- # increment the slow transaction count if this is a slow transaction.
22
- meta = MetricMeta.new("SlowTransaction/#{scope.legacy_metric_name}")
23
- stat = MetricStats.new
24
- stat.update!(1)
25
-
26
35
  uri = request.annotations[:uri] || ""
27
36
 
28
37
  timing_metrics, allocation_metrics = create_metrics
@@ -30,23 +39,27 @@ module ScoutApm
30
39
  allocation_metrics = {}
31
40
  end
32
41
 
42
+ ScoutApm::Agent.instance.config.value("ignore_traces").each do |pattern|
43
+ if /#{pattern}/ =~ uri
44
+ ScoutApm::Agent.instance.logger.debug("Skipped recording a trace for #{uri} due to `ignore_traces` pattern: #{pattern}")
45
+ return nil
46
+ end
47
+ end
48
+
33
49
  # Disable stackprof output for now
34
50
  stackprof = [] # request.stackprof
35
51
 
36
- [
37
- SlowTransaction.new(uri,
38
- scope.legacy_metric_name,
39
- root_layer.total_call_time,
40
- timing_metrics,
41
- allocation_metrics,
42
- request.context,
43
- root_layer.stop_time,
44
- stackprof,
45
- mem_delta,
46
- root_layer.total_allocations
47
- ),
48
- { meta => stat }
49
- ]
52
+ SlowTransaction.new(uri,
53
+ scope.legacy_metric_name,
54
+ root_layer.total_call_time,
55
+ timing_metrics,
56
+ allocation_metrics,
57
+ request.context,
58
+ root_layer.stop_time,
59
+ stackprof,
60
+ mem_delta,
61
+ root_layer.total_allocations,
62
+ @points)
50
63
  end
51
64
 
52
65
  # Iterates over the TrackedRequest's MetricMetas that have backtraces and attaches each to correct MetricMeta in the Metric Hash.
@@ -17,7 +17,11 @@ class MetricMeta
17
17
 
18
18
  # Unsure if type or bucket is a better name.
19
19
  def type
20
- bucket
20
+ bucket_type
21
+ end
22
+
23
+ def name
24
+ bucket_name
21
25
  end
22
26
 
23
27
  # A key metric is the "core" of a request - either the Rails controller reached, or the background Job executed
@@ -2,7 +2,7 @@ module ScoutApm
2
2
  class MetricSet
3
3
  # We can't aggregate CPU, Memory, Capacity, or Controller, so pass through these metrics directly
4
4
  # TODO: Figure out a way to not have this duplicate what's in Samplers, and also on server's ingest
5
- PASSTHROUGH_METRICS = ["CPU", "Memory", "Instance", "Controller", "SlowTransaction"]
5
+ PASSTHROUGH_METRICS = ["CPU", "Memory", "Instance", "Controller", "SlowTransaction", "Percentile", "Job"]
6
6
 
7
7
  attr_reader :metrics
8
8
 
@@ -23,11 +23,15 @@ module ScoutApm
23
23
  @metrics[meta].combine!(stat)
24
24
 
25
25
  elsif meta.type == "Errors" # Sadly special cased, we want both raw and aggregate values
26
- @metrics[meta] ||= MetricStats.new
27
- @metrics[meta].combine!(stat)
28
- agg_meta = MetricMeta.new("Errors/Request", :scope => meta.scope)
29
- @metrics[agg_meta] ||= MetricStats.new
30
- @metrics[agg_meta].combine!(stat)
26
+ # When combining MetricSets between different
27
+ @metrics[meta] ||= MetricStats.new
28
+ @metrics[meta].combine!(stat)
29
+
30
+ if !@combine_in_progress
31
+ agg_meta = MetricMeta.new("Errors/Request", :scope => meta.scope)
32
+ @metrics[agg_meta] ||= MetricStats.new
33
+ @metrics[agg_meta].combine!(stat)
34
+ end
31
35
 
32
36
  else # Combine down to a single /all key
33
37
  agg_meta = MetricMeta.new("#{meta.type}/all", :scope => meta.scope)
@@ -36,8 +40,13 @@ module ScoutApm
36
40
  end
37
41
  end
38
42
 
43
+ # Sets a combine_in_progress flag to prevent double-counting Error metrics.
44
+ # Without it, the Errors/Request number would be increasingly off as
45
+ # metric_sets get merged in.
39
46
  def combine!(other)
47
+ @combine_in_progress = true
40
48
  absorb_all(other.metrics)
49
+ @combine_in_progress = false
41
50
  self
42
51
  end
43
52
  end
@@ -8,17 +8,21 @@ module ScoutApm
8
8
  attr_reader :config
9
9
  attr_reader :logger
10
10
  attr_reader :type
11
+ attr_reader :instant_key
11
12
 
12
- def initialize(type = :checkin, config=Agent.instance.config, logger=Agent.instance.logger)
13
+ def initialize(type = :checkin, config=Agent.instance.config, logger=Agent.instance.logger, instant_key=nil)
13
14
  @config = config
14
15
  @logger = logger
15
16
  @type = type
17
+ @instant_key = instant_key
16
18
  end
17
19
 
18
20
  # TODO: Parse & return a real response object, not the HTTP Response object
19
21
  def report(payload, headers = {})
20
- Array(config.value('host')).each do |host|
22
+ # Some posts (typically ones under development) bypass the ingestion pipeline and go directly to the webserver. They use direct_host instead of host
23
+ hosts = [:deploy_hook, :instant_trace].include?(type) ? config.value('direct_host') : config.value('host')
21
24
 
25
+ Array(hosts).each do |host|
22
26
  full_uri = uri(host)
23
27
  response = post(full_uri, payload, headers)
24
28
  unless response && response.is_a?(Net::HTTPSuccess)
@@ -34,7 +38,9 @@ module ScoutApm
34
38
  when :app_server_load
35
39
  URI.parse("#{host}/apps/app_server_load.scout?key=#{config.value('key')}&name=#{CGI.escape(Environment.instance.application_name)}")
36
40
  when :deploy_hook
37
- URI.parse("https://apm.scoutapp.com/apps/deploy.scout?key=#{config.value('key')}&name=#{CGI.escape(config.value('name'))}")
41
+ URI.parse("#{host}/apps/deploy.scout?key=#{config.value('key')}&name=#{CGI.escape(config.value('name'))}")
42
+ when :instant_trace
43
+ URI.parse("#{host}/apps/instant_trace.scout?key=#{config.value('key')}&name=#{CGI.escape(config.value('name'))}&instant_key=#{instant_key}")
38
44
  end.tap{|u| logger.debug("Posting to #{u.to_s}")}
39
45
  end
40
46
 
@@ -0,0 +1,46 @@
1
+ module ScoutApm
2
+ class RequestHistograms
3
+ DEFAULT_HISTOGRAM_SIZE = 50
4
+
5
+ # Private Accessor:
6
+ # A hash of Endpoint Name to an approximate histogram
7
+ #
8
+ # Each time a new request is requested to see if it's slow or not, we
9
+ # should insert it into the histogram, and get the approximate percentile
10
+ # of that time
11
+ attr_reader :histograms
12
+ private :histograms
13
+
14
+ attr_reader :histogram_size
15
+
16
+ def initialize(histogram_size = DEFAULT_HISTOGRAM_SIZE)
17
+ @histogram_size = histogram_size
18
+ initialize_histograms_hash
19
+ end
20
+
21
+ def each_name
22
+ @histograms.keys.each { |n| yield n }
23
+ end
24
+
25
+ def add(item, value)
26
+ @histograms[item].add(value)
27
+ end
28
+
29
+ def approximate_quantile_of_value(item, value)
30
+ @histograms[item].approximate_quantile_of_value(value)
31
+ end
32
+
33
+ def quantile(item, q)
34
+ @histograms[item].quantile(q)
35
+ end
36
+
37
+ # Wipes all histograms, setting them back to empty
38
+ def reset_all!
39
+ initialize_histograms_hash
40
+ end
41
+
42
+ def initialize_histograms_hash
43
+ @histograms = Hash.new { |h, k| h[k] = NumericHistogram.new(histogram_size) }
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,79 @@
1
+ # Attempts to keep the highest score.
2
+ #
3
+ # Each item must respond to:
4
+ # #call to get the storable item
5
+ # #name to get a unique identifier of the storable
6
+ # #score to get a numeric score, where higher is better
7
+ module ScoutApm
8
+ class ScoredItemSet
9
+ include Enumerable
10
+
11
+ # A number larger than any score we will actually get.
12
+ ARBITRARILY_LARGE = 100000000
13
+
14
+ # Without otherwise saying, default the size to this
15
+ DEFAULT_MAX_SIZE = 10
16
+
17
+ attr_reader :max_size
18
+ attr_reader :items
19
+
20
+ def initialize(max_size = DEFAULT_MAX_SIZE)
21
+ @items = {}
22
+ @max_size = max_size
23
+ end
24
+
25
+ def each
26
+ items.each do |(_, (_, item))|
27
+ yield item
28
+ end
29
+ end
30
+
31
+ # This function is a large if statement, with a few branches. See inline comments for each branch.
32
+ def <<(new_item)
33
+ return if new_item.name == :unknown
34
+
35
+ # If we have this item in the hash already, compare the new & old ones, and store
36
+ # the new one only if it's higher score.
37
+ if items.has_key?(new_item.name)
38
+ if new_item.score > items[new_item.name].first
39
+ store!(new_item)
40
+ end
41
+
42
+
43
+ # If the set is full, then we have to see if we evict anything to store
44
+ # this one
45
+ elsif full?
46
+ smallest_name, smallest_score = items.inject([nil, ARBITRARILY_LARGE]) do |(memo_name, memo_score), (name, (stored_score, _))|
47
+ if stored_score < memo_score
48
+ [name, stored_score]
49
+ else
50
+ [memo_name, memo_score]
51
+ end
52
+ end
53
+
54
+ if smallest_score < new_item.score
55
+ items.delete(smallest_name)
56
+ store!(new_item)
57
+ end
58
+
59
+
60
+ # Set isn't full, and we've not seen this new_item, so go ahead and store it.
61
+ else
62
+ store!(new_item)
63
+ end
64
+ end
65
+
66
+
67
+ private
68
+
69
+ def full?
70
+ items.size >= max_size
71
+ end
72
+
73
+ def store!(new_item)
74
+ if !new_item.name.nil? # Never store a nil name.
75
+ items[new_item.name] = [new_item.score, new_item.call]
76
+ end
77
+ end
78
+ end
79
+ end
@@ -59,6 +59,8 @@ module ScoutApm
59
59
  "[#{all_the_elements.join(",")}]"
60
60
  when Numeric
61
61
  formatee
62
+ when Time
63
+ %Q["#{formatee.iso8601}"]
62
64
  when nil
63
65
  "null"
64
66
  else # strings and everything
@@ -24,6 +24,8 @@ module ScoutApm
24
24
  "metrics" => MetricsToJsonSerializer.new(job.metrics).as_json, # New style of metrics
25
25
  "allocation_metrics" => MetricsToJsonSerializer.new(job.allocation_metrics).as_json, # New style of metrics
26
26
  "context" => job.context.to_hash,
27
+
28
+ "score" => job.score,
27
29
  }
28
30
  end
29
31
  end
@@ -1,29 +1,99 @@
1
- # Create one of these at startup time, and ask it if a certain worker's
2
- # processing time is slow enough for us to collect a slow trace.
3
- #
4
- # Keeps track of a histogram of times for each worker class (spearately), and
5
- # uses a percentile of normal to mark individual runs as "slow".
6
- #
7
- # This assumes that all worker calls will be requested once to `slow?`, so that
8
- # the data can be stored
1
+ # Long running class that determines if, and in how much detail a potentially
2
+ # slow job should be recorded in
3
+
9
4
  module ScoutApm
10
5
  class SlowJobPolicy
11
- DEFAULT_HISTOGRAM_SIZE = 50
6
+ CAPTURE_TYPES = [
7
+ CAPTURE_DETAIL = "capture_detail",
8
+ CAPTURE_NONE = "capture_none",
9
+ ]
10
+
11
+ # Adjust speed points. See the function
12
+ POINT_MULTIPLIER_SPEED = 0.25
13
+
14
+ # For each minute we haven't seen an endpoint
15
+ POINT_MULTIPLIER_AGE = 0.25
16
+
17
+ # Outliers are worth up to "1000ms" of weight
18
+ POINT_MULTIPLIER_PERCENTILE = 1.0
19
+
20
+ # A hash of Job Names to the last time we stored a slow trace for it.
21
+ #
22
+ # Defaults to a start time that is pretty close to application boot time.
23
+ # So the "age" of an endpoint we've never seen is the time the application
24
+ # has been running.
25
+ attr_reader :last_seen
12
26
 
13
- QUANTILE = 95
14
27
 
15
- def initialize(histogram_size = DEFAULT_HISTOGRAM_SIZE)
16
- @histograms = Hash.new { |h, k| h[k] = NumericHistogram.new(histogram_size) }
28
+ def initialize
29
+ zero_time = Time.now
30
+ @last_seen = Hash.new { |h, k| h[k] = zero_time }
17
31
  end
18
32
 
19
- # worker: just the worker class name. "PasswordResetJob" or similar
20
- # total_time: runtime of the job in seconds
21
- # returns true if this request should be stored in higher trace detail, false otherwise
22
- def slow?(worker, total_time)
23
- @histograms[worker].add(total_time)
24
- return false if @histograms[worker].total == 1 # First call is never slow
33
+ def stored!(request)
34
+ last_seen[unique_name_for(request)] = Time.now
35
+ end
36
+
37
+ # Determine if this job trace should be fully analyzed by scoring it
38
+ # across several metrics, and then determining if that's good enough to
39
+ # make it into this minute's payload.
40
+ #
41
+ # Due to the combining nature of the agent & layaway file, there's no
42
+ # guarantee that a high scoring local champion will still be a winner when
43
+ # they go up to "regionals" and are compared against the other processes
44
+ # running on a node.
45
+ def score(request)
46
+ unique_name = request.unique_name
47
+ if unique_name == :unknown
48
+ return -1 # A negative score, should never be good enough to store.
49
+ end
50
+
51
+ total_time = request.root_layer.total_call_time
52
+
53
+ # How long has it been since we've seen this?
54
+ age = Time.now - last_seen[unique_name]
55
+
56
+ # What approximate percentile was this request?
57
+ percentile = ScoutApm::Agent.instance.request_histograms.approximate_quantile_of_value(unique_name, total_time)
58
+
59
+ return speed_points(total_time) + percentile_points(percentile) + age_points(age)
60
+ end
61
+
62
+ private
63
+
64
+ def unique_name_for(request)
65
+ scope_layer = LayerConverters::ConverterBase.new(request).scope_layer
66
+ if scope_layer
67
+ scope_layer.legacy_metric_name
68
+ else
69
+ :unknown
70
+ end
71
+ end
72
+
73
+ # Time in seconds
74
+ # Logarithm keeps huge times from swamping the other metrics.
75
+ # 1+ is necessary to keep the log function in positive territory.
76
+ def speed_points(time)
77
+ Math.log(1 + time) * POINT_MULTIPLIER_SPEED
78
+ end
79
+
80
+ def percentile_points(percentile)
81
+ if percentile < 40
82
+ 0.4 # Don't put much emphasis on capturing low percentiles.
83
+ elsif percentile < 60
84
+ 1.4 # Highest here to get mean traces
85
+ elsif percentile < 90
86
+ 0.7 # Between 60 & 90% is fine.
87
+ elsif percentile >= 90
88
+ 1.4 # Highest here to get 90+%ile traces
89
+ else
90
+ # impossible.
91
+ percentile
92
+ end
93
+ end
25
94
 
26
- total_time >= @histograms[worker].quantile(QUANTILE)
95
+ def age_points(age)
96
+ age / 60.0 * POINT_MULTIPLIER_AGE
27
97
  end
28
98
  end
29
99
  end