ddtrace 0.52.0 → 0.53.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +65 -1
  3. data/ddtrace.gemspec +1 -1
  4. data/docs/DevelopmentGuide.md +1 -6
  5. data/docs/GettingStarted.md +66 -16
  6. data/lib/datadog/ci/contrib/rspec/integration.rb +2 -2
  7. data/lib/datadog/ci/ext/environment.rb +41 -4
  8. data/lib/datadog/contrib.rb +2 -0
  9. data/lib/datadog/core/environment/vm_cache.rb +46 -0
  10. data/lib/ddtrace/configuration/agent_settings_resolver.rb +107 -40
  11. data/lib/ddtrace/configuration/components.rb +1 -1
  12. data/lib/ddtrace/configuration/settings.rb +13 -3
  13. data/lib/ddtrace/contrib/action_cable/instrumentation.rb +46 -0
  14. data/lib/ddtrace/contrib/action_cable/patcher.rb +1 -0
  15. data/lib/ddtrace/contrib/action_mailer/configuration/settings.rb +32 -0
  16. data/lib/ddtrace/contrib/action_mailer/event.rb +50 -0
  17. data/lib/ddtrace/contrib/action_mailer/events/deliver.rb +54 -0
  18. data/lib/ddtrace/contrib/action_mailer/events/process.rb +41 -0
  19. data/lib/ddtrace/contrib/action_mailer/events.rb +31 -0
  20. data/lib/ddtrace/contrib/action_mailer/ext.rb +32 -0
  21. data/lib/ddtrace/contrib/action_mailer/integration.rb +45 -0
  22. data/lib/ddtrace/contrib/action_mailer/patcher.rb +27 -0
  23. data/lib/ddtrace/contrib/active_job/configuration/settings.rb +33 -0
  24. data/lib/ddtrace/contrib/active_job/event.rb +54 -0
  25. data/lib/ddtrace/contrib/active_job/events/discard.rb +46 -0
  26. data/lib/ddtrace/contrib/active_job/events/enqueue.rb +45 -0
  27. data/lib/ddtrace/contrib/active_job/events/enqueue_at.rb +45 -0
  28. data/lib/ddtrace/contrib/active_job/events/enqueue_retry.rb +47 -0
  29. data/lib/ddtrace/contrib/active_job/events/perform.rb +45 -0
  30. data/lib/ddtrace/contrib/active_job/events/retry_stopped.rb +46 -0
  31. data/lib/ddtrace/contrib/active_job/events.rb +39 -0
  32. data/lib/ddtrace/contrib/active_job/ext.rb +32 -0
  33. data/lib/ddtrace/contrib/active_job/integration.rb +46 -0
  34. data/lib/ddtrace/contrib/active_job/log_injection.rb +21 -0
  35. data/lib/ddtrace/contrib/active_job/patcher.rb +33 -0
  36. data/lib/ddtrace/contrib/auto_instrument.rb +0 -1
  37. data/lib/ddtrace/contrib/rails/auto_instrument_railtie.rb +0 -1
  38. data/lib/ddtrace/contrib/rails/framework.rb +22 -0
  39. data/lib/ddtrace/contrib/rails/patcher.rb +19 -10
  40. data/lib/ddtrace/contrib/registerable.rb +0 -1
  41. data/lib/ddtrace/contrib/sidekiq/ext.rb +3 -0
  42. data/lib/ddtrace/contrib/sidekiq/integration.rb +10 -0
  43. data/lib/ddtrace/contrib/sidekiq/patcher.rb +26 -0
  44. data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/heartbeat.rb +30 -0
  45. data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/job_fetch.rb +30 -0
  46. data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/scheduled_push.rb +29 -0
  47. data/lib/ddtrace/contrib/sinatra/env.rb +2 -1
  48. data/lib/ddtrace/contrib/sinatra/tracer.rb +15 -2
  49. data/lib/ddtrace/ext/git.rb +12 -0
  50. data/lib/ddtrace/ext/profiling.rb +1 -0
  51. data/lib/ddtrace/ext/runtime.rb +3 -0
  52. data/lib/ddtrace/ext/transport.rb +11 -0
  53. data/lib/ddtrace/profiling/collectors/stack.rb +71 -27
  54. data/lib/ddtrace/profiling/encoding/profile.rb +9 -1
  55. data/lib/ddtrace/profiling/events/stack.rb +7 -7
  56. data/lib/ddtrace/profiling/pprof/converter.rb +22 -9
  57. data/lib/ddtrace/profiling/pprof/stack_sample.rb +28 -2
  58. data/lib/ddtrace/profiling/tasks/setup.rb +0 -1
  59. data/lib/ddtrace/profiling/trace_identifiers/ddtrace.rb +1 -1
  60. data/lib/ddtrace/profiling/trace_identifiers/helper.rb +3 -3
  61. data/lib/ddtrace/profiling/transport/http.rb +8 -17
  62. data/lib/ddtrace/runtime/metrics.rb +14 -0
  63. data/lib/ddtrace/span.rb +7 -19
  64. data/lib/ddtrace/tracer.rb +1 -1
  65. data/lib/ddtrace/transport/http/adapters/net.rb +13 -3
  66. data/lib/ddtrace/transport/http/adapters/test.rb +4 -2
  67. data/lib/ddtrace/transport/http/adapters/unix_socket.rb +23 -12
  68. data/lib/ddtrace/transport/http/builder.rb +13 -6
  69. data/lib/ddtrace/transport/http.rb +5 -11
  70. data/lib/ddtrace/utils/time.rb +5 -6
  71. data/lib/ddtrace/version.rb +1 -1
  72. metadata +27 -2
@@ -34,8 +34,34 @@ module Datadog
34
34
  config.server_middleware do |chain|
35
35
  chain.add(Sidekiq::ServerTracer)
36
36
  end
37
+
38
+ patch_server_internals if Integration.compatible_with_server_internal_tracing?
37
39
  end
38
40
  end
41
+
42
+ def patch_server_internals
43
+ patch_server_heartbeat
44
+ patch_server_job_fetch
45
+ patch_server_scheduled_push
46
+ end
47
+
48
+ def patch_server_heartbeat
49
+ require 'ddtrace/contrib/sidekiq/server_internal_tracer/heartbeat'
50
+
51
+ ::Sidekiq::Launcher.prepend(ServerInternalTracer::Heartbeat)
52
+ end
53
+
54
+ def patch_server_job_fetch
55
+ require 'ddtrace/contrib/sidekiq/server_internal_tracer/job_fetch'
56
+
57
+ ::Sidekiq::Processor.prepend(ServerInternalTracer::JobFetch)
58
+ end
59
+
60
+ def patch_server_scheduled_push
61
+ require 'ddtrace/contrib/sidekiq/server_internal_tracer/scheduled_push'
62
+
63
+ ::Sidekiq::Scheduled::Poller.prepend(ServerInternalTracer::ScheduledPush)
64
+ end
39
65
  end
40
66
  end
41
67
  end
@@ -0,0 +1,30 @@
1
+ # typed: true
2
+
3
+ module Datadog
4
+ module Contrib
5
+ module Sidekiq
6
+ module ServerInternalTracer
7
+ # Trace when a Sidekiq process has a heartbeat
8
+ module Heartbeat
9
+ private
10
+
11
+ def ❤ # rubocop:disable Naming/AsciiIdentifiers, Naming/MethodName
12
+ configuration = Datadog.configuration[:sidekiq]
13
+
14
+ configuration[:tracer].trace(Ext::SPAN_HEARTBEAT) do |span|
15
+ span.service = configuration[:service_name]
16
+ span.span_type = Datadog::Ext::AppTypes::WORKER
17
+
18
+ # Set analytics sample rate
19
+ if Contrib::Analytics.enabled?(configuration[:analytics_enabled])
20
+ Contrib::Analytics.set_sample_rate(span, configuration[:analytics_sample_rate])
21
+ end
22
+
23
+ super
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,30 @@
1
+ # typed: true
2
+
3
+ module Datadog
4
+ module Contrib
5
+ module Sidekiq
6
+ module ServerInternalTracer
7
+ # Trace when Sidekiq looks for another job to work
8
+ module JobFetch
9
+ private
10
+
11
+ def fetch
12
+ configuration = Datadog.configuration[:sidekiq]
13
+
14
+ configuration[:tracer].trace(Ext::SPAN_JOB_FETCH) do |span|
15
+ span.service = configuration[:service_name]
16
+ span.span_type = Datadog::Ext::AppTypes::WORKER
17
+
18
+ # Set analytics sample rate
19
+ if Contrib::Analytics.enabled?(configuration[:analytics_enabled])
20
+ Contrib::Analytics.set_sample_rate(span, configuration[:analytics_sample_rate])
21
+ end
22
+
23
+ super
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,29 @@
1
+ # typed: true
2
+
3
+ module Datadog
4
+ module Contrib
5
+ module Sidekiq
6
+ module ServerInternalTracer
7
+ # Trace when Sidekiq checks to see if there are scheduled jobs that need to be worked
8
+ # https://github.com/mperham/sidekiq/wiki/Scheduled-Jobs
9
+ module ScheduledPush
10
+ def enqueue
11
+ configuration = Datadog.configuration[:sidekiq]
12
+
13
+ configuration[:tracer].trace(Ext::SPAN_SCHEDULED_PUSH) do |span|
14
+ span.service = configuration[:service_name]
15
+ span.span_type = Datadog::Ext::AppTypes::WORKER
16
+
17
+ # Set analytics sample rate
18
+ if Contrib::Analytics.enabled?(configuration[:analytics_enabled])
19
+ Contrib::Analytics.set_sample_rate(span, configuration[:analytics_sample_rate])
20
+ end
21
+
22
+ super
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -10,7 +10,8 @@ module Datadog
10
10
  module_function
11
11
 
12
12
  def datadog_span(env, app)
13
- env[Ext::RACK_ENV_REQUEST_SPAN][app]
13
+ request_span = env[Ext::RACK_ENV_REQUEST_SPAN]
14
+ request_span && request_span[app]
14
15
  end
15
16
 
16
17
  def set_datadog_span(env, app, span)
@@ -5,7 +5,7 @@ require 'ddtrace/ext/app_types'
5
5
  require 'ddtrace/ext/errors'
6
6
  require 'ddtrace/ext/http'
7
7
  require 'ddtrace/propagation/http_propagator'
8
-
8
+ require 'ddtrace/utils/only_once'
9
9
  require 'ddtrace/contrib/sinatra/ext'
10
10
  require 'ddtrace/contrib/sinatra/tracer_middleware'
11
11
  require 'ddtrace/contrib/sinatra/env'
@@ -77,6 +77,9 @@ module Datadog
77
77
 
78
78
  # Method overrides for Sinatra::Base
79
79
  module Base
80
+ MISSING_REQUEST_SPAN_ONLY_ONCE = Datadog::Utils::OnlyOnce.new
81
+ private_constant :MISSING_REQUEST_SPAN_ONLY_ONCE
82
+
80
83
  def render(engine, data, *)
81
84
  tracer = Datadog.configuration[:sinatra][:tracer]
82
85
  return super unless tracer.enabled
@@ -121,8 +124,18 @@ module Datadog
121
124
  else
122
125
  Sinatra::Env.datadog_span(env, self.class)
123
126
  end
124
- if sinatra_request_span # DEV: Is it possible for sinatra_request_span to ever be nil here?
127
+ if sinatra_request_span
125
128
  sinatra_request_span.resource = span.resource
129
+ else
130
+ MISSING_REQUEST_SPAN_ONLY_ONCE.run do
131
+ Datadog.logger.warn do
132
+ 'Sinatra integration is misconfigured, reported traces will be missing request metadata ' \
133
+ 'such as path and HTTP status code. ' \
134
+ 'Did you forget to add `register Datadog::Contrib::Sinatra::Tracer` to your ' \
135
+ '`Sinatra::Base` subclass? ' \
136
+ 'See <https://docs.datadoghq.com/tracing/setup_overview/setup/ruby/#sinatra> for more details.'
137
+ end
138
+ end
126
139
  end
127
140
 
128
141
  Contrib::Analytics.set_measured(span)
@@ -15,6 +15,18 @@ module Datadog
15
15
  TAG_COMMIT_COMMITTER_NAME = 'git.commit.committer.name'.freeze
16
16
  TAG_COMMIT_MESSAGE = 'git.commit.message'.freeze
17
17
  TAG_COMMIT_SHA = 'git.commit.sha'.freeze
18
+
19
+ ENV_REPOSITORY_URL = 'DD_GIT_REPOSITORY_URL'.freeze
20
+ ENV_COMMIT_SHA = 'DD_GIT_COMMIT_SHA'.freeze
21
+ ENV_BRANCH = 'DD_GIT_BRANCH'.freeze
22
+ ENV_TAG = 'DD_GIT_TAG'.freeze
23
+ ENV_COMMIT_MESSAGE = 'DD_GIT_COMMIT_MESSAGE'.freeze
24
+ ENV_COMMIT_AUTHOR_NAME = 'DD_GIT_COMMIT_AUTHOR_NAME'.freeze
25
+ ENV_COMMIT_AUTHOR_EMAIL = 'DD_GIT_COMMIT_AUTHOR_EMAIL'.freeze
26
+ ENV_COMMIT_AUTHOR_DATE = 'DD_GIT_COMMIT_AUTHOR_DATE'.freeze
27
+ ENV_COMMIT_COMMITTER_NAME = 'DD_GIT_COMMIT_COMMITTER_NAME'.freeze
28
+ ENV_COMMIT_COMMITTER_EMAIL = 'DD_GIT_COMMIT_COMMITTER_EMAIL'.freeze
29
+ ENV_COMMIT_COMMITTER_DATE = 'DD_GIT_COMMIT_COMMITTER_DATE'.freeze
18
30
  end
19
31
  end
20
32
  end
@@ -6,6 +6,7 @@ module Datadog
6
6
  ENV_UPLOAD_TIMEOUT = 'DD_PROFILING_UPLOAD_TIMEOUT'.freeze
7
7
  ENV_MAX_FRAMES = 'DD_PROFILING_MAX_FRAMES'.freeze
8
8
  ENV_AGENTLESS = 'DD_PROFILING_AGENTLESS'.freeze
9
+ ENV_ENDPOINT_COLLECTION_ENABLED = 'DD_PROFILING_ENDPOINT_COLLECTION_ENABLED'.freeze
9
10
 
10
11
  module Pprof
11
12
  LABEL_KEY_SPAN_ID = 'span id'.freeze
@@ -6,6 +6,7 @@ module Datadog
6
6
  module Runtime
7
7
  TAG_ID = 'runtime-id'.freeze
8
8
  TAG_LANG = 'language'.freeze
9
+ TAG_PID = 'system.pid'.freeze
9
10
 
10
11
  # Metrics
11
12
  module Metrics
@@ -14,6 +15,8 @@ module Datadog
14
15
  METRIC_CLASS_COUNT = 'runtime.ruby.class_count'.freeze
15
16
  METRIC_GC_PREFIX = 'runtime.ruby.gc'.freeze
16
17
  METRIC_THREAD_COUNT = 'runtime.ruby.thread_count'.freeze
18
+ METRIC_GLOBAL_CONSTANT_STATE = 'runtime.ruby.global_constant_state'.freeze
19
+ METRIC_GLOBAL_METHOD_STATE = 'runtime.ruby.global_method_state'.freeze
17
20
 
18
21
  TAG_SERVICE = 'service'.freeze
19
22
  end
@@ -3,6 +3,7 @@ module Datadog
3
3
  module Ext
4
4
  module Transport
5
5
  module HTTP
6
+ ADAPTER = :net_http # DEV: Rename to simply `:http`, as Net::HTTP is an implementation detail.
6
7
  DEFAULT_HOST = '127.0.0.1'.freeze
7
8
  DEFAULT_PORT = 8126
8
9
  DEFAULT_TIMEOUT_SECONDS = 1
@@ -16,6 +17,16 @@ module Datadog
16
17
  HEADER_META_LANG_INTERPRETER = 'Datadog-Meta-Lang-Interpreter'.freeze
17
18
  HEADER_META_TRACER_VERSION = 'Datadog-Meta-Tracer-Version'.freeze
18
19
  end
20
+
21
+ module Test
22
+ ADAPTER = :test
23
+ end
24
+
25
+ module UnixSocket
26
+ ADAPTER = :unix
27
+ DEFAULT_PATH = '/var/run/datadog/apm.socket'.freeze
28
+ DEFAULT_TIMEOUT_SECONDS = 1
29
+ end
19
30
  end
20
31
  end
21
32
  end
@@ -18,6 +18,12 @@ module Datadog
18
18
  DEFAULT_MAX_TIME_USAGE_PCT = 2.0
19
19
  MIN_INTERVAL = 0.01
20
20
  THREAD_LAST_CPU_TIME_KEY = :datadog_profiler_last_cpu_time
21
+ THREAD_LAST_WALL_CLOCK_KEY = :datadog_profiler_last_wall_clock
22
+
23
+ # This default was picked based on the current sampling performance and on expected concurrency on an average
24
+ # Ruby MRI application. Lowering this optimizes for latency (less impact each time we sample), and raising
25
+ # optimizes for coverage (less chance to miss what a given thread is doing).
26
+ DEFAULT_MAX_THREADS_SAMPLED = 16
21
27
 
22
28
  attr_reader \
23
29
  :recorder,
@@ -33,6 +39,7 @@ module Datadog
33
39
  trace_identifiers_helper:, # Usually an instance of Datadog::Profiling::TraceIdentifiers::Helper
34
40
  ignore_thread: nil,
35
41
  max_time_usage_pct: DEFAULT_MAX_TIME_USAGE_PCT,
42
+ max_threads_sampled: DEFAULT_MAX_THREADS_SAMPLED,
36
43
  thread_api: Thread,
37
44
  fork_policy: Workers::Async::Thread::FORK_POLICY_RESTART, # Restart in forks by default
38
45
  interval: MIN_INTERVAL,
@@ -43,6 +50,7 @@ module Datadog
43
50
  @trace_identifiers_helper = trace_identifiers_helper
44
51
  @ignore_thread = ignore_thread
45
52
  @max_time_usage_pct = max_time_usage_pct
53
+ @max_threads_sampled = max_threads_sampled
46
54
  @thread_api = thread_api
47
55
 
48
56
  # Workers::Async::Thread settings
@@ -60,10 +68,13 @@ module Datadog
60
68
  @build_backtrace_location = method(:build_backtrace_location).to_proc
61
69
  # Cache this buffer, since it's pretty expensive to keep accessing it
62
70
  @stack_sample_event_recorder = recorder[Events::StackSample]
71
+ # See below for details on why this is needed
72
+ @needs_process_waiter_workaround =
73
+ Gem::Version.new(RUBY_VERSION) >= Gem::Version.new('2.3') &&
74
+ Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.7')
63
75
  end
64
76
 
65
77
  def start
66
- @last_wall_time = Datadog::Utils::Time.get_time
67
78
  reset_cpu_time_tracking
68
79
  perform
69
80
  end
@@ -87,24 +98,14 @@ module Datadog
87
98
 
88
99
  def collect_events
89
100
  events = []
90
-
91
- # Compute wall time interval
92
- current_wall_time = Datadog::Utils::Time.get_time
93
- last_wall_time = if instance_variable_defined?(:@last_wall_time)
94
- @last_wall_time
95
- else
96
- current_wall_time
97
- end
98
-
99
- wall_time_interval_ns = ((current_wall_time - last_wall_time).round(9) * 1e9).to_i
100
- @last_wall_time = current_wall_time
101
+ current_wall_time_ns = get_current_wall_time_timestamp_ns
101
102
 
102
103
  # Collect backtraces from each thread
103
- thread_api.list.each do |thread|
104
+ threads_to_sample.each do |thread|
104
105
  next unless thread.alive?
105
106
  next if ignore_thread.is_a?(Proc) && ignore_thread.call(thread)
106
107
 
107
- event = collect_thread_event(thread, wall_time_interval_ns)
108
+ event = collect_thread_event(thread, current_wall_time_ns)
108
109
  events << event unless event.nil?
109
110
  end
110
111
 
@@ -114,7 +115,7 @@ module Datadog
114
115
  events
115
116
  end
116
117
 
117
- def collect_thread_event(thread, wall_time_interval_ns)
118
+ def collect_thread_event(thread, current_wall_time_ns)
118
119
  locations = thread.backtrace_locations
119
120
  return if locations.nil?
120
121
 
@@ -126,8 +127,10 @@ module Datadog
126
127
  locations = convert_backtrace_locations(locations)
127
128
 
128
129
  thread_id = thread.respond_to?(:pthread_thread_id) ? thread.pthread_thread_id : thread.object_id
129
- trace_id, span_id, trace_resource_container = trace_identifiers_helper.trace_identifiers_for(thread)
130
+ trace_id, span_id, trace_resource = trace_identifiers_helper.trace_identifiers_for(thread)
130
131
  cpu_time = get_cpu_time_interval!(thread)
132
+ wall_time_interval_ns =
133
+ get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_WALL_CLOCK_KEY, current_wall_time_ns)
131
134
 
132
135
  Events::StackSample.new(
133
136
  nil,
@@ -136,7 +139,7 @@ module Datadog
136
139
  thread_id,
137
140
  trace_id,
138
141
  span_id,
139
- trace_resource_container,
142
+ trace_resource,
140
143
  cpu_time,
141
144
  wall_time_interval_ns
142
145
  )
@@ -156,14 +159,7 @@ module Datadog
156
159
  # *before* the thread had time to finish the initialization
157
160
  return unless current_cpu_time_ns
158
161
 
159
- last_cpu_time_ns = (thread.thread_variable_get(THREAD_LAST_CPU_TIME_KEY) || current_cpu_time_ns)
160
- interval = current_cpu_time_ns - last_cpu_time_ns
161
-
162
- # Update CPU time for thread
163
- thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
164
-
165
- # Return interval
166
- interval
162
+ get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
167
163
  end
168
164
 
169
165
  def compute_wait_time(used_time)
@@ -237,10 +233,10 @@ module Datadog
237
233
  end
238
234
 
239
235
  # If the profiler is started for a while, stopped and then restarted OR whenever the process forks, we need to
240
- # clean up any leftover per-thread cpu time counters, so that the first sample after starting doesn't end up with:
236
+ # clean up any leftover per-thread counters, so that the first sample after starting doesn't end up with:
241
237
  #
242
238
  # a) negative time: At least on my test docker container, and on the reliability environment, after the process
243
- # forks, the clock reference changes and (old cpu time - new cpu time) can be < 0
239
+ # forks, the cpu time reference changes and (old cpu time - new cpu time) can be < 0
244
240
  #
245
241
  # b) large amount of time: if the profiler was started, then stopped for some amount of time, and then
246
242
  # restarted, we don't want the first sample to be "blamed" for multiple minutes of CPU time
@@ -248,9 +244,57 @@ module Datadog
248
244
  # By resetting the last cpu time seen, we start with a clean slate every time we start the stack collector.
249
245
  def reset_cpu_time_tracking
250
246
  thread_api.list.each do |thread|
247
+ # See below for details on why this is needed
248
+ next if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
249
+
251
250
  thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, nil)
251
+ thread.thread_variable_set(THREAD_LAST_WALL_CLOCK_KEY, nil)
252
+ end
253
+ end
254
+
255
+ def get_elapsed_since_last_sample_and_set_value(thread, key, current_value)
256
+ # See cthread.rb for more details, but this is a workaround for https://bugs.ruby-lang.org/issues/17807 ;
257
+ # using all thread_variable related methods on these instances also triggers a crash and for now we just
258
+ # skip it for the affected Rubies
259
+ return 0 if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
260
+
261
+ last_value = thread.thread_variable_get(key) || current_value
262
+ thread.thread_variable_set(key, current_value)
263
+
264
+ current_value - last_value
265
+ end
266
+
267
+ # Whenever there are more than max_threads_sampled active, we only sample a subset of them.
268
+ # We do this to avoid impacting the latency of the service being profiled. We want to avoid doing
269
+ # a big burst of work all at once (sample everything), and instead do a little work each time
270
+ # (sample a bit by bit).
271
+ #
272
+ # Because we pick the threads to sample randomly, we'll eventually sample all threads -- just not at once.
273
+ # Notice also that this will interact with our dynamic sampling mechanism -- if samples are faster, we take
274
+ # them more often, if they are slower, we take them less often -- which again means that over a longer period
275
+ # we should take sample roughly the same samples.
276
+ #
277
+ # One downside of this approach is that if there really are many threads, the resulting wall clock times
278
+ # in a one minute profile may "drift" around the 60 second mark, e.g. maybe we only sampled a thread once per
279
+ # second and only 59 times, so we'll report 59s, but on the next report we'll include the missing one, so
280
+ # then the result will be 61s. I've observed 60 +- 1.68 secs for an app with ~65 threads, given the
281
+ # default maximum of 16 threads. This seems a reasonable enough margin of error given the improvement to
282
+ # latency (especially on such a large application! -> even bigger latency impact if we tried to sample all
283
+ # threads).
284
+ #
285
+ def threads_to_sample
286
+ all_threads = thread_api.list
287
+
288
+ if all_threads.size > @max_threads_sampled
289
+ all_threads.sample(@max_threads_sampled)
290
+ else
291
+ all_threads
252
292
  end
253
293
  end
294
+
295
+ def get_current_wall_time_timestamp_ns
296
+ Datadog::Utils::Time.get_time(:nanosecond)
297
+ end
254
298
  end
255
299
  end
256
300
  end
@@ -24,8 +24,16 @@ module Datadog
24
24
  flush.event_groups.each { |event_group| template.add_events!(event_group.event_class, event_group.events) }
25
25
 
26
26
  Datadog.logger.debug do
27
+ max_events = Datadog.configuration.profiling.advanced.max_events
28
+ events_sampled =
29
+ if flush.event_count == max_events
30
+ 'max events limit hit, events were sampled [profile will be biased], '
31
+ else
32
+ ''
33
+ end
34
+
27
35
  "Encoding profile covering #{flush.start.iso8601} to #{flush.finish.iso8601}, " \
28
- "events: #{flush.event_count} (#{template.debug_statistics})"
36
+ "events: #{flush.event_count} (#{events_sampled}#{template.debug_statistics})"
29
37
  end
30
38
 
31
39
  # Build the profile and encode it
@@ -13,7 +13,7 @@ module Datadog
13
13
  :thread_id,
14
14
  :trace_id,
15
15
  :span_id,
16
- :trace_resource_container
16
+ :trace_resource
17
17
 
18
18
  def initialize(
19
19
  timestamp,
@@ -22,7 +22,7 @@ module Datadog
22
22
  thread_id,
23
23
  trace_id,
24
24
  span_id,
25
- trace_resource_container
25
+ trace_resource
26
26
  )
27
27
  super(timestamp)
28
28
 
@@ -31,14 +31,14 @@ module Datadog
31
31
  @thread_id = thread_id
32
32
  @trace_id = trace_id
33
33
  @span_id = span_id
34
- @trace_resource_container = trace_resource_container
34
+ @trace_resource = trace_resource
35
35
 
36
36
  @hash = [
37
37
  thread_id,
38
38
  trace_id,
39
39
  span_id,
40
- # trace_resource_container is deliberately not included -- events that share the same (trace_id, span_id)
41
- # pair should also have the same trace_resource_container
40
+ # trace_resource is deliberately not included -- events that share the same (trace_id, span_id)
41
+ # trace_resource might not match between pairs, but they refer to the same trace.
42
42
  frames.collect(&:hash),
43
43
  total_frame_count
44
44
  ].hash
@@ -58,7 +58,7 @@ module Datadog
58
58
  thread_id,
59
59
  trace_id,
60
60
  span_id,
61
- trace_resource_container,
61
+ trace_resource,
62
62
  cpu_time_interval_ns,
63
63
  wall_time_interval_ns
64
64
  )
@@ -69,7 +69,7 @@ module Datadog
69
69
  thread_id,
70
70
  trace_id,
71
71
  span_id,
72
- trace_resource_container
72
+ trace_resource
73
73
  )
74
74
 
75
75
  @cpu_time_interval_ns = cpu_time_interval_ns
@@ -25,20 +25,19 @@ module Datadog
25
25
  # [key, EventGroup]
26
26
  event_groups = {}
27
27
 
28
+ # Aggregate each event into a group
29
+ # with identical properties, but different values.
28
30
  events.each do |event|
29
31
  key = yield(event)
30
- values = build_sample_values(event)
32
+ values = build_event_values(event)
31
33
 
32
34
  unless key.nil?
33
35
  if event_groups.key?(key)
34
- # Update values for group
35
- group_values = event_groups[key].values
36
- group_values.each_with_index do |group_value, i|
37
- group_values[i] = group_value + values[i]
38
- end
36
+ # Update existing group from event
37
+ update_group(event_groups[key], event, values)
39
38
  else
40
39
  # Add new group
41
- event_groups[key] = EventGroup.new(event, values)
40
+ event_groups[key] = new_group(event, values)
42
41
  end
43
42
  end
44
43
  end
@@ -57,7 +56,7 @@ module Datadog
57
56
  index
58
57
  end
59
58
 
60
- def build_sample_values(stack_sample)
59
+ def build_event_values(event)
61
60
  # Build a value array that matches the length of the sample types
62
61
  # Populate all values with "no value" by default
63
62
  Array.new(@sample_type_mappings.length, Datadog::Ext::Profiling::Pprof::SAMPLE_VALUE_NO_VALUE)
@@ -69,7 +68,7 @@ module Datadog
69
68
 
70
69
  # Represents a grouped event
71
70
  # 'sample' is an example event object from the group.
72
- # 'values' is the the summation of the group's sample values
71
+ # 'values' is the summation of the group's sample values
73
72
  EventGroup = Struct.new(:sample, :values)
74
73
 
75
74
  # Error when the mapping of a sample type to value index is unknown
@@ -84,6 +83,20 @@ module Datadog
84
83
  "Mapping for sample value type '#{type}' to index is unknown."
85
84
  end
86
85
  end
86
+
87
+ protected
88
+
89
+ def new_group(event, values)
90
+ EventGroup.new(event, values)
91
+ end
92
+
93
+ def update_group(event_group, event, values)
94
+ # Update values for group
95
+ group_values = event_group.values
96
+ group_values.each_with_index do |group_value, i|
97
+ group_values[i] = group_value + values[i]
98
+ end
99
+ end
87
100
  end
88
101
  end
89
102
  end
@@ -30,6 +30,7 @@ module Datadog
30
30
  def initialize(*_)
31
31
  super
32
32
 
33
+ @most_recent_trace_samples = {}
33
34
  @processed_unique_stacks = 0
34
35
  @processed_with_trace_ids = 0
35
36
  end
@@ -40,9 +41,28 @@ module Datadog
40
41
  end
41
42
 
42
43
  def stack_sample_group_key(stack_sample)
44
+ # We want to make sure we have the most recent sample for any trace.
45
+ # (This is done here to save an iteration over all samples.)
46
+ update_most_recent_trace_sample(stack_sample)
47
+
43
48
  stack_sample.hash
44
49
  end
45
50
 
51
+ # Track the most recent sample for each trace
52
+ def update_most_recent_trace_sample(stack_sample)
53
+ return unless stack_sample.trace_id && stack_sample.trace_resource
54
+
55
+ # Update trace resource with most recent value
56
+ if (most_recent_trace_sample = @most_recent_trace_samples[stack_sample.trace_id])
57
+ if most_recent_trace_sample.timestamp < stack_sample.timestamp
58
+ @most_recent_trace_samples[stack_sample.trace_id] = stack_sample
59
+ end
60
+ else
61
+ # Add trace resource
62
+ @most_recent_trace_samples[stack_sample.trace_id] = stack_sample
63
+ end
64
+ end
65
+
46
66
  def build_samples(stack_samples)
47
67
  groups = group_events(stack_samples, &method(:stack_sample_group_key))
48
68
  groups.collect do |_group_key, group|
@@ -64,7 +84,7 @@ module Datadog
64
84
  )
65
85
  end
66
86
 
67
- def build_sample_values(stack_sample)
87
+ def build_event_values(stack_sample)
68
88
  no_value = Datadog::Ext::Profiling::Pprof::SAMPLE_VALUE_NO_VALUE
69
89
  values = super(stack_sample)
70
90
  values[sample_value_index(:cpu_time_ns)] = stack_sample.cpu_time_interval_ns || no_value
@@ -96,7 +116,13 @@ module Datadog
96
116
  str: builder.string_table.fetch(span_id.to_s)
97
117
  )
98
118
 
99
- trace_resource = stack_sample.trace_resource_container && stack_sample.trace_resource_container.latest
119
+ # Use most up-to-date trace resource, if available.
120
+ # Otherwise, use the trace resource provided.
121
+ trace_resource = (
122
+ @most_recent_trace_samples[stack_sample.trace_id] \
123
+ || stack_sample
124
+ ).trace_resource
125
+
100
126
  if trace_resource && !trace_resource.empty?
101
127
  labels << Perftools::Profiles::Label.new(
102
128
  key: builder.string_table.fetch(Datadog::Ext::Profiling::Pprof::LABEL_KEY_TRACE_ENDPOINT),
@@ -1,5 +1,4 @@
1
1
  # typed: false
2
- require 'ddtrace'
3
2
  require 'ddtrace/utils/only_once'
4
3
  require 'ddtrace/profiling'
5
4
  require 'ddtrace/profiling/ext/cpu'
@@ -33,7 +33,7 @@ module Datadog
33
33
  def maybe_extract_resource(root_span)
34
34
  return unless root_span
35
35
 
36
- root_span.resource_container if root_span.span_type == Datadog::Ext::HTTP::TYPE_INBOUND
36
+ root_span.resource if root_span.span_type == Datadog::Ext::HTTP::TYPE_INBOUND
37
37
  end
38
38
  end
39
39
  end
@@ -20,10 +20,10 @@ module Datadog
20
20
  def initialize(
21
21
  tracer:,
22
22
  # If this is disabled, the helper will strip the optional trace_resource_container even if provided by the api
23
- extract_trace_resource:,
23
+ endpoint_collection_enabled:,
24
24
  supported_apis: DEFAULT_SUPPORTED_APIS.map { |api| api.new(tracer: tracer) }
25
25
  )
26
- @extract_trace_resource = extract_trace_resource
26
+ @endpoint_collection_enabled = endpoint_collection_enabled
27
27
  @supported_apis = supported_apis
28
28
  end
29
29
 
@@ -34,7 +34,7 @@ module Datadog
34
34
  trace_identifiers = api.trace_identifiers_for(thread)
35
35
 
36
36
  if trace_identifiers
37
- return @extract_trace_resource ? trace_identifiers : trace_identifiers[0..1]
37
+ return @endpoint_collection_enabled ? trace_identifiers : trace_identifiers[0..1]
38
38
  end
39
39
  end
40
40