ddtrace 0.52.0 → 0.54.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +174 -11
  3. data/ddtrace.gemspec +6 -3
  4. data/docs/DevelopmentGuide.md +1 -6
  5. data/docs/GettingStarted.md +109 -18
  6. data/docs/ProfilingDevelopment.md +2 -2
  7. data/ext/ddtrace_profiling_native_extension/NativeExtensionDesign.md +86 -0
  8. data/ext/ddtrace_profiling_native_extension/clock_id.h +4 -0
  9. data/ext/ddtrace_profiling_native_extension/clock_id_from_pthread.c +52 -0
  10. data/ext/ddtrace_profiling_native_extension/clock_id_noop.c +14 -0
  11. data/ext/ddtrace_profiling_native_extension/extconf.rb +177 -8
  12. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +35 -0
  13. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.h +3 -0
  14. data/ext/ddtrace_profiling_native_extension/profiling.c +6 -1
  15. data/lib/datadog/ci/contrib/cucumber/formatter.rb +1 -0
  16. data/lib/datadog/ci/contrib/rspec/example.rb +1 -0
  17. data/lib/datadog/ci/contrib/rspec/integration.rb +2 -2
  18. data/lib/datadog/ci/ext/environment.rb +64 -22
  19. data/lib/datadog/ci/ext/test.rb +1 -0
  20. data/lib/datadog/ci/test.rb +5 -1
  21. data/lib/datadog/contrib.rb +2 -0
  22. data/lib/datadog/core/environment/vm_cache.rb +46 -0
  23. data/lib/ddtrace/buffer.rb +28 -16
  24. data/lib/ddtrace/configuration/agent_settings_resolver.rb +131 -53
  25. data/lib/ddtrace/configuration/components.rb +1 -1
  26. data/lib/ddtrace/configuration/settings.rb +13 -3
  27. data/lib/ddtrace/context.rb +10 -2
  28. data/lib/ddtrace/contrib/action_cable/instrumentation.rb +46 -0
  29. data/lib/ddtrace/contrib/action_cable/patcher.rb +1 -0
  30. data/lib/ddtrace/contrib/action_mailer/configuration/settings.rb +32 -0
  31. data/lib/ddtrace/contrib/action_mailer/event.rb +50 -0
  32. data/lib/ddtrace/contrib/action_mailer/events/deliver.rb +54 -0
  33. data/lib/ddtrace/contrib/action_mailer/events/process.rb +41 -0
  34. data/lib/ddtrace/contrib/action_mailer/events.rb +31 -0
  35. data/lib/ddtrace/contrib/action_mailer/ext.rb +32 -0
  36. data/lib/ddtrace/contrib/action_mailer/integration.rb +45 -0
  37. data/lib/ddtrace/contrib/action_mailer/patcher.rb +27 -0
  38. data/lib/ddtrace/contrib/active_job/configuration/settings.rb +33 -0
  39. data/lib/ddtrace/contrib/active_job/event.rb +54 -0
  40. data/lib/ddtrace/contrib/active_job/events/discard.rb +46 -0
  41. data/lib/ddtrace/contrib/active_job/events/enqueue.rb +45 -0
  42. data/lib/ddtrace/contrib/active_job/events/enqueue_at.rb +45 -0
  43. data/lib/ddtrace/contrib/active_job/events/enqueue_retry.rb +47 -0
  44. data/lib/ddtrace/contrib/active_job/events/perform.rb +45 -0
  45. data/lib/ddtrace/contrib/active_job/events/retry_stopped.rb +46 -0
  46. data/lib/ddtrace/contrib/active_job/events.rb +39 -0
  47. data/lib/ddtrace/contrib/active_job/ext.rb +32 -0
  48. data/lib/ddtrace/contrib/active_job/integration.rb +46 -0
  49. data/lib/ddtrace/contrib/active_job/log_injection.rb +21 -0
  50. data/lib/ddtrace/contrib/active_job/patcher.rb +33 -0
  51. data/lib/ddtrace/contrib/auto_instrument.rb +0 -1
  52. data/lib/ddtrace/contrib/delayed_job/plugin.rb +2 -2
  53. data/lib/ddtrace/contrib/mongodb/instrumentation.rb +1 -1
  54. data/lib/ddtrace/contrib/mongodb/integration.rb +5 -0
  55. data/lib/ddtrace/contrib/rails/auto_instrument_railtie.rb +0 -1
  56. data/lib/ddtrace/contrib/rails/configuration/settings.rb +7 -0
  57. data/lib/ddtrace/contrib/rails/framework.rb +24 -1
  58. data/lib/ddtrace/contrib/rails/patcher.rb +19 -10
  59. data/lib/ddtrace/contrib/redis/instrumentation.rb +90 -0
  60. data/lib/ddtrace/contrib/redis/patcher.rb +2 -84
  61. data/lib/ddtrace/contrib/registerable.rb +0 -1
  62. data/lib/ddtrace/contrib/resque/integration.rb +1 -5
  63. data/lib/ddtrace/contrib/sidekiq/ext.rb +3 -0
  64. data/lib/ddtrace/contrib/sidekiq/integration.rb +10 -0
  65. data/lib/ddtrace/contrib/sidekiq/patcher.rb +26 -0
  66. data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/heartbeat.rb +30 -0
  67. data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/job_fetch.rb +30 -0
  68. data/lib/ddtrace/contrib/sidekiq/server_internal_tracer/scheduled_push.rb +29 -0
  69. data/lib/ddtrace/contrib/sinatra/env.rb +2 -1
  70. data/lib/ddtrace/contrib/sinatra/tracer.rb +15 -2
  71. data/lib/ddtrace/ext/git.rb +12 -0
  72. data/lib/ddtrace/ext/priority.rb +6 -4
  73. data/lib/ddtrace/ext/profiling.rb +8 -11
  74. data/lib/ddtrace/ext/runtime.rb +3 -0
  75. data/lib/ddtrace/ext/transport.rb +11 -0
  76. data/lib/ddtrace/metrics.rb +2 -2
  77. data/lib/ddtrace/profiling/collectors/stack.rb +112 -72
  78. data/lib/ddtrace/profiling/encoding/profile.rb +10 -2
  79. data/lib/ddtrace/profiling/events/stack.rb +13 -13
  80. data/lib/ddtrace/profiling/native_extension.rb +23 -1
  81. data/lib/ddtrace/profiling/pprof/builder.rb +8 -2
  82. data/lib/ddtrace/profiling/pprof/converter.rb +22 -9
  83. data/lib/ddtrace/profiling/pprof/stack_sample.rb +32 -9
  84. data/lib/ddtrace/profiling/pprof/template.rb +2 -2
  85. data/lib/ddtrace/profiling/scheduler.rb +20 -4
  86. data/lib/ddtrace/profiling/tasks/setup.rb +21 -13
  87. data/lib/ddtrace/profiling/trace_identifiers/ddtrace.rb +10 -9
  88. data/lib/ddtrace/profiling/trace_identifiers/helper.rb +5 -5
  89. data/lib/ddtrace/profiling/transport/http/api/endpoint.rb +8 -15
  90. data/lib/ddtrace/profiling/transport/http.rb +8 -17
  91. data/lib/ddtrace/profiling.rb +0 -2
  92. data/lib/ddtrace/runtime/metrics.rb +14 -0
  93. data/lib/ddtrace/sampler.rb +18 -8
  94. data/lib/ddtrace/sampling/rule_sampler.rb +13 -1
  95. data/lib/ddtrace/span.rb +7 -19
  96. data/lib/ddtrace/tracer.rb +1 -1
  97. data/lib/ddtrace/transport/http/adapters/net.rb +13 -3
  98. data/lib/ddtrace/transport/http/adapters/test.rb +4 -2
  99. data/lib/ddtrace/transport/http/adapters/unix_socket.rb +23 -12
  100. data/lib/ddtrace/transport/http/builder.rb +13 -6
  101. data/lib/ddtrace/transport/http.rb +5 -11
  102. data/lib/ddtrace/utils/time.rb +11 -6
  103. data/lib/ddtrace/version.rb +2 -2
  104. data/lib/ddtrace/workers/{loop.rb → interval_loop.rb} +0 -16
  105. data/lib/ddtrace/workers/polling.rb +1 -1
  106. metadata +40 -10
  107. data/lib/ddtrace/profiling/ext/cpu.rb +0 -67
  108. data/lib/ddtrace/profiling/ext/cthread.rb +0 -156
@@ -4,13 +4,15 @@ module Datadog
4
4
  # Priority is a hint given to the backend so that it knows which traces to reject or kept.
5
5
  # In a distributed context, it should be set before any context propagation (fork, RPC calls) to be effective.
6
6
  module Priority
7
- # Use this to explicitely inform the backend that a trace should be rejected and not stored.
7
+ # Use this to explicitly inform the backend that a trace MUST be rejected and not stored.
8
+ # This includes rules and rate limits configured by the user through the {RuleSampler}.
8
9
  USER_REJECT = -1
9
- # Used by the builtin sampler to inform the backend that a trace should be rejected and not stored.
10
+ # Used by the {PrioritySampler} to inform the backend that a trace should be rejected and not stored.
10
11
  AUTO_REJECT = 0
11
- # Used by the builtin sampler to inform the backend that a trace should be kept and stored.
12
+ # Used by the {PrioritySampler} to inform the backend that a trace should be kept and stored.
12
13
  AUTO_KEEP = 1
13
- # Use this to explicitely inform the backend that a trace should be kept and stored.
14
+ # Use this to explicitly inform the backend that a trace MUST be kept and stored.
15
+ # This includes rules and rate limits configured by the user through the {RuleSampler}.
14
16
  USER_KEEP = 2
15
17
  end
16
18
  end
@@ -6,11 +6,12 @@ module Datadog
6
6
  ENV_UPLOAD_TIMEOUT = 'DD_PROFILING_UPLOAD_TIMEOUT'.freeze
7
7
  ENV_MAX_FRAMES = 'DD_PROFILING_MAX_FRAMES'.freeze
8
8
  ENV_AGENTLESS = 'DD_PROFILING_AGENTLESS'.freeze
9
+ ENV_ENDPOINT_COLLECTION_ENABLED = 'DD_PROFILING_ENDPOINT_COLLECTION_ENABLED'.freeze
9
10
 
10
11
  module Pprof
12
+ LABEL_KEY_LOCAL_ROOT_SPAN_ID = 'local root span id'.freeze
11
13
  LABEL_KEY_SPAN_ID = 'span id'.freeze
12
14
  LABEL_KEY_THREAD_ID = 'thread id'.freeze
13
- LABEL_KEY_TRACE_ID = 'trace id'.freeze
14
15
  LABEL_KEY_TRACE_ENDPOINT = 'trace endpoint'.freeze
15
16
  SAMPLE_VALUE_NO_VALUE = 0
16
17
  VALUE_TYPE_CPU = 'cpu-time'.freeze
@@ -22,13 +23,9 @@ module Datadog
22
23
  module HTTP
23
24
  URI_TEMPLATE_DD_API = 'https://intake.profile.%s/'.freeze
24
25
 
25
- FORM_FIELD_DATA = 'data[0]'.freeze
26
- FORM_FIELD_FORMAT = 'format'.freeze
27
- FORM_FIELD_FORMAT_PPROF = 'pprof'.freeze
28
- FORM_FIELD_RECORDING_END = 'recording-end'.freeze
29
- FORM_FIELD_RECORDING_START = 'recording-start'.freeze
30
- FORM_FIELD_RUNTIME = 'runtime'.freeze
31
- FORM_FIELD_RUNTIME_ID = 'runtime-id'.freeze
26
+ FORM_FIELD_RECORDING_START = 'start'.freeze
27
+ FORM_FIELD_RECORDING_END = 'end'.freeze
28
+ FORM_FIELD_FAMILY = 'family'.freeze
32
29
  FORM_FIELD_TAG_ENV = 'env'.freeze
33
30
  FORM_FIELD_TAG_HOST = 'host'.freeze
34
31
  FORM_FIELD_TAG_LANGUAGE = 'language'.freeze
@@ -42,13 +39,13 @@ module Datadog
42
39
  FORM_FIELD_TAG_SERVICE = 'service'.freeze
43
40
  FORM_FIELD_TAG_VERSION = 'version'.freeze
44
41
  FORM_FIELD_TAGS = 'tags'.freeze
45
- FORM_FIELD_TYPES = 'types[0]'.freeze
46
- FORM_FIELD_TYPES_AUTO = 'auto'.freeze
42
+ FORM_FIELD_INTAKE_VERSION = 'version'.freeze
47
43
 
48
44
  HEADER_CONTENT_TYPE = 'Content-Type'.freeze
49
45
  HEADER_CONTENT_TYPE_OCTET_STREAM = 'application/octet-stream'.freeze
50
46
 
51
- PPROF_DEFAULT_FILENAME = 'profile.pb.gz'.freeze
47
+ FORM_FIELD_PPROF_DATA = 'data[rubyprofile.pprof]'.freeze
48
+ PPROF_DEFAULT_FILENAME = 'rubyprofile.pprof.gz'.freeze
52
49
  end
53
50
  end
54
51
  end
@@ -6,6 +6,7 @@ module Datadog
6
6
  module Runtime
7
7
  TAG_ID = 'runtime-id'.freeze
8
8
  TAG_LANG = 'language'.freeze
9
+ TAG_PID = 'system.pid'.freeze
9
10
 
10
11
  # Metrics
11
12
  module Metrics
@@ -14,6 +15,8 @@ module Datadog
14
15
  METRIC_CLASS_COUNT = 'runtime.ruby.class_count'.freeze
15
16
  METRIC_GC_PREFIX = 'runtime.ruby.gc'.freeze
16
17
  METRIC_THREAD_COUNT = 'runtime.ruby.thread_count'.freeze
18
+ METRIC_GLOBAL_CONSTANT_STATE = 'runtime.ruby.global_constant_state'.freeze
19
+ METRIC_GLOBAL_METHOD_STATE = 'runtime.ruby.global_method_state'.freeze
17
20
 
18
21
  TAG_SERVICE = 'service'.freeze
19
22
  end
@@ -3,6 +3,7 @@ module Datadog
3
3
  module Ext
4
4
  module Transport
5
5
  module HTTP
6
+ ADAPTER = :net_http # DEV: Rename to simply `:http`, as Net::HTTP is an implementation detail.
6
7
  DEFAULT_HOST = '127.0.0.1'.freeze
7
8
  DEFAULT_PORT = 8126
8
9
  DEFAULT_TIMEOUT_SECONDS = 1
@@ -16,6 +17,16 @@ module Datadog
16
17
  HEADER_META_LANG_INTERPRETER = 'Datadog-Meta-Lang-Interpreter'.freeze
17
18
  HEADER_META_TRACER_VERSION = 'Datadog-Meta-Tracer-Version'.freeze
18
19
  end
20
+
21
+ module Test
22
+ ADAPTER = :test
23
+ end
24
+
25
+ module UnixSocket
26
+ ADAPTER = :unix
27
+ DEFAULT_PATH = '/var/run/datadog/apm.socket'.freeze
28
+ DEFAULT_TIMEOUT_SECONDS = 1
29
+ end
19
30
  end
20
31
  end
21
32
  end
@@ -31,7 +31,7 @@ module Datadog
31
31
  !version.nil? && version >= Gem::Version.new('3.3.0') &&
32
32
  # dogstatsd-ruby >= 5.0 & < 5.2.0 has known issues with process forks
33
33
  # and do not support the single thread mode we use to avoid this problem.
34
- !(version >= Gem::Version.new('5.0') && version < Gem::Version.new('5.2'))
34
+ !(version >= Gem::Version.new('5.0') && version < Gem::Version.new('5.3'))
35
35
  end
36
36
 
37
37
  def enabled?
@@ -274,7 +274,7 @@ module Datadog
274
274
  IGNORED_STATSD_ONLY_ONCE.run do
275
275
  Datadog.logger.warn(
276
276
  'Ignoring user-supplied statsd instance as currently-installed version of dogstastd-ruby is incompatible. ' \
277
- "To fix this, ensure that you have `gem 'dogstatsd-ruby', '~> 5.2'` on your Gemfile or gems.rb file."
277
+ "To fix this, ensure that you have `gem 'dogstatsd-ruby', '~> 5.3'` on your Gemfile or gems.rb file."
278
278
  )
279
279
  end
280
280
  end
@@ -1,4 +1,6 @@
1
1
  # typed: true
2
+
3
+ require 'ddtrace/profiling/native_extension'
2
4
  require 'ddtrace/profiling/backtrace_location'
3
5
  require 'ddtrace/profiling/events/stack'
4
6
  require 'ddtrace/utils/only_once'
@@ -18,6 +20,13 @@ module Datadog
18
20
  DEFAULT_MAX_TIME_USAGE_PCT = 2.0
19
21
  MIN_INTERVAL = 0.01
20
22
  THREAD_LAST_CPU_TIME_KEY = :datadog_profiler_last_cpu_time
23
+ THREAD_LAST_WALL_CLOCK_KEY = :datadog_profiler_last_wall_clock
24
+ SYNTHETIC_STACK_IN_NATIVE_CODE = [BacktraceLocation.new('', 0, 'In native code').freeze].freeze
25
+
26
+ # This default was picked based on the current sampling performance and on expected concurrency on an average
27
+ # Ruby MRI application. Lowering this optimizes for latency (less impact each time we sample), and raising
28
+ # optimizes for coverage (less chance to miss what a given thread is doing).
29
+ DEFAULT_MAX_THREADS_SAMPLED = 16
21
30
 
22
31
  attr_reader \
23
32
  :recorder,
@@ -25,7 +34,8 @@ module Datadog
25
34
  :trace_identifiers_helper,
26
35
  :ignore_thread,
27
36
  :max_time_usage_pct,
28
- :thread_api
37
+ :thread_api,
38
+ :cpu_time_provider
29
39
 
30
40
  def initialize(
31
41
  recorder,
@@ -33,7 +43,9 @@ module Datadog
33
43
  trace_identifiers_helper:, # Usually an instance of Datadog::Profiling::TraceIdentifiers::Helper
34
44
  ignore_thread: nil,
35
45
  max_time_usage_pct: DEFAULT_MAX_TIME_USAGE_PCT,
46
+ max_threads_sampled: DEFAULT_MAX_THREADS_SAMPLED,
36
47
  thread_api: Thread,
48
+ cpu_time_provider: Datadog::Profiling::NativeExtension,
37
49
  fork_policy: Workers::Async::Thread::FORK_POLICY_RESTART, # Restart in forks by default
38
50
  interval: MIN_INTERVAL,
39
51
  enabled: true
@@ -43,7 +55,10 @@ module Datadog
43
55
  @trace_identifiers_helper = trace_identifiers_helper
44
56
  @ignore_thread = ignore_thread
45
57
  @max_time_usage_pct = max_time_usage_pct
58
+ @max_threads_sampled = max_threads_sampled
46
59
  @thread_api = thread_api
60
+ # Only set the provider if it's able to work in the current Ruby/OS combo
61
+ @cpu_time_provider = cpu_time_provider unless cpu_time_provider.cpu_time_ns_for(thread_api.current).nil?
47
62
 
48
63
  # Workers::Async::Thread settings
49
64
  self.fork_policy = fork_policy
@@ -54,16 +69,17 @@ module Datadog
54
69
  # Workers::Polling settings
55
70
  self.enabled = enabled
56
71
 
57
- @warn_about_missing_cpu_time_instrumentation_only_once = Datadog::Utils::OnlyOnce.new
58
-
59
72
  # Cache this proc, since it's pretty expensive to keep recreating it
60
73
  @build_backtrace_location = method(:build_backtrace_location).to_proc
61
74
  # Cache this buffer, since it's pretty expensive to keep accessing it
62
75
  @stack_sample_event_recorder = recorder[Events::StackSample]
76
+ # See below for details on why this is needed
77
+ @needs_process_waiter_workaround =
78
+ Gem::Version.new(RUBY_VERSION) >= Gem::Version.new('2.3') &&
79
+ Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.7')
63
80
  end
64
81
 
65
82
  def start
66
- @last_wall_time = Datadog::Utils::Time.get_time
67
83
  reset_cpu_time_tracking
68
84
  perform
69
85
  end
@@ -72,10 +88,6 @@ module Datadog
72
88
  collect_and_wait
73
89
  end
74
90
 
75
- def loop_back_off?
76
- false
77
- end
78
-
79
91
  def collect_and_wait
80
92
  run_time = Datadog::Utils::Time.measure do
81
93
  collect_events
@@ -87,24 +99,14 @@ module Datadog
87
99
 
88
100
  def collect_events
89
101
  events = []
90
-
91
- # Compute wall time interval
92
- current_wall_time = Datadog::Utils::Time.get_time
93
- last_wall_time = if instance_variable_defined?(:@last_wall_time)
94
- @last_wall_time
95
- else
96
- current_wall_time
97
- end
98
-
99
- wall_time_interval_ns = ((current_wall_time - last_wall_time).round(9) * 1e9).to_i
100
- @last_wall_time = current_wall_time
102
+ current_wall_time_ns = get_current_wall_time_timestamp_ns
101
103
 
102
104
  # Collect backtraces from each thread
103
- thread_api.list.each do |thread|
105
+ threads_to_sample.each do |thread|
104
106
  next unless thread.alive?
105
107
  next if ignore_thread.is_a?(Proc) && ignore_thread.call(thread)
106
108
 
107
- event = collect_thread_event(thread, wall_time_interval_ns)
109
+ event = collect_thread_event(thread, current_wall_time_ns)
108
110
  events << event unless event.nil?
109
111
  end
110
112
 
@@ -114,10 +116,30 @@ module Datadog
114
116
  events
115
117
  end
116
118
 
117
- def collect_thread_event(thread, wall_time_interval_ns)
119
+ def collect_thread_event(thread, current_wall_time_ns)
118
120
  locations = thread.backtrace_locations
119
121
  return if locations.nil?
120
122
 
123
+ # Having empty locations means that the thread is alive, but we don't know what it's doing:
124
+ #
125
+ # 1. It can be starting up
126
+ # ```
127
+ # > Thread.new { sleep }.backtrace
128
+ # => [] # <-- note the thread hasn't actually started running sleep yet, we got there first
129
+ # ```
130
+ # 2. It can be running native code
131
+ # ```
132
+ # > t = Process.detach(fork { sleep })
133
+ # => #<Process::Waiter:0x00007ffe7285f7a0 run>
134
+ # > t.backtrace
135
+ # => [] # <-- this can happen even minutes later, e.g. it's not a race as in 1.
136
+ # ```
137
+ # This effect has been observed in threads created by the Iodine web server and the ffi gem
138
+ #
139
+ # To give customers visibility into these threads, we replace the empty stack with one containing a
140
+ # synthetic placeholder frame, so that these threads are properly represented in the UX.
141
+ locations = SYNTHETIC_STACK_IN_NATIVE_CODE if locations.empty?
142
+
121
143
  # Get actual stack size then trim the stack
122
144
  stack_size = locations.length
123
145
  locations = locations[0..(max_frames - 1)]
@@ -125,45 +147,33 @@ module Datadog
125
147
  # Convert backtrace locations into structs
126
148
  locations = convert_backtrace_locations(locations)
127
149
 
128
- thread_id = thread.respond_to?(:pthread_thread_id) ? thread.pthread_thread_id : thread.object_id
129
- trace_id, span_id, trace_resource_container = trace_identifiers_helper.trace_identifiers_for(thread)
150
+ thread_id = thread.object_id
151
+ root_span_id, span_id, trace_resource = trace_identifiers_helper.trace_identifiers_for(thread)
130
152
  cpu_time = get_cpu_time_interval!(thread)
153
+ wall_time_interval_ns =
154
+ get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_WALL_CLOCK_KEY, current_wall_time_ns)
131
155
 
132
156
  Events::StackSample.new(
133
157
  nil,
134
158
  locations,
135
159
  stack_size,
136
160
  thread_id,
137
- trace_id,
161
+ root_span_id,
138
162
  span_id,
139
- trace_resource_container,
163
+ trace_resource,
140
164
  cpu_time,
141
165
  wall_time_interval_ns
142
166
  )
143
167
  end
144
168
 
145
169
  def get_cpu_time_interval!(thread)
146
- # Return if we can't get the current CPU time
147
- unless thread.respond_to?(:cpu_time_instrumentation_installed?) && thread.cpu_time_instrumentation_installed?
148
- warn_about_missing_cpu_time_instrumentation(thread)
149
- return
150
- end
170
+ return unless cpu_time_provider
151
171
 
152
- current_cpu_time_ns = thread.cpu_time(:nanosecond)
172
+ current_cpu_time_ns = cpu_time_provider.cpu_time_ns_for(thread)
153
173
 
154
- # NOTE: This can still be nil even when all of the checks above passed because of a race: there's a bit of
155
- # initialization that needs to be done by the thread itself, and it's possible for us to try to sample
156
- # *before* the thread had time to finish the initialization
157
174
  return unless current_cpu_time_ns
158
175
 
159
- last_cpu_time_ns = (thread.thread_variable_get(THREAD_LAST_CPU_TIME_KEY) || current_cpu_time_ns)
160
- interval = current_cpu_time_ns - last_cpu_time_ns
161
-
162
- # Update CPU time for thread
163
- thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
164
-
165
- # Return interval
166
- interval
176
+ get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
167
177
  end
168
178
 
169
179
  def compute_wait_time(used_time)
@@ -209,38 +219,11 @@ module Datadog
209
219
 
210
220
  private
211
221
 
212
- def warn_about_missing_cpu_time_instrumentation(thread)
213
- @warn_about_missing_cpu_time_instrumentation_only_once.run do
214
- # Is the profiler thread instrumented? If it is, then we know instrumentation is available, but seems to be
215
- # missing on this thread we just found.
216
- #
217
- # As far as we know, it can be missing due to one the following:
218
- #
219
- # a) The thread was started before we installed our instrumentation.
220
- # In this case, the fix is to make sure ddtrace gets loaded before any other parts of the application.
221
- #
222
- # b) The thread was started using the Ruby native APIs (e.g. from a C extension such as ffi).
223
- # Known cases right now that trigger this are the ethon/typhoeus gems.
224
- # We currently have no solution for this case; these threads will always be missing our CPU instrumentation.
225
- #
226
- # c) The thread was started with `Thread.start`/`Thread.fork` and hasn't yet enabled the instrumentation.
227
- # When threads are started using these APIs, there's a small time window during which the thread has started
228
- # but our code to apply the instrumentation hasn't run yet; in these cases it's just a matter of allowing
229
- # it to run and our instrumentation to be applied.
230
- #
231
- if thread_api.current.respond_to?(:cpu_time) && thread_api.current.cpu_time
232
- Datadog.logger.debug(
233
- "Thread ('#{thread}') is missing profiling instrumentation; other threads should be unaffected"
234
- )
235
- end
236
- end
237
- end
238
-
239
222
  # If the profiler is started for a while, stopped and then restarted OR whenever the process forks, we need to
240
- # clean up any leftover per-thread cpu time counters, so that the first sample after starting doesn't end up with:
223
+ # clean up any leftover per-thread counters, so that the first sample after starting doesn't end up with:
241
224
  #
242
225
  # a) negative time: At least on my test docker container, and on the reliability environment, after the process
243
- # forks, the clock reference changes and (old cpu time - new cpu time) can be < 0
226
+ # forks, the cpu time reference changes and (old cpu time - new cpu time) can be < 0
244
227
  #
245
228
  # b) large amount of time: if the profiler was started, then stopped for some amount of time, and then
246
229
  # restarted, we don't want the first sample to be "blamed" for multiple minutes of CPU time
@@ -248,9 +231,66 @@ module Datadog
248
231
  # By resetting the last cpu time seen, we start with a clean slate every time we start the stack collector.
249
232
  def reset_cpu_time_tracking
250
233
  thread_api.list.each do |thread|
234
+ # See below for details on why this is needed
235
+ next if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
236
+
251
237
  thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, nil)
238
+ thread.thread_variable_set(THREAD_LAST_WALL_CLOCK_KEY, nil)
252
239
  end
253
240
  end
241
+
242
+ def get_elapsed_since_last_sample_and_set_value(thread, key, current_value)
243
+ # Process::Waiter crash workaround:
244
+ #
245
+ # This is a workaround for a Ruby VM segfault (usually something like
246
+ # "[BUG] Segmentation fault at 0x0000000000000008") in the affected Ruby versions.
247
+ # See https://bugs.ruby-lang.org/issues/17807 for details.
248
+ #
249
+ # In those Ruby versions, there's a very special subclass of `Thread` called `Process::Waiter` that causes VM
250
+ # crashes whenever something tries to read its instance or thread variables. This subclass of thread only
251
+ # shows up when the `Process.detach` API gets used.
252
+ # In the specs you'll find crash regression tests that include a way of reproducing it.
253
+ #
254
+ # As workaround for now we just skip it for the affected Rubies
255
+ return 0 if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
256
+
257
+ last_value = thread.thread_variable_get(key) || current_value
258
+ thread.thread_variable_set(key, current_value)
259
+
260
+ current_value - last_value
261
+ end
262
+
263
+ # Whenever there are more than max_threads_sampled active, we only sample a subset of them.
264
+ # We do this to avoid impacting the latency of the service being profiled. We want to avoid doing
265
+ # a big burst of work all at once (sample everything), and instead do a little work each time
266
+ # (sample a bit by bit).
267
+ #
268
+ # Because we pick the threads to sample randomly, we'll eventually sample all threads -- just not at once.
269
+ # Notice also that this will interact with our dynamic sampling mechanism -- if samples are faster, we take
270
+ # them more often, if they are slower, we take them less often -- which again means that over a longer period
271
+ # we should take sample roughly the same samples.
272
+ #
273
+ # One downside of this approach is that if there really are many threads, the resulting wall clock times
274
+ # in a one minute profile may "drift" around the 60 second mark, e.g. maybe we only sampled a thread once per
275
+ # second and only 59 times, so we'll report 59s, but on the next report we'll include the missing one, so
276
+ # then the result will be 61s. I've observed 60 +- 1.68 secs for an app with ~65 threads, given the
277
+ # default maximum of 16 threads. This seems a reasonable enough margin of error given the improvement to
278
+ # latency (especially on such a large application! -> even bigger latency impact if we tried to sample all
279
+ # threads).
280
+ #
281
+ def threads_to_sample
282
+ all_threads = thread_api.list
283
+
284
+ if all_threads.size > @max_threads_sampled
285
+ all_threads.sample(@max_threads_sampled)
286
+ else
287
+ all_threads
288
+ end
289
+ end
290
+
291
+ def get_current_wall_time_timestamp_ns
292
+ Datadog::Utils::Time.get_time(:nanosecond)
293
+ end
254
294
  end
255
295
  end
256
296
  end
@@ -24,12 +24,20 @@ module Datadog
24
24
  flush.event_groups.each { |event_group| template.add_events!(event_group.event_class, event_group.events) }
25
25
 
26
26
  Datadog.logger.debug do
27
+ max_events = Datadog.configuration.profiling.advanced.max_events
28
+ events_sampled =
29
+ if flush.event_count == max_events
30
+ 'max events limit hit, events were sampled [profile will be biased], '
31
+ else
32
+ ''
33
+ end
34
+
27
35
  "Encoding profile covering #{flush.start.iso8601} to #{flush.finish.iso8601}, " \
28
- "events: #{flush.event_count} (#{template.debug_statistics})"
36
+ "events: #{flush.event_count} (#{events_sampled}#{template.debug_statistics})"
29
37
  end
30
38
 
31
39
  # Build the profile and encode it
32
- template.to_pprof
40
+ template.to_pprof(start: flush.start, finish: flush.finish)
33
41
  end
34
42
  end
35
43
  end
@@ -11,34 +11,34 @@ module Datadog
11
11
  :frames,
12
12
  :total_frame_count,
13
13
  :thread_id,
14
- :trace_id,
14
+ :root_span_id,
15
15
  :span_id,
16
- :trace_resource_container
16
+ :trace_resource
17
17
 
18
18
  def initialize(
19
19
  timestamp,
20
20
  frames,
21
21
  total_frame_count,
22
22
  thread_id,
23
- trace_id,
23
+ root_span_id,
24
24
  span_id,
25
- trace_resource_container
25
+ trace_resource
26
26
  )
27
27
  super(timestamp)
28
28
 
29
29
  @frames = frames
30
30
  @total_frame_count = total_frame_count
31
31
  @thread_id = thread_id
32
- @trace_id = trace_id
32
+ @root_span_id = root_span_id
33
33
  @span_id = span_id
34
- @trace_resource_container = trace_resource_container
34
+ @trace_resource = trace_resource
35
35
 
36
36
  @hash = [
37
37
  thread_id,
38
- trace_id,
38
+ root_span_id,
39
39
  span_id,
40
- # trace_resource_container is deliberately not included -- events that share the same (trace_id, span_id)
41
- # pair should also have the same trace_resource_container
40
+ # trace_resource is deliberately not included -- events that share the same (root_span_id, span_id) refer
41
+ # to the same trace
42
42
  frames.collect(&:hash),
43
43
  total_frame_count
44
44
  ].hash
@@ -56,9 +56,9 @@ module Datadog
56
56
  frames,
57
57
  total_frame_count,
58
58
  thread_id,
59
- trace_id,
59
+ root_span_id,
60
60
  span_id,
61
- trace_resource_container,
61
+ trace_resource,
62
62
  cpu_time_interval_ns,
63
63
  wall_time_interval_ns
64
64
  )
@@ -67,9 +67,9 @@ module Datadog
67
67
  frames,
68
68
  total_frame_count,
69
69
  thread_id,
70
- trace_id,
70
+ root_span_id,
71
71
  span_id,
72
- trace_resource_container
72
+ trace_resource
73
73
  )
74
74
 
75
75
  @cpu_time_interval_ns = cpu_time_interval_ns
@@ -2,7 +2,8 @@
2
2
  module Datadog
3
3
  module Profiling
4
4
  # This module contains classes and methods which are implemented using native code in the
5
- # ext/ddtrace_profiling_native_extension folder
5
+ # ext/ddtrace_profiling_native_extension folder, as well as some Ruby-level utilities that don't make sense to
6
+ # write using C
6
7
  module NativeExtension
7
8
  private_class_method def self.working?
8
9
  native_working?
@@ -13,6 +14,27 @@ module Datadog
13
14
  false
14
15
  end
15
16
  end
17
+
18
+ unless singleton_class.method_defined?(:clock_id_for)
19
+ def self.clock_id_for(_)
20
+ nil
21
+ end
22
+ end
23
+
24
+ def self.cpu_time_ns_for(thread)
25
+ clock_id =
26
+ begin
27
+ clock_id_for(thread)
28
+ rescue Errno::ESRCH
29
+ nil
30
+ end
31
+
32
+ begin
33
+ ::Process.clock_gettime(clock_id, :nanosecond) if clock_id
34
+ rescue Errno::EINVAL
35
+ nil
36
+ end
37
+ end
16
38
  end
17
39
  end
18
40
  end
@@ -4,6 +4,7 @@
4
4
  require 'ddtrace/profiling/flush'
5
5
  require 'ddtrace/profiling/pprof/message_set'
6
6
  require 'ddtrace/profiling/pprof/string_table'
7
+ require 'ddtrace/utils/time'
7
8
 
8
9
  module Datadog
9
10
  module Profiling
@@ -47,14 +48,19 @@ module Datadog
47
48
  Perftools::Profiles::Profile.encode(profile).force_encoding(DEFAULT_ENCODING)
48
49
  end
49
50
 
50
- def build_profile
51
+ def build_profile(start:, finish:)
52
+ start_ns = Datadog::Utils::Time.as_utc_epoch_ns(start)
53
+ finish_ns = Datadog::Utils::Time.as_utc_epoch_ns(finish)
54
+
51
55
  Perftools::Profiles::Profile.new(
52
56
  sample_type: @sample_types.messages,
53
57
  sample: @samples,
54
58
  mapping: @mappings.messages,
55
59
  location: @locations.values,
56
60
  function: @functions.messages,
57
- string_table: @string_table.strings
61
+ string_table: @string_table.strings,
62
+ time_nanos: start_ns,
63
+ duration_nanos: finish_ns - start_ns,
58
64
  )
59
65
  end
60
66
 
@@ -25,20 +25,19 @@ module Datadog
25
25
  # [key, EventGroup]
26
26
  event_groups = {}
27
27
 
28
+ # Aggregate each event into a group
29
+ # with identical properties, but different values.
28
30
  events.each do |event|
29
31
  key = yield(event)
30
- values = build_sample_values(event)
32
+ values = build_event_values(event)
31
33
 
32
34
  unless key.nil?
33
35
  if event_groups.key?(key)
34
- # Update values for group
35
- group_values = event_groups[key].values
36
- group_values.each_with_index do |group_value, i|
37
- group_values[i] = group_value + values[i]
38
- end
36
+ # Update existing group from event
37
+ update_group(event_groups[key], event, values)
39
38
  else
40
39
  # Add new group
41
- event_groups[key] = EventGroup.new(event, values)
40
+ event_groups[key] = new_group(event, values)
42
41
  end
43
42
  end
44
43
  end
@@ -57,7 +56,7 @@ module Datadog
57
56
  index
58
57
  end
59
58
 
60
- def build_sample_values(stack_sample)
59
+ def build_event_values(event)
61
60
  # Build a value array that matches the length of the sample types
62
61
  # Populate all values with "no value" by default
63
62
  Array.new(@sample_type_mappings.length, Datadog::Ext::Profiling::Pprof::SAMPLE_VALUE_NO_VALUE)
@@ -69,7 +68,7 @@ module Datadog
69
68
 
70
69
  # Represents a grouped event
71
70
  # 'sample' is an example event object from the group.
72
- # 'values' is the the summation of the group's sample values
71
+ # 'values' is the summation of the group's sample values
73
72
  EventGroup = Struct.new(:sample, :values)
74
73
 
75
74
  # Error when the mapping of a sample type to value index is unknown
@@ -84,6 +83,20 @@ module Datadog
84
83
  "Mapping for sample value type '#{type}' to index is unknown."
85
84
  end
86
85
  end
86
+
87
+ protected
88
+
89
+ def new_group(event, values)
90
+ EventGroup.new(event, values)
91
+ end
92
+
93
+ def update_group(event_group, event, values)
94
+ # Update values for group
95
+ group_values = event_group.values
96
+ group_values.each_with_index do |group_value, i|
97
+ group_values[i] = group_value + values[i]
98
+ end
99
+ end
87
100
  end
88
101
  end
89
102
  end