ddtrace 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +4 -16
  3. data/CHANGELOG.md +31 -2
  4. data/LICENSE-3rdparty.csv +3 -2
  5. data/README.md +2 -2
  6. data/ddtrace.gemspec +12 -3
  7. data/docs/GettingStarted.md +19 -2
  8. data/docs/ProfilingDevelopment.md +8 -8
  9. data/docs/UpgradeGuide.md +3 -3
  10. data/ext/ddtrace_profiling_loader/ddtrace_profiling_loader.c +118 -0
  11. data/ext/ddtrace_profiling_loader/extconf.rb +53 -0
  12. data/ext/ddtrace_profiling_native_extension/NativeExtensionDesign.md +31 -5
  13. data/ext/ddtrace_profiling_native_extension/clock_id_from_pthread.c +0 -8
  14. data/ext/ddtrace_profiling_native_extension/collectors_stack.c +278 -0
  15. data/ext/ddtrace_profiling_native_extension/extconf.rb +70 -100
  16. data/ext/ddtrace_profiling_native_extension/libddprof_helpers.h +13 -0
  17. data/ext/ddtrace_profiling_native_extension/native_extension_helpers.rb +186 -0
  18. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +579 -7
  19. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.h +30 -0
  20. data/ext/ddtrace_profiling_native_extension/profiling.c +7 -0
  21. data/ext/ddtrace_profiling_native_extension/stack_recorder.c +139 -0
  22. data/ext/ddtrace_profiling_native_extension/stack_recorder.h +28 -0
  23. data/lib/datadog/appsec/autoload.rb +2 -2
  24. data/lib/datadog/appsec/configuration/settings.rb +19 -0
  25. data/lib/datadog/appsec/configuration.rb +8 -0
  26. data/lib/datadog/appsec/contrib/rack/gateway/watcher.rb +76 -33
  27. data/lib/datadog/appsec/contrib/rack/integration.rb +1 -0
  28. data/lib/datadog/appsec/contrib/rack/patcher.rb +0 -1
  29. data/lib/datadog/appsec/contrib/rack/reactive/request_body.rb +64 -0
  30. data/lib/datadog/appsec/contrib/rack/request.rb +6 -0
  31. data/lib/datadog/appsec/contrib/rack/request_body_middleware.rb +41 -0
  32. data/lib/datadog/appsec/contrib/rack/request_middleware.rb +60 -5
  33. data/lib/datadog/appsec/contrib/rails/gateway/watcher.rb +81 -0
  34. data/lib/datadog/appsec/contrib/rails/patcher.rb +34 -1
  35. data/lib/datadog/appsec/contrib/rails/reactive/action.rb +68 -0
  36. data/lib/datadog/appsec/contrib/rails/request.rb +33 -0
  37. data/lib/datadog/appsec/contrib/sinatra/gateway/watcher.rb +124 -0
  38. data/lib/datadog/appsec/contrib/sinatra/patcher.rb +69 -2
  39. data/lib/datadog/appsec/contrib/sinatra/reactive/routed.rb +63 -0
  40. data/lib/datadog/appsec/event.rb +33 -18
  41. data/lib/datadog/appsec/extensions.rb +0 -3
  42. data/lib/datadog/appsec/processor.rb +45 -2
  43. data/lib/datadog/appsec/rate_limiter.rb +5 -0
  44. data/lib/datadog/appsec/reactive/operation.rb +0 -1
  45. data/lib/datadog/ci/ext/environment.rb +21 -7
  46. data/lib/datadog/core/configuration/agent_settings_resolver.rb +1 -1
  47. data/lib/datadog/core/configuration/components.rb +22 -4
  48. data/lib/datadog/core/configuration/settings.rb +3 -3
  49. data/lib/datadog/core/configuration.rb +7 -5
  50. data/lib/datadog/core/environment/cgroup.rb +3 -1
  51. data/lib/datadog/core/environment/container.rb +2 -1
  52. data/lib/datadog/core/environment/variable_helpers.rb +26 -2
  53. data/lib/datadog/core/logging/ext.rb +11 -0
  54. data/lib/datadog/core/metrics/client.rb +15 -5
  55. data/lib/datadog/core/runtime/metrics.rb +1 -1
  56. data/lib/datadog/core/workers/async.rb +3 -1
  57. data/lib/datadog/core/workers/runtime_metrics.rb +0 -3
  58. data/lib/datadog/core.rb +6 -0
  59. data/lib/datadog/kit/enable_core_dumps.rb +50 -0
  60. data/lib/datadog/kit/identity.rb +63 -0
  61. data/lib/datadog/kit.rb +11 -0
  62. data/lib/datadog/opentracer/tracer.rb +0 -2
  63. data/lib/datadog/profiling/collectors/old_stack.rb +298 -0
  64. data/lib/datadog/profiling/collectors/stack.rb +6 -287
  65. data/lib/datadog/profiling/encoding/profile.rb +0 -1
  66. data/lib/datadog/profiling/ext.rb +1 -1
  67. data/lib/datadog/profiling/flush.rb +1 -1
  68. data/lib/datadog/profiling/load_native_extension.rb +22 -0
  69. data/lib/datadog/profiling/recorder.rb +1 -1
  70. data/lib/datadog/profiling/scheduler.rb +1 -1
  71. data/lib/datadog/profiling/stack_recorder.rb +33 -0
  72. data/lib/datadog/profiling/tag_builder.rb +48 -0
  73. data/lib/datadog/profiling/tasks/exec.rb +2 -2
  74. data/lib/datadog/profiling/tasks/setup.rb +6 -4
  75. data/lib/datadog/profiling.rb +29 -27
  76. data/lib/datadog/tracing/buffer.rb +9 -3
  77. data/lib/datadog/tracing/contrib/action_view/patcher.rb +0 -1
  78. data/lib/datadog/tracing/contrib/active_record/configuration/resolver.rb +2 -2
  79. data/lib/datadog/tracing/contrib/active_record/utils.rb +1 -1
  80. data/lib/datadog/tracing/contrib/active_record/vendor/connection_specification.rb +1 -1
  81. data/lib/datadog/tracing/contrib/active_support/notifications/subscription.rb +4 -2
  82. data/lib/datadog/tracing/contrib/concurrent_ruby/context_composite_executor_service.rb +10 -3
  83. data/lib/datadog/tracing/contrib/dalli/patcher.rb +0 -1
  84. data/lib/datadog/tracing/contrib/delayed_job/patcher.rb +0 -1
  85. data/lib/datadog/tracing/contrib/elasticsearch/integration.rb +9 -3
  86. data/lib/datadog/tracing/contrib/elasticsearch/patcher.rb +38 -2
  87. data/lib/datadog/tracing/contrib/ethon/patcher.rb +0 -1
  88. data/lib/datadog/tracing/contrib/extensions.rb +0 -2
  89. data/lib/datadog/tracing/contrib/faraday/patcher.rb +0 -1
  90. data/lib/datadog/tracing/contrib/grape/patcher.rb +0 -1
  91. data/lib/datadog/tracing/contrib/graphql/patcher.rb +0 -1
  92. data/lib/datadog/tracing/contrib/grpc/patcher.rb +0 -1
  93. data/lib/datadog/tracing/contrib/kafka/patcher.rb +0 -1
  94. data/lib/datadog/tracing/contrib/lograge/instrumentation.rb +2 -1
  95. data/lib/datadog/tracing/contrib/qless/patcher.rb +0 -1
  96. data/lib/datadog/tracing/contrib/que/patcher.rb +0 -1
  97. data/lib/datadog/tracing/contrib/racecar/patcher.rb +0 -1
  98. data/lib/datadog/tracing/contrib/rails/log_injection.rb +3 -16
  99. data/lib/datadog/tracing/contrib/rake/instrumentation.rb +2 -2
  100. data/lib/datadog/tracing/contrib/rake/patcher.rb +0 -1
  101. data/lib/datadog/tracing/contrib/redis/patcher.rb +0 -1
  102. data/lib/datadog/tracing/contrib/resque/patcher.rb +0 -1
  103. data/lib/datadog/tracing/contrib/rest_client/patcher.rb +0 -1
  104. data/lib/datadog/tracing/contrib/semantic_logger/instrumentation.rb +2 -1
  105. data/lib/datadog/tracing/contrib/sidekiq/configuration/settings.rb +1 -0
  106. data/lib/datadog/tracing/contrib/sidekiq/server_tracer.rb +20 -1
  107. data/lib/datadog/tracing/contrib/sinatra/framework.rb +11 -0
  108. data/lib/datadog/tracing/contrib/sinatra/patcher.rb +0 -1
  109. data/lib/datadog/tracing/contrib/sneakers/patcher.rb +0 -1
  110. data/lib/datadog/tracing/contrib/sucker_punch/patcher.rb +0 -1
  111. data/lib/datadog/tracing/event.rb +2 -1
  112. data/lib/datadog/tracing/sampling/priority_sampler.rb +4 -5
  113. data/lib/datadog/tracing/sampling/rule.rb +12 -6
  114. data/lib/datadog/tracing/sampling/rule_sampler.rb +3 -5
  115. data/lib/datadog/tracing/span_operation.rb +2 -3
  116. data/lib/datadog/tracing/trace_operation.rb +0 -1
  117. data/lib/ddtrace/transport/http/client.rb +2 -1
  118. data/lib/ddtrace/transport/http/response.rb +34 -4
  119. data/lib/ddtrace/transport/io/client.rb +3 -1
  120. data/lib/ddtrace/version.rb +1 -1
  121. data/lib/ddtrace.rb +1 -0
  122. metadata +43 -6
@@ -0,0 +1,298 @@
1
+ # typed: true
2
+
3
+ require 'datadog/core/utils/only_once'
4
+ require 'datadog/core/utils/time'
5
+ require 'datadog/core/worker'
6
+ require 'datadog/core/workers/polling'
7
+ require 'datadog/profiling/backtrace_location'
8
+ require 'datadog/profiling/events/stack'
9
+ require 'datadog/profiling/native_extension'
10
+
11
+ module Datadog
12
+ module Profiling
13
+ module Collectors
14
+ # Collects stack trace samples from Ruby threads for both CPU-time (if available) and wall-clock.
15
+ # Runs on its own background thread.
16
+ #
17
+ # This class has the prefix "Old" because it will be deprecated by the new native CPU Profiler
18
+ class OldStack < Core::Worker # rubocop:disable Metrics/ClassLength
19
+ include Core::Workers::Polling
20
+
21
+ DEFAULT_MAX_TIME_USAGE_PCT = 2.0
22
+ MIN_INTERVAL = 0.01
23
+ THREAD_LAST_CPU_TIME_KEY = :datadog_profiler_last_cpu_time
24
+ THREAD_LAST_WALL_CLOCK_KEY = :datadog_profiler_last_wall_clock
25
+ SYNTHETIC_STACK_IN_NATIVE_CODE = [BacktraceLocation.new('', 0, 'In native code').freeze].freeze
26
+
27
+ # This default was picked based on the current sampling performance and on expected concurrency on an average
28
+ # Ruby MRI application. Lowering this optimizes for latency (less impact each time we sample), and raising
29
+ # optimizes for coverage (less chance to miss what a given thread is doing).
30
+ DEFAULT_MAX_THREADS_SAMPLED = 16
31
+
32
+ attr_reader \
33
+ :recorder,
34
+ :max_frames,
35
+ :trace_identifiers_helper,
36
+ :ignore_thread,
37
+ :max_time_usage_pct,
38
+ :thread_api,
39
+ :cpu_time_provider
40
+
41
+ def initialize(
42
+ recorder,
43
+ max_frames:,
44
+ trace_identifiers_helper:, # Usually an instance of Profiling::TraceIdentifiers::Helper
45
+ ignore_thread: nil,
46
+ max_time_usage_pct: DEFAULT_MAX_TIME_USAGE_PCT,
47
+ max_threads_sampled: DEFAULT_MAX_THREADS_SAMPLED,
48
+ thread_api: Thread,
49
+ cpu_time_provider: Profiling::NativeExtension,
50
+ fork_policy: Core::Workers::Async::Thread::FORK_POLICY_RESTART, # Restart in forks by default
51
+ interval: MIN_INTERVAL,
52
+ enabled: true
53
+ )
54
+ @recorder = recorder
55
+ @max_frames = max_frames
56
+ @trace_identifiers_helper = trace_identifiers_helper
57
+ @ignore_thread = ignore_thread
58
+ @max_time_usage_pct = max_time_usage_pct
59
+ @max_threads_sampled = max_threads_sampled
60
+ @thread_api = thread_api
61
+ # Only set the provider if it's able to work in the current Ruby/OS combo
62
+ @cpu_time_provider = cpu_time_provider unless cpu_time_provider.cpu_time_ns_for(thread_api.current).nil?
63
+
64
+ # Workers::Async::Thread settings
65
+ self.fork_policy = fork_policy
66
+
67
+ # Workers::IntervalLoop settings
68
+ self.loop_base_interval = interval
69
+
70
+ # Workers::Polling settings
71
+ self.enabled = enabled
72
+
73
+ # Cache this proc, since it's pretty expensive to keep recreating it
74
+ @build_backtrace_location = method(:build_backtrace_location).to_proc
75
+ # Cache this buffer, since it's pretty expensive to keep accessing it
76
+ @stack_sample_event_recorder = recorder[Events::StackSample]
77
+ # See below for details on why this is needed
78
+ @needs_process_waiter_workaround =
79
+ Gem::Version.new(RUBY_VERSION) >= Gem::Version.new('2.3') &&
80
+ Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.7')
81
+ end
82
+
83
+ def start
84
+ reset_cpu_time_tracking
85
+ perform
86
+ end
87
+
88
+ def perform
89
+ collect_and_wait
90
+ end
91
+
92
+ def collect_and_wait
93
+ run_time = Core::Utils::Time.measure do
94
+ collect_events
95
+ end
96
+
97
+ # Update wait time to throttle profiling
98
+ self.loop_wait_time = compute_wait_time(run_time)
99
+ end
100
+
101
+ def collect_events
102
+ events = []
103
+ current_wall_time_ns = get_current_wall_time_timestamp_ns
104
+
105
+ # Collect backtraces from each thread
106
+ threads_to_sample.each do |thread|
107
+ next unless thread.alive?
108
+ next if ignore_thread.is_a?(Proc) && ignore_thread.call(thread)
109
+
110
+ event = collect_thread_event(thread, current_wall_time_ns)
111
+ events << event unless event.nil?
112
+ end
113
+
114
+ # Send events to recorder
115
+ recorder.push(events) unless events.empty?
116
+
117
+ events
118
+ end
119
+
120
+ def collect_thread_event(thread, current_wall_time_ns)
121
+ locations = thread.backtrace_locations
122
+ return if locations.nil?
123
+
124
+ # Having empty locations means that the thread is alive, but we don't know what it's doing:
125
+ #
126
+ # 1. It can be starting up
127
+ # ```
128
+ # > Thread.new { sleep }.backtrace
129
+ # => [] # <-- note the thread hasn't actually started running sleep yet, we got there first
130
+ # ```
131
+ # 2. It can be running native code
132
+ # ```
133
+ # > t = Process.detach(fork { sleep })
134
+ # => #<Process::Waiter:0x00007ffe7285f7a0 run>
135
+ # > t.backtrace
136
+ # => [] # <-- this can happen even minutes later, e.g. it's not a race as in 1.
137
+ # ```
138
+ # This effect has been observed in threads created by the Iodine web server and the ffi gem
139
+ #
140
+ # To give customers visibility into these threads, we replace the empty stack with one containing a
141
+ # synthetic placeholder frame, so that these threads are properly represented in the UX.
142
+ locations = SYNTHETIC_STACK_IN_NATIVE_CODE if locations.empty?
143
+
144
+ # Get actual stack size then trim the stack
145
+ stack_size = locations.length
146
+ locations = locations[0..(max_frames - 1)]
147
+
148
+ # Convert backtrace locations into structs
149
+ locations = convert_backtrace_locations(locations)
150
+
151
+ thread_id = thread.object_id
152
+ root_span_id, span_id, trace_resource = trace_identifiers_helper.trace_identifiers_for(thread)
153
+ cpu_time = get_cpu_time_interval!(thread)
154
+ wall_time_interval_ns =
155
+ get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_WALL_CLOCK_KEY, current_wall_time_ns)
156
+
157
+ Events::StackSample.new(
158
+ nil,
159
+ locations,
160
+ stack_size,
161
+ thread_id,
162
+ root_span_id,
163
+ span_id,
164
+ trace_resource,
165
+ cpu_time,
166
+ wall_time_interval_ns
167
+ )
168
+ end
169
+
170
+ def get_cpu_time_interval!(thread)
171
+ return unless cpu_time_provider
172
+
173
+ current_cpu_time_ns = cpu_time_provider.cpu_time_ns_for(thread)
174
+
175
+ return unless current_cpu_time_ns
176
+
177
+ get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
178
+ end
179
+
180
+ def compute_wait_time(used_time)
181
+ # We took used_time to get the last sample.
182
+ #
183
+ # What we're computing here is -- if used_time corresponds to max_time_usage_pct of the time we should
184
+ # spend working, how much is (100% - max_time_usage_pct) of the time?
185
+ #
186
+ # For instance, if we took 10ms to sample, and max_time_usage_pct is 1%, then the other 99% is 990ms, which
187
+ # means we need to sleep for 990ms to guarantee that we don't spend more than 1% of the time working.
188
+ used_time_ns = used_time * 1e9
189
+ interval = (used_time_ns / (max_time_usage_pct / 100.0)) - used_time_ns
190
+ [interval / 1e9, MIN_INTERVAL].max
191
+ end
192
+
193
+ # Convert backtrace locations into structs
194
+ # Re-use old backtrace location objects if they already exist in the buffer
195
+ def convert_backtrace_locations(locations)
196
+ locations.collect do |location|
197
+ # Re-use existing BacktraceLocation if identical copy, otherwise build a new one.
198
+ @stack_sample_event_recorder.cache(:backtrace_locations).fetch(
199
+ # Function name
200
+ location.base_label,
201
+ # Line number
202
+ location.lineno,
203
+ # Filename
204
+ location.path,
205
+ # Build function
206
+ &@build_backtrace_location
207
+ )
208
+ end
209
+ end
210
+
211
+ def build_backtrace_location(_id, base_label, lineno, path)
212
+ string_table = @stack_sample_event_recorder.string_table
213
+
214
+ Profiling::BacktraceLocation.new(
215
+ string_table.fetch_string(base_label),
216
+ lineno,
217
+ string_table.fetch_string(path)
218
+ )
219
+ end
220
+
221
+ private
222
+
223
+ # If the profiler is started for a while, stopped and then restarted OR whenever the process forks, we need to
224
+ # clean up any leftover per-thread counters, so that the first sample after starting doesn't end up with:
225
+ #
226
+ # a) negative time: At least on my test docker container, and on the reliability environment, after the process
227
+ # forks, the cpu time reference changes and (old cpu time - new cpu time) can be < 0
228
+ #
229
+ # b) large amount of time: if the profiler was started, then stopped for some amount of time, and then
230
+ # restarted, we don't want the first sample to be "blamed" for multiple minutes of CPU time
231
+ #
232
+ # By resetting the last cpu time seen, we start with a clean slate every time we start the stack collector.
233
+ def reset_cpu_time_tracking
234
+ thread_api.list.each do |thread|
235
+ # See below for details on why this is needed
236
+ next if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
237
+
238
+ thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, nil)
239
+ thread.thread_variable_set(THREAD_LAST_WALL_CLOCK_KEY, nil)
240
+ end
241
+ end
242
+
243
+ def get_elapsed_since_last_sample_and_set_value(thread, key, current_value)
244
+ # Process::Waiter crash workaround:
245
+ #
246
+ # This is a workaround for a Ruby VM segfault (usually something like
247
+ # "[BUG] Segmentation fault at 0x0000000000000008") in the affected Ruby versions.
248
+ # See https://bugs.ruby-lang.org/issues/17807 for details.
249
+ #
250
+ # In those Ruby versions, there's a very special subclass of `Thread` called `Process::Waiter` that causes VM
251
+ # crashes whenever something tries to read its instance or thread variables. This subclass of thread only
252
+ # shows up when the `Process.detach` API gets used.
253
+ # In the specs you'll find crash regression tests that include a way of reproducing it.
254
+ #
255
+ # As workaround for now we just skip it for the affected Rubies
256
+ return 0 if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
257
+
258
+ last_value = thread.thread_variable_get(key) || current_value
259
+ thread.thread_variable_set(key, current_value)
260
+
261
+ current_value - last_value
262
+ end
263
+
264
+ # Whenever there are more than max_threads_sampled active, we only sample a subset of them.
265
+ # We do this to avoid impacting the latency of the service being profiled. We want to avoid doing
266
+ # a big burst of work all at once (sample everything), and instead do a little work each time
267
+ # (sample a bit by bit).
268
+ #
269
+ # Because we pick the threads to sample randomly, we'll eventually sample all threads -- just not at once.
270
+ # Notice also that this will interact with our dynamic sampling mechanism -- if samples are faster, we take
271
+ # them more often, if they are slower, we take them less often -- which again means that over a longer period
272
+ # we should take sample roughly the same samples.
273
+ #
274
+ # One downside of this approach is that if there really are many threads, the resulting wall clock times
275
+ # in a one minute profile may "drift" around the 60 second mark, e.g. maybe we only sampled a thread once per
276
+ # second and only 59 times, so we'll report 59s, but on the next report we'll include the missing one, so
277
+ # then the result will be 61s. I've observed 60 +- 1.68 secs for an app with ~65 threads, given the
278
+ # default maximum of 16 threads. This seems a reasonable enough margin of error given the improvement to
279
+ # latency (especially on such a large application! -> even bigger latency impact if we tried to sample all
280
+ # threads).
281
+ #
282
+ def threads_to_sample
283
+ all_threads = thread_api.list
284
+
285
+ if all_threads.size > @max_threads_sampled
286
+ all_threads.sample(@max_threads_sampled)
287
+ else
288
+ all_threads
289
+ end
290
+ end
291
+
292
+ def get_current_wall_time_timestamp_ns
293
+ Core::Utils::Time.get_time(:nanosecond)
294
+ end
295
+ end
296
+ end
297
+ end
298
+ end
@@ -1,295 +1,14 @@
1
- # typed: true
2
-
3
- require 'datadog/core/utils/only_once'
4
- require 'datadog/core/utils/time'
5
- require 'datadog/core/worker'
6
- require 'datadog/core/workers/polling'
7
- require 'datadog/profiling/backtrace_location'
8
- require 'datadog/profiling/events/stack'
9
- require 'datadog/profiling/native_extension'
1
+ # typed: false
10
2
 
11
3
  module Datadog
12
4
  module Profiling
13
5
  module Collectors
14
- # Collects stack trace samples from Ruby threads for both CPU-time (if available) and wall-clock.
15
- # Runs on its own background thread.
6
+ # Used to gather a stack trace from a given Ruby thread. Almost all of this class is implemented as native code.
16
7
  #
17
- class Stack < Core::Worker # rubocop:disable Metrics/ClassLength
18
- include Core::Workers::Polling
19
-
20
- DEFAULT_MAX_TIME_USAGE_PCT = 2.0
21
- MIN_INTERVAL = 0.01
22
- THREAD_LAST_CPU_TIME_KEY = :datadog_profiler_last_cpu_time
23
- THREAD_LAST_WALL_CLOCK_KEY = :datadog_profiler_last_wall_clock
24
- SYNTHETIC_STACK_IN_NATIVE_CODE = [BacktraceLocation.new('', 0, 'In native code').freeze].freeze
25
-
26
- # This default was picked based on the current sampling performance and on expected concurrency on an average
27
- # Ruby MRI application. Lowering this optimizes for latency (less impact each time we sample), and raising
28
- # optimizes for coverage (less chance to miss what a given thread is doing).
29
- DEFAULT_MAX_THREADS_SAMPLED = 16
30
-
31
- attr_reader \
32
- :recorder,
33
- :max_frames,
34
- :trace_identifiers_helper,
35
- :ignore_thread,
36
- :max_time_usage_pct,
37
- :thread_api,
38
- :cpu_time_provider
39
-
40
- def initialize(
41
- recorder,
42
- max_frames:,
43
- trace_identifiers_helper:, # Usually an instance of Profiling::TraceIdentifiers::Helper
44
- ignore_thread: nil,
45
- max_time_usage_pct: DEFAULT_MAX_TIME_USAGE_PCT,
46
- max_threads_sampled: DEFAULT_MAX_THREADS_SAMPLED,
47
- thread_api: Thread,
48
- cpu_time_provider: Profiling::NativeExtension,
49
- fork_policy: Core::Workers::Async::Thread::FORK_POLICY_RESTART, # Restart in forks by default
50
- interval: MIN_INTERVAL,
51
- enabled: true
52
- )
53
- @recorder = recorder
54
- @max_frames = max_frames
55
- @trace_identifiers_helper = trace_identifiers_helper
56
- @ignore_thread = ignore_thread
57
- @max_time_usage_pct = max_time_usage_pct
58
- @max_threads_sampled = max_threads_sampled
59
- @thread_api = thread_api
60
- # Only set the provider if it's able to work in the current Ruby/OS combo
61
- @cpu_time_provider = cpu_time_provider unless cpu_time_provider.cpu_time_ns_for(thread_api.current).nil?
62
-
63
- # Workers::Async::Thread settings
64
- self.fork_policy = fork_policy
65
-
66
- # Workers::IntervalLoop settings
67
- self.loop_base_interval = interval
68
-
69
- # Workers::Polling settings
70
- self.enabled = enabled
71
-
72
- # Cache this proc, since it's pretty expensive to keep recreating it
73
- @build_backtrace_location = method(:build_backtrace_location).to_proc
74
- # Cache this buffer, since it's pretty expensive to keep accessing it
75
- @stack_sample_event_recorder = recorder[Events::StackSample]
76
- # See below for details on why this is needed
77
- @needs_process_waiter_workaround =
78
- Gem::Version.new(RUBY_VERSION) >= Gem::Version.new('2.3') &&
79
- Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.7')
80
- end
81
-
82
- def start
83
- reset_cpu_time_tracking
84
- perform
85
- end
86
-
87
- def perform
88
- collect_and_wait
89
- end
90
-
91
- def collect_and_wait
92
- run_time = Core::Utils::Time.measure do
93
- collect_events
94
- end
95
-
96
- # Update wait time to throttle profiling
97
- self.loop_wait_time = compute_wait_time(run_time)
98
- end
99
-
100
- def collect_events
101
- events = []
102
- current_wall_time_ns = get_current_wall_time_timestamp_ns
103
-
104
- # Collect backtraces from each thread
105
- threads_to_sample.each do |thread|
106
- next unless thread.alive?
107
- next if ignore_thread.is_a?(Proc) && ignore_thread.call(thread)
108
-
109
- event = collect_thread_event(thread, current_wall_time_ns)
110
- events << event unless event.nil?
111
- end
112
-
113
- # Send events to recorder
114
- recorder.push(events) unless events.empty?
115
-
116
- events
117
- end
118
-
119
- def collect_thread_event(thread, current_wall_time_ns)
120
- locations = thread.backtrace_locations
121
- return if locations.nil?
122
-
123
- # Having empty locations means that the thread is alive, but we don't know what it's doing:
124
- #
125
- # 1. It can be starting up
126
- # ```
127
- # > Thread.new { sleep }.backtrace
128
- # => [] # <-- note the thread hasn't actually started running sleep yet, we got there first
129
- # ```
130
- # 2. It can be running native code
131
- # ```
132
- # > t = Process.detach(fork { sleep })
133
- # => #<Process::Waiter:0x00007ffe7285f7a0 run>
134
- # > t.backtrace
135
- # => [] # <-- this can happen even minutes later, e.g. it's not a race as in 1.
136
- # ```
137
- # This effect has been observed in threads created by the Iodine web server and the ffi gem
138
- #
139
- # To give customers visibility into these threads, we replace the empty stack with one containing a
140
- # synthetic placeholder frame, so that these threads are properly represented in the UX.
141
- locations = SYNTHETIC_STACK_IN_NATIVE_CODE if locations.empty?
142
-
143
- # Get actual stack size then trim the stack
144
- stack_size = locations.length
145
- locations = locations[0..(max_frames - 1)]
146
-
147
- # Convert backtrace locations into structs
148
- locations = convert_backtrace_locations(locations)
149
-
150
- thread_id = thread.object_id
151
- root_span_id, span_id, trace_resource = trace_identifiers_helper.trace_identifiers_for(thread)
152
- cpu_time = get_cpu_time_interval!(thread)
153
- wall_time_interval_ns =
154
- get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_WALL_CLOCK_KEY, current_wall_time_ns)
155
-
156
- Events::StackSample.new(
157
- nil,
158
- locations,
159
- stack_size,
160
- thread_id,
161
- root_span_id,
162
- span_id,
163
- trace_resource,
164
- cpu_time,
165
- wall_time_interval_ns
166
- )
167
- end
168
-
169
- def get_cpu_time_interval!(thread)
170
- return unless cpu_time_provider
171
-
172
- current_cpu_time_ns = cpu_time_provider.cpu_time_ns_for(thread)
173
-
174
- return unless current_cpu_time_ns
175
-
176
- get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
177
- end
178
-
179
- def compute_wait_time(used_time)
180
- # We took used_time to get the last sample.
181
- #
182
- # What we're computing here is -- if used_time corresponds to max_time_usage_pct of the time we should
183
- # spend working, how much is (100% - max_time_usage_pct) of the time?
184
- #
185
- # For instance, if we took 10ms to sample, and max_time_usage_pct is 1%, then the other 99% is 990ms, which
186
- # means we need to sleep for 990ms to guarantee that we don't spend more than 1% of the time working.
187
- used_time_ns = used_time * 1e9
188
- interval = (used_time_ns / (max_time_usage_pct / 100.0)) - used_time_ns
189
- [interval / 1e9, MIN_INTERVAL].max
190
- end
191
-
192
- # Convert backtrace locations into structs
193
- # Re-use old backtrace location objects if they already exist in the buffer
194
- def convert_backtrace_locations(locations)
195
- locations.collect do |location|
196
- # Re-use existing BacktraceLocation if identical copy, otherwise build a new one.
197
- @stack_sample_event_recorder.cache(:backtrace_locations).fetch(
198
- # Function name
199
- location.base_label,
200
- # Line number
201
- location.lineno,
202
- # Filename
203
- location.path,
204
- # Build function
205
- &@build_backtrace_location
206
- )
207
- end
208
- end
209
-
210
- def build_backtrace_location(_id, base_label, lineno, path)
211
- string_table = @stack_sample_event_recorder.string_table
212
-
213
- Profiling::BacktraceLocation.new(
214
- string_table.fetch_string(base_label),
215
- lineno,
216
- string_table.fetch_string(path)
217
- )
218
- end
219
-
220
- private
221
-
222
- # If the profiler is started for a while, stopped and then restarted OR whenever the process forks, we need to
223
- # clean up any leftover per-thread counters, so that the first sample after starting doesn't end up with:
224
- #
225
- # a) negative time: At least on my test docker container, and on the reliability environment, after the process
226
- # forks, the cpu time reference changes and (old cpu time - new cpu time) can be < 0
227
- #
228
- # b) large amount of time: if the profiler was started, then stopped for some amount of time, and then
229
- # restarted, we don't want the first sample to be "blamed" for multiple minutes of CPU time
230
- #
231
- # By resetting the last cpu time seen, we start with a clean slate every time we start the stack collector.
232
- def reset_cpu_time_tracking
233
- thread_api.list.each do |thread|
234
- # See below for details on why this is needed
235
- next if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
236
-
237
- thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, nil)
238
- thread.thread_variable_set(THREAD_LAST_WALL_CLOCK_KEY, nil)
239
- end
240
- end
241
-
242
- def get_elapsed_since_last_sample_and_set_value(thread, key, current_value)
243
- # Process::Waiter crash workaround:
244
- #
245
- # This is a workaround for a Ruby VM segfault (usually something like
246
- # "[BUG] Segmentation fault at 0x0000000000000008") in the affected Ruby versions.
247
- # See https://bugs.ruby-lang.org/issues/17807 for details.
248
- #
249
- # In those Ruby versions, there's a very special subclass of `Thread` called `Process::Waiter` that causes VM
250
- # crashes whenever something tries to read its instance or thread variables. This subclass of thread only
251
- # shows up when the `Process.detach` API gets used.
252
- # In the specs you'll find crash regression tests that include a way of reproducing it.
253
- #
254
- # As workaround for now we just skip it for the affected Rubies
255
- return 0 if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
256
-
257
- last_value = thread.thread_variable_get(key) || current_value
258
- thread.thread_variable_set(key, current_value)
259
-
260
- current_value - last_value
261
- end
262
-
263
- # Whenever there are more than max_threads_sampled active, we only sample a subset of them.
264
- # We do this to avoid impacting the latency of the service being profiled. We want to avoid doing
265
- # a big burst of work all at once (sample everything), and instead do a little work each time
266
- # (sample a bit by bit).
267
- #
268
- # Because we pick the threads to sample randomly, we'll eventually sample all threads -- just not at once.
269
- # Notice also that this will interact with our dynamic sampling mechanism -- if samples are faster, we take
270
- # them more often, if they are slower, we take them less often -- which again means that over a longer period
271
- # we should take sample roughly the same samples.
272
- #
273
- # One downside of this approach is that if there really are many threads, the resulting wall clock times
274
- # in a one minute profile may "drift" around the 60 second mark, e.g. maybe we only sampled a thread once per
275
- # second and only 59 times, so we'll report 59s, but on the next report we'll include the missing one, so
276
- # then the result will be 61s. I've observed 60 +- 1.68 secs for an app with ~65 threads, given the
277
- # default maximum of 16 threads. This seems a reasonable enough margin of error given the improvement to
278
- # latency (especially on such a large application! -> even bigger latency impact if we tried to sample all
279
- # threads).
280
- #
281
- def threads_to_sample
282
- all_threads = thread_api.list
283
-
284
- if all_threads.size > @max_threads_sampled
285
- all_threads.sample(@max_threads_sampled)
286
- else
287
- all_threads
288
- end
289
- end
290
-
291
- def get_current_wall_time_timestamp_ns
292
- Core::Utils::Time.get_time(:nanosecond)
8
+ # Methods prefixed with _native_ are implemented in `collectors_stack.c`
9
+ class Stack
10
+ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_frames: 400)
11
+ self.class._native_sample(thread, recorder_instance, metric_values_hash, labels_array, max_frames)
293
12
  end
294
13
  end
295
14
  end
@@ -3,7 +3,6 @@
3
3
  require 'set'
4
4
  require 'time'
5
5
 
6
- require 'datadog/core'
7
6
  require 'datadog/profiling/flush'
8
7
  require 'datadog/profiling/pprof/template'
9
8
 
@@ -30,7 +30,7 @@ module Datadog
30
30
  FORM_FIELD_TAG_ENV = 'env'.freeze
31
31
  FORM_FIELD_TAG_HOST = 'host'.freeze
32
32
  FORM_FIELD_TAG_LANGUAGE = 'language'.freeze
33
- FORM_FIELD_TAG_PID = 'pid'.freeze
33
+ FORM_FIELD_TAG_PID = 'process_id'.freeze
34
34
  FORM_FIELD_TAG_PROFILER_VERSION = 'profiler_version'.freeze
35
35
  FORM_FIELD_TAG_RUNTIME = 'runtime'.freeze
36
36
  FORM_FIELD_TAG_RUNTIME_ENGINE = 'runtime_engine'.freeze
@@ -6,7 +6,7 @@ require 'datadog/core/environment/socket'
6
6
  module Datadog
7
7
  module Profiling
8
8
  # Entity class used to represent metadata for a given profile
9
- Flush = Struct.new(
9
+ OldFlush = Struct.new(
10
10
  :start,
11
11
  :finish,
12
12
  :event_groups,
@@ -0,0 +1,22 @@
1
+ # typed: ignore
2
+
3
+ # This file is used to load the profiling native extension. It works in two steps:
4
+ #
5
+ # 1. Load the ddtrace_profiling_loader extension. This extension will be used to load the actual extension, but in
6
+ # a special way that avoids exposing native-level code symbols. See `ddtrace_profiling_loader.c` for more details.
7
+ #
8
+ # 2. Use the Datadog::Profiling::Loader exposed by the ddtrace_profiling_loader extension to load the actual
9
+ # profiling native extension.
10
+ #
11
+ # All code on this file is on-purpose at the top-level; this makes it so this file is executed only once,
12
+ # the first time it gets required, to avoid any issues with the native extension being initialized more than once.
13
+
14
+ require "ddtrace_profiling_loader.#{RUBY_VERSION}_#{RUBY_PLATFORM}"
15
+
16
+ extension_name = "ddtrace_profiling_native_extension.#{RUBY_VERSION}_#{RUBY_PLATFORM}"
17
+ full_file_path = "#{__dir__}/../../#{extension_name}.#{RbConfig::CONFIG['DLEXT']}"
18
+ init_function_name = "Init_#{extension_name.split('.').first}"
19
+
20
+ status, result = Datadog::Profiling::Loader._native_load(full_file_path, init_function_name)
21
+
22
+ raise "Failure to load #{extension_name} due to #{result}" if status == :error
@@ -75,7 +75,7 @@ module Datadog
75
75
 
76
76
  code_provenance = @code_provenance_collector.refresh.generate_json if @code_provenance_collector
77
77
 
78
- Flush.new(
78
+ OldFlush.new(
79
79
  start: start,
80
80
  finish: finish,
81
81
  event_groups: event_groups,
@@ -114,7 +114,7 @@ module Datadog
114
114
  # Sleep for a bit to cause misalignment between profilers in multi-process applications
115
115
  #
116
116
  # When not being run in a loop, it means the scheduler has not been started or was stopped, and thus
117
- # a) it's being shutting down (and is trying to report the last profile)
117
+ # a) it's being shut down (and is trying to report the last profile)
118
118
  # b) it's being run as a one-shot, usually in a test
119
119
  # ...so in those cases we don't sleep
120
120
  #