ddtrace 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +4 -16
  3. data/CHANGELOG.md +31 -2
  4. data/LICENSE-3rdparty.csv +3 -2
  5. data/README.md +2 -2
  6. data/ddtrace.gemspec +12 -3
  7. data/docs/GettingStarted.md +19 -2
  8. data/docs/ProfilingDevelopment.md +8 -8
  9. data/docs/UpgradeGuide.md +3 -3
  10. data/ext/ddtrace_profiling_loader/ddtrace_profiling_loader.c +118 -0
  11. data/ext/ddtrace_profiling_loader/extconf.rb +53 -0
  12. data/ext/ddtrace_profiling_native_extension/NativeExtensionDesign.md +31 -5
  13. data/ext/ddtrace_profiling_native_extension/clock_id_from_pthread.c +0 -8
  14. data/ext/ddtrace_profiling_native_extension/collectors_stack.c +278 -0
  15. data/ext/ddtrace_profiling_native_extension/extconf.rb +70 -100
  16. data/ext/ddtrace_profiling_native_extension/libddprof_helpers.h +13 -0
  17. data/ext/ddtrace_profiling_native_extension/native_extension_helpers.rb +186 -0
  18. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +579 -7
  19. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.h +30 -0
  20. data/ext/ddtrace_profiling_native_extension/profiling.c +7 -0
  21. data/ext/ddtrace_profiling_native_extension/stack_recorder.c +139 -0
  22. data/ext/ddtrace_profiling_native_extension/stack_recorder.h +28 -0
  23. data/lib/datadog/appsec/autoload.rb +2 -2
  24. data/lib/datadog/appsec/configuration/settings.rb +19 -0
  25. data/lib/datadog/appsec/configuration.rb +8 -0
  26. data/lib/datadog/appsec/contrib/rack/gateway/watcher.rb +76 -33
  27. data/lib/datadog/appsec/contrib/rack/integration.rb +1 -0
  28. data/lib/datadog/appsec/contrib/rack/patcher.rb +0 -1
  29. data/lib/datadog/appsec/contrib/rack/reactive/request_body.rb +64 -0
  30. data/lib/datadog/appsec/contrib/rack/request.rb +6 -0
  31. data/lib/datadog/appsec/contrib/rack/request_body_middleware.rb +41 -0
  32. data/lib/datadog/appsec/contrib/rack/request_middleware.rb +60 -5
  33. data/lib/datadog/appsec/contrib/rails/gateway/watcher.rb +81 -0
  34. data/lib/datadog/appsec/contrib/rails/patcher.rb +34 -1
  35. data/lib/datadog/appsec/contrib/rails/reactive/action.rb +68 -0
  36. data/lib/datadog/appsec/contrib/rails/request.rb +33 -0
  37. data/lib/datadog/appsec/contrib/sinatra/gateway/watcher.rb +124 -0
  38. data/lib/datadog/appsec/contrib/sinatra/patcher.rb +69 -2
  39. data/lib/datadog/appsec/contrib/sinatra/reactive/routed.rb +63 -0
  40. data/lib/datadog/appsec/event.rb +33 -18
  41. data/lib/datadog/appsec/extensions.rb +0 -3
  42. data/lib/datadog/appsec/processor.rb +45 -2
  43. data/lib/datadog/appsec/rate_limiter.rb +5 -0
  44. data/lib/datadog/appsec/reactive/operation.rb +0 -1
  45. data/lib/datadog/ci/ext/environment.rb +21 -7
  46. data/lib/datadog/core/configuration/agent_settings_resolver.rb +1 -1
  47. data/lib/datadog/core/configuration/components.rb +22 -4
  48. data/lib/datadog/core/configuration/settings.rb +3 -3
  49. data/lib/datadog/core/configuration.rb +7 -5
  50. data/lib/datadog/core/environment/cgroup.rb +3 -1
  51. data/lib/datadog/core/environment/container.rb +2 -1
  52. data/lib/datadog/core/environment/variable_helpers.rb +26 -2
  53. data/lib/datadog/core/logging/ext.rb +11 -0
  54. data/lib/datadog/core/metrics/client.rb +15 -5
  55. data/lib/datadog/core/runtime/metrics.rb +1 -1
  56. data/lib/datadog/core/workers/async.rb +3 -1
  57. data/lib/datadog/core/workers/runtime_metrics.rb +0 -3
  58. data/lib/datadog/core.rb +6 -0
  59. data/lib/datadog/kit/enable_core_dumps.rb +50 -0
  60. data/lib/datadog/kit/identity.rb +63 -0
  61. data/lib/datadog/kit.rb +11 -0
  62. data/lib/datadog/opentracer/tracer.rb +0 -2
  63. data/lib/datadog/profiling/collectors/old_stack.rb +298 -0
  64. data/lib/datadog/profiling/collectors/stack.rb +6 -287
  65. data/lib/datadog/profiling/encoding/profile.rb +0 -1
  66. data/lib/datadog/profiling/ext.rb +1 -1
  67. data/lib/datadog/profiling/flush.rb +1 -1
  68. data/lib/datadog/profiling/load_native_extension.rb +22 -0
  69. data/lib/datadog/profiling/recorder.rb +1 -1
  70. data/lib/datadog/profiling/scheduler.rb +1 -1
  71. data/lib/datadog/profiling/stack_recorder.rb +33 -0
  72. data/lib/datadog/profiling/tag_builder.rb +48 -0
  73. data/lib/datadog/profiling/tasks/exec.rb +2 -2
  74. data/lib/datadog/profiling/tasks/setup.rb +6 -4
  75. data/lib/datadog/profiling.rb +29 -27
  76. data/lib/datadog/tracing/buffer.rb +9 -3
  77. data/lib/datadog/tracing/contrib/action_view/patcher.rb +0 -1
  78. data/lib/datadog/tracing/contrib/active_record/configuration/resolver.rb +2 -2
  79. data/lib/datadog/tracing/contrib/active_record/utils.rb +1 -1
  80. data/lib/datadog/tracing/contrib/active_record/vendor/connection_specification.rb +1 -1
  81. data/lib/datadog/tracing/contrib/active_support/notifications/subscription.rb +4 -2
  82. data/lib/datadog/tracing/contrib/concurrent_ruby/context_composite_executor_service.rb +10 -3
  83. data/lib/datadog/tracing/contrib/dalli/patcher.rb +0 -1
  84. data/lib/datadog/tracing/contrib/delayed_job/patcher.rb +0 -1
  85. data/lib/datadog/tracing/contrib/elasticsearch/integration.rb +9 -3
  86. data/lib/datadog/tracing/contrib/elasticsearch/patcher.rb +38 -2
  87. data/lib/datadog/tracing/contrib/ethon/patcher.rb +0 -1
  88. data/lib/datadog/tracing/contrib/extensions.rb +0 -2
  89. data/lib/datadog/tracing/contrib/faraday/patcher.rb +0 -1
  90. data/lib/datadog/tracing/contrib/grape/patcher.rb +0 -1
  91. data/lib/datadog/tracing/contrib/graphql/patcher.rb +0 -1
  92. data/lib/datadog/tracing/contrib/grpc/patcher.rb +0 -1
  93. data/lib/datadog/tracing/contrib/kafka/patcher.rb +0 -1
  94. data/lib/datadog/tracing/contrib/lograge/instrumentation.rb +2 -1
  95. data/lib/datadog/tracing/contrib/qless/patcher.rb +0 -1
  96. data/lib/datadog/tracing/contrib/que/patcher.rb +0 -1
  97. data/lib/datadog/tracing/contrib/racecar/patcher.rb +0 -1
  98. data/lib/datadog/tracing/contrib/rails/log_injection.rb +3 -16
  99. data/lib/datadog/tracing/contrib/rake/instrumentation.rb +2 -2
  100. data/lib/datadog/tracing/contrib/rake/patcher.rb +0 -1
  101. data/lib/datadog/tracing/contrib/redis/patcher.rb +0 -1
  102. data/lib/datadog/tracing/contrib/resque/patcher.rb +0 -1
  103. data/lib/datadog/tracing/contrib/rest_client/patcher.rb +0 -1
  104. data/lib/datadog/tracing/contrib/semantic_logger/instrumentation.rb +2 -1
  105. data/lib/datadog/tracing/contrib/sidekiq/configuration/settings.rb +1 -0
  106. data/lib/datadog/tracing/contrib/sidekiq/server_tracer.rb +20 -1
  107. data/lib/datadog/tracing/contrib/sinatra/framework.rb +11 -0
  108. data/lib/datadog/tracing/contrib/sinatra/patcher.rb +0 -1
  109. data/lib/datadog/tracing/contrib/sneakers/patcher.rb +0 -1
  110. data/lib/datadog/tracing/contrib/sucker_punch/patcher.rb +0 -1
  111. data/lib/datadog/tracing/event.rb +2 -1
  112. data/lib/datadog/tracing/sampling/priority_sampler.rb +4 -5
  113. data/lib/datadog/tracing/sampling/rule.rb +12 -6
  114. data/lib/datadog/tracing/sampling/rule_sampler.rb +3 -5
  115. data/lib/datadog/tracing/span_operation.rb +2 -3
  116. data/lib/datadog/tracing/trace_operation.rb +0 -1
  117. data/lib/ddtrace/transport/http/client.rb +2 -1
  118. data/lib/ddtrace/transport/http/response.rb +34 -4
  119. data/lib/ddtrace/transport/io/client.rb +3 -1
  120. data/lib/ddtrace/version.rb +1 -1
  121. data/lib/ddtrace.rb +1 -0
  122. metadata +43 -6
@@ -0,0 +1,298 @@
1
+ # typed: true
2
+
3
+ require 'datadog/core/utils/only_once'
4
+ require 'datadog/core/utils/time'
5
+ require 'datadog/core/worker'
6
+ require 'datadog/core/workers/polling'
7
+ require 'datadog/profiling/backtrace_location'
8
+ require 'datadog/profiling/events/stack'
9
+ require 'datadog/profiling/native_extension'
10
+
11
+ module Datadog
12
+ module Profiling
13
+ module Collectors
14
+ # Collects stack trace samples from Ruby threads for both CPU-time (if available) and wall-clock.
15
+ # Runs on its own background thread.
16
+ #
17
+ # This class has the prefix "Old" because it will be deprecated by the new native CPU Profiler
18
+ class OldStack < Core::Worker # rubocop:disable Metrics/ClassLength
19
+ include Core::Workers::Polling
20
+
21
+ DEFAULT_MAX_TIME_USAGE_PCT = 2.0
22
+ MIN_INTERVAL = 0.01
23
+ THREAD_LAST_CPU_TIME_KEY = :datadog_profiler_last_cpu_time
24
+ THREAD_LAST_WALL_CLOCK_KEY = :datadog_profiler_last_wall_clock
25
+ SYNTHETIC_STACK_IN_NATIVE_CODE = [BacktraceLocation.new('', 0, 'In native code').freeze].freeze
26
+
27
+ # This default was picked based on the current sampling performance and on expected concurrency on an average
28
+ # Ruby MRI application. Lowering this optimizes for latency (less impact each time we sample), and raising
29
+ # optimizes for coverage (less chance to miss what a given thread is doing).
30
+ DEFAULT_MAX_THREADS_SAMPLED = 16
31
+
32
+ attr_reader \
33
+ :recorder,
34
+ :max_frames,
35
+ :trace_identifiers_helper,
36
+ :ignore_thread,
37
+ :max_time_usage_pct,
38
+ :thread_api,
39
+ :cpu_time_provider
40
+
41
+ def initialize(
42
+ recorder,
43
+ max_frames:,
44
+ trace_identifiers_helper:, # Usually an instance of Profiling::TraceIdentifiers::Helper
45
+ ignore_thread: nil,
46
+ max_time_usage_pct: DEFAULT_MAX_TIME_USAGE_PCT,
47
+ max_threads_sampled: DEFAULT_MAX_THREADS_SAMPLED,
48
+ thread_api: Thread,
49
+ cpu_time_provider: Profiling::NativeExtension,
50
+ fork_policy: Core::Workers::Async::Thread::FORK_POLICY_RESTART, # Restart in forks by default
51
+ interval: MIN_INTERVAL,
52
+ enabled: true
53
+ )
54
+ @recorder = recorder
55
+ @max_frames = max_frames
56
+ @trace_identifiers_helper = trace_identifiers_helper
57
+ @ignore_thread = ignore_thread
58
+ @max_time_usage_pct = max_time_usage_pct
59
+ @max_threads_sampled = max_threads_sampled
60
+ @thread_api = thread_api
61
+ # Only set the provider if it's able to work in the current Ruby/OS combo
62
+ @cpu_time_provider = cpu_time_provider unless cpu_time_provider.cpu_time_ns_for(thread_api.current).nil?
63
+
64
+ # Workers::Async::Thread settings
65
+ self.fork_policy = fork_policy
66
+
67
+ # Workers::IntervalLoop settings
68
+ self.loop_base_interval = interval
69
+
70
+ # Workers::Polling settings
71
+ self.enabled = enabled
72
+
73
+ # Cache this proc, since it's pretty expensive to keep recreating it
74
+ @build_backtrace_location = method(:build_backtrace_location).to_proc
75
+ # Cache this buffer, since it's pretty expensive to keep accessing it
76
+ @stack_sample_event_recorder = recorder[Events::StackSample]
77
+ # See below for details on why this is needed
78
+ @needs_process_waiter_workaround =
79
+ Gem::Version.new(RUBY_VERSION) >= Gem::Version.new('2.3') &&
80
+ Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.7')
81
+ end
82
+
83
+ def start
84
+ reset_cpu_time_tracking
85
+ perform
86
+ end
87
+
88
+ def perform
89
+ collect_and_wait
90
+ end
91
+
92
+ def collect_and_wait
93
+ run_time = Core::Utils::Time.measure do
94
+ collect_events
95
+ end
96
+
97
+ # Update wait time to throttle profiling
98
+ self.loop_wait_time = compute_wait_time(run_time)
99
+ end
100
+
101
+ def collect_events
102
+ events = []
103
+ current_wall_time_ns = get_current_wall_time_timestamp_ns
104
+
105
+ # Collect backtraces from each thread
106
+ threads_to_sample.each do |thread|
107
+ next unless thread.alive?
108
+ next if ignore_thread.is_a?(Proc) && ignore_thread.call(thread)
109
+
110
+ event = collect_thread_event(thread, current_wall_time_ns)
111
+ events << event unless event.nil?
112
+ end
113
+
114
+ # Send events to recorder
115
+ recorder.push(events) unless events.empty?
116
+
117
+ events
118
+ end
119
+
120
+ def collect_thread_event(thread, current_wall_time_ns)
121
+ locations = thread.backtrace_locations
122
+ return if locations.nil?
123
+
124
+ # Having empty locations means that the thread is alive, but we don't know what it's doing:
125
+ #
126
+ # 1. It can be starting up
127
+ # ```
128
+ # > Thread.new { sleep }.backtrace
129
+ # => [] # <-- note the thread hasn't actually started running sleep yet, we got there first
130
+ # ```
131
+ # 2. It can be running native code
132
+ # ```
133
+ # > t = Process.detach(fork { sleep })
134
+ # => #<Process::Waiter:0x00007ffe7285f7a0 run>
135
+ # > t.backtrace
136
+ # => [] # <-- this can happen even minutes later, e.g. it's not a race as in 1.
137
+ # ```
138
+ # This effect has been observed in threads created by the Iodine web server and the ffi gem
139
+ #
140
+ # To give customers visibility into these threads, we replace the empty stack with one containing a
141
+ # synthetic placeholder frame, so that these threads are properly represented in the UX.
142
+ locations = SYNTHETIC_STACK_IN_NATIVE_CODE if locations.empty?
143
+
144
+ # Get actual stack size then trim the stack
145
+ stack_size = locations.length
146
+ locations = locations[0..(max_frames - 1)]
147
+
148
+ # Convert backtrace locations into structs
149
+ locations = convert_backtrace_locations(locations)
150
+
151
+ thread_id = thread.object_id
152
+ root_span_id, span_id, trace_resource = trace_identifiers_helper.trace_identifiers_for(thread)
153
+ cpu_time = get_cpu_time_interval!(thread)
154
+ wall_time_interval_ns =
155
+ get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_WALL_CLOCK_KEY, current_wall_time_ns)
156
+
157
+ Events::StackSample.new(
158
+ nil,
159
+ locations,
160
+ stack_size,
161
+ thread_id,
162
+ root_span_id,
163
+ span_id,
164
+ trace_resource,
165
+ cpu_time,
166
+ wall_time_interval_ns
167
+ )
168
+ end
169
+
170
+ def get_cpu_time_interval!(thread)
171
+ return unless cpu_time_provider
172
+
173
+ current_cpu_time_ns = cpu_time_provider.cpu_time_ns_for(thread)
174
+
175
+ return unless current_cpu_time_ns
176
+
177
+ get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
178
+ end
179
+
180
+ def compute_wait_time(used_time)
181
+ # We took used_time to get the last sample.
182
+ #
183
+ # What we're computing here is -- if used_time corresponds to max_time_usage_pct of the time we should
184
+ # spend working, how much is (100% - max_time_usage_pct) of the time?
185
+ #
186
+ # For instance, if we took 10ms to sample, and max_time_usage_pct is 1%, then the other 99% is 990ms, which
187
+ # means we need to sleep for 990ms to guarantee that we don't spend more than 1% of the time working.
188
+ used_time_ns = used_time * 1e9
189
+ interval = (used_time_ns / (max_time_usage_pct / 100.0)) - used_time_ns
190
+ [interval / 1e9, MIN_INTERVAL].max
191
+ end
192
+
193
+ # Convert backtrace locations into structs
194
+ # Re-use old backtrace location objects if they already exist in the buffer
195
+ def convert_backtrace_locations(locations)
196
+ locations.collect do |location|
197
+ # Re-use existing BacktraceLocation if identical copy, otherwise build a new one.
198
+ @stack_sample_event_recorder.cache(:backtrace_locations).fetch(
199
+ # Function name
200
+ location.base_label,
201
+ # Line number
202
+ location.lineno,
203
+ # Filename
204
+ location.path,
205
+ # Build function
206
+ &@build_backtrace_location
207
+ )
208
+ end
209
+ end
210
+
211
+ def build_backtrace_location(_id, base_label, lineno, path)
212
+ string_table = @stack_sample_event_recorder.string_table
213
+
214
+ Profiling::BacktraceLocation.new(
215
+ string_table.fetch_string(base_label),
216
+ lineno,
217
+ string_table.fetch_string(path)
218
+ )
219
+ end
220
+
221
+ private
222
+
223
+ # If the profiler is started for a while, stopped and then restarted OR whenever the process forks, we need to
224
+ # clean up any leftover per-thread counters, so that the first sample after starting doesn't end up with:
225
+ #
226
+ # a) negative time: At least on my test docker container, and on the reliability environment, after the process
227
+ # forks, the cpu time reference changes and (old cpu time - new cpu time) can be < 0
228
+ #
229
+ # b) large amount of time: if the profiler was started, then stopped for some amount of time, and then
230
+ # restarted, we don't want the first sample to be "blamed" for multiple minutes of CPU time
231
+ #
232
+ # By resetting the last cpu time seen, we start with a clean slate every time we start the stack collector.
233
+ def reset_cpu_time_tracking
234
+ thread_api.list.each do |thread|
235
+ # See below for details on why this is needed
236
+ next if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
237
+
238
+ thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, nil)
239
+ thread.thread_variable_set(THREAD_LAST_WALL_CLOCK_KEY, nil)
240
+ end
241
+ end
242
+
243
+ def get_elapsed_since_last_sample_and_set_value(thread, key, current_value)
244
+ # Process::Waiter crash workaround:
245
+ #
246
+ # This is a workaround for a Ruby VM segfault (usually something like
247
+ # "[BUG] Segmentation fault at 0x0000000000000008") in the affected Ruby versions.
248
+ # See https://bugs.ruby-lang.org/issues/17807 for details.
249
+ #
250
+ # In those Ruby versions, there's a very special subclass of `Thread` called `Process::Waiter` that causes VM
251
+ # crashes whenever something tries to read its instance or thread variables. This subclass of thread only
252
+ # shows up when the `Process.detach` API gets used.
253
+ # In the specs you'll find crash regression tests that include a way of reproducing it.
254
+ #
255
+ # As workaround for now we just skip it for the affected Rubies
256
+ return 0 if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
257
+
258
+ last_value = thread.thread_variable_get(key) || current_value
259
+ thread.thread_variable_set(key, current_value)
260
+
261
+ current_value - last_value
262
+ end
263
+
264
+ # Whenever there are more than max_threads_sampled active, we only sample a subset of them.
265
+ # We do this to avoid impacting the latency of the service being profiled. We want to avoid doing
266
+ # a big burst of work all at once (sample everything), and instead do a little work each time
267
+ # (sample a bit by bit).
268
+ #
269
+ # Because we pick the threads to sample randomly, we'll eventually sample all threads -- just not at once.
270
+ # Notice also that this will interact with our dynamic sampling mechanism -- if samples are faster, we take
271
+ # them more often, if they are slower, we take them less often -- which again means that over a longer period
272
+ # we should take sample roughly the same samples.
273
+ #
274
+ # One downside of this approach is that if there really are many threads, the resulting wall clock times
275
+ # in a one minute profile may "drift" around the 60 second mark, e.g. maybe we only sampled a thread once per
276
+ # second and only 59 times, so we'll report 59s, but on the next report we'll include the missing one, so
277
+ # then the result will be 61s. I've observed 60 +- 1.68 secs for an app with ~65 threads, given the
278
+ # default maximum of 16 threads. This seems a reasonable enough margin of error given the improvement to
279
+ # latency (especially on such a large application! -> even bigger latency impact if we tried to sample all
280
+ # threads).
281
+ #
282
+ def threads_to_sample
283
+ all_threads = thread_api.list
284
+
285
+ if all_threads.size > @max_threads_sampled
286
+ all_threads.sample(@max_threads_sampled)
287
+ else
288
+ all_threads
289
+ end
290
+ end
291
+
292
+ def get_current_wall_time_timestamp_ns
293
+ Core::Utils::Time.get_time(:nanosecond)
294
+ end
295
+ end
296
+ end
297
+ end
298
+ end
@@ -1,295 +1,14 @@
1
- # typed: true
2
-
3
- require 'datadog/core/utils/only_once'
4
- require 'datadog/core/utils/time'
5
- require 'datadog/core/worker'
6
- require 'datadog/core/workers/polling'
7
- require 'datadog/profiling/backtrace_location'
8
- require 'datadog/profiling/events/stack'
9
- require 'datadog/profiling/native_extension'
1
+ # typed: false
10
2
 
11
3
  module Datadog
12
4
  module Profiling
13
5
  module Collectors
14
- # Collects stack trace samples from Ruby threads for both CPU-time (if available) and wall-clock.
15
- # Runs on its own background thread.
6
+ # Used to gather a stack trace from a given Ruby thread. Almost all of this class is implemented as native code.
16
7
  #
17
- class Stack < Core::Worker # rubocop:disable Metrics/ClassLength
18
- include Core::Workers::Polling
19
-
20
- DEFAULT_MAX_TIME_USAGE_PCT = 2.0
21
- MIN_INTERVAL = 0.01
22
- THREAD_LAST_CPU_TIME_KEY = :datadog_profiler_last_cpu_time
23
- THREAD_LAST_WALL_CLOCK_KEY = :datadog_profiler_last_wall_clock
24
- SYNTHETIC_STACK_IN_NATIVE_CODE = [BacktraceLocation.new('', 0, 'In native code').freeze].freeze
25
-
26
- # This default was picked based on the current sampling performance and on expected concurrency on an average
27
- # Ruby MRI application. Lowering this optimizes for latency (less impact each time we sample), and raising
28
- # optimizes for coverage (less chance to miss what a given thread is doing).
29
- DEFAULT_MAX_THREADS_SAMPLED = 16
30
-
31
- attr_reader \
32
- :recorder,
33
- :max_frames,
34
- :trace_identifiers_helper,
35
- :ignore_thread,
36
- :max_time_usage_pct,
37
- :thread_api,
38
- :cpu_time_provider
39
-
40
- def initialize(
41
- recorder,
42
- max_frames:,
43
- trace_identifiers_helper:, # Usually an instance of Profiling::TraceIdentifiers::Helper
44
- ignore_thread: nil,
45
- max_time_usage_pct: DEFAULT_MAX_TIME_USAGE_PCT,
46
- max_threads_sampled: DEFAULT_MAX_THREADS_SAMPLED,
47
- thread_api: Thread,
48
- cpu_time_provider: Profiling::NativeExtension,
49
- fork_policy: Core::Workers::Async::Thread::FORK_POLICY_RESTART, # Restart in forks by default
50
- interval: MIN_INTERVAL,
51
- enabled: true
52
- )
53
- @recorder = recorder
54
- @max_frames = max_frames
55
- @trace_identifiers_helper = trace_identifiers_helper
56
- @ignore_thread = ignore_thread
57
- @max_time_usage_pct = max_time_usage_pct
58
- @max_threads_sampled = max_threads_sampled
59
- @thread_api = thread_api
60
- # Only set the provider if it's able to work in the current Ruby/OS combo
61
- @cpu_time_provider = cpu_time_provider unless cpu_time_provider.cpu_time_ns_for(thread_api.current).nil?
62
-
63
- # Workers::Async::Thread settings
64
- self.fork_policy = fork_policy
65
-
66
- # Workers::IntervalLoop settings
67
- self.loop_base_interval = interval
68
-
69
- # Workers::Polling settings
70
- self.enabled = enabled
71
-
72
- # Cache this proc, since it's pretty expensive to keep recreating it
73
- @build_backtrace_location = method(:build_backtrace_location).to_proc
74
- # Cache this buffer, since it's pretty expensive to keep accessing it
75
- @stack_sample_event_recorder = recorder[Events::StackSample]
76
- # See below for details on why this is needed
77
- @needs_process_waiter_workaround =
78
- Gem::Version.new(RUBY_VERSION) >= Gem::Version.new('2.3') &&
79
- Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.7')
80
- end
81
-
82
- def start
83
- reset_cpu_time_tracking
84
- perform
85
- end
86
-
87
- def perform
88
- collect_and_wait
89
- end
90
-
91
- def collect_and_wait
92
- run_time = Core::Utils::Time.measure do
93
- collect_events
94
- end
95
-
96
- # Update wait time to throttle profiling
97
- self.loop_wait_time = compute_wait_time(run_time)
98
- end
99
-
100
- def collect_events
101
- events = []
102
- current_wall_time_ns = get_current_wall_time_timestamp_ns
103
-
104
- # Collect backtraces from each thread
105
- threads_to_sample.each do |thread|
106
- next unless thread.alive?
107
- next if ignore_thread.is_a?(Proc) && ignore_thread.call(thread)
108
-
109
- event = collect_thread_event(thread, current_wall_time_ns)
110
- events << event unless event.nil?
111
- end
112
-
113
- # Send events to recorder
114
- recorder.push(events) unless events.empty?
115
-
116
- events
117
- end
118
-
119
- def collect_thread_event(thread, current_wall_time_ns)
120
- locations = thread.backtrace_locations
121
- return if locations.nil?
122
-
123
- # Having empty locations means that the thread is alive, but we don't know what it's doing:
124
- #
125
- # 1. It can be starting up
126
- # ```
127
- # > Thread.new { sleep }.backtrace
128
- # => [] # <-- note the thread hasn't actually started running sleep yet, we got there first
129
- # ```
130
- # 2. It can be running native code
131
- # ```
132
- # > t = Process.detach(fork { sleep })
133
- # => #<Process::Waiter:0x00007ffe7285f7a0 run>
134
- # > t.backtrace
135
- # => [] # <-- this can happen even minutes later, e.g. it's not a race as in 1.
136
- # ```
137
- # This effect has been observed in threads created by the Iodine web server and the ffi gem
138
- #
139
- # To give customers visibility into these threads, we replace the empty stack with one containing a
140
- # synthetic placeholder frame, so that these threads are properly represented in the UX.
141
- locations = SYNTHETIC_STACK_IN_NATIVE_CODE if locations.empty?
142
-
143
- # Get actual stack size then trim the stack
144
- stack_size = locations.length
145
- locations = locations[0..(max_frames - 1)]
146
-
147
- # Convert backtrace locations into structs
148
- locations = convert_backtrace_locations(locations)
149
-
150
- thread_id = thread.object_id
151
- root_span_id, span_id, trace_resource = trace_identifiers_helper.trace_identifiers_for(thread)
152
- cpu_time = get_cpu_time_interval!(thread)
153
- wall_time_interval_ns =
154
- get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_WALL_CLOCK_KEY, current_wall_time_ns)
155
-
156
- Events::StackSample.new(
157
- nil,
158
- locations,
159
- stack_size,
160
- thread_id,
161
- root_span_id,
162
- span_id,
163
- trace_resource,
164
- cpu_time,
165
- wall_time_interval_ns
166
- )
167
- end
168
-
169
- def get_cpu_time_interval!(thread)
170
- return unless cpu_time_provider
171
-
172
- current_cpu_time_ns = cpu_time_provider.cpu_time_ns_for(thread)
173
-
174
- return unless current_cpu_time_ns
175
-
176
- get_elapsed_since_last_sample_and_set_value(thread, THREAD_LAST_CPU_TIME_KEY, current_cpu_time_ns)
177
- end
178
-
179
- def compute_wait_time(used_time)
180
- # We took used_time to get the last sample.
181
- #
182
- # What we're computing here is -- if used_time corresponds to max_time_usage_pct of the time we should
183
- # spend working, how much is (100% - max_time_usage_pct) of the time?
184
- #
185
- # For instance, if we took 10ms to sample, and max_time_usage_pct is 1%, then the other 99% is 990ms, which
186
- # means we need to sleep for 990ms to guarantee that we don't spend more than 1% of the time working.
187
- used_time_ns = used_time * 1e9
188
- interval = (used_time_ns / (max_time_usage_pct / 100.0)) - used_time_ns
189
- [interval / 1e9, MIN_INTERVAL].max
190
- end
191
-
192
- # Convert backtrace locations into structs
193
- # Re-use old backtrace location objects if they already exist in the buffer
194
- def convert_backtrace_locations(locations)
195
- locations.collect do |location|
196
- # Re-use existing BacktraceLocation if identical copy, otherwise build a new one.
197
- @stack_sample_event_recorder.cache(:backtrace_locations).fetch(
198
- # Function name
199
- location.base_label,
200
- # Line number
201
- location.lineno,
202
- # Filename
203
- location.path,
204
- # Build function
205
- &@build_backtrace_location
206
- )
207
- end
208
- end
209
-
210
- def build_backtrace_location(_id, base_label, lineno, path)
211
- string_table = @stack_sample_event_recorder.string_table
212
-
213
- Profiling::BacktraceLocation.new(
214
- string_table.fetch_string(base_label),
215
- lineno,
216
- string_table.fetch_string(path)
217
- )
218
- end
219
-
220
- private
221
-
222
- # If the profiler is started for a while, stopped and then restarted OR whenever the process forks, we need to
223
- # clean up any leftover per-thread counters, so that the first sample after starting doesn't end up with:
224
- #
225
- # a) negative time: At least on my test docker container, and on the reliability environment, after the process
226
- # forks, the cpu time reference changes and (old cpu time - new cpu time) can be < 0
227
- #
228
- # b) large amount of time: if the profiler was started, then stopped for some amount of time, and then
229
- # restarted, we don't want the first sample to be "blamed" for multiple minutes of CPU time
230
- #
231
- # By resetting the last cpu time seen, we start with a clean slate every time we start the stack collector.
232
- def reset_cpu_time_tracking
233
- thread_api.list.each do |thread|
234
- # See below for details on why this is needed
235
- next if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
236
-
237
- thread.thread_variable_set(THREAD_LAST_CPU_TIME_KEY, nil)
238
- thread.thread_variable_set(THREAD_LAST_WALL_CLOCK_KEY, nil)
239
- end
240
- end
241
-
242
- def get_elapsed_since_last_sample_and_set_value(thread, key, current_value)
243
- # Process::Waiter crash workaround:
244
- #
245
- # This is a workaround for a Ruby VM segfault (usually something like
246
- # "[BUG] Segmentation fault at 0x0000000000000008") in the affected Ruby versions.
247
- # See https://bugs.ruby-lang.org/issues/17807 for details.
248
- #
249
- # In those Ruby versions, there's a very special subclass of `Thread` called `Process::Waiter` that causes VM
250
- # crashes whenever something tries to read its instance or thread variables. This subclass of thread only
251
- # shows up when the `Process.detach` API gets used.
252
- # In the specs you'll find crash regression tests that include a way of reproducing it.
253
- #
254
- # As workaround for now we just skip it for the affected Rubies
255
- return 0 if @needs_process_waiter_workaround && thread.is_a?(::Process::Waiter)
256
-
257
- last_value = thread.thread_variable_get(key) || current_value
258
- thread.thread_variable_set(key, current_value)
259
-
260
- current_value - last_value
261
- end
262
-
263
- # Whenever there are more than max_threads_sampled active, we only sample a subset of them.
264
- # We do this to avoid impacting the latency of the service being profiled. We want to avoid doing
265
- # a big burst of work all at once (sample everything), and instead do a little work each time
266
- # (sample a bit by bit).
267
- #
268
- # Because we pick the threads to sample randomly, we'll eventually sample all threads -- just not at once.
269
- # Notice also that this will interact with our dynamic sampling mechanism -- if samples are faster, we take
270
- # them more often, if they are slower, we take them less often -- which again means that over a longer period
271
- # we should take sample roughly the same samples.
272
- #
273
- # One downside of this approach is that if there really are many threads, the resulting wall clock times
274
- # in a one minute profile may "drift" around the 60 second mark, e.g. maybe we only sampled a thread once per
275
- # second and only 59 times, so we'll report 59s, but on the next report we'll include the missing one, so
276
- # then the result will be 61s. I've observed 60 +- 1.68 secs for an app with ~65 threads, given the
277
- # default maximum of 16 threads. This seems a reasonable enough margin of error given the improvement to
278
- # latency (especially on such a large application! -> even bigger latency impact if we tried to sample all
279
- # threads).
280
- #
281
- def threads_to_sample
282
- all_threads = thread_api.list
283
-
284
- if all_threads.size > @max_threads_sampled
285
- all_threads.sample(@max_threads_sampled)
286
- else
287
- all_threads
288
- end
289
- end
290
-
291
- def get_current_wall_time_timestamp_ns
292
- Core::Utils::Time.get_time(:nanosecond)
8
+ # Methods prefixed with _native_ are implemented in `collectors_stack.c`
9
+ class Stack
10
+ def sample(thread, recorder_instance, metric_values_hash, labels_array, max_frames: 400)
11
+ self.class._native_sample(thread, recorder_instance, metric_values_hash, labels_array, max_frames)
293
12
  end
294
13
  end
295
14
  end
@@ -3,7 +3,6 @@
3
3
  require 'set'
4
4
  require 'time'
5
5
 
6
- require 'datadog/core'
7
6
  require 'datadog/profiling/flush'
8
7
  require 'datadog/profiling/pprof/template'
9
8
 
@@ -30,7 +30,7 @@ module Datadog
30
30
  FORM_FIELD_TAG_ENV = 'env'.freeze
31
31
  FORM_FIELD_TAG_HOST = 'host'.freeze
32
32
  FORM_FIELD_TAG_LANGUAGE = 'language'.freeze
33
- FORM_FIELD_TAG_PID = 'pid'.freeze
33
+ FORM_FIELD_TAG_PID = 'process_id'.freeze
34
34
  FORM_FIELD_TAG_PROFILER_VERSION = 'profiler_version'.freeze
35
35
  FORM_FIELD_TAG_RUNTIME = 'runtime'.freeze
36
36
  FORM_FIELD_TAG_RUNTIME_ENGINE = 'runtime_engine'.freeze
@@ -6,7 +6,7 @@ require 'datadog/core/environment/socket'
6
6
  module Datadog
7
7
  module Profiling
8
8
  # Entity class used to represent metadata for a given profile
9
- Flush = Struct.new(
9
+ OldFlush = Struct.new(
10
10
  :start,
11
11
  :finish,
12
12
  :event_groups,
@@ -0,0 +1,22 @@
1
+ # typed: ignore
2
+
3
+ # This file is used to load the profiling native extension. It works in two steps:
4
+ #
5
+ # 1. Load the ddtrace_profiling_loader extension. This extension will be used to load the actual extension, but in
6
+ # a special way that avoids exposing native-level code symbols. See `ddtrace_profiling_loader.c` for more details.
7
+ #
8
+ # 2. Use the Datadog::Profiling::Loader exposed by the ddtrace_profiling_loader extension to load the actual
9
+ # profiling native extension.
10
+ #
11
+ # All code on this file is on-purpose at the top-level; this makes it so this file is executed only once,
12
+ # the first time it gets required, to avoid any issues with the native extension being initialized more than once.
13
+
14
+ require "ddtrace_profiling_loader.#{RUBY_VERSION}_#{RUBY_PLATFORM}"
15
+
16
+ extension_name = "ddtrace_profiling_native_extension.#{RUBY_VERSION}_#{RUBY_PLATFORM}"
17
+ full_file_path = "#{__dir__}/../../#{extension_name}.#{RbConfig::CONFIG['DLEXT']}"
18
+ init_function_name = "Init_#{extension_name.split('.').first}"
19
+
20
+ status, result = Datadog::Profiling::Loader._native_load(full_file_path, init_function_name)
21
+
22
+ raise "Failure to load #{extension_name} due to #{result}" if status == :error
@@ -75,7 +75,7 @@ module Datadog
75
75
 
76
76
  code_provenance = @code_provenance_collector.refresh.generate_json if @code_provenance_collector
77
77
 
78
- Flush.new(
78
+ OldFlush.new(
79
79
  start: start,
80
80
  finish: finish,
81
81
  event_groups: event_groups,
@@ -114,7 +114,7 @@ module Datadog
114
114
  # Sleep for a bit to cause misalignment between profilers in multi-process applications
115
115
  #
116
116
  # When not being run in a loop, it means the scheduler has not been started or was stopped, and thus
117
- # a) it's being shutting down (and is trying to report the last profile)
117
+ # a) it's being shut down (and is trying to report the last profile)
118
118
  # b) it's being run as a one-shot, usually in a test
119
119
  # ...so in those cases we don't sleep
120
120
  #