ddtrace 1.5.2 → 1.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (116) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +44 -1
  3. data/ext/ddtrace_profiling_loader/ddtrace_profiling_loader.c +9 -2
  4. data/ext/ddtrace_profiling_loader/extconf.rb +17 -0
  5. data/ext/ddtrace_profiling_native_extension/NativeExtensionDesign.md +38 -2
  6. data/ext/ddtrace_profiling_native_extension/clock_id.h +1 -0
  7. data/ext/ddtrace_profiling_native_extension/clock_id_from_pthread.c +1 -0
  8. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c +517 -42
  9. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h +3 -0
  10. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +208 -30
  11. data/ext/ddtrace_profiling_native_extension/collectors_stack.c +156 -46
  12. data/ext/ddtrace_profiling_native_extension/collectors_stack.h +11 -2
  13. data/ext/ddtrace_profiling_native_extension/extconf.rb +11 -1
  14. data/ext/ddtrace_profiling_native_extension/http_transport.c +83 -64
  15. data/ext/ddtrace_profiling_native_extension/libdatadog_helpers.h +4 -4
  16. data/ext/ddtrace_profiling_native_extension/native_extension_helpers.rb +3 -2
  17. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +59 -0
  18. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.h +3 -0
  19. data/ext/ddtrace_profiling_native_extension/profiling.c +10 -0
  20. data/ext/ddtrace_profiling_native_extension/ruby_helpers.c +0 -1
  21. data/ext/ddtrace_profiling_native_extension/ruby_helpers.h +4 -2
  22. data/ext/ddtrace_profiling_native_extension/stack_recorder.c +45 -29
  23. data/ext/ddtrace_profiling_native_extension/stack_recorder.h +7 -7
  24. data/lib/datadog/appsec/contrib/rack/request_middleware.rb +4 -0
  25. data/lib/datadog/appsec/event.rb +6 -0
  26. data/lib/datadog/core/configuration/components.rb +20 -14
  27. data/lib/datadog/core/configuration/settings.rb +42 -4
  28. data/lib/datadog/core/diagnostics/environment_logger.rb +5 -1
  29. data/lib/datadog/core/utils/compression.rb +5 -1
  30. data/lib/datadog/core.rb +0 -54
  31. data/lib/datadog/profiling/collectors/cpu_and_wall_time.rb +12 -2
  32. data/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +5 -3
  33. data/lib/datadog/profiling/exporter.rb +2 -4
  34. data/lib/datadog/profiling/http_transport.rb +1 -1
  35. data/lib/datadog/tracing/configuration/ext.rb +1 -0
  36. data/lib/datadog/tracing/contrib/aws/instrumentation.rb +2 -0
  37. data/lib/datadog/tracing/contrib/dalli/ext.rb +1 -0
  38. data/lib/datadog/tracing/contrib/dalli/instrumentation.rb +4 -0
  39. data/lib/datadog/tracing/contrib/elasticsearch/ext.rb +2 -0
  40. data/lib/datadog/tracing/contrib/elasticsearch/patcher.rb +3 -0
  41. data/lib/datadog/tracing/contrib/ethon/easy_patch.rb +2 -0
  42. data/lib/datadog/tracing/contrib/ethon/multi_patch.rb +2 -0
  43. data/lib/datadog/tracing/contrib/excon/middleware.rb +2 -0
  44. data/lib/datadog/tracing/contrib/ext.rb +6 -0
  45. data/lib/datadog/tracing/contrib/faraday/middleware.rb +2 -0
  46. data/lib/datadog/tracing/contrib/grpc/datadog_interceptor/client.rb +5 -0
  47. data/lib/datadog/tracing/contrib/grpc/datadog_interceptor/server.rb +7 -1
  48. data/lib/datadog/tracing/contrib/grpc/ext.rb +2 -0
  49. data/lib/datadog/tracing/contrib/hanami/action_tracer.rb +47 -0
  50. data/lib/datadog/tracing/contrib/hanami/configuration/settings.rb +22 -0
  51. data/lib/datadog/tracing/contrib/hanami/ext.rb +24 -0
  52. data/lib/datadog/tracing/contrib/hanami/integration.rb +44 -0
  53. data/lib/datadog/tracing/contrib/hanami/patcher.rb +33 -0
  54. data/lib/datadog/tracing/contrib/hanami/plugin.rb +23 -0
  55. data/lib/datadog/tracing/contrib/hanami/renderer_policy_tracing.rb +41 -0
  56. data/lib/datadog/tracing/contrib/hanami/router_tracing.rb +44 -0
  57. data/lib/datadog/tracing/contrib/http/instrumentation.rb +2 -0
  58. data/lib/datadog/tracing/contrib/httpclient/instrumentation.rb +2 -0
  59. data/lib/datadog/tracing/contrib/httprb/instrumentation.rb +2 -0
  60. data/lib/datadog/tracing/contrib/mongodb/ext.rb +7 -0
  61. data/lib/datadog/tracing/contrib/mongodb/subscribers.rb +4 -0
  62. data/lib/datadog/tracing/contrib/mysql2/configuration/settings.rb +12 -0
  63. data/lib/datadog/tracing/contrib/mysql2/ext.rb +1 -0
  64. data/lib/datadog/tracing/contrib/mysql2/instrumentation.rb +16 -0
  65. data/lib/datadog/tracing/contrib/pg/configuration/settings.rb +12 -0
  66. data/lib/datadog/tracing/contrib/pg/ext.rb +2 -1
  67. data/lib/datadog/tracing/contrib/pg/instrumentation.rb +34 -18
  68. data/lib/datadog/tracing/contrib/propagation/sql_comment/comment.rb +43 -0
  69. data/lib/datadog/tracing/contrib/propagation/sql_comment/ext.rb +32 -0
  70. data/lib/datadog/tracing/contrib/propagation/sql_comment/mode.rb +28 -0
  71. data/lib/datadog/tracing/contrib/propagation/sql_comment.rb +49 -0
  72. data/lib/datadog/tracing/contrib/rack/middlewares.rb +11 -5
  73. data/lib/datadog/tracing/contrib/redis/ext.rb +2 -0
  74. data/lib/datadog/tracing/contrib/redis/instrumentation.rb +4 -2
  75. data/lib/datadog/tracing/contrib/redis/patcher.rb +41 -0
  76. data/lib/datadog/tracing/contrib/redis/tags.rb +5 -0
  77. data/lib/datadog/tracing/contrib/rest_client/request_patch.rb +2 -0
  78. data/lib/datadog/tracing/contrib/sinatra/env.rb +12 -23
  79. data/lib/datadog/tracing/contrib/sinatra/ext.rb +7 -3
  80. data/lib/datadog/tracing/contrib/sinatra/patcher.rb +2 -2
  81. data/lib/datadog/tracing/contrib/sinatra/tracer.rb +8 -80
  82. data/lib/datadog/tracing/contrib/sinatra/tracer_middleware.rb +14 -9
  83. data/lib/datadog/tracing/contrib.rb +1 -0
  84. data/lib/datadog/tracing/distributed/datadog_tags_codec.rb +84 -0
  85. data/lib/datadog/tracing/distributed/headers/datadog.rb +122 -30
  86. data/lib/datadog/tracing/distributed/headers/ext.rb +2 -0
  87. data/lib/datadog/tracing/flush.rb +1 -1
  88. data/lib/datadog/tracing/metadata/ext.rb +8 -0
  89. data/lib/datadog/tracing/propagation/http.rb +9 -1
  90. data/lib/datadog/tracing/sampling/ext.rb +31 -0
  91. data/lib/datadog/tracing/sampling/priority_sampler.rb +46 -4
  92. data/lib/datadog/tracing/sampling/rate_by_key_sampler.rb +8 -9
  93. data/lib/datadog/tracing/sampling/rate_by_service_sampler.rb +29 -5
  94. data/lib/datadog/tracing/sampling/rate_sampler.rb +10 -3
  95. data/lib/datadog/tracing/sampling/rule_sampler.rb +4 -3
  96. data/lib/datadog/tracing/sampling/span/ext.rb +0 -4
  97. data/lib/datadog/tracing/sampling/span/rule.rb +1 -1
  98. data/lib/datadog/tracing/sampling/span/sampler.rb +14 -3
  99. data/lib/datadog/tracing/trace_digest.rb +3 -0
  100. data/lib/datadog/tracing/trace_operation.rb +10 -0
  101. data/lib/datadog/tracing/trace_segment.rb +6 -0
  102. data/lib/datadog/tracing/tracer.rb +3 -1
  103. data/lib/datadog/tracing/writer.rb +7 -0
  104. data/lib/ddtrace/transport/trace_formatter.rb +7 -0
  105. data/lib/ddtrace/transport/traces.rb +1 -1
  106. data/lib/ddtrace/version.rb +2 -2
  107. metadata +18 -14
  108. data/lib/datadog/profiling/old_ext.rb +0 -42
  109. data/lib/datadog/profiling/transport/http/api/endpoint.rb +0 -85
  110. data/lib/datadog/profiling/transport/http/api/instance.rb +0 -38
  111. data/lib/datadog/profiling/transport/http/api/spec.rb +0 -42
  112. data/lib/datadog/profiling/transport/http/api.rb +0 -45
  113. data/lib/datadog/profiling/transport/http/builder.rb +0 -30
  114. data/lib/datadog/profiling/transport/http/client.rb +0 -37
  115. data/lib/datadog/profiling/transport/http/response.rb +0 -21
  116. data/lib/datadog/profiling/transport/http.rb +0 -118
@@ -3,4 +3,7 @@
3
3
  #include <ruby.h>
4
4
 
5
5
  VALUE cpu_and_wall_time_collector_sample(VALUE self_instance);
6
+ VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance);
7
+ void cpu_and_wall_time_collector_on_gc_start(VALUE self_instance);
8
+ void cpu_and_wall_time_collector_on_gc_finish(VALUE self_instance);
6
9
  VALUE enforce_cpu_and_wall_time_collector_instance(VALUE object);
@@ -64,18 +64,28 @@ struct cpu_and_wall_time_worker_state {
64
64
  // telling the sampling trigger loop to stop, but if we ever need to communicate more, we should move to actual
65
65
  // atomic operations. stdatomic.h seems a nice thing to reach out for.
66
66
  volatile bool should_run;
67
-
67
+ bool gc_profiling_enabled;
68
68
  VALUE cpu_and_wall_time_collector_instance;
69
+
69
70
  // When something goes wrong during sampling, we record the Ruby exception here, so that it can be "re-raised" on
70
71
  // the CpuAndWallTimeWorker thread
71
72
  VALUE failure_exception;
73
+
74
+ // Used to get gc start/finish information
75
+ VALUE gc_tracepoint;
72
76
  };
73
77
 
74
78
  static VALUE _native_new(VALUE klass);
75
- static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE cpu_and_wall_time_collector_instance);
79
+ static VALUE _native_initialize(
80
+ DDTRACE_UNUSED VALUE _self,
81
+ VALUE self_instance,
82
+ VALUE cpu_and_wall_time_collector_instance,
83
+ VALUE gc_profiling_enabled
84
+ );
76
85
  static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
77
86
  static VALUE _native_sampling_loop(VALUE self, VALUE instance);
78
87
  static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance);
88
+ static VALUE stop(VALUE self_instance, VALUE optional_exception);
79
89
  static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *));
80
90
  static void remove_sigprof_signal_handler(void);
81
91
  static void block_sigprof_signal_handler_from_running_in_current_thread(void);
@@ -90,6 +100,13 @@ static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance);
90
100
  static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
91
101
  static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self);
92
102
  static VALUE _native_remove_testing_signal_handler(DDTRACE_UNUSED VALUE self);
103
+ static VALUE _native_trigger_sample(DDTRACE_UNUSED VALUE self);
104
+ static VALUE _native_gc_tracepoint(DDTRACE_UNUSED VALUE self, VALUE instance);
105
+ static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
106
+ static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused);
107
+ static void safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance);
108
+ static VALUE _native_simulate_handle_sampling_signal(DDTRACE_UNUSED VALUE self);
109
+ static VALUE _native_simulate_sample_from_postponed_job(DDTRACE_UNUSED VALUE self);
93
110
 
94
111
  // Global state -- be very careful when accessing or modifying it
95
112
 
@@ -119,13 +136,17 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
119
136
  // https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
120
137
  rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);
121
138
 
122
- rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 2);
139
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 3);
123
140
  rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
124
141
  rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 1);
125
142
  rb_define_singleton_method(testing_module, "_native_current_sigprof_signal_handler", _native_current_sigprof_signal_handler, 0);
126
143
  rb_define_singleton_method(testing_module, "_native_is_running?", _native_is_running, 1);
127
144
  rb_define_singleton_method(testing_module, "_native_install_testing_signal_handler", _native_install_testing_signal_handler, 0);
128
145
  rb_define_singleton_method(testing_module, "_native_remove_testing_signal_handler", _native_remove_testing_signal_handler, 0);
146
+ rb_define_singleton_method(testing_module, "_native_trigger_sample", _native_trigger_sample, 0);
147
+ rb_define_singleton_method(testing_module, "_native_gc_tracepoint", _native_gc_tracepoint, 1);
148
+ rb_define_singleton_method(testing_module, "_native_simulate_handle_sampling_signal", _native_simulate_handle_sampling_signal, 0);
149
+ rb_define_singleton_method(testing_module, "_native_simulate_sample_from_postponed_job", _native_simulate_sample_from_postponed_job, 0);
129
150
  }
130
151
 
131
152
  // This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_worker_state
@@ -145,17 +166,28 @@ static VALUE _native_new(VALUE klass) {
145
166
  struct cpu_and_wall_time_worker_state *state = ruby_xcalloc(1, sizeof(struct cpu_and_wall_time_worker_state));
146
167
 
147
168
  state->should_run = false;
169
+ state->gc_profiling_enabled = false;
148
170
  state->cpu_and_wall_time_collector_instance = Qnil;
149
171
  state->failure_exception = Qnil;
172
+ state->gc_tracepoint = Qnil;
150
173
 
151
174
  return TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
152
175
  }
153
176
 
154
- static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE cpu_and_wall_time_collector_instance) {
177
+ static VALUE _native_initialize(
178
+ DDTRACE_UNUSED VALUE _self,
179
+ VALUE self_instance,
180
+ VALUE cpu_and_wall_time_collector_instance,
181
+ VALUE gc_profiling_enabled
182
+ ) {
183
+ ENFORCE_BOOLEAN(gc_profiling_enabled);
184
+
155
185
  struct cpu_and_wall_time_worker_state *state;
156
186
  TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
157
187
 
188
+ state->gc_profiling_enabled = (gc_profiling_enabled == Qtrue);
158
189
  state->cpu_and_wall_time_collector_instance = enforce_cpu_and_wall_time_collector_instance(cpu_and_wall_time_collector_instance);
190
+ state->gc_tracepoint = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_GC_ENTER | RUBY_INTERNAL_EVENT_GC_EXIT, on_gc_event, NULL /* unused */);
159
191
 
160
192
  return Qtrue;
161
193
  }
@@ -166,6 +198,7 @@ static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr) {
166
198
 
167
199
  rb_gc_mark(state->cpu_and_wall_time_collector_instance);
168
200
  rb_gc_mark(state->failure_exception);
201
+ rb_gc_mark(state->gc_tracepoint);
169
202
  }
170
203
 
171
204
  // Called in a background thread created in CpuAndWallTimeWorker#start
@@ -173,11 +206,25 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
173
206
  struct cpu_and_wall_time_worker_state *state;
174
207
  TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
175
208
 
176
- if (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread)) {
177
- rb_raise(
178
- rb_eRuntimeError,
179
- "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
180
- );
209
+ if (active_sampler_owner_thread != Qnil) {
210
+ if (is_thread_alive(active_sampler_owner_thread)) {
211
+ rb_raise(
212
+ rb_eRuntimeError,
213
+ "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
214
+ );
215
+ } else {
216
+ // The previously active thread seems to have died without cleaning up after itself.
217
+ // In this case, we can still go ahead and start the profiler BUT we make sure to disable any existing GC tracepoint
218
+ // first as:
219
+ // a) If this is a new instance of the CpuAndWallTimeWorker, we don't want the tracepoint from the old instance
220
+ // being kept around
221
+ // b) If this is the same instance of the CpuAndWallTimeWorker if we call enable on a tracepoint that is already
222
+ // enabled, it will start firing more than once, see https://bugs.ruby-lang.org/issues/19114 for details.
223
+
224
+ struct cpu_and_wall_time_worker_state *old_state;
225
+ TypedData_Get_Struct(active_sampler_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, old_state);
226
+ rb_tracepoint_disable(old_state->gc_tracepoint);
227
+ }
181
228
  }
182
229
 
183
230
  // This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point
@@ -189,6 +236,7 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
189
236
  block_sigprof_signal_handler_from_running_in_current_thread(); // We want to interrupt the thread with the global VM lock, never this one
190
237
 
191
238
  install_sigprof_signal_handler(handle_sampling_signal);
239
+ if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint);
192
240
 
193
241
  // Release GVL, get to the actual work!
194
242
  int exception_state;
@@ -196,6 +244,7 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
196
244
 
197
245
  // The sample trigger loop finished (either cleanly or with an error); let's clean up
198
246
 
247
+ rb_tracepoint_disable(state->gc_tracepoint);
199
248
  remove_sigprof_signal_handler();
200
249
  active_sampler_instance = Qnil;
201
250
  active_sampler_owner_thread = Qnil;
@@ -209,10 +258,18 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
209
258
  }
210
259
 
211
260
  static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance) {
261
+ return stop(self_instance, /* optional_exception: */ Qnil);
262
+ }
263
+
264
+ static VALUE stop(VALUE self_instance, VALUE optional_exception) {
212
265
  struct cpu_and_wall_time_worker_state *state;
213
266
  TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
214
267
 
215
268
  state->should_run = false;
269
+ state->failure_exception = optional_exception;
270
+
271
+ // Disable the GC tracepoint as soon as possible, so the VM doesn't keep on calling it
272
+ rb_tracepoint_disable(state->gc_tracepoint);
216
273
 
217
274
  return Qtrue;
218
275
  }
@@ -264,10 +321,16 @@ static void block_sigprof_signal_handler_from_running_in_current_thread(void) {
264
321
  pthread_sigmask(SIG_BLOCK, &signals_to_block, NULL);
265
322
  }
266
323
 
324
+ // NOTE: Remember that this will run in the thread and within the scope of user code, including user C code.
325
+ // We need to be careful not to change any state that may be observed OR to restore it if we do. For instance, if anything
326
+ // we do here can set `errno`, then we must be careful to restore the old `errno` after the fact.
267
327
  static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
268
328
  if (!ruby_thread_has_gvl_p()) {
269
329
  return; // Not safe to enqueue a sample from this thread
270
330
  }
331
+ if (!ddtrace_rb_ractor_main_p()) {
332
+ return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
333
+ }
271
334
 
272
335
  // We implicitly assume there can be no concurrent nor nested calls to handle_sampling_signal because
273
336
  // a) we get triggered using SIGPROF, and the docs state second SIGPROF will not interrupt an existing one
@@ -315,34 +378,23 @@ static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
315
378
  // This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
316
379
  if (instance == Qnil) return;
317
380
 
381
+ // @ivoanjo: I'm not sure this can ever happen because `handle_sampling_signal` only enqueues this callback if
382
+ // it's running on the main Ractor, but just in case...
383
+ if (!ddtrace_rb_ractor_main_p()) {
384
+ return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
385
+ }
386
+
318
387
  struct cpu_and_wall_time_worker_state *state;
319
388
  TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
320
389
 
321
390
  // Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
322
- VALUE (*function_to_call_safely)(VALUE) = cpu_and_wall_time_collector_sample;
323
- VALUE function_to_call_safely_arg = state->cpu_and_wall_time_collector_instance;
324
- VALUE (*exception_handler_function)(VALUE, VALUE) = handle_sampling_failure;
325
- VALUE exception_handler_function_arg = instance;
326
- rb_rescue2(
327
- function_to_call_safely,
328
- function_to_call_safely_arg,
329
- exception_handler_function,
330
- exception_handler_function_arg,
331
- rb_eException, // rb_eException is the base class of all Ruby exceptions
332
- 0 // Required by API to be the last argument
333
- );
391
+ safely_call(cpu_and_wall_time_collector_sample, state->cpu_and_wall_time_collector_instance, instance);
334
392
  }
335
393
 
336
- static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) {
337
- struct cpu_and_wall_time_worker_state *state;
338
- TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
339
-
340
- state->should_run = false;
341
- state->failure_exception = exception;
342
-
343
- return Qnil;
344
- }
394
+ static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) { return stop(self_instance, exception); }
345
395
 
396
+ // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
397
+ // It SHOULD NOT be used for other purposes.
346
398
  static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self) {
347
399
  struct sigaction existing_signal_handler_config = {.sa_sigaction = NULL};
348
400
  if (sigaction(SIGPROF, NULL, &existing_signal_handler_config) != 0) {
@@ -370,6 +422,8 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
370
422
  return Qnil;
371
423
  }
372
424
 
425
+ // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
426
+ // It SHOULD NOT be used for other purposes.
373
427
  static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance) {
374
428
  return \
375
429
  (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread) && active_sampler_instance == instance) ?
@@ -380,12 +434,136 @@ static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED si
380
434
  /* Does nothing on purpose */
381
435
  }
382
436
 
437
+ // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
438
+ // It SHOULD NOT be used for other purposes.
383
439
  static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self) {
384
440
  install_sigprof_signal_handler(testing_signal_handler);
385
441
  return Qtrue;
386
442
  }
387
443
 
444
+ // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
445
+ // It SHOULD NOT be used for other purposes.
388
446
  static VALUE _native_remove_testing_signal_handler(DDTRACE_UNUSED VALUE self) {
389
447
  remove_sigprof_signal_handler();
390
448
  return Qtrue;
391
449
  }
450
+
451
+ // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
452
+ // It SHOULD NOT be used for other purposes.
453
+ static VALUE _native_trigger_sample(DDTRACE_UNUSED VALUE self) {
454
+ sample_from_postponed_job(NULL);
455
+ return Qtrue;
456
+ }
457
+
458
+ // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
459
+ // It SHOULD NOT be used for other purposes.
460
+ static VALUE _native_gc_tracepoint(DDTRACE_UNUSED VALUE self, VALUE instance) {
461
+ struct cpu_and_wall_time_worker_state *state;
462
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
463
+
464
+ return state->gc_tracepoint;
465
+ }
466
+
467
+ // Implements tracking of cpu-time and wall-time spent doing GC. This function is called by Ruby from the `gc_tracepoint`
468
+ // when the RUBY_INTERNAL_EVENT_GC_ENTER and RUBY_INTERNAL_EVENT_GC_EXIT events are triggered.
469
+ //
470
+ // See the comments on
471
+ // * cpu_and_wall_time_collector_on_gc_start
472
+ // * cpu_and_wall_time_collector_on_gc_finish
473
+ // * cpu_and_wall_time_collector_sample_after_gc
474
+ //
475
+ // For the expected times in which to call them, and their assumptions.
476
+ //
477
+ // Safety: This function gets called while Ruby is doing garbage collection. While Ruby is doing garbage collection,
478
+ // *NO ALLOCATION* is allowed. This function, and any it calls must never trigger memory or object allocation.
479
+ // This includes exceptions and use of ruby_xcalloc (because xcalloc can trigger GC)!
480
+ static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
481
+ if (!ddtrace_rb_ractor_main_p()) {
482
+ return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
483
+ }
484
+
485
+ int event = rb_tracearg_event_flag(rb_tracearg_from_tracepoint(tracepoint_data));
486
+ if (event != RUBY_INTERNAL_EVENT_GC_ENTER && event != RUBY_INTERNAL_EVENT_GC_EXIT) return; // Unknown event
487
+
488
+ VALUE instance = active_sampler_instance; // Read from global variable
489
+
490
+ // This should not happen in a normal situation because the tracepoint is always enabled after the instance is set
491
+ // and disabled before it is cleared, but just in case...
492
+ if (instance == Qnil) return;
493
+
494
+ struct cpu_and_wall_time_worker_state *state;
495
+ if (!rb_typeddata_is_kind_of(instance, &cpu_and_wall_time_worker_typed_data)) return;
496
+ // This should never fail the the above check passes
497
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
498
+
499
+ if (event == RUBY_INTERNAL_EVENT_GC_ENTER) {
500
+ cpu_and_wall_time_collector_on_gc_start(state->cpu_and_wall_time_collector_instance);
501
+ } else if (event == RUBY_INTERNAL_EVENT_GC_EXIT) {
502
+ // Design: In an earlier iteration of this feature (see https://github.com/DataDog/dd-trace-rb/pull/2308) we
503
+ // actually had a single method to implement the behavior of both cpu_and_wall_time_collector_on_gc_finish
504
+ // and cpu_and_wall_time_collector_sample_after_gc (the latter is called via after_gc_from_postponed_job).
505
+ //
506
+ // Unfortunately, then we discovered the safety issue around no allocations, and thus decided to separate them -- so that
507
+ // the sampling could run outside the tight safety constraints of the garbage collection process.
508
+ //
509
+ // There is a downside: The sample is now taken very very shortly afterwards the GC finishes, and not immediately
510
+ // as the GC finishes, which means the stack captured may by affected by "skid", e.g. point slightly after where
511
+ // it should be pointing at.
512
+ // Alternatives to solve this would be to capture no stack for garbage collection (as we do for Java and .net);
513
+ // making the sampling process allocation-safe (very hard); or separate stack sampling from sample recording,
514
+ // e.g. enabling us to capture the stack in cpu_and_wall_time_collector_on_gc_finish and do the rest later
515
+ // (medium hard).
516
+
517
+ cpu_and_wall_time_collector_on_gc_finish(state->cpu_and_wall_time_collector_instance);
518
+ // We use rb_postponed_job_register_one to ask Ruby to run cpu_and_wall_time_collector_sample_after_gc after if
519
+ // fully finishes the garbage collection, so that one is allowed to do allocations and throw exceptions as usual.
520
+ rb_postponed_job_register_one(0, after_gc_from_postponed_job, NULL);
521
+ }
522
+ }
523
+
524
+ static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused) {
525
+ VALUE instance = active_sampler_instance; // Read from global variable
526
+
527
+ // This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
528
+ if (instance == Qnil) return;
529
+
530
+ // @ivoanjo: I'm not sure this can ever happen because `on_gc_event` only enqueues this callback if
531
+ // it's running on the main Ractor, but just in case...
532
+ if (!ddtrace_rb_ractor_main_p()) {
533
+ return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
534
+ }
535
+
536
+ struct cpu_and_wall_time_worker_state *state;
537
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
538
+
539
+ // Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
540
+ safely_call(cpu_and_wall_time_collector_sample_after_gc, state->cpu_and_wall_time_collector_instance, instance);
541
+ }
542
+
543
+ // Equivalent to Ruby begin/rescue call, where we call a C function and jump to the exception handler if an
544
+ // exception gets raised within
545
+ static void safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance) {
546
+ VALUE exception_handler_function_arg = instance;
547
+ rb_rescue2(
548
+ function_to_call_safely,
549
+ function_to_call_safely_arg,
550
+ handle_sampling_failure,
551
+ exception_handler_function_arg,
552
+ rb_eException, // rb_eException is the base class of all Ruby exceptions
553
+ 0 // Required by API to be the last argument
554
+ );
555
+ }
556
+
557
+ // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
558
+ // It SHOULD NOT be used for other purposes.
559
+ static VALUE _native_simulate_handle_sampling_signal(DDTRACE_UNUSED VALUE self) {
560
+ handle_sampling_signal(0, NULL, NULL);
561
+ return Qtrue;
562
+ }
563
+
564
+ // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
565
+ // It SHOULD NOT be used for other purposes.
566
+ static VALUE _native_simulate_sample_from_postponed_job(DDTRACE_UNUSED VALUE self) {
567
+ sample_from_postponed_job(NULL);
568
+ return Qtrue;
569
+ }