ddtrace 1.5.2 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +44 -1
  3. data/ext/ddtrace_profiling_loader/ddtrace_profiling_loader.c +9 -2
  4. data/ext/ddtrace_profiling_loader/extconf.rb +17 -0
  5. data/ext/ddtrace_profiling_native_extension/NativeExtensionDesign.md +38 -2
  6. data/ext/ddtrace_profiling_native_extension/clock_id.h +1 -0
  7. data/ext/ddtrace_profiling_native_extension/clock_id_from_pthread.c +1 -0
  8. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.c +517 -42
  9. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time.h +3 -0
  10. data/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +208 -30
  11. data/ext/ddtrace_profiling_native_extension/collectors_stack.c +156 -46
  12. data/ext/ddtrace_profiling_native_extension/collectors_stack.h +11 -2
  13. data/ext/ddtrace_profiling_native_extension/extconf.rb +11 -1
  14. data/ext/ddtrace_profiling_native_extension/http_transport.c +83 -64
  15. data/ext/ddtrace_profiling_native_extension/libdatadog_helpers.h +4 -4
  16. data/ext/ddtrace_profiling_native_extension/native_extension_helpers.rb +3 -2
  17. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.c +59 -0
  18. data/ext/ddtrace_profiling_native_extension/private_vm_api_access.h +3 -0
  19. data/ext/ddtrace_profiling_native_extension/profiling.c +10 -0
  20. data/ext/ddtrace_profiling_native_extension/ruby_helpers.c +0 -1
  21. data/ext/ddtrace_profiling_native_extension/ruby_helpers.h +4 -2
  22. data/ext/ddtrace_profiling_native_extension/stack_recorder.c +45 -29
  23. data/ext/ddtrace_profiling_native_extension/stack_recorder.h +7 -7
  24. data/lib/datadog/appsec/contrib/rack/request_middleware.rb +4 -0
  25. data/lib/datadog/appsec/event.rb +6 -0
  26. data/lib/datadog/core/configuration/components.rb +20 -14
  27. data/lib/datadog/core/configuration/settings.rb +42 -4
  28. data/lib/datadog/core/diagnostics/environment_logger.rb +5 -1
  29. data/lib/datadog/core/utils/compression.rb +5 -1
  30. data/lib/datadog/core.rb +0 -54
  31. data/lib/datadog/profiling/collectors/cpu_and_wall_time.rb +12 -2
  32. data/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +5 -3
  33. data/lib/datadog/profiling/exporter.rb +2 -4
  34. data/lib/datadog/profiling/http_transport.rb +1 -1
  35. data/lib/datadog/tracing/configuration/ext.rb +1 -0
  36. data/lib/datadog/tracing/contrib/aws/instrumentation.rb +2 -0
  37. data/lib/datadog/tracing/contrib/dalli/ext.rb +1 -0
  38. data/lib/datadog/tracing/contrib/dalli/instrumentation.rb +4 -0
  39. data/lib/datadog/tracing/contrib/elasticsearch/ext.rb +2 -0
  40. data/lib/datadog/tracing/contrib/elasticsearch/patcher.rb +3 -0
  41. data/lib/datadog/tracing/contrib/ethon/easy_patch.rb +2 -0
  42. data/lib/datadog/tracing/contrib/ethon/multi_patch.rb +2 -0
  43. data/lib/datadog/tracing/contrib/excon/middleware.rb +2 -0
  44. data/lib/datadog/tracing/contrib/ext.rb +6 -0
  45. data/lib/datadog/tracing/contrib/faraday/middleware.rb +2 -0
  46. data/lib/datadog/tracing/contrib/grpc/datadog_interceptor/client.rb +5 -0
  47. data/lib/datadog/tracing/contrib/grpc/datadog_interceptor/server.rb +7 -1
  48. data/lib/datadog/tracing/contrib/grpc/ext.rb +2 -0
  49. data/lib/datadog/tracing/contrib/hanami/action_tracer.rb +47 -0
  50. data/lib/datadog/tracing/contrib/hanami/configuration/settings.rb +22 -0
  51. data/lib/datadog/tracing/contrib/hanami/ext.rb +24 -0
  52. data/lib/datadog/tracing/contrib/hanami/integration.rb +44 -0
  53. data/lib/datadog/tracing/contrib/hanami/patcher.rb +33 -0
  54. data/lib/datadog/tracing/contrib/hanami/plugin.rb +23 -0
  55. data/lib/datadog/tracing/contrib/hanami/renderer_policy_tracing.rb +41 -0
  56. data/lib/datadog/tracing/contrib/hanami/router_tracing.rb +44 -0
  57. data/lib/datadog/tracing/contrib/http/instrumentation.rb +2 -0
  58. data/lib/datadog/tracing/contrib/httpclient/instrumentation.rb +2 -0
  59. data/lib/datadog/tracing/contrib/httprb/instrumentation.rb +2 -0
  60. data/lib/datadog/tracing/contrib/mongodb/ext.rb +7 -0
  61. data/lib/datadog/tracing/contrib/mongodb/subscribers.rb +4 -0
  62. data/lib/datadog/tracing/contrib/mysql2/configuration/settings.rb +12 -0
  63. data/lib/datadog/tracing/contrib/mysql2/ext.rb +1 -0
  64. data/lib/datadog/tracing/contrib/mysql2/instrumentation.rb +16 -0
  65. data/lib/datadog/tracing/contrib/pg/configuration/settings.rb +12 -0
  66. data/lib/datadog/tracing/contrib/pg/ext.rb +2 -1
  67. data/lib/datadog/tracing/contrib/pg/instrumentation.rb +34 -18
  68. data/lib/datadog/tracing/contrib/propagation/sql_comment/comment.rb +43 -0
  69. data/lib/datadog/tracing/contrib/propagation/sql_comment/ext.rb +32 -0
  70. data/lib/datadog/tracing/contrib/propagation/sql_comment/mode.rb +28 -0
  71. data/lib/datadog/tracing/contrib/propagation/sql_comment.rb +49 -0
  72. data/lib/datadog/tracing/contrib/rack/middlewares.rb +11 -5
  73. data/lib/datadog/tracing/contrib/redis/ext.rb +2 -0
  74. data/lib/datadog/tracing/contrib/redis/instrumentation.rb +4 -2
  75. data/lib/datadog/tracing/contrib/redis/patcher.rb +41 -0
  76. data/lib/datadog/tracing/contrib/redis/tags.rb +5 -0
  77. data/lib/datadog/tracing/contrib/rest_client/request_patch.rb +2 -0
  78. data/lib/datadog/tracing/contrib/sinatra/env.rb +12 -23
  79. data/lib/datadog/tracing/contrib/sinatra/ext.rb +7 -3
  80. data/lib/datadog/tracing/contrib/sinatra/patcher.rb +2 -2
  81. data/lib/datadog/tracing/contrib/sinatra/tracer.rb +8 -80
  82. data/lib/datadog/tracing/contrib/sinatra/tracer_middleware.rb +14 -9
  83. data/lib/datadog/tracing/contrib.rb +1 -0
  84. data/lib/datadog/tracing/distributed/datadog_tags_codec.rb +84 -0
  85. data/lib/datadog/tracing/distributed/headers/datadog.rb +122 -30
  86. data/lib/datadog/tracing/distributed/headers/ext.rb +2 -0
  87. data/lib/datadog/tracing/flush.rb +1 -1
  88. data/lib/datadog/tracing/metadata/ext.rb +8 -0
  89. data/lib/datadog/tracing/propagation/http.rb +9 -1
  90. data/lib/datadog/tracing/sampling/ext.rb +31 -0
  91. data/lib/datadog/tracing/sampling/priority_sampler.rb +46 -4
  92. data/lib/datadog/tracing/sampling/rate_by_key_sampler.rb +8 -9
  93. data/lib/datadog/tracing/sampling/rate_by_service_sampler.rb +29 -5
  94. data/lib/datadog/tracing/sampling/rate_sampler.rb +10 -3
  95. data/lib/datadog/tracing/sampling/rule_sampler.rb +4 -3
  96. data/lib/datadog/tracing/sampling/span/ext.rb +0 -4
  97. data/lib/datadog/tracing/sampling/span/rule.rb +1 -1
  98. data/lib/datadog/tracing/sampling/span/sampler.rb +14 -3
  99. data/lib/datadog/tracing/trace_digest.rb +3 -0
  100. data/lib/datadog/tracing/trace_operation.rb +10 -0
  101. data/lib/datadog/tracing/trace_segment.rb +6 -0
  102. data/lib/datadog/tracing/tracer.rb +3 -1
  103. data/lib/datadog/tracing/writer.rb +7 -0
  104. data/lib/ddtrace/transport/trace_formatter.rb +7 -0
  105. data/lib/ddtrace/transport/traces.rb +1 -1
  106. data/lib/ddtrace/version.rb +2 -2
  107. metadata +18 -14
  108. data/lib/datadog/profiling/old_ext.rb +0 -42
  109. data/lib/datadog/profiling/transport/http/api/endpoint.rb +0 -85
  110. data/lib/datadog/profiling/transport/http/api/instance.rb +0 -38
  111. data/lib/datadog/profiling/transport/http/api/spec.rb +0 -42
  112. data/lib/datadog/profiling/transport/http/api.rb +0 -45
  113. data/lib/datadog/profiling/transport/http/builder.rb +0 -30
  114. data/lib/datadog/profiling/transport/http/client.rb +0 -37
  115. data/lib/datadog/profiling/transport/http/response.rb +0 -21
  116. data/lib/datadog/profiling/transport/http.rb +0 -118
@@ -3,4 +3,7 @@
3
3
  #include <ruby.h>
4
4
 
5
5
  VALUE cpu_and_wall_time_collector_sample(VALUE self_instance);
6
+ VALUE cpu_and_wall_time_collector_sample_after_gc(VALUE self_instance);
7
+ void cpu_and_wall_time_collector_on_gc_start(VALUE self_instance);
8
+ void cpu_and_wall_time_collector_on_gc_finish(VALUE self_instance);
6
9
  VALUE enforce_cpu_and_wall_time_collector_instance(VALUE object);
@@ -64,18 +64,28 @@ struct cpu_and_wall_time_worker_state {
64
64
  // telling the sampling trigger loop to stop, but if we ever need to communicate more, we should move to actual
65
65
  // atomic operations. stdatomic.h seems a nice thing to reach out for.
66
66
  volatile bool should_run;
67
-
67
+ bool gc_profiling_enabled;
68
68
  VALUE cpu_and_wall_time_collector_instance;
69
+
69
70
  // When something goes wrong during sampling, we record the Ruby exception here, so that it can be "re-raised" on
70
71
  // the CpuAndWallTimeWorker thread
71
72
  VALUE failure_exception;
73
+
74
+ // Used to get gc start/finish information
75
+ VALUE gc_tracepoint;
72
76
  };
73
77
 
74
78
  static VALUE _native_new(VALUE klass);
75
- static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE cpu_and_wall_time_collector_instance);
79
+ static VALUE _native_initialize(
80
+ DDTRACE_UNUSED VALUE _self,
81
+ VALUE self_instance,
82
+ VALUE cpu_and_wall_time_collector_instance,
83
+ VALUE gc_profiling_enabled
84
+ );
76
85
  static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
77
86
  static VALUE _native_sampling_loop(VALUE self, VALUE instance);
78
87
  static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance);
88
+ static VALUE stop(VALUE self_instance, VALUE optional_exception);
79
89
  static void install_sigprof_signal_handler(void (*signal_handler_function)(int, siginfo_t *, void *));
80
90
  static void remove_sigprof_signal_handler(void);
81
91
  static void block_sigprof_signal_handler_from_running_in_current_thread(void);
@@ -90,6 +100,13 @@ static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance);
90
100
  static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext);
91
101
  static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self);
92
102
  static VALUE _native_remove_testing_signal_handler(DDTRACE_UNUSED VALUE self);
103
+ static VALUE _native_trigger_sample(DDTRACE_UNUSED VALUE self);
104
+ static VALUE _native_gc_tracepoint(DDTRACE_UNUSED VALUE self, VALUE instance);
105
+ static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused);
106
+ static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused);
107
+ static void safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance);
108
+ static VALUE _native_simulate_handle_sampling_signal(DDTRACE_UNUSED VALUE self);
109
+ static VALUE _native_simulate_sample_from_postponed_job(DDTRACE_UNUSED VALUE self);
93
110
 
94
111
  // Global state -- be very careful when accessing or modifying it
95
112
 
@@ -119,13 +136,17 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
119
136
  // https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
120
137
  rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);
121
138
 
122
- rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 2);
139
+ rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 3);
123
140
  rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
124
141
  rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 1);
125
142
  rb_define_singleton_method(testing_module, "_native_current_sigprof_signal_handler", _native_current_sigprof_signal_handler, 0);
126
143
  rb_define_singleton_method(testing_module, "_native_is_running?", _native_is_running, 1);
127
144
  rb_define_singleton_method(testing_module, "_native_install_testing_signal_handler", _native_install_testing_signal_handler, 0);
128
145
  rb_define_singleton_method(testing_module, "_native_remove_testing_signal_handler", _native_remove_testing_signal_handler, 0);
146
+ rb_define_singleton_method(testing_module, "_native_trigger_sample", _native_trigger_sample, 0);
147
+ rb_define_singleton_method(testing_module, "_native_gc_tracepoint", _native_gc_tracepoint, 1);
148
+ rb_define_singleton_method(testing_module, "_native_simulate_handle_sampling_signal", _native_simulate_handle_sampling_signal, 0);
149
+ rb_define_singleton_method(testing_module, "_native_simulate_sample_from_postponed_job", _native_simulate_sample_from_postponed_job, 0);
129
150
  }
130
151
 
131
152
  // This structure is used to define a Ruby object that stores a pointer to a struct cpu_and_wall_time_worker_state
@@ -145,17 +166,28 @@ static VALUE _native_new(VALUE klass) {
145
166
  struct cpu_and_wall_time_worker_state *state = ruby_xcalloc(1, sizeof(struct cpu_and_wall_time_worker_state));
146
167
 
147
168
  state->should_run = false;
169
+ state->gc_profiling_enabled = false;
148
170
  state->cpu_and_wall_time_collector_instance = Qnil;
149
171
  state->failure_exception = Qnil;
172
+ state->gc_tracepoint = Qnil;
150
173
 
151
174
  return TypedData_Wrap_Struct(klass, &cpu_and_wall_time_worker_typed_data, state);
152
175
  }
153
176
 
154
- static VALUE _native_initialize(DDTRACE_UNUSED VALUE _self, VALUE self_instance, VALUE cpu_and_wall_time_collector_instance) {
177
+ static VALUE _native_initialize(
178
+ DDTRACE_UNUSED VALUE _self,
179
+ VALUE self_instance,
180
+ VALUE cpu_and_wall_time_collector_instance,
181
+ VALUE gc_profiling_enabled
182
+ ) {
183
+ ENFORCE_BOOLEAN(gc_profiling_enabled);
184
+
155
185
  struct cpu_and_wall_time_worker_state *state;
156
186
  TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
157
187
 
188
+ state->gc_profiling_enabled = (gc_profiling_enabled == Qtrue);
158
189
  state->cpu_and_wall_time_collector_instance = enforce_cpu_and_wall_time_collector_instance(cpu_and_wall_time_collector_instance);
190
+ state->gc_tracepoint = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_GC_ENTER | RUBY_INTERNAL_EVENT_GC_EXIT, on_gc_event, NULL /* unused */);
159
191
 
160
192
  return Qtrue;
161
193
  }
@@ -166,6 +198,7 @@ static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr) {
166
198
 
167
199
  rb_gc_mark(state->cpu_and_wall_time_collector_instance);
168
200
  rb_gc_mark(state->failure_exception);
201
+ rb_gc_mark(state->gc_tracepoint);
169
202
  }
170
203
 
171
204
  // Called in a background thread created in CpuAndWallTimeWorker#start
@@ -173,11 +206,25 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
173
206
  struct cpu_and_wall_time_worker_state *state;
174
207
  TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
175
208
 
176
- if (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread)) {
177
- rb_raise(
178
- rb_eRuntimeError,
179
- "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
180
- );
209
+ if (active_sampler_owner_thread != Qnil) {
210
+ if (is_thread_alive(active_sampler_owner_thread)) {
211
+ rb_raise(
212
+ rb_eRuntimeError,
213
+ "Could not start CpuAndWallTimeWorker: There's already another instance of CpuAndWallTimeWorker active in a different thread"
214
+ );
215
+ } else {
216
+ // The previously active thread seems to have died without cleaning up after itself.
217
+ // In this case, we can still go ahead and start the profiler BUT we make sure to disable any existing GC tracepoint
218
+ // first as:
219
+ // a) If this is a new instance of the CpuAndWallTimeWorker, we don't want the tracepoint from the old instance
220
+ // being kept around
221
+ // b) If this is the same instance of the CpuAndWallTimeWorker if we call enable on a tracepoint that is already
222
+ // enabled, it will start firing more than once, see https://bugs.ruby-lang.org/issues/19114 for details.
223
+
224
+ struct cpu_and_wall_time_worker_state *old_state;
225
+ TypedData_Get_Struct(active_sampler_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, old_state);
226
+ rb_tracepoint_disable(old_state->gc_tracepoint);
227
+ }
181
228
  }
182
229
 
183
230
  // This write to a global is thread-safe BECAUSE we're still holding on to the global VM lock at this point
@@ -189,6 +236,7 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
189
236
  block_sigprof_signal_handler_from_running_in_current_thread(); // We want to interrupt the thread with the global VM lock, never this one
190
237
 
191
238
  install_sigprof_signal_handler(handle_sampling_signal);
239
+ if (state->gc_profiling_enabled) rb_tracepoint_enable(state->gc_tracepoint);
192
240
 
193
241
  // Release GVL, get to the actual work!
194
242
  int exception_state;
@@ -196,6 +244,7 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
196
244
 
197
245
  // The sample trigger loop finished (either cleanly or with an error); let's clean up
198
246
 
247
+ rb_tracepoint_disable(state->gc_tracepoint);
199
248
  remove_sigprof_signal_handler();
200
249
  active_sampler_instance = Qnil;
201
250
  active_sampler_owner_thread = Qnil;
@@ -209,10 +258,18 @@ static VALUE _native_sampling_loop(DDTRACE_UNUSED VALUE _self, VALUE instance) {
209
258
  }
210
259
 
211
260
  static VALUE _native_stop(DDTRACE_UNUSED VALUE _self, VALUE self_instance) {
261
+ return stop(self_instance, /* optional_exception: */ Qnil);
262
+ }
263
+
264
+ static VALUE stop(VALUE self_instance, VALUE optional_exception) {
212
265
  struct cpu_and_wall_time_worker_state *state;
213
266
  TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
214
267
 
215
268
  state->should_run = false;
269
+ state->failure_exception = optional_exception;
270
+
271
+ // Disable the GC tracepoint as soon as possible, so the VM doesn't keep on calling it
272
+ rb_tracepoint_disable(state->gc_tracepoint);
216
273
 
217
274
  return Qtrue;
218
275
  }
@@ -264,10 +321,16 @@ static void block_sigprof_signal_handler_from_running_in_current_thread(void) {
264
321
  pthread_sigmask(SIG_BLOCK, &signals_to_block, NULL);
265
322
  }
266
323
 
324
+ // NOTE: Remember that this will run in the thread and within the scope of user code, including user C code.
325
+ // We need to be careful not to change any state that may be observed OR to restore it if we do. For instance, if anything
326
+ // we do here can set `errno`, then we must be careful to restore the old `errno` after the fact.
267
327
  static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED siginfo_t *_info, DDTRACE_UNUSED void *_ucontext) {
268
328
  if (!ruby_thread_has_gvl_p()) {
269
329
  return; // Not safe to enqueue a sample from this thread
270
330
  }
331
+ if (!ddtrace_rb_ractor_main_p()) {
332
+ return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
333
+ }
271
334
 
272
335
  // We implicitly assume there can be no concurrent nor nested calls to handle_sampling_signal because
273
336
  // a) we get triggered using SIGPROF, and the docs state second SIGPROF will not interrupt an existing one
@@ -315,34 +378,23 @@ static void sample_from_postponed_job(DDTRACE_UNUSED void *_unused) {
315
378
  // This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
316
379
  if (instance == Qnil) return;
317
380
 
381
+ // @ivoanjo: I'm not sure this can ever happen because `handle_sampling_signal` only enqueues this callback if
382
+ // it's running on the main Ractor, but just in case...
383
+ if (!ddtrace_rb_ractor_main_p()) {
384
+ return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
385
+ }
386
+
318
387
  struct cpu_and_wall_time_worker_state *state;
319
388
  TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
320
389
 
321
390
  // Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
322
- VALUE (*function_to_call_safely)(VALUE) = cpu_and_wall_time_collector_sample;
323
- VALUE function_to_call_safely_arg = state->cpu_and_wall_time_collector_instance;
324
- VALUE (*exception_handler_function)(VALUE, VALUE) = handle_sampling_failure;
325
- VALUE exception_handler_function_arg = instance;
326
- rb_rescue2(
327
- function_to_call_safely,
328
- function_to_call_safely_arg,
329
- exception_handler_function,
330
- exception_handler_function_arg,
331
- rb_eException, // rb_eException is the base class of all Ruby exceptions
332
- 0 // Required by API to be the last argument
333
- );
391
+ safely_call(cpu_and_wall_time_collector_sample, state->cpu_and_wall_time_collector_instance, instance);
334
392
  }
335
393
 
336
- static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) {
337
- struct cpu_and_wall_time_worker_state *state;
338
- TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
339
-
340
- state->should_run = false;
341
- state->failure_exception = exception;
342
-
343
- return Qnil;
344
- }
394
+ static VALUE handle_sampling_failure(VALUE self_instance, VALUE exception) { return stop(self_instance, exception); }
345
395
 
396
+ // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
397
+ // It SHOULD NOT be used for other purposes.
346
398
  static VALUE _native_current_sigprof_signal_handler(DDTRACE_UNUSED VALUE self) {
347
399
  struct sigaction existing_signal_handler_config = {.sa_sigaction = NULL};
348
400
  if (sigaction(SIGPROF, NULL, &existing_signal_handler_config) != 0) {
@@ -370,6 +422,8 @@ static VALUE release_gvl_and_run_sampling_trigger_loop(VALUE instance) {
370
422
  return Qnil;
371
423
  }
372
424
 
425
+ // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
426
+ // It SHOULD NOT be used for other purposes.
373
427
  static VALUE _native_is_running(DDTRACE_UNUSED VALUE self, VALUE instance) {
374
428
  return \
375
429
  (active_sampler_owner_thread != Qnil && is_thread_alive(active_sampler_owner_thread) && active_sampler_instance == instance) ?
@@ -380,12 +434,136 @@ static void testing_signal_handler(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED si
380
434
  /* Does nothing on purpose */
381
435
  }
382
436
 
437
+ // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
438
+ // It SHOULD NOT be used for other purposes.
383
439
  static VALUE _native_install_testing_signal_handler(DDTRACE_UNUSED VALUE self) {
384
440
  install_sigprof_signal_handler(testing_signal_handler);
385
441
  return Qtrue;
386
442
  }
387
443
 
444
+ // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
445
+ // It SHOULD NOT be used for other purposes.
388
446
  static VALUE _native_remove_testing_signal_handler(DDTRACE_UNUSED VALUE self) {
389
447
  remove_sigprof_signal_handler();
390
448
  return Qtrue;
391
449
  }
450
+
451
+ // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
452
+ // It SHOULD NOT be used for other purposes.
453
+ static VALUE _native_trigger_sample(DDTRACE_UNUSED VALUE self) {
454
+ sample_from_postponed_job(NULL);
455
+ return Qtrue;
456
+ }
457
+
458
+ // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
459
+ // It SHOULD NOT be used for other purposes.
460
+ static VALUE _native_gc_tracepoint(DDTRACE_UNUSED VALUE self, VALUE instance) {
461
+ struct cpu_and_wall_time_worker_state *state;
462
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
463
+
464
+ return state->gc_tracepoint;
465
+ }
466
+
467
+ // Implements tracking of cpu-time and wall-time spent doing GC. This function is called by Ruby from the `gc_tracepoint`
468
+ // when the RUBY_INTERNAL_EVENT_GC_ENTER and RUBY_INTERNAL_EVENT_GC_EXIT events are triggered.
469
+ //
470
+ // See the comments on
471
+ // * cpu_and_wall_time_collector_on_gc_start
472
+ // * cpu_and_wall_time_collector_on_gc_finish
473
+ // * cpu_and_wall_time_collector_sample_after_gc
474
+ //
475
+ // For the expected times in which to call them, and their assumptions.
476
+ //
477
+ // Safety: This function gets called while Ruby is doing garbage collection. While Ruby is doing garbage collection,
478
+ // *NO ALLOCATION* is allowed. This function, and any it calls must never trigger memory or object allocation.
479
+ // This includes exceptions and use of ruby_xcalloc (because xcalloc can trigger GC)!
480
+ static void on_gc_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
481
+ if (!ddtrace_rb_ractor_main_p()) {
482
+ return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
483
+ }
484
+
485
+ int event = rb_tracearg_event_flag(rb_tracearg_from_tracepoint(tracepoint_data));
486
+ if (event != RUBY_INTERNAL_EVENT_GC_ENTER && event != RUBY_INTERNAL_EVENT_GC_EXIT) return; // Unknown event
487
+
488
+ VALUE instance = active_sampler_instance; // Read from global variable
489
+
490
+ // This should not happen in a normal situation because the tracepoint is always enabled after the instance is set
491
+ // and disabled before it is cleared, but just in case...
492
+ if (instance == Qnil) return;
493
+
494
+ struct cpu_and_wall_time_worker_state *state;
495
+ if (!rb_typeddata_is_kind_of(instance, &cpu_and_wall_time_worker_typed_data)) return;
496
+ // This should never fail the the above check passes
497
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
498
+
499
+ if (event == RUBY_INTERNAL_EVENT_GC_ENTER) {
500
+ cpu_and_wall_time_collector_on_gc_start(state->cpu_and_wall_time_collector_instance);
501
+ } else if (event == RUBY_INTERNAL_EVENT_GC_EXIT) {
502
+ // Design: In an earlier iteration of this feature (see https://github.com/DataDog/dd-trace-rb/pull/2308) we
503
+ // actually had a single method to implement the behavior of both cpu_and_wall_time_collector_on_gc_finish
504
+ // and cpu_and_wall_time_collector_sample_after_gc (the latter is called via after_gc_from_postponed_job).
505
+ //
506
+ // Unfortunately, then we discovered the safety issue around no allocations, and thus decided to separate them -- so that
507
+ // the sampling could run outside the tight safety constraints of the garbage collection process.
508
+ //
509
+ // There is a downside: The sample is now taken very very shortly afterwards the GC finishes, and not immediately
510
+ // as the GC finishes, which means the stack captured may by affected by "skid", e.g. point slightly after where
511
+ // it should be pointing at.
512
+ // Alternatives to solve this would be to capture no stack for garbage collection (as we do for Java and .net);
513
+ // making the sampling process allocation-safe (very hard); or separate stack sampling from sample recording,
514
+ // e.g. enabling us to capture the stack in cpu_and_wall_time_collector_on_gc_finish and do the rest later
515
+ // (medium hard).
516
+
517
+ cpu_and_wall_time_collector_on_gc_finish(state->cpu_and_wall_time_collector_instance);
518
+ // We use rb_postponed_job_register_one to ask Ruby to run cpu_and_wall_time_collector_sample_after_gc after if
519
+ // fully finishes the garbage collection, so that one is allowed to do allocations and throw exceptions as usual.
520
+ rb_postponed_job_register_one(0, after_gc_from_postponed_job, NULL);
521
+ }
522
+ }
523
+
524
+ static void after_gc_from_postponed_job(DDTRACE_UNUSED void *_unused) {
525
+ VALUE instance = active_sampler_instance; // Read from global variable
526
+
527
+ // This can potentially happen if the CpuAndWallTimeWorker was stopped while the postponed job was waiting to be executed; nothing to do
528
+ if (instance == Qnil) return;
529
+
530
+ // @ivoanjo: I'm not sure this can ever happen because `on_gc_event` only enqueues this callback if
531
+ // it's running on the main Ractor, but just in case...
532
+ if (!ddtrace_rb_ractor_main_p()) {
533
+ return; // We're not on the main Ractor; we currently don't support profiling non-main Ractors
534
+ }
535
+
536
+ struct cpu_and_wall_time_worker_state *state;
537
+ TypedData_Get_Struct(instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
538
+
539
+ // Trigger sampling using the Collectors::CpuAndWallTime; rescue against any exceptions that happen during sampling
540
+ safely_call(cpu_and_wall_time_collector_sample_after_gc, state->cpu_and_wall_time_collector_instance, instance);
541
+ }
542
+
543
+ // Equivalent to Ruby begin/rescue call, where we call a C function and jump to the exception handler if an
544
+ // exception gets raised within
545
+ static void safely_call(VALUE (*function_to_call_safely)(VALUE), VALUE function_to_call_safely_arg, VALUE instance) {
546
+ VALUE exception_handler_function_arg = instance;
547
+ rb_rescue2(
548
+ function_to_call_safely,
549
+ function_to_call_safely_arg,
550
+ handle_sampling_failure,
551
+ exception_handler_function_arg,
552
+ rb_eException, // rb_eException is the base class of all Ruby exceptions
553
+ 0 // Required by API to be the last argument
554
+ );
555
+ }
556
+
557
+ // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
558
+ // It SHOULD NOT be used for other purposes.
559
+ static VALUE _native_simulate_handle_sampling_signal(DDTRACE_UNUSED VALUE self) {
560
+ handle_sampling_signal(0, NULL, NULL);
561
+ return Qtrue;
562
+ }
563
+
564
+ // This method exists only to enable testing Datadog::Profiling::Collectors::CpuAndWallTimeWorker behavior using RSpec.
565
+ // It SHOULD NOT be used for other purposes.
566
+ static VALUE _native_simulate_sample_from_postponed_job(DDTRACE_UNUSED VALUE self) {
567
+ sample_from_postponed_job(NULL);
568
+ return Qtrue;
569
+ }