datadog 2.3.0 → 2.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (129) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +37 -1
  3. data/ext/datadog_profiling_loader/datadog_profiling_loader.c +9 -1
  4. data/ext/datadog_profiling_loader/extconf.rb +10 -22
  5. data/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +148 -30
  6. data/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c +4 -2
  7. data/ext/datadog_profiling_native_extension/collectors_stack.c +89 -46
  8. data/ext/datadog_profiling_native_extension/collectors_thread_context.c +580 -29
  9. data/ext/datadog_profiling_native_extension/collectors_thread_context.h +9 -1
  10. data/ext/datadog_profiling_native_extension/datadog_ruby_common.c +0 -27
  11. data/ext/datadog_profiling_native_extension/datadog_ruby_common.h +0 -4
  12. data/ext/datadog_profiling_native_extension/extconf.rb +38 -21
  13. data/ext/datadog_profiling_native_extension/gvl_profiling_helper.c +50 -0
  14. data/ext/datadog_profiling_native_extension/gvl_profiling_helper.h +75 -0
  15. data/ext/datadog_profiling_native_extension/heap_recorder.c +20 -6
  16. data/ext/datadog_profiling_native_extension/http_transport.c +38 -6
  17. data/ext/datadog_profiling_native_extension/private_vm_api_access.c +52 -1
  18. data/ext/datadog_profiling_native_extension/private_vm_api_access.h +3 -0
  19. data/ext/datadog_profiling_native_extension/profiling.c +1 -1
  20. data/ext/datadog_profiling_native_extension/stack_recorder.h +1 -0
  21. data/ext/libdatadog_api/crashtracker.c +20 -18
  22. data/ext/libdatadog_api/datadog_ruby_common.c +0 -27
  23. data/ext/libdatadog_api/datadog_ruby_common.h +0 -4
  24. data/ext/libdatadog_extconf_helpers.rb +1 -1
  25. data/lib/datadog/appsec/assets/waf_rules/recommended.json +2184 -108
  26. data/lib/datadog/appsec/assets/waf_rules/strict.json +1430 -2
  27. data/lib/datadog/appsec/component.rb +29 -8
  28. data/lib/datadog/appsec/configuration/settings.rb +2 -2
  29. data/lib/datadog/appsec/contrib/devise/patcher/authenticatable_patch.rb +1 -0
  30. data/lib/datadog/appsec/contrib/devise/patcher/rememberable_patch.rb +21 -0
  31. data/lib/datadog/appsec/contrib/devise/patcher.rb +12 -2
  32. data/lib/datadog/appsec/contrib/graphql/appsec_trace.rb +0 -14
  33. data/lib/datadog/appsec/contrib/graphql/gateway/multiplex.rb +67 -31
  34. data/lib/datadog/appsec/contrib/graphql/gateway/watcher.rb +18 -15
  35. data/lib/datadog/appsec/contrib/graphql/integration.rb +14 -1
  36. data/lib/datadog/appsec/contrib/rack/gateway/request.rb +2 -5
  37. data/lib/datadog/appsec/event.rb +1 -1
  38. data/lib/datadog/appsec/processor/rule_loader.rb +3 -1
  39. data/lib/datadog/appsec/processor/rule_merger.rb +33 -15
  40. data/lib/datadog/appsec/processor.rb +36 -37
  41. data/lib/datadog/appsec/rate_limiter.rb +25 -40
  42. data/lib/datadog/appsec/remote.rb +7 -3
  43. data/lib/datadog/appsec.rb +2 -2
  44. data/lib/datadog/core/configuration/components.rb +4 -3
  45. data/lib/datadog/core/configuration/settings.rb +84 -5
  46. data/lib/datadog/core/crashtracking/component.rb +1 -1
  47. data/lib/datadog/core/environment/execution.rb +5 -5
  48. data/lib/datadog/core/metrics/client.rb +7 -0
  49. data/lib/datadog/core/rate_limiter.rb +183 -0
  50. data/lib/datadog/core/remote/client/capabilities.rb +4 -3
  51. data/lib/datadog/core/remote/component.rb +4 -2
  52. data/lib/datadog/core/remote/negotiation.rb +4 -4
  53. data/lib/datadog/core/remote/tie.rb +2 -0
  54. data/lib/datadog/core/runtime/metrics.rb +1 -1
  55. data/lib/datadog/core/telemetry/component.rb +2 -0
  56. data/lib/datadog/core/telemetry/event.rb +12 -7
  57. data/lib/datadog/core/telemetry/logger.rb +51 -0
  58. data/lib/datadog/core/telemetry/logging.rb +50 -14
  59. data/lib/datadog/core/telemetry/request.rb +13 -1
  60. data/lib/datadog/core/utils/time.rb +12 -0
  61. data/lib/datadog/di/code_tracker.rb +168 -0
  62. data/lib/datadog/di/configuration/settings.rb +163 -0
  63. data/lib/datadog/di/configuration.rb +11 -0
  64. data/lib/datadog/di/error.rb +31 -0
  65. data/lib/datadog/di/extensions.rb +16 -0
  66. data/lib/datadog/di/probe.rb +133 -0
  67. data/lib/datadog/di/probe_builder.rb +41 -0
  68. data/lib/datadog/di/redactor.rb +188 -0
  69. data/lib/datadog/di/serializer.rb +193 -0
  70. data/lib/datadog/di.rb +14 -0
  71. data/lib/datadog/opentelemetry/sdk/propagator.rb +2 -0
  72. data/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb +12 -10
  73. data/lib/datadog/profiling/collectors/info.rb +12 -3
  74. data/lib/datadog/profiling/collectors/thread_context.rb +26 -0
  75. data/lib/datadog/profiling/component.rb +20 -4
  76. data/lib/datadog/profiling/http_transport.rb +6 -1
  77. data/lib/datadog/profiling/scheduler.rb +2 -0
  78. data/lib/datadog/profiling/stack_recorder.rb +3 -0
  79. data/lib/datadog/single_step_instrument.rb +12 -0
  80. data/lib/datadog/tracing/contrib/action_cable/instrumentation.rb +8 -12
  81. data/lib/datadog/tracing/contrib/action_pack/action_controller/instrumentation.rb +5 -0
  82. data/lib/datadog/tracing/contrib/action_pack/action_dispatch/instrumentation.rb +78 -0
  83. data/lib/datadog/tracing/contrib/action_pack/action_dispatch/patcher.rb +33 -0
  84. data/lib/datadog/tracing/contrib/action_pack/patcher.rb +2 -0
  85. data/lib/datadog/tracing/contrib/active_record/configuration/resolver.rb +4 -0
  86. data/lib/datadog/tracing/contrib/active_record/events/instantiation.rb +3 -1
  87. data/lib/datadog/tracing/contrib/active_record/events/sql.rb +3 -1
  88. data/lib/datadog/tracing/contrib/active_support/cache/events/cache.rb +5 -1
  89. data/lib/datadog/tracing/contrib/aws/instrumentation.rb +5 -0
  90. data/lib/datadog/tracing/contrib/elasticsearch/patcher.rb +6 -1
  91. data/lib/datadog/tracing/contrib/faraday/middleware.rb +9 -0
  92. data/lib/datadog/tracing/contrib/grape/endpoint.rb +19 -0
  93. data/lib/datadog/tracing/contrib/graphql/patcher.rb +9 -12
  94. data/lib/datadog/tracing/contrib/graphql/trace_patcher.rb +3 -3
  95. data/lib/datadog/tracing/contrib/graphql/tracing_patcher.rb +3 -3
  96. data/lib/datadog/tracing/contrib/graphql/unified_trace.rb +13 -9
  97. data/lib/datadog/tracing/contrib/graphql/unified_trace_patcher.rb +6 -3
  98. data/lib/datadog/tracing/contrib/http/instrumentation.rb +18 -15
  99. data/lib/datadog/tracing/contrib/httpclient/instrumentation.rb +6 -5
  100. data/lib/datadog/tracing/contrib/httpclient/patcher.rb +1 -14
  101. data/lib/datadog/tracing/contrib/httprb/instrumentation.rb +5 -0
  102. data/lib/datadog/tracing/contrib/httprb/patcher.rb +1 -14
  103. data/lib/datadog/tracing/contrib/lograge/patcher.rb +1 -2
  104. data/lib/datadog/tracing/contrib/mongodb/subscribers.rb +2 -0
  105. data/lib/datadog/tracing/contrib/opensearch/patcher.rb +13 -6
  106. data/lib/datadog/tracing/contrib/patcher.rb +2 -1
  107. data/lib/datadog/tracing/contrib/presto/patcher.rb +1 -13
  108. data/lib/datadog/tracing/contrib/rack/middlewares.rb +27 -0
  109. data/lib/datadog/tracing/contrib/redis/tags.rb +4 -0
  110. data/lib/datadog/tracing/contrib/sinatra/tracer.rb +4 -0
  111. data/lib/datadog/tracing/contrib/stripe/request.rb +3 -2
  112. data/lib/datadog/tracing/distributed/propagation.rb +7 -0
  113. data/lib/datadog/tracing/metadata/ext.rb +2 -0
  114. data/lib/datadog/tracing/remote.rb +5 -2
  115. data/lib/datadog/tracing/sampling/matcher.rb +6 -1
  116. data/lib/datadog/tracing/sampling/rate_sampler.rb +1 -1
  117. data/lib/datadog/tracing/sampling/rule.rb +2 -0
  118. data/lib/datadog/tracing/sampling/rule_sampler.rb +9 -5
  119. data/lib/datadog/tracing/sampling/span/ext.rb +1 -1
  120. data/lib/datadog/tracing/sampling/span/rule.rb +2 -2
  121. data/lib/datadog/tracing/trace_operation.rb +26 -2
  122. data/lib/datadog/tracing/tracer.rb +14 -12
  123. data/lib/datadog/tracing/transport/http/client.rb +1 -0
  124. data/lib/datadog/tracing/transport/io/client.rb +1 -0
  125. data/lib/datadog/tracing/workers/trace_writer.rb +1 -1
  126. data/lib/datadog/tracing/workers.rb +1 -1
  127. data/lib/datadog/version.rb +1 -1
  128. metadata +25 -8
  129. data/lib/datadog/tracing/sampling/rate_limiter.rb +0 -185
@@ -76,6 +76,11 @@
76
76
  #define MISSING_TRACER_CONTEXT_KEY 0
77
77
  #define TIME_BETWEEN_GC_EVENTS_NS MILLIS_AS_NS(10)
78
78
 
79
+ // This is used as a placeholder to mark threads that are allowed to be profiled (enabled)
80
+ // (e.g. to avoid trying to gvl profile threads that are not from the main Ractor)
81
+ // and for which there's no data yet
82
+ #define GVL_WAITING_ENABLED_EMPTY RUBY_FIXNUM_MAX
83
+
79
84
  static ID at_active_span_id; // id of :@active_span in Ruby
80
85
  static ID at_active_trace_id; // id of :@active_trace in Ruby
81
86
  static ID at_id_id; // id of :@id in Ruby
@@ -86,6 +91,26 @@ static ID at_otel_values_id; // id of :@otel_values in Ruby
86
91
  static ID at_parent_span_id_id; // id of :@parent_span_id in Ruby
87
92
  static ID at_datadog_trace_id; // id of :@datadog_trace in Ruby
88
93
 
94
+ // Used to support reading trace identifiers from the opentelemetry Ruby library when the ddtrace gem tracing
95
+ // integration is NOT in use.
96
+ static ID at_span_id_id; // id of :@span_id in Ruby
97
+ static ID at_trace_id_id; // id of :@trace_id in Ruby
98
+ static ID at_entries_id; // id of :@entries in Ruby
99
+ static ID at_context_id; // id of :@context in Ruby
100
+ static ID at_kind_id; // id of :@kind in Ruby
101
+ static ID at_name_id; // id of :@name in Ruby
102
+ static ID server_id; // id of :server in Ruby
103
+ static ID otel_context_storage_id; // id of :__opentelemetry_context_storage__ in Ruby
104
+
105
+ // This is used by `thread_context_collector_on_gvl_running`. Because when that method gets called we're not sure if
106
+ // it's safe to access the state of the thread context collector, we store this setting as a global value. This does
107
+ // mean this setting is shared among all thread context collectors, and thus it's "last writer wins".
108
+ // In production this should not be a problem: there should only be one profiler, which is the last one created,
109
+ // and that'll be the one that last wrote this setting.
110
+ static uint32_t global_waiting_for_gvl_threshold_ns = MILLIS_AS_NS(10);
111
+
112
+ enum otel_context_enabled {otel_context_enabled_false, otel_context_enabled_only, otel_context_enabled_both};
113
+
89
114
  // Contains state for a single ThreadContext instance
90
115
  struct thread_context_collector_state {
91
116
  // Note: Places in this file that usually need to be changed when this struct is changed are tagged with
@@ -112,6 +137,8 @@ struct thread_context_collector_state {
112
137
  bool endpoint_collection_enabled;
113
138
  // Used to omit timestamps / timeline events from collected data
114
139
  bool timeline_enabled;
140
+ // Used to control context collection
141
+ enum otel_context_enabled otel_context_enabled;
115
142
  // Used to omit class information from collected allocation data
116
143
  bool allocation_type_enabled;
117
144
  // Used when calling monotonic_to_system_epoch_ns
@@ -119,6 +146,8 @@ struct thread_context_collector_state {
119
146
  // Used to identify the main thread, to give it a fallback name
120
147
  VALUE main_thread;
121
148
  // Used when extracting trace identifiers from otel spans. Lazily initialized.
149
+ // Qtrue serves as a marker we've not yet extracted it; when we try to extract it, we set it to an object if
150
+ // successful and Qnil if not.
122
151
  VALUE otel_current_span_key;
123
152
 
124
153
  struct stats {
@@ -164,6 +193,12 @@ struct trace_identifiers {
164
193
  VALUE trace_endpoint;
165
194
  };
166
195
 
196
+ struct otel_span {
197
+ VALUE span;
198
+ VALUE span_id;
199
+ VALUE trace_id;
200
+ };
201
+
167
202
  static void thread_context_collector_typed_data_mark(void *state_ptr);
168
203
  static void thread_context_collector_typed_data_free(void *state_ptr);
169
204
  static int hash_map_per_thread_context_mark(st_data_t key_thread, st_data_t _value, st_data_t _argument);
@@ -177,13 +212,15 @@ static VALUE _native_initialize(
177
212
  VALUE tracer_context_key,
178
213
  VALUE endpoint_collection_enabled,
179
214
  VALUE timeline_enabled,
215
+ VALUE waiting_for_gvl_threshold_ns,
216
+ VALUE otel_context_enabled,
180
217
  VALUE allocation_type_enabled
181
218
  );
182
219
  static VALUE _native_sample(VALUE self, VALUE collector_instance, VALUE profiler_overhead_stack_thread);
183
220
  static VALUE _native_on_gc_start(VALUE self, VALUE collector_instance);
184
221
  static VALUE _native_on_gc_finish(VALUE self, VALUE collector_instance);
185
- static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance);
186
- void update_metrics_and_sample(
222
+ static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state);
223
+ static void update_metrics_and_sample(
187
224
  struct thread_context_collector_state *state,
188
225
  VALUE thread_being_sampled,
189
226
  VALUE stack_from_thread,
@@ -201,7 +238,8 @@ static void trigger_sample_for_thread(
201
238
  sample_values values,
202
239
  long current_monotonic_wall_time_ns,
203
240
  ddog_CharSlice *ruby_vm_type,
204
- ddog_CharSlice *class_name
241
+ ddog_CharSlice *class_name,
242
+ bool is_gvl_waiting_state
205
243
  );
206
244
  static VALUE _native_thread_list(VALUE self);
207
245
  static struct per_thread_context *get_or_create_context_for(VALUE thread, struct thread_context_collector_state *state);
@@ -237,6 +275,26 @@ static void ddtrace_otel_trace_identifiers_for(
237
275
  VALUE otel_values
238
276
  );
239
277
  static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE skipped_samples);
278
+ static bool handle_gvl_waiting(
279
+ struct thread_context_collector_state *state,
280
+ VALUE thread_being_sampled,
281
+ VALUE stack_from_thread,
282
+ struct per_thread_context *thread_context,
283
+ sampling_buffer* sampling_buffer,
284
+ long current_cpu_time_ns
285
+ );
286
+ static VALUE _native_on_gvl_waiting(DDTRACE_UNUSED VALUE self, VALUE thread);
287
+ static VALUE _native_gvl_waiting_at_for(DDTRACE_UNUSED VALUE self, VALUE thread);
288
+ static VALUE _native_on_gvl_running(DDTRACE_UNUSED VALUE self, VALUE thread);
289
+ static VALUE _native_sample_after_gvl_running(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread);
290
+ static VALUE _native_apply_delta_to_cpu_time_at_previous_sample_ns(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread, VALUE delta_ns);
291
+ static void otel_without_ddtrace_trace_identifiers_for(
292
+ struct thread_context_collector_state *state,
293
+ VALUE thread,
294
+ struct trace_identifiers *trace_identifiers_result
295
+ );
296
+ static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_span_key);
297
+ static uint64_t otel_span_id_to_uint(VALUE otel_span_id);
240
298
 
241
299
  void collectors_thread_context_init(VALUE profiling_module) {
242
300
  VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
@@ -254,20 +312,27 @@ void collectors_thread_context_init(VALUE profiling_module) {
254
312
  // https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
255
313
  rb_define_alloc_func(collectors_thread_context_class, _native_new);
256
314
 
257
- rb_define_singleton_method(collectors_thread_context_class, "_native_initialize", _native_initialize, 7);
315
+ rb_define_singleton_method(collectors_thread_context_class, "_native_initialize", _native_initialize, 9);
258
316
  rb_define_singleton_method(collectors_thread_context_class, "_native_inspect", _native_inspect, 1);
259
317
  rb_define_singleton_method(collectors_thread_context_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
260
318
  rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 2);
261
319
  rb_define_singleton_method(testing_module, "_native_sample_allocation", _native_sample_allocation, 3);
262
320
  rb_define_singleton_method(testing_module, "_native_on_gc_start", _native_on_gc_start, 1);
263
321
  rb_define_singleton_method(testing_module, "_native_on_gc_finish", _native_on_gc_finish, 1);
264
- rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 1);
322
+ rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 2);
265
323
  rb_define_singleton_method(testing_module, "_native_thread_list", _native_thread_list, 0);
266
324
  rb_define_singleton_method(testing_module, "_native_per_thread_context", _native_per_thread_context, 1);
267
325
  rb_define_singleton_method(testing_module, "_native_stats", _native_stats, 1);
268
326
  rb_define_singleton_method(testing_module, "_native_gc_tracking", _native_gc_tracking, 1);
269
327
  rb_define_singleton_method(testing_module, "_native_new_empty_thread", _native_new_empty_thread, 0);
270
328
  rb_define_singleton_method(testing_module, "_native_sample_skipped_allocation_samples", _native_sample_skipped_allocation_samples, 2);
329
+ #ifndef NO_GVL_INSTRUMENTATION
330
+ rb_define_singleton_method(testing_module, "_native_on_gvl_waiting", _native_on_gvl_waiting, 1);
331
+ rb_define_singleton_method(testing_module, "_native_gvl_waiting_at_for", _native_gvl_waiting_at_for, 1);
332
+ rb_define_singleton_method(testing_module, "_native_on_gvl_running", _native_on_gvl_running, 1);
333
+ rb_define_singleton_method(testing_module, "_native_sample_after_gvl_running", _native_sample_after_gvl_running, 2);
334
+ rb_define_singleton_method(testing_module, "_native_apply_delta_to_cpu_time_at_previous_sample_ns", _native_apply_delta_to_cpu_time_at_previous_sample_ns, 3);
335
+ #endif
271
336
 
272
337
  at_active_span_id = rb_intern_const("@active_span");
273
338
  at_active_trace_id = rb_intern_const("@active_trace");
@@ -278,6 +343,19 @@ void collectors_thread_context_init(VALUE profiling_module) {
278
343
  at_otel_values_id = rb_intern_const("@otel_values");
279
344
  at_parent_span_id_id = rb_intern_const("@parent_span_id");
280
345
  at_datadog_trace_id = rb_intern_const("@datadog_trace");
346
+ at_span_id_id = rb_intern_const("@span_id");
347
+ at_trace_id_id = rb_intern_const("@trace_id");
348
+ at_entries_id = rb_intern_const("@entries");
349
+ at_context_id = rb_intern_const("@context");
350
+ at_kind_id = rb_intern_const("@kind");
351
+ at_name_id = rb_intern_const("@name");
352
+ server_id = rb_intern_const("server");
353
+ otel_context_storage_id = rb_intern_const("__opentelemetry_context_storage__");
354
+
355
+ #ifndef NO_GVL_INSTRUMENTATION
356
+ // This will raise if Ruby already ran out of thread-local keys
357
+ gvl_profiling_init();
358
+ #endif
281
359
 
282
360
  gc_profiling_init();
283
361
  }
@@ -357,11 +435,12 @@ static VALUE _native_new(VALUE klass) {
357
435
  state->thread_list_buffer = thread_list_buffer;
358
436
  state->endpoint_collection_enabled = true;
359
437
  state->timeline_enabled = true;
438
+ state->otel_context_enabled = otel_context_enabled_false;
360
439
  state->allocation_type_enabled = true;
361
440
  state->time_converter_state = (monotonic_to_system_epoch_state) MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER;
362
441
  VALUE main_thread = rb_thread_main();
363
442
  state->main_thread = main_thread;
364
- state->otel_current_span_key = Qnil;
443
+ state->otel_current_span_key = Qtrue;
365
444
  state->gc_tracking.wall_time_at_previous_gc_ns = INVALID_TIME;
366
445
  state->gc_tracking.wall_time_at_last_flushed_gc_event_ns = 0;
367
446
 
@@ -377,6 +456,7 @@ static VALUE _native_new(VALUE klass) {
377
456
  return instance;
378
457
  }
379
458
 
459
+ // TODO: Convert this to use options like CpuAndWallTimeWorker
380
460
  static VALUE _native_initialize(
381
461
  DDTRACE_UNUSED VALUE _self,
382
462
  VALUE collector_instance,
@@ -385,10 +465,13 @@ static VALUE _native_initialize(
385
465
  VALUE tracer_context_key,
386
466
  VALUE endpoint_collection_enabled,
387
467
  VALUE timeline_enabled,
468
+ VALUE waiting_for_gvl_threshold_ns,
469
+ VALUE otel_context_enabled,
388
470
  VALUE allocation_type_enabled
389
471
  ) {
390
472
  ENFORCE_BOOLEAN(endpoint_collection_enabled);
391
473
  ENFORCE_BOOLEAN(timeline_enabled);
474
+ ENFORCE_TYPE(waiting_for_gvl_threshold_ns, T_FIXNUM);
392
475
  ENFORCE_BOOLEAN(allocation_type_enabled);
393
476
 
394
477
  struct thread_context_collector_state *state;
@@ -401,8 +484,19 @@ static VALUE _native_initialize(
401
484
  state->recorder_instance = enforce_recorder_instance(recorder_instance);
402
485
  state->endpoint_collection_enabled = (endpoint_collection_enabled == Qtrue);
403
486
  state->timeline_enabled = (timeline_enabled == Qtrue);
487
+ if (otel_context_enabled == Qfalse || otel_context_enabled == Qnil) {
488
+ state->otel_context_enabled = otel_context_enabled_false;
489
+ } else if (otel_context_enabled == ID2SYM(rb_intern("only"))) {
490
+ state->otel_context_enabled = otel_context_enabled_only;
491
+ } else if (otel_context_enabled == ID2SYM(rb_intern("both"))) {
492
+ state->otel_context_enabled = otel_context_enabled_both;
493
+ } else {
494
+ rb_raise(rb_eArgError, "Unexpected value for otel_context_enabled: %+" PRIsVALUE, otel_context_enabled);
495
+ }
404
496
  state->allocation_type_enabled = (allocation_type_enabled == Qtrue);
405
497
 
498
+ global_waiting_for_gvl_threshold_ns = NUM2UINT(waiting_for_gvl_threshold_ns);
499
+
406
500
  if (RTEST(tracer_context_key)) {
407
501
  ENFORCE_TYPE(tracer_context_key, T_SYMBOL);
408
502
  // Note about rb_to_id and dynamic symbols: calling `rb_to_id` prevents symbols from ever being garbage collected.
@@ -433,13 +527,22 @@ static VALUE _native_on_gc_start(DDTRACE_UNUSED VALUE self, VALUE collector_inst
433
527
  // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec.
434
528
  // It SHOULD NOT be used for other purposes.
435
529
  static VALUE _native_on_gc_finish(DDTRACE_UNUSED VALUE self, VALUE collector_instance) {
436
- thread_context_collector_on_gc_finish(collector_instance);
530
+ (void) !thread_context_collector_on_gc_finish(collector_instance);
437
531
  return Qtrue;
438
532
  }
439
533
 
440
534
  // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec.
441
535
  // It SHOULD NOT be used for other purposes.
442
- static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance) {
536
+ static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state) {
537
+ ENFORCE_BOOLEAN(reset_monotonic_to_system_state);
538
+
539
+ struct thread_context_collector_state *state;
540
+ TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
541
+
542
+ if (reset_monotonic_to_system_state == Qtrue) {
543
+ state->time_converter_state = (monotonic_to_system_epoch_state) MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER;
544
+ }
545
+
443
546
  thread_context_collector_sample_after_gc(collector_instance);
444
547
  return Qtrue;
445
548
  }
@@ -502,7 +605,7 @@ void thread_context_collector_sample(VALUE self_instance, long current_monotonic
502
605
  );
503
606
  }
504
607
 
505
- void update_metrics_and_sample(
608
+ static void update_metrics_and_sample(
506
609
  struct thread_context_collector_state *state,
507
610
  VALUE thread_being_sampled,
508
611
  VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack
@@ -511,12 +614,17 @@ void update_metrics_and_sample(
511
614
  long current_cpu_time_ns,
512
615
  long current_monotonic_wall_time_ns
513
616
  ) {
514
- long cpu_time_elapsed_ns = update_time_since_previous_sample(
617
+ bool is_gvl_waiting_state =
618
+ handle_gvl_waiting(state, thread_being_sampled, stack_from_thread, thread_context, sampling_buffer, current_cpu_time_ns);
619
+
620
+ // Don't assign/update cpu during "Waiting for GVL"
621
+ long cpu_time_elapsed_ns = is_gvl_waiting_state ? 0 : update_time_since_previous_sample(
515
622
  &thread_context->cpu_time_at_previous_sample_ns,
516
623
  current_cpu_time_ns,
517
624
  thread_context->gc_tracking.cpu_time_at_start_ns,
518
625
  IS_NOT_WALL_TIME
519
626
  );
627
+
520
628
  long wall_time_elapsed_ns = update_time_since_previous_sample(
521
629
  &thread_context->wall_time_at_previous_sample_ns,
522
630
  current_monotonic_wall_time_ns,
@@ -528,6 +636,21 @@ void update_metrics_and_sample(
528
636
  IS_WALL_TIME
529
637
  );
530
638
 
639
+ // A thread enters "Waiting for GVL", well, as the name implies, without the GVL.
640
+ //
641
+ // As a consequence, it's possible that a thread enters "Waiting for GVL" in parallel with the current thread working
642
+ // on sampling, and thus for the `current_monotonic_wall_time_ns` (which is recorded at the start of sampling)
643
+ // to be < the time at which we started Waiting for GVL.
644
+ //
645
+ // All together, this means that when `handle_gvl_waiting` creates an extra sample (see comments on that function for
646
+ // what the extra sample is), it's possible that there's no more wall-time to be assigned.
647
+ // Thus, in this case, we don't want to produce a sample representing Waiting for GVL with a wall-time of 0, and
648
+ // thus we skip creating such a sample.
649
+ if (is_gvl_waiting_state && wall_time_elapsed_ns == 0) return;
650
+ // ...you may also wonder: is there any other situation where it makes sense to produce a sample with
651
+ // wall_time_elapsed_ns == 0? I believe that yes, because the sample still includes a timestamp and a stack, but we
652
+ // may revisit/change our minds on this in the future.
653
+
531
654
  trigger_sample_for_thread(
532
655
  state,
533
656
  thread_being_sampled,
@@ -537,7 +660,8 @@ void update_metrics_and_sample(
537
660
  (sample_values) {.cpu_time_ns = cpu_time_elapsed_ns, .cpu_or_wall_samples = 1, .wall_time_ns = wall_time_elapsed_ns},
538
661
  current_monotonic_wall_time_ns,
539
662
  NULL,
540
- NULL
663
+ NULL,
664
+ is_gvl_waiting_state
541
665
  );
542
666
  }
543
667
 
@@ -583,6 +707,7 @@ void thread_context_collector_on_gc_start(VALUE self_instance) {
583
707
  //
584
708
  // Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
585
709
  // Assumption 2: This function is called from the main Ractor (if Ruby has support for Ractors).
710
+ __attribute__((warn_unused_result))
586
711
  bool thread_context_collector_on_gc_finish(VALUE self_instance) {
587
712
  struct thread_context_collector_state *state;
588
713
  if (!rb_typeddata_is_kind_of(self_instance, &thread_context_collector_typed_data)) return false;
@@ -718,7 +843,8 @@ static void trigger_sample_for_thread(
718
843
  long current_monotonic_wall_time_ns,
719
844
  // These two labels are only used for allocation profiling; @ivoanjo: may want to refactor this at some point?
720
845
  ddog_CharSlice *ruby_vm_type,
721
- ddog_CharSlice *class_name
846
+ ddog_CharSlice *class_name,
847
+ bool is_gvl_waiting_state
722
848
  ) {
723
849
  int max_label_count =
724
850
  1 + // thread id
@@ -759,6 +885,11 @@ static void trigger_sample_for_thread(
759
885
  struct trace_identifiers trace_identifiers_result = {.valid = false, .trace_endpoint = Qnil};
760
886
  trace_identifiers_for(state, thread, &trace_identifiers_result);
761
887
 
888
+ if (!trace_identifiers_result.valid && state->otel_context_enabled != otel_context_enabled_false) {
889
+ // If we couldn't get something with ddtrace, let's see if we can get some trace identifiers from opentelemetry directly
890
+ otel_without_ddtrace_trace_identifiers_for(state, thread, &trace_identifiers_result);
891
+ }
892
+
762
893
  if (trace_identifiers_result.valid) {
763
894
  labels[label_pos++] = (ddog_prof_Label) {.key = DDOG_CHARSLICE_C("local root span id"), .num = trace_identifiers_result.local_root_span_id};
764
895
  labels[label_pos++] = (ddog_prof_Label) {.key = DDOG_CHARSLICE_C("span id"), .num = trace_identifiers_result.span_id};
@@ -837,7 +968,12 @@ static void trigger_sample_for_thread(
837
968
  sampling_buffer,
838
969
  state->recorder_instance,
839
970
  values,
840
- (sample_labels) {.labels = slice_labels, .state_label = state_label, .end_timestamp_ns = end_timestamp_ns}
971
+ (sample_labels) {
972
+ .labels = slice_labels,
973
+ .state_label = state_label,
974
+ .end_timestamp_ns = end_timestamp_ns,
975
+ .is_gvl_waiting_state = is_gvl_waiting_state,
976
+ }
841
977
  );
842
978
  }
843
979
 
@@ -887,9 +1023,9 @@ static struct per_thread_context *get_context_for(VALUE thread, struct thread_co
887
1023
  // to either run Ruby code during sampling (not great), or otherwise use some of the VM private APIs to detect this.
888
1024
  //
889
1025
  static bool is_logging_gem_monkey_patch(VALUE invoke_file_location) {
890
- int logging_gem_path_len = strlen(LOGGING_GEM_PATH);
1026
+ unsigned long logging_gem_path_len = strlen(LOGGING_GEM_PATH);
891
1027
  char *invoke_file = StringValueCStr(invoke_file_location);
892
- int invoke_file_len = strlen(invoke_file);
1028
+ unsigned long invoke_file_len = strlen(invoke_file);
893
1029
 
894
1030
  if (invoke_file_len < logging_gem_path_len) return false;
895
1031
 
@@ -937,6 +1073,20 @@ static void initialize_context(VALUE thread, struct per_thread_context *thread_c
937
1073
  // These will only be used during a GC operation
938
1074
  thread_context->gc_tracking.cpu_time_at_start_ns = INVALID_TIME;
939
1075
  thread_context->gc_tracking.wall_time_at_start_ns = INVALID_TIME;
1076
+
1077
+ #ifndef NO_GVL_INSTRUMENTATION
1078
+ // We use this special location to store data that can be accessed without any
1079
+ // kind of synchronization (e.g. by threads without the GVL).
1080
+ //
1081
+ // We set this marker here for two purposes:
1082
+ // * To make sure there's no stale data from a previous execution of the profiler.
1083
+ // * To mark threads that are actually being profiled
1084
+ //
1085
+ // (Setting this is potentially a race, but what we want is to avoid _stale_ data, so
1086
+ // if this gets set concurrently with context initialization, then such a value will belong
1087
+ // to the current profiler instance, so that's OK)
1088
+ gvl_profiling_state_thread_object_set(thread, GVL_WAITING_ENABLED_EMPTY);
1089
+ #endif
940
1090
  }
941
1091
 
942
1092
  static void free_context(struct per_thread_context* thread_context) {
@@ -960,6 +1110,7 @@ static VALUE _native_inspect(DDTRACE_UNUSED VALUE _self, VALUE collector_instanc
960
1110
  rb_str_concat(result, rb_sprintf(" stats=%"PRIsVALUE, stats_as_ruby_hash(state)));
961
1111
  rb_str_concat(result, rb_sprintf(" endpoint_collection_enabled=%"PRIsVALUE, state->endpoint_collection_enabled ? Qtrue : Qfalse));
962
1112
  rb_str_concat(result, rb_sprintf(" timeline_enabled=%"PRIsVALUE, state->timeline_enabled ? Qtrue : Qfalse));
1113
+ rb_str_concat(result, rb_sprintf(" otel_context_enabled=%d", state->otel_context_enabled));
963
1114
  rb_str_concat(result, rb_sprintf(" allocation_type_enabled=%"PRIsVALUE, state->allocation_type_enabled ? Qtrue : Qfalse));
964
1115
  rb_str_concat(result, rb_sprintf(
965
1116
  " time_converter_state={.system_epoch_ns_reference=%ld, .delta_to_epoch_ns=%ld}",
@@ -969,6 +1120,7 @@ static VALUE _native_inspect(DDTRACE_UNUSED VALUE _self, VALUE collector_instanc
969
1120
  rb_str_concat(result, rb_sprintf(" main_thread=%"PRIsVALUE, state->main_thread));
970
1121
  rb_str_concat(result, rb_sprintf(" gc_tracking=%"PRIsVALUE, gc_tracking_as_ruby_hash(state)));
971
1122
  rb_str_concat(result, rb_sprintf(" otel_current_span_key=%"PRIsVALUE, state->otel_current_span_key));
1123
+ rb_str_concat(result, rb_sprintf(" global_waiting_for_gvl_threshold_ns=%u", global_waiting_for_gvl_threshold_ns));
972
1124
 
973
1125
  return result;
974
1126
  }
@@ -996,6 +1148,10 @@ static int per_thread_context_as_ruby_hash(st_data_t key_thread, st_data_t value
996
1148
 
997
1149
  ID2SYM(rb_intern("gc_tracking.cpu_time_at_start_ns")), /* => */ LONG2NUM(thread_context->gc_tracking.cpu_time_at_start_ns),
998
1150
  ID2SYM(rb_intern("gc_tracking.wall_time_at_start_ns")), /* => */ LONG2NUM(thread_context->gc_tracking.wall_time_at_start_ns),
1151
+
1152
+ #ifndef NO_GVL_INSTRUMENTATION
1153
+ ID2SYM(rb_intern("gvl_waiting_at")), /* => */ LONG2NUM(gvl_profiling_state_thread_object_get(thread)),
1154
+ #endif
999
1155
  };
1000
1156
  for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(context_as_hash, arguments[i], arguments[i+1]);
1001
1157
 
@@ -1146,6 +1302,7 @@ static VALUE _native_gc_tracking(DDTRACE_UNUSED VALUE _self, VALUE collector_ins
1146
1302
 
1147
1303
  // Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
1148
1304
  static void trace_identifiers_for(struct thread_context_collector_state *state, VALUE thread, struct trace_identifiers *trace_identifiers_result) {
1305
+ if (state->otel_context_enabled == otel_context_enabled_only) return;
1149
1306
  if (state->tracer_context_key == MISSING_TRACER_CONTEXT_KEY) return;
1150
1307
 
1151
1308
  VALUE current_context = rb_thread_local_aref(thread, state->tracer_context_key);
@@ -1200,7 +1357,7 @@ static bool should_collect_resource(VALUE root_span) {
1200
1357
  if (root_span_type == Qnil) return false;
1201
1358
  ENFORCE_TYPE(root_span_type, T_STRING);
1202
1359
 
1203
- int root_span_type_length = RSTRING_LEN(root_span_type);
1360
+ long root_span_type_length = RSTRING_LEN(root_span_type);
1204
1361
  const char *root_span_type_value = StringValuePtr(root_span_type);
1205
1362
 
1206
1363
  bool is_web_request =
@@ -1223,6 +1380,9 @@ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE collector
1223
1380
  struct thread_context_collector_state *state;
1224
1381
  TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
1225
1382
 
1383
+ // Release all context memory before clearing the existing context
1384
+ st_foreach(state->hash_map_per_thread_context, hash_map_per_thread_context_free_values, 0 /* unused */);
1385
+
1226
1386
  st_clear(state->hash_map_per_thread_context);
1227
1387
 
1228
1388
  state->stats = (struct stats) {}; // Resets all stats back to zero
@@ -1326,7 +1486,8 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in
1326
1486
  (sample_values) {.alloc_samples = sample_weight, .alloc_samples_unscaled = 1, .heap_sample = true},
1327
1487
  INVALID_TIME, // For now we're not collecting timestamps for allocation events, as per profiling team internal discussions
1328
1488
  &ruby_vm_type,
1329
- optional_class_name
1489
+ optional_class_name,
1490
+ false
1330
1491
  );
1331
1492
  }
1332
1493
 
@@ -1372,25 +1533,29 @@ static ddog_CharSlice ruby_value_type_to_class_name(enum ruby_value_type type) {
1372
1533
  }
1373
1534
  }
1374
1535
 
1536
+ // Used to access OpenTelemetry::Trace.const_get(:CURRENT_SPAN_KEY). Will raise exceptions if it fails.
1537
+ static VALUE read_otel_current_span_key_const(DDTRACE_UNUSED VALUE _unused) {
1538
+ VALUE opentelemetry_module = rb_const_get(rb_cObject, rb_intern("OpenTelemetry"));
1539
+ ENFORCE_TYPE(opentelemetry_module, T_MODULE);
1540
+ VALUE trace_module = rb_const_get(opentelemetry_module, rb_intern("Trace"));
1541
+ ENFORCE_TYPE(trace_module, T_MODULE);
1542
+ return rb_const_get(trace_module, rb_intern("CURRENT_SPAN_KEY"));
1543
+ }
1544
+
1375
1545
  static VALUE get_otel_current_span_key(struct thread_context_collector_state *state) {
1376
- if (state->otel_current_span_key == Qnil) {
1377
- VALUE datadog_module = rb_const_get(rb_cObject, rb_intern("Datadog"));
1378
- VALUE opentelemetry_module = rb_const_get(datadog_module, rb_intern("OpenTelemetry"));
1379
- VALUE api_module = rb_const_get(opentelemetry_module, rb_intern("API"));
1380
- VALUE context_module = rb_const_get(api_module, rb_intern_const("Context"));
1381
- VALUE current_span_key = rb_const_get(context_module, rb_intern_const("CURRENT_SPAN_KEY"));
1382
-
1383
- if (current_span_key == Qnil) {
1384
- rb_raise(rb_eRuntimeError, "Unexpected: Missing Datadog::OpenTelemetry::API::Context::CURRENT_SPAN_KEY");
1385
- }
1546
+ if (state->otel_current_span_key == Qtrue) { // Qtrue means we haven't tried to extract it yet
1547
+ // If this fails, we want to fail gracefully, rather than raise an exception (e.g. if the opentelemetry gem
1548
+ // gets refactored, we should not fall on our face)
1549
+ VALUE span_key = rb_protect(read_otel_current_span_key_const, Qnil, NULL);
1386
1550
 
1387
- state->otel_current_span_key = current_span_key;
1551
+ // Note that this gets set to Qnil if we failed to extract the correct value, and thus we won't try to extract it again
1552
+ state->otel_current_span_key = span_key;
1388
1553
  }
1389
1554
 
1390
1555
  return state->otel_current_span_key;
1391
1556
  }
1392
1557
 
1393
- // This method gets used when ddtrace is being used indirectly via the otel APIs. Information gets stored slightly
1558
+ // This method gets used when ddtrace is being used indirectly via the opentelemetry APIs. Information gets stored slightly
1394
1559
  // differently, and this codepath handles it.
1395
1560
  static void ddtrace_otel_trace_identifiers_for(
1396
1561
  struct thread_context_collector_state *state,
@@ -1410,6 +1575,7 @@ static void ddtrace_otel_trace_identifiers_for(
1410
1575
  if (resolved_numeric_span_id == Qnil) return;
1411
1576
 
1412
1577
  VALUE otel_current_span_key = get_otel_current_span_key(state);
1578
+ if (otel_current_span_key == Qnil) return;
1413
1579
  VALUE current_trace = *active_trace;
1414
1580
 
1415
1581
  // ddtrace uses a different structure when spans are created from otel, where each otel span will have a unique ddtrace
@@ -1462,3 +1628,388 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
1462
1628
  thread_context_collector_sample_skipped_allocation_samples(collector_instance, NUM2UINT(skipped_samples));
1463
1629
  return Qtrue;
1464
1630
  }
1631
+
1632
+ // This method differs from trace_identifiers_for/ddtrace_otel_trace_identifiers_for to support the situation where
1633
+ // the opentelemetry ruby library is being used for tracing AND the ddtrace tracing bits are not involved at all.
1634
+ //
1635
+ // Thus, in this case, we're directly reading from the opentelemetry stuff, which is different to how ddtrace tracing
1636
+ // does it.
1637
+ //
1638
+ // This is somewhat brittle: we're coupling on internal details of the opentelemetry gem to get what we need. In the
1639
+ // future maybe the otel ruby folks would be open to having a nice public way of getting this data that suits the
1640
+ // usecase of profilers.
1641
+ // Until then, the strategy below is to be extremely defensive, and if anything is out of place, we immediately return
1642
+ // and give up on getting trace data from opentelemetry. (Thus, worst case would be -- you upgrade opentelemetry and
1643
+ // profiling features relying on reading this data stop working, but you'll still get profiles and the app will be
1644
+ // otherwise undisturbed).
1645
+ //
1646
+ // Specifically, the way this works is:
1647
+ // 1. The latest entry in the opentelemetry context storage represents the current span (if any). We take the span id
1648
+ // and trace id from this span.
1649
+ // 2. To find the local root span id, we walk the context storage backwards from the current span, and find the earliest
1650
+ // entry in the context storage that has the same trace id as the current span; we use the found span as the local
1651
+ // root span id.
1652
+ // This matches the semantics of how ddtrace tracing creates a TraceOperation and assigns a local root span to it.
1653
+ static void otel_without_ddtrace_trace_identifiers_for(
1654
+ struct thread_context_collector_state *state,
1655
+ VALUE thread,
1656
+ struct trace_identifiers *trace_identifiers_result
1657
+ ) {
1658
+ VALUE context_storage = rb_thread_local_aref(thread, otel_context_storage_id /* __opentelemetry_context_storage__ */);
1659
+
1660
+ // If it exists, context_storage is expected to be an Array[OpenTelemetry::Context]
1661
+ if (context_storage == Qnil || !RB_TYPE_P(context_storage, T_ARRAY)) return;
1662
+
1663
+ VALUE otel_current_span_key = get_otel_current_span_key(state);
1664
+ if (otel_current_span_key == Qnil) return;
1665
+
1666
+ int active_context_index = RARRAY_LEN(context_storage) - 1;
1667
+ if (active_context_index < 0) return;
1668
+
1669
+ struct otel_span active_span = otel_span_from(rb_ary_entry(context_storage, active_context_index), otel_current_span_key);
1670
+ if (active_span.span == Qnil) return;
1671
+
1672
+ struct otel_span local_root_span = active_span;
1673
+
1674
+ // Now find the oldest span starting from the active span that still has the same trace id as the active span
1675
+ for (int i = active_context_index - 1; i >= 0; i--) {
1676
+ struct otel_span checking_span = otel_span_from(rb_ary_entry(context_storage, i), otel_current_span_key);
1677
+ if (checking_span.span == Qnil) return;
1678
+
1679
+ if (rb_str_equal(active_span.trace_id, checking_span.trace_id) == Qfalse) break;
1680
+
1681
+ local_root_span = checking_span;
1682
+ }
1683
+
1684
+ // Convert the span ids into uint64_t to match what the Datadog tracer does
1685
+ trace_identifiers_result->span_id = otel_span_id_to_uint(active_span.span_id);
1686
+ trace_identifiers_result->local_root_span_id = otel_span_id_to_uint(local_root_span.span_id);
1687
+
1688
+ if (trace_identifiers_result->span_id == 0 || trace_identifiers_result->local_root_span_id == 0) return;
1689
+
1690
+ trace_identifiers_result->valid = true;
1691
+
1692
+ if (!state->endpoint_collection_enabled) return;
1693
+
1694
+ VALUE root_span_type = rb_ivar_get(local_root_span.span, at_kind_id /* @kind */);
1695
+ // We filter out spans that don't have `kind: :server`
1696
+ if (root_span_type == Qnil || !RB_TYPE_P(root_span_type, T_SYMBOL) || SYM2ID(root_span_type) != server_id) return;
1697
+
1698
+ VALUE trace_resource = rb_ivar_get(local_root_span.span, at_name_id /* @name */);
1699
+ if (!RB_TYPE_P(trace_resource, T_STRING)) return;
1700
+
1701
+ trace_identifiers_result->trace_endpoint = trace_resource;
1702
+ }
1703
+
1704
+ static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_span_key) {
1705
+ struct otel_span failed = {.span = Qnil, .span_id = Qnil, .trace_id = Qnil};
1706
+
1707
+ if (otel_context == Qnil) return failed;
1708
+
1709
+ VALUE context_entries = rb_ivar_get(otel_context, at_entries_id /* @entries */);
1710
+ if (context_entries == Qnil || !RB_TYPE_P(context_entries, T_HASH)) return failed;
1711
+
1712
+ // If it exists, context_entries is expected to be a Hash[OpenTelemetry::Context::Key, OpenTelemetry::Trace::Span]
1713
+ VALUE span = rb_hash_lookup(context_entries, otel_current_span_key);
1714
+ if (span == Qnil) return failed;
1715
+
1716
+ // If it exists, span_context is expected to be a OpenTelemetry::Trace::SpanContext (don't confuse it with OpenTelemetry::Context)
1717
+ VALUE span_context = rb_ivar_get(span, at_context_id /* @context */);
1718
+ if (span_context == Qnil) return failed;
1719
+
1720
+ VALUE span_id = rb_ivar_get(span_context, at_span_id_id /* @span_id */);
1721
+ VALUE trace_id = rb_ivar_get(span_context, at_trace_id_id /* @trace_id */);
1722
+ if (span_id == Qnil || trace_id == Qnil || !RB_TYPE_P(span_id, T_STRING) || !RB_TYPE_P(trace_id, T_STRING)) return failed;
1723
+
1724
+ return (struct otel_span) {.span = span, .span_id = span_id, .trace_id = trace_id};
1725
+ }
1726
+
1727
+ // Otel span ids are represented as a big-endian 8-byte string
1728
+ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
1729
+ if (!RB_TYPE_P(otel_span_id, T_STRING) || RSTRING_LEN(otel_span_id) != 8) { return 0; }
1730
+
1731
+ unsigned char *span_bytes = (unsigned char*) StringValuePtr(otel_span_id);
1732
+
1733
+ return \
1734
+ ((uint64_t)span_bytes[0] << 56) |
1735
+ ((uint64_t)span_bytes[1] << 48) |
1736
+ ((uint64_t)span_bytes[2] << 40) |
1737
+ ((uint64_t)span_bytes[3] << 32) |
1738
+ ((uint64_t)span_bytes[4] << 24) |
1739
+ ((uint64_t)span_bytes[5] << 16) |
1740
+ ((uint64_t)span_bytes[6] << 8) |
1741
+ ((uint64_t)span_bytes[7]);
1742
+ }
1743
+
1744
+ #ifndef NO_GVL_INSTRUMENTATION
1745
+ // This function can get called from outside the GVL and even on non-main Ractors
1746
+ void thread_context_collector_on_gvl_waiting(gvl_profiling_thread thread) {
1747
+ // Because this function gets called from a thread that is NOT holding the GVL, we avoid touching the
1748
+ // per-thread context directly.
1749
+ //
1750
+ // Instead, we ask Ruby to hold the data we need in Ruby's own special per-thread context area
1751
+ // that's thread-safe and built for this kind of use
1752
+ //
1753
+ // Also, this function can get called on the non-main Ractor. We deal with this by checking if the value in the context
1754
+ // is non-zero, since only `initialize_context` ever sets the value from 0 to non-zero for threads it sees.
1755
+ intptr_t thread_being_profiled = gvl_profiling_state_get(thread);
1756
+ if (!thread_being_profiled) return;
1757
+
1758
+ long current_monotonic_wall_time_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
1759
+ if (current_monotonic_wall_time_ns <= 0 || current_monotonic_wall_time_ns > GVL_WAITING_ENABLED_EMPTY) return;
1760
+
1761
+ gvl_profiling_state_set(thread, current_monotonic_wall_time_ns);
1762
+ }
1763
+
1764
+ // This function can get called from outside the GVL and even on non-main Ractors
1765
+ __attribute__((warn_unused_result))
1766
+ bool thread_context_collector_on_gvl_running_with_threshold(gvl_profiling_thread thread, uint32_t waiting_for_gvl_threshold_ns) {
1767
+ intptr_t gvl_waiting_at = gvl_profiling_state_get(thread);
1768
+
1769
+ // Thread was not being profiled / not waiting on gvl
1770
+ if (gvl_waiting_at == 0 || gvl_waiting_at == GVL_WAITING_ENABLED_EMPTY) return false;
1771
+
1772
+ // @ivoanjo: I'm not sure if this can happen -- It means we should've sampled already but haven't gotten the chance yet?
1773
+ if (gvl_waiting_at < 0) return true;
1774
+
1775
+ long waiting_for_gvl_duration_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE) - gvl_waiting_at;
1776
+
1777
+ bool should_sample = waiting_for_gvl_duration_ns >= waiting_for_gvl_threshold_ns;
1778
+
1779
+ if (should_sample) {
1780
+ // We flip the gvl_waiting_at to negative to mark that the thread is now running and no longer waiting
1781
+ intptr_t gvl_waiting_at_is_now_running = -gvl_waiting_at;
1782
+
1783
+ gvl_profiling_state_set(thread, gvl_waiting_at_is_now_running);
1784
+ } else {
1785
+ // We decided not to sample. Let's mark the thread back to the initial "enabled but empty" state
1786
+ gvl_profiling_state_set(thread, GVL_WAITING_ENABLED_EMPTY);
1787
+ }
1788
+
1789
+ return should_sample;
1790
+ }
1791
+
1792
+ __attribute__((warn_unused_result))
1793
+ bool thread_context_collector_on_gvl_running(gvl_profiling_thread thread) {
1794
+ return thread_context_collector_on_gvl_running_with_threshold(thread, global_waiting_for_gvl_threshold_ns);
1795
+ }
1796
+
1797
+ // Why does this method need to exist?
1798
+ //
1799
+ // You may be surprised to see that if we never call this function (from cpu_and_wall_time_worker), Waiting for GVL
1800
+ // samples will still show up.
1801
+ // This is because regular cpu/wall-time samples also use `update_metrics_and_sample` which will do the right thing
1802
+ // and push "Waiting for GVL" samples as needed.
1803
+ //
1804
+ // The reason this method needs to exist and be called very shortly after thread_context_collector_on_gvl_running
1805
+ // returning true is to ensure accuracy of both the timing and stack for the Waiting for GVL sample.
1806
+ //
1807
+ // Timing:
1808
+ // Because we currently only record the timestamp when the Waiting for GVL started and not when the Waiting for GVL ended,
1809
+ // we rely on pushing a sample as soon as possible when the Waiting for GVL ends so that the timestamp of the sample
1810
+ // actually matches when we stopped waiting.
1811
+ //
1812
+ // Stack:
1813
+ // If the thread starts working without the end of the Waiting for GVL sample, then by the time the thread is sampled
1814
+ // via the regular cpu/wall-time samples mechanism, the stack can be be inaccurate (e.g. does not correctly pinpoint
1815
+ // where the waiting happened).
1816
+ //
1817
+ // Arguably, the last sample after Waiting for GVL ended (when gvl_waiting_at < 0) should always come from this method
1818
+ // and not a regular cpu/wall-time sample BUT since all of these things are happening in parallel/concurrently I suspect
1819
+ // it's possible for a regular sample to kick in just before this one.
1820
+ //
1821
+ // ---
1822
+ //
1823
+ // NOTE: In normal use, current_thread is expected to be == rb_thread_current(); the `current_thread` parameter only
1824
+ // exists to enable testing.
1825
+ VALUE thread_context_collector_sample_after_gvl_running_with_thread(VALUE self_instance, VALUE current_thread) {
1826
+ struct thread_context_collector_state *state;
1827
+ TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
1828
+
1829
+ if (!state->timeline_enabled) rb_raise(rb_eRuntimeError, "GVL profiling requires timeline to be enabled");
1830
+
1831
+ intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(current_thread);
1832
+
1833
+ if (gvl_waiting_at >= 0) {
1834
+ // @ivoanjo: I'm not sure if this can ever happen. This means that we're not on the same thread
1835
+ // that ran `thread_context_collector_on_gvl_running` and made the decision to sample OR a regular sample was
1836
+ // triggered ahead of us.
1837
+ // We do nothing in this case.
1838
+ return Qfalse;
1839
+ }
1840
+
1841
+ struct per_thread_context *thread_context = get_or_create_context_for(current_thread, state);
1842
+
1843
+ // We don't actually account for cpu-time during Waiting for GVL. BUT, we may chose to push an
1844
+ // extra sample to represent the period prior to Waiting for GVL. To support that, we retrieve the current
1845
+ // cpu-time of the thread and let `update_metrics_and_sample` decide what to do with it.
1846
+ long cpu_time_for_thread = cpu_time_now_ns(thread_context);
1847
+
1848
+ // TODO: Should we update the dynamic sampling rate overhead tracking with this sample as well?
1849
+
1850
+ update_metrics_and_sample(
1851
+ state,
1852
+ /* thread_being_sampled: */ current_thread,
1853
+ /* stack_from_thread: */ current_thread,
1854
+ thread_context,
1855
+ thread_context->sampling_buffer,
1856
+ cpu_time_for_thread,
1857
+ monotonic_wall_time_now_ns(RAISE_ON_FAILURE)
1858
+ );
1859
+
1860
+ return Qtrue; // To allow this to be called from rb_rescue2
1861
+ }
1862
+
1863
+ VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance) {
1864
+ return thread_context_collector_sample_after_gvl_running_with_thread(self_instance, rb_thread_current());
1865
+ }
1866
+
1867
+ // This method is intended to be called from update_metrics_and_sample. It exists to handle extra sampling steps we
1868
+ // need to take when sampling cpu/wall-time for a thread that's in the "Waiting for GVL" state.
1869
+ __attribute__((warn_unused_result))
1870
+ static bool handle_gvl_waiting(
1871
+ struct thread_context_collector_state *state,
1872
+ VALUE thread_being_sampled,
1873
+ VALUE stack_from_thread,
1874
+ struct per_thread_context *thread_context,
1875
+ sampling_buffer* sampling_buffer,
1876
+ long current_cpu_time_ns
1877
+ ) {
1878
+ intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(thread_being_sampled);
1879
+
1880
+ bool is_gvl_waiting_state = gvl_waiting_at != 0 && gvl_waiting_at != GVL_WAITING_ENABLED_EMPTY;
1881
+
1882
+ if (!is_gvl_waiting_state) return false;
1883
+
1884
+ // We can be in one of 2 situations here:
1885
+ //
1886
+ // 1. The current sample is the first one after we entered the "Waiting for GVL" state
1887
+ // (wall_time_at_previous_sample_ns < abs(gvl_waiting_at))
1888
+ //
1889
+ // time ─────►
1890
+ // ...──────────────┬───────────────────...
1891
+ // Other state │ Waiting for GVL
1892
+ // ...──────────────┴───────────────────...
1893
+ // ▲ ▲
1894
+ // └─ Previous sample └─ Regular sample (caller)
1895
+ //
1896
+ // In this case, we'll want to push two samples: a) one for the current time (handled by the caller), b) an extra sample
1897
+ // to represent the remaining cpu/wall time before the "Waiting for GVL" started:
1898
+ //
1899
+ // time ─────►
1900
+ // ...──────────────┬───────────────────...
1901
+ // Other state │ Waiting for GVL
1902
+ // ...──────────────┴───────────────────...
1903
+ // ▲ ▲ ▲
1904
+ // └─ Prev... └─ Extra sample └─ Regular sample (caller)
1905
+ //
1906
+ // 2. The current sample is the n-th one after we entered the "Waiting for GVL" state
1907
+ // (wall_time_at_previous_sample_ns > abs(gvl_waiting_at))
1908
+ //
1909
+ // time ─────►
1910
+ // ...──────────────┬───────────────────────────────────────────────...
1911
+ // Other state │ Waiting for GVL
1912
+ // ...──────────────┴───────────────────────────────────────────────...
1913
+ // ▲ ▲ ▲
1914
+ // └─ Previous sample └─ Previous sample └─ Regular sample (caller)
1915
+ //
1916
+ // In this case, we just report back to the caller that the thread is in the "Waiting for GVL" state.
1917
+ //
1918
+ // ---
1919
+ //
1920
+ // Overall, gvl_waiting_at will be > 0 if still in the "Waiting for GVL" state and < 0 if we actually reached the end of
1921
+ // the wait.
1922
+ //
1923
+ // It doesn't really matter if the thread is still waiting or just reached the end of the wait: each sample represents
1924
+ // a snapshot at time ending now, so if the state finished, it just means the next sample will be a regular one.
1925
+
1926
+ if (gvl_waiting_at < 0) {
1927
+ // Negative means the waiting for GVL just ended, so we clear the state, so next samples no longer represent waiting
1928
+ gvl_profiling_state_thread_object_set(thread_being_sampled, GVL_WAITING_ENABLED_EMPTY);
1929
+ }
1930
+
1931
+ long gvl_waiting_started_wall_time_ns = labs(gvl_waiting_at);
1932
+
1933
+ if (thread_context->wall_time_at_previous_sample_ns < gvl_waiting_started_wall_time_ns) { // situation 1 above
1934
+ long cpu_time_elapsed_ns = update_time_since_previous_sample(
1935
+ &thread_context->cpu_time_at_previous_sample_ns,
1936
+ current_cpu_time_ns,
1937
+ thread_context->gc_tracking.cpu_time_at_start_ns,
1938
+ IS_NOT_WALL_TIME
1939
+ );
1940
+
1941
+ long duration_until_start_of_gvl_waiting_ns = update_time_since_previous_sample(
1942
+ &thread_context->wall_time_at_previous_sample_ns,
1943
+ gvl_waiting_started_wall_time_ns,
1944
+ INVALID_TIME,
1945
+ IS_WALL_TIME
1946
+ );
1947
+
1948
+ // Push extra sample
1949
+ trigger_sample_for_thread(
1950
+ state,
1951
+ thread_being_sampled,
1952
+ stack_from_thread,
1953
+ thread_context,
1954
+ sampling_buffer,
1955
+ (sample_values) {.cpu_time_ns = cpu_time_elapsed_ns, .cpu_or_wall_samples = 1, .wall_time_ns = duration_until_start_of_gvl_waiting_ns},
1956
+ gvl_waiting_started_wall_time_ns,
1957
+ NULL,
1958
+ NULL,
1959
+ false // This is the extra sample before the wait begun; only the next sample will be in the gvl waiting state
1960
+ );
1961
+ }
1962
+
1963
+ return true;
1964
+ }
1965
+
1966
+ static VALUE _native_on_gvl_waiting(DDTRACE_UNUSED VALUE self, VALUE thread) {
1967
+ ENFORCE_THREAD(thread);
1968
+
1969
+ thread_context_collector_on_gvl_waiting(thread_from_thread_object(thread));
1970
+ return Qnil;
1971
+ }
1972
+
1973
+ static VALUE _native_gvl_waiting_at_for(DDTRACE_UNUSED VALUE self, VALUE thread) {
1974
+ ENFORCE_THREAD(thread);
1975
+
1976
+ intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(thread);
1977
+ return LONG2NUM(gvl_waiting_at);
1978
+ }
1979
+
1980
+ static VALUE _native_on_gvl_running(DDTRACE_UNUSED VALUE self, VALUE thread) {
1981
+ ENFORCE_THREAD(thread);
1982
+
1983
+ return thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) ? Qtrue : Qfalse;
1984
+ }
1985
+
1986
+ static VALUE _native_sample_after_gvl_running(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread) {
1987
+ ENFORCE_THREAD(thread);
1988
+
1989
+ return thread_context_collector_sample_after_gvl_running_with_thread(collector_instance, thread);
1990
+ }
1991
+
1992
+ static VALUE _native_apply_delta_to_cpu_time_at_previous_sample_ns(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread, VALUE delta_ns) {
1993
+ ENFORCE_THREAD(thread);
1994
+
1995
+ struct thread_context_collector_state *state;
1996
+ TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
1997
+
1998
+ struct per_thread_context *thread_context = get_context_for(thread, state);
1999
+ if (thread_context == NULL) rb_raise(rb_eArgError, "Unexpected: This method cannot be used unless the per-thread context for the thread already exists");
2000
+
2001
+ thread_context->cpu_time_at_previous_sample_ns += NUM2LONG(delta_ns);
2002
+
2003
+ return Qtrue;
2004
+ }
2005
+
2006
+ #else
2007
+ static bool handle_gvl_waiting(
2008
+ DDTRACE_UNUSED struct thread_context_collector_state *state,
2009
+ DDTRACE_UNUSED VALUE thread_being_sampled,
2010
+ DDTRACE_UNUSED VALUE stack_from_thread,
2011
+ DDTRACE_UNUSED struct per_thread_context *thread_context,
2012
+ DDTRACE_UNUSED sampling_buffer* sampling_buffer,
2013
+ DDTRACE_UNUSED long current_cpu_time_ns
2014
+ ) { return false; }
2015
+ #endif // NO_GVL_INSTRUMENTATION