RubyGems - datadog - Versions diffs - 2.2.0 → 2.4.0 - Mend

datadog 2.2.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (196) hide show

data/ext/datadog_profiling_native_extension/collectors_thread_context.c CHANGED Viewed

@@ -76,6 +76,11 @@
 #define MISSING_TRACER_CONTEXT_KEY 0
 #define TIME_BETWEEN_GC_EVENTS_NS MILLIS_AS_NS(10)
+// This is used as a placeholder to mark threads that are allowed to be profiled (enabled)
+// (e.g. to avoid trying to gvl profile threads that are not from the main Ractor)
+// and for which there's no data yet
+#define GVL_WAITING_ENABLED_EMPTY RUBY_FIXNUM_MAX
 static ID at_active_span_id;  // id of :@active_span in Ruby
 static ID at_active_trace_id; // id of :@active_trace in Ruby
 static ID at_id_id;           // id of :@id in Ruby
@@ -86,13 +91,34 @@ static ID at_otel_values_id;  // id of :@otel_values in Ruby
 static ID at_parent_span_id_id; // id of :@parent_span_id in Ruby
 static ID at_datadog_trace_id;  // id of :@datadog_trace in Ruby
+// Used to support reading trace identifiers from the opentelemetry Ruby library when the ddtrace gem tracing
+// integration is NOT in use.
+static ID at_span_id_id;  // id of :@span_id in Ruby
+static ID at_trace_id_id; // id of :@trace_id in Ruby
+static ID at_entries_id;  // id of :@entries in Ruby
+static ID at_context_id;  // id of :@context in Ruby
+static ID at_kind_id;     // id of :@kind in Ruby
+static ID at_name_id;     // id of :@name in Ruby
+static ID server_id;      // id of :server in Ruby
+static ID otel_context_storage_id; // id of :__opentelemetry_context_storage__ in Ruby
+// This is used by `thread_context_collector_on_gvl_running`. Because when that method gets called we're not sure if
+// it's safe to access the state of the thread context collector, we store this setting as a global value. This does
+// mean this setting is shared among all thread context collectors, and thus it's "last writer wins".
+// In production this should not be a problem: there should only be one profiler, which is the last one created,
+// and that'll be the one that last wrote this setting.
+static uint32_t global_waiting_for_gvl_threshold_ns = MILLIS_AS_NS(10);
+enum otel_context_enabled {otel_context_enabled_false, otel_context_enabled_only, otel_context_enabled_both};
 // Contains state for a single ThreadContext instance
 struct thread_context_collector_state {
   // Note: Places in this file that usually need to be changed when this struct is changed are tagged with
   // "Update this when modifying state struct"
   // Required by Datadog::Profiling::Collectors::Stack as a scratch buffer during sampling
-  sampling_buffer *sampling_buffer;
+  ddog_prof_Location *locations;
+  uint16_t max_frames;
   // Hashmap <Thread Object, struct per_thread_context>
   st_table *hash_map_per_thread_context;
   // Datadog::Profiling::StackRecorder instance
@@ -111,6 +137,8 @@ struct thread_context_collector_state {
   bool endpoint_collection_enabled;
   // Used to omit timestamps / timeline events from collected data
   bool timeline_enabled;
+  // Used to control context collection
+  enum otel_context_enabled otel_context_enabled;
   // Used to omit class information from collected allocation data
   bool allocation_type_enabled;
   // Used when calling monotonic_to_system_epoch_ns
@@ -118,6 +146,8 @@ struct thread_context_collector_state {
   // Used to identify the main thread, to give it a fallback name
   VALUE main_thread;
   // Used when extracting trace identifiers from otel spans. Lazily initialized.
+  // Qtrue serves as a marker we've not yet extracted it; when we try to extract it, we set it to an object if
+  // successful and Qnil if not.
   VALUE otel_current_span_key;
   struct stats {
@@ -138,6 +168,7 @@ struct thread_context_collector_state {
 // Tracks per-thread state
 struct per_thread_context {
+  sampling_buffer *sampling_buffer;
   char thread_id[THREAD_ID_LIMIT_CHARS];
   ddog_CharSlice thread_id_char_slice;
   char thread_invoke_location[THREAD_INVOKE_LOCATION_LIMIT_CHARS];
@@ -162,6 +193,12 @@ struct trace_identifiers {
   VALUE trace_endpoint;
 };
+struct otel_span {
+  VALUE span;
+  VALUE span_id;
+  VALUE trace_id;
+};
 static void thread_context_collector_typed_data_mark(void *state_ptr);
 static void thread_context_collector_typed_data_free(void *state_ptr);
 static int hash_map_per_thread_context_mark(st_data_t key_thread, st_data_t _value, st_data_t _argument);
@@ -175,17 +212,20 @@ static VALUE _native_initialize(
   VALUE tracer_context_key,
   VALUE endpoint_collection_enabled,
   VALUE timeline_enabled,
+  VALUE waiting_for_gvl_threshold_ns,
+  VALUE otel_context_enabled,
   VALUE allocation_type_enabled
 );
 static VALUE _native_sample(VALUE self, VALUE collector_instance, VALUE profiler_overhead_stack_thread);
 static VALUE _native_on_gc_start(VALUE self, VALUE collector_instance);
 static VALUE _native_on_gc_finish(VALUE self, VALUE collector_instance);
-static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance);
-void update_metrics_and_sample(
+static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state);
+static void update_metrics_and_sample(
   struct thread_context_collector_state *state,
   VALUE thread_being_sampled,
-  VALUE profiler_overhead_stack_thread,
+  VALUE stack_from_thread,
   struct per_thread_context *thread_context,
+  sampling_buffer* sampling_buffer,
   long current_cpu_time_ns,
   long current_monotonic_wall_time_ns
 );
@@ -194,15 +234,18 @@ static void trigger_sample_for_thread(
   VALUE thread,
   VALUE stack_from_thread,
   struct per_thread_context *thread_context,
+  sampling_buffer* sampling_buffer,
   sample_values values,
   long current_monotonic_wall_time_ns,
   ddog_CharSlice *ruby_vm_type,
-  ddog_CharSlice *class_name
+  ddog_CharSlice *class_name,
+  bool is_gvl_waiting_state
 );
 static VALUE _native_thread_list(VALUE self);
 static struct per_thread_context *get_or_create_context_for(VALUE thread, struct thread_context_collector_state *state);
 static struct per_thread_context *get_context_for(VALUE thread, struct thread_context_collector_state *state);
 static void initialize_context(VALUE thread, struct per_thread_context *thread_context, struct thread_context_collector_state *state);
+static void free_context(struct per_thread_context* thread_context);
 static VALUE _native_inspect(VALUE self, VALUE collector_instance);
 static VALUE per_thread_context_st_table_as_ruby_hash(struct thread_context_collector_state *state);
 static int per_thread_context_as_ruby_hash(st_data_t key_thread, st_data_t value_context, st_data_t result_hash);
@@ -231,6 +274,27 @@ static void ddtrace_otel_trace_identifiers_for(
   VALUE active_span,
   VALUE otel_values
 );
+static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE skipped_samples);
+static bool handle_gvl_waiting(
+  struct thread_context_collector_state *state,
+  VALUE thread_being_sampled,
+  VALUE stack_from_thread,
+  struct per_thread_context *thread_context,
+  sampling_buffer* sampling_buffer,
+  long current_cpu_time_ns
+);
+static VALUE _native_on_gvl_waiting(DDTRACE_UNUSED VALUE self, VALUE thread);
+static VALUE _native_gvl_waiting_at_for(DDTRACE_UNUSED VALUE self, VALUE thread);
+static VALUE _native_on_gvl_running(DDTRACE_UNUSED VALUE self, VALUE thread);
+static VALUE _native_sample_after_gvl_running(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread);
+static VALUE _native_apply_delta_to_cpu_time_at_previous_sample_ns(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread, VALUE delta_ns);
+static void otel_without_ddtrace_trace_identifiers_for(
+  struct thread_context_collector_state *state,
+  VALUE thread,
+  struct trace_identifiers *trace_identifiers_result
+);
+static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_span_key);
+static uint64_t otel_span_id_to_uint(VALUE otel_span_id);
 void collectors_thread_context_init(VALUE profiling_module) {
   VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
@@ -248,19 +312,27 @@ void collectors_thread_context_init(VALUE profiling_module) {
   // https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
   rb_define_alloc_func(collectors_thread_context_class, _native_new);
-  rb_define_singleton_method(collectors_thread_context_class, "_native_initialize", _native_initialize, 7);
+  rb_define_singleton_method(collectors_thread_context_class, "_native_initialize", _native_initialize, 9);
   rb_define_singleton_method(collectors_thread_context_class, "_native_inspect", _native_inspect, 1);
   rb_define_singleton_method(collectors_thread_context_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
   rb_define_singleton_method(testing_module, "_native_sample", _native_sample, 2);
   rb_define_singleton_method(testing_module, "_native_sample_allocation", _native_sample_allocation, 3);
   rb_define_singleton_method(testing_module, "_native_on_gc_start", _native_on_gc_start, 1);
   rb_define_singleton_method(testing_module, "_native_on_gc_finish", _native_on_gc_finish, 1);
-  rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 1);
+  rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 2);
   rb_define_singleton_method(testing_module, "_native_thread_list", _native_thread_list, 0);
   rb_define_singleton_method(testing_module, "_native_per_thread_context", _native_per_thread_context, 1);
   rb_define_singleton_method(testing_module, "_native_stats", _native_stats, 1);
   rb_define_singleton_method(testing_module, "_native_gc_tracking", _native_gc_tracking, 1);
   rb_define_singleton_method(testing_module, "_native_new_empty_thread", _native_new_empty_thread, 0);
+  rb_define_singleton_method(testing_module, "_native_sample_skipped_allocation_samples", _native_sample_skipped_allocation_samples, 2);
+  #ifndef NO_GVL_INSTRUMENTATION
+    rb_define_singleton_method(testing_module, "_native_on_gvl_waiting", _native_on_gvl_waiting, 1);
+    rb_define_singleton_method(testing_module, "_native_gvl_waiting_at_for", _native_gvl_waiting_at_for, 1);
+    rb_define_singleton_method(testing_module, "_native_on_gvl_running", _native_on_gvl_running, 1);
+    rb_define_singleton_method(testing_module, "_native_sample_after_gvl_running", _native_sample_after_gvl_running, 2);
+    rb_define_singleton_method(testing_module, "_native_apply_delta_to_cpu_time_at_previous_sample_ns", _native_apply_delta_to_cpu_time_at_previous_sample_ns, 3);
+  #endif
   at_active_span_id = rb_intern_const("@active_span");
   at_active_trace_id = rb_intern_const("@active_trace");
@@ -271,6 +343,19 @@ void collectors_thread_context_init(VALUE profiling_module) {
   at_otel_values_id = rb_intern_const("@otel_values");
   at_parent_span_id_id = rb_intern_const("@parent_span_id");
   at_datadog_trace_id = rb_intern_const("@datadog_trace");
+  at_span_id_id = rb_intern_const("@span_id");
+  at_trace_id_id = rb_intern_const("@trace_id");
+  at_entries_id = rb_intern_const("@entries");
+  at_context_id = rb_intern_const("@context");
+  at_kind_id = rb_intern_const("@kind");
+  at_name_id = rb_intern_const("@name");
+  server_id = rb_intern_const("server");
+  otel_context_storage_id = rb_intern_const("__opentelemetry_context_storage__");
+  #ifndef NO_GVL_INSTRUMENTATION
+    // This will raise if Ruby already ran out of thread-local keys
+    gvl_profiling_init();
+  #endif
   gc_profiling_init();
 }
@@ -308,7 +393,7 @@ static void thread_context_collector_typed_data_free(void *state_ptr) {
   // Important: Remember that we're only guaranteed to see here what's been set in _native_new, aka
   // pointers that have been set NULL there may still be NULL here.
-  if (state->sampling_buffer != NULL) sampling_buffer_free(state->sampling_buffer);
+  if (state->locations != NULL) ruby_xfree(state->locations);
   // Free each entry in the map
   st_foreach(state->hash_map_per_thread_context, hash_map_per_thread_context_free_values, 0 /* unused */);
@@ -327,8 +412,8 @@ static int hash_map_per_thread_context_mark(st_data_t key_thread, DDTRACE_UNUSED
 // Used to clear each of the per_thread_contexts inside the hash_map_per_thread_context
 static int hash_map_per_thread_context_free_values(DDTRACE_UNUSED st_data_t _thread, st_data_t value_per_thread_context, DDTRACE_UNUSED st_data_t _argument) {
-  struct per_thread_context *per_thread_context = (struct per_thread_context*) value_per_thread_context;
-  ruby_xfree(per_thread_context);
+  struct per_thread_context *thread_context = (struct per_thread_context*) value_per_thread_context;
+  free_context(thread_context);
   return ST_CONTINUE;
 }
@@ -339,25 +424,39 @@ static VALUE _native_new(VALUE klass) {
   // being leaked.
   // Update this when modifying state struct
-  state->sampling_buffer = NULL;
+  state->locations = NULL;
+  state->max_frames = 0;
   state->hash_map_per_thread_context =
    // "numtable" is an awful name, but TL;DR it's what should be used when keys are `VALUE`s.
     st_init_numtable();
   state->recorder_instance = Qnil;
   state->tracer_context_key = MISSING_TRACER_CONTEXT_KEY;
-  state->thread_list_buffer = rb_ary_new();
+  VALUE thread_list_buffer = rb_ary_new();
+  state->thread_list_buffer = thread_list_buffer;
   state->endpoint_collection_enabled = true;
   state->timeline_enabled = true;
+  state->otel_context_enabled = otel_context_enabled_false;
   state->allocation_type_enabled = true;
   state->time_converter_state = (monotonic_to_system_epoch_state) MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER;
-  state->main_thread = rb_thread_main();
-  state->otel_current_span_key = Qnil;
+  VALUE main_thread = rb_thread_main();
+  state->main_thread = main_thread;
+  state->otel_current_span_key = Qtrue;
   state->gc_tracking.wall_time_at_previous_gc_ns = INVALID_TIME;
   state->gc_tracking.wall_time_at_last_flushed_gc_event_ns = 0;
-  return TypedData_Wrap_Struct(klass, &thread_context_collector_typed_data, state);
+  // Note: Remember to keep any new allocated objects that get stored in the state also on the stack + mark them with
+  // RB_GC_GUARD -- otherwise it's possible for a GC to run and
+  // since the instance representing the state does not yet exist, such objects will not get marked.
+  VALUE instance = TypedData_Wrap_Struct(klass, &thread_context_collector_typed_data, state);
+  RB_GC_GUARD(thread_list_buffer);
+  RB_GC_GUARD(main_thread); // Arguably not needed, but perhaps can be move in some future Ruby release?
+  return instance;
 }
+// TODO: Convert this to use options like CpuAndWallTimeWorker
 static VALUE _native_initialize(
   DDTRACE_UNUSED VALUE _self,
   VALUE collector_instance,
@@ -366,26 +465,38 @@ static VALUE _native_initialize(
   VALUE tracer_context_key,
   VALUE endpoint_collection_enabled,
   VALUE timeline_enabled,
+  VALUE waiting_for_gvl_threshold_ns,
+  VALUE otel_context_enabled,
   VALUE allocation_type_enabled
 ) {
   ENFORCE_BOOLEAN(endpoint_collection_enabled);
   ENFORCE_BOOLEAN(timeline_enabled);
+  ENFORCE_TYPE(waiting_for_gvl_threshold_ns, T_FIXNUM);
   ENFORCE_BOOLEAN(allocation_type_enabled);
   struct thread_context_collector_state *state;
   TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
-  int max_frames_requested = NUM2INT(max_frames);
-  if (max_frames_requested < 0) rb_raise(rb_eArgError, "Invalid max_frames: value must not be negative");
   // Update this when modifying state struct
-  state->sampling_buffer = sampling_buffer_new(max_frames_requested);
+  state->max_frames = sampling_buffer_check_max_frames(NUM2INT(max_frames));
+  state->locations = ruby_xcalloc(state->max_frames, sizeof(ddog_prof_Location));
   // hash_map_per_thread_context is already initialized, nothing to do here
   state->recorder_instance = enforce_recorder_instance(recorder_instance);
   state->endpoint_collection_enabled = (endpoint_collection_enabled == Qtrue);
   state->timeline_enabled = (timeline_enabled == Qtrue);
+  if (otel_context_enabled == Qfalse || otel_context_enabled == Qnil) {
+    state->otel_context_enabled = otel_context_enabled_false;
+  } else if (otel_context_enabled == ID2SYM(rb_intern("only"))) {
+    state->otel_context_enabled = otel_context_enabled_only;
+  } else if (otel_context_enabled == ID2SYM(rb_intern("both"))) {
+    state->otel_context_enabled = otel_context_enabled_both;
+  } else {
+    rb_raise(rb_eArgError, "Unexpected value for otel_context_enabled: %+" PRIsVALUE, otel_context_enabled);
+  }
   state->allocation_type_enabled = (allocation_type_enabled == Qtrue);
+  global_waiting_for_gvl_threshold_ns = NUM2UINT(waiting_for_gvl_threshold_ns);
   if (RTEST(tracer_context_key)) {
     ENFORCE_TYPE(tracer_context_key, T_SYMBOL);
     // Note about rb_to_id and dynamic symbols: calling `rb_to_id` prevents symbols from ever being garbage collected.
@@ -416,13 +527,22 @@ static VALUE _native_on_gc_start(DDTRACE_UNUSED VALUE self, VALUE collector_inst
 // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec.
 // It SHOULD NOT be used for other purposes.
 static VALUE _native_on_gc_finish(DDTRACE_UNUSED VALUE self, VALUE collector_instance) {
-  thread_context_collector_on_gc_finish(collector_instance);
+  (void) !thread_context_collector_on_gc_finish(collector_instance);
   return Qtrue;
 }
 // This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec.
 // It SHOULD NOT be used for other purposes.
-static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance) {
+static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state) {
+  ENFORCE_BOOLEAN(reset_monotonic_to_system_state);
+  struct thread_context_collector_state *state;
+  TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
+  if (reset_monotonic_to_system_state == Qtrue) {
+    state->time_converter_state = (monotonic_to_system_epoch_state) MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER;
+  }
   thread_context_collector_sample_after_gc(collector_instance);
   return Qtrue;
 }
@@ -461,6 +581,7 @@ void thread_context_collector_sample(VALUE self_instance, long current_monotonic
       /* thread_being_sampled: */ thread,
       /* stack_from_thread: */ thread,
       thread_context,
+      thread_context->sampling_buffer,
       current_cpu_time_ns,
       current_monotonic_wall_time_ns
     );
@@ -477,25 +598,33 @@ void thread_context_collector_sample(VALUE self_instance, long current_monotonic
     /* thread_being_sampled: */ current_thread,
     /* stack_from_thread: */ profiler_overhead_stack_thread,
     current_thread_context,
+    // Here we use the overhead thread's sampling buffer so as to not invalidate the cache in the buffer of the thread being sampled
+    get_or_create_context_for(profiler_overhead_stack_thread, state)->sampling_buffer,
     cpu_time_now_ns(current_thread_context),
     monotonic_wall_time_now_ns(RAISE_ON_FAILURE)
   );
 }
-void update_metrics_and_sample(
+static void update_metrics_and_sample(
   struct thread_context_collector_state *state,
   VALUE thread_being_sampled,
   VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack
   struct per_thread_context *thread_context,
+  sampling_buffer* sampling_buffer,
   long current_cpu_time_ns,
   long current_monotonic_wall_time_ns
 ) {
-  long cpu_time_elapsed_ns = update_time_since_previous_sample(
+  bool is_gvl_waiting_state =
+    handle_gvl_waiting(state, thread_being_sampled, stack_from_thread, thread_context, sampling_buffer, current_cpu_time_ns);
+  // Don't assign/update cpu during "Waiting for GVL"
+  long cpu_time_elapsed_ns = is_gvl_waiting_state ? 0 : update_time_since_previous_sample(
     &thread_context->cpu_time_at_previous_sample_ns,
     current_cpu_time_ns,
     thread_context->gc_tracking.cpu_time_at_start_ns,
     IS_NOT_WALL_TIME
   );
   long wall_time_elapsed_ns = update_time_since_previous_sample(
     &thread_context->wall_time_at_previous_sample_ns,
     current_monotonic_wall_time_ns,
@@ -507,15 +636,32 @@ void update_metrics_and_sample(
     IS_WALL_TIME
   );
+  // A thread enters "Waiting for GVL", well, as the name implies, without the GVL.
+  //
+  // As a consequence, it's possible that a thread enters "Waiting for GVL" in parallel with the current thread working
+  // on sampling, and thus for the  `current_monotonic_wall_time_ns` (which is recorded at the start of sampling)
+  // to be < the time at which we started Waiting for GVL.
+  //
+  // All together, this means that when `handle_gvl_waiting` creates an extra sample (see comments on that function for
+  // what the extra sample is), it's possible that there's no more wall-time to be assigned.
+  // Thus, in this case, we don't want to produce a sample representing Waiting for GVL with a wall-time of 0, and
+  // thus we skip creating such a sample.
+  if (is_gvl_waiting_state && wall_time_elapsed_ns == 0) return;
+  // ...you may also wonder: is there any other situation where it makes sense to produce a sample with
+  // wall_time_elapsed_ns == 0? I believe that yes, because the sample still includes a timestamp and a stack, but we
+  // may revisit/change our minds on this in the future.
   trigger_sample_for_thread(
     state,
     thread_being_sampled,
     stack_from_thread,
     thread_context,
+    sampling_buffer,
     (sample_values) {.cpu_time_ns = cpu_time_elapsed_ns, .cpu_or_wall_samples = 1, .wall_time_ns = wall_time_elapsed_ns},
     current_monotonic_wall_time_ns,
     NULL,
-    NULL
+    NULL,
+    is_gvl_waiting_state
   );
 }
@@ -561,6 +707,7 @@ void thread_context_collector_on_gc_start(VALUE self_instance) {
 //
 // Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
 // Assumption 2: This function is called from the main Ractor (if Ruby has support for Ractors).
+__attribute__((warn_unused_result))
 bool thread_context_collector_on_gc_finish(VALUE self_instance) {
   struct thread_context_collector_state *state;
   if (!rb_typeddata_is_kind_of(self_instance, &thread_context_collector_typed_data)) return false;
@@ -661,7 +808,6 @@ VALUE thread_context_collector_sample_after_gc(VALUE self_instance) {
   }
   record_placeholder_stack(
-    state->sampling_buffer,
     state->recorder_instance,
     (sample_values) {
       // This event gets both a regular cpu/wall-time duration, as a normal cpu/wall-time sample would, as well as a
@@ -692,11 +838,13 @@ static void trigger_sample_for_thread(
   VALUE thread,
   VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack
   struct per_thread_context *thread_context,
+  sampling_buffer* sampling_buffer,
   sample_values values,
   long current_monotonic_wall_time_ns,
   // These two labels are only used for allocation profiling; @ivoanjo: may want to refactor this at some point?
   ddog_CharSlice *ruby_vm_type,
-  ddog_CharSlice *class_name
+  ddog_CharSlice *class_name,
+  bool is_gvl_waiting_state
 ) {
   int max_label_count =
     1 + // thread id
@@ -737,6 +885,11 @@ static void trigger_sample_for_thread(
   struct trace_identifiers trace_identifiers_result = {.valid = false, .trace_endpoint = Qnil};
   trace_identifiers_for(state, thread, &trace_identifiers_result);
+  if (!trace_identifiers_result.valid && state->otel_context_enabled != otel_context_enabled_false) {
+    // If we couldn't get something with ddtrace, let's see if we can get some trace identifiers from opentelemetry directly
+    otel_without_ddtrace_trace_identifiers_for(state, thread, &trace_identifiers_result);
+  }
   if (trace_identifiers_result.valid) {
     labels[label_pos++] = (ddog_prof_Label) {.key = DDOG_CHARSLICE_C("local root span id"), .num = trace_identifiers_result.local_root_span_id};
     labels[label_pos++] = (ddog_prof_Label) {.key = DDOG_CHARSLICE_C("span id"), .num = trace_identifiers_result.span_id};
@@ -812,10 +965,15 @@ static void trigger_sample_for_thread(
   sample_thread(
     stack_from_thread,
-    state->sampling_buffer,
+    sampling_buffer,
     state->recorder_instance,
     values,
-    (sample_labels) {.labels = slice_labels, .state_label = state_label, .end_timestamp_ns = end_timestamp_ns}
+    (sample_labels) {
+      .labels = slice_labels,
+      .state_label = state_label,
+      .end_timestamp_ns = end_timestamp_ns,
+      .is_gvl_waiting_state = is_gvl_waiting_state,
+    }
   );
 }
@@ -865,9 +1023,9 @@ static struct per_thread_context *get_context_for(VALUE thread, struct thread_co
 // to either run Ruby code during sampling (not great), or otherwise use some of the VM private APIs to detect this.
 //
 static bool is_logging_gem_monkey_patch(VALUE invoke_file_location) {
-  int logging_gem_path_len = strlen(LOGGING_GEM_PATH);
+  unsigned long logging_gem_path_len = strlen(LOGGING_GEM_PATH);
   char *invoke_file = StringValueCStr(invoke_file_location);
-  int invoke_file_len = strlen(invoke_file);
+  unsigned long invoke_file_len = strlen(invoke_file);
   if (invoke_file_len < logging_gem_path_len) return false;
@@ -875,6 +1033,8 @@ static bool is_logging_gem_monkey_patch(VALUE invoke_file_location) {
 }
 static void initialize_context(VALUE thread, struct per_thread_context *thread_context, struct thread_context_collector_state *state) {
+  thread_context->sampling_buffer = sampling_buffer_new(state->max_frames, state->locations);
   snprintf(thread_context->thread_id, THREAD_ID_LIMIT_CHARS, "%"PRIu64" (%lu)", native_thread_id_for(thread), (unsigned long) thread_id_for(thread));
   thread_context->thread_id_char_slice = (ddog_CharSlice) {.ptr = thread_context->thread_id, .len = strlen(thread_context->thread_id)};
@@ -913,6 +1073,25 @@ static void initialize_context(VALUE thread, struct per_thread_context *thread_c
   // These will only be used during a GC operation
   thread_context->gc_tracking.cpu_time_at_start_ns = INVALID_TIME;
   thread_context->gc_tracking.wall_time_at_start_ns = INVALID_TIME;
+  #ifndef NO_GVL_INSTRUMENTATION
+    // We use this special location to store data that can be accessed without any
+    // kind of synchronization (e.g. by threads without the GVL).
+    //
+    // We set this marker here for two purposes:
+    // * To make sure there's no stale data from a previous execution of the profiler.
+    // * To mark threads that are actually being profiled
+    //
+    // (Setting this is potentially a race, but what we want is to avoid _stale_ data, so
+    // if this gets set concurrently with context initialization, then such a value will belong
+    // to the current profiler instance, so that's OK)
+    gvl_profiling_state_thread_object_set(thread, GVL_WAITING_ENABLED_EMPTY);
+  #endif
+}
+static void free_context(struct per_thread_context* thread_context) {
+  sampling_buffer_free(thread_context->sampling_buffer);
+  ruby_xfree(thread_context);
 }
 static VALUE _native_inspect(DDTRACE_UNUSED VALUE _self, VALUE collector_instance) {
@@ -922,6 +1101,7 @@ static VALUE _native_inspect(DDTRACE_UNUSED VALUE _self, VALUE collector_instanc
   VALUE result = rb_str_new2(" (native state)");
   // Update this when modifying state struct
+  rb_str_concat(result, rb_sprintf(" max_frames=%d", state->max_frames));
   rb_str_concat(result, rb_sprintf(" hash_map_per_thread_context=%"PRIsVALUE, per_thread_context_st_table_as_ruby_hash(state)));
   rb_str_concat(result, rb_sprintf(" recorder_instance=%"PRIsVALUE, state->recorder_instance));
   VALUE tracer_context_key = state->tracer_context_key == MISSING_TRACER_CONTEXT_KEY ? Qnil : ID2SYM(state->tracer_context_key);
@@ -930,6 +1110,7 @@ static VALUE _native_inspect(DDTRACE_UNUSED VALUE _self, VALUE collector_instanc
   rb_str_concat(result, rb_sprintf(" stats=%"PRIsVALUE, stats_as_ruby_hash(state)));
   rb_str_concat(result, rb_sprintf(" endpoint_collection_enabled=%"PRIsVALUE, state->endpoint_collection_enabled ? Qtrue : Qfalse));
   rb_str_concat(result, rb_sprintf(" timeline_enabled=%"PRIsVALUE, state->timeline_enabled ? Qtrue : Qfalse));
+  rb_str_concat(result, rb_sprintf(" otel_context_enabled=%d", state->otel_context_enabled));
   rb_str_concat(result, rb_sprintf(" allocation_type_enabled=%"PRIsVALUE, state->allocation_type_enabled ? Qtrue : Qfalse));
   rb_str_concat(result, rb_sprintf(
     " time_converter_state={.system_epoch_ns_reference=%ld, .delta_to_epoch_ns=%ld}",
@@ -939,6 +1120,7 @@ static VALUE _native_inspect(DDTRACE_UNUSED VALUE _self, VALUE collector_instanc
   rb_str_concat(result, rb_sprintf(" main_thread=%"PRIsVALUE, state->main_thread));
   rb_str_concat(result, rb_sprintf(" gc_tracking=%"PRIsVALUE, gc_tracking_as_ruby_hash(state)));
   rb_str_concat(result, rb_sprintf(" otel_current_span_key=%"PRIsVALUE, state->otel_current_span_key));
+  rb_str_concat(result, rb_sprintf(" global_waiting_for_gvl_threshold_ns=%u", global_waiting_for_gvl_threshold_ns));
   return result;
 }
@@ -966,6 +1148,10 @@ static int per_thread_context_as_ruby_hash(st_data_t key_thread, st_data_t value
     ID2SYM(rb_intern("gc_tracking.cpu_time_at_start_ns")),   /* => */ LONG2NUM(thread_context->gc_tracking.cpu_time_at_start_ns),
     ID2SYM(rb_intern("gc_tracking.wall_time_at_start_ns")),  /* => */ LONG2NUM(thread_context->gc_tracking.wall_time_at_start_ns),
+    #ifndef NO_GVL_INSTRUMENTATION
+      ID2SYM(rb_intern("gvl_waiting_at")), /* => */ LONG2NUM(gvl_profiling_state_thread_object_get(thread)),
+    #endif
   };
   for (long unsigned int i = 0; i < VALUE_COUNT(arguments); i += 2) rb_hash_aset(context_as_hash, arguments[i], arguments[i+1]);
@@ -1006,7 +1192,7 @@ static int remove_if_dead_thread(st_data_t key_thread, st_data_t value_context,
   if (is_thread_alive(thread)) return ST_CONTINUE;
-  ruby_xfree(thread_context);
+  free_context(thread_context);
   return ST_DELETE;
 }
@@ -1116,6 +1302,7 @@ static VALUE _native_gc_tracking(DDTRACE_UNUSED VALUE _self, VALUE collector_ins
 // Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
 static void trace_identifiers_for(struct thread_context_collector_state *state, VALUE thread, struct trace_identifiers *trace_identifiers_result) {
+  if (state->otel_context_enabled == otel_context_enabled_only) return;
   if (state->tracer_context_key == MISSING_TRACER_CONTEXT_KEY) return;
   VALUE current_context = rb_thread_local_aref(thread, state->tracer_context_key);
@@ -1170,7 +1357,7 @@ static bool should_collect_resource(VALUE root_span) {
   if (root_span_type == Qnil) return false;
   ENFORCE_TYPE(root_span_type, T_STRING);
-  int root_span_type_length = RSTRING_LEN(root_span_type);
+  long root_span_type_length = RSTRING_LEN(root_span_type);
   const char *root_span_type_value = StringValuePtr(root_span_type);
   bool is_web_request =
@@ -1193,6 +1380,9 @@ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE collector
   struct thread_context_collector_state *state;
   TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
+  // Release all context memory before clearing the existing context
+  st_foreach(state->hash_map_per_thread_context, hash_map_per_thread_context_free_values, 0 /* unused */);
   st_clear(state->hash_map_per_thread_context);
   state->stats = (struct stats) {}; // Resets all stats back to zero
@@ -1257,7 +1447,7 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in
       // Thus, we need to make sure there's actually a class before getting its name.
       if (klass != 0) {
-        const char *name = rb_obj_classname(new_object);
+        const char *name = rb_class2name(klass);
         size_t name_length = name != NULL ? strlen(name) : 0;
         if (name_length > 0) {
@@ -1285,15 +1475,19 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in
   track_object(state->recorder_instance, new_object, sample_weight, optional_class_name);
+  struct per_thread_context *thread_context = get_or_create_context_for(current_thread, state);
   trigger_sample_for_thread(
     state,
     /* thread: */  current_thread,
     /* stack_from_thread: */ current_thread,
-    get_or_create_context_for(current_thread, state),
-    (sample_values) {.alloc_samples = sample_weight, .alloc_samples_unscaled = 1},
+    thread_context,
+    thread_context->sampling_buffer,
+    (sample_values) {.alloc_samples = sample_weight, .alloc_samples_unscaled = 1, .heap_sample = true},
     INVALID_TIME, // For now we're not collecting timestamps for allocation events, as per profiling team internal discussions
     &ruby_vm_type,
-    optional_class_name
+    optional_class_name,
+    false
   );
 }
@@ -1339,25 +1533,29 @@ static ddog_CharSlice ruby_value_type_to_class_name(enum ruby_value_type type) {
   }
 }
+// Used to access OpenTelemetry::Trace.const_get(:CURRENT_SPAN_KEY). Will raise exceptions if it fails.
+static VALUE read_otel_current_span_key_const(DDTRACE_UNUSED VALUE _unused) {
+  VALUE opentelemetry_module = rb_const_get(rb_cObject, rb_intern("OpenTelemetry"));
+  ENFORCE_TYPE(opentelemetry_module, T_MODULE);
+  VALUE trace_module = rb_const_get(opentelemetry_module, rb_intern("Trace"));
+  ENFORCE_TYPE(trace_module, T_MODULE);
+  return rb_const_get(trace_module, rb_intern("CURRENT_SPAN_KEY"));
+}
 static VALUE get_otel_current_span_key(struct thread_context_collector_state *state) {
-  if (state->otel_current_span_key == Qnil) {
-    VALUE datadog_module = rb_const_get(rb_cObject, rb_intern("Datadog"));
-    VALUE opentelemetry_module = rb_const_get(datadog_module, rb_intern("OpenTelemetry"));
-    VALUE api_module = rb_const_get(opentelemetry_module, rb_intern("API"));
-    VALUE context_module = rb_const_get(api_module, rb_intern_const("Context"));
-    VALUE current_span_key = rb_const_get(context_module, rb_intern_const("CURRENT_SPAN_KEY"));
-    if (current_span_key == Qnil) {
-      rb_raise(rb_eRuntimeError, "Unexpected: Missing Datadog::OpenTelemetry::API::Context::CURRENT_SPAN_KEY");
-    }
+  if (state->otel_current_span_key == Qtrue) { // Qtrue means we haven't tried to extract it yet
+    // If this fails, we want to fail gracefully, rather than raise an exception (e.g. if the opentelemetry gem
+    // gets refactored, we should not fall on our face)
+    VALUE span_key = rb_protect(read_otel_current_span_key_const, Qnil, NULL);
-    state->otel_current_span_key = current_span_key;
+    // Note that this gets set to Qnil if we failed to extract the correct value, and thus we won't try to extract it again
+    state->otel_current_span_key = span_key;
   }
   return state->otel_current_span_key;
 }
-// This method gets used when ddtrace is being used indirectly via the otel APIs. Information gets stored slightly
+// This method gets used when ddtrace is being used indirectly via the opentelemetry APIs. Information gets stored slightly
 // differently, and this codepath handles it.
 static void ddtrace_otel_trace_identifiers_for(
   struct thread_context_collector_state *state,
@@ -1377,6 +1575,7 @@ static void ddtrace_otel_trace_identifiers_for(
   if (resolved_numeric_span_id == Qnil) return;
   VALUE otel_current_span_key = get_otel_current_span_key(state);
+  if (otel_current_span_key == Qnil) return;
   VALUE current_trace = *active_trace;
   // ddtrace uses a different structure when spans are created from otel, where each otel span will have a unique ddtrace
@@ -1400,3 +1599,417 @@ static void ddtrace_otel_trace_identifiers_for(
   *active_trace = current_trace;
   *numeric_span_id = resolved_numeric_span_id;
 }
+void thread_context_collector_sample_skipped_allocation_samples(VALUE self_instance, unsigned int skipped_samples) {
+  struct thread_context_collector_state *state;
+  TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
+  ddog_prof_Label labels[] = {
+    // Providing .num = 0 should not be needed but the tracer-2.7 docker image ships a buggy gcc that complains about this
+    {.key = DDOG_CHARSLICE_C("thread id"),        .str = DDOG_CHARSLICE_C("SS"),                .num = 0},
+    {.key = DDOG_CHARSLICE_C("thread name"),      .str = DDOG_CHARSLICE_C("Skipped Samples"),   .num = 0},
+    {.key = DDOG_CHARSLICE_C("allocation class"), .str = DDOG_CHARSLICE_C("(Skipped Samples)"), .num = 0},
+  };
+  ddog_prof_Slice_Label slice_labels = {.ptr = labels, .len = sizeof(labels) / sizeof(labels[0])};
+  record_placeholder_stack(
+    state->recorder_instance,
+    (sample_values) {.alloc_samples = skipped_samples},
+    (sample_labels) {
+      .labels = slice_labels,
+      .state_label = NULL,
+      .end_timestamp_ns = 0, // For now we're not collecting timestamps for allocation events
+    },
+    DDOG_CHARSLICE_C("Skipped Samples")
+  );
+}
+static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE skipped_samples) {
+  thread_context_collector_sample_skipped_allocation_samples(collector_instance, NUM2UINT(skipped_samples));
+  return Qtrue;
+}
+// This method differs from trace_identifiers_for/ddtrace_otel_trace_identifiers_for to support the situation where
+// the opentelemetry ruby library is being used for tracing AND the ddtrace tracing bits are not involved at all.
+//
+// Thus, in this case, we're directly reading from the opentelemetry stuff, which is different to how ddtrace tracing
+// does it.
+//
+// This is somewhat brittle: we're coupling on internal details of the opentelemetry gem to get what we need. In the
+// future maybe the otel ruby folks would be open to having a nice public way of getting this data that suits the
+// usecase of profilers.
+// Until then, the strategy below is to be extremely defensive, and if anything is out of place, we immediately return
+// and give up on getting trace data from opentelemetry. (Thus, worst case would be -- you upgrade opentelemetry and
+// profiling features relying on reading this data stop working, but you'll still get profiles and the app will be
+// otherwise undisturbed).
+//
+// Specifically, the way this works is:
+// 1. The latest entry in the opentelemetry context storage represents the current span (if any). We take the span id
+//    and trace id from this span.
+// 2. To find the local root span id, we walk the context storage backwards from the current span, and find the earliest
+//    entry in the context storage that has the same trace id as the current span; we use the found span as the local
+//    root span id.
+//    This matches the semantics of how ddtrace tracing creates a TraceOperation and assigns a local root span to it.
+static void otel_without_ddtrace_trace_identifiers_for(
+  struct thread_context_collector_state *state,
+  VALUE thread,
+  struct trace_identifiers *trace_identifiers_result
+) {
+  VALUE context_storage = rb_thread_local_aref(thread, otel_context_storage_id /* __opentelemetry_context_storage__ */);
+  // If it exists, context_storage is expected to be an Array[OpenTelemetry::Context]
+  if (context_storage == Qnil || !RB_TYPE_P(context_storage, T_ARRAY)) return;
+  VALUE otel_current_span_key = get_otel_current_span_key(state);
+  if (otel_current_span_key == Qnil) return;
+  int active_context_index = RARRAY_LEN(context_storage) - 1;
+  if (active_context_index < 0) return;
+  struct otel_span active_span = otel_span_from(rb_ary_entry(context_storage, active_context_index), otel_current_span_key);
+  if (active_span.span == Qnil) return;
+  struct otel_span local_root_span = active_span;
+  // Now find the oldest span starting from the active span that still has the same trace id as the active span
+  for (int i = active_context_index - 1; i >= 0; i--) {
+    struct otel_span checking_span = otel_span_from(rb_ary_entry(context_storage, i), otel_current_span_key);
+    if (checking_span.span == Qnil) return;
+    if (rb_str_equal(active_span.trace_id, checking_span.trace_id) == Qfalse) break;
+    local_root_span = checking_span;
+  }
+  // Convert the span ids into uint64_t to match what the Datadog tracer does
+  trace_identifiers_result->span_id = otel_span_id_to_uint(active_span.span_id);
+  trace_identifiers_result->local_root_span_id = otel_span_id_to_uint(local_root_span.span_id);
+  if (trace_identifiers_result->span_id == 0 || trace_identifiers_result->local_root_span_id == 0) return;
+  trace_identifiers_result->valid = true;
+  if (!state->endpoint_collection_enabled) return;
+  VALUE root_span_type = rb_ivar_get(local_root_span.span, at_kind_id /* @kind */);
+  // We filter out spans that don't have `kind: :server`
+  if (root_span_type == Qnil || !RB_TYPE_P(root_span_type, T_SYMBOL) || SYM2ID(root_span_type) != server_id) return;
+  VALUE trace_resource = rb_ivar_get(local_root_span.span, at_name_id /* @name */);
+  if (!RB_TYPE_P(trace_resource, T_STRING)) return;
+  trace_identifiers_result->trace_endpoint = trace_resource;
+}
+static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_span_key) {
+  struct otel_span failed = {.span = Qnil, .span_id = Qnil, .trace_id = Qnil};
+  if (otel_context == Qnil) return failed;
+  VALUE context_entries = rb_ivar_get(otel_context, at_entries_id /* @entries */);
+  if (context_entries == Qnil || !RB_TYPE_P(context_entries, T_HASH)) return failed;
+  // If it exists, context_entries is expected to be a Hash[OpenTelemetry::Context::Key, OpenTelemetry::Trace::Span]
+  VALUE span = rb_hash_lookup(context_entries, otel_current_span_key);
+  if (span == Qnil) return failed;
+  // If it exists, span_context is expected to be a OpenTelemetry::Trace::SpanContext (don't confuse it with OpenTelemetry::Context)
+  VALUE span_context = rb_ivar_get(span, at_context_id /* @context */);
+  if (span_context == Qnil) return failed;
+  VALUE span_id = rb_ivar_get(span_context, at_span_id_id /* @span_id */);
+  VALUE trace_id = rb_ivar_get(span_context, at_trace_id_id /* @trace_id */);
+  if (span_id == Qnil || trace_id == Qnil || !RB_TYPE_P(span_id, T_STRING) || !RB_TYPE_P(trace_id, T_STRING)) return failed;
+  return (struct otel_span) {.span = span, .span_id = span_id, .trace_id = trace_id};
+}
+// Otel span ids are represented as a big-endian 8-byte string
+static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
+  if (!RB_TYPE_P(otel_span_id, T_STRING) || RSTRING_LEN(otel_span_id) != 8) { return 0; }
+  unsigned char *span_bytes = (unsigned char*) StringValuePtr(otel_span_id);
+  return \
+    ((uint64_t)span_bytes[0] << 56) |
+    ((uint64_t)span_bytes[1] << 48) |
+    ((uint64_t)span_bytes[2] << 40) |
+    ((uint64_t)span_bytes[3] << 32) |
+    ((uint64_t)span_bytes[4] << 24) |
+    ((uint64_t)span_bytes[5] << 16) |
+    ((uint64_t)span_bytes[6] <<  8) |
+    ((uint64_t)span_bytes[7]);
+}
+#ifndef NO_GVL_INSTRUMENTATION
+  // This function can get called from outside the GVL and even on non-main Ractors
+  void thread_context_collector_on_gvl_waiting(gvl_profiling_thread thread) {
+    // Because this function gets called from a thread that is NOT holding the GVL, we avoid touching the
+    // per-thread context directly.
+    //
+    // Instead, we ask Ruby to hold the data we need in Ruby's own special per-thread context area
+    // that's thread-safe and built for this kind of use
+    //
+    // Also, this function can get called on the non-main Ractor. We deal with this by checking if the value in the context
+    // is non-zero, since only `initialize_context` ever sets the value from 0 to non-zero for threads it sees.
+    intptr_t thread_being_profiled = gvl_profiling_state_get(thread);
+    if (!thread_being_profiled) return;
+    long current_monotonic_wall_time_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
+    if (current_monotonic_wall_time_ns <= 0 || current_monotonic_wall_time_ns > GVL_WAITING_ENABLED_EMPTY) return;
+    gvl_profiling_state_set(thread, current_monotonic_wall_time_ns);
+  }
+  // This function can get called from outside the GVL and even on non-main Ractors
+  __attribute__((warn_unused_result))
+  bool thread_context_collector_on_gvl_running_with_threshold(gvl_profiling_thread thread, uint32_t waiting_for_gvl_threshold_ns) {
+    intptr_t gvl_waiting_at = gvl_profiling_state_get(thread);
+    // Thread was not being profiled / not waiting on gvl
+    if (gvl_waiting_at == 0 || gvl_waiting_at == GVL_WAITING_ENABLED_EMPTY) return false;
+    // @ivoanjo: I'm not sure if this can happen -- It means we should've sampled already but haven't gotten the chance yet?
+    if (gvl_waiting_at < 0) return true;
+    long waiting_for_gvl_duration_ns = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE) - gvl_waiting_at;
+    bool should_sample = waiting_for_gvl_duration_ns >= waiting_for_gvl_threshold_ns;
+    if (should_sample) {
+      // We flip the gvl_waiting_at to negative to mark that the thread is now running and no longer waiting
+      intptr_t gvl_waiting_at_is_now_running = -gvl_waiting_at;
+      gvl_profiling_state_set(thread, gvl_waiting_at_is_now_running);
+    } else {
+      // We decided not to sample. Let's mark the thread back to the initial "enabled but empty" state
+      gvl_profiling_state_set(thread, GVL_WAITING_ENABLED_EMPTY);
+    }
+    return should_sample;
+  }
+  __attribute__((warn_unused_result))
+  bool thread_context_collector_on_gvl_running(gvl_profiling_thread thread) {
+    return thread_context_collector_on_gvl_running_with_threshold(thread, global_waiting_for_gvl_threshold_ns);
+  }
+  // Why does this method need to exist?
+  //
+  // You may be surprised to see that if we never call this function (from cpu_and_wall_time_worker), Waiting for GVL
+  // samples will still show up.
+  // This is because regular cpu/wall-time samples also use `update_metrics_and_sample` which will do the right thing
+  // and push "Waiting for GVL" samples as needed.
+  //
+  // The reason this method needs to exist and be called very shortly after thread_context_collector_on_gvl_running
+  // returning true is to ensure accuracy of both the timing and stack for the Waiting for GVL sample.
+  //
+  // Timing:
+  // Because we currently only record the timestamp when the Waiting for GVL started and not when the Waiting for GVL ended,
+  // we rely on pushing a sample as soon as possible when the Waiting for GVL ends so that the timestamp of the sample
+  // actually matches when we stopped waiting.
+  //
+  // Stack:
+  // If the thread starts working without the end of the Waiting for GVL sample, then by the time the thread is sampled
+  // via the regular cpu/wall-time samples mechanism, the stack can be be inaccurate (e.g. does not correctly pinpoint
+  // where the waiting happened).
+  //
+  // Arguably, the last sample after Waiting for GVL ended (when gvl_waiting_at < 0) should always come from this method
+  // and not a regular cpu/wall-time sample BUT since all of these things are happening in parallel/concurrently I suspect
+  // it's possible for a regular sample to kick in just before this one.
+  //
+  // ---
+  //
+  // NOTE: In normal use, current_thread is expected to be == rb_thread_current(); the `current_thread` parameter only
+  // exists to enable testing.
+  VALUE thread_context_collector_sample_after_gvl_running_with_thread(VALUE self_instance, VALUE current_thread) {
+    struct thread_context_collector_state *state;
+    TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
+    if (!state->timeline_enabled) rb_raise(rb_eRuntimeError, "GVL profiling requires timeline to be enabled");
+    intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(current_thread);
+    if (gvl_waiting_at >= 0) {
+      // @ivoanjo: I'm not sure if this can ever happen. This means that we're not on the same thread
+      // that ran `thread_context_collector_on_gvl_running` and made the decision to sample OR a regular sample was
+      // triggered ahead of us.
+      // We do nothing in this case.
+      return Qfalse;
+    }
+    struct per_thread_context *thread_context = get_or_create_context_for(current_thread, state);
+    // We don't actually account for cpu-time during Waiting for GVL. BUT, we may chose to push an
+    // extra sample to represent the period prior to Waiting for GVL. To support that, we retrieve the current
+    // cpu-time of the thread and let `update_metrics_and_sample` decide what to do with it.
+    long cpu_time_for_thread = cpu_time_now_ns(thread_context);
+    // TODO: Should we update the dynamic sampling rate overhead tracking with this sample as well?
+    update_metrics_and_sample(
+      state,
+      /* thread_being_sampled: */ current_thread,
+      /* stack_from_thread: */ current_thread,
+      thread_context,
+      thread_context->sampling_buffer,
+      cpu_time_for_thread,
+      monotonic_wall_time_now_ns(RAISE_ON_FAILURE)
+    );
+    return Qtrue; // To allow this to be called from rb_rescue2
+  }
+  VALUE thread_context_collector_sample_after_gvl_running(VALUE self_instance) {
+    return thread_context_collector_sample_after_gvl_running_with_thread(self_instance, rb_thread_current());
+  }
+  // This method is intended to be called from update_metrics_and_sample. It exists to handle extra sampling steps we
+  // need to take when sampling cpu/wall-time for a thread that's in the "Waiting for GVL" state.
+  __attribute__((warn_unused_result))
+  static bool handle_gvl_waiting(
+    struct thread_context_collector_state *state,
+    VALUE thread_being_sampled,
+    VALUE stack_from_thread,
+    struct per_thread_context *thread_context,
+    sampling_buffer* sampling_buffer,
+    long current_cpu_time_ns
+  ) {
+    intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(thread_being_sampled);
+    bool is_gvl_waiting_state = gvl_waiting_at != 0 && gvl_waiting_at != GVL_WAITING_ENABLED_EMPTY;
+    if (!is_gvl_waiting_state) return false;
+    // We can be in one of 2 situations here:
+    //
+    // 1. The current sample is the first one after we entered the "Waiting for GVL" state
+    //    (wall_time_at_previous_sample_ns < abs(gvl_waiting_at))
+    //
+    //                             time ─────►
+    //  ...──────────────┬───────────────────...
+    //       Other state │ Waiting for GVL
+    //  ...──────────────┴───────────────────...
+    //     ▲                              ▲
+    //     └─ Previous sample              └─ Regular sample (caller)
+    //
+    //   In this case, we'll want to push two samples: a) one for the current time (handled by the caller), b) an extra sample
+    //   to represent the remaining cpu/wall time before the "Waiting for GVL" started:
+    //
+    //                             time ─────►
+    //  ...──────────────┬───────────────────...
+    //       Other state │ Waiting for GVL
+    //  ...──────────────┴───────────────────...
+    //     ▲            ▲                ▲
+    //     └─ Prev...    └─ Extra sample   └─ Regular sample (caller)
+    //
+    // 2. The current sample is the n-th one after we entered the "Waiting for GVL" state
+    //    (wall_time_at_previous_sample_ns > abs(gvl_waiting_at))
+    //
+    //                             time ─────►
+    //  ...──────────────┬───────────────────────────────────────────────...
+    //       Other state │ Waiting for GVL
+    //  ...──────────────┴───────────────────────────────────────────────...
+    //     ▲                              ▲                          ▲
+    //     └─ Previous sample              └─ Previous sample         └─ Regular sample (caller)
+    //
+    //   In this case, we just report back to the caller that the thread is in the "Waiting for GVL" state.
+    //
+    // ---
+    //
+    // Overall, gvl_waiting_at will be > 0 if still in the "Waiting for GVL" state and < 0 if we actually reached the end of
+    // the wait.
+    //
+    // It doesn't really matter if the thread is still waiting or just reached the end of the wait: each sample represents
+    // a snapshot at time ending now, so if the state finished, it just means the next sample will be a regular one.
+    if (gvl_waiting_at < 0) {
+      // Negative means the waiting for GVL just ended, so we clear the state, so next samples no longer represent waiting
+      gvl_profiling_state_thread_object_set(thread_being_sampled, GVL_WAITING_ENABLED_EMPTY);
+    }
+    long gvl_waiting_started_wall_time_ns = labs(gvl_waiting_at);
+    if (thread_context->wall_time_at_previous_sample_ns < gvl_waiting_started_wall_time_ns) { // situation 1 above
+      long cpu_time_elapsed_ns = update_time_since_previous_sample(
+        &thread_context->cpu_time_at_previous_sample_ns,
+        current_cpu_time_ns,
+        thread_context->gc_tracking.cpu_time_at_start_ns,
+        IS_NOT_WALL_TIME
+      );
+      long duration_until_start_of_gvl_waiting_ns = update_time_since_previous_sample(
+        &thread_context->wall_time_at_previous_sample_ns,
+        gvl_waiting_started_wall_time_ns,
+        INVALID_TIME,
+        IS_WALL_TIME
+      );
+      // Push extra sample
+      trigger_sample_for_thread(
+        state,
+        thread_being_sampled,
+        stack_from_thread,
+        thread_context,
+        sampling_buffer,
+        (sample_values) {.cpu_time_ns = cpu_time_elapsed_ns, .cpu_or_wall_samples = 1, .wall_time_ns = duration_until_start_of_gvl_waiting_ns},
+        gvl_waiting_started_wall_time_ns,
+        NULL,
+        NULL,
+        false // This is the extra sample before the wait begun; only the next sample will be in the gvl waiting state
+      );
+    }
+    return true;
+  }
+  static VALUE _native_on_gvl_waiting(DDTRACE_UNUSED VALUE self, VALUE thread) {
+    ENFORCE_THREAD(thread);
+    thread_context_collector_on_gvl_waiting(thread_from_thread_object(thread));
+    return Qnil;
+  }
+  static VALUE _native_gvl_waiting_at_for(DDTRACE_UNUSED VALUE self, VALUE thread) {
+    ENFORCE_THREAD(thread);
+    intptr_t gvl_waiting_at = gvl_profiling_state_thread_object_get(thread);
+    return LONG2NUM(gvl_waiting_at);
+  }
+  static VALUE _native_on_gvl_running(DDTRACE_UNUSED VALUE self, VALUE thread) {
+    ENFORCE_THREAD(thread);
+    return thread_context_collector_on_gvl_running(thread_from_thread_object(thread)) ? Qtrue : Qfalse;
+  }
+  static VALUE _native_sample_after_gvl_running(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread) {
+    ENFORCE_THREAD(thread);
+    return thread_context_collector_sample_after_gvl_running_with_thread(collector_instance, thread);
+  }
+  static VALUE _native_apply_delta_to_cpu_time_at_previous_sample_ns(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE thread, VALUE delta_ns) {
+    ENFORCE_THREAD(thread);
+    struct thread_context_collector_state *state;
+    TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);
+    struct per_thread_context *thread_context = get_context_for(thread, state);
+    if (thread_context == NULL) rb_raise(rb_eArgError, "Unexpected: This method cannot be used unless the per-thread context for the thread already exists");
+    thread_context->cpu_time_at_previous_sample_ns += NUM2LONG(delta_ns);
+    return Qtrue;
+  }
+#else
+  static bool handle_gvl_waiting(
+    DDTRACE_UNUSED struct thread_context_collector_state *state,
+    DDTRACE_UNUSED VALUE thread_being_sampled,
+    DDTRACE_UNUSED VALUE stack_from_thread,
+    DDTRACE_UNUSED struct per_thread_context *thread_context,
+    DDTRACE_UNUSED sampling_buffer* sampling_buffer,
+    DDTRACE_UNUSED long current_cpu_time_ns
+  ) { return false; }
+#endif // NO_GVL_INSTRUMENTATION