RubyGems - rperf - Versions diffs - 0.7.0 → 0.9.0 - Mend

rperf 0.7.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/ext/rperf/rperf.c CHANGED Viewed

@@ -8,6 +8,7 @@
 #include <unistd.h>
 #include <signal.h>
 #include <stdatomic.h>
+#include <sched.h>
 #ifdef __linux__
 #include <sys/syscall.h>
 #endif
@@ -24,8 +25,10 @@
 #ifdef __linux__
 #define RPERF_USE_TIMER_SIGNAL 1
 #define RPERF_TIMER_SIGNAL_DEFAULT (SIGRTMIN + 8)
+#define RPERF_COND_CLOCK CLOCK_MONOTONIC
 #else
 #define RPERF_USE_TIMER_SIGNAL 0
+#define RPERF_COND_CLOCK CLOCK_REALTIME  /* macOS lacks pthread_condattr_setclock */
 #endif
 #define RPERF_MAX_STACK_DEPTH 512
@@ -38,21 +41,21 @@
 #define RPERF_STACK_POOL_INITIAL 4096
 #define RPERF_PAUSED(prof) ((prof)->profile_refcount == 0)
-/* Synthetic frame IDs (reserved in frame_table, 0-based) */
-#define RPERF_SYNTHETIC_GVL_BLOCKED 0
-#define RPERF_SYNTHETIC_GVL_WAIT    1
-#define RPERF_SYNTHETIC_GC_MARKING  2
-#define RPERF_SYNTHETIC_GC_SWEEPING 3
-#define RPERF_SYNTHETIC_COUNT       4
+/* VM state values (stored in samples, not as stack frames) */
+enum rperf_vm_state {
+    RPERF_VM_STATE_NORMAL       = 0,
+    RPERF_VM_STATE_GVL_BLOCKED  = 1,
+    RPERF_VM_STATE_GVL_WAIT     = 2,
+    RPERF_VM_STATE_GC_MARKING   = 3,
+    RPERF_VM_STATE_GC_SWEEPING  = 4,
+};
 /* ---- Data structures ---- */
-enum rperf_sample_type {
-    RPERF_SAMPLE_NORMAL      = 0,
-    RPERF_SAMPLE_GVL_BLOCKED = 1,  /* off-GVL: SUSPENDED → READY */
-    RPERF_SAMPLE_GVL_WAIT    = 2,  /* GVL wait: READY → RESUMED */
-    RPERF_SAMPLE_GC_MARKING  = 3,  /* GC marking phase */
-    RPERF_SAMPLE_GC_SWEEPING = 4,  /* GC sweeping phase */
+enum rperf_mode {
+    RPERF_MODE_CPU  = 0,
+    RPERF_MODE_WALL = 1,
 };
 enum rperf_gc_phase {
@@ -65,7 +68,7 @@ typedef struct rperf_sample {
     int depth;
     size_t frame_start; /* index into frame_pool */
     int64_t weight;
-    int type;           /* rperf_sample_type */
+    enum rperf_vm_state vm_state;
     int thread_seq;     /* thread sequence number (1-based) */
     int label_set_id;   /* label set ID (0 = no labels) */
 } rperf_sample_t;
@@ -87,7 +90,7 @@ typedef struct rperf_sample_buffer {
 typedef struct rperf_frame_table {
     _Atomic(VALUE *) keys;    /* unique VALUE array (GC mark target) */
-    size_t count;             /* = next frame_id (starts after RPERF_SYNTHETIC_COUNT) */
+    _Atomic(size_t) count;    /* = next frame_id */
     size_t capacity;
     uint32_t *buckets;        /* open addressing: stores index into keys[] */
     size_t bucket_capacity;
@@ -103,9 +106,10 @@ typedef struct rperf_frame_table {
 typedef struct rperf_agg_entry {
     uint32_t frame_start;     /* offset into stack_pool */
-    int depth;                /* includes synthetic frame */
+    int depth;
     int thread_seq;
     int label_set_id;         /* label set ID (0 = no labels) */
+    enum rperf_vm_state vm_state;
     int64_t weight;           /* accumulated */
     uint32_t hash;            /* cached hash value */
     int used;                 /* 0 = empty, 1 = used */
@@ -122,7 +126,6 @@ typedef struct rperf_agg_table {
 typedef struct rperf_thread_data {
     int64_t prev_time_ns;
-    int64_t prev_wall_ns;
     /* GVL event tracking */
     int64_t suspended_at_ns;        /* wall time at SUSPENDED */
     int64_t ready_at_ns;            /* wall time at READY */
@@ -145,11 +148,13 @@ typedef struct rperf_stats {
     size_t trigger_count;
     size_t sampling_count;
     int64_t sampling_total_ns;
+    size_t dropped_samples;     /* samples lost due to allocation failure */
+    size_t dropped_aggregation; /* samples lost during aggregation (frame_table/agg_table full) */
 } rperf_stats_t;
 typedef struct rperf_profiler {
     int frequency;
-    int mode; /* 0 = cpu, 1 = wall */
+    enum rperf_mode mode;
     _Atomic int running;
     pthread_t worker_thread;     /* combined timer + aggregation */
 #if RPERF_USE_TIMER_SIGNAL
@@ -188,6 +193,7 @@ typedef struct rperf_profiler {
      * profile_inc/dec transitions 0↔1 arm/disarm the timer.
      * Modified only under GVL, so plain int is safe. */
     int profile_refcount;
+    int worker_paused;  /* 1 when nanosleep worker is in paused cond_wait */
 } rperf_profiler_t;
 static rperf_profiler_t g_profiler;
@@ -218,21 +224,50 @@ rperf_profiler_mark(void *ptr)
      * If we see an old count, both old and new keys arrays have valid
      * data (old keys are kept alive in old_keys[]). */
     {
-        size_t ft_count = __atomic_load_n(&prof->frame_table.count, __ATOMIC_ACQUIRE);
+        size_t ft_count = atomic_load_explicit(&prof->frame_table.count, memory_order_acquire);
         VALUE *ft_keys = atomic_load_explicit(&prof->frame_table.keys, memory_order_acquire);
         if (ft_keys && ft_count > 0) {
-            rb_gc_mark_locations(ft_keys + RPERF_SYNTHETIC_COUNT,
-                                ft_keys + ft_count);
+            rb_gc_mark_locations(ft_keys, ft_keys + ft_count);
         }
     }
 }
+static size_t
+rperf_profiler_memsize(const void *ptr)
+{
+    const rperf_profiler_t *prof = (const rperf_profiler_t *)ptr;
+    size_t size = sizeof(rperf_profiler_t);
+    int i;
+    /* Double-buffered sample storage */
+    for (i = 0; i < 2; i++) {
+        const rperf_sample_buffer_t *buf = &prof->buffers[i];
+        size += buf->sample_capacity * sizeof(rperf_sample_t);
+        size += buf->frame_pool_capacity * sizeof(VALUE);
+    }
+    /* Frame table */
+    size += prof->frame_table.capacity * sizeof(VALUE);           /* keys */
+    size += prof->frame_table.bucket_capacity * sizeof(uint32_t); /* buckets */
+    for (i = 0; i < prof->frame_table.old_keys_count; i++) {
+        /* old_keys entries are previous keys arrays; exact sizes unknown,
+         * but the pointer array itself is accounted for below. */
+    }
+    size += prof->frame_table.old_keys_capacity * sizeof(VALUE *); /* old_keys */
+    /* Aggregation table */
+    size += prof->agg_table.bucket_capacity * sizeof(rperf_agg_entry_t);
+    size += prof->agg_table.stack_pool_capacity * sizeof(uint32_t);
+    return size;
+}
 static const rb_data_type_t rperf_profiler_type = {
     .wrap_struct_name = "rperf_profiler",
     .function = {
         .dmark = rperf_profiler_mark,
         .dfree = NULL,
-        .dsize = NULL,
+        .dsize = rperf_profiler_memsize,
     },
 };
@@ -259,9 +294,9 @@ rperf_wall_time_ns(void)
 /* ---- Get current thread's time based on profiler mode ---- */
 static int64_t
-rperf_current_time_ns(rperf_profiler_t *prof, rperf_thread_data_t *td)
+rperf_current_time_ns(rperf_profiler_t *prof)
 {
-    if (prof->mode == 0) {
+    if (prof->mode == RPERF_MODE_CPU) {
         return rperf_cpu_time_ns();
     } else {
         return rperf_wall_time_ns();
@@ -302,6 +337,7 @@ static int
 rperf_ensure_sample_capacity(rperf_sample_buffer_t *buf)
 {
     if (buf->sample_count >= buf->sample_capacity) {
+        if (buf->sample_capacity > SIZE_MAX / (2 * sizeof(rperf_sample_t))) return -1;
         size_t new_cap = buf->sample_capacity * 2;
         rperf_sample_t *new_samples = (rperf_sample_t *)realloc(
             buf->samples,
@@ -320,6 +356,7 @@ static int
 rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
 {
     while (buf->frame_pool_count + (size_t)needed > buf->frame_pool_capacity) {
+        if (buf->frame_pool_capacity > SIZE_MAX / (2 * sizeof(VALUE))) return -1;
         size_t new_cap = buf->frame_pool_capacity * 2;
         VALUE *new_pool = (VALUE *)realloc(
             buf->frame_pool,
@@ -340,7 +377,7 @@ rperf_frame_table_init(rperf_frame_table_t *ft)
     VALUE *keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
     if (!keys) return -1;
     atomic_store_explicit(&ft->keys, keys, memory_order_relaxed);
-    ft->count = RPERF_SYNTHETIC_COUNT; /* reserve slots for synthetic frames */
+    ft->count = 0;
     ft->bucket_capacity = RPERF_FRAME_TABLE_INITIAL * 2;
     ft->buckets = (uint32_t *)malloc(ft->bucket_capacity * sizeof(uint32_t));
     if (!ft->buckets) { free(keys); atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed); return -1; }
@@ -372,6 +409,7 @@ rperf_frame_table_free(rperf_frame_table_t *ft)
 static void
 rperf_frame_table_rehash(rperf_frame_table_t *ft)
 {
+    if (ft->bucket_capacity > SIZE_MAX / 2) return;
     size_t new_cap = ft->bucket_capacity * 2;
     uint32_t *new_buckets = (uint32_t *)malloc(new_cap * sizeof(uint32_t));
     if (!new_buckets) return; /* keep using current buckets at higher load factor */
@@ -379,7 +417,7 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
     VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
     size_t i;
-    for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
+    for (i = 0; i < ft->count; i++) {
         uint32_t h = (uint32_t)(keys[i] >> 3); /* shift out tag bits */
         size_t idx = h % new_cap;
         while (new_buckets[idx] != RPERF_FRAME_TABLE_EMPTY)
@@ -400,11 +438,13 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
     uint32_t h = (uint32_t)(fval >> 3);
     size_t idx = h % ft->bucket_capacity;
+    size_t probes = 0;
     while (1) {
         uint32_t slot = ft->buckets[idx];
         if (slot == RPERF_FRAME_TABLE_EMPTY) break;
         if (keys[slot] == fval) return slot;
         idx = (idx + 1) % ft->bucket_capacity;
+        if (++probes >= ft->bucket_capacity) return RPERF_FRAME_TABLE_EMPTY; /* table full */
     }
     /* Insert new entry.  Grow keys array if capacity is exhausted.
@@ -412,6 +452,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
      * the old keys pointer.  Instead, allocate new, copy, swap pointer
      * atomically, and keep old array alive until stop. */
     if (ft->count >= ft->capacity) {
+        if (ft->capacity > SIZE_MAX / 2) return RPERF_FRAME_TABLE_EMPTY;
         size_t new_cap = ft->capacity * 2;
         VALUE *new_keys = (VALUE *)calloc(new_cap, sizeof(VALUE));
         if (!new_keys) return RPERF_FRAME_TABLE_EMPTY;
@@ -434,7 +475,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
     keys[frame_id] = fval;
     /* Store fence: ensure keys[frame_id] is visible before count is incremented,
      * so GC dmark never reads uninitialized keys[count-1]. */
-    __atomic_store_n(&ft->count, ft->count + 1, __ATOMIC_RELEASE);
+    atomic_store_explicit(&ft->count, ft->count + 1, memory_order_release);
     ft->buckets[idx] = frame_id;
     /* Rehash if load factor > 0.7 */
@@ -448,7 +489,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
 /* ---- Aggregation table operations (all malloc-based, no GVL needed) ---- */
 static uint32_t
-rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id)
+rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id, enum rperf_vm_state vm_state)
 {
     uint32_t h = 2166136261u;
     int i;
@@ -460,6 +501,8 @@ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id)
     h *= 16777619u;
     h ^= (uint32_t)label_set_id;
     h *= 16777619u;
+    h ^= (uint32_t)vm_state;
+    h *= 16777619u;
     return h;
 }
@@ -488,6 +531,7 @@ rperf_agg_table_free(rperf_agg_table_t *at)
 static void
 rperf_agg_table_rehash(rperf_agg_table_t *at)
 {
+    if (at->bucket_capacity > SIZE_MAX / (2 * sizeof(rperf_agg_entry_t))) return;
     size_t new_cap = at->bucket_capacity * 2;
     rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
     if (!new_buckets) return; /* keep using current buckets at higher load factor */
@@ -512,6 +556,7 @@ static int
 rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
 {
     while (at->stack_pool_count + (size_t)needed > at->stack_pool_capacity) {
+        if (at->stack_pool_capacity > SIZE_MAX / (2 * sizeof(uint32_t))) return -1;
         size_t new_cap = at->stack_pool_capacity * 2;
         uint32_t *new_pool = (uint32_t *)realloc(at->stack_pool,
                                                   new_cap * sizeof(uint32_t));
@@ -522,36 +567,40 @@ rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
     return 0;
 }
-/* Insert or merge a stack into the aggregation table */
-static void
+/* Insert or merge a stack into the aggregation table.
+ * Returns 0 on success, -1 on failure (table full or allocation failure). */
+static int
 rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
                        int depth, int thread_seq, int label_set_id,
-                       int64_t weight, uint32_t hash)
+                       enum rperf_vm_state vm_state, int64_t weight, uint32_t hash)
 {
     size_t idx = hash % at->bucket_capacity;
+    size_t probes = 0;
     while (1) {
         rperf_agg_entry_t *e = &at->buckets[idx];
         if (!e->used) break;
         if (e->hash == hash && e->depth == depth && e->thread_seq == thread_seq &&
-            e->label_set_id == label_set_id &&
+            e->label_set_id == label_set_id && e->vm_state == vm_state &&
             memcmp(at->stack_pool + e->frame_start, frame_ids,
                    depth * sizeof(uint32_t)) == 0) {
             /* Match — merge weight */
             e->weight += weight;
-            return;
+            return 0;
         }
         idx = (idx + 1) % at->bucket_capacity;
+        if (++probes >= at->bucket_capacity) return -1; /* table full */
     }
     /* New entry — append frame_ids to stack_pool */
-    if (rperf_agg_ensure_stack_pool(at, depth) < 0) return;
+    if (rperf_agg_ensure_stack_pool(at, depth) < 0) return -1;
     rperf_agg_entry_t *e = &at->buckets[idx];
     e->frame_start = (uint32_t)at->stack_pool_count;
     e->depth = depth;
     e->thread_seq = thread_seq;
     e->label_set_id = label_set_id;
+    e->vm_state = vm_state;
     e->weight = weight;
     e->hash = hash;
     e->used = 1;
@@ -565,6 +614,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
     if (at->count * 10 > at->bucket_capacity * 7) {
         rperf_agg_table_rehash(at);
     }
+    return 0;
 }
 /* ---- Aggregation: process a sample buffer into frame_table + agg_table ---- */
@@ -573,47 +623,46 @@ static void
 rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
 {
     size_t i;
-    uint32_t temp_ids[RPERF_MAX_STACK_DEPTH + 1];
+    uint32_t temp_ids[RPERF_MAX_STACK_DEPTH];
     for (i = 0; i < buf->sample_count; i++) {
         rperf_sample_t *s = &buf->samples[i];
-        int off = 0;
         uint32_t hash;
         int j;
-        /* Prepend synthetic frame if needed */
-        if (s->type == RPERF_SAMPLE_GVL_BLOCKED) {
-            temp_ids[off++] = RPERF_SYNTHETIC_GVL_BLOCKED;
-        } else if (s->type == RPERF_SAMPLE_GVL_WAIT) {
-            temp_ids[off++] = RPERF_SYNTHETIC_GVL_WAIT;
-        } else if (s->type == RPERF_SAMPLE_GC_MARKING) {
-            temp_ids[off++] = RPERF_SYNTHETIC_GC_MARKING;
-        } else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
-            temp_ids[off++] = RPERF_SYNTHETIC_GC_SWEEPING;
-        }
+        /* Clamp depth to temp_ids[] capacity */
+        if (s->depth > RPERF_MAX_STACK_DEPTH)
+            s->depth = RPERF_MAX_STACK_DEPTH;
         /* Convert VALUE frames to frame_ids */
         int overflow = 0;
         for (j = 0; j < s->depth; j++) {
+            if (s->frame_start + j >= buf->frame_pool_count) break;
             VALUE fval = buf->frame_pool[s->frame_start + j];
             uint32_t fid = rperf_frame_table_insert(&prof->frame_table, fval);
             if (fid == RPERF_FRAME_TABLE_EMPTY) { overflow = 1; break; }
-            temp_ids[off + j] = fid;
+            temp_ids[j] = fid;
+        }
+        if (overflow) {
+            /* frame_table full — count remaining samples as dropped */
+            prof->stats.dropped_aggregation += buf->sample_count - i;
+            break;
         }
-        if (overflow) break; /* frame_table full, stop aggregating this buffer */
-        int total_depth = off + s->depth;
-        hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq, s->label_set_id);
+        hash = rperf_fnv1a_u32(temp_ids, s->depth, s->thread_seq, s->label_set_id, s->vm_state);
-        rperf_agg_table_insert(&prof->agg_table, temp_ids, total_depth,
-                               s->thread_seq, s->label_set_id, s->weight, hash);
+        if (rperf_agg_table_insert(&prof->agg_table, temp_ids, s->depth,
+                               s->thread_seq, s->label_set_id, s->vm_state,
+                               s->weight, hash) < 0) {
+            prof->stats.dropped_aggregation++;
+        }
     }
     /* Reset buffer for reuse.
      * Release fence: ensure all frame_table inserts are visible (to GC dmark)
      * before frame_pool_count is cleared, so dmark always has at least one
      * source (frame_table or frame_pool) covering each VALUE. */
-    __atomic_thread_fence(__ATOMIC_RELEASE);
+    atomic_thread_fence(memory_order_release);
     buf->sample_count = 0;
     buf->frame_pool_count = 0;
 }
@@ -656,7 +705,7 @@ rperf_try_swap(rperf_profiler_t *prof)
 /* Write a sample into a specific buffer. No swap check. */
 static int
 rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
-                   int64_t weight, int type, int thread_seq, int label_set_id)
+                   int64_t weight, enum rperf_vm_state vm_state, int thread_seq, int label_set_id)
 {
     if (weight <= 0) return 0;
     if (rperf_ensure_sample_capacity(buf) < 0) return -1;
@@ -665,7 +714,7 @@ rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
     sample->depth = depth;
     sample->frame_start = frame_start;
     sample->weight = weight;
-    sample->type = type;
+    sample->vm_state = vm_state;
     sample->thread_seq = thread_seq;
     sample->label_set_id = label_set_id;
     buf->sample_count++;
@@ -674,10 +723,11 @@ rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
 static void
 rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
-                    int64_t weight, int type, int thread_seq, int label_set_id)
+                    int64_t weight, enum rperf_vm_state vm_state, int thread_seq, int label_set_id)
 {
     rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
-    rperf_write_sample(buf, frame_start, depth, weight, type, thread_seq, label_set_id);
+    if (rperf_write_sample(buf, frame_start, depth, weight, vm_state, thread_seq, label_set_id) < 0)
+        prof->stats.dropped_samples++;
     rperf_try_swap(prof);
 }
@@ -689,8 +739,9 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
 {
     rperf_thread_data_t *td = (rperf_thread_data_t *)calloc(1, sizeof(rperf_thread_data_t));
     if (!td) return NULL;
-    td->prev_time_ns = rperf_current_time_ns(prof, td);
-    td->prev_wall_ns = rperf_wall_time_ns();
+    int64_t t = rperf_current_time_ns(prof);
+    if (t < 0) { free(td); return NULL; }
+    td->prev_time_ns = t;
     td->thread_seq = ++prof->next_thread_seq;
     rb_internal_thread_specific_set(thread, prof->ts_key, td);
     return td;
@@ -712,7 +763,7 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t
         is_first = 1;
     }
-    int64_t time_now = rperf_current_time_ns(prof, td);
+    int64_t time_now = rperf_current_time_ns(prof);
     if (time_now < 0) return;
     /* Capture backtrace into active buffer's frame_pool */
@@ -727,13 +778,12 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t
     /* Record normal sample (skip if first time — no prev_time, or if paused) */
     if (!is_first && !RPERF_PAUSED(prof)) {
         int64_t weight = time_now - td->prev_time_ns;
-        rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq, td->label_set_id);
+        rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
     }
     /* Save timestamp for READY/RESUMED */
     td->suspended_at_ns = wall_now;
     td->prev_time_ns = time_now;
-    td->prev_wall_ns = wall_now;
 }
 static void
@@ -764,7 +814,7 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
      * Both samples are written directly into the same buffer before calling
      * rperf_try_swap, so that a swap triggered by the first sample cannot
      * move the second into a different buffer with a stale frame_start. */
-    if (prof->mode == 1 && td->suspended_at_ns > 0 && !RPERF_PAUSED(prof)) {
+    if (prof->mode == RPERF_MODE_WALL && td->suspended_at_ns > 0 && !RPERF_PAUSED(prof)) {
         rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
         if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) goto skip_gvl;
         size_t frame_start = buf->frame_pool_count;
@@ -776,13 +826,15 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
         /* Write both samples into the same buf, then swap-check once */
         if (td->ready_at_ns > 0 && td->ready_at_ns > td->suspended_at_ns) {
             int64_t blocked_ns = td->ready_at_ns - td->suspended_at_ns;
-            rperf_write_sample(buf, frame_start, depth, blocked_ns,
-                               RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq, td->label_set_id);
+            if (rperf_write_sample(buf, frame_start, depth, blocked_ns,
+                               RPERF_VM_STATE_GVL_BLOCKED, td->thread_seq, td->label_set_id) < 0)
+                prof->stats.dropped_samples++;
         }
         if (td->ready_at_ns > 0 && wall_now > td->ready_at_ns) {
             int64_t wait_ns = wall_now - td->ready_at_ns;
-            rperf_write_sample(buf, frame_start, depth, wait_ns,
-                               RPERF_SAMPLE_GVL_WAIT, td->thread_seq, td->label_set_id);
+            if (rperf_write_sample(buf, frame_start, depth, wait_ns,
+                               RPERF_VM_STATE_GVL_WAIT, td->thread_seq, td->label_set_id) < 0)
+                prof->stats.dropped_samples++;
         }
         rperf_try_swap(prof);
@@ -790,9 +842,8 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
 skip_gvl:
     /* Reset prev times to current — next timer sample measures from resume */
-    int64_t time_now = rperf_current_time_ns(prof, td);
+    int64_t time_now = rperf_current_time_ns(prof);
     if (time_now >= 0) td->prev_time_ns = time_now;
-    td->prev_wall_ns = wall_now;
     /* Clear suspended state */
     td->suspended_at_ns = 0;
@@ -861,9 +912,9 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
         int64_t wall_now = rperf_wall_time_ns();
         int64_t weight = wall_now - prof->gc.enter_ns;
-        int type = (prof->gc.phase == RPERF_GC_SWEEPING)
-                   ? RPERF_SAMPLE_GC_SWEEPING
-                   : RPERF_SAMPLE_GC_MARKING;
+        enum rperf_vm_state vm_state = (prof->gc.phase == RPERF_GC_SWEEPING)
+                   ? RPERF_VM_STATE_GC_SWEEPING
+                   : RPERF_VM_STATE_GC_MARKING;
         /* Capture backtrace here (not at GC_ENTER) so that frame_start
          * always indexes into the current active buffer. The Ruby stack
@@ -882,24 +933,22 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
         }
         buf->frame_pool_count += depth;
-        rperf_record_sample(prof, frame_start, depth, weight, type, prof->gc.thread_seq, prof->gc.label_set_id);
+        rperf_record_sample(prof, frame_start, depth, weight, vm_state, prof->gc.thread_seq, prof->gc.label_set_id);
         prof->gc.enter_ns = 0;
     }
 }
 /* ---- Sampling callback (postponed job) — current thread only ---- */
-static void
-rperf_sample_job(void *arg)
+/* Core sampling logic, parameterized by mode constant.
+ * Called from rperf_sample_cpu/rperf_sample_wall so the compiler
+ * can inline and eliminate mode branches at compile time. */
+static inline void
+rperf_sample_core(rperf_profiler_t *prof, enum rperf_mode mode)
 {
-    rperf_profiler_t *prof = (rperf_profiler_t *)arg;
-    if (!prof->running) return;
-    if (RPERF_PAUSED(prof)) return;
-    /* Measure sampling overhead */
+    /* Measure sampling overhead (wall time — runs under GVL, no I/O) */
     struct timespec ts_start, ts_end;
-    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_start);
+    clock_gettime(CLOCK_MONOTONIC, &ts_start);
     VALUE thread = rb_thread_current();
@@ -911,12 +960,11 @@ rperf_sample_job(void *arg)
         return; /* Skip first sample for this thread */
     }
-    int64_t time_now = rperf_current_time_ns(prof, td);
+    int64_t time_now = (mode == RPERF_MODE_CPU) ? rperf_cpu_time_ns() : rperf_wall_time_ns();
     if (time_now < 0) return;
     int64_t weight = time_now - td->prev_time_ns;
     td->prev_time_ns = time_now;
-    td->prev_wall_ns = rperf_wall_time_ns();
     if (weight <= 0) return;
@@ -930,15 +978,35 @@ rperf_sample_job(void *arg)
     if (depth <= 0) return;
     buf->frame_pool_count += depth;
-    rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq, td->label_set_id);
+    rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
-    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_end);
+    clock_gettime(CLOCK_MONOTONIC, &ts_end);
     prof->stats.sampling_count++;
     prof->stats.sampling_total_ns +=
         ((int64_t)ts_end.tv_sec - ts_start.tv_sec) * 1000000000LL +
         (ts_end.tv_nsec - ts_start.tv_nsec);
 }
+static void
+rperf_sample_cpu(rperf_profiler_t *prof) { rperf_sample_core(prof, RPERF_MODE_CPU); }
+static void
+rperf_sample_wall(rperf_profiler_t *prof) { rperf_sample_core(prof, RPERF_MODE_WALL); }
+static void
+rperf_sample_job(void *arg)
+{
+    rperf_profiler_t *prof = (rperf_profiler_t *)arg;
+    if (!prof->running) return;
+    if (RPERF_PAUSED(prof)) return;
+    if (prof->mode == RPERF_MODE_CPU)
+        rperf_sample_cpu(prof);
+    else
+        rperf_sample_wall(prof);
+}
 /* ---- Worker thread: timer + aggregation ---- */
 #if RPERF_USE_TIMER_SIGNAL
@@ -984,7 +1052,7 @@ rperf_worker_nanosleep_func(void *arg)
     struct timespec deadline;
     long interval_ns = 1000000000L / prof->frequency;
-    clock_gettime(CLOCK_REALTIME, &deadline);
+    clock_gettime(RPERF_COND_CLOCK, &deadline);
     deadline.tv_nsec += interval_ns;
     if (deadline.tv_nsec >= 1000000000L) {
         deadline.tv_sec++;
@@ -994,10 +1062,12 @@ rperf_worker_nanosleep_func(void *arg)
     CHECKED(pthread_mutex_lock(&prof->worker_mutex));
     while (prof->running) {
         if (RPERF_PAUSED(prof)) {
-            /* Paused: wait indefinitely until signaled (resume or stop) */
+            /* Paused: mark as paused so disarm can confirm, then wait */
+            prof->worker_paused = 1;
             CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
+            prof->worker_paused = 0;
             /* Reset deadline on wake to avoid burst of catch-up triggers */
-            clock_gettime(CLOCK_REALTIME, &deadline);
+            clock_gettime(RPERF_COND_CLOCK, &deadline);
             deadline.tv_nsec += interval_ns;
             if (deadline.tv_nsec >= 1000000000L) {
                 deadline.tv_sec++;
@@ -1068,14 +1138,18 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
     result = rb_hash_new();
     rb_hash_aset(result, ID2SYM(rb_intern("mode")),
-                 ID2SYM(rb_intern(prof->mode == 1 ? "wall" : "cpu")));
+                 ID2SYM(rb_intern(prof->mode == RPERF_MODE_WALL ? "wall" : "cpu")));
     rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(prof->frequency));
     rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(prof->stats.trigger_count));
     rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(prof->stats.sampling_count));
     rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(prof->stats.sampling_total_ns));
+    if (prof->stats.dropped_samples > 0)
+        rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(prof->stats.dropped_samples));
+    if (prof->stats.dropped_aggregation > 0)
+        rb_hash_aset(result, ID2SYM(rb_intern("dropped_aggregation")), SIZET2NUM(prof->stats.dropped_aggregation));
     rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(prof->next_thread_seq));
     rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
-                 SIZET2NUM(prof->frame_table.count - RPERF_SYNTHETIC_COUNT));
+                 SIZET2NUM(prof->frame_table.count));
     rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
                  SIZET2NUM(prof->agg_table.count));
@@ -1094,11 +1168,7 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
     {
         rperf_frame_table_t *ft = &prof->frame_table;
         VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
-        rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]")));
-        rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
-        rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"),  rb_str_new_lit("[GC marking]")));
-        rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"),  rb_str_new_lit("[GC sweeping]")));
-        for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
+        for (i = 0; i < ft->count; i++) {
             rb_ary_push(resolved_ary, rperf_resolve_frame(atomic_load_explicit(&ft->keys, memory_order_relaxed)[i]));
         }
@@ -1110,11 +1180,18 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
             VALUE frames = rb_ary_new_capa(e->depth);
             for (j = 0; j < e->depth; j++) {
+                if (e->frame_start + j >= at->stack_pool_count) break;
                 uint32_t fid = at->stack_pool[e->frame_start + j];
+                if (fid >= ft->count) break;
                 rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
             }
-            VALUE sample = rb_ary_new3(4, frames, LONG2NUM(e->weight), INT2NUM(e->thread_seq), INT2NUM(e->label_set_id));
+            VALUE sample = rb_ary_new_capa(5);
+            rb_ary_push(sample, frames);
+            rb_ary_push(sample, LONG2NUM(e->weight));
+            rb_ary_push(sample, INT2NUM(e->thread_seq));
+            rb_ary_push(sample, INT2NUM(e->label_set_id));
+            rb_ary_push(sample, INT2NUM(e->vm_state));
             rb_ary_push(samples_ary, sample);
         }
     }
@@ -1141,7 +1218,7 @@ static VALUE
 rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VALUE vdefer)
 {
     int frequency = NUM2INT(vfreq);
-    int mode = NUM2INT(vmode);
+    enum rperf_mode mode = (enum rperf_mode)NUM2INT(vmode);
     int aggregate = RTEST(vagg) ? 1 : 0;
 #if RPERF_USE_TIMER_SIGNAL
     int sig = NUM2INT(vsig);
@@ -1159,13 +1236,27 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VAL
     g_profiler.stats.sampling_count = 0;
     g_profiler.stats.sampling_total_ns = 0;
     g_profiler.stats.trigger_count = 0;
+    g_profiler.stats.dropped_samples = 0;
+    g_profiler.stats.dropped_aggregation = 0;
     atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
     atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
     g_profiler.label_sets = Qnil;
     /* Initialize worker mutex/cond */
     CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
+#ifdef __linux__
+    {
+        /* Use CLOCK_MONOTONIC for pthread_cond_timedwait so that
+         * system clock adjustments (NTP etc.) don't affect timer intervals. */
+        pthread_condattr_t cond_attr;
+        CHECKED(pthread_condattr_init(&cond_attr));
+        CHECKED(pthread_condattr_setclock(&cond_attr, CLOCK_MONOTONIC));
+        CHECKED(pthread_cond_init(&g_profiler.worker_cond, &cond_attr));
+        CHECKED(pthread_condattr_destroy(&cond_attr));
+    }
+#else
     CHECKED(pthread_cond_init(&g_profiler.worker_cond, NULL));
+#endif
     /* Initialize sample buffer(s) */
     if (rperf_sample_buffer_init(&g_profiler.buffers[0]) < 0) {
@@ -1244,6 +1335,7 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VAL
     g_profiler.running = 1;
     g_profiler.profile_refcount = RTEST(vdefer) ? 0 : 1;
+    g_profiler.worker_paused = 0;
 #if RPERF_USE_TIMER_SIGNAL
     g_profiler.timer_signal = timer_signal;
@@ -1347,9 +1439,7 @@ timer_fail:
 static VALUE
 rb_rperf_stop(VALUE self)
 {
-    VALUE result, samples_ary;
-    size_t i;
-    int j;
+    VALUE result;
     if (!g_profiler.running) {
         return Qnil;
@@ -1416,15 +1506,22 @@ rb_rperf_stop(VALUE self)
         rperf_agg_table_free(&g_profiler.agg_table);
     } else {
         /* Raw samples path (aggregate: false) */
+        VALUE samples_ary;
+        size_t i;
+        int j;
         rperf_sample_buffer_t *buf = &g_profiler.buffers[0];
         result = rb_hash_new();
         rb_hash_aset(result, ID2SYM(rb_intern("mode")),
-                     ID2SYM(rb_intern(g_profiler.mode == 1 ? "wall" : "cpu")));
+                     ID2SYM(rb_intern(g_profiler.mode == RPERF_MODE_WALL ? "wall" : "cpu")));
         rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
         rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
         rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
         rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
+        if (g_profiler.stats.dropped_samples > 0)
+            rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(g_profiler.stats.dropped_samples));
+        if (g_profiler.stats.dropped_aggregation > 0)
+            rb_hash_aset(result, ID2SYM(rb_intern("dropped_aggregation")), SIZET2NUM(g_profiler.stats.dropped_aggregation));
         rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
         {
             struct timespec stop_monotonic;
@@ -1441,29 +1538,20 @@ rb_rperf_stop(VALUE self)
         samples_ary = rb_ary_new_capa((long)buf->sample_count);
         for (i = 0; i < buf->sample_count; i++) {
             rperf_sample_t *s = &buf->samples[i];
-            VALUE frames = rb_ary_new_capa(s->depth + 1);
-            /* Prepend synthetic frame at leaf position (index 0) */
-            if (s->type == RPERF_SAMPLE_GVL_BLOCKED) {
-                VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]"));
-                rb_ary_push(frames, syn);
-            } else if (s->type == RPERF_SAMPLE_GVL_WAIT) {
-                VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]"));
-                rb_ary_push(frames, syn);
-            } else if (s->type == RPERF_SAMPLE_GC_MARKING) {
-                VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]"));
-                rb_ary_push(frames, syn);
-            } else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
-                VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]"));
-                rb_ary_push(frames, syn);
-            }
+            VALUE frames = rb_ary_new_capa(s->depth);
             for (j = 0; j < s->depth; j++) {
+                if (s->frame_start + j >= buf->frame_pool_count) break;
                 VALUE fval = buf->frame_pool[s->frame_start + j];
                 rb_ary_push(frames, rperf_resolve_frame(fval));
             }
-            VALUE sample = rb_ary_new3(4, frames, LONG2NUM(s->weight), INT2NUM(s->thread_seq), INT2NUM(s->label_set_id));
+            VALUE sample = rb_ary_new_capa(5);
+            rb_ary_push(sample, frames);
+            rb_ary_push(sample, LONG2NUM(s->weight));
+            rb_ary_push(sample, INT2NUM(s->thread_seq));
+            rb_ary_push(sample, INT2NUM(s->label_set_id));
+            rb_ary_push(sample, INT2NUM(s->vm_state));
             rb_ary_push(samples_ary, sample);
         }
         rb_hash_aset(result, ID2SYM(rb_intern("raw_samples")), samples_ary);
@@ -1498,6 +1586,8 @@ rperf_clear_aggregated_data(rperf_profiler_t *prof)
     prof->stats.trigger_count = 0;
     prof->stats.sampling_count = 0;
     prof->stats.sampling_total_ns = 0;
+    prof->stats.dropped_samples = 0;
+    prof->stats.dropped_aggregation = 0;
     /* Reset start timestamps so next snapshot's duration_ns covers
      * only the period since this clear. */
@@ -1619,7 +1709,15 @@ rperf_disarm_timer(rperf_profiler_t *prof)
         return;
     }
 #endif
-    /* nanosleep mode: worker will see RPERF_PAUSED on next iteration */
+    /* nanosleep mode: wake the worker and wait until it enters paused state */
+    CHECKED(pthread_mutex_lock(&prof->worker_mutex));
+    while (!prof->worker_paused) {
+        CHECKED(pthread_cond_signal(&prof->worker_cond));
+        CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
+        sched_yield();
+        CHECKED(pthread_mutex_lock(&prof->worker_mutex));
+    }
+    CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
 }
 /* Helper: reset prev_time_ns for all threads (called on resume to avoid
@@ -1633,8 +1731,7 @@ rperf_reset_thread_times(rperf_profiler_t *prof)
         VALUE thread = RARRAY_AREF(threads, i);
         rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
         if (td) {
-            td->prev_time_ns = rperf_current_time_ns(prof, td);
-            td->prev_wall_ns = rperf_wall_time_ns();
+            td->prev_time_ns = rperf_current_time_ns(prof);
         }
     }
 }
@@ -1659,6 +1756,7 @@ static VALUE
 rb_rperf_profile_dec(VALUE self)
 {
     if (!g_profiler.running) return Qfalse;
+    if (g_profiler.profile_refcount <= 0) return Qfalse;
     g_profiler.profile_refcount--;
     if (g_profiler.profile_refcount == 0) {
         rperf_disarm_timer(&g_profiler);
@@ -1673,6 +1771,12 @@ rb_rperf_running_p(VALUE self)
     return g_profiler.running ? Qtrue : Qfalse;
 }
+static VALUE
+rb_rperf_profiler_wrapper(VALUE self)
+{
+    return g_profiler_wrapper;
+}
 /* ---- Fork safety ---- */
 static void
@@ -1683,6 +1787,14 @@ rperf_after_fork_child(void)
     /* Mark as not running — timer doesn't exist in child */
     g_profiler.running = 0;
+    /* Re-initialize mutex/condvar — they may have been locked by the parent's
+     * worker thread at fork time and are in an undefined state in the child.
+     * POSIX says only async-signal-safe functions should be called in atfork
+     * child handlers, but pthread_mutex_init is safe on Linux/glibc/musl and
+     * this is the standard pattern (e.g., Python, Go do the same). */
+    pthread_mutex_init(&g_profiler.worker_mutex, NULL);
+    pthread_cond_init(&g_profiler.worker_cond, NULL);
 #if RPERF_USE_TIMER_SIGNAL
     /* timer_create timers are not inherited across fork, but pending signals may be.
      * Block the signal, drain any pending instances, then restore old handler. */
@@ -1723,6 +1835,7 @@ rperf_after_fork_child(void)
     /* Reset stats */
     g_profiler.stats.sampling_count = 0;
     g_profiler.stats.sampling_total_ns = 0;
+    g_profiler.stats.dropped_samples = 0;
     g_profiler.profile_refcount = 0;
     atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
 }
@@ -1743,6 +1856,7 @@ Init_rperf(void)
     rb_define_module_function(mRperf, "_c_profile_inc", rb_rperf_profile_inc, 0);
     rb_define_module_function(mRperf, "_c_profile_dec", rb_rperf_profile_dec, 0);
     rb_define_module_function(mRperf, "_c_running?", rb_rperf_running_p, 0);
+    rb_define_module_function(mRperf, "_c_profiler_wrapper", rb_rperf_profiler_wrapper, 0);
     memset(&g_profiler, 0, sizeof(g_profiler));
     g_profiler.label_sets = Qnil;