RubyGems - rperf - Versions diffs - 0.4.0 → 0.6.0 - Mend

rperf 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/ext/rperf/rperf.c CHANGED Viewed

@@ -7,13 +7,19 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <signal.h>
-#include <assert.h>
+#include <stdatomic.h>
 #ifdef __linux__
 #include <sys/syscall.h>
 #endif
-/* Checked pthread wrappers — assert on unexpected errors */
-#define CHECKED(call) do { int _r = (call); assert(_r == 0 && #call); (void)_r; } while (0)
+/* Checked pthread wrappers — always active regardless of NDEBUG */
+#define CHECKED(call) do { \
+    int _r = (call); \
+    if (_r != 0) { \
+        fprintf(stderr, "rperf: %s failed: %s\n", #call, strerror(_r)); \
+        abort(); \
+    } \
+} while (0)
 #ifdef __linux__
 #define RPERF_USE_TIMER_SIGNAL 1
@@ -26,7 +32,8 @@
 #define RPERF_INITIAL_SAMPLES 16384  /* >= AGG_THRESHOLD to avoid realloc before first aggregation */
 #define RPERF_INITIAL_FRAME_POOL (1024 * 1024 / sizeof(VALUE)) /* ~1MB */
 #define RPERF_AGG_THRESHOLD 10000  /* aggregate every N samples */
-#define RPERF_FRAME_TABLE_INITIAL 65536  /* pre-allocate to avoid realloc race with GC dmark */
+#define RPERF_FRAME_TABLE_INITIAL 4096
+#define RPERF_FRAME_TABLE_OLD_KEYS_INITIAL 16
 #define RPERF_AGG_TABLE_INITIAL 1024
 #define RPERF_STACK_POOL_INITIAL 4096
@@ -59,6 +66,7 @@ typedef struct rperf_sample {
     int64_t weight;
     int type;           /* rperf_sample_type */
     int thread_seq;     /* thread sequence number (1-based) */
+    int label_set_id;   /* label set ID (0 = no labels) */
 } rperf_sample_t;
 /* ---- Sample buffer (double-buffered) ---- */
@@ -77,11 +85,15 @@ typedef struct rperf_sample_buffer {
 #define RPERF_FRAME_TABLE_EMPTY UINT32_MAX
 typedef struct rperf_frame_table {
-    VALUE *keys;              /* unique VALUE array (GC mark target) */
+    _Atomic(VALUE *) keys;    /* unique VALUE array (GC mark target) */
     size_t count;             /* = next frame_id (starts after RPERF_SYNTHETIC_COUNT) */
     size_t capacity;
     uint32_t *buckets;        /* open addressing: stores index into keys[] */
     size_t bucket_capacity;
+    /* Old keys arrays kept alive for GC dmark safety until stop */
+    VALUE **old_keys;
+    int old_keys_count;
+    int old_keys_capacity;
 } rperf_frame_table_t;
 /* ---- Aggregation table: stack → weight ---- */
@@ -92,6 +104,7 @@ typedef struct rperf_agg_entry {
     uint32_t frame_start;     /* offset into stack_pool */
     int depth;                /* includes synthetic frame */
     int thread_seq;
+    int label_set_id;         /* label set ID (0 = no labels) */
     int64_t weight;           /* accumulated */
     uint32_t hash;            /* cached hash value */
     int used;                 /* 0 = empty, 1 = used */
@@ -107,54 +120,68 @@ typedef struct rperf_agg_table {
 } rperf_agg_table_t;
 typedef struct rperf_thread_data {
-    int64_t prev_cpu_ns;
+    int64_t prev_time_ns;
     int64_t prev_wall_ns;
     /* GVL event tracking */
     int64_t suspended_at_ns;        /* wall time at SUSPENDED */
     int64_t ready_at_ns;            /* wall time at READY */
-    size_t suspended_frame_start;   /* saved stack in frame_pool */
-    int suspended_frame_depth;      /* saved stack depth */
     int thread_seq;                 /* thread sequence number (1-based) */
+    int label_set_id;               /* current label set ID (0 = no labels) */
 } rperf_thread_data_t;
+/* ---- GC tracking state ---- */
+typedef struct rperf_gc_state {
+    int phase;                /* rperf_gc_phase */
+    int64_t enter_ns;         /* wall time at GC_ENTER */
+    int thread_seq;           /* thread_seq at GC_ENTER */
+    int label_set_id;         /* label_set_id at GC_ENTER */
+} rperf_gc_state_t;
+/* ---- Sampling overhead stats ---- */
+typedef struct rperf_stats {
+    size_t trigger_count;
+    size_t sampling_count;
+    int64_t sampling_total_ns;
+} rperf_stats_t;
 typedef struct rperf_profiler {
     int frequency;
     int mode; /* 0 = cpu, 1 = wall */
-    volatile int running;
+    _Atomic int running;
     pthread_t worker_thread;     /* combined timer + aggregation */
 #if RPERF_USE_TIMER_SIGNAL
     timer_t timer_id;
     int timer_signal;     /* >0: use timer signal, 0: use nanosleep thread */
-    volatile pid_t worker_tid;   /* kernel TID of worker thread (for SIGEV_THREAD_ID) */
+    _Atomic pid_t worker_tid;    /* kernel TID of worker thread (for SIGEV_THREAD_ID) */
+    struct sigaction old_sigaction;  /* saved handler to restore on stop */
 #endif
     rb_postponed_job_handle_t pj_handle;
     int aggregate;               /* 1 = aggregate samples, 0 = raw */
     /* Double-buffered sample storage (only buffers[0] used when !aggregate) */
     rperf_sample_buffer_t buffers[2];
-    int active_idx;              /* 0 or 1 */
+    _Atomic int active_idx;      /* 0 or 1 */
     /* Aggregation (only used when aggregate=1) */
     rperf_frame_table_t frame_table;
     rperf_agg_table_t agg_table;
-    volatile int swap_ready;     /* 1 = standby buffer ready for aggregation */
+    _Atomic int swap_ready;      /* 1 = standby buffer ready for aggregation */
     pthread_mutex_t worker_mutex;
     pthread_cond_t worker_cond;
     rb_internal_thread_specific_key_t ts_key;
     rb_internal_thread_event_hook_t *thread_hook;
     /* GC tracking */
-    int gc_phase;                /* rperf_gc_phase */
-    int64_t gc_enter_ns;         /* wall time at GC_ENTER */
-    size_t gc_frame_start;       /* saved stack at GC_ENTER */
-    int gc_frame_depth;          /* saved stack depth */
-    int gc_thread_seq;           /* thread_seq at GC_ENTER */
+    rperf_gc_state_t gc;
     /* Timing metadata for pprof */
     struct timespec start_realtime;   /* CLOCK_REALTIME at start */
     struct timespec start_monotonic;  /* CLOCK_MONOTONIC at start */
     /* Thread sequence counter */
     int next_thread_seq;
     /* Sampling overhead stats */
-    size_t trigger_count;
-    size_t sampling_count;
-    int64_t sampling_total_ns;
+    rperf_stats_t stats;
+    /* Label sets: Ruby Array of Hash objects, managed from Ruby side.
+     * Index 0 is reserved (no labels). GC-marked via profiler_mark. */
+    VALUE label_sets;  /* Ruby Array or Qnil */
 } rperf_profiler_t;
 static rperf_profiler_t g_profiler;
@@ -175,10 +202,22 @@ rperf_profiler_mark(void *ptr)
                                 buf->frame_pool + buf->frame_pool_count);
         }
     }
-    /* Mark frame_table keys (unique frame VALUEs) */
-    if (prof->frame_table.keys && prof->frame_table.count > 0) {
-        rb_gc_mark_locations(prof->frame_table.keys + RPERF_SYNTHETIC_COUNT,
-                            prof->frame_table.keys + prof->frame_table.count);
+    /* Mark label_sets array */
+    if (prof->label_sets != Qnil) {
+        rb_gc_mark(prof->label_sets);
+    }
+    /* Mark frame_table keys (unique frame VALUEs).
+     * Acquire count to synchronize with the release-store in insert,
+     * ensuring we see the keys pointer that is valid for [0, count).
+     * If we see an old count, both old and new keys arrays have valid
+     * data (old keys are kept alive in old_keys[]). */
+    {
+        size_t ft_count = __atomic_load_n(&prof->frame_table.count, __ATOMIC_ACQUIRE);
+        VALUE *ft_keys = atomic_load_explicit(&prof->frame_table.keys, memory_order_acquire);
+        if (ft_keys && ft_count > 0) {
+            rb_gc_mark_locations(ft_keys + RPERF_SYNTHETIC_COUNT,
+                                ft_keys + ft_count);
+        }
     }
 }
@@ -288,21 +327,38 @@ rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
 /* ---- Frame table operations (all malloc-based, no GVL needed) ---- */
-static void
+static int
 rperf_frame_table_init(rperf_frame_table_t *ft)
 {
     ft->capacity = RPERF_FRAME_TABLE_INITIAL;
-    ft->keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
+    VALUE *keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
+    if (!keys) return -1;
+    atomic_store_explicit(&ft->keys, keys, memory_order_relaxed);
     ft->count = RPERF_SYNTHETIC_COUNT; /* reserve slots for synthetic frames */
     ft->bucket_capacity = RPERF_FRAME_TABLE_INITIAL * 2;
     ft->buckets = (uint32_t *)malloc(ft->bucket_capacity * sizeof(uint32_t));
+    if (!ft->buckets) { free(keys); atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed); return -1; }
     memset(ft->buckets, 0xFF, ft->bucket_capacity * sizeof(uint32_t)); /* EMPTY */
+    ft->old_keys_count = 0;
+    ft->old_keys_capacity = RPERF_FRAME_TABLE_OLD_KEYS_INITIAL;
+    ft->old_keys = (VALUE **)malloc(ft->old_keys_capacity * sizeof(VALUE *));
+    if (!ft->old_keys) {
+        free(ft->buckets);
+        free(keys);
+        atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed);
+        return -1;
+    }
+    return 0;
 }
 static void
 rperf_frame_table_free(rperf_frame_table_t *ft)
 {
-    free(ft->keys);
+    int i;
+    for (i = 0; i < ft->old_keys_count; i++)
+        free(ft->old_keys[i]);
+    free(ft->old_keys);
+    free(atomic_load_explicit(&ft->keys, memory_order_relaxed));
     free(ft->buckets);
     memset(ft, 0, sizeof(*ft));
 }
@@ -312,11 +368,13 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
 {
     size_t new_cap = ft->bucket_capacity * 2;
     uint32_t *new_buckets = (uint32_t *)malloc(new_cap * sizeof(uint32_t));
+    if (!new_buckets) return; /* keep using current buckets at higher load factor */
     memset(new_buckets, 0xFF, new_cap * sizeof(uint32_t));
+    VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
     size_t i;
     for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
-        uint32_t h = (uint32_t)(ft->keys[i] >> 3); /* shift out tag bits */
+        uint32_t h = (uint32_t)(keys[i] >> 3); /* shift out tag bits */
         size_t idx = h % new_cap;
         while (new_buckets[idx] != RPERF_FRAME_TABLE_EMPTY)
             idx = (idx + 1) % new_cap;
@@ -332,25 +390,42 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
 static uint32_t
 rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
 {
+    VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
     uint32_t h = (uint32_t)(fval >> 3);
     size_t idx = h % ft->bucket_capacity;
     while (1) {
         uint32_t slot = ft->buckets[idx];
         if (slot == RPERF_FRAME_TABLE_EMPTY) break;
-        if (ft->keys[slot] == fval) return slot;
+        if (keys[slot] == fval) return slot;
         idx = (idx + 1) % ft->bucket_capacity;
     }
-    /* Insert new entry.
-     * keys array is pre-allocated and never realloc'd to avoid race with GC dmark.
-     * If capacity is exhausted, return EMPTY to signal aggregation should stop. */
+    /* Insert new entry.  Grow keys array if capacity is exhausted.
+     * Cannot realloc in-place because GC dmark may concurrently read
+     * the old keys pointer.  Instead, allocate new, copy, swap pointer
+     * atomically, and keep old array alive until stop. */
     if (ft->count >= ft->capacity) {
-        return RPERF_FRAME_TABLE_EMPTY;
+        size_t new_cap = ft->capacity * 2;
+        VALUE *new_keys = (VALUE *)calloc(new_cap, sizeof(VALUE));
+        if (!new_keys) return RPERF_FRAME_TABLE_EMPTY;
+        memcpy(new_keys, keys, ft->capacity * sizeof(VALUE));
+        /* Save old keys for deferred free (GC dmark safety) */
+        if (ft->old_keys_count >= ft->old_keys_capacity) {
+            int new_old_cap = ft->old_keys_capacity * 2;
+            VALUE **new_old = (VALUE **)realloc(ft->old_keys, new_old_cap * sizeof(VALUE *));
+            if (!new_old) { free(new_keys); return RPERF_FRAME_TABLE_EMPTY; }
+            ft->old_keys = new_old;
+            ft->old_keys_capacity = new_old_cap;
+        }
+        ft->old_keys[ft->old_keys_count++] = keys;
+        keys = new_keys;
+        atomic_store_explicit(&ft->keys, new_keys, memory_order_release);
+        ft->capacity = new_cap;
     }
     uint32_t frame_id = (uint32_t)ft->count;
-    ft->keys[frame_id] = fval;
+    keys[frame_id] = fval;
     /* Store fence: ensure keys[frame_id] is visible before count is incremented,
      * so GC dmark never reads uninitialized keys[count-1]. */
     __atomic_store_n(&ft->count, ft->count + 1, __ATOMIC_RELEASE);
@@ -367,7 +442,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
 /* ---- Aggregation table operations (all malloc-based, no GVL needed) ---- */
 static uint32_t
-rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq)
+rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id)
 {
     uint32_t h = 2166136261u;
     int i;
@@ -377,18 +452,23 @@ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq)
     }
     h ^= (uint32_t)thread_seq;
     h *= 16777619u;
+    h ^= (uint32_t)label_set_id;
+    h *= 16777619u;
     return h;
 }
-static void
+static int
 rperf_agg_table_init(rperf_agg_table_t *at)
 {
     at->bucket_capacity = RPERF_AGG_TABLE_INITIAL * 2;
     at->buckets = (rperf_agg_entry_t *)calloc(at->bucket_capacity, sizeof(rperf_agg_entry_t));
+    if (!at->buckets) return -1;
     at->count = 0;
     at->stack_pool_capacity = RPERF_STACK_POOL_INITIAL;
     at->stack_pool = (uint32_t *)malloc(at->stack_pool_capacity * sizeof(uint32_t));
+    if (!at->stack_pool) { free(at->buckets); at->buckets = NULL; return -1; }
     at->stack_pool_count = 0;
+    return 0;
 }
 static void
@@ -404,6 +484,7 @@ rperf_agg_table_rehash(rperf_agg_table_t *at)
 {
     size_t new_cap = at->bucket_capacity * 2;
     rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
+    if (!new_buckets) return; /* keep using current buckets at higher load factor */
     size_t i;
     for (i = 0; i < at->bucket_capacity; i++) {
@@ -438,7 +519,8 @@ rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
 /* Insert or merge a stack into the aggregation table */
 static void
 rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
-                       int depth, int thread_seq, int64_t weight, uint32_t hash)
+                       int depth, int thread_seq, int label_set_id,
+                       int64_t weight, uint32_t hash)
 {
     size_t idx = hash % at->bucket_capacity;
@@ -446,6 +528,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
         rperf_agg_entry_t *e = &at->buckets[idx];
         if (!e->used) break;
         if (e->hash == hash && e->depth == depth && e->thread_seq == thread_seq &&
+            e->label_set_id == label_set_id &&
             memcmp(at->stack_pool + e->frame_start, frame_ids,
                    depth * sizeof(uint32_t)) == 0) {
             /* Match — merge weight */
@@ -462,6 +545,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
     e->frame_start = (uint32_t)at->stack_pool_count;
     e->depth = depth;
     e->thread_seq = thread_seq;
+    e->label_set_id = label_set_id;
     e->weight = weight;
     e->hash = hash;
     e->used = 1;
@@ -513,10 +597,10 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
         if (overflow) break; /* frame_table full, stop aggregating this buffer */
         int total_depth = off + s->depth;
-        hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq);
+        hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq, s->label_set_id);
         rperf_agg_table_insert(&prof->agg_table, temp_ids, total_depth,
-                               s->thread_seq, s->weight, hash);
+                               s->thread_seq, s->label_set_id, s->weight, hash);
     }
     /* Reset buffer for reuse.
@@ -535,10 +619,10 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
 static void
 rperf_try_aggregate(rperf_profiler_t *prof)
 {
-    if (!prof->aggregate || !prof->swap_ready) return;
-    int standby_idx = prof->active_idx ^ 1;
+    if (!prof->aggregate || !atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) return;
+    int standby_idx = atomic_load_explicit(&prof->active_idx, memory_order_acquire) ^ 1;
     rperf_aggregate_buffer(prof, &prof->buffers[standby_idx]);
-    prof->swap_ready = 0;
+    atomic_store_explicit(&prof->swap_ready, 0, memory_order_release);
 }
 /* ---- Record a sample ---- */
@@ -547,25 +631,29 @@ static void
 rperf_try_swap(rperf_profiler_t *prof)
 {
     if (!prof->aggregate) return;
-    rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
+    int idx = atomic_load_explicit(&prof->active_idx, memory_order_relaxed);
+    rperf_sample_buffer_t *buf = &prof->buffers[idx];
     if (buf->sample_count < RPERF_AGG_THRESHOLD) return;
-    if (prof->swap_ready) return; /* standby still being aggregated */
+    if (atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) return; /* standby still being aggregated */
-    /* Swap active buffer */
-    prof->active_idx ^= 1;
-    prof->swap_ready = 1;
+    /* Swap active buffer: release ensures buffer writes are visible to worker */
+    atomic_store_explicit(&prof->active_idx, idx ^ 1, memory_order_release);
-    /* Wake worker thread */
+    /* Set swap_ready under mutex and signal, preventing lost wakeup:
+     * the worker checks swap_ready while holding the same mutex. */
+    CHECKED(pthread_mutex_lock(&prof->worker_mutex));
+    atomic_store_explicit(&prof->swap_ready, 1, memory_order_release);
     CHECKED(pthread_cond_signal(&prof->worker_cond));
+    CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
 }
-static void
-rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
-                    int64_t weight, int type, int thread_seq)
+/* Write a sample into a specific buffer. No swap check. */
+static int
+rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
+                   int64_t weight, int type, int thread_seq, int label_set_id)
 {
-    if (weight <= 0) return;
-    rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
-    if (rperf_ensure_sample_capacity(buf) < 0) return;
+    if (weight <= 0) return 0;
+    if (rperf_ensure_sample_capacity(buf) < 0) return -1;
     rperf_sample_t *sample = &buf->samples[buf->sample_count];
     sample->depth = depth;
@@ -573,8 +661,17 @@ rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
     sample->weight = weight;
     sample->type = type;
     sample->thread_seq = thread_seq;
+    sample->label_set_id = label_set_id;
     buf->sample_count++;
+    return 0;
+}
+static void
+rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
+                    int64_t weight, int type, int thread_seq, int label_set_id)
+{
+    rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
+    rperf_write_sample(buf, frame_start, depth, weight, type, thread_seq, label_set_id);
     rperf_try_swap(prof);
 }
@@ -586,7 +683,7 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
 {
     rperf_thread_data_t *td = (rperf_thread_data_t *)calloc(1, sizeof(rperf_thread_data_t));
     if (!td) return NULL;
-    td->prev_cpu_ns = rperf_current_time_ns(prof, td);
+    td->prev_time_ns = rperf_current_time_ns(prof, td);
     td->prev_wall_ns = rperf_wall_time_ns();
     td->thread_seq = ++prof->next_thread_seq;
     rb_internal_thread_specific_set(thread, prof->ts_key, td);
@@ -596,12 +693,11 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
 /* ---- Thread event hooks ---- */
 static void
-rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
+rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
 {
     /* Has GVL — safe to call Ruby APIs */
     int64_t wall_now = rperf_wall_time_ns();
-    rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
     int is_first = 0;
     if (td == NULL) {
@@ -614,7 +710,7 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
     if (time_now < 0) return;
     /* Capture backtrace into active buffer's frame_pool */
-    rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
+    rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
     if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
     size_t frame_start = buf->frame_pool_count;
     int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
@@ -624,34 +720,29 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
     /* Record normal sample (skip if first time — no prev_time) */
     if (!is_first) {
-        int64_t weight = time_now - td->prev_cpu_ns;
-        rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
+        int64_t weight = time_now - td->prev_time_ns;
+        rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq, td->label_set_id);
     }
-    /* Save stack and timestamp for READY/RESUMED */
+    /* Save timestamp for READY/RESUMED */
     td->suspended_at_ns = wall_now;
-    td->suspended_frame_start = frame_start;
-    td->suspended_frame_depth = depth;
-    td->prev_cpu_ns = time_now;
+    td->prev_time_ns = time_now;
     td->prev_wall_ns = wall_now;
 }
 static void
-rperf_handle_ready(rperf_profiler_t *prof, VALUE thread)
+rperf_handle_ready(rperf_thread_data_t *td)
 {
     /* May NOT have GVL — only simple C operations allowed */
-    rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
     if (!td) return;
     td->ready_at_ns = rperf_wall_time_ns();
 }
 static void
-rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread)
+rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
 {
     /* Has GVL */
-    rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
     if (td == NULL) {
         td = rperf_thread_data_create(prof, thread);
         if (!td) return;
@@ -659,36 +750,52 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread)
     int64_t wall_now = rperf_wall_time_ns();
-    /* Record GVL blocked/wait samples (wall mode only) */
-    if (prof->mode == 1 && td->suspended_frame_depth > 0) {
+    /* Record GVL blocked/wait samples (wall mode only).
+     * Capture backtrace here (not at SUSPENDED) so that frame_start always
+     * indexes into the current active buffer, avoiding mismatch after a
+     * double-buffer swap. The Ruby stack is unchanged while off-GVL.
+     *
+     * Both samples are written directly into the same buffer before calling
+     * rperf_try_swap, so that a swap triggered by the first sample cannot
+     * move the second into a different buffer with a stale frame_start. */
+    if (prof->mode == 1 && td->suspended_at_ns > 0) {
+        rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
+        if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) goto skip_gvl;
+        size_t frame_start = buf->frame_pool_count;
+        int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
+                                      &buf->frame_pool[frame_start], NULL);
+        if (depth <= 0) goto skip_gvl;
+        buf->frame_pool_count += depth;
+        /* Write both samples into the same buf, then swap-check once */
         if (td->ready_at_ns > 0 && td->ready_at_ns > td->suspended_at_ns) {
             int64_t blocked_ns = td->ready_at_ns - td->suspended_at_ns;
-            rperf_record_sample(prof, td->suspended_frame_start,
-                                td->suspended_frame_depth, blocked_ns,
-                                RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq);
+            rperf_write_sample(buf, frame_start, depth, blocked_ns,
+                               RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq, td->label_set_id);
         }
         if (td->ready_at_ns > 0 && wall_now > td->ready_at_ns) {
             int64_t wait_ns = wall_now - td->ready_at_ns;
-            rperf_record_sample(prof, td->suspended_frame_start,
-                                td->suspended_frame_depth, wait_ns,
-                                RPERF_SAMPLE_GVL_WAIT, td->thread_seq);
+            rperf_write_sample(buf, frame_start, depth, wait_ns,
+                               RPERF_SAMPLE_GVL_WAIT, td->thread_seq, td->label_set_id);
         }
+        rperf_try_swap(prof);
     }
+skip_gvl:
     /* Reset prev times to current — next timer sample measures from resume */
     int64_t time_now = rperf_current_time_ns(prof, td);
-    if (time_now >= 0) td->prev_cpu_ns = time_now;
+    if (time_now >= 0) td->prev_time_ns = time_now;
     td->prev_wall_ns = wall_now;
     /* Clear suspended state */
-    td->suspended_frame_depth = 0;
+    td->suspended_at_ns = 0;
     td->ready_at_ns = 0;
 }
 static void
-rperf_handle_exited(rperf_profiler_t *prof, VALUE thread)
+rperf_handle_exited(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
 {
-    rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
     if (td) {
         free(td);
         rb_internal_thread_specific_set(thread, prof->ts_key, NULL);
@@ -702,15 +809,16 @@ rperf_thread_event_hook(rb_event_flag_t event, const rb_internal_thread_event_da
     if (!prof->running) return;
     VALUE thread = data->thread;
+    rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
     if (event & RUBY_INTERNAL_THREAD_EVENT_SUSPENDED)
-        rperf_handle_suspended(prof, thread);
+        rperf_handle_suspended(prof, thread, td);
     else if (event & RUBY_INTERNAL_THREAD_EVENT_READY)
-        rperf_handle_ready(prof, thread);
+        rperf_handle_ready(td);
     else if (event & RUBY_INTERNAL_THREAD_EVENT_RESUMED)
-        rperf_handle_resumed(prof, thread);
+        rperf_handle_resumed(prof, thread, td);
     else if (event & RUBY_INTERNAL_THREAD_EVENT_EXITED)
-        rperf_handle_exited(prof, thread);
+        rperf_handle_exited(prof, thread, td);
 }
 /* ---- GC event hook ---- */
@@ -722,50 +830,53 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
     if (!prof->running) return;
     if (event & RUBY_INTERNAL_EVENT_GC_START) {
-        prof->gc_phase = RPERF_GC_MARKING;
+        prof->gc.phase = RPERF_GC_MARKING;
     }
     else if (event & RUBY_INTERNAL_EVENT_GC_END_MARK) {
-        prof->gc_phase = RPERF_GC_SWEEPING;
+        prof->gc.phase = RPERF_GC_SWEEPING;
     }
     else if (event & RUBY_INTERNAL_EVENT_GC_END_SWEEP) {
-        prof->gc_phase = RPERF_GC_NONE;
+        prof->gc.phase = RPERF_GC_NONE;
     }
     else if (event & RUBY_INTERNAL_EVENT_GC_ENTER) {
-        /* Capture backtrace and timestamp at GC entry */
-        prof->gc_enter_ns = rperf_wall_time_ns();
-        rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
-        if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
-        size_t frame_start = buf->frame_pool_count;
-        int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
-                                      &buf->frame_pool[frame_start], NULL);
-        if (depth <= 0) {
-            prof->gc_frame_depth = 0;
-            return;
-        }
-        buf->frame_pool_count += depth;
-        prof->gc_frame_start = frame_start;
-        prof->gc_frame_depth = depth;
-        /* Save thread_seq for the GC_EXIT sample */
+        /* Save timestamp, thread_seq, and label_set_id; backtrace is captured at GC_EXIT
+         * to avoid buffer mismatch after a double-buffer swap. */
+        prof->gc.enter_ns = rperf_wall_time_ns();
         {
             VALUE thread = rb_thread_current();
             rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
-            prof->gc_thread_seq = td ? td->thread_seq : 0;
+            prof->gc.thread_seq = td ? td->thread_seq : 0;
+            prof->gc.label_set_id = td ? td->label_set_id : 0;
         }
     }
     else if (event & RUBY_INTERNAL_EVENT_GC_EXIT) {
-        if (prof->gc_frame_depth <= 0) return;
+        if (prof->gc.enter_ns <= 0) return;
         int64_t wall_now = rperf_wall_time_ns();
-        int64_t weight = wall_now - prof->gc_enter_ns;
-        int type = (prof->gc_phase == RPERF_GC_SWEEPING)
+        int64_t weight = wall_now - prof->gc.enter_ns;
+        int type = (prof->gc.phase == RPERF_GC_SWEEPING)
                    ? RPERF_SAMPLE_GC_SWEEPING
                    : RPERF_SAMPLE_GC_MARKING;
-        rperf_record_sample(prof, prof->gc_frame_start,
-                            prof->gc_frame_depth, weight, type, prof->gc_thread_seq);
-        prof->gc_frame_depth = 0;
+        /* Capture backtrace here (not at GC_ENTER) so that frame_start
+         * always indexes into the current active buffer. The Ruby stack
+         * is unchanged during GC. */
+        rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
+        if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) {
+            prof->gc.enter_ns = 0;
+            return;
+        }
+        size_t frame_start = buf->frame_pool_count;
+        int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
+                                      &buf->frame_pool[frame_start], NULL);
+        if (depth <= 0) {
+            prof->gc.enter_ns = 0;
+            return;
+        }
+        buf->frame_pool_count += depth;
+        rperf_record_sample(prof, frame_start, depth, weight, type, prof->gc.thread_seq, prof->gc.label_set_id);
+        prof->gc.enter_ns = 0;
     }
 }
@@ -795,14 +906,14 @@ rperf_sample_job(void *arg)
     int64_t time_now = rperf_current_time_ns(prof, td);
     if (time_now < 0) return;
-    int64_t weight = time_now - td->prev_cpu_ns;
-    td->prev_cpu_ns = time_now;
+    int64_t weight = time_now - td->prev_time_ns;
+    td->prev_time_ns = time_now;
     td->prev_wall_ns = rperf_wall_time_ns();
     if (weight <= 0) return;
     /* Capture backtrace and record sample */
-    rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
+    rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
     if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
     size_t frame_start = buf->frame_pool_count;
@@ -811,11 +922,11 @@ rperf_sample_job(void *arg)
     if (depth <= 0) return;
     buf->frame_pool_count += depth;
-    rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
+    rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq, td->label_set_id);
     clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_end);
-    prof->sampling_count++;
-    prof->sampling_total_ns +=
+    prof->stats.sampling_count++;
+    prof->stats.sampling_total_ns +=
         ((int64_t)ts_end.tv_sec - ts_start.tv_sec) * 1000000000LL +
         (ts_end.tv_nsec - ts_start.tv_nsec);
 }
@@ -826,7 +937,7 @@ rperf_sample_job(void *arg)
 static void
 rperf_signal_handler(int sig)
 {
-    g_profiler.trigger_count++;
+    g_profiler.stats.trigger_count++;
     rb_postponed_job_trigger(g_profiler.pj_handle);
 }
@@ -845,7 +956,8 @@ rperf_worker_signal_func(void *arg)
     CHECKED(pthread_cond_signal(&prof->worker_cond));
     while (prof->running) {
-        CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
+        while (prof->running && !atomic_load_explicit(&prof->swap_ready, memory_order_acquire))
+            CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
         rperf_try_aggregate(prof);
     }
     CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
@@ -874,9 +986,12 @@ rperf_worker_nanosleep_func(void *arg)
     CHECKED(pthread_mutex_lock(&prof->worker_mutex));
     while (prof->running) {
         int ret = pthread_cond_timedwait(&prof->worker_cond, &prof->worker_mutex, &deadline);
-        assert(ret == 0 || ret == ETIMEDOUT);
+        if (ret != 0 && ret != ETIMEDOUT) {
+            fprintf(stderr, "rperf: pthread_cond_timedwait failed: %s\n", strerror(ret));
+            abort();
+        }
         if (ret == ETIMEDOUT) {
-            prof->trigger_count++;
+            prof->stats.trigger_count++;
             rb_postponed_job_trigger(prof->pj_handle);
             /* Advance deadline by interval */
             deadline.tv_nsec += interval_ns;
@@ -900,66 +1015,117 @@ rperf_resolve_frame(VALUE fval)
     VALUE label = rb_profile_frame_full_label(fval);
     if (NIL_P(path))  path  = rb_str_new_lit("<C method>");
-    if (NIL_P(path))  path  = rb_str_new_cstr("");
     if (NIL_P(label)) label = rb_str_new_cstr("");
     return rb_ary_new3(2, path, label);
 }
-/* ---- Ruby API ---- */
+/* ---- Shared helpers for stop/snapshot ---- */
+/* Flush pending sample buffers into agg_table.
+ * Caller must ensure no concurrent access (worker joined or mutex held). */
+static void
+rperf_flush_buffers(rperf_profiler_t *prof)
+{
+    int cur_idx = atomic_load_explicit(&prof->active_idx, memory_order_acquire);
+    if (atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) {
+        int standby_idx = cur_idx ^ 1;
+        rperf_aggregate_buffer(prof, &prof->buffers[standby_idx]);
+        atomic_store_explicit(&prof->swap_ready, 0, memory_order_release);
+    }
+    rperf_aggregate_buffer(prof, &prof->buffers[cur_idx]);
+}
+/* Build result hash from aggregated data (agg_table + frame_table).
+ * Does NOT free any resources.  Caller must hold GVL. */
 static VALUE
-rb_rperf_start(int argc, VALUE *argv, VALUE self)
+rperf_build_aggregated_result(rperf_profiler_t *prof)
 {
-    VALUE opts;
-    int frequency = 1000;
-    int mode = 0; /* 0 = cpu, 1 = wall */
-    int aggregate = 1; /* default: aggregate */
-#if RPERF_USE_TIMER_SIGNAL
-    int timer_signal = RPERF_TIMER_SIGNAL_DEFAULT;
-#endif
+    VALUE result, samples_ary;
+    size_t i;
+    int j;
-    rb_scan_args(argc, argv, ":", &opts);
-    if (!NIL_P(opts)) {
-        VALUE vagg = rb_hash_aref(opts, ID2SYM(rb_intern("aggregate")));
-        if (!NIL_P(vagg)) {
-            aggregate = RTEST(vagg) ? 1 : 0;
-        }
-        VALUE vfreq = rb_hash_aref(opts, ID2SYM(rb_intern("frequency")));
-        if (!NIL_P(vfreq)) {
-            frequency = NUM2INT(vfreq);
-            if (frequency <= 0 || frequency > 1000000) {
-                rb_raise(rb_eArgError, "frequency must be between 1 and 1000000");
-            }
+    result = rb_hash_new();
+    rb_hash_aset(result, ID2SYM(rb_intern("mode")),
+                 ID2SYM(rb_intern(prof->mode == 1 ? "wall" : "cpu")));
+    rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(prof->frequency));
+    rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(prof->stats.trigger_count));
+    rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(prof->stats.sampling_count));
+    rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(prof->stats.sampling_total_ns));
+    rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(prof->next_thread_seq));
+    rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
+                 SIZET2NUM(prof->frame_table.count - RPERF_SYNTHETIC_COUNT));
+    rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
+                 SIZET2NUM(prof->agg_table.count));
+    {
+        struct timespec now_monotonic;
+        int64_t start_ns, duration_ns;
+        clock_gettime(CLOCK_MONOTONIC, &now_monotonic);
+        start_ns = (int64_t)prof->start_realtime.tv_sec * 1000000000LL
+                 + (int64_t)prof->start_realtime.tv_nsec;
+        duration_ns = ((int64_t)now_monotonic.tv_sec - (int64_t)prof->start_monotonic.tv_sec) * 1000000000LL
+                    + ((int64_t)now_monotonic.tv_nsec - (int64_t)prof->start_monotonic.tv_nsec);
+        rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
+        rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
+    }
+    {
+        rperf_frame_table_t *ft = &prof->frame_table;
+        VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
+        rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]")));
+        rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
+        rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"),  rb_str_new_lit("[GC marking]")));
+        rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"),  rb_str_new_lit("[GC sweeping]")));
+        for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
+            rb_ary_push(resolved_ary, rperf_resolve_frame(atomic_load_explicit(&ft->keys, memory_order_relaxed)[i]));
         }
-        VALUE vmode = rb_hash_aref(opts, ID2SYM(rb_intern("mode")));
-        if (!NIL_P(vmode)) {
-            ID mode_id = SYM2ID(vmode);
-            if (mode_id == rb_intern("cpu")) {
-                mode = 0;
-            } else if (mode_id == rb_intern("wall")) {
-                mode = 1;
-            } else {
-                rb_raise(rb_eArgError, "mode must be :cpu or :wall");
+        rperf_agg_table_t *at = &prof->agg_table;
+        samples_ary = rb_ary_new();
+        for (i = 0; i < at->bucket_capacity; i++) {
+            rperf_agg_entry_t *e = &at->buckets[i];
+            if (!e->used) continue;
+            VALUE frames = rb_ary_new_capa(e->depth);
+            for (j = 0; j < e->depth; j++) {
+                uint32_t fid = at->stack_pool[e->frame_start + j];
+                rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
             }
+            VALUE sample = rb_ary_new3(4, frames, LONG2NUM(e->weight), INT2NUM(e->thread_seq), INT2NUM(e->label_set_id));
+            rb_ary_push(samples_ary, sample);
         }
+    }
+    rb_hash_aset(result, ID2SYM(rb_intern("aggregated_samples")), samples_ary);
+    if (prof->label_sets != Qnil) {
+        rb_hash_aset(result, ID2SYM(rb_intern("label_sets")), prof->label_sets);
+    }
+    return result;
+}
+/* ---- Ruby API ---- */
+/* _c_start(frequency, mode, aggregate, signal)
+ *   frequency: Integer (Hz)
+ *   mode:      0 = cpu, 1 = wall
+ *   aggregate: 0 or 1
+ *   signal:    Integer (RT signal number, 0 = nanosleep, -1 = default)
+ */
+static VALUE
+rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
+{
+    int frequency = NUM2INT(vfreq);
+    int mode = NUM2INT(vmode);
+    int aggregate = RTEST(vagg) ? 1 : 0;
 #if RPERF_USE_TIMER_SIGNAL
-        VALUE vsig = rb_hash_aref(opts, ID2SYM(rb_intern("signal")));
-        if (!NIL_P(vsig)) {
-            if (RTEST(vsig)) {
-                timer_signal = NUM2INT(vsig);
-                if (timer_signal < SIGRTMIN || timer_signal > SIGRTMAX) {
-                    rb_raise(rb_eArgError, "signal must be between SIGRTMIN(%d) and SIGRTMAX(%d)",
-                             SIGRTMIN, SIGRTMAX);
-                }
-            } else {
-                /* signal: false or signal: 0 → use nanosleep thread */
-                timer_signal = 0;
-            }
-        }
+    int sig = NUM2INT(vsig);
+    int timer_signal = (sig < 0) ? RPERF_TIMER_SIGNAL_DEFAULT : sig;
 #endif
-    }
     if (g_profiler.running) {
         rb_raise(rb_eRuntimeError, "Rperf is already running");
@@ -969,11 +1135,12 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
     g_profiler.mode = mode;
     g_profiler.aggregate = aggregate;
     g_profiler.next_thread_seq = 0;
-    g_profiler.sampling_count = 0;
-    g_profiler.sampling_total_ns = 0;
-    g_profiler.trigger_count = 0;
-    g_profiler.active_idx = 0;
-    g_profiler.swap_ready = 0;
+    g_profiler.stats.sampling_count = 0;
+    g_profiler.stats.sampling_total_ns = 0;
+    g_profiler.stats.trigger_count = 0;
+    atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
+    atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
+    g_profiler.label_sets = Qnil;
     /* Initialize worker mutex/cond */
     CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
@@ -994,13 +1161,26 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
         }
         /* Initialize aggregation structures */
-        rperf_frame_table_init(&g_profiler.frame_table);
-        rperf_agg_table_init(&g_profiler.agg_table);
+        if (rperf_frame_table_init(&g_profiler.frame_table) < 0) {
+            rperf_sample_buffer_free(&g_profiler.buffers[0]);
+            rperf_sample_buffer_free(&g_profiler.buffers[1]);
+            CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
+            CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
+            rb_raise(rb_eNoMemError, "rperf: failed to allocate frame table");
+        }
+        if (rperf_agg_table_init(&g_profiler.agg_table) < 0) {
+            rperf_frame_table_free(&g_profiler.frame_table);
+            rperf_sample_buffer_free(&g_profiler.buffers[0]);
+            rperf_sample_buffer_free(&g_profiler.buffers[1]);
+            CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
+            CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
+            rb_raise(rb_eNoMemError, "rperf: failed to allocate aggregation table");
+        }
     }
     /* Register GC event hook */
-    g_profiler.gc_phase = RPERF_GC_NONE;
-    g_profiler.gc_frame_depth = 0;
+    g_profiler.gc.phase = RPERF_GC_NONE;
+    g_profiler.gc.enter_ns = 0;
     rb_add_event_hook(rperf_gc_event_hook,
                       RUBY_INTERNAL_EVENT_GC_START |
                       RUBY_INTERNAL_EVENT_GC_END_MARK |
@@ -1023,6 +1203,7 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
         VALUE cur_thread = rb_thread_current();
         rperf_thread_data_t *td = rperf_thread_data_create(&g_profiler, cur_thread);
         if (!td) {
+            rb_remove_event_hook(rperf_gc_event_hook);
             rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
             g_profiler.thread_hook = NULL;
             if (g_profiler.aggregate) {
@@ -1053,14 +1234,17 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
         memset(&sa, 0, sizeof(sa));
         sa.sa_handler = rperf_signal_handler;
         sa.sa_flags = SA_RESTART;
-        sigaction(g_profiler.timer_signal, &sa, NULL);
+        if (sigaction(g_profiler.timer_signal, &sa, &g_profiler.old_sigaction) != 0) {
+            g_profiler.running = 0;
+            goto timer_fail;
+        }
         /* Start worker thread first to get its kernel TID */
         g_profiler.worker_tid = 0;
         if (pthread_create(&g_profiler.worker_thread, NULL,
                            rperf_worker_signal_func, &g_profiler) != 0) {
             g_profiler.running = 0;
-            signal(g_profiler.timer_signal, SIG_DFL);
+            sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
             goto timer_fail;
         }
@@ -1078,7 +1262,7 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
         sev._sigev_un._tid = g_profiler.worker_tid;
         if (timer_create(CLOCK_MONOTONIC, &sev, &g_profiler.timer_id) != 0) {
             g_profiler.running = 0;
-            signal(g_profiler.timer_signal, SIG_DFL);
+            sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
             CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
             CHECKED(pthread_join(g_profiler.worker_thread, NULL));
             goto timer_fail;
@@ -1087,7 +1271,14 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
         its.it_value.tv_sec = 0;
         its.it_value.tv_nsec = 1000000000L / g_profiler.frequency;
         its.it_interval = its.it_value;
-        timer_settime(g_profiler.timer_id, 0, &its, NULL);
+        if (timer_settime(g_profiler.timer_id, 0, &its, NULL) != 0) {
+            timer_delete(g_profiler.timer_id);
+            g_profiler.running = 0;
+            sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
+            CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
+            CHECKED(pthread_join(g_profiler.worker_thread, NULL));
+            goto timer_fail;
+        }
     } else
 #endif
     {
@@ -1109,6 +1300,7 @@ timer_fail:
                 rb_internal_thread_specific_set(cur, g_profiler.ts_key, NULL);
             }
         }
+        rb_remove_event_hook(rperf_gc_event_hook);
         rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
         g_profiler.thread_hook = NULL;
         if (g_profiler.aggregate) {
@@ -1139,17 +1331,28 @@ rb_rperf_stop(VALUE self)
     g_profiler.running = 0;
 #if RPERF_USE_TIMER_SIGNAL
     if (g_profiler.timer_signal > 0) {
+        /* Delete timer first to stop generating new signals.
+         * Do NOT restore signal handler yet — the worker thread may still have
+         * pending timer signals.  rperf_signal_handler handles them harmlessly. */
         timer_delete(g_profiler.timer_id);
-        signal(g_profiler.timer_signal, SIG_IGN);
     }
 #endif
-    /* Wake and join worker thread */
+    /* Wake and join worker thread.
+     * Any pending timer signals are still handled by rperf_signal_handler
+     * (just increments trigger_count + calls rb_postponed_job_trigger). */
     CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
     CHECKED(pthread_join(g_profiler.worker_thread, NULL));
     CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
     CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
+#if RPERF_USE_TIMER_SIGNAL
+    if (g_profiler.timer_signal > 0) {
+        /* Worker thread is gone — safe to restore old signal handler now. */
+        sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
+    }
+#endif
     if (g_profiler.thread_hook) {
         rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
         g_profiler.thread_hook = NULL;
@@ -1159,13 +1362,8 @@ rb_rperf_stop(VALUE self)
     rb_remove_event_hook(rperf_gc_event_hook);
     if (g_profiler.aggregate) {
-        /* Aggregate remaining samples from both buffers */
-        if (g_profiler.swap_ready) {
-            int standby_idx = g_profiler.active_idx ^ 1;
-            rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[standby_idx]);
-            g_profiler.swap_ready = 0;
-        }
-        rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[g_profiler.active_idx]);
+        /* Worker thread is joined; no concurrent access. */
+        rperf_flush_buffers(&g_profiler);
     }
     /* Clean up thread-specific data for all live threads */
@@ -1183,72 +1381,8 @@ rb_rperf_stop(VALUE self)
         }
     }
-    /* Build result hash */
-    result = rb_hash_new();
-    /* mode */
-    rb_hash_aset(result, ID2SYM(rb_intern("mode")),
-                 ID2SYM(rb_intern(g_profiler.mode == 1 ? "wall" : "cpu")));
-    /* frequency */
-    rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
-    /* trigger_count, sampling_count, sampling_time_ns */
-    rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.trigger_count));
-    rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.sampling_count));
-    rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.sampling_total_ns));
-    /* aggregation stats */
-    if (g_profiler.aggregate) {
-        rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
-                     SIZET2NUM(g_profiler.frame_table.count - RPERF_SYNTHETIC_COUNT));
-        rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
-                     SIZET2NUM(g_profiler.agg_table.count));
-    }
-    /* start_time_ns (CLOCK_REALTIME epoch nanos), duration_ns (CLOCK_MONOTONIC delta) */
-    {
-        struct timespec stop_monotonic;
-        int64_t start_ns, duration_ns;
-        clock_gettime(CLOCK_MONOTONIC, &stop_monotonic);
-        start_ns = (int64_t)g_profiler.start_realtime.tv_sec * 1000000000LL
-                 + (int64_t)g_profiler.start_realtime.tv_nsec;
-        duration_ns = ((int64_t)stop_monotonic.tv_sec - (int64_t)g_profiler.start_monotonic.tv_sec) * 1000000000LL
-                    + ((int64_t)stop_monotonic.tv_nsec - (int64_t)g_profiler.start_monotonic.tv_nsec);
-        rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
-        rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
-    }
     if (g_profiler.aggregate) {
-        /* Build samples from aggregation table.
-         * Use a Ruby array for resolved frames so GC protects them. */
-        rperf_frame_table_t *ft = &g_profiler.frame_table;
-        VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
-        /* Synthetic frames */
-        rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]")));
-        rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
-        rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"),  rb_str_new_lit("[GC marking]")));
-        rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"),  rb_str_new_lit("[GC sweeping]")));
-        /* Real frames */
-        for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
-            rb_ary_push(resolved_ary, rperf_resolve_frame(ft->keys[i]));
-        }
-        rperf_agg_table_t *at = &g_profiler.agg_table;
-        samples_ary = rb_ary_new();
-        for (i = 0; i < at->bucket_capacity; i++) {
-            rperf_agg_entry_t *e = &at->buckets[i];
-            if (!e->used) continue;
-            VALUE frames = rb_ary_new_capa(e->depth);
-            for (j = 0; j < e->depth; j++) {
-                uint32_t fid = at->stack_pool[e->frame_start + j];
-                rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
-            }
-            VALUE sample = rb_ary_new3(3, frames, LONG2NUM(e->weight), INT2NUM(e->thread_seq));
-            rb_ary_push(samples_ary, sample);
-        }
+        result = rperf_build_aggregated_result(&g_profiler);
         rperf_sample_buffer_free(&g_profiler.buffers[1]);
         rperf_frame_table_free(&g_profiler.frame_table);
@@ -1256,6 +1390,27 @@ rb_rperf_stop(VALUE self)
     } else {
         /* Raw samples path (aggregate: false) */
         rperf_sample_buffer_t *buf = &g_profiler.buffers[0];
+        result = rb_hash_new();
+        rb_hash_aset(result, ID2SYM(rb_intern("mode")),
+                     ID2SYM(rb_intern(g_profiler.mode == 1 ? "wall" : "cpu")));
+        rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
+        rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
+        rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
+        rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
+        rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
+        {
+            struct timespec stop_monotonic;
+            int64_t start_ns, duration_ns;
+            clock_gettime(CLOCK_MONOTONIC, &stop_monotonic);
+            start_ns = (int64_t)g_profiler.start_realtime.tv_sec * 1000000000LL
+                     + (int64_t)g_profiler.start_realtime.tv_nsec;
+            duration_ns = ((int64_t)stop_monotonic.tv_sec - (int64_t)g_profiler.start_monotonic.tv_sec) * 1000000000LL
+                        + ((int64_t)stop_monotonic.tv_nsec - (int64_t)g_profiler.start_monotonic.tv_nsec);
+            rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
+            rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
+        }
         samples_ary = rb_ary_new_capa((long)buf->sample_count);
         for (i = 0; i < buf->sample_count; i++) {
             rperf_sample_t *s = &buf->samples[i];
@@ -1281,11 +1436,14 @@ rb_rperf_stop(VALUE self)
                 rb_ary_push(frames, rperf_resolve_frame(fval));
             }
-            VALUE sample = rb_ary_new3(3, frames, LONG2NUM(s->weight), INT2NUM(s->thread_seq));
+            VALUE sample = rb_ary_new3(4, frames, LONG2NUM(s->weight), INT2NUM(s->thread_seq), INT2NUM(s->label_set_id));
             rb_ary_push(samples_ary, sample);
         }
+        rb_hash_aset(result, ID2SYM(rb_intern("raw_samples")), samples_ary);
+        if (g_profiler.label_sets != Qnil) {
+            rb_hash_aset(result, ID2SYM(rb_intern("label_sets")), g_profiler.label_sets);
+        }
     }
-    rb_hash_aset(result, ID2SYM(rb_intern("samples")), samples_ary);
     /* Cleanup */
     rperf_sample_buffer_free(&g_profiler.buffers[0]);
@@ -1293,6 +1451,113 @@ rb_rperf_stop(VALUE self)
     return result;
 }
+/* ---- Snapshot: read aggregated data without stopping ---- */
+/* Clear aggregated data for the next interval.
+ * Caller must hold GVL + worker_mutex.
+ * Keeps allocations intact for reuse.  Does NOT touch frame_table
+ * (frame IDs must stay stable — dmark may be iterating keys outside GVL,
+ * and existing threads reference frame IDs via their thread_data). */
+static void
+rperf_clear_aggregated_data(rperf_profiler_t *prof)
+{
+    /* Clear agg_table entries (keep allocation) */
+    memset(prof->agg_table.buckets, 0,
+           prof->agg_table.bucket_capacity * sizeof(rperf_agg_entry_t));
+    prof->agg_table.count = 0;
+    prof->agg_table.stack_pool_count = 0;
+    /* Reset stats */
+    prof->stats.trigger_count = 0;
+    prof->stats.sampling_count = 0;
+    prof->stats.sampling_total_ns = 0;
+    /* Reset start timestamps so next snapshot's duration_ns covers
+     * only the period since this clear. */
+    clock_gettime(CLOCK_REALTIME, &prof->start_realtime);
+    clock_gettime(CLOCK_MONOTONIC, &prof->start_monotonic);
+}
+static VALUE
+rb_rperf_snapshot(VALUE self, VALUE vclear)
+{
+    VALUE result;
+    if (!g_profiler.running) {
+        return Qnil;
+    }
+    if (!g_profiler.aggregate) {
+        rb_raise(rb_eRuntimeError, "snapshot requires aggregate mode (aggregate: true)");
+    }
+    /* GVL is held → no postponed jobs fire → no new samples written.
+     * Lock worker_mutex to pause worker thread's aggregation. */
+    CHECKED(pthread_mutex_lock(&g_profiler.worker_mutex));
+    rperf_flush_buffers(&g_profiler);
+    /* Build result while mutex is held.  If clear is requested, we must
+     * also clear under the same lock to avoid a window where the worker
+     * could aggregate into the table between build and clear. */
+    result = rperf_build_aggregated_result(&g_profiler);
+    if (RTEST(vclear)) {
+        rperf_clear_aggregated_data(&g_profiler);
+    }
+    CHECKED(pthread_mutex_unlock(&g_profiler.worker_mutex));
+    return result;
+}
+/* ---- Label API ---- */
+/* _c_set_label(label_set_id) — set current thread's label_set_id.
+ * Called from Ruby with GVL held. */
+static VALUE
+rb_rperf_set_label(VALUE self, VALUE vid)
+{
+    if (!g_profiler.running) return vid;
+    int label_set_id = NUM2INT(vid);
+    VALUE thread = rb_thread_current();
+    rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, g_profiler.ts_key);
+    if (td == NULL) {
+        td = rperf_thread_data_create(&g_profiler, thread);
+        if (!td) rb_raise(rb_eNoMemError, "rperf: failed to allocate thread data");
+    }
+    td->label_set_id = label_set_id;
+    return vid;
+}
+/* _c_get_label() — get current thread's label_set_id.
+ * Returns 0 if not profiling or thread not yet seen. */
+static VALUE
+rb_rperf_get_label(VALUE self)
+{
+    if (!g_profiler.running) return INT2FIX(0);
+    VALUE thread = rb_thread_current();
+    rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, g_profiler.ts_key);
+    if (td == NULL) return INT2FIX(0);
+    return INT2NUM(td->label_set_id);
+}
+/* _c_set_label_sets(ary) — store label_sets Ruby Array for result building */
+static VALUE
+rb_rperf_set_label_sets(VALUE self, VALUE ary)
+{
+    g_profiler.label_sets = ary;
+    return ary;
+}
+/* _c_get_label_sets() — get label_sets Ruby Array */
+static VALUE
+rb_rperf_get_label_sets(VALUE self)
+{
+    return g_profiler.label_sets;
+}
 /* ---- Fork safety ---- */
 static void
@@ -1304,9 +1569,20 @@ rperf_after_fork_child(void)
     g_profiler.running = 0;
 #if RPERF_USE_TIMER_SIGNAL
-    /* timer_create timers are not inherited across fork; reset signal handler */
+    /* timer_create timers are not inherited across fork, but pending signals may be.
+     * Block the signal, drain any pending instances, then restore old handler. */
     if (g_profiler.timer_signal > 0) {
-        signal(g_profiler.timer_signal, SIG_DFL);
+        sigset_t block_set, old_set;
+        struct timespec zero_ts = {0, 0};
+        sigemptyset(&block_set);
+        sigaddset(&block_set, g_profiler.timer_signal);
+        pthread_sigmask(SIG_BLOCK, &block_set, &old_set);
+        while (sigtimedwait(&block_set, NULL, &zero_ts) > 0) {}
+        sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
+        pthread_sigmask(SIG_SETMASK, &old_set, NULL);
     }
 #endif
@@ -1326,12 +1602,13 @@ rperf_after_fork_child(void)
     }
     /* Reset GC state */
-    g_profiler.gc_phase = 0;
+    g_profiler.gc.phase = 0;
+    g_profiler.gc.enter_ns = 0;
     /* Reset stats */
-    g_profiler.sampling_count = 0;
-    g_profiler.sampling_total_ns = 0;
-    g_profiler.swap_ready = 0;
+    g_profiler.stats.sampling_count = 0;
+    g_profiler.stats.sampling_total_ns = 0;
+    atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
 }
 /* ---- Init ---- */
@@ -1340,10 +1617,16 @@ void
 Init_rperf(void)
 {
     VALUE mRperf = rb_define_module("Rperf");
-    rb_define_module_function(mRperf, "_c_start", rb_rperf_start, -1);
+    rb_define_module_function(mRperf, "_c_start", rb_rperf_start, 4);
     rb_define_module_function(mRperf, "_c_stop", rb_rperf_stop, 0);
+    rb_define_module_function(mRperf, "_c_snapshot", rb_rperf_snapshot, 1);
+    rb_define_module_function(mRperf, "_c_set_label", rb_rperf_set_label, 1);
+    rb_define_module_function(mRperf, "_c_get_label", rb_rperf_get_label, 0);
+    rb_define_module_function(mRperf, "_c_set_label_sets", rb_rperf_set_label_sets, 1);
+    rb_define_module_function(mRperf, "_c_get_label_sets", rb_rperf_get_label_sets, 0);
     memset(&g_profiler, 0, sizeof(g_profiler));
+    g_profiler.label_sets = Qnil;
     g_profiler.pj_handle = rb_postponed_job_preregister(0, rperf_sample_job, &g_profiler);
     g_profiler.ts_key = rb_internal_thread_specific_key_create();