RubyGems - rperf - Versions diffs - 0.4.0 → 0.5.0 - Mend

rperf 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ab923fe1fc0a0d6928941271cdffc979012af73d6d0bd0aa5c5d43a95e9451c2
-  data.tar.gz: 74a0200ec71ae3743d2b99d578df0b484d23dea57285385209e23b0748a95564
+  metadata.gz: 3413c4c6ed0cdc0897428bf01fc0fec17a4d14f1c2883e9e5afa0cff110247dc
+  data.tar.gz: '097b06203ce4648a860f2816635d6dfac52f8e5987aa381653cec874d52abf7c'
 SHA512:
-  metadata.gz: b2d95c3e58fd883efebfcad8506a5249dee8c7322fb53e75a25afcd5050bbb1885fb620eef18a86e74fa54cb542ba83b1761f13630746862c619477e022b09db
-  data.tar.gz: ee4236170102e0be1cd13749389a29679ba7361c4c77db98ed708e9b509e8e1c28ba3b47a2d8a4b704ada2fe5a11859bc61c0476d8f061fbd43703168232f5f6
+  metadata.gz: 37065071f049a27eb1bab9f859ed39499022489a19aa8ecd91b3dc35cb6052ffb6b2fbc02c67ea46a94e8dba7644f2b23760d72d2dda7b998ccf3c61c304e225
+  data.tar.gz: 686ab430d58e5dd5163ae65a2bd330a76e57cf0dd72e7eac2b7c61621a03007bd724cac20b1e452766870ff33de325855e199bc3d873d004344f7b26b9b6614f

data/docs/help.md CHANGED Viewed

@@ -10,6 +10,7 @@ POSIX systems (Linux, macOS). Requires Ruby >= 3.4.0.
     rperf record [options] command [args...]
     rperf stat [options] command [args...]
+    rperf exec [options] command [args...]
     rperf report [options] [file]
     rperf help
@@ -41,6 +42,20 @@ Shows: user/sys/real time, time breakdown (CPU execution, GVL blocked,
 GVL wait, GC marking, GC sweeping), GC/memory/OS stats, and profiler overhead.
 Use --report to add flat and cumulative top-50 function tables.
+### exec: Run command and print full profile report to stderr.
+Like `stat --report`. Uses wall mode by default. No file output by default.
+    -o, --output PATH       Also save profile to file (default: none)
+    -f, --frequency HZ      Sampling frequency in Hz (default: 1000)
+    -m, --mode MODE         cpu or wall (default: wall)
+    --signal VALUE          Timer signal (Linux only): signal number, or 'false'
+                            for nanosleep thread (default: auto)
+    -v, --verbose           Print additional sampling statistics
+Shows: user/sys/real time, time breakdown, GC/memory/OS stats, profiler overhead,
+and flat/cumulative top-50 function tables.
 ### report: Open pprof profile with go tool pprof. Requires Go.
     --top                   Print top functions by flat time
@@ -67,6 +82,8 @@ Default (no flag): opens diff in browser.
     rperf stat ruby app.rb
     rperf stat --report ruby app.rb
     rperf stat -o profile.pb.gz ruby app.rb
+    rperf exec ruby app.rb
+    rperf exec -m cpu ruby app.rb
     rperf report
     rperf report --top profile.pb.gz
     rperf diff before.pb.gz after.pb.gz
@@ -106,16 +123,22 @@ Rperf.save("profile.txt", data)
 nil if profiler was not running; otherwise a Hash:
 ```ruby
-{ mode: :cpu,             # or :wall
+{ mode: :cpu,                      # or :wall
   frequency: 500,
   sampling_count: 1234,
   sampling_time_ns: 56789,
-  start_time_ns: 17740..., # CLOCK_REALTIME epoch nanos
-  duration_ns: 10000000,   # profiling duration in nanos
-  samples: [               # Array of [frames, weight, thread_seq]
-    [frames, weight, seq], #   frames: [[path, label], ...] deepest-first
-    ...                    #   weight: Integer (nanoseconds)
-  ] }                      #   seq: Integer (thread sequence, 1-based)
+  detected_thread_count: 4,        # threads seen during profiling
+  start_time_ns: 17740...,         # CLOCK_REALTIME epoch nanos
+  duration_ns: 10000000,           # profiling duration in nanos
+  aggregated_samples: [            # when aggregate: true (default)
+    [frames, weight, seq],         #   frames: [[path, label], ...] deepest-first
+    ...                            #   weight: Integer (nanoseconds, merged per unique stack)
+  ],                               #   seq: Integer (thread sequence, 1-based)
+  # --- OR ---
+  raw_samples: [           # when aggregate: false
+    [frames, weight, seq], #   one entry per timer sample (not merged)
+    ...
+  ] }
 ```
 ### Rperf.save(path, data, format: nil)

data/exe/rperf CHANGED Viewed

@@ -72,6 +72,7 @@ HELP_TEXT = File.read(File.expand_path("../docs/help.md", __dir__))
 USAGE = "Usage: rperf record [options] command [args...]\n" \
        "       rperf stat [options] command [args...]\n" \
+       "       rperf exec [options] command [args...]\n" \
        "       rperf report [options] [file]\n" \
        "       rperf diff [options] base.pb.gz target.pb.gz\n" \
        "       rperf help\n"
@@ -120,7 +121,7 @@ when "diff"
     else            exec("go", "tool", "pprof", "-http=localhost:#{find_available_port}", "-diff_base=#{base_file}", target_file)
     end
   end
-when "record", "stat"
+when "record", "stat", "exec"
   # continue below
 else
   $stderr.puts "Unknown subcommand: #{subcommand.inspect}" if subcommand
@@ -128,22 +129,23 @@ else
   exit 1
 end
-output = (subcommand == "stat") ? nil : "rperf.data"
+output = (subcommand == "record") ? "rperf.data" : nil
 frequency = 1000
-mode = (subcommand == "stat") ? "wall" : "cpu"
+mode = (subcommand == "record") ? "cpu" : "wall"
 format = nil
 signal = nil
 verbose = false
 aggregate = true
-stat_report = false
+stat_report = (subcommand == "exec")
 parser = OptionParser.new do |opts|
   opts.banner = case subcommand
                 when "record" then "Usage: rperf record [options] command [args...]"
                 when "stat"   then "Usage: rperf stat [options] command [args...]"
+                when "exec"   then "Usage: rperf exec [options] command [args...]"
                 end
-  opts.on("-o", "--output PATH", "Output file#{subcommand == 'stat' ? ' (default: none)' : ' (default: rperf.data)'}") do |v|
+  opts.on("-o", "--output PATH", "Output file#{subcommand == 'record' ? ' (default: rperf.data)' : ' (default: none)'}") do |v|
     output = v
   end
@@ -151,7 +153,7 @@ parser = OptionParser.new do |opts|
     frequency = v
   end
-  default_mode = (subcommand == "stat") ? "wall" : "cpu"
+  default_mode = (subcommand == "record") ? "cpu" : "wall"
   opts.on("-m", "--mode MODE", %w[cpu wall], "Profiling mode: cpu or wall (default: #{default_mode})") do |v|
     mode = v
   end
@@ -208,6 +210,29 @@ if ARGV.empty?
   exit 1
 end
+if frequency <= 0
+  $stderr.puts "Error: frequency must be a positive integer (got #{frequency})"
+  exit 1
+end
+if frequency > 10_000
+  $stderr.puts "Error: frequency must be <= 10000 (10KHz), got #{frequency}"
+  exit 1
+end
+if signal && signal != "false"
+  unless RUBY_PLATFORM =~ /linux/
+    $stderr.puts "Error: signal mode is only supported on Linux"
+    exit 1
+  end
+  sig_num = signal.to_i
+  uncatchable = [Signal.list["KILL"], Signal.list["STOP"]].compact
+  if uncatchable.include?(sig_num)
+    $stderr.puts "Error: signal #{sig_num} (#{Signal.signame(sig_num)}) cannot be caught; use a different signal"
+    exit 1
+  end
+end
 # Add lib dir to RUBYLIB so -rrperf can find the extension
 lib_dir = File.expand_path("../lib", __dir__)
 ENV["RUBYLIB"] = [lib_dir, ENV["RUBYLIB"]].compact.join(File::PATH_SEPARATOR)
@@ -221,7 +246,7 @@ ENV["RPERF_VERBOSE"] = "1" if verbose
 ENV["RPERF_SIGNAL"] = signal if signal
 ENV["RPERF_AGGREGATE"] = "0" unless aggregate
-if subcommand == "stat"
+if subcommand == "stat" || subcommand == "exec"
   ENV["RPERF_STAT"] = "1"
   ENV["RPERF_STAT_COMMAND"] = ARGV.join(" ")
   ENV["RPERF_STAT_REPORT"] = "1" if stat_report

data/ext/rperf/rperf.c CHANGED Viewed

@@ -7,13 +7,19 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <signal.h>
-#include <assert.h>
+#include <stdatomic.h>
 #ifdef __linux__
 #include <sys/syscall.h>
 #endif
-/* Checked pthread wrappers — assert on unexpected errors */
-#define CHECKED(call) do { int _r = (call); assert(_r == 0 && #call); (void)_r; } while (0)
+/* Checked pthread wrappers — always active regardless of NDEBUG */
+#define CHECKED(call) do { \
+    int _r = (call); \
+    if (_r != 0) { \
+        fprintf(stderr, "rperf: %s failed: %s\n", #call, strerror(_r)); \
+        abort(); \
+    } \
+} while (0)
 #ifdef __linux__
 #define RPERF_USE_TIMER_SIGNAL 1
@@ -26,7 +32,8 @@
 #define RPERF_INITIAL_SAMPLES 16384  /* >= AGG_THRESHOLD to avoid realloc before first aggregation */
 #define RPERF_INITIAL_FRAME_POOL (1024 * 1024 / sizeof(VALUE)) /* ~1MB */
 #define RPERF_AGG_THRESHOLD 10000  /* aggregate every N samples */
-#define RPERF_FRAME_TABLE_INITIAL 65536  /* pre-allocate to avoid realloc race with GC dmark */
+#define RPERF_FRAME_TABLE_INITIAL 4096
+#define RPERF_FRAME_TABLE_OLD_KEYS_INITIAL 16
 #define RPERF_AGG_TABLE_INITIAL 1024
 #define RPERF_STACK_POOL_INITIAL 4096
@@ -77,11 +84,15 @@ typedef struct rperf_sample_buffer {
 #define RPERF_FRAME_TABLE_EMPTY UINT32_MAX
 typedef struct rperf_frame_table {
-    VALUE *keys;              /* unique VALUE array (GC mark target) */
+    _Atomic(VALUE *) keys;    /* unique VALUE array (GC mark target) */
     size_t count;             /* = next frame_id (starts after RPERF_SYNTHETIC_COUNT) */
     size_t capacity;
     uint32_t *buckets;        /* open addressing: stores index into keys[] */
     size_t bucket_capacity;
+    /* Old keys arrays kept alive for GC dmark safety until stop */
+    VALUE **old_keys;
+    int old_keys_count;
+    int old_keys_capacity;
 } rperf_frame_table_t;
 /* ---- Aggregation table: stack → weight ---- */
@@ -107,54 +118,63 @@ typedef struct rperf_agg_table {
 } rperf_agg_table_t;
 typedef struct rperf_thread_data {
-    int64_t prev_cpu_ns;
+    int64_t prev_time_ns;
     int64_t prev_wall_ns;
     /* GVL event tracking */
     int64_t suspended_at_ns;        /* wall time at SUSPENDED */
     int64_t ready_at_ns;            /* wall time at READY */
-    size_t suspended_frame_start;   /* saved stack in frame_pool */
-    int suspended_frame_depth;      /* saved stack depth */
     int thread_seq;                 /* thread sequence number (1-based) */
 } rperf_thread_data_t;
+/* ---- GC tracking state ---- */
+typedef struct rperf_gc_state {
+    int phase;                /* rperf_gc_phase */
+    int64_t enter_ns;         /* wall time at GC_ENTER */
+    int thread_seq;           /* thread_seq at GC_ENTER */
+} rperf_gc_state_t;
+/* ---- Sampling overhead stats ---- */
+typedef struct rperf_stats {
+    size_t trigger_count;
+    size_t sampling_count;
+    int64_t sampling_total_ns;
+} rperf_stats_t;
 typedef struct rperf_profiler {
     int frequency;
     int mode; /* 0 = cpu, 1 = wall */
-    volatile int running;
+    _Atomic int running;
     pthread_t worker_thread;     /* combined timer + aggregation */
 #if RPERF_USE_TIMER_SIGNAL
     timer_t timer_id;
     int timer_signal;     /* >0: use timer signal, 0: use nanosleep thread */
-    volatile pid_t worker_tid;   /* kernel TID of worker thread (for SIGEV_THREAD_ID) */
+    _Atomic pid_t worker_tid;    /* kernel TID of worker thread (for SIGEV_THREAD_ID) */
+    struct sigaction old_sigaction;  /* saved handler to restore on stop */
 #endif
     rb_postponed_job_handle_t pj_handle;
     int aggregate;               /* 1 = aggregate samples, 0 = raw */
     /* Double-buffered sample storage (only buffers[0] used when !aggregate) */
     rperf_sample_buffer_t buffers[2];
-    int active_idx;              /* 0 or 1 */
+    _Atomic int active_idx;      /* 0 or 1 */
     /* Aggregation (only used when aggregate=1) */
     rperf_frame_table_t frame_table;
     rperf_agg_table_t agg_table;
-    volatile int swap_ready;     /* 1 = standby buffer ready for aggregation */
+    _Atomic int swap_ready;      /* 1 = standby buffer ready for aggregation */
     pthread_mutex_t worker_mutex;
     pthread_cond_t worker_cond;
     rb_internal_thread_specific_key_t ts_key;
     rb_internal_thread_event_hook_t *thread_hook;
     /* GC tracking */
-    int gc_phase;                /* rperf_gc_phase */
-    int64_t gc_enter_ns;         /* wall time at GC_ENTER */
-    size_t gc_frame_start;       /* saved stack at GC_ENTER */
-    int gc_frame_depth;          /* saved stack depth */
-    int gc_thread_seq;           /* thread_seq at GC_ENTER */
+    rperf_gc_state_t gc;
     /* Timing metadata for pprof */
     struct timespec start_realtime;   /* CLOCK_REALTIME at start */
     struct timespec start_monotonic;  /* CLOCK_MONOTONIC at start */
     /* Thread sequence counter */
     int next_thread_seq;
     /* Sampling overhead stats */
-    size_t trigger_count;
-    size_t sampling_count;
-    int64_t sampling_total_ns;
+    rperf_stats_t stats;
 } rperf_profiler_t;
 static rperf_profiler_t g_profiler;
@@ -175,10 +195,18 @@ rperf_profiler_mark(void *ptr)
                                 buf->frame_pool + buf->frame_pool_count);
         }
     }
-    /* Mark frame_table keys (unique frame VALUEs) */
-    if (prof->frame_table.keys && prof->frame_table.count > 0) {
-        rb_gc_mark_locations(prof->frame_table.keys + RPERF_SYNTHETIC_COUNT,
-                            prof->frame_table.keys + prof->frame_table.count);
+    /* Mark frame_table keys (unique frame VALUEs).
+     * Acquire count to synchronize with the release-store in insert,
+     * ensuring we see the keys pointer that is valid for [0, count).
+     * If we see an old count, both old and new keys arrays have valid
+     * data (old keys are kept alive in old_keys[]). */
+    {
+        size_t ft_count = __atomic_load_n(&prof->frame_table.count, __ATOMIC_ACQUIRE);
+        VALUE *ft_keys = atomic_load_explicit(&prof->frame_table.keys, memory_order_acquire);
+        if (ft_keys && ft_count > 0) {
+            rb_gc_mark_locations(ft_keys + RPERF_SYNTHETIC_COUNT,
+                                ft_keys + ft_count);
+        }
     }
 }
@@ -288,21 +316,38 @@ rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
 /* ---- Frame table operations (all malloc-based, no GVL needed) ---- */
-static void
+static int
 rperf_frame_table_init(rperf_frame_table_t *ft)
 {
     ft->capacity = RPERF_FRAME_TABLE_INITIAL;
-    ft->keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
+    VALUE *keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
+    if (!keys) return -1;
+    atomic_store_explicit(&ft->keys, keys, memory_order_relaxed);
     ft->count = RPERF_SYNTHETIC_COUNT; /* reserve slots for synthetic frames */
     ft->bucket_capacity = RPERF_FRAME_TABLE_INITIAL * 2;
     ft->buckets = (uint32_t *)malloc(ft->bucket_capacity * sizeof(uint32_t));
+    if (!ft->buckets) { free(keys); atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed); return -1; }
     memset(ft->buckets, 0xFF, ft->bucket_capacity * sizeof(uint32_t)); /* EMPTY */
+    ft->old_keys_count = 0;
+    ft->old_keys_capacity = RPERF_FRAME_TABLE_OLD_KEYS_INITIAL;
+    ft->old_keys = (VALUE **)malloc(ft->old_keys_capacity * sizeof(VALUE *));
+    if (!ft->old_keys) {
+        free(ft->buckets);
+        free(keys);
+        atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed);
+        return -1;
+    }
+    return 0;
 }
 static void
 rperf_frame_table_free(rperf_frame_table_t *ft)
 {
-    free(ft->keys);
+    int i;
+    for (i = 0; i < ft->old_keys_count; i++)
+        free(ft->old_keys[i]);
+    free(ft->old_keys);
+    free(atomic_load_explicit(&ft->keys, memory_order_relaxed));
     free(ft->buckets);
     memset(ft, 0, sizeof(*ft));
 }
@@ -312,11 +357,13 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
 {
     size_t new_cap = ft->bucket_capacity * 2;
     uint32_t *new_buckets = (uint32_t *)malloc(new_cap * sizeof(uint32_t));
+    if (!new_buckets) return; /* keep using current buckets at higher load factor */
     memset(new_buckets, 0xFF, new_cap * sizeof(uint32_t));
+    VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
     size_t i;
     for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
-        uint32_t h = (uint32_t)(ft->keys[i] >> 3); /* shift out tag bits */
+        uint32_t h = (uint32_t)(keys[i] >> 3); /* shift out tag bits */
         size_t idx = h % new_cap;
         while (new_buckets[idx] != RPERF_FRAME_TABLE_EMPTY)
             idx = (idx + 1) % new_cap;
@@ -332,25 +379,42 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
 static uint32_t
 rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
 {
+    VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
     uint32_t h = (uint32_t)(fval >> 3);
     size_t idx = h % ft->bucket_capacity;
     while (1) {
         uint32_t slot = ft->buckets[idx];
         if (slot == RPERF_FRAME_TABLE_EMPTY) break;
-        if (ft->keys[slot] == fval) return slot;
+        if (keys[slot] == fval) return slot;
         idx = (idx + 1) % ft->bucket_capacity;
     }
-    /* Insert new entry.
-     * keys array is pre-allocated and never realloc'd to avoid race with GC dmark.
-     * If capacity is exhausted, return EMPTY to signal aggregation should stop. */
+    /* Insert new entry.  Grow keys array if capacity is exhausted.
+     * Cannot realloc in-place because GC dmark may concurrently read
+     * the old keys pointer.  Instead, allocate new, copy, swap pointer
+     * atomically, and keep old array alive until stop. */
     if (ft->count >= ft->capacity) {
-        return RPERF_FRAME_TABLE_EMPTY;
+        size_t new_cap = ft->capacity * 2;
+        VALUE *new_keys = (VALUE *)calloc(new_cap, sizeof(VALUE));
+        if (!new_keys) return RPERF_FRAME_TABLE_EMPTY;
+        memcpy(new_keys, keys, ft->capacity * sizeof(VALUE));
+        /* Save old keys for deferred free (GC dmark safety) */
+        if (ft->old_keys_count >= ft->old_keys_capacity) {
+            int new_old_cap = ft->old_keys_capacity * 2;
+            VALUE **new_old = (VALUE **)realloc(ft->old_keys, new_old_cap * sizeof(VALUE *));
+            if (!new_old) { free(new_keys); return RPERF_FRAME_TABLE_EMPTY; }
+            ft->old_keys = new_old;
+            ft->old_keys_capacity = new_old_cap;
+        }
+        ft->old_keys[ft->old_keys_count++] = keys;
+        keys = new_keys;
+        atomic_store_explicit(&ft->keys, new_keys, memory_order_release);
+        ft->capacity = new_cap;
     }
     uint32_t frame_id = (uint32_t)ft->count;
-    ft->keys[frame_id] = fval;
+    keys[frame_id] = fval;
     /* Store fence: ensure keys[frame_id] is visible before count is incremented,
      * so GC dmark never reads uninitialized keys[count-1]. */
     __atomic_store_n(&ft->count, ft->count + 1, __ATOMIC_RELEASE);
@@ -380,15 +444,18 @@ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq)
     return h;
 }
-static void
+static int
 rperf_agg_table_init(rperf_agg_table_t *at)
 {
     at->bucket_capacity = RPERF_AGG_TABLE_INITIAL * 2;
     at->buckets = (rperf_agg_entry_t *)calloc(at->bucket_capacity, sizeof(rperf_agg_entry_t));
+    if (!at->buckets) return -1;
     at->count = 0;
     at->stack_pool_capacity = RPERF_STACK_POOL_INITIAL;
     at->stack_pool = (uint32_t *)malloc(at->stack_pool_capacity * sizeof(uint32_t));
+    if (!at->stack_pool) { free(at->buckets); at->buckets = NULL; return -1; }
     at->stack_pool_count = 0;
+    return 0;
 }
 static void
@@ -404,6 +471,7 @@ rperf_agg_table_rehash(rperf_agg_table_t *at)
 {
     size_t new_cap = at->bucket_capacity * 2;
     rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
+    if (!new_buckets) return; /* keep using current buckets at higher load factor */
     size_t i;
     for (i = 0; i < at->bucket_capacity; i++) {
@@ -535,10 +603,10 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
 static void
 rperf_try_aggregate(rperf_profiler_t *prof)
 {
-    if (!prof->aggregate || !prof->swap_ready) return;
-    int standby_idx = prof->active_idx ^ 1;
+    if (!prof->aggregate || !atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) return;
+    int standby_idx = atomic_load_explicit(&prof->active_idx, memory_order_acquire) ^ 1;
     rperf_aggregate_buffer(prof, &prof->buffers[standby_idx]);
-    prof->swap_ready = 0;
+    atomic_store_explicit(&prof->swap_ready, 0, memory_order_release);
 }
 /* ---- Record a sample ---- */
@@ -547,25 +615,29 @@ static void
 rperf_try_swap(rperf_profiler_t *prof)
 {
     if (!prof->aggregate) return;
-    rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
+    int idx = atomic_load_explicit(&prof->active_idx, memory_order_relaxed);
+    rperf_sample_buffer_t *buf = &prof->buffers[idx];
     if (buf->sample_count < RPERF_AGG_THRESHOLD) return;
-    if (prof->swap_ready) return; /* standby still being aggregated */
+    if (atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) return; /* standby still being aggregated */
-    /* Swap active buffer */
-    prof->active_idx ^= 1;
-    prof->swap_ready = 1;
+    /* Swap active buffer: release ensures buffer writes are visible to worker */
+    atomic_store_explicit(&prof->active_idx, idx ^ 1, memory_order_release);
-    /* Wake worker thread */
+    /* Set swap_ready under mutex and signal, preventing lost wakeup:
+     * the worker checks swap_ready while holding the same mutex. */
+    CHECKED(pthread_mutex_lock(&prof->worker_mutex));
+    atomic_store_explicit(&prof->swap_ready, 1, memory_order_release);
     CHECKED(pthread_cond_signal(&prof->worker_cond));
+    CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
 }
-static void
-rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
-                    int64_t weight, int type, int thread_seq)
+/* Write a sample into a specific buffer. No swap check. */
+static int
+rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
+                   int64_t weight, int type, int thread_seq)
 {
-    if (weight <= 0) return;
-    rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
-    if (rperf_ensure_sample_capacity(buf) < 0) return;
+    if (weight <= 0) return 0;
+    if (rperf_ensure_sample_capacity(buf) < 0) return -1;
     rperf_sample_t *sample = &buf->samples[buf->sample_count];
     sample->depth = depth;
@@ -574,7 +646,15 @@ rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
     sample->type = type;
     sample->thread_seq = thread_seq;
     buf->sample_count++;
+    return 0;
+}
+static void
+rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
+                    int64_t weight, int type, int thread_seq)
+{
+    rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
+    rperf_write_sample(buf, frame_start, depth, weight, type, thread_seq);
     rperf_try_swap(prof);
 }
@@ -586,7 +666,7 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
 {
     rperf_thread_data_t *td = (rperf_thread_data_t *)calloc(1, sizeof(rperf_thread_data_t));
     if (!td) return NULL;
-    td->prev_cpu_ns = rperf_current_time_ns(prof, td);
+    td->prev_time_ns = rperf_current_time_ns(prof, td);
     td->prev_wall_ns = rperf_wall_time_ns();
     td->thread_seq = ++prof->next_thread_seq;
     rb_internal_thread_specific_set(thread, prof->ts_key, td);
@@ -614,7 +694,7 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
     if (time_now < 0) return;
     /* Capture backtrace into active buffer's frame_pool */
-    rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
+    rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
     if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
     size_t frame_start = buf->frame_pool_count;
     int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
@@ -624,15 +704,13 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
     /* Record normal sample (skip if first time — no prev_time) */
     if (!is_first) {
-        int64_t weight = time_now - td->prev_cpu_ns;
+        int64_t weight = time_now - td->prev_time_ns;
         rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
     }
-    /* Save stack and timestamp for READY/RESUMED */
+    /* Save timestamp for READY/RESUMED */
     td->suspended_at_ns = wall_now;
-    td->suspended_frame_start = frame_start;
-    td->suspended_frame_depth = depth;
-    td->prev_cpu_ns = time_now;
+    td->prev_time_ns = time_now;
     td->prev_wall_ns = wall_now;
 }
@@ -659,29 +737,46 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread)
     int64_t wall_now = rperf_wall_time_ns();
-    /* Record GVL blocked/wait samples (wall mode only) */
-    if (prof->mode == 1 && td->suspended_frame_depth > 0) {
+    /* Record GVL blocked/wait samples (wall mode only).
+     * Capture backtrace here (not at SUSPENDED) so that frame_start always
+     * indexes into the current active buffer, avoiding mismatch after a
+     * double-buffer swap. The Ruby stack is unchanged while off-GVL.
+     *
+     * Both samples are written directly into the same buffer before calling
+     * rperf_try_swap, so that a swap triggered by the first sample cannot
+     * move the second into a different buffer with a stale frame_start. */
+    if (prof->mode == 1 && td->suspended_at_ns > 0) {
+        rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
+        if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) goto skip_gvl;
+        size_t frame_start = buf->frame_pool_count;
+        int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
+                                      &buf->frame_pool[frame_start], NULL);
+        if (depth <= 0) goto skip_gvl;
+        buf->frame_pool_count += depth;
+        /* Write both samples into the same buf, then swap-check once */
         if (td->ready_at_ns > 0 && td->ready_at_ns > td->suspended_at_ns) {
             int64_t blocked_ns = td->ready_at_ns - td->suspended_at_ns;
-            rperf_record_sample(prof, td->suspended_frame_start,
-                                td->suspended_frame_depth, blocked_ns,
-                                RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq);
+            rperf_write_sample(buf, frame_start, depth, blocked_ns,
+                               RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq);
         }
         if (td->ready_at_ns > 0 && wall_now > td->ready_at_ns) {
             int64_t wait_ns = wall_now - td->ready_at_ns;
-            rperf_record_sample(prof, td->suspended_frame_start,
-                                td->suspended_frame_depth, wait_ns,
-                                RPERF_SAMPLE_GVL_WAIT, td->thread_seq);
+            rperf_write_sample(buf, frame_start, depth, wait_ns,
+                               RPERF_SAMPLE_GVL_WAIT, td->thread_seq);
         }
+        rperf_try_swap(prof);
     }
+skip_gvl:
     /* Reset prev times to current — next timer sample measures from resume */
     int64_t time_now = rperf_current_time_ns(prof, td);
-    if (time_now >= 0) td->prev_cpu_ns = time_now;
+    if (time_now >= 0) td->prev_time_ns = time_now;
     td->prev_wall_ns = wall_now;
     /* Clear suspended state */
-    td->suspended_frame_depth = 0;
+    td->suspended_at_ns = 0;
     td->ready_at_ns = 0;
 }
@@ -722,50 +817,52 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
     if (!prof->running) return;
     if (event & RUBY_INTERNAL_EVENT_GC_START) {
-        prof->gc_phase = RPERF_GC_MARKING;
+        prof->gc.phase = RPERF_GC_MARKING;
     }
     else if (event & RUBY_INTERNAL_EVENT_GC_END_MARK) {
-        prof->gc_phase = RPERF_GC_SWEEPING;
+        prof->gc.phase = RPERF_GC_SWEEPING;
     }
     else if (event & RUBY_INTERNAL_EVENT_GC_END_SWEEP) {
-        prof->gc_phase = RPERF_GC_NONE;
+        prof->gc.phase = RPERF_GC_NONE;
     }
     else if (event & RUBY_INTERNAL_EVENT_GC_ENTER) {
-        /* Capture backtrace and timestamp at GC entry */
-        prof->gc_enter_ns = rperf_wall_time_ns();
-        rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
-        if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
-        size_t frame_start = buf->frame_pool_count;
-        int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
-                                      &buf->frame_pool[frame_start], NULL);
-        if (depth <= 0) {
-            prof->gc_frame_depth = 0;
-            return;
-        }
-        buf->frame_pool_count += depth;
-        prof->gc_frame_start = frame_start;
-        prof->gc_frame_depth = depth;
-        /* Save thread_seq for the GC_EXIT sample */
+        /* Save timestamp and thread_seq; backtrace is captured at GC_EXIT
+         * to avoid buffer mismatch after a double-buffer swap. */
+        prof->gc.enter_ns = rperf_wall_time_ns();
         {
             VALUE thread = rb_thread_current();
             rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
-            prof->gc_thread_seq = td ? td->thread_seq : 0;
+            prof->gc.thread_seq = td ? td->thread_seq : 0;
         }
     }
     else if (event & RUBY_INTERNAL_EVENT_GC_EXIT) {
-        if (prof->gc_frame_depth <= 0) return;
+        if (prof->gc.enter_ns <= 0) return;
         int64_t wall_now = rperf_wall_time_ns();
-        int64_t weight = wall_now - prof->gc_enter_ns;
-        int type = (prof->gc_phase == RPERF_GC_SWEEPING)
+        int64_t weight = wall_now - prof->gc.enter_ns;
+        int type = (prof->gc.phase == RPERF_GC_SWEEPING)
                    ? RPERF_SAMPLE_GC_SWEEPING
                    : RPERF_SAMPLE_GC_MARKING;
-        rperf_record_sample(prof, prof->gc_frame_start,
-                            prof->gc_frame_depth, weight, type, prof->gc_thread_seq);
-        prof->gc_frame_depth = 0;
+        /* Capture backtrace here (not at GC_ENTER) so that frame_start
+         * always indexes into the current active buffer. The Ruby stack
+         * is unchanged during GC. */
+        rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
+        if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) {
+            prof->gc.enter_ns = 0;
+            return;
+        }
+        size_t frame_start = buf->frame_pool_count;
+        int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
+                                      &buf->frame_pool[frame_start], NULL);
+        if (depth <= 0) {
+            prof->gc.enter_ns = 0;
+            return;
+        }
+        buf->frame_pool_count += depth;
+        rperf_record_sample(prof, frame_start, depth, weight, type, prof->gc.thread_seq);
+        prof->gc.enter_ns = 0;
     }
 }
@@ -795,14 +892,14 @@ rperf_sample_job(void *arg)
     int64_t time_now = rperf_current_time_ns(prof, td);
     if (time_now < 0) return;
-    int64_t weight = time_now - td->prev_cpu_ns;
-    td->prev_cpu_ns = time_now;
+    int64_t weight = time_now - td->prev_time_ns;
+    td->prev_time_ns = time_now;
     td->prev_wall_ns = rperf_wall_time_ns();
     if (weight <= 0) return;
     /* Capture backtrace and record sample */
-    rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
+    rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
     if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
     size_t frame_start = buf->frame_pool_count;
@@ -814,8 +911,8 @@ rperf_sample_job(void *arg)
     rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
     clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_end);
-    prof->sampling_count++;
-    prof->sampling_total_ns +=
+    prof->stats.sampling_count++;
+    prof->stats.sampling_total_ns +=
         ((int64_t)ts_end.tv_sec - ts_start.tv_sec) * 1000000000LL +
         (ts_end.tv_nsec - ts_start.tv_nsec);
 }
@@ -826,7 +923,7 @@ rperf_sample_job(void *arg)
 static void
 rperf_signal_handler(int sig)
 {
-    g_profiler.trigger_count++;
+    g_profiler.stats.trigger_count++;
     rb_postponed_job_trigger(g_profiler.pj_handle);
 }
@@ -845,7 +942,8 @@ rperf_worker_signal_func(void *arg)
     CHECKED(pthread_cond_signal(&prof->worker_cond));
     while (prof->running) {
-        CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
+        while (prof->running && !atomic_load_explicit(&prof->swap_ready, memory_order_acquire))
+            CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
         rperf_try_aggregate(prof);
     }
     CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
@@ -874,9 +972,12 @@ rperf_worker_nanosleep_func(void *arg)
     CHECKED(pthread_mutex_lock(&prof->worker_mutex));
     while (prof->running) {
         int ret = pthread_cond_timedwait(&prof->worker_cond, &prof->worker_mutex, &deadline);
-        assert(ret == 0 || ret == ETIMEDOUT);
+        if (ret != 0 && ret != ETIMEDOUT) {
+            fprintf(stderr, "rperf: pthread_cond_timedwait failed: %s\n", strerror(ret));
+            abort();
+        }
         if (ret == ETIMEDOUT) {
-            prof->trigger_count++;
+            prof->stats.trigger_count++;
             rb_postponed_job_trigger(prof->pj_handle);
             /* Advance deadline by interval */
             deadline.tv_nsec += interval_ns;
@@ -900,8 +1001,6 @@ rperf_resolve_frame(VALUE fval)
     VALUE label = rb_profile_frame_full_label(fval);
     if (NIL_P(path))  path  = rb_str_new_lit("<C method>");
-    if (NIL_P(path))  path  = rb_str_new_cstr("");
     if (NIL_P(label)) label = rb_str_new_cstr("");
     return rb_ary_new3(2, path, label);
@@ -909,58 +1008,23 @@ rperf_resolve_frame(VALUE fval)
 /* ---- Ruby API ---- */
+/* _c_start(frequency, mode, aggregate, signal)
+ *   frequency: Integer (Hz)
+ *   mode:      0 = cpu, 1 = wall
+ *   aggregate: 0 or 1
+ *   signal:    Integer (RT signal number, 0 = nanosleep, -1 = default)
+ */
 static VALUE
-rb_rperf_start(int argc, VALUE *argv, VALUE self)
+rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
 {
-    VALUE opts;
-    int frequency = 1000;
-    int mode = 0; /* 0 = cpu, 1 = wall */
-    int aggregate = 1; /* default: aggregate */
+    int frequency = NUM2INT(vfreq);
+    int mode = NUM2INT(vmode);
+    int aggregate = RTEST(vagg) ? 1 : 0;
 #if RPERF_USE_TIMER_SIGNAL
-    int timer_signal = RPERF_TIMER_SIGNAL_DEFAULT;
+    int sig = NUM2INT(vsig);
+    int timer_signal = (sig < 0) ? RPERF_TIMER_SIGNAL_DEFAULT : sig;
 #endif
-    rb_scan_args(argc, argv, ":", &opts);
-    if (!NIL_P(opts)) {
-        VALUE vagg = rb_hash_aref(opts, ID2SYM(rb_intern("aggregate")));
-        if (!NIL_P(vagg)) {
-            aggregate = RTEST(vagg) ? 1 : 0;
-        }
-        VALUE vfreq = rb_hash_aref(opts, ID2SYM(rb_intern("frequency")));
-        if (!NIL_P(vfreq)) {
-            frequency = NUM2INT(vfreq);
-            if (frequency <= 0 || frequency > 1000000) {
-                rb_raise(rb_eArgError, "frequency must be between 1 and 1000000");
-            }
-        }
-        VALUE vmode = rb_hash_aref(opts, ID2SYM(rb_intern("mode")));
-        if (!NIL_P(vmode)) {
-            ID mode_id = SYM2ID(vmode);
-            if (mode_id == rb_intern("cpu")) {
-                mode = 0;
-            } else if (mode_id == rb_intern("wall")) {
-                mode = 1;
-            } else {
-                rb_raise(rb_eArgError, "mode must be :cpu or :wall");
-            }
-        }
-#if RPERF_USE_TIMER_SIGNAL
-        VALUE vsig = rb_hash_aref(opts, ID2SYM(rb_intern("signal")));
-        if (!NIL_P(vsig)) {
-            if (RTEST(vsig)) {
-                timer_signal = NUM2INT(vsig);
-                if (timer_signal < SIGRTMIN || timer_signal > SIGRTMAX) {
-                    rb_raise(rb_eArgError, "signal must be between SIGRTMIN(%d) and SIGRTMAX(%d)",
-                             SIGRTMIN, SIGRTMAX);
-                }
-            } else {
-                /* signal: false or signal: 0 → use nanosleep thread */
-                timer_signal = 0;
-            }
-        }
-#endif
-    }
     if (g_profiler.running) {
         rb_raise(rb_eRuntimeError, "Rperf is already running");
     }
@@ -969,11 +1033,11 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
     g_profiler.mode = mode;
     g_profiler.aggregate = aggregate;
     g_profiler.next_thread_seq = 0;
-    g_profiler.sampling_count = 0;
-    g_profiler.sampling_total_ns = 0;
-    g_profiler.trigger_count = 0;
-    g_profiler.active_idx = 0;
-    g_profiler.swap_ready = 0;
+    g_profiler.stats.sampling_count = 0;
+    g_profiler.stats.sampling_total_ns = 0;
+    g_profiler.stats.trigger_count = 0;
+    atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
+    atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
     /* Initialize worker mutex/cond */
     CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
@@ -994,13 +1058,26 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
         }
         /* Initialize aggregation structures */
-        rperf_frame_table_init(&g_profiler.frame_table);
-        rperf_agg_table_init(&g_profiler.agg_table);
+        if (rperf_frame_table_init(&g_profiler.frame_table) < 0) {
+            rperf_sample_buffer_free(&g_profiler.buffers[0]);
+            rperf_sample_buffer_free(&g_profiler.buffers[1]);
+            CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
+            CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
+            rb_raise(rb_eNoMemError, "rperf: failed to allocate frame table");
+        }
+        if (rperf_agg_table_init(&g_profiler.agg_table) < 0) {
+            rperf_frame_table_free(&g_profiler.frame_table);
+            rperf_sample_buffer_free(&g_profiler.buffers[0]);
+            rperf_sample_buffer_free(&g_profiler.buffers[1]);
+            CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
+            CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
+            rb_raise(rb_eNoMemError, "rperf: failed to allocate aggregation table");
+        }
     }
     /* Register GC event hook */
-    g_profiler.gc_phase = RPERF_GC_NONE;
-    g_profiler.gc_frame_depth = 0;
+    g_profiler.gc.phase = RPERF_GC_NONE;
+    g_profiler.gc.enter_ns = 0;
     rb_add_event_hook(rperf_gc_event_hook,
                       RUBY_INTERNAL_EVENT_GC_START |
                       RUBY_INTERNAL_EVENT_GC_END_MARK |
@@ -1023,6 +1100,7 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
         VALUE cur_thread = rb_thread_current();
         rperf_thread_data_t *td = rperf_thread_data_create(&g_profiler, cur_thread);
         if (!td) {
+            rb_remove_event_hook(rperf_gc_event_hook);
             rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
             g_profiler.thread_hook = NULL;
             if (g_profiler.aggregate) {
@@ -1053,14 +1131,17 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
         memset(&sa, 0, sizeof(sa));
         sa.sa_handler = rperf_signal_handler;
         sa.sa_flags = SA_RESTART;
-        sigaction(g_profiler.timer_signal, &sa, NULL);
+        if (sigaction(g_profiler.timer_signal, &sa, &g_profiler.old_sigaction) != 0) {
+            g_profiler.running = 0;
+            goto timer_fail;
+        }
         /* Start worker thread first to get its kernel TID */
         g_profiler.worker_tid = 0;
         if (pthread_create(&g_profiler.worker_thread, NULL,
                            rperf_worker_signal_func, &g_profiler) != 0) {
             g_profiler.running = 0;
-            signal(g_profiler.timer_signal, SIG_DFL);
+            sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
             goto timer_fail;
         }
@@ -1078,7 +1159,7 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
         sev._sigev_un._tid = g_profiler.worker_tid;
         if (timer_create(CLOCK_MONOTONIC, &sev, &g_profiler.timer_id) != 0) {
             g_profiler.running = 0;
-            signal(g_profiler.timer_signal, SIG_DFL);
+            sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
             CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
             CHECKED(pthread_join(g_profiler.worker_thread, NULL));
             goto timer_fail;
@@ -1087,7 +1168,14 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
         its.it_value.tv_sec = 0;
         its.it_value.tv_nsec = 1000000000L / g_profiler.frequency;
         its.it_interval = its.it_value;
-        timer_settime(g_profiler.timer_id, 0, &its, NULL);
+        if (timer_settime(g_profiler.timer_id, 0, &its, NULL) != 0) {
+            timer_delete(g_profiler.timer_id);
+            g_profiler.running = 0;
+            sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
+            CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
+            CHECKED(pthread_join(g_profiler.worker_thread, NULL));
+            goto timer_fail;
+        }
     } else
 #endif
     {
@@ -1109,6 +1197,7 @@ timer_fail:
                 rb_internal_thread_specific_set(cur, g_profiler.ts_key, NULL);
             }
         }
+        rb_remove_event_hook(rperf_gc_event_hook);
         rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
         g_profiler.thread_hook = NULL;
         if (g_profiler.aggregate) {
@@ -1139,17 +1228,28 @@ rb_rperf_stop(VALUE self)
     g_profiler.running = 0;
 #if RPERF_USE_TIMER_SIGNAL
     if (g_profiler.timer_signal > 0) {
+        /* Delete timer first to stop generating new signals.
+         * Do NOT restore signal handler yet — the worker thread may still have
+         * pending timer signals.  rperf_signal_handler handles them harmlessly. */
         timer_delete(g_profiler.timer_id);
-        signal(g_profiler.timer_signal, SIG_IGN);
     }
 #endif
-    /* Wake and join worker thread */
+    /* Wake and join worker thread.
+     * Any pending timer signals are still handled by rperf_signal_handler
+     * (just increments trigger_count + calls rb_postponed_job_trigger). */
     CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
     CHECKED(pthread_join(g_profiler.worker_thread, NULL));
     CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
     CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
+#if RPERF_USE_TIMER_SIGNAL
+    if (g_profiler.timer_signal > 0) {
+        /* Worker thread is gone — safe to restore old signal handler now. */
+        sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
+    }
+#endif
     if (g_profiler.thread_hook) {
         rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
         g_profiler.thread_hook = NULL;
@@ -1159,13 +1259,15 @@ rb_rperf_stop(VALUE self)
     rb_remove_event_hook(rperf_gc_event_hook);
     if (g_profiler.aggregate) {
+        /* Worker thread is joined; no concurrent access to these atomics. */
+        int cur_idx = atomic_load_explicit(&g_profiler.active_idx, memory_order_relaxed);
         /* Aggregate remaining samples from both buffers */
-        if (g_profiler.swap_ready) {
-            int standby_idx = g_profiler.active_idx ^ 1;
+        if (atomic_load_explicit(&g_profiler.swap_ready, memory_order_relaxed)) {
+            int standby_idx = cur_idx ^ 1;
             rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[standby_idx]);
-            g_profiler.swap_ready = 0;
+            atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
         }
-        rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[g_profiler.active_idx]);
+        rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[cur_idx]);
     }
     /* Clean up thread-specific data for all live threads */
@@ -1193,10 +1295,11 @@ rb_rperf_stop(VALUE self)
     /* frequency */
     rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
-    /* trigger_count, sampling_count, sampling_time_ns */
-    rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.trigger_count));
-    rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.sampling_count));
-    rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.sampling_total_ns));
+    /* trigger_count, sampling_count, sampling_time_ns, detected_thread_count */
+    rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
+    rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
+    rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
+    rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
     /* aggregation stats */
     if (g_profiler.aggregate) {
@@ -1231,7 +1334,7 @@ rb_rperf_stop(VALUE self)
         rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"),  rb_str_new_lit("[GC sweeping]")));
         /* Real frames */
         for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
-            rb_ary_push(resolved_ary, rperf_resolve_frame(ft->keys[i]));
+            rb_ary_push(resolved_ary, rperf_resolve_frame(atomic_load_explicit(&ft->keys, memory_order_relaxed)[i]));
         }
         rperf_agg_table_t *at = &g_profiler.agg_table;
@@ -1285,7 +1388,9 @@ rb_rperf_stop(VALUE self)
             rb_ary_push(samples_ary, sample);
         }
     }
-    rb_hash_aset(result, ID2SYM(rb_intern("samples")), samples_ary);
+    rb_hash_aset(result,
+                 ID2SYM(rb_intern(g_profiler.aggregate ? "aggregated_samples" : "raw_samples")),
+                 samples_ary);
     /* Cleanup */
     rperf_sample_buffer_free(&g_profiler.buffers[0]);
@@ -1304,9 +1409,20 @@ rperf_after_fork_child(void)
     g_profiler.running = 0;
 #if RPERF_USE_TIMER_SIGNAL
-    /* timer_create timers are not inherited across fork; reset signal handler */
+    /* timer_create timers are not inherited across fork, but pending signals may be.
+     * Block the signal, drain any pending instances, then restore old handler. */
     if (g_profiler.timer_signal > 0) {
-        signal(g_profiler.timer_signal, SIG_DFL);
+        sigset_t block_set, old_set;
+        struct timespec zero_ts = {0, 0};
+        sigemptyset(&block_set);
+        sigaddset(&block_set, g_profiler.timer_signal);
+        pthread_sigmask(SIG_BLOCK, &block_set, &old_set);
+        while (sigtimedwait(&block_set, NULL, &zero_ts) > 0) {}
+        sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
+        pthread_sigmask(SIG_SETMASK, &old_set, NULL);
     }
 #endif
@@ -1326,12 +1442,13 @@ rperf_after_fork_child(void)
     }
     /* Reset GC state */
-    g_profiler.gc_phase = 0;
+    g_profiler.gc.phase = 0;
+    g_profiler.gc.enter_ns = 0;
     /* Reset stats */
-    g_profiler.sampling_count = 0;
-    g_profiler.sampling_total_ns = 0;
-    g_profiler.swap_ready = 0;
+    g_profiler.stats.sampling_count = 0;
+    g_profiler.stats.sampling_total_ns = 0;
+    atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
 }
 /* ---- Init ---- */
@@ -1340,7 +1457,7 @@ void
 Init_rperf(void)
 {
     VALUE mRperf = rb_define_module("Rperf");
-    rb_define_module_function(mRperf, "_c_start", rb_rperf_start, -1);
+    rb_define_module_function(mRperf, "_c_start", rb_rperf_start, 4);
     rb_define_module_function(mRperf, "_c_stop", rb_rperf_stop, 0);
     memset(&g_profiler, 0, sizeof(g_profiler));

data/lib/rperf/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Rperf
-  VERSION = "0.4.0"
+  VERSION = "0.5.0"
 end

data/lib/rperf.rb CHANGED Viewed

@@ -24,14 +24,25 @@ module Rperf
   #   .txt       → text report (human/AI readable flat + cumulative table)
   #   otherwise (.pb.gz etc) → pprof protobuf (gzip compressed)
   def self.start(frequency: 1000, mode: :cpu, output: nil, verbose: false, format: nil, stat: false, signal: nil, aggregate: true)
+    raise ArgumentError, "frequency must be a positive integer (got #{frequency.inspect})" unless frequency.is_a?(Integer) && frequency > 0
+    raise ArgumentError, "frequency must be <= 10000 (10KHz), got #{frequency}" if frequency > 10_000
+    raise ArgumentError, "mode must be :cpu or :wall, got #{mode.inspect}" unless %i[cpu wall].include?(mode)
+    c_mode = mode == :cpu ? 0 : 1
+    c_signal = signal.nil? ? -1 : (signal ? signal.to_i : 0)
+    if c_signal > 0
+      raise ArgumentError, "signal mode is only supported on Linux" unless RUBY_PLATFORM =~ /linux/
+      uncatchable = [Signal.list["KILL"], Signal.list["STOP"]].compact
+      if uncatchable.include?(c_signal)
+        name = Signal.signame(c_signal) rescue c_signal.to_s
+        raise ArgumentError, "signal #{c_signal} (#{name}) cannot be caught; use a different signal"
+      end
+    end
     @verbose = verbose || ENV["RPERF_VERBOSE"] == "1"
     @output = output
     @format = format
     @stat = stat
     @stat_start_mono = Process.clock_gettime(Process::CLOCK_MONOTONIC) if @stat
-    c_opts = { frequency: frequency, mode: mode, aggregate: aggregate }
-    c_opts[:signal] = signal unless signal.nil?
-    _c_start(**c_opts)
+    _c_start(frequency, c_mode, aggregate, c_signal)
     if block_given?
       begin
@@ -46,6 +57,21 @@ module Rperf
     data = _c_stop
     return unless data
+    # When aggregate: false, C extension returns :raw_samples but not
+    # :aggregated_samples.  Build aggregated view so encoders always work.
+    if data[:raw_samples] && !data[:aggregated_samples]
+      merged = {}
+      data[:raw_samples].each do |frames, weight, thread_seq|
+        key = [frames, thread_seq || 0]
+        if merged.key?(key)
+          merged[key] += weight
+        else
+          merged[key] = weight
+        end
+      end
+      data[:aggregated_samples] = merged.map { |(frames, ts), w| [frames, w, ts] }
+    end
     print_stats(data) if @verbose
     print_stat(data) if @stat
@@ -148,7 +174,7 @@ module Rperf
   # Samples from C are now [[path_str, label_str], ...], weight]
   def self.print_top(data)
-    samples_raw = data[:samples]
+    samples_raw = data[:aggregated_samples]
     return if !samples_raw || samples_raw.empty?
     result = compute_flat_cum(samples_raw)
@@ -180,7 +206,7 @@ module Rperf
   private_constant :STAT_PCT_LINE, :STAT_LINE
   def self.print_stat(data)
-    samples_raw = data[:samples] || []
+    samples_raw = data[:aggregated_samples] || []
     real_ns = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - @stat_start_mono) * 1_000_000_000).to_i
     times = Process.times
     user_ns = (times.utime * 1_000_000_000).to_i
@@ -198,7 +224,7 @@ module Rperf
     if samples_raw.size > 0
       breakdown, total_weight = compute_stat_breakdown(samples_raw)
       print_stat_breakdown(breakdown, total_weight)
-      print_stat_runtime_info
+      print_stat_runtime_info(data)
       print_stat_system_info
       print_stat_report(data) if ENV["RPERF_STAT_REPORT"] == "1"
       print_stat_footer(samples_raw, real_ns, data)
@@ -246,7 +272,9 @@ module Rperf
   end
   private_class_method :print_stat_breakdown
-  def self.print_stat_runtime_info
+  def self.print_stat_runtime_info(data)
+    thread_count = data[:detected_thread_count] || 0
+    $stderr.puts STAT_LINE.call(format_integer(thread_count), "  ", "[Ruby] detected threads") if thread_count > 0
     gc = GC.stat
     $stderr.puts STAT_LINE.call(format_ms(gc[:time] * 1_000_000), "ms",
                                 "[Ruby] GC time (%s count: %s minor, %s major)" % [
@@ -391,7 +419,7 @@ module Rperf
     module_function
     def encode(data, top_n: 50, header: true)
-      samples_raw = data[:samples]
+      samples_raw = data[:aggregated_samples]
       mode = data[:mode] || :cpu
       frequency = data[:frequency] || 0
@@ -433,8 +461,10 @@ module Rperf
     module_function
     def encode(data)
+      samples = data[:aggregated_samples]
+      return "" if !samples || samples.empty?
       merged = Hash.new(0)
-      data[:samples].each do |frames, weight|
+      samples.each do |frames, weight|
         key = frames.reverse.map { |_, label| label }.join(";")
         merged[key] += weight
       end
@@ -451,7 +481,7 @@ module Rperf
     module_function
     def encode(data)
-      samples_raw = data[:samples]
+      samples_raw = data[:aggregated_samples]
       frequency = data[:frequency]
       interval_ns = 1_000_000_000 / frequency
       mode = data[:mode] || :cpu
@@ -537,7 +567,7 @@ module Rperf
       # field 6: string_table (repeated string)
       string_table.each do |s|
-        buf << encode_bytes(6, s.encode("UTF-8"))
+        buf << encode_bytes(6, s.encode("UTF-8", invalid: :replace, undef: :replace))
       end
       # field 9: time_nanos (int64)

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rperf
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.5.0
 platform: ruby
 authors:
 - Koichi Sasada