rperf 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ab923fe1fc0a0d6928941271cdffc979012af73d6d0bd0aa5c5d43a95e9451c2
4
- data.tar.gz: 74a0200ec71ae3743d2b99d578df0b484d23dea57285385209e23b0748a95564
3
+ metadata.gz: 3413c4c6ed0cdc0897428bf01fc0fec17a4d14f1c2883e9e5afa0cff110247dc
4
+ data.tar.gz: '097b06203ce4648a860f2816635d6dfac52f8e5987aa381653cec874d52abf7c'
5
5
  SHA512:
6
- metadata.gz: b2d95c3e58fd883efebfcad8506a5249dee8c7322fb53e75a25afcd5050bbb1885fb620eef18a86e74fa54cb542ba83b1761f13630746862c619477e022b09db
7
- data.tar.gz: ee4236170102e0be1cd13749389a29679ba7361c4c77db98ed708e9b509e8e1c28ba3b47a2d8a4b704ada2fe5a11859bc61c0476d8f061fbd43703168232f5f6
6
+ metadata.gz: 37065071f049a27eb1bab9f859ed39499022489a19aa8ecd91b3dc35cb6052ffb6b2fbc02c67ea46a94e8dba7644f2b23760d72d2dda7b998ccf3c61c304e225
7
+ data.tar.gz: 686ab430d58e5dd5163ae65a2bd330a76e57cf0dd72e7eac2b7c61621a03007bd724cac20b1e452766870ff33de325855e199bc3d873d004344f7b26b9b6614f
data/docs/help.md CHANGED
@@ -10,6 +10,7 @@ POSIX systems (Linux, macOS). Requires Ruby >= 3.4.0.
10
10
 
11
11
  rperf record [options] command [args...]
12
12
  rperf stat [options] command [args...]
13
+ rperf exec [options] command [args...]
13
14
  rperf report [options] [file]
14
15
  rperf help
15
16
 
@@ -41,6 +42,20 @@ Shows: user/sys/real time, time breakdown (CPU execution, GVL blocked,
41
42
  GVL wait, GC marking, GC sweeping), GC/memory/OS stats, and profiler overhead.
42
43
  Use --report to add flat and cumulative top-50 function tables.
43
44
 
45
+ ### exec: Run command and print full profile report to stderr.
46
+
47
+ Like `stat --report`. Uses wall mode by default. No file output by default.
48
+
49
+ -o, --output PATH Also save profile to file (default: none)
50
+ -f, --frequency HZ Sampling frequency in Hz (default: 1000)
51
+ -m, --mode MODE cpu or wall (default: wall)
52
+ --signal VALUE Timer signal (Linux only): signal number, or 'false'
53
+ for nanosleep thread (default: auto)
54
+ -v, --verbose Print additional sampling statistics
55
+
56
+ Shows: user/sys/real time, time breakdown, GC/memory/OS stats, profiler overhead,
57
+ and flat/cumulative top-50 function tables.
58
+
44
59
  ### report: Open pprof profile with go tool pprof. Requires Go.
45
60
 
46
61
  --top Print top functions by flat time
@@ -67,6 +82,8 @@ Default (no flag): opens diff in browser.
67
82
  rperf stat ruby app.rb
68
83
  rperf stat --report ruby app.rb
69
84
  rperf stat -o profile.pb.gz ruby app.rb
85
+ rperf exec ruby app.rb
86
+ rperf exec -m cpu ruby app.rb
70
87
  rperf report
71
88
  rperf report --top profile.pb.gz
72
89
  rperf diff before.pb.gz after.pb.gz
@@ -106,16 +123,22 @@ Rperf.save("profile.txt", data)
106
123
  nil if profiler was not running; otherwise a Hash:
107
124
 
108
125
  ```ruby
109
- { mode: :cpu, # or :wall
126
+ { mode: :cpu, # or :wall
110
127
  frequency: 500,
111
128
  sampling_count: 1234,
112
129
  sampling_time_ns: 56789,
113
- start_time_ns: 17740..., # CLOCK_REALTIME epoch nanos
114
- duration_ns: 10000000, # profiling duration in nanos
115
- samples: [ # Array of [frames, weight, thread_seq]
116
- [frames, weight, seq], # frames: [[path, label], ...] deepest-first
117
- ... # weight: Integer (nanoseconds)
118
- ] } # seq: Integer (thread sequence, 1-based)
130
+ detected_thread_count: 4, # threads seen during profiling
131
+ start_time_ns: 17740..., # CLOCK_REALTIME epoch nanos
132
+ duration_ns: 10000000, # profiling duration in nanos
133
+ aggregated_samples: [ # when aggregate: true (default)
134
+ [frames, weight, seq], # frames: [[path, label], ...] deepest-first
135
+ ... # weight: Integer (nanoseconds, merged per unique stack)
136
+ ], # seq: Integer (thread sequence, 1-based)
137
+ # --- OR ---
138
+ raw_samples: [ # when aggregate: false
139
+ [frames, weight, seq], # one entry per timer sample (not merged)
140
+ ...
141
+ ] }
119
142
  ```
120
143
 
121
144
  ### Rperf.save(path, data, format: nil)
data/exe/rperf CHANGED
@@ -72,6 +72,7 @@ HELP_TEXT = File.read(File.expand_path("../docs/help.md", __dir__))
72
72
 
73
73
  USAGE = "Usage: rperf record [options] command [args...]\n" \
74
74
  " rperf stat [options] command [args...]\n" \
75
+ " rperf exec [options] command [args...]\n" \
75
76
  " rperf report [options] [file]\n" \
76
77
  " rperf diff [options] base.pb.gz target.pb.gz\n" \
77
78
  " rperf help\n"
@@ -120,7 +121,7 @@ when "diff"
120
121
  else exec("go", "tool", "pprof", "-http=localhost:#{find_available_port}", "-diff_base=#{base_file}", target_file)
121
122
  end
122
123
  end
123
- when "record", "stat"
124
+ when "record", "stat", "exec"
124
125
  # continue below
125
126
  else
126
127
  $stderr.puts "Unknown subcommand: #{subcommand.inspect}" if subcommand
@@ -128,22 +129,23 @@ else
128
129
  exit 1
129
130
  end
130
131
 
131
- output = (subcommand == "stat") ? nil : "rperf.data"
132
+ output = (subcommand == "record") ? "rperf.data" : nil
132
133
  frequency = 1000
133
- mode = (subcommand == "stat") ? "wall" : "cpu"
134
+ mode = (subcommand == "record") ? "cpu" : "wall"
134
135
  format = nil
135
136
  signal = nil
136
137
  verbose = false
137
138
  aggregate = true
138
- stat_report = false
139
+ stat_report = (subcommand == "exec")
139
140
 
140
141
  parser = OptionParser.new do |opts|
141
142
  opts.banner = case subcommand
142
143
  when "record" then "Usage: rperf record [options] command [args...]"
143
144
  when "stat" then "Usage: rperf stat [options] command [args...]"
145
+ when "exec" then "Usage: rperf exec [options] command [args...]"
144
146
  end
145
147
 
146
- opts.on("-o", "--output PATH", "Output file#{subcommand == 'stat' ? ' (default: none)' : ' (default: rperf.data)'}") do |v|
148
+ opts.on("-o", "--output PATH", "Output file#{subcommand == 'record' ? ' (default: rperf.data)' : ' (default: none)'}") do |v|
147
149
  output = v
148
150
  end
149
151
 
@@ -151,7 +153,7 @@ parser = OptionParser.new do |opts|
151
153
  frequency = v
152
154
  end
153
155
 
154
- default_mode = (subcommand == "stat") ? "wall" : "cpu"
156
+ default_mode = (subcommand == "record") ? "cpu" : "wall"
155
157
  opts.on("-m", "--mode MODE", %w[cpu wall], "Profiling mode: cpu or wall (default: #{default_mode})") do |v|
156
158
  mode = v
157
159
  end
@@ -208,6 +210,29 @@ if ARGV.empty?
208
210
  exit 1
209
211
  end
210
212
 
213
+ if frequency <= 0
214
+ $stderr.puts "Error: frequency must be a positive integer (got #{frequency})"
215
+ exit 1
216
+ end
217
+
218
+ if frequency > 10_000
219
+ $stderr.puts "Error: frequency must be <= 10000 (10KHz), got #{frequency}"
220
+ exit 1
221
+ end
222
+
223
+ if signal && signal != "false"
224
+ unless RUBY_PLATFORM =~ /linux/
225
+ $stderr.puts "Error: signal mode is only supported on Linux"
226
+ exit 1
227
+ end
228
+ sig_num = signal.to_i
229
+ uncatchable = [Signal.list["KILL"], Signal.list["STOP"]].compact
230
+ if uncatchable.include?(sig_num)
231
+ $stderr.puts "Error: signal #{sig_num} (#{Signal.signame(sig_num)}) cannot be caught; use a different signal"
232
+ exit 1
233
+ end
234
+ end
235
+
211
236
  # Add lib dir to RUBYLIB so -rrperf can find the extension
212
237
  lib_dir = File.expand_path("../lib", __dir__)
213
238
  ENV["RUBYLIB"] = [lib_dir, ENV["RUBYLIB"]].compact.join(File::PATH_SEPARATOR)
@@ -221,7 +246,7 @@ ENV["RPERF_VERBOSE"] = "1" if verbose
221
246
  ENV["RPERF_SIGNAL"] = signal if signal
222
247
  ENV["RPERF_AGGREGATE"] = "0" unless aggregate
223
248
 
224
- if subcommand == "stat"
249
+ if subcommand == "stat" || subcommand == "exec"
225
250
  ENV["RPERF_STAT"] = "1"
226
251
  ENV["RPERF_STAT_COMMAND"] = ARGV.join(" ")
227
252
  ENV["RPERF_STAT_REPORT"] = "1" if stat_report
data/ext/rperf/rperf.c CHANGED
@@ -7,13 +7,19 @@
7
7
  #include <stdlib.h>
8
8
  #include <unistd.h>
9
9
  #include <signal.h>
10
- #include <assert.h>
10
+ #include <stdatomic.h>
11
11
  #ifdef __linux__
12
12
  #include <sys/syscall.h>
13
13
  #endif
14
14
 
15
- /* Checked pthread wrappers — assert on unexpected errors */
16
- #define CHECKED(call) do { int _r = (call); assert(_r == 0 && #call); (void)_r; } while (0)
15
+ /* Checked pthread wrappers — always active regardless of NDEBUG */
16
+ #define CHECKED(call) do { \
17
+ int _r = (call); \
18
+ if (_r != 0) { \
19
+ fprintf(stderr, "rperf: %s failed: %s\n", #call, strerror(_r)); \
20
+ abort(); \
21
+ } \
22
+ } while (0)
17
23
 
18
24
  #ifdef __linux__
19
25
  #define RPERF_USE_TIMER_SIGNAL 1
@@ -26,7 +32,8 @@
26
32
  #define RPERF_INITIAL_SAMPLES 16384 /* >= AGG_THRESHOLD to avoid realloc before first aggregation */
27
33
  #define RPERF_INITIAL_FRAME_POOL (1024 * 1024 / sizeof(VALUE)) /* ~1MB */
28
34
  #define RPERF_AGG_THRESHOLD 10000 /* aggregate every N samples */
29
- #define RPERF_FRAME_TABLE_INITIAL 65536 /* pre-allocate to avoid realloc race with GC dmark */
35
+ #define RPERF_FRAME_TABLE_INITIAL 4096
36
+ #define RPERF_FRAME_TABLE_OLD_KEYS_INITIAL 16
30
37
  #define RPERF_AGG_TABLE_INITIAL 1024
31
38
  #define RPERF_STACK_POOL_INITIAL 4096
32
39
 
@@ -77,11 +84,15 @@ typedef struct rperf_sample_buffer {
77
84
  #define RPERF_FRAME_TABLE_EMPTY UINT32_MAX
78
85
 
79
86
  typedef struct rperf_frame_table {
80
- VALUE *keys; /* unique VALUE array (GC mark target) */
87
+ _Atomic(VALUE *) keys; /* unique VALUE array (GC mark target) */
81
88
  size_t count; /* = next frame_id (starts after RPERF_SYNTHETIC_COUNT) */
82
89
  size_t capacity;
83
90
  uint32_t *buckets; /* open addressing: stores index into keys[] */
84
91
  size_t bucket_capacity;
92
+ /* Old keys arrays kept alive for GC dmark safety until stop */
93
+ VALUE **old_keys;
94
+ int old_keys_count;
95
+ int old_keys_capacity;
85
96
  } rperf_frame_table_t;
86
97
 
87
98
  /* ---- Aggregation table: stack → weight ---- */
@@ -107,54 +118,63 @@ typedef struct rperf_agg_table {
107
118
  } rperf_agg_table_t;
108
119
 
109
120
  typedef struct rperf_thread_data {
110
- int64_t prev_cpu_ns;
121
+ int64_t prev_time_ns;
111
122
  int64_t prev_wall_ns;
112
123
  /* GVL event tracking */
113
124
  int64_t suspended_at_ns; /* wall time at SUSPENDED */
114
125
  int64_t ready_at_ns; /* wall time at READY */
115
- size_t suspended_frame_start; /* saved stack in frame_pool */
116
- int suspended_frame_depth; /* saved stack depth */
117
126
  int thread_seq; /* thread sequence number (1-based) */
118
127
  } rperf_thread_data_t;
119
128
 
129
+ /* ---- GC tracking state ---- */
130
+
131
+ typedef struct rperf_gc_state {
132
+ int phase; /* rperf_gc_phase */
133
+ int64_t enter_ns; /* wall time at GC_ENTER */
134
+ int thread_seq; /* thread_seq at GC_ENTER */
135
+ } rperf_gc_state_t;
136
+
137
+ /* ---- Sampling overhead stats ---- */
138
+
139
+ typedef struct rperf_stats {
140
+ size_t trigger_count;
141
+ size_t sampling_count;
142
+ int64_t sampling_total_ns;
143
+ } rperf_stats_t;
144
+
120
145
  typedef struct rperf_profiler {
121
146
  int frequency;
122
147
  int mode; /* 0 = cpu, 1 = wall */
123
- volatile int running;
148
+ _Atomic int running;
124
149
  pthread_t worker_thread; /* combined timer + aggregation */
125
150
  #if RPERF_USE_TIMER_SIGNAL
126
151
  timer_t timer_id;
127
152
  int timer_signal; /* >0: use timer signal, 0: use nanosleep thread */
128
- volatile pid_t worker_tid; /* kernel TID of worker thread (for SIGEV_THREAD_ID) */
153
+ _Atomic pid_t worker_tid; /* kernel TID of worker thread (for SIGEV_THREAD_ID) */
154
+ struct sigaction old_sigaction; /* saved handler to restore on stop */
129
155
  #endif
130
156
  rb_postponed_job_handle_t pj_handle;
131
157
  int aggregate; /* 1 = aggregate samples, 0 = raw */
132
158
  /* Double-buffered sample storage (only buffers[0] used when !aggregate) */
133
159
  rperf_sample_buffer_t buffers[2];
134
- int active_idx; /* 0 or 1 */
160
+ _Atomic int active_idx; /* 0 or 1 */
135
161
  /* Aggregation (only used when aggregate=1) */
136
162
  rperf_frame_table_t frame_table;
137
163
  rperf_agg_table_t agg_table;
138
- volatile int swap_ready; /* 1 = standby buffer ready for aggregation */
164
+ _Atomic int swap_ready; /* 1 = standby buffer ready for aggregation */
139
165
  pthread_mutex_t worker_mutex;
140
166
  pthread_cond_t worker_cond;
141
167
  rb_internal_thread_specific_key_t ts_key;
142
168
  rb_internal_thread_event_hook_t *thread_hook;
143
169
  /* GC tracking */
144
- int gc_phase; /* rperf_gc_phase */
145
- int64_t gc_enter_ns; /* wall time at GC_ENTER */
146
- size_t gc_frame_start; /* saved stack at GC_ENTER */
147
- int gc_frame_depth; /* saved stack depth */
148
- int gc_thread_seq; /* thread_seq at GC_ENTER */
170
+ rperf_gc_state_t gc;
149
171
  /* Timing metadata for pprof */
150
172
  struct timespec start_realtime; /* CLOCK_REALTIME at start */
151
173
  struct timespec start_monotonic; /* CLOCK_MONOTONIC at start */
152
174
  /* Thread sequence counter */
153
175
  int next_thread_seq;
154
176
  /* Sampling overhead stats */
155
- size_t trigger_count;
156
- size_t sampling_count;
157
- int64_t sampling_total_ns;
177
+ rperf_stats_t stats;
158
178
  } rperf_profiler_t;
159
179
 
160
180
  static rperf_profiler_t g_profiler;
@@ -175,10 +195,18 @@ rperf_profiler_mark(void *ptr)
175
195
  buf->frame_pool + buf->frame_pool_count);
176
196
  }
177
197
  }
178
- /* Mark frame_table keys (unique frame VALUEs) */
179
- if (prof->frame_table.keys && prof->frame_table.count > 0) {
180
- rb_gc_mark_locations(prof->frame_table.keys + RPERF_SYNTHETIC_COUNT,
181
- prof->frame_table.keys + prof->frame_table.count);
198
+ /* Mark frame_table keys (unique frame VALUEs).
199
+ * Acquire count to synchronize with the release-store in insert,
200
+ * ensuring we see the keys pointer that is valid for [0, count).
201
+ * If we see an old count, both old and new keys arrays have valid
202
+ * data (old keys are kept alive in old_keys[]). */
203
+ {
204
+ size_t ft_count = __atomic_load_n(&prof->frame_table.count, __ATOMIC_ACQUIRE);
205
+ VALUE *ft_keys = atomic_load_explicit(&prof->frame_table.keys, memory_order_acquire);
206
+ if (ft_keys && ft_count > 0) {
207
+ rb_gc_mark_locations(ft_keys + RPERF_SYNTHETIC_COUNT,
208
+ ft_keys + ft_count);
209
+ }
182
210
  }
183
211
  }
184
212
 
@@ -288,21 +316,38 @@ rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
288
316
 
289
317
  /* ---- Frame table operations (all malloc-based, no GVL needed) ---- */
290
318
 
291
- static void
319
+ static int
292
320
  rperf_frame_table_init(rperf_frame_table_t *ft)
293
321
  {
294
322
  ft->capacity = RPERF_FRAME_TABLE_INITIAL;
295
- ft->keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
323
+ VALUE *keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
324
+ if (!keys) return -1;
325
+ atomic_store_explicit(&ft->keys, keys, memory_order_relaxed);
296
326
  ft->count = RPERF_SYNTHETIC_COUNT; /* reserve slots for synthetic frames */
297
327
  ft->bucket_capacity = RPERF_FRAME_TABLE_INITIAL * 2;
298
328
  ft->buckets = (uint32_t *)malloc(ft->bucket_capacity * sizeof(uint32_t));
329
+ if (!ft->buckets) { free(keys); atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed); return -1; }
299
330
  memset(ft->buckets, 0xFF, ft->bucket_capacity * sizeof(uint32_t)); /* EMPTY */
331
+ ft->old_keys_count = 0;
332
+ ft->old_keys_capacity = RPERF_FRAME_TABLE_OLD_KEYS_INITIAL;
333
+ ft->old_keys = (VALUE **)malloc(ft->old_keys_capacity * sizeof(VALUE *));
334
+ if (!ft->old_keys) {
335
+ free(ft->buckets);
336
+ free(keys);
337
+ atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed);
338
+ return -1;
339
+ }
340
+ return 0;
300
341
  }
301
342
 
302
343
  static void
303
344
  rperf_frame_table_free(rperf_frame_table_t *ft)
304
345
  {
305
- free(ft->keys);
346
+ int i;
347
+ for (i = 0; i < ft->old_keys_count; i++)
348
+ free(ft->old_keys[i]);
349
+ free(ft->old_keys);
350
+ free(atomic_load_explicit(&ft->keys, memory_order_relaxed));
306
351
  free(ft->buckets);
307
352
  memset(ft, 0, sizeof(*ft));
308
353
  }
@@ -312,11 +357,13 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
312
357
  {
313
358
  size_t new_cap = ft->bucket_capacity * 2;
314
359
  uint32_t *new_buckets = (uint32_t *)malloc(new_cap * sizeof(uint32_t));
360
+ if (!new_buckets) return; /* keep using current buckets at higher load factor */
315
361
  memset(new_buckets, 0xFF, new_cap * sizeof(uint32_t));
316
362
 
363
+ VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
317
364
  size_t i;
318
365
  for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
319
- uint32_t h = (uint32_t)(ft->keys[i] >> 3); /* shift out tag bits */
366
+ uint32_t h = (uint32_t)(keys[i] >> 3); /* shift out tag bits */
320
367
  size_t idx = h % new_cap;
321
368
  while (new_buckets[idx] != RPERF_FRAME_TABLE_EMPTY)
322
369
  idx = (idx + 1) % new_cap;
@@ -332,25 +379,42 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
332
379
  static uint32_t
333
380
  rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
334
381
  {
382
+ VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
335
383
  uint32_t h = (uint32_t)(fval >> 3);
336
384
  size_t idx = h % ft->bucket_capacity;
337
385
 
338
386
  while (1) {
339
387
  uint32_t slot = ft->buckets[idx];
340
388
  if (slot == RPERF_FRAME_TABLE_EMPTY) break;
341
- if (ft->keys[slot] == fval) return slot;
389
+ if (keys[slot] == fval) return slot;
342
390
  idx = (idx + 1) % ft->bucket_capacity;
343
391
  }
344
392
 
345
- /* Insert new entry.
346
- * keys array is pre-allocated and never realloc'd to avoid race with GC dmark.
347
- * If capacity is exhausted, return EMPTY to signal aggregation should stop. */
393
+ /* Insert new entry. Grow keys array if capacity is exhausted.
394
+ * Cannot realloc in-place because GC dmark may concurrently read
395
+ * the old keys pointer. Instead, allocate new, copy, swap pointer
396
+ * atomically, and keep old array alive until stop. */
348
397
  if (ft->count >= ft->capacity) {
349
- return RPERF_FRAME_TABLE_EMPTY;
398
+ size_t new_cap = ft->capacity * 2;
399
+ VALUE *new_keys = (VALUE *)calloc(new_cap, sizeof(VALUE));
400
+ if (!new_keys) return RPERF_FRAME_TABLE_EMPTY;
401
+ memcpy(new_keys, keys, ft->capacity * sizeof(VALUE));
402
+ /* Save old keys for deferred free (GC dmark safety) */
403
+ if (ft->old_keys_count >= ft->old_keys_capacity) {
404
+ int new_old_cap = ft->old_keys_capacity * 2;
405
+ VALUE **new_old = (VALUE **)realloc(ft->old_keys, new_old_cap * sizeof(VALUE *));
406
+ if (!new_old) { free(new_keys); return RPERF_FRAME_TABLE_EMPTY; }
407
+ ft->old_keys = new_old;
408
+ ft->old_keys_capacity = new_old_cap;
409
+ }
410
+ ft->old_keys[ft->old_keys_count++] = keys;
411
+ keys = new_keys;
412
+ atomic_store_explicit(&ft->keys, new_keys, memory_order_release);
413
+ ft->capacity = new_cap;
350
414
  }
351
415
 
352
416
  uint32_t frame_id = (uint32_t)ft->count;
353
- ft->keys[frame_id] = fval;
417
+ keys[frame_id] = fval;
354
418
  /* Store fence: ensure keys[frame_id] is visible before count is incremented,
355
419
  * so GC dmark never reads uninitialized keys[count-1]. */
356
420
  __atomic_store_n(&ft->count, ft->count + 1, __ATOMIC_RELEASE);
@@ -380,15 +444,18 @@ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq)
380
444
  return h;
381
445
  }
382
446
 
383
- static void
447
+ static int
384
448
  rperf_agg_table_init(rperf_agg_table_t *at)
385
449
  {
386
450
  at->bucket_capacity = RPERF_AGG_TABLE_INITIAL * 2;
387
451
  at->buckets = (rperf_agg_entry_t *)calloc(at->bucket_capacity, sizeof(rperf_agg_entry_t));
452
+ if (!at->buckets) return -1;
388
453
  at->count = 0;
389
454
  at->stack_pool_capacity = RPERF_STACK_POOL_INITIAL;
390
455
  at->stack_pool = (uint32_t *)malloc(at->stack_pool_capacity * sizeof(uint32_t));
456
+ if (!at->stack_pool) { free(at->buckets); at->buckets = NULL; return -1; }
391
457
  at->stack_pool_count = 0;
458
+ return 0;
392
459
  }
393
460
 
394
461
  static void
@@ -404,6 +471,7 @@ rperf_agg_table_rehash(rperf_agg_table_t *at)
404
471
  {
405
472
  size_t new_cap = at->bucket_capacity * 2;
406
473
  rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
474
+ if (!new_buckets) return; /* keep using current buckets at higher load factor */
407
475
 
408
476
  size_t i;
409
477
  for (i = 0; i < at->bucket_capacity; i++) {
@@ -535,10 +603,10 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
535
603
  static void
536
604
  rperf_try_aggregate(rperf_profiler_t *prof)
537
605
  {
538
- if (!prof->aggregate || !prof->swap_ready) return;
539
- int standby_idx = prof->active_idx ^ 1;
606
+ if (!prof->aggregate || !atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) return;
607
+ int standby_idx = atomic_load_explicit(&prof->active_idx, memory_order_acquire) ^ 1;
540
608
  rperf_aggregate_buffer(prof, &prof->buffers[standby_idx]);
541
- prof->swap_ready = 0;
609
+ atomic_store_explicit(&prof->swap_ready, 0, memory_order_release);
542
610
  }
543
611
 
544
612
  /* ---- Record a sample ---- */
@@ -547,25 +615,29 @@ static void
547
615
  rperf_try_swap(rperf_profiler_t *prof)
548
616
  {
549
617
  if (!prof->aggregate) return;
550
- rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
618
+ int idx = atomic_load_explicit(&prof->active_idx, memory_order_relaxed);
619
+ rperf_sample_buffer_t *buf = &prof->buffers[idx];
551
620
  if (buf->sample_count < RPERF_AGG_THRESHOLD) return;
552
- if (prof->swap_ready) return; /* standby still being aggregated */
621
+ if (atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) return; /* standby still being aggregated */
553
622
 
554
- /* Swap active buffer */
555
- prof->active_idx ^= 1;
556
- prof->swap_ready = 1;
623
+ /* Swap active buffer: release ensures buffer writes are visible to worker */
624
+ atomic_store_explicit(&prof->active_idx, idx ^ 1, memory_order_release);
557
625
 
558
- /* Wake worker thread */
626
+ /* Set swap_ready under mutex and signal, preventing lost wakeup:
627
+ * the worker checks swap_ready while holding the same mutex. */
628
+ CHECKED(pthread_mutex_lock(&prof->worker_mutex));
629
+ atomic_store_explicit(&prof->swap_ready, 1, memory_order_release);
559
630
  CHECKED(pthread_cond_signal(&prof->worker_cond));
631
+ CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
560
632
  }
561
633
 
562
- static void
563
- rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
564
- int64_t weight, int type, int thread_seq)
634
+ /* Write a sample into a specific buffer. No swap check. */
635
+ static int
636
+ rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
637
+ int64_t weight, int type, int thread_seq)
565
638
  {
566
- if (weight <= 0) return;
567
- rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
568
- if (rperf_ensure_sample_capacity(buf) < 0) return;
639
+ if (weight <= 0) return 0;
640
+ if (rperf_ensure_sample_capacity(buf) < 0) return -1;
569
641
 
570
642
  rperf_sample_t *sample = &buf->samples[buf->sample_count];
571
643
  sample->depth = depth;
@@ -574,7 +646,15 @@ rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
574
646
  sample->type = type;
575
647
  sample->thread_seq = thread_seq;
576
648
  buf->sample_count++;
649
+ return 0;
650
+ }
577
651
 
652
+ static void
653
+ rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
654
+ int64_t weight, int type, int thread_seq)
655
+ {
656
+ rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
657
+ rperf_write_sample(buf, frame_start, depth, weight, type, thread_seq);
578
658
  rperf_try_swap(prof);
579
659
  }
580
660
 
@@ -586,7 +666,7 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
586
666
  {
587
667
  rperf_thread_data_t *td = (rperf_thread_data_t *)calloc(1, sizeof(rperf_thread_data_t));
588
668
  if (!td) return NULL;
589
- td->prev_cpu_ns = rperf_current_time_ns(prof, td);
669
+ td->prev_time_ns = rperf_current_time_ns(prof, td);
590
670
  td->prev_wall_ns = rperf_wall_time_ns();
591
671
  td->thread_seq = ++prof->next_thread_seq;
592
672
  rb_internal_thread_specific_set(thread, prof->ts_key, td);
@@ -614,7 +694,7 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
614
694
  if (time_now < 0) return;
615
695
 
616
696
  /* Capture backtrace into active buffer's frame_pool */
617
- rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
697
+ rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
618
698
  if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
619
699
  size_t frame_start = buf->frame_pool_count;
620
700
  int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
@@ -624,15 +704,13 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
624
704
 
625
705
  /* Record normal sample (skip if first time — no prev_time) */
626
706
  if (!is_first) {
627
- int64_t weight = time_now - td->prev_cpu_ns;
707
+ int64_t weight = time_now - td->prev_time_ns;
628
708
  rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
629
709
  }
630
710
 
631
- /* Save stack and timestamp for READY/RESUMED */
711
+ /* Save timestamp for READY/RESUMED */
632
712
  td->suspended_at_ns = wall_now;
633
- td->suspended_frame_start = frame_start;
634
- td->suspended_frame_depth = depth;
635
- td->prev_cpu_ns = time_now;
713
+ td->prev_time_ns = time_now;
636
714
  td->prev_wall_ns = wall_now;
637
715
  }
638
716
 
@@ -659,29 +737,46 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread)
659
737
 
660
738
  int64_t wall_now = rperf_wall_time_ns();
661
739
 
662
- /* Record GVL blocked/wait samples (wall mode only) */
663
- if (prof->mode == 1 && td->suspended_frame_depth > 0) {
740
+ /* Record GVL blocked/wait samples (wall mode only).
741
+ * Capture backtrace here (not at SUSPENDED) so that frame_start always
742
+ * indexes into the current active buffer, avoiding mismatch after a
743
+ * double-buffer swap. The Ruby stack is unchanged while off-GVL.
744
+ *
745
+ * Both samples are written directly into the same buffer before calling
746
+ * rperf_try_swap, so that a swap triggered by the first sample cannot
747
+ * move the second into a different buffer with a stale frame_start. */
748
+ if (prof->mode == 1 && td->suspended_at_ns > 0) {
749
+ rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
750
+ if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) goto skip_gvl;
751
+ size_t frame_start = buf->frame_pool_count;
752
+ int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
753
+ &buf->frame_pool[frame_start], NULL);
754
+ if (depth <= 0) goto skip_gvl;
755
+ buf->frame_pool_count += depth;
756
+
757
+ /* Write both samples into the same buf, then swap-check once */
664
758
  if (td->ready_at_ns > 0 && td->ready_at_ns > td->suspended_at_ns) {
665
759
  int64_t blocked_ns = td->ready_at_ns - td->suspended_at_ns;
666
- rperf_record_sample(prof, td->suspended_frame_start,
667
- td->suspended_frame_depth, blocked_ns,
668
- RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq);
760
+ rperf_write_sample(buf, frame_start, depth, blocked_ns,
761
+ RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq);
669
762
  }
670
763
  if (td->ready_at_ns > 0 && wall_now > td->ready_at_ns) {
671
764
  int64_t wait_ns = wall_now - td->ready_at_ns;
672
- rperf_record_sample(prof, td->suspended_frame_start,
673
- td->suspended_frame_depth, wait_ns,
674
- RPERF_SAMPLE_GVL_WAIT, td->thread_seq);
765
+ rperf_write_sample(buf, frame_start, depth, wait_ns,
766
+ RPERF_SAMPLE_GVL_WAIT, td->thread_seq);
675
767
  }
768
+
769
+ rperf_try_swap(prof);
676
770
  }
771
+ skip_gvl:
677
772
 
678
773
  /* Reset prev times to current — next timer sample measures from resume */
679
774
  int64_t time_now = rperf_current_time_ns(prof, td);
680
- if (time_now >= 0) td->prev_cpu_ns = time_now;
775
+ if (time_now >= 0) td->prev_time_ns = time_now;
681
776
  td->prev_wall_ns = wall_now;
682
777
 
683
778
  /* Clear suspended state */
684
- td->suspended_frame_depth = 0;
779
+ td->suspended_at_ns = 0;
685
780
  td->ready_at_ns = 0;
686
781
  }
687
782
 
@@ -722,50 +817,52 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
722
817
  if (!prof->running) return;
723
818
 
724
819
  if (event & RUBY_INTERNAL_EVENT_GC_START) {
725
- prof->gc_phase = RPERF_GC_MARKING;
820
+ prof->gc.phase = RPERF_GC_MARKING;
726
821
  }
727
822
  else if (event & RUBY_INTERNAL_EVENT_GC_END_MARK) {
728
- prof->gc_phase = RPERF_GC_SWEEPING;
823
+ prof->gc.phase = RPERF_GC_SWEEPING;
729
824
  }
730
825
  else if (event & RUBY_INTERNAL_EVENT_GC_END_SWEEP) {
731
- prof->gc_phase = RPERF_GC_NONE;
826
+ prof->gc.phase = RPERF_GC_NONE;
732
827
  }
733
828
  else if (event & RUBY_INTERNAL_EVENT_GC_ENTER) {
734
- /* Capture backtrace and timestamp at GC entry */
735
- prof->gc_enter_ns = rperf_wall_time_ns();
736
-
737
- rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
738
- if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
739
- size_t frame_start = buf->frame_pool_count;
740
- int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
741
- &buf->frame_pool[frame_start], NULL);
742
- if (depth <= 0) {
743
- prof->gc_frame_depth = 0;
744
- return;
745
- }
746
- buf->frame_pool_count += depth;
747
- prof->gc_frame_start = frame_start;
748
- prof->gc_frame_depth = depth;
749
-
750
- /* Save thread_seq for the GC_EXIT sample */
829
+ /* Save timestamp and thread_seq; backtrace is captured at GC_EXIT
830
+ * to avoid buffer mismatch after a double-buffer swap. */
831
+ prof->gc.enter_ns = rperf_wall_time_ns();
751
832
  {
752
833
  VALUE thread = rb_thread_current();
753
834
  rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
754
- prof->gc_thread_seq = td ? td->thread_seq : 0;
835
+ prof->gc.thread_seq = td ? td->thread_seq : 0;
755
836
  }
756
837
  }
757
838
  else if (event & RUBY_INTERNAL_EVENT_GC_EXIT) {
758
- if (prof->gc_frame_depth <= 0) return;
839
+ if (prof->gc.enter_ns <= 0) return;
759
840
 
760
841
  int64_t wall_now = rperf_wall_time_ns();
761
- int64_t weight = wall_now - prof->gc_enter_ns;
762
- int type = (prof->gc_phase == RPERF_GC_SWEEPING)
842
+ int64_t weight = wall_now - prof->gc.enter_ns;
843
+ int type = (prof->gc.phase == RPERF_GC_SWEEPING)
763
844
  ? RPERF_SAMPLE_GC_SWEEPING
764
845
  : RPERF_SAMPLE_GC_MARKING;
765
846
 
766
- rperf_record_sample(prof, prof->gc_frame_start,
767
- prof->gc_frame_depth, weight, type, prof->gc_thread_seq);
768
- prof->gc_frame_depth = 0;
847
+ /* Capture backtrace here (not at GC_ENTER) so that frame_start
848
+ * always indexes into the current active buffer. The Ruby stack
849
+ * is unchanged during GC. */
850
+ rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
851
+ if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) {
852
+ prof->gc.enter_ns = 0;
853
+ return;
854
+ }
855
+ size_t frame_start = buf->frame_pool_count;
856
+ int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
857
+ &buf->frame_pool[frame_start], NULL);
858
+ if (depth <= 0) {
859
+ prof->gc.enter_ns = 0;
860
+ return;
861
+ }
862
+ buf->frame_pool_count += depth;
863
+
864
+ rperf_record_sample(prof, frame_start, depth, weight, type, prof->gc.thread_seq);
865
+ prof->gc.enter_ns = 0;
769
866
  }
770
867
  }
771
868
 
@@ -795,14 +892,14 @@ rperf_sample_job(void *arg)
795
892
  int64_t time_now = rperf_current_time_ns(prof, td);
796
893
  if (time_now < 0) return;
797
894
 
798
- int64_t weight = time_now - td->prev_cpu_ns;
799
- td->prev_cpu_ns = time_now;
895
+ int64_t weight = time_now - td->prev_time_ns;
896
+ td->prev_time_ns = time_now;
800
897
  td->prev_wall_ns = rperf_wall_time_ns();
801
898
 
802
899
  if (weight <= 0) return;
803
900
 
804
901
  /* Capture backtrace and record sample */
805
- rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
902
+ rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
806
903
  if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
807
904
 
808
905
  size_t frame_start = buf->frame_pool_count;
@@ -814,8 +911,8 @@ rperf_sample_job(void *arg)
814
911
  rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
815
912
 
816
913
  clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_end);
817
- prof->sampling_count++;
818
- prof->sampling_total_ns +=
914
+ prof->stats.sampling_count++;
915
+ prof->stats.sampling_total_ns +=
819
916
  ((int64_t)ts_end.tv_sec - ts_start.tv_sec) * 1000000000LL +
820
917
  (ts_end.tv_nsec - ts_start.tv_nsec);
821
918
  }
@@ -826,7 +923,7 @@ rperf_sample_job(void *arg)
826
923
  static void
827
924
  rperf_signal_handler(int sig)
828
925
  {
829
- g_profiler.trigger_count++;
926
+ g_profiler.stats.trigger_count++;
830
927
  rb_postponed_job_trigger(g_profiler.pj_handle);
831
928
  }
832
929
 
@@ -845,7 +942,8 @@ rperf_worker_signal_func(void *arg)
845
942
  CHECKED(pthread_cond_signal(&prof->worker_cond));
846
943
 
847
944
  while (prof->running) {
848
- CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
945
+ while (prof->running && !atomic_load_explicit(&prof->swap_ready, memory_order_acquire))
946
+ CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
849
947
  rperf_try_aggregate(prof);
850
948
  }
851
949
  CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
@@ -874,9 +972,12 @@ rperf_worker_nanosleep_func(void *arg)
874
972
  CHECKED(pthread_mutex_lock(&prof->worker_mutex));
875
973
  while (prof->running) {
876
974
  int ret = pthread_cond_timedwait(&prof->worker_cond, &prof->worker_mutex, &deadline);
877
- assert(ret == 0 || ret == ETIMEDOUT);
975
+ if (ret != 0 && ret != ETIMEDOUT) {
976
+ fprintf(stderr, "rperf: pthread_cond_timedwait failed: %s\n", strerror(ret));
977
+ abort();
978
+ }
878
979
  if (ret == ETIMEDOUT) {
879
- prof->trigger_count++;
980
+ prof->stats.trigger_count++;
880
981
  rb_postponed_job_trigger(prof->pj_handle);
881
982
  /* Advance deadline by interval */
882
983
  deadline.tv_nsec += interval_ns;
@@ -900,8 +1001,6 @@ rperf_resolve_frame(VALUE fval)
900
1001
  VALUE label = rb_profile_frame_full_label(fval);
901
1002
 
902
1003
  if (NIL_P(path)) path = rb_str_new_lit("<C method>");
903
-
904
- if (NIL_P(path)) path = rb_str_new_cstr("");
905
1004
  if (NIL_P(label)) label = rb_str_new_cstr("");
906
1005
 
907
1006
  return rb_ary_new3(2, path, label);
@@ -909,58 +1008,23 @@ rperf_resolve_frame(VALUE fval)
909
1008
 
910
1009
  /* ---- Ruby API ---- */
911
1010
 
1011
+ /* _c_start(frequency, mode, aggregate, signal)
1012
+ * frequency: Integer (Hz)
1013
+ * mode: 0 = cpu, 1 = wall
1014
+ * aggregate: 0 or 1
1015
+ * signal: Integer (RT signal number, 0 = nanosleep, -1 = default)
1016
+ */
912
1017
  static VALUE
913
- rb_rperf_start(int argc, VALUE *argv, VALUE self)
1018
+ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
914
1019
  {
915
- VALUE opts;
916
- int frequency = 1000;
917
- int mode = 0; /* 0 = cpu, 1 = wall */
918
- int aggregate = 1; /* default: aggregate */
1020
+ int frequency = NUM2INT(vfreq);
1021
+ int mode = NUM2INT(vmode);
1022
+ int aggregate = RTEST(vagg) ? 1 : 0;
919
1023
  #if RPERF_USE_TIMER_SIGNAL
920
- int timer_signal = RPERF_TIMER_SIGNAL_DEFAULT;
1024
+ int sig = NUM2INT(vsig);
1025
+ int timer_signal = (sig < 0) ? RPERF_TIMER_SIGNAL_DEFAULT : sig;
921
1026
  #endif
922
1027
 
923
- rb_scan_args(argc, argv, ":", &opts);
924
- if (!NIL_P(opts)) {
925
- VALUE vagg = rb_hash_aref(opts, ID2SYM(rb_intern("aggregate")));
926
- if (!NIL_P(vagg)) {
927
- aggregate = RTEST(vagg) ? 1 : 0;
928
- }
929
- VALUE vfreq = rb_hash_aref(opts, ID2SYM(rb_intern("frequency")));
930
- if (!NIL_P(vfreq)) {
931
- frequency = NUM2INT(vfreq);
932
- if (frequency <= 0 || frequency > 1000000) {
933
- rb_raise(rb_eArgError, "frequency must be between 1 and 1000000");
934
- }
935
- }
936
- VALUE vmode = rb_hash_aref(opts, ID2SYM(rb_intern("mode")));
937
- if (!NIL_P(vmode)) {
938
- ID mode_id = SYM2ID(vmode);
939
- if (mode_id == rb_intern("cpu")) {
940
- mode = 0;
941
- } else if (mode_id == rb_intern("wall")) {
942
- mode = 1;
943
- } else {
944
- rb_raise(rb_eArgError, "mode must be :cpu or :wall");
945
- }
946
- }
947
- #if RPERF_USE_TIMER_SIGNAL
948
- VALUE vsig = rb_hash_aref(opts, ID2SYM(rb_intern("signal")));
949
- if (!NIL_P(vsig)) {
950
- if (RTEST(vsig)) {
951
- timer_signal = NUM2INT(vsig);
952
- if (timer_signal < SIGRTMIN || timer_signal > SIGRTMAX) {
953
- rb_raise(rb_eArgError, "signal must be between SIGRTMIN(%d) and SIGRTMAX(%d)",
954
- SIGRTMIN, SIGRTMAX);
955
- }
956
- } else {
957
- /* signal: false or signal: 0 → use nanosleep thread */
958
- timer_signal = 0;
959
- }
960
- }
961
- #endif
962
- }
963
-
964
1028
  if (g_profiler.running) {
965
1029
  rb_raise(rb_eRuntimeError, "Rperf is already running");
966
1030
  }
@@ -969,11 +1033,11 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
969
1033
  g_profiler.mode = mode;
970
1034
  g_profiler.aggregate = aggregate;
971
1035
  g_profiler.next_thread_seq = 0;
972
- g_profiler.sampling_count = 0;
973
- g_profiler.sampling_total_ns = 0;
974
- g_profiler.trigger_count = 0;
975
- g_profiler.active_idx = 0;
976
- g_profiler.swap_ready = 0;
1036
+ g_profiler.stats.sampling_count = 0;
1037
+ g_profiler.stats.sampling_total_ns = 0;
1038
+ g_profiler.stats.trigger_count = 0;
1039
+ atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
1040
+ atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
977
1041
 
978
1042
  /* Initialize worker mutex/cond */
979
1043
  CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
@@ -994,13 +1058,26 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
994
1058
  }
995
1059
 
996
1060
  /* Initialize aggregation structures */
997
- rperf_frame_table_init(&g_profiler.frame_table);
998
- rperf_agg_table_init(&g_profiler.agg_table);
1061
+ if (rperf_frame_table_init(&g_profiler.frame_table) < 0) {
1062
+ rperf_sample_buffer_free(&g_profiler.buffers[0]);
1063
+ rperf_sample_buffer_free(&g_profiler.buffers[1]);
1064
+ CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
1065
+ CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
1066
+ rb_raise(rb_eNoMemError, "rperf: failed to allocate frame table");
1067
+ }
1068
+ if (rperf_agg_table_init(&g_profiler.agg_table) < 0) {
1069
+ rperf_frame_table_free(&g_profiler.frame_table);
1070
+ rperf_sample_buffer_free(&g_profiler.buffers[0]);
1071
+ rperf_sample_buffer_free(&g_profiler.buffers[1]);
1072
+ CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
1073
+ CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
1074
+ rb_raise(rb_eNoMemError, "rperf: failed to allocate aggregation table");
1075
+ }
999
1076
  }
1000
1077
 
1001
1078
  /* Register GC event hook */
1002
- g_profiler.gc_phase = RPERF_GC_NONE;
1003
- g_profiler.gc_frame_depth = 0;
1079
+ g_profiler.gc.phase = RPERF_GC_NONE;
1080
+ g_profiler.gc.enter_ns = 0;
1004
1081
  rb_add_event_hook(rperf_gc_event_hook,
1005
1082
  RUBY_INTERNAL_EVENT_GC_START |
1006
1083
  RUBY_INTERNAL_EVENT_GC_END_MARK |
@@ -1023,6 +1100,7 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
1023
1100
  VALUE cur_thread = rb_thread_current();
1024
1101
  rperf_thread_data_t *td = rperf_thread_data_create(&g_profiler, cur_thread);
1025
1102
  if (!td) {
1103
+ rb_remove_event_hook(rperf_gc_event_hook);
1026
1104
  rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
1027
1105
  g_profiler.thread_hook = NULL;
1028
1106
  if (g_profiler.aggregate) {
@@ -1053,14 +1131,17 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
1053
1131
  memset(&sa, 0, sizeof(sa));
1054
1132
  sa.sa_handler = rperf_signal_handler;
1055
1133
  sa.sa_flags = SA_RESTART;
1056
- sigaction(g_profiler.timer_signal, &sa, NULL);
1134
+ if (sigaction(g_profiler.timer_signal, &sa, &g_profiler.old_sigaction) != 0) {
1135
+ g_profiler.running = 0;
1136
+ goto timer_fail;
1137
+ }
1057
1138
 
1058
1139
  /* Start worker thread first to get its kernel TID */
1059
1140
  g_profiler.worker_tid = 0;
1060
1141
  if (pthread_create(&g_profiler.worker_thread, NULL,
1061
1142
  rperf_worker_signal_func, &g_profiler) != 0) {
1062
1143
  g_profiler.running = 0;
1063
- signal(g_profiler.timer_signal, SIG_DFL);
1144
+ sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
1064
1145
  goto timer_fail;
1065
1146
  }
1066
1147
 
@@ -1078,7 +1159,7 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
1078
1159
  sev._sigev_un._tid = g_profiler.worker_tid;
1079
1160
  if (timer_create(CLOCK_MONOTONIC, &sev, &g_profiler.timer_id) != 0) {
1080
1161
  g_profiler.running = 0;
1081
- signal(g_profiler.timer_signal, SIG_DFL);
1162
+ sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
1082
1163
  CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
1083
1164
  CHECKED(pthread_join(g_profiler.worker_thread, NULL));
1084
1165
  goto timer_fail;
@@ -1087,7 +1168,14 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
1087
1168
  its.it_value.tv_sec = 0;
1088
1169
  its.it_value.tv_nsec = 1000000000L / g_profiler.frequency;
1089
1170
  its.it_interval = its.it_value;
1090
- timer_settime(g_profiler.timer_id, 0, &its, NULL);
1171
+ if (timer_settime(g_profiler.timer_id, 0, &its, NULL) != 0) {
1172
+ timer_delete(g_profiler.timer_id);
1173
+ g_profiler.running = 0;
1174
+ sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
1175
+ CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
1176
+ CHECKED(pthread_join(g_profiler.worker_thread, NULL));
1177
+ goto timer_fail;
1178
+ }
1091
1179
  } else
1092
1180
  #endif
1093
1181
  {
@@ -1109,6 +1197,7 @@ timer_fail:
1109
1197
  rb_internal_thread_specific_set(cur, g_profiler.ts_key, NULL);
1110
1198
  }
1111
1199
  }
1200
+ rb_remove_event_hook(rperf_gc_event_hook);
1112
1201
  rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
1113
1202
  g_profiler.thread_hook = NULL;
1114
1203
  if (g_profiler.aggregate) {
@@ -1139,17 +1228,28 @@ rb_rperf_stop(VALUE self)
1139
1228
  g_profiler.running = 0;
1140
1229
  #if RPERF_USE_TIMER_SIGNAL
1141
1230
  if (g_profiler.timer_signal > 0) {
1231
+ /* Delete timer first to stop generating new signals.
1232
+ * Do NOT restore signal handler yet — the worker thread may still have
1233
+ * pending timer signals. rperf_signal_handler handles them harmlessly. */
1142
1234
  timer_delete(g_profiler.timer_id);
1143
- signal(g_profiler.timer_signal, SIG_IGN);
1144
1235
  }
1145
1236
  #endif
1146
1237
 
1147
- /* Wake and join worker thread */
1238
+ /* Wake and join worker thread.
1239
+ * Any pending timer signals are still handled by rperf_signal_handler
1240
+ * (just increments trigger_count + calls rb_postponed_job_trigger). */
1148
1241
  CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
1149
1242
  CHECKED(pthread_join(g_profiler.worker_thread, NULL));
1150
1243
  CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
1151
1244
  CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
1152
1245
 
1246
+ #if RPERF_USE_TIMER_SIGNAL
1247
+ if (g_profiler.timer_signal > 0) {
1248
+ /* Worker thread is gone — safe to restore old signal handler now. */
1249
+ sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
1250
+ }
1251
+ #endif
1252
+
1153
1253
  if (g_profiler.thread_hook) {
1154
1254
  rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
1155
1255
  g_profiler.thread_hook = NULL;
@@ -1159,13 +1259,15 @@ rb_rperf_stop(VALUE self)
1159
1259
  rb_remove_event_hook(rperf_gc_event_hook);
1160
1260
 
1161
1261
  if (g_profiler.aggregate) {
1262
+ /* Worker thread is joined; no concurrent access to these atomics. */
1263
+ int cur_idx = atomic_load_explicit(&g_profiler.active_idx, memory_order_relaxed);
1162
1264
  /* Aggregate remaining samples from both buffers */
1163
- if (g_profiler.swap_ready) {
1164
- int standby_idx = g_profiler.active_idx ^ 1;
1265
+ if (atomic_load_explicit(&g_profiler.swap_ready, memory_order_relaxed)) {
1266
+ int standby_idx = cur_idx ^ 1;
1165
1267
  rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[standby_idx]);
1166
- g_profiler.swap_ready = 0;
1268
+ atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
1167
1269
  }
1168
- rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[g_profiler.active_idx]);
1270
+ rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[cur_idx]);
1169
1271
  }
1170
1272
 
1171
1273
  /* Clean up thread-specific data for all live threads */
@@ -1193,10 +1295,11 @@ rb_rperf_stop(VALUE self)
1193
1295
  /* frequency */
1194
1296
  rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
1195
1297
 
1196
- /* trigger_count, sampling_count, sampling_time_ns */
1197
- rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.trigger_count));
1198
- rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.sampling_count));
1199
- rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.sampling_total_ns));
1298
+ /* trigger_count, sampling_count, sampling_time_ns, detected_thread_count */
1299
+ rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
1300
+ rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
1301
+ rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
1302
+ rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
1200
1303
 
1201
1304
  /* aggregation stats */
1202
1305
  if (g_profiler.aggregate) {
@@ -1231,7 +1334,7 @@ rb_rperf_stop(VALUE self)
1231
1334
  rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]")));
1232
1335
  /* Real frames */
1233
1336
  for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
1234
- rb_ary_push(resolved_ary, rperf_resolve_frame(ft->keys[i]));
1337
+ rb_ary_push(resolved_ary, rperf_resolve_frame(atomic_load_explicit(&ft->keys, memory_order_relaxed)[i]));
1235
1338
  }
1236
1339
 
1237
1340
  rperf_agg_table_t *at = &g_profiler.agg_table;
@@ -1285,7 +1388,9 @@ rb_rperf_stop(VALUE self)
1285
1388
  rb_ary_push(samples_ary, sample);
1286
1389
  }
1287
1390
  }
1288
- rb_hash_aset(result, ID2SYM(rb_intern("samples")), samples_ary);
1391
+ rb_hash_aset(result,
1392
+ ID2SYM(rb_intern(g_profiler.aggregate ? "aggregated_samples" : "raw_samples")),
1393
+ samples_ary);
1289
1394
 
1290
1395
  /* Cleanup */
1291
1396
  rperf_sample_buffer_free(&g_profiler.buffers[0]);
@@ -1304,9 +1409,20 @@ rperf_after_fork_child(void)
1304
1409
  g_profiler.running = 0;
1305
1410
 
1306
1411
  #if RPERF_USE_TIMER_SIGNAL
1307
- /* timer_create timers are not inherited across fork; reset signal handler */
1412
+ /* timer_create timers are not inherited across fork, but pending signals may be.
1413
+ * Block the signal, drain any pending instances, then restore old handler. */
1308
1414
  if (g_profiler.timer_signal > 0) {
1309
- signal(g_profiler.timer_signal, SIG_DFL);
1415
+ sigset_t block_set, old_set;
1416
+ struct timespec zero_ts = {0, 0};
1417
+
1418
+ sigemptyset(&block_set);
1419
+ sigaddset(&block_set, g_profiler.timer_signal);
1420
+ pthread_sigmask(SIG_BLOCK, &block_set, &old_set);
1421
+
1422
+ while (sigtimedwait(&block_set, NULL, &zero_ts) > 0) {}
1423
+
1424
+ sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
1425
+ pthread_sigmask(SIG_SETMASK, &old_set, NULL);
1310
1426
  }
1311
1427
  #endif
1312
1428
 
@@ -1326,12 +1442,13 @@ rperf_after_fork_child(void)
1326
1442
  }
1327
1443
 
1328
1444
  /* Reset GC state */
1329
- g_profiler.gc_phase = 0;
1445
+ g_profiler.gc.phase = 0;
1446
+ g_profiler.gc.enter_ns = 0;
1330
1447
 
1331
1448
  /* Reset stats */
1332
- g_profiler.sampling_count = 0;
1333
- g_profiler.sampling_total_ns = 0;
1334
- g_profiler.swap_ready = 0;
1449
+ g_profiler.stats.sampling_count = 0;
1450
+ g_profiler.stats.sampling_total_ns = 0;
1451
+ atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
1335
1452
  }
1336
1453
 
1337
1454
  /* ---- Init ---- */
@@ -1340,7 +1457,7 @@ void
1340
1457
  Init_rperf(void)
1341
1458
  {
1342
1459
  VALUE mRperf = rb_define_module("Rperf");
1343
- rb_define_module_function(mRperf, "_c_start", rb_rperf_start, -1);
1460
+ rb_define_module_function(mRperf, "_c_start", rb_rperf_start, 4);
1344
1461
  rb_define_module_function(mRperf, "_c_stop", rb_rperf_stop, 0);
1345
1462
 
1346
1463
  memset(&g_profiler, 0, sizeof(g_profiler));
data/lib/rperf/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Rperf
2
- VERSION = "0.4.0"
2
+ VERSION = "0.5.0"
3
3
  end
data/lib/rperf.rb CHANGED
@@ -24,14 +24,25 @@ module Rperf
24
24
  # .txt → text report (human/AI readable flat + cumulative table)
25
25
  # otherwise (.pb.gz etc) → pprof protobuf (gzip compressed)
26
26
  def self.start(frequency: 1000, mode: :cpu, output: nil, verbose: false, format: nil, stat: false, signal: nil, aggregate: true)
27
+ raise ArgumentError, "frequency must be a positive integer (got #{frequency.inspect})" unless frequency.is_a?(Integer) && frequency > 0
28
+ raise ArgumentError, "frequency must be <= 10000 (10KHz), got #{frequency}" if frequency > 10_000
29
+ raise ArgumentError, "mode must be :cpu or :wall, got #{mode.inspect}" unless %i[cpu wall].include?(mode)
30
+ c_mode = mode == :cpu ? 0 : 1
31
+ c_signal = signal.nil? ? -1 : (signal ? signal.to_i : 0)
32
+ if c_signal > 0
33
+ raise ArgumentError, "signal mode is only supported on Linux" unless RUBY_PLATFORM =~ /linux/
34
+ uncatchable = [Signal.list["KILL"], Signal.list["STOP"]].compact
35
+ if uncatchable.include?(c_signal)
36
+ name = Signal.signame(c_signal) rescue c_signal.to_s
37
+ raise ArgumentError, "signal #{c_signal} (#{name}) cannot be caught; use a different signal"
38
+ end
39
+ end
27
40
  @verbose = verbose || ENV["RPERF_VERBOSE"] == "1"
28
41
  @output = output
29
42
  @format = format
30
43
  @stat = stat
31
44
  @stat_start_mono = Process.clock_gettime(Process::CLOCK_MONOTONIC) if @stat
32
- c_opts = { frequency: frequency, mode: mode, aggregate: aggregate }
33
- c_opts[:signal] = signal unless signal.nil?
34
- _c_start(**c_opts)
45
+ _c_start(frequency, c_mode, aggregate, c_signal)
35
46
 
36
47
  if block_given?
37
48
  begin
@@ -46,6 +57,21 @@ module Rperf
46
57
  data = _c_stop
47
58
  return unless data
48
59
 
60
+ # When aggregate: false, C extension returns :raw_samples but not
61
+ # :aggregated_samples. Build aggregated view so encoders always work.
62
+ if data[:raw_samples] && !data[:aggregated_samples]
63
+ merged = {}
64
+ data[:raw_samples].each do |frames, weight, thread_seq|
65
+ key = [frames, thread_seq || 0]
66
+ if merged.key?(key)
67
+ merged[key] += weight
68
+ else
69
+ merged[key] = weight
70
+ end
71
+ end
72
+ data[:aggregated_samples] = merged.map { |(frames, ts), w| [frames, w, ts] }
73
+ end
74
+
49
75
  print_stats(data) if @verbose
50
76
  print_stat(data) if @stat
51
77
 
@@ -148,7 +174,7 @@ module Rperf
148
174
 
149
175
  # Samples from C are now [[path_str, label_str], ...], weight]
150
176
  def self.print_top(data)
151
- samples_raw = data[:samples]
177
+ samples_raw = data[:aggregated_samples]
152
178
  return if !samples_raw || samples_raw.empty?
153
179
 
154
180
  result = compute_flat_cum(samples_raw)
@@ -180,7 +206,7 @@ module Rperf
180
206
  private_constant :STAT_PCT_LINE, :STAT_LINE
181
207
 
182
208
  def self.print_stat(data)
183
- samples_raw = data[:samples] || []
209
+ samples_raw = data[:aggregated_samples] || []
184
210
  real_ns = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - @stat_start_mono) * 1_000_000_000).to_i
185
211
  times = Process.times
186
212
  user_ns = (times.utime * 1_000_000_000).to_i
@@ -198,7 +224,7 @@ module Rperf
198
224
  if samples_raw.size > 0
199
225
  breakdown, total_weight = compute_stat_breakdown(samples_raw)
200
226
  print_stat_breakdown(breakdown, total_weight)
201
- print_stat_runtime_info
227
+ print_stat_runtime_info(data)
202
228
  print_stat_system_info
203
229
  print_stat_report(data) if ENV["RPERF_STAT_REPORT"] == "1"
204
230
  print_stat_footer(samples_raw, real_ns, data)
@@ -246,7 +272,9 @@ module Rperf
246
272
  end
247
273
  private_class_method :print_stat_breakdown
248
274
 
249
- def self.print_stat_runtime_info
275
+ def self.print_stat_runtime_info(data)
276
+ thread_count = data[:detected_thread_count] || 0
277
+ $stderr.puts STAT_LINE.call(format_integer(thread_count), " ", "[Ruby] detected threads") if thread_count > 0
250
278
  gc = GC.stat
251
279
  $stderr.puts STAT_LINE.call(format_ms(gc[:time] * 1_000_000), "ms",
252
280
  "[Ruby] GC time (%s count: %s minor, %s major)" % [
@@ -391,7 +419,7 @@ module Rperf
391
419
  module_function
392
420
 
393
421
  def encode(data, top_n: 50, header: true)
394
- samples_raw = data[:samples]
422
+ samples_raw = data[:aggregated_samples]
395
423
  mode = data[:mode] || :cpu
396
424
  frequency = data[:frequency] || 0
397
425
 
@@ -433,8 +461,10 @@ module Rperf
433
461
  module_function
434
462
 
435
463
  def encode(data)
464
+ samples = data[:aggregated_samples]
465
+ return "" if !samples || samples.empty?
436
466
  merged = Hash.new(0)
437
- data[:samples].each do |frames, weight|
467
+ samples.each do |frames, weight|
438
468
  key = frames.reverse.map { |_, label| label }.join(";")
439
469
  merged[key] += weight
440
470
  end
@@ -451,7 +481,7 @@ module Rperf
451
481
  module_function
452
482
 
453
483
  def encode(data)
454
- samples_raw = data[:samples]
484
+ samples_raw = data[:aggregated_samples]
455
485
  frequency = data[:frequency]
456
486
  interval_ns = 1_000_000_000 / frequency
457
487
  mode = data[:mode] || :cpu
@@ -537,7 +567,7 @@ module Rperf
537
567
 
538
568
  # field 6: string_table (repeated string)
539
569
  string_table.each do |s|
540
- buf << encode_bytes(6, s.encode("UTF-8"))
570
+ buf << encode_bytes(6, s.encode("UTF-8", invalid: :replace, undef: :replace))
541
571
  end
542
572
 
543
573
  # field 9: time_nanos (int64)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rperf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Koichi Sasada