rperf 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/rperf/rperf.c CHANGED
@@ -7,13 +7,19 @@
7
7
  #include <stdlib.h>
8
8
  #include <unistd.h>
9
9
  #include <signal.h>
10
- #include <assert.h>
10
+ #include <stdatomic.h>
11
11
  #ifdef __linux__
12
12
  #include <sys/syscall.h>
13
13
  #endif
14
14
 
15
- /* Checked pthread wrappers — assert on unexpected errors */
16
- #define CHECKED(call) do { int _r = (call); assert(_r == 0 && #call); (void)_r; } while (0)
15
+ /* Checked pthread wrappers — always active regardless of NDEBUG */
16
+ #define CHECKED(call) do { \
17
+ int _r = (call); \
18
+ if (_r != 0) { \
19
+ fprintf(stderr, "rperf: %s failed: %s\n", #call, strerror(_r)); \
20
+ abort(); \
21
+ } \
22
+ } while (0)
17
23
 
18
24
  #ifdef __linux__
19
25
  #define RPERF_USE_TIMER_SIGNAL 1
@@ -26,7 +32,8 @@
26
32
  #define RPERF_INITIAL_SAMPLES 16384 /* >= AGG_THRESHOLD to avoid realloc before first aggregation */
27
33
  #define RPERF_INITIAL_FRAME_POOL (1024 * 1024 / sizeof(VALUE)) /* ~1MB */
28
34
  #define RPERF_AGG_THRESHOLD 10000 /* aggregate every N samples */
29
- #define RPERF_FRAME_TABLE_INITIAL 65536 /* pre-allocate to avoid realloc race with GC dmark */
35
+ #define RPERF_FRAME_TABLE_INITIAL 4096
36
+ #define RPERF_FRAME_TABLE_OLD_KEYS_INITIAL 16
30
37
  #define RPERF_AGG_TABLE_INITIAL 1024
31
38
  #define RPERF_STACK_POOL_INITIAL 4096
32
39
 
@@ -59,6 +66,7 @@ typedef struct rperf_sample {
59
66
  int64_t weight;
60
67
  int type; /* rperf_sample_type */
61
68
  int thread_seq; /* thread sequence number (1-based) */
69
+ int label_set_id; /* label set ID (0 = no labels) */
62
70
  } rperf_sample_t;
63
71
 
64
72
  /* ---- Sample buffer (double-buffered) ---- */
@@ -77,11 +85,15 @@ typedef struct rperf_sample_buffer {
77
85
  #define RPERF_FRAME_TABLE_EMPTY UINT32_MAX
78
86
 
79
87
  typedef struct rperf_frame_table {
80
- VALUE *keys; /* unique VALUE array (GC mark target) */
88
+ _Atomic(VALUE *) keys; /* unique VALUE array (GC mark target) */
81
89
  size_t count; /* = next frame_id (starts after RPERF_SYNTHETIC_COUNT) */
82
90
  size_t capacity;
83
91
  uint32_t *buckets; /* open addressing: stores index into keys[] */
84
92
  size_t bucket_capacity;
93
+ /* Old keys arrays kept alive for GC dmark safety until stop */
94
+ VALUE **old_keys;
95
+ int old_keys_count;
96
+ int old_keys_capacity;
85
97
  } rperf_frame_table_t;
86
98
 
87
99
  /* ---- Aggregation table: stack → weight ---- */
@@ -92,6 +104,7 @@ typedef struct rperf_agg_entry {
92
104
  uint32_t frame_start; /* offset into stack_pool */
93
105
  int depth; /* includes synthetic frame */
94
106
  int thread_seq;
107
+ int label_set_id; /* label set ID (0 = no labels) */
95
108
  int64_t weight; /* accumulated */
96
109
  uint32_t hash; /* cached hash value */
97
110
  int used; /* 0 = empty, 1 = used */
@@ -107,54 +120,68 @@ typedef struct rperf_agg_table {
107
120
  } rperf_agg_table_t;
108
121
 
109
122
  typedef struct rperf_thread_data {
110
- int64_t prev_cpu_ns;
123
+ int64_t prev_time_ns;
111
124
  int64_t prev_wall_ns;
112
125
  /* GVL event tracking */
113
126
  int64_t suspended_at_ns; /* wall time at SUSPENDED */
114
127
  int64_t ready_at_ns; /* wall time at READY */
115
- size_t suspended_frame_start; /* saved stack in frame_pool */
116
- int suspended_frame_depth; /* saved stack depth */
117
128
  int thread_seq; /* thread sequence number (1-based) */
129
+ int label_set_id; /* current label set ID (0 = no labels) */
118
130
  } rperf_thread_data_t;
119
131
 
132
+ /* ---- GC tracking state ---- */
133
+
134
+ typedef struct rperf_gc_state {
135
+ int phase; /* rperf_gc_phase */
136
+ int64_t enter_ns; /* wall time at GC_ENTER */
137
+ int thread_seq; /* thread_seq at GC_ENTER */
138
+ int label_set_id; /* label_set_id at GC_ENTER */
139
+ } rperf_gc_state_t;
140
+
141
+ /* ---- Sampling overhead stats ---- */
142
+
143
+ typedef struct rperf_stats {
144
+ size_t trigger_count;
145
+ size_t sampling_count;
146
+ int64_t sampling_total_ns;
147
+ } rperf_stats_t;
148
+
120
149
  typedef struct rperf_profiler {
121
150
  int frequency;
122
151
  int mode; /* 0 = cpu, 1 = wall */
123
- volatile int running;
152
+ _Atomic int running;
124
153
  pthread_t worker_thread; /* combined timer + aggregation */
125
154
  #if RPERF_USE_TIMER_SIGNAL
126
155
  timer_t timer_id;
127
156
  int timer_signal; /* >0: use timer signal, 0: use nanosleep thread */
128
- volatile pid_t worker_tid; /* kernel TID of worker thread (for SIGEV_THREAD_ID) */
157
+ _Atomic pid_t worker_tid; /* kernel TID of worker thread (for SIGEV_THREAD_ID) */
158
+ struct sigaction old_sigaction; /* saved handler to restore on stop */
129
159
  #endif
130
160
  rb_postponed_job_handle_t pj_handle;
131
161
  int aggregate; /* 1 = aggregate samples, 0 = raw */
132
162
  /* Double-buffered sample storage (only buffers[0] used when !aggregate) */
133
163
  rperf_sample_buffer_t buffers[2];
134
- int active_idx; /* 0 or 1 */
164
+ _Atomic int active_idx; /* 0 or 1 */
135
165
  /* Aggregation (only used when aggregate=1) */
136
166
  rperf_frame_table_t frame_table;
137
167
  rperf_agg_table_t agg_table;
138
- volatile int swap_ready; /* 1 = standby buffer ready for aggregation */
168
+ _Atomic int swap_ready; /* 1 = standby buffer ready for aggregation */
139
169
  pthread_mutex_t worker_mutex;
140
170
  pthread_cond_t worker_cond;
141
171
  rb_internal_thread_specific_key_t ts_key;
142
172
  rb_internal_thread_event_hook_t *thread_hook;
143
173
  /* GC tracking */
144
- int gc_phase; /* rperf_gc_phase */
145
- int64_t gc_enter_ns; /* wall time at GC_ENTER */
146
- size_t gc_frame_start; /* saved stack at GC_ENTER */
147
- int gc_frame_depth; /* saved stack depth */
148
- int gc_thread_seq; /* thread_seq at GC_ENTER */
174
+ rperf_gc_state_t gc;
149
175
  /* Timing metadata for pprof */
150
176
  struct timespec start_realtime; /* CLOCK_REALTIME at start */
151
177
  struct timespec start_monotonic; /* CLOCK_MONOTONIC at start */
152
178
  /* Thread sequence counter */
153
179
  int next_thread_seq;
154
180
  /* Sampling overhead stats */
155
- size_t trigger_count;
156
- size_t sampling_count;
157
- int64_t sampling_total_ns;
181
+ rperf_stats_t stats;
182
+ /* Label sets: Ruby Array of Hash objects, managed from Ruby side.
183
+ * Index 0 is reserved (no labels). GC-marked via profiler_mark. */
184
+ VALUE label_sets; /* Ruby Array or Qnil */
158
185
  } rperf_profiler_t;
159
186
 
160
187
  static rperf_profiler_t g_profiler;
@@ -175,10 +202,22 @@ rperf_profiler_mark(void *ptr)
175
202
  buf->frame_pool + buf->frame_pool_count);
176
203
  }
177
204
  }
178
- /* Mark frame_table keys (unique frame VALUEs) */
179
- if (prof->frame_table.keys && prof->frame_table.count > 0) {
180
- rb_gc_mark_locations(prof->frame_table.keys + RPERF_SYNTHETIC_COUNT,
181
- prof->frame_table.keys + prof->frame_table.count);
205
+ /* Mark label_sets array */
206
+ if (prof->label_sets != Qnil) {
207
+ rb_gc_mark(prof->label_sets);
208
+ }
209
+ /* Mark frame_table keys (unique frame VALUEs).
210
+ * Acquire count to synchronize with the release-store in insert,
211
+ * ensuring we see the keys pointer that is valid for [0, count).
212
+ * If we see an old count, both old and new keys arrays have valid
213
+ * data (old keys are kept alive in old_keys[]). */
214
+ {
215
+ size_t ft_count = __atomic_load_n(&prof->frame_table.count, __ATOMIC_ACQUIRE);
216
+ VALUE *ft_keys = atomic_load_explicit(&prof->frame_table.keys, memory_order_acquire);
217
+ if (ft_keys && ft_count > 0) {
218
+ rb_gc_mark_locations(ft_keys + RPERF_SYNTHETIC_COUNT,
219
+ ft_keys + ft_count);
220
+ }
182
221
  }
183
222
  }
184
223
 
@@ -288,21 +327,38 @@ rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
288
327
 
289
328
  /* ---- Frame table operations (all malloc-based, no GVL needed) ---- */
290
329
 
291
- static void
330
+ static int
292
331
  rperf_frame_table_init(rperf_frame_table_t *ft)
293
332
  {
294
333
  ft->capacity = RPERF_FRAME_TABLE_INITIAL;
295
- ft->keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
334
+ VALUE *keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
335
+ if (!keys) return -1;
336
+ atomic_store_explicit(&ft->keys, keys, memory_order_relaxed);
296
337
  ft->count = RPERF_SYNTHETIC_COUNT; /* reserve slots for synthetic frames */
297
338
  ft->bucket_capacity = RPERF_FRAME_TABLE_INITIAL * 2;
298
339
  ft->buckets = (uint32_t *)malloc(ft->bucket_capacity * sizeof(uint32_t));
340
+ if (!ft->buckets) { free(keys); atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed); return -1; }
299
341
  memset(ft->buckets, 0xFF, ft->bucket_capacity * sizeof(uint32_t)); /* EMPTY */
342
+ ft->old_keys_count = 0;
343
+ ft->old_keys_capacity = RPERF_FRAME_TABLE_OLD_KEYS_INITIAL;
344
+ ft->old_keys = (VALUE **)malloc(ft->old_keys_capacity * sizeof(VALUE *));
345
+ if (!ft->old_keys) {
346
+ free(ft->buckets);
347
+ free(keys);
348
+ atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed);
349
+ return -1;
350
+ }
351
+ return 0;
300
352
  }
301
353
 
302
354
  static void
303
355
  rperf_frame_table_free(rperf_frame_table_t *ft)
304
356
  {
305
- free(ft->keys);
357
+ int i;
358
+ for (i = 0; i < ft->old_keys_count; i++)
359
+ free(ft->old_keys[i]);
360
+ free(ft->old_keys);
361
+ free(atomic_load_explicit(&ft->keys, memory_order_relaxed));
306
362
  free(ft->buckets);
307
363
  memset(ft, 0, sizeof(*ft));
308
364
  }
@@ -312,11 +368,13 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
312
368
  {
313
369
  size_t new_cap = ft->bucket_capacity * 2;
314
370
  uint32_t *new_buckets = (uint32_t *)malloc(new_cap * sizeof(uint32_t));
371
+ if (!new_buckets) return; /* keep using current buckets at higher load factor */
315
372
  memset(new_buckets, 0xFF, new_cap * sizeof(uint32_t));
316
373
 
374
+ VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
317
375
  size_t i;
318
376
  for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
319
- uint32_t h = (uint32_t)(ft->keys[i] >> 3); /* shift out tag bits */
377
+ uint32_t h = (uint32_t)(keys[i] >> 3); /* shift out tag bits */
320
378
  size_t idx = h % new_cap;
321
379
  while (new_buckets[idx] != RPERF_FRAME_TABLE_EMPTY)
322
380
  idx = (idx + 1) % new_cap;
@@ -332,25 +390,42 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
332
390
  static uint32_t
333
391
  rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
334
392
  {
393
+ VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
335
394
  uint32_t h = (uint32_t)(fval >> 3);
336
395
  size_t idx = h % ft->bucket_capacity;
337
396
 
338
397
  while (1) {
339
398
  uint32_t slot = ft->buckets[idx];
340
399
  if (slot == RPERF_FRAME_TABLE_EMPTY) break;
341
- if (ft->keys[slot] == fval) return slot;
400
+ if (keys[slot] == fval) return slot;
342
401
  idx = (idx + 1) % ft->bucket_capacity;
343
402
  }
344
403
 
345
- /* Insert new entry.
346
- * keys array is pre-allocated and never realloc'd to avoid race with GC dmark.
347
- * If capacity is exhausted, return EMPTY to signal aggregation should stop. */
404
+ /* Insert new entry. Grow keys array if capacity is exhausted.
405
+ * Cannot realloc in-place because GC dmark may concurrently read
406
+ * the old keys pointer. Instead, allocate new, copy, swap pointer
407
+ * atomically, and keep old array alive until stop. */
348
408
  if (ft->count >= ft->capacity) {
349
- return RPERF_FRAME_TABLE_EMPTY;
409
+ size_t new_cap = ft->capacity * 2;
410
+ VALUE *new_keys = (VALUE *)calloc(new_cap, sizeof(VALUE));
411
+ if (!new_keys) return RPERF_FRAME_TABLE_EMPTY;
412
+ memcpy(new_keys, keys, ft->capacity * sizeof(VALUE));
413
+ /* Save old keys for deferred free (GC dmark safety) */
414
+ if (ft->old_keys_count >= ft->old_keys_capacity) {
415
+ int new_old_cap = ft->old_keys_capacity * 2;
416
+ VALUE **new_old = (VALUE **)realloc(ft->old_keys, new_old_cap * sizeof(VALUE *));
417
+ if (!new_old) { free(new_keys); return RPERF_FRAME_TABLE_EMPTY; }
418
+ ft->old_keys = new_old;
419
+ ft->old_keys_capacity = new_old_cap;
420
+ }
421
+ ft->old_keys[ft->old_keys_count++] = keys;
422
+ keys = new_keys;
423
+ atomic_store_explicit(&ft->keys, new_keys, memory_order_release);
424
+ ft->capacity = new_cap;
350
425
  }
351
426
 
352
427
  uint32_t frame_id = (uint32_t)ft->count;
353
- ft->keys[frame_id] = fval;
428
+ keys[frame_id] = fval;
354
429
  /* Store fence: ensure keys[frame_id] is visible before count is incremented,
355
430
  * so GC dmark never reads uninitialized keys[count-1]. */
356
431
  __atomic_store_n(&ft->count, ft->count + 1, __ATOMIC_RELEASE);
@@ -367,7 +442,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
367
442
  /* ---- Aggregation table operations (all malloc-based, no GVL needed) ---- */
368
443
 
369
444
  static uint32_t
370
- rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq)
445
+ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id)
371
446
  {
372
447
  uint32_t h = 2166136261u;
373
448
  int i;
@@ -377,18 +452,23 @@ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq)
377
452
  }
378
453
  h ^= (uint32_t)thread_seq;
379
454
  h *= 16777619u;
455
+ h ^= (uint32_t)label_set_id;
456
+ h *= 16777619u;
380
457
  return h;
381
458
  }
382
459
 
383
- static void
460
+ static int
384
461
  rperf_agg_table_init(rperf_agg_table_t *at)
385
462
  {
386
463
  at->bucket_capacity = RPERF_AGG_TABLE_INITIAL * 2;
387
464
  at->buckets = (rperf_agg_entry_t *)calloc(at->bucket_capacity, sizeof(rperf_agg_entry_t));
465
+ if (!at->buckets) return -1;
388
466
  at->count = 0;
389
467
  at->stack_pool_capacity = RPERF_STACK_POOL_INITIAL;
390
468
  at->stack_pool = (uint32_t *)malloc(at->stack_pool_capacity * sizeof(uint32_t));
469
+ if (!at->stack_pool) { free(at->buckets); at->buckets = NULL; return -1; }
391
470
  at->stack_pool_count = 0;
471
+ return 0;
392
472
  }
393
473
 
394
474
  static void
@@ -404,6 +484,7 @@ rperf_agg_table_rehash(rperf_agg_table_t *at)
404
484
  {
405
485
  size_t new_cap = at->bucket_capacity * 2;
406
486
  rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
487
+ if (!new_buckets) return; /* keep using current buckets at higher load factor */
407
488
 
408
489
  size_t i;
409
490
  for (i = 0; i < at->bucket_capacity; i++) {
@@ -438,7 +519,8 @@ rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
438
519
  /* Insert or merge a stack into the aggregation table */
439
520
  static void
440
521
  rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
441
- int depth, int thread_seq, int64_t weight, uint32_t hash)
522
+ int depth, int thread_seq, int label_set_id,
523
+ int64_t weight, uint32_t hash)
442
524
  {
443
525
  size_t idx = hash % at->bucket_capacity;
444
526
 
@@ -446,6 +528,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
446
528
  rperf_agg_entry_t *e = &at->buckets[idx];
447
529
  if (!e->used) break;
448
530
  if (e->hash == hash && e->depth == depth && e->thread_seq == thread_seq &&
531
+ e->label_set_id == label_set_id &&
449
532
  memcmp(at->stack_pool + e->frame_start, frame_ids,
450
533
  depth * sizeof(uint32_t)) == 0) {
451
534
  /* Match — merge weight */
@@ -462,6 +545,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
462
545
  e->frame_start = (uint32_t)at->stack_pool_count;
463
546
  e->depth = depth;
464
547
  e->thread_seq = thread_seq;
548
+ e->label_set_id = label_set_id;
465
549
  e->weight = weight;
466
550
  e->hash = hash;
467
551
  e->used = 1;
@@ -513,10 +597,10 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
513
597
  if (overflow) break; /* frame_table full, stop aggregating this buffer */
514
598
 
515
599
  int total_depth = off + s->depth;
516
- hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq);
600
+ hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq, s->label_set_id);
517
601
 
518
602
  rperf_agg_table_insert(&prof->agg_table, temp_ids, total_depth,
519
- s->thread_seq, s->weight, hash);
603
+ s->thread_seq, s->label_set_id, s->weight, hash);
520
604
  }
521
605
 
522
606
  /* Reset buffer for reuse.
@@ -535,10 +619,10 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
535
619
  static void
536
620
  rperf_try_aggregate(rperf_profiler_t *prof)
537
621
  {
538
- if (!prof->aggregate || !prof->swap_ready) return;
539
- int standby_idx = prof->active_idx ^ 1;
622
+ if (!prof->aggregate || !atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) return;
623
+ int standby_idx = atomic_load_explicit(&prof->active_idx, memory_order_acquire) ^ 1;
540
624
  rperf_aggregate_buffer(prof, &prof->buffers[standby_idx]);
541
- prof->swap_ready = 0;
625
+ atomic_store_explicit(&prof->swap_ready, 0, memory_order_release);
542
626
  }
543
627
 
544
628
  /* ---- Record a sample ---- */
@@ -547,25 +631,29 @@ static void
547
631
  rperf_try_swap(rperf_profiler_t *prof)
548
632
  {
549
633
  if (!prof->aggregate) return;
550
- rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
634
+ int idx = atomic_load_explicit(&prof->active_idx, memory_order_relaxed);
635
+ rperf_sample_buffer_t *buf = &prof->buffers[idx];
551
636
  if (buf->sample_count < RPERF_AGG_THRESHOLD) return;
552
- if (prof->swap_ready) return; /* standby still being aggregated */
637
+ if (atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) return; /* standby still being aggregated */
553
638
 
554
- /* Swap active buffer */
555
- prof->active_idx ^= 1;
556
- prof->swap_ready = 1;
639
+ /* Swap active buffer: release ensures buffer writes are visible to worker */
640
+ atomic_store_explicit(&prof->active_idx, idx ^ 1, memory_order_release);
557
641
 
558
- /* Wake worker thread */
642
+ /* Set swap_ready under mutex and signal, preventing lost wakeup:
643
+ * the worker checks swap_ready while holding the same mutex. */
644
+ CHECKED(pthread_mutex_lock(&prof->worker_mutex));
645
+ atomic_store_explicit(&prof->swap_ready, 1, memory_order_release);
559
646
  CHECKED(pthread_cond_signal(&prof->worker_cond));
647
+ CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
560
648
  }
561
649
 
562
- static void
563
- rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
564
- int64_t weight, int type, int thread_seq)
650
+ /* Write a sample into a specific buffer. No swap check. */
651
+ static int
652
+ rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
653
+ int64_t weight, int type, int thread_seq, int label_set_id)
565
654
  {
566
- if (weight <= 0) return;
567
- rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
568
- if (rperf_ensure_sample_capacity(buf) < 0) return;
655
+ if (weight <= 0) return 0;
656
+ if (rperf_ensure_sample_capacity(buf) < 0) return -1;
569
657
 
570
658
  rperf_sample_t *sample = &buf->samples[buf->sample_count];
571
659
  sample->depth = depth;
@@ -573,8 +661,17 @@ rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
573
661
  sample->weight = weight;
574
662
  sample->type = type;
575
663
  sample->thread_seq = thread_seq;
664
+ sample->label_set_id = label_set_id;
576
665
  buf->sample_count++;
666
+ return 0;
667
+ }
577
668
 
669
+ static void
670
+ rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
671
+ int64_t weight, int type, int thread_seq, int label_set_id)
672
+ {
673
+ rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
674
+ rperf_write_sample(buf, frame_start, depth, weight, type, thread_seq, label_set_id);
578
675
  rperf_try_swap(prof);
579
676
  }
580
677
 
@@ -586,7 +683,7 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
586
683
  {
587
684
  rperf_thread_data_t *td = (rperf_thread_data_t *)calloc(1, sizeof(rperf_thread_data_t));
588
685
  if (!td) return NULL;
589
- td->prev_cpu_ns = rperf_current_time_ns(prof, td);
686
+ td->prev_time_ns = rperf_current_time_ns(prof, td);
590
687
  td->prev_wall_ns = rperf_wall_time_ns();
591
688
  td->thread_seq = ++prof->next_thread_seq;
592
689
  rb_internal_thread_specific_set(thread, prof->ts_key, td);
@@ -596,12 +693,11 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
596
693
  /* ---- Thread event hooks ---- */
597
694
 
598
695
  static void
599
- rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
696
+ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
600
697
  {
601
698
  /* Has GVL — safe to call Ruby APIs */
602
699
  int64_t wall_now = rperf_wall_time_ns();
603
700
 
604
- rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
605
701
  int is_first = 0;
606
702
 
607
703
  if (td == NULL) {
@@ -614,7 +710,7 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
614
710
  if (time_now < 0) return;
615
711
 
616
712
  /* Capture backtrace into active buffer's frame_pool */
617
- rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
713
+ rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
618
714
  if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
619
715
  size_t frame_start = buf->frame_pool_count;
620
716
  int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
@@ -624,34 +720,29 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
624
720
 
625
721
  /* Record normal sample (skip if first time — no prev_time) */
626
722
  if (!is_first) {
627
- int64_t weight = time_now - td->prev_cpu_ns;
628
- rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
723
+ int64_t weight = time_now - td->prev_time_ns;
724
+ rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq, td->label_set_id);
629
725
  }
630
726
 
631
- /* Save stack and timestamp for READY/RESUMED */
727
+ /* Save timestamp for READY/RESUMED */
632
728
  td->suspended_at_ns = wall_now;
633
- td->suspended_frame_start = frame_start;
634
- td->suspended_frame_depth = depth;
635
- td->prev_cpu_ns = time_now;
729
+ td->prev_time_ns = time_now;
636
730
  td->prev_wall_ns = wall_now;
637
731
  }
638
732
 
639
733
  static void
640
- rperf_handle_ready(rperf_profiler_t *prof, VALUE thread)
734
+ rperf_handle_ready(rperf_thread_data_t *td)
641
735
  {
642
736
  /* May NOT have GVL — only simple C operations allowed */
643
- rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
644
737
  if (!td) return;
645
738
 
646
739
  td->ready_at_ns = rperf_wall_time_ns();
647
740
  }
648
741
 
649
742
  static void
650
- rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread)
743
+ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
651
744
  {
652
745
  /* Has GVL */
653
- rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
654
-
655
746
  if (td == NULL) {
656
747
  td = rperf_thread_data_create(prof, thread);
657
748
  if (!td) return;
@@ -659,36 +750,52 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread)
659
750
 
660
751
  int64_t wall_now = rperf_wall_time_ns();
661
752
 
662
- /* Record GVL blocked/wait samples (wall mode only) */
663
- if (prof->mode == 1 && td->suspended_frame_depth > 0) {
753
+ /* Record GVL blocked/wait samples (wall mode only).
754
+ * Capture backtrace here (not at SUSPENDED) so that frame_start always
755
+ * indexes into the current active buffer, avoiding mismatch after a
756
+ * double-buffer swap. The Ruby stack is unchanged while off-GVL.
757
+ *
758
+ * Both samples are written directly into the same buffer before calling
759
+ * rperf_try_swap, so that a swap triggered by the first sample cannot
760
+ * move the second into a different buffer with a stale frame_start. */
761
+ if (prof->mode == 1 && td->suspended_at_ns > 0) {
762
+ rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
763
+ if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) goto skip_gvl;
764
+ size_t frame_start = buf->frame_pool_count;
765
+ int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
766
+ &buf->frame_pool[frame_start], NULL);
767
+ if (depth <= 0) goto skip_gvl;
768
+ buf->frame_pool_count += depth;
769
+
770
+ /* Write both samples into the same buf, then swap-check once */
664
771
  if (td->ready_at_ns > 0 && td->ready_at_ns > td->suspended_at_ns) {
665
772
  int64_t blocked_ns = td->ready_at_ns - td->suspended_at_ns;
666
- rperf_record_sample(prof, td->suspended_frame_start,
667
- td->suspended_frame_depth, blocked_ns,
668
- RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq);
773
+ rperf_write_sample(buf, frame_start, depth, blocked_ns,
774
+ RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq, td->label_set_id);
669
775
  }
670
776
  if (td->ready_at_ns > 0 && wall_now > td->ready_at_ns) {
671
777
  int64_t wait_ns = wall_now - td->ready_at_ns;
672
- rperf_record_sample(prof, td->suspended_frame_start,
673
- td->suspended_frame_depth, wait_ns,
674
- RPERF_SAMPLE_GVL_WAIT, td->thread_seq);
778
+ rperf_write_sample(buf, frame_start, depth, wait_ns,
779
+ RPERF_SAMPLE_GVL_WAIT, td->thread_seq, td->label_set_id);
675
780
  }
781
+
782
+ rperf_try_swap(prof);
676
783
  }
784
+ skip_gvl:
677
785
 
678
786
  /* Reset prev times to current — next timer sample measures from resume */
679
787
  int64_t time_now = rperf_current_time_ns(prof, td);
680
- if (time_now >= 0) td->prev_cpu_ns = time_now;
788
+ if (time_now >= 0) td->prev_time_ns = time_now;
681
789
  td->prev_wall_ns = wall_now;
682
790
 
683
791
  /* Clear suspended state */
684
- td->suspended_frame_depth = 0;
792
+ td->suspended_at_ns = 0;
685
793
  td->ready_at_ns = 0;
686
794
  }
687
795
 
688
796
  static void
689
- rperf_handle_exited(rperf_profiler_t *prof, VALUE thread)
797
+ rperf_handle_exited(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
690
798
  {
691
- rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
692
799
  if (td) {
693
800
  free(td);
694
801
  rb_internal_thread_specific_set(thread, prof->ts_key, NULL);
@@ -702,15 +809,16 @@ rperf_thread_event_hook(rb_event_flag_t event, const rb_internal_thread_event_da
702
809
  if (!prof->running) return;
703
810
 
704
811
  VALUE thread = data->thread;
812
+ rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
705
813
 
706
814
  if (event & RUBY_INTERNAL_THREAD_EVENT_SUSPENDED)
707
- rperf_handle_suspended(prof, thread);
815
+ rperf_handle_suspended(prof, thread, td);
708
816
  else if (event & RUBY_INTERNAL_THREAD_EVENT_READY)
709
- rperf_handle_ready(prof, thread);
817
+ rperf_handle_ready(td);
710
818
  else if (event & RUBY_INTERNAL_THREAD_EVENT_RESUMED)
711
- rperf_handle_resumed(prof, thread);
819
+ rperf_handle_resumed(prof, thread, td);
712
820
  else if (event & RUBY_INTERNAL_THREAD_EVENT_EXITED)
713
- rperf_handle_exited(prof, thread);
821
+ rperf_handle_exited(prof, thread, td);
714
822
  }
715
823
 
716
824
  /* ---- GC event hook ---- */
@@ -722,50 +830,53 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
722
830
  if (!prof->running) return;
723
831
 
724
832
  if (event & RUBY_INTERNAL_EVENT_GC_START) {
725
- prof->gc_phase = RPERF_GC_MARKING;
833
+ prof->gc.phase = RPERF_GC_MARKING;
726
834
  }
727
835
  else if (event & RUBY_INTERNAL_EVENT_GC_END_MARK) {
728
- prof->gc_phase = RPERF_GC_SWEEPING;
836
+ prof->gc.phase = RPERF_GC_SWEEPING;
729
837
  }
730
838
  else if (event & RUBY_INTERNAL_EVENT_GC_END_SWEEP) {
731
- prof->gc_phase = RPERF_GC_NONE;
839
+ prof->gc.phase = RPERF_GC_NONE;
732
840
  }
733
841
  else if (event & RUBY_INTERNAL_EVENT_GC_ENTER) {
734
- /* Capture backtrace and timestamp at GC entry */
735
- prof->gc_enter_ns = rperf_wall_time_ns();
736
-
737
- rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
738
- if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
739
- size_t frame_start = buf->frame_pool_count;
740
- int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
741
- &buf->frame_pool[frame_start], NULL);
742
- if (depth <= 0) {
743
- prof->gc_frame_depth = 0;
744
- return;
745
- }
746
- buf->frame_pool_count += depth;
747
- prof->gc_frame_start = frame_start;
748
- prof->gc_frame_depth = depth;
749
-
750
- /* Save thread_seq for the GC_EXIT sample */
842
+ /* Save timestamp, thread_seq, and label_set_id; backtrace is captured at GC_EXIT
843
+ * to avoid buffer mismatch after a double-buffer swap. */
844
+ prof->gc.enter_ns = rperf_wall_time_ns();
751
845
  {
752
846
  VALUE thread = rb_thread_current();
753
847
  rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
754
- prof->gc_thread_seq = td ? td->thread_seq : 0;
848
+ prof->gc.thread_seq = td ? td->thread_seq : 0;
849
+ prof->gc.label_set_id = td ? td->label_set_id : 0;
755
850
  }
756
851
  }
757
852
  else if (event & RUBY_INTERNAL_EVENT_GC_EXIT) {
758
- if (prof->gc_frame_depth <= 0) return;
853
+ if (prof->gc.enter_ns <= 0) return;
759
854
 
760
855
  int64_t wall_now = rperf_wall_time_ns();
761
- int64_t weight = wall_now - prof->gc_enter_ns;
762
- int type = (prof->gc_phase == RPERF_GC_SWEEPING)
856
+ int64_t weight = wall_now - prof->gc.enter_ns;
857
+ int type = (prof->gc.phase == RPERF_GC_SWEEPING)
763
858
  ? RPERF_SAMPLE_GC_SWEEPING
764
859
  : RPERF_SAMPLE_GC_MARKING;
765
860
 
766
- rperf_record_sample(prof, prof->gc_frame_start,
767
- prof->gc_frame_depth, weight, type, prof->gc_thread_seq);
768
- prof->gc_frame_depth = 0;
861
+ /* Capture backtrace here (not at GC_ENTER) so that frame_start
862
+ * always indexes into the current active buffer. The Ruby stack
863
+ * is unchanged during GC. */
864
+ rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
865
+ if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) {
866
+ prof->gc.enter_ns = 0;
867
+ return;
868
+ }
869
+ size_t frame_start = buf->frame_pool_count;
870
+ int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
871
+ &buf->frame_pool[frame_start], NULL);
872
+ if (depth <= 0) {
873
+ prof->gc.enter_ns = 0;
874
+ return;
875
+ }
876
+ buf->frame_pool_count += depth;
877
+
878
+ rperf_record_sample(prof, frame_start, depth, weight, type, prof->gc.thread_seq, prof->gc.label_set_id);
879
+ prof->gc.enter_ns = 0;
769
880
  }
770
881
  }
771
882
 
@@ -795,14 +906,14 @@ rperf_sample_job(void *arg)
795
906
  int64_t time_now = rperf_current_time_ns(prof, td);
796
907
  if (time_now < 0) return;
797
908
 
798
- int64_t weight = time_now - td->prev_cpu_ns;
799
- td->prev_cpu_ns = time_now;
909
+ int64_t weight = time_now - td->prev_time_ns;
910
+ td->prev_time_ns = time_now;
800
911
  td->prev_wall_ns = rperf_wall_time_ns();
801
912
 
802
913
  if (weight <= 0) return;
803
914
 
804
915
  /* Capture backtrace and record sample */
805
- rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
916
+ rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
806
917
  if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
807
918
 
808
919
  size_t frame_start = buf->frame_pool_count;
@@ -811,11 +922,11 @@ rperf_sample_job(void *arg)
811
922
  if (depth <= 0) return;
812
923
  buf->frame_pool_count += depth;
813
924
 
814
- rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
925
+ rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq, td->label_set_id);
815
926
 
816
927
  clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_end);
817
- prof->sampling_count++;
818
- prof->sampling_total_ns +=
928
+ prof->stats.sampling_count++;
929
+ prof->stats.sampling_total_ns +=
819
930
  ((int64_t)ts_end.tv_sec - ts_start.tv_sec) * 1000000000LL +
820
931
  (ts_end.tv_nsec - ts_start.tv_nsec);
821
932
  }
@@ -826,7 +937,7 @@ rperf_sample_job(void *arg)
826
937
  static void
827
938
  rperf_signal_handler(int sig)
828
939
  {
829
- g_profiler.trigger_count++;
940
+ g_profiler.stats.trigger_count++;
830
941
  rb_postponed_job_trigger(g_profiler.pj_handle);
831
942
  }
832
943
 
@@ -845,7 +956,8 @@ rperf_worker_signal_func(void *arg)
845
956
  CHECKED(pthread_cond_signal(&prof->worker_cond));
846
957
 
847
958
  while (prof->running) {
848
- CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
959
+ while (prof->running && !atomic_load_explicit(&prof->swap_ready, memory_order_acquire))
960
+ CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
849
961
  rperf_try_aggregate(prof);
850
962
  }
851
963
  CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
@@ -874,9 +986,12 @@ rperf_worker_nanosleep_func(void *arg)
874
986
  CHECKED(pthread_mutex_lock(&prof->worker_mutex));
875
987
  while (prof->running) {
876
988
  int ret = pthread_cond_timedwait(&prof->worker_cond, &prof->worker_mutex, &deadline);
877
- assert(ret == 0 || ret == ETIMEDOUT);
989
+ if (ret != 0 && ret != ETIMEDOUT) {
990
+ fprintf(stderr, "rperf: pthread_cond_timedwait failed: %s\n", strerror(ret));
991
+ abort();
992
+ }
878
993
  if (ret == ETIMEDOUT) {
879
- prof->trigger_count++;
994
+ prof->stats.trigger_count++;
880
995
  rb_postponed_job_trigger(prof->pj_handle);
881
996
  /* Advance deadline by interval */
882
997
  deadline.tv_nsec += interval_ns;
@@ -900,66 +1015,117 @@ rperf_resolve_frame(VALUE fval)
900
1015
  VALUE label = rb_profile_frame_full_label(fval);
901
1016
 
902
1017
  if (NIL_P(path)) path = rb_str_new_lit("<C method>");
903
-
904
- if (NIL_P(path)) path = rb_str_new_cstr("");
905
1018
  if (NIL_P(label)) label = rb_str_new_cstr("");
906
1019
 
907
1020
  return rb_ary_new3(2, path, label);
908
1021
  }
909
1022
 
910
- /* ---- Ruby API ---- */
1023
+ /* ---- Shared helpers for stop/snapshot ---- */
911
1024
 
1025
+ /* Flush pending sample buffers into agg_table.
1026
+ * Caller must ensure no concurrent access (worker joined or mutex held). */
1027
+ static void
1028
+ rperf_flush_buffers(rperf_profiler_t *prof)
1029
+ {
1030
+ int cur_idx = atomic_load_explicit(&prof->active_idx, memory_order_acquire);
1031
+ if (atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) {
1032
+ int standby_idx = cur_idx ^ 1;
1033
+ rperf_aggregate_buffer(prof, &prof->buffers[standby_idx]);
1034
+ atomic_store_explicit(&prof->swap_ready, 0, memory_order_release);
1035
+ }
1036
+ rperf_aggregate_buffer(prof, &prof->buffers[cur_idx]);
1037
+ }
1038
+
1039
+ /* Build result hash from aggregated data (agg_table + frame_table).
1040
+ * Does NOT free any resources. Caller must hold GVL. */
912
1041
  static VALUE
913
- rb_rperf_start(int argc, VALUE *argv, VALUE self)
1042
+ rperf_build_aggregated_result(rperf_profiler_t *prof)
914
1043
  {
915
- VALUE opts;
916
- int frequency = 1000;
917
- int mode = 0; /* 0 = cpu, 1 = wall */
918
- int aggregate = 1; /* default: aggregate */
919
- #if RPERF_USE_TIMER_SIGNAL
920
- int timer_signal = RPERF_TIMER_SIGNAL_DEFAULT;
921
- #endif
1044
+ VALUE result, samples_ary;
1045
+ size_t i;
1046
+ int j;
922
1047
 
923
- rb_scan_args(argc, argv, ":", &opts);
924
- if (!NIL_P(opts)) {
925
- VALUE vagg = rb_hash_aref(opts, ID2SYM(rb_intern("aggregate")));
926
- if (!NIL_P(vagg)) {
927
- aggregate = RTEST(vagg) ? 1 : 0;
928
- }
929
- VALUE vfreq = rb_hash_aref(opts, ID2SYM(rb_intern("frequency")));
930
- if (!NIL_P(vfreq)) {
931
- frequency = NUM2INT(vfreq);
932
- if (frequency <= 0 || frequency > 1000000) {
933
- rb_raise(rb_eArgError, "frequency must be between 1 and 1000000");
934
- }
1048
+ result = rb_hash_new();
1049
+
1050
+ rb_hash_aset(result, ID2SYM(rb_intern("mode")),
1051
+ ID2SYM(rb_intern(prof->mode == 1 ? "wall" : "cpu")));
1052
+ rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(prof->frequency));
1053
+ rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(prof->stats.trigger_count));
1054
+ rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(prof->stats.sampling_count));
1055
+ rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(prof->stats.sampling_total_ns));
1056
+ rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(prof->next_thread_seq));
1057
+ rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
1058
+ SIZET2NUM(prof->frame_table.count - RPERF_SYNTHETIC_COUNT));
1059
+ rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
1060
+ SIZET2NUM(prof->agg_table.count));
1061
+
1062
+ {
1063
+ struct timespec now_monotonic;
1064
+ int64_t start_ns, duration_ns;
1065
+ clock_gettime(CLOCK_MONOTONIC, &now_monotonic);
1066
+ start_ns = (int64_t)prof->start_realtime.tv_sec * 1000000000LL
1067
+ + (int64_t)prof->start_realtime.tv_nsec;
1068
+ duration_ns = ((int64_t)now_monotonic.tv_sec - (int64_t)prof->start_monotonic.tv_sec) * 1000000000LL
1069
+ + ((int64_t)now_monotonic.tv_nsec - (int64_t)prof->start_monotonic.tv_nsec);
1070
+ rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
1071
+ rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
1072
+ }
1073
+
1074
+ {
1075
+ rperf_frame_table_t *ft = &prof->frame_table;
1076
+ VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
1077
+ rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]")));
1078
+ rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
1079
+ rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]")));
1080
+ rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]")));
1081
+ for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
1082
+ rb_ary_push(resolved_ary, rperf_resolve_frame(atomic_load_explicit(&ft->keys, memory_order_relaxed)[i]));
935
1083
  }
936
- VALUE vmode = rb_hash_aref(opts, ID2SYM(rb_intern("mode")));
937
- if (!NIL_P(vmode)) {
938
- ID mode_id = SYM2ID(vmode);
939
- if (mode_id == rb_intern("cpu")) {
940
- mode = 0;
941
- } else if (mode_id == rb_intern("wall")) {
942
- mode = 1;
943
- } else {
944
- rb_raise(rb_eArgError, "mode must be :cpu or :wall");
1084
+
1085
+ rperf_agg_table_t *at = &prof->agg_table;
1086
+ samples_ary = rb_ary_new();
1087
+ for (i = 0; i < at->bucket_capacity; i++) {
1088
+ rperf_agg_entry_t *e = &at->buckets[i];
1089
+ if (!e->used) continue;
1090
+
1091
+ VALUE frames = rb_ary_new_capa(e->depth);
1092
+ for (j = 0; j < e->depth; j++) {
1093
+ uint32_t fid = at->stack_pool[e->frame_start + j];
1094
+ rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
945
1095
  }
1096
+
1097
+ VALUE sample = rb_ary_new3(4, frames, LONG2NUM(e->weight), INT2NUM(e->thread_seq), INT2NUM(e->label_set_id));
1098
+ rb_ary_push(samples_ary, sample);
946
1099
  }
1100
+ }
1101
+
1102
+ rb_hash_aset(result, ID2SYM(rb_intern("aggregated_samples")), samples_ary);
1103
+
1104
+ if (prof->label_sets != Qnil) {
1105
+ rb_hash_aset(result, ID2SYM(rb_intern("label_sets")), prof->label_sets);
1106
+ }
1107
+
1108
+ return result;
1109
+ }
1110
+
1111
+ /* ---- Ruby API ---- */
1112
+
1113
+ /* _c_start(frequency, mode, aggregate, signal)
1114
+ * frequency: Integer (Hz)
1115
+ * mode: 0 = cpu, 1 = wall
1116
+ * aggregate: 0 or 1
1117
+ * signal: Integer (RT signal number, 0 = nanosleep, -1 = default)
1118
+ */
1119
+ static VALUE
1120
+ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
1121
+ {
1122
+ int frequency = NUM2INT(vfreq);
1123
+ int mode = NUM2INT(vmode);
1124
+ int aggregate = RTEST(vagg) ? 1 : 0;
947
1125
  #if RPERF_USE_TIMER_SIGNAL
948
- VALUE vsig = rb_hash_aref(opts, ID2SYM(rb_intern("signal")));
949
- if (!NIL_P(vsig)) {
950
- if (RTEST(vsig)) {
951
- timer_signal = NUM2INT(vsig);
952
- if (timer_signal < SIGRTMIN || timer_signal > SIGRTMAX) {
953
- rb_raise(rb_eArgError, "signal must be between SIGRTMIN(%d) and SIGRTMAX(%d)",
954
- SIGRTMIN, SIGRTMAX);
955
- }
956
- } else {
957
- /* signal: false or signal: 0 → use nanosleep thread */
958
- timer_signal = 0;
959
- }
960
- }
1126
+ int sig = NUM2INT(vsig);
1127
+ int timer_signal = (sig < 0) ? RPERF_TIMER_SIGNAL_DEFAULT : sig;
961
1128
  #endif
962
- }
963
1129
 
964
1130
  if (g_profiler.running) {
965
1131
  rb_raise(rb_eRuntimeError, "Rperf is already running");
@@ -969,11 +1135,12 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
969
1135
  g_profiler.mode = mode;
970
1136
  g_profiler.aggregate = aggregate;
971
1137
  g_profiler.next_thread_seq = 0;
972
- g_profiler.sampling_count = 0;
973
- g_profiler.sampling_total_ns = 0;
974
- g_profiler.trigger_count = 0;
975
- g_profiler.active_idx = 0;
976
- g_profiler.swap_ready = 0;
1138
+ g_profiler.stats.sampling_count = 0;
1139
+ g_profiler.stats.sampling_total_ns = 0;
1140
+ g_profiler.stats.trigger_count = 0;
1141
+ atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
1142
+ atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
1143
+ g_profiler.label_sets = Qnil;
977
1144
 
978
1145
  /* Initialize worker mutex/cond */
979
1146
  CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
@@ -994,13 +1161,26 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
994
1161
  }
995
1162
 
996
1163
  /* Initialize aggregation structures */
997
- rperf_frame_table_init(&g_profiler.frame_table);
998
- rperf_agg_table_init(&g_profiler.agg_table);
1164
+ if (rperf_frame_table_init(&g_profiler.frame_table) < 0) {
1165
+ rperf_sample_buffer_free(&g_profiler.buffers[0]);
1166
+ rperf_sample_buffer_free(&g_profiler.buffers[1]);
1167
+ CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
1168
+ CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
1169
+ rb_raise(rb_eNoMemError, "rperf: failed to allocate frame table");
1170
+ }
1171
+ if (rperf_agg_table_init(&g_profiler.agg_table) < 0) {
1172
+ rperf_frame_table_free(&g_profiler.frame_table);
1173
+ rperf_sample_buffer_free(&g_profiler.buffers[0]);
1174
+ rperf_sample_buffer_free(&g_profiler.buffers[1]);
1175
+ CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
1176
+ CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
1177
+ rb_raise(rb_eNoMemError, "rperf: failed to allocate aggregation table");
1178
+ }
999
1179
  }
1000
1180
 
1001
1181
  /* Register GC event hook */
1002
- g_profiler.gc_phase = RPERF_GC_NONE;
1003
- g_profiler.gc_frame_depth = 0;
1182
+ g_profiler.gc.phase = RPERF_GC_NONE;
1183
+ g_profiler.gc.enter_ns = 0;
1004
1184
  rb_add_event_hook(rperf_gc_event_hook,
1005
1185
  RUBY_INTERNAL_EVENT_GC_START |
1006
1186
  RUBY_INTERNAL_EVENT_GC_END_MARK |
@@ -1023,6 +1203,7 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
1023
1203
  VALUE cur_thread = rb_thread_current();
1024
1204
  rperf_thread_data_t *td = rperf_thread_data_create(&g_profiler, cur_thread);
1025
1205
  if (!td) {
1206
+ rb_remove_event_hook(rperf_gc_event_hook);
1026
1207
  rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
1027
1208
  g_profiler.thread_hook = NULL;
1028
1209
  if (g_profiler.aggregate) {
@@ -1053,14 +1234,17 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
1053
1234
  memset(&sa, 0, sizeof(sa));
1054
1235
  sa.sa_handler = rperf_signal_handler;
1055
1236
  sa.sa_flags = SA_RESTART;
1056
- sigaction(g_profiler.timer_signal, &sa, NULL);
1237
+ if (sigaction(g_profiler.timer_signal, &sa, &g_profiler.old_sigaction) != 0) {
1238
+ g_profiler.running = 0;
1239
+ goto timer_fail;
1240
+ }
1057
1241
 
1058
1242
  /* Start worker thread first to get its kernel TID */
1059
1243
  g_profiler.worker_tid = 0;
1060
1244
  if (pthread_create(&g_profiler.worker_thread, NULL,
1061
1245
  rperf_worker_signal_func, &g_profiler) != 0) {
1062
1246
  g_profiler.running = 0;
1063
- signal(g_profiler.timer_signal, SIG_DFL);
1247
+ sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
1064
1248
  goto timer_fail;
1065
1249
  }
1066
1250
 
@@ -1078,7 +1262,7 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
1078
1262
  sev._sigev_un._tid = g_profiler.worker_tid;
1079
1263
  if (timer_create(CLOCK_MONOTONIC, &sev, &g_profiler.timer_id) != 0) {
1080
1264
  g_profiler.running = 0;
1081
- signal(g_profiler.timer_signal, SIG_DFL);
1265
+ sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
1082
1266
  CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
1083
1267
  CHECKED(pthread_join(g_profiler.worker_thread, NULL));
1084
1268
  goto timer_fail;
@@ -1087,7 +1271,14 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
1087
1271
  its.it_value.tv_sec = 0;
1088
1272
  its.it_value.tv_nsec = 1000000000L / g_profiler.frequency;
1089
1273
  its.it_interval = its.it_value;
1090
- timer_settime(g_profiler.timer_id, 0, &its, NULL);
1274
+ if (timer_settime(g_profiler.timer_id, 0, &its, NULL) != 0) {
1275
+ timer_delete(g_profiler.timer_id);
1276
+ g_profiler.running = 0;
1277
+ sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
1278
+ CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
1279
+ CHECKED(pthread_join(g_profiler.worker_thread, NULL));
1280
+ goto timer_fail;
1281
+ }
1091
1282
  } else
1092
1283
  #endif
1093
1284
  {
@@ -1109,6 +1300,7 @@ timer_fail:
1109
1300
  rb_internal_thread_specific_set(cur, g_profiler.ts_key, NULL);
1110
1301
  }
1111
1302
  }
1303
+ rb_remove_event_hook(rperf_gc_event_hook);
1112
1304
  rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
1113
1305
  g_profiler.thread_hook = NULL;
1114
1306
  if (g_profiler.aggregate) {
@@ -1139,17 +1331,28 @@ rb_rperf_stop(VALUE self)
1139
1331
  g_profiler.running = 0;
1140
1332
  #if RPERF_USE_TIMER_SIGNAL
1141
1333
  if (g_profiler.timer_signal > 0) {
1334
+ /* Delete timer first to stop generating new signals.
1335
+ * Do NOT restore signal handler yet — the worker thread may still have
1336
+ * pending timer signals. rperf_signal_handler handles them harmlessly. */
1142
1337
  timer_delete(g_profiler.timer_id);
1143
- signal(g_profiler.timer_signal, SIG_IGN);
1144
1338
  }
1145
1339
  #endif
1146
1340
 
1147
- /* Wake and join worker thread */
1341
+ /* Wake and join worker thread.
1342
+ * Any pending timer signals are still handled by rperf_signal_handler
1343
+ * (just increments trigger_count + calls rb_postponed_job_trigger). */
1148
1344
  CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
1149
1345
  CHECKED(pthread_join(g_profiler.worker_thread, NULL));
1150
1346
  CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
1151
1347
  CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
1152
1348
 
1349
+ #if RPERF_USE_TIMER_SIGNAL
1350
+ if (g_profiler.timer_signal > 0) {
1351
+ /* Worker thread is gone — safe to restore old signal handler now. */
1352
+ sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
1353
+ }
1354
+ #endif
1355
+
1153
1356
  if (g_profiler.thread_hook) {
1154
1357
  rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
1155
1358
  g_profiler.thread_hook = NULL;
@@ -1159,13 +1362,8 @@ rb_rperf_stop(VALUE self)
1159
1362
  rb_remove_event_hook(rperf_gc_event_hook);
1160
1363
 
1161
1364
  if (g_profiler.aggregate) {
1162
- /* Aggregate remaining samples from both buffers */
1163
- if (g_profiler.swap_ready) {
1164
- int standby_idx = g_profiler.active_idx ^ 1;
1165
- rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[standby_idx]);
1166
- g_profiler.swap_ready = 0;
1167
- }
1168
- rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[g_profiler.active_idx]);
1365
+ /* Worker thread is joined; no concurrent access. */
1366
+ rperf_flush_buffers(&g_profiler);
1169
1367
  }
1170
1368
 
1171
1369
  /* Clean up thread-specific data for all live threads */
@@ -1183,72 +1381,8 @@ rb_rperf_stop(VALUE self)
1183
1381
  }
1184
1382
  }
1185
1383
 
1186
- /* Build result hash */
1187
- result = rb_hash_new();
1188
-
1189
- /* mode */
1190
- rb_hash_aset(result, ID2SYM(rb_intern("mode")),
1191
- ID2SYM(rb_intern(g_profiler.mode == 1 ? "wall" : "cpu")));
1192
-
1193
- /* frequency */
1194
- rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
1195
-
1196
- /* trigger_count, sampling_count, sampling_time_ns */
1197
- rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.trigger_count));
1198
- rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.sampling_count));
1199
- rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.sampling_total_ns));
1200
-
1201
- /* aggregation stats */
1202
- if (g_profiler.aggregate) {
1203
- rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
1204
- SIZET2NUM(g_profiler.frame_table.count - RPERF_SYNTHETIC_COUNT));
1205
- rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
1206
- SIZET2NUM(g_profiler.agg_table.count));
1207
- }
1208
-
1209
- /* start_time_ns (CLOCK_REALTIME epoch nanos), duration_ns (CLOCK_MONOTONIC delta) */
1210
- {
1211
- struct timespec stop_monotonic;
1212
- int64_t start_ns, duration_ns;
1213
- clock_gettime(CLOCK_MONOTONIC, &stop_monotonic);
1214
- start_ns = (int64_t)g_profiler.start_realtime.tv_sec * 1000000000LL
1215
- + (int64_t)g_profiler.start_realtime.tv_nsec;
1216
- duration_ns = ((int64_t)stop_monotonic.tv_sec - (int64_t)g_profiler.start_monotonic.tv_sec) * 1000000000LL
1217
- + ((int64_t)stop_monotonic.tv_nsec - (int64_t)g_profiler.start_monotonic.tv_nsec);
1218
- rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
1219
- rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
1220
- }
1221
-
1222
1384
  if (g_profiler.aggregate) {
1223
- /* Build samples from aggregation table.
1224
- * Use a Ruby array for resolved frames so GC protects them. */
1225
- rperf_frame_table_t *ft = &g_profiler.frame_table;
1226
- VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
1227
- /* Synthetic frames */
1228
- rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]")));
1229
- rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
1230
- rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]")));
1231
- rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]")));
1232
- /* Real frames */
1233
- for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
1234
- rb_ary_push(resolved_ary, rperf_resolve_frame(ft->keys[i]));
1235
- }
1236
-
1237
- rperf_agg_table_t *at = &g_profiler.agg_table;
1238
- samples_ary = rb_ary_new();
1239
- for (i = 0; i < at->bucket_capacity; i++) {
1240
- rperf_agg_entry_t *e = &at->buckets[i];
1241
- if (!e->used) continue;
1242
-
1243
- VALUE frames = rb_ary_new_capa(e->depth);
1244
- for (j = 0; j < e->depth; j++) {
1245
- uint32_t fid = at->stack_pool[e->frame_start + j];
1246
- rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
1247
- }
1248
-
1249
- VALUE sample = rb_ary_new3(3, frames, LONG2NUM(e->weight), INT2NUM(e->thread_seq));
1250
- rb_ary_push(samples_ary, sample);
1251
- }
1385
+ result = rperf_build_aggregated_result(&g_profiler);
1252
1386
 
1253
1387
  rperf_sample_buffer_free(&g_profiler.buffers[1]);
1254
1388
  rperf_frame_table_free(&g_profiler.frame_table);
@@ -1256,6 +1390,27 @@ rb_rperf_stop(VALUE self)
1256
1390
  } else {
1257
1391
  /* Raw samples path (aggregate: false) */
1258
1392
  rperf_sample_buffer_t *buf = &g_profiler.buffers[0];
1393
+
1394
+ result = rb_hash_new();
1395
+ rb_hash_aset(result, ID2SYM(rb_intern("mode")),
1396
+ ID2SYM(rb_intern(g_profiler.mode == 1 ? "wall" : "cpu")));
1397
+ rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
1398
+ rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
1399
+ rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
1400
+ rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
1401
+ rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
1402
+ {
1403
+ struct timespec stop_monotonic;
1404
+ int64_t start_ns, duration_ns;
1405
+ clock_gettime(CLOCK_MONOTONIC, &stop_monotonic);
1406
+ start_ns = (int64_t)g_profiler.start_realtime.tv_sec * 1000000000LL
1407
+ + (int64_t)g_profiler.start_realtime.tv_nsec;
1408
+ duration_ns = ((int64_t)stop_monotonic.tv_sec - (int64_t)g_profiler.start_monotonic.tv_sec) * 1000000000LL
1409
+ + ((int64_t)stop_monotonic.tv_nsec - (int64_t)g_profiler.start_monotonic.tv_nsec);
1410
+ rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
1411
+ rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
1412
+ }
1413
+
1259
1414
  samples_ary = rb_ary_new_capa((long)buf->sample_count);
1260
1415
  for (i = 0; i < buf->sample_count; i++) {
1261
1416
  rperf_sample_t *s = &buf->samples[i];
@@ -1281,11 +1436,14 @@ rb_rperf_stop(VALUE self)
1281
1436
  rb_ary_push(frames, rperf_resolve_frame(fval));
1282
1437
  }
1283
1438
 
1284
- VALUE sample = rb_ary_new3(3, frames, LONG2NUM(s->weight), INT2NUM(s->thread_seq));
1439
+ VALUE sample = rb_ary_new3(4, frames, LONG2NUM(s->weight), INT2NUM(s->thread_seq), INT2NUM(s->label_set_id));
1285
1440
  rb_ary_push(samples_ary, sample);
1286
1441
  }
1442
+ rb_hash_aset(result, ID2SYM(rb_intern("raw_samples")), samples_ary);
1443
+ if (g_profiler.label_sets != Qnil) {
1444
+ rb_hash_aset(result, ID2SYM(rb_intern("label_sets")), g_profiler.label_sets);
1445
+ }
1287
1446
  }
1288
- rb_hash_aset(result, ID2SYM(rb_intern("samples")), samples_ary);
1289
1447
 
1290
1448
  /* Cleanup */
1291
1449
  rperf_sample_buffer_free(&g_profiler.buffers[0]);
@@ -1293,6 +1451,113 @@ rb_rperf_stop(VALUE self)
1293
1451
  return result;
1294
1452
  }
1295
1453
 
1454
+ /* ---- Snapshot: read aggregated data without stopping ---- */
1455
+
1456
+ /* Clear aggregated data for the next interval.
1457
+ * Caller must hold GVL + worker_mutex.
1458
+ * Keeps allocations intact for reuse. Does NOT touch frame_table
1459
+ * (frame IDs must stay stable — dmark may be iterating keys outside GVL,
1460
+ * and existing threads reference frame IDs via their thread_data). */
1461
+ static void
1462
+ rperf_clear_aggregated_data(rperf_profiler_t *prof)
1463
+ {
1464
+ /* Clear agg_table entries (keep allocation) */
1465
+ memset(prof->agg_table.buckets, 0,
1466
+ prof->agg_table.bucket_capacity * sizeof(rperf_agg_entry_t));
1467
+ prof->agg_table.count = 0;
1468
+ prof->agg_table.stack_pool_count = 0;
1469
+
1470
+ /* Reset stats */
1471
+ prof->stats.trigger_count = 0;
1472
+ prof->stats.sampling_count = 0;
1473
+ prof->stats.sampling_total_ns = 0;
1474
+
1475
+ /* Reset start timestamps so next snapshot's duration_ns covers
1476
+ * only the period since this clear. */
1477
+ clock_gettime(CLOCK_REALTIME, &prof->start_realtime);
1478
+ clock_gettime(CLOCK_MONOTONIC, &prof->start_monotonic);
1479
+ }
1480
+
1481
+ static VALUE
1482
+ rb_rperf_snapshot(VALUE self, VALUE vclear)
1483
+ {
1484
+ VALUE result;
1485
+
1486
+ if (!g_profiler.running) {
1487
+ return Qnil;
1488
+ }
1489
+
1490
+ if (!g_profiler.aggregate) {
1491
+ rb_raise(rb_eRuntimeError, "snapshot requires aggregate mode (aggregate: true)");
1492
+ }
1493
+
1494
+ /* GVL is held → no postponed jobs fire → no new samples written.
1495
+ * Lock worker_mutex to pause worker thread's aggregation. */
1496
+ CHECKED(pthread_mutex_lock(&g_profiler.worker_mutex));
1497
+ rperf_flush_buffers(&g_profiler);
1498
+
1499
+ /* Build result while mutex is held. If clear is requested, we must
1500
+ * also clear under the same lock to avoid a window where the worker
1501
+ * could aggregate into the table between build and clear. */
1502
+ result = rperf_build_aggregated_result(&g_profiler);
1503
+
1504
+ if (RTEST(vclear)) {
1505
+ rperf_clear_aggregated_data(&g_profiler);
1506
+ }
1507
+
1508
+ CHECKED(pthread_mutex_unlock(&g_profiler.worker_mutex));
1509
+
1510
+ return result;
1511
+ }
1512
+
1513
+ /* ---- Label API ---- */
1514
+
1515
+ /* _c_set_label(label_set_id) — set current thread's label_set_id.
1516
+ * Called from Ruby with GVL held. */
1517
+ static VALUE
1518
+ rb_rperf_set_label(VALUE self, VALUE vid)
1519
+ {
1520
+ if (!g_profiler.running) return vid;
1521
+
1522
+ int label_set_id = NUM2INT(vid);
1523
+ VALUE thread = rb_thread_current();
1524
+ rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, g_profiler.ts_key);
1525
+ if (td == NULL) {
1526
+ td = rperf_thread_data_create(&g_profiler, thread);
1527
+ if (!td) rb_raise(rb_eNoMemError, "rperf: failed to allocate thread data");
1528
+ }
1529
+ td->label_set_id = label_set_id;
1530
+ return vid;
1531
+ }
1532
+
1533
+ /* _c_get_label() — get current thread's label_set_id.
1534
+ * Returns 0 if not profiling or thread not yet seen. */
1535
+ static VALUE
1536
+ rb_rperf_get_label(VALUE self)
1537
+ {
1538
+ if (!g_profiler.running) return INT2FIX(0);
1539
+
1540
+ VALUE thread = rb_thread_current();
1541
+ rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, g_profiler.ts_key);
1542
+ if (td == NULL) return INT2FIX(0);
1543
+ return INT2NUM(td->label_set_id);
1544
+ }
1545
+
1546
+ /* _c_set_label_sets(ary) — store label_sets Ruby Array for result building */
1547
+ static VALUE
1548
+ rb_rperf_set_label_sets(VALUE self, VALUE ary)
1549
+ {
1550
+ g_profiler.label_sets = ary;
1551
+ return ary;
1552
+ }
1553
+
1554
+ /* _c_get_label_sets() — get label_sets Ruby Array */
1555
+ static VALUE
1556
+ rb_rperf_get_label_sets(VALUE self)
1557
+ {
1558
+ return g_profiler.label_sets;
1559
+ }
1560
+
1296
1561
  /* ---- Fork safety ---- */
1297
1562
 
1298
1563
  static void
@@ -1304,9 +1569,20 @@ rperf_after_fork_child(void)
1304
1569
  g_profiler.running = 0;
1305
1570
 
1306
1571
  #if RPERF_USE_TIMER_SIGNAL
1307
- /* timer_create timers are not inherited across fork; reset signal handler */
1572
+ /* timer_create timers are not inherited across fork, but pending signals may be.
1573
+ * Block the signal, drain any pending instances, then restore old handler. */
1308
1574
  if (g_profiler.timer_signal > 0) {
1309
- signal(g_profiler.timer_signal, SIG_DFL);
1575
+ sigset_t block_set, old_set;
1576
+ struct timespec zero_ts = {0, 0};
1577
+
1578
+ sigemptyset(&block_set);
1579
+ sigaddset(&block_set, g_profiler.timer_signal);
1580
+ pthread_sigmask(SIG_BLOCK, &block_set, &old_set);
1581
+
1582
+ while (sigtimedwait(&block_set, NULL, &zero_ts) > 0) {}
1583
+
1584
+ sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
1585
+ pthread_sigmask(SIG_SETMASK, &old_set, NULL);
1310
1586
  }
1311
1587
  #endif
1312
1588
 
@@ -1326,12 +1602,13 @@ rperf_after_fork_child(void)
1326
1602
  }
1327
1603
 
1328
1604
  /* Reset GC state */
1329
- g_profiler.gc_phase = 0;
1605
+ g_profiler.gc.phase = 0;
1606
+ g_profiler.gc.enter_ns = 0;
1330
1607
 
1331
1608
  /* Reset stats */
1332
- g_profiler.sampling_count = 0;
1333
- g_profiler.sampling_total_ns = 0;
1334
- g_profiler.swap_ready = 0;
1609
+ g_profiler.stats.sampling_count = 0;
1610
+ g_profiler.stats.sampling_total_ns = 0;
1611
+ atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
1335
1612
  }
1336
1613
 
1337
1614
  /* ---- Init ---- */
@@ -1340,10 +1617,16 @@ void
1340
1617
  Init_rperf(void)
1341
1618
  {
1342
1619
  VALUE mRperf = rb_define_module("Rperf");
1343
- rb_define_module_function(mRperf, "_c_start", rb_rperf_start, -1);
1620
+ rb_define_module_function(mRperf, "_c_start", rb_rperf_start, 4);
1344
1621
  rb_define_module_function(mRperf, "_c_stop", rb_rperf_stop, 0);
1622
+ rb_define_module_function(mRperf, "_c_snapshot", rb_rperf_snapshot, 1);
1623
+ rb_define_module_function(mRperf, "_c_set_label", rb_rperf_set_label, 1);
1624
+ rb_define_module_function(mRperf, "_c_get_label", rb_rperf_get_label, 0);
1625
+ rb_define_module_function(mRperf, "_c_set_label_sets", rb_rperf_set_label_sets, 1);
1626
+ rb_define_module_function(mRperf, "_c_get_label_sets", rb_rperf_get_label_sets, 0);
1345
1627
 
1346
1628
  memset(&g_profiler, 0, sizeof(g_profiler));
1629
+ g_profiler.label_sets = Qnil;
1347
1630
  g_profiler.pj_handle = rb_postponed_job_preregister(0, rperf_sample_job, &g_profiler);
1348
1631
  g_profiler.ts_key = rb_internal_thread_specific_key_create();
1349
1632