rperf 0.7.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/rperf/rperf.c CHANGED
@@ -8,6 +8,7 @@
8
8
  #include <unistd.h>
9
9
  #include <signal.h>
10
10
  #include <stdatomic.h>
11
+ #include <sched.h>
11
12
  #ifdef __linux__
12
13
  #include <sys/syscall.h>
13
14
  #endif
@@ -24,8 +25,10 @@
24
25
  #ifdef __linux__
25
26
  #define RPERF_USE_TIMER_SIGNAL 1
26
27
  #define RPERF_TIMER_SIGNAL_DEFAULT (SIGRTMIN + 8)
28
+ #define RPERF_COND_CLOCK CLOCK_MONOTONIC
27
29
  #else
28
30
  #define RPERF_USE_TIMER_SIGNAL 0
31
+ #define RPERF_COND_CLOCK CLOCK_REALTIME /* macOS lacks pthread_condattr_setclock */
29
32
  #endif
30
33
 
31
34
  #define RPERF_MAX_STACK_DEPTH 512
@@ -38,21 +41,21 @@
38
41
  #define RPERF_STACK_POOL_INITIAL 4096
39
42
  #define RPERF_PAUSED(prof) ((prof)->profile_refcount == 0)
40
43
 
41
- /* Synthetic frame IDs (reserved in frame_table, 0-based) */
42
- #define RPERF_SYNTHETIC_GVL_BLOCKED 0
43
- #define RPERF_SYNTHETIC_GVL_WAIT 1
44
- #define RPERF_SYNTHETIC_GC_MARKING 2
45
- #define RPERF_SYNTHETIC_GC_SWEEPING 3
46
- #define RPERF_SYNTHETIC_COUNT 4
44
+ /* VM state values (stored in samples, not as stack frames) */
45
+ enum rperf_vm_state {
46
+ RPERF_VM_STATE_NORMAL = 0,
47
+ RPERF_VM_STATE_GVL_BLOCKED = 1,
48
+ RPERF_VM_STATE_GVL_WAIT = 2,
49
+ RPERF_VM_STATE_GC_MARKING = 3,
50
+ RPERF_VM_STATE_GC_SWEEPING = 4,
51
+ };
47
52
 
48
53
  /* ---- Data structures ---- */
49
54
 
50
- enum rperf_sample_type {
51
- RPERF_SAMPLE_NORMAL = 0,
52
- RPERF_SAMPLE_GVL_BLOCKED = 1, /* off-GVL: SUSPENDED → READY */
53
- RPERF_SAMPLE_GVL_WAIT = 2, /* GVL wait: READY → RESUMED */
54
- RPERF_SAMPLE_GC_MARKING = 3, /* GC marking phase */
55
- RPERF_SAMPLE_GC_SWEEPING = 4, /* GC sweeping phase */
55
+
56
+ enum rperf_mode {
57
+ RPERF_MODE_CPU = 0,
58
+ RPERF_MODE_WALL = 1,
56
59
  };
57
60
 
58
61
  enum rperf_gc_phase {
@@ -65,7 +68,7 @@ typedef struct rperf_sample {
65
68
  int depth;
66
69
  size_t frame_start; /* index into frame_pool */
67
70
  int64_t weight;
68
- int type; /* rperf_sample_type */
71
+ enum rperf_vm_state vm_state;
69
72
  int thread_seq; /* thread sequence number (1-based) */
70
73
  int label_set_id; /* label set ID (0 = no labels) */
71
74
  } rperf_sample_t;
@@ -87,7 +90,7 @@ typedef struct rperf_sample_buffer {
87
90
 
88
91
  typedef struct rperf_frame_table {
89
92
  _Atomic(VALUE *) keys; /* unique VALUE array (GC mark target) */
90
- size_t count; /* = next frame_id (starts after RPERF_SYNTHETIC_COUNT) */
93
+ _Atomic(size_t) count; /* = next frame_id */
91
94
  size_t capacity;
92
95
  uint32_t *buckets; /* open addressing: stores index into keys[] */
93
96
  size_t bucket_capacity;
@@ -103,9 +106,10 @@ typedef struct rperf_frame_table {
103
106
 
104
107
  typedef struct rperf_agg_entry {
105
108
  uint32_t frame_start; /* offset into stack_pool */
106
- int depth; /* includes synthetic frame */
109
+ int depth;
107
110
  int thread_seq;
108
111
  int label_set_id; /* label set ID (0 = no labels) */
112
+ enum rperf_vm_state vm_state;
109
113
  int64_t weight; /* accumulated */
110
114
  uint32_t hash; /* cached hash value */
111
115
  int used; /* 0 = empty, 1 = used */
@@ -122,7 +126,6 @@ typedef struct rperf_agg_table {
122
126
 
123
127
  typedef struct rperf_thread_data {
124
128
  int64_t prev_time_ns;
125
- int64_t prev_wall_ns;
126
129
  /* GVL event tracking */
127
130
  int64_t suspended_at_ns; /* wall time at SUSPENDED */
128
131
  int64_t ready_at_ns; /* wall time at READY */
@@ -145,11 +148,13 @@ typedef struct rperf_stats {
145
148
  size_t trigger_count;
146
149
  size_t sampling_count;
147
150
  int64_t sampling_total_ns;
151
+ size_t dropped_samples; /* samples lost due to allocation failure */
152
+ size_t dropped_aggregation; /* samples lost during aggregation (frame_table/agg_table full) */
148
153
  } rperf_stats_t;
149
154
 
150
155
  typedef struct rperf_profiler {
151
156
  int frequency;
152
- int mode; /* 0 = cpu, 1 = wall */
157
+ enum rperf_mode mode;
153
158
  _Atomic int running;
154
159
  pthread_t worker_thread; /* combined timer + aggregation */
155
160
  #if RPERF_USE_TIMER_SIGNAL
@@ -188,6 +193,7 @@ typedef struct rperf_profiler {
188
193
  * profile_inc/dec transitions 0↔1 arm/disarm the timer.
189
194
  * Modified only under GVL, so plain int is safe. */
190
195
  int profile_refcount;
196
+ int worker_paused; /* 1 when nanosleep worker is in paused cond_wait */
191
197
  } rperf_profiler_t;
192
198
 
193
199
  static rperf_profiler_t g_profiler;
@@ -218,21 +224,50 @@ rperf_profiler_mark(void *ptr)
218
224
  * If we see an old count, both old and new keys arrays have valid
219
225
  * data (old keys are kept alive in old_keys[]). */
220
226
  {
221
- size_t ft_count = __atomic_load_n(&prof->frame_table.count, __ATOMIC_ACQUIRE);
227
+ size_t ft_count = atomic_load_explicit(&prof->frame_table.count, memory_order_acquire);
222
228
  VALUE *ft_keys = atomic_load_explicit(&prof->frame_table.keys, memory_order_acquire);
223
229
  if (ft_keys && ft_count > 0) {
224
- rb_gc_mark_locations(ft_keys + RPERF_SYNTHETIC_COUNT,
225
- ft_keys + ft_count);
230
+ rb_gc_mark_locations(ft_keys, ft_keys + ft_count);
226
231
  }
227
232
  }
228
233
  }
229
234
 
235
+ static size_t
236
+ rperf_profiler_memsize(const void *ptr)
237
+ {
238
+ const rperf_profiler_t *prof = (const rperf_profiler_t *)ptr;
239
+ size_t size = sizeof(rperf_profiler_t);
240
+ int i;
241
+
242
+ /* Double-buffered sample storage */
243
+ for (i = 0; i < 2; i++) {
244
+ const rperf_sample_buffer_t *buf = &prof->buffers[i];
245
+ size += buf->sample_capacity * sizeof(rperf_sample_t);
246
+ size += buf->frame_pool_capacity * sizeof(VALUE);
247
+ }
248
+
249
+ /* Frame table */
250
+ size += prof->frame_table.capacity * sizeof(VALUE); /* keys */
251
+ size += prof->frame_table.bucket_capacity * sizeof(uint32_t); /* buckets */
252
+ for (i = 0; i < prof->frame_table.old_keys_count; i++) {
253
+ /* old_keys entries are previous keys arrays; exact sizes unknown,
254
+ * but the pointer array itself is accounted for below. */
255
+ }
256
+ size += prof->frame_table.old_keys_capacity * sizeof(VALUE *); /* old_keys */
257
+
258
+ /* Aggregation table */
259
+ size += prof->agg_table.bucket_capacity * sizeof(rperf_agg_entry_t);
260
+ size += prof->agg_table.stack_pool_capacity * sizeof(uint32_t);
261
+
262
+ return size;
263
+ }
264
+
230
265
  static const rb_data_type_t rperf_profiler_type = {
231
266
  .wrap_struct_name = "rperf_profiler",
232
267
  .function = {
233
268
  .dmark = rperf_profiler_mark,
234
269
  .dfree = NULL,
235
- .dsize = NULL,
270
+ .dsize = rperf_profiler_memsize,
236
271
  },
237
272
  };
238
273
 
@@ -259,9 +294,9 @@ rperf_wall_time_ns(void)
259
294
  /* ---- Get current thread's time based on profiler mode ---- */
260
295
 
261
296
  static int64_t
262
- rperf_current_time_ns(rperf_profiler_t *prof, rperf_thread_data_t *td)
297
+ rperf_current_time_ns(rperf_profiler_t *prof)
263
298
  {
264
- if (prof->mode == 0) {
299
+ if (prof->mode == RPERF_MODE_CPU) {
265
300
  return rperf_cpu_time_ns();
266
301
  } else {
267
302
  return rperf_wall_time_ns();
@@ -302,6 +337,7 @@ static int
302
337
  rperf_ensure_sample_capacity(rperf_sample_buffer_t *buf)
303
338
  {
304
339
  if (buf->sample_count >= buf->sample_capacity) {
340
+ if (buf->sample_capacity > SIZE_MAX / (2 * sizeof(rperf_sample_t))) return -1;
305
341
  size_t new_cap = buf->sample_capacity * 2;
306
342
  rperf_sample_t *new_samples = (rperf_sample_t *)realloc(
307
343
  buf->samples,
@@ -320,6 +356,7 @@ static int
320
356
  rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
321
357
  {
322
358
  while (buf->frame_pool_count + (size_t)needed > buf->frame_pool_capacity) {
359
+ if (buf->frame_pool_capacity > SIZE_MAX / (2 * sizeof(VALUE))) return -1;
323
360
  size_t new_cap = buf->frame_pool_capacity * 2;
324
361
  VALUE *new_pool = (VALUE *)realloc(
325
362
  buf->frame_pool,
@@ -340,7 +377,7 @@ rperf_frame_table_init(rperf_frame_table_t *ft)
340
377
  VALUE *keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
341
378
  if (!keys) return -1;
342
379
  atomic_store_explicit(&ft->keys, keys, memory_order_relaxed);
343
- ft->count = RPERF_SYNTHETIC_COUNT; /* reserve slots for synthetic frames */
380
+ ft->count = 0;
344
381
  ft->bucket_capacity = RPERF_FRAME_TABLE_INITIAL * 2;
345
382
  ft->buckets = (uint32_t *)malloc(ft->bucket_capacity * sizeof(uint32_t));
346
383
  if (!ft->buckets) { free(keys); atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed); return -1; }
@@ -372,6 +409,7 @@ rperf_frame_table_free(rperf_frame_table_t *ft)
372
409
  static void
373
410
  rperf_frame_table_rehash(rperf_frame_table_t *ft)
374
411
  {
412
+ if (ft->bucket_capacity > SIZE_MAX / 2) return;
375
413
  size_t new_cap = ft->bucket_capacity * 2;
376
414
  uint32_t *new_buckets = (uint32_t *)malloc(new_cap * sizeof(uint32_t));
377
415
  if (!new_buckets) return; /* keep using current buckets at higher load factor */
@@ -379,7 +417,7 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
379
417
 
380
418
  VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
381
419
  size_t i;
382
- for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
420
+ for (i = 0; i < ft->count; i++) {
383
421
  uint32_t h = (uint32_t)(keys[i] >> 3); /* shift out tag bits */
384
422
  size_t idx = h % new_cap;
385
423
  while (new_buckets[idx] != RPERF_FRAME_TABLE_EMPTY)
@@ -400,11 +438,13 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
400
438
  uint32_t h = (uint32_t)(fval >> 3);
401
439
  size_t idx = h % ft->bucket_capacity;
402
440
 
441
+ size_t probes = 0;
403
442
  while (1) {
404
443
  uint32_t slot = ft->buckets[idx];
405
444
  if (slot == RPERF_FRAME_TABLE_EMPTY) break;
406
445
  if (keys[slot] == fval) return slot;
407
446
  idx = (idx + 1) % ft->bucket_capacity;
447
+ if (++probes >= ft->bucket_capacity) return RPERF_FRAME_TABLE_EMPTY; /* table full */
408
448
  }
409
449
 
410
450
  /* Insert new entry. Grow keys array if capacity is exhausted.
@@ -412,6 +452,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
412
452
  * the old keys pointer. Instead, allocate new, copy, swap pointer
413
453
  * atomically, and keep old array alive until stop. */
414
454
  if (ft->count >= ft->capacity) {
455
+ if (ft->capacity > SIZE_MAX / 2) return RPERF_FRAME_TABLE_EMPTY;
415
456
  size_t new_cap = ft->capacity * 2;
416
457
  VALUE *new_keys = (VALUE *)calloc(new_cap, sizeof(VALUE));
417
458
  if (!new_keys) return RPERF_FRAME_TABLE_EMPTY;
@@ -434,7 +475,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
434
475
  keys[frame_id] = fval;
435
476
  /* Store fence: ensure keys[frame_id] is visible before count is incremented,
436
477
  * so GC dmark never reads uninitialized keys[count-1]. */
437
- __atomic_store_n(&ft->count, ft->count + 1, __ATOMIC_RELEASE);
478
+ atomic_store_explicit(&ft->count, ft->count + 1, memory_order_release);
438
479
  ft->buckets[idx] = frame_id;
439
480
 
440
481
  /* Rehash if load factor > 0.7 */
@@ -448,7 +489,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
448
489
  /* ---- Aggregation table operations (all malloc-based, no GVL needed) ---- */
449
490
 
450
491
  static uint32_t
451
- rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id)
492
+ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id, enum rperf_vm_state vm_state)
452
493
  {
453
494
  uint32_t h = 2166136261u;
454
495
  int i;
@@ -460,6 +501,8 @@ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id)
460
501
  h *= 16777619u;
461
502
  h ^= (uint32_t)label_set_id;
462
503
  h *= 16777619u;
504
+ h ^= (uint32_t)vm_state;
505
+ h *= 16777619u;
463
506
  return h;
464
507
  }
465
508
 
@@ -488,6 +531,7 @@ rperf_agg_table_free(rperf_agg_table_t *at)
488
531
  static void
489
532
  rperf_agg_table_rehash(rperf_agg_table_t *at)
490
533
  {
534
+ if (at->bucket_capacity > SIZE_MAX / (2 * sizeof(rperf_agg_entry_t))) return;
491
535
  size_t new_cap = at->bucket_capacity * 2;
492
536
  rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
493
537
  if (!new_buckets) return; /* keep using current buckets at higher load factor */
@@ -512,6 +556,7 @@ static int
512
556
  rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
513
557
  {
514
558
  while (at->stack_pool_count + (size_t)needed > at->stack_pool_capacity) {
559
+ if (at->stack_pool_capacity > SIZE_MAX / (2 * sizeof(uint32_t))) return -1;
515
560
  size_t new_cap = at->stack_pool_capacity * 2;
516
561
  uint32_t *new_pool = (uint32_t *)realloc(at->stack_pool,
517
562
  new_cap * sizeof(uint32_t));
@@ -522,36 +567,40 @@ rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
522
567
  return 0;
523
568
  }
524
569
 
525
- /* Insert or merge a stack into the aggregation table */
526
- static void
570
+ /* Insert or merge a stack into the aggregation table.
571
+ * Returns 0 on success, -1 on failure (table full or allocation failure). */
572
+ static int
527
573
  rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
528
574
  int depth, int thread_seq, int label_set_id,
529
- int64_t weight, uint32_t hash)
575
+ enum rperf_vm_state vm_state, int64_t weight, uint32_t hash)
530
576
  {
531
577
  size_t idx = hash % at->bucket_capacity;
532
578
 
579
+ size_t probes = 0;
533
580
  while (1) {
534
581
  rperf_agg_entry_t *e = &at->buckets[idx];
535
582
  if (!e->used) break;
536
583
  if (e->hash == hash && e->depth == depth && e->thread_seq == thread_seq &&
537
- e->label_set_id == label_set_id &&
584
+ e->label_set_id == label_set_id && e->vm_state == vm_state &&
538
585
  memcmp(at->stack_pool + e->frame_start, frame_ids,
539
586
  depth * sizeof(uint32_t)) == 0) {
540
587
  /* Match — merge weight */
541
588
  e->weight += weight;
542
- return;
589
+ return 0;
543
590
  }
544
591
  idx = (idx + 1) % at->bucket_capacity;
592
+ if (++probes >= at->bucket_capacity) return -1; /* table full */
545
593
  }
546
594
 
547
595
  /* New entry — append frame_ids to stack_pool */
548
- if (rperf_agg_ensure_stack_pool(at, depth) < 0) return;
596
+ if (rperf_agg_ensure_stack_pool(at, depth) < 0) return -1;
549
597
 
550
598
  rperf_agg_entry_t *e = &at->buckets[idx];
551
599
  e->frame_start = (uint32_t)at->stack_pool_count;
552
600
  e->depth = depth;
553
601
  e->thread_seq = thread_seq;
554
602
  e->label_set_id = label_set_id;
603
+ e->vm_state = vm_state;
555
604
  e->weight = weight;
556
605
  e->hash = hash;
557
606
  e->used = 1;
@@ -565,6 +614,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
565
614
  if (at->count * 10 > at->bucket_capacity * 7) {
566
615
  rperf_agg_table_rehash(at);
567
616
  }
617
+ return 0;
568
618
  }
569
619
 
570
620
  /* ---- Aggregation: process a sample buffer into frame_table + agg_table ---- */
@@ -573,47 +623,46 @@ static void
573
623
  rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
574
624
  {
575
625
  size_t i;
576
- uint32_t temp_ids[RPERF_MAX_STACK_DEPTH + 1];
626
+ uint32_t temp_ids[RPERF_MAX_STACK_DEPTH];
577
627
 
578
628
  for (i = 0; i < buf->sample_count; i++) {
579
629
  rperf_sample_t *s = &buf->samples[i];
580
- int off = 0;
581
630
  uint32_t hash;
582
631
  int j;
583
632
 
584
- /* Prepend synthetic frame if needed */
585
- if (s->type == RPERF_SAMPLE_GVL_BLOCKED) {
586
- temp_ids[off++] = RPERF_SYNTHETIC_GVL_BLOCKED;
587
- } else if (s->type == RPERF_SAMPLE_GVL_WAIT) {
588
- temp_ids[off++] = RPERF_SYNTHETIC_GVL_WAIT;
589
- } else if (s->type == RPERF_SAMPLE_GC_MARKING) {
590
- temp_ids[off++] = RPERF_SYNTHETIC_GC_MARKING;
591
- } else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
592
- temp_ids[off++] = RPERF_SYNTHETIC_GC_SWEEPING;
593
- }
633
+ /* Clamp depth to temp_ids[] capacity */
634
+ if (s->depth > RPERF_MAX_STACK_DEPTH)
635
+ s->depth = RPERF_MAX_STACK_DEPTH;
594
636
 
595
637
  /* Convert VALUE frames to frame_ids */
596
638
  int overflow = 0;
597
639
  for (j = 0; j < s->depth; j++) {
640
+ if (s->frame_start + j >= buf->frame_pool_count) break;
598
641
  VALUE fval = buf->frame_pool[s->frame_start + j];
599
642
  uint32_t fid = rperf_frame_table_insert(&prof->frame_table, fval);
600
643
  if (fid == RPERF_FRAME_TABLE_EMPTY) { overflow = 1; break; }
601
- temp_ids[off + j] = fid;
644
+ temp_ids[j] = fid;
645
+ }
646
+ if (overflow) {
647
+ /* frame_table full — count remaining samples as dropped */
648
+ prof->stats.dropped_aggregation += buf->sample_count - i;
649
+ break;
602
650
  }
603
- if (overflow) break; /* frame_table full, stop aggregating this buffer */
604
651
 
605
- int total_depth = off + s->depth;
606
- hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq, s->label_set_id);
652
+ hash = rperf_fnv1a_u32(temp_ids, s->depth, s->thread_seq, s->label_set_id, s->vm_state);
607
653
 
608
- rperf_agg_table_insert(&prof->agg_table, temp_ids, total_depth,
609
- s->thread_seq, s->label_set_id, s->weight, hash);
654
+ if (rperf_agg_table_insert(&prof->agg_table, temp_ids, s->depth,
655
+ s->thread_seq, s->label_set_id, s->vm_state,
656
+ s->weight, hash) < 0) {
657
+ prof->stats.dropped_aggregation++;
658
+ }
610
659
  }
611
660
 
612
661
  /* Reset buffer for reuse.
613
662
  * Release fence: ensure all frame_table inserts are visible (to GC dmark)
614
663
  * before frame_pool_count is cleared, so dmark always has at least one
615
664
  * source (frame_table or frame_pool) covering each VALUE. */
616
- __atomic_thread_fence(__ATOMIC_RELEASE);
665
+ atomic_thread_fence(memory_order_release);
617
666
  buf->sample_count = 0;
618
667
  buf->frame_pool_count = 0;
619
668
  }
@@ -656,7 +705,7 @@ rperf_try_swap(rperf_profiler_t *prof)
656
705
  /* Write a sample into a specific buffer. No swap check. */
657
706
  static int
658
707
  rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
659
- int64_t weight, int type, int thread_seq, int label_set_id)
708
+ int64_t weight, enum rperf_vm_state vm_state, int thread_seq, int label_set_id)
660
709
  {
661
710
  if (weight <= 0) return 0;
662
711
  if (rperf_ensure_sample_capacity(buf) < 0) return -1;
@@ -665,7 +714,7 @@ rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
665
714
  sample->depth = depth;
666
715
  sample->frame_start = frame_start;
667
716
  sample->weight = weight;
668
- sample->type = type;
717
+ sample->vm_state = vm_state;
669
718
  sample->thread_seq = thread_seq;
670
719
  sample->label_set_id = label_set_id;
671
720
  buf->sample_count++;
@@ -674,10 +723,11 @@ rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
674
723
 
675
724
  static void
676
725
  rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
677
- int64_t weight, int type, int thread_seq, int label_set_id)
726
+ int64_t weight, enum rperf_vm_state vm_state, int thread_seq, int label_set_id)
678
727
  {
679
728
  rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
680
- rperf_write_sample(buf, frame_start, depth, weight, type, thread_seq, label_set_id);
729
+ if (rperf_write_sample(buf, frame_start, depth, weight, vm_state, thread_seq, label_set_id) < 0)
730
+ prof->stats.dropped_samples++;
681
731
  rperf_try_swap(prof);
682
732
  }
683
733
 
@@ -689,8 +739,9 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
689
739
  {
690
740
  rperf_thread_data_t *td = (rperf_thread_data_t *)calloc(1, sizeof(rperf_thread_data_t));
691
741
  if (!td) return NULL;
692
- td->prev_time_ns = rperf_current_time_ns(prof, td);
693
- td->prev_wall_ns = rperf_wall_time_ns();
742
+ int64_t t = rperf_current_time_ns(prof);
743
+ if (t < 0) { free(td); return NULL; }
744
+ td->prev_time_ns = t;
694
745
  td->thread_seq = ++prof->next_thread_seq;
695
746
  rb_internal_thread_specific_set(thread, prof->ts_key, td);
696
747
  return td;
@@ -712,7 +763,7 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t
712
763
  is_first = 1;
713
764
  }
714
765
 
715
- int64_t time_now = rperf_current_time_ns(prof, td);
766
+ int64_t time_now = rperf_current_time_ns(prof);
716
767
  if (time_now < 0) return;
717
768
 
718
769
  /* Capture backtrace into active buffer's frame_pool */
@@ -727,13 +778,12 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t
727
778
  /* Record normal sample (skip if first time — no prev_time, or if paused) */
728
779
  if (!is_first && !RPERF_PAUSED(prof)) {
729
780
  int64_t weight = time_now - td->prev_time_ns;
730
- rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq, td->label_set_id);
781
+ rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
731
782
  }
732
783
 
733
784
  /* Save timestamp for READY/RESUMED */
734
785
  td->suspended_at_ns = wall_now;
735
786
  td->prev_time_ns = time_now;
736
- td->prev_wall_ns = wall_now;
737
787
  }
738
788
 
739
789
  static void
@@ -764,7 +814,7 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
764
814
  * Both samples are written directly into the same buffer before calling
765
815
  * rperf_try_swap, so that a swap triggered by the first sample cannot
766
816
  * move the second into a different buffer with a stale frame_start. */
767
- if (prof->mode == 1 && td->suspended_at_ns > 0 && !RPERF_PAUSED(prof)) {
817
+ if (prof->mode == RPERF_MODE_WALL && td->suspended_at_ns > 0 && !RPERF_PAUSED(prof)) {
768
818
  rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
769
819
  if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) goto skip_gvl;
770
820
  size_t frame_start = buf->frame_pool_count;
@@ -776,13 +826,15 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
776
826
  /* Write both samples into the same buf, then swap-check once */
777
827
  if (td->ready_at_ns > 0 && td->ready_at_ns > td->suspended_at_ns) {
778
828
  int64_t blocked_ns = td->ready_at_ns - td->suspended_at_ns;
779
- rperf_write_sample(buf, frame_start, depth, blocked_ns,
780
- RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq, td->label_set_id);
829
+ if (rperf_write_sample(buf, frame_start, depth, blocked_ns,
830
+ RPERF_VM_STATE_GVL_BLOCKED, td->thread_seq, td->label_set_id) < 0)
831
+ prof->stats.dropped_samples++;
781
832
  }
782
833
  if (td->ready_at_ns > 0 && wall_now > td->ready_at_ns) {
783
834
  int64_t wait_ns = wall_now - td->ready_at_ns;
784
- rperf_write_sample(buf, frame_start, depth, wait_ns,
785
- RPERF_SAMPLE_GVL_WAIT, td->thread_seq, td->label_set_id);
835
+ if (rperf_write_sample(buf, frame_start, depth, wait_ns,
836
+ RPERF_VM_STATE_GVL_WAIT, td->thread_seq, td->label_set_id) < 0)
837
+ prof->stats.dropped_samples++;
786
838
  }
787
839
 
788
840
  rperf_try_swap(prof);
@@ -790,9 +842,8 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
790
842
  skip_gvl:
791
843
 
792
844
  /* Reset prev times to current — next timer sample measures from resume */
793
- int64_t time_now = rperf_current_time_ns(prof, td);
845
+ int64_t time_now = rperf_current_time_ns(prof);
794
846
  if (time_now >= 0) td->prev_time_ns = time_now;
795
- td->prev_wall_ns = wall_now;
796
847
 
797
848
  /* Clear suspended state */
798
849
  td->suspended_at_ns = 0;
@@ -861,9 +912,9 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
861
912
 
862
913
  int64_t wall_now = rperf_wall_time_ns();
863
914
  int64_t weight = wall_now - prof->gc.enter_ns;
864
- int type = (prof->gc.phase == RPERF_GC_SWEEPING)
865
- ? RPERF_SAMPLE_GC_SWEEPING
866
- : RPERF_SAMPLE_GC_MARKING;
915
+ enum rperf_vm_state vm_state = (prof->gc.phase == RPERF_GC_SWEEPING)
916
+ ? RPERF_VM_STATE_GC_SWEEPING
917
+ : RPERF_VM_STATE_GC_MARKING;
867
918
 
868
919
  /* Capture backtrace here (not at GC_ENTER) so that frame_start
869
920
  * always indexes into the current active buffer. The Ruby stack
@@ -882,24 +933,22 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
882
933
  }
883
934
  buf->frame_pool_count += depth;
884
935
 
885
- rperf_record_sample(prof, frame_start, depth, weight, type, prof->gc.thread_seq, prof->gc.label_set_id);
936
+ rperf_record_sample(prof, frame_start, depth, weight, vm_state, prof->gc.thread_seq, prof->gc.label_set_id);
886
937
  prof->gc.enter_ns = 0;
887
938
  }
888
939
  }
889
940
 
890
941
  /* ---- Sampling callback (postponed job) — current thread only ---- */
891
942
 
892
- static void
893
- rperf_sample_job(void *arg)
943
+ /* Core sampling logic, parameterized by mode constant.
944
+ * Called from rperf_sample_cpu/rperf_sample_wall so the compiler
945
+ * can inline and eliminate mode branches at compile time. */
946
+ static inline void
947
+ rperf_sample_core(rperf_profiler_t *prof, enum rperf_mode mode)
894
948
  {
895
- rperf_profiler_t *prof = (rperf_profiler_t *)arg;
896
-
897
- if (!prof->running) return;
898
- if (RPERF_PAUSED(prof)) return;
899
-
900
- /* Measure sampling overhead */
949
+ /* Measure sampling overhead (wall time — runs under GVL, no I/O) */
901
950
  struct timespec ts_start, ts_end;
902
- clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_start);
951
+ clock_gettime(CLOCK_MONOTONIC, &ts_start);
903
952
 
904
953
  VALUE thread = rb_thread_current();
905
954
 
@@ -911,12 +960,11 @@ rperf_sample_job(void *arg)
911
960
  return; /* Skip first sample for this thread */
912
961
  }
913
962
 
914
- int64_t time_now = rperf_current_time_ns(prof, td);
963
+ int64_t time_now = (mode == RPERF_MODE_CPU) ? rperf_cpu_time_ns() : rperf_wall_time_ns();
915
964
  if (time_now < 0) return;
916
965
 
917
966
  int64_t weight = time_now - td->prev_time_ns;
918
967
  td->prev_time_ns = time_now;
919
- td->prev_wall_ns = rperf_wall_time_ns();
920
968
 
921
969
  if (weight <= 0) return;
922
970
 
@@ -930,15 +978,35 @@ rperf_sample_job(void *arg)
930
978
  if (depth <= 0) return;
931
979
  buf->frame_pool_count += depth;
932
980
 
933
- rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq, td->label_set_id);
981
+ rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
934
982
 
935
- clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_end);
983
+ clock_gettime(CLOCK_MONOTONIC, &ts_end);
936
984
  prof->stats.sampling_count++;
937
985
  prof->stats.sampling_total_ns +=
938
986
  ((int64_t)ts_end.tv_sec - ts_start.tv_sec) * 1000000000LL +
939
987
  (ts_end.tv_nsec - ts_start.tv_nsec);
940
988
  }
941
989
 
990
+ static void
991
+ rperf_sample_cpu(rperf_profiler_t *prof) { rperf_sample_core(prof, RPERF_MODE_CPU); }
992
+
993
+ static void
994
+ rperf_sample_wall(rperf_profiler_t *prof) { rperf_sample_core(prof, RPERF_MODE_WALL); }
995
+
996
+ static void
997
+ rperf_sample_job(void *arg)
998
+ {
999
+ rperf_profiler_t *prof = (rperf_profiler_t *)arg;
1000
+
1001
+ if (!prof->running) return;
1002
+ if (RPERF_PAUSED(prof)) return;
1003
+
1004
+ if (prof->mode == RPERF_MODE_CPU)
1005
+ rperf_sample_cpu(prof);
1006
+ else
1007
+ rperf_sample_wall(prof);
1008
+ }
1009
+
942
1010
  /* ---- Worker thread: timer + aggregation ---- */
943
1011
 
944
1012
  #if RPERF_USE_TIMER_SIGNAL
@@ -984,7 +1052,7 @@ rperf_worker_nanosleep_func(void *arg)
984
1052
  struct timespec deadline;
985
1053
  long interval_ns = 1000000000L / prof->frequency;
986
1054
 
987
- clock_gettime(CLOCK_REALTIME, &deadline);
1055
+ clock_gettime(RPERF_COND_CLOCK, &deadline);
988
1056
  deadline.tv_nsec += interval_ns;
989
1057
  if (deadline.tv_nsec >= 1000000000L) {
990
1058
  deadline.tv_sec++;
@@ -994,10 +1062,12 @@ rperf_worker_nanosleep_func(void *arg)
994
1062
  CHECKED(pthread_mutex_lock(&prof->worker_mutex));
995
1063
  while (prof->running) {
996
1064
  if (RPERF_PAUSED(prof)) {
997
- /* Paused: wait indefinitely until signaled (resume or stop) */
1065
+ /* Paused: mark as paused so disarm can confirm, then wait */
1066
+ prof->worker_paused = 1;
998
1067
  CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
1068
+ prof->worker_paused = 0;
999
1069
  /* Reset deadline on wake to avoid burst of catch-up triggers */
1000
- clock_gettime(CLOCK_REALTIME, &deadline);
1070
+ clock_gettime(RPERF_COND_CLOCK, &deadline);
1001
1071
  deadline.tv_nsec += interval_ns;
1002
1072
  if (deadline.tv_nsec >= 1000000000L) {
1003
1073
  deadline.tv_sec++;
@@ -1068,14 +1138,18 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
1068
1138
  result = rb_hash_new();
1069
1139
 
1070
1140
  rb_hash_aset(result, ID2SYM(rb_intern("mode")),
1071
- ID2SYM(rb_intern(prof->mode == 1 ? "wall" : "cpu")));
1141
+ ID2SYM(rb_intern(prof->mode == RPERF_MODE_WALL ? "wall" : "cpu")));
1072
1142
  rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(prof->frequency));
1073
1143
  rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(prof->stats.trigger_count));
1074
1144
  rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(prof->stats.sampling_count));
1075
1145
  rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(prof->stats.sampling_total_ns));
1146
+ if (prof->stats.dropped_samples > 0)
1147
+ rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(prof->stats.dropped_samples));
1148
+ if (prof->stats.dropped_aggregation > 0)
1149
+ rb_hash_aset(result, ID2SYM(rb_intern("dropped_aggregation")), SIZET2NUM(prof->stats.dropped_aggregation));
1076
1150
  rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(prof->next_thread_seq));
1077
1151
  rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
1078
- SIZET2NUM(prof->frame_table.count - RPERF_SYNTHETIC_COUNT));
1152
+ SIZET2NUM(prof->frame_table.count));
1079
1153
  rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
1080
1154
  SIZET2NUM(prof->agg_table.count));
1081
1155
 
@@ -1094,11 +1168,7 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
1094
1168
  {
1095
1169
  rperf_frame_table_t *ft = &prof->frame_table;
1096
1170
  VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
1097
- rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]")));
1098
- rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
1099
- rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]")));
1100
- rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]")));
1101
- for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
1171
+ for (i = 0; i < ft->count; i++) {
1102
1172
  rb_ary_push(resolved_ary, rperf_resolve_frame(atomic_load_explicit(&ft->keys, memory_order_relaxed)[i]));
1103
1173
  }
1104
1174
 
@@ -1110,11 +1180,18 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
1110
1180
 
1111
1181
  VALUE frames = rb_ary_new_capa(e->depth);
1112
1182
  for (j = 0; j < e->depth; j++) {
1183
+ if (e->frame_start + j >= at->stack_pool_count) break;
1113
1184
  uint32_t fid = at->stack_pool[e->frame_start + j];
1185
+ if (fid >= ft->count) break;
1114
1186
  rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
1115
1187
  }
1116
1188
 
1117
- VALUE sample = rb_ary_new3(4, frames, LONG2NUM(e->weight), INT2NUM(e->thread_seq), INT2NUM(e->label_set_id));
1189
+ VALUE sample = rb_ary_new_capa(5);
1190
+ rb_ary_push(sample, frames);
1191
+ rb_ary_push(sample, LONG2NUM(e->weight));
1192
+ rb_ary_push(sample, INT2NUM(e->thread_seq));
1193
+ rb_ary_push(sample, INT2NUM(e->label_set_id));
1194
+ rb_ary_push(sample, INT2NUM(e->vm_state));
1118
1195
  rb_ary_push(samples_ary, sample);
1119
1196
  }
1120
1197
  }
@@ -1141,7 +1218,7 @@ static VALUE
1141
1218
  rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VALUE vdefer)
1142
1219
  {
1143
1220
  int frequency = NUM2INT(vfreq);
1144
- int mode = NUM2INT(vmode);
1221
+ enum rperf_mode mode = (enum rperf_mode)NUM2INT(vmode);
1145
1222
  int aggregate = RTEST(vagg) ? 1 : 0;
1146
1223
  #if RPERF_USE_TIMER_SIGNAL
1147
1224
  int sig = NUM2INT(vsig);
@@ -1159,13 +1236,27 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VAL
1159
1236
  g_profiler.stats.sampling_count = 0;
1160
1237
  g_profiler.stats.sampling_total_ns = 0;
1161
1238
  g_profiler.stats.trigger_count = 0;
1239
+ g_profiler.stats.dropped_samples = 0;
1240
+ g_profiler.stats.dropped_aggregation = 0;
1162
1241
  atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
1163
1242
  atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
1164
1243
  g_profiler.label_sets = Qnil;
1165
1244
 
1166
1245
  /* Initialize worker mutex/cond */
1167
1246
  CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
1247
+ #ifdef __linux__
1248
+ {
1249
+ /* Use CLOCK_MONOTONIC for pthread_cond_timedwait so that
1250
+ * system clock adjustments (NTP etc.) don't affect timer intervals. */
1251
+ pthread_condattr_t cond_attr;
1252
+ CHECKED(pthread_condattr_init(&cond_attr));
1253
+ CHECKED(pthread_condattr_setclock(&cond_attr, CLOCK_MONOTONIC));
1254
+ CHECKED(pthread_cond_init(&g_profiler.worker_cond, &cond_attr));
1255
+ CHECKED(pthread_condattr_destroy(&cond_attr));
1256
+ }
1257
+ #else
1168
1258
  CHECKED(pthread_cond_init(&g_profiler.worker_cond, NULL));
1259
+ #endif
1169
1260
 
1170
1261
  /* Initialize sample buffer(s) */
1171
1262
  if (rperf_sample_buffer_init(&g_profiler.buffers[0]) < 0) {
@@ -1244,6 +1335,7 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VAL
1244
1335
 
1245
1336
  g_profiler.running = 1;
1246
1337
  g_profiler.profile_refcount = RTEST(vdefer) ? 0 : 1;
1338
+ g_profiler.worker_paused = 0;
1247
1339
 
1248
1340
  #if RPERF_USE_TIMER_SIGNAL
1249
1341
  g_profiler.timer_signal = timer_signal;
@@ -1347,9 +1439,7 @@ timer_fail:
1347
1439
  static VALUE
1348
1440
  rb_rperf_stop(VALUE self)
1349
1441
  {
1350
- VALUE result, samples_ary;
1351
- size_t i;
1352
- int j;
1442
+ VALUE result;
1353
1443
 
1354
1444
  if (!g_profiler.running) {
1355
1445
  return Qnil;
@@ -1416,15 +1506,22 @@ rb_rperf_stop(VALUE self)
1416
1506
  rperf_agg_table_free(&g_profiler.agg_table);
1417
1507
  } else {
1418
1508
  /* Raw samples path (aggregate: false) */
1509
+ VALUE samples_ary;
1510
+ size_t i;
1511
+ int j;
1419
1512
  rperf_sample_buffer_t *buf = &g_profiler.buffers[0];
1420
1513
 
1421
1514
  result = rb_hash_new();
1422
1515
  rb_hash_aset(result, ID2SYM(rb_intern("mode")),
1423
- ID2SYM(rb_intern(g_profiler.mode == 1 ? "wall" : "cpu")));
1516
+ ID2SYM(rb_intern(g_profiler.mode == RPERF_MODE_WALL ? "wall" : "cpu")));
1424
1517
  rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
1425
1518
  rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
1426
1519
  rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
1427
1520
  rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
1521
+ if (g_profiler.stats.dropped_samples > 0)
1522
+ rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(g_profiler.stats.dropped_samples));
1523
+ if (g_profiler.stats.dropped_aggregation > 0)
1524
+ rb_hash_aset(result, ID2SYM(rb_intern("dropped_aggregation")), SIZET2NUM(g_profiler.stats.dropped_aggregation));
1428
1525
  rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
1429
1526
  {
1430
1527
  struct timespec stop_monotonic;
@@ -1441,29 +1538,20 @@ rb_rperf_stop(VALUE self)
1441
1538
  samples_ary = rb_ary_new_capa((long)buf->sample_count);
1442
1539
  for (i = 0; i < buf->sample_count; i++) {
1443
1540
  rperf_sample_t *s = &buf->samples[i];
1444
- VALUE frames = rb_ary_new_capa(s->depth + 1);
1445
-
1446
- /* Prepend synthetic frame at leaf position (index 0) */
1447
- if (s->type == RPERF_SAMPLE_GVL_BLOCKED) {
1448
- VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]"));
1449
- rb_ary_push(frames, syn);
1450
- } else if (s->type == RPERF_SAMPLE_GVL_WAIT) {
1451
- VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]"));
1452
- rb_ary_push(frames, syn);
1453
- } else if (s->type == RPERF_SAMPLE_GC_MARKING) {
1454
- VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]"));
1455
- rb_ary_push(frames, syn);
1456
- } else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
1457
- VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]"));
1458
- rb_ary_push(frames, syn);
1459
- }
1541
+ VALUE frames = rb_ary_new_capa(s->depth);
1460
1542
 
1461
1543
  for (j = 0; j < s->depth; j++) {
1544
+ if (s->frame_start + j >= buf->frame_pool_count) break;
1462
1545
  VALUE fval = buf->frame_pool[s->frame_start + j];
1463
1546
  rb_ary_push(frames, rperf_resolve_frame(fval));
1464
1547
  }
1465
1548
 
1466
- VALUE sample = rb_ary_new3(4, frames, LONG2NUM(s->weight), INT2NUM(s->thread_seq), INT2NUM(s->label_set_id));
1549
+ VALUE sample = rb_ary_new_capa(5);
1550
+ rb_ary_push(sample, frames);
1551
+ rb_ary_push(sample, LONG2NUM(s->weight));
1552
+ rb_ary_push(sample, INT2NUM(s->thread_seq));
1553
+ rb_ary_push(sample, INT2NUM(s->label_set_id));
1554
+ rb_ary_push(sample, INT2NUM(s->vm_state));
1467
1555
  rb_ary_push(samples_ary, sample);
1468
1556
  }
1469
1557
  rb_hash_aset(result, ID2SYM(rb_intern("raw_samples")), samples_ary);
@@ -1498,6 +1586,8 @@ rperf_clear_aggregated_data(rperf_profiler_t *prof)
1498
1586
  prof->stats.trigger_count = 0;
1499
1587
  prof->stats.sampling_count = 0;
1500
1588
  prof->stats.sampling_total_ns = 0;
1589
+ prof->stats.dropped_samples = 0;
1590
+ prof->stats.dropped_aggregation = 0;
1501
1591
 
1502
1592
  /* Reset start timestamps so next snapshot's duration_ns covers
1503
1593
  * only the period since this clear. */
@@ -1619,7 +1709,15 @@ rperf_disarm_timer(rperf_profiler_t *prof)
1619
1709
  return;
1620
1710
  }
1621
1711
  #endif
1622
- /* nanosleep mode: worker will see RPERF_PAUSED on next iteration */
1712
+ /* nanosleep mode: wake the worker and wait until it enters paused state */
1713
+ CHECKED(pthread_mutex_lock(&prof->worker_mutex));
1714
+ while (!prof->worker_paused) {
1715
+ CHECKED(pthread_cond_signal(&prof->worker_cond));
1716
+ CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
1717
+ sched_yield();
1718
+ CHECKED(pthread_mutex_lock(&prof->worker_mutex));
1719
+ }
1720
+ CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
1623
1721
  }
1624
1722
 
1625
1723
  /* Helper: reset prev_time_ns for all threads (called on resume to avoid
@@ -1633,8 +1731,7 @@ rperf_reset_thread_times(rperf_profiler_t *prof)
1633
1731
  VALUE thread = RARRAY_AREF(threads, i);
1634
1732
  rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
1635
1733
  if (td) {
1636
- td->prev_time_ns = rperf_current_time_ns(prof, td);
1637
- td->prev_wall_ns = rperf_wall_time_ns();
1734
+ td->prev_time_ns = rperf_current_time_ns(prof);
1638
1735
  }
1639
1736
  }
1640
1737
  }
@@ -1659,6 +1756,7 @@ static VALUE
1659
1756
  rb_rperf_profile_dec(VALUE self)
1660
1757
  {
1661
1758
  if (!g_profiler.running) return Qfalse;
1759
+ if (g_profiler.profile_refcount <= 0) return Qfalse;
1662
1760
  g_profiler.profile_refcount--;
1663
1761
  if (g_profiler.profile_refcount == 0) {
1664
1762
  rperf_disarm_timer(&g_profiler);
@@ -1673,6 +1771,12 @@ rb_rperf_running_p(VALUE self)
1673
1771
  return g_profiler.running ? Qtrue : Qfalse;
1674
1772
  }
1675
1773
 
1774
+ static VALUE
1775
+ rb_rperf_profiler_wrapper(VALUE self)
1776
+ {
1777
+ return g_profiler_wrapper;
1778
+ }
1779
+
1676
1780
  /* ---- Fork safety ---- */
1677
1781
 
1678
1782
  static void
@@ -1683,6 +1787,14 @@ rperf_after_fork_child(void)
1683
1787
  /* Mark as not running — timer doesn't exist in child */
1684
1788
  g_profiler.running = 0;
1685
1789
 
1790
+ /* Re-initialize mutex/condvar — they may have been locked by the parent's
1791
+ * worker thread at fork time and are in an undefined state in the child.
1792
+ * POSIX says only async-signal-safe functions should be called in atfork
1793
+ * child handlers, but pthread_mutex_init is safe on Linux/glibc/musl and
1794
+ * this is the standard pattern (e.g., Python, Go do the same). */
1795
+ pthread_mutex_init(&g_profiler.worker_mutex, NULL);
1796
+ pthread_cond_init(&g_profiler.worker_cond, NULL);
1797
+
1686
1798
  #if RPERF_USE_TIMER_SIGNAL
1687
1799
  /* timer_create timers are not inherited across fork, but pending signals may be.
1688
1800
  * Block the signal, drain any pending instances, then restore old handler. */
@@ -1723,6 +1835,7 @@ rperf_after_fork_child(void)
1723
1835
  /* Reset stats */
1724
1836
  g_profiler.stats.sampling_count = 0;
1725
1837
  g_profiler.stats.sampling_total_ns = 0;
1838
+ g_profiler.stats.dropped_samples = 0;
1726
1839
  g_profiler.profile_refcount = 0;
1727
1840
  atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
1728
1841
  }
@@ -1743,6 +1856,7 @@ Init_rperf(void)
1743
1856
  rb_define_module_function(mRperf, "_c_profile_inc", rb_rperf_profile_inc, 0);
1744
1857
  rb_define_module_function(mRperf, "_c_profile_dec", rb_rperf_profile_dec, 0);
1745
1858
  rb_define_module_function(mRperf, "_c_running?", rb_rperf_running_p, 0);
1859
+ rb_define_module_function(mRperf, "_c_profiler_wrapper", rb_rperf_profiler_wrapper, 0);
1746
1860
 
1747
1861
  memset(&g_profiler, 0, sizeof(g_profiler));
1748
1862
  g_profiler.label_sets = Qnil;