rperf 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/rperf/rperf.c CHANGED
@@ -25,8 +25,10 @@
25
25
  #ifdef __linux__
26
26
  #define RPERF_USE_TIMER_SIGNAL 1
27
27
  #define RPERF_TIMER_SIGNAL_DEFAULT (SIGRTMIN + 8)
28
+ #define RPERF_COND_CLOCK CLOCK_MONOTONIC
28
29
  #else
29
30
  #define RPERF_USE_TIMER_SIGNAL 0
31
+ #define RPERF_COND_CLOCK CLOCK_REALTIME /* macOS lacks pthread_condattr_setclock */
30
32
  #endif
31
33
 
32
34
  #define RPERF_MAX_STACK_DEPTH 512
@@ -51,6 +53,11 @@ enum rperf_vm_state {
51
53
  /* ---- Data structures ---- */
52
54
 
53
55
 
56
+ enum rperf_mode {
57
+ RPERF_MODE_CPU = 0,
58
+ RPERF_MODE_WALL = 1,
59
+ };
60
+
54
61
  enum rperf_gc_phase {
55
62
  RPERF_GC_NONE = 0,
56
63
  RPERF_GC_MARKING = 1,
@@ -73,7 +80,11 @@ typedef struct rperf_sample_buffer {
73
80
  size_t sample_count;
74
81
  size_t sample_capacity;
75
82
  VALUE *frame_pool;
76
- size_t frame_pool_count;
83
+ /* _Atomic: read by GC dmark concurrently with the aggregator's clear.
84
+ * Seq-cst accesses pair with the frame_table count release-stores so
85
+ * dmark never observes the cleared pool together with a stale
86
+ * frame_table count (which would leave frames unmarked). */
87
+ _Atomic size_t frame_pool_count;
77
88
  size_t frame_pool_capacity;
78
89
  } rperf_sample_buffer_t;
79
90
 
@@ -83,7 +94,7 @@ typedef struct rperf_sample_buffer {
83
94
 
84
95
  typedef struct rperf_frame_table {
85
96
  _Atomic(VALUE *) keys; /* unique VALUE array (GC mark target) */
86
- size_t count; /* = next frame_id */
97
+ _Atomic(size_t) count; /* = next frame_id */
87
98
  size_t capacity;
88
99
  uint32_t *buckets; /* open addressing: stores index into keys[] */
89
100
  size_t bucket_capacity;
@@ -95,8 +106,6 @@ typedef struct rperf_frame_table {
95
106
 
96
107
  /* ---- Aggregation table: stack → weight ---- */
97
108
 
98
- #define RPERF_AGG_ENTRY_EMPTY 0
99
-
100
109
  typedef struct rperf_agg_entry {
101
110
  uint32_t frame_start; /* offset into stack_pool */
102
111
  int depth;
@@ -119,7 +128,6 @@ typedef struct rperf_agg_table {
119
128
 
120
129
  typedef struct rperf_thread_data {
121
130
  int64_t prev_time_ns;
122
- int64_t prev_wall_ns;
123
131
  /* GVL event tracking */
124
132
  int64_t suspended_at_ns; /* wall time at SUSPENDED */
125
133
  int64_t ready_at_ns; /* wall time at READY */
@@ -139,15 +147,19 @@ typedef struct rperf_gc_state {
139
147
  /* ---- Sampling overhead stats ---- */
140
148
 
141
149
  typedef struct rperf_stats {
142
- size_t trigger_count;
150
+ /* _Atomic: incremented by the signal handler / nanosleep worker, read and
151
+ * cleared by snapshot while running (atomic size_t is async-signal-safe
152
+ * when lock-free, which it is on all supported platforms). */
153
+ _Atomic size_t trigger_count;
143
154
  size_t sampling_count;
144
155
  int64_t sampling_total_ns;
145
156
  size_t dropped_samples; /* samples lost due to allocation failure */
157
+ size_t dropped_aggregation; /* samples lost during aggregation (frame_table/agg_table full) */
146
158
  } rperf_stats_t;
147
159
 
148
160
  typedef struct rperf_profiler {
149
161
  int frequency;
150
- int mode; /* 0 = cpu, 1 = wall */
162
+ enum rperf_mode mode;
151
163
  _Atomic int running;
152
164
  pthread_t worker_thread; /* combined timer + aggregation */
153
165
  #if RPERF_USE_TIMER_SIGNAL
@@ -199,12 +211,17 @@ rperf_profiler_mark(void *ptr)
199
211
  {
200
212
  rperf_profiler_t *prof = (rperf_profiler_t *)ptr;
201
213
  int i;
202
- /* Mark both sample buffers' frame_pools */
214
+ /* Mark both sample buffers' frame_pools.
215
+ * Load the count once: the aggregator may clear it concurrently, and the
216
+ * pools must be read BEFORE frame_table.count below — seeing the cleared
217
+ * count (seq-cst) guarantees the corresponding frame_table inserts are
218
+ * visible, so every frame is covered by at least one mark source. */
203
219
  for (i = 0; i < 2; i++) {
204
220
  rperf_sample_buffer_t *buf = &prof->buffers[i];
205
- if (buf->frame_pool && buf->frame_pool_count > 0) {
221
+ size_t fp_count = buf->frame_pool_count;
222
+ if (buf->frame_pool && fp_count > 0) {
206
223
  rb_gc_mark_locations(buf->frame_pool,
207
- buf->frame_pool + buf->frame_pool_count);
224
+ buf->frame_pool + fp_count);
208
225
  }
209
226
  }
210
227
  /* Mark label_sets array */
@@ -217,7 +234,7 @@ rperf_profiler_mark(void *ptr)
217
234
  * If we see an old count, both old and new keys arrays have valid
218
235
  * data (old keys are kept alive in old_keys[]). */
219
236
  {
220
- size_t ft_count = __atomic_load_n(&prof->frame_table.count, __ATOMIC_ACQUIRE);
237
+ size_t ft_count = atomic_load_explicit(&prof->frame_table.count, memory_order_acquire);
221
238
  VALUE *ft_keys = atomic_load_explicit(&prof->frame_table.keys, memory_order_acquire);
222
239
  if (ft_keys && ft_count > 0) {
223
240
  rb_gc_mark_locations(ft_keys, ft_keys + ft_count);
@@ -225,12 +242,40 @@ rperf_profiler_mark(void *ptr)
225
242
  }
226
243
  }
227
244
 
245
+ static size_t
246
+ rperf_profiler_memsize(const void *ptr)
247
+ {
248
+ const rperf_profiler_t *prof = (const rperf_profiler_t *)ptr;
249
+ size_t size = sizeof(rperf_profiler_t);
250
+ int i;
251
+
252
+ /* Double-buffered sample storage */
253
+ for (i = 0; i < 2; i++) {
254
+ const rperf_sample_buffer_t *buf = &prof->buffers[i];
255
+ size += buf->sample_capacity * sizeof(rperf_sample_t);
256
+ size += buf->frame_pool_capacity * sizeof(VALUE);
257
+ }
258
+
259
+ /* Frame table */
260
+ size += prof->frame_table.capacity * sizeof(VALUE); /* keys */
261
+ size += prof->frame_table.bucket_capacity * sizeof(uint32_t); /* buckets */
262
+ /* old_keys entries are previous keys arrays; exact sizes unknown,
263
+ * only the pointer array itself is accounted for. */
264
+ size += prof->frame_table.old_keys_capacity * sizeof(VALUE *); /* old_keys */
265
+
266
+ /* Aggregation table */
267
+ size += prof->agg_table.bucket_capacity * sizeof(rperf_agg_entry_t);
268
+ size += prof->agg_table.stack_pool_capacity * sizeof(uint32_t);
269
+
270
+ return size;
271
+ }
272
+
228
273
  static const rb_data_type_t rperf_profiler_type = {
229
274
  .wrap_struct_name = "rperf_profiler",
230
275
  .function = {
231
276
  .dmark = rperf_profiler_mark,
232
277
  .dfree = NULL,
233
- .dsize = NULL,
278
+ .dsize = rperf_profiler_memsize,
234
279
  },
235
280
  };
236
281
 
@@ -259,7 +304,7 @@ rperf_wall_time_ns(void)
259
304
  static int64_t
260
305
  rperf_current_time_ns(rperf_profiler_t *prof)
261
306
  {
262
- if (prof->mode == 0) {
307
+ if (prof->mode == RPERF_MODE_CPU) {
263
308
  return rperf_cpu_time_ns();
264
309
  } else {
265
310
  return rperf_wall_time_ns();
@@ -300,7 +345,7 @@ static int
300
345
  rperf_ensure_sample_capacity(rperf_sample_buffer_t *buf)
301
346
  {
302
347
  if (buf->sample_count >= buf->sample_capacity) {
303
- if (buf->sample_capacity > SIZE_MAX / 2) return -1;
348
+ if (buf->sample_capacity > SIZE_MAX / (2 * sizeof(rperf_sample_t))) return -1;
304
349
  size_t new_cap = buf->sample_capacity * 2;
305
350
  rperf_sample_t *new_samples = (rperf_sample_t *)realloc(
306
351
  buf->samples,
@@ -319,7 +364,7 @@ static int
319
364
  rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
320
365
  {
321
366
  while (buf->frame_pool_count + (size_t)needed > buf->frame_pool_capacity) {
322
- if (buf->frame_pool_capacity > SIZE_MAX / 2) return -1;
367
+ if (buf->frame_pool_capacity > SIZE_MAX / (2 * sizeof(VALUE))) return -1;
323
368
  size_t new_cap = buf->frame_pool_capacity * 2;
324
369
  VALUE *new_pool = (VALUE *)realloc(
325
370
  buf->frame_pool,
@@ -438,7 +483,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
438
483
  keys[frame_id] = fval;
439
484
  /* Store fence: ensure keys[frame_id] is visible before count is incremented,
440
485
  * so GC dmark never reads uninitialized keys[count-1]. */
441
- __atomic_store_n(&ft->count, ft->count + 1, __ATOMIC_RELEASE);
486
+ atomic_store_explicit(&ft->count, ft->count + 1, memory_order_release);
442
487
  ft->buckets[idx] = frame_id;
443
488
 
444
489
  /* Rehash if load factor > 0.7 */
@@ -494,7 +539,7 @@ rperf_agg_table_free(rperf_agg_table_t *at)
494
539
  static void
495
540
  rperf_agg_table_rehash(rperf_agg_table_t *at)
496
541
  {
497
- if (at->bucket_capacity > SIZE_MAX / 2) return;
542
+ if (at->bucket_capacity > SIZE_MAX / (2 * sizeof(rperf_agg_entry_t))) return;
498
543
  size_t new_cap = at->bucket_capacity * 2;
499
544
  rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
500
545
  if (!new_buckets) return; /* keep using current buckets at higher load factor */
@@ -519,7 +564,7 @@ static int
519
564
  rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
520
565
  {
521
566
  while (at->stack_pool_count + (size_t)needed > at->stack_pool_capacity) {
522
- if (at->stack_pool_capacity > SIZE_MAX / 2) return -1;
567
+ if (at->stack_pool_capacity > SIZE_MAX / (2 * sizeof(uint32_t))) return -1;
523
568
  size_t new_cap = at->stack_pool_capacity * 2;
524
569
  uint32_t *new_pool = (uint32_t *)realloc(at->stack_pool,
525
570
  new_cap * sizeof(uint32_t));
@@ -530,8 +575,9 @@ rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
530
575
  return 0;
531
576
  }
532
577
 
533
- /* Insert or merge a stack into the aggregation table */
534
- static void
578
+ /* Insert or merge a stack into the aggregation table.
579
+ * Returns 0 on success, -1 on failure (table full or allocation failure). */
580
+ static int
535
581
  rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
536
582
  int depth, int thread_seq, int label_set_id,
537
583
  enum rperf_vm_state vm_state, int64_t weight, uint32_t hash)
@@ -548,14 +594,14 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
548
594
  depth * sizeof(uint32_t)) == 0) {
549
595
  /* Match — merge weight */
550
596
  e->weight += weight;
551
- return;
597
+ return 0;
552
598
  }
553
599
  idx = (idx + 1) % at->bucket_capacity;
554
- if (++probes >= at->bucket_capacity) return; /* table full, drop sample */
600
+ if (++probes >= at->bucket_capacity) return -1; /* table full */
555
601
  }
556
602
 
557
603
  /* New entry — append frame_ids to stack_pool */
558
- if (rperf_agg_ensure_stack_pool(at, depth) < 0) return;
604
+ if (rperf_agg_ensure_stack_pool(at, depth) < 0) return -1;
559
605
 
560
606
  rperf_agg_entry_t *e = &at->buckets[idx];
561
607
  e->frame_start = (uint32_t)at->stack_pool_count;
@@ -576,6 +622,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
576
622
  if (at->count * 10 > at->bucket_capacity * 7) {
577
623
  rperf_agg_table_rehash(at);
578
624
  }
625
+ return 0;
579
626
  }
580
627
 
581
628
  /* ---- Aggregation: process a sample buffer into frame_table + agg_table ---- */
@@ -598,25 +645,38 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
598
645
  /* Convert VALUE frames to frame_ids */
599
646
  int overflow = 0;
600
647
  for (j = 0; j < s->depth; j++) {
648
+ if (s->frame_start + j >= buf->frame_pool_count) {
649
+ /* Defensive: sample points past the pool — truncate the
650
+ * sample so we never hash/insert uninitialized temp_ids */
651
+ s->depth = j;
652
+ break;
653
+ }
601
654
  VALUE fval = buf->frame_pool[s->frame_start + j];
602
655
  uint32_t fid = rperf_frame_table_insert(&prof->frame_table, fval);
603
656
  if (fid == RPERF_FRAME_TABLE_EMPTY) { overflow = 1; break; }
604
657
  temp_ids[j] = fid;
605
658
  }
606
- if (overflow) break; /* frame_table full, stop aggregating this buffer */
659
+ if (overflow) {
660
+ /* frame_table full — count remaining samples as dropped */
661
+ prof->stats.dropped_aggregation += buf->sample_count - i;
662
+ break;
663
+ }
664
+ if (s->depth <= 0) continue;
607
665
 
608
666
  hash = rperf_fnv1a_u32(temp_ids, s->depth, s->thread_seq, s->label_set_id, s->vm_state);
609
667
 
610
- rperf_agg_table_insert(&prof->agg_table, temp_ids, s->depth,
668
+ if (rperf_agg_table_insert(&prof->agg_table, temp_ids, s->depth,
611
669
  s->thread_seq, s->label_set_id, s->vm_state,
612
- s->weight, hash);
670
+ s->weight, hash) < 0) {
671
+ prof->stats.dropped_aggregation++;
672
+ }
613
673
  }
614
674
 
615
675
  /* Reset buffer for reuse.
616
676
  * Release fence: ensure all frame_table inserts are visible (to GC dmark)
617
677
  * before frame_pool_count is cleared, so dmark always has at least one
618
678
  * source (frame_table or frame_pool) covering each VALUE. */
619
- __atomic_thread_fence(__ATOMIC_RELEASE);
679
+ atomic_thread_fence(memory_order_release);
620
680
  buf->sample_count = 0;
621
681
  buf->frame_pool_count = 0;
622
682
  }
@@ -693,8 +753,9 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
693
753
  {
694
754
  rperf_thread_data_t *td = (rperf_thread_data_t *)calloc(1, sizeof(rperf_thread_data_t));
695
755
  if (!td) return NULL;
696
- td->prev_time_ns = rperf_current_time_ns(prof);
697
- td->prev_wall_ns = rperf_wall_time_ns();
756
+ int64_t t = rperf_current_time_ns(prof);
757
+ if (t < 0) { free(td); return NULL; }
758
+ td->prev_time_ns = t;
698
759
  td->thread_seq = ++prof->next_thread_seq;
699
760
  rb_internal_thread_specific_set(thread, prof->ts_key, td);
700
761
  return td;
@@ -706,7 +767,8 @@ static void
706
767
  rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
707
768
  {
708
769
  /* Has GVL — safe to call Ruby APIs */
709
- int64_t wall_now = rperf_wall_time_ns();
770
+ /* suspended_at_ns is only consumed by RESUMED in wall mode */
771
+ int64_t wall_now = (prof->mode == RPERF_MODE_WALL) ? rperf_wall_time_ns() : 0;
710
772
 
711
773
  int is_first = 0;
712
774
 
@@ -719,25 +781,27 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t
719
781
  int64_t time_now = rperf_current_time_ns(prof);
720
782
  if (time_now < 0) return;
721
783
 
722
- /* Capture backtrace into active buffer's frame_pool */
723
- rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
724
- if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
725
- size_t frame_start = buf->frame_pool_count;
726
- int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
727
- &buf->frame_pool[frame_start], NULL);
728
- if (depth <= 0) return;
729
- buf->frame_pool_count += depth;
730
-
731
- /* Record normal sample (skip if first time — no prev_time, or if paused) */
784
+ /* Record normal sample (skip if first time — no prev_time, or if paused).
785
+ * The backtrace is captured only when a sample is actually recorded:
786
+ * committing frames to the pool while paused would grow it without bound,
787
+ * because no aggregation runs until samples accumulate. */
732
788
  if (!is_first && !RPERF_PAUSED(prof)) {
733
- int64_t weight = time_now - td->prev_time_ns;
734
- rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
789
+ rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
790
+ if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) >= 0) {
791
+ size_t frame_start = buf->frame_pool_count;
792
+ int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
793
+ &buf->frame_pool[frame_start], NULL);
794
+ if (depth > 0) {
795
+ buf->frame_pool_count += depth;
796
+ int64_t weight = time_now - td->prev_time_ns;
797
+ rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
798
+ }
799
+ }
735
800
  }
736
801
 
737
802
  /* Save timestamp for READY/RESUMED */
738
803
  td->suspended_at_ns = wall_now;
739
804
  td->prev_time_ns = time_now;
740
- td->prev_wall_ns = wall_now;
741
805
  }
742
806
 
743
807
  static void
@@ -768,7 +832,7 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
768
832
  * Both samples are written directly into the same buffer before calling
769
833
  * rperf_try_swap, so that a swap triggered by the first sample cannot
770
834
  * move the second into a different buffer with a stale frame_start. */
771
- if (prof->mode == 1 && td->suspended_at_ns > 0 && !RPERF_PAUSED(prof)) {
835
+ if (prof->mode == RPERF_MODE_WALL && td->suspended_at_ns > 0 && !RPERF_PAUSED(prof)) {
772
836
  rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
773
837
  if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) goto skip_gvl;
774
838
  size_t frame_start = buf->frame_pool_count;
@@ -798,7 +862,6 @@ skip_gvl:
798
862
  /* Reset prev times to current — next timer sample measures from resume */
799
863
  int64_t time_now = rperf_current_time_ns(prof);
800
864
  if (time_now >= 0) td->prev_time_ns = time_now;
801
- td->prev_wall_ns = wall_now;
802
865
 
803
866
  /* Clear suspended state */
804
867
  td->suspended_at_ns = 0;
@@ -818,19 +881,26 @@ static void
818
881
  rperf_thread_event_hook(rb_event_flag_t event, const rb_internal_thread_event_data_t *data, void *user_data)
819
882
  {
820
883
  rperf_profiler_t *prof = (rperf_profiler_t *)user_data;
821
- if (!prof->running) return;
822
884
 
823
885
  VALUE thread = data->thread;
824
886
  rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
825
887
 
888
+ /* EXITED frees the thread's data even when running == 0: a thread can
889
+ * exit between stop setting running = 0 and the hook removal, and its td
890
+ * would otherwise leak (stop's Thread.list cleanup no longer sees it). */
891
+ if (event & RUBY_INTERNAL_THREAD_EVENT_EXITED) {
892
+ rperf_handle_exited(prof, thread, td);
893
+ return;
894
+ }
895
+
896
+ if (!prof->running) return;
897
+
826
898
  if (event & RUBY_INTERNAL_THREAD_EVENT_SUSPENDED)
827
899
  rperf_handle_suspended(prof, thread, td);
828
900
  else if (event & RUBY_INTERNAL_THREAD_EVENT_READY)
829
901
  rperf_handle_ready(td);
830
902
  else if (event & RUBY_INTERNAL_THREAD_EVENT_RESUMED)
831
903
  rperf_handle_resumed(prof, thread, td);
832
- else if (event & RUBY_INTERNAL_THREAD_EVENT_EXITED)
833
- rperf_handle_exited(prof, thread, td);
834
904
  }
835
905
 
836
906
  /* ---- GC event hook ---- */
@@ -895,17 +965,15 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
895
965
 
896
966
  /* ---- Sampling callback (postponed job) — current thread only ---- */
897
967
 
898
- static void
899
- rperf_sample_job(void *arg)
968
+ /* Core sampling logic, parameterized by mode constant.
969
+ * Called from rperf_sample_cpu/rperf_sample_wall so the compiler
970
+ * can inline and eliminate mode branches at compile time. */
971
+ static inline void
972
+ rperf_sample_core(rperf_profiler_t *prof, enum rperf_mode mode)
900
973
  {
901
- rperf_profiler_t *prof = (rperf_profiler_t *)arg;
902
-
903
- if (!prof->running) return;
904
- if (RPERF_PAUSED(prof)) return;
905
-
906
- /* Measure sampling overhead */
974
+ /* Measure sampling overhead (wall time — runs under GVL, no I/O) */
907
975
  struct timespec ts_start, ts_end;
908
- clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_start);
976
+ clock_gettime(CLOCK_MONOTONIC, &ts_start);
909
977
 
910
978
  VALUE thread = rb_thread_current();
911
979
 
@@ -917,12 +985,11 @@ rperf_sample_job(void *arg)
917
985
  return; /* Skip first sample for this thread */
918
986
  }
919
987
 
920
- int64_t time_now = rperf_current_time_ns(prof);
988
+ int64_t time_now = (mode == RPERF_MODE_CPU) ? rperf_cpu_time_ns() : rperf_wall_time_ns();
921
989
  if (time_now < 0) return;
922
990
 
923
991
  int64_t weight = time_now - td->prev_time_ns;
924
992
  td->prev_time_ns = time_now;
925
- td->prev_wall_ns = rperf_wall_time_ns();
926
993
 
927
994
  if (weight <= 0) return;
928
995
 
@@ -938,13 +1005,33 @@ rperf_sample_job(void *arg)
938
1005
 
939
1006
  rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
940
1007
 
941
- clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_end);
1008
+ clock_gettime(CLOCK_MONOTONIC, &ts_end);
942
1009
  prof->stats.sampling_count++;
943
1010
  prof->stats.sampling_total_ns +=
944
1011
  ((int64_t)ts_end.tv_sec - ts_start.tv_sec) * 1000000000LL +
945
1012
  (ts_end.tv_nsec - ts_start.tv_nsec);
946
1013
  }
947
1014
 
1015
+ static void
1016
+ rperf_sample_cpu(rperf_profiler_t *prof) { rperf_sample_core(prof, RPERF_MODE_CPU); }
1017
+
1018
+ static void
1019
+ rperf_sample_wall(rperf_profiler_t *prof) { rperf_sample_core(prof, RPERF_MODE_WALL); }
1020
+
1021
+ static void
1022
+ rperf_sample_job(void *arg)
1023
+ {
1024
+ rperf_profiler_t *prof = (rperf_profiler_t *)arg;
1025
+
1026
+ if (!prof->running) return;
1027
+ if (RPERF_PAUSED(prof)) return;
1028
+
1029
+ if (prof->mode == RPERF_MODE_CPU)
1030
+ rperf_sample_cpu(prof);
1031
+ else
1032
+ rperf_sample_wall(prof);
1033
+ }
1034
+
948
1035
  /* ---- Worker thread: timer + aggregation ---- */
949
1036
 
950
1037
  #if RPERF_USE_TIMER_SIGNAL
@@ -990,7 +1077,7 @@ rperf_worker_nanosleep_func(void *arg)
990
1077
  struct timespec deadline;
991
1078
  long interval_ns = 1000000000L / prof->frequency;
992
1079
 
993
- clock_gettime(CLOCK_REALTIME, &deadline);
1080
+ clock_gettime(RPERF_COND_CLOCK, &deadline);
994
1081
  deadline.tv_nsec += interval_ns;
995
1082
  if (deadline.tv_nsec >= 1000000000L) {
996
1083
  deadline.tv_sec++;
@@ -1005,7 +1092,7 @@ rperf_worker_nanosleep_func(void *arg)
1005
1092
  CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
1006
1093
  prof->worker_paused = 0;
1007
1094
  /* Reset deadline on wake to avoid burst of catch-up triggers */
1008
- clock_gettime(CLOCK_REALTIME, &deadline);
1095
+ clock_gettime(RPERF_COND_CLOCK, &deadline);
1009
1096
  deadline.tv_nsec += interval_ns;
1010
1097
  if (deadline.tv_nsec >= 1000000000L) {
1011
1098
  deadline.tv_sec++;
@@ -1076,13 +1163,15 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
1076
1163
  result = rb_hash_new();
1077
1164
 
1078
1165
  rb_hash_aset(result, ID2SYM(rb_intern("mode")),
1079
- ID2SYM(rb_intern(prof->mode == 1 ? "wall" : "cpu")));
1166
+ ID2SYM(rb_intern(prof->mode == RPERF_MODE_WALL ? "wall" : "cpu")));
1080
1167
  rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(prof->frequency));
1081
1168
  rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(prof->stats.trigger_count));
1082
1169
  rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(prof->stats.sampling_count));
1083
- rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(prof->stats.sampling_total_ns));
1170
+ rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LL2NUM(prof->stats.sampling_total_ns));
1084
1171
  if (prof->stats.dropped_samples > 0)
1085
1172
  rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(prof->stats.dropped_samples));
1173
+ if (prof->stats.dropped_aggregation > 0)
1174
+ rb_hash_aset(result, ID2SYM(rb_intern("dropped_aggregation")), SIZET2NUM(prof->stats.dropped_aggregation));
1086
1175
  rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(prof->next_thread_seq));
1087
1176
  rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
1088
1177
  SIZET2NUM(prof->frame_table.count));
@@ -1097,8 +1186,8 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
1097
1186
  + (int64_t)prof->start_realtime.tv_nsec;
1098
1187
  duration_ns = ((int64_t)now_monotonic.tv_sec - (int64_t)prof->start_monotonic.tv_sec) * 1000000000LL
1099
1188
  + ((int64_t)now_monotonic.tv_nsec - (int64_t)prof->start_monotonic.tv_nsec);
1100
- rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
1101
- rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
1189
+ rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LL2NUM(start_ns));
1190
+ rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LL2NUM(duration_ns));
1102
1191
  }
1103
1192
 
1104
1193
  {
@@ -1124,7 +1213,7 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
1124
1213
 
1125
1214
  VALUE sample = rb_ary_new_capa(5);
1126
1215
  rb_ary_push(sample, frames);
1127
- rb_ary_push(sample, LONG2NUM(e->weight));
1216
+ rb_ary_push(sample, LL2NUM(e->weight));
1128
1217
  rb_ary_push(sample, INT2NUM(e->thread_seq));
1129
1218
  rb_ary_push(sample, INT2NUM(e->label_set_id));
1130
1219
  rb_ary_push(sample, INT2NUM(e->vm_state));
@@ -1154,7 +1243,7 @@ static VALUE
1154
1243
  rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VALUE vdefer)
1155
1244
  {
1156
1245
  int frequency = NUM2INT(vfreq);
1157
- int mode = NUM2INT(vmode);
1246
+ enum rperf_mode mode = (enum rperf_mode)NUM2INT(vmode);
1158
1247
  int aggregate = RTEST(vagg) ? 1 : 0;
1159
1248
  #if RPERF_USE_TIMER_SIGNAL
1160
1249
  int sig = NUM2INT(vsig);
@@ -1173,13 +1262,26 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VAL
1173
1262
  g_profiler.stats.sampling_total_ns = 0;
1174
1263
  g_profiler.stats.trigger_count = 0;
1175
1264
  g_profiler.stats.dropped_samples = 0;
1265
+ g_profiler.stats.dropped_aggregation = 0;
1176
1266
  atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
1177
1267
  atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
1178
1268
  g_profiler.label_sets = Qnil;
1179
1269
 
1180
1270
  /* Initialize worker mutex/cond */
1181
1271
  CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
1272
+ #ifdef __linux__
1273
+ {
1274
+ /* Use CLOCK_MONOTONIC for pthread_cond_timedwait so that
1275
+ * system clock adjustments (NTP etc.) don't affect timer intervals. */
1276
+ pthread_condattr_t cond_attr;
1277
+ CHECKED(pthread_condattr_init(&cond_attr));
1278
+ CHECKED(pthread_condattr_setclock(&cond_attr, CLOCK_MONOTONIC));
1279
+ CHECKED(pthread_cond_init(&g_profiler.worker_cond, &cond_attr));
1280
+ CHECKED(pthread_condattr_destroy(&cond_attr));
1281
+ }
1282
+ #else
1182
1283
  CHECKED(pthread_cond_init(&g_profiler.worker_cond, NULL));
1284
+ #endif
1183
1285
 
1184
1286
  /* Initialize sample buffer(s) */
1185
1287
  if (rperf_sample_buffer_init(&g_profiler.buffers[0]) < 0) {
@@ -1236,6 +1338,14 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VAL
1236
1338
  /* Pre-initialize current thread's time so the first sample is not skipped */
1237
1339
  {
1238
1340
  VALUE cur_thread = rb_thread_current();
1341
+ /* A stale td can survive a fork (the atfork child handler does not
1342
+ * free the forking thread's data) — free it before creating a fresh
1343
+ * one, or it would leak on every fork + restart cycle. */
1344
+ rperf_thread_data_t *stale = (rperf_thread_data_t *)rb_internal_thread_specific_get(cur_thread, g_profiler.ts_key);
1345
+ if (stale) {
1346
+ free(stale);
1347
+ rb_internal_thread_specific_set(cur_thread, g_profiler.ts_key, NULL);
1348
+ }
1239
1349
  rperf_thread_data_t *td = rperf_thread_data_create(&g_profiler, cur_thread);
1240
1350
  if (!td) {
1241
1351
  rb_remove_event_hook(rperf_gc_event_hook);
@@ -1300,24 +1410,34 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VAL
1300
1410
  if (timer_create(CLOCK_MONOTONIC, &sev, &g_profiler.timer_id) != 0) {
1301
1411
  g_profiler.running = 0;
1302
1412
  sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
1413
+ /* Signal under the mutex — see rb_rperf_stop for the rationale */
1414
+ CHECKED(pthread_mutex_lock(&g_profiler.worker_mutex));
1303
1415
  CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
1416
+ CHECKED(pthread_mutex_unlock(&g_profiler.worker_mutex));
1304
1417
  CHECKED(pthread_join(g_profiler.worker_thread, NULL));
1305
1418
  goto timer_fail;
1306
1419
  }
1307
1420
 
1308
- its.it_value.tv_sec = 0;
1309
1421
  if (RPERF_PAUSED(&g_profiler)) {
1310
1422
  /* defer mode: create timer but don't arm it */
1423
+ its.it_value.tv_sec = 0;
1311
1424
  its.it_value.tv_nsec = 0;
1312
1425
  } else {
1313
- its.it_value.tv_nsec = 1000000000L / g_profiler.frequency;
1426
+ /* Split into sec/nsec: frequency 1 gives a 1s interval, and
1427
+ * tv_nsec must be < 1e9 or timer_settime fails with EINVAL */
1428
+ long interval_ns = 1000000000L / g_profiler.frequency;
1429
+ its.it_value.tv_sec = interval_ns / 1000000000L;
1430
+ its.it_value.tv_nsec = interval_ns % 1000000000L;
1314
1431
  }
1315
1432
  its.it_interval = its.it_value;
1316
1433
  if (timer_settime(g_profiler.timer_id, 0, &its, NULL) != 0) {
1317
1434
  timer_delete(g_profiler.timer_id);
1318
1435
  g_profiler.running = 0;
1319
1436
  sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
1437
+ /* Signal under the mutex — see rb_rperf_stop for the rationale */
1438
+ CHECKED(pthread_mutex_lock(&g_profiler.worker_mutex));
1320
1439
  CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
1440
+ CHECKED(pthread_mutex_unlock(&g_profiler.worker_mutex));
1321
1441
  CHECKED(pthread_join(g_profiler.worker_thread, NULL));
1322
1442
  goto timer_fail;
1323
1443
  }
@@ -1378,10 +1498,15 @@ rb_rperf_stop(VALUE self)
1378
1498
  }
1379
1499
  #endif
1380
1500
 
1381
- /* Wake and join worker thread.
1501
+ /* Wake and join worker thread. Signal while holding worker_mutex:
1502
+ * the worker re-checks its predicate (running) with the mutex held, so
1503
+ * signaling under the mutex guarantees it either sees running == 0 or is
1504
+ * already inside cond_wait when the signal fires — no lost wakeup.
1382
1505
  * Any pending timer signals are still handled by rperf_signal_handler
1383
1506
  * (just increments trigger_count + calls rb_postponed_job_trigger). */
1507
+ CHECKED(pthread_mutex_lock(&g_profiler.worker_mutex));
1384
1508
  CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
1509
+ CHECKED(pthread_mutex_unlock(&g_profiler.worker_mutex));
1385
1510
  CHECKED(pthread_join(g_profiler.worker_thread, NULL));
1386
1511
  CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
1387
1512
  CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
@@ -1436,13 +1561,15 @@ rb_rperf_stop(VALUE self)
1436
1561
 
1437
1562
  result = rb_hash_new();
1438
1563
  rb_hash_aset(result, ID2SYM(rb_intern("mode")),
1439
- ID2SYM(rb_intern(g_profiler.mode == 1 ? "wall" : "cpu")));
1564
+ ID2SYM(rb_intern(g_profiler.mode == RPERF_MODE_WALL ? "wall" : "cpu")));
1440
1565
  rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
1441
1566
  rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
1442
1567
  rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
1443
- rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
1568
+ rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LL2NUM(g_profiler.stats.sampling_total_ns));
1444
1569
  if (g_profiler.stats.dropped_samples > 0)
1445
1570
  rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(g_profiler.stats.dropped_samples));
1571
+ if (g_profiler.stats.dropped_aggregation > 0)
1572
+ rb_hash_aset(result, ID2SYM(rb_intern("dropped_aggregation")), SIZET2NUM(g_profiler.stats.dropped_aggregation));
1446
1573
  rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
1447
1574
  {
1448
1575
  struct timespec stop_monotonic;
@@ -1452,8 +1579,8 @@ rb_rperf_stop(VALUE self)
1452
1579
  + (int64_t)g_profiler.start_realtime.tv_nsec;
1453
1580
  duration_ns = ((int64_t)stop_monotonic.tv_sec - (int64_t)g_profiler.start_monotonic.tv_sec) * 1000000000LL
1454
1581
  + ((int64_t)stop_monotonic.tv_nsec - (int64_t)g_profiler.start_monotonic.tv_nsec);
1455
- rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
1456
- rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
1582
+ rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LL2NUM(start_ns));
1583
+ rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LL2NUM(duration_ns));
1457
1584
  }
1458
1585
 
1459
1586
  samples_ary = rb_ary_new_capa((long)buf->sample_count);
@@ -1469,7 +1596,7 @@ rb_rperf_stop(VALUE self)
1469
1596
 
1470
1597
  VALUE sample = rb_ary_new_capa(5);
1471
1598
  rb_ary_push(sample, frames);
1472
- rb_ary_push(sample, LONG2NUM(s->weight));
1599
+ rb_ary_push(sample, LL2NUM(s->weight));
1473
1600
  rb_ary_push(sample, INT2NUM(s->thread_seq));
1474
1601
  rb_ary_push(sample, INT2NUM(s->label_set_id));
1475
1602
  rb_ary_push(sample, INT2NUM(s->vm_state));
@@ -1508,6 +1635,7 @@ rperf_clear_aggregated_data(rperf_profiler_t *prof)
1508
1635
  prof->stats.sampling_count = 0;
1509
1636
  prof->stats.sampling_total_ns = 0;
1510
1637
  prof->stats.dropped_samples = 0;
1638
+ prof->stats.dropped_aggregation = 0;
1511
1639
 
1512
1640
  /* Reset start timestamps so next snapshot's duration_ns covers
1513
1641
  * only the period since this clear. */
@@ -1604,10 +1732,13 @@ rperf_arm_timer(rperf_profiler_t *prof)
1604
1732
  #if RPERF_USE_TIMER_SIGNAL
1605
1733
  if (prof->timer_signal > 0) {
1606
1734
  struct itimerspec its;
1607
- its.it_value.tv_sec = 0;
1608
- its.it_value.tv_nsec = 1000000000L / prof->frequency;
1735
+ long interval_ns = 1000000000L / prof->frequency;
1736
+ its.it_value.tv_sec = interval_ns / 1000000000L;
1737
+ its.it_value.tv_nsec = interval_ns % 1000000000L;
1609
1738
  its.it_interval = its.it_value;
1610
- timer_settime(prof->timer_id, 0, &its, NULL);
1739
+ if (timer_settime(prof->timer_id, 0, &its, NULL) != 0) {
1740
+ fprintf(stderr, "rperf: timer_settime (arm) failed: %s\n", strerror(errno));
1741
+ }
1611
1742
  return;
1612
1743
  }
1613
1744
  #endif
@@ -1625,7 +1756,9 @@ rperf_disarm_timer(rperf_profiler_t *prof)
1625
1756
  if (prof->timer_signal > 0) {
1626
1757
  struct itimerspec its;
1627
1758
  memset(&its, 0, sizeof(its));
1628
- timer_settime(prof->timer_id, 0, &its, NULL);
1759
+ if (timer_settime(prof->timer_id, 0, &its, NULL) != 0) {
1760
+ fprintf(stderr, "rperf: timer_settime (disarm) failed: %s\n", strerror(errno));
1761
+ }
1629
1762
  return;
1630
1763
  }
1631
1764
  #endif
@@ -1652,7 +1785,6 @@ rperf_reset_thread_times(rperf_profiler_t *prof)
1652
1785
  rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
1653
1786
  if (td) {
1654
1787
  td->prev_time_ns = rperf_current_time_ns(prof);
1655
- td->prev_wall_ns = rperf_wall_time_ns();
1656
1788
  }
1657
1789
  }
1658
1790
  }
@@ -1692,6 +1824,12 @@ rb_rperf_running_p(VALUE self)
1692
1824
  return g_profiler.running ? Qtrue : Qfalse;
1693
1825
  }
1694
1826
 
1827
+ static VALUE
1828
+ rb_rperf_profiler_wrapper(VALUE self)
1829
+ {
1830
+ return g_profiler_wrapper;
1831
+ }
1832
+
1695
1833
  /* ---- Fork safety ---- */
1696
1834
 
1697
1835
  static void
@@ -1771,6 +1909,7 @@ Init_rperf(void)
1771
1909
  rb_define_module_function(mRperf, "_c_profile_inc", rb_rperf_profile_inc, 0);
1772
1910
  rb_define_module_function(mRperf, "_c_profile_dec", rb_rperf_profile_dec, 0);
1773
1911
  rb_define_module_function(mRperf, "_c_running?", rb_rperf_running_p, 0);
1912
+ rb_define_module_function(mRperf, "_c_profiler_wrapper", rb_rperf_profiler_wrapper, 0);
1774
1913
 
1775
1914
  memset(&g_profiler, 0, sizeof(g_profiler));
1776
1915
  g_profiler.label_sets = Qnil;