rperf 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/rperf/rperf.c CHANGED
@@ -25,8 +25,10 @@
25
25
  #ifdef __linux__
26
26
  #define RPERF_USE_TIMER_SIGNAL 1
27
27
  #define RPERF_TIMER_SIGNAL_DEFAULT (SIGRTMIN + 8)
28
+ #define RPERF_COND_CLOCK CLOCK_MONOTONIC
28
29
  #else
29
30
  #define RPERF_USE_TIMER_SIGNAL 0
31
+ #define RPERF_COND_CLOCK CLOCK_REALTIME /* macOS lacks pthread_condattr_setclock */
30
32
  #endif
31
33
 
32
34
  #define RPERF_MAX_STACK_DEPTH 512
@@ -51,6 +53,11 @@ enum rperf_vm_state {
51
53
  /* ---- Data structures ---- */
52
54
 
53
55
 
56
+ enum rperf_mode {
57
+ RPERF_MODE_CPU = 0,
58
+ RPERF_MODE_WALL = 1,
59
+ };
60
+
54
61
  enum rperf_gc_phase {
55
62
  RPERF_GC_NONE = 0,
56
63
  RPERF_GC_MARKING = 1,
@@ -83,7 +90,7 @@ typedef struct rperf_sample_buffer {
83
90
 
84
91
  typedef struct rperf_frame_table {
85
92
  _Atomic(VALUE *) keys; /* unique VALUE array (GC mark target) */
86
- size_t count; /* = next frame_id */
93
+ _Atomic(size_t) count; /* = next frame_id */
87
94
  size_t capacity;
88
95
  uint32_t *buckets; /* open addressing: stores index into keys[] */
89
96
  size_t bucket_capacity;
@@ -119,7 +126,6 @@ typedef struct rperf_agg_table {
119
126
 
120
127
  typedef struct rperf_thread_data {
121
128
  int64_t prev_time_ns;
122
- int64_t prev_wall_ns;
123
129
  /* GVL event tracking */
124
130
  int64_t suspended_at_ns; /* wall time at SUSPENDED */
125
131
  int64_t ready_at_ns; /* wall time at READY */
@@ -143,11 +149,12 @@ typedef struct rperf_stats {
143
149
  size_t sampling_count;
144
150
  int64_t sampling_total_ns;
145
151
  size_t dropped_samples; /* samples lost due to allocation failure */
152
+ size_t dropped_aggregation; /* samples lost during aggregation (frame_table/agg_table full) */
146
153
  } rperf_stats_t;
147
154
 
148
155
  typedef struct rperf_profiler {
149
156
  int frequency;
150
- int mode; /* 0 = cpu, 1 = wall */
157
+ enum rperf_mode mode;
151
158
  _Atomic int running;
152
159
  pthread_t worker_thread; /* combined timer + aggregation */
153
160
  #if RPERF_USE_TIMER_SIGNAL
@@ -217,7 +224,7 @@ rperf_profiler_mark(void *ptr)
217
224
  * If we see an old count, both old and new keys arrays have valid
218
225
  * data (old keys are kept alive in old_keys[]). */
219
226
  {
220
- size_t ft_count = __atomic_load_n(&prof->frame_table.count, __ATOMIC_ACQUIRE);
227
+ size_t ft_count = atomic_load_explicit(&prof->frame_table.count, memory_order_acquire);
221
228
  VALUE *ft_keys = atomic_load_explicit(&prof->frame_table.keys, memory_order_acquire);
222
229
  if (ft_keys && ft_count > 0) {
223
230
  rb_gc_mark_locations(ft_keys, ft_keys + ft_count);
@@ -225,12 +232,42 @@ rperf_profiler_mark(void *ptr)
225
232
  }
226
233
  }
227
234
 
235
+ static size_t
236
+ rperf_profiler_memsize(const void *ptr)
237
+ {
238
+ const rperf_profiler_t *prof = (const rperf_profiler_t *)ptr;
239
+ size_t size = sizeof(rperf_profiler_t);
240
+ int i;
241
+
242
+ /* Double-buffered sample storage */
243
+ for (i = 0; i < 2; i++) {
244
+ const rperf_sample_buffer_t *buf = &prof->buffers[i];
245
+ size += buf->sample_capacity * sizeof(rperf_sample_t);
246
+ size += buf->frame_pool_capacity * sizeof(VALUE);
247
+ }
248
+
249
+ /* Frame table */
250
+ size += prof->frame_table.capacity * sizeof(VALUE); /* keys */
251
+ size += prof->frame_table.bucket_capacity * sizeof(uint32_t); /* buckets */
252
+ for (i = 0; i < prof->frame_table.old_keys_count; i++) {
253
+ /* old_keys entries are previous keys arrays; exact sizes unknown,
254
+ * but the pointer array itself is accounted for below. */
255
+ }
256
+ size += prof->frame_table.old_keys_capacity * sizeof(VALUE *); /* old_keys */
257
+
258
+ /* Aggregation table */
259
+ size += prof->agg_table.bucket_capacity * sizeof(rperf_agg_entry_t);
260
+ size += prof->agg_table.stack_pool_capacity * sizeof(uint32_t);
261
+
262
+ return size;
263
+ }
264
+
228
265
  static const rb_data_type_t rperf_profiler_type = {
229
266
  .wrap_struct_name = "rperf_profiler",
230
267
  .function = {
231
268
  .dmark = rperf_profiler_mark,
232
269
  .dfree = NULL,
233
- .dsize = NULL,
270
+ .dsize = rperf_profiler_memsize,
234
271
  },
235
272
  };
236
273
 
@@ -259,7 +296,7 @@ rperf_wall_time_ns(void)
259
296
  static int64_t
260
297
  rperf_current_time_ns(rperf_profiler_t *prof)
261
298
  {
262
- if (prof->mode == 0) {
299
+ if (prof->mode == RPERF_MODE_CPU) {
263
300
  return rperf_cpu_time_ns();
264
301
  } else {
265
302
  return rperf_wall_time_ns();
@@ -300,7 +337,7 @@ static int
300
337
  rperf_ensure_sample_capacity(rperf_sample_buffer_t *buf)
301
338
  {
302
339
  if (buf->sample_count >= buf->sample_capacity) {
303
- if (buf->sample_capacity > SIZE_MAX / 2) return -1;
340
+ if (buf->sample_capacity > SIZE_MAX / (2 * sizeof(rperf_sample_t))) return -1;
304
341
  size_t new_cap = buf->sample_capacity * 2;
305
342
  rperf_sample_t *new_samples = (rperf_sample_t *)realloc(
306
343
  buf->samples,
@@ -319,7 +356,7 @@ static int
319
356
  rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
320
357
  {
321
358
  while (buf->frame_pool_count + (size_t)needed > buf->frame_pool_capacity) {
322
- if (buf->frame_pool_capacity > SIZE_MAX / 2) return -1;
359
+ if (buf->frame_pool_capacity > SIZE_MAX / (2 * sizeof(VALUE))) return -1;
323
360
  size_t new_cap = buf->frame_pool_capacity * 2;
324
361
  VALUE *new_pool = (VALUE *)realloc(
325
362
  buf->frame_pool,
@@ -438,7 +475,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
438
475
  keys[frame_id] = fval;
439
476
  /* Store fence: ensure keys[frame_id] is visible before count is incremented,
440
477
  * so GC dmark never reads uninitialized keys[count-1]. */
441
- __atomic_store_n(&ft->count, ft->count + 1, __ATOMIC_RELEASE);
478
+ atomic_store_explicit(&ft->count, ft->count + 1, memory_order_release);
442
479
  ft->buckets[idx] = frame_id;
443
480
 
444
481
  /* Rehash if load factor > 0.7 */
@@ -494,7 +531,7 @@ rperf_agg_table_free(rperf_agg_table_t *at)
494
531
  static void
495
532
  rperf_agg_table_rehash(rperf_agg_table_t *at)
496
533
  {
497
- if (at->bucket_capacity > SIZE_MAX / 2) return;
534
+ if (at->bucket_capacity > SIZE_MAX / (2 * sizeof(rperf_agg_entry_t))) return;
498
535
  size_t new_cap = at->bucket_capacity * 2;
499
536
  rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
500
537
  if (!new_buckets) return; /* keep using current buckets at higher load factor */
@@ -519,7 +556,7 @@ static int
519
556
  rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
520
557
  {
521
558
  while (at->stack_pool_count + (size_t)needed > at->stack_pool_capacity) {
522
- if (at->stack_pool_capacity > SIZE_MAX / 2) return -1;
559
+ if (at->stack_pool_capacity > SIZE_MAX / (2 * sizeof(uint32_t))) return -1;
523
560
  size_t new_cap = at->stack_pool_capacity * 2;
524
561
  uint32_t *new_pool = (uint32_t *)realloc(at->stack_pool,
525
562
  new_cap * sizeof(uint32_t));
@@ -530,8 +567,9 @@ rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
530
567
  return 0;
531
568
  }
532
569
 
533
- /* Insert or merge a stack into the aggregation table */
534
- static void
570
+ /* Insert or merge a stack into the aggregation table.
571
+ * Returns 0 on success, -1 on failure (table full or allocation failure). */
572
+ static int
535
573
  rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
536
574
  int depth, int thread_seq, int label_set_id,
537
575
  enum rperf_vm_state vm_state, int64_t weight, uint32_t hash)
@@ -548,14 +586,14 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
548
586
  depth * sizeof(uint32_t)) == 0) {
549
587
  /* Match — merge weight */
550
588
  e->weight += weight;
551
- return;
589
+ return 0;
552
590
  }
553
591
  idx = (idx + 1) % at->bucket_capacity;
554
- if (++probes >= at->bucket_capacity) return; /* table full, drop sample */
592
+ if (++probes >= at->bucket_capacity) return -1; /* table full */
555
593
  }
556
594
 
557
595
  /* New entry — append frame_ids to stack_pool */
558
- if (rperf_agg_ensure_stack_pool(at, depth) < 0) return;
596
+ if (rperf_agg_ensure_stack_pool(at, depth) < 0) return -1;
559
597
 
560
598
  rperf_agg_entry_t *e = &at->buckets[idx];
561
599
  e->frame_start = (uint32_t)at->stack_pool_count;
@@ -576,6 +614,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
576
614
  if (at->count * 10 > at->bucket_capacity * 7) {
577
615
  rperf_agg_table_rehash(at);
578
616
  }
617
+ return 0;
579
618
  }
580
619
 
581
620
  /* ---- Aggregation: process a sample buffer into frame_table + agg_table ---- */
@@ -598,25 +637,32 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
598
637
  /* Convert VALUE frames to frame_ids */
599
638
  int overflow = 0;
600
639
  for (j = 0; j < s->depth; j++) {
640
+ if (s->frame_start + j >= buf->frame_pool_count) break;
601
641
  VALUE fval = buf->frame_pool[s->frame_start + j];
602
642
  uint32_t fid = rperf_frame_table_insert(&prof->frame_table, fval);
603
643
  if (fid == RPERF_FRAME_TABLE_EMPTY) { overflow = 1; break; }
604
644
  temp_ids[j] = fid;
605
645
  }
606
- if (overflow) break; /* frame_table full, stop aggregating this buffer */
646
+ if (overflow) {
647
+ /* frame_table full — count remaining samples as dropped */
648
+ prof->stats.dropped_aggregation += buf->sample_count - i;
649
+ break;
650
+ }
607
651
 
608
652
  hash = rperf_fnv1a_u32(temp_ids, s->depth, s->thread_seq, s->label_set_id, s->vm_state);
609
653
 
610
- rperf_agg_table_insert(&prof->agg_table, temp_ids, s->depth,
654
+ if (rperf_agg_table_insert(&prof->agg_table, temp_ids, s->depth,
611
655
  s->thread_seq, s->label_set_id, s->vm_state,
612
- s->weight, hash);
656
+ s->weight, hash) < 0) {
657
+ prof->stats.dropped_aggregation++;
658
+ }
613
659
  }
614
660
 
615
661
  /* Reset buffer for reuse.
616
662
  * Release fence: ensure all frame_table inserts are visible (to GC dmark)
617
663
  * before frame_pool_count is cleared, so dmark always has at least one
618
664
  * source (frame_table or frame_pool) covering each VALUE. */
619
- __atomic_thread_fence(__ATOMIC_RELEASE);
665
+ atomic_thread_fence(memory_order_release);
620
666
  buf->sample_count = 0;
621
667
  buf->frame_pool_count = 0;
622
668
  }
@@ -693,8 +739,9 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
693
739
  {
694
740
  rperf_thread_data_t *td = (rperf_thread_data_t *)calloc(1, sizeof(rperf_thread_data_t));
695
741
  if (!td) return NULL;
696
- td->prev_time_ns = rperf_current_time_ns(prof);
697
- td->prev_wall_ns = rperf_wall_time_ns();
742
+ int64_t t = rperf_current_time_ns(prof);
743
+ if (t < 0) { free(td); return NULL; }
744
+ td->prev_time_ns = t;
698
745
  td->thread_seq = ++prof->next_thread_seq;
699
746
  rb_internal_thread_specific_set(thread, prof->ts_key, td);
700
747
  return td;
@@ -737,7 +784,6 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t
737
784
  /* Save timestamp for READY/RESUMED */
738
785
  td->suspended_at_ns = wall_now;
739
786
  td->prev_time_ns = time_now;
740
- td->prev_wall_ns = wall_now;
741
787
  }
742
788
 
743
789
  static void
@@ -768,7 +814,7 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
768
814
  * Both samples are written directly into the same buffer before calling
769
815
  * rperf_try_swap, so that a swap triggered by the first sample cannot
770
816
  * move the second into a different buffer with a stale frame_start. */
771
- if (prof->mode == 1 && td->suspended_at_ns > 0 && !RPERF_PAUSED(prof)) {
817
+ if (prof->mode == RPERF_MODE_WALL && td->suspended_at_ns > 0 && !RPERF_PAUSED(prof)) {
772
818
  rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
773
819
  if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) goto skip_gvl;
774
820
  size_t frame_start = buf->frame_pool_count;
@@ -798,7 +844,6 @@ skip_gvl:
798
844
  /* Reset prev times to current — next timer sample measures from resume */
799
845
  int64_t time_now = rperf_current_time_ns(prof);
800
846
  if (time_now >= 0) td->prev_time_ns = time_now;
801
- td->prev_wall_ns = wall_now;
802
847
 
803
848
  /* Clear suspended state */
804
849
  td->suspended_at_ns = 0;
@@ -895,17 +940,15 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
895
940
 
896
941
  /* ---- Sampling callback (postponed job) — current thread only ---- */
897
942
 
898
- static void
899
- rperf_sample_job(void *arg)
943
+ /* Core sampling logic, parameterized by mode constant.
944
+ * Called from rperf_sample_cpu/rperf_sample_wall so the compiler
945
+ * can inline and eliminate mode branches at compile time. */
946
+ static inline void
947
+ rperf_sample_core(rperf_profiler_t *prof, enum rperf_mode mode)
900
948
  {
901
- rperf_profiler_t *prof = (rperf_profiler_t *)arg;
902
-
903
- if (!prof->running) return;
904
- if (RPERF_PAUSED(prof)) return;
905
-
906
- /* Measure sampling overhead */
949
+ /* Measure sampling overhead (wall time — runs under GVL, no I/O) */
907
950
  struct timespec ts_start, ts_end;
908
- clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_start);
951
+ clock_gettime(CLOCK_MONOTONIC, &ts_start);
909
952
 
910
953
  VALUE thread = rb_thread_current();
911
954
 
@@ -917,12 +960,11 @@ rperf_sample_job(void *arg)
917
960
  return; /* Skip first sample for this thread */
918
961
  }
919
962
 
920
- int64_t time_now = rperf_current_time_ns(prof);
963
+ int64_t time_now = (mode == RPERF_MODE_CPU) ? rperf_cpu_time_ns() : rperf_wall_time_ns();
921
964
  if (time_now < 0) return;
922
965
 
923
966
  int64_t weight = time_now - td->prev_time_ns;
924
967
  td->prev_time_ns = time_now;
925
- td->prev_wall_ns = rperf_wall_time_ns();
926
968
 
927
969
  if (weight <= 0) return;
928
970
 
@@ -938,13 +980,33 @@ rperf_sample_job(void *arg)
938
980
 
939
981
  rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
940
982
 
941
- clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_end);
983
+ clock_gettime(CLOCK_MONOTONIC, &ts_end);
942
984
  prof->stats.sampling_count++;
943
985
  prof->stats.sampling_total_ns +=
944
986
  ((int64_t)ts_end.tv_sec - ts_start.tv_sec) * 1000000000LL +
945
987
  (ts_end.tv_nsec - ts_start.tv_nsec);
946
988
  }
947
989
 
990
+ static void
991
+ rperf_sample_cpu(rperf_profiler_t *prof) { rperf_sample_core(prof, RPERF_MODE_CPU); }
992
+
993
+ static void
994
+ rperf_sample_wall(rperf_profiler_t *prof) { rperf_sample_core(prof, RPERF_MODE_WALL); }
995
+
996
+ static void
997
+ rperf_sample_job(void *arg)
998
+ {
999
+ rperf_profiler_t *prof = (rperf_profiler_t *)arg;
1000
+
1001
+ if (!prof->running) return;
1002
+ if (RPERF_PAUSED(prof)) return;
1003
+
1004
+ if (prof->mode == RPERF_MODE_CPU)
1005
+ rperf_sample_cpu(prof);
1006
+ else
1007
+ rperf_sample_wall(prof);
1008
+ }
1009
+
948
1010
  /* ---- Worker thread: timer + aggregation ---- */
949
1011
 
950
1012
  #if RPERF_USE_TIMER_SIGNAL
@@ -990,7 +1052,7 @@ rperf_worker_nanosleep_func(void *arg)
990
1052
  struct timespec deadline;
991
1053
  long interval_ns = 1000000000L / prof->frequency;
992
1054
 
993
- clock_gettime(CLOCK_REALTIME, &deadline);
1055
+ clock_gettime(RPERF_COND_CLOCK, &deadline);
994
1056
  deadline.tv_nsec += interval_ns;
995
1057
  if (deadline.tv_nsec >= 1000000000L) {
996
1058
  deadline.tv_sec++;
@@ -1005,7 +1067,7 @@ rperf_worker_nanosleep_func(void *arg)
1005
1067
  CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
1006
1068
  prof->worker_paused = 0;
1007
1069
  /* Reset deadline on wake to avoid burst of catch-up triggers */
1008
- clock_gettime(CLOCK_REALTIME, &deadline);
1070
+ clock_gettime(RPERF_COND_CLOCK, &deadline);
1009
1071
  deadline.tv_nsec += interval_ns;
1010
1072
  if (deadline.tv_nsec >= 1000000000L) {
1011
1073
  deadline.tv_sec++;
@@ -1076,13 +1138,15 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
1076
1138
  result = rb_hash_new();
1077
1139
 
1078
1140
  rb_hash_aset(result, ID2SYM(rb_intern("mode")),
1079
- ID2SYM(rb_intern(prof->mode == 1 ? "wall" : "cpu")));
1141
+ ID2SYM(rb_intern(prof->mode == RPERF_MODE_WALL ? "wall" : "cpu")));
1080
1142
  rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(prof->frequency));
1081
1143
  rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(prof->stats.trigger_count));
1082
1144
  rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(prof->stats.sampling_count));
1083
1145
  rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(prof->stats.sampling_total_ns));
1084
1146
  if (prof->stats.dropped_samples > 0)
1085
1147
  rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(prof->stats.dropped_samples));
1148
+ if (prof->stats.dropped_aggregation > 0)
1149
+ rb_hash_aset(result, ID2SYM(rb_intern("dropped_aggregation")), SIZET2NUM(prof->stats.dropped_aggregation));
1086
1150
  rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(prof->next_thread_seq));
1087
1151
  rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
1088
1152
  SIZET2NUM(prof->frame_table.count));
@@ -1154,7 +1218,7 @@ static VALUE
1154
1218
  rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VALUE vdefer)
1155
1219
  {
1156
1220
  int frequency = NUM2INT(vfreq);
1157
- int mode = NUM2INT(vmode);
1221
+ enum rperf_mode mode = (enum rperf_mode)NUM2INT(vmode);
1158
1222
  int aggregate = RTEST(vagg) ? 1 : 0;
1159
1223
  #if RPERF_USE_TIMER_SIGNAL
1160
1224
  int sig = NUM2INT(vsig);
@@ -1173,13 +1237,26 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VAL
1173
1237
  g_profiler.stats.sampling_total_ns = 0;
1174
1238
  g_profiler.stats.trigger_count = 0;
1175
1239
  g_profiler.stats.dropped_samples = 0;
1240
+ g_profiler.stats.dropped_aggregation = 0;
1176
1241
  atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
1177
1242
  atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
1178
1243
  g_profiler.label_sets = Qnil;
1179
1244
 
1180
1245
  /* Initialize worker mutex/cond */
1181
1246
  CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
1247
+ #ifdef __linux__
1248
+ {
1249
+ /* Use CLOCK_MONOTONIC for pthread_cond_timedwait so that
1250
+ * system clock adjustments (NTP etc.) don't affect timer intervals. */
1251
+ pthread_condattr_t cond_attr;
1252
+ CHECKED(pthread_condattr_init(&cond_attr));
1253
+ CHECKED(pthread_condattr_setclock(&cond_attr, CLOCK_MONOTONIC));
1254
+ CHECKED(pthread_cond_init(&g_profiler.worker_cond, &cond_attr));
1255
+ CHECKED(pthread_condattr_destroy(&cond_attr));
1256
+ }
1257
+ #else
1182
1258
  CHECKED(pthread_cond_init(&g_profiler.worker_cond, NULL));
1259
+ #endif
1183
1260
 
1184
1261
  /* Initialize sample buffer(s) */
1185
1262
  if (rperf_sample_buffer_init(&g_profiler.buffers[0]) < 0) {
@@ -1436,13 +1513,15 @@ rb_rperf_stop(VALUE self)
1436
1513
 
1437
1514
  result = rb_hash_new();
1438
1515
  rb_hash_aset(result, ID2SYM(rb_intern("mode")),
1439
- ID2SYM(rb_intern(g_profiler.mode == 1 ? "wall" : "cpu")));
1516
+ ID2SYM(rb_intern(g_profiler.mode == RPERF_MODE_WALL ? "wall" : "cpu")));
1440
1517
  rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
1441
1518
  rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
1442
1519
  rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
1443
1520
  rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
1444
1521
  if (g_profiler.stats.dropped_samples > 0)
1445
1522
  rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(g_profiler.stats.dropped_samples));
1523
+ if (g_profiler.stats.dropped_aggregation > 0)
1524
+ rb_hash_aset(result, ID2SYM(rb_intern("dropped_aggregation")), SIZET2NUM(g_profiler.stats.dropped_aggregation));
1446
1525
  rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
1447
1526
  {
1448
1527
  struct timespec stop_monotonic;
@@ -1508,6 +1587,7 @@ rperf_clear_aggregated_data(rperf_profiler_t *prof)
1508
1587
  prof->stats.sampling_count = 0;
1509
1588
  prof->stats.sampling_total_ns = 0;
1510
1589
  prof->stats.dropped_samples = 0;
1590
+ prof->stats.dropped_aggregation = 0;
1511
1591
 
1512
1592
  /* Reset start timestamps so next snapshot's duration_ns covers
1513
1593
  * only the period since this clear. */
@@ -1652,7 +1732,6 @@ rperf_reset_thread_times(rperf_profiler_t *prof)
1652
1732
  rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
1653
1733
  if (td) {
1654
1734
  td->prev_time_ns = rperf_current_time_ns(prof);
1655
- td->prev_wall_ns = rperf_wall_time_ns();
1656
1735
  }
1657
1736
  }
1658
1737
  }
@@ -1692,6 +1771,12 @@ rb_rperf_running_p(VALUE self)
1692
1771
  return g_profiler.running ? Qtrue : Qfalse;
1693
1772
  }
1694
1773
 
1774
+ static VALUE
1775
+ rb_rperf_profiler_wrapper(VALUE self)
1776
+ {
1777
+ return g_profiler_wrapper;
1778
+ }
1779
+
1695
1780
  /* ---- Fork safety ---- */
1696
1781
 
1697
1782
  static void
@@ -1771,6 +1856,7 @@ Init_rperf(void)
1771
1856
  rb_define_module_function(mRperf, "_c_profile_inc", rb_rperf_profile_inc, 0);
1772
1857
  rb_define_module_function(mRperf, "_c_profile_dec", rb_rperf_profile_dec, 0);
1773
1858
  rb_define_module_function(mRperf, "_c_running?", rb_rperf_running_p, 0);
1859
+ rb_define_module_function(mRperf, "_c_profiler_wrapper", rb_rperf_profiler_wrapper, 0);
1774
1860
 
1775
1861
  memset(&g_profiler, 0, sizeof(g_profiler));
1776
1862
  g_profiler.label_sets = Qnil;
@@ -1,4 +1,5 @@
1
1
  require "rperf"
2
+ require "active_support/concern"
2
3
 
3
4
  module Rperf::ActiveJobMiddleware
4
5
  extend ActiveSupport::Concern
data/lib/rperf/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Rperf
2
- VERSION = "0.8.0"
2
+ VERSION = "0.9.0"
3
3
  end
data/lib/rperf/viewer.rb CHANGED
@@ -3,6 +3,11 @@ require "json"
3
3
 
4
4
  # Rack middleware that serves flamegraph visualizations of rperf snapshots.
5
5
  #
6
+ # *Security note*: This middleware exposes profiling data without
7
+ # authentication. It is intended for development and staging environments.
8
+ # In production, place it behind an authenticated reverse proxy or restrict
9
+ # access by IP / VPN.
10
+ #
6
11
  # Usage:
7
12
  # require "rperf/viewer"
8
13
  # use Rperf::Viewer # mount at /rperf (default)
@@ -25,6 +30,7 @@ class Rperf::Viewer
25
30
  attr_reader :max_snapshots, :path
26
31
 
27
32
  def initialize(app, path: "/rperf", max_snapshots: 24)
33
+ raise ArgumentError, "max_snapshots must be a positive integer, got #{max_snapshots.inspect}" unless max_snapshots.is_a?(Integer) && max_snapshots > 0
28
34
  @app = app
29
35
  @path = path.chomp("/")
30
36
  @max_snapshots = max_snapshots
@@ -81,6 +87,63 @@ class Rperf::Viewer
81
87
  end
82
88
  end
83
89
 
90
+ # Convert aggregated samples to JSON-friendly format.
91
+ # Stack is stored top-to-bottom (leaf first) in C; reverse to root-first for flamegraph.
92
+ # Label set keys are converted from symbols to strings for JSON.
93
+ def self.samples_to_json(samples, label_sets)
94
+ json_samples = samples.map do |frames, weight, thread_seq, label_set_id|
95
+ {
96
+ stack: frames.reverse.map { |_, label| label },
97
+ weight: weight,
98
+ thread_seq: thread_seq || 0,
99
+ label_set_id: label_set_id || 0,
100
+ }
101
+ end
102
+ json_label_sets = label_sets.map do |ls|
103
+ ls.is_a?(Hash) ? ls.transform_keys(&:to_s) : ls
104
+ end
105
+ [json_samples, json_label_sets]
106
+ end
107
+
108
+ # Generate a self-contained static HTML file with inline snapshot data.
109
+ # The HTML loads d3/d3-flamegraph from CDN but requires no server.
110
+ def self.render_static_html(data)
111
+ samples = data[:aggregated_samples] || []
112
+ label_sets = data[:label_sets] || []
113
+ json_samples, json_label_sets = samples_to_json(samples, label_sets)
114
+
115
+ json_snapshot = JSON.generate({
116
+ id: 1,
117
+ taken_at: Time.now.iso8601,
118
+ mode: data[:mode],
119
+ frequency: data[:frequency],
120
+ duration_ns: data[:duration_ns],
121
+ sampling_count: data[:sampling_count],
122
+ samples: json_samples,
123
+ label_sets: json_label_sets,
124
+ })
125
+
126
+ logo = LOGO_SVG.sub("<svg ", '<svg style="height:36px;width:auto" ')
127
+
128
+ html = VIEWER_HTML.sub("<!-- LOGO -->") { logo }
129
+
130
+ # Hide snapshot selector (single snapshot, no server)
131
+ html = html.sub('<select id="sel-snapshot"', '<select id="sel-snapshot" style="display:none"')
132
+
133
+ # Replace dynamic loading with inline data.
134
+ # Escape for safe embedding in <script>:
135
+ # - "</" prevents closing </script> tag injection
136
+ # - U+2028/U+2029 are line terminators in JS but valid in JSON
137
+ json_safe = json_snapshot
138
+ .gsub("</", "<\\/")
139
+ .gsub("\u2028", "\\u2028")
140
+ .gsub("\u2029", "\\u2029")
141
+ html = html.sub("loadSnapshotList();",
142
+ "currentData = #{json_safe}; updateTagDropdowns(); applyAndRender();")
143
+
144
+ html
145
+ end
146
+
84
147
  private
85
148
 
86
149
  LOGO_SVG = begin
@@ -119,24 +182,9 @@ class Rperf::Viewer
119
182
  return [404, { "content-type" => "text/plain" }, ["Snapshot not found"]] unless entry
120
183
 
121
184
  data = entry[:data]
122
- samples = data[:aggregated_samples]
185
+ samples = data[:aggregated_samples] || []
123
186
  label_sets = data[:label_sets] || []
124
-
125
- # Convert samples to JSON-friendly format.
126
- # Stack is stored top-to-bottom (leaf first) in C; reverse to root-first for flamegraph.
127
- json_samples = samples.map do |frames, weight, thread_seq, label_set_id|
128
- {
129
- stack: frames.reverse.map { |_, label| label },
130
- weight: weight,
131
- thread_seq: thread_seq || 0,
132
- label_set_id: label_set_id || 0,
133
- }
134
- end
135
-
136
- # Convert label_sets: symbol keys to string keys for JSON
137
- json_label_sets = label_sets.map do |ls|
138
- ls.is_a?(Hash) ? ls.transform_keys(&:to_s) : ls
139
- end
187
+ json_samples, json_label_sets = self.class.samples_to_json(samples, label_sets)
140
188
 
141
189
  json_response({
142
190
  id: entry[:id],
@@ -162,6 +210,7 @@ class Rperf::Viewer
162
210
  <html lang="en">
163
211
  <head>
164
212
  <meta charset="utf-8">
213
+ <meta http-equiv="Content-Security-Policy" content="default-src 'none'; script-src 'unsafe-inline' https://cdnjs.cloudflare.com https://cdn.jsdelivr.net; style-src 'unsafe-inline' https://cdn.jsdelivr.net; connect-src 'self'; img-src data:; frame-ancestors 'none'">
165
214
  <title>rperf Viewer</title>
166
215
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/d3-flame-graph@4/dist/d3-flamegraph.css" integrity="sha384-DgAQSBzzhv8bu6Qc6Lq08THluOr+kO5qLMHt1yv8A3my7Jz2OQv6aq/WSZRYIQkG" crossorigin="anonymous">
167
216
  <style>