rperf 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +21 -0
- data/README.md +12 -10
- data/docs/help.md +106 -9
- data/exe/rperf +35 -6
- data/ext/rperf/rperf.c +129 -43
- data/lib/rperf/active_job.rb +1 -0
- data/lib/rperf/version.rb +1 -1
- data/lib/rperf/viewer.rb +66 -17
- data/lib/rperf.rb +514 -60
- metadata +5 -4
data/ext/rperf/rperf.c
CHANGED
|
@@ -25,8 +25,10 @@
|
|
|
25
25
|
#ifdef __linux__
|
|
26
26
|
#define RPERF_USE_TIMER_SIGNAL 1
|
|
27
27
|
#define RPERF_TIMER_SIGNAL_DEFAULT (SIGRTMIN + 8)
|
|
28
|
+
#define RPERF_COND_CLOCK CLOCK_MONOTONIC
|
|
28
29
|
#else
|
|
29
30
|
#define RPERF_USE_TIMER_SIGNAL 0
|
|
31
|
+
#define RPERF_COND_CLOCK CLOCK_REALTIME /* macOS lacks pthread_condattr_setclock */
|
|
30
32
|
#endif
|
|
31
33
|
|
|
32
34
|
#define RPERF_MAX_STACK_DEPTH 512
|
|
@@ -51,6 +53,11 @@ enum rperf_vm_state {
|
|
|
51
53
|
/* ---- Data structures ---- */
|
|
52
54
|
|
|
53
55
|
|
|
56
|
+
enum rperf_mode {
|
|
57
|
+
RPERF_MODE_CPU = 0,
|
|
58
|
+
RPERF_MODE_WALL = 1,
|
|
59
|
+
};
|
|
60
|
+
|
|
54
61
|
enum rperf_gc_phase {
|
|
55
62
|
RPERF_GC_NONE = 0,
|
|
56
63
|
RPERF_GC_MARKING = 1,
|
|
@@ -83,7 +90,7 @@ typedef struct rperf_sample_buffer {
|
|
|
83
90
|
|
|
84
91
|
typedef struct rperf_frame_table {
|
|
85
92
|
_Atomic(VALUE *) keys; /* unique VALUE array (GC mark target) */
|
|
86
|
-
size_t count;
|
|
93
|
+
_Atomic(size_t) count; /* = next frame_id */
|
|
87
94
|
size_t capacity;
|
|
88
95
|
uint32_t *buckets; /* open addressing: stores index into keys[] */
|
|
89
96
|
size_t bucket_capacity;
|
|
@@ -119,7 +126,6 @@ typedef struct rperf_agg_table {
|
|
|
119
126
|
|
|
120
127
|
typedef struct rperf_thread_data {
|
|
121
128
|
int64_t prev_time_ns;
|
|
122
|
-
int64_t prev_wall_ns;
|
|
123
129
|
/* GVL event tracking */
|
|
124
130
|
int64_t suspended_at_ns; /* wall time at SUSPENDED */
|
|
125
131
|
int64_t ready_at_ns; /* wall time at READY */
|
|
@@ -143,11 +149,12 @@ typedef struct rperf_stats {
|
|
|
143
149
|
size_t sampling_count;
|
|
144
150
|
int64_t sampling_total_ns;
|
|
145
151
|
size_t dropped_samples; /* samples lost due to allocation failure */
|
|
152
|
+
size_t dropped_aggregation; /* samples lost during aggregation (frame_table/agg_table full) */
|
|
146
153
|
} rperf_stats_t;
|
|
147
154
|
|
|
148
155
|
typedef struct rperf_profiler {
|
|
149
156
|
int frequency;
|
|
150
|
-
|
|
157
|
+
enum rperf_mode mode;
|
|
151
158
|
_Atomic int running;
|
|
152
159
|
pthread_t worker_thread; /* combined timer + aggregation */
|
|
153
160
|
#if RPERF_USE_TIMER_SIGNAL
|
|
@@ -217,7 +224,7 @@ rperf_profiler_mark(void *ptr)
|
|
|
217
224
|
* If we see an old count, both old and new keys arrays have valid
|
|
218
225
|
* data (old keys are kept alive in old_keys[]). */
|
|
219
226
|
{
|
|
220
|
-
size_t ft_count =
|
|
227
|
+
size_t ft_count = atomic_load_explicit(&prof->frame_table.count, memory_order_acquire);
|
|
221
228
|
VALUE *ft_keys = atomic_load_explicit(&prof->frame_table.keys, memory_order_acquire);
|
|
222
229
|
if (ft_keys && ft_count > 0) {
|
|
223
230
|
rb_gc_mark_locations(ft_keys, ft_keys + ft_count);
|
|
@@ -225,12 +232,42 @@ rperf_profiler_mark(void *ptr)
|
|
|
225
232
|
}
|
|
226
233
|
}
|
|
227
234
|
|
|
235
|
+
static size_t
|
|
236
|
+
rperf_profiler_memsize(const void *ptr)
|
|
237
|
+
{
|
|
238
|
+
const rperf_profiler_t *prof = (const rperf_profiler_t *)ptr;
|
|
239
|
+
size_t size = sizeof(rperf_profiler_t);
|
|
240
|
+
int i;
|
|
241
|
+
|
|
242
|
+
/* Double-buffered sample storage */
|
|
243
|
+
for (i = 0; i < 2; i++) {
|
|
244
|
+
const rperf_sample_buffer_t *buf = &prof->buffers[i];
|
|
245
|
+
size += buf->sample_capacity * sizeof(rperf_sample_t);
|
|
246
|
+
size += buf->frame_pool_capacity * sizeof(VALUE);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/* Frame table */
|
|
250
|
+
size += prof->frame_table.capacity * sizeof(VALUE); /* keys */
|
|
251
|
+
size += prof->frame_table.bucket_capacity * sizeof(uint32_t); /* buckets */
|
|
252
|
+
for (i = 0; i < prof->frame_table.old_keys_count; i++) {
|
|
253
|
+
/* old_keys entries are previous keys arrays; exact sizes unknown,
|
|
254
|
+
* but the pointer array itself is accounted for below. */
|
|
255
|
+
}
|
|
256
|
+
size += prof->frame_table.old_keys_capacity * sizeof(VALUE *); /* old_keys */
|
|
257
|
+
|
|
258
|
+
/* Aggregation table */
|
|
259
|
+
size += prof->agg_table.bucket_capacity * sizeof(rperf_agg_entry_t);
|
|
260
|
+
size += prof->agg_table.stack_pool_capacity * sizeof(uint32_t);
|
|
261
|
+
|
|
262
|
+
return size;
|
|
263
|
+
}
|
|
264
|
+
|
|
228
265
|
static const rb_data_type_t rperf_profiler_type = {
|
|
229
266
|
.wrap_struct_name = "rperf_profiler",
|
|
230
267
|
.function = {
|
|
231
268
|
.dmark = rperf_profiler_mark,
|
|
232
269
|
.dfree = NULL,
|
|
233
|
-
.dsize =
|
|
270
|
+
.dsize = rperf_profiler_memsize,
|
|
234
271
|
},
|
|
235
272
|
};
|
|
236
273
|
|
|
@@ -259,7 +296,7 @@ rperf_wall_time_ns(void)
|
|
|
259
296
|
static int64_t
|
|
260
297
|
rperf_current_time_ns(rperf_profiler_t *prof)
|
|
261
298
|
{
|
|
262
|
-
if (prof->mode ==
|
|
299
|
+
if (prof->mode == RPERF_MODE_CPU) {
|
|
263
300
|
return rperf_cpu_time_ns();
|
|
264
301
|
} else {
|
|
265
302
|
return rperf_wall_time_ns();
|
|
@@ -300,7 +337,7 @@ static int
|
|
|
300
337
|
rperf_ensure_sample_capacity(rperf_sample_buffer_t *buf)
|
|
301
338
|
{
|
|
302
339
|
if (buf->sample_count >= buf->sample_capacity) {
|
|
303
|
-
if (buf->sample_capacity > SIZE_MAX / 2) return -1;
|
|
340
|
+
if (buf->sample_capacity > SIZE_MAX / (2 * sizeof(rperf_sample_t))) return -1;
|
|
304
341
|
size_t new_cap = buf->sample_capacity * 2;
|
|
305
342
|
rperf_sample_t *new_samples = (rperf_sample_t *)realloc(
|
|
306
343
|
buf->samples,
|
|
@@ -319,7 +356,7 @@ static int
|
|
|
319
356
|
rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
|
|
320
357
|
{
|
|
321
358
|
while (buf->frame_pool_count + (size_t)needed > buf->frame_pool_capacity) {
|
|
322
|
-
if (buf->frame_pool_capacity > SIZE_MAX / 2) return -1;
|
|
359
|
+
if (buf->frame_pool_capacity > SIZE_MAX / (2 * sizeof(VALUE))) return -1;
|
|
323
360
|
size_t new_cap = buf->frame_pool_capacity * 2;
|
|
324
361
|
VALUE *new_pool = (VALUE *)realloc(
|
|
325
362
|
buf->frame_pool,
|
|
@@ -438,7 +475,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
|
|
|
438
475
|
keys[frame_id] = fval;
|
|
439
476
|
/* Store fence: ensure keys[frame_id] is visible before count is incremented,
|
|
440
477
|
* so GC dmark never reads uninitialized keys[count-1]. */
|
|
441
|
-
|
|
478
|
+
atomic_store_explicit(&ft->count, ft->count + 1, memory_order_release);
|
|
442
479
|
ft->buckets[idx] = frame_id;
|
|
443
480
|
|
|
444
481
|
/* Rehash if load factor > 0.7 */
|
|
@@ -494,7 +531,7 @@ rperf_agg_table_free(rperf_agg_table_t *at)
|
|
|
494
531
|
static void
|
|
495
532
|
rperf_agg_table_rehash(rperf_agg_table_t *at)
|
|
496
533
|
{
|
|
497
|
-
if (at->bucket_capacity > SIZE_MAX / 2) return;
|
|
534
|
+
if (at->bucket_capacity > SIZE_MAX / (2 * sizeof(rperf_agg_entry_t))) return;
|
|
498
535
|
size_t new_cap = at->bucket_capacity * 2;
|
|
499
536
|
rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
|
|
500
537
|
if (!new_buckets) return; /* keep using current buckets at higher load factor */
|
|
@@ -519,7 +556,7 @@ static int
|
|
|
519
556
|
rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
|
|
520
557
|
{
|
|
521
558
|
while (at->stack_pool_count + (size_t)needed > at->stack_pool_capacity) {
|
|
522
|
-
if (at->stack_pool_capacity > SIZE_MAX / 2) return -1;
|
|
559
|
+
if (at->stack_pool_capacity > SIZE_MAX / (2 * sizeof(uint32_t))) return -1;
|
|
523
560
|
size_t new_cap = at->stack_pool_capacity * 2;
|
|
524
561
|
uint32_t *new_pool = (uint32_t *)realloc(at->stack_pool,
|
|
525
562
|
new_cap * sizeof(uint32_t));
|
|
@@ -530,8 +567,9 @@ rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
|
|
|
530
567
|
return 0;
|
|
531
568
|
}
|
|
532
569
|
|
|
533
|
-
/* Insert or merge a stack into the aggregation table
|
|
534
|
-
|
|
570
|
+
/* Insert or merge a stack into the aggregation table.
|
|
571
|
+
* Returns 0 on success, -1 on failure (table full or allocation failure). */
|
|
572
|
+
static int
|
|
535
573
|
rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
536
574
|
int depth, int thread_seq, int label_set_id,
|
|
537
575
|
enum rperf_vm_state vm_state, int64_t weight, uint32_t hash)
|
|
@@ -548,14 +586,14 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
|
548
586
|
depth * sizeof(uint32_t)) == 0) {
|
|
549
587
|
/* Match — merge weight */
|
|
550
588
|
e->weight += weight;
|
|
551
|
-
return;
|
|
589
|
+
return 0;
|
|
552
590
|
}
|
|
553
591
|
idx = (idx + 1) % at->bucket_capacity;
|
|
554
|
-
if (++probes >= at->bucket_capacity) return; /* table full
|
|
592
|
+
if (++probes >= at->bucket_capacity) return -1; /* table full */
|
|
555
593
|
}
|
|
556
594
|
|
|
557
595
|
/* New entry — append frame_ids to stack_pool */
|
|
558
|
-
if (rperf_agg_ensure_stack_pool(at, depth) < 0) return;
|
|
596
|
+
if (rperf_agg_ensure_stack_pool(at, depth) < 0) return -1;
|
|
559
597
|
|
|
560
598
|
rperf_agg_entry_t *e = &at->buckets[idx];
|
|
561
599
|
e->frame_start = (uint32_t)at->stack_pool_count;
|
|
@@ -576,6 +614,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
|
576
614
|
if (at->count * 10 > at->bucket_capacity * 7) {
|
|
577
615
|
rperf_agg_table_rehash(at);
|
|
578
616
|
}
|
|
617
|
+
return 0;
|
|
579
618
|
}
|
|
580
619
|
|
|
581
620
|
/* ---- Aggregation: process a sample buffer into frame_table + agg_table ---- */
|
|
@@ -598,25 +637,32 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
|
|
|
598
637
|
/* Convert VALUE frames to frame_ids */
|
|
599
638
|
int overflow = 0;
|
|
600
639
|
for (j = 0; j < s->depth; j++) {
|
|
640
|
+
if (s->frame_start + j >= buf->frame_pool_count) break;
|
|
601
641
|
VALUE fval = buf->frame_pool[s->frame_start + j];
|
|
602
642
|
uint32_t fid = rperf_frame_table_insert(&prof->frame_table, fval);
|
|
603
643
|
if (fid == RPERF_FRAME_TABLE_EMPTY) { overflow = 1; break; }
|
|
604
644
|
temp_ids[j] = fid;
|
|
605
645
|
}
|
|
606
|
-
if (overflow)
|
|
646
|
+
if (overflow) {
|
|
647
|
+
/* frame_table full — count remaining samples as dropped */
|
|
648
|
+
prof->stats.dropped_aggregation += buf->sample_count - i;
|
|
649
|
+
break;
|
|
650
|
+
}
|
|
607
651
|
|
|
608
652
|
hash = rperf_fnv1a_u32(temp_ids, s->depth, s->thread_seq, s->label_set_id, s->vm_state);
|
|
609
653
|
|
|
610
|
-
rperf_agg_table_insert(&prof->agg_table, temp_ids, s->depth,
|
|
654
|
+
if (rperf_agg_table_insert(&prof->agg_table, temp_ids, s->depth,
|
|
611
655
|
s->thread_seq, s->label_set_id, s->vm_state,
|
|
612
|
-
s->weight, hash)
|
|
656
|
+
s->weight, hash) < 0) {
|
|
657
|
+
prof->stats.dropped_aggregation++;
|
|
658
|
+
}
|
|
613
659
|
}
|
|
614
660
|
|
|
615
661
|
/* Reset buffer for reuse.
|
|
616
662
|
* Release fence: ensure all frame_table inserts are visible (to GC dmark)
|
|
617
663
|
* before frame_pool_count is cleared, so dmark always has at least one
|
|
618
664
|
* source (frame_table or frame_pool) covering each VALUE. */
|
|
619
|
-
|
|
665
|
+
atomic_thread_fence(memory_order_release);
|
|
620
666
|
buf->sample_count = 0;
|
|
621
667
|
buf->frame_pool_count = 0;
|
|
622
668
|
}
|
|
@@ -693,8 +739,9 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
|
|
|
693
739
|
{
|
|
694
740
|
rperf_thread_data_t *td = (rperf_thread_data_t *)calloc(1, sizeof(rperf_thread_data_t));
|
|
695
741
|
if (!td) return NULL;
|
|
696
|
-
|
|
697
|
-
|
|
742
|
+
int64_t t = rperf_current_time_ns(prof);
|
|
743
|
+
if (t < 0) { free(td); return NULL; }
|
|
744
|
+
td->prev_time_ns = t;
|
|
698
745
|
td->thread_seq = ++prof->next_thread_seq;
|
|
699
746
|
rb_internal_thread_specific_set(thread, prof->ts_key, td);
|
|
700
747
|
return td;
|
|
@@ -737,7 +784,6 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t
|
|
|
737
784
|
/* Save timestamp for READY/RESUMED */
|
|
738
785
|
td->suspended_at_ns = wall_now;
|
|
739
786
|
td->prev_time_ns = time_now;
|
|
740
|
-
td->prev_wall_ns = wall_now;
|
|
741
787
|
}
|
|
742
788
|
|
|
743
789
|
static void
|
|
@@ -768,7 +814,7 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
|
|
|
768
814
|
* Both samples are written directly into the same buffer before calling
|
|
769
815
|
* rperf_try_swap, so that a swap triggered by the first sample cannot
|
|
770
816
|
* move the second into a different buffer with a stale frame_start. */
|
|
771
|
-
if (prof->mode ==
|
|
817
|
+
if (prof->mode == RPERF_MODE_WALL && td->suspended_at_ns > 0 && !RPERF_PAUSED(prof)) {
|
|
772
818
|
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
773
819
|
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) goto skip_gvl;
|
|
774
820
|
size_t frame_start = buf->frame_pool_count;
|
|
@@ -798,7 +844,6 @@ skip_gvl:
|
|
|
798
844
|
/* Reset prev times to current — next timer sample measures from resume */
|
|
799
845
|
int64_t time_now = rperf_current_time_ns(prof);
|
|
800
846
|
if (time_now >= 0) td->prev_time_ns = time_now;
|
|
801
|
-
td->prev_wall_ns = wall_now;
|
|
802
847
|
|
|
803
848
|
/* Clear suspended state */
|
|
804
849
|
td->suspended_at_ns = 0;
|
|
@@ -895,17 +940,15 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
|
|
|
895
940
|
|
|
896
941
|
/* ---- Sampling callback (postponed job) — current thread only ---- */
|
|
897
942
|
|
|
898
|
-
|
|
899
|
-
|
|
943
|
+
/* Core sampling logic, parameterized by mode constant.
|
|
944
|
+
* Called from rperf_sample_cpu/rperf_sample_wall so the compiler
|
|
945
|
+
* can inline and eliminate mode branches at compile time. */
|
|
946
|
+
static inline void
|
|
947
|
+
rperf_sample_core(rperf_profiler_t *prof, enum rperf_mode mode)
|
|
900
948
|
{
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
if (!prof->running) return;
|
|
904
|
-
if (RPERF_PAUSED(prof)) return;
|
|
905
|
-
|
|
906
|
-
/* Measure sampling overhead */
|
|
949
|
+
/* Measure sampling overhead (wall time — runs under GVL, no I/O) */
|
|
907
950
|
struct timespec ts_start, ts_end;
|
|
908
|
-
clock_gettime(
|
|
951
|
+
clock_gettime(CLOCK_MONOTONIC, &ts_start);
|
|
909
952
|
|
|
910
953
|
VALUE thread = rb_thread_current();
|
|
911
954
|
|
|
@@ -917,12 +960,11 @@ rperf_sample_job(void *arg)
|
|
|
917
960
|
return; /* Skip first sample for this thread */
|
|
918
961
|
}
|
|
919
962
|
|
|
920
|
-
int64_t time_now =
|
|
963
|
+
int64_t time_now = (mode == RPERF_MODE_CPU) ? rperf_cpu_time_ns() : rperf_wall_time_ns();
|
|
921
964
|
if (time_now < 0) return;
|
|
922
965
|
|
|
923
966
|
int64_t weight = time_now - td->prev_time_ns;
|
|
924
967
|
td->prev_time_ns = time_now;
|
|
925
|
-
td->prev_wall_ns = rperf_wall_time_ns();
|
|
926
968
|
|
|
927
969
|
if (weight <= 0) return;
|
|
928
970
|
|
|
@@ -938,13 +980,33 @@ rperf_sample_job(void *arg)
|
|
|
938
980
|
|
|
939
981
|
rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
|
|
940
982
|
|
|
941
|
-
clock_gettime(
|
|
983
|
+
clock_gettime(CLOCK_MONOTONIC, &ts_end);
|
|
942
984
|
prof->stats.sampling_count++;
|
|
943
985
|
prof->stats.sampling_total_ns +=
|
|
944
986
|
((int64_t)ts_end.tv_sec - ts_start.tv_sec) * 1000000000LL +
|
|
945
987
|
(ts_end.tv_nsec - ts_start.tv_nsec);
|
|
946
988
|
}
|
|
947
989
|
|
|
990
|
+
static void
|
|
991
|
+
rperf_sample_cpu(rperf_profiler_t *prof) { rperf_sample_core(prof, RPERF_MODE_CPU); }
|
|
992
|
+
|
|
993
|
+
static void
|
|
994
|
+
rperf_sample_wall(rperf_profiler_t *prof) { rperf_sample_core(prof, RPERF_MODE_WALL); }
|
|
995
|
+
|
|
996
|
+
static void
|
|
997
|
+
rperf_sample_job(void *arg)
|
|
998
|
+
{
|
|
999
|
+
rperf_profiler_t *prof = (rperf_profiler_t *)arg;
|
|
1000
|
+
|
|
1001
|
+
if (!prof->running) return;
|
|
1002
|
+
if (RPERF_PAUSED(prof)) return;
|
|
1003
|
+
|
|
1004
|
+
if (prof->mode == RPERF_MODE_CPU)
|
|
1005
|
+
rperf_sample_cpu(prof);
|
|
1006
|
+
else
|
|
1007
|
+
rperf_sample_wall(prof);
|
|
1008
|
+
}
|
|
1009
|
+
|
|
948
1010
|
/* ---- Worker thread: timer + aggregation ---- */
|
|
949
1011
|
|
|
950
1012
|
#if RPERF_USE_TIMER_SIGNAL
|
|
@@ -990,7 +1052,7 @@ rperf_worker_nanosleep_func(void *arg)
|
|
|
990
1052
|
struct timespec deadline;
|
|
991
1053
|
long interval_ns = 1000000000L / prof->frequency;
|
|
992
1054
|
|
|
993
|
-
clock_gettime(
|
|
1055
|
+
clock_gettime(RPERF_COND_CLOCK, &deadline);
|
|
994
1056
|
deadline.tv_nsec += interval_ns;
|
|
995
1057
|
if (deadline.tv_nsec >= 1000000000L) {
|
|
996
1058
|
deadline.tv_sec++;
|
|
@@ -1005,7 +1067,7 @@ rperf_worker_nanosleep_func(void *arg)
|
|
|
1005
1067
|
CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
|
|
1006
1068
|
prof->worker_paused = 0;
|
|
1007
1069
|
/* Reset deadline on wake to avoid burst of catch-up triggers */
|
|
1008
|
-
clock_gettime(
|
|
1070
|
+
clock_gettime(RPERF_COND_CLOCK, &deadline);
|
|
1009
1071
|
deadline.tv_nsec += interval_ns;
|
|
1010
1072
|
if (deadline.tv_nsec >= 1000000000L) {
|
|
1011
1073
|
deadline.tv_sec++;
|
|
@@ -1076,13 +1138,15 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
|
|
|
1076
1138
|
result = rb_hash_new();
|
|
1077
1139
|
|
|
1078
1140
|
rb_hash_aset(result, ID2SYM(rb_intern("mode")),
|
|
1079
|
-
ID2SYM(rb_intern(prof->mode ==
|
|
1141
|
+
ID2SYM(rb_intern(prof->mode == RPERF_MODE_WALL ? "wall" : "cpu")));
|
|
1080
1142
|
rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(prof->frequency));
|
|
1081
1143
|
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(prof->stats.trigger_count));
|
|
1082
1144
|
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(prof->stats.sampling_count));
|
|
1083
1145
|
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(prof->stats.sampling_total_ns));
|
|
1084
1146
|
if (prof->stats.dropped_samples > 0)
|
|
1085
1147
|
rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(prof->stats.dropped_samples));
|
|
1148
|
+
if (prof->stats.dropped_aggregation > 0)
|
|
1149
|
+
rb_hash_aset(result, ID2SYM(rb_intern("dropped_aggregation")), SIZET2NUM(prof->stats.dropped_aggregation));
|
|
1086
1150
|
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(prof->next_thread_seq));
|
|
1087
1151
|
rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
|
|
1088
1152
|
SIZET2NUM(prof->frame_table.count));
|
|
@@ -1154,7 +1218,7 @@ static VALUE
|
|
|
1154
1218
|
rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VALUE vdefer)
|
|
1155
1219
|
{
|
|
1156
1220
|
int frequency = NUM2INT(vfreq);
|
|
1157
|
-
|
|
1221
|
+
enum rperf_mode mode = (enum rperf_mode)NUM2INT(vmode);
|
|
1158
1222
|
int aggregate = RTEST(vagg) ? 1 : 0;
|
|
1159
1223
|
#if RPERF_USE_TIMER_SIGNAL
|
|
1160
1224
|
int sig = NUM2INT(vsig);
|
|
@@ -1173,13 +1237,26 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VAL
|
|
|
1173
1237
|
g_profiler.stats.sampling_total_ns = 0;
|
|
1174
1238
|
g_profiler.stats.trigger_count = 0;
|
|
1175
1239
|
g_profiler.stats.dropped_samples = 0;
|
|
1240
|
+
g_profiler.stats.dropped_aggregation = 0;
|
|
1176
1241
|
atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
|
|
1177
1242
|
atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
|
|
1178
1243
|
g_profiler.label_sets = Qnil;
|
|
1179
1244
|
|
|
1180
1245
|
/* Initialize worker mutex/cond */
|
|
1181
1246
|
CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
|
|
1247
|
+
#ifdef __linux__
|
|
1248
|
+
{
|
|
1249
|
+
/* Use CLOCK_MONOTONIC for pthread_cond_timedwait so that
|
|
1250
|
+
* system clock adjustments (NTP etc.) don't affect timer intervals. */
|
|
1251
|
+
pthread_condattr_t cond_attr;
|
|
1252
|
+
CHECKED(pthread_condattr_init(&cond_attr));
|
|
1253
|
+
CHECKED(pthread_condattr_setclock(&cond_attr, CLOCK_MONOTONIC));
|
|
1254
|
+
CHECKED(pthread_cond_init(&g_profiler.worker_cond, &cond_attr));
|
|
1255
|
+
CHECKED(pthread_condattr_destroy(&cond_attr));
|
|
1256
|
+
}
|
|
1257
|
+
#else
|
|
1182
1258
|
CHECKED(pthread_cond_init(&g_profiler.worker_cond, NULL));
|
|
1259
|
+
#endif
|
|
1183
1260
|
|
|
1184
1261
|
/* Initialize sample buffer(s) */
|
|
1185
1262
|
if (rperf_sample_buffer_init(&g_profiler.buffers[0]) < 0) {
|
|
@@ -1436,13 +1513,15 @@ rb_rperf_stop(VALUE self)
|
|
|
1436
1513
|
|
|
1437
1514
|
result = rb_hash_new();
|
|
1438
1515
|
rb_hash_aset(result, ID2SYM(rb_intern("mode")),
|
|
1439
|
-
ID2SYM(rb_intern(g_profiler.mode ==
|
|
1516
|
+
ID2SYM(rb_intern(g_profiler.mode == RPERF_MODE_WALL ? "wall" : "cpu")));
|
|
1440
1517
|
rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
|
|
1441
1518
|
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
|
|
1442
1519
|
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
|
|
1443
1520
|
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
|
|
1444
1521
|
if (g_profiler.stats.dropped_samples > 0)
|
|
1445
1522
|
rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(g_profiler.stats.dropped_samples));
|
|
1523
|
+
if (g_profiler.stats.dropped_aggregation > 0)
|
|
1524
|
+
rb_hash_aset(result, ID2SYM(rb_intern("dropped_aggregation")), SIZET2NUM(g_profiler.stats.dropped_aggregation));
|
|
1446
1525
|
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
|
|
1447
1526
|
{
|
|
1448
1527
|
struct timespec stop_monotonic;
|
|
@@ -1508,6 +1587,7 @@ rperf_clear_aggregated_data(rperf_profiler_t *prof)
|
|
|
1508
1587
|
prof->stats.sampling_count = 0;
|
|
1509
1588
|
prof->stats.sampling_total_ns = 0;
|
|
1510
1589
|
prof->stats.dropped_samples = 0;
|
|
1590
|
+
prof->stats.dropped_aggregation = 0;
|
|
1511
1591
|
|
|
1512
1592
|
/* Reset start timestamps so next snapshot's duration_ns covers
|
|
1513
1593
|
* only the period since this clear. */
|
|
@@ -1652,7 +1732,6 @@ rperf_reset_thread_times(rperf_profiler_t *prof)
|
|
|
1652
1732
|
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
1653
1733
|
if (td) {
|
|
1654
1734
|
td->prev_time_ns = rperf_current_time_ns(prof);
|
|
1655
|
-
td->prev_wall_ns = rperf_wall_time_ns();
|
|
1656
1735
|
}
|
|
1657
1736
|
}
|
|
1658
1737
|
}
|
|
@@ -1692,6 +1771,12 @@ rb_rperf_running_p(VALUE self)
|
|
|
1692
1771
|
return g_profiler.running ? Qtrue : Qfalse;
|
|
1693
1772
|
}
|
|
1694
1773
|
|
|
1774
|
+
static VALUE
|
|
1775
|
+
rb_rperf_profiler_wrapper(VALUE self)
|
|
1776
|
+
{
|
|
1777
|
+
return g_profiler_wrapper;
|
|
1778
|
+
}
|
|
1779
|
+
|
|
1695
1780
|
/* ---- Fork safety ---- */
|
|
1696
1781
|
|
|
1697
1782
|
static void
|
|
@@ -1771,6 +1856,7 @@ Init_rperf(void)
|
|
|
1771
1856
|
rb_define_module_function(mRperf, "_c_profile_inc", rb_rperf_profile_inc, 0);
|
|
1772
1857
|
rb_define_module_function(mRperf, "_c_profile_dec", rb_rperf_profile_dec, 0);
|
|
1773
1858
|
rb_define_module_function(mRperf, "_c_running?", rb_rperf_running_p, 0);
|
|
1859
|
+
rb_define_module_function(mRperf, "_c_profiler_wrapper", rb_rperf_profiler_wrapper, 0);
|
|
1774
1860
|
|
|
1775
1861
|
memset(&g_profiler, 0, sizeof(g_profiler));
|
|
1776
1862
|
g_profiler.label_sets = Qnil;
|
data/lib/rperf/active_job.rb
CHANGED
data/lib/rperf/version.rb
CHANGED
data/lib/rperf/viewer.rb
CHANGED
|
@@ -3,6 +3,11 @@ require "json"
|
|
|
3
3
|
|
|
4
4
|
# Rack middleware that serves flamegraph visualizations of rperf snapshots.
|
|
5
5
|
#
|
|
6
|
+
# *Security note*: This middleware exposes profiling data without
|
|
7
|
+
# authentication. It is intended for development and staging environments.
|
|
8
|
+
# In production, place it behind an authenticated reverse proxy or restrict
|
|
9
|
+
# access by IP / VPN.
|
|
10
|
+
#
|
|
6
11
|
# Usage:
|
|
7
12
|
# require "rperf/viewer"
|
|
8
13
|
# use Rperf::Viewer # mount at /rperf (default)
|
|
@@ -25,6 +30,7 @@ class Rperf::Viewer
|
|
|
25
30
|
attr_reader :max_snapshots, :path
|
|
26
31
|
|
|
27
32
|
def initialize(app, path: "/rperf", max_snapshots: 24)
|
|
33
|
+
raise ArgumentError, "max_snapshots must be a positive integer, got #{max_snapshots.inspect}" unless max_snapshots.is_a?(Integer) && max_snapshots > 0
|
|
28
34
|
@app = app
|
|
29
35
|
@path = path.chomp("/")
|
|
30
36
|
@max_snapshots = max_snapshots
|
|
@@ -81,6 +87,63 @@ class Rperf::Viewer
|
|
|
81
87
|
end
|
|
82
88
|
end
|
|
83
89
|
|
|
90
|
+
# Convert aggregated samples to JSON-friendly format.
|
|
91
|
+
# Stack is stored top-to-bottom (leaf first) in C; reverse to root-first for flamegraph.
|
|
92
|
+
# Label set keys are converted from symbols to strings for JSON.
|
|
93
|
+
def self.samples_to_json(samples, label_sets)
|
|
94
|
+
json_samples = samples.map do |frames, weight, thread_seq, label_set_id|
|
|
95
|
+
{
|
|
96
|
+
stack: frames.reverse.map { |_, label| label },
|
|
97
|
+
weight: weight,
|
|
98
|
+
thread_seq: thread_seq || 0,
|
|
99
|
+
label_set_id: label_set_id || 0,
|
|
100
|
+
}
|
|
101
|
+
end
|
|
102
|
+
json_label_sets = label_sets.map do |ls|
|
|
103
|
+
ls.is_a?(Hash) ? ls.transform_keys(&:to_s) : ls
|
|
104
|
+
end
|
|
105
|
+
[json_samples, json_label_sets]
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Generate a self-contained static HTML file with inline snapshot data.
|
|
109
|
+
# The HTML loads d3/d3-flamegraph from CDN but requires no server.
|
|
110
|
+
def self.render_static_html(data)
|
|
111
|
+
samples = data[:aggregated_samples] || []
|
|
112
|
+
label_sets = data[:label_sets] || []
|
|
113
|
+
json_samples, json_label_sets = samples_to_json(samples, label_sets)
|
|
114
|
+
|
|
115
|
+
json_snapshot = JSON.generate({
|
|
116
|
+
id: 1,
|
|
117
|
+
taken_at: Time.now.iso8601,
|
|
118
|
+
mode: data[:mode],
|
|
119
|
+
frequency: data[:frequency],
|
|
120
|
+
duration_ns: data[:duration_ns],
|
|
121
|
+
sampling_count: data[:sampling_count],
|
|
122
|
+
samples: json_samples,
|
|
123
|
+
label_sets: json_label_sets,
|
|
124
|
+
})
|
|
125
|
+
|
|
126
|
+
logo = LOGO_SVG.sub("<svg ", '<svg style="height:36px;width:auto" ')
|
|
127
|
+
|
|
128
|
+
html = VIEWER_HTML.sub("<!-- LOGO -->") { logo }
|
|
129
|
+
|
|
130
|
+
# Hide snapshot selector (single snapshot, no server)
|
|
131
|
+
html = html.sub('<select id="sel-snapshot"', '<select id="sel-snapshot" style="display:none"')
|
|
132
|
+
|
|
133
|
+
# Replace dynamic loading with inline data.
|
|
134
|
+
# Escape for safe embedding in <script>:
|
|
135
|
+
# - "</" prevents closing </script> tag injection
|
|
136
|
+
# - U+2028/U+2029 are line terminators in JS but valid in JSON
|
|
137
|
+
json_safe = json_snapshot
|
|
138
|
+
.gsub("</", "<\\/")
|
|
139
|
+
.gsub("\u2028", "\\u2028")
|
|
140
|
+
.gsub("\u2029", "\\u2029")
|
|
141
|
+
html = html.sub("loadSnapshotList();",
|
|
142
|
+
"currentData = #{json_safe}; updateTagDropdowns(); applyAndRender();")
|
|
143
|
+
|
|
144
|
+
html
|
|
145
|
+
end
|
|
146
|
+
|
|
84
147
|
private
|
|
85
148
|
|
|
86
149
|
LOGO_SVG = begin
|
|
@@ -119,24 +182,9 @@ class Rperf::Viewer
|
|
|
119
182
|
return [404, { "content-type" => "text/plain" }, ["Snapshot not found"]] unless entry
|
|
120
183
|
|
|
121
184
|
data = entry[:data]
|
|
122
|
-
samples = data[:aggregated_samples]
|
|
185
|
+
samples = data[:aggregated_samples] || []
|
|
123
186
|
label_sets = data[:label_sets] || []
|
|
124
|
-
|
|
125
|
-
# Convert samples to JSON-friendly format.
|
|
126
|
-
# Stack is stored top-to-bottom (leaf first) in C; reverse to root-first for flamegraph.
|
|
127
|
-
json_samples = samples.map do |frames, weight, thread_seq, label_set_id|
|
|
128
|
-
{
|
|
129
|
-
stack: frames.reverse.map { |_, label| label },
|
|
130
|
-
weight: weight,
|
|
131
|
-
thread_seq: thread_seq || 0,
|
|
132
|
-
label_set_id: label_set_id || 0,
|
|
133
|
-
}
|
|
134
|
-
end
|
|
135
|
-
|
|
136
|
-
# Convert label_sets: symbol keys to string keys for JSON
|
|
137
|
-
json_label_sets = label_sets.map do |ls|
|
|
138
|
-
ls.is_a?(Hash) ? ls.transform_keys(&:to_s) : ls
|
|
139
|
-
end
|
|
187
|
+
json_samples, json_label_sets = self.class.samples_to_json(samples, label_sets)
|
|
140
188
|
|
|
141
189
|
json_response({
|
|
142
190
|
id: entry[:id],
|
|
@@ -162,6 +210,7 @@ class Rperf::Viewer
|
|
|
162
210
|
<html lang="en">
|
|
163
211
|
<head>
|
|
164
212
|
<meta charset="utf-8">
|
|
213
|
+
<meta http-equiv="Content-Security-Policy" content="default-src 'none'; script-src 'unsafe-inline' https://cdnjs.cloudflare.com https://cdn.jsdelivr.net; style-src 'unsafe-inline' https://cdn.jsdelivr.net; connect-src 'self'; img-src data:; frame-ancestors 'none'">
|
|
165
214
|
<title>rperf Viewer</title>
|
|
166
215
|
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/d3-flame-graph@4/dist/d3-flamegraph.css" integrity="sha384-DgAQSBzzhv8bu6Qc6Lq08THluOr+kO5qLMHt1yv8A3my7Jz2OQv6aq/WSZRYIQkG" crossorigin="anonymous">
|
|
167
216
|
<style>
|