rperf 0.6.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +71 -47
- data/docs/help.md +184 -34
- data/docs/logo.svg +25 -0
- data/exe/rperf +121 -26
- data/ext/rperf/rperf.c +250 -103
- data/lib/rperf/active_job.rb +1 -1
- data/lib/rperf/rack.rb +37 -0
- data/lib/rperf/sidekiq.rb +1 -1
- data/lib/rperf/version.rb +1 -1
- data/lib/rperf/viewer.rb +798 -0
- data/lib/rperf.rb +200 -51
- metadata +7 -5
- data/lib/rperf/middleware.rb +0 -15
data/ext/rperf/rperf.c
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
#include <unistd.h>
|
|
9
9
|
#include <signal.h>
|
|
10
10
|
#include <stdatomic.h>
|
|
11
|
+
#include <sched.h>
|
|
11
12
|
#ifdef __linux__
|
|
12
13
|
#include <sys/syscall.h>
|
|
13
14
|
#endif
|
|
@@ -36,23 +37,19 @@
|
|
|
36
37
|
#define RPERF_FRAME_TABLE_OLD_KEYS_INITIAL 16
|
|
37
38
|
#define RPERF_AGG_TABLE_INITIAL 1024
|
|
38
39
|
#define RPERF_STACK_POOL_INITIAL 4096
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
40
|
+
#define RPERF_PAUSED(prof) ((prof)->profile_refcount == 0)
|
|
41
|
+
|
|
42
|
+
/* VM state values (stored in samples, not as stack frames) */
|
|
43
|
+
enum rperf_vm_state {
|
|
44
|
+
RPERF_VM_STATE_NORMAL = 0,
|
|
45
|
+
RPERF_VM_STATE_GVL_BLOCKED = 1,
|
|
46
|
+
RPERF_VM_STATE_GVL_WAIT = 2,
|
|
47
|
+
RPERF_VM_STATE_GC_MARKING = 3,
|
|
48
|
+
RPERF_VM_STATE_GC_SWEEPING = 4,
|
|
49
|
+
};
|
|
46
50
|
|
|
47
51
|
/* ---- Data structures ---- */
|
|
48
52
|
|
|
49
|
-
enum rperf_sample_type {
|
|
50
|
-
RPERF_SAMPLE_NORMAL = 0,
|
|
51
|
-
RPERF_SAMPLE_GVL_BLOCKED = 1, /* off-GVL: SUSPENDED → READY */
|
|
52
|
-
RPERF_SAMPLE_GVL_WAIT = 2, /* GVL wait: READY → RESUMED */
|
|
53
|
-
RPERF_SAMPLE_GC_MARKING = 3, /* GC marking phase */
|
|
54
|
-
RPERF_SAMPLE_GC_SWEEPING = 4, /* GC sweeping phase */
|
|
55
|
-
};
|
|
56
53
|
|
|
57
54
|
enum rperf_gc_phase {
|
|
58
55
|
RPERF_GC_NONE = 0,
|
|
@@ -64,7 +61,7 @@ typedef struct rperf_sample {
|
|
|
64
61
|
int depth;
|
|
65
62
|
size_t frame_start; /* index into frame_pool */
|
|
66
63
|
int64_t weight;
|
|
67
|
-
|
|
64
|
+
enum rperf_vm_state vm_state;
|
|
68
65
|
int thread_seq; /* thread sequence number (1-based) */
|
|
69
66
|
int label_set_id; /* label set ID (0 = no labels) */
|
|
70
67
|
} rperf_sample_t;
|
|
@@ -86,7 +83,7 @@ typedef struct rperf_sample_buffer {
|
|
|
86
83
|
|
|
87
84
|
typedef struct rperf_frame_table {
|
|
88
85
|
_Atomic(VALUE *) keys; /* unique VALUE array (GC mark target) */
|
|
89
|
-
size_t count; /* = next frame_id
|
|
86
|
+
size_t count; /* = next frame_id */
|
|
90
87
|
size_t capacity;
|
|
91
88
|
uint32_t *buckets; /* open addressing: stores index into keys[] */
|
|
92
89
|
size_t bucket_capacity;
|
|
@@ -102,9 +99,10 @@ typedef struct rperf_frame_table {
|
|
|
102
99
|
|
|
103
100
|
typedef struct rperf_agg_entry {
|
|
104
101
|
uint32_t frame_start; /* offset into stack_pool */
|
|
105
|
-
int depth;
|
|
102
|
+
int depth;
|
|
106
103
|
int thread_seq;
|
|
107
104
|
int label_set_id; /* label set ID (0 = no labels) */
|
|
105
|
+
enum rperf_vm_state vm_state;
|
|
108
106
|
int64_t weight; /* accumulated */
|
|
109
107
|
uint32_t hash; /* cached hash value */
|
|
110
108
|
int used; /* 0 = empty, 1 = used */
|
|
@@ -144,6 +142,7 @@ typedef struct rperf_stats {
|
|
|
144
142
|
size_t trigger_count;
|
|
145
143
|
size_t sampling_count;
|
|
146
144
|
int64_t sampling_total_ns;
|
|
145
|
+
size_t dropped_samples; /* samples lost due to allocation failure */
|
|
147
146
|
} rperf_stats_t;
|
|
148
147
|
|
|
149
148
|
typedef struct rperf_profiler {
|
|
@@ -182,6 +181,12 @@ typedef struct rperf_profiler {
|
|
|
182
181
|
/* Label sets: Ruby Array of Hash objects, managed from Ruby side.
|
|
183
182
|
* Index 0 is reserved (no labels). GC-marked via profiler_mark. */
|
|
184
183
|
VALUE label_sets; /* Ruby Array or Qnil */
|
|
184
|
+
/* Profile refcount: controls timer active/paused state.
|
|
185
|
+
* start(defer:false) sets to 1, start(defer:true) sets to 0.
|
|
186
|
+
* profile_inc/dec transitions 0↔1 arm/disarm the timer.
|
|
187
|
+
* Modified only under GVL, so plain int is safe. */
|
|
188
|
+
int profile_refcount;
|
|
189
|
+
int worker_paused; /* 1 when nanosleep worker is in paused cond_wait */
|
|
185
190
|
} rperf_profiler_t;
|
|
186
191
|
|
|
187
192
|
static rperf_profiler_t g_profiler;
|
|
@@ -215,8 +220,7 @@ rperf_profiler_mark(void *ptr)
|
|
|
215
220
|
size_t ft_count = __atomic_load_n(&prof->frame_table.count, __ATOMIC_ACQUIRE);
|
|
216
221
|
VALUE *ft_keys = atomic_load_explicit(&prof->frame_table.keys, memory_order_acquire);
|
|
217
222
|
if (ft_keys && ft_count > 0) {
|
|
218
|
-
rb_gc_mark_locations(ft_keys +
|
|
219
|
-
ft_keys + ft_count);
|
|
223
|
+
rb_gc_mark_locations(ft_keys, ft_keys + ft_count);
|
|
220
224
|
}
|
|
221
225
|
}
|
|
222
226
|
}
|
|
@@ -253,7 +257,7 @@ rperf_wall_time_ns(void)
|
|
|
253
257
|
/* ---- Get current thread's time based on profiler mode ---- */
|
|
254
258
|
|
|
255
259
|
static int64_t
|
|
256
|
-
rperf_current_time_ns(rperf_profiler_t *prof
|
|
260
|
+
rperf_current_time_ns(rperf_profiler_t *prof)
|
|
257
261
|
{
|
|
258
262
|
if (prof->mode == 0) {
|
|
259
263
|
return rperf_cpu_time_ns();
|
|
@@ -296,6 +300,7 @@ static int
|
|
|
296
300
|
rperf_ensure_sample_capacity(rperf_sample_buffer_t *buf)
|
|
297
301
|
{
|
|
298
302
|
if (buf->sample_count >= buf->sample_capacity) {
|
|
303
|
+
if (buf->sample_capacity > SIZE_MAX / 2) return -1;
|
|
299
304
|
size_t new_cap = buf->sample_capacity * 2;
|
|
300
305
|
rperf_sample_t *new_samples = (rperf_sample_t *)realloc(
|
|
301
306
|
buf->samples,
|
|
@@ -314,6 +319,7 @@ static int
|
|
|
314
319
|
rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
|
|
315
320
|
{
|
|
316
321
|
while (buf->frame_pool_count + (size_t)needed > buf->frame_pool_capacity) {
|
|
322
|
+
if (buf->frame_pool_capacity > SIZE_MAX / 2) return -1;
|
|
317
323
|
size_t new_cap = buf->frame_pool_capacity * 2;
|
|
318
324
|
VALUE *new_pool = (VALUE *)realloc(
|
|
319
325
|
buf->frame_pool,
|
|
@@ -334,7 +340,7 @@ rperf_frame_table_init(rperf_frame_table_t *ft)
|
|
|
334
340
|
VALUE *keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
|
|
335
341
|
if (!keys) return -1;
|
|
336
342
|
atomic_store_explicit(&ft->keys, keys, memory_order_relaxed);
|
|
337
|
-
ft->count =
|
|
343
|
+
ft->count = 0;
|
|
338
344
|
ft->bucket_capacity = RPERF_FRAME_TABLE_INITIAL * 2;
|
|
339
345
|
ft->buckets = (uint32_t *)malloc(ft->bucket_capacity * sizeof(uint32_t));
|
|
340
346
|
if (!ft->buckets) { free(keys); atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed); return -1; }
|
|
@@ -366,6 +372,7 @@ rperf_frame_table_free(rperf_frame_table_t *ft)
|
|
|
366
372
|
static void
|
|
367
373
|
rperf_frame_table_rehash(rperf_frame_table_t *ft)
|
|
368
374
|
{
|
|
375
|
+
if (ft->bucket_capacity > SIZE_MAX / 2) return;
|
|
369
376
|
size_t new_cap = ft->bucket_capacity * 2;
|
|
370
377
|
uint32_t *new_buckets = (uint32_t *)malloc(new_cap * sizeof(uint32_t));
|
|
371
378
|
if (!new_buckets) return; /* keep using current buckets at higher load factor */
|
|
@@ -373,7 +380,7 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
|
|
|
373
380
|
|
|
374
381
|
VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
|
|
375
382
|
size_t i;
|
|
376
|
-
for (i =
|
|
383
|
+
for (i = 0; i < ft->count; i++) {
|
|
377
384
|
uint32_t h = (uint32_t)(keys[i] >> 3); /* shift out tag bits */
|
|
378
385
|
size_t idx = h % new_cap;
|
|
379
386
|
while (new_buckets[idx] != RPERF_FRAME_TABLE_EMPTY)
|
|
@@ -394,11 +401,13 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
|
|
|
394
401
|
uint32_t h = (uint32_t)(fval >> 3);
|
|
395
402
|
size_t idx = h % ft->bucket_capacity;
|
|
396
403
|
|
|
404
|
+
size_t probes = 0;
|
|
397
405
|
while (1) {
|
|
398
406
|
uint32_t slot = ft->buckets[idx];
|
|
399
407
|
if (slot == RPERF_FRAME_TABLE_EMPTY) break;
|
|
400
408
|
if (keys[slot] == fval) return slot;
|
|
401
409
|
idx = (idx + 1) % ft->bucket_capacity;
|
|
410
|
+
if (++probes >= ft->bucket_capacity) return RPERF_FRAME_TABLE_EMPTY; /* table full */
|
|
402
411
|
}
|
|
403
412
|
|
|
404
413
|
/* Insert new entry. Grow keys array if capacity is exhausted.
|
|
@@ -406,6 +415,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
|
|
|
406
415
|
* the old keys pointer. Instead, allocate new, copy, swap pointer
|
|
407
416
|
* atomically, and keep old array alive until stop. */
|
|
408
417
|
if (ft->count >= ft->capacity) {
|
|
418
|
+
if (ft->capacity > SIZE_MAX / 2) return RPERF_FRAME_TABLE_EMPTY;
|
|
409
419
|
size_t new_cap = ft->capacity * 2;
|
|
410
420
|
VALUE *new_keys = (VALUE *)calloc(new_cap, sizeof(VALUE));
|
|
411
421
|
if (!new_keys) return RPERF_FRAME_TABLE_EMPTY;
|
|
@@ -442,7 +452,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
|
|
|
442
452
|
/* ---- Aggregation table operations (all malloc-based, no GVL needed) ---- */
|
|
443
453
|
|
|
444
454
|
static uint32_t
|
|
445
|
-
rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id)
|
|
455
|
+
rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id, enum rperf_vm_state vm_state)
|
|
446
456
|
{
|
|
447
457
|
uint32_t h = 2166136261u;
|
|
448
458
|
int i;
|
|
@@ -454,6 +464,8 @@ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id)
|
|
|
454
464
|
h *= 16777619u;
|
|
455
465
|
h ^= (uint32_t)label_set_id;
|
|
456
466
|
h *= 16777619u;
|
|
467
|
+
h ^= (uint32_t)vm_state;
|
|
468
|
+
h *= 16777619u;
|
|
457
469
|
return h;
|
|
458
470
|
}
|
|
459
471
|
|
|
@@ -482,6 +494,7 @@ rperf_agg_table_free(rperf_agg_table_t *at)
|
|
|
482
494
|
static void
|
|
483
495
|
rperf_agg_table_rehash(rperf_agg_table_t *at)
|
|
484
496
|
{
|
|
497
|
+
if (at->bucket_capacity > SIZE_MAX / 2) return;
|
|
485
498
|
size_t new_cap = at->bucket_capacity * 2;
|
|
486
499
|
rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
|
|
487
500
|
if (!new_buckets) return; /* keep using current buckets at higher load factor */
|
|
@@ -506,6 +519,7 @@ static int
|
|
|
506
519
|
rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
|
|
507
520
|
{
|
|
508
521
|
while (at->stack_pool_count + (size_t)needed > at->stack_pool_capacity) {
|
|
522
|
+
if (at->stack_pool_capacity > SIZE_MAX / 2) return -1;
|
|
509
523
|
size_t new_cap = at->stack_pool_capacity * 2;
|
|
510
524
|
uint32_t *new_pool = (uint32_t *)realloc(at->stack_pool,
|
|
511
525
|
new_cap * sizeof(uint32_t));
|
|
@@ -520,15 +534,16 @@ rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
|
|
|
520
534
|
static void
|
|
521
535
|
rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
522
536
|
int depth, int thread_seq, int label_set_id,
|
|
523
|
-
int64_t weight, uint32_t hash)
|
|
537
|
+
enum rperf_vm_state vm_state, int64_t weight, uint32_t hash)
|
|
524
538
|
{
|
|
525
539
|
size_t idx = hash % at->bucket_capacity;
|
|
526
540
|
|
|
541
|
+
size_t probes = 0;
|
|
527
542
|
while (1) {
|
|
528
543
|
rperf_agg_entry_t *e = &at->buckets[idx];
|
|
529
544
|
if (!e->used) break;
|
|
530
545
|
if (e->hash == hash && e->depth == depth && e->thread_seq == thread_seq &&
|
|
531
|
-
e->label_set_id == label_set_id &&
|
|
546
|
+
e->label_set_id == label_set_id && e->vm_state == vm_state &&
|
|
532
547
|
memcmp(at->stack_pool + e->frame_start, frame_ids,
|
|
533
548
|
depth * sizeof(uint32_t)) == 0) {
|
|
534
549
|
/* Match — merge weight */
|
|
@@ -536,6 +551,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
|
536
551
|
return;
|
|
537
552
|
}
|
|
538
553
|
idx = (idx + 1) % at->bucket_capacity;
|
|
554
|
+
if (++probes >= at->bucket_capacity) return; /* table full, drop sample */
|
|
539
555
|
}
|
|
540
556
|
|
|
541
557
|
/* New entry — append frame_ids to stack_pool */
|
|
@@ -546,6 +562,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
|
546
562
|
e->depth = depth;
|
|
547
563
|
e->thread_seq = thread_seq;
|
|
548
564
|
e->label_set_id = label_set_id;
|
|
565
|
+
e->vm_state = vm_state;
|
|
549
566
|
e->weight = weight;
|
|
550
567
|
e->hash = hash;
|
|
551
568
|
e->used = 1;
|
|
@@ -567,24 +584,16 @@ static void
|
|
|
567
584
|
rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
|
|
568
585
|
{
|
|
569
586
|
size_t i;
|
|
570
|
-
uint32_t temp_ids[RPERF_MAX_STACK_DEPTH
|
|
587
|
+
uint32_t temp_ids[RPERF_MAX_STACK_DEPTH];
|
|
571
588
|
|
|
572
589
|
for (i = 0; i < buf->sample_count; i++) {
|
|
573
590
|
rperf_sample_t *s = &buf->samples[i];
|
|
574
|
-
int off = 0;
|
|
575
591
|
uint32_t hash;
|
|
576
592
|
int j;
|
|
577
593
|
|
|
578
|
-
/*
|
|
579
|
-
if (s->
|
|
580
|
-
|
|
581
|
-
} else if (s->type == RPERF_SAMPLE_GVL_WAIT) {
|
|
582
|
-
temp_ids[off++] = RPERF_SYNTHETIC_GVL_WAIT;
|
|
583
|
-
} else if (s->type == RPERF_SAMPLE_GC_MARKING) {
|
|
584
|
-
temp_ids[off++] = RPERF_SYNTHETIC_GC_MARKING;
|
|
585
|
-
} else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
|
|
586
|
-
temp_ids[off++] = RPERF_SYNTHETIC_GC_SWEEPING;
|
|
587
|
-
}
|
|
594
|
+
/* Clamp depth to temp_ids[] capacity */
|
|
595
|
+
if (s->depth > RPERF_MAX_STACK_DEPTH)
|
|
596
|
+
s->depth = RPERF_MAX_STACK_DEPTH;
|
|
588
597
|
|
|
589
598
|
/* Convert VALUE frames to frame_ids */
|
|
590
599
|
int overflow = 0;
|
|
@@ -592,15 +601,15 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
|
|
|
592
601
|
VALUE fval = buf->frame_pool[s->frame_start + j];
|
|
593
602
|
uint32_t fid = rperf_frame_table_insert(&prof->frame_table, fval);
|
|
594
603
|
if (fid == RPERF_FRAME_TABLE_EMPTY) { overflow = 1; break; }
|
|
595
|
-
temp_ids[
|
|
604
|
+
temp_ids[j] = fid;
|
|
596
605
|
}
|
|
597
606
|
if (overflow) break; /* frame_table full, stop aggregating this buffer */
|
|
598
607
|
|
|
599
|
-
|
|
600
|
-
hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq, s->label_set_id);
|
|
608
|
+
hash = rperf_fnv1a_u32(temp_ids, s->depth, s->thread_seq, s->label_set_id, s->vm_state);
|
|
601
609
|
|
|
602
|
-
rperf_agg_table_insert(&prof->agg_table, temp_ids,
|
|
603
|
-
s->thread_seq, s->label_set_id, s->
|
|
610
|
+
rperf_agg_table_insert(&prof->agg_table, temp_ids, s->depth,
|
|
611
|
+
s->thread_seq, s->label_set_id, s->vm_state,
|
|
612
|
+
s->weight, hash);
|
|
604
613
|
}
|
|
605
614
|
|
|
606
615
|
/* Reset buffer for reuse.
|
|
@@ -650,7 +659,7 @@ rperf_try_swap(rperf_profiler_t *prof)
|
|
|
650
659
|
/* Write a sample into a specific buffer. No swap check. */
|
|
651
660
|
static int
|
|
652
661
|
rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
|
|
653
|
-
int64_t weight,
|
|
662
|
+
int64_t weight, enum rperf_vm_state vm_state, int thread_seq, int label_set_id)
|
|
654
663
|
{
|
|
655
664
|
if (weight <= 0) return 0;
|
|
656
665
|
if (rperf_ensure_sample_capacity(buf) < 0) return -1;
|
|
@@ -659,7 +668,7 @@ rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
|
|
|
659
668
|
sample->depth = depth;
|
|
660
669
|
sample->frame_start = frame_start;
|
|
661
670
|
sample->weight = weight;
|
|
662
|
-
sample->
|
|
671
|
+
sample->vm_state = vm_state;
|
|
663
672
|
sample->thread_seq = thread_seq;
|
|
664
673
|
sample->label_set_id = label_set_id;
|
|
665
674
|
buf->sample_count++;
|
|
@@ -668,10 +677,11 @@ rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
|
|
|
668
677
|
|
|
669
678
|
static void
|
|
670
679
|
rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
|
|
671
|
-
int64_t weight,
|
|
680
|
+
int64_t weight, enum rperf_vm_state vm_state, int thread_seq, int label_set_id)
|
|
672
681
|
{
|
|
673
682
|
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
674
|
-
rperf_write_sample(buf, frame_start, depth, weight,
|
|
683
|
+
if (rperf_write_sample(buf, frame_start, depth, weight, vm_state, thread_seq, label_set_id) < 0)
|
|
684
|
+
prof->stats.dropped_samples++;
|
|
675
685
|
rperf_try_swap(prof);
|
|
676
686
|
}
|
|
677
687
|
|
|
@@ -683,7 +693,7 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
|
|
|
683
693
|
{
|
|
684
694
|
rperf_thread_data_t *td = (rperf_thread_data_t *)calloc(1, sizeof(rperf_thread_data_t));
|
|
685
695
|
if (!td) return NULL;
|
|
686
|
-
td->prev_time_ns = rperf_current_time_ns(prof
|
|
696
|
+
td->prev_time_ns = rperf_current_time_ns(prof);
|
|
687
697
|
td->prev_wall_ns = rperf_wall_time_ns();
|
|
688
698
|
td->thread_seq = ++prof->next_thread_seq;
|
|
689
699
|
rb_internal_thread_specific_set(thread, prof->ts_key, td);
|
|
@@ -706,7 +716,7 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t
|
|
|
706
716
|
is_first = 1;
|
|
707
717
|
}
|
|
708
718
|
|
|
709
|
-
int64_t time_now = rperf_current_time_ns(prof
|
|
719
|
+
int64_t time_now = rperf_current_time_ns(prof);
|
|
710
720
|
if (time_now < 0) return;
|
|
711
721
|
|
|
712
722
|
/* Capture backtrace into active buffer's frame_pool */
|
|
@@ -718,10 +728,10 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t
|
|
|
718
728
|
if (depth <= 0) return;
|
|
719
729
|
buf->frame_pool_count += depth;
|
|
720
730
|
|
|
721
|
-
/* Record normal sample (skip if first time — no prev_time) */
|
|
722
|
-
if (!is_first) {
|
|
731
|
+
/* Record normal sample (skip if first time — no prev_time, or if paused) */
|
|
732
|
+
if (!is_first && !RPERF_PAUSED(prof)) {
|
|
723
733
|
int64_t weight = time_now - td->prev_time_ns;
|
|
724
|
-
rperf_record_sample(prof, frame_start, depth, weight,
|
|
734
|
+
rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
|
|
725
735
|
}
|
|
726
736
|
|
|
727
737
|
/* Save timestamp for READY/RESUMED */
|
|
@@ -758,7 +768,7 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
|
|
|
758
768
|
* Both samples are written directly into the same buffer before calling
|
|
759
769
|
* rperf_try_swap, so that a swap triggered by the first sample cannot
|
|
760
770
|
* move the second into a different buffer with a stale frame_start. */
|
|
761
|
-
if (prof->mode == 1 && td->suspended_at_ns > 0) {
|
|
771
|
+
if (prof->mode == 1 && td->suspended_at_ns > 0 && !RPERF_PAUSED(prof)) {
|
|
762
772
|
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
763
773
|
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) goto skip_gvl;
|
|
764
774
|
size_t frame_start = buf->frame_pool_count;
|
|
@@ -770,13 +780,15 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
|
|
|
770
780
|
/* Write both samples into the same buf, then swap-check once */
|
|
771
781
|
if (td->ready_at_ns > 0 && td->ready_at_ns > td->suspended_at_ns) {
|
|
772
782
|
int64_t blocked_ns = td->ready_at_ns - td->suspended_at_ns;
|
|
773
|
-
rperf_write_sample(buf, frame_start, depth, blocked_ns,
|
|
774
|
-
|
|
783
|
+
if (rperf_write_sample(buf, frame_start, depth, blocked_ns,
|
|
784
|
+
RPERF_VM_STATE_GVL_BLOCKED, td->thread_seq, td->label_set_id) < 0)
|
|
785
|
+
prof->stats.dropped_samples++;
|
|
775
786
|
}
|
|
776
787
|
if (td->ready_at_ns > 0 && wall_now > td->ready_at_ns) {
|
|
777
788
|
int64_t wait_ns = wall_now - td->ready_at_ns;
|
|
778
|
-
rperf_write_sample(buf, frame_start, depth, wait_ns,
|
|
779
|
-
|
|
789
|
+
if (rperf_write_sample(buf, frame_start, depth, wait_ns,
|
|
790
|
+
RPERF_VM_STATE_GVL_WAIT, td->thread_seq, td->label_set_id) < 0)
|
|
791
|
+
prof->stats.dropped_samples++;
|
|
780
792
|
}
|
|
781
793
|
|
|
782
794
|
rperf_try_swap(prof);
|
|
@@ -784,7 +796,7 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
|
|
|
784
796
|
skip_gvl:
|
|
785
797
|
|
|
786
798
|
/* Reset prev times to current — next timer sample measures from resume */
|
|
787
|
-
int64_t time_now = rperf_current_time_ns(prof
|
|
799
|
+
int64_t time_now = rperf_current_time_ns(prof);
|
|
788
800
|
if (time_now >= 0) td->prev_time_ns = time_now;
|
|
789
801
|
td->prev_wall_ns = wall_now;
|
|
790
802
|
|
|
@@ -851,12 +863,13 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
|
|
|
851
863
|
}
|
|
852
864
|
else if (event & RUBY_INTERNAL_EVENT_GC_EXIT) {
|
|
853
865
|
if (prof->gc.enter_ns <= 0) return;
|
|
866
|
+
if (RPERF_PAUSED(prof)) { prof->gc.enter_ns = 0; return; }
|
|
854
867
|
|
|
855
868
|
int64_t wall_now = rperf_wall_time_ns();
|
|
856
869
|
int64_t weight = wall_now - prof->gc.enter_ns;
|
|
857
|
-
|
|
858
|
-
?
|
|
859
|
-
:
|
|
870
|
+
enum rperf_vm_state vm_state = (prof->gc.phase == RPERF_GC_SWEEPING)
|
|
871
|
+
? RPERF_VM_STATE_GC_SWEEPING
|
|
872
|
+
: RPERF_VM_STATE_GC_MARKING;
|
|
860
873
|
|
|
861
874
|
/* Capture backtrace here (not at GC_ENTER) so that frame_start
|
|
862
875
|
* always indexes into the current active buffer. The Ruby stack
|
|
@@ -875,7 +888,7 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
|
|
|
875
888
|
}
|
|
876
889
|
buf->frame_pool_count += depth;
|
|
877
890
|
|
|
878
|
-
rperf_record_sample(prof, frame_start, depth, weight,
|
|
891
|
+
rperf_record_sample(prof, frame_start, depth, weight, vm_state, prof->gc.thread_seq, prof->gc.label_set_id);
|
|
879
892
|
prof->gc.enter_ns = 0;
|
|
880
893
|
}
|
|
881
894
|
}
|
|
@@ -888,6 +901,7 @@ rperf_sample_job(void *arg)
|
|
|
888
901
|
rperf_profiler_t *prof = (rperf_profiler_t *)arg;
|
|
889
902
|
|
|
890
903
|
if (!prof->running) return;
|
|
904
|
+
if (RPERF_PAUSED(prof)) return;
|
|
891
905
|
|
|
892
906
|
/* Measure sampling overhead */
|
|
893
907
|
struct timespec ts_start, ts_end;
|
|
@@ -903,7 +917,7 @@ rperf_sample_job(void *arg)
|
|
|
903
917
|
return; /* Skip first sample for this thread */
|
|
904
918
|
}
|
|
905
919
|
|
|
906
|
-
int64_t time_now = rperf_current_time_ns(prof
|
|
920
|
+
int64_t time_now = rperf_current_time_ns(prof);
|
|
907
921
|
if (time_now < 0) return;
|
|
908
922
|
|
|
909
923
|
int64_t weight = time_now - td->prev_time_ns;
|
|
@@ -922,7 +936,7 @@ rperf_sample_job(void *arg)
|
|
|
922
936
|
if (depth <= 0) return;
|
|
923
937
|
buf->frame_pool_count += depth;
|
|
924
938
|
|
|
925
|
-
rperf_record_sample(prof, frame_start, depth, weight,
|
|
939
|
+
rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
|
|
926
940
|
|
|
927
941
|
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_end);
|
|
928
942
|
prof->stats.sampling_count++;
|
|
@@ -985,20 +999,34 @@ rperf_worker_nanosleep_func(void *arg)
|
|
|
985
999
|
|
|
986
1000
|
CHECKED(pthread_mutex_lock(&prof->worker_mutex));
|
|
987
1001
|
while (prof->running) {
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
rb_postponed_job_trigger(prof->pj_handle);
|
|
996
|
-
/* Advance deadline by interval */
|
|
1002
|
+
if (RPERF_PAUSED(prof)) {
|
|
1003
|
+
/* Paused: mark as paused so disarm can confirm, then wait */
|
|
1004
|
+
prof->worker_paused = 1;
|
|
1005
|
+
CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
|
|
1006
|
+
prof->worker_paused = 0;
|
|
1007
|
+
/* Reset deadline on wake to avoid burst of catch-up triggers */
|
|
1008
|
+
clock_gettime(CLOCK_REALTIME, &deadline);
|
|
997
1009
|
deadline.tv_nsec += interval_ns;
|
|
998
1010
|
if (deadline.tv_nsec >= 1000000000L) {
|
|
999
1011
|
deadline.tv_sec++;
|
|
1000
1012
|
deadline.tv_nsec -= 1000000000L;
|
|
1001
1013
|
}
|
|
1014
|
+
} else {
|
|
1015
|
+
int ret = pthread_cond_timedwait(&prof->worker_cond, &prof->worker_mutex, &deadline);
|
|
1016
|
+
if (ret != 0 && ret != ETIMEDOUT) {
|
|
1017
|
+
fprintf(stderr, "rperf: pthread_cond_timedwait failed: %s\n", strerror(ret));
|
|
1018
|
+
abort();
|
|
1019
|
+
}
|
|
1020
|
+
if (ret == ETIMEDOUT) {
|
|
1021
|
+
prof->stats.trigger_count++;
|
|
1022
|
+
rb_postponed_job_trigger(prof->pj_handle);
|
|
1023
|
+
/* Advance deadline by interval */
|
|
1024
|
+
deadline.tv_nsec += interval_ns;
|
|
1025
|
+
if (deadline.tv_nsec >= 1000000000L) {
|
|
1026
|
+
deadline.tv_sec++;
|
|
1027
|
+
deadline.tv_nsec -= 1000000000L;
|
|
1028
|
+
}
|
|
1029
|
+
}
|
|
1002
1030
|
}
|
|
1003
1031
|
rperf_try_aggregate(prof);
|
|
1004
1032
|
}
|
|
@@ -1053,9 +1081,11 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
|
|
|
1053
1081
|
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(prof->stats.trigger_count));
|
|
1054
1082
|
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(prof->stats.sampling_count));
|
|
1055
1083
|
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(prof->stats.sampling_total_ns));
|
|
1084
|
+
if (prof->stats.dropped_samples > 0)
|
|
1085
|
+
rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(prof->stats.dropped_samples));
|
|
1056
1086
|
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(prof->next_thread_seq));
|
|
1057
1087
|
rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
|
|
1058
|
-
SIZET2NUM(prof->frame_table.count
|
|
1088
|
+
SIZET2NUM(prof->frame_table.count));
|
|
1059
1089
|
rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
|
|
1060
1090
|
SIZET2NUM(prof->agg_table.count));
|
|
1061
1091
|
|
|
@@ -1074,11 +1104,7 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
|
|
|
1074
1104
|
{
|
|
1075
1105
|
rperf_frame_table_t *ft = &prof->frame_table;
|
|
1076
1106
|
VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
|
|
1077
|
-
|
|
1078
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
|
|
1079
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]")));
|
|
1080
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]")));
|
|
1081
|
-
for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
|
|
1107
|
+
for (i = 0; i < ft->count; i++) {
|
|
1082
1108
|
rb_ary_push(resolved_ary, rperf_resolve_frame(atomic_load_explicit(&ft->keys, memory_order_relaxed)[i]));
|
|
1083
1109
|
}
|
|
1084
1110
|
|
|
@@ -1090,11 +1116,18 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
|
|
|
1090
1116
|
|
|
1091
1117
|
VALUE frames = rb_ary_new_capa(e->depth);
|
|
1092
1118
|
for (j = 0; j < e->depth; j++) {
|
|
1119
|
+
if (e->frame_start + j >= at->stack_pool_count) break;
|
|
1093
1120
|
uint32_t fid = at->stack_pool[e->frame_start + j];
|
|
1121
|
+
if (fid >= ft->count) break;
|
|
1094
1122
|
rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
|
|
1095
1123
|
}
|
|
1096
1124
|
|
|
1097
|
-
VALUE sample =
|
|
1125
|
+
VALUE sample = rb_ary_new_capa(5);
|
|
1126
|
+
rb_ary_push(sample, frames);
|
|
1127
|
+
rb_ary_push(sample, LONG2NUM(e->weight));
|
|
1128
|
+
rb_ary_push(sample, INT2NUM(e->thread_seq));
|
|
1129
|
+
rb_ary_push(sample, INT2NUM(e->label_set_id));
|
|
1130
|
+
rb_ary_push(sample, INT2NUM(e->vm_state));
|
|
1098
1131
|
rb_ary_push(samples_ary, sample);
|
|
1099
1132
|
}
|
|
1100
1133
|
}
|
|
@@ -1110,14 +1143,15 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
|
|
|
1110
1143
|
|
|
1111
1144
|
/* ---- Ruby API ---- */
|
|
1112
1145
|
|
|
1113
|
-
/* _c_start(frequency, mode, aggregate, signal)
|
|
1146
|
+
/* _c_start(frequency, mode, aggregate, signal, defer)
|
|
1114
1147
|
* frequency: Integer (Hz)
|
|
1115
1148
|
* mode: 0 = cpu, 1 = wall
|
|
1116
1149
|
* aggregate: 0 or 1
|
|
1117
1150
|
* signal: Integer (RT signal number, 0 = nanosleep, -1 = default)
|
|
1151
|
+
* defer: if truthy, start with timer paused (profile_refcount = 0)
|
|
1118
1152
|
*/
|
|
1119
1153
|
static VALUE
|
|
1120
|
-
rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
|
|
1154
|
+
rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VALUE vdefer)
|
|
1121
1155
|
{
|
|
1122
1156
|
int frequency = NUM2INT(vfreq);
|
|
1123
1157
|
int mode = NUM2INT(vmode);
|
|
@@ -1138,6 +1172,7 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
|
|
|
1138
1172
|
g_profiler.stats.sampling_count = 0;
|
|
1139
1173
|
g_profiler.stats.sampling_total_ns = 0;
|
|
1140
1174
|
g_profiler.stats.trigger_count = 0;
|
|
1175
|
+
g_profiler.stats.dropped_samples = 0;
|
|
1141
1176
|
atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
|
|
1142
1177
|
atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
|
|
1143
1178
|
g_profiler.label_sets = Qnil;
|
|
@@ -1222,6 +1257,8 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
|
|
|
1222
1257
|
clock_gettime(CLOCK_MONOTONIC, &g_profiler.start_monotonic);
|
|
1223
1258
|
|
|
1224
1259
|
g_profiler.running = 1;
|
|
1260
|
+
g_profiler.profile_refcount = RTEST(vdefer) ? 0 : 1;
|
|
1261
|
+
g_profiler.worker_paused = 0;
|
|
1225
1262
|
|
|
1226
1263
|
#if RPERF_USE_TIMER_SIGNAL
|
|
1227
1264
|
g_profiler.timer_signal = timer_signal;
|
|
@@ -1269,7 +1306,12 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
|
|
|
1269
1306
|
}
|
|
1270
1307
|
|
|
1271
1308
|
its.it_value.tv_sec = 0;
|
|
1272
|
-
|
|
1309
|
+
if (RPERF_PAUSED(&g_profiler)) {
|
|
1310
|
+
/* defer mode: create timer but don't arm it */
|
|
1311
|
+
its.it_value.tv_nsec = 0;
|
|
1312
|
+
} else {
|
|
1313
|
+
its.it_value.tv_nsec = 1000000000L / g_profiler.frequency;
|
|
1314
|
+
}
|
|
1273
1315
|
its.it_interval = its.it_value;
|
|
1274
1316
|
if (timer_settime(g_profiler.timer_id, 0, &its, NULL) != 0) {
|
|
1275
1317
|
timer_delete(g_profiler.timer_id);
|
|
@@ -1320,9 +1362,7 @@ timer_fail:
|
|
|
1320
1362
|
static VALUE
|
|
1321
1363
|
rb_rperf_stop(VALUE self)
|
|
1322
1364
|
{
|
|
1323
|
-
VALUE result
|
|
1324
|
-
size_t i;
|
|
1325
|
-
int j;
|
|
1365
|
+
VALUE result;
|
|
1326
1366
|
|
|
1327
1367
|
if (!g_profiler.running) {
|
|
1328
1368
|
return Qnil;
|
|
@@ -1389,6 +1429,9 @@ rb_rperf_stop(VALUE self)
|
|
|
1389
1429
|
rperf_agg_table_free(&g_profiler.agg_table);
|
|
1390
1430
|
} else {
|
|
1391
1431
|
/* Raw samples path (aggregate: false) */
|
|
1432
|
+
VALUE samples_ary;
|
|
1433
|
+
size_t i;
|
|
1434
|
+
int j;
|
|
1392
1435
|
rperf_sample_buffer_t *buf = &g_profiler.buffers[0];
|
|
1393
1436
|
|
|
1394
1437
|
result = rb_hash_new();
|
|
@@ -1398,6 +1441,8 @@ rb_rperf_stop(VALUE self)
|
|
|
1398
1441
|
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
|
|
1399
1442
|
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
|
|
1400
1443
|
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
|
|
1444
|
+
if (g_profiler.stats.dropped_samples > 0)
|
|
1445
|
+
rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(g_profiler.stats.dropped_samples));
|
|
1401
1446
|
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
|
|
1402
1447
|
{
|
|
1403
1448
|
struct timespec stop_monotonic;
|
|
@@ -1414,29 +1459,20 @@ rb_rperf_stop(VALUE self)
|
|
|
1414
1459
|
samples_ary = rb_ary_new_capa((long)buf->sample_count);
|
|
1415
1460
|
for (i = 0; i < buf->sample_count; i++) {
|
|
1416
1461
|
rperf_sample_t *s = &buf->samples[i];
|
|
1417
|
-
VALUE frames = rb_ary_new_capa(s->depth
|
|
1418
|
-
|
|
1419
|
-
/* Prepend synthetic frame at leaf position (index 0) */
|
|
1420
|
-
if (s->type == RPERF_SAMPLE_GVL_BLOCKED) {
|
|
1421
|
-
VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]"));
|
|
1422
|
-
rb_ary_push(frames, syn);
|
|
1423
|
-
} else if (s->type == RPERF_SAMPLE_GVL_WAIT) {
|
|
1424
|
-
VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]"));
|
|
1425
|
-
rb_ary_push(frames, syn);
|
|
1426
|
-
} else if (s->type == RPERF_SAMPLE_GC_MARKING) {
|
|
1427
|
-
VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]"));
|
|
1428
|
-
rb_ary_push(frames, syn);
|
|
1429
|
-
} else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
|
|
1430
|
-
VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]"));
|
|
1431
|
-
rb_ary_push(frames, syn);
|
|
1432
|
-
}
|
|
1462
|
+
VALUE frames = rb_ary_new_capa(s->depth);
|
|
1433
1463
|
|
|
1434
1464
|
for (j = 0; j < s->depth; j++) {
|
|
1465
|
+
if (s->frame_start + j >= buf->frame_pool_count) break;
|
|
1435
1466
|
VALUE fval = buf->frame_pool[s->frame_start + j];
|
|
1436
1467
|
rb_ary_push(frames, rperf_resolve_frame(fval));
|
|
1437
1468
|
}
|
|
1438
1469
|
|
|
1439
|
-
VALUE sample =
|
|
1470
|
+
VALUE sample = rb_ary_new_capa(5);
|
|
1471
|
+
rb_ary_push(sample, frames);
|
|
1472
|
+
rb_ary_push(sample, LONG2NUM(s->weight));
|
|
1473
|
+
rb_ary_push(sample, INT2NUM(s->thread_seq));
|
|
1474
|
+
rb_ary_push(sample, INT2NUM(s->label_set_id));
|
|
1475
|
+
rb_ary_push(sample, INT2NUM(s->vm_state));
|
|
1440
1476
|
rb_ary_push(samples_ary, sample);
|
|
1441
1477
|
}
|
|
1442
1478
|
rb_hash_aset(result, ID2SYM(rb_intern("raw_samples")), samples_ary);
|
|
@@ -1471,6 +1507,7 @@ rperf_clear_aggregated_data(rperf_profiler_t *prof)
|
|
|
1471
1507
|
prof->stats.trigger_count = 0;
|
|
1472
1508
|
prof->stats.sampling_count = 0;
|
|
1473
1509
|
prof->stats.sampling_total_ns = 0;
|
|
1510
|
+
prof->stats.dropped_samples = 0;
|
|
1474
1511
|
|
|
1475
1512
|
/* Reset start timestamps so next snapshot's duration_ns covers
|
|
1476
1513
|
* only the period since this clear. */
|
|
@@ -1558,6 +1595,103 @@ rb_rperf_get_label_sets(VALUE self)
|
|
|
1558
1595
|
return g_profiler.label_sets;
|
|
1559
1596
|
}
|
|
1560
1597
|
|
|
1598
|
+
/* ---- Profile refcount API (timer pause/resume) ---- */
|
|
1599
|
+
|
|
1600
|
+
/* Helper: arm the timer with the configured interval */
|
|
1601
|
+
static void
|
|
1602
|
+
rperf_arm_timer(rperf_profiler_t *prof)
|
|
1603
|
+
{
|
|
1604
|
+
#if RPERF_USE_TIMER_SIGNAL
|
|
1605
|
+
if (prof->timer_signal > 0) {
|
|
1606
|
+
struct itimerspec its;
|
|
1607
|
+
its.it_value.tv_sec = 0;
|
|
1608
|
+
its.it_value.tv_nsec = 1000000000L / prof->frequency;
|
|
1609
|
+
its.it_interval = its.it_value;
|
|
1610
|
+
timer_settime(prof->timer_id, 0, &its, NULL);
|
|
1611
|
+
return;
|
|
1612
|
+
}
|
|
1613
|
+
#endif
|
|
1614
|
+
/* nanosleep mode: signal the worker to wake from cond_wait */
|
|
1615
|
+
CHECKED(pthread_mutex_lock(&prof->worker_mutex));
|
|
1616
|
+
CHECKED(pthread_cond_signal(&prof->worker_cond));
|
|
1617
|
+
CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
|
|
1618
|
+
}
|
|
1619
|
+
|
|
1620
|
+
/* Helper: disarm the timer (stop firing) */
|
|
1621
|
+
static void
|
|
1622
|
+
rperf_disarm_timer(rperf_profiler_t *prof)
|
|
1623
|
+
{
|
|
1624
|
+
#if RPERF_USE_TIMER_SIGNAL
|
|
1625
|
+
if (prof->timer_signal > 0) {
|
|
1626
|
+
struct itimerspec its;
|
|
1627
|
+
memset(&its, 0, sizeof(its));
|
|
1628
|
+
timer_settime(prof->timer_id, 0, &its, NULL);
|
|
1629
|
+
return;
|
|
1630
|
+
}
|
|
1631
|
+
#endif
|
|
1632
|
+
/* nanosleep mode: wake the worker and wait until it enters paused state */
|
|
1633
|
+
CHECKED(pthread_mutex_lock(&prof->worker_mutex));
|
|
1634
|
+
while (!prof->worker_paused) {
|
|
1635
|
+
CHECKED(pthread_cond_signal(&prof->worker_cond));
|
|
1636
|
+
CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
|
|
1637
|
+
sched_yield();
|
|
1638
|
+
CHECKED(pthread_mutex_lock(&prof->worker_mutex));
|
|
1639
|
+
}
|
|
1640
|
+
CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
|
|
1641
|
+
}
|
|
1642
|
+
|
|
1643
|
+
/* Helper: reset prev_time_ns for all threads (called on resume to avoid
|
|
1644
|
+
* inflated weight from pause duration). Must be called with GVL held. */
|
|
1645
|
+
static void
|
|
1646
|
+
rperf_reset_thread_times(rperf_profiler_t *prof)
|
|
1647
|
+
{
|
|
1648
|
+
VALUE threads = rb_funcall(rb_cThread, rb_intern("list"), 0);
|
|
1649
|
+
long tc = RARRAY_LEN(threads);
|
|
1650
|
+
for (long i = 0; i < tc; i++) {
|
|
1651
|
+
VALUE thread = RARRAY_AREF(threads, i);
|
|
1652
|
+
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
1653
|
+
if (td) {
|
|
1654
|
+
td->prev_time_ns = rperf_current_time_ns(prof);
|
|
1655
|
+
td->prev_wall_ns = rperf_wall_time_ns();
|
|
1656
|
+
}
|
|
1657
|
+
}
|
|
1658
|
+
}
|
|
1659
|
+
|
|
1660
|
+
/* _c_profile_inc() — increment profile refcount; resume timer on 0→1.
|
|
1661
|
+
* Called with GVL held. */
|
|
1662
|
+
static VALUE
|
|
1663
|
+
rb_rperf_profile_inc(VALUE self)
|
|
1664
|
+
{
|
|
1665
|
+
if (!g_profiler.running) return Qfalse;
|
|
1666
|
+
g_profiler.profile_refcount++;
|
|
1667
|
+
if (g_profiler.profile_refcount == 1) {
|
|
1668
|
+
rperf_reset_thread_times(&g_profiler);
|
|
1669
|
+
rperf_arm_timer(&g_profiler);
|
|
1670
|
+
}
|
|
1671
|
+
return Qtrue;
|
|
1672
|
+
}
|
|
1673
|
+
|
|
1674
|
+
/* _c_profile_dec() — decrement profile refcount; pause timer on 1→0.
|
|
1675
|
+
* Called with GVL held. */
|
|
1676
|
+
static VALUE
|
|
1677
|
+
rb_rperf_profile_dec(VALUE self)
|
|
1678
|
+
{
|
|
1679
|
+
if (!g_profiler.running) return Qfalse;
|
|
1680
|
+
if (g_profiler.profile_refcount <= 0) return Qfalse;
|
|
1681
|
+
g_profiler.profile_refcount--;
|
|
1682
|
+
if (g_profiler.profile_refcount == 0) {
|
|
1683
|
+
rperf_disarm_timer(&g_profiler);
|
|
1684
|
+
}
|
|
1685
|
+
return Qtrue;
|
|
1686
|
+
}
|
|
1687
|
+
|
|
1688
|
+
/* _c_running?() — check if profiler is running. */
|
|
1689
|
+
static VALUE
|
|
1690
|
+
rb_rperf_running_p(VALUE self)
|
|
1691
|
+
{
|
|
1692
|
+
return g_profiler.running ? Qtrue : Qfalse;
|
|
1693
|
+
}
|
|
1694
|
+
|
|
1561
1695
|
/* ---- Fork safety ---- */
|
|
1562
1696
|
|
|
1563
1697
|
static void
|
|
@@ -1568,6 +1702,14 @@ rperf_after_fork_child(void)
|
|
|
1568
1702
|
/* Mark as not running — timer doesn't exist in child */
|
|
1569
1703
|
g_profiler.running = 0;
|
|
1570
1704
|
|
|
1705
|
+
/* Re-initialize mutex/condvar — they may have been locked by the parent's
|
|
1706
|
+
* worker thread at fork time and are in an undefined state in the child.
|
|
1707
|
+
* POSIX says only async-signal-safe functions should be called in atfork
|
|
1708
|
+
* child handlers, but pthread_mutex_init is safe on Linux/glibc/musl and
|
|
1709
|
+
* this is the standard pattern (e.g., Python, Go do the same). */
|
|
1710
|
+
pthread_mutex_init(&g_profiler.worker_mutex, NULL);
|
|
1711
|
+
pthread_cond_init(&g_profiler.worker_cond, NULL);
|
|
1712
|
+
|
|
1571
1713
|
#if RPERF_USE_TIMER_SIGNAL
|
|
1572
1714
|
/* timer_create timers are not inherited across fork, but pending signals may be.
|
|
1573
1715
|
* Block the signal, drain any pending instances, then restore old handler. */
|
|
@@ -1608,6 +1750,8 @@ rperf_after_fork_child(void)
|
|
|
1608
1750
|
/* Reset stats */
|
|
1609
1751
|
g_profiler.stats.sampling_count = 0;
|
|
1610
1752
|
g_profiler.stats.sampling_total_ns = 0;
|
|
1753
|
+
g_profiler.stats.dropped_samples = 0;
|
|
1754
|
+
g_profiler.profile_refcount = 0;
|
|
1611
1755
|
atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
|
|
1612
1756
|
}
|
|
1613
1757
|
|
|
@@ -1617,13 +1761,16 @@ void
|
|
|
1617
1761
|
Init_rperf(void)
|
|
1618
1762
|
{
|
|
1619
1763
|
VALUE mRperf = rb_define_module("Rperf");
|
|
1620
|
-
rb_define_module_function(mRperf, "_c_start", rb_rperf_start,
|
|
1764
|
+
rb_define_module_function(mRperf, "_c_start", rb_rperf_start, 5);
|
|
1621
1765
|
rb_define_module_function(mRperf, "_c_stop", rb_rperf_stop, 0);
|
|
1622
1766
|
rb_define_module_function(mRperf, "_c_snapshot", rb_rperf_snapshot, 1);
|
|
1623
1767
|
rb_define_module_function(mRperf, "_c_set_label", rb_rperf_set_label, 1);
|
|
1624
1768
|
rb_define_module_function(mRperf, "_c_get_label", rb_rperf_get_label, 0);
|
|
1625
1769
|
rb_define_module_function(mRperf, "_c_set_label_sets", rb_rperf_set_label_sets, 1);
|
|
1626
1770
|
rb_define_module_function(mRperf, "_c_get_label_sets", rb_rperf_get_label_sets, 0);
|
|
1771
|
+
rb_define_module_function(mRperf, "_c_profile_inc", rb_rperf_profile_inc, 0);
|
|
1772
|
+
rb_define_module_function(mRperf, "_c_profile_dec", rb_rperf_profile_dec, 0);
|
|
1773
|
+
rb_define_module_function(mRperf, "_c_running?", rb_rperf_running_p, 0);
|
|
1627
1774
|
|
|
1628
1775
|
memset(&g_profiler, 0, sizeof(g_profiler));
|
|
1629
1776
|
g_profiler.label_sets = Qnil;
|