rperf 0.7.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +21 -0
- data/README.md +75 -49
- data/docs/help.md +255 -36
- data/docs/logo.svg +25 -0
- data/exe/rperf +154 -30
- data/ext/rperf/rperf.c +235 -121
- data/lib/rperf/active_job.rb +1 -0
- data/lib/rperf/rack.rb +25 -3
- data/lib/rperf/version.rb +1 -1
- data/lib/rperf/viewer.rb +847 -0
- data/lib/rperf.rb +663 -92
- metadata +7 -4
data/ext/rperf/rperf.c
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
#include <unistd.h>
|
|
9
9
|
#include <signal.h>
|
|
10
10
|
#include <stdatomic.h>
|
|
11
|
+
#include <sched.h>
|
|
11
12
|
#ifdef __linux__
|
|
12
13
|
#include <sys/syscall.h>
|
|
13
14
|
#endif
|
|
@@ -24,8 +25,10 @@
|
|
|
24
25
|
#ifdef __linux__
|
|
25
26
|
#define RPERF_USE_TIMER_SIGNAL 1
|
|
26
27
|
#define RPERF_TIMER_SIGNAL_DEFAULT (SIGRTMIN + 8)
|
|
28
|
+
#define RPERF_COND_CLOCK CLOCK_MONOTONIC
|
|
27
29
|
#else
|
|
28
30
|
#define RPERF_USE_TIMER_SIGNAL 0
|
|
31
|
+
#define RPERF_COND_CLOCK CLOCK_REALTIME /* macOS lacks pthread_condattr_setclock */
|
|
29
32
|
#endif
|
|
30
33
|
|
|
31
34
|
#define RPERF_MAX_STACK_DEPTH 512
|
|
@@ -38,21 +41,21 @@
|
|
|
38
41
|
#define RPERF_STACK_POOL_INITIAL 4096
|
|
39
42
|
#define RPERF_PAUSED(prof) ((prof)->profile_refcount == 0)
|
|
40
43
|
|
|
41
|
-
/*
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
44
|
+
/* VM state values (stored in samples, not as stack frames) */
|
|
45
|
+
enum rperf_vm_state {
|
|
46
|
+
RPERF_VM_STATE_NORMAL = 0,
|
|
47
|
+
RPERF_VM_STATE_GVL_BLOCKED = 1,
|
|
48
|
+
RPERF_VM_STATE_GVL_WAIT = 2,
|
|
49
|
+
RPERF_VM_STATE_GC_MARKING = 3,
|
|
50
|
+
RPERF_VM_STATE_GC_SWEEPING = 4,
|
|
51
|
+
};
|
|
47
52
|
|
|
48
53
|
/* ---- Data structures ---- */
|
|
49
54
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
RPERF_SAMPLE_GC_MARKING = 3, /* GC marking phase */
|
|
55
|
-
RPERF_SAMPLE_GC_SWEEPING = 4, /* GC sweeping phase */
|
|
55
|
+
|
|
56
|
+
enum rperf_mode {
|
|
57
|
+
RPERF_MODE_CPU = 0,
|
|
58
|
+
RPERF_MODE_WALL = 1,
|
|
56
59
|
};
|
|
57
60
|
|
|
58
61
|
enum rperf_gc_phase {
|
|
@@ -65,7 +68,7 @@ typedef struct rperf_sample {
|
|
|
65
68
|
int depth;
|
|
66
69
|
size_t frame_start; /* index into frame_pool */
|
|
67
70
|
int64_t weight;
|
|
68
|
-
|
|
71
|
+
enum rperf_vm_state vm_state;
|
|
69
72
|
int thread_seq; /* thread sequence number (1-based) */
|
|
70
73
|
int label_set_id; /* label set ID (0 = no labels) */
|
|
71
74
|
} rperf_sample_t;
|
|
@@ -87,7 +90,7 @@ typedef struct rperf_sample_buffer {
|
|
|
87
90
|
|
|
88
91
|
typedef struct rperf_frame_table {
|
|
89
92
|
_Atomic(VALUE *) keys; /* unique VALUE array (GC mark target) */
|
|
90
|
-
size_t count;
|
|
93
|
+
_Atomic(size_t) count; /* = next frame_id */
|
|
91
94
|
size_t capacity;
|
|
92
95
|
uint32_t *buckets; /* open addressing: stores index into keys[] */
|
|
93
96
|
size_t bucket_capacity;
|
|
@@ -103,9 +106,10 @@ typedef struct rperf_frame_table {
|
|
|
103
106
|
|
|
104
107
|
typedef struct rperf_agg_entry {
|
|
105
108
|
uint32_t frame_start; /* offset into stack_pool */
|
|
106
|
-
int depth;
|
|
109
|
+
int depth;
|
|
107
110
|
int thread_seq;
|
|
108
111
|
int label_set_id; /* label set ID (0 = no labels) */
|
|
112
|
+
enum rperf_vm_state vm_state;
|
|
109
113
|
int64_t weight; /* accumulated */
|
|
110
114
|
uint32_t hash; /* cached hash value */
|
|
111
115
|
int used; /* 0 = empty, 1 = used */
|
|
@@ -122,7 +126,6 @@ typedef struct rperf_agg_table {
|
|
|
122
126
|
|
|
123
127
|
typedef struct rperf_thread_data {
|
|
124
128
|
int64_t prev_time_ns;
|
|
125
|
-
int64_t prev_wall_ns;
|
|
126
129
|
/* GVL event tracking */
|
|
127
130
|
int64_t suspended_at_ns; /* wall time at SUSPENDED */
|
|
128
131
|
int64_t ready_at_ns; /* wall time at READY */
|
|
@@ -145,11 +148,13 @@ typedef struct rperf_stats {
|
|
|
145
148
|
size_t trigger_count;
|
|
146
149
|
size_t sampling_count;
|
|
147
150
|
int64_t sampling_total_ns;
|
|
151
|
+
size_t dropped_samples; /* samples lost due to allocation failure */
|
|
152
|
+
size_t dropped_aggregation; /* samples lost during aggregation (frame_table/agg_table full) */
|
|
148
153
|
} rperf_stats_t;
|
|
149
154
|
|
|
150
155
|
typedef struct rperf_profiler {
|
|
151
156
|
int frequency;
|
|
152
|
-
|
|
157
|
+
enum rperf_mode mode;
|
|
153
158
|
_Atomic int running;
|
|
154
159
|
pthread_t worker_thread; /* combined timer + aggregation */
|
|
155
160
|
#if RPERF_USE_TIMER_SIGNAL
|
|
@@ -188,6 +193,7 @@ typedef struct rperf_profiler {
|
|
|
188
193
|
* profile_inc/dec transitions 0↔1 arm/disarm the timer.
|
|
189
194
|
* Modified only under GVL, so plain int is safe. */
|
|
190
195
|
int profile_refcount;
|
|
196
|
+
int worker_paused; /* 1 when nanosleep worker is in paused cond_wait */
|
|
191
197
|
} rperf_profiler_t;
|
|
192
198
|
|
|
193
199
|
static rperf_profiler_t g_profiler;
|
|
@@ -218,21 +224,50 @@ rperf_profiler_mark(void *ptr)
|
|
|
218
224
|
* If we see an old count, both old and new keys arrays have valid
|
|
219
225
|
* data (old keys are kept alive in old_keys[]). */
|
|
220
226
|
{
|
|
221
|
-
size_t ft_count =
|
|
227
|
+
size_t ft_count = atomic_load_explicit(&prof->frame_table.count, memory_order_acquire);
|
|
222
228
|
VALUE *ft_keys = atomic_load_explicit(&prof->frame_table.keys, memory_order_acquire);
|
|
223
229
|
if (ft_keys && ft_count > 0) {
|
|
224
|
-
rb_gc_mark_locations(ft_keys +
|
|
225
|
-
ft_keys + ft_count);
|
|
230
|
+
rb_gc_mark_locations(ft_keys, ft_keys + ft_count);
|
|
226
231
|
}
|
|
227
232
|
}
|
|
228
233
|
}
|
|
229
234
|
|
|
235
|
+
static size_t
|
|
236
|
+
rperf_profiler_memsize(const void *ptr)
|
|
237
|
+
{
|
|
238
|
+
const rperf_profiler_t *prof = (const rperf_profiler_t *)ptr;
|
|
239
|
+
size_t size = sizeof(rperf_profiler_t);
|
|
240
|
+
int i;
|
|
241
|
+
|
|
242
|
+
/* Double-buffered sample storage */
|
|
243
|
+
for (i = 0; i < 2; i++) {
|
|
244
|
+
const rperf_sample_buffer_t *buf = &prof->buffers[i];
|
|
245
|
+
size += buf->sample_capacity * sizeof(rperf_sample_t);
|
|
246
|
+
size += buf->frame_pool_capacity * sizeof(VALUE);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/* Frame table */
|
|
250
|
+
size += prof->frame_table.capacity * sizeof(VALUE); /* keys */
|
|
251
|
+
size += prof->frame_table.bucket_capacity * sizeof(uint32_t); /* buckets */
|
|
252
|
+
for (i = 0; i < prof->frame_table.old_keys_count; i++) {
|
|
253
|
+
/* old_keys entries are previous keys arrays; exact sizes unknown,
|
|
254
|
+
* but the pointer array itself is accounted for below. */
|
|
255
|
+
}
|
|
256
|
+
size += prof->frame_table.old_keys_capacity * sizeof(VALUE *); /* old_keys */
|
|
257
|
+
|
|
258
|
+
/* Aggregation table */
|
|
259
|
+
size += prof->agg_table.bucket_capacity * sizeof(rperf_agg_entry_t);
|
|
260
|
+
size += prof->agg_table.stack_pool_capacity * sizeof(uint32_t);
|
|
261
|
+
|
|
262
|
+
return size;
|
|
263
|
+
}
|
|
264
|
+
|
|
230
265
|
static const rb_data_type_t rperf_profiler_type = {
|
|
231
266
|
.wrap_struct_name = "rperf_profiler",
|
|
232
267
|
.function = {
|
|
233
268
|
.dmark = rperf_profiler_mark,
|
|
234
269
|
.dfree = NULL,
|
|
235
|
-
.dsize =
|
|
270
|
+
.dsize = rperf_profiler_memsize,
|
|
236
271
|
},
|
|
237
272
|
};
|
|
238
273
|
|
|
@@ -259,9 +294,9 @@ rperf_wall_time_ns(void)
|
|
|
259
294
|
/* ---- Get current thread's time based on profiler mode ---- */
|
|
260
295
|
|
|
261
296
|
static int64_t
|
|
262
|
-
rperf_current_time_ns(rperf_profiler_t *prof
|
|
297
|
+
rperf_current_time_ns(rperf_profiler_t *prof)
|
|
263
298
|
{
|
|
264
|
-
if (prof->mode ==
|
|
299
|
+
if (prof->mode == RPERF_MODE_CPU) {
|
|
265
300
|
return rperf_cpu_time_ns();
|
|
266
301
|
} else {
|
|
267
302
|
return rperf_wall_time_ns();
|
|
@@ -302,6 +337,7 @@ static int
|
|
|
302
337
|
rperf_ensure_sample_capacity(rperf_sample_buffer_t *buf)
|
|
303
338
|
{
|
|
304
339
|
if (buf->sample_count >= buf->sample_capacity) {
|
|
340
|
+
if (buf->sample_capacity > SIZE_MAX / (2 * sizeof(rperf_sample_t))) return -1;
|
|
305
341
|
size_t new_cap = buf->sample_capacity * 2;
|
|
306
342
|
rperf_sample_t *new_samples = (rperf_sample_t *)realloc(
|
|
307
343
|
buf->samples,
|
|
@@ -320,6 +356,7 @@ static int
|
|
|
320
356
|
rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
|
|
321
357
|
{
|
|
322
358
|
while (buf->frame_pool_count + (size_t)needed > buf->frame_pool_capacity) {
|
|
359
|
+
if (buf->frame_pool_capacity > SIZE_MAX / (2 * sizeof(VALUE))) return -1;
|
|
323
360
|
size_t new_cap = buf->frame_pool_capacity * 2;
|
|
324
361
|
VALUE *new_pool = (VALUE *)realloc(
|
|
325
362
|
buf->frame_pool,
|
|
@@ -340,7 +377,7 @@ rperf_frame_table_init(rperf_frame_table_t *ft)
|
|
|
340
377
|
VALUE *keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
|
|
341
378
|
if (!keys) return -1;
|
|
342
379
|
atomic_store_explicit(&ft->keys, keys, memory_order_relaxed);
|
|
343
|
-
ft->count =
|
|
380
|
+
ft->count = 0;
|
|
344
381
|
ft->bucket_capacity = RPERF_FRAME_TABLE_INITIAL * 2;
|
|
345
382
|
ft->buckets = (uint32_t *)malloc(ft->bucket_capacity * sizeof(uint32_t));
|
|
346
383
|
if (!ft->buckets) { free(keys); atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed); return -1; }
|
|
@@ -372,6 +409,7 @@ rperf_frame_table_free(rperf_frame_table_t *ft)
|
|
|
372
409
|
static void
|
|
373
410
|
rperf_frame_table_rehash(rperf_frame_table_t *ft)
|
|
374
411
|
{
|
|
412
|
+
if (ft->bucket_capacity > SIZE_MAX / 2) return;
|
|
375
413
|
size_t new_cap = ft->bucket_capacity * 2;
|
|
376
414
|
uint32_t *new_buckets = (uint32_t *)malloc(new_cap * sizeof(uint32_t));
|
|
377
415
|
if (!new_buckets) return; /* keep using current buckets at higher load factor */
|
|
@@ -379,7 +417,7 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
|
|
|
379
417
|
|
|
380
418
|
VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
|
|
381
419
|
size_t i;
|
|
382
|
-
for (i =
|
|
420
|
+
for (i = 0; i < ft->count; i++) {
|
|
383
421
|
uint32_t h = (uint32_t)(keys[i] >> 3); /* shift out tag bits */
|
|
384
422
|
size_t idx = h % new_cap;
|
|
385
423
|
while (new_buckets[idx] != RPERF_FRAME_TABLE_EMPTY)
|
|
@@ -400,11 +438,13 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
|
|
|
400
438
|
uint32_t h = (uint32_t)(fval >> 3);
|
|
401
439
|
size_t idx = h % ft->bucket_capacity;
|
|
402
440
|
|
|
441
|
+
size_t probes = 0;
|
|
403
442
|
while (1) {
|
|
404
443
|
uint32_t slot = ft->buckets[idx];
|
|
405
444
|
if (slot == RPERF_FRAME_TABLE_EMPTY) break;
|
|
406
445
|
if (keys[slot] == fval) return slot;
|
|
407
446
|
idx = (idx + 1) % ft->bucket_capacity;
|
|
447
|
+
if (++probes >= ft->bucket_capacity) return RPERF_FRAME_TABLE_EMPTY; /* table full */
|
|
408
448
|
}
|
|
409
449
|
|
|
410
450
|
/* Insert new entry. Grow keys array if capacity is exhausted.
|
|
@@ -412,6 +452,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
|
|
|
412
452
|
* the old keys pointer. Instead, allocate new, copy, swap pointer
|
|
413
453
|
* atomically, and keep old array alive until stop. */
|
|
414
454
|
if (ft->count >= ft->capacity) {
|
|
455
|
+
if (ft->capacity > SIZE_MAX / 2) return RPERF_FRAME_TABLE_EMPTY;
|
|
415
456
|
size_t new_cap = ft->capacity * 2;
|
|
416
457
|
VALUE *new_keys = (VALUE *)calloc(new_cap, sizeof(VALUE));
|
|
417
458
|
if (!new_keys) return RPERF_FRAME_TABLE_EMPTY;
|
|
@@ -434,7 +475,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
|
|
|
434
475
|
keys[frame_id] = fval;
|
|
435
476
|
/* Store fence: ensure keys[frame_id] is visible before count is incremented,
|
|
436
477
|
* so GC dmark never reads uninitialized keys[count-1]. */
|
|
437
|
-
|
|
478
|
+
atomic_store_explicit(&ft->count, ft->count + 1, memory_order_release);
|
|
438
479
|
ft->buckets[idx] = frame_id;
|
|
439
480
|
|
|
440
481
|
/* Rehash if load factor > 0.7 */
|
|
@@ -448,7 +489,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
|
|
|
448
489
|
/* ---- Aggregation table operations (all malloc-based, no GVL needed) ---- */
|
|
449
490
|
|
|
450
491
|
static uint32_t
|
|
451
|
-
rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id)
|
|
492
|
+
rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id, enum rperf_vm_state vm_state)
|
|
452
493
|
{
|
|
453
494
|
uint32_t h = 2166136261u;
|
|
454
495
|
int i;
|
|
@@ -460,6 +501,8 @@ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id)
|
|
|
460
501
|
h *= 16777619u;
|
|
461
502
|
h ^= (uint32_t)label_set_id;
|
|
462
503
|
h *= 16777619u;
|
|
504
|
+
h ^= (uint32_t)vm_state;
|
|
505
|
+
h *= 16777619u;
|
|
463
506
|
return h;
|
|
464
507
|
}
|
|
465
508
|
|
|
@@ -488,6 +531,7 @@ rperf_agg_table_free(rperf_agg_table_t *at)
|
|
|
488
531
|
static void
|
|
489
532
|
rperf_agg_table_rehash(rperf_agg_table_t *at)
|
|
490
533
|
{
|
|
534
|
+
if (at->bucket_capacity > SIZE_MAX / (2 * sizeof(rperf_agg_entry_t))) return;
|
|
491
535
|
size_t new_cap = at->bucket_capacity * 2;
|
|
492
536
|
rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
|
|
493
537
|
if (!new_buckets) return; /* keep using current buckets at higher load factor */
|
|
@@ -512,6 +556,7 @@ static int
|
|
|
512
556
|
rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
|
|
513
557
|
{
|
|
514
558
|
while (at->stack_pool_count + (size_t)needed > at->stack_pool_capacity) {
|
|
559
|
+
if (at->stack_pool_capacity > SIZE_MAX / (2 * sizeof(uint32_t))) return -1;
|
|
515
560
|
size_t new_cap = at->stack_pool_capacity * 2;
|
|
516
561
|
uint32_t *new_pool = (uint32_t *)realloc(at->stack_pool,
|
|
517
562
|
new_cap * sizeof(uint32_t));
|
|
@@ -522,36 +567,40 @@ rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
|
|
|
522
567
|
return 0;
|
|
523
568
|
}
|
|
524
569
|
|
|
525
|
-
/* Insert or merge a stack into the aggregation table
|
|
526
|
-
|
|
570
|
+
/* Insert or merge a stack into the aggregation table.
|
|
571
|
+
* Returns 0 on success, -1 on failure (table full or allocation failure). */
|
|
572
|
+
static int
|
|
527
573
|
rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
528
574
|
int depth, int thread_seq, int label_set_id,
|
|
529
|
-
int64_t weight, uint32_t hash)
|
|
575
|
+
enum rperf_vm_state vm_state, int64_t weight, uint32_t hash)
|
|
530
576
|
{
|
|
531
577
|
size_t idx = hash % at->bucket_capacity;
|
|
532
578
|
|
|
579
|
+
size_t probes = 0;
|
|
533
580
|
while (1) {
|
|
534
581
|
rperf_agg_entry_t *e = &at->buckets[idx];
|
|
535
582
|
if (!e->used) break;
|
|
536
583
|
if (e->hash == hash && e->depth == depth && e->thread_seq == thread_seq &&
|
|
537
|
-
e->label_set_id == label_set_id &&
|
|
584
|
+
e->label_set_id == label_set_id && e->vm_state == vm_state &&
|
|
538
585
|
memcmp(at->stack_pool + e->frame_start, frame_ids,
|
|
539
586
|
depth * sizeof(uint32_t)) == 0) {
|
|
540
587
|
/* Match — merge weight */
|
|
541
588
|
e->weight += weight;
|
|
542
|
-
return;
|
|
589
|
+
return 0;
|
|
543
590
|
}
|
|
544
591
|
idx = (idx + 1) % at->bucket_capacity;
|
|
592
|
+
if (++probes >= at->bucket_capacity) return -1; /* table full */
|
|
545
593
|
}
|
|
546
594
|
|
|
547
595
|
/* New entry — append frame_ids to stack_pool */
|
|
548
|
-
if (rperf_agg_ensure_stack_pool(at, depth) < 0) return;
|
|
596
|
+
if (rperf_agg_ensure_stack_pool(at, depth) < 0) return -1;
|
|
549
597
|
|
|
550
598
|
rperf_agg_entry_t *e = &at->buckets[idx];
|
|
551
599
|
e->frame_start = (uint32_t)at->stack_pool_count;
|
|
552
600
|
e->depth = depth;
|
|
553
601
|
e->thread_seq = thread_seq;
|
|
554
602
|
e->label_set_id = label_set_id;
|
|
603
|
+
e->vm_state = vm_state;
|
|
555
604
|
e->weight = weight;
|
|
556
605
|
e->hash = hash;
|
|
557
606
|
e->used = 1;
|
|
@@ -565,6 +614,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
|
565
614
|
if (at->count * 10 > at->bucket_capacity * 7) {
|
|
566
615
|
rperf_agg_table_rehash(at);
|
|
567
616
|
}
|
|
617
|
+
return 0;
|
|
568
618
|
}
|
|
569
619
|
|
|
570
620
|
/* ---- Aggregation: process a sample buffer into frame_table + agg_table ---- */
|
|
@@ -573,47 +623,46 @@ static void
|
|
|
573
623
|
rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
|
|
574
624
|
{
|
|
575
625
|
size_t i;
|
|
576
|
-
uint32_t temp_ids[RPERF_MAX_STACK_DEPTH
|
|
626
|
+
uint32_t temp_ids[RPERF_MAX_STACK_DEPTH];
|
|
577
627
|
|
|
578
628
|
for (i = 0; i < buf->sample_count; i++) {
|
|
579
629
|
rperf_sample_t *s = &buf->samples[i];
|
|
580
|
-
int off = 0;
|
|
581
630
|
uint32_t hash;
|
|
582
631
|
int j;
|
|
583
632
|
|
|
584
|
-
/*
|
|
585
|
-
if (s->
|
|
586
|
-
|
|
587
|
-
} else if (s->type == RPERF_SAMPLE_GVL_WAIT) {
|
|
588
|
-
temp_ids[off++] = RPERF_SYNTHETIC_GVL_WAIT;
|
|
589
|
-
} else if (s->type == RPERF_SAMPLE_GC_MARKING) {
|
|
590
|
-
temp_ids[off++] = RPERF_SYNTHETIC_GC_MARKING;
|
|
591
|
-
} else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
|
|
592
|
-
temp_ids[off++] = RPERF_SYNTHETIC_GC_SWEEPING;
|
|
593
|
-
}
|
|
633
|
+
/* Clamp depth to temp_ids[] capacity */
|
|
634
|
+
if (s->depth > RPERF_MAX_STACK_DEPTH)
|
|
635
|
+
s->depth = RPERF_MAX_STACK_DEPTH;
|
|
594
636
|
|
|
595
637
|
/* Convert VALUE frames to frame_ids */
|
|
596
638
|
int overflow = 0;
|
|
597
639
|
for (j = 0; j < s->depth; j++) {
|
|
640
|
+
if (s->frame_start + j >= buf->frame_pool_count) break;
|
|
598
641
|
VALUE fval = buf->frame_pool[s->frame_start + j];
|
|
599
642
|
uint32_t fid = rperf_frame_table_insert(&prof->frame_table, fval);
|
|
600
643
|
if (fid == RPERF_FRAME_TABLE_EMPTY) { overflow = 1; break; }
|
|
601
|
-
temp_ids[
|
|
644
|
+
temp_ids[j] = fid;
|
|
645
|
+
}
|
|
646
|
+
if (overflow) {
|
|
647
|
+
/* frame_table full — count remaining samples as dropped */
|
|
648
|
+
prof->stats.dropped_aggregation += buf->sample_count - i;
|
|
649
|
+
break;
|
|
602
650
|
}
|
|
603
|
-
if (overflow) break; /* frame_table full, stop aggregating this buffer */
|
|
604
651
|
|
|
605
|
-
|
|
606
|
-
hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq, s->label_set_id);
|
|
652
|
+
hash = rperf_fnv1a_u32(temp_ids, s->depth, s->thread_seq, s->label_set_id, s->vm_state);
|
|
607
653
|
|
|
608
|
-
rperf_agg_table_insert(&prof->agg_table, temp_ids,
|
|
609
|
-
s->thread_seq, s->label_set_id, s->
|
|
654
|
+
if (rperf_agg_table_insert(&prof->agg_table, temp_ids, s->depth,
|
|
655
|
+
s->thread_seq, s->label_set_id, s->vm_state,
|
|
656
|
+
s->weight, hash) < 0) {
|
|
657
|
+
prof->stats.dropped_aggregation++;
|
|
658
|
+
}
|
|
610
659
|
}
|
|
611
660
|
|
|
612
661
|
/* Reset buffer for reuse.
|
|
613
662
|
* Release fence: ensure all frame_table inserts are visible (to GC dmark)
|
|
614
663
|
* before frame_pool_count is cleared, so dmark always has at least one
|
|
615
664
|
* source (frame_table or frame_pool) covering each VALUE. */
|
|
616
|
-
|
|
665
|
+
atomic_thread_fence(memory_order_release);
|
|
617
666
|
buf->sample_count = 0;
|
|
618
667
|
buf->frame_pool_count = 0;
|
|
619
668
|
}
|
|
@@ -656,7 +705,7 @@ rperf_try_swap(rperf_profiler_t *prof)
|
|
|
656
705
|
/* Write a sample into a specific buffer. No swap check. */
|
|
657
706
|
static int
|
|
658
707
|
rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
|
|
659
|
-
int64_t weight,
|
|
708
|
+
int64_t weight, enum rperf_vm_state vm_state, int thread_seq, int label_set_id)
|
|
660
709
|
{
|
|
661
710
|
if (weight <= 0) return 0;
|
|
662
711
|
if (rperf_ensure_sample_capacity(buf) < 0) return -1;
|
|
@@ -665,7 +714,7 @@ rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
|
|
|
665
714
|
sample->depth = depth;
|
|
666
715
|
sample->frame_start = frame_start;
|
|
667
716
|
sample->weight = weight;
|
|
668
|
-
sample->
|
|
717
|
+
sample->vm_state = vm_state;
|
|
669
718
|
sample->thread_seq = thread_seq;
|
|
670
719
|
sample->label_set_id = label_set_id;
|
|
671
720
|
buf->sample_count++;
|
|
@@ -674,10 +723,11 @@ rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
|
|
|
674
723
|
|
|
675
724
|
static void
|
|
676
725
|
rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
|
|
677
|
-
int64_t weight,
|
|
726
|
+
int64_t weight, enum rperf_vm_state vm_state, int thread_seq, int label_set_id)
|
|
678
727
|
{
|
|
679
728
|
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
680
|
-
rperf_write_sample(buf, frame_start, depth, weight,
|
|
729
|
+
if (rperf_write_sample(buf, frame_start, depth, weight, vm_state, thread_seq, label_set_id) < 0)
|
|
730
|
+
prof->stats.dropped_samples++;
|
|
681
731
|
rperf_try_swap(prof);
|
|
682
732
|
}
|
|
683
733
|
|
|
@@ -689,8 +739,9 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
|
|
|
689
739
|
{
|
|
690
740
|
rperf_thread_data_t *td = (rperf_thread_data_t *)calloc(1, sizeof(rperf_thread_data_t));
|
|
691
741
|
if (!td) return NULL;
|
|
692
|
-
|
|
693
|
-
|
|
742
|
+
int64_t t = rperf_current_time_ns(prof);
|
|
743
|
+
if (t < 0) { free(td); return NULL; }
|
|
744
|
+
td->prev_time_ns = t;
|
|
694
745
|
td->thread_seq = ++prof->next_thread_seq;
|
|
695
746
|
rb_internal_thread_specific_set(thread, prof->ts_key, td);
|
|
696
747
|
return td;
|
|
@@ -712,7 +763,7 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t
|
|
|
712
763
|
is_first = 1;
|
|
713
764
|
}
|
|
714
765
|
|
|
715
|
-
int64_t time_now = rperf_current_time_ns(prof
|
|
766
|
+
int64_t time_now = rperf_current_time_ns(prof);
|
|
716
767
|
if (time_now < 0) return;
|
|
717
768
|
|
|
718
769
|
/* Capture backtrace into active buffer's frame_pool */
|
|
@@ -727,13 +778,12 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t
|
|
|
727
778
|
/* Record normal sample (skip if first time — no prev_time, or if paused) */
|
|
728
779
|
if (!is_first && !RPERF_PAUSED(prof)) {
|
|
729
780
|
int64_t weight = time_now - td->prev_time_ns;
|
|
730
|
-
rperf_record_sample(prof, frame_start, depth, weight,
|
|
781
|
+
rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
|
|
731
782
|
}
|
|
732
783
|
|
|
733
784
|
/* Save timestamp for READY/RESUMED */
|
|
734
785
|
td->suspended_at_ns = wall_now;
|
|
735
786
|
td->prev_time_ns = time_now;
|
|
736
|
-
td->prev_wall_ns = wall_now;
|
|
737
787
|
}
|
|
738
788
|
|
|
739
789
|
static void
|
|
@@ -764,7 +814,7 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
|
|
|
764
814
|
* Both samples are written directly into the same buffer before calling
|
|
765
815
|
* rperf_try_swap, so that a swap triggered by the first sample cannot
|
|
766
816
|
* move the second into a different buffer with a stale frame_start. */
|
|
767
|
-
if (prof->mode ==
|
|
817
|
+
if (prof->mode == RPERF_MODE_WALL && td->suspended_at_ns > 0 && !RPERF_PAUSED(prof)) {
|
|
768
818
|
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
769
819
|
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) goto skip_gvl;
|
|
770
820
|
size_t frame_start = buf->frame_pool_count;
|
|
@@ -776,13 +826,15 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
|
|
|
776
826
|
/* Write both samples into the same buf, then swap-check once */
|
|
777
827
|
if (td->ready_at_ns > 0 && td->ready_at_ns > td->suspended_at_ns) {
|
|
778
828
|
int64_t blocked_ns = td->ready_at_ns - td->suspended_at_ns;
|
|
779
|
-
rperf_write_sample(buf, frame_start, depth, blocked_ns,
|
|
780
|
-
|
|
829
|
+
if (rperf_write_sample(buf, frame_start, depth, blocked_ns,
|
|
830
|
+
RPERF_VM_STATE_GVL_BLOCKED, td->thread_seq, td->label_set_id) < 0)
|
|
831
|
+
prof->stats.dropped_samples++;
|
|
781
832
|
}
|
|
782
833
|
if (td->ready_at_ns > 0 && wall_now > td->ready_at_ns) {
|
|
783
834
|
int64_t wait_ns = wall_now - td->ready_at_ns;
|
|
784
|
-
rperf_write_sample(buf, frame_start, depth, wait_ns,
|
|
785
|
-
|
|
835
|
+
if (rperf_write_sample(buf, frame_start, depth, wait_ns,
|
|
836
|
+
RPERF_VM_STATE_GVL_WAIT, td->thread_seq, td->label_set_id) < 0)
|
|
837
|
+
prof->stats.dropped_samples++;
|
|
786
838
|
}
|
|
787
839
|
|
|
788
840
|
rperf_try_swap(prof);
|
|
@@ -790,9 +842,8 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
|
|
|
790
842
|
skip_gvl:
|
|
791
843
|
|
|
792
844
|
/* Reset prev times to current — next timer sample measures from resume */
|
|
793
|
-
int64_t time_now = rperf_current_time_ns(prof
|
|
845
|
+
int64_t time_now = rperf_current_time_ns(prof);
|
|
794
846
|
if (time_now >= 0) td->prev_time_ns = time_now;
|
|
795
|
-
td->prev_wall_ns = wall_now;
|
|
796
847
|
|
|
797
848
|
/* Clear suspended state */
|
|
798
849
|
td->suspended_at_ns = 0;
|
|
@@ -861,9 +912,9 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
|
|
|
861
912
|
|
|
862
913
|
int64_t wall_now = rperf_wall_time_ns();
|
|
863
914
|
int64_t weight = wall_now - prof->gc.enter_ns;
|
|
864
|
-
|
|
865
|
-
?
|
|
866
|
-
:
|
|
915
|
+
enum rperf_vm_state vm_state = (prof->gc.phase == RPERF_GC_SWEEPING)
|
|
916
|
+
? RPERF_VM_STATE_GC_SWEEPING
|
|
917
|
+
: RPERF_VM_STATE_GC_MARKING;
|
|
867
918
|
|
|
868
919
|
/* Capture backtrace here (not at GC_ENTER) so that frame_start
|
|
869
920
|
* always indexes into the current active buffer. The Ruby stack
|
|
@@ -882,24 +933,22 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
|
|
|
882
933
|
}
|
|
883
934
|
buf->frame_pool_count += depth;
|
|
884
935
|
|
|
885
|
-
rperf_record_sample(prof, frame_start, depth, weight,
|
|
936
|
+
rperf_record_sample(prof, frame_start, depth, weight, vm_state, prof->gc.thread_seq, prof->gc.label_set_id);
|
|
886
937
|
prof->gc.enter_ns = 0;
|
|
887
938
|
}
|
|
888
939
|
}
|
|
889
940
|
|
|
890
941
|
/* ---- Sampling callback (postponed job) — current thread only ---- */
|
|
891
942
|
|
|
892
|
-
|
|
893
|
-
|
|
943
|
+
/* Core sampling logic, parameterized by mode constant.
|
|
944
|
+
* Called from rperf_sample_cpu/rperf_sample_wall so the compiler
|
|
945
|
+
* can inline and eliminate mode branches at compile time. */
|
|
946
|
+
static inline void
|
|
947
|
+
rperf_sample_core(rperf_profiler_t *prof, enum rperf_mode mode)
|
|
894
948
|
{
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
if (!prof->running) return;
|
|
898
|
-
if (RPERF_PAUSED(prof)) return;
|
|
899
|
-
|
|
900
|
-
/* Measure sampling overhead */
|
|
949
|
+
/* Measure sampling overhead (wall time — runs under GVL, no I/O) */
|
|
901
950
|
struct timespec ts_start, ts_end;
|
|
902
|
-
clock_gettime(
|
|
951
|
+
clock_gettime(CLOCK_MONOTONIC, &ts_start);
|
|
903
952
|
|
|
904
953
|
VALUE thread = rb_thread_current();
|
|
905
954
|
|
|
@@ -911,12 +960,11 @@ rperf_sample_job(void *arg)
|
|
|
911
960
|
return; /* Skip first sample for this thread */
|
|
912
961
|
}
|
|
913
962
|
|
|
914
|
-
int64_t time_now =
|
|
963
|
+
int64_t time_now = (mode == RPERF_MODE_CPU) ? rperf_cpu_time_ns() : rperf_wall_time_ns();
|
|
915
964
|
if (time_now < 0) return;
|
|
916
965
|
|
|
917
966
|
int64_t weight = time_now - td->prev_time_ns;
|
|
918
967
|
td->prev_time_ns = time_now;
|
|
919
|
-
td->prev_wall_ns = rperf_wall_time_ns();
|
|
920
968
|
|
|
921
969
|
if (weight <= 0) return;
|
|
922
970
|
|
|
@@ -930,15 +978,35 @@ rperf_sample_job(void *arg)
|
|
|
930
978
|
if (depth <= 0) return;
|
|
931
979
|
buf->frame_pool_count += depth;
|
|
932
980
|
|
|
933
|
-
rperf_record_sample(prof, frame_start, depth, weight,
|
|
981
|
+
rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
|
|
934
982
|
|
|
935
|
-
clock_gettime(
|
|
983
|
+
clock_gettime(CLOCK_MONOTONIC, &ts_end);
|
|
936
984
|
prof->stats.sampling_count++;
|
|
937
985
|
prof->stats.sampling_total_ns +=
|
|
938
986
|
((int64_t)ts_end.tv_sec - ts_start.tv_sec) * 1000000000LL +
|
|
939
987
|
(ts_end.tv_nsec - ts_start.tv_nsec);
|
|
940
988
|
}
|
|
941
989
|
|
|
990
|
+
static void
|
|
991
|
+
rperf_sample_cpu(rperf_profiler_t *prof) { rperf_sample_core(prof, RPERF_MODE_CPU); }
|
|
992
|
+
|
|
993
|
+
static void
|
|
994
|
+
rperf_sample_wall(rperf_profiler_t *prof) { rperf_sample_core(prof, RPERF_MODE_WALL); }
|
|
995
|
+
|
|
996
|
+
static void
|
|
997
|
+
rperf_sample_job(void *arg)
|
|
998
|
+
{
|
|
999
|
+
rperf_profiler_t *prof = (rperf_profiler_t *)arg;
|
|
1000
|
+
|
|
1001
|
+
if (!prof->running) return;
|
|
1002
|
+
if (RPERF_PAUSED(prof)) return;
|
|
1003
|
+
|
|
1004
|
+
if (prof->mode == RPERF_MODE_CPU)
|
|
1005
|
+
rperf_sample_cpu(prof);
|
|
1006
|
+
else
|
|
1007
|
+
rperf_sample_wall(prof);
|
|
1008
|
+
}
|
|
1009
|
+
|
|
942
1010
|
/* ---- Worker thread: timer + aggregation ---- */
|
|
943
1011
|
|
|
944
1012
|
#if RPERF_USE_TIMER_SIGNAL
|
|
@@ -984,7 +1052,7 @@ rperf_worker_nanosleep_func(void *arg)
|
|
|
984
1052
|
struct timespec deadline;
|
|
985
1053
|
long interval_ns = 1000000000L / prof->frequency;
|
|
986
1054
|
|
|
987
|
-
clock_gettime(
|
|
1055
|
+
clock_gettime(RPERF_COND_CLOCK, &deadline);
|
|
988
1056
|
deadline.tv_nsec += interval_ns;
|
|
989
1057
|
if (deadline.tv_nsec >= 1000000000L) {
|
|
990
1058
|
deadline.tv_sec++;
|
|
@@ -994,10 +1062,12 @@ rperf_worker_nanosleep_func(void *arg)
|
|
|
994
1062
|
CHECKED(pthread_mutex_lock(&prof->worker_mutex));
|
|
995
1063
|
while (prof->running) {
|
|
996
1064
|
if (RPERF_PAUSED(prof)) {
|
|
997
|
-
/* Paused:
|
|
1065
|
+
/* Paused: mark as paused so disarm can confirm, then wait */
|
|
1066
|
+
prof->worker_paused = 1;
|
|
998
1067
|
CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
|
|
1068
|
+
prof->worker_paused = 0;
|
|
999
1069
|
/* Reset deadline on wake to avoid burst of catch-up triggers */
|
|
1000
|
-
clock_gettime(
|
|
1070
|
+
clock_gettime(RPERF_COND_CLOCK, &deadline);
|
|
1001
1071
|
deadline.tv_nsec += interval_ns;
|
|
1002
1072
|
if (deadline.tv_nsec >= 1000000000L) {
|
|
1003
1073
|
deadline.tv_sec++;
|
|
@@ -1068,14 +1138,18 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
|
|
|
1068
1138
|
result = rb_hash_new();
|
|
1069
1139
|
|
|
1070
1140
|
rb_hash_aset(result, ID2SYM(rb_intern("mode")),
|
|
1071
|
-
ID2SYM(rb_intern(prof->mode ==
|
|
1141
|
+
ID2SYM(rb_intern(prof->mode == RPERF_MODE_WALL ? "wall" : "cpu")));
|
|
1072
1142
|
rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(prof->frequency));
|
|
1073
1143
|
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(prof->stats.trigger_count));
|
|
1074
1144
|
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(prof->stats.sampling_count));
|
|
1075
1145
|
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(prof->stats.sampling_total_ns));
|
|
1146
|
+
if (prof->stats.dropped_samples > 0)
|
|
1147
|
+
rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(prof->stats.dropped_samples));
|
|
1148
|
+
if (prof->stats.dropped_aggregation > 0)
|
|
1149
|
+
rb_hash_aset(result, ID2SYM(rb_intern("dropped_aggregation")), SIZET2NUM(prof->stats.dropped_aggregation));
|
|
1076
1150
|
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(prof->next_thread_seq));
|
|
1077
1151
|
rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
|
|
1078
|
-
SIZET2NUM(prof->frame_table.count
|
|
1152
|
+
SIZET2NUM(prof->frame_table.count));
|
|
1079
1153
|
rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
|
|
1080
1154
|
SIZET2NUM(prof->agg_table.count));
|
|
1081
1155
|
|
|
@@ -1094,11 +1168,7 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
|
|
|
1094
1168
|
{
|
|
1095
1169
|
rperf_frame_table_t *ft = &prof->frame_table;
|
|
1096
1170
|
VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
|
|
1097
|
-
|
|
1098
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
|
|
1099
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]")));
|
|
1100
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]")));
|
|
1101
|
-
for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
|
|
1171
|
+
for (i = 0; i < ft->count; i++) {
|
|
1102
1172
|
rb_ary_push(resolved_ary, rperf_resolve_frame(atomic_load_explicit(&ft->keys, memory_order_relaxed)[i]));
|
|
1103
1173
|
}
|
|
1104
1174
|
|
|
@@ -1110,11 +1180,18 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
|
|
|
1110
1180
|
|
|
1111
1181
|
VALUE frames = rb_ary_new_capa(e->depth);
|
|
1112
1182
|
for (j = 0; j < e->depth; j++) {
|
|
1183
|
+
if (e->frame_start + j >= at->stack_pool_count) break;
|
|
1113
1184
|
uint32_t fid = at->stack_pool[e->frame_start + j];
|
|
1185
|
+
if (fid >= ft->count) break;
|
|
1114
1186
|
rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
|
|
1115
1187
|
}
|
|
1116
1188
|
|
|
1117
|
-
VALUE sample =
|
|
1189
|
+
VALUE sample = rb_ary_new_capa(5);
|
|
1190
|
+
rb_ary_push(sample, frames);
|
|
1191
|
+
rb_ary_push(sample, LONG2NUM(e->weight));
|
|
1192
|
+
rb_ary_push(sample, INT2NUM(e->thread_seq));
|
|
1193
|
+
rb_ary_push(sample, INT2NUM(e->label_set_id));
|
|
1194
|
+
rb_ary_push(sample, INT2NUM(e->vm_state));
|
|
1118
1195
|
rb_ary_push(samples_ary, sample);
|
|
1119
1196
|
}
|
|
1120
1197
|
}
|
|
@@ -1141,7 +1218,7 @@ static VALUE
|
|
|
1141
1218
|
rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VALUE vdefer)
|
|
1142
1219
|
{
|
|
1143
1220
|
int frequency = NUM2INT(vfreq);
|
|
1144
|
-
|
|
1221
|
+
enum rperf_mode mode = (enum rperf_mode)NUM2INT(vmode);
|
|
1145
1222
|
int aggregate = RTEST(vagg) ? 1 : 0;
|
|
1146
1223
|
#if RPERF_USE_TIMER_SIGNAL
|
|
1147
1224
|
int sig = NUM2INT(vsig);
|
|
@@ -1159,13 +1236,27 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VAL
|
|
|
1159
1236
|
g_profiler.stats.sampling_count = 0;
|
|
1160
1237
|
g_profiler.stats.sampling_total_ns = 0;
|
|
1161
1238
|
g_profiler.stats.trigger_count = 0;
|
|
1239
|
+
g_profiler.stats.dropped_samples = 0;
|
|
1240
|
+
g_profiler.stats.dropped_aggregation = 0;
|
|
1162
1241
|
atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
|
|
1163
1242
|
atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
|
|
1164
1243
|
g_profiler.label_sets = Qnil;
|
|
1165
1244
|
|
|
1166
1245
|
/* Initialize worker mutex/cond */
|
|
1167
1246
|
CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
|
|
1247
|
+
#ifdef __linux__
|
|
1248
|
+
{
|
|
1249
|
+
/* Use CLOCK_MONOTONIC for pthread_cond_timedwait so that
|
|
1250
|
+
* system clock adjustments (NTP etc.) don't affect timer intervals. */
|
|
1251
|
+
pthread_condattr_t cond_attr;
|
|
1252
|
+
CHECKED(pthread_condattr_init(&cond_attr));
|
|
1253
|
+
CHECKED(pthread_condattr_setclock(&cond_attr, CLOCK_MONOTONIC));
|
|
1254
|
+
CHECKED(pthread_cond_init(&g_profiler.worker_cond, &cond_attr));
|
|
1255
|
+
CHECKED(pthread_condattr_destroy(&cond_attr));
|
|
1256
|
+
}
|
|
1257
|
+
#else
|
|
1168
1258
|
CHECKED(pthread_cond_init(&g_profiler.worker_cond, NULL));
|
|
1259
|
+
#endif
|
|
1169
1260
|
|
|
1170
1261
|
/* Initialize sample buffer(s) */
|
|
1171
1262
|
if (rperf_sample_buffer_init(&g_profiler.buffers[0]) < 0) {
|
|
@@ -1244,6 +1335,7 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VAL
|
|
|
1244
1335
|
|
|
1245
1336
|
g_profiler.running = 1;
|
|
1246
1337
|
g_profiler.profile_refcount = RTEST(vdefer) ? 0 : 1;
|
|
1338
|
+
g_profiler.worker_paused = 0;
|
|
1247
1339
|
|
|
1248
1340
|
#if RPERF_USE_TIMER_SIGNAL
|
|
1249
1341
|
g_profiler.timer_signal = timer_signal;
|
|
@@ -1347,9 +1439,7 @@ timer_fail:
|
|
|
1347
1439
|
static VALUE
|
|
1348
1440
|
rb_rperf_stop(VALUE self)
|
|
1349
1441
|
{
|
|
1350
|
-
VALUE result
|
|
1351
|
-
size_t i;
|
|
1352
|
-
int j;
|
|
1442
|
+
VALUE result;
|
|
1353
1443
|
|
|
1354
1444
|
if (!g_profiler.running) {
|
|
1355
1445
|
return Qnil;
|
|
@@ -1416,15 +1506,22 @@ rb_rperf_stop(VALUE self)
|
|
|
1416
1506
|
rperf_agg_table_free(&g_profiler.agg_table);
|
|
1417
1507
|
} else {
|
|
1418
1508
|
/* Raw samples path (aggregate: false) */
|
|
1509
|
+
VALUE samples_ary;
|
|
1510
|
+
size_t i;
|
|
1511
|
+
int j;
|
|
1419
1512
|
rperf_sample_buffer_t *buf = &g_profiler.buffers[0];
|
|
1420
1513
|
|
|
1421
1514
|
result = rb_hash_new();
|
|
1422
1515
|
rb_hash_aset(result, ID2SYM(rb_intern("mode")),
|
|
1423
|
-
ID2SYM(rb_intern(g_profiler.mode ==
|
|
1516
|
+
ID2SYM(rb_intern(g_profiler.mode == RPERF_MODE_WALL ? "wall" : "cpu")));
|
|
1424
1517
|
rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
|
|
1425
1518
|
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
|
|
1426
1519
|
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
|
|
1427
1520
|
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
|
|
1521
|
+
if (g_profiler.stats.dropped_samples > 0)
|
|
1522
|
+
rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(g_profiler.stats.dropped_samples));
|
|
1523
|
+
if (g_profiler.stats.dropped_aggregation > 0)
|
|
1524
|
+
rb_hash_aset(result, ID2SYM(rb_intern("dropped_aggregation")), SIZET2NUM(g_profiler.stats.dropped_aggregation));
|
|
1428
1525
|
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
|
|
1429
1526
|
{
|
|
1430
1527
|
struct timespec stop_monotonic;
|
|
@@ -1441,29 +1538,20 @@ rb_rperf_stop(VALUE self)
|
|
|
1441
1538
|
samples_ary = rb_ary_new_capa((long)buf->sample_count);
|
|
1442
1539
|
for (i = 0; i < buf->sample_count; i++) {
|
|
1443
1540
|
rperf_sample_t *s = &buf->samples[i];
|
|
1444
|
-
VALUE frames = rb_ary_new_capa(s->depth
|
|
1445
|
-
|
|
1446
|
-
/* Prepend synthetic frame at leaf position (index 0) */
|
|
1447
|
-
if (s->type == RPERF_SAMPLE_GVL_BLOCKED) {
|
|
1448
|
-
VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]"));
|
|
1449
|
-
rb_ary_push(frames, syn);
|
|
1450
|
-
} else if (s->type == RPERF_SAMPLE_GVL_WAIT) {
|
|
1451
|
-
VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]"));
|
|
1452
|
-
rb_ary_push(frames, syn);
|
|
1453
|
-
} else if (s->type == RPERF_SAMPLE_GC_MARKING) {
|
|
1454
|
-
VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]"));
|
|
1455
|
-
rb_ary_push(frames, syn);
|
|
1456
|
-
} else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
|
|
1457
|
-
VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]"));
|
|
1458
|
-
rb_ary_push(frames, syn);
|
|
1459
|
-
}
|
|
1541
|
+
VALUE frames = rb_ary_new_capa(s->depth);
|
|
1460
1542
|
|
|
1461
1543
|
for (j = 0; j < s->depth; j++) {
|
|
1544
|
+
if (s->frame_start + j >= buf->frame_pool_count) break;
|
|
1462
1545
|
VALUE fval = buf->frame_pool[s->frame_start + j];
|
|
1463
1546
|
rb_ary_push(frames, rperf_resolve_frame(fval));
|
|
1464
1547
|
}
|
|
1465
1548
|
|
|
1466
|
-
VALUE sample =
|
|
1549
|
+
VALUE sample = rb_ary_new_capa(5);
|
|
1550
|
+
rb_ary_push(sample, frames);
|
|
1551
|
+
rb_ary_push(sample, LONG2NUM(s->weight));
|
|
1552
|
+
rb_ary_push(sample, INT2NUM(s->thread_seq));
|
|
1553
|
+
rb_ary_push(sample, INT2NUM(s->label_set_id));
|
|
1554
|
+
rb_ary_push(sample, INT2NUM(s->vm_state));
|
|
1467
1555
|
rb_ary_push(samples_ary, sample);
|
|
1468
1556
|
}
|
|
1469
1557
|
rb_hash_aset(result, ID2SYM(rb_intern("raw_samples")), samples_ary);
|
|
@@ -1498,6 +1586,8 @@ rperf_clear_aggregated_data(rperf_profiler_t *prof)
|
|
|
1498
1586
|
prof->stats.trigger_count = 0;
|
|
1499
1587
|
prof->stats.sampling_count = 0;
|
|
1500
1588
|
prof->stats.sampling_total_ns = 0;
|
|
1589
|
+
prof->stats.dropped_samples = 0;
|
|
1590
|
+
prof->stats.dropped_aggregation = 0;
|
|
1501
1591
|
|
|
1502
1592
|
/* Reset start timestamps so next snapshot's duration_ns covers
|
|
1503
1593
|
* only the period since this clear. */
|
|
@@ -1619,7 +1709,15 @@ rperf_disarm_timer(rperf_profiler_t *prof)
|
|
|
1619
1709
|
return;
|
|
1620
1710
|
}
|
|
1621
1711
|
#endif
|
|
1622
|
-
/* nanosleep mode: worker
|
|
1712
|
+
/* nanosleep mode: wake the worker and wait until it enters paused state */
|
|
1713
|
+
CHECKED(pthread_mutex_lock(&prof->worker_mutex));
|
|
1714
|
+
while (!prof->worker_paused) {
|
|
1715
|
+
CHECKED(pthread_cond_signal(&prof->worker_cond));
|
|
1716
|
+
CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
|
|
1717
|
+
sched_yield();
|
|
1718
|
+
CHECKED(pthread_mutex_lock(&prof->worker_mutex));
|
|
1719
|
+
}
|
|
1720
|
+
CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
|
|
1623
1721
|
}
|
|
1624
1722
|
|
|
1625
1723
|
/* Helper: reset prev_time_ns for all threads (called on resume to avoid
|
|
@@ -1633,8 +1731,7 @@ rperf_reset_thread_times(rperf_profiler_t *prof)
|
|
|
1633
1731
|
VALUE thread = RARRAY_AREF(threads, i);
|
|
1634
1732
|
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
1635
1733
|
if (td) {
|
|
1636
|
-
td->prev_time_ns = rperf_current_time_ns(prof
|
|
1637
|
-
td->prev_wall_ns = rperf_wall_time_ns();
|
|
1734
|
+
td->prev_time_ns = rperf_current_time_ns(prof);
|
|
1638
1735
|
}
|
|
1639
1736
|
}
|
|
1640
1737
|
}
|
|
@@ -1659,6 +1756,7 @@ static VALUE
|
|
|
1659
1756
|
rb_rperf_profile_dec(VALUE self)
|
|
1660
1757
|
{
|
|
1661
1758
|
if (!g_profiler.running) return Qfalse;
|
|
1759
|
+
if (g_profiler.profile_refcount <= 0) return Qfalse;
|
|
1662
1760
|
g_profiler.profile_refcount--;
|
|
1663
1761
|
if (g_profiler.profile_refcount == 0) {
|
|
1664
1762
|
rperf_disarm_timer(&g_profiler);
|
|
@@ -1673,6 +1771,12 @@ rb_rperf_running_p(VALUE self)
|
|
|
1673
1771
|
return g_profiler.running ? Qtrue : Qfalse;
|
|
1674
1772
|
}
|
|
1675
1773
|
|
|
1774
|
+
static VALUE
|
|
1775
|
+
rb_rperf_profiler_wrapper(VALUE self)
|
|
1776
|
+
{
|
|
1777
|
+
return g_profiler_wrapper;
|
|
1778
|
+
}
|
|
1779
|
+
|
|
1676
1780
|
/* ---- Fork safety ---- */
|
|
1677
1781
|
|
|
1678
1782
|
static void
|
|
@@ -1683,6 +1787,14 @@ rperf_after_fork_child(void)
|
|
|
1683
1787
|
/* Mark as not running — timer doesn't exist in child */
|
|
1684
1788
|
g_profiler.running = 0;
|
|
1685
1789
|
|
|
1790
|
+
/* Re-initialize mutex/condvar — they may have been locked by the parent's
|
|
1791
|
+
* worker thread at fork time and are in an undefined state in the child.
|
|
1792
|
+
* POSIX says only async-signal-safe functions should be called in atfork
|
|
1793
|
+
* child handlers, but pthread_mutex_init is safe on Linux/glibc/musl and
|
|
1794
|
+
* this is the standard pattern (e.g., Python, Go do the same). */
|
|
1795
|
+
pthread_mutex_init(&g_profiler.worker_mutex, NULL);
|
|
1796
|
+
pthread_cond_init(&g_profiler.worker_cond, NULL);
|
|
1797
|
+
|
|
1686
1798
|
#if RPERF_USE_TIMER_SIGNAL
|
|
1687
1799
|
/* timer_create timers are not inherited across fork, but pending signals may be.
|
|
1688
1800
|
* Block the signal, drain any pending instances, then restore old handler. */
|
|
@@ -1723,6 +1835,7 @@ rperf_after_fork_child(void)
|
|
|
1723
1835
|
/* Reset stats */
|
|
1724
1836
|
g_profiler.stats.sampling_count = 0;
|
|
1725
1837
|
g_profiler.stats.sampling_total_ns = 0;
|
|
1838
|
+
g_profiler.stats.dropped_samples = 0;
|
|
1726
1839
|
g_profiler.profile_refcount = 0;
|
|
1727
1840
|
atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
|
|
1728
1841
|
}
|
|
@@ -1743,6 +1856,7 @@ Init_rperf(void)
|
|
|
1743
1856
|
rb_define_module_function(mRperf, "_c_profile_inc", rb_rperf_profile_inc, 0);
|
|
1744
1857
|
rb_define_module_function(mRperf, "_c_profile_dec", rb_rperf_profile_dec, 0);
|
|
1745
1858
|
rb_define_module_function(mRperf, "_c_running?", rb_rperf_running_p, 0);
|
|
1859
|
+
rb_define_module_function(mRperf, "_c_profiler_wrapper", rb_rperf_profiler_wrapper, 0);
|
|
1746
1860
|
|
|
1747
1861
|
memset(&g_profiler, 0, sizeof(g_profiler));
|
|
1748
1862
|
g_profiler.label_sets = Qnil;
|