rperf 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +69 -28
- data/docs/help.md +149 -7
- data/exe/rperf +33 -8
- data/ext/rperf/rperf.c +547 -264
- data/lib/rperf/active_job.rb +13 -0
- data/lib/rperf/middleware.rb +15 -0
- data/lib/rperf/sidekiq.rb +9 -0
- data/lib/rperf/version.rb +1 -1
- data/lib/rperf.rb +145 -18
- metadata +4 -1
data/ext/rperf/rperf.c
CHANGED
|
@@ -7,13 +7,19 @@
|
|
|
7
7
|
#include <stdlib.h>
|
|
8
8
|
#include <unistd.h>
|
|
9
9
|
#include <signal.h>
|
|
10
|
-
#include <
|
|
10
|
+
#include <stdatomic.h>
|
|
11
11
|
#ifdef __linux__
|
|
12
12
|
#include <sys/syscall.h>
|
|
13
13
|
#endif
|
|
14
14
|
|
|
15
|
-
/* Checked pthread wrappers —
|
|
16
|
-
#define CHECKED(call) do {
|
|
15
|
+
/* Checked pthread wrappers — always active regardless of NDEBUG */
|
|
16
|
+
#define CHECKED(call) do { \
|
|
17
|
+
int _r = (call); \
|
|
18
|
+
if (_r != 0) { \
|
|
19
|
+
fprintf(stderr, "rperf: %s failed: %s\n", #call, strerror(_r)); \
|
|
20
|
+
abort(); \
|
|
21
|
+
} \
|
|
22
|
+
} while (0)
|
|
17
23
|
|
|
18
24
|
#ifdef __linux__
|
|
19
25
|
#define RPERF_USE_TIMER_SIGNAL 1
|
|
@@ -26,7 +32,8 @@
|
|
|
26
32
|
#define RPERF_INITIAL_SAMPLES 16384 /* >= AGG_THRESHOLD to avoid realloc before first aggregation */
|
|
27
33
|
#define RPERF_INITIAL_FRAME_POOL (1024 * 1024 / sizeof(VALUE)) /* ~1MB */
|
|
28
34
|
#define RPERF_AGG_THRESHOLD 10000 /* aggregate every N samples */
|
|
29
|
-
#define RPERF_FRAME_TABLE_INITIAL
|
|
35
|
+
#define RPERF_FRAME_TABLE_INITIAL 4096
|
|
36
|
+
#define RPERF_FRAME_TABLE_OLD_KEYS_INITIAL 16
|
|
30
37
|
#define RPERF_AGG_TABLE_INITIAL 1024
|
|
31
38
|
#define RPERF_STACK_POOL_INITIAL 4096
|
|
32
39
|
|
|
@@ -59,6 +66,7 @@ typedef struct rperf_sample {
|
|
|
59
66
|
int64_t weight;
|
|
60
67
|
int type; /* rperf_sample_type */
|
|
61
68
|
int thread_seq; /* thread sequence number (1-based) */
|
|
69
|
+
int label_set_id; /* label set ID (0 = no labels) */
|
|
62
70
|
} rperf_sample_t;
|
|
63
71
|
|
|
64
72
|
/* ---- Sample buffer (double-buffered) ---- */
|
|
@@ -77,11 +85,15 @@ typedef struct rperf_sample_buffer {
|
|
|
77
85
|
#define RPERF_FRAME_TABLE_EMPTY UINT32_MAX
|
|
78
86
|
|
|
79
87
|
typedef struct rperf_frame_table {
|
|
80
|
-
VALUE *keys;
|
|
88
|
+
_Atomic(VALUE *) keys; /* unique VALUE array (GC mark target) */
|
|
81
89
|
size_t count; /* = next frame_id (starts after RPERF_SYNTHETIC_COUNT) */
|
|
82
90
|
size_t capacity;
|
|
83
91
|
uint32_t *buckets; /* open addressing: stores index into keys[] */
|
|
84
92
|
size_t bucket_capacity;
|
|
93
|
+
/* Old keys arrays kept alive for GC dmark safety until stop */
|
|
94
|
+
VALUE **old_keys;
|
|
95
|
+
int old_keys_count;
|
|
96
|
+
int old_keys_capacity;
|
|
85
97
|
} rperf_frame_table_t;
|
|
86
98
|
|
|
87
99
|
/* ---- Aggregation table: stack → weight ---- */
|
|
@@ -92,6 +104,7 @@ typedef struct rperf_agg_entry {
|
|
|
92
104
|
uint32_t frame_start; /* offset into stack_pool */
|
|
93
105
|
int depth; /* includes synthetic frame */
|
|
94
106
|
int thread_seq;
|
|
107
|
+
int label_set_id; /* label set ID (0 = no labels) */
|
|
95
108
|
int64_t weight; /* accumulated */
|
|
96
109
|
uint32_t hash; /* cached hash value */
|
|
97
110
|
int used; /* 0 = empty, 1 = used */
|
|
@@ -107,54 +120,68 @@ typedef struct rperf_agg_table {
|
|
|
107
120
|
} rperf_agg_table_t;
|
|
108
121
|
|
|
109
122
|
typedef struct rperf_thread_data {
|
|
110
|
-
int64_t
|
|
123
|
+
int64_t prev_time_ns;
|
|
111
124
|
int64_t prev_wall_ns;
|
|
112
125
|
/* GVL event tracking */
|
|
113
126
|
int64_t suspended_at_ns; /* wall time at SUSPENDED */
|
|
114
127
|
int64_t ready_at_ns; /* wall time at READY */
|
|
115
|
-
size_t suspended_frame_start; /* saved stack in frame_pool */
|
|
116
|
-
int suspended_frame_depth; /* saved stack depth */
|
|
117
128
|
int thread_seq; /* thread sequence number (1-based) */
|
|
129
|
+
int label_set_id; /* current label set ID (0 = no labels) */
|
|
118
130
|
} rperf_thread_data_t;
|
|
119
131
|
|
|
132
|
+
/* ---- GC tracking state ---- */
|
|
133
|
+
|
|
134
|
+
typedef struct rperf_gc_state {
|
|
135
|
+
int phase; /* rperf_gc_phase */
|
|
136
|
+
int64_t enter_ns; /* wall time at GC_ENTER */
|
|
137
|
+
int thread_seq; /* thread_seq at GC_ENTER */
|
|
138
|
+
int label_set_id; /* label_set_id at GC_ENTER */
|
|
139
|
+
} rperf_gc_state_t;
|
|
140
|
+
|
|
141
|
+
/* ---- Sampling overhead stats ---- */
|
|
142
|
+
|
|
143
|
+
typedef struct rperf_stats {
|
|
144
|
+
size_t trigger_count;
|
|
145
|
+
size_t sampling_count;
|
|
146
|
+
int64_t sampling_total_ns;
|
|
147
|
+
} rperf_stats_t;
|
|
148
|
+
|
|
120
149
|
typedef struct rperf_profiler {
|
|
121
150
|
int frequency;
|
|
122
151
|
int mode; /* 0 = cpu, 1 = wall */
|
|
123
|
-
|
|
152
|
+
_Atomic int running;
|
|
124
153
|
pthread_t worker_thread; /* combined timer + aggregation */
|
|
125
154
|
#if RPERF_USE_TIMER_SIGNAL
|
|
126
155
|
timer_t timer_id;
|
|
127
156
|
int timer_signal; /* >0: use timer signal, 0: use nanosleep thread */
|
|
128
|
-
|
|
157
|
+
_Atomic pid_t worker_tid; /* kernel TID of worker thread (for SIGEV_THREAD_ID) */
|
|
158
|
+
struct sigaction old_sigaction; /* saved handler to restore on stop */
|
|
129
159
|
#endif
|
|
130
160
|
rb_postponed_job_handle_t pj_handle;
|
|
131
161
|
int aggregate; /* 1 = aggregate samples, 0 = raw */
|
|
132
162
|
/* Double-buffered sample storage (only buffers[0] used when !aggregate) */
|
|
133
163
|
rperf_sample_buffer_t buffers[2];
|
|
134
|
-
int active_idx;
|
|
164
|
+
_Atomic int active_idx; /* 0 or 1 */
|
|
135
165
|
/* Aggregation (only used when aggregate=1) */
|
|
136
166
|
rperf_frame_table_t frame_table;
|
|
137
167
|
rperf_agg_table_t agg_table;
|
|
138
|
-
|
|
168
|
+
_Atomic int swap_ready; /* 1 = standby buffer ready for aggregation */
|
|
139
169
|
pthread_mutex_t worker_mutex;
|
|
140
170
|
pthread_cond_t worker_cond;
|
|
141
171
|
rb_internal_thread_specific_key_t ts_key;
|
|
142
172
|
rb_internal_thread_event_hook_t *thread_hook;
|
|
143
173
|
/* GC tracking */
|
|
144
|
-
|
|
145
|
-
int64_t gc_enter_ns; /* wall time at GC_ENTER */
|
|
146
|
-
size_t gc_frame_start; /* saved stack at GC_ENTER */
|
|
147
|
-
int gc_frame_depth; /* saved stack depth */
|
|
148
|
-
int gc_thread_seq; /* thread_seq at GC_ENTER */
|
|
174
|
+
rperf_gc_state_t gc;
|
|
149
175
|
/* Timing metadata for pprof */
|
|
150
176
|
struct timespec start_realtime; /* CLOCK_REALTIME at start */
|
|
151
177
|
struct timespec start_monotonic; /* CLOCK_MONOTONIC at start */
|
|
152
178
|
/* Thread sequence counter */
|
|
153
179
|
int next_thread_seq;
|
|
154
180
|
/* Sampling overhead stats */
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
181
|
+
rperf_stats_t stats;
|
|
182
|
+
/* Label sets: Ruby Array of Hash objects, managed from Ruby side.
|
|
183
|
+
* Index 0 is reserved (no labels). GC-marked via profiler_mark. */
|
|
184
|
+
VALUE label_sets; /* Ruby Array or Qnil */
|
|
158
185
|
} rperf_profiler_t;
|
|
159
186
|
|
|
160
187
|
static rperf_profiler_t g_profiler;
|
|
@@ -175,10 +202,22 @@ rperf_profiler_mark(void *ptr)
|
|
|
175
202
|
buf->frame_pool + buf->frame_pool_count);
|
|
176
203
|
}
|
|
177
204
|
}
|
|
178
|
-
/* Mark
|
|
179
|
-
if (prof->
|
|
180
|
-
|
|
181
|
-
|
|
205
|
+
/* Mark label_sets array */
|
|
206
|
+
if (prof->label_sets != Qnil) {
|
|
207
|
+
rb_gc_mark(prof->label_sets);
|
|
208
|
+
}
|
|
209
|
+
/* Mark frame_table keys (unique frame VALUEs).
|
|
210
|
+
* Acquire count to synchronize with the release-store in insert,
|
|
211
|
+
* ensuring we see the keys pointer that is valid for [0, count).
|
|
212
|
+
* If we see an old count, both old and new keys arrays have valid
|
|
213
|
+
* data (old keys are kept alive in old_keys[]). */
|
|
214
|
+
{
|
|
215
|
+
size_t ft_count = __atomic_load_n(&prof->frame_table.count, __ATOMIC_ACQUIRE);
|
|
216
|
+
VALUE *ft_keys = atomic_load_explicit(&prof->frame_table.keys, memory_order_acquire);
|
|
217
|
+
if (ft_keys && ft_count > 0) {
|
|
218
|
+
rb_gc_mark_locations(ft_keys + RPERF_SYNTHETIC_COUNT,
|
|
219
|
+
ft_keys + ft_count);
|
|
220
|
+
}
|
|
182
221
|
}
|
|
183
222
|
}
|
|
184
223
|
|
|
@@ -288,21 +327,38 @@ rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
|
|
|
288
327
|
|
|
289
328
|
/* ---- Frame table operations (all malloc-based, no GVL needed) ---- */
|
|
290
329
|
|
|
291
|
-
static
|
|
330
|
+
static int
|
|
292
331
|
rperf_frame_table_init(rperf_frame_table_t *ft)
|
|
293
332
|
{
|
|
294
333
|
ft->capacity = RPERF_FRAME_TABLE_INITIAL;
|
|
295
|
-
|
|
334
|
+
VALUE *keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
|
|
335
|
+
if (!keys) return -1;
|
|
336
|
+
atomic_store_explicit(&ft->keys, keys, memory_order_relaxed);
|
|
296
337
|
ft->count = RPERF_SYNTHETIC_COUNT; /* reserve slots for synthetic frames */
|
|
297
338
|
ft->bucket_capacity = RPERF_FRAME_TABLE_INITIAL * 2;
|
|
298
339
|
ft->buckets = (uint32_t *)malloc(ft->bucket_capacity * sizeof(uint32_t));
|
|
340
|
+
if (!ft->buckets) { free(keys); atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed); return -1; }
|
|
299
341
|
memset(ft->buckets, 0xFF, ft->bucket_capacity * sizeof(uint32_t)); /* EMPTY */
|
|
342
|
+
ft->old_keys_count = 0;
|
|
343
|
+
ft->old_keys_capacity = RPERF_FRAME_TABLE_OLD_KEYS_INITIAL;
|
|
344
|
+
ft->old_keys = (VALUE **)malloc(ft->old_keys_capacity * sizeof(VALUE *));
|
|
345
|
+
if (!ft->old_keys) {
|
|
346
|
+
free(ft->buckets);
|
|
347
|
+
free(keys);
|
|
348
|
+
atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed);
|
|
349
|
+
return -1;
|
|
350
|
+
}
|
|
351
|
+
return 0;
|
|
300
352
|
}
|
|
301
353
|
|
|
302
354
|
static void
|
|
303
355
|
rperf_frame_table_free(rperf_frame_table_t *ft)
|
|
304
356
|
{
|
|
305
|
-
|
|
357
|
+
int i;
|
|
358
|
+
for (i = 0; i < ft->old_keys_count; i++)
|
|
359
|
+
free(ft->old_keys[i]);
|
|
360
|
+
free(ft->old_keys);
|
|
361
|
+
free(atomic_load_explicit(&ft->keys, memory_order_relaxed));
|
|
306
362
|
free(ft->buckets);
|
|
307
363
|
memset(ft, 0, sizeof(*ft));
|
|
308
364
|
}
|
|
@@ -312,11 +368,13 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
|
|
|
312
368
|
{
|
|
313
369
|
size_t new_cap = ft->bucket_capacity * 2;
|
|
314
370
|
uint32_t *new_buckets = (uint32_t *)malloc(new_cap * sizeof(uint32_t));
|
|
371
|
+
if (!new_buckets) return; /* keep using current buckets at higher load factor */
|
|
315
372
|
memset(new_buckets, 0xFF, new_cap * sizeof(uint32_t));
|
|
316
373
|
|
|
374
|
+
VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
|
|
317
375
|
size_t i;
|
|
318
376
|
for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
|
|
319
|
-
uint32_t h = (uint32_t)(
|
|
377
|
+
uint32_t h = (uint32_t)(keys[i] >> 3); /* shift out tag bits */
|
|
320
378
|
size_t idx = h % new_cap;
|
|
321
379
|
while (new_buckets[idx] != RPERF_FRAME_TABLE_EMPTY)
|
|
322
380
|
idx = (idx + 1) % new_cap;
|
|
@@ -332,25 +390,42 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
|
|
|
332
390
|
static uint32_t
|
|
333
391
|
rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
|
|
334
392
|
{
|
|
393
|
+
VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
|
|
335
394
|
uint32_t h = (uint32_t)(fval >> 3);
|
|
336
395
|
size_t idx = h % ft->bucket_capacity;
|
|
337
396
|
|
|
338
397
|
while (1) {
|
|
339
398
|
uint32_t slot = ft->buckets[idx];
|
|
340
399
|
if (slot == RPERF_FRAME_TABLE_EMPTY) break;
|
|
341
|
-
if (
|
|
400
|
+
if (keys[slot] == fval) return slot;
|
|
342
401
|
idx = (idx + 1) % ft->bucket_capacity;
|
|
343
402
|
}
|
|
344
403
|
|
|
345
|
-
/* Insert new entry.
|
|
346
|
-
*
|
|
347
|
-
*
|
|
404
|
+
/* Insert new entry. Grow keys array if capacity is exhausted.
|
|
405
|
+
* Cannot realloc in-place because GC dmark may concurrently read
|
|
406
|
+
* the old keys pointer. Instead, allocate new, copy, swap pointer
|
|
407
|
+
* atomically, and keep old array alive until stop. */
|
|
348
408
|
if (ft->count >= ft->capacity) {
|
|
349
|
-
|
|
409
|
+
size_t new_cap = ft->capacity * 2;
|
|
410
|
+
VALUE *new_keys = (VALUE *)calloc(new_cap, sizeof(VALUE));
|
|
411
|
+
if (!new_keys) return RPERF_FRAME_TABLE_EMPTY;
|
|
412
|
+
memcpy(new_keys, keys, ft->capacity * sizeof(VALUE));
|
|
413
|
+
/* Save old keys for deferred free (GC dmark safety) */
|
|
414
|
+
if (ft->old_keys_count >= ft->old_keys_capacity) {
|
|
415
|
+
int new_old_cap = ft->old_keys_capacity * 2;
|
|
416
|
+
VALUE **new_old = (VALUE **)realloc(ft->old_keys, new_old_cap * sizeof(VALUE *));
|
|
417
|
+
if (!new_old) { free(new_keys); return RPERF_FRAME_TABLE_EMPTY; }
|
|
418
|
+
ft->old_keys = new_old;
|
|
419
|
+
ft->old_keys_capacity = new_old_cap;
|
|
420
|
+
}
|
|
421
|
+
ft->old_keys[ft->old_keys_count++] = keys;
|
|
422
|
+
keys = new_keys;
|
|
423
|
+
atomic_store_explicit(&ft->keys, new_keys, memory_order_release);
|
|
424
|
+
ft->capacity = new_cap;
|
|
350
425
|
}
|
|
351
426
|
|
|
352
427
|
uint32_t frame_id = (uint32_t)ft->count;
|
|
353
|
-
|
|
428
|
+
keys[frame_id] = fval;
|
|
354
429
|
/* Store fence: ensure keys[frame_id] is visible before count is incremented,
|
|
355
430
|
* so GC dmark never reads uninitialized keys[count-1]. */
|
|
356
431
|
__atomic_store_n(&ft->count, ft->count + 1, __ATOMIC_RELEASE);
|
|
@@ -367,7 +442,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
|
|
|
367
442
|
/* ---- Aggregation table operations (all malloc-based, no GVL needed) ---- */
|
|
368
443
|
|
|
369
444
|
static uint32_t
|
|
370
|
-
rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq)
|
|
445
|
+
rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id)
|
|
371
446
|
{
|
|
372
447
|
uint32_t h = 2166136261u;
|
|
373
448
|
int i;
|
|
@@ -377,18 +452,23 @@ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq)
|
|
|
377
452
|
}
|
|
378
453
|
h ^= (uint32_t)thread_seq;
|
|
379
454
|
h *= 16777619u;
|
|
455
|
+
h ^= (uint32_t)label_set_id;
|
|
456
|
+
h *= 16777619u;
|
|
380
457
|
return h;
|
|
381
458
|
}
|
|
382
459
|
|
|
383
|
-
static
|
|
460
|
+
static int
|
|
384
461
|
rperf_agg_table_init(rperf_agg_table_t *at)
|
|
385
462
|
{
|
|
386
463
|
at->bucket_capacity = RPERF_AGG_TABLE_INITIAL * 2;
|
|
387
464
|
at->buckets = (rperf_agg_entry_t *)calloc(at->bucket_capacity, sizeof(rperf_agg_entry_t));
|
|
465
|
+
if (!at->buckets) return -1;
|
|
388
466
|
at->count = 0;
|
|
389
467
|
at->stack_pool_capacity = RPERF_STACK_POOL_INITIAL;
|
|
390
468
|
at->stack_pool = (uint32_t *)malloc(at->stack_pool_capacity * sizeof(uint32_t));
|
|
469
|
+
if (!at->stack_pool) { free(at->buckets); at->buckets = NULL; return -1; }
|
|
391
470
|
at->stack_pool_count = 0;
|
|
471
|
+
return 0;
|
|
392
472
|
}
|
|
393
473
|
|
|
394
474
|
static void
|
|
@@ -404,6 +484,7 @@ rperf_agg_table_rehash(rperf_agg_table_t *at)
|
|
|
404
484
|
{
|
|
405
485
|
size_t new_cap = at->bucket_capacity * 2;
|
|
406
486
|
rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
|
|
487
|
+
if (!new_buckets) return; /* keep using current buckets at higher load factor */
|
|
407
488
|
|
|
408
489
|
size_t i;
|
|
409
490
|
for (i = 0; i < at->bucket_capacity; i++) {
|
|
@@ -438,7 +519,8 @@ rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
|
|
|
438
519
|
/* Insert or merge a stack into the aggregation table */
|
|
439
520
|
static void
|
|
440
521
|
rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
441
|
-
int depth, int thread_seq,
|
|
522
|
+
int depth, int thread_seq, int label_set_id,
|
|
523
|
+
int64_t weight, uint32_t hash)
|
|
442
524
|
{
|
|
443
525
|
size_t idx = hash % at->bucket_capacity;
|
|
444
526
|
|
|
@@ -446,6 +528,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
|
446
528
|
rperf_agg_entry_t *e = &at->buckets[idx];
|
|
447
529
|
if (!e->used) break;
|
|
448
530
|
if (e->hash == hash && e->depth == depth && e->thread_seq == thread_seq &&
|
|
531
|
+
e->label_set_id == label_set_id &&
|
|
449
532
|
memcmp(at->stack_pool + e->frame_start, frame_ids,
|
|
450
533
|
depth * sizeof(uint32_t)) == 0) {
|
|
451
534
|
/* Match — merge weight */
|
|
@@ -462,6 +545,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
|
462
545
|
e->frame_start = (uint32_t)at->stack_pool_count;
|
|
463
546
|
e->depth = depth;
|
|
464
547
|
e->thread_seq = thread_seq;
|
|
548
|
+
e->label_set_id = label_set_id;
|
|
465
549
|
e->weight = weight;
|
|
466
550
|
e->hash = hash;
|
|
467
551
|
e->used = 1;
|
|
@@ -513,10 +597,10 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
|
|
|
513
597
|
if (overflow) break; /* frame_table full, stop aggregating this buffer */
|
|
514
598
|
|
|
515
599
|
int total_depth = off + s->depth;
|
|
516
|
-
hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq);
|
|
600
|
+
hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq, s->label_set_id);
|
|
517
601
|
|
|
518
602
|
rperf_agg_table_insert(&prof->agg_table, temp_ids, total_depth,
|
|
519
|
-
s->thread_seq, s->weight, hash);
|
|
603
|
+
s->thread_seq, s->label_set_id, s->weight, hash);
|
|
520
604
|
}
|
|
521
605
|
|
|
522
606
|
/* Reset buffer for reuse.
|
|
@@ -535,10 +619,10 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
|
|
|
535
619
|
static void
|
|
536
620
|
rperf_try_aggregate(rperf_profiler_t *prof)
|
|
537
621
|
{
|
|
538
|
-
if (!prof->aggregate || !prof->swap_ready) return;
|
|
539
|
-
int standby_idx = prof->active_idx ^ 1;
|
|
622
|
+
if (!prof->aggregate || !atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) return;
|
|
623
|
+
int standby_idx = atomic_load_explicit(&prof->active_idx, memory_order_acquire) ^ 1;
|
|
540
624
|
rperf_aggregate_buffer(prof, &prof->buffers[standby_idx]);
|
|
541
|
-
prof->swap_ready
|
|
625
|
+
atomic_store_explicit(&prof->swap_ready, 0, memory_order_release);
|
|
542
626
|
}
|
|
543
627
|
|
|
544
628
|
/* ---- Record a sample ---- */
|
|
@@ -547,25 +631,29 @@ static void
|
|
|
547
631
|
rperf_try_swap(rperf_profiler_t *prof)
|
|
548
632
|
{
|
|
549
633
|
if (!prof->aggregate) return;
|
|
550
|
-
|
|
634
|
+
int idx = atomic_load_explicit(&prof->active_idx, memory_order_relaxed);
|
|
635
|
+
rperf_sample_buffer_t *buf = &prof->buffers[idx];
|
|
551
636
|
if (buf->sample_count < RPERF_AGG_THRESHOLD) return;
|
|
552
|
-
if (prof->swap_ready) return; /* standby still being aggregated */
|
|
637
|
+
if (atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) return; /* standby still being aggregated */
|
|
553
638
|
|
|
554
|
-
/* Swap active buffer */
|
|
555
|
-
prof->active_idx
|
|
556
|
-
prof->swap_ready = 1;
|
|
639
|
+
/* Swap active buffer: release ensures buffer writes are visible to worker */
|
|
640
|
+
atomic_store_explicit(&prof->active_idx, idx ^ 1, memory_order_release);
|
|
557
641
|
|
|
558
|
-
/*
|
|
642
|
+
/* Set swap_ready under mutex and signal, preventing lost wakeup:
|
|
643
|
+
* the worker checks swap_ready while holding the same mutex. */
|
|
644
|
+
CHECKED(pthread_mutex_lock(&prof->worker_mutex));
|
|
645
|
+
atomic_store_explicit(&prof->swap_ready, 1, memory_order_release);
|
|
559
646
|
CHECKED(pthread_cond_signal(&prof->worker_cond));
|
|
647
|
+
CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
|
|
560
648
|
}
|
|
561
649
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
650
|
+
/* Write a sample into a specific buffer. No swap check. */
|
|
651
|
+
static int
|
|
652
|
+
rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
|
|
653
|
+
int64_t weight, int type, int thread_seq, int label_set_id)
|
|
565
654
|
{
|
|
566
|
-
if (weight <= 0) return;
|
|
567
|
-
|
|
568
|
-
if (rperf_ensure_sample_capacity(buf) < 0) return;
|
|
655
|
+
if (weight <= 0) return 0;
|
|
656
|
+
if (rperf_ensure_sample_capacity(buf) < 0) return -1;
|
|
569
657
|
|
|
570
658
|
rperf_sample_t *sample = &buf->samples[buf->sample_count];
|
|
571
659
|
sample->depth = depth;
|
|
@@ -573,8 +661,17 @@ rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
|
|
|
573
661
|
sample->weight = weight;
|
|
574
662
|
sample->type = type;
|
|
575
663
|
sample->thread_seq = thread_seq;
|
|
664
|
+
sample->label_set_id = label_set_id;
|
|
576
665
|
buf->sample_count++;
|
|
666
|
+
return 0;
|
|
667
|
+
}
|
|
577
668
|
|
|
669
|
+
static void
|
|
670
|
+
rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
|
|
671
|
+
int64_t weight, int type, int thread_seq, int label_set_id)
|
|
672
|
+
{
|
|
673
|
+
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
674
|
+
rperf_write_sample(buf, frame_start, depth, weight, type, thread_seq, label_set_id);
|
|
578
675
|
rperf_try_swap(prof);
|
|
579
676
|
}
|
|
580
677
|
|
|
@@ -586,7 +683,7 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
|
|
|
586
683
|
{
|
|
587
684
|
rperf_thread_data_t *td = (rperf_thread_data_t *)calloc(1, sizeof(rperf_thread_data_t));
|
|
588
685
|
if (!td) return NULL;
|
|
589
|
-
td->
|
|
686
|
+
td->prev_time_ns = rperf_current_time_ns(prof, td);
|
|
590
687
|
td->prev_wall_ns = rperf_wall_time_ns();
|
|
591
688
|
td->thread_seq = ++prof->next_thread_seq;
|
|
592
689
|
rb_internal_thread_specific_set(thread, prof->ts_key, td);
|
|
@@ -596,12 +693,11 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
|
|
|
596
693
|
/* ---- Thread event hooks ---- */
|
|
597
694
|
|
|
598
695
|
static void
|
|
599
|
-
rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
|
|
696
|
+
rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
|
|
600
697
|
{
|
|
601
698
|
/* Has GVL — safe to call Ruby APIs */
|
|
602
699
|
int64_t wall_now = rperf_wall_time_ns();
|
|
603
700
|
|
|
604
|
-
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
605
701
|
int is_first = 0;
|
|
606
702
|
|
|
607
703
|
if (td == NULL) {
|
|
@@ -614,7 +710,7 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
|
|
|
614
710
|
if (time_now < 0) return;
|
|
615
711
|
|
|
616
712
|
/* Capture backtrace into active buffer's frame_pool */
|
|
617
|
-
rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
|
|
713
|
+
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
618
714
|
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
|
|
619
715
|
size_t frame_start = buf->frame_pool_count;
|
|
620
716
|
int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
|
|
@@ -624,34 +720,29 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
|
|
|
624
720
|
|
|
625
721
|
/* Record normal sample (skip if first time — no prev_time) */
|
|
626
722
|
if (!is_first) {
|
|
627
|
-
int64_t weight = time_now - td->
|
|
628
|
-
rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
|
|
723
|
+
int64_t weight = time_now - td->prev_time_ns;
|
|
724
|
+
rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq, td->label_set_id);
|
|
629
725
|
}
|
|
630
726
|
|
|
631
|
-
/* Save
|
|
727
|
+
/* Save timestamp for READY/RESUMED */
|
|
632
728
|
td->suspended_at_ns = wall_now;
|
|
633
|
-
td->
|
|
634
|
-
td->suspended_frame_depth = depth;
|
|
635
|
-
td->prev_cpu_ns = time_now;
|
|
729
|
+
td->prev_time_ns = time_now;
|
|
636
730
|
td->prev_wall_ns = wall_now;
|
|
637
731
|
}
|
|
638
732
|
|
|
639
733
|
static void
|
|
640
|
-
rperf_handle_ready(
|
|
734
|
+
rperf_handle_ready(rperf_thread_data_t *td)
|
|
641
735
|
{
|
|
642
736
|
/* May NOT have GVL — only simple C operations allowed */
|
|
643
|
-
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
644
737
|
if (!td) return;
|
|
645
738
|
|
|
646
739
|
td->ready_at_ns = rperf_wall_time_ns();
|
|
647
740
|
}
|
|
648
741
|
|
|
649
742
|
static void
|
|
650
|
-
rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread)
|
|
743
|
+
rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
|
|
651
744
|
{
|
|
652
745
|
/* Has GVL */
|
|
653
|
-
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
654
|
-
|
|
655
746
|
if (td == NULL) {
|
|
656
747
|
td = rperf_thread_data_create(prof, thread);
|
|
657
748
|
if (!td) return;
|
|
@@ -659,36 +750,52 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread)
|
|
|
659
750
|
|
|
660
751
|
int64_t wall_now = rperf_wall_time_ns();
|
|
661
752
|
|
|
662
|
-
/* Record GVL blocked/wait samples (wall mode only)
|
|
663
|
-
|
|
753
|
+
/* Record GVL blocked/wait samples (wall mode only).
|
|
754
|
+
* Capture backtrace here (not at SUSPENDED) so that frame_start always
|
|
755
|
+
* indexes into the current active buffer, avoiding mismatch after a
|
|
756
|
+
* double-buffer swap. The Ruby stack is unchanged while off-GVL.
|
|
757
|
+
*
|
|
758
|
+
* Both samples are written directly into the same buffer before calling
|
|
759
|
+
* rperf_try_swap, so that a swap triggered by the first sample cannot
|
|
760
|
+
* move the second into a different buffer with a stale frame_start. */
|
|
761
|
+
if (prof->mode == 1 && td->suspended_at_ns > 0) {
|
|
762
|
+
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
763
|
+
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) goto skip_gvl;
|
|
764
|
+
size_t frame_start = buf->frame_pool_count;
|
|
765
|
+
int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
|
|
766
|
+
&buf->frame_pool[frame_start], NULL);
|
|
767
|
+
if (depth <= 0) goto skip_gvl;
|
|
768
|
+
buf->frame_pool_count += depth;
|
|
769
|
+
|
|
770
|
+
/* Write both samples into the same buf, then swap-check once */
|
|
664
771
|
if (td->ready_at_ns > 0 && td->ready_at_ns > td->suspended_at_ns) {
|
|
665
772
|
int64_t blocked_ns = td->ready_at_ns - td->suspended_at_ns;
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq);
|
|
773
|
+
rperf_write_sample(buf, frame_start, depth, blocked_ns,
|
|
774
|
+
RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq, td->label_set_id);
|
|
669
775
|
}
|
|
670
776
|
if (td->ready_at_ns > 0 && wall_now > td->ready_at_ns) {
|
|
671
777
|
int64_t wait_ns = wall_now - td->ready_at_ns;
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
RPERF_SAMPLE_GVL_WAIT, td->thread_seq);
|
|
778
|
+
rperf_write_sample(buf, frame_start, depth, wait_ns,
|
|
779
|
+
RPERF_SAMPLE_GVL_WAIT, td->thread_seq, td->label_set_id);
|
|
675
780
|
}
|
|
781
|
+
|
|
782
|
+
rperf_try_swap(prof);
|
|
676
783
|
}
|
|
784
|
+
skip_gvl:
|
|
677
785
|
|
|
678
786
|
/* Reset prev times to current — next timer sample measures from resume */
|
|
679
787
|
int64_t time_now = rperf_current_time_ns(prof, td);
|
|
680
|
-
if (time_now >= 0) td->
|
|
788
|
+
if (time_now >= 0) td->prev_time_ns = time_now;
|
|
681
789
|
td->prev_wall_ns = wall_now;
|
|
682
790
|
|
|
683
791
|
/* Clear suspended state */
|
|
684
|
-
td->
|
|
792
|
+
td->suspended_at_ns = 0;
|
|
685
793
|
td->ready_at_ns = 0;
|
|
686
794
|
}
|
|
687
795
|
|
|
688
796
|
static void
|
|
689
|
-
rperf_handle_exited(rperf_profiler_t *prof, VALUE thread)
|
|
797
|
+
rperf_handle_exited(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
|
|
690
798
|
{
|
|
691
|
-
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
692
799
|
if (td) {
|
|
693
800
|
free(td);
|
|
694
801
|
rb_internal_thread_specific_set(thread, prof->ts_key, NULL);
|
|
@@ -702,15 +809,16 @@ rperf_thread_event_hook(rb_event_flag_t event, const rb_internal_thread_event_da
|
|
|
702
809
|
if (!prof->running) return;
|
|
703
810
|
|
|
704
811
|
VALUE thread = data->thread;
|
|
812
|
+
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
705
813
|
|
|
706
814
|
if (event & RUBY_INTERNAL_THREAD_EVENT_SUSPENDED)
|
|
707
|
-
rperf_handle_suspended(prof, thread);
|
|
815
|
+
rperf_handle_suspended(prof, thread, td);
|
|
708
816
|
else if (event & RUBY_INTERNAL_THREAD_EVENT_READY)
|
|
709
|
-
rperf_handle_ready(
|
|
817
|
+
rperf_handle_ready(td);
|
|
710
818
|
else if (event & RUBY_INTERNAL_THREAD_EVENT_RESUMED)
|
|
711
|
-
rperf_handle_resumed(prof, thread);
|
|
819
|
+
rperf_handle_resumed(prof, thread, td);
|
|
712
820
|
else if (event & RUBY_INTERNAL_THREAD_EVENT_EXITED)
|
|
713
|
-
rperf_handle_exited(prof, thread);
|
|
821
|
+
rperf_handle_exited(prof, thread, td);
|
|
714
822
|
}
|
|
715
823
|
|
|
716
824
|
/* ---- GC event hook ---- */
|
|
@@ -722,50 +830,53 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
|
|
|
722
830
|
if (!prof->running) return;
|
|
723
831
|
|
|
724
832
|
if (event & RUBY_INTERNAL_EVENT_GC_START) {
|
|
725
|
-
prof->
|
|
833
|
+
prof->gc.phase = RPERF_GC_MARKING;
|
|
726
834
|
}
|
|
727
835
|
else if (event & RUBY_INTERNAL_EVENT_GC_END_MARK) {
|
|
728
|
-
prof->
|
|
836
|
+
prof->gc.phase = RPERF_GC_SWEEPING;
|
|
729
837
|
}
|
|
730
838
|
else if (event & RUBY_INTERNAL_EVENT_GC_END_SWEEP) {
|
|
731
|
-
prof->
|
|
839
|
+
prof->gc.phase = RPERF_GC_NONE;
|
|
732
840
|
}
|
|
733
841
|
else if (event & RUBY_INTERNAL_EVENT_GC_ENTER) {
|
|
734
|
-
/*
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
|
|
738
|
-
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
|
|
739
|
-
size_t frame_start = buf->frame_pool_count;
|
|
740
|
-
int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
|
|
741
|
-
&buf->frame_pool[frame_start], NULL);
|
|
742
|
-
if (depth <= 0) {
|
|
743
|
-
prof->gc_frame_depth = 0;
|
|
744
|
-
return;
|
|
745
|
-
}
|
|
746
|
-
buf->frame_pool_count += depth;
|
|
747
|
-
prof->gc_frame_start = frame_start;
|
|
748
|
-
prof->gc_frame_depth = depth;
|
|
749
|
-
|
|
750
|
-
/* Save thread_seq for the GC_EXIT sample */
|
|
842
|
+
/* Save timestamp, thread_seq, and label_set_id; backtrace is captured at GC_EXIT
|
|
843
|
+
* to avoid buffer mismatch after a double-buffer swap. */
|
|
844
|
+
prof->gc.enter_ns = rperf_wall_time_ns();
|
|
751
845
|
{
|
|
752
846
|
VALUE thread = rb_thread_current();
|
|
753
847
|
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
754
|
-
prof->
|
|
848
|
+
prof->gc.thread_seq = td ? td->thread_seq : 0;
|
|
849
|
+
prof->gc.label_set_id = td ? td->label_set_id : 0;
|
|
755
850
|
}
|
|
756
851
|
}
|
|
757
852
|
else if (event & RUBY_INTERNAL_EVENT_GC_EXIT) {
|
|
758
|
-
if (prof->
|
|
853
|
+
if (prof->gc.enter_ns <= 0) return;
|
|
759
854
|
|
|
760
855
|
int64_t wall_now = rperf_wall_time_ns();
|
|
761
|
-
int64_t weight = wall_now - prof->
|
|
762
|
-
int type = (prof->
|
|
856
|
+
int64_t weight = wall_now - prof->gc.enter_ns;
|
|
857
|
+
int type = (prof->gc.phase == RPERF_GC_SWEEPING)
|
|
763
858
|
? RPERF_SAMPLE_GC_SWEEPING
|
|
764
859
|
: RPERF_SAMPLE_GC_MARKING;
|
|
765
860
|
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
861
|
+
/* Capture backtrace here (not at GC_ENTER) so that frame_start
|
|
862
|
+
* always indexes into the current active buffer. The Ruby stack
|
|
863
|
+
* is unchanged during GC. */
|
|
864
|
+
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
865
|
+
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) {
|
|
866
|
+
prof->gc.enter_ns = 0;
|
|
867
|
+
return;
|
|
868
|
+
}
|
|
869
|
+
size_t frame_start = buf->frame_pool_count;
|
|
870
|
+
int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
|
|
871
|
+
&buf->frame_pool[frame_start], NULL);
|
|
872
|
+
if (depth <= 0) {
|
|
873
|
+
prof->gc.enter_ns = 0;
|
|
874
|
+
return;
|
|
875
|
+
}
|
|
876
|
+
buf->frame_pool_count += depth;
|
|
877
|
+
|
|
878
|
+
rperf_record_sample(prof, frame_start, depth, weight, type, prof->gc.thread_seq, prof->gc.label_set_id);
|
|
879
|
+
prof->gc.enter_ns = 0;
|
|
769
880
|
}
|
|
770
881
|
}
|
|
771
882
|
|
|
@@ -795,14 +906,14 @@ rperf_sample_job(void *arg)
|
|
|
795
906
|
int64_t time_now = rperf_current_time_ns(prof, td);
|
|
796
907
|
if (time_now < 0) return;
|
|
797
908
|
|
|
798
|
-
int64_t weight = time_now - td->
|
|
799
|
-
td->
|
|
909
|
+
int64_t weight = time_now - td->prev_time_ns;
|
|
910
|
+
td->prev_time_ns = time_now;
|
|
800
911
|
td->prev_wall_ns = rperf_wall_time_ns();
|
|
801
912
|
|
|
802
913
|
if (weight <= 0) return;
|
|
803
914
|
|
|
804
915
|
/* Capture backtrace and record sample */
|
|
805
|
-
rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
|
|
916
|
+
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
806
917
|
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
|
|
807
918
|
|
|
808
919
|
size_t frame_start = buf->frame_pool_count;
|
|
@@ -811,11 +922,11 @@ rperf_sample_job(void *arg)
|
|
|
811
922
|
if (depth <= 0) return;
|
|
812
923
|
buf->frame_pool_count += depth;
|
|
813
924
|
|
|
814
|
-
rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
|
|
925
|
+
rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq, td->label_set_id);
|
|
815
926
|
|
|
816
927
|
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_end);
|
|
817
|
-
prof->sampling_count++;
|
|
818
|
-
prof->sampling_total_ns +=
|
|
928
|
+
prof->stats.sampling_count++;
|
|
929
|
+
prof->stats.sampling_total_ns +=
|
|
819
930
|
((int64_t)ts_end.tv_sec - ts_start.tv_sec) * 1000000000LL +
|
|
820
931
|
(ts_end.tv_nsec - ts_start.tv_nsec);
|
|
821
932
|
}
|
|
@@ -826,7 +937,7 @@ rperf_sample_job(void *arg)
|
|
|
826
937
|
static void
|
|
827
938
|
rperf_signal_handler(int sig)
|
|
828
939
|
{
|
|
829
|
-
g_profiler.trigger_count++;
|
|
940
|
+
g_profiler.stats.trigger_count++;
|
|
830
941
|
rb_postponed_job_trigger(g_profiler.pj_handle);
|
|
831
942
|
}
|
|
832
943
|
|
|
@@ -845,7 +956,8 @@ rperf_worker_signal_func(void *arg)
|
|
|
845
956
|
CHECKED(pthread_cond_signal(&prof->worker_cond));
|
|
846
957
|
|
|
847
958
|
while (prof->running) {
|
|
848
|
-
|
|
959
|
+
while (prof->running && !atomic_load_explicit(&prof->swap_ready, memory_order_acquire))
|
|
960
|
+
CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
|
|
849
961
|
rperf_try_aggregate(prof);
|
|
850
962
|
}
|
|
851
963
|
CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
|
|
@@ -874,9 +986,12 @@ rperf_worker_nanosleep_func(void *arg)
|
|
|
874
986
|
CHECKED(pthread_mutex_lock(&prof->worker_mutex));
|
|
875
987
|
while (prof->running) {
|
|
876
988
|
int ret = pthread_cond_timedwait(&prof->worker_cond, &prof->worker_mutex, &deadline);
|
|
877
|
-
|
|
989
|
+
if (ret != 0 && ret != ETIMEDOUT) {
|
|
990
|
+
fprintf(stderr, "rperf: pthread_cond_timedwait failed: %s\n", strerror(ret));
|
|
991
|
+
abort();
|
|
992
|
+
}
|
|
878
993
|
if (ret == ETIMEDOUT) {
|
|
879
|
-
prof->trigger_count++;
|
|
994
|
+
prof->stats.trigger_count++;
|
|
880
995
|
rb_postponed_job_trigger(prof->pj_handle);
|
|
881
996
|
/* Advance deadline by interval */
|
|
882
997
|
deadline.tv_nsec += interval_ns;
|
|
@@ -900,66 +1015,117 @@ rperf_resolve_frame(VALUE fval)
|
|
|
900
1015
|
VALUE label = rb_profile_frame_full_label(fval);
|
|
901
1016
|
|
|
902
1017
|
if (NIL_P(path)) path = rb_str_new_lit("<C method>");
|
|
903
|
-
|
|
904
|
-
if (NIL_P(path)) path = rb_str_new_cstr("");
|
|
905
1018
|
if (NIL_P(label)) label = rb_str_new_cstr("");
|
|
906
1019
|
|
|
907
1020
|
return rb_ary_new3(2, path, label);
|
|
908
1021
|
}
|
|
909
1022
|
|
|
910
|
-
/* ----
|
|
1023
|
+
/* ---- Shared helpers for stop/snapshot ---- */
|
|
911
1024
|
|
|
1025
|
+
/* Flush pending sample buffers into agg_table.
|
|
1026
|
+
* Caller must ensure no concurrent access (worker joined or mutex held). */
|
|
1027
|
+
static void
|
|
1028
|
+
rperf_flush_buffers(rperf_profiler_t *prof)
|
|
1029
|
+
{
|
|
1030
|
+
int cur_idx = atomic_load_explicit(&prof->active_idx, memory_order_acquire);
|
|
1031
|
+
if (atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) {
|
|
1032
|
+
int standby_idx = cur_idx ^ 1;
|
|
1033
|
+
rperf_aggregate_buffer(prof, &prof->buffers[standby_idx]);
|
|
1034
|
+
atomic_store_explicit(&prof->swap_ready, 0, memory_order_release);
|
|
1035
|
+
}
|
|
1036
|
+
rperf_aggregate_buffer(prof, &prof->buffers[cur_idx]);
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
/* Build result hash from aggregated data (agg_table + frame_table).
|
|
1040
|
+
* Does NOT free any resources. Caller must hold GVL. */
|
|
912
1041
|
static VALUE
|
|
913
|
-
|
|
1042
|
+
rperf_build_aggregated_result(rperf_profiler_t *prof)
|
|
914
1043
|
{
|
|
915
|
-
VALUE
|
|
916
|
-
|
|
917
|
-
int
|
|
918
|
-
int aggregate = 1; /* default: aggregate */
|
|
919
|
-
#if RPERF_USE_TIMER_SIGNAL
|
|
920
|
-
int timer_signal = RPERF_TIMER_SIGNAL_DEFAULT;
|
|
921
|
-
#endif
|
|
1044
|
+
VALUE result, samples_ary;
|
|
1045
|
+
size_t i;
|
|
1046
|
+
int j;
|
|
922
1047
|
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
1048
|
+
result = rb_hash_new();
|
|
1049
|
+
|
|
1050
|
+
rb_hash_aset(result, ID2SYM(rb_intern("mode")),
|
|
1051
|
+
ID2SYM(rb_intern(prof->mode == 1 ? "wall" : "cpu")));
|
|
1052
|
+
rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(prof->frequency));
|
|
1053
|
+
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(prof->stats.trigger_count));
|
|
1054
|
+
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(prof->stats.sampling_count));
|
|
1055
|
+
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(prof->stats.sampling_total_ns));
|
|
1056
|
+
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(prof->next_thread_seq));
|
|
1057
|
+
rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
|
|
1058
|
+
SIZET2NUM(prof->frame_table.count - RPERF_SYNTHETIC_COUNT));
|
|
1059
|
+
rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
|
|
1060
|
+
SIZET2NUM(prof->agg_table.count));
|
|
1061
|
+
|
|
1062
|
+
{
|
|
1063
|
+
struct timespec now_monotonic;
|
|
1064
|
+
int64_t start_ns, duration_ns;
|
|
1065
|
+
clock_gettime(CLOCK_MONOTONIC, &now_monotonic);
|
|
1066
|
+
start_ns = (int64_t)prof->start_realtime.tv_sec * 1000000000LL
|
|
1067
|
+
+ (int64_t)prof->start_realtime.tv_nsec;
|
|
1068
|
+
duration_ns = ((int64_t)now_monotonic.tv_sec - (int64_t)prof->start_monotonic.tv_sec) * 1000000000LL
|
|
1069
|
+
+ ((int64_t)now_monotonic.tv_nsec - (int64_t)prof->start_monotonic.tv_nsec);
|
|
1070
|
+
rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
|
|
1071
|
+
rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
|
|
1072
|
+
}
|
|
1073
|
+
|
|
1074
|
+
{
|
|
1075
|
+
rperf_frame_table_t *ft = &prof->frame_table;
|
|
1076
|
+
VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
|
|
1077
|
+
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]")));
|
|
1078
|
+
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
|
|
1079
|
+
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]")));
|
|
1080
|
+
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]")));
|
|
1081
|
+
for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
|
|
1082
|
+
rb_ary_push(resolved_ary, rperf_resolve_frame(atomic_load_explicit(&ft->keys, memory_order_relaxed)[i]));
|
|
935
1083
|
}
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
1084
|
+
|
|
1085
|
+
rperf_agg_table_t *at = &prof->agg_table;
|
|
1086
|
+
samples_ary = rb_ary_new();
|
|
1087
|
+
for (i = 0; i < at->bucket_capacity; i++) {
|
|
1088
|
+
rperf_agg_entry_t *e = &at->buckets[i];
|
|
1089
|
+
if (!e->used) continue;
|
|
1090
|
+
|
|
1091
|
+
VALUE frames = rb_ary_new_capa(e->depth);
|
|
1092
|
+
for (j = 0; j < e->depth; j++) {
|
|
1093
|
+
uint32_t fid = at->stack_pool[e->frame_start + j];
|
|
1094
|
+
rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
|
|
945
1095
|
}
|
|
1096
|
+
|
|
1097
|
+
VALUE sample = rb_ary_new3(4, frames, LONG2NUM(e->weight), INT2NUM(e->thread_seq), INT2NUM(e->label_set_id));
|
|
1098
|
+
rb_ary_push(samples_ary, sample);
|
|
946
1099
|
}
|
|
1100
|
+
}
|
|
1101
|
+
|
|
1102
|
+
rb_hash_aset(result, ID2SYM(rb_intern("aggregated_samples")), samples_ary);
|
|
1103
|
+
|
|
1104
|
+
if (prof->label_sets != Qnil) {
|
|
1105
|
+
rb_hash_aset(result, ID2SYM(rb_intern("label_sets")), prof->label_sets);
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
return result;
|
|
1109
|
+
}
|
|
1110
|
+
|
|
1111
|
+
/* ---- Ruby API ---- */
|
|
1112
|
+
|
|
1113
|
+
/* _c_start(frequency, mode, aggregate, signal)
|
|
1114
|
+
* frequency: Integer (Hz)
|
|
1115
|
+
* mode: 0 = cpu, 1 = wall
|
|
1116
|
+
* aggregate: 0 or 1
|
|
1117
|
+
* signal: Integer (RT signal number, 0 = nanosleep, -1 = default)
|
|
1118
|
+
*/
|
|
1119
|
+
static VALUE
|
|
1120
|
+
rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
|
|
1121
|
+
{
|
|
1122
|
+
int frequency = NUM2INT(vfreq);
|
|
1123
|
+
int mode = NUM2INT(vmode);
|
|
1124
|
+
int aggregate = RTEST(vagg) ? 1 : 0;
|
|
947
1125
|
#if RPERF_USE_TIMER_SIGNAL
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
if (RTEST(vsig)) {
|
|
951
|
-
timer_signal = NUM2INT(vsig);
|
|
952
|
-
if (timer_signal < SIGRTMIN || timer_signal > SIGRTMAX) {
|
|
953
|
-
rb_raise(rb_eArgError, "signal must be between SIGRTMIN(%d) and SIGRTMAX(%d)",
|
|
954
|
-
SIGRTMIN, SIGRTMAX);
|
|
955
|
-
}
|
|
956
|
-
} else {
|
|
957
|
-
/* signal: false or signal: 0 → use nanosleep thread */
|
|
958
|
-
timer_signal = 0;
|
|
959
|
-
}
|
|
960
|
-
}
|
|
1126
|
+
int sig = NUM2INT(vsig);
|
|
1127
|
+
int timer_signal = (sig < 0) ? RPERF_TIMER_SIGNAL_DEFAULT : sig;
|
|
961
1128
|
#endif
|
|
962
|
-
}
|
|
963
1129
|
|
|
964
1130
|
if (g_profiler.running) {
|
|
965
1131
|
rb_raise(rb_eRuntimeError, "Rperf is already running");
|
|
@@ -969,11 +1135,12 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
969
1135
|
g_profiler.mode = mode;
|
|
970
1136
|
g_profiler.aggregate = aggregate;
|
|
971
1137
|
g_profiler.next_thread_seq = 0;
|
|
972
|
-
g_profiler.sampling_count = 0;
|
|
973
|
-
g_profiler.sampling_total_ns = 0;
|
|
974
|
-
g_profiler.trigger_count = 0;
|
|
975
|
-
g_profiler.active_idx
|
|
976
|
-
g_profiler.swap_ready
|
|
1138
|
+
g_profiler.stats.sampling_count = 0;
|
|
1139
|
+
g_profiler.stats.sampling_total_ns = 0;
|
|
1140
|
+
g_profiler.stats.trigger_count = 0;
|
|
1141
|
+
atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
|
|
1142
|
+
atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
|
|
1143
|
+
g_profiler.label_sets = Qnil;
|
|
977
1144
|
|
|
978
1145
|
/* Initialize worker mutex/cond */
|
|
979
1146
|
CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
|
|
@@ -994,13 +1161,26 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
994
1161
|
}
|
|
995
1162
|
|
|
996
1163
|
/* Initialize aggregation structures */
|
|
997
|
-
rperf_frame_table_init(&g_profiler.frame_table)
|
|
998
|
-
|
|
1164
|
+
if (rperf_frame_table_init(&g_profiler.frame_table) < 0) {
|
|
1165
|
+
rperf_sample_buffer_free(&g_profiler.buffers[0]);
|
|
1166
|
+
rperf_sample_buffer_free(&g_profiler.buffers[1]);
|
|
1167
|
+
CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
|
|
1168
|
+
CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
|
|
1169
|
+
rb_raise(rb_eNoMemError, "rperf: failed to allocate frame table");
|
|
1170
|
+
}
|
|
1171
|
+
if (rperf_agg_table_init(&g_profiler.agg_table) < 0) {
|
|
1172
|
+
rperf_frame_table_free(&g_profiler.frame_table);
|
|
1173
|
+
rperf_sample_buffer_free(&g_profiler.buffers[0]);
|
|
1174
|
+
rperf_sample_buffer_free(&g_profiler.buffers[1]);
|
|
1175
|
+
CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
|
|
1176
|
+
CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
|
|
1177
|
+
rb_raise(rb_eNoMemError, "rperf: failed to allocate aggregation table");
|
|
1178
|
+
}
|
|
999
1179
|
}
|
|
1000
1180
|
|
|
1001
1181
|
/* Register GC event hook */
|
|
1002
|
-
g_profiler.
|
|
1003
|
-
g_profiler.
|
|
1182
|
+
g_profiler.gc.phase = RPERF_GC_NONE;
|
|
1183
|
+
g_profiler.gc.enter_ns = 0;
|
|
1004
1184
|
rb_add_event_hook(rperf_gc_event_hook,
|
|
1005
1185
|
RUBY_INTERNAL_EVENT_GC_START |
|
|
1006
1186
|
RUBY_INTERNAL_EVENT_GC_END_MARK |
|
|
@@ -1023,6 +1203,7 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
1023
1203
|
VALUE cur_thread = rb_thread_current();
|
|
1024
1204
|
rperf_thread_data_t *td = rperf_thread_data_create(&g_profiler, cur_thread);
|
|
1025
1205
|
if (!td) {
|
|
1206
|
+
rb_remove_event_hook(rperf_gc_event_hook);
|
|
1026
1207
|
rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
|
|
1027
1208
|
g_profiler.thread_hook = NULL;
|
|
1028
1209
|
if (g_profiler.aggregate) {
|
|
@@ -1053,14 +1234,17 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
1053
1234
|
memset(&sa, 0, sizeof(sa));
|
|
1054
1235
|
sa.sa_handler = rperf_signal_handler;
|
|
1055
1236
|
sa.sa_flags = SA_RESTART;
|
|
1056
|
-
sigaction(g_profiler.timer_signal, &sa,
|
|
1237
|
+
if (sigaction(g_profiler.timer_signal, &sa, &g_profiler.old_sigaction) != 0) {
|
|
1238
|
+
g_profiler.running = 0;
|
|
1239
|
+
goto timer_fail;
|
|
1240
|
+
}
|
|
1057
1241
|
|
|
1058
1242
|
/* Start worker thread first to get its kernel TID */
|
|
1059
1243
|
g_profiler.worker_tid = 0;
|
|
1060
1244
|
if (pthread_create(&g_profiler.worker_thread, NULL,
|
|
1061
1245
|
rperf_worker_signal_func, &g_profiler) != 0) {
|
|
1062
1246
|
g_profiler.running = 0;
|
|
1063
|
-
|
|
1247
|
+
sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
|
|
1064
1248
|
goto timer_fail;
|
|
1065
1249
|
}
|
|
1066
1250
|
|
|
@@ -1078,7 +1262,7 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
1078
1262
|
sev._sigev_un._tid = g_profiler.worker_tid;
|
|
1079
1263
|
if (timer_create(CLOCK_MONOTONIC, &sev, &g_profiler.timer_id) != 0) {
|
|
1080
1264
|
g_profiler.running = 0;
|
|
1081
|
-
|
|
1265
|
+
sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
|
|
1082
1266
|
CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
|
|
1083
1267
|
CHECKED(pthread_join(g_profiler.worker_thread, NULL));
|
|
1084
1268
|
goto timer_fail;
|
|
@@ -1087,7 +1271,14 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
1087
1271
|
its.it_value.tv_sec = 0;
|
|
1088
1272
|
its.it_value.tv_nsec = 1000000000L / g_profiler.frequency;
|
|
1089
1273
|
its.it_interval = its.it_value;
|
|
1090
|
-
timer_settime(g_profiler.timer_id, 0, &its, NULL)
|
|
1274
|
+
if (timer_settime(g_profiler.timer_id, 0, &its, NULL) != 0) {
|
|
1275
|
+
timer_delete(g_profiler.timer_id);
|
|
1276
|
+
g_profiler.running = 0;
|
|
1277
|
+
sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
|
|
1278
|
+
CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
|
|
1279
|
+
CHECKED(pthread_join(g_profiler.worker_thread, NULL));
|
|
1280
|
+
goto timer_fail;
|
|
1281
|
+
}
|
|
1091
1282
|
} else
|
|
1092
1283
|
#endif
|
|
1093
1284
|
{
|
|
@@ -1109,6 +1300,7 @@ timer_fail:
|
|
|
1109
1300
|
rb_internal_thread_specific_set(cur, g_profiler.ts_key, NULL);
|
|
1110
1301
|
}
|
|
1111
1302
|
}
|
|
1303
|
+
rb_remove_event_hook(rperf_gc_event_hook);
|
|
1112
1304
|
rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
|
|
1113
1305
|
g_profiler.thread_hook = NULL;
|
|
1114
1306
|
if (g_profiler.aggregate) {
|
|
@@ -1139,17 +1331,28 @@ rb_rperf_stop(VALUE self)
|
|
|
1139
1331
|
g_profiler.running = 0;
|
|
1140
1332
|
#if RPERF_USE_TIMER_SIGNAL
|
|
1141
1333
|
if (g_profiler.timer_signal > 0) {
|
|
1334
|
+
/* Delete timer first to stop generating new signals.
|
|
1335
|
+
* Do NOT restore signal handler yet — the worker thread may still have
|
|
1336
|
+
* pending timer signals. rperf_signal_handler handles them harmlessly. */
|
|
1142
1337
|
timer_delete(g_profiler.timer_id);
|
|
1143
|
-
signal(g_profiler.timer_signal, SIG_IGN);
|
|
1144
1338
|
}
|
|
1145
1339
|
#endif
|
|
1146
1340
|
|
|
1147
|
-
/* Wake and join worker thread
|
|
1341
|
+
/* Wake and join worker thread.
|
|
1342
|
+
* Any pending timer signals are still handled by rperf_signal_handler
|
|
1343
|
+
* (just increments trigger_count + calls rb_postponed_job_trigger). */
|
|
1148
1344
|
CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
|
|
1149
1345
|
CHECKED(pthread_join(g_profiler.worker_thread, NULL));
|
|
1150
1346
|
CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
|
|
1151
1347
|
CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
|
|
1152
1348
|
|
|
1349
|
+
#if RPERF_USE_TIMER_SIGNAL
|
|
1350
|
+
if (g_profiler.timer_signal > 0) {
|
|
1351
|
+
/* Worker thread is gone — safe to restore old signal handler now. */
|
|
1352
|
+
sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
|
|
1353
|
+
}
|
|
1354
|
+
#endif
|
|
1355
|
+
|
|
1153
1356
|
if (g_profiler.thread_hook) {
|
|
1154
1357
|
rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
|
|
1155
1358
|
g_profiler.thread_hook = NULL;
|
|
@@ -1159,13 +1362,8 @@ rb_rperf_stop(VALUE self)
|
|
|
1159
1362
|
rb_remove_event_hook(rperf_gc_event_hook);
|
|
1160
1363
|
|
|
1161
1364
|
if (g_profiler.aggregate) {
|
|
1162
|
-
/*
|
|
1163
|
-
|
|
1164
|
-
int standby_idx = g_profiler.active_idx ^ 1;
|
|
1165
|
-
rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[standby_idx]);
|
|
1166
|
-
g_profiler.swap_ready = 0;
|
|
1167
|
-
}
|
|
1168
|
-
rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[g_profiler.active_idx]);
|
|
1365
|
+
/* Worker thread is joined; no concurrent access. */
|
|
1366
|
+
rperf_flush_buffers(&g_profiler);
|
|
1169
1367
|
}
|
|
1170
1368
|
|
|
1171
1369
|
/* Clean up thread-specific data for all live threads */
|
|
@@ -1183,72 +1381,8 @@ rb_rperf_stop(VALUE self)
|
|
|
1183
1381
|
}
|
|
1184
1382
|
}
|
|
1185
1383
|
|
|
1186
|
-
/* Build result hash */
|
|
1187
|
-
result = rb_hash_new();
|
|
1188
|
-
|
|
1189
|
-
/* mode */
|
|
1190
|
-
rb_hash_aset(result, ID2SYM(rb_intern("mode")),
|
|
1191
|
-
ID2SYM(rb_intern(g_profiler.mode == 1 ? "wall" : "cpu")));
|
|
1192
|
-
|
|
1193
|
-
/* frequency */
|
|
1194
|
-
rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
|
|
1195
|
-
|
|
1196
|
-
/* trigger_count, sampling_count, sampling_time_ns */
|
|
1197
|
-
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.trigger_count));
|
|
1198
|
-
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.sampling_count));
|
|
1199
|
-
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.sampling_total_ns));
|
|
1200
|
-
|
|
1201
|
-
/* aggregation stats */
|
|
1202
|
-
if (g_profiler.aggregate) {
|
|
1203
|
-
rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
|
|
1204
|
-
SIZET2NUM(g_profiler.frame_table.count - RPERF_SYNTHETIC_COUNT));
|
|
1205
|
-
rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
|
|
1206
|
-
SIZET2NUM(g_profiler.agg_table.count));
|
|
1207
|
-
}
|
|
1208
|
-
|
|
1209
|
-
/* start_time_ns (CLOCK_REALTIME epoch nanos), duration_ns (CLOCK_MONOTONIC delta) */
|
|
1210
|
-
{
|
|
1211
|
-
struct timespec stop_monotonic;
|
|
1212
|
-
int64_t start_ns, duration_ns;
|
|
1213
|
-
clock_gettime(CLOCK_MONOTONIC, &stop_monotonic);
|
|
1214
|
-
start_ns = (int64_t)g_profiler.start_realtime.tv_sec * 1000000000LL
|
|
1215
|
-
+ (int64_t)g_profiler.start_realtime.tv_nsec;
|
|
1216
|
-
duration_ns = ((int64_t)stop_monotonic.tv_sec - (int64_t)g_profiler.start_monotonic.tv_sec) * 1000000000LL
|
|
1217
|
-
+ ((int64_t)stop_monotonic.tv_nsec - (int64_t)g_profiler.start_monotonic.tv_nsec);
|
|
1218
|
-
rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
|
|
1219
|
-
rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
|
|
1220
|
-
}
|
|
1221
|
-
|
|
1222
1384
|
if (g_profiler.aggregate) {
|
|
1223
|
-
|
|
1224
|
-
* Use a Ruby array for resolved frames so GC protects them. */
|
|
1225
|
-
rperf_frame_table_t *ft = &g_profiler.frame_table;
|
|
1226
|
-
VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
|
|
1227
|
-
/* Synthetic frames */
|
|
1228
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]")));
|
|
1229
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
|
|
1230
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]")));
|
|
1231
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]")));
|
|
1232
|
-
/* Real frames */
|
|
1233
|
-
for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
|
|
1234
|
-
rb_ary_push(resolved_ary, rperf_resolve_frame(ft->keys[i]));
|
|
1235
|
-
}
|
|
1236
|
-
|
|
1237
|
-
rperf_agg_table_t *at = &g_profiler.agg_table;
|
|
1238
|
-
samples_ary = rb_ary_new();
|
|
1239
|
-
for (i = 0; i < at->bucket_capacity; i++) {
|
|
1240
|
-
rperf_agg_entry_t *e = &at->buckets[i];
|
|
1241
|
-
if (!e->used) continue;
|
|
1242
|
-
|
|
1243
|
-
VALUE frames = rb_ary_new_capa(e->depth);
|
|
1244
|
-
for (j = 0; j < e->depth; j++) {
|
|
1245
|
-
uint32_t fid = at->stack_pool[e->frame_start + j];
|
|
1246
|
-
rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
|
|
1247
|
-
}
|
|
1248
|
-
|
|
1249
|
-
VALUE sample = rb_ary_new3(3, frames, LONG2NUM(e->weight), INT2NUM(e->thread_seq));
|
|
1250
|
-
rb_ary_push(samples_ary, sample);
|
|
1251
|
-
}
|
|
1385
|
+
result = rperf_build_aggregated_result(&g_profiler);
|
|
1252
1386
|
|
|
1253
1387
|
rperf_sample_buffer_free(&g_profiler.buffers[1]);
|
|
1254
1388
|
rperf_frame_table_free(&g_profiler.frame_table);
|
|
@@ -1256,6 +1390,27 @@ rb_rperf_stop(VALUE self)
|
|
|
1256
1390
|
} else {
|
|
1257
1391
|
/* Raw samples path (aggregate: false) */
|
|
1258
1392
|
rperf_sample_buffer_t *buf = &g_profiler.buffers[0];
|
|
1393
|
+
|
|
1394
|
+
result = rb_hash_new();
|
|
1395
|
+
rb_hash_aset(result, ID2SYM(rb_intern("mode")),
|
|
1396
|
+
ID2SYM(rb_intern(g_profiler.mode == 1 ? "wall" : "cpu")));
|
|
1397
|
+
rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
|
|
1398
|
+
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
|
|
1399
|
+
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
|
|
1400
|
+
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
|
|
1401
|
+
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
|
|
1402
|
+
{
|
|
1403
|
+
struct timespec stop_monotonic;
|
|
1404
|
+
int64_t start_ns, duration_ns;
|
|
1405
|
+
clock_gettime(CLOCK_MONOTONIC, &stop_monotonic);
|
|
1406
|
+
start_ns = (int64_t)g_profiler.start_realtime.tv_sec * 1000000000LL
|
|
1407
|
+
+ (int64_t)g_profiler.start_realtime.tv_nsec;
|
|
1408
|
+
duration_ns = ((int64_t)stop_monotonic.tv_sec - (int64_t)g_profiler.start_monotonic.tv_sec) * 1000000000LL
|
|
1409
|
+
+ ((int64_t)stop_monotonic.tv_nsec - (int64_t)g_profiler.start_monotonic.tv_nsec);
|
|
1410
|
+
rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
|
|
1411
|
+
rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
|
|
1412
|
+
}
|
|
1413
|
+
|
|
1259
1414
|
samples_ary = rb_ary_new_capa((long)buf->sample_count);
|
|
1260
1415
|
for (i = 0; i < buf->sample_count; i++) {
|
|
1261
1416
|
rperf_sample_t *s = &buf->samples[i];
|
|
@@ -1281,11 +1436,14 @@ rb_rperf_stop(VALUE self)
|
|
|
1281
1436
|
rb_ary_push(frames, rperf_resolve_frame(fval));
|
|
1282
1437
|
}
|
|
1283
1438
|
|
|
1284
|
-
VALUE sample = rb_ary_new3(
|
|
1439
|
+
VALUE sample = rb_ary_new3(4, frames, LONG2NUM(s->weight), INT2NUM(s->thread_seq), INT2NUM(s->label_set_id));
|
|
1285
1440
|
rb_ary_push(samples_ary, sample);
|
|
1286
1441
|
}
|
|
1442
|
+
rb_hash_aset(result, ID2SYM(rb_intern("raw_samples")), samples_ary);
|
|
1443
|
+
if (g_profiler.label_sets != Qnil) {
|
|
1444
|
+
rb_hash_aset(result, ID2SYM(rb_intern("label_sets")), g_profiler.label_sets);
|
|
1445
|
+
}
|
|
1287
1446
|
}
|
|
1288
|
-
rb_hash_aset(result, ID2SYM(rb_intern("samples")), samples_ary);
|
|
1289
1447
|
|
|
1290
1448
|
/* Cleanup */
|
|
1291
1449
|
rperf_sample_buffer_free(&g_profiler.buffers[0]);
|
|
@@ -1293,6 +1451,113 @@ rb_rperf_stop(VALUE self)
|
|
|
1293
1451
|
return result;
|
|
1294
1452
|
}
|
|
1295
1453
|
|
|
1454
|
+
/* ---- Snapshot: read aggregated data without stopping ---- */
|
|
1455
|
+
|
|
1456
|
+
/* Clear aggregated data for the next interval.
|
|
1457
|
+
* Caller must hold GVL + worker_mutex.
|
|
1458
|
+
* Keeps allocations intact for reuse. Does NOT touch frame_table
|
|
1459
|
+
* (frame IDs must stay stable — dmark may be iterating keys outside GVL,
|
|
1460
|
+
* and existing threads reference frame IDs via their thread_data). */
|
|
1461
|
+
static void
|
|
1462
|
+
rperf_clear_aggregated_data(rperf_profiler_t *prof)
|
|
1463
|
+
{
|
|
1464
|
+
/* Clear agg_table entries (keep allocation) */
|
|
1465
|
+
memset(prof->agg_table.buckets, 0,
|
|
1466
|
+
prof->agg_table.bucket_capacity * sizeof(rperf_agg_entry_t));
|
|
1467
|
+
prof->agg_table.count = 0;
|
|
1468
|
+
prof->agg_table.stack_pool_count = 0;
|
|
1469
|
+
|
|
1470
|
+
/* Reset stats */
|
|
1471
|
+
prof->stats.trigger_count = 0;
|
|
1472
|
+
prof->stats.sampling_count = 0;
|
|
1473
|
+
prof->stats.sampling_total_ns = 0;
|
|
1474
|
+
|
|
1475
|
+
/* Reset start timestamps so next snapshot's duration_ns covers
|
|
1476
|
+
* only the period since this clear. */
|
|
1477
|
+
clock_gettime(CLOCK_REALTIME, &prof->start_realtime);
|
|
1478
|
+
clock_gettime(CLOCK_MONOTONIC, &prof->start_monotonic);
|
|
1479
|
+
}
|
|
1480
|
+
|
|
1481
|
+
static VALUE
|
|
1482
|
+
rb_rperf_snapshot(VALUE self, VALUE vclear)
|
|
1483
|
+
{
|
|
1484
|
+
VALUE result;
|
|
1485
|
+
|
|
1486
|
+
if (!g_profiler.running) {
|
|
1487
|
+
return Qnil;
|
|
1488
|
+
}
|
|
1489
|
+
|
|
1490
|
+
if (!g_profiler.aggregate) {
|
|
1491
|
+
rb_raise(rb_eRuntimeError, "snapshot requires aggregate mode (aggregate: true)");
|
|
1492
|
+
}
|
|
1493
|
+
|
|
1494
|
+
/* GVL is held → no postponed jobs fire → no new samples written.
|
|
1495
|
+
* Lock worker_mutex to pause worker thread's aggregation. */
|
|
1496
|
+
CHECKED(pthread_mutex_lock(&g_profiler.worker_mutex));
|
|
1497
|
+
rperf_flush_buffers(&g_profiler);
|
|
1498
|
+
|
|
1499
|
+
/* Build result while mutex is held. If clear is requested, we must
|
|
1500
|
+
* also clear under the same lock to avoid a window where the worker
|
|
1501
|
+
* could aggregate into the table between build and clear. */
|
|
1502
|
+
result = rperf_build_aggregated_result(&g_profiler);
|
|
1503
|
+
|
|
1504
|
+
if (RTEST(vclear)) {
|
|
1505
|
+
rperf_clear_aggregated_data(&g_profiler);
|
|
1506
|
+
}
|
|
1507
|
+
|
|
1508
|
+
CHECKED(pthread_mutex_unlock(&g_profiler.worker_mutex));
|
|
1509
|
+
|
|
1510
|
+
return result;
|
|
1511
|
+
}
|
|
1512
|
+
|
|
1513
|
+
/* ---- Label API ---- */
|
|
1514
|
+
|
|
1515
|
+
/* _c_set_label(label_set_id) — set current thread's label_set_id.
|
|
1516
|
+
* Called from Ruby with GVL held. */
|
|
1517
|
+
static VALUE
|
|
1518
|
+
rb_rperf_set_label(VALUE self, VALUE vid)
|
|
1519
|
+
{
|
|
1520
|
+
if (!g_profiler.running) return vid;
|
|
1521
|
+
|
|
1522
|
+
int label_set_id = NUM2INT(vid);
|
|
1523
|
+
VALUE thread = rb_thread_current();
|
|
1524
|
+
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, g_profiler.ts_key);
|
|
1525
|
+
if (td == NULL) {
|
|
1526
|
+
td = rperf_thread_data_create(&g_profiler, thread);
|
|
1527
|
+
if (!td) rb_raise(rb_eNoMemError, "rperf: failed to allocate thread data");
|
|
1528
|
+
}
|
|
1529
|
+
td->label_set_id = label_set_id;
|
|
1530
|
+
return vid;
|
|
1531
|
+
}
|
|
1532
|
+
|
|
1533
|
+
/* _c_get_label() — get current thread's label_set_id.
|
|
1534
|
+
* Returns 0 if not profiling or thread not yet seen. */
|
|
1535
|
+
static VALUE
|
|
1536
|
+
rb_rperf_get_label(VALUE self)
|
|
1537
|
+
{
|
|
1538
|
+
if (!g_profiler.running) return INT2FIX(0);
|
|
1539
|
+
|
|
1540
|
+
VALUE thread = rb_thread_current();
|
|
1541
|
+
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, g_profiler.ts_key);
|
|
1542
|
+
if (td == NULL) return INT2FIX(0);
|
|
1543
|
+
return INT2NUM(td->label_set_id);
|
|
1544
|
+
}
|
|
1545
|
+
|
|
1546
|
+
/* _c_set_label_sets(ary) — store label_sets Ruby Array for result building */
|
|
1547
|
+
static VALUE
|
|
1548
|
+
rb_rperf_set_label_sets(VALUE self, VALUE ary)
|
|
1549
|
+
{
|
|
1550
|
+
g_profiler.label_sets = ary;
|
|
1551
|
+
return ary;
|
|
1552
|
+
}
|
|
1553
|
+
|
|
1554
|
+
/* _c_get_label_sets() — get label_sets Ruby Array */
|
|
1555
|
+
static VALUE
|
|
1556
|
+
rb_rperf_get_label_sets(VALUE self)
|
|
1557
|
+
{
|
|
1558
|
+
return g_profiler.label_sets;
|
|
1559
|
+
}
|
|
1560
|
+
|
|
1296
1561
|
/* ---- Fork safety ---- */
|
|
1297
1562
|
|
|
1298
1563
|
static void
|
|
@@ -1304,9 +1569,20 @@ rperf_after_fork_child(void)
|
|
|
1304
1569
|
g_profiler.running = 0;
|
|
1305
1570
|
|
|
1306
1571
|
#if RPERF_USE_TIMER_SIGNAL
|
|
1307
|
-
/* timer_create timers are not inherited across fork
|
|
1572
|
+
/* timer_create timers are not inherited across fork, but pending signals may be.
|
|
1573
|
+
* Block the signal, drain any pending instances, then restore old handler. */
|
|
1308
1574
|
if (g_profiler.timer_signal > 0) {
|
|
1309
|
-
|
|
1575
|
+
sigset_t block_set, old_set;
|
|
1576
|
+
struct timespec zero_ts = {0, 0};
|
|
1577
|
+
|
|
1578
|
+
sigemptyset(&block_set);
|
|
1579
|
+
sigaddset(&block_set, g_profiler.timer_signal);
|
|
1580
|
+
pthread_sigmask(SIG_BLOCK, &block_set, &old_set);
|
|
1581
|
+
|
|
1582
|
+
while (sigtimedwait(&block_set, NULL, &zero_ts) > 0) {}
|
|
1583
|
+
|
|
1584
|
+
sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
|
|
1585
|
+
pthread_sigmask(SIG_SETMASK, &old_set, NULL);
|
|
1310
1586
|
}
|
|
1311
1587
|
#endif
|
|
1312
1588
|
|
|
@@ -1326,12 +1602,13 @@ rperf_after_fork_child(void)
|
|
|
1326
1602
|
}
|
|
1327
1603
|
|
|
1328
1604
|
/* Reset GC state */
|
|
1329
|
-
g_profiler.
|
|
1605
|
+
g_profiler.gc.phase = 0;
|
|
1606
|
+
g_profiler.gc.enter_ns = 0;
|
|
1330
1607
|
|
|
1331
1608
|
/* Reset stats */
|
|
1332
|
-
g_profiler.sampling_count = 0;
|
|
1333
|
-
g_profiler.sampling_total_ns = 0;
|
|
1334
|
-
g_profiler.swap_ready
|
|
1609
|
+
g_profiler.stats.sampling_count = 0;
|
|
1610
|
+
g_profiler.stats.sampling_total_ns = 0;
|
|
1611
|
+
atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
|
|
1335
1612
|
}
|
|
1336
1613
|
|
|
1337
1614
|
/* ---- Init ---- */
|
|
@@ -1340,10 +1617,16 @@ void
|
|
|
1340
1617
|
Init_rperf(void)
|
|
1341
1618
|
{
|
|
1342
1619
|
VALUE mRperf = rb_define_module("Rperf");
|
|
1343
|
-
rb_define_module_function(mRperf, "_c_start", rb_rperf_start,
|
|
1620
|
+
rb_define_module_function(mRperf, "_c_start", rb_rperf_start, 4);
|
|
1344
1621
|
rb_define_module_function(mRperf, "_c_stop", rb_rperf_stop, 0);
|
|
1622
|
+
rb_define_module_function(mRperf, "_c_snapshot", rb_rperf_snapshot, 1);
|
|
1623
|
+
rb_define_module_function(mRperf, "_c_set_label", rb_rperf_set_label, 1);
|
|
1624
|
+
rb_define_module_function(mRperf, "_c_get_label", rb_rperf_get_label, 0);
|
|
1625
|
+
rb_define_module_function(mRperf, "_c_set_label_sets", rb_rperf_set_label_sets, 1);
|
|
1626
|
+
rb_define_module_function(mRperf, "_c_get_label_sets", rb_rperf_get_label_sets, 0);
|
|
1345
1627
|
|
|
1346
1628
|
memset(&g_profiler, 0, sizeof(g_profiler));
|
|
1629
|
+
g_profiler.label_sets = Qnil;
|
|
1347
1630
|
g_profiler.pj_handle = rb_postponed_job_preregister(0, rperf_sample_job, &g_profiler);
|
|
1348
1631
|
g_profiler.ts_key = rb_internal_thread_specific_key_create();
|
|
1349
1632
|
|