rperf 0.8.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +21 -0
- data/README.md +26 -15
- data/docs/help.md +284 -18
- data/exe/rperf +278 -55
- data/ext/rperf/rperf.c +220 -81
- data/lib/rperf/active_job.rb +1 -0
- data/lib/rperf/meta.rb +343 -0
- data/lib/rperf/rack.rb +7 -2
- data/lib/rperf/table.rb +156 -0
- data/lib/rperf/version.rb +1 -1
- data/lib/rperf/viewer/viewer.html +1148 -0
- data/lib/rperf/viewer.rb +158 -661
- data/lib/rperf.rb +682 -89
- metadata +8 -4
data/ext/rperf/rperf.c
CHANGED
|
@@ -25,8 +25,10 @@
|
|
|
25
25
|
#ifdef __linux__
|
|
26
26
|
#define RPERF_USE_TIMER_SIGNAL 1
|
|
27
27
|
#define RPERF_TIMER_SIGNAL_DEFAULT (SIGRTMIN + 8)
|
|
28
|
+
#define RPERF_COND_CLOCK CLOCK_MONOTONIC
|
|
28
29
|
#else
|
|
29
30
|
#define RPERF_USE_TIMER_SIGNAL 0
|
|
31
|
+
#define RPERF_COND_CLOCK CLOCK_REALTIME /* macOS lacks pthread_condattr_setclock */
|
|
30
32
|
#endif
|
|
31
33
|
|
|
32
34
|
#define RPERF_MAX_STACK_DEPTH 512
|
|
@@ -51,6 +53,11 @@ enum rperf_vm_state {
|
|
|
51
53
|
/* ---- Data structures ---- */
|
|
52
54
|
|
|
53
55
|
|
|
56
|
+
enum rperf_mode {
|
|
57
|
+
RPERF_MODE_CPU = 0,
|
|
58
|
+
RPERF_MODE_WALL = 1,
|
|
59
|
+
};
|
|
60
|
+
|
|
54
61
|
enum rperf_gc_phase {
|
|
55
62
|
RPERF_GC_NONE = 0,
|
|
56
63
|
RPERF_GC_MARKING = 1,
|
|
@@ -73,7 +80,11 @@ typedef struct rperf_sample_buffer {
|
|
|
73
80
|
size_t sample_count;
|
|
74
81
|
size_t sample_capacity;
|
|
75
82
|
VALUE *frame_pool;
|
|
76
|
-
|
|
83
|
+
/* _Atomic: read by GC dmark concurrently with the aggregator's clear.
|
|
84
|
+
* Seq-cst accesses pair with the frame_table count release-stores so
|
|
85
|
+
* dmark never observes the cleared pool together with a stale
|
|
86
|
+
* frame_table count (which would leave frames unmarked). */
|
|
87
|
+
_Atomic size_t frame_pool_count;
|
|
77
88
|
size_t frame_pool_capacity;
|
|
78
89
|
} rperf_sample_buffer_t;
|
|
79
90
|
|
|
@@ -83,7 +94,7 @@ typedef struct rperf_sample_buffer {
|
|
|
83
94
|
|
|
84
95
|
typedef struct rperf_frame_table {
|
|
85
96
|
_Atomic(VALUE *) keys; /* unique VALUE array (GC mark target) */
|
|
86
|
-
size_t count;
|
|
97
|
+
_Atomic(size_t) count; /* = next frame_id */
|
|
87
98
|
size_t capacity;
|
|
88
99
|
uint32_t *buckets; /* open addressing: stores index into keys[] */
|
|
89
100
|
size_t bucket_capacity;
|
|
@@ -95,8 +106,6 @@ typedef struct rperf_frame_table {
|
|
|
95
106
|
|
|
96
107
|
/* ---- Aggregation table: stack → weight ---- */
|
|
97
108
|
|
|
98
|
-
#define RPERF_AGG_ENTRY_EMPTY 0
|
|
99
|
-
|
|
100
109
|
typedef struct rperf_agg_entry {
|
|
101
110
|
uint32_t frame_start; /* offset into stack_pool */
|
|
102
111
|
int depth;
|
|
@@ -119,7 +128,6 @@ typedef struct rperf_agg_table {
|
|
|
119
128
|
|
|
120
129
|
typedef struct rperf_thread_data {
|
|
121
130
|
int64_t prev_time_ns;
|
|
122
|
-
int64_t prev_wall_ns;
|
|
123
131
|
/* GVL event tracking */
|
|
124
132
|
int64_t suspended_at_ns; /* wall time at SUSPENDED */
|
|
125
133
|
int64_t ready_at_ns; /* wall time at READY */
|
|
@@ -139,15 +147,19 @@ typedef struct rperf_gc_state {
|
|
|
139
147
|
/* ---- Sampling overhead stats ---- */
|
|
140
148
|
|
|
141
149
|
typedef struct rperf_stats {
|
|
142
|
-
|
|
150
|
+
/* _Atomic: incremented by the signal handler / nanosleep worker, read and
|
|
151
|
+
* cleared by snapshot while running (atomic size_t is async-signal-safe
|
|
152
|
+
* when lock-free, which it is on all supported platforms). */
|
|
153
|
+
_Atomic size_t trigger_count;
|
|
143
154
|
size_t sampling_count;
|
|
144
155
|
int64_t sampling_total_ns;
|
|
145
156
|
size_t dropped_samples; /* samples lost due to allocation failure */
|
|
157
|
+
size_t dropped_aggregation; /* samples lost during aggregation (frame_table/agg_table full) */
|
|
146
158
|
} rperf_stats_t;
|
|
147
159
|
|
|
148
160
|
typedef struct rperf_profiler {
|
|
149
161
|
int frequency;
|
|
150
|
-
|
|
162
|
+
enum rperf_mode mode;
|
|
151
163
|
_Atomic int running;
|
|
152
164
|
pthread_t worker_thread; /* combined timer + aggregation */
|
|
153
165
|
#if RPERF_USE_TIMER_SIGNAL
|
|
@@ -199,12 +211,17 @@ rperf_profiler_mark(void *ptr)
|
|
|
199
211
|
{
|
|
200
212
|
rperf_profiler_t *prof = (rperf_profiler_t *)ptr;
|
|
201
213
|
int i;
|
|
202
|
-
/* Mark both sample buffers' frame_pools
|
|
214
|
+
/* Mark both sample buffers' frame_pools.
|
|
215
|
+
* Load the count once: the aggregator may clear it concurrently, and the
|
|
216
|
+
* pools must be read BEFORE frame_table.count below — seeing the cleared
|
|
217
|
+
* count (seq-cst) guarantees the corresponding frame_table inserts are
|
|
218
|
+
* visible, so every frame is covered by at least one mark source. */
|
|
203
219
|
for (i = 0; i < 2; i++) {
|
|
204
220
|
rperf_sample_buffer_t *buf = &prof->buffers[i];
|
|
205
|
-
|
|
221
|
+
size_t fp_count = buf->frame_pool_count;
|
|
222
|
+
if (buf->frame_pool && fp_count > 0) {
|
|
206
223
|
rb_gc_mark_locations(buf->frame_pool,
|
|
207
|
-
buf->frame_pool +
|
|
224
|
+
buf->frame_pool + fp_count);
|
|
208
225
|
}
|
|
209
226
|
}
|
|
210
227
|
/* Mark label_sets array */
|
|
@@ -217,7 +234,7 @@ rperf_profiler_mark(void *ptr)
|
|
|
217
234
|
* If we see an old count, both old and new keys arrays have valid
|
|
218
235
|
* data (old keys are kept alive in old_keys[]). */
|
|
219
236
|
{
|
|
220
|
-
size_t ft_count =
|
|
237
|
+
size_t ft_count = atomic_load_explicit(&prof->frame_table.count, memory_order_acquire);
|
|
221
238
|
VALUE *ft_keys = atomic_load_explicit(&prof->frame_table.keys, memory_order_acquire);
|
|
222
239
|
if (ft_keys && ft_count > 0) {
|
|
223
240
|
rb_gc_mark_locations(ft_keys, ft_keys + ft_count);
|
|
@@ -225,12 +242,40 @@ rperf_profiler_mark(void *ptr)
|
|
|
225
242
|
}
|
|
226
243
|
}
|
|
227
244
|
|
|
245
|
+
static size_t
|
|
246
|
+
rperf_profiler_memsize(const void *ptr)
|
|
247
|
+
{
|
|
248
|
+
const rperf_profiler_t *prof = (const rperf_profiler_t *)ptr;
|
|
249
|
+
size_t size = sizeof(rperf_profiler_t);
|
|
250
|
+
int i;
|
|
251
|
+
|
|
252
|
+
/* Double-buffered sample storage */
|
|
253
|
+
for (i = 0; i < 2; i++) {
|
|
254
|
+
const rperf_sample_buffer_t *buf = &prof->buffers[i];
|
|
255
|
+
size += buf->sample_capacity * sizeof(rperf_sample_t);
|
|
256
|
+
size += buf->frame_pool_capacity * sizeof(VALUE);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/* Frame table */
|
|
260
|
+
size += prof->frame_table.capacity * sizeof(VALUE); /* keys */
|
|
261
|
+
size += prof->frame_table.bucket_capacity * sizeof(uint32_t); /* buckets */
|
|
262
|
+
/* old_keys entries are previous keys arrays; exact sizes unknown,
|
|
263
|
+
* only the pointer array itself is accounted for. */
|
|
264
|
+
size += prof->frame_table.old_keys_capacity * sizeof(VALUE *); /* old_keys */
|
|
265
|
+
|
|
266
|
+
/* Aggregation table */
|
|
267
|
+
size += prof->agg_table.bucket_capacity * sizeof(rperf_agg_entry_t);
|
|
268
|
+
size += prof->agg_table.stack_pool_capacity * sizeof(uint32_t);
|
|
269
|
+
|
|
270
|
+
return size;
|
|
271
|
+
}
|
|
272
|
+
|
|
228
273
|
static const rb_data_type_t rperf_profiler_type = {
|
|
229
274
|
.wrap_struct_name = "rperf_profiler",
|
|
230
275
|
.function = {
|
|
231
276
|
.dmark = rperf_profiler_mark,
|
|
232
277
|
.dfree = NULL,
|
|
233
|
-
.dsize =
|
|
278
|
+
.dsize = rperf_profiler_memsize,
|
|
234
279
|
},
|
|
235
280
|
};
|
|
236
281
|
|
|
@@ -259,7 +304,7 @@ rperf_wall_time_ns(void)
|
|
|
259
304
|
static int64_t
|
|
260
305
|
rperf_current_time_ns(rperf_profiler_t *prof)
|
|
261
306
|
{
|
|
262
|
-
if (prof->mode ==
|
|
307
|
+
if (prof->mode == RPERF_MODE_CPU) {
|
|
263
308
|
return rperf_cpu_time_ns();
|
|
264
309
|
} else {
|
|
265
310
|
return rperf_wall_time_ns();
|
|
@@ -300,7 +345,7 @@ static int
|
|
|
300
345
|
rperf_ensure_sample_capacity(rperf_sample_buffer_t *buf)
|
|
301
346
|
{
|
|
302
347
|
if (buf->sample_count >= buf->sample_capacity) {
|
|
303
|
-
if (buf->sample_capacity > SIZE_MAX / 2) return -1;
|
|
348
|
+
if (buf->sample_capacity > SIZE_MAX / (2 * sizeof(rperf_sample_t))) return -1;
|
|
304
349
|
size_t new_cap = buf->sample_capacity * 2;
|
|
305
350
|
rperf_sample_t *new_samples = (rperf_sample_t *)realloc(
|
|
306
351
|
buf->samples,
|
|
@@ -319,7 +364,7 @@ static int
|
|
|
319
364
|
rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
|
|
320
365
|
{
|
|
321
366
|
while (buf->frame_pool_count + (size_t)needed > buf->frame_pool_capacity) {
|
|
322
|
-
if (buf->frame_pool_capacity > SIZE_MAX / 2) return -1;
|
|
367
|
+
if (buf->frame_pool_capacity > SIZE_MAX / (2 * sizeof(VALUE))) return -1;
|
|
323
368
|
size_t new_cap = buf->frame_pool_capacity * 2;
|
|
324
369
|
VALUE *new_pool = (VALUE *)realloc(
|
|
325
370
|
buf->frame_pool,
|
|
@@ -438,7 +483,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
|
|
|
438
483
|
keys[frame_id] = fval;
|
|
439
484
|
/* Store fence: ensure keys[frame_id] is visible before count is incremented,
|
|
440
485
|
* so GC dmark never reads uninitialized keys[count-1]. */
|
|
441
|
-
|
|
486
|
+
atomic_store_explicit(&ft->count, ft->count + 1, memory_order_release);
|
|
442
487
|
ft->buckets[idx] = frame_id;
|
|
443
488
|
|
|
444
489
|
/* Rehash if load factor > 0.7 */
|
|
@@ -494,7 +539,7 @@ rperf_agg_table_free(rperf_agg_table_t *at)
|
|
|
494
539
|
static void
|
|
495
540
|
rperf_agg_table_rehash(rperf_agg_table_t *at)
|
|
496
541
|
{
|
|
497
|
-
if (at->bucket_capacity > SIZE_MAX / 2) return;
|
|
542
|
+
if (at->bucket_capacity > SIZE_MAX / (2 * sizeof(rperf_agg_entry_t))) return;
|
|
498
543
|
size_t new_cap = at->bucket_capacity * 2;
|
|
499
544
|
rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
|
|
500
545
|
if (!new_buckets) return; /* keep using current buckets at higher load factor */
|
|
@@ -519,7 +564,7 @@ static int
|
|
|
519
564
|
rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
|
|
520
565
|
{
|
|
521
566
|
while (at->stack_pool_count + (size_t)needed > at->stack_pool_capacity) {
|
|
522
|
-
if (at->stack_pool_capacity > SIZE_MAX / 2) return -1;
|
|
567
|
+
if (at->stack_pool_capacity > SIZE_MAX / (2 * sizeof(uint32_t))) return -1;
|
|
523
568
|
size_t new_cap = at->stack_pool_capacity * 2;
|
|
524
569
|
uint32_t *new_pool = (uint32_t *)realloc(at->stack_pool,
|
|
525
570
|
new_cap * sizeof(uint32_t));
|
|
@@ -530,8 +575,9 @@ rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
|
|
|
530
575
|
return 0;
|
|
531
576
|
}
|
|
532
577
|
|
|
533
|
-
/* Insert or merge a stack into the aggregation table
|
|
534
|
-
|
|
578
|
+
/* Insert or merge a stack into the aggregation table.
|
|
579
|
+
* Returns 0 on success, -1 on failure (table full or allocation failure). */
|
|
580
|
+
static int
|
|
535
581
|
rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
536
582
|
int depth, int thread_seq, int label_set_id,
|
|
537
583
|
enum rperf_vm_state vm_state, int64_t weight, uint32_t hash)
|
|
@@ -548,14 +594,14 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
|
548
594
|
depth * sizeof(uint32_t)) == 0) {
|
|
549
595
|
/* Match — merge weight */
|
|
550
596
|
e->weight += weight;
|
|
551
|
-
return;
|
|
597
|
+
return 0;
|
|
552
598
|
}
|
|
553
599
|
idx = (idx + 1) % at->bucket_capacity;
|
|
554
|
-
if (++probes >= at->bucket_capacity) return; /* table full
|
|
600
|
+
if (++probes >= at->bucket_capacity) return -1; /* table full */
|
|
555
601
|
}
|
|
556
602
|
|
|
557
603
|
/* New entry — append frame_ids to stack_pool */
|
|
558
|
-
if (rperf_agg_ensure_stack_pool(at, depth) < 0) return;
|
|
604
|
+
if (rperf_agg_ensure_stack_pool(at, depth) < 0) return -1;
|
|
559
605
|
|
|
560
606
|
rperf_agg_entry_t *e = &at->buckets[idx];
|
|
561
607
|
e->frame_start = (uint32_t)at->stack_pool_count;
|
|
@@ -576,6 +622,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
|
576
622
|
if (at->count * 10 > at->bucket_capacity * 7) {
|
|
577
623
|
rperf_agg_table_rehash(at);
|
|
578
624
|
}
|
|
625
|
+
return 0;
|
|
579
626
|
}
|
|
580
627
|
|
|
581
628
|
/* ---- Aggregation: process a sample buffer into frame_table + agg_table ---- */
|
|
@@ -598,25 +645,38 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
|
|
|
598
645
|
/* Convert VALUE frames to frame_ids */
|
|
599
646
|
int overflow = 0;
|
|
600
647
|
for (j = 0; j < s->depth; j++) {
|
|
648
|
+
if (s->frame_start + j >= buf->frame_pool_count) {
|
|
649
|
+
/* Defensive: sample points past the pool — truncate the
|
|
650
|
+
* sample so we never hash/insert uninitialized temp_ids */
|
|
651
|
+
s->depth = j;
|
|
652
|
+
break;
|
|
653
|
+
}
|
|
601
654
|
VALUE fval = buf->frame_pool[s->frame_start + j];
|
|
602
655
|
uint32_t fid = rperf_frame_table_insert(&prof->frame_table, fval);
|
|
603
656
|
if (fid == RPERF_FRAME_TABLE_EMPTY) { overflow = 1; break; }
|
|
604
657
|
temp_ids[j] = fid;
|
|
605
658
|
}
|
|
606
|
-
if (overflow)
|
|
659
|
+
if (overflow) {
|
|
660
|
+
/* frame_table full — count remaining samples as dropped */
|
|
661
|
+
prof->stats.dropped_aggregation += buf->sample_count - i;
|
|
662
|
+
break;
|
|
663
|
+
}
|
|
664
|
+
if (s->depth <= 0) continue;
|
|
607
665
|
|
|
608
666
|
hash = rperf_fnv1a_u32(temp_ids, s->depth, s->thread_seq, s->label_set_id, s->vm_state);
|
|
609
667
|
|
|
610
|
-
rperf_agg_table_insert(&prof->agg_table, temp_ids, s->depth,
|
|
668
|
+
if (rperf_agg_table_insert(&prof->agg_table, temp_ids, s->depth,
|
|
611
669
|
s->thread_seq, s->label_set_id, s->vm_state,
|
|
612
|
-
s->weight, hash)
|
|
670
|
+
s->weight, hash) < 0) {
|
|
671
|
+
prof->stats.dropped_aggregation++;
|
|
672
|
+
}
|
|
613
673
|
}
|
|
614
674
|
|
|
615
675
|
/* Reset buffer for reuse.
|
|
616
676
|
* Release fence: ensure all frame_table inserts are visible (to GC dmark)
|
|
617
677
|
* before frame_pool_count is cleared, so dmark always has at least one
|
|
618
678
|
* source (frame_table or frame_pool) covering each VALUE. */
|
|
619
|
-
|
|
679
|
+
atomic_thread_fence(memory_order_release);
|
|
620
680
|
buf->sample_count = 0;
|
|
621
681
|
buf->frame_pool_count = 0;
|
|
622
682
|
}
|
|
@@ -693,8 +753,9 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
|
|
|
693
753
|
{
|
|
694
754
|
rperf_thread_data_t *td = (rperf_thread_data_t *)calloc(1, sizeof(rperf_thread_data_t));
|
|
695
755
|
if (!td) return NULL;
|
|
696
|
-
|
|
697
|
-
|
|
756
|
+
int64_t t = rperf_current_time_ns(prof);
|
|
757
|
+
if (t < 0) { free(td); return NULL; }
|
|
758
|
+
td->prev_time_ns = t;
|
|
698
759
|
td->thread_seq = ++prof->next_thread_seq;
|
|
699
760
|
rb_internal_thread_specific_set(thread, prof->ts_key, td);
|
|
700
761
|
return td;
|
|
@@ -706,7 +767,8 @@ static void
|
|
|
706
767
|
rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
|
|
707
768
|
{
|
|
708
769
|
/* Has GVL — safe to call Ruby APIs */
|
|
709
|
-
|
|
770
|
+
/* suspended_at_ns is only consumed by RESUMED in wall mode */
|
|
771
|
+
int64_t wall_now = (prof->mode == RPERF_MODE_WALL) ? rperf_wall_time_ns() : 0;
|
|
710
772
|
|
|
711
773
|
int is_first = 0;
|
|
712
774
|
|
|
@@ -719,25 +781,27 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t
|
|
|
719
781
|
int64_t time_now = rperf_current_time_ns(prof);
|
|
720
782
|
if (time_now < 0) return;
|
|
721
783
|
|
|
722
|
-
/*
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
|
|
727
|
-
&buf->frame_pool[frame_start], NULL);
|
|
728
|
-
if (depth <= 0) return;
|
|
729
|
-
buf->frame_pool_count += depth;
|
|
730
|
-
|
|
731
|
-
/* Record normal sample (skip if first time — no prev_time, or if paused) */
|
|
784
|
+
/* Record normal sample (skip if first time — no prev_time, or if paused).
|
|
785
|
+
* The backtrace is captured only when a sample is actually recorded:
|
|
786
|
+
* committing frames to the pool while paused would grow it without bound,
|
|
787
|
+
* because no aggregation runs until samples accumulate. */
|
|
732
788
|
if (!is_first && !RPERF_PAUSED(prof)) {
|
|
733
|
-
|
|
734
|
-
|
|
789
|
+
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
790
|
+
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) >= 0) {
|
|
791
|
+
size_t frame_start = buf->frame_pool_count;
|
|
792
|
+
int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
|
|
793
|
+
&buf->frame_pool[frame_start], NULL);
|
|
794
|
+
if (depth > 0) {
|
|
795
|
+
buf->frame_pool_count += depth;
|
|
796
|
+
int64_t weight = time_now - td->prev_time_ns;
|
|
797
|
+
rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
|
|
798
|
+
}
|
|
799
|
+
}
|
|
735
800
|
}
|
|
736
801
|
|
|
737
802
|
/* Save timestamp for READY/RESUMED */
|
|
738
803
|
td->suspended_at_ns = wall_now;
|
|
739
804
|
td->prev_time_ns = time_now;
|
|
740
|
-
td->prev_wall_ns = wall_now;
|
|
741
805
|
}
|
|
742
806
|
|
|
743
807
|
static void
|
|
@@ -768,7 +832,7 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
|
|
|
768
832
|
* Both samples are written directly into the same buffer before calling
|
|
769
833
|
* rperf_try_swap, so that a swap triggered by the first sample cannot
|
|
770
834
|
* move the second into a different buffer with a stale frame_start. */
|
|
771
|
-
if (prof->mode ==
|
|
835
|
+
if (prof->mode == RPERF_MODE_WALL && td->suspended_at_ns > 0 && !RPERF_PAUSED(prof)) {
|
|
772
836
|
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
773
837
|
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) goto skip_gvl;
|
|
774
838
|
size_t frame_start = buf->frame_pool_count;
|
|
@@ -798,7 +862,6 @@ skip_gvl:
|
|
|
798
862
|
/* Reset prev times to current — next timer sample measures from resume */
|
|
799
863
|
int64_t time_now = rperf_current_time_ns(prof);
|
|
800
864
|
if (time_now >= 0) td->prev_time_ns = time_now;
|
|
801
|
-
td->prev_wall_ns = wall_now;
|
|
802
865
|
|
|
803
866
|
/* Clear suspended state */
|
|
804
867
|
td->suspended_at_ns = 0;
|
|
@@ -818,19 +881,26 @@ static void
|
|
|
818
881
|
rperf_thread_event_hook(rb_event_flag_t event, const rb_internal_thread_event_data_t *data, void *user_data)
|
|
819
882
|
{
|
|
820
883
|
rperf_profiler_t *prof = (rperf_profiler_t *)user_data;
|
|
821
|
-
if (!prof->running) return;
|
|
822
884
|
|
|
823
885
|
VALUE thread = data->thread;
|
|
824
886
|
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
825
887
|
|
|
888
|
+
/* EXITED frees the thread's data even when running == 0: a thread can
|
|
889
|
+
* exit between stop setting running = 0 and the hook removal, and its td
|
|
890
|
+
* would otherwise leak (stop's Thread.list cleanup no longer sees it). */
|
|
891
|
+
if (event & RUBY_INTERNAL_THREAD_EVENT_EXITED) {
|
|
892
|
+
rperf_handle_exited(prof, thread, td);
|
|
893
|
+
return;
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
if (!prof->running) return;
|
|
897
|
+
|
|
826
898
|
if (event & RUBY_INTERNAL_THREAD_EVENT_SUSPENDED)
|
|
827
899
|
rperf_handle_suspended(prof, thread, td);
|
|
828
900
|
else if (event & RUBY_INTERNAL_THREAD_EVENT_READY)
|
|
829
901
|
rperf_handle_ready(td);
|
|
830
902
|
else if (event & RUBY_INTERNAL_THREAD_EVENT_RESUMED)
|
|
831
903
|
rperf_handle_resumed(prof, thread, td);
|
|
832
|
-
else if (event & RUBY_INTERNAL_THREAD_EVENT_EXITED)
|
|
833
|
-
rperf_handle_exited(prof, thread, td);
|
|
834
904
|
}
|
|
835
905
|
|
|
836
906
|
/* ---- GC event hook ---- */
|
|
@@ -895,17 +965,15 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
|
|
|
895
965
|
|
|
896
966
|
/* ---- Sampling callback (postponed job) — current thread only ---- */
|
|
897
967
|
|
|
898
|
-
|
|
899
|
-
|
|
968
|
+
/* Core sampling logic, parameterized by mode constant.
|
|
969
|
+
* Called from rperf_sample_cpu/rperf_sample_wall so the compiler
|
|
970
|
+
* can inline and eliminate mode branches at compile time. */
|
|
971
|
+
static inline void
|
|
972
|
+
rperf_sample_core(rperf_profiler_t *prof, enum rperf_mode mode)
|
|
900
973
|
{
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
if (!prof->running) return;
|
|
904
|
-
if (RPERF_PAUSED(prof)) return;
|
|
905
|
-
|
|
906
|
-
/* Measure sampling overhead */
|
|
974
|
+
/* Measure sampling overhead (wall time — runs under GVL, no I/O) */
|
|
907
975
|
struct timespec ts_start, ts_end;
|
|
908
|
-
clock_gettime(
|
|
976
|
+
clock_gettime(CLOCK_MONOTONIC, &ts_start);
|
|
909
977
|
|
|
910
978
|
VALUE thread = rb_thread_current();
|
|
911
979
|
|
|
@@ -917,12 +985,11 @@ rperf_sample_job(void *arg)
|
|
|
917
985
|
return; /* Skip first sample for this thread */
|
|
918
986
|
}
|
|
919
987
|
|
|
920
|
-
int64_t time_now =
|
|
988
|
+
int64_t time_now = (mode == RPERF_MODE_CPU) ? rperf_cpu_time_ns() : rperf_wall_time_ns();
|
|
921
989
|
if (time_now < 0) return;
|
|
922
990
|
|
|
923
991
|
int64_t weight = time_now - td->prev_time_ns;
|
|
924
992
|
td->prev_time_ns = time_now;
|
|
925
|
-
td->prev_wall_ns = rperf_wall_time_ns();
|
|
926
993
|
|
|
927
994
|
if (weight <= 0) return;
|
|
928
995
|
|
|
@@ -938,13 +1005,33 @@ rperf_sample_job(void *arg)
|
|
|
938
1005
|
|
|
939
1006
|
rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
|
|
940
1007
|
|
|
941
|
-
clock_gettime(
|
|
1008
|
+
clock_gettime(CLOCK_MONOTONIC, &ts_end);
|
|
942
1009
|
prof->stats.sampling_count++;
|
|
943
1010
|
prof->stats.sampling_total_ns +=
|
|
944
1011
|
((int64_t)ts_end.tv_sec - ts_start.tv_sec) * 1000000000LL +
|
|
945
1012
|
(ts_end.tv_nsec - ts_start.tv_nsec);
|
|
946
1013
|
}
|
|
947
1014
|
|
|
1015
|
+
static void
|
|
1016
|
+
rperf_sample_cpu(rperf_profiler_t *prof) { rperf_sample_core(prof, RPERF_MODE_CPU); }
|
|
1017
|
+
|
|
1018
|
+
static void
|
|
1019
|
+
rperf_sample_wall(rperf_profiler_t *prof) { rperf_sample_core(prof, RPERF_MODE_WALL); }
|
|
1020
|
+
|
|
1021
|
+
static void
|
|
1022
|
+
rperf_sample_job(void *arg)
|
|
1023
|
+
{
|
|
1024
|
+
rperf_profiler_t *prof = (rperf_profiler_t *)arg;
|
|
1025
|
+
|
|
1026
|
+
if (!prof->running) return;
|
|
1027
|
+
if (RPERF_PAUSED(prof)) return;
|
|
1028
|
+
|
|
1029
|
+
if (prof->mode == RPERF_MODE_CPU)
|
|
1030
|
+
rperf_sample_cpu(prof);
|
|
1031
|
+
else
|
|
1032
|
+
rperf_sample_wall(prof);
|
|
1033
|
+
}
|
|
1034
|
+
|
|
948
1035
|
/* ---- Worker thread: timer + aggregation ---- */
|
|
949
1036
|
|
|
950
1037
|
#if RPERF_USE_TIMER_SIGNAL
|
|
@@ -990,7 +1077,7 @@ rperf_worker_nanosleep_func(void *arg)
|
|
|
990
1077
|
struct timespec deadline;
|
|
991
1078
|
long interval_ns = 1000000000L / prof->frequency;
|
|
992
1079
|
|
|
993
|
-
clock_gettime(
|
|
1080
|
+
clock_gettime(RPERF_COND_CLOCK, &deadline);
|
|
994
1081
|
deadline.tv_nsec += interval_ns;
|
|
995
1082
|
if (deadline.tv_nsec >= 1000000000L) {
|
|
996
1083
|
deadline.tv_sec++;
|
|
@@ -1005,7 +1092,7 @@ rperf_worker_nanosleep_func(void *arg)
|
|
|
1005
1092
|
CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
|
|
1006
1093
|
prof->worker_paused = 0;
|
|
1007
1094
|
/* Reset deadline on wake to avoid burst of catch-up triggers */
|
|
1008
|
-
clock_gettime(
|
|
1095
|
+
clock_gettime(RPERF_COND_CLOCK, &deadline);
|
|
1009
1096
|
deadline.tv_nsec += interval_ns;
|
|
1010
1097
|
if (deadline.tv_nsec >= 1000000000L) {
|
|
1011
1098
|
deadline.tv_sec++;
|
|
@@ -1076,13 +1163,15 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
|
|
|
1076
1163
|
result = rb_hash_new();
|
|
1077
1164
|
|
|
1078
1165
|
rb_hash_aset(result, ID2SYM(rb_intern("mode")),
|
|
1079
|
-
ID2SYM(rb_intern(prof->mode ==
|
|
1166
|
+
ID2SYM(rb_intern(prof->mode == RPERF_MODE_WALL ? "wall" : "cpu")));
|
|
1080
1167
|
rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(prof->frequency));
|
|
1081
1168
|
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(prof->stats.trigger_count));
|
|
1082
1169
|
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(prof->stats.sampling_count));
|
|
1083
|
-
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")),
|
|
1170
|
+
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LL2NUM(prof->stats.sampling_total_ns));
|
|
1084
1171
|
if (prof->stats.dropped_samples > 0)
|
|
1085
1172
|
rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(prof->stats.dropped_samples));
|
|
1173
|
+
if (prof->stats.dropped_aggregation > 0)
|
|
1174
|
+
rb_hash_aset(result, ID2SYM(rb_intern("dropped_aggregation")), SIZET2NUM(prof->stats.dropped_aggregation));
|
|
1086
1175
|
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(prof->next_thread_seq));
|
|
1087
1176
|
rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
|
|
1088
1177
|
SIZET2NUM(prof->frame_table.count));
|
|
@@ -1097,8 +1186,8 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
|
|
|
1097
1186
|
+ (int64_t)prof->start_realtime.tv_nsec;
|
|
1098
1187
|
duration_ns = ((int64_t)now_monotonic.tv_sec - (int64_t)prof->start_monotonic.tv_sec) * 1000000000LL
|
|
1099
1188
|
+ ((int64_t)now_monotonic.tv_nsec - (int64_t)prof->start_monotonic.tv_nsec);
|
|
1100
|
-
rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")),
|
|
1101
|
-
rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")),
|
|
1189
|
+
rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LL2NUM(start_ns));
|
|
1190
|
+
rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LL2NUM(duration_ns));
|
|
1102
1191
|
}
|
|
1103
1192
|
|
|
1104
1193
|
{
|
|
@@ -1124,7 +1213,7 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
|
|
|
1124
1213
|
|
|
1125
1214
|
VALUE sample = rb_ary_new_capa(5);
|
|
1126
1215
|
rb_ary_push(sample, frames);
|
|
1127
|
-
rb_ary_push(sample,
|
|
1216
|
+
rb_ary_push(sample, LL2NUM(e->weight));
|
|
1128
1217
|
rb_ary_push(sample, INT2NUM(e->thread_seq));
|
|
1129
1218
|
rb_ary_push(sample, INT2NUM(e->label_set_id));
|
|
1130
1219
|
rb_ary_push(sample, INT2NUM(e->vm_state));
|
|
@@ -1154,7 +1243,7 @@ static VALUE
|
|
|
1154
1243
|
rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VALUE vdefer)
|
|
1155
1244
|
{
|
|
1156
1245
|
int frequency = NUM2INT(vfreq);
|
|
1157
|
-
|
|
1246
|
+
enum rperf_mode mode = (enum rperf_mode)NUM2INT(vmode);
|
|
1158
1247
|
int aggregate = RTEST(vagg) ? 1 : 0;
|
|
1159
1248
|
#if RPERF_USE_TIMER_SIGNAL
|
|
1160
1249
|
int sig = NUM2INT(vsig);
|
|
@@ -1173,13 +1262,26 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VAL
|
|
|
1173
1262
|
g_profiler.stats.sampling_total_ns = 0;
|
|
1174
1263
|
g_profiler.stats.trigger_count = 0;
|
|
1175
1264
|
g_profiler.stats.dropped_samples = 0;
|
|
1265
|
+
g_profiler.stats.dropped_aggregation = 0;
|
|
1176
1266
|
atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
|
|
1177
1267
|
atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
|
|
1178
1268
|
g_profiler.label_sets = Qnil;
|
|
1179
1269
|
|
|
1180
1270
|
/* Initialize worker mutex/cond */
|
|
1181
1271
|
CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
|
|
1272
|
+
#ifdef __linux__
|
|
1273
|
+
{
|
|
1274
|
+
/* Use CLOCK_MONOTONIC for pthread_cond_timedwait so that
|
|
1275
|
+
* system clock adjustments (NTP etc.) don't affect timer intervals. */
|
|
1276
|
+
pthread_condattr_t cond_attr;
|
|
1277
|
+
CHECKED(pthread_condattr_init(&cond_attr));
|
|
1278
|
+
CHECKED(pthread_condattr_setclock(&cond_attr, CLOCK_MONOTONIC));
|
|
1279
|
+
CHECKED(pthread_cond_init(&g_profiler.worker_cond, &cond_attr));
|
|
1280
|
+
CHECKED(pthread_condattr_destroy(&cond_attr));
|
|
1281
|
+
}
|
|
1282
|
+
#else
|
|
1182
1283
|
CHECKED(pthread_cond_init(&g_profiler.worker_cond, NULL));
|
|
1284
|
+
#endif
|
|
1183
1285
|
|
|
1184
1286
|
/* Initialize sample buffer(s) */
|
|
1185
1287
|
if (rperf_sample_buffer_init(&g_profiler.buffers[0]) < 0) {
|
|
@@ -1236,6 +1338,14 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VAL
|
|
|
1236
1338
|
/* Pre-initialize current thread's time so the first sample is not skipped */
|
|
1237
1339
|
{
|
|
1238
1340
|
VALUE cur_thread = rb_thread_current();
|
|
1341
|
+
/* A stale td can survive a fork (the atfork child handler does not
|
|
1342
|
+
* free the forking thread's data) — free it before creating a fresh
|
|
1343
|
+
* one, or it would leak on every fork + restart cycle. */
|
|
1344
|
+
rperf_thread_data_t *stale = (rperf_thread_data_t *)rb_internal_thread_specific_get(cur_thread, g_profiler.ts_key);
|
|
1345
|
+
if (stale) {
|
|
1346
|
+
free(stale);
|
|
1347
|
+
rb_internal_thread_specific_set(cur_thread, g_profiler.ts_key, NULL);
|
|
1348
|
+
}
|
|
1239
1349
|
rperf_thread_data_t *td = rperf_thread_data_create(&g_profiler, cur_thread);
|
|
1240
1350
|
if (!td) {
|
|
1241
1351
|
rb_remove_event_hook(rperf_gc_event_hook);
|
|
@@ -1300,24 +1410,34 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VAL
|
|
|
1300
1410
|
if (timer_create(CLOCK_MONOTONIC, &sev, &g_profiler.timer_id) != 0) {
|
|
1301
1411
|
g_profiler.running = 0;
|
|
1302
1412
|
sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
|
|
1413
|
+
/* Signal under the mutex — see rb_rperf_stop for the rationale */
|
|
1414
|
+
CHECKED(pthread_mutex_lock(&g_profiler.worker_mutex));
|
|
1303
1415
|
CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
|
|
1416
|
+
CHECKED(pthread_mutex_unlock(&g_profiler.worker_mutex));
|
|
1304
1417
|
CHECKED(pthread_join(g_profiler.worker_thread, NULL));
|
|
1305
1418
|
goto timer_fail;
|
|
1306
1419
|
}
|
|
1307
1420
|
|
|
1308
|
-
its.it_value.tv_sec = 0;
|
|
1309
1421
|
if (RPERF_PAUSED(&g_profiler)) {
|
|
1310
1422
|
/* defer mode: create timer but don't arm it */
|
|
1423
|
+
its.it_value.tv_sec = 0;
|
|
1311
1424
|
its.it_value.tv_nsec = 0;
|
|
1312
1425
|
} else {
|
|
1313
|
-
|
|
1426
|
+
/* Split into sec/nsec: frequency 1 gives a 1s interval, and
|
|
1427
|
+
* tv_nsec must be < 1e9 or timer_settime fails with EINVAL */
|
|
1428
|
+
long interval_ns = 1000000000L / g_profiler.frequency;
|
|
1429
|
+
its.it_value.tv_sec = interval_ns / 1000000000L;
|
|
1430
|
+
its.it_value.tv_nsec = interval_ns % 1000000000L;
|
|
1314
1431
|
}
|
|
1315
1432
|
its.it_interval = its.it_value;
|
|
1316
1433
|
if (timer_settime(g_profiler.timer_id, 0, &its, NULL) != 0) {
|
|
1317
1434
|
timer_delete(g_profiler.timer_id);
|
|
1318
1435
|
g_profiler.running = 0;
|
|
1319
1436
|
sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
|
|
1437
|
+
/* Signal under the mutex — see rb_rperf_stop for the rationale */
|
|
1438
|
+
CHECKED(pthread_mutex_lock(&g_profiler.worker_mutex));
|
|
1320
1439
|
CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
|
|
1440
|
+
CHECKED(pthread_mutex_unlock(&g_profiler.worker_mutex));
|
|
1321
1441
|
CHECKED(pthread_join(g_profiler.worker_thread, NULL));
|
|
1322
1442
|
goto timer_fail;
|
|
1323
1443
|
}
|
|
@@ -1378,10 +1498,15 @@ rb_rperf_stop(VALUE self)
|
|
|
1378
1498
|
}
|
|
1379
1499
|
#endif
|
|
1380
1500
|
|
|
1381
|
-
/* Wake and join worker thread.
|
|
1501
|
+
/* Wake and join worker thread. Signal while holding worker_mutex:
|
|
1502
|
+
* the worker re-checks its predicate (running) with the mutex held, so
|
|
1503
|
+
* signaling under the mutex guarantees it either sees running == 0 or is
|
|
1504
|
+
* already inside cond_wait when the signal fires — no lost wakeup.
|
|
1382
1505
|
* Any pending timer signals are still handled by rperf_signal_handler
|
|
1383
1506
|
* (just increments trigger_count + calls rb_postponed_job_trigger). */
|
|
1507
|
+
CHECKED(pthread_mutex_lock(&g_profiler.worker_mutex));
|
|
1384
1508
|
CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
|
|
1509
|
+
CHECKED(pthread_mutex_unlock(&g_profiler.worker_mutex));
|
|
1385
1510
|
CHECKED(pthread_join(g_profiler.worker_thread, NULL));
|
|
1386
1511
|
CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
|
|
1387
1512
|
CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
|
|
@@ -1436,13 +1561,15 @@ rb_rperf_stop(VALUE self)
|
|
|
1436
1561
|
|
|
1437
1562
|
result = rb_hash_new();
|
|
1438
1563
|
rb_hash_aset(result, ID2SYM(rb_intern("mode")),
|
|
1439
|
-
ID2SYM(rb_intern(g_profiler.mode ==
|
|
1564
|
+
ID2SYM(rb_intern(g_profiler.mode == RPERF_MODE_WALL ? "wall" : "cpu")));
|
|
1440
1565
|
rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
|
|
1441
1566
|
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
|
|
1442
1567
|
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
|
|
1443
|
-
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")),
|
|
1568
|
+
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LL2NUM(g_profiler.stats.sampling_total_ns));
|
|
1444
1569
|
if (g_profiler.stats.dropped_samples > 0)
|
|
1445
1570
|
rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(g_profiler.stats.dropped_samples));
|
|
1571
|
+
if (g_profiler.stats.dropped_aggregation > 0)
|
|
1572
|
+
rb_hash_aset(result, ID2SYM(rb_intern("dropped_aggregation")), SIZET2NUM(g_profiler.stats.dropped_aggregation));
|
|
1446
1573
|
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
|
|
1447
1574
|
{
|
|
1448
1575
|
struct timespec stop_monotonic;
|
|
@@ -1452,8 +1579,8 @@ rb_rperf_stop(VALUE self)
|
|
|
1452
1579
|
+ (int64_t)g_profiler.start_realtime.tv_nsec;
|
|
1453
1580
|
duration_ns = ((int64_t)stop_monotonic.tv_sec - (int64_t)g_profiler.start_monotonic.tv_sec) * 1000000000LL
|
|
1454
1581
|
+ ((int64_t)stop_monotonic.tv_nsec - (int64_t)g_profiler.start_monotonic.tv_nsec);
|
|
1455
|
-
rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")),
|
|
1456
|
-
rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")),
|
|
1582
|
+
rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LL2NUM(start_ns));
|
|
1583
|
+
rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LL2NUM(duration_ns));
|
|
1457
1584
|
}
|
|
1458
1585
|
|
|
1459
1586
|
samples_ary = rb_ary_new_capa((long)buf->sample_count);
|
|
@@ -1469,7 +1596,7 @@ rb_rperf_stop(VALUE self)
|
|
|
1469
1596
|
|
|
1470
1597
|
VALUE sample = rb_ary_new_capa(5);
|
|
1471
1598
|
rb_ary_push(sample, frames);
|
|
1472
|
-
rb_ary_push(sample,
|
|
1599
|
+
rb_ary_push(sample, LL2NUM(s->weight));
|
|
1473
1600
|
rb_ary_push(sample, INT2NUM(s->thread_seq));
|
|
1474
1601
|
rb_ary_push(sample, INT2NUM(s->label_set_id));
|
|
1475
1602
|
rb_ary_push(sample, INT2NUM(s->vm_state));
|
|
@@ -1508,6 +1635,7 @@ rperf_clear_aggregated_data(rperf_profiler_t *prof)
|
|
|
1508
1635
|
prof->stats.sampling_count = 0;
|
|
1509
1636
|
prof->stats.sampling_total_ns = 0;
|
|
1510
1637
|
prof->stats.dropped_samples = 0;
|
|
1638
|
+
prof->stats.dropped_aggregation = 0;
|
|
1511
1639
|
|
|
1512
1640
|
/* Reset start timestamps so next snapshot's duration_ns covers
|
|
1513
1641
|
* only the period since this clear. */
|
|
@@ -1604,10 +1732,13 @@ rperf_arm_timer(rperf_profiler_t *prof)
|
|
|
1604
1732
|
#if RPERF_USE_TIMER_SIGNAL
|
|
1605
1733
|
if (prof->timer_signal > 0) {
|
|
1606
1734
|
struct itimerspec its;
|
|
1607
|
-
|
|
1608
|
-
its.it_value.
|
|
1735
|
+
long interval_ns = 1000000000L / prof->frequency;
|
|
1736
|
+
its.it_value.tv_sec = interval_ns / 1000000000L;
|
|
1737
|
+
its.it_value.tv_nsec = interval_ns % 1000000000L;
|
|
1609
1738
|
its.it_interval = its.it_value;
|
|
1610
|
-
timer_settime(prof->timer_id, 0, &its, NULL)
|
|
1739
|
+
if (timer_settime(prof->timer_id, 0, &its, NULL) != 0) {
|
|
1740
|
+
fprintf(stderr, "rperf: timer_settime (arm) failed: %s\n", strerror(errno));
|
|
1741
|
+
}
|
|
1611
1742
|
return;
|
|
1612
1743
|
}
|
|
1613
1744
|
#endif
|
|
@@ -1625,7 +1756,9 @@ rperf_disarm_timer(rperf_profiler_t *prof)
|
|
|
1625
1756
|
if (prof->timer_signal > 0) {
|
|
1626
1757
|
struct itimerspec its;
|
|
1627
1758
|
memset(&its, 0, sizeof(its));
|
|
1628
|
-
timer_settime(prof->timer_id, 0, &its, NULL)
|
|
1759
|
+
if (timer_settime(prof->timer_id, 0, &its, NULL) != 0) {
|
|
1760
|
+
fprintf(stderr, "rperf: timer_settime (disarm) failed: %s\n", strerror(errno));
|
|
1761
|
+
}
|
|
1629
1762
|
return;
|
|
1630
1763
|
}
|
|
1631
1764
|
#endif
|
|
@@ -1652,7 +1785,6 @@ rperf_reset_thread_times(rperf_profiler_t *prof)
|
|
|
1652
1785
|
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
1653
1786
|
if (td) {
|
|
1654
1787
|
td->prev_time_ns = rperf_current_time_ns(prof);
|
|
1655
|
-
td->prev_wall_ns = rperf_wall_time_ns();
|
|
1656
1788
|
}
|
|
1657
1789
|
}
|
|
1658
1790
|
}
|
|
@@ -1692,6 +1824,12 @@ rb_rperf_running_p(VALUE self)
|
|
|
1692
1824
|
return g_profiler.running ? Qtrue : Qfalse;
|
|
1693
1825
|
}
|
|
1694
1826
|
|
|
1827
|
+
static VALUE
|
|
1828
|
+
rb_rperf_profiler_wrapper(VALUE self)
|
|
1829
|
+
{
|
|
1830
|
+
return g_profiler_wrapper;
|
|
1831
|
+
}
|
|
1832
|
+
|
|
1695
1833
|
/* ---- Fork safety ---- */
|
|
1696
1834
|
|
|
1697
1835
|
static void
|
|
@@ -1771,6 +1909,7 @@ Init_rperf(void)
|
|
|
1771
1909
|
rb_define_module_function(mRperf, "_c_profile_inc", rb_rperf_profile_inc, 0);
|
|
1772
1910
|
rb_define_module_function(mRperf, "_c_profile_dec", rb_rperf_profile_dec, 0);
|
|
1773
1911
|
rb_define_module_function(mRperf, "_c_running?", rb_rperf_running_p, 0);
|
|
1912
|
+
rb_define_module_function(mRperf, "_c_profiler_wrapper", rb_rperf_profiler_wrapper, 0);
|
|
1774
1913
|
|
|
1775
1914
|
memset(&g_profiler, 0, sizeof(g_profiler));
|
|
1776
1915
|
g_profiler.label_sets = Qnil;
|