rperf 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/rperf/rperf.c ADDED
@@ -0,0 +1,834 @@
1
+ #include <ruby.h>
2
+ #include <ruby/debug.h>
3
+ #include <ruby/thread.h>
4
+ #include <pthread.h>
5
+ #include <time.h>
6
+ #include <string.h>
7
+ #include <stdlib.h>
8
+ #include <unistd.h>
9
+ #include <signal.h>
10
+
11
+ #ifdef __linux__
12
+ #define RPERF_USE_TIMER_SIGNAL 1
13
+ #define RPERF_TIMER_SIGNAL_DEFAULT (SIGRTMIN + 8)
14
+ #else
15
+ #define RPERF_USE_TIMER_SIGNAL 0
16
+ #endif
17
+
18
+ #define RPERF_MAX_STACK_DEPTH 512
19
+ #define RPERF_INITIAL_SAMPLES 1024
20
+ #define RPERF_INITIAL_FRAME_POOL (1024 * 1024 / sizeof(VALUE)) /* ~1MB */
21
+
22
+ /* ---- Data structures ---- */
23
+
24
+ enum rperf_sample_type {
25
+ RPERF_SAMPLE_NORMAL = 0,
26
+ RPERF_SAMPLE_GVL_BLOCKED = 1, /* off-GVL: SUSPENDED → READY */
27
+ RPERF_SAMPLE_GVL_WAIT = 2, /* GVL wait: READY → RESUMED */
28
+ RPERF_SAMPLE_GC_MARKING = 3, /* GC marking phase */
29
+ RPERF_SAMPLE_GC_SWEEPING = 4, /* GC sweeping phase */
30
+ };
31
+
32
+ enum rperf_gc_phase {
33
+ RPERF_GC_NONE = 0,
34
+ RPERF_GC_MARKING = 1,
35
+ RPERF_GC_SWEEPING = 2,
36
+ };
37
+
38
+ typedef struct rperf_sample {
39
+ int depth;
40
+ size_t frame_start; /* index into frame_pool */
41
+ int64_t weight;
42
+ int type; /* rperf_sample_type */
43
+ int thread_seq; /* thread sequence number (1-based) */
44
+ } rperf_sample_t;
45
+
46
+ typedef struct rperf_thread_data {
47
+ int64_t prev_cpu_ns;
48
+ int64_t prev_wall_ns;
49
+ /* GVL event tracking */
50
+ int64_t suspended_at_ns; /* wall time at SUSPENDED */
51
+ int64_t ready_at_ns; /* wall time at READY */
52
+ size_t suspended_frame_start; /* saved stack in frame_pool */
53
+ int suspended_frame_depth; /* saved stack depth */
54
+ int thread_seq; /* thread sequence number (1-based) */
55
+ } rperf_thread_data_t;
56
+
57
+ typedef struct rperf_profiler {
58
+ int frequency;
59
+ int mode; /* 0 = cpu, 1 = wall */
60
+ volatile int running;
61
+ pthread_t timer_thread;
62
+ #if RPERF_USE_TIMER_SIGNAL
63
+ timer_t timer_id;
64
+ int timer_signal; /* >0: use timer signal, 0: use nanosleep thread */
65
+ #endif
66
+ rb_postponed_job_handle_t pj_handle;
67
+ rperf_sample_t *samples;
68
+ size_t sample_count;
69
+ size_t sample_capacity;
70
+ VALUE *frame_pool; /* raw frame VALUEs from rb_profile_frames */
71
+ size_t frame_pool_count;
72
+ size_t frame_pool_capacity;
73
+ rb_internal_thread_specific_key_t ts_key;
74
+ rb_internal_thread_event_hook_t *thread_hook;
75
+ /* GC tracking */
76
+ int gc_phase; /* rperf_gc_phase */
77
+ int64_t gc_enter_ns; /* wall time at GC_ENTER */
78
+ size_t gc_frame_start; /* saved stack at GC_ENTER */
79
+ int gc_frame_depth; /* saved stack depth */
80
+ int gc_thread_seq; /* thread_seq at GC_ENTER */
81
+ /* Timing metadata for pprof */
82
+ struct timespec start_realtime; /* CLOCK_REALTIME at start */
83
+ struct timespec start_monotonic; /* CLOCK_MONOTONIC at start */
84
+ /* Thread sequence counter */
85
+ int next_thread_seq;
86
+ /* Sampling overhead stats */
87
+ size_t trigger_count;
88
+ size_t sampling_count;
89
+ int64_t sampling_total_ns;
90
+ } rperf_profiler_t;
91
+
92
+ static rperf_profiler_t g_profiler;
93
+ static VALUE g_profiler_wrapper = Qnil;
94
+
95
+ /* ---- TypedData for GC marking of frame_pool ---- */
96
+
97
+ static void
98
+ rperf_profiler_mark(void *ptr)
99
+ {
100
+ rperf_profiler_t *prof = (rperf_profiler_t *)ptr;
101
+ if (prof->frame_pool && prof->frame_pool_count > 0) {
102
+ rb_gc_mark_locations(prof->frame_pool, prof->frame_pool + prof->frame_pool_count);
103
+ }
104
+ }
105
+
106
+ static const rb_data_type_t rperf_profiler_type = {
107
+ .wrap_struct_name = "rperf_profiler",
108
+ .function = {
109
+ .dmark = rperf_profiler_mark,
110
+ .dfree = NULL,
111
+ .dsize = NULL,
112
+ },
113
+ };
114
+
115
+ /* ---- CPU time ---- */
116
+
117
+ static int64_t
118
+ rperf_cpu_time_ns(void)
119
+ {
120
+ struct timespec ts;
121
+ if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) != 0) return -1;
122
+ return (int64_t)ts.tv_sec * 1000000000LL + ts.tv_nsec;
123
+ }
124
+
125
+ /* ---- Wall time ---- */
126
+
127
+ static int64_t
128
+ rperf_wall_time_ns(void)
129
+ {
130
+ struct timespec ts;
131
+ clock_gettime(CLOCK_MONOTONIC, &ts);
132
+ return (int64_t)ts.tv_sec * 1000000000LL + ts.tv_nsec;
133
+ }
134
+
135
+ /* ---- Get current thread's time based on profiler mode ---- */
136
+
137
+ static int64_t
138
+ rperf_current_time_ns(rperf_profiler_t *prof, rperf_thread_data_t *td)
139
+ {
140
+ if (prof->mode == 0) {
141
+ return rperf_cpu_time_ns();
142
+ } else {
143
+ return rperf_wall_time_ns();
144
+ }
145
+ }
146
+
147
+ /* ---- Sample buffer ---- */
148
+
149
+ /* Returns 0 on success, -1 on allocation failure */
150
+ static int
151
+ rperf_ensure_sample_capacity(rperf_profiler_t *prof)
152
+ {
153
+ if (prof->sample_count >= prof->sample_capacity) {
154
+ size_t new_cap = prof->sample_capacity * 2;
155
+ rperf_sample_t *new_samples = (rperf_sample_t *)realloc(
156
+ prof->samples,
157
+ new_cap * sizeof(rperf_sample_t));
158
+ if (!new_samples) return -1;
159
+ prof->samples = new_samples;
160
+ prof->sample_capacity = new_cap;
161
+ }
162
+ return 0;
163
+ }
164
+
165
+ /* ---- Frame pool ---- */
166
+
167
+ /* Ensure frame_pool has room for `needed` more entries. Returns 0 on success. */
168
+ static int
169
+ rperf_ensure_frame_pool_capacity(rperf_profiler_t *prof, int needed)
170
+ {
171
+ while (prof->frame_pool_count + (size_t)needed > prof->frame_pool_capacity) {
172
+ size_t new_cap = prof->frame_pool_capacity * 2;
173
+ VALUE *new_pool = (VALUE *)realloc(
174
+ prof->frame_pool,
175
+ new_cap * sizeof(VALUE));
176
+ if (!new_pool) return -1;
177
+ prof->frame_pool = new_pool;
178
+ prof->frame_pool_capacity = new_cap;
179
+ }
180
+ return 0;
181
+ }
182
+
183
+ /* ---- Record a sample ---- */
184
+
185
+ static void
186
+ rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
187
+ int64_t weight, int type, int thread_seq)
188
+ {
189
+ if (weight <= 0) return;
190
+ if (rperf_ensure_sample_capacity(prof) < 0) return;
191
+
192
+ rperf_sample_t *sample = &prof->samples[prof->sample_count];
193
+ sample->depth = depth;
194
+ sample->frame_start = frame_start;
195
+ sample->weight = weight;
196
+ sample->type = type;
197
+ sample->thread_seq = thread_seq;
198
+ prof->sample_count++;
199
+ }
200
+
201
+ /* ---- Thread data initialization ---- */
202
+
203
+ /* Create and initialize per-thread data. Must be called on the target thread. */
204
+ static rperf_thread_data_t *
205
+ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
206
+ {
207
+ rperf_thread_data_t *td = (rperf_thread_data_t *)calloc(1, sizeof(rperf_thread_data_t));
208
+ if (!td) return NULL;
209
+ td->prev_cpu_ns = rperf_current_time_ns(prof, td);
210
+ td->prev_wall_ns = rperf_wall_time_ns();
211
+ td->thread_seq = ++prof->next_thread_seq;
212
+ rb_internal_thread_specific_set(thread, prof->ts_key, td);
213
+ return td;
214
+ }
215
+
216
+ /* ---- Thread event hooks ---- */
217
+
218
+ static void
219
+ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
220
+ {
221
+ /* Has GVL — safe to call Ruby APIs */
222
+ int64_t wall_now = rperf_wall_time_ns();
223
+
224
+ rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
225
+ int is_first = 0;
226
+
227
+ if (td == NULL) {
228
+ td = rperf_thread_data_create(prof, thread);
229
+ if (!td) return;
230
+ is_first = 1;
231
+ }
232
+
233
+ int64_t time_now = rperf_current_time_ns(prof, td);
234
+ if (time_now < 0) return;
235
+
236
+ /* Capture backtrace into frame_pool */
237
+ if (rperf_ensure_frame_pool_capacity(prof, RPERF_MAX_STACK_DEPTH) < 0) return;
238
+ size_t frame_start = prof->frame_pool_count;
239
+ int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
240
+ &prof->frame_pool[frame_start], NULL);
241
+ if (depth <= 0) return;
242
+ prof->frame_pool_count += depth;
243
+
244
+ /* Record normal sample (skip if first time — no prev_time) */
245
+ if (!is_first) {
246
+ int64_t weight = time_now - td->prev_cpu_ns;
247
+ rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
248
+ }
249
+
250
+ /* Save stack and timestamp for READY/RESUMED */
251
+ td->suspended_at_ns = wall_now;
252
+ td->suspended_frame_start = frame_start;
253
+ td->suspended_frame_depth = depth;
254
+ td->prev_cpu_ns = time_now;
255
+ td->prev_wall_ns = wall_now;
256
+ }
257
+
258
+ static void
259
+ rperf_handle_ready(rperf_profiler_t *prof, VALUE thread)
260
+ {
261
+ /* May NOT have GVL — only simple C operations allowed */
262
+ rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
263
+ if (!td) return;
264
+
265
+ td->ready_at_ns = rperf_wall_time_ns();
266
+ }
267
+
268
+ static void
269
+ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread)
270
+ {
271
+ /* Has GVL */
272
+ rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
273
+
274
+ if (td == NULL) {
275
+ td = rperf_thread_data_create(prof, thread);
276
+ if (!td) return;
277
+ }
278
+
279
+ int64_t wall_now = rperf_wall_time_ns();
280
+
281
+ /* Record GVL blocked/wait samples (wall mode only) */
282
+ if (prof->mode == 1 && td->suspended_frame_depth > 0) {
283
+ if (td->ready_at_ns > 0 && td->ready_at_ns > td->suspended_at_ns) {
284
+ int64_t blocked_ns = td->ready_at_ns - td->suspended_at_ns;
285
+ rperf_record_sample(prof, td->suspended_frame_start,
286
+ td->suspended_frame_depth, blocked_ns,
287
+ RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq);
288
+ }
289
+ if (td->ready_at_ns > 0 && wall_now > td->ready_at_ns) {
290
+ int64_t wait_ns = wall_now - td->ready_at_ns;
291
+ rperf_record_sample(prof, td->suspended_frame_start,
292
+ td->suspended_frame_depth, wait_ns,
293
+ RPERF_SAMPLE_GVL_WAIT, td->thread_seq);
294
+ }
295
+ }
296
+
297
+ /* Reset prev times to current — next timer sample measures from resume */
298
+ int64_t time_now = rperf_current_time_ns(prof, td);
299
+ if (time_now >= 0) td->prev_cpu_ns = time_now;
300
+ td->prev_wall_ns = wall_now;
301
+
302
+ /* Clear suspended state */
303
+ td->suspended_frame_depth = 0;
304
+ td->ready_at_ns = 0;
305
+ }
306
+
307
+ static void
308
+ rperf_handle_exited(rperf_profiler_t *prof, VALUE thread)
309
+ {
310
+ rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
311
+ if (td) {
312
+ free(td);
313
+ rb_internal_thread_specific_set(thread, prof->ts_key, NULL);
314
+ }
315
+ }
316
+
317
+ static void
318
+ rperf_thread_event_hook(rb_event_flag_t event, const rb_internal_thread_event_data_t *data, void *user_data)
319
+ {
320
+ rperf_profiler_t *prof = (rperf_profiler_t *)user_data;
321
+ if (!prof->running) return;
322
+
323
+ VALUE thread = data->thread;
324
+
325
+ if (event & RUBY_INTERNAL_THREAD_EVENT_SUSPENDED)
326
+ rperf_handle_suspended(prof, thread);
327
+ else if (event & RUBY_INTERNAL_THREAD_EVENT_READY)
328
+ rperf_handle_ready(prof, thread);
329
+ else if (event & RUBY_INTERNAL_THREAD_EVENT_RESUMED)
330
+ rperf_handle_resumed(prof, thread);
331
+ else if (event & RUBY_INTERNAL_THREAD_EVENT_EXITED)
332
+ rperf_handle_exited(prof, thread);
333
+ }
334
+
335
+ /* ---- GC event hook ---- */
336
+
337
+ static void
338
+ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE klass)
339
+ {
340
+ rperf_profiler_t *prof = &g_profiler;
341
+ if (!prof->running) return;
342
+
343
+ if (event & RUBY_INTERNAL_EVENT_GC_START) {
344
+ prof->gc_phase = RPERF_GC_MARKING;
345
+ }
346
+ else if (event & RUBY_INTERNAL_EVENT_GC_END_MARK) {
347
+ prof->gc_phase = RPERF_GC_SWEEPING;
348
+ }
349
+ else if (event & RUBY_INTERNAL_EVENT_GC_END_SWEEP) {
350
+ prof->gc_phase = RPERF_GC_NONE;
351
+ }
352
+ else if (event & RUBY_INTERNAL_EVENT_GC_ENTER) {
353
+ /* Capture backtrace and timestamp at GC entry */
354
+ prof->gc_enter_ns = rperf_wall_time_ns();
355
+
356
+ if (rperf_ensure_frame_pool_capacity(prof, RPERF_MAX_STACK_DEPTH) < 0) return;
357
+ size_t frame_start = prof->frame_pool_count;
358
+ int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
359
+ &prof->frame_pool[frame_start], NULL);
360
+ if (depth <= 0) {
361
+ prof->gc_frame_depth = 0;
362
+ return;
363
+ }
364
+ prof->frame_pool_count += depth;
365
+ prof->gc_frame_start = frame_start;
366
+ prof->gc_frame_depth = depth;
367
+
368
+ /* Save thread_seq for the GC_EXIT sample */
369
+ {
370
+ VALUE thread = rb_thread_current();
371
+ rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
372
+ prof->gc_thread_seq = td ? td->thread_seq : 0;
373
+ }
374
+ }
375
+ else if (event & RUBY_INTERNAL_EVENT_GC_EXIT) {
376
+ if (prof->gc_frame_depth <= 0) return;
377
+
378
+ int64_t wall_now = rperf_wall_time_ns();
379
+ int64_t weight = wall_now - prof->gc_enter_ns;
380
+ int type = (prof->gc_phase == RPERF_GC_SWEEPING)
381
+ ? RPERF_SAMPLE_GC_SWEEPING
382
+ : RPERF_SAMPLE_GC_MARKING;
383
+
384
+ rperf_record_sample(prof, prof->gc_frame_start,
385
+ prof->gc_frame_depth, weight, type, prof->gc_thread_seq);
386
+ prof->gc_frame_depth = 0;
387
+ }
388
+ }
389
+
390
+ /* ---- Sampling callback (postponed job) — current thread only ---- */
391
+
392
+ static void
393
+ rperf_sample_job(void *arg)
394
+ {
395
+ rperf_profiler_t *prof = (rperf_profiler_t *)arg;
396
+
397
+ if (!prof->running) return;
398
+
399
+ /* Measure sampling overhead */
400
+ struct timespec ts_start, ts_end;
401
+ clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_start);
402
+
403
+ VALUE thread = rb_thread_current();
404
+
405
+ /* Get/create per-thread data */
406
+ rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
407
+ if (td == NULL) {
408
+ td = rperf_thread_data_create(prof, thread);
409
+ if (!td) return;
410
+ return; /* Skip first sample for this thread */
411
+ }
412
+
413
+ int64_t time_now = rperf_current_time_ns(prof, td);
414
+ if (time_now < 0) return;
415
+
416
+ int64_t weight = time_now - td->prev_cpu_ns;
417
+ td->prev_cpu_ns = time_now;
418
+ td->prev_wall_ns = rperf_wall_time_ns();
419
+
420
+ if (weight <= 0) return;
421
+
422
+ /* Capture backtrace and record sample */
423
+ if (rperf_ensure_frame_pool_capacity(prof, RPERF_MAX_STACK_DEPTH) < 0) return;
424
+
425
+ size_t frame_start = prof->frame_pool_count;
426
+ int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
427
+ &prof->frame_pool[frame_start], NULL);
428
+ if (depth <= 0) return;
429
+ prof->frame_pool_count += depth;
430
+
431
+ rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
432
+
433
+ clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_end);
434
+ prof->sampling_count++;
435
+ prof->sampling_total_ns +=
436
+ ((int64_t)ts_end.tv_sec - ts_start.tv_sec) * 1000000000LL +
437
+ (ts_end.tv_nsec - ts_start.tv_nsec);
438
+ }
439
+
440
+ /* ---- Timer ---- */
441
+
442
+ #if RPERF_USE_TIMER_SIGNAL
443
+ static void
444
+ rperf_signal_handler(int sig)
445
+ {
446
+ g_profiler.trigger_count++;
447
+ rb_postponed_job_trigger(g_profiler.pj_handle);
448
+ }
449
+ #endif
450
+
451
+ static void *
452
+ rperf_timer_func(void *arg)
453
+ {
454
+ rperf_profiler_t *prof = (rperf_profiler_t *)arg;
455
+ struct timespec interval;
456
+ interval.tv_sec = 0;
457
+ interval.tv_nsec = 1000000000L / prof->frequency;
458
+
459
+ while (prof->running) {
460
+ prof->trigger_count++;
461
+ rb_postponed_job_trigger(prof->pj_handle);
462
+ nanosleep(&interval, NULL);
463
+ }
464
+ return NULL;
465
+ }
466
+
467
+ /* ---- Resolve frame VALUE to [path, label] Ruby strings ---- */
468
+
469
+ static VALUE
470
+ rperf_resolve_frame(VALUE fval)
471
+ {
472
+ VALUE path = rb_profile_frame_path(fval);
473
+ VALUE label = rb_profile_frame_full_label(fval);
474
+
475
+ if (NIL_P(path)) path = rb_str_new_lit("<C method>");
476
+
477
+ if (NIL_P(path)) path = rb_str_new_cstr("");
478
+ if (NIL_P(label)) label = rb_str_new_cstr("");
479
+
480
+ return rb_ary_new3(2, path, label);
481
+ }
482
+
483
+ /* ---- Ruby API ---- */
484
+
485
+ static VALUE
486
+ rb_rperf_start(int argc, VALUE *argv, VALUE self)
487
+ {
488
+ VALUE opts;
489
+ int frequency = 1000;
490
+ int mode = 0; /* 0 = cpu, 1 = wall */
491
+ #if RPERF_USE_TIMER_SIGNAL
492
+ int timer_signal = RPERF_TIMER_SIGNAL_DEFAULT;
493
+ #endif
494
+
495
+ rb_scan_args(argc, argv, ":", &opts);
496
+ if (!NIL_P(opts)) {
497
+ VALUE vfreq = rb_hash_aref(opts, ID2SYM(rb_intern("frequency")));
498
+ if (!NIL_P(vfreq)) {
499
+ frequency = NUM2INT(vfreq);
500
+ if (frequency <= 0 || frequency > 1000000) {
501
+ rb_raise(rb_eArgError, "frequency must be between 1 and 1000000");
502
+ }
503
+ }
504
+ VALUE vmode = rb_hash_aref(opts, ID2SYM(rb_intern("mode")));
505
+ if (!NIL_P(vmode)) {
506
+ ID mode_id = SYM2ID(vmode);
507
+ if (mode_id == rb_intern("cpu")) {
508
+ mode = 0;
509
+ } else if (mode_id == rb_intern("wall")) {
510
+ mode = 1;
511
+ } else {
512
+ rb_raise(rb_eArgError, "mode must be :cpu or :wall");
513
+ }
514
+ }
515
+ #if RPERF_USE_TIMER_SIGNAL
516
+ VALUE vsig = rb_hash_aref(opts, ID2SYM(rb_intern("signal")));
517
+ if (!NIL_P(vsig)) {
518
+ if (RTEST(vsig)) {
519
+ timer_signal = NUM2INT(vsig);
520
+ if (timer_signal < SIGRTMIN || timer_signal > SIGRTMAX) {
521
+ rb_raise(rb_eArgError, "signal must be between SIGRTMIN(%d) and SIGRTMAX(%d)",
522
+ SIGRTMIN, SIGRTMAX);
523
+ }
524
+ } else {
525
+ /* signal: false or signal: 0 → use nanosleep thread */
526
+ timer_signal = 0;
527
+ }
528
+ }
529
+ #endif
530
+ }
531
+
532
+ if (g_profiler.running) {
533
+ rb_raise(rb_eRuntimeError, "Rperf is already running");
534
+ }
535
+
536
+ g_profiler.frequency = frequency;
537
+ g_profiler.mode = mode;
538
+ g_profiler.sample_count = 0;
539
+ g_profiler.next_thread_seq = 0;
540
+ g_profiler.sampling_count = 0;
541
+ g_profiler.sampling_total_ns = 0;
542
+ g_profiler.sample_capacity = RPERF_INITIAL_SAMPLES;
543
+ g_profiler.samples = (rperf_sample_t *)calloc(
544
+ g_profiler.sample_capacity, sizeof(rperf_sample_t));
545
+ if (!g_profiler.samples) {
546
+ rb_raise(rb_eNoMemError, "rperf: failed to allocate sample buffer");
547
+ }
548
+
549
+ g_profiler.frame_pool_count = 0;
550
+ g_profiler.frame_pool_capacity = RPERF_INITIAL_FRAME_POOL;
551
+ g_profiler.frame_pool = (VALUE *)calloc(
552
+ g_profiler.frame_pool_capacity, sizeof(VALUE));
553
+ if (!g_profiler.frame_pool) {
554
+ free(g_profiler.samples);
555
+ g_profiler.samples = NULL;
556
+ rb_raise(rb_eNoMemError, "rperf: failed to allocate frame pool");
557
+ }
558
+
559
+ /* Register GC event hook */
560
+ g_profiler.gc_phase = RPERF_GC_NONE;
561
+ g_profiler.gc_frame_depth = 0;
562
+ rb_add_event_hook(rperf_gc_event_hook,
563
+ RUBY_INTERNAL_EVENT_GC_START |
564
+ RUBY_INTERNAL_EVENT_GC_END_MARK |
565
+ RUBY_INTERNAL_EVENT_GC_END_SWEEP |
566
+ RUBY_INTERNAL_EVENT_GC_ENTER |
567
+ RUBY_INTERNAL_EVENT_GC_EXIT,
568
+ Qnil);
569
+
570
+ /* Register thread event hook for all events */
571
+ g_profiler.thread_hook = rb_internal_thread_add_event_hook(
572
+ rperf_thread_event_hook,
573
+ RUBY_INTERNAL_THREAD_EVENT_EXITED |
574
+ RUBY_INTERNAL_THREAD_EVENT_SUSPENDED |
575
+ RUBY_INTERNAL_THREAD_EVENT_READY |
576
+ RUBY_INTERNAL_THREAD_EVENT_RESUMED,
577
+ &g_profiler);
578
+
579
+ /* Pre-initialize current thread's time so the first sample is not skipped */
580
+ {
581
+ VALUE cur_thread = rb_thread_current();
582
+ rperf_thread_data_t *td = rperf_thread_data_create(&g_profiler, cur_thread);
583
+ if (!td) {
584
+ free(g_profiler.samples);
585
+ g_profiler.samples = NULL;
586
+ free(g_profiler.frame_pool);
587
+ g_profiler.frame_pool = NULL;
588
+ rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
589
+ g_profiler.thread_hook = NULL;
590
+ rb_raise(rb_eNoMemError, "rperf: failed to allocate thread data");
591
+ }
592
+ }
593
+
594
+ clock_gettime(CLOCK_REALTIME, &g_profiler.start_realtime);
595
+ clock_gettime(CLOCK_MONOTONIC, &g_profiler.start_monotonic);
596
+
597
+ g_profiler.running = 1;
598
+
599
+ #if RPERF_USE_TIMER_SIGNAL
600
+ g_profiler.timer_signal = timer_signal;
601
+
602
+ if (timer_signal > 0) {
603
+ struct sigaction sa;
604
+ struct sigevent sev;
605
+ struct itimerspec its;
606
+
607
+ memset(&sa, 0, sizeof(sa));
608
+ sa.sa_handler = rperf_signal_handler;
609
+ sa.sa_flags = SA_RESTART;
610
+ sigaction(g_profiler.timer_signal, &sa, NULL);
611
+
612
+ memset(&sev, 0, sizeof(sev));
613
+ sev.sigev_notify = SIGEV_SIGNAL;
614
+ sev.sigev_signo = g_profiler.timer_signal;
615
+ if (timer_create(CLOCK_MONOTONIC, &sev, &g_profiler.timer_id) != 0) {
616
+ g_profiler.running = 0;
617
+ signal(g_profiler.timer_signal, SIG_DFL);
618
+ goto timer_fail;
619
+ }
620
+
621
+ its.it_value.tv_sec = 0;
622
+ its.it_value.tv_nsec = 1000000000L / g_profiler.frequency;
623
+ its.it_interval = its.it_value;
624
+ timer_settime(g_profiler.timer_id, 0, &its, NULL);
625
+ } else
626
+ #endif
627
+ {
628
+ if (pthread_create(&g_profiler.timer_thread, NULL, rperf_timer_func, &g_profiler) != 0) {
629
+ g_profiler.running = 0;
630
+ goto timer_fail;
631
+ }
632
+ }
633
+
634
+ if (0) {
635
+ timer_fail:
636
+ {
637
+ VALUE cur = rb_thread_current();
638
+ rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(cur, g_profiler.ts_key);
639
+ if (td) {
640
+ free(td);
641
+ rb_internal_thread_specific_set(cur, g_profiler.ts_key, NULL);
642
+ }
643
+ }
644
+ rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
645
+ g_profiler.thread_hook = NULL;
646
+ free(g_profiler.samples);
647
+ g_profiler.samples = NULL;
648
+ free(g_profiler.frame_pool);
649
+ g_profiler.frame_pool = NULL;
650
+ rb_raise(rb_eRuntimeError, "rperf: failed to create timer");
651
+ }
652
+
653
+ return Qtrue;
654
+ }
655
+
656
+ static VALUE
657
+ rb_rperf_stop(VALUE self)
658
+ {
659
+ VALUE result, samples_ary;
660
+ size_t i;
661
+ int j;
662
+
663
+ if (!g_profiler.running) {
664
+ return Qnil;
665
+ }
666
+
667
+ g_profiler.running = 0;
668
+ #if RPERF_USE_TIMER_SIGNAL
669
+ if (g_profiler.timer_signal > 0) {
670
+ timer_delete(g_profiler.timer_id);
671
+ signal(g_profiler.timer_signal, SIG_DFL);
672
+ } else
673
+ #endif
674
+ {
675
+ pthread_join(g_profiler.timer_thread, NULL);
676
+ }
677
+
678
+ if (g_profiler.thread_hook) {
679
+ rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
680
+ g_profiler.thread_hook = NULL;
681
+ }
682
+
683
+ /* Remove GC event hook */
684
+ rb_remove_event_hook(rperf_gc_event_hook);
685
+
686
+ /* Clean up thread-specific data for all live threads */
687
+ {
688
+ VALUE threads = rb_funcall(rb_cThread, rb_intern("list"), 0);
689
+ long tc = RARRAY_LEN(threads);
690
+ long ti;
691
+ for (ti = 0; ti < tc; ti++) {
692
+ VALUE thread = RARRAY_AREF(threads, ti);
693
+ rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, g_profiler.ts_key);
694
+ if (td) {
695
+ free(td);
696
+ rb_internal_thread_specific_set(thread, g_profiler.ts_key, NULL);
697
+ }
698
+ }
699
+ }
700
+
701
+ /* Build result hash */
702
+ result = rb_hash_new();
703
+
704
+ /* mode */
705
+ rb_hash_aset(result, ID2SYM(rb_intern("mode")),
706
+ ID2SYM(rb_intern(g_profiler.mode == 1 ? "wall" : "cpu")));
707
+
708
+ /* frequency */
709
+ rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
710
+
711
+ /* trigger_count, sampling_count, sampling_time_ns */
712
+ rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.trigger_count));
713
+ rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.sampling_count));
714
+ rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.sampling_total_ns));
715
+
716
+ /* start_time_ns (CLOCK_REALTIME epoch nanos), duration_ns (CLOCK_MONOTONIC delta) */
717
+ {
718
+ struct timespec stop_monotonic;
719
+ int64_t start_ns, duration_ns;
720
+ clock_gettime(CLOCK_MONOTONIC, &stop_monotonic);
721
+ start_ns = (int64_t)g_profiler.start_realtime.tv_sec * 1000000000LL
722
+ + (int64_t)g_profiler.start_realtime.tv_nsec;
723
+ duration_ns = ((int64_t)stop_monotonic.tv_sec - (int64_t)g_profiler.start_monotonic.tv_sec) * 1000000000LL
724
+ + ((int64_t)stop_monotonic.tv_nsec - (int64_t)g_profiler.start_monotonic.tv_nsec);
725
+ rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
726
+ rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
727
+ }
728
+
729
+ /* samples: array of [frames_array, weight]
730
+ * Each frame is [path_string, label_string]
731
+ * GVL blocked/wait samples get synthetic frame prepended (leaf position) */
732
+ samples_ary = rb_ary_new_capa((long)g_profiler.sample_count);
733
+ for (i = 0; i < g_profiler.sample_count; i++) {
734
+ rperf_sample_t *s = &g_profiler.samples[i];
735
+ VALUE frames = rb_ary_new_capa(s->depth + 1);
736
+
737
+ /* Prepend synthetic frame at leaf position (index 0) */
738
+ if (s->type == RPERF_SAMPLE_GVL_BLOCKED) {
739
+ VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]"));
740
+ rb_ary_push(frames, syn);
741
+ } else if (s->type == RPERF_SAMPLE_GVL_WAIT) {
742
+ VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]"));
743
+ rb_ary_push(frames, syn);
744
+ } else if (s->type == RPERF_SAMPLE_GC_MARKING) {
745
+ VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]"));
746
+ rb_ary_push(frames, syn);
747
+ } else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
748
+ VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]"));
749
+ rb_ary_push(frames, syn);
750
+ }
751
+
752
+ for (j = 0; j < s->depth; j++) {
753
+ VALUE fval = g_profiler.frame_pool[s->frame_start + j];
754
+ rb_ary_push(frames, rperf_resolve_frame(fval));
755
+ }
756
+
757
+ VALUE sample = rb_ary_new3(3, frames, LONG2NUM(s->weight), INT2NUM(s->thread_seq));
758
+ rb_ary_push(samples_ary, sample);
759
+ }
760
+ rb_hash_aset(result, ID2SYM(rb_intern("samples")), samples_ary);
761
+
762
+ /* Cleanup */
763
+ free(g_profiler.samples);
764
+ g_profiler.samples = NULL;
765
+ free(g_profiler.frame_pool);
766
+ g_profiler.frame_pool = NULL;
767
+ g_profiler.frame_pool_count = 0;
768
+
769
+ return result;
770
+ }
771
+
772
+ /* ---- Fork safety ---- */
773
+
774
+ static void
775
+ rperf_after_fork_child(void)
776
+ {
777
+ if (!g_profiler.running) return;
778
+
779
+ /* Mark as not running — timer doesn't exist in child */
780
+ g_profiler.running = 0;
781
+
782
+ #if RPERF_USE_TIMER_SIGNAL
783
+ /* timer_create timers are not inherited across fork; reset signal handler */
784
+ if (g_profiler.timer_signal > 0) {
785
+ signal(g_profiler.timer_signal, SIG_DFL);
786
+ }
787
+ #endif
788
+
789
+ /* Remove hooks so they don't fire with stale state */
790
+ if (g_profiler.thread_hook) {
791
+ rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
792
+ g_profiler.thread_hook = NULL;
793
+ }
794
+ rb_remove_event_hook(rperf_gc_event_hook);
795
+
796
+ /* Free sample buffer and frame pool — these hold parent's data */
797
+ free(g_profiler.samples);
798
+ g_profiler.samples = NULL;
799
+ g_profiler.sample_count = 0;
800
+ g_profiler.sample_capacity = 0;
801
+
802
+ free(g_profiler.frame_pool);
803
+ g_profiler.frame_pool = NULL;
804
+ g_profiler.frame_pool_count = 0;
805
+ g_profiler.frame_pool_capacity = 0;
806
+
807
+ /* Reset GC state */
808
+ g_profiler.gc_phase = 0;
809
+
810
+ /* Reset stats */
811
+ g_profiler.sampling_count = 0;
812
+ g_profiler.sampling_total_ns = 0;
813
+ }
814
+
815
+ /* ---- Init ---- */
816
+
817
+ void
818
+ Init_rperf(void)
819
+ {
820
+ VALUE mRperf = rb_define_module("Rperf");
821
+ rb_define_module_function(mRperf, "_c_start", rb_rperf_start, -1);
822
+ rb_define_module_function(mRperf, "_c_stop", rb_rperf_stop, 0);
823
+
824
+ memset(&g_profiler, 0, sizeof(g_profiler));
825
+ g_profiler.pj_handle = rb_postponed_job_preregister(0, rperf_sample_job, &g_profiler);
826
+ g_profiler.ts_key = rb_internal_thread_specific_key_create();
827
+
828
+ /* TypedData wrapper for GC marking of frame_pool */
829
+ g_profiler_wrapper = TypedData_Wrap_Struct(rb_cObject, &rperf_profiler_type, &g_profiler);
830
+ rb_gc_register_address(&g_profiler_wrapper);
831
+
832
+ /* Fork safety: silently stop profiling in child process */
833
+ pthread_atfork(NULL, NULL, rperf_after_fork_child);
834
+ }