rperf 0.6.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/rperf/rperf.c CHANGED
@@ -8,6 +8,7 @@
8
8
  #include <unistd.h>
9
9
  #include <signal.h>
10
10
  #include <stdatomic.h>
11
+ #include <sched.h>
11
12
  #ifdef __linux__
12
13
  #include <sys/syscall.h>
13
14
  #endif
@@ -36,23 +37,19 @@
36
37
  #define RPERF_FRAME_TABLE_OLD_KEYS_INITIAL 16
37
38
  #define RPERF_AGG_TABLE_INITIAL 1024
38
39
  #define RPERF_STACK_POOL_INITIAL 4096
39
-
40
- /* Synthetic frame IDs (reserved in frame_table, 0-based) */
41
- #define RPERF_SYNTHETIC_GVL_BLOCKED 0
42
- #define RPERF_SYNTHETIC_GVL_WAIT 1
43
- #define RPERF_SYNTHETIC_GC_MARKING 2
44
- #define RPERF_SYNTHETIC_GC_SWEEPING 3
45
- #define RPERF_SYNTHETIC_COUNT 4
40
+ #define RPERF_PAUSED(prof) ((prof)->profile_refcount == 0)
41
+
42
+ /* VM state values (stored in samples, not as stack frames) */
43
+ enum rperf_vm_state {
44
+ RPERF_VM_STATE_NORMAL = 0,
45
+ RPERF_VM_STATE_GVL_BLOCKED = 1,
46
+ RPERF_VM_STATE_GVL_WAIT = 2,
47
+ RPERF_VM_STATE_GC_MARKING = 3,
48
+ RPERF_VM_STATE_GC_SWEEPING = 4,
49
+ };
46
50
 
47
51
  /* ---- Data structures ---- */
48
52
 
49
- enum rperf_sample_type {
50
- RPERF_SAMPLE_NORMAL = 0,
51
- RPERF_SAMPLE_GVL_BLOCKED = 1, /* off-GVL: SUSPENDED → READY */
52
- RPERF_SAMPLE_GVL_WAIT = 2, /* GVL wait: READY → RESUMED */
53
- RPERF_SAMPLE_GC_MARKING = 3, /* GC marking phase */
54
- RPERF_SAMPLE_GC_SWEEPING = 4, /* GC sweeping phase */
55
- };
56
53
 
57
54
  enum rperf_gc_phase {
58
55
  RPERF_GC_NONE = 0,
@@ -64,7 +61,7 @@ typedef struct rperf_sample {
64
61
  int depth;
65
62
  size_t frame_start; /* index into frame_pool */
66
63
  int64_t weight;
67
- int type; /* rperf_sample_type */
64
+ enum rperf_vm_state vm_state;
68
65
  int thread_seq; /* thread sequence number (1-based) */
69
66
  int label_set_id; /* label set ID (0 = no labels) */
70
67
  } rperf_sample_t;
@@ -86,7 +83,7 @@ typedef struct rperf_sample_buffer {
86
83
 
87
84
  typedef struct rperf_frame_table {
88
85
  _Atomic(VALUE *) keys; /* unique VALUE array (GC mark target) */
89
- size_t count; /* = next frame_id (starts after RPERF_SYNTHETIC_COUNT) */
86
+ size_t count; /* = next frame_id */
90
87
  size_t capacity;
91
88
  uint32_t *buckets; /* open addressing: stores index into keys[] */
92
89
  size_t bucket_capacity;
@@ -102,9 +99,10 @@ typedef struct rperf_frame_table {
102
99
 
103
100
  typedef struct rperf_agg_entry {
104
101
  uint32_t frame_start; /* offset into stack_pool */
105
- int depth; /* includes synthetic frame */
102
+ int depth;
106
103
  int thread_seq;
107
104
  int label_set_id; /* label set ID (0 = no labels) */
105
+ enum rperf_vm_state vm_state;
108
106
  int64_t weight; /* accumulated */
109
107
  uint32_t hash; /* cached hash value */
110
108
  int used; /* 0 = empty, 1 = used */
@@ -144,6 +142,7 @@ typedef struct rperf_stats {
144
142
  size_t trigger_count;
145
143
  size_t sampling_count;
146
144
  int64_t sampling_total_ns;
145
+ size_t dropped_samples; /* samples lost due to allocation failure */
147
146
  } rperf_stats_t;
148
147
 
149
148
  typedef struct rperf_profiler {
@@ -182,6 +181,12 @@ typedef struct rperf_profiler {
182
181
  /* Label sets: Ruby Array of Hash objects, managed from Ruby side.
183
182
  * Index 0 is reserved (no labels). GC-marked via profiler_mark. */
184
183
  VALUE label_sets; /* Ruby Array or Qnil */
184
+ /* Profile refcount: controls timer active/paused state.
185
+ * start(defer:false) sets to 1, start(defer:true) sets to 0.
186
+ * profile_inc/dec transitions 0↔1 arm/disarm the timer.
187
+ * Modified only under GVL, so plain int is safe. */
188
+ int profile_refcount;
189
+ int worker_paused; /* 1 when nanosleep worker is in paused cond_wait */
185
190
  } rperf_profiler_t;
186
191
 
187
192
  static rperf_profiler_t g_profiler;
@@ -215,8 +220,7 @@ rperf_profiler_mark(void *ptr)
215
220
  size_t ft_count = __atomic_load_n(&prof->frame_table.count, __ATOMIC_ACQUIRE);
216
221
  VALUE *ft_keys = atomic_load_explicit(&prof->frame_table.keys, memory_order_acquire);
217
222
  if (ft_keys && ft_count > 0) {
218
- rb_gc_mark_locations(ft_keys + RPERF_SYNTHETIC_COUNT,
219
- ft_keys + ft_count);
223
+ rb_gc_mark_locations(ft_keys, ft_keys + ft_count);
220
224
  }
221
225
  }
222
226
  }
@@ -253,7 +257,7 @@ rperf_wall_time_ns(void)
253
257
  /* ---- Get current thread's time based on profiler mode ---- */
254
258
 
255
259
  static int64_t
256
- rperf_current_time_ns(rperf_profiler_t *prof, rperf_thread_data_t *td)
260
+ rperf_current_time_ns(rperf_profiler_t *prof)
257
261
  {
258
262
  if (prof->mode == 0) {
259
263
  return rperf_cpu_time_ns();
@@ -296,6 +300,7 @@ static int
296
300
  rperf_ensure_sample_capacity(rperf_sample_buffer_t *buf)
297
301
  {
298
302
  if (buf->sample_count >= buf->sample_capacity) {
303
+ if (buf->sample_capacity > SIZE_MAX / 2) return -1;
299
304
  size_t new_cap = buf->sample_capacity * 2;
300
305
  rperf_sample_t *new_samples = (rperf_sample_t *)realloc(
301
306
  buf->samples,
@@ -314,6 +319,7 @@ static int
314
319
  rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
315
320
  {
316
321
  while (buf->frame_pool_count + (size_t)needed > buf->frame_pool_capacity) {
322
+ if (buf->frame_pool_capacity > SIZE_MAX / 2) return -1;
317
323
  size_t new_cap = buf->frame_pool_capacity * 2;
318
324
  VALUE *new_pool = (VALUE *)realloc(
319
325
  buf->frame_pool,
@@ -334,7 +340,7 @@ rperf_frame_table_init(rperf_frame_table_t *ft)
334
340
  VALUE *keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
335
341
  if (!keys) return -1;
336
342
  atomic_store_explicit(&ft->keys, keys, memory_order_relaxed);
337
- ft->count = RPERF_SYNTHETIC_COUNT; /* reserve slots for synthetic frames */
343
+ ft->count = 0;
338
344
  ft->bucket_capacity = RPERF_FRAME_TABLE_INITIAL * 2;
339
345
  ft->buckets = (uint32_t *)malloc(ft->bucket_capacity * sizeof(uint32_t));
340
346
  if (!ft->buckets) { free(keys); atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed); return -1; }
@@ -366,6 +372,7 @@ rperf_frame_table_free(rperf_frame_table_t *ft)
366
372
  static void
367
373
  rperf_frame_table_rehash(rperf_frame_table_t *ft)
368
374
  {
375
+ if (ft->bucket_capacity > SIZE_MAX / 2) return;
369
376
  size_t new_cap = ft->bucket_capacity * 2;
370
377
  uint32_t *new_buckets = (uint32_t *)malloc(new_cap * sizeof(uint32_t));
371
378
  if (!new_buckets) return; /* keep using current buckets at higher load factor */
@@ -373,7 +380,7 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
373
380
 
374
381
  VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
375
382
  size_t i;
376
- for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
383
+ for (i = 0; i < ft->count; i++) {
377
384
  uint32_t h = (uint32_t)(keys[i] >> 3); /* shift out tag bits */
378
385
  size_t idx = h % new_cap;
379
386
  while (new_buckets[idx] != RPERF_FRAME_TABLE_EMPTY)
@@ -394,11 +401,13 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
394
401
  uint32_t h = (uint32_t)(fval >> 3);
395
402
  size_t idx = h % ft->bucket_capacity;
396
403
 
404
+ size_t probes = 0;
397
405
  while (1) {
398
406
  uint32_t slot = ft->buckets[idx];
399
407
  if (slot == RPERF_FRAME_TABLE_EMPTY) break;
400
408
  if (keys[slot] == fval) return slot;
401
409
  idx = (idx + 1) % ft->bucket_capacity;
410
+ if (++probes >= ft->bucket_capacity) return RPERF_FRAME_TABLE_EMPTY; /* table full */
402
411
  }
403
412
 
404
413
  /* Insert new entry. Grow keys array if capacity is exhausted.
@@ -406,6 +415,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
406
415
  * the old keys pointer. Instead, allocate new, copy, swap pointer
407
416
  * atomically, and keep old array alive until stop. */
408
417
  if (ft->count >= ft->capacity) {
418
+ if (ft->capacity > SIZE_MAX / 2) return RPERF_FRAME_TABLE_EMPTY;
409
419
  size_t new_cap = ft->capacity * 2;
410
420
  VALUE *new_keys = (VALUE *)calloc(new_cap, sizeof(VALUE));
411
421
  if (!new_keys) return RPERF_FRAME_TABLE_EMPTY;
@@ -442,7 +452,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
442
452
  /* ---- Aggregation table operations (all malloc-based, no GVL needed) ---- */
443
453
 
444
454
  static uint32_t
445
- rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id)
455
+ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id, enum rperf_vm_state vm_state)
446
456
  {
447
457
  uint32_t h = 2166136261u;
448
458
  int i;
@@ -454,6 +464,8 @@ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id)
454
464
  h *= 16777619u;
455
465
  h ^= (uint32_t)label_set_id;
456
466
  h *= 16777619u;
467
+ h ^= (uint32_t)vm_state;
468
+ h *= 16777619u;
457
469
  return h;
458
470
  }
459
471
 
@@ -482,6 +494,7 @@ rperf_agg_table_free(rperf_agg_table_t *at)
482
494
  static void
483
495
  rperf_agg_table_rehash(rperf_agg_table_t *at)
484
496
  {
497
+ if (at->bucket_capacity > SIZE_MAX / 2) return;
485
498
  size_t new_cap = at->bucket_capacity * 2;
486
499
  rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
487
500
  if (!new_buckets) return; /* keep using current buckets at higher load factor */
@@ -506,6 +519,7 @@ static int
506
519
  rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
507
520
  {
508
521
  while (at->stack_pool_count + (size_t)needed > at->stack_pool_capacity) {
522
+ if (at->stack_pool_capacity > SIZE_MAX / 2) return -1;
509
523
  size_t new_cap = at->stack_pool_capacity * 2;
510
524
  uint32_t *new_pool = (uint32_t *)realloc(at->stack_pool,
511
525
  new_cap * sizeof(uint32_t));
@@ -520,15 +534,16 @@ rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
520
534
  static void
521
535
  rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
522
536
  int depth, int thread_seq, int label_set_id,
523
- int64_t weight, uint32_t hash)
537
+ enum rperf_vm_state vm_state, int64_t weight, uint32_t hash)
524
538
  {
525
539
  size_t idx = hash % at->bucket_capacity;
526
540
 
541
+ size_t probes = 0;
527
542
  while (1) {
528
543
  rperf_agg_entry_t *e = &at->buckets[idx];
529
544
  if (!e->used) break;
530
545
  if (e->hash == hash && e->depth == depth && e->thread_seq == thread_seq &&
531
- e->label_set_id == label_set_id &&
546
+ e->label_set_id == label_set_id && e->vm_state == vm_state &&
532
547
  memcmp(at->stack_pool + e->frame_start, frame_ids,
533
548
  depth * sizeof(uint32_t)) == 0) {
534
549
  /* Match — merge weight */
@@ -536,6 +551,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
536
551
  return;
537
552
  }
538
553
  idx = (idx + 1) % at->bucket_capacity;
554
+ if (++probes >= at->bucket_capacity) return; /* table full, drop sample */
539
555
  }
540
556
 
541
557
  /* New entry — append frame_ids to stack_pool */
@@ -546,6 +562,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
546
562
  e->depth = depth;
547
563
  e->thread_seq = thread_seq;
548
564
  e->label_set_id = label_set_id;
565
+ e->vm_state = vm_state;
549
566
  e->weight = weight;
550
567
  e->hash = hash;
551
568
  e->used = 1;
@@ -567,24 +584,16 @@ static void
567
584
  rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
568
585
  {
569
586
  size_t i;
570
- uint32_t temp_ids[RPERF_MAX_STACK_DEPTH + 1];
587
+ uint32_t temp_ids[RPERF_MAX_STACK_DEPTH];
571
588
 
572
589
  for (i = 0; i < buf->sample_count; i++) {
573
590
  rperf_sample_t *s = &buf->samples[i];
574
- int off = 0;
575
591
  uint32_t hash;
576
592
  int j;
577
593
 
578
- /* Prepend synthetic frame if needed */
579
- if (s->type == RPERF_SAMPLE_GVL_BLOCKED) {
580
- temp_ids[off++] = RPERF_SYNTHETIC_GVL_BLOCKED;
581
- } else if (s->type == RPERF_SAMPLE_GVL_WAIT) {
582
- temp_ids[off++] = RPERF_SYNTHETIC_GVL_WAIT;
583
- } else if (s->type == RPERF_SAMPLE_GC_MARKING) {
584
- temp_ids[off++] = RPERF_SYNTHETIC_GC_MARKING;
585
- } else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
586
- temp_ids[off++] = RPERF_SYNTHETIC_GC_SWEEPING;
587
- }
594
+ /* Clamp depth to temp_ids[] capacity */
595
+ if (s->depth > RPERF_MAX_STACK_DEPTH)
596
+ s->depth = RPERF_MAX_STACK_DEPTH;
588
597
 
589
598
  /* Convert VALUE frames to frame_ids */
590
599
  int overflow = 0;
@@ -592,15 +601,15 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
592
601
  VALUE fval = buf->frame_pool[s->frame_start + j];
593
602
  uint32_t fid = rperf_frame_table_insert(&prof->frame_table, fval);
594
603
  if (fid == RPERF_FRAME_TABLE_EMPTY) { overflow = 1; break; }
595
- temp_ids[off + j] = fid;
604
+ temp_ids[j] = fid;
596
605
  }
597
606
  if (overflow) break; /* frame_table full, stop aggregating this buffer */
598
607
 
599
- int total_depth = off + s->depth;
600
- hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq, s->label_set_id);
608
+ hash = rperf_fnv1a_u32(temp_ids, s->depth, s->thread_seq, s->label_set_id, s->vm_state);
601
609
 
602
- rperf_agg_table_insert(&prof->agg_table, temp_ids, total_depth,
603
- s->thread_seq, s->label_set_id, s->weight, hash);
610
+ rperf_agg_table_insert(&prof->agg_table, temp_ids, s->depth,
611
+ s->thread_seq, s->label_set_id, s->vm_state,
612
+ s->weight, hash);
604
613
  }
605
614
 
606
615
  /* Reset buffer for reuse.
@@ -650,7 +659,7 @@ rperf_try_swap(rperf_profiler_t *prof)
650
659
  /* Write a sample into a specific buffer. No swap check. */
651
660
  static int
652
661
  rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
653
- int64_t weight, int type, int thread_seq, int label_set_id)
662
+ int64_t weight, enum rperf_vm_state vm_state, int thread_seq, int label_set_id)
654
663
  {
655
664
  if (weight <= 0) return 0;
656
665
  if (rperf_ensure_sample_capacity(buf) < 0) return -1;
@@ -659,7 +668,7 @@ rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
659
668
  sample->depth = depth;
660
669
  sample->frame_start = frame_start;
661
670
  sample->weight = weight;
662
- sample->type = type;
671
+ sample->vm_state = vm_state;
663
672
  sample->thread_seq = thread_seq;
664
673
  sample->label_set_id = label_set_id;
665
674
  buf->sample_count++;
@@ -668,10 +677,11 @@ rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
668
677
 
669
678
  static void
670
679
  rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
671
- int64_t weight, int type, int thread_seq, int label_set_id)
680
+ int64_t weight, enum rperf_vm_state vm_state, int thread_seq, int label_set_id)
672
681
  {
673
682
  rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
674
- rperf_write_sample(buf, frame_start, depth, weight, type, thread_seq, label_set_id);
683
+ if (rperf_write_sample(buf, frame_start, depth, weight, vm_state, thread_seq, label_set_id) < 0)
684
+ prof->stats.dropped_samples++;
675
685
  rperf_try_swap(prof);
676
686
  }
677
687
 
@@ -683,7 +693,7 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
683
693
  {
684
694
  rperf_thread_data_t *td = (rperf_thread_data_t *)calloc(1, sizeof(rperf_thread_data_t));
685
695
  if (!td) return NULL;
686
- td->prev_time_ns = rperf_current_time_ns(prof, td);
696
+ td->prev_time_ns = rperf_current_time_ns(prof);
687
697
  td->prev_wall_ns = rperf_wall_time_ns();
688
698
  td->thread_seq = ++prof->next_thread_seq;
689
699
  rb_internal_thread_specific_set(thread, prof->ts_key, td);
@@ -706,7 +716,7 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t
706
716
  is_first = 1;
707
717
  }
708
718
 
709
- int64_t time_now = rperf_current_time_ns(prof, td);
719
+ int64_t time_now = rperf_current_time_ns(prof);
710
720
  if (time_now < 0) return;
711
721
 
712
722
  /* Capture backtrace into active buffer's frame_pool */
@@ -718,10 +728,10 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t
718
728
  if (depth <= 0) return;
719
729
  buf->frame_pool_count += depth;
720
730
 
721
- /* Record normal sample (skip if first time — no prev_time) */
722
- if (!is_first) {
731
+ /* Record normal sample (skip if first time — no prev_time, or if paused) */
732
+ if (!is_first && !RPERF_PAUSED(prof)) {
723
733
  int64_t weight = time_now - td->prev_time_ns;
724
- rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq, td->label_set_id);
734
+ rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
725
735
  }
726
736
 
727
737
  /* Save timestamp for READY/RESUMED */
@@ -758,7 +768,7 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
758
768
  * Both samples are written directly into the same buffer before calling
759
769
  * rperf_try_swap, so that a swap triggered by the first sample cannot
760
770
  * move the second into a different buffer with a stale frame_start. */
761
- if (prof->mode == 1 && td->suspended_at_ns > 0) {
771
+ if (prof->mode == 1 && td->suspended_at_ns > 0 && !RPERF_PAUSED(prof)) {
762
772
  rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
763
773
  if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) goto skip_gvl;
764
774
  size_t frame_start = buf->frame_pool_count;
@@ -770,13 +780,15 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
770
780
  /* Write both samples into the same buf, then swap-check once */
771
781
  if (td->ready_at_ns > 0 && td->ready_at_ns > td->suspended_at_ns) {
772
782
  int64_t blocked_ns = td->ready_at_ns - td->suspended_at_ns;
773
- rperf_write_sample(buf, frame_start, depth, blocked_ns,
774
- RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq, td->label_set_id);
783
+ if (rperf_write_sample(buf, frame_start, depth, blocked_ns,
784
+ RPERF_VM_STATE_GVL_BLOCKED, td->thread_seq, td->label_set_id) < 0)
785
+ prof->stats.dropped_samples++;
775
786
  }
776
787
  if (td->ready_at_ns > 0 && wall_now > td->ready_at_ns) {
777
788
  int64_t wait_ns = wall_now - td->ready_at_ns;
778
- rperf_write_sample(buf, frame_start, depth, wait_ns,
779
- RPERF_SAMPLE_GVL_WAIT, td->thread_seq, td->label_set_id);
789
+ if (rperf_write_sample(buf, frame_start, depth, wait_ns,
790
+ RPERF_VM_STATE_GVL_WAIT, td->thread_seq, td->label_set_id) < 0)
791
+ prof->stats.dropped_samples++;
780
792
  }
781
793
 
782
794
  rperf_try_swap(prof);
@@ -784,7 +796,7 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *
784
796
  skip_gvl:
785
797
 
786
798
  /* Reset prev times to current — next timer sample measures from resume */
787
- int64_t time_now = rperf_current_time_ns(prof, td);
799
+ int64_t time_now = rperf_current_time_ns(prof);
788
800
  if (time_now >= 0) td->prev_time_ns = time_now;
789
801
  td->prev_wall_ns = wall_now;
790
802
 
@@ -851,12 +863,13 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
851
863
  }
852
864
  else if (event & RUBY_INTERNAL_EVENT_GC_EXIT) {
853
865
  if (prof->gc.enter_ns <= 0) return;
866
+ if (RPERF_PAUSED(prof)) { prof->gc.enter_ns = 0; return; }
854
867
 
855
868
  int64_t wall_now = rperf_wall_time_ns();
856
869
  int64_t weight = wall_now - prof->gc.enter_ns;
857
- int type = (prof->gc.phase == RPERF_GC_SWEEPING)
858
- ? RPERF_SAMPLE_GC_SWEEPING
859
- : RPERF_SAMPLE_GC_MARKING;
870
+ enum rperf_vm_state vm_state = (prof->gc.phase == RPERF_GC_SWEEPING)
871
+ ? RPERF_VM_STATE_GC_SWEEPING
872
+ : RPERF_VM_STATE_GC_MARKING;
860
873
 
861
874
  /* Capture backtrace here (not at GC_ENTER) so that frame_start
862
875
  * always indexes into the current active buffer. The Ruby stack
@@ -875,7 +888,7 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
875
888
  }
876
889
  buf->frame_pool_count += depth;
877
890
 
878
- rperf_record_sample(prof, frame_start, depth, weight, type, prof->gc.thread_seq, prof->gc.label_set_id);
891
+ rperf_record_sample(prof, frame_start, depth, weight, vm_state, prof->gc.thread_seq, prof->gc.label_set_id);
879
892
  prof->gc.enter_ns = 0;
880
893
  }
881
894
  }
@@ -888,6 +901,7 @@ rperf_sample_job(void *arg)
888
901
  rperf_profiler_t *prof = (rperf_profiler_t *)arg;
889
902
 
890
903
  if (!prof->running) return;
904
+ if (RPERF_PAUSED(prof)) return;
891
905
 
892
906
  /* Measure sampling overhead */
893
907
  struct timespec ts_start, ts_end;
@@ -903,7 +917,7 @@ rperf_sample_job(void *arg)
903
917
  return; /* Skip first sample for this thread */
904
918
  }
905
919
 
906
- int64_t time_now = rperf_current_time_ns(prof, td);
920
+ int64_t time_now = rperf_current_time_ns(prof);
907
921
  if (time_now < 0) return;
908
922
 
909
923
  int64_t weight = time_now - td->prev_time_ns;
@@ -922,7 +936,7 @@ rperf_sample_job(void *arg)
922
936
  if (depth <= 0) return;
923
937
  buf->frame_pool_count += depth;
924
938
 
925
- rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq, td->label_set_id);
939
+ rperf_record_sample(prof, frame_start, depth, weight, RPERF_VM_STATE_NORMAL, td->thread_seq, td->label_set_id);
926
940
 
927
941
  clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_end);
928
942
  prof->stats.sampling_count++;
@@ -985,20 +999,34 @@ rperf_worker_nanosleep_func(void *arg)
985
999
 
986
1000
  CHECKED(pthread_mutex_lock(&prof->worker_mutex));
987
1001
  while (prof->running) {
988
- int ret = pthread_cond_timedwait(&prof->worker_cond, &prof->worker_mutex, &deadline);
989
- if (ret != 0 && ret != ETIMEDOUT) {
990
- fprintf(stderr, "rperf: pthread_cond_timedwait failed: %s\n", strerror(ret));
991
- abort();
992
- }
993
- if (ret == ETIMEDOUT) {
994
- prof->stats.trigger_count++;
995
- rb_postponed_job_trigger(prof->pj_handle);
996
- /* Advance deadline by interval */
1002
+ if (RPERF_PAUSED(prof)) {
1003
+ /* Paused: mark as paused so disarm can confirm, then wait */
1004
+ prof->worker_paused = 1;
1005
+ CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
1006
+ prof->worker_paused = 0;
1007
+ /* Reset deadline on wake to avoid burst of catch-up triggers */
1008
+ clock_gettime(CLOCK_REALTIME, &deadline);
997
1009
  deadline.tv_nsec += interval_ns;
998
1010
  if (deadline.tv_nsec >= 1000000000L) {
999
1011
  deadline.tv_sec++;
1000
1012
  deadline.tv_nsec -= 1000000000L;
1001
1013
  }
1014
+ } else {
1015
+ int ret = pthread_cond_timedwait(&prof->worker_cond, &prof->worker_mutex, &deadline);
1016
+ if (ret != 0 && ret != ETIMEDOUT) {
1017
+ fprintf(stderr, "rperf: pthread_cond_timedwait failed: %s\n", strerror(ret));
1018
+ abort();
1019
+ }
1020
+ if (ret == ETIMEDOUT) {
1021
+ prof->stats.trigger_count++;
1022
+ rb_postponed_job_trigger(prof->pj_handle);
1023
+ /* Advance deadline by interval */
1024
+ deadline.tv_nsec += interval_ns;
1025
+ if (deadline.tv_nsec >= 1000000000L) {
1026
+ deadline.tv_sec++;
1027
+ deadline.tv_nsec -= 1000000000L;
1028
+ }
1029
+ }
1002
1030
  }
1003
1031
  rperf_try_aggregate(prof);
1004
1032
  }
@@ -1053,9 +1081,11 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
1053
1081
  rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(prof->stats.trigger_count));
1054
1082
  rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(prof->stats.sampling_count));
1055
1083
  rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(prof->stats.sampling_total_ns));
1084
+ if (prof->stats.dropped_samples > 0)
1085
+ rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(prof->stats.dropped_samples));
1056
1086
  rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(prof->next_thread_seq));
1057
1087
  rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
1058
- SIZET2NUM(prof->frame_table.count - RPERF_SYNTHETIC_COUNT));
1088
+ SIZET2NUM(prof->frame_table.count));
1059
1089
  rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
1060
1090
  SIZET2NUM(prof->agg_table.count));
1061
1091
 
@@ -1074,11 +1104,7 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
1074
1104
  {
1075
1105
  rperf_frame_table_t *ft = &prof->frame_table;
1076
1106
  VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
1077
- rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]")));
1078
- rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
1079
- rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]")));
1080
- rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]")));
1081
- for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
1107
+ for (i = 0; i < ft->count; i++) {
1082
1108
  rb_ary_push(resolved_ary, rperf_resolve_frame(atomic_load_explicit(&ft->keys, memory_order_relaxed)[i]));
1083
1109
  }
1084
1110
 
@@ -1090,11 +1116,18 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
1090
1116
 
1091
1117
  VALUE frames = rb_ary_new_capa(e->depth);
1092
1118
  for (j = 0; j < e->depth; j++) {
1119
+ if (e->frame_start + j >= at->stack_pool_count) break;
1093
1120
  uint32_t fid = at->stack_pool[e->frame_start + j];
1121
+ if (fid >= ft->count) break;
1094
1122
  rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
1095
1123
  }
1096
1124
 
1097
- VALUE sample = rb_ary_new3(4, frames, LONG2NUM(e->weight), INT2NUM(e->thread_seq), INT2NUM(e->label_set_id));
1125
+ VALUE sample = rb_ary_new_capa(5);
1126
+ rb_ary_push(sample, frames);
1127
+ rb_ary_push(sample, LONG2NUM(e->weight));
1128
+ rb_ary_push(sample, INT2NUM(e->thread_seq));
1129
+ rb_ary_push(sample, INT2NUM(e->label_set_id));
1130
+ rb_ary_push(sample, INT2NUM(e->vm_state));
1098
1131
  rb_ary_push(samples_ary, sample);
1099
1132
  }
1100
1133
  }
@@ -1110,14 +1143,15 @@ rperf_build_aggregated_result(rperf_profiler_t *prof)
1110
1143
 
1111
1144
  /* ---- Ruby API ---- */
1112
1145
 
1113
- /* _c_start(frequency, mode, aggregate, signal)
1146
+ /* _c_start(frequency, mode, aggregate, signal, defer)
1114
1147
  * frequency: Integer (Hz)
1115
1148
  * mode: 0 = cpu, 1 = wall
1116
1149
  * aggregate: 0 or 1
1117
1150
  * signal: Integer (RT signal number, 0 = nanosleep, -1 = default)
1151
+ * defer: if truthy, start with timer paused (profile_refcount = 0)
1118
1152
  */
1119
1153
  static VALUE
1120
- rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
1154
+ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VALUE vdefer)
1121
1155
  {
1122
1156
  int frequency = NUM2INT(vfreq);
1123
1157
  int mode = NUM2INT(vmode);
@@ -1138,6 +1172,7 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
1138
1172
  g_profiler.stats.sampling_count = 0;
1139
1173
  g_profiler.stats.sampling_total_ns = 0;
1140
1174
  g_profiler.stats.trigger_count = 0;
1175
+ g_profiler.stats.dropped_samples = 0;
1141
1176
  atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
1142
1177
  atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
1143
1178
  g_profiler.label_sets = Qnil;
@@ -1222,6 +1257,8 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
1222
1257
  clock_gettime(CLOCK_MONOTONIC, &g_profiler.start_monotonic);
1223
1258
 
1224
1259
  g_profiler.running = 1;
1260
+ g_profiler.profile_refcount = RTEST(vdefer) ? 0 : 1;
1261
+ g_profiler.worker_paused = 0;
1225
1262
 
1226
1263
  #if RPERF_USE_TIMER_SIGNAL
1227
1264
  g_profiler.timer_signal = timer_signal;
@@ -1269,7 +1306,12 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
1269
1306
  }
1270
1307
 
1271
1308
  its.it_value.tv_sec = 0;
1272
- its.it_value.tv_nsec = 1000000000L / g_profiler.frequency;
1309
+ if (RPERF_PAUSED(&g_profiler)) {
1310
+ /* defer mode: create timer but don't arm it */
1311
+ its.it_value.tv_nsec = 0;
1312
+ } else {
1313
+ its.it_value.tv_nsec = 1000000000L / g_profiler.frequency;
1314
+ }
1273
1315
  its.it_interval = its.it_value;
1274
1316
  if (timer_settime(g_profiler.timer_id, 0, &its, NULL) != 0) {
1275
1317
  timer_delete(g_profiler.timer_id);
@@ -1320,9 +1362,7 @@ timer_fail:
1320
1362
  static VALUE
1321
1363
  rb_rperf_stop(VALUE self)
1322
1364
  {
1323
- VALUE result, samples_ary;
1324
- size_t i;
1325
- int j;
1365
+ VALUE result;
1326
1366
 
1327
1367
  if (!g_profiler.running) {
1328
1368
  return Qnil;
@@ -1389,6 +1429,9 @@ rb_rperf_stop(VALUE self)
1389
1429
  rperf_agg_table_free(&g_profiler.agg_table);
1390
1430
  } else {
1391
1431
  /* Raw samples path (aggregate: false) */
1432
+ VALUE samples_ary;
1433
+ size_t i;
1434
+ int j;
1392
1435
  rperf_sample_buffer_t *buf = &g_profiler.buffers[0];
1393
1436
 
1394
1437
  result = rb_hash_new();
@@ -1398,6 +1441,8 @@ rb_rperf_stop(VALUE self)
1398
1441
  rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
1399
1442
  rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
1400
1443
  rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
1444
+ if (g_profiler.stats.dropped_samples > 0)
1445
+ rb_hash_aset(result, ID2SYM(rb_intern("dropped_samples")), SIZET2NUM(g_profiler.stats.dropped_samples));
1401
1446
  rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
1402
1447
  {
1403
1448
  struct timespec stop_monotonic;
@@ -1414,29 +1459,20 @@ rb_rperf_stop(VALUE self)
1414
1459
  samples_ary = rb_ary_new_capa((long)buf->sample_count);
1415
1460
  for (i = 0; i < buf->sample_count; i++) {
1416
1461
  rperf_sample_t *s = &buf->samples[i];
1417
- VALUE frames = rb_ary_new_capa(s->depth + 1);
1418
-
1419
- /* Prepend synthetic frame at leaf position (index 0) */
1420
- if (s->type == RPERF_SAMPLE_GVL_BLOCKED) {
1421
- VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]"));
1422
- rb_ary_push(frames, syn);
1423
- } else if (s->type == RPERF_SAMPLE_GVL_WAIT) {
1424
- VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]"));
1425
- rb_ary_push(frames, syn);
1426
- } else if (s->type == RPERF_SAMPLE_GC_MARKING) {
1427
- VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]"));
1428
- rb_ary_push(frames, syn);
1429
- } else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
1430
- VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]"));
1431
- rb_ary_push(frames, syn);
1432
- }
1462
+ VALUE frames = rb_ary_new_capa(s->depth);
1433
1463
 
1434
1464
  for (j = 0; j < s->depth; j++) {
1465
+ if (s->frame_start + j >= buf->frame_pool_count) break;
1435
1466
  VALUE fval = buf->frame_pool[s->frame_start + j];
1436
1467
  rb_ary_push(frames, rperf_resolve_frame(fval));
1437
1468
  }
1438
1469
 
1439
- VALUE sample = rb_ary_new3(4, frames, LONG2NUM(s->weight), INT2NUM(s->thread_seq), INT2NUM(s->label_set_id));
1470
+ VALUE sample = rb_ary_new_capa(5);
1471
+ rb_ary_push(sample, frames);
1472
+ rb_ary_push(sample, LONG2NUM(s->weight));
1473
+ rb_ary_push(sample, INT2NUM(s->thread_seq));
1474
+ rb_ary_push(sample, INT2NUM(s->label_set_id));
1475
+ rb_ary_push(sample, INT2NUM(s->vm_state));
1440
1476
  rb_ary_push(samples_ary, sample);
1441
1477
  }
1442
1478
  rb_hash_aset(result, ID2SYM(rb_intern("raw_samples")), samples_ary);
@@ -1471,6 +1507,7 @@ rperf_clear_aggregated_data(rperf_profiler_t *prof)
1471
1507
  prof->stats.trigger_count = 0;
1472
1508
  prof->stats.sampling_count = 0;
1473
1509
  prof->stats.sampling_total_ns = 0;
1510
+ prof->stats.dropped_samples = 0;
1474
1511
 
1475
1512
  /* Reset start timestamps so next snapshot's duration_ns covers
1476
1513
  * only the period since this clear. */
@@ -1558,6 +1595,103 @@ rb_rperf_get_label_sets(VALUE self)
1558
1595
  return g_profiler.label_sets;
1559
1596
  }
1560
1597
 
1598
+ /* ---- Profile refcount API (timer pause/resume) ---- */
1599
+
1600
+ /* Helper: arm the timer with the configured interval */
1601
+ static void
1602
+ rperf_arm_timer(rperf_profiler_t *prof)
1603
+ {
1604
+ #if RPERF_USE_TIMER_SIGNAL
1605
+ if (prof->timer_signal > 0) {
1606
+ struct itimerspec its;
1607
+ its.it_value.tv_sec = 0;
1608
+ its.it_value.tv_nsec = 1000000000L / prof->frequency;
1609
+ its.it_interval = its.it_value;
1610
+ timer_settime(prof->timer_id, 0, &its, NULL);
1611
+ return;
1612
+ }
1613
+ #endif
1614
+ /* nanosleep mode: signal the worker to wake from cond_wait */
1615
+ CHECKED(pthread_mutex_lock(&prof->worker_mutex));
1616
+ CHECKED(pthread_cond_signal(&prof->worker_cond));
1617
+ CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
1618
+ }
1619
+
1620
+ /* Helper: disarm the timer (stop firing) */
1621
+ static void
1622
+ rperf_disarm_timer(rperf_profiler_t *prof)
1623
+ {
1624
+ #if RPERF_USE_TIMER_SIGNAL
1625
+ if (prof->timer_signal > 0) {
1626
+ struct itimerspec its;
1627
+ memset(&its, 0, sizeof(its));
1628
+ timer_settime(prof->timer_id, 0, &its, NULL);
1629
+ return;
1630
+ }
1631
+ #endif
1632
+ /* nanosleep mode: wake the worker and wait until it enters paused state */
1633
+ CHECKED(pthread_mutex_lock(&prof->worker_mutex));
1634
+ while (!prof->worker_paused) {
1635
+ CHECKED(pthread_cond_signal(&prof->worker_cond));
1636
+ CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
1637
+ sched_yield();
1638
+ CHECKED(pthread_mutex_lock(&prof->worker_mutex));
1639
+ }
1640
+ CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
1641
+ }
1642
+
1643
+ /* Helper: reset prev_time_ns for all threads (called on resume to avoid
1644
+ * inflated weight from pause duration). Must be called with GVL held. */
1645
+ static void
1646
+ rperf_reset_thread_times(rperf_profiler_t *prof)
1647
+ {
1648
+ VALUE threads = rb_funcall(rb_cThread, rb_intern("list"), 0);
1649
+ long tc = RARRAY_LEN(threads);
1650
+ for (long i = 0; i < tc; i++) {
1651
+ VALUE thread = RARRAY_AREF(threads, i);
1652
+ rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
1653
+ if (td) {
1654
+ td->prev_time_ns = rperf_current_time_ns(prof);
1655
+ td->prev_wall_ns = rperf_wall_time_ns();
1656
+ }
1657
+ }
1658
+ }
1659
+
1660
+ /* _c_profile_inc() — increment profile refcount; resume timer on 0→1.
1661
+ * Called with GVL held. */
1662
+ static VALUE
1663
+ rb_rperf_profile_inc(VALUE self)
1664
+ {
1665
+ if (!g_profiler.running) return Qfalse;
1666
+ g_profiler.profile_refcount++;
1667
+ if (g_profiler.profile_refcount == 1) {
1668
+ rperf_reset_thread_times(&g_profiler);
1669
+ rperf_arm_timer(&g_profiler);
1670
+ }
1671
+ return Qtrue;
1672
+ }
1673
+
1674
+ /* _c_profile_dec() — decrement profile refcount; pause timer on 1→0.
1675
+ * Called with GVL held. */
1676
+ static VALUE
1677
+ rb_rperf_profile_dec(VALUE self)
1678
+ {
1679
+ if (!g_profiler.running) return Qfalse;
1680
+ if (g_profiler.profile_refcount <= 0) return Qfalse;
1681
+ g_profiler.profile_refcount--;
1682
+ if (g_profiler.profile_refcount == 0) {
1683
+ rperf_disarm_timer(&g_profiler);
1684
+ }
1685
+ return Qtrue;
1686
+ }
1687
+
1688
+ /* _c_running?() — check if profiler is running. */
1689
+ static VALUE
1690
+ rb_rperf_running_p(VALUE self)
1691
+ {
1692
+ return g_profiler.running ? Qtrue : Qfalse;
1693
+ }
1694
+
1561
1695
  /* ---- Fork safety ---- */
1562
1696
 
1563
1697
  static void
@@ -1568,6 +1702,14 @@ rperf_after_fork_child(void)
1568
1702
  /* Mark as not running — timer doesn't exist in child */
1569
1703
  g_profiler.running = 0;
1570
1704
 
1705
+ /* Re-initialize mutex/condvar — they may have been locked by the parent's
1706
+ * worker thread at fork time and are in an undefined state in the child.
1707
+ * POSIX says only async-signal-safe functions should be called in atfork
1708
+ * child handlers, but pthread_mutex_init is safe on Linux/glibc/musl and
1709
+ * this is the standard pattern (e.g., Python, Go do the same). */
1710
+ pthread_mutex_init(&g_profiler.worker_mutex, NULL);
1711
+ pthread_cond_init(&g_profiler.worker_cond, NULL);
1712
+
1571
1713
  #if RPERF_USE_TIMER_SIGNAL
1572
1714
  /* timer_create timers are not inherited across fork, but pending signals may be.
1573
1715
  * Block the signal, drain any pending instances, then restore old handler. */
@@ -1608,6 +1750,8 @@ rperf_after_fork_child(void)
1608
1750
  /* Reset stats */
1609
1751
  g_profiler.stats.sampling_count = 0;
1610
1752
  g_profiler.stats.sampling_total_ns = 0;
1753
+ g_profiler.stats.dropped_samples = 0;
1754
+ g_profiler.profile_refcount = 0;
1611
1755
  atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
1612
1756
  }
1613
1757
 
@@ -1617,13 +1761,16 @@ void
1617
1761
  Init_rperf(void)
1618
1762
  {
1619
1763
  VALUE mRperf = rb_define_module("Rperf");
1620
- rb_define_module_function(mRperf, "_c_start", rb_rperf_start, 4);
1764
+ rb_define_module_function(mRperf, "_c_start", rb_rperf_start, 5);
1621
1765
  rb_define_module_function(mRperf, "_c_stop", rb_rperf_stop, 0);
1622
1766
  rb_define_module_function(mRperf, "_c_snapshot", rb_rperf_snapshot, 1);
1623
1767
  rb_define_module_function(mRperf, "_c_set_label", rb_rperf_set_label, 1);
1624
1768
  rb_define_module_function(mRperf, "_c_get_label", rb_rperf_get_label, 0);
1625
1769
  rb_define_module_function(mRperf, "_c_set_label_sets", rb_rperf_set_label_sets, 1);
1626
1770
  rb_define_module_function(mRperf, "_c_get_label_sets", rb_rperf_get_label_sets, 0);
1771
+ rb_define_module_function(mRperf, "_c_profile_inc", rb_rperf_profile_inc, 0);
1772
+ rb_define_module_function(mRperf, "_c_profile_dec", rb_rperf_profile_dec, 0);
1773
+ rb_define_module_function(mRperf, "_c_running?", rb_rperf_running_p, 0);
1627
1774
 
1628
1775
  memset(&g_profiler, 0, sizeof(g_profiler));
1629
1776
  g_profiler.label_sets = Qnil;