rperf 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +69 -28
- data/docs/help.md +153 -6
- data/exe/rperf +1 -1
- data/ext/rperf/rperf.c +406 -121
- data/lib/rperf/active_job.rb +13 -0
- data/lib/rperf/rack.rb +15 -0
- data/lib/rperf/sidekiq.rb +9 -0
- data/lib/rperf/version.rb +1 -1
- data/lib/rperf.rb +141 -12
- metadata +4 -1
data/ext/rperf/rperf.c
CHANGED
|
@@ -36,6 +36,7 @@
|
|
|
36
36
|
#define RPERF_FRAME_TABLE_OLD_KEYS_INITIAL 16
|
|
37
37
|
#define RPERF_AGG_TABLE_INITIAL 1024
|
|
38
38
|
#define RPERF_STACK_POOL_INITIAL 4096
|
|
39
|
+
#define RPERF_PAUSED(prof) ((prof)->profile_refcount == 0)
|
|
39
40
|
|
|
40
41
|
/* Synthetic frame IDs (reserved in frame_table, 0-based) */
|
|
41
42
|
#define RPERF_SYNTHETIC_GVL_BLOCKED 0
|
|
@@ -66,6 +67,7 @@ typedef struct rperf_sample {
|
|
|
66
67
|
int64_t weight;
|
|
67
68
|
int type; /* rperf_sample_type */
|
|
68
69
|
int thread_seq; /* thread sequence number (1-based) */
|
|
70
|
+
int label_set_id; /* label set ID (0 = no labels) */
|
|
69
71
|
} rperf_sample_t;
|
|
70
72
|
|
|
71
73
|
/* ---- Sample buffer (double-buffered) ---- */
|
|
@@ -103,6 +105,7 @@ typedef struct rperf_agg_entry {
|
|
|
103
105
|
uint32_t frame_start; /* offset into stack_pool */
|
|
104
106
|
int depth; /* includes synthetic frame */
|
|
105
107
|
int thread_seq;
|
|
108
|
+
int label_set_id; /* label set ID (0 = no labels) */
|
|
106
109
|
int64_t weight; /* accumulated */
|
|
107
110
|
uint32_t hash; /* cached hash value */
|
|
108
111
|
int used; /* 0 = empty, 1 = used */
|
|
@@ -124,6 +127,7 @@ typedef struct rperf_thread_data {
|
|
|
124
127
|
int64_t suspended_at_ns; /* wall time at SUSPENDED */
|
|
125
128
|
int64_t ready_at_ns; /* wall time at READY */
|
|
126
129
|
int thread_seq; /* thread sequence number (1-based) */
|
|
130
|
+
int label_set_id; /* current label set ID (0 = no labels) */
|
|
127
131
|
} rperf_thread_data_t;
|
|
128
132
|
|
|
129
133
|
/* ---- GC tracking state ---- */
|
|
@@ -132,6 +136,7 @@ typedef struct rperf_gc_state {
|
|
|
132
136
|
int phase; /* rperf_gc_phase */
|
|
133
137
|
int64_t enter_ns; /* wall time at GC_ENTER */
|
|
134
138
|
int thread_seq; /* thread_seq at GC_ENTER */
|
|
139
|
+
int label_set_id; /* label_set_id at GC_ENTER */
|
|
135
140
|
} rperf_gc_state_t;
|
|
136
141
|
|
|
137
142
|
/* ---- Sampling overhead stats ---- */
|
|
@@ -175,6 +180,14 @@ typedef struct rperf_profiler {
|
|
|
175
180
|
int next_thread_seq;
|
|
176
181
|
/* Sampling overhead stats */
|
|
177
182
|
rperf_stats_t stats;
|
|
183
|
+
/* Label sets: Ruby Array of Hash objects, managed from Ruby side.
|
|
184
|
+
* Index 0 is reserved (no labels). GC-marked via profiler_mark. */
|
|
185
|
+
VALUE label_sets; /* Ruby Array or Qnil */
|
|
186
|
+
/* Profile refcount: controls timer active/paused state.
|
|
187
|
+
* start(defer:false) sets to 1, start(defer:true) sets to 0.
|
|
188
|
+
* profile_inc/dec transitions 0↔1 arm/disarm the timer.
|
|
189
|
+
* Modified only under GVL, so plain int is safe. */
|
|
190
|
+
int profile_refcount;
|
|
178
191
|
} rperf_profiler_t;
|
|
179
192
|
|
|
180
193
|
static rperf_profiler_t g_profiler;
|
|
@@ -195,6 +208,10 @@ rperf_profiler_mark(void *ptr)
|
|
|
195
208
|
buf->frame_pool + buf->frame_pool_count);
|
|
196
209
|
}
|
|
197
210
|
}
|
|
211
|
+
/* Mark label_sets array */
|
|
212
|
+
if (prof->label_sets != Qnil) {
|
|
213
|
+
rb_gc_mark(prof->label_sets);
|
|
214
|
+
}
|
|
198
215
|
/* Mark frame_table keys (unique frame VALUEs).
|
|
199
216
|
* Acquire count to synchronize with the release-store in insert,
|
|
200
217
|
* ensuring we see the keys pointer that is valid for [0, count).
|
|
@@ -431,7 +448,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
|
|
|
431
448
|
/* ---- Aggregation table operations (all malloc-based, no GVL needed) ---- */
|
|
432
449
|
|
|
433
450
|
static uint32_t
|
|
434
|
-
rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq)
|
|
451
|
+
rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id)
|
|
435
452
|
{
|
|
436
453
|
uint32_t h = 2166136261u;
|
|
437
454
|
int i;
|
|
@@ -441,6 +458,8 @@ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq)
|
|
|
441
458
|
}
|
|
442
459
|
h ^= (uint32_t)thread_seq;
|
|
443
460
|
h *= 16777619u;
|
|
461
|
+
h ^= (uint32_t)label_set_id;
|
|
462
|
+
h *= 16777619u;
|
|
444
463
|
return h;
|
|
445
464
|
}
|
|
446
465
|
|
|
@@ -506,7 +525,8 @@ rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
|
|
|
506
525
|
/* Insert or merge a stack into the aggregation table */
|
|
507
526
|
static void
|
|
508
527
|
rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
509
|
-
int depth, int thread_seq,
|
|
528
|
+
int depth, int thread_seq, int label_set_id,
|
|
529
|
+
int64_t weight, uint32_t hash)
|
|
510
530
|
{
|
|
511
531
|
size_t idx = hash % at->bucket_capacity;
|
|
512
532
|
|
|
@@ -514,6 +534,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
|
514
534
|
rperf_agg_entry_t *e = &at->buckets[idx];
|
|
515
535
|
if (!e->used) break;
|
|
516
536
|
if (e->hash == hash && e->depth == depth && e->thread_seq == thread_seq &&
|
|
537
|
+
e->label_set_id == label_set_id &&
|
|
517
538
|
memcmp(at->stack_pool + e->frame_start, frame_ids,
|
|
518
539
|
depth * sizeof(uint32_t)) == 0) {
|
|
519
540
|
/* Match — merge weight */
|
|
@@ -530,6 +551,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
|
530
551
|
e->frame_start = (uint32_t)at->stack_pool_count;
|
|
531
552
|
e->depth = depth;
|
|
532
553
|
e->thread_seq = thread_seq;
|
|
554
|
+
e->label_set_id = label_set_id;
|
|
533
555
|
e->weight = weight;
|
|
534
556
|
e->hash = hash;
|
|
535
557
|
e->used = 1;
|
|
@@ -581,10 +603,10 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
|
|
|
581
603
|
if (overflow) break; /* frame_table full, stop aggregating this buffer */
|
|
582
604
|
|
|
583
605
|
int total_depth = off + s->depth;
|
|
584
|
-
hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq);
|
|
606
|
+
hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq, s->label_set_id);
|
|
585
607
|
|
|
586
608
|
rperf_agg_table_insert(&prof->agg_table, temp_ids, total_depth,
|
|
587
|
-
s->thread_seq, s->weight, hash);
|
|
609
|
+
s->thread_seq, s->label_set_id, s->weight, hash);
|
|
588
610
|
}
|
|
589
611
|
|
|
590
612
|
/* Reset buffer for reuse.
|
|
@@ -634,7 +656,7 @@ rperf_try_swap(rperf_profiler_t *prof)
|
|
|
634
656
|
/* Write a sample into a specific buffer. No swap check. */
|
|
635
657
|
static int
|
|
636
658
|
rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
|
|
637
|
-
int64_t weight, int type, int thread_seq)
|
|
659
|
+
int64_t weight, int type, int thread_seq, int label_set_id)
|
|
638
660
|
{
|
|
639
661
|
if (weight <= 0) return 0;
|
|
640
662
|
if (rperf_ensure_sample_capacity(buf) < 0) return -1;
|
|
@@ -645,16 +667,17 @@ rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
|
|
|
645
667
|
sample->weight = weight;
|
|
646
668
|
sample->type = type;
|
|
647
669
|
sample->thread_seq = thread_seq;
|
|
670
|
+
sample->label_set_id = label_set_id;
|
|
648
671
|
buf->sample_count++;
|
|
649
672
|
return 0;
|
|
650
673
|
}
|
|
651
674
|
|
|
652
675
|
static void
|
|
653
676
|
rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
|
|
654
|
-
int64_t weight, int type, int thread_seq)
|
|
677
|
+
int64_t weight, int type, int thread_seq, int label_set_id)
|
|
655
678
|
{
|
|
656
679
|
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
657
|
-
rperf_write_sample(buf, frame_start, depth, weight, type, thread_seq);
|
|
680
|
+
rperf_write_sample(buf, frame_start, depth, weight, type, thread_seq, label_set_id);
|
|
658
681
|
rperf_try_swap(prof);
|
|
659
682
|
}
|
|
660
683
|
|
|
@@ -676,12 +699,11 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
|
|
|
676
699
|
/* ---- Thread event hooks ---- */
|
|
677
700
|
|
|
678
701
|
static void
|
|
679
|
-
rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
|
|
702
|
+
rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
|
|
680
703
|
{
|
|
681
704
|
/* Has GVL — safe to call Ruby APIs */
|
|
682
705
|
int64_t wall_now = rperf_wall_time_ns();
|
|
683
706
|
|
|
684
|
-
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
685
707
|
int is_first = 0;
|
|
686
708
|
|
|
687
709
|
if (td == NULL) {
|
|
@@ -702,10 +724,10 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
|
|
|
702
724
|
if (depth <= 0) return;
|
|
703
725
|
buf->frame_pool_count += depth;
|
|
704
726
|
|
|
705
|
-
/* Record normal sample (skip if first time — no prev_time) */
|
|
706
|
-
if (!is_first) {
|
|
727
|
+
/* Record normal sample (skip if first time — no prev_time, or if paused) */
|
|
728
|
+
if (!is_first && !RPERF_PAUSED(prof)) {
|
|
707
729
|
int64_t weight = time_now - td->prev_time_ns;
|
|
708
|
-
rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
|
|
730
|
+
rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq, td->label_set_id);
|
|
709
731
|
}
|
|
710
732
|
|
|
711
733
|
/* Save timestamp for READY/RESUMED */
|
|
@@ -715,21 +737,18 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
|
|
|
715
737
|
}
|
|
716
738
|
|
|
717
739
|
static void
|
|
718
|
-
rperf_handle_ready(
|
|
740
|
+
rperf_handle_ready(rperf_thread_data_t *td)
|
|
719
741
|
{
|
|
720
742
|
/* May NOT have GVL — only simple C operations allowed */
|
|
721
|
-
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
722
743
|
if (!td) return;
|
|
723
744
|
|
|
724
745
|
td->ready_at_ns = rperf_wall_time_ns();
|
|
725
746
|
}
|
|
726
747
|
|
|
727
748
|
static void
|
|
728
|
-
rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread)
|
|
749
|
+
rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
|
|
729
750
|
{
|
|
730
751
|
/* Has GVL */
|
|
731
|
-
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
732
|
-
|
|
733
752
|
if (td == NULL) {
|
|
734
753
|
td = rperf_thread_data_create(prof, thread);
|
|
735
754
|
if (!td) return;
|
|
@@ -745,7 +764,7 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread)
|
|
|
745
764
|
* Both samples are written directly into the same buffer before calling
|
|
746
765
|
* rperf_try_swap, so that a swap triggered by the first sample cannot
|
|
747
766
|
* move the second into a different buffer with a stale frame_start. */
|
|
748
|
-
if (prof->mode == 1 && td->suspended_at_ns > 0) {
|
|
767
|
+
if (prof->mode == 1 && td->suspended_at_ns > 0 && !RPERF_PAUSED(prof)) {
|
|
749
768
|
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
750
769
|
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) goto skip_gvl;
|
|
751
770
|
size_t frame_start = buf->frame_pool_count;
|
|
@@ -758,12 +777,12 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread)
|
|
|
758
777
|
if (td->ready_at_ns > 0 && td->ready_at_ns > td->suspended_at_ns) {
|
|
759
778
|
int64_t blocked_ns = td->ready_at_ns - td->suspended_at_ns;
|
|
760
779
|
rperf_write_sample(buf, frame_start, depth, blocked_ns,
|
|
761
|
-
RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq);
|
|
780
|
+
RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq, td->label_set_id);
|
|
762
781
|
}
|
|
763
782
|
if (td->ready_at_ns > 0 && wall_now > td->ready_at_ns) {
|
|
764
783
|
int64_t wait_ns = wall_now - td->ready_at_ns;
|
|
765
784
|
rperf_write_sample(buf, frame_start, depth, wait_ns,
|
|
766
|
-
RPERF_SAMPLE_GVL_WAIT, td->thread_seq);
|
|
785
|
+
RPERF_SAMPLE_GVL_WAIT, td->thread_seq, td->label_set_id);
|
|
767
786
|
}
|
|
768
787
|
|
|
769
788
|
rperf_try_swap(prof);
|
|
@@ -781,9 +800,8 @@ skip_gvl:
|
|
|
781
800
|
}
|
|
782
801
|
|
|
783
802
|
static void
|
|
784
|
-
rperf_handle_exited(rperf_profiler_t *prof, VALUE thread)
|
|
803
|
+
rperf_handle_exited(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
|
|
785
804
|
{
|
|
786
|
-
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
787
805
|
if (td) {
|
|
788
806
|
free(td);
|
|
789
807
|
rb_internal_thread_specific_set(thread, prof->ts_key, NULL);
|
|
@@ -797,15 +815,16 @@ rperf_thread_event_hook(rb_event_flag_t event, const rb_internal_thread_event_da
|
|
|
797
815
|
if (!prof->running) return;
|
|
798
816
|
|
|
799
817
|
VALUE thread = data->thread;
|
|
818
|
+
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
800
819
|
|
|
801
820
|
if (event & RUBY_INTERNAL_THREAD_EVENT_SUSPENDED)
|
|
802
|
-
rperf_handle_suspended(prof, thread);
|
|
821
|
+
rperf_handle_suspended(prof, thread, td);
|
|
803
822
|
else if (event & RUBY_INTERNAL_THREAD_EVENT_READY)
|
|
804
|
-
rperf_handle_ready(
|
|
823
|
+
rperf_handle_ready(td);
|
|
805
824
|
else if (event & RUBY_INTERNAL_THREAD_EVENT_RESUMED)
|
|
806
|
-
rperf_handle_resumed(prof, thread);
|
|
825
|
+
rperf_handle_resumed(prof, thread, td);
|
|
807
826
|
else if (event & RUBY_INTERNAL_THREAD_EVENT_EXITED)
|
|
808
|
-
rperf_handle_exited(prof, thread);
|
|
827
|
+
rperf_handle_exited(prof, thread, td);
|
|
809
828
|
}
|
|
810
829
|
|
|
811
830
|
/* ---- GC event hook ---- */
|
|
@@ -826,17 +845,19 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
|
|
|
826
845
|
prof->gc.phase = RPERF_GC_NONE;
|
|
827
846
|
}
|
|
828
847
|
else if (event & RUBY_INTERNAL_EVENT_GC_ENTER) {
|
|
829
|
-
/* Save timestamp and
|
|
848
|
+
/* Save timestamp, thread_seq, and label_set_id; backtrace is captured at GC_EXIT
|
|
830
849
|
* to avoid buffer mismatch after a double-buffer swap. */
|
|
831
850
|
prof->gc.enter_ns = rperf_wall_time_ns();
|
|
832
851
|
{
|
|
833
852
|
VALUE thread = rb_thread_current();
|
|
834
853
|
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
835
854
|
prof->gc.thread_seq = td ? td->thread_seq : 0;
|
|
855
|
+
prof->gc.label_set_id = td ? td->label_set_id : 0;
|
|
836
856
|
}
|
|
837
857
|
}
|
|
838
858
|
else if (event & RUBY_INTERNAL_EVENT_GC_EXIT) {
|
|
839
859
|
if (prof->gc.enter_ns <= 0) return;
|
|
860
|
+
if (RPERF_PAUSED(prof)) { prof->gc.enter_ns = 0; return; }
|
|
840
861
|
|
|
841
862
|
int64_t wall_now = rperf_wall_time_ns();
|
|
842
863
|
int64_t weight = wall_now - prof->gc.enter_ns;
|
|
@@ -861,7 +882,7 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
|
|
|
861
882
|
}
|
|
862
883
|
buf->frame_pool_count += depth;
|
|
863
884
|
|
|
864
|
-
rperf_record_sample(prof, frame_start, depth, weight, type, prof->gc.thread_seq);
|
|
885
|
+
rperf_record_sample(prof, frame_start, depth, weight, type, prof->gc.thread_seq, prof->gc.label_set_id);
|
|
865
886
|
prof->gc.enter_ns = 0;
|
|
866
887
|
}
|
|
867
888
|
}
|
|
@@ -874,6 +895,7 @@ rperf_sample_job(void *arg)
|
|
|
874
895
|
rperf_profiler_t *prof = (rperf_profiler_t *)arg;
|
|
875
896
|
|
|
876
897
|
if (!prof->running) return;
|
|
898
|
+
if (RPERF_PAUSED(prof)) return;
|
|
877
899
|
|
|
878
900
|
/* Measure sampling overhead */
|
|
879
901
|
struct timespec ts_start, ts_end;
|
|
@@ -908,7 +930,7 @@ rperf_sample_job(void *arg)
|
|
|
908
930
|
if (depth <= 0) return;
|
|
909
931
|
buf->frame_pool_count += depth;
|
|
910
932
|
|
|
911
|
-
rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
|
|
933
|
+
rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq, td->label_set_id);
|
|
912
934
|
|
|
913
935
|
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_end);
|
|
914
936
|
prof->stats.sampling_count++;
|
|
@@ -971,20 +993,32 @@ rperf_worker_nanosleep_func(void *arg)
|
|
|
971
993
|
|
|
972
994
|
CHECKED(pthread_mutex_lock(&prof->worker_mutex));
|
|
973
995
|
while (prof->running) {
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
if (ret == ETIMEDOUT) {
|
|
980
|
-
prof->stats.trigger_count++;
|
|
981
|
-
rb_postponed_job_trigger(prof->pj_handle);
|
|
982
|
-
/* Advance deadline by interval */
|
|
996
|
+
if (RPERF_PAUSED(prof)) {
|
|
997
|
+
/* Paused: wait indefinitely until signaled (resume or stop) */
|
|
998
|
+
CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
|
|
999
|
+
/* Reset deadline on wake to avoid burst of catch-up triggers */
|
|
1000
|
+
clock_gettime(CLOCK_REALTIME, &deadline);
|
|
983
1001
|
deadline.tv_nsec += interval_ns;
|
|
984
1002
|
if (deadline.tv_nsec >= 1000000000L) {
|
|
985
1003
|
deadline.tv_sec++;
|
|
986
1004
|
deadline.tv_nsec -= 1000000000L;
|
|
987
1005
|
}
|
|
1006
|
+
} else {
|
|
1007
|
+
int ret = pthread_cond_timedwait(&prof->worker_cond, &prof->worker_mutex, &deadline);
|
|
1008
|
+
if (ret != 0 && ret != ETIMEDOUT) {
|
|
1009
|
+
fprintf(stderr, "rperf: pthread_cond_timedwait failed: %s\n", strerror(ret));
|
|
1010
|
+
abort();
|
|
1011
|
+
}
|
|
1012
|
+
if (ret == ETIMEDOUT) {
|
|
1013
|
+
prof->stats.trigger_count++;
|
|
1014
|
+
rb_postponed_job_trigger(prof->pj_handle);
|
|
1015
|
+
/* Advance deadline by interval */
|
|
1016
|
+
deadline.tv_nsec += interval_ns;
|
|
1017
|
+
if (deadline.tv_nsec >= 1000000000L) {
|
|
1018
|
+
deadline.tv_sec++;
|
|
1019
|
+
deadline.tv_nsec -= 1000000000L;
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
988
1022
|
}
|
|
989
1023
|
rperf_try_aggregate(prof);
|
|
990
1024
|
}
|
|
@@ -1006,16 +1040,105 @@ rperf_resolve_frame(VALUE fval)
|
|
|
1006
1040
|
return rb_ary_new3(2, path, label);
|
|
1007
1041
|
}
|
|
1008
1042
|
|
|
1043
|
+
/* ---- Shared helpers for stop/snapshot ---- */
|
|
1044
|
+
|
|
1045
|
+
/* Flush pending sample buffers into agg_table.
|
|
1046
|
+
* Caller must ensure no concurrent access (worker joined or mutex held). */
|
|
1047
|
+
static void
|
|
1048
|
+
rperf_flush_buffers(rperf_profiler_t *prof)
|
|
1049
|
+
{
|
|
1050
|
+
int cur_idx = atomic_load_explicit(&prof->active_idx, memory_order_acquire);
|
|
1051
|
+
if (atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) {
|
|
1052
|
+
int standby_idx = cur_idx ^ 1;
|
|
1053
|
+
rperf_aggregate_buffer(prof, &prof->buffers[standby_idx]);
|
|
1054
|
+
atomic_store_explicit(&prof->swap_ready, 0, memory_order_release);
|
|
1055
|
+
}
|
|
1056
|
+
rperf_aggregate_buffer(prof, &prof->buffers[cur_idx]);
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
/* Build result hash from aggregated data (agg_table + frame_table).
|
|
1060
|
+
* Does NOT free any resources. Caller must hold GVL. */
|
|
1061
|
+
static VALUE
|
|
1062
|
+
rperf_build_aggregated_result(rperf_profiler_t *prof)
|
|
1063
|
+
{
|
|
1064
|
+
VALUE result, samples_ary;
|
|
1065
|
+
size_t i;
|
|
1066
|
+
int j;
|
|
1067
|
+
|
|
1068
|
+
result = rb_hash_new();
|
|
1069
|
+
|
|
1070
|
+
rb_hash_aset(result, ID2SYM(rb_intern("mode")),
|
|
1071
|
+
ID2SYM(rb_intern(prof->mode == 1 ? "wall" : "cpu")));
|
|
1072
|
+
rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(prof->frequency));
|
|
1073
|
+
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(prof->stats.trigger_count));
|
|
1074
|
+
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(prof->stats.sampling_count));
|
|
1075
|
+
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(prof->stats.sampling_total_ns));
|
|
1076
|
+
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(prof->next_thread_seq));
|
|
1077
|
+
rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
|
|
1078
|
+
SIZET2NUM(prof->frame_table.count - RPERF_SYNTHETIC_COUNT));
|
|
1079
|
+
rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
|
|
1080
|
+
SIZET2NUM(prof->agg_table.count));
|
|
1081
|
+
|
|
1082
|
+
{
|
|
1083
|
+
struct timespec now_monotonic;
|
|
1084
|
+
int64_t start_ns, duration_ns;
|
|
1085
|
+
clock_gettime(CLOCK_MONOTONIC, &now_monotonic);
|
|
1086
|
+
start_ns = (int64_t)prof->start_realtime.tv_sec * 1000000000LL
|
|
1087
|
+
+ (int64_t)prof->start_realtime.tv_nsec;
|
|
1088
|
+
duration_ns = ((int64_t)now_monotonic.tv_sec - (int64_t)prof->start_monotonic.tv_sec) * 1000000000LL
|
|
1089
|
+
+ ((int64_t)now_monotonic.tv_nsec - (int64_t)prof->start_monotonic.tv_nsec);
|
|
1090
|
+
rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
|
|
1091
|
+
rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
{
|
|
1095
|
+
rperf_frame_table_t *ft = &prof->frame_table;
|
|
1096
|
+
VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
|
|
1097
|
+
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]")));
|
|
1098
|
+
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
|
|
1099
|
+
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]")));
|
|
1100
|
+
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]")));
|
|
1101
|
+
for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
|
|
1102
|
+
rb_ary_push(resolved_ary, rperf_resolve_frame(atomic_load_explicit(&ft->keys, memory_order_relaxed)[i]));
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
rperf_agg_table_t *at = &prof->agg_table;
|
|
1106
|
+
samples_ary = rb_ary_new();
|
|
1107
|
+
for (i = 0; i < at->bucket_capacity; i++) {
|
|
1108
|
+
rperf_agg_entry_t *e = &at->buckets[i];
|
|
1109
|
+
if (!e->used) continue;
|
|
1110
|
+
|
|
1111
|
+
VALUE frames = rb_ary_new_capa(e->depth);
|
|
1112
|
+
for (j = 0; j < e->depth; j++) {
|
|
1113
|
+
uint32_t fid = at->stack_pool[e->frame_start + j];
|
|
1114
|
+
rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
|
|
1115
|
+
}
|
|
1116
|
+
|
|
1117
|
+
VALUE sample = rb_ary_new3(4, frames, LONG2NUM(e->weight), INT2NUM(e->thread_seq), INT2NUM(e->label_set_id));
|
|
1118
|
+
rb_ary_push(samples_ary, sample);
|
|
1119
|
+
}
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
rb_hash_aset(result, ID2SYM(rb_intern("aggregated_samples")), samples_ary);
|
|
1123
|
+
|
|
1124
|
+
if (prof->label_sets != Qnil) {
|
|
1125
|
+
rb_hash_aset(result, ID2SYM(rb_intern("label_sets")), prof->label_sets);
|
|
1126
|
+
}
|
|
1127
|
+
|
|
1128
|
+
return result;
|
|
1129
|
+
}
|
|
1130
|
+
|
|
1009
1131
|
/* ---- Ruby API ---- */
|
|
1010
1132
|
|
|
1011
|
-
/* _c_start(frequency, mode, aggregate, signal)
|
|
1133
|
+
/* _c_start(frequency, mode, aggregate, signal, defer)
|
|
1012
1134
|
* frequency: Integer (Hz)
|
|
1013
1135
|
* mode: 0 = cpu, 1 = wall
|
|
1014
1136
|
* aggregate: 0 or 1
|
|
1015
1137
|
* signal: Integer (RT signal number, 0 = nanosleep, -1 = default)
|
|
1138
|
+
* defer: if truthy, start with timer paused (profile_refcount = 0)
|
|
1016
1139
|
*/
|
|
1017
1140
|
static VALUE
|
|
1018
|
-
rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
|
|
1141
|
+
rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig, VALUE vdefer)
|
|
1019
1142
|
{
|
|
1020
1143
|
int frequency = NUM2INT(vfreq);
|
|
1021
1144
|
int mode = NUM2INT(vmode);
|
|
@@ -1038,6 +1161,7 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
|
|
|
1038
1161
|
g_profiler.stats.trigger_count = 0;
|
|
1039
1162
|
atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
|
|
1040
1163
|
atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
|
|
1164
|
+
g_profiler.label_sets = Qnil;
|
|
1041
1165
|
|
|
1042
1166
|
/* Initialize worker mutex/cond */
|
|
1043
1167
|
CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
|
|
@@ -1119,6 +1243,7 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
|
|
|
1119
1243
|
clock_gettime(CLOCK_MONOTONIC, &g_profiler.start_monotonic);
|
|
1120
1244
|
|
|
1121
1245
|
g_profiler.running = 1;
|
|
1246
|
+
g_profiler.profile_refcount = RTEST(vdefer) ? 0 : 1;
|
|
1122
1247
|
|
|
1123
1248
|
#if RPERF_USE_TIMER_SIGNAL
|
|
1124
1249
|
g_profiler.timer_signal = timer_signal;
|
|
@@ -1166,7 +1291,12 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
|
|
|
1166
1291
|
}
|
|
1167
1292
|
|
|
1168
1293
|
its.it_value.tv_sec = 0;
|
|
1169
|
-
|
|
1294
|
+
if (RPERF_PAUSED(&g_profiler)) {
|
|
1295
|
+
/* defer mode: create timer but don't arm it */
|
|
1296
|
+
its.it_value.tv_nsec = 0;
|
|
1297
|
+
} else {
|
|
1298
|
+
its.it_value.tv_nsec = 1000000000L / g_profiler.frequency;
|
|
1299
|
+
}
|
|
1170
1300
|
its.it_interval = its.it_value;
|
|
1171
1301
|
if (timer_settime(g_profiler.timer_id, 0, &its, NULL) != 0) {
|
|
1172
1302
|
timer_delete(g_profiler.timer_id);
|
|
@@ -1259,15 +1389,8 @@ rb_rperf_stop(VALUE self)
|
|
|
1259
1389
|
rb_remove_event_hook(rperf_gc_event_hook);
|
|
1260
1390
|
|
|
1261
1391
|
if (g_profiler.aggregate) {
|
|
1262
|
-
/* Worker thread is joined; no concurrent access
|
|
1263
|
-
|
|
1264
|
-
/* Aggregate remaining samples from both buffers */
|
|
1265
|
-
if (atomic_load_explicit(&g_profiler.swap_ready, memory_order_relaxed)) {
|
|
1266
|
-
int standby_idx = cur_idx ^ 1;
|
|
1267
|
-
rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[standby_idx]);
|
|
1268
|
-
atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
|
|
1269
|
-
}
|
|
1270
|
-
rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[cur_idx]);
|
|
1392
|
+
/* Worker thread is joined; no concurrent access. */
|
|
1393
|
+
rperf_flush_buffers(&g_profiler);
|
|
1271
1394
|
}
|
|
1272
1395
|
|
|
1273
1396
|
/* Clean up thread-specific data for all live threads */
|
|
@@ -1285,73 +1408,8 @@ rb_rperf_stop(VALUE self)
|
|
|
1285
1408
|
}
|
|
1286
1409
|
}
|
|
1287
1410
|
|
|
1288
|
-
/* Build result hash */
|
|
1289
|
-
result = rb_hash_new();
|
|
1290
|
-
|
|
1291
|
-
/* mode */
|
|
1292
|
-
rb_hash_aset(result, ID2SYM(rb_intern("mode")),
|
|
1293
|
-
ID2SYM(rb_intern(g_profiler.mode == 1 ? "wall" : "cpu")));
|
|
1294
|
-
|
|
1295
|
-
/* frequency */
|
|
1296
|
-
rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
|
|
1297
|
-
|
|
1298
|
-
/* trigger_count, sampling_count, sampling_time_ns, detected_thread_count */
|
|
1299
|
-
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
|
|
1300
|
-
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
|
|
1301
|
-
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
|
|
1302
|
-
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
|
|
1303
|
-
|
|
1304
|
-
/* aggregation stats */
|
|
1305
|
-
if (g_profiler.aggregate) {
|
|
1306
|
-
rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
|
|
1307
|
-
SIZET2NUM(g_profiler.frame_table.count - RPERF_SYNTHETIC_COUNT));
|
|
1308
|
-
rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
|
|
1309
|
-
SIZET2NUM(g_profiler.agg_table.count));
|
|
1310
|
-
}
|
|
1311
|
-
|
|
1312
|
-
/* start_time_ns (CLOCK_REALTIME epoch nanos), duration_ns (CLOCK_MONOTONIC delta) */
|
|
1313
|
-
{
|
|
1314
|
-
struct timespec stop_monotonic;
|
|
1315
|
-
int64_t start_ns, duration_ns;
|
|
1316
|
-
clock_gettime(CLOCK_MONOTONIC, &stop_monotonic);
|
|
1317
|
-
start_ns = (int64_t)g_profiler.start_realtime.tv_sec * 1000000000LL
|
|
1318
|
-
+ (int64_t)g_profiler.start_realtime.tv_nsec;
|
|
1319
|
-
duration_ns = ((int64_t)stop_monotonic.tv_sec - (int64_t)g_profiler.start_monotonic.tv_sec) * 1000000000LL
|
|
1320
|
-
+ ((int64_t)stop_monotonic.tv_nsec - (int64_t)g_profiler.start_monotonic.tv_nsec);
|
|
1321
|
-
rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
|
|
1322
|
-
rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
|
|
1323
|
-
}
|
|
1324
|
-
|
|
1325
1411
|
if (g_profiler.aggregate) {
|
|
1326
|
-
|
|
1327
|
-
* Use a Ruby array for resolved frames so GC protects them. */
|
|
1328
|
-
rperf_frame_table_t *ft = &g_profiler.frame_table;
|
|
1329
|
-
VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
|
|
1330
|
-
/* Synthetic frames */
|
|
1331
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]")));
|
|
1332
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
|
|
1333
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]")));
|
|
1334
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]")));
|
|
1335
|
-
/* Real frames */
|
|
1336
|
-
for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
|
|
1337
|
-
rb_ary_push(resolved_ary, rperf_resolve_frame(atomic_load_explicit(&ft->keys, memory_order_relaxed)[i]));
|
|
1338
|
-
}
|
|
1339
|
-
|
|
1340
|
-
rperf_agg_table_t *at = &g_profiler.agg_table;
|
|
1341
|
-
samples_ary = rb_ary_new();
|
|
1342
|
-
for (i = 0; i < at->bucket_capacity; i++) {
|
|
1343
|
-
rperf_agg_entry_t *e = &at->buckets[i];
|
|
1344
|
-
if (!e->used) continue;
|
|
1345
|
-
|
|
1346
|
-
VALUE frames = rb_ary_new_capa(e->depth);
|
|
1347
|
-
for (j = 0; j < e->depth; j++) {
|
|
1348
|
-
uint32_t fid = at->stack_pool[e->frame_start + j];
|
|
1349
|
-
rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
|
|
1350
|
-
}
|
|
1351
|
-
|
|
1352
|
-
VALUE sample = rb_ary_new3(3, frames, LONG2NUM(e->weight), INT2NUM(e->thread_seq));
|
|
1353
|
-
rb_ary_push(samples_ary, sample);
|
|
1354
|
-
}
|
|
1412
|
+
result = rperf_build_aggregated_result(&g_profiler);
|
|
1355
1413
|
|
|
1356
1414
|
rperf_sample_buffer_free(&g_profiler.buffers[1]);
|
|
1357
1415
|
rperf_frame_table_free(&g_profiler.frame_table);
|
|
@@ -1359,6 +1417,27 @@ rb_rperf_stop(VALUE self)
|
|
|
1359
1417
|
} else {
|
|
1360
1418
|
/* Raw samples path (aggregate: false) */
|
|
1361
1419
|
rperf_sample_buffer_t *buf = &g_profiler.buffers[0];
|
|
1420
|
+
|
|
1421
|
+
result = rb_hash_new();
|
|
1422
|
+
rb_hash_aset(result, ID2SYM(rb_intern("mode")),
|
|
1423
|
+
ID2SYM(rb_intern(g_profiler.mode == 1 ? "wall" : "cpu")));
|
|
1424
|
+
rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
|
|
1425
|
+
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
|
|
1426
|
+
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
|
|
1427
|
+
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
|
|
1428
|
+
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
|
|
1429
|
+
{
|
|
1430
|
+
struct timespec stop_monotonic;
|
|
1431
|
+
int64_t start_ns, duration_ns;
|
|
1432
|
+
clock_gettime(CLOCK_MONOTONIC, &stop_monotonic);
|
|
1433
|
+
start_ns = (int64_t)g_profiler.start_realtime.tv_sec * 1000000000LL
|
|
1434
|
+
+ (int64_t)g_profiler.start_realtime.tv_nsec;
|
|
1435
|
+
duration_ns = ((int64_t)stop_monotonic.tv_sec - (int64_t)g_profiler.start_monotonic.tv_sec) * 1000000000LL
|
|
1436
|
+
+ ((int64_t)stop_monotonic.tv_nsec - (int64_t)g_profiler.start_monotonic.tv_nsec);
|
|
1437
|
+
rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
|
|
1438
|
+
rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
|
|
1439
|
+
}
|
|
1440
|
+
|
|
1362
1441
|
samples_ary = rb_ary_new_capa((long)buf->sample_count);
|
|
1363
1442
|
for (i = 0; i < buf->sample_count; i++) {
|
|
1364
1443
|
rperf_sample_t *s = &buf->samples[i];
|
|
@@ -1384,13 +1463,14 @@ rb_rperf_stop(VALUE self)
|
|
|
1384
1463
|
rb_ary_push(frames, rperf_resolve_frame(fval));
|
|
1385
1464
|
}
|
|
1386
1465
|
|
|
1387
|
-
VALUE sample = rb_ary_new3(
|
|
1466
|
+
VALUE sample = rb_ary_new3(4, frames, LONG2NUM(s->weight), INT2NUM(s->thread_seq), INT2NUM(s->label_set_id));
|
|
1388
1467
|
rb_ary_push(samples_ary, sample);
|
|
1389
1468
|
}
|
|
1469
|
+
rb_hash_aset(result, ID2SYM(rb_intern("raw_samples")), samples_ary);
|
|
1470
|
+
if (g_profiler.label_sets != Qnil) {
|
|
1471
|
+
rb_hash_aset(result, ID2SYM(rb_intern("label_sets")), g_profiler.label_sets);
|
|
1472
|
+
}
|
|
1390
1473
|
}
|
|
1391
|
-
rb_hash_aset(result,
|
|
1392
|
-
ID2SYM(rb_intern(g_profiler.aggregate ? "aggregated_samples" : "raw_samples")),
|
|
1393
|
-
samples_ary);
|
|
1394
1474
|
|
|
1395
1475
|
/* Cleanup */
|
|
1396
1476
|
rperf_sample_buffer_free(&g_profiler.buffers[0]);
|
|
@@ -1398,6 +1478,201 @@ rb_rperf_stop(VALUE self)
|
|
|
1398
1478
|
return result;
|
|
1399
1479
|
}
|
|
1400
1480
|
|
|
1481
|
+
/* ---- Snapshot: read aggregated data without stopping ---- */
|
|
1482
|
+
|
|
1483
|
+
/* Clear aggregated data for the next interval.
|
|
1484
|
+
* Caller must hold GVL + worker_mutex.
|
|
1485
|
+
* Keeps allocations intact for reuse. Does NOT touch frame_table
|
|
1486
|
+
* (frame IDs must stay stable — dmark may be iterating keys outside GVL,
|
|
1487
|
+
* and existing threads reference frame IDs via their thread_data). */
|
|
1488
|
+
static void
|
|
1489
|
+
rperf_clear_aggregated_data(rperf_profiler_t *prof)
|
|
1490
|
+
{
|
|
1491
|
+
/* Clear agg_table entries (keep allocation) */
|
|
1492
|
+
memset(prof->agg_table.buckets, 0,
|
|
1493
|
+
prof->agg_table.bucket_capacity * sizeof(rperf_agg_entry_t));
|
|
1494
|
+
prof->agg_table.count = 0;
|
|
1495
|
+
prof->agg_table.stack_pool_count = 0;
|
|
1496
|
+
|
|
1497
|
+
/* Reset stats */
|
|
1498
|
+
prof->stats.trigger_count = 0;
|
|
1499
|
+
prof->stats.sampling_count = 0;
|
|
1500
|
+
prof->stats.sampling_total_ns = 0;
|
|
1501
|
+
|
|
1502
|
+
/* Reset start timestamps so next snapshot's duration_ns covers
|
|
1503
|
+
* only the period since this clear. */
|
|
1504
|
+
clock_gettime(CLOCK_REALTIME, &prof->start_realtime);
|
|
1505
|
+
clock_gettime(CLOCK_MONOTONIC, &prof->start_monotonic);
|
|
1506
|
+
}
|
|
1507
|
+
|
|
1508
|
+
static VALUE
|
|
1509
|
+
rb_rperf_snapshot(VALUE self, VALUE vclear)
|
|
1510
|
+
{
|
|
1511
|
+
VALUE result;
|
|
1512
|
+
|
|
1513
|
+
if (!g_profiler.running) {
|
|
1514
|
+
return Qnil;
|
|
1515
|
+
}
|
|
1516
|
+
|
|
1517
|
+
if (!g_profiler.aggregate) {
|
|
1518
|
+
rb_raise(rb_eRuntimeError, "snapshot requires aggregate mode (aggregate: true)");
|
|
1519
|
+
}
|
|
1520
|
+
|
|
1521
|
+
/* GVL is held → no postponed jobs fire → no new samples written.
|
|
1522
|
+
* Lock worker_mutex to pause worker thread's aggregation. */
|
|
1523
|
+
CHECKED(pthread_mutex_lock(&g_profiler.worker_mutex));
|
|
1524
|
+
rperf_flush_buffers(&g_profiler);
|
|
1525
|
+
|
|
1526
|
+
/* Build result while mutex is held. If clear is requested, we must
|
|
1527
|
+
* also clear under the same lock to avoid a window where the worker
|
|
1528
|
+
* could aggregate into the table between build and clear. */
|
|
1529
|
+
result = rperf_build_aggregated_result(&g_profiler);
|
|
1530
|
+
|
|
1531
|
+
if (RTEST(vclear)) {
|
|
1532
|
+
rperf_clear_aggregated_data(&g_profiler);
|
|
1533
|
+
}
|
|
1534
|
+
|
|
1535
|
+
CHECKED(pthread_mutex_unlock(&g_profiler.worker_mutex));
|
|
1536
|
+
|
|
1537
|
+
return result;
|
|
1538
|
+
}
|
|
1539
|
+
|
|
1540
|
+
/* ---- Label API ---- */
|
|
1541
|
+
|
|
1542
|
+
/* _c_set_label(label_set_id) — set current thread's label_set_id.
|
|
1543
|
+
* Called from Ruby with GVL held. */
|
|
1544
|
+
static VALUE
|
|
1545
|
+
rb_rperf_set_label(VALUE self, VALUE vid)
|
|
1546
|
+
{
|
|
1547
|
+
if (!g_profiler.running) return vid;
|
|
1548
|
+
|
|
1549
|
+
int label_set_id = NUM2INT(vid);
|
|
1550
|
+
VALUE thread = rb_thread_current();
|
|
1551
|
+
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, g_profiler.ts_key);
|
|
1552
|
+
if (td == NULL) {
|
|
1553
|
+
td = rperf_thread_data_create(&g_profiler, thread);
|
|
1554
|
+
if (!td) rb_raise(rb_eNoMemError, "rperf: failed to allocate thread data");
|
|
1555
|
+
}
|
|
1556
|
+
td->label_set_id = label_set_id;
|
|
1557
|
+
return vid;
|
|
1558
|
+
}
|
|
1559
|
+
|
|
1560
|
+
/* _c_get_label() — get current thread's label_set_id.
|
|
1561
|
+
* Returns 0 if not profiling or thread not yet seen. */
|
|
1562
|
+
static VALUE
|
|
1563
|
+
rb_rperf_get_label(VALUE self)
|
|
1564
|
+
{
|
|
1565
|
+
if (!g_profiler.running) return INT2FIX(0);
|
|
1566
|
+
|
|
1567
|
+
VALUE thread = rb_thread_current();
|
|
1568
|
+
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, g_profiler.ts_key);
|
|
1569
|
+
if (td == NULL) return INT2FIX(0);
|
|
1570
|
+
return INT2NUM(td->label_set_id);
|
|
1571
|
+
}
|
|
1572
|
+
|
|
1573
|
+
/* _c_set_label_sets(ary) — store label_sets Ruby Array for result building */
|
|
1574
|
+
static VALUE
|
|
1575
|
+
rb_rperf_set_label_sets(VALUE self, VALUE ary)
|
|
1576
|
+
{
|
|
1577
|
+
g_profiler.label_sets = ary;
|
|
1578
|
+
return ary;
|
|
1579
|
+
}
|
|
1580
|
+
|
|
1581
|
+
/* _c_get_label_sets() — get label_sets Ruby Array */
|
|
1582
|
+
static VALUE
|
|
1583
|
+
rb_rperf_get_label_sets(VALUE self)
|
|
1584
|
+
{
|
|
1585
|
+
return g_profiler.label_sets;
|
|
1586
|
+
}
|
|
1587
|
+
|
|
1588
|
+
/* ---- Profile refcount API (timer pause/resume) ---- */
|
|
1589
|
+
|
|
1590
|
+
/* Helper: arm the timer with the configured interval */
|
|
1591
|
+
static void
|
|
1592
|
+
rperf_arm_timer(rperf_profiler_t *prof)
|
|
1593
|
+
{
|
|
1594
|
+
#if RPERF_USE_TIMER_SIGNAL
|
|
1595
|
+
if (prof->timer_signal > 0) {
|
|
1596
|
+
struct itimerspec its;
|
|
1597
|
+
its.it_value.tv_sec = 0;
|
|
1598
|
+
its.it_value.tv_nsec = 1000000000L / prof->frequency;
|
|
1599
|
+
its.it_interval = its.it_value;
|
|
1600
|
+
timer_settime(prof->timer_id, 0, &its, NULL);
|
|
1601
|
+
return;
|
|
1602
|
+
}
|
|
1603
|
+
#endif
|
|
1604
|
+
/* nanosleep mode: signal the worker to wake from cond_wait */
|
|
1605
|
+
CHECKED(pthread_mutex_lock(&prof->worker_mutex));
|
|
1606
|
+
CHECKED(pthread_cond_signal(&prof->worker_cond));
|
|
1607
|
+
CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
|
|
1608
|
+
}
|
|
1609
|
+
|
|
1610
|
+
/* Helper: disarm the timer (stop firing) */
|
|
1611
|
+
static void
|
|
1612
|
+
rperf_disarm_timer(rperf_profiler_t *prof)
|
|
1613
|
+
{
|
|
1614
|
+
#if RPERF_USE_TIMER_SIGNAL
|
|
1615
|
+
if (prof->timer_signal > 0) {
|
|
1616
|
+
struct itimerspec its;
|
|
1617
|
+
memset(&its, 0, sizeof(its));
|
|
1618
|
+
timer_settime(prof->timer_id, 0, &its, NULL);
|
|
1619
|
+
return;
|
|
1620
|
+
}
|
|
1621
|
+
#endif
|
|
1622
|
+
/* nanosleep mode: worker will see RPERF_PAUSED on next iteration */
|
|
1623
|
+
}
|
|
1624
|
+
|
|
1625
|
+
/* Helper: reset prev_time_ns for all threads (called on resume to avoid
|
|
1626
|
+
* inflated weight from pause duration). Must be called with GVL held. */
|
|
1627
|
+
static void
|
|
1628
|
+
rperf_reset_thread_times(rperf_profiler_t *prof)
|
|
1629
|
+
{
|
|
1630
|
+
VALUE threads = rb_funcall(rb_cThread, rb_intern("list"), 0);
|
|
1631
|
+
long tc = RARRAY_LEN(threads);
|
|
1632
|
+
for (long i = 0; i < tc; i++) {
|
|
1633
|
+
VALUE thread = RARRAY_AREF(threads, i);
|
|
1634
|
+
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
1635
|
+
if (td) {
|
|
1636
|
+
td->prev_time_ns = rperf_current_time_ns(prof, td);
|
|
1637
|
+
td->prev_wall_ns = rperf_wall_time_ns();
|
|
1638
|
+
}
|
|
1639
|
+
}
|
|
1640
|
+
}
|
|
1641
|
+
|
|
1642
|
+
/* _c_profile_inc() — increment profile refcount; resume timer on 0→1.
|
|
1643
|
+
* Called with GVL held. */
|
|
1644
|
+
static VALUE
|
|
1645
|
+
rb_rperf_profile_inc(VALUE self)
|
|
1646
|
+
{
|
|
1647
|
+
if (!g_profiler.running) return Qfalse;
|
|
1648
|
+
g_profiler.profile_refcount++;
|
|
1649
|
+
if (g_profiler.profile_refcount == 1) {
|
|
1650
|
+
rperf_reset_thread_times(&g_profiler);
|
|
1651
|
+
rperf_arm_timer(&g_profiler);
|
|
1652
|
+
}
|
|
1653
|
+
return Qtrue;
|
|
1654
|
+
}
|
|
1655
|
+
|
|
1656
|
+
/* _c_profile_dec() — decrement profile refcount; pause timer on 1→0.
|
|
1657
|
+
* Called with GVL held. */
|
|
1658
|
+
static VALUE
|
|
1659
|
+
rb_rperf_profile_dec(VALUE self)
|
|
1660
|
+
{
|
|
1661
|
+
if (!g_profiler.running) return Qfalse;
|
|
1662
|
+
g_profiler.profile_refcount--;
|
|
1663
|
+
if (g_profiler.profile_refcount == 0) {
|
|
1664
|
+
rperf_disarm_timer(&g_profiler);
|
|
1665
|
+
}
|
|
1666
|
+
return Qtrue;
|
|
1667
|
+
}
|
|
1668
|
+
|
|
1669
|
+
/* _c_running?() — check if profiler is running. */
|
|
1670
|
+
static VALUE
|
|
1671
|
+
rb_rperf_running_p(VALUE self)
|
|
1672
|
+
{
|
|
1673
|
+
return g_profiler.running ? Qtrue : Qfalse;
|
|
1674
|
+
}
|
|
1675
|
+
|
|
1401
1676
|
/* ---- Fork safety ---- */
|
|
1402
1677
|
|
|
1403
1678
|
static void
|
|
@@ -1448,6 +1723,7 @@ rperf_after_fork_child(void)
|
|
|
1448
1723
|
/* Reset stats */
|
|
1449
1724
|
g_profiler.stats.sampling_count = 0;
|
|
1450
1725
|
g_profiler.stats.sampling_total_ns = 0;
|
|
1726
|
+
g_profiler.profile_refcount = 0;
|
|
1451
1727
|
atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
|
|
1452
1728
|
}
|
|
1453
1729
|
|
|
@@ -1457,10 +1733,19 @@ void
|
|
|
1457
1733
|
Init_rperf(void)
|
|
1458
1734
|
{
|
|
1459
1735
|
VALUE mRperf = rb_define_module("Rperf");
|
|
1460
|
-
rb_define_module_function(mRperf, "_c_start", rb_rperf_start,
|
|
1736
|
+
rb_define_module_function(mRperf, "_c_start", rb_rperf_start, 5);
|
|
1461
1737
|
rb_define_module_function(mRperf, "_c_stop", rb_rperf_stop, 0);
|
|
1738
|
+
rb_define_module_function(mRperf, "_c_snapshot", rb_rperf_snapshot, 1);
|
|
1739
|
+
rb_define_module_function(mRperf, "_c_set_label", rb_rperf_set_label, 1);
|
|
1740
|
+
rb_define_module_function(mRperf, "_c_get_label", rb_rperf_get_label, 0);
|
|
1741
|
+
rb_define_module_function(mRperf, "_c_set_label_sets", rb_rperf_set_label_sets, 1);
|
|
1742
|
+
rb_define_module_function(mRperf, "_c_get_label_sets", rb_rperf_get_label_sets, 0);
|
|
1743
|
+
rb_define_module_function(mRperf, "_c_profile_inc", rb_rperf_profile_inc, 0);
|
|
1744
|
+
rb_define_module_function(mRperf, "_c_profile_dec", rb_rperf_profile_dec, 0);
|
|
1745
|
+
rb_define_module_function(mRperf, "_c_running?", rb_rperf_running_p, 0);
|
|
1462
1746
|
|
|
1463
1747
|
memset(&g_profiler, 0, sizeof(g_profiler));
|
|
1748
|
+
g_profiler.label_sets = Qnil;
|
|
1464
1749
|
g_profiler.pj_handle = rb_postponed_job_preregister(0, rperf_sample_job, &g_profiler);
|
|
1465
1750
|
g_profiler.ts_key = rb_internal_thread_specific_key_create();
|
|
1466
1751
|
|