rperf 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a5e10797e7670051bb82e49f32a80bac5371c9bd7652809ece4894a7d508c4bf
4
- data.tar.gz: b577d93730398a5b91ab89df80e0cec422300839ec3d9879043b711285d4c4c2
3
+ metadata.gz: ab923fe1fc0a0d6928941271cdffc979012af73d6d0bd0aa5c5d43a95e9451c2
4
+ data.tar.gz: 74a0200ec71ae3743d2b99d578df0b484d23dea57285385209e23b0748a95564
5
5
  SHA512:
6
- metadata.gz: 2b5eb6e2125e2155937af009e084b43ff4ea4a5599a4b9d2015f3d6cd13a86f6644ecf05b58a383867853bb87e2017a4097d0f9c34662622dfaafba21efdd98c
7
- data.tar.gz: e3585af44f4cfbb5bace10a7ea127d801035006123a45a54aa1c2b095aeb93f8b98a41041f39cda62a5bda7e16ce4ee5acb6a88290224db5b3162e08c969f6da
6
+ metadata.gz: b2d95c3e58fd883efebfcad8506a5249dee8c7322fb53e75a25afcd5050bbb1885fb620eef18a86e74fa54cb542ba83b1761f13630746862c619477e022b09db
7
+ data.tar.gz: ee4236170102e0be1cd13749389a29679ba7361c4c77db98ed708e9b509e8e1c28ba3b47a2d8a4b704ada2fe5a11859bc61c0476d8f061fbd43703168232f5f6
data/docs/help.md CHANGED
@@ -19,22 +19,27 @@ POSIX systems (Linux, macOS). Requires Ruby >= 3.4.0.
19
19
  -f, --frequency HZ Sampling frequency in Hz (default: 1000)
20
20
  -m, --mode MODE cpu or wall (default: cpu)
21
21
  --format FORMAT pprof, collapsed, or text (default: auto from extension)
22
+ -p, --print Print text profile to stdout
23
+ (same as --format=text --output=/dev/stdout)
22
24
  --signal VALUE Timer signal (Linux only): signal number, or 'false'
23
25
  for nanosleep thread (default: auto)
24
26
  -v, --verbose Print sampling statistics to stderr
25
27
 
26
28
  ### stat: Run command and print performance summary to stderr.
27
29
 
28
- Always uses wall mode. No file output by default.
30
+ Uses wall mode by default. No file output by default.
29
31
 
30
32
  -o, --output PATH Also save profile to file (default: none)
31
33
  -f, --frequency HZ Sampling frequency in Hz (default: 1000)
34
+ -m, --mode MODE cpu or wall (default: wall)
35
+ --report Include flat/cumulative profile tables in output
32
36
  --signal VALUE Timer signal (Linux only): signal number, or 'false'
33
37
  for nanosleep thread (default: auto)
34
38
  -v, --verbose Print additional sampling statistics
35
39
 
36
40
  Shows: user/sys/real time, time breakdown (CPU execution, GVL blocked,
37
- GVL wait, GC marking, GC sweeping), and top 5 hot functions.
41
+ GVL wait, GC marking, GC sweeping), GC/memory/OS stats, and profiler overhead.
42
+ Use --report to add flat and cumulative top-50 function tables.
38
43
 
39
44
  ### report: Open pprof profile with go tool pprof. Requires Go.
40
45
 
@@ -58,7 +63,9 @@ Default (no flag): opens diff in browser.
58
63
  rperf record -m wall -f 500 -o profile.pb.gz ruby server.rb
59
64
  rperf record -o profile.collapsed ruby app.rb
60
65
  rperf record -o profile.txt ruby app.rb
66
+ rperf record -p ruby app.rb
61
67
  rperf stat ruby app.rb
68
+ rperf stat --report ruby app.rb
62
69
  rperf stat -o profile.pb.gz ruby app.rb
63
70
  rperf report
64
71
  rperf report --top profile.pb.gz
@@ -168,14 +175,14 @@ Example output:
168
175
  Total: 1523.4ms (cpu)
169
176
  Samples: 4820, Frequency: 500Hz
170
177
 
171
- Flat:
172
- 820.3ms 53.8% Array#each (app/models/user.rb)
173
- 312.1ms 20.5% JSON.parse (lib/json/parser.rb)
174
- ...
178
+ Flat:
179
+ 820.3 ms 53.8% Array#each (app/models/user.rb)
180
+ 312.1 ms 20.5% JSON.parse (lib/json/parser.rb)
181
+ ...
175
182
 
176
- Cumulative:
177
- 1401.2ms 92.0% UsersController#index (app/controllers/users_controller.rb)
178
- ...
183
+ Cumulative:
184
+ 1,401.2 ms 92.0% UsersController#index (app/controllers/users_controller.rb)
185
+ ...
179
186
 
180
187
  ### Format auto-detection
181
188
 
@@ -281,6 +288,7 @@ Used internally by the CLI to pass options to the auto-started profiler:
281
288
  RPERF_VERBOSE=1 Print statistics
282
289
  RPERF_SIGNAL=N|false Timer signal number or 'false' for nanosleep (Linux only)
283
290
  RPERF_STAT=1 Enable stat mode (used by rperf stat)
291
+ RPERF_STAT_REPORT=1 Include profile tables in stat output
284
292
 
285
293
  ## TIPS
286
294
 
data/exe/rperf CHANGED
@@ -134,9 +134,14 @@ mode = (subcommand == "stat") ? "wall" : "cpu"
134
134
  format = nil
135
135
  signal = nil
136
136
  verbose = false
137
+ aggregate = true
138
+ stat_report = false
137
139
 
138
140
  parser = OptionParser.new do |opts|
139
- opts.banner = USAGE
141
+ opts.banner = case subcommand
142
+ when "record" then "Usage: rperf record [options] command [args...]"
143
+ when "stat" then "Usage: rperf stat [options] command [args...]"
144
+ end
140
145
 
141
146
  opts.on("-o", "--output PATH", "Output file#{subcommand == 'stat' ? ' (default: none)' : ' (default: rperf.data)'}") do |v|
142
147
  output = v
@@ -146,21 +151,37 @@ parser = OptionParser.new do |opts|
146
151
  frequency = v
147
152
  end
148
153
 
149
- if subcommand == "record"
150
- opts.on("-m", "--mode MODE", %w[cpu wall], "Profiling mode: cpu or wall (default: cpu)") do |v|
151
- mode = v
152
- end
154
+ default_mode = (subcommand == "stat") ? "wall" : "cpu"
155
+ opts.on("-m", "--mode MODE", %w[cpu wall], "Profiling mode: cpu or wall (default: #{default_mode})") do |v|
156
+ mode = v
157
+ end
153
158
 
159
+ if subcommand == "record"
154
160
  opts.on("--format FORMAT", %w[pprof collapsed text],
155
161
  "Output format: pprof, collapsed, or text (default: auto from extension)") do |v|
156
162
  format = v
157
163
  end
164
+
165
+ opts.on("-p", "--print", "Print text profile to stdout (same as --format=text --output=/dev/stdout)") do
166
+ format = "text"
167
+ output = "/dev/stdout"
168
+ end
158
169
  end
159
170
 
160
171
  opts.on("--signal VALUE", "Timer signal (Linux only): signal number, or 'false' for nanosleep thread") do |v|
161
172
  signal = (v == "false") ? "false" : v
162
173
  end
163
174
 
175
+ opts.on("--no-aggregate", "Disable sample aggregation (keep raw samples)") do
176
+ aggregate = false
177
+ end
178
+
179
+ if subcommand == "stat"
180
+ opts.on("--report", "Include flat/cumulative profile tables in output") do
181
+ stat_report = true
182
+ end
183
+ end
184
+
164
185
  opts.on("-v", "--verbose", "Print sampling statistics to stderr") do
165
186
  verbose = true
166
187
  end
@@ -198,10 +219,12 @@ ENV["RPERF_MODE"] = mode
198
219
  ENV["RPERF_FORMAT"] = format if format
199
220
  ENV["RPERF_VERBOSE"] = "1" if verbose
200
221
  ENV["RPERF_SIGNAL"] = signal if signal
222
+ ENV["RPERF_AGGREGATE"] = "0" unless aggregate
201
223
 
202
224
  if subcommand == "stat"
203
225
  ENV["RPERF_STAT"] = "1"
204
226
  ENV["RPERF_STAT_COMMAND"] = ARGV.join(" ")
227
+ ENV["RPERF_STAT_REPORT"] = "1" if stat_report
205
228
  end
206
229
 
207
230
  exec(*ARGV)
data/ext/rperf/rperf.c CHANGED
@@ -7,6 +7,13 @@
7
7
  #include <stdlib.h>
8
8
  #include <unistd.h>
9
9
  #include <signal.h>
10
+ #include <assert.h>
11
+ #ifdef __linux__
12
+ #include <sys/syscall.h>
13
+ #endif
14
+
15
+ /* Checked pthread wrappers — assert on unexpected errors */
16
+ #define CHECKED(call) do { int _r = (call); assert(_r == 0 && #call); (void)_r; } while (0)
10
17
 
11
18
  #ifdef __linux__
12
19
  #define RPERF_USE_TIMER_SIGNAL 1
@@ -16,8 +23,19 @@
16
23
  #endif
17
24
 
18
25
  #define RPERF_MAX_STACK_DEPTH 512
19
- #define RPERF_INITIAL_SAMPLES 1024
26
+ #define RPERF_INITIAL_SAMPLES 16384 /* >= AGG_THRESHOLD to avoid realloc before first aggregation */
20
27
  #define RPERF_INITIAL_FRAME_POOL (1024 * 1024 / sizeof(VALUE)) /* ~1MB */
28
+ #define RPERF_AGG_THRESHOLD 10000 /* aggregate every N samples */
29
+ #define RPERF_FRAME_TABLE_INITIAL 65536 /* pre-allocate to avoid realloc race with GC dmark */
30
+ #define RPERF_AGG_TABLE_INITIAL 1024
31
+ #define RPERF_STACK_POOL_INITIAL 4096
32
+
33
+ /* Synthetic frame IDs (reserved in frame_table, 0-based) */
34
+ #define RPERF_SYNTHETIC_GVL_BLOCKED 0
35
+ #define RPERF_SYNTHETIC_GVL_WAIT 1
36
+ #define RPERF_SYNTHETIC_GC_MARKING 2
37
+ #define RPERF_SYNTHETIC_GC_SWEEPING 3
38
+ #define RPERF_SYNTHETIC_COUNT 4
21
39
 
22
40
  /* ---- Data structures ---- */
23
41
 
@@ -43,6 +61,51 @@ typedef struct rperf_sample {
43
61
  int thread_seq; /* thread sequence number (1-based) */
44
62
  } rperf_sample_t;
45
63
 
64
+ /* ---- Sample buffer (double-buffered) ---- */
65
+
66
+ typedef struct rperf_sample_buffer {
67
+ rperf_sample_t *samples;
68
+ size_t sample_count;
69
+ size_t sample_capacity;
70
+ VALUE *frame_pool;
71
+ size_t frame_pool_count;
72
+ size_t frame_pool_capacity;
73
+ } rperf_sample_buffer_t;
74
+
75
+ /* ---- Frame table: VALUE → uint32_t frame_id ---- */
76
+
77
+ #define RPERF_FRAME_TABLE_EMPTY UINT32_MAX
78
+
79
+ typedef struct rperf_frame_table {
80
+ VALUE *keys; /* unique VALUE array (GC mark target) */
81
+ size_t count; /* = next frame_id (starts after RPERF_SYNTHETIC_COUNT) */
82
+ size_t capacity;
83
+ uint32_t *buckets; /* open addressing: stores index into keys[] */
84
+ size_t bucket_capacity;
85
+ } rperf_frame_table_t;
86
+
87
+ /* ---- Aggregation table: stack → weight ---- */
88
+
89
+ #define RPERF_AGG_ENTRY_EMPTY 0
90
+
91
+ typedef struct rperf_agg_entry {
92
+ uint32_t frame_start; /* offset into stack_pool */
93
+ int depth; /* includes synthetic frame */
94
+ int thread_seq;
95
+ int64_t weight; /* accumulated */
96
+ uint32_t hash; /* cached hash value */
97
+ int used; /* 0 = empty, 1 = used */
98
+ } rperf_agg_entry_t;
99
+
100
+ typedef struct rperf_agg_table {
101
+ rperf_agg_entry_t *buckets;
102
+ size_t bucket_capacity;
103
+ size_t count;
104
+ uint32_t *stack_pool; /* frame_id sequences stored contiguously */
105
+ size_t stack_pool_count;
106
+ size_t stack_pool_capacity;
107
+ } rperf_agg_table_t;
108
+
46
109
  typedef struct rperf_thread_data {
47
110
  int64_t prev_cpu_ns;
48
111
  int64_t prev_wall_ns;
@@ -58,18 +121,23 @@ typedef struct rperf_profiler {
58
121
  int frequency;
59
122
  int mode; /* 0 = cpu, 1 = wall */
60
123
  volatile int running;
61
- pthread_t timer_thread;
124
+ pthread_t worker_thread; /* combined timer + aggregation */
62
125
  #if RPERF_USE_TIMER_SIGNAL
63
126
  timer_t timer_id;
64
127
  int timer_signal; /* >0: use timer signal, 0: use nanosleep thread */
128
+ volatile pid_t worker_tid; /* kernel TID of worker thread (for SIGEV_THREAD_ID) */
65
129
  #endif
66
130
  rb_postponed_job_handle_t pj_handle;
67
- rperf_sample_t *samples;
68
- size_t sample_count;
69
- size_t sample_capacity;
70
- VALUE *frame_pool; /* raw frame VALUEs from rb_profile_frames */
71
- size_t frame_pool_count;
72
- size_t frame_pool_capacity;
131
+ int aggregate; /* 1 = aggregate samples, 0 = raw */
132
+ /* Double-buffered sample storage (only buffers[0] used when !aggregate) */
133
+ rperf_sample_buffer_t buffers[2];
134
+ int active_idx; /* 0 or 1 */
135
+ /* Aggregation (only used when aggregate=1) */
136
+ rperf_frame_table_t frame_table;
137
+ rperf_agg_table_t agg_table;
138
+ volatile int swap_ready; /* 1 = standby buffer ready for aggregation */
139
+ pthread_mutex_t worker_mutex;
140
+ pthread_cond_t worker_cond;
73
141
  rb_internal_thread_specific_key_t ts_key;
74
142
  rb_internal_thread_event_hook_t *thread_hook;
75
143
  /* GC tracking */
@@ -98,8 +166,19 @@ static void
98
166
  rperf_profiler_mark(void *ptr)
99
167
  {
100
168
  rperf_profiler_t *prof = (rperf_profiler_t *)ptr;
101
- if (prof->frame_pool && prof->frame_pool_count > 0) {
102
- rb_gc_mark_locations(prof->frame_pool, prof->frame_pool + prof->frame_pool_count);
169
+ int i;
170
+ /* Mark both sample buffers' frame_pools */
171
+ for (i = 0; i < 2; i++) {
172
+ rperf_sample_buffer_t *buf = &prof->buffers[i];
173
+ if (buf->frame_pool && buf->frame_pool_count > 0) {
174
+ rb_gc_mark_locations(buf->frame_pool,
175
+ buf->frame_pool + buf->frame_pool_count);
176
+ }
177
+ }
178
+ /* Mark frame_table keys (unique frame VALUEs) */
179
+ if (prof->frame_table.keys && prof->frame_table.count > 0) {
180
+ rb_gc_mark_locations(prof->frame_table.keys + RPERF_SYNTHETIC_COUNT,
181
+ prof->frame_table.keys + prof->frame_table.count);
103
182
  }
104
183
  }
105
184
 
@@ -146,18 +225,45 @@ rperf_current_time_ns(rperf_profiler_t *prof, rperf_thread_data_t *td)
146
225
 
147
226
  /* ---- Sample buffer ---- */
148
227
 
228
+ static int
229
+ rperf_sample_buffer_init(rperf_sample_buffer_t *buf)
230
+ {
231
+ buf->sample_count = 0;
232
+ buf->sample_capacity = RPERF_INITIAL_SAMPLES;
233
+ buf->samples = (rperf_sample_t *)calloc(buf->sample_capacity, sizeof(rperf_sample_t));
234
+ if (!buf->samples) return -1;
235
+
236
+ buf->frame_pool_count = 0;
237
+ buf->frame_pool_capacity = RPERF_INITIAL_FRAME_POOL;
238
+ buf->frame_pool = (VALUE *)calloc(buf->frame_pool_capacity, sizeof(VALUE));
239
+ if (!buf->frame_pool) {
240
+ free(buf->samples);
241
+ buf->samples = NULL;
242
+ return -1;
243
+ }
244
+ return 0;
245
+ }
246
+
247
+ static void
248
+ rperf_sample_buffer_free(rperf_sample_buffer_t *buf)
249
+ {
250
+ free(buf->samples);
251
+ free(buf->frame_pool);
252
+ memset(buf, 0, sizeof(*buf));
253
+ }
254
+
149
255
  /* Returns 0 on success, -1 on allocation failure */
150
256
  static int
151
- rperf_ensure_sample_capacity(rperf_profiler_t *prof)
257
+ rperf_ensure_sample_capacity(rperf_sample_buffer_t *buf)
152
258
  {
153
- if (prof->sample_count >= prof->sample_capacity) {
154
- size_t new_cap = prof->sample_capacity * 2;
259
+ if (buf->sample_count >= buf->sample_capacity) {
260
+ size_t new_cap = buf->sample_capacity * 2;
155
261
  rperf_sample_t *new_samples = (rperf_sample_t *)realloc(
156
- prof->samples,
262
+ buf->samples,
157
263
  new_cap * sizeof(rperf_sample_t));
158
264
  if (!new_samples) return -1;
159
- prof->samples = new_samples;
160
- prof->sample_capacity = new_cap;
265
+ buf->samples = new_samples;
266
+ buf->sample_capacity = new_cap;
161
267
  }
162
268
  return 0;
163
269
  }
@@ -166,36 +272,310 @@ rperf_ensure_sample_capacity(rperf_profiler_t *prof)
166
272
 
167
273
  /* Ensure frame_pool has room for `needed` more entries. Returns 0 on success. */
168
274
  static int
169
- rperf_ensure_frame_pool_capacity(rperf_profiler_t *prof, int needed)
275
+ rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
170
276
  {
171
- while (prof->frame_pool_count + (size_t)needed > prof->frame_pool_capacity) {
172
- size_t new_cap = prof->frame_pool_capacity * 2;
277
+ while (buf->frame_pool_count + (size_t)needed > buf->frame_pool_capacity) {
278
+ size_t new_cap = buf->frame_pool_capacity * 2;
173
279
  VALUE *new_pool = (VALUE *)realloc(
174
- prof->frame_pool,
280
+ buf->frame_pool,
175
281
  new_cap * sizeof(VALUE));
176
282
  if (!new_pool) return -1;
177
- prof->frame_pool = new_pool;
178
- prof->frame_pool_capacity = new_cap;
283
+ buf->frame_pool = new_pool;
284
+ buf->frame_pool_capacity = new_cap;
285
+ }
286
+ return 0;
287
+ }
288
+
289
+ /* ---- Frame table operations (all malloc-based, no GVL needed) ---- */
290
+
291
+ static void
292
+ rperf_frame_table_init(rperf_frame_table_t *ft)
293
+ {
294
+ ft->capacity = RPERF_FRAME_TABLE_INITIAL;
295
+ ft->keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
296
+ ft->count = RPERF_SYNTHETIC_COUNT; /* reserve slots for synthetic frames */
297
+ ft->bucket_capacity = RPERF_FRAME_TABLE_INITIAL * 2;
298
+ ft->buckets = (uint32_t *)malloc(ft->bucket_capacity * sizeof(uint32_t));
299
+ memset(ft->buckets, 0xFF, ft->bucket_capacity * sizeof(uint32_t)); /* EMPTY */
300
+ }
301
+
302
+ static void
303
+ rperf_frame_table_free(rperf_frame_table_t *ft)
304
+ {
305
+ free(ft->keys);
306
+ free(ft->buckets);
307
+ memset(ft, 0, sizeof(*ft));
308
+ }
309
+
310
+ static void
311
+ rperf_frame_table_rehash(rperf_frame_table_t *ft)
312
+ {
313
+ size_t new_cap = ft->bucket_capacity * 2;
314
+ uint32_t *new_buckets = (uint32_t *)malloc(new_cap * sizeof(uint32_t));
315
+ memset(new_buckets, 0xFF, new_cap * sizeof(uint32_t));
316
+
317
+ size_t i;
318
+ for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
319
+ uint32_t h = (uint32_t)(ft->keys[i] >> 3); /* shift out tag bits */
320
+ size_t idx = h % new_cap;
321
+ while (new_buckets[idx] != RPERF_FRAME_TABLE_EMPTY)
322
+ idx = (idx + 1) % new_cap;
323
+ new_buckets[idx] = (uint32_t)i;
324
+ }
325
+
326
+ free(ft->buckets);
327
+ ft->buckets = new_buckets;
328
+ ft->bucket_capacity = new_cap;
329
+ }
330
+
331
+ /* Returns frame_id for the given VALUE, inserting if new */
332
+ static uint32_t
333
+ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
334
+ {
335
+ uint32_t h = (uint32_t)(fval >> 3);
336
+ size_t idx = h % ft->bucket_capacity;
337
+
338
+ while (1) {
339
+ uint32_t slot = ft->buckets[idx];
340
+ if (slot == RPERF_FRAME_TABLE_EMPTY) break;
341
+ if (ft->keys[slot] == fval) return slot;
342
+ idx = (idx + 1) % ft->bucket_capacity;
343
+ }
344
+
345
+ /* Insert new entry.
346
+ * keys array is pre-allocated and never realloc'd to avoid race with GC dmark.
347
+ * If capacity is exhausted, return EMPTY to signal aggregation should stop. */
348
+ if (ft->count >= ft->capacity) {
349
+ return RPERF_FRAME_TABLE_EMPTY;
350
+ }
351
+
352
+ uint32_t frame_id = (uint32_t)ft->count;
353
+ ft->keys[frame_id] = fval;
354
+ /* Store fence: ensure keys[frame_id] is visible before count is incremented,
355
+ * so GC dmark never reads uninitialized keys[count-1]. */
356
+ __atomic_store_n(&ft->count, ft->count + 1, __ATOMIC_RELEASE);
357
+ ft->buckets[idx] = frame_id;
358
+
359
+ /* Rehash if load factor > 0.7 */
360
+ if (ft->count * 10 > ft->bucket_capacity * 7) {
361
+ rperf_frame_table_rehash(ft);
362
+ }
363
+
364
+ return frame_id;
365
+ }
366
+
367
+ /* ---- Aggregation table operations (all malloc-based, no GVL needed) ---- */
368
+
369
+ static uint32_t
370
+ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq)
371
+ {
372
+ uint32_t h = 2166136261u;
373
+ int i;
374
+ for (i = 0; i < len; i++) {
375
+ h ^= data[i];
376
+ h *= 16777619u;
377
+ }
378
+ h ^= (uint32_t)thread_seq;
379
+ h *= 16777619u;
380
+ return h;
381
+ }
382
+
383
+ static void
384
+ rperf_agg_table_init(rperf_agg_table_t *at)
385
+ {
386
+ at->bucket_capacity = RPERF_AGG_TABLE_INITIAL * 2;
387
+ at->buckets = (rperf_agg_entry_t *)calloc(at->bucket_capacity, sizeof(rperf_agg_entry_t));
388
+ at->count = 0;
389
+ at->stack_pool_capacity = RPERF_STACK_POOL_INITIAL;
390
+ at->stack_pool = (uint32_t *)malloc(at->stack_pool_capacity * sizeof(uint32_t));
391
+ at->stack_pool_count = 0;
392
+ }
393
+
394
+ static void
395
+ rperf_agg_table_free(rperf_agg_table_t *at)
396
+ {
397
+ free(at->buckets);
398
+ free(at->stack_pool);
399
+ memset(at, 0, sizeof(*at));
400
+ }
401
+
402
+ static void
403
+ rperf_agg_table_rehash(rperf_agg_table_t *at)
404
+ {
405
+ size_t new_cap = at->bucket_capacity * 2;
406
+ rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
407
+
408
+ size_t i;
409
+ for (i = 0; i < at->bucket_capacity; i++) {
410
+ if (!at->buckets[i].used) continue;
411
+ rperf_agg_entry_t *e = &at->buckets[i];
412
+ size_t idx = e->hash % new_cap;
413
+ while (new_buckets[idx].used)
414
+ idx = (idx + 1) % new_cap;
415
+ new_buckets[idx] = *e;
416
+ }
417
+
418
+ free(at->buckets);
419
+ at->buckets = new_buckets;
420
+ at->bucket_capacity = new_cap;
421
+ }
422
+
423
+ /* Ensure stack_pool has room for `needed` more entries */
424
+ static int
425
+ rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
426
+ {
427
+ while (at->stack_pool_count + (size_t)needed > at->stack_pool_capacity) {
428
+ size_t new_cap = at->stack_pool_capacity * 2;
429
+ uint32_t *new_pool = (uint32_t *)realloc(at->stack_pool,
430
+ new_cap * sizeof(uint32_t));
431
+ if (!new_pool) return -1;
432
+ at->stack_pool = new_pool;
433
+ at->stack_pool_capacity = new_cap;
179
434
  }
180
435
  return 0;
181
436
  }
182
437
 
438
+ /* Insert or merge a stack into the aggregation table */
439
+ static void
440
+ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
441
+ int depth, int thread_seq, int64_t weight, uint32_t hash)
442
+ {
443
+ size_t idx = hash % at->bucket_capacity;
444
+
445
+ while (1) {
446
+ rperf_agg_entry_t *e = &at->buckets[idx];
447
+ if (!e->used) break;
448
+ if (e->hash == hash && e->depth == depth && e->thread_seq == thread_seq &&
449
+ memcmp(at->stack_pool + e->frame_start, frame_ids,
450
+ depth * sizeof(uint32_t)) == 0) {
451
+ /* Match — merge weight */
452
+ e->weight += weight;
453
+ return;
454
+ }
455
+ idx = (idx + 1) % at->bucket_capacity;
456
+ }
457
+
458
+ /* New entry — append frame_ids to stack_pool */
459
+ if (rperf_agg_ensure_stack_pool(at, depth) < 0) return;
460
+
461
+ rperf_agg_entry_t *e = &at->buckets[idx];
462
+ e->frame_start = (uint32_t)at->stack_pool_count;
463
+ e->depth = depth;
464
+ e->thread_seq = thread_seq;
465
+ e->weight = weight;
466
+ e->hash = hash;
467
+ e->used = 1;
468
+
469
+ memcpy(at->stack_pool + at->stack_pool_count, frame_ids,
470
+ depth * sizeof(uint32_t));
471
+ at->stack_pool_count += depth;
472
+ at->count++;
473
+
474
+ /* Rehash if load factor > 0.7 */
475
+ if (at->count * 10 > at->bucket_capacity * 7) {
476
+ rperf_agg_table_rehash(at);
477
+ }
478
+ }
479
+
480
+ /* ---- Aggregation: process a sample buffer into frame_table + agg_table ---- */
481
+
482
+ static void
483
+ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
484
+ {
485
+ size_t i;
486
+ uint32_t temp_ids[RPERF_MAX_STACK_DEPTH + 1];
487
+
488
+ for (i = 0; i < buf->sample_count; i++) {
489
+ rperf_sample_t *s = &buf->samples[i];
490
+ int off = 0;
491
+ uint32_t hash;
492
+ int j;
493
+
494
+ /* Prepend synthetic frame if needed */
495
+ if (s->type == RPERF_SAMPLE_GVL_BLOCKED) {
496
+ temp_ids[off++] = RPERF_SYNTHETIC_GVL_BLOCKED;
497
+ } else if (s->type == RPERF_SAMPLE_GVL_WAIT) {
498
+ temp_ids[off++] = RPERF_SYNTHETIC_GVL_WAIT;
499
+ } else if (s->type == RPERF_SAMPLE_GC_MARKING) {
500
+ temp_ids[off++] = RPERF_SYNTHETIC_GC_MARKING;
501
+ } else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
502
+ temp_ids[off++] = RPERF_SYNTHETIC_GC_SWEEPING;
503
+ }
504
+
505
+ /* Convert VALUE frames to frame_ids */
506
+ int overflow = 0;
507
+ for (j = 0; j < s->depth; j++) {
508
+ VALUE fval = buf->frame_pool[s->frame_start + j];
509
+ uint32_t fid = rperf_frame_table_insert(&prof->frame_table, fval);
510
+ if (fid == RPERF_FRAME_TABLE_EMPTY) { overflow = 1; break; }
511
+ temp_ids[off + j] = fid;
512
+ }
513
+ if (overflow) break; /* frame_table full, stop aggregating this buffer */
514
+
515
+ int total_depth = off + s->depth;
516
+ hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq);
517
+
518
+ rperf_agg_table_insert(&prof->agg_table, temp_ids, total_depth,
519
+ s->thread_seq, s->weight, hash);
520
+ }
521
+
522
+ /* Reset buffer for reuse.
523
+ * Release fence: ensure all frame_table inserts are visible (to GC dmark)
524
+ * before frame_pool_count is cleared, so dmark always has at least one
525
+ * source (frame_table or frame_pool) covering each VALUE. */
526
+ __atomic_thread_fence(__ATOMIC_RELEASE);
527
+ buf->sample_count = 0;
528
+ buf->frame_pool_count = 0;
529
+ }
530
+
531
+ /* ---- Aggregation thread ---- */
532
+
533
+ /* Try to aggregate the standby buffer if swap_ready is set.
534
+ * Called from worker thread (with or without worker_mutex held). */
535
+ static void
536
+ rperf_try_aggregate(rperf_profiler_t *prof)
537
+ {
538
+ if (!prof->aggregate || !prof->swap_ready) return;
539
+ int standby_idx = prof->active_idx ^ 1;
540
+ rperf_aggregate_buffer(prof, &prof->buffers[standby_idx]);
541
+ prof->swap_ready = 0;
542
+ }
543
+
183
544
  /* ---- Record a sample ---- */
184
545
 
546
+ static void
547
+ rperf_try_swap(rperf_profiler_t *prof)
548
+ {
549
+ if (!prof->aggregate) return;
550
+ rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
551
+ if (buf->sample_count < RPERF_AGG_THRESHOLD) return;
552
+ if (prof->swap_ready) return; /* standby still being aggregated */
553
+
554
+ /* Swap active buffer */
555
+ prof->active_idx ^= 1;
556
+ prof->swap_ready = 1;
557
+
558
+ /* Wake worker thread */
559
+ CHECKED(pthread_cond_signal(&prof->worker_cond));
560
+ }
561
+
185
562
  static void
186
563
  rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
187
564
  int64_t weight, int type, int thread_seq)
188
565
  {
189
566
  if (weight <= 0) return;
190
- if (rperf_ensure_sample_capacity(prof) < 0) return;
567
+ rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
568
+ if (rperf_ensure_sample_capacity(buf) < 0) return;
191
569
 
192
- rperf_sample_t *sample = &prof->samples[prof->sample_count];
570
+ rperf_sample_t *sample = &buf->samples[buf->sample_count];
193
571
  sample->depth = depth;
194
572
  sample->frame_start = frame_start;
195
573
  sample->weight = weight;
196
574
  sample->type = type;
197
575
  sample->thread_seq = thread_seq;
198
- prof->sample_count++;
576
+ buf->sample_count++;
577
+
578
+ rperf_try_swap(prof);
199
579
  }
200
580
 
201
581
  /* ---- Thread data initialization ---- */
@@ -233,13 +613,14 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
233
613
  int64_t time_now = rperf_current_time_ns(prof, td);
234
614
  if (time_now < 0) return;
235
615
 
236
- /* Capture backtrace into frame_pool */
237
- if (rperf_ensure_frame_pool_capacity(prof, RPERF_MAX_STACK_DEPTH) < 0) return;
238
- size_t frame_start = prof->frame_pool_count;
616
+ /* Capture backtrace into active buffer's frame_pool */
617
+ rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
618
+ if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
619
+ size_t frame_start = buf->frame_pool_count;
239
620
  int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
240
- &prof->frame_pool[frame_start], NULL);
621
+ &buf->frame_pool[frame_start], NULL);
241
622
  if (depth <= 0) return;
242
- prof->frame_pool_count += depth;
623
+ buf->frame_pool_count += depth;
243
624
 
244
625
  /* Record normal sample (skip if first time — no prev_time) */
245
626
  if (!is_first) {
@@ -353,15 +734,16 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
353
734
  /* Capture backtrace and timestamp at GC entry */
354
735
  prof->gc_enter_ns = rperf_wall_time_ns();
355
736
 
356
- if (rperf_ensure_frame_pool_capacity(prof, RPERF_MAX_STACK_DEPTH) < 0) return;
357
- size_t frame_start = prof->frame_pool_count;
737
+ rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
738
+ if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
739
+ size_t frame_start = buf->frame_pool_count;
358
740
  int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
359
- &prof->frame_pool[frame_start], NULL);
741
+ &buf->frame_pool[frame_start], NULL);
360
742
  if (depth <= 0) {
361
743
  prof->gc_frame_depth = 0;
362
744
  return;
363
745
  }
364
- prof->frame_pool_count += depth;
746
+ buf->frame_pool_count += depth;
365
747
  prof->gc_frame_start = frame_start;
366
748
  prof->gc_frame_depth = depth;
367
749
 
@@ -420,13 +802,14 @@ rperf_sample_job(void *arg)
420
802
  if (weight <= 0) return;
421
803
 
422
804
  /* Capture backtrace and record sample */
423
- if (rperf_ensure_frame_pool_capacity(prof, RPERF_MAX_STACK_DEPTH) < 0) return;
805
+ rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
806
+ if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
424
807
 
425
- size_t frame_start = prof->frame_pool_count;
808
+ size_t frame_start = buf->frame_pool_count;
426
809
  int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
427
- &prof->frame_pool[frame_start], NULL);
810
+ &buf->frame_pool[frame_start], NULL);
428
811
  if (depth <= 0) return;
429
- prof->frame_pool_count += depth;
812
+ buf->frame_pool_count += depth;
430
813
 
431
814
  rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
432
815
 
@@ -437,7 +820,7 @@ rperf_sample_job(void *arg)
437
820
  (ts_end.tv_nsec - ts_start.tv_nsec);
438
821
  }
439
822
 
440
- /* ---- Timer ---- */
823
+ /* ---- Worker thread: timer + aggregation ---- */
441
824
 
442
825
  #if RPERF_USE_TIMER_SIGNAL
443
826
  static void
@@ -446,21 +829,65 @@ rperf_signal_handler(int sig)
446
829
  g_profiler.trigger_count++;
447
830
  rb_postponed_job_trigger(g_profiler.pj_handle);
448
831
  }
832
+
833
+ /* Worker thread for signal mode: aggregation only.
834
+ * Timer signals are directed to this thread via SIGEV_THREAD_ID,
835
+ * and handled by the sigaction handler (rperf_signal_handler).
836
+ * This ensures the timer signal does not interrupt other threads. */
837
+ static void *
838
+ rperf_worker_signal_func(void *arg)
839
+ {
840
+ rperf_profiler_t *prof = (rperf_profiler_t *)arg;
841
+
842
+ /* Publish our kernel TID so start() can use it for SIGEV_THREAD_ID */
843
+ CHECKED(pthread_mutex_lock(&prof->worker_mutex));
844
+ prof->worker_tid = (pid_t)syscall(SYS_gettid);
845
+ CHECKED(pthread_cond_signal(&prof->worker_cond));
846
+
847
+ while (prof->running) {
848
+ CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
849
+ rperf_try_aggregate(prof);
850
+ }
851
+ CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
852
+ return NULL;
853
+ }
449
854
  #endif
450
855
 
856
+ /* Worker thread for nanosleep mode: timer + aggregation.
857
+ * Uses pthread_cond_timedwait with absolute deadline.
858
+ * Timeout → trigger + advance deadline.
859
+ * Signal (swap_ready) → aggregate only, keep same deadline. */
451
860
  static void *
452
- rperf_timer_func(void *arg)
861
+ rperf_worker_nanosleep_func(void *arg)
453
862
  {
454
863
  rperf_profiler_t *prof = (rperf_profiler_t *)arg;
455
- struct timespec interval;
456
- interval.tv_sec = 0;
457
- interval.tv_nsec = 1000000000L / prof->frequency;
864
+ struct timespec deadline;
865
+ long interval_ns = 1000000000L / prof->frequency;
866
+
867
+ clock_gettime(CLOCK_REALTIME, &deadline);
868
+ deadline.tv_nsec += interval_ns;
869
+ if (deadline.tv_nsec >= 1000000000L) {
870
+ deadline.tv_sec++;
871
+ deadline.tv_nsec -= 1000000000L;
872
+ }
458
873
 
874
+ CHECKED(pthread_mutex_lock(&prof->worker_mutex));
459
875
  while (prof->running) {
460
- prof->trigger_count++;
461
- rb_postponed_job_trigger(prof->pj_handle);
462
- nanosleep(&interval, NULL);
876
+ int ret = pthread_cond_timedwait(&prof->worker_cond, &prof->worker_mutex, &deadline);
877
+ assert(ret == 0 || ret == ETIMEDOUT);
878
+ if (ret == ETIMEDOUT) {
879
+ prof->trigger_count++;
880
+ rb_postponed_job_trigger(prof->pj_handle);
881
+ /* Advance deadline by interval */
882
+ deadline.tv_nsec += interval_ns;
883
+ if (deadline.tv_nsec >= 1000000000L) {
884
+ deadline.tv_sec++;
885
+ deadline.tv_nsec -= 1000000000L;
886
+ }
887
+ }
888
+ rperf_try_aggregate(prof);
463
889
  }
890
+ CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
464
891
  return NULL;
465
892
  }
466
893
 
@@ -488,12 +915,17 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
488
915
  VALUE opts;
489
916
  int frequency = 1000;
490
917
  int mode = 0; /* 0 = cpu, 1 = wall */
918
+ int aggregate = 1; /* default: aggregate */
491
919
  #if RPERF_USE_TIMER_SIGNAL
492
920
  int timer_signal = RPERF_TIMER_SIGNAL_DEFAULT;
493
921
  #endif
494
922
 
495
923
  rb_scan_args(argc, argv, ":", &opts);
496
924
  if (!NIL_P(opts)) {
925
+ VALUE vagg = rb_hash_aref(opts, ID2SYM(rb_intern("aggregate")));
926
+ if (!NIL_P(vagg)) {
927
+ aggregate = RTEST(vagg) ? 1 : 0;
928
+ }
497
929
  VALUE vfreq = rb_hash_aref(opts, ID2SYM(rb_intern("frequency")));
498
930
  if (!NIL_P(vfreq)) {
499
931
  frequency = NUM2INT(vfreq);
@@ -535,25 +967,35 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
535
967
 
536
968
  g_profiler.frequency = frequency;
537
969
  g_profiler.mode = mode;
538
- g_profiler.sample_count = 0;
970
+ g_profiler.aggregate = aggregate;
539
971
  g_profiler.next_thread_seq = 0;
540
972
  g_profiler.sampling_count = 0;
541
973
  g_profiler.sampling_total_ns = 0;
542
- g_profiler.sample_capacity = RPERF_INITIAL_SAMPLES;
543
- g_profiler.samples = (rperf_sample_t *)calloc(
544
- g_profiler.sample_capacity, sizeof(rperf_sample_t));
545
- if (!g_profiler.samples) {
546
- rb_raise(rb_eNoMemError, "rperf: failed to allocate sample buffer");
974
+ g_profiler.trigger_count = 0;
975
+ g_profiler.active_idx = 0;
976
+ g_profiler.swap_ready = 0;
977
+
978
+ /* Initialize worker mutex/cond */
979
+ CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
980
+ CHECKED(pthread_cond_init(&g_profiler.worker_cond, NULL));
981
+
982
+ /* Initialize sample buffer(s) */
983
+ if (rperf_sample_buffer_init(&g_profiler.buffers[0]) < 0) {
984
+ CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
985
+ CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
986
+ rb_raise(rb_eNoMemError, "rperf: failed to allocate sample buffer 0");
547
987
  }
988
+ if (aggregate) {
989
+ if (rperf_sample_buffer_init(&g_profiler.buffers[1]) < 0) {
990
+ rperf_sample_buffer_free(&g_profiler.buffers[0]);
991
+ CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
992
+ CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
993
+ rb_raise(rb_eNoMemError, "rperf: failed to allocate sample buffer 1");
994
+ }
548
995
 
549
- g_profiler.frame_pool_count = 0;
550
- g_profiler.frame_pool_capacity = RPERF_INITIAL_FRAME_POOL;
551
- g_profiler.frame_pool = (VALUE *)calloc(
552
- g_profiler.frame_pool_capacity, sizeof(VALUE));
553
- if (!g_profiler.frame_pool) {
554
- free(g_profiler.samples);
555
- g_profiler.samples = NULL;
556
- rb_raise(rb_eNoMemError, "rperf: failed to allocate frame pool");
996
+ /* Initialize aggregation structures */
997
+ rperf_frame_table_init(&g_profiler.frame_table);
998
+ rperf_agg_table_init(&g_profiler.agg_table);
557
999
  }
558
1000
 
559
1001
  /* Register GC event hook */
@@ -581,12 +1023,16 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
581
1023
  VALUE cur_thread = rb_thread_current();
582
1024
  rperf_thread_data_t *td = rperf_thread_data_create(&g_profiler, cur_thread);
583
1025
  if (!td) {
584
- free(g_profiler.samples);
585
- g_profiler.samples = NULL;
586
- free(g_profiler.frame_pool);
587
- g_profiler.frame_pool = NULL;
588
1026
  rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
589
1027
  g_profiler.thread_hook = NULL;
1028
+ if (g_profiler.aggregate) {
1029
+ rperf_sample_buffer_free(&g_profiler.buffers[1]);
1030
+ rperf_frame_table_free(&g_profiler.frame_table);
1031
+ rperf_agg_table_free(&g_profiler.agg_table);
1032
+ }
1033
+ rperf_sample_buffer_free(&g_profiler.buffers[0]);
1034
+ CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
1035
+ CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
590
1036
  rb_raise(rb_eNoMemError, "rperf: failed to allocate thread data");
591
1037
  }
592
1038
  }
@@ -609,12 +1055,32 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
609
1055
  sa.sa_flags = SA_RESTART;
610
1056
  sigaction(g_profiler.timer_signal, &sa, NULL);
611
1057
 
1058
+ /* Start worker thread first to get its kernel TID */
1059
+ g_profiler.worker_tid = 0;
1060
+ if (pthread_create(&g_profiler.worker_thread, NULL,
1061
+ rperf_worker_signal_func, &g_profiler) != 0) {
1062
+ g_profiler.running = 0;
1063
+ signal(g_profiler.timer_signal, SIG_DFL);
1064
+ goto timer_fail;
1065
+ }
1066
+
1067
+ /* Wait for worker thread to publish its TID */
1068
+ CHECKED(pthread_mutex_lock(&g_profiler.worker_mutex));
1069
+ while (g_profiler.worker_tid == 0) {
1070
+ CHECKED(pthread_cond_wait(&g_profiler.worker_cond, &g_profiler.worker_mutex));
1071
+ }
1072
+ CHECKED(pthread_mutex_unlock(&g_profiler.worker_mutex));
1073
+
1074
+ /* Create timer targeting the worker thread via SIGEV_THREAD_ID */
612
1075
  memset(&sev, 0, sizeof(sev));
613
- sev.sigev_notify = SIGEV_SIGNAL;
1076
+ sev.sigev_notify = SIGEV_THREAD_ID;
614
1077
  sev.sigev_signo = g_profiler.timer_signal;
1078
+ sev._sigev_un._tid = g_profiler.worker_tid;
615
1079
  if (timer_create(CLOCK_MONOTONIC, &sev, &g_profiler.timer_id) != 0) {
616
1080
  g_profiler.running = 0;
617
1081
  signal(g_profiler.timer_signal, SIG_DFL);
1082
+ CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
1083
+ CHECKED(pthread_join(g_profiler.worker_thread, NULL));
618
1084
  goto timer_fail;
619
1085
  }
620
1086
 
@@ -625,7 +1091,9 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
625
1091
  } else
626
1092
  #endif
627
1093
  {
628
- if (pthread_create(&g_profiler.timer_thread, NULL, rperf_timer_func, &g_profiler) != 0) {
1094
+ /* Start worker thread (timer via timedwait + aggregation) */
1095
+ if (pthread_create(&g_profiler.worker_thread, NULL,
1096
+ rperf_worker_nanosleep_func, &g_profiler) != 0) {
629
1097
  g_profiler.running = 0;
630
1098
  goto timer_fail;
631
1099
  }
@@ -643,10 +1111,14 @@ timer_fail:
643
1111
  }
644
1112
  rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
645
1113
  g_profiler.thread_hook = NULL;
646
- free(g_profiler.samples);
647
- g_profiler.samples = NULL;
648
- free(g_profiler.frame_pool);
649
- g_profiler.frame_pool = NULL;
1114
+ if (g_profiler.aggregate) {
1115
+ rperf_sample_buffer_free(&g_profiler.buffers[1]);
1116
+ rperf_frame_table_free(&g_profiler.frame_table);
1117
+ rperf_agg_table_free(&g_profiler.agg_table);
1118
+ }
1119
+ rperf_sample_buffer_free(&g_profiler.buffers[0]);
1120
+ CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
1121
+ CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
650
1122
  rb_raise(rb_eRuntimeError, "rperf: failed to create timer");
651
1123
  }
652
1124
 
@@ -668,12 +1140,15 @@ rb_rperf_stop(VALUE self)
668
1140
  #if RPERF_USE_TIMER_SIGNAL
669
1141
  if (g_profiler.timer_signal > 0) {
670
1142
  timer_delete(g_profiler.timer_id);
671
- signal(g_profiler.timer_signal, SIG_DFL);
672
- } else
673
- #endif
674
- {
675
- pthread_join(g_profiler.timer_thread, NULL);
1143
+ signal(g_profiler.timer_signal, SIG_IGN);
676
1144
  }
1145
+ #endif
1146
+
1147
+ /* Wake and join worker thread */
1148
+ CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
1149
+ CHECKED(pthread_join(g_profiler.worker_thread, NULL));
1150
+ CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
1151
+ CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
677
1152
 
678
1153
  if (g_profiler.thread_hook) {
679
1154
  rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
@@ -683,6 +1158,16 @@ rb_rperf_stop(VALUE self)
683
1158
  /* Remove GC event hook */
684
1159
  rb_remove_event_hook(rperf_gc_event_hook);
685
1160
 
1161
+ if (g_profiler.aggregate) {
1162
+ /* Aggregate remaining samples from both buffers */
1163
+ if (g_profiler.swap_ready) {
1164
+ int standby_idx = g_profiler.active_idx ^ 1;
1165
+ rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[standby_idx]);
1166
+ g_profiler.swap_ready = 0;
1167
+ }
1168
+ rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[g_profiler.active_idx]);
1169
+ }
1170
+
686
1171
  /* Clean up thread-specific data for all live threads */
687
1172
  {
688
1173
  VALUE threads = rb_funcall(rb_cThread, rb_intern("list"), 0);
@@ -713,6 +1198,14 @@ rb_rperf_stop(VALUE self)
713
1198
  rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.sampling_count));
714
1199
  rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.sampling_total_ns));
715
1200
 
1201
+ /* aggregation stats */
1202
+ if (g_profiler.aggregate) {
1203
+ rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
1204
+ SIZET2NUM(g_profiler.frame_table.count - RPERF_SYNTHETIC_COUNT));
1205
+ rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
1206
+ SIZET2NUM(g_profiler.agg_table.count));
1207
+ }
1208
+
716
1209
  /* start_time_ns (CLOCK_REALTIME epoch nanos), duration_ns (CLOCK_MONOTONIC delta) */
717
1210
  {
718
1211
  struct timespec stop_monotonic;
@@ -726,45 +1219,76 @@ rb_rperf_stop(VALUE self)
726
1219
  rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
727
1220
  }
728
1221
 
729
- /* samples: array of [frames_array, weight]
730
- * Each frame is [path_string, label_string]
731
- * GVL blocked/wait samples get synthetic frame prepended (leaf position) */
732
- samples_ary = rb_ary_new_capa((long)g_profiler.sample_count);
733
- for (i = 0; i < g_profiler.sample_count; i++) {
734
- rperf_sample_t *s = &g_profiler.samples[i];
735
- VALUE frames = rb_ary_new_capa(s->depth + 1);
736
-
737
- /* Prepend synthetic frame at leaf position (index 0) */
738
- if (s->type == RPERF_SAMPLE_GVL_BLOCKED) {
739
- VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]"));
740
- rb_ary_push(frames, syn);
741
- } else if (s->type == RPERF_SAMPLE_GVL_WAIT) {
742
- VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]"));
743
- rb_ary_push(frames, syn);
744
- } else if (s->type == RPERF_SAMPLE_GC_MARKING) {
745
- VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]"));
746
- rb_ary_push(frames, syn);
747
- } else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
748
- VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]"));
749
- rb_ary_push(frames, syn);
1222
+ if (g_profiler.aggregate) {
1223
+ /* Build samples from aggregation table.
1224
+ * Use a Ruby array for resolved frames so GC protects them. */
1225
+ rperf_frame_table_t *ft = &g_profiler.frame_table;
1226
+ VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
1227
+ /* Synthetic frames */
1228
+ rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]")));
1229
+ rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
1230
+ rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]")));
1231
+ rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]")));
1232
+ /* Real frames */
1233
+ for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
1234
+ rb_ary_push(resolved_ary, rperf_resolve_frame(ft->keys[i]));
750
1235
  }
751
1236
 
752
- for (j = 0; j < s->depth; j++) {
753
- VALUE fval = g_profiler.frame_pool[s->frame_start + j];
754
- rb_ary_push(frames, rperf_resolve_frame(fval));
1237
+ rperf_agg_table_t *at = &g_profiler.agg_table;
1238
+ samples_ary = rb_ary_new();
1239
+ for (i = 0; i < at->bucket_capacity; i++) {
1240
+ rperf_agg_entry_t *e = &at->buckets[i];
1241
+ if (!e->used) continue;
1242
+
1243
+ VALUE frames = rb_ary_new_capa(e->depth);
1244
+ for (j = 0; j < e->depth; j++) {
1245
+ uint32_t fid = at->stack_pool[e->frame_start + j];
1246
+ rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
1247
+ }
1248
+
1249
+ VALUE sample = rb_ary_new3(3, frames, LONG2NUM(e->weight), INT2NUM(e->thread_seq));
1250
+ rb_ary_push(samples_ary, sample);
755
1251
  }
756
1252
 
757
- VALUE sample = rb_ary_new3(3, frames, LONG2NUM(s->weight), INT2NUM(s->thread_seq));
758
- rb_ary_push(samples_ary, sample);
1253
+ rperf_sample_buffer_free(&g_profiler.buffers[1]);
1254
+ rperf_frame_table_free(&g_profiler.frame_table);
1255
+ rperf_agg_table_free(&g_profiler.agg_table);
1256
+ } else {
1257
+ /* Raw samples path (aggregate: false) */
1258
+ rperf_sample_buffer_t *buf = &g_profiler.buffers[0];
1259
+ samples_ary = rb_ary_new_capa((long)buf->sample_count);
1260
+ for (i = 0; i < buf->sample_count; i++) {
1261
+ rperf_sample_t *s = &buf->samples[i];
1262
+ VALUE frames = rb_ary_new_capa(s->depth + 1);
1263
+
1264
+ /* Prepend synthetic frame at leaf position (index 0) */
1265
+ if (s->type == RPERF_SAMPLE_GVL_BLOCKED) {
1266
+ VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]"));
1267
+ rb_ary_push(frames, syn);
1268
+ } else if (s->type == RPERF_SAMPLE_GVL_WAIT) {
1269
+ VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]"));
1270
+ rb_ary_push(frames, syn);
1271
+ } else if (s->type == RPERF_SAMPLE_GC_MARKING) {
1272
+ VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]"));
1273
+ rb_ary_push(frames, syn);
1274
+ } else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
1275
+ VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]"));
1276
+ rb_ary_push(frames, syn);
1277
+ }
1278
+
1279
+ for (j = 0; j < s->depth; j++) {
1280
+ VALUE fval = buf->frame_pool[s->frame_start + j];
1281
+ rb_ary_push(frames, rperf_resolve_frame(fval));
1282
+ }
1283
+
1284
+ VALUE sample = rb_ary_new3(3, frames, LONG2NUM(s->weight), INT2NUM(s->thread_seq));
1285
+ rb_ary_push(samples_ary, sample);
1286
+ }
759
1287
  }
760
1288
  rb_hash_aset(result, ID2SYM(rb_intern("samples")), samples_ary);
761
1289
 
762
1290
  /* Cleanup */
763
- free(g_profiler.samples);
764
- g_profiler.samples = NULL;
765
- free(g_profiler.frame_pool);
766
- g_profiler.frame_pool = NULL;
767
- g_profiler.frame_pool_count = 0;
1291
+ rperf_sample_buffer_free(&g_profiler.buffers[0]);
768
1292
 
769
1293
  return result;
770
1294
  }
@@ -793,16 +1317,13 @@ rperf_after_fork_child(void)
793
1317
  }
794
1318
  rb_remove_event_hook(rperf_gc_event_hook);
795
1319
 
796
- /* Free sample buffer and frame pool — these hold parent's data */
797
- free(g_profiler.samples);
798
- g_profiler.samples = NULL;
799
- g_profiler.sample_count = 0;
800
- g_profiler.sample_capacity = 0;
801
-
802
- free(g_profiler.frame_pool);
803
- g_profiler.frame_pool = NULL;
804
- g_profiler.frame_pool_count = 0;
805
- g_profiler.frame_pool_capacity = 0;
1320
+ /* Free sample buffers, frame table, and agg table — these hold parent's data */
1321
+ rperf_sample_buffer_free(&g_profiler.buffers[0]);
1322
+ if (g_profiler.aggregate) {
1323
+ rperf_sample_buffer_free(&g_profiler.buffers[1]);
1324
+ rperf_frame_table_free(&g_profiler.frame_table);
1325
+ rperf_agg_table_free(&g_profiler.agg_table);
1326
+ }
806
1327
 
807
1328
  /* Reset GC state */
808
1329
  g_profiler.gc_phase = 0;
@@ -810,6 +1331,7 @@ rperf_after_fork_child(void)
810
1331
  /* Reset stats */
811
1332
  g_profiler.sampling_count = 0;
812
1333
  g_profiler.sampling_total_ns = 0;
1334
+ g_profiler.swap_ready = 0;
813
1335
  }
814
1336
 
815
1337
  /* ---- Init ---- */
@@ -830,5 +1352,5 @@ Init_rperf(void)
830
1352
  rb_gc_register_address(&g_profiler_wrapper);
831
1353
 
832
1354
  /* Fork safety: silently stop profiling in child process */
833
- pthread_atfork(NULL, NULL, rperf_after_fork_child);
1355
+ CHECKED(pthread_atfork(NULL, NULL, rperf_after_fork_child));
834
1356
  }
data/lib/rperf/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Rperf
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  end
data/lib/rperf.rb CHANGED
@@ -17,21 +17,19 @@ module Rperf
17
17
  @output = nil
18
18
  @stat = false
19
19
  @stat_start_mono = nil
20
- STAT_TOP_N = 5
21
- SYNTHETIC_LABELS = %w[[GVL\ blocked] [GVL\ wait] [GC\ marking] [GC\ sweeping]].freeze
22
20
 
23
21
  # Starts profiling.
24
22
  # format: :pprof, :collapsed, or :text. nil = auto-detect from output extension
25
23
  # .collapsed → collapsed stacks (FlameGraph / speedscope compatible)
26
24
  # .txt → text report (human/AI readable flat + cumulative table)
27
25
  # otherwise (.pb.gz etc) → pprof protobuf (gzip compressed)
28
- def self.start(frequency: 1000, mode: :cpu, output: nil, verbose: false, format: nil, stat: false, signal: nil)
26
+ def self.start(frequency: 1000, mode: :cpu, output: nil, verbose: false, format: nil, stat: false, signal: nil, aggregate: true)
29
27
  @verbose = verbose || ENV["RPERF_VERBOSE"] == "1"
30
28
  @output = output
31
29
  @format = format
32
30
  @stat = stat
33
31
  @stat_start_mono = Process.clock_gettime(Process::CLOCK_MONOTONIC) if @stat
34
- c_opts = { frequency: frequency, mode: mode }
32
+ c_opts = { frequency: frequency, mode: mode, aggregate: aggregate }
35
33
  c_opts[:signal] = signal unless signal.nil?
36
34
  _c_start(**c_opts)
37
35
 
@@ -104,7 +102,7 @@ module Rperf
104
102
  def self.print_stats(data)
105
103
  count = data[:sampling_count] || 0
106
104
  total_ns = data[:sampling_time_ns] || 0
107
- samples = data[:samples]&.size || 0
105
+ sample_count = data[:sampling_count] || 0
108
106
  mode = data[:mode] || :cpu
109
107
  frequency = data[:frequency] || 0
110
108
 
@@ -113,7 +111,7 @@ module Rperf
113
111
 
114
112
  $stderr.puts "[rperf] mode=#{mode} frequency=#{frequency}Hz"
115
113
  $stderr.puts "[rperf] sampling: #{count} calls, #{format("%.2f", total_ms)}ms total, #{format("%.1f", avg_us)}us/call avg"
116
- $stderr.puts "[rperf] samples recorded: #{samples}"
114
+ $stderr.puts "[rperf] samples recorded: #{sample_count}"
117
115
 
118
116
  print_top(data)
119
117
  end
@@ -202,7 +200,7 @@ module Rperf
202
200
  print_stat_breakdown(breakdown, total_weight)
203
201
  print_stat_runtime_info
204
202
  print_stat_system_info
205
- print_stat_top(samples_raw, total_weight)
203
+ print_stat_report(data) if ENV["RPERF_STAT_REPORT"] == "1"
206
204
  print_stat_footer(samples_raw, real_ns, data)
207
205
  end
208
206
 
@@ -291,37 +289,20 @@ module Rperf
291
289
  end
292
290
  private_class_method :print_stat_system_info
293
291
 
294
- def self.print_stat_top(samples_raw, total_weight)
295
- flat = Hash.new(0)
296
- samples_raw.each do |frames, weight|
297
- leaf = frames.first
298
- if leaf
299
- _, label = leaf
300
- next if SYNTHETIC_LABELS.include?(label)
301
- flat[[label, leaf[0]]] += weight
302
- end
303
- end
304
-
305
- return if flat.empty?
306
292
 
307
- top = flat.sort_by { |_, w| -w }.first(STAT_TOP_N)
293
+ def self.print_stat_report(data)
308
294
  $stderr.puts
309
- $stderr.puts " Top #{top.size} by flat:"
310
- top.each do |key, weight|
311
- label, path = key
312
- pct = total_weight > 0 ? weight * 100.0 / total_weight : 0.0
313
- loc = path.empty? ? "" : " (#{path})"
314
- $stderr.puts STAT_PCT_LINE.call(format_ms(weight), "ms", pct, "#{label}#{loc}")
315
- end
295
+ $stderr.puts Text.encode(data, header: false)
316
296
  end
317
- private_class_method :print_stat_top
297
+ private_class_method :print_stat_report
318
298
 
319
299
  def self.print_stat_footer(samples_raw, real_ns, data)
320
- unique_stacks = samples_raw.map { |frames, _| frames }.uniq.size
300
+ triggers = data[:trigger_count] || 0
321
301
  overhead_pct = real_ns > 0 ? (data[:sampling_time_ns] || 0) * 100.0 / real_ns : 0.0
322
302
  $stderr.puts
323
- $stderr.puts format(" %d samples (%d unique stacks), %.1f%% profiler overhead",
324
- samples_raw.size, unique_stacks, overhead_pct)
303
+ samples = data[:sampling_count] || samples_raw.size
304
+ $stderr.puts format(" %d samples / %d triggers, %.1f%% profiler overhead",
305
+ samples, triggers, overhead_pct)
325
306
  end
326
307
  private_class_method :print_stat_footer
327
308
 
@@ -393,11 +374,13 @@ module Rperf
393
374
  when "false" then false
394
375
  else ENV["RPERF_SIGNAL"].to_i
395
376
  end
377
+ _rperf_aggregate = ENV["RPERF_AGGREGATE"] != "0"
396
378
  _rperf_start_opts = { frequency: (ENV["RPERF_FREQUENCY"] || 1000).to_i, mode: _rperf_mode,
397
379
  output: _rperf_stat ? ENV["RPERF_OUTPUT"] : (ENV["RPERF_OUTPUT"] || "rperf.data"),
398
380
  verbose: ENV["RPERF_VERBOSE"] == "1",
399
381
  format: _rperf_format,
400
- stat: _rperf_stat }
382
+ stat: _rperf_stat,
383
+ aggregate: _rperf_aggregate }
401
384
  _rperf_start_opts[:signal] = _rperf_signal unless _rperf_signal.nil?
402
385
  start(**_rperf_start_opts)
403
386
  at_exit { stop }
@@ -407,7 +390,7 @@ module Rperf
407
390
  module Text
408
391
  module_function
409
392
 
410
- def encode(data, top_n: 50)
393
+ def encode(data, top_n: 50, header: true)
411
394
  samples_raw = data[:samples]
412
395
  mode = data[:mode] || :cpu
413
396
  frequency = data[:frequency] || 0
@@ -417,10 +400,13 @@ module Rperf
417
400
  result = Rperf.send(:compute_flat_cum, samples_raw)
418
401
 
419
402
  out = String.new
420
- total_ms = result[:total_weight] / 1_000_000.0
421
- out << "Total: #{"%.1f" % total_ms}ms (#{mode})\n"
422
- out << "Samples: #{samples_raw.size}, Frequency: #{frequency}Hz\n"
423
- out << "\n"
403
+ if header
404
+ total_ms = result[:total_weight] / 1_000_000.0
405
+ out << "Total: #{"%.1f" % total_ms}ms (#{mode})\n"
406
+ sample_count = data[:sampling_count] || samples_raw.size
407
+ out << "Samples: #{sample_count}, Frequency: #{frequency}Hz\n"
408
+ out << "\n"
409
+ end
424
410
  out << format_table("Flat", result[:flat], result[:total_weight], top_n)
425
411
  out << "\n"
426
412
  out << format_table("Cumulative", result[:cum], result[:total_weight], top_n)
@@ -430,13 +416,12 @@ module Rperf
430
416
  def format_table(title, table, total_weight, top_n)
431
417
  sorted = table.sort_by { |_, w| -w }.first(top_n)
432
418
  out = String.new
433
- out << "#{title}:\n"
419
+ out << " #{title}:\n"
434
420
  sorted.each do |key, weight|
435
421
  label, path = key
436
- ms = weight / 1_000_000.0
437
422
  pct = total_weight > 0 ? weight * 100.0 / total_weight : 0.0
438
423
  loc = path.empty? ? "" : " (#{path})"
439
- out << (" %8.1fms %5.1f%% %s%s\n" % [ms, pct, label, loc])
424
+ out << format(" %14s ms %5.1f%% %s%s\n", Rperf.send(:format_ms, weight), pct, label, loc)
440
425
  end
441
426
  out
442
427
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rperf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Koichi Sasada