rperf 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/docs/help.md +17 -9
- data/exe/rperf +28 -5
- data/ext/rperf/rperf.c +639 -117
- data/lib/rperf/version.rb +1 -1
- data/lib/rperf.rb +25 -40
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ab923fe1fc0a0d6928941271cdffc979012af73d6d0bd0aa5c5d43a95e9451c2
|
|
4
|
+
data.tar.gz: 74a0200ec71ae3743d2b99d578df0b484d23dea57285385209e23b0748a95564
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b2d95c3e58fd883efebfcad8506a5249dee8c7322fb53e75a25afcd5050bbb1885fb620eef18a86e74fa54cb542ba83b1761f13630746862c619477e022b09db
|
|
7
|
+
data.tar.gz: ee4236170102e0be1cd13749389a29679ba7361c4c77db98ed708e9b509e8e1c28ba3b47a2d8a4b704ada2fe5a11859bc61c0476d8f061fbd43703168232f5f6
|
data/docs/help.md
CHANGED
|
@@ -19,22 +19,27 @@ POSIX systems (Linux, macOS). Requires Ruby >= 3.4.0.
|
|
|
19
19
|
-f, --frequency HZ Sampling frequency in Hz (default: 1000)
|
|
20
20
|
-m, --mode MODE cpu or wall (default: cpu)
|
|
21
21
|
--format FORMAT pprof, collapsed, or text (default: auto from extension)
|
|
22
|
+
-p, --print Print text profile to stdout
|
|
23
|
+
(same as --format=text --output=/dev/stdout)
|
|
22
24
|
--signal VALUE Timer signal (Linux only): signal number, or 'false'
|
|
23
25
|
for nanosleep thread (default: auto)
|
|
24
26
|
-v, --verbose Print sampling statistics to stderr
|
|
25
27
|
|
|
26
28
|
### stat: Run command and print performance summary to stderr.
|
|
27
29
|
|
|
28
|
-
|
|
30
|
+
Uses wall mode by default. No file output by default.
|
|
29
31
|
|
|
30
32
|
-o, --output PATH Also save profile to file (default: none)
|
|
31
33
|
-f, --frequency HZ Sampling frequency in Hz (default: 1000)
|
|
34
|
+
-m, --mode MODE cpu or wall (default: wall)
|
|
35
|
+
--report Include flat/cumulative profile tables in output
|
|
32
36
|
--signal VALUE Timer signal (Linux only): signal number, or 'false'
|
|
33
37
|
for nanosleep thread (default: auto)
|
|
34
38
|
-v, --verbose Print additional sampling statistics
|
|
35
39
|
|
|
36
40
|
Shows: user/sys/real time, time breakdown (CPU execution, GVL blocked,
|
|
37
|
-
GVL wait, GC marking, GC sweeping),
|
|
41
|
+
GVL wait, GC marking, GC sweeping), GC/memory/OS stats, and profiler overhead.
|
|
42
|
+
Use --report to add flat and cumulative top-50 function tables.
|
|
38
43
|
|
|
39
44
|
### report: Open pprof profile with go tool pprof. Requires Go.
|
|
40
45
|
|
|
@@ -58,7 +63,9 @@ Default (no flag): opens diff in browser.
|
|
|
58
63
|
rperf record -m wall -f 500 -o profile.pb.gz ruby server.rb
|
|
59
64
|
rperf record -o profile.collapsed ruby app.rb
|
|
60
65
|
rperf record -o profile.txt ruby app.rb
|
|
66
|
+
rperf record -p ruby app.rb
|
|
61
67
|
rperf stat ruby app.rb
|
|
68
|
+
rperf stat --report ruby app.rb
|
|
62
69
|
rperf stat -o profile.pb.gz ruby app.rb
|
|
63
70
|
rperf report
|
|
64
71
|
rperf report --top profile.pb.gz
|
|
@@ -168,14 +175,14 @@ Example output:
|
|
|
168
175
|
Total: 1523.4ms (cpu)
|
|
169
176
|
Samples: 4820, Frequency: 500Hz
|
|
170
177
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
178
|
+
Flat:
|
|
179
|
+
820.3 ms 53.8% Array#each (app/models/user.rb)
|
|
180
|
+
312.1 ms 20.5% JSON.parse (lib/json/parser.rb)
|
|
181
|
+
...
|
|
175
182
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
183
|
+
Cumulative:
|
|
184
|
+
1,401.2 ms 92.0% UsersController#index (app/controllers/users_controller.rb)
|
|
185
|
+
...
|
|
179
186
|
|
|
180
187
|
### Format auto-detection
|
|
181
188
|
|
|
@@ -281,6 +288,7 @@ Used internally by the CLI to pass options to the auto-started profiler:
|
|
|
281
288
|
RPERF_VERBOSE=1 Print statistics
|
|
282
289
|
RPERF_SIGNAL=N|false Timer signal number or 'false' for nanosleep (Linux only)
|
|
283
290
|
RPERF_STAT=1 Enable stat mode (used by rperf stat)
|
|
291
|
+
RPERF_STAT_REPORT=1 Include profile tables in stat output
|
|
284
292
|
|
|
285
293
|
## TIPS
|
|
286
294
|
|
data/exe/rperf
CHANGED
|
@@ -134,9 +134,14 @@ mode = (subcommand == "stat") ? "wall" : "cpu"
|
|
|
134
134
|
format = nil
|
|
135
135
|
signal = nil
|
|
136
136
|
verbose = false
|
|
137
|
+
aggregate = true
|
|
138
|
+
stat_report = false
|
|
137
139
|
|
|
138
140
|
parser = OptionParser.new do |opts|
|
|
139
|
-
opts.banner =
|
|
141
|
+
opts.banner = case subcommand
|
|
142
|
+
when "record" then "Usage: rperf record [options] command [args...]"
|
|
143
|
+
when "stat" then "Usage: rperf stat [options] command [args...]"
|
|
144
|
+
end
|
|
140
145
|
|
|
141
146
|
opts.on("-o", "--output PATH", "Output file#{subcommand == 'stat' ? ' (default: none)' : ' (default: rperf.data)'}") do |v|
|
|
142
147
|
output = v
|
|
@@ -146,21 +151,37 @@ parser = OptionParser.new do |opts|
|
|
|
146
151
|
frequency = v
|
|
147
152
|
end
|
|
148
153
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
154
|
+
default_mode = (subcommand == "stat") ? "wall" : "cpu"
|
|
155
|
+
opts.on("-m", "--mode MODE", %w[cpu wall], "Profiling mode: cpu or wall (default: #{default_mode})") do |v|
|
|
156
|
+
mode = v
|
|
157
|
+
end
|
|
153
158
|
|
|
159
|
+
if subcommand == "record"
|
|
154
160
|
opts.on("--format FORMAT", %w[pprof collapsed text],
|
|
155
161
|
"Output format: pprof, collapsed, or text (default: auto from extension)") do |v|
|
|
156
162
|
format = v
|
|
157
163
|
end
|
|
164
|
+
|
|
165
|
+
opts.on("-p", "--print", "Print text profile to stdout (same as --format=text --output=/dev/stdout)") do
|
|
166
|
+
format = "text"
|
|
167
|
+
output = "/dev/stdout"
|
|
168
|
+
end
|
|
158
169
|
end
|
|
159
170
|
|
|
160
171
|
opts.on("--signal VALUE", "Timer signal (Linux only): signal number, or 'false' for nanosleep thread") do |v|
|
|
161
172
|
signal = (v == "false") ? "false" : v
|
|
162
173
|
end
|
|
163
174
|
|
|
175
|
+
opts.on("--no-aggregate", "Disable sample aggregation (keep raw samples)") do
|
|
176
|
+
aggregate = false
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
if subcommand == "stat"
|
|
180
|
+
opts.on("--report", "Include flat/cumulative profile tables in output") do
|
|
181
|
+
stat_report = true
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
164
185
|
opts.on("-v", "--verbose", "Print sampling statistics to stderr") do
|
|
165
186
|
verbose = true
|
|
166
187
|
end
|
|
@@ -198,10 +219,12 @@ ENV["RPERF_MODE"] = mode
|
|
|
198
219
|
ENV["RPERF_FORMAT"] = format if format
|
|
199
220
|
ENV["RPERF_VERBOSE"] = "1" if verbose
|
|
200
221
|
ENV["RPERF_SIGNAL"] = signal if signal
|
|
222
|
+
ENV["RPERF_AGGREGATE"] = "0" unless aggregate
|
|
201
223
|
|
|
202
224
|
if subcommand == "stat"
|
|
203
225
|
ENV["RPERF_STAT"] = "1"
|
|
204
226
|
ENV["RPERF_STAT_COMMAND"] = ARGV.join(" ")
|
|
227
|
+
ENV["RPERF_STAT_REPORT"] = "1" if stat_report
|
|
205
228
|
end
|
|
206
229
|
|
|
207
230
|
exec(*ARGV)
|
data/ext/rperf/rperf.c
CHANGED
|
@@ -7,6 +7,13 @@
|
|
|
7
7
|
#include <stdlib.h>
|
|
8
8
|
#include <unistd.h>
|
|
9
9
|
#include <signal.h>
|
|
10
|
+
#include <assert.h>
|
|
11
|
+
#ifdef __linux__
|
|
12
|
+
#include <sys/syscall.h>
|
|
13
|
+
#endif
|
|
14
|
+
|
|
15
|
+
/* Checked pthread wrappers — assert on unexpected errors */
|
|
16
|
+
#define CHECKED(call) do { int _r = (call); assert(_r == 0 && #call); (void)_r; } while (0)
|
|
10
17
|
|
|
11
18
|
#ifdef __linux__
|
|
12
19
|
#define RPERF_USE_TIMER_SIGNAL 1
|
|
@@ -16,8 +23,19 @@
|
|
|
16
23
|
#endif
|
|
17
24
|
|
|
18
25
|
#define RPERF_MAX_STACK_DEPTH 512
|
|
19
|
-
#define RPERF_INITIAL_SAMPLES
|
|
26
|
+
#define RPERF_INITIAL_SAMPLES 16384 /* >= AGG_THRESHOLD to avoid realloc before first aggregation */
|
|
20
27
|
#define RPERF_INITIAL_FRAME_POOL (1024 * 1024 / sizeof(VALUE)) /* ~1MB */
|
|
28
|
+
#define RPERF_AGG_THRESHOLD 10000 /* aggregate every N samples */
|
|
29
|
+
#define RPERF_FRAME_TABLE_INITIAL 65536 /* pre-allocate to avoid realloc race with GC dmark */
|
|
30
|
+
#define RPERF_AGG_TABLE_INITIAL 1024
|
|
31
|
+
#define RPERF_STACK_POOL_INITIAL 4096
|
|
32
|
+
|
|
33
|
+
/* Synthetic frame IDs (reserved in frame_table, 0-based) */
|
|
34
|
+
#define RPERF_SYNTHETIC_GVL_BLOCKED 0
|
|
35
|
+
#define RPERF_SYNTHETIC_GVL_WAIT 1
|
|
36
|
+
#define RPERF_SYNTHETIC_GC_MARKING 2
|
|
37
|
+
#define RPERF_SYNTHETIC_GC_SWEEPING 3
|
|
38
|
+
#define RPERF_SYNTHETIC_COUNT 4
|
|
21
39
|
|
|
22
40
|
/* ---- Data structures ---- */
|
|
23
41
|
|
|
@@ -43,6 +61,51 @@ typedef struct rperf_sample {
|
|
|
43
61
|
int thread_seq; /* thread sequence number (1-based) */
|
|
44
62
|
} rperf_sample_t;
|
|
45
63
|
|
|
64
|
+
/* ---- Sample buffer (double-buffered) ---- */
|
|
65
|
+
|
|
66
|
+
typedef struct rperf_sample_buffer {
|
|
67
|
+
rperf_sample_t *samples;
|
|
68
|
+
size_t sample_count;
|
|
69
|
+
size_t sample_capacity;
|
|
70
|
+
VALUE *frame_pool;
|
|
71
|
+
size_t frame_pool_count;
|
|
72
|
+
size_t frame_pool_capacity;
|
|
73
|
+
} rperf_sample_buffer_t;
|
|
74
|
+
|
|
75
|
+
/* ---- Frame table: VALUE → uint32_t frame_id ---- */
|
|
76
|
+
|
|
77
|
+
#define RPERF_FRAME_TABLE_EMPTY UINT32_MAX
|
|
78
|
+
|
|
79
|
+
typedef struct rperf_frame_table {
|
|
80
|
+
VALUE *keys; /* unique VALUE array (GC mark target) */
|
|
81
|
+
size_t count; /* = next frame_id (starts after RPERF_SYNTHETIC_COUNT) */
|
|
82
|
+
size_t capacity;
|
|
83
|
+
uint32_t *buckets; /* open addressing: stores index into keys[] */
|
|
84
|
+
size_t bucket_capacity;
|
|
85
|
+
} rperf_frame_table_t;
|
|
86
|
+
|
|
87
|
+
/* ---- Aggregation table: stack → weight ---- */
|
|
88
|
+
|
|
89
|
+
#define RPERF_AGG_ENTRY_EMPTY 0
|
|
90
|
+
|
|
91
|
+
typedef struct rperf_agg_entry {
|
|
92
|
+
uint32_t frame_start; /* offset into stack_pool */
|
|
93
|
+
int depth; /* includes synthetic frame */
|
|
94
|
+
int thread_seq;
|
|
95
|
+
int64_t weight; /* accumulated */
|
|
96
|
+
uint32_t hash; /* cached hash value */
|
|
97
|
+
int used; /* 0 = empty, 1 = used */
|
|
98
|
+
} rperf_agg_entry_t;
|
|
99
|
+
|
|
100
|
+
typedef struct rperf_agg_table {
|
|
101
|
+
rperf_agg_entry_t *buckets;
|
|
102
|
+
size_t bucket_capacity;
|
|
103
|
+
size_t count;
|
|
104
|
+
uint32_t *stack_pool; /* frame_id sequences stored contiguously */
|
|
105
|
+
size_t stack_pool_count;
|
|
106
|
+
size_t stack_pool_capacity;
|
|
107
|
+
} rperf_agg_table_t;
|
|
108
|
+
|
|
46
109
|
typedef struct rperf_thread_data {
|
|
47
110
|
int64_t prev_cpu_ns;
|
|
48
111
|
int64_t prev_wall_ns;
|
|
@@ -58,18 +121,23 @@ typedef struct rperf_profiler {
|
|
|
58
121
|
int frequency;
|
|
59
122
|
int mode; /* 0 = cpu, 1 = wall */
|
|
60
123
|
volatile int running;
|
|
61
|
-
pthread_t
|
|
124
|
+
pthread_t worker_thread; /* combined timer + aggregation */
|
|
62
125
|
#if RPERF_USE_TIMER_SIGNAL
|
|
63
126
|
timer_t timer_id;
|
|
64
127
|
int timer_signal; /* >0: use timer signal, 0: use nanosleep thread */
|
|
128
|
+
volatile pid_t worker_tid; /* kernel TID of worker thread (for SIGEV_THREAD_ID) */
|
|
65
129
|
#endif
|
|
66
130
|
rb_postponed_job_handle_t pj_handle;
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
131
|
+
int aggregate; /* 1 = aggregate samples, 0 = raw */
|
|
132
|
+
/* Double-buffered sample storage (only buffers[0] used when !aggregate) */
|
|
133
|
+
rperf_sample_buffer_t buffers[2];
|
|
134
|
+
int active_idx; /* 0 or 1 */
|
|
135
|
+
/* Aggregation (only used when aggregate=1) */
|
|
136
|
+
rperf_frame_table_t frame_table;
|
|
137
|
+
rperf_agg_table_t agg_table;
|
|
138
|
+
volatile int swap_ready; /* 1 = standby buffer ready for aggregation */
|
|
139
|
+
pthread_mutex_t worker_mutex;
|
|
140
|
+
pthread_cond_t worker_cond;
|
|
73
141
|
rb_internal_thread_specific_key_t ts_key;
|
|
74
142
|
rb_internal_thread_event_hook_t *thread_hook;
|
|
75
143
|
/* GC tracking */
|
|
@@ -98,8 +166,19 @@ static void
|
|
|
98
166
|
rperf_profiler_mark(void *ptr)
|
|
99
167
|
{
|
|
100
168
|
rperf_profiler_t *prof = (rperf_profiler_t *)ptr;
|
|
101
|
-
|
|
102
|
-
|
|
169
|
+
int i;
|
|
170
|
+
/* Mark both sample buffers' frame_pools */
|
|
171
|
+
for (i = 0; i < 2; i++) {
|
|
172
|
+
rperf_sample_buffer_t *buf = &prof->buffers[i];
|
|
173
|
+
if (buf->frame_pool && buf->frame_pool_count > 0) {
|
|
174
|
+
rb_gc_mark_locations(buf->frame_pool,
|
|
175
|
+
buf->frame_pool + buf->frame_pool_count);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
/* Mark frame_table keys (unique frame VALUEs) */
|
|
179
|
+
if (prof->frame_table.keys && prof->frame_table.count > 0) {
|
|
180
|
+
rb_gc_mark_locations(prof->frame_table.keys + RPERF_SYNTHETIC_COUNT,
|
|
181
|
+
prof->frame_table.keys + prof->frame_table.count);
|
|
103
182
|
}
|
|
104
183
|
}
|
|
105
184
|
|
|
@@ -146,18 +225,45 @@ rperf_current_time_ns(rperf_profiler_t *prof, rperf_thread_data_t *td)
|
|
|
146
225
|
|
|
147
226
|
/* ---- Sample buffer ---- */
|
|
148
227
|
|
|
228
|
+
static int
|
|
229
|
+
rperf_sample_buffer_init(rperf_sample_buffer_t *buf)
|
|
230
|
+
{
|
|
231
|
+
buf->sample_count = 0;
|
|
232
|
+
buf->sample_capacity = RPERF_INITIAL_SAMPLES;
|
|
233
|
+
buf->samples = (rperf_sample_t *)calloc(buf->sample_capacity, sizeof(rperf_sample_t));
|
|
234
|
+
if (!buf->samples) return -1;
|
|
235
|
+
|
|
236
|
+
buf->frame_pool_count = 0;
|
|
237
|
+
buf->frame_pool_capacity = RPERF_INITIAL_FRAME_POOL;
|
|
238
|
+
buf->frame_pool = (VALUE *)calloc(buf->frame_pool_capacity, sizeof(VALUE));
|
|
239
|
+
if (!buf->frame_pool) {
|
|
240
|
+
free(buf->samples);
|
|
241
|
+
buf->samples = NULL;
|
|
242
|
+
return -1;
|
|
243
|
+
}
|
|
244
|
+
return 0;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
static void
|
|
248
|
+
rperf_sample_buffer_free(rperf_sample_buffer_t *buf)
|
|
249
|
+
{
|
|
250
|
+
free(buf->samples);
|
|
251
|
+
free(buf->frame_pool);
|
|
252
|
+
memset(buf, 0, sizeof(*buf));
|
|
253
|
+
}
|
|
254
|
+
|
|
149
255
|
/* Returns 0 on success, -1 on allocation failure */
|
|
150
256
|
static int
|
|
151
|
-
rperf_ensure_sample_capacity(
|
|
257
|
+
rperf_ensure_sample_capacity(rperf_sample_buffer_t *buf)
|
|
152
258
|
{
|
|
153
|
-
if (
|
|
154
|
-
size_t new_cap =
|
|
259
|
+
if (buf->sample_count >= buf->sample_capacity) {
|
|
260
|
+
size_t new_cap = buf->sample_capacity * 2;
|
|
155
261
|
rperf_sample_t *new_samples = (rperf_sample_t *)realloc(
|
|
156
|
-
|
|
262
|
+
buf->samples,
|
|
157
263
|
new_cap * sizeof(rperf_sample_t));
|
|
158
264
|
if (!new_samples) return -1;
|
|
159
|
-
|
|
160
|
-
|
|
265
|
+
buf->samples = new_samples;
|
|
266
|
+
buf->sample_capacity = new_cap;
|
|
161
267
|
}
|
|
162
268
|
return 0;
|
|
163
269
|
}
|
|
@@ -166,36 +272,310 @@ rperf_ensure_sample_capacity(rperf_profiler_t *prof)
|
|
|
166
272
|
|
|
167
273
|
/* Ensure frame_pool has room for `needed` more entries. Returns 0 on success. */
|
|
168
274
|
static int
|
|
169
|
-
rperf_ensure_frame_pool_capacity(
|
|
275
|
+
rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
|
|
170
276
|
{
|
|
171
|
-
while (
|
|
172
|
-
size_t new_cap =
|
|
277
|
+
while (buf->frame_pool_count + (size_t)needed > buf->frame_pool_capacity) {
|
|
278
|
+
size_t new_cap = buf->frame_pool_capacity * 2;
|
|
173
279
|
VALUE *new_pool = (VALUE *)realloc(
|
|
174
|
-
|
|
280
|
+
buf->frame_pool,
|
|
175
281
|
new_cap * sizeof(VALUE));
|
|
176
282
|
if (!new_pool) return -1;
|
|
177
|
-
|
|
178
|
-
|
|
283
|
+
buf->frame_pool = new_pool;
|
|
284
|
+
buf->frame_pool_capacity = new_cap;
|
|
285
|
+
}
|
|
286
|
+
return 0;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/* ---- Frame table operations (all malloc-based, no GVL needed) ---- */
|
|
290
|
+
|
|
291
|
+
static void
|
|
292
|
+
rperf_frame_table_init(rperf_frame_table_t *ft)
|
|
293
|
+
{
|
|
294
|
+
ft->capacity = RPERF_FRAME_TABLE_INITIAL;
|
|
295
|
+
ft->keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
|
|
296
|
+
ft->count = RPERF_SYNTHETIC_COUNT; /* reserve slots for synthetic frames */
|
|
297
|
+
ft->bucket_capacity = RPERF_FRAME_TABLE_INITIAL * 2;
|
|
298
|
+
ft->buckets = (uint32_t *)malloc(ft->bucket_capacity * sizeof(uint32_t));
|
|
299
|
+
memset(ft->buckets, 0xFF, ft->bucket_capacity * sizeof(uint32_t)); /* EMPTY */
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
static void
|
|
303
|
+
rperf_frame_table_free(rperf_frame_table_t *ft)
|
|
304
|
+
{
|
|
305
|
+
free(ft->keys);
|
|
306
|
+
free(ft->buckets);
|
|
307
|
+
memset(ft, 0, sizeof(*ft));
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
static void
|
|
311
|
+
rperf_frame_table_rehash(rperf_frame_table_t *ft)
|
|
312
|
+
{
|
|
313
|
+
size_t new_cap = ft->bucket_capacity * 2;
|
|
314
|
+
uint32_t *new_buckets = (uint32_t *)malloc(new_cap * sizeof(uint32_t));
|
|
315
|
+
memset(new_buckets, 0xFF, new_cap * sizeof(uint32_t));
|
|
316
|
+
|
|
317
|
+
size_t i;
|
|
318
|
+
for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
|
|
319
|
+
uint32_t h = (uint32_t)(ft->keys[i] >> 3); /* shift out tag bits */
|
|
320
|
+
size_t idx = h % new_cap;
|
|
321
|
+
while (new_buckets[idx] != RPERF_FRAME_TABLE_EMPTY)
|
|
322
|
+
idx = (idx + 1) % new_cap;
|
|
323
|
+
new_buckets[idx] = (uint32_t)i;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
free(ft->buckets);
|
|
327
|
+
ft->buckets = new_buckets;
|
|
328
|
+
ft->bucket_capacity = new_cap;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
/* Returns frame_id for the given VALUE, inserting if new */
|
|
332
|
+
static uint32_t
|
|
333
|
+
rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
|
|
334
|
+
{
|
|
335
|
+
uint32_t h = (uint32_t)(fval >> 3);
|
|
336
|
+
size_t idx = h % ft->bucket_capacity;
|
|
337
|
+
|
|
338
|
+
while (1) {
|
|
339
|
+
uint32_t slot = ft->buckets[idx];
|
|
340
|
+
if (slot == RPERF_FRAME_TABLE_EMPTY) break;
|
|
341
|
+
if (ft->keys[slot] == fval) return slot;
|
|
342
|
+
idx = (idx + 1) % ft->bucket_capacity;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
/* Insert new entry.
|
|
346
|
+
* keys array is pre-allocated and never realloc'd to avoid race with GC dmark.
|
|
347
|
+
* If capacity is exhausted, return EMPTY to signal aggregation should stop. */
|
|
348
|
+
if (ft->count >= ft->capacity) {
|
|
349
|
+
return RPERF_FRAME_TABLE_EMPTY;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
uint32_t frame_id = (uint32_t)ft->count;
|
|
353
|
+
ft->keys[frame_id] = fval;
|
|
354
|
+
/* Store fence: ensure keys[frame_id] is visible before count is incremented,
|
|
355
|
+
* so GC dmark never reads uninitialized keys[count-1]. */
|
|
356
|
+
__atomic_store_n(&ft->count, ft->count + 1, __ATOMIC_RELEASE);
|
|
357
|
+
ft->buckets[idx] = frame_id;
|
|
358
|
+
|
|
359
|
+
/* Rehash if load factor > 0.7 */
|
|
360
|
+
if (ft->count * 10 > ft->bucket_capacity * 7) {
|
|
361
|
+
rperf_frame_table_rehash(ft);
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
return frame_id;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
/* ---- Aggregation table operations (all malloc-based, no GVL needed) ---- */
|
|
368
|
+
|
|
369
|
+
static uint32_t
|
|
370
|
+
rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq)
|
|
371
|
+
{
|
|
372
|
+
uint32_t h = 2166136261u;
|
|
373
|
+
int i;
|
|
374
|
+
for (i = 0; i < len; i++) {
|
|
375
|
+
h ^= data[i];
|
|
376
|
+
h *= 16777619u;
|
|
377
|
+
}
|
|
378
|
+
h ^= (uint32_t)thread_seq;
|
|
379
|
+
h *= 16777619u;
|
|
380
|
+
return h;
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
static void
|
|
384
|
+
rperf_agg_table_init(rperf_agg_table_t *at)
|
|
385
|
+
{
|
|
386
|
+
at->bucket_capacity = RPERF_AGG_TABLE_INITIAL * 2;
|
|
387
|
+
at->buckets = (rperf_agg_entry_t *)calloc(at->bucket_capacity, sizeof(rperf_agg_entry_t));
|
|
388
|
+
at->count = 0;
|
|
389
|
+
at->stack_pool_capacity = RPERF_STACK_POOL_INITIAL;
|
|
390
|
+
at->stack_pool = (uint32_t *)malloc(at->stack_pool_capacity * sizeof(uint32_t));
|
|
391
|
+
at->stack_pool_count = 0;
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
static void
|
|
395
|
+
rperf_agg_table_free(rperf_agg_table_t *at)
|
|
396
|
+
{
|
|
397
|
+
free(at->buckets);
|
|
398
|
+
free(at->stack_pool);
|
|
399
|
+
memset(at, 0, sizeof(*at));
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
static void
|
|
403
|
+
rperf_agg_table_rehash(rperf_agg_table_t *at)
|
|
404
|
+
{
|
|
405
|
+
size_t new_cap = at->bucket_capacity * 2;
|
|
406
|
+
rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
|
|
407
|
+
|
|
408
|
+
size_t i;
|
|
409
|
+
for (i = 0; i < at->bucket_capacity; i++) {
|
|
410
|
+
if (!at->buckets[i].used) continue;
|
|
411
|
+
rperf_agg_entry_t *e = &at->buckets[i];
|
|
412
|
+
size_t idx = e->hash % new_cap;
|
|
413
|
+
while (new_buckets[idx].used)
|
|
414
|
+
idx = (idx + 1) % new_cap;
|
|
415
|
+
new_buckets[idx] = *e;
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
free(at->buckets);
|
|
419
|
+
at->buckets = new_buckets;
|
|
420
|
+
at->bucket_capacity = new_cap;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
/* Ensure stack_pool has room for `needed` more entries */
|
|
424
|
+
static int
|
|
425
|
+
rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
|
|
426
|
+
{
|
|
427
|
+
while (at->stack_pool_count + (size_t)needed > at->stack_pool_capacity) {
|
|
428
|
+
size_t new_cap = at->stack_pool_capacity * 2;
|
|
429
|
+
uint32_t *new_pool = (uint32_t *)realloc(at->stack_pool,
|
|
430
|
+
new_cap * sizeof(uint32_t));
|
|
431
|
+
if (!new_pool) return -1;
|
|
432
|
+
at->stack_pool = new_pool;
|
|
433
|
+
at->stack_pool_capacity = new_cap;
|
|
179
434
|
}
|
|
180
435
|
return 0;
|
|
181
436
|
}
|
|
182
437
|
|
|
438
|
+
/* Insert or merge a stack into the aggregation table */
|
|
439
|
+
static void
|
|
440
|
+
rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
441
|
+
int depth, int thread_seq, int64_t weight, uint32_t hash)
|
|
442
|
+
{
|
|
443
|
+
size_t idx = hash % at->bucket_capacity;
|
|
444
|
+
|
|
445
|
+
while (1) {
|
|
446
|
+
rperf_agg_entry_t *e = &at->buckets[idx];
|
|
447
|
+
if (!e->used) break;
|
|
448
|
+
if (e->hash == hash && e->depth == depth && e->thread_seq == thread_seq &&
|
|
449
|
+
memcmp(at->stack_pool + e->frame_start, frame_ids,
|
|
450
|
+
depth * sizeof(uint32_t)) == 0) {
|
|
451
|
+
/* Match — merge weight */
|
|
452
|
+
e->weight += weight;
|
|
453
|
+
return;
|
|
454
|
+
}
|
|
455
|
+
idx = (idx + 1) % at->bucket_capacity;
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
/* New entry — append frame_ids to stack_pool */
|
|
459
|
+
if (rperf_agg_ensure_stack_pool(at, depth) < 0) return;
|
|
460
|
+
|
|
461
|
+
rperf_agg_entry_t *e = &at->buckets[idx];
|
|
462
|
+
e->frame_start = (uint32_t)at->stack_pool_count;
|
|
463
|
+
e->depth = depth;
|
|
464
|
+
e->thread_seq = thread_seq;
|
|
465
|
+
e->weight = weight;
|
|
466
|
+
e->hash = hash;
|
|
467
|
+
e->used = 1;
|
|
468
|
+
|
|
469
|
+
memcpy(at->stack_pool + at->stack_pool_count, frame_ids,
|
|
470
|
+
depth * sizeof(uint32_t));
|
|
471
|
+
at->stack_pool_count += depth;
|
|
472
|
+
at->count++;
|
|
473
|
+
|
|
474
|
+
/* Rehash if load factor > 0.7 */
|
|
475
|
+
if (at->count * 10 > at->bucket_capacity * 7) {
|
|
476
|
+
rperf_agg_table_rehash(at);
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
/* ---- Aggregation: process a sample buffer into frame_table + agg_table ---- */
|
|
481
|
+
|
|
482
|
+
static void
|
|
483
|
+
rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
|
|
484
|
+
{
|
|
485
|
+
size_t i;
|
|
486
|
+
uint32_t temp_ids[RPERF_MAX_STACK_DEPTH + 1];
|
|
487
|
+
|
|
488
|
+
for (i = 0; i < buf->sample_count; i++) {
|
|
489
|
+
rperf_sample_t *s = &buf->samples[i];
|
|
490
|
+
int off = 0;
|
|
491
|
+
uint32_t hash;
|
|
492
|
+
int j;
|
|
493
|
+
|
|
494
|
+
/* Prepend synthetic frame if needed */
|
|
495
|
+
if (s->type == RPERF_SAMPLE_GVL_BLOCKED) {
|
|
496
|
+
temp_ids[off++] = RPERF_SYNTHETIC_GVL_BLOCKED;
|
|
497
|
+
} else if (s->type == RPERF_SAMPLE_GVL_WAIT) {
|
|
498
|
+
temp_ids[off++] = RPERF_SYNTHETIC_GVL_WAIT;
|
|
499
|
+
} else if (s->type == RPERF_SAMPLE_GC_MARKING) {
|
|
500
|
+
temp_ids[off++] = RPERF_SYNTHETIC_GC_MARKING;
|
|
501
|
+
} else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
|
|
502
|
+
temp_ids[off++] = RPERF_SYNTHETIC_GC_SWEEPING;
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
/* Convert VALUE frames to frame_ids */
|
|
506
|
+
int overflow = 0;
|
|
507
|
+
for (j = 0; j < s->depth; j++) {
|
|
508
|
+
VALUE fval = buf->frame_pool[s->frame_start + j];
|
|
509
|
+
uint32_t fid = rperf_frame_table_insert(&prof->frame_table, fval);
|
|
510
|
+
if (fid == RPERF_FRAME_TABLE_EMPTY) { overflow = 1; break; }
|
|
511
|
+
temp_ids[off + j] = fid;
|
|
512
|
+
}
|
|
513
|
+
if (overflow) break; /* frame_table full, stop aggregating this buffer */
|
|
514
|
+
|
|
515
|
+
int total_depth = off + s->depth;
|
|
516
|
+
hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq);
|
|
517
|
+
|
|
518
|
+
rperf_agg_table_insert(&prof->agg_table, temp_ids, total_depth,
|
|
519
|
+
s->thread_seq, s->weight, hash);
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
/* Reset buffer for reuse.
|
|
523
|
+
* Release fence: ensure all frame_table inserts are visible (to GC dmark)
|
|
524
|
+
* before frame_pool_count is cleared, so dmark always has at least one
|
|
525
|
+
* source (frame_table or frame_pool) covering each VALUE. */
|
|
526
|
+
__atomic_thread_fence(__ATOMIC_RELEASE);
|
|
527
|
+
buf->sample_count = 0;
|
|
528
|
+
buf->frame_pool_count = 0;
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
/* ---- Aggregation thread ---- */
|
|
532
|
+
|
|
533
|
+
/* Try to aggregate the standby buffer if swap_ready is set.
|
|
534
|
+
* Called from worker thread (with or without worker_mutex held). */
|
|
535
|
+
static void
|
|
536
|
+
rperf_try_aggregate(rperf_profiler_t *prof)
|
|
537
|
+
{
|
|
538
|
+
if (!prof->aggregate || !prof->swap_ready) return;
|
|
539
|
+
int standby_idx = prof->active_idx ^ 1;
|
|
540
|
+
rperf_aggregate_buffer(prof, &prof->buffers[standby_idx]);
|
|
541
|
+
prof->swap_ready = 0;
|
|
542
|
+
}
|
|
543
|
+
|
|
183
544
|
/* ---- Record a sample ---- */
|
|
184
545
|
|
|
546
|
+
static void
|
|
547
|
+
rperf_try_swap(rperf_profiler_t *prof)
|
|
548
|
+
{
|
|
549
|
+
if (!prof->aggregate) return;
|
|
550
|
+
rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
|
|
551
|
+
if (buf->sample_count < RPERF_AGG_THRESHOLD) return;
|
|
552
|
+
if (prof->swap_ready) return; /* standby still being aggregated */
|
|
553
|
+
|
|
554
|
+
/* Swap active buffer */
|
|
555
|
+
prof->active_idx ^= 1;
|
|
556
|
+
prof->swap_ready = 1;
|
|
557
|
+
|
|
558
|
+
/* Wake worker thread */
|
|
559
|
+
CHECKED(pthread_cond_signal(&prof->worker_cond));
|
|
560
|
+
}
|
|
561
|
+
|
|
185
562
|
static void
|
|
186
563
|
rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
|
|
187
564
|
int64_t weight, int type, int thread_seq)
|
|
188
565
|
{
|
|
189
566
|
if (weight <= 0) return;
|
|
190
|
-
|
|
567
|
+
rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
|
|
568
|
+
if (rperf_ensure_sample_capacity(buf) < 0) return;
|
|
191
569
|
|
|
192
|
-
rperf_sample_t *sample = &
|
|
570
|
+
rperf_sample_t *sample = &buf->samples[buf->sample_count];
|
|
193
571
|
sample->depth = depth;
|
|
194
572
|
sample->frame_start = frame_start;
|
|
195
573
|
sample->weight = weight;
|
|
196
574
|
sample->type = type;
|
|
197
575
|
sample->thread_seq = thread_seq;
|
|
198
|
-
|
|
576
|
+
buf->sample_count++;
|
|
577
|
+
|
|
578
|
+
rperf_try_swap(prof);
|
|
199
579
|
}
|
|
200
580
|
|
|
201
581
|
/* ---- Thread data initialization ---- */
|
|
@@ -233,13 +613,14 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
|
|
|
233
613
|
int64_t time_now = rperf_current_time_ns(prof, td);
|
|
234
614
|
if (time_now < 0) return;
|
|
235
615
|
|
|
236
|
-
/* Capture backtrace into frame_pool */
|
|
237
|
-
|
|
238
|
-
|
|
616
|
+
/* Capture backtrace into active buffer's frame_pool */
|
|
617
|
+
rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
|
|
618
|
+
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
|
|
619
|
+
size_t frame_start = buf->frame_pool_count;
|
|
239
620
|
int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
|
|
240
|
-
&
|
|
621
|
+
&buf->frame_pool[frame_start], NULL);
|
|
241
622
|
if (depth <= 0) return;
|
|
242
|
-
|
|
623
|
+
buf->frame_pool_count += depth;
|
|
243
624
|
|
|
244
625
|
/* Record normal sample (skip if first time — no prev_time) */
|
|
245
626
|
if (!is_first) {
|
|
@@ -353,15 +734,16 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
|
|
|
353
734
|
/* Capture backtrace and timestamp at GC entry */
|
|
354
735
|
prof->gc_enter_ns = rperf_wall_time_ns();
|
|
355
736
|
|
|
356
|
-
|
|
357
|
-
|
|
737
|
+
rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
|
|
738
|
+
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
|
|
739
|
+
size_t frame_start = buf->frame_pool_count;
|
|
358
740
|
int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
|
|
359
|
-
&
|
|
741
|
+
&buf->frame_pool[frame_start], NULL);
|
|
360
742
|
if (depth <= 0) {
|
|
361
743
|
prof->gc_frame_depth = 0;
|
|
362
744
|
return;
|
|
363
745
|
}
|
|
364
|
-
|
|
746
|
+
buf->frame_pool_count += depth;
|
|
365
747
|
prof->gc_frame_start = frame_start;
|
|
366
748
|
prof->gc_frame_depth = depth;
|
|
367
749
|
|
|
@@ -420,13 +802,14 @@ rperf_sample_job(void *arg)
|
|
|
420
802
|
if (weight <= 0) return;
|
|
421
803
|
|
|
422
804
|
/* Capture backtrace and record sample */
|
|
423
|
-
|
|
805
|
+
rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
|
|
806
|
+
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
|
|
424
807
|
|
|
425
|
-
size_t frame_start =
|
|
808
|
+
size_t frame_start = buf->frame_pool_count;
|
|
426
809
|
int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
|
|
427
|
-
&
|
|
810
|
+
&buf->frame_pool[frame_start], NULL);
|
|
428
811
|
if (depth <= 0) return;
|
|
429
|
-
|
|
812
|
+
buf->frame_pool_count += depth;
|
|
430
813
|
|
|
431
814
|
rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
|
|
432
815
|
|
|
@@ -437,7 +820,7 @@ rperf_sample_job(void *arg)
|
|
|
437
820
|
(ts_end.tv_nsec - ts_start.tv_nsec);
|
|
438
821
|
}
|
|
439
822
|
|
|
440
|
-
/* ----
|
|
823
|
+
/* ---- Worker thread: timer + aggregation ---- */
|
|
441
824
|
|
|
442
825
|
#if RPERF_USE_TIMER_SIGNAL
|
|
443
826
|
static void
|
|
@@ -446,21 +829,65 @@ rperf_signal_handler(int sig)
|
|
|
446
829
|
g_profiler.trigger_count++;
|
|
447
830
|
rb_postponed_job_trigger(g_profiler.pj_handle);
|
|
448
831
|
}
|
|
832
|
+
|
|
833
|
+
/* Worker thread for signal mode: aggregation only.
|
|
834
|
+
* Timer signals are directed to this thread via SIGEV_THREAD_ID,
|
|
835
|
+
* and handled by the sigaction handler (rperf_signal_handler).
|
|
836
|
+
* This ensures the timer signal does not interrupt other threads. */
|
|
837
|
+
static void *
|
|
838
|
+
rperf_worker_signal_func(void *arg)
|
|
839
|
+
{
|
|
840
|
+
rperf_profiler_t *prof = (rperf_profiler_t *)arg;
|
|
841
|
+
|
|
842
|
+
/* Publish our kernel TID so start() can use it for SIGEV_THREAD_ID */
|
|
843
|
+
CHECKED(pthread_mutex_lock(&prof->worker_mutex));
|
|
844
|
+
prof->worker_tid = (pid_t)syscall(SYS_gettid);
|
|
845
|
+
CHECKED(pthread_cond_signal(&prof->worker_cond));
|
|
846
|
+
|
|
847
|
+
while (prof->running) {
|
|
848
|
+
CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
|
|
849
|
+
rperf_try_aggregate(prof);
|
|
850
|
+
}
|
|
851
|
+
CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
|
|
852
|
+
return NULL;
|
|
853
|
+
}
|
|
449
854
|
#endif
|
|
450
855
|
|
|
856
|
+
/* Worker thread for nanosleep mode: timer + aggregation.
|
|
857
|
+
* Uses pthread_cond_timedwait with absolute deadline.
|
|
858
|
+
* Timeout → trigger + advance deadline.
|
|
859
|
+
* Signal (swap_ready) → aggregate only, keep same deadline. */
|
|
451
860
|
static void *
|
|
452
|
-
|
|
861
|
+
rperf_worker_nanosleep_func(void *arg)
|
|
453
862
|
{
|
|
454
863
|
rperf_profiler_t *prof = (rperf_profiler_t *)arg;
|
|
455
|
-
struct timespec
|
|
456
|
-
|
|
457
|
-
|
|
864
|
+
struct timespec deadline;
|
|
865
|
+
long interval_ns = 1000000000L / prof->frequency;
|
|
866
|
+
|
|
867
|
+
clock_gettime(CLOCK_REALTIME, &deadline);
|
|
868
|
+
deadline.tv_nsec += interval_ns;
|
|
869
|
+
if (deadline.tv_nsec >= 1000000000L) {
|
|
870
|
+
deadline.tv_sec++;
|
|
871
|
+
deadline.tv_nsec -= 1000000000L;
|
|
872
|
+
}
|
|
458
873
|
|
|
874
|
+
CHECKED(pthread_mutex_lock(&prof->worker_mutex));
|
|
459
875
|
while (prof->running) {
|
|
460
|
-
prof->
|
|
461
|
-
|
|
462
|
-
|
|
876
|
+
int ret = pthread_cond_timedwait(&prof->worker_cond, &prof->worker_mutex, &deadline);
|
|
877
|
+
assert(ret == 0 || ret == ETIMEDOUT);
|
|
878
|
+
if (ret == ETIMEDOUT) {
|
|
879
|
+
prof->trigger_count++;
|
|
880
|
+
rb_postponed_job_trigger(prof->pj_handle);
|
|
881
|
+
/* Advance deadline by interval */
|
|
882
|
+
deadline.tv_nsec += interval_ns;
|
|
883
|
+
if (deadline.tv_nsec >= 1000000000L) {
|
|
884
|
+
deadline.tv_sec++;
|
|
885
|
+
deadline.tv_nsec -= 1000000000L;
|
|
886
|
+
}
|
|
887
|
+
}
|
|
888
|
+
rperf_try_aggregate(prof);
|
|
463
889
|
}
|
|
890
|
+
CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
|
|
464
891
|
return NULL;
|
|
465
892
|
}
|
|
466
893
|
|
|
@@ -488,12 +915,17 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
488
915
|
VALUE opts;
|
|
489
916
|
int frequency = 1000;
|
|
490
917
|
int mode = 0; /* 0 = cpu, 1 = wall */
|
|
918
|
+
int aggregate = 1; /* default: aggregate */
|
|
491
919
|
#if RPERF_USE_TIMER_SIGNAL
|
|
492
920
|
int timer_signal = RPERF_TIMER_SIGNAL_DEFAULT;
|
|
493
921
|
#endif
|
|
494
922
|
|
|
495
923
|
rb_scan_args(argc, argv, ":", &opts);
|
|
496
924
|
if (!NIL_P(opts)) {
|
|
925
|
+
VALUE vagg = rb_hash_aref(opts, ID2SYM(rb_intern("aggregate")));
|
|
926
|
+
if (!NIL_P(vagg)) {
|
|
927
|
+
aggregate = RTEST(vagg) ? 1 : 0;
|
|
928
|
+
}
|
|
497
929
|
VALUE vfreq = rb_hash_aref(opts, ID2SYM(rb_intern("frequency")));
|
|
498
930
|
if (!NIL_P(vfreq)) {
|
|
499
931
|
frequency = NUM2INT(vfreq);
|
|
@@ -535,25 +967,35 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
535
967
|
|
|
536
968
|
g_profiler.frequency = frequency;
|
|
537
969
|
g_profiler.mode = mode;
|
|
538
|
-
g_profiler.
|
|
970
|
+
g_profiler.aggregate = aggregate;
|
|
539
971
|
g_profiler.next_thread_seq = 0;
|
|
540
972
|
g_profiler.sampling_count = 0;
|
|
541
973
|
g_profiler.sampling_total_ns = 0;
|
|
542
|
-
g_profiler.
|
|
543
|
-
g_profiler.
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
974
|
+
g_profiler.trigger_count = 0;
|
|
975
|
+
g_profiler.active_idx = 0;
|
|
976
|
+
g_profiler.swap_ready = 0;
|
|
977
|
+
|
|
978
|
+
/* Initialize worker mutex/cond */
|
|
979
|
+
CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
|
|
980
|
+
CHECKED(pthread_cond_init(&g_profiler.worker_cond, NULL));
|
|
981
|
+
|
|
982
|
+
/* Initialize sample buffer(s) */
|
|
983
|
+
if (rperf_sample_buffer_init(&g_profiler.buffers[0]) < 0) {
|
|
984
|
+
CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
|
|
985
|
+
CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
|
|
986
|
+
rb_raise(rb_eNoMemError, "rperf: failed to allocate sample buffer 0");
|
|
547
987
|
}
|
|
988
|
+
if (aggregate) {
|
|
989
|
+
if (rperf_sample_buffer_init(&g_profiler.buffers[1]) < 0) {
|
|
990
|
+
rperf_sample_buffer_free(&g_profiler.buffers[0]);
|
|
991
|
+
CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
|
|
992
|
+
CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
|
|
993
|
+
rb_raise(rb_eNoMemError, "rperf: failed to allocate sample buffer 1");
|
|
994
|
+
}
|
|
548
995
|
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
g_profiler.frame_pool_capacity, sizeof(VALUE));
|
|
553
|
-
if (!g_profiler.frame_pool) {
|
|
554
|
-
free(g_profiler.samples);
|
|
555
|
-
g_profiler.samples = NULL;
|
|
556
|
-
rb_raise(rb_eNoMemError, "rperf: failed to allocate frame pool");
|
|
996
|
+
/* Initialize aggregation structures */
|
|
997
|
+
rperf_frame_table_init(&g_profiler.frame_table);
|
|
998
|
+
rperf_agg_table_init(&g_profiler.agg_table);
|
|
557
999
|
}
|
|
558
1000
|
|
|
559
1001
|
/* Register GC event hook */
|
|
@@ -581,12 +1023,16 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
581
1023
|
VALUE cur_thread = rb_thread_current();
|
|
582
1024
|
rperf_thread_data_t *td = rperf_thread_data_create(&g_profiler, cur_thread);
|
|
583
1025
|
if (!td) {
|
|
584
|
-
free(g_profiler.samples);
|
|
585
|
-
g_profiler.samples = NULL;
|
|
586
|
-
free(g_profiler.frame_pool);
|
|
587
|
-
g_profiler.frame_pool = NULL;
|
|
588
1026
|
rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
|
|
589
1027
|
g_profiler.thread_hook = NULL;
|
|
1028
|
+
if (g_profiler.aggregate) {
|
|
1029
|
+
rperf_sample_buffer_free(&g_profiler.buffers[1]);
|
|
1030
|
+
rperf_frame_table_free(&g_profiler.frame_table);
|
|
1031
|
+
rperf_agg_table_free(&g_profiler.agg_table);
|
|
1032
|
+
}
|
|
1033
|
+
rperf_sample_buffer_free(&g_profiler.buffers[0]);
|
|
1034
|
+
CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
|
|
1035
|
+
CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
|
|
590
1036
|
rb_raise(rb_eNoMemError, "rperf: failed to allocate thread data");
|
|
591
1037
|
}
|
|
592
1038
|
}
|
|
@@ -609,12 +1055,32 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
609
1055
|
sa.sa_flags = SA_RESTART;
|
|
610
1056
|
sigaction(g_profiler.timer_signal, &sa, NULL);
|
|
611
1057
|
|
|
1058
|
+
/* Start worker thread first to get its kernel TID */
|
|
1059
|
+
g_profiler.worker_tid = 0;
|
|
1060
|
+
if (pthread_create(&g_profiler.worker_thread, NULL,
|
|
1061
|
+
rperf_worker_signal_func, &g_profiler) != 0) {
|
|
1062
|
+
g_profiler.running = 0;
|
|
1063
|
+
signal(g_profiler.timer_signal, SIG_DFL);
|
|
1064
|
+
goto timer_fail;
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
/* Wait for worker thread to publish its TID */
|
|
1068
|
+
CHECKED(pthread_mutex_lock(&g_profiler.worker_mutex));
|
|
1069
|
+
while (g_profiler.worker_tid == 0) {
|
|
1070
|
+
CHECKED(pthread_cond_wait(&g_profiler.worker_cond, &g_profiler.worker_mutex));
|
|
1071
|
+
}
|
|
1072
|
+
CHECKED(pthread_mutex_unlock(&g_profiler.worker_mutex));
|
|
1073
|
+
|
|
1074
|
+
/* Create timer targeting the worker thread via SIGEV_THREAD_ID */
|
|
612
1075
|
memset(&sev, 0, sizeof(sev));
|
|
613
|
-
sev.sigev_notify =
|
|
1076
|
+
sev.sigev_notify = SIGEV_THREAD_ID;
|
|
614
1077
|
sev.sigev_signo = g_profiler.timer_signal;
|
|
1078
|
+
sev._sigev_un._tid = g_profiler.worker_tid;
|
|
615
1079
|
if (timer_create(CLOCK_MONOTONIC, &sev, &g_profiler.timer_id) != 0) {
|
|
616
1080
|
g_profiler.running = 0;
|
|
617
1081
|
signal(g_profiler.timer_signal, SIG_DFL);
|
|
1082
|
+
CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
|
|
1083
|
+
CHECKED(pthread_join(g_profiler.worker_thread, NULL));
|
|
618
1084
|
goto timer_fail;
|
|
619
1085
|
}
|
|
620
1086
|
|
|
@@ -625,7 +1091,9 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
625
1091
|
} else
|
|
626
1092
|
#endif
|
|
627
1093
|
{
|
|
628
|
-
|
|
1094
|
+
/* Start worker thread (timer via timedwait + aggregation) */
|
|
1095
|
+
if (pthread_create(&g_profiler.worker_thread, NULL,
|
|
1096
|
+
rperf_worker_nanosleep_func, &g_profiler) != 0) {
|
|
629
1097
|
g_profiler.running = 0;
|
|
630
1098
|
goto timer_fail;
|
|
631
1099
|
}
|
|
@@ -643,10 +1111,14 @@ timer_fail:
|
|
|
643
1111
|
}
|
|
644
1112
|
rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
|
|
645
1113
|
g_profiler.thread_hook = NULL;
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
1114
|
+
if (g_profiler.aggregate) {
|
|
1115
|
+
rperf_sample_buffer_free(&g_profiler.buffers[1]);
|
|
1116
|
+
rperf_frame_table_free(&g_profiler.frame_table);
|
|
1117
|
+
rperf_agg_table_free(&g_profiler.agg_table);
|
|
1118
|
+
}
|
|
1119
|
+
rperf_sample_buffer_free(&g_profiler.buffers[0]);
|
|
1120
|
+
CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
|
|
1121
|
+
CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
|
|
650
1122
|
rb_raise(rb_eRuntimeError, "rperf: failed to create timer");
|
|
651
1123
|
}
|
|
652
1124
|
|
|
@@ -668,12 +1140,15 @@ rb_rperf_stop(VALUE self)
|
|
|
668
1140
|
#if RPERF_USE_TIMER_SIGNAL
|
|
669
1141
|
if (g_profiler.timer_signal > 0) {
|
|
670
1142
|
timer_delete(g_profiler.timer_id);
|
|
671
|
-
signal(g_profiler.timer_signal,
|
|
672
|
-
} else
|
|
673
|
-
#endif
|
|
674
|
-
{
|
|
675
|
-
pthread_join(g_profiler.timer_thread, NULL);
|
|
1143
|
+
signal(g_profiler.timer_signal, SIG_IGN);
|
|
676
1144
|
}
|
|
1145
|
+
#endif
|
|
1146
|
+
|
|
1147
|
+
/* Wake and join worker thread */
|
|
1148
|
+
CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
|
|
1149
|
+
CHECKED(pthread_join(g_profiler.worker_thread, NULL));
|
|
1150
|
+
CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
|
|
1151
|
+
CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
|
|
677
1152
|
|
|
678
1153
|
if (g_profiler.thread_hook) {
|
|
679
1154
|
rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
|
|
@@ -683,6 +1158,16 @@ rb_rperf_stop(VALUE self)
|
|
|
683
1158
|
/* Remove GC event hook */
|
|
684
1159
|
rb_remove_event_hook(rperf_gc_event_hook);
|
|
685
1160
|
|
|
1161
|
+
if (g_profiler.aggregate) {
|
|
1162
|
+
/* Aggregate remaining samples from both buffers */
|
|
1163
|
+
if (g_profiler.swap_ready) {
|
|
1164
|
+
int standby_idx = g_profiler.active_idx ^ 1;
|
|
1165
|
+
rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[standby_idx]);
|
|
1166
|
+
g_profiler.swap_ready = 0;
|
|
1167
|
+
}
|
|
1168
|
+
rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[g_profiler.active_idx]);
|
|
1169
|
+
}
|
|
1170
|
+
|
|
686
1171
|
/* Clean up thread-specific data for all live threads */
|
|
687
1172
|
{
|
|
688
1173
|
VALUE threads = rb_funcall(rb_cThread, rb_intern("list"), 0);
|
|
@@ -713,6 +1198,14 @@ rb_rperf_stop(VALUE self)
|
|
|
713
1198
|
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.sampling_count));
|
|
714
1199
|
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.sampling_total_ns));
|
|
715
1200
|
|
|
1201
|
+
/* aggregation stats */
|
|
1202
|
+
if (g_profiler.aggregate) {
|
|
1203
|
+
rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
|
|
1204
|
+
SIZET2NUM(g_profiler.frame_table.count - RPERF_SYNTHETIC_COUNT));
|
|
1205
|
+
rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
|
|
1206
|
+
SIZET2NUM(g_profiler.agg_table.count));
|
|
1207
|
+
}
|
|
1208
|
+
|
|
716
1209
|
/* start_time_ns (CLOCK_REALTIME epoch nanos), duration_ns (CLOCK_MONOTONIC delta) */
|
|
717
1210
|
{
|
|
718
1211
|
struct timespec stop_monotonic;
|
|
@@ -726,45 +1219,76 @@ rb_rperf_stop(VALUE self)
|
|
|
726
1219
|
rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
|
|
727
1220
|
}
|
|
728
1221
|
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]"));
|
|
743
|
-
rb_ary_push(frames, syn);
|
|
744
|
-
} else if (s->type == RPERF_SAMPLE_GC_MARKING) {
|
|
745
|
-
VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]"));
|
|
746
|
-
rb_ary_push(frames, syn);
|
|
747
|
-
} else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
|
|
748
|
-
VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]"));
|
|
749
|
-
rb_ary_push(frames, syn);
|
|
1222
|
+
if (g_profiler.aggregate) {
|
|
1223
|
+
/* Build samples from aggregation table.
|
|
1224
|
+
* Use a Ruby array for resolved frames so GC protects them. */
|
|
1225
|
+
rperf_frame_table_t *ft = &g_profiler.frame_table;
|
|
1226
|
+
VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
|
|
1227
|
+
/* Synthetic frames */
|
|
1228
|
+
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]")));
|
|
1229
|
+
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
|
|
1230
|
+
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]")));
|
|
1231
|
+
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]")));
|
|
1232
|
+
/* Real frames */
|
|
1233
|
+
for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
|
|
1234
|
+
rb_ary_push(resolved_ary, rperf_resolve_frame(ft->keys[i]));
|
|
750
1235
|
}
|
|
751
1236
|
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
1237
|
+
rperf_agg_table_t *at = &g_profiler.agg_table;
|
|
1238
|
+
samples_ary = rb_ary_new();
|
|
1239
|
+
for (i = 0; i < at->bucket_capacity; i++) {
|
|
1240
|
+
rperf_agg_entry_t *e = &at->buckets[i];
|
|
1241
|
+
if (!e->used) continue;
|
|
1242
|
+
|
|
1243
|
+
VALUE frames = rb_ary_new_capa(e->depth);
|
|
1244
|
+
for (j = 0; j < e->depth; j++) {
|
|
1245
|
+
uint32_t fid = at->stack_pool[e->frame_start + j];
|
|
1246
|
+
rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
|
|
1247
|
+
}
|
|
1248
|
+
|
|
1249
|
+
VALUE sample = rb_ary_new3(3, frames, LONG2NUM(e->weight), INT2NUM(e->thread_seq));
|
|
1250
|
+
rb_ary_push(samples_ary, sample);
|
|
755
1251
|
}
|
|
756
1252
|
|
|
757
|
-
|
|
758
|
-
|
|
1253
|
+
rperf_sample_buffer_free(&g_profiler.buffers[1]);
|
|
1254
|
+
rperf_frame_table_free(&g_profiler.frame_table);
|
|
1255
|
+
rperf_agg_table_free(&g_profiler.agg_table);
|
|
1256
|
+
} else {
|
|
1257
|
+
/* Raw samples path (aggregate: false) */
|
|
1258
|
+
rperf_sample_buffer_t *buf = &g_profiler.buffers[0];
|
|
1259
|
+
samples_ary = rb_ary_new_capa((long)buf->sample_count);
|
|
1260
|
+
for (i = 0; i < buf->sample_count; i++) {
|
|
1261
|
+
rperf_sample_t *s = &buf->samples[i];
|
|
1262
|
+
VALUE frames = rb_ary_new_capa(s->depth + 1);
|
|
1263
|
+
|
|
1264
|
+
/* Prepend synthetic frame at leaf position (index 0) */
|
|
1265
|
+
if (s->type == RPERF_SAMPLE_GVL_BLOCKED) {
|
|
1266
|
+
VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]"));
|
|
1267
|
+
rb_ary_push(frames, syn);
|
|
1268
|
+
} else if (s->type == RPERF_SAMPLE_GVL_WAIT) {
|
|
1269
|
+
VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]"));
|
|
1270
|
+
rb_ary_push(frames, syn);
|
|
1271
|
+
} else if (s->type == RPERF_SAMPLE_GC_MARKING) {
|
|
1272
|
+
VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]"));
|
|
1273
|
+
rb_ary_push(frames, syn);
|
|
1274
|
+
} else if (s->type == RPERF_SAMPLE_GC_SWEEPING) {
|
|
1275
|
+
VALUE syn = rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]"));
|
|
1276
|
+
rb_ary_push(frames, syn);
|
|
1277
|
+
}
|
|
1278
|
+
|
|
1279
|
+
for (j = 0; j < s->depth; j++) {
|
|
1280
|
+
VALUE fval = buf->frame_pool[s->frame_start + j];
|
|
1281
|
+
rb_ary_push(frames, rperf_resolve_frame(fval));
|
|
1282
|
+
}
|
|
1283
|
+
|
|
1284
|
+
VALUE sample = rb_ary_new3(3, frames, LONG2NUM(s->weight), INT2NUM(s->thread_seq));
|
|
1285
|
+
rb_ary_push(samples_ary, sample);
|
|
1286
|
+
}
|
|
759
1287
|
}
|
|
760
1288
|
rb_hash_aset(result, ID2SYM(rb_intern("samples")), samples_ary);
|
|
761
1289
|
|
|
762
1290
|
/* Cleanup */
|
|
763
|
-
|
|
764
|
-
g_profiler.samples = NULL;
|
|
765
|
-
free(g_profiler.frame_pool);
|
|
766
|
-
g_profiler.frame_pool = NULL;
|
|
767
|
-
g_profiler.frame_pool_count = 0;
|
|
1291
|
+
rperf_sample_buffer_free(&g_profiler.buffers[0]);
|
|
768
1292
|
|
|
769
1293
|
return result;
|
|
770
1294
|
}
|
|
@@ -793,16 +1317,13 @@ rperf_after_fork_child(void)
|
|
|
793
1317
|
}
|
|
794
1318
|
rb_remove_event_hook(rperf_gc_event_hook);
|
|
795
1319
|
|
|
796
|
-
/* Free sample
|
|
797
|
-
|
|
798
|
-
g_profiler.
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
g_profiler.frame_pool = NULL;
|
|
804
|
-
g_profiler.frame_pool_count = 0;
|
|
805
|
-
g_profiler.frame_pool_capacity = 0;
|
|
1320
|
+
/* Free sample buffers, frame table, and agg table — these hold parent's data */
|
|
1321
|
+
rperf_sample_buffer_free(&g_profiler.buffers[0]);
|
|
1322
|
+
if (g_profiler.aggregate) {
|
|
1323
|
+
rperf_sample_buffer_free(&g_profiler.buffers[1]);
|
|
1324
|
+
rperf_frame_table_free(&g_profiler.frame_table);
|
|
1325
|
+
rperf_agg_table_free(&g_profiler.agg_table);
|
|
1326
|
+
}
|
|
806
1327
|
|
|
807
1328
|
/* Reset GC state */
|
|
808
1329
|
g_profiler.gc_phase = 0;
|
|
@@ -810,6 +1331,7 @@ rperf_after_fork_child(void)
|
|
|
810
1331
|
/* Reset stats */
|
|
811
1332
|
g_profiler.sampling_count = 0;
|
|
812
1333
|
g_profiler.sampling_total_ns = 0;
|
|
1334
|
+
g_profiler.swap_ready = 0;
|
|
813
1335
|
}
|
|
814
1336
|
|
|
815
1337
|
/* ---- Init ---- */
|
|
@@ -830,5 +1352,5 @@ Init_rperf(void)
|
|
|
830
1352
|
rb_gc_register_address(&g_profiler_wrapper);
|
|
831
1353
|
|
|
832
1354
|
/* Fork safety: silently stop profiling in child process */
|
|
833
|
-
pthread_atfork(NULL, NULL, rperf_after_fork_child);
|
|
1355
|
+
CHECKED(pthread_atfork(NULL, NULL, rperf_after_fork_child));
|
|
834
1356
|
}
|
data/lib/rperf/version.rb
CHANGED
data/lib/rperf.rb
CHANGED
|
@@ -17,21 +17,19 @@ module Rperf
|
|
|
17
17
|
@output = nil
|
|
18
18
|
@stat = false
|
|
19
19
|
@stat_start_mono = nil
|
|
20
|
-
STAT_TOP_N = 5
|
|
21
|
-
SYNTHETIC_LABELS = %w[[GVL\ blocked] [GVL\ wait] [GC\ marking] [GC\ sweeping]].freeze
|
|
22
20
|
|
|
23
21
|
# Starts profiling.
|
|
24
22
|
# format: :pprof, :collapsed, or :text. nil = auto-detect from output extension
|
|
25
23
|
# .collapsed → collapsed stacks (FlameGraph / speedscope compatible)
|
|
26
24
|
# .txt → text report (human/AI readable flat + cumulative table)
|
|
27
25
|
# otherwise (.pb.gz etc) → pprof protobuf (gzip compressed)
|
|
28
|
-
def self.start(frequency: 1000, mode: :cpu, output: nil, verbose: false, format: nil, stat: false, signal: nil)
|
|
26
|
+
def self.start(frequency: 1000, mode: :cpu, output: nil, verbose: false, format: nil, stat: false, signal: nil, aggregate: true)
|
|
29
27
|
@verbose = verbose || ENV["RPERF_VERBOSE"] == "1"
|
|
30
28
|
@output = output
|
|
31
29
|
@format = format
|
|
32
30
|
@stat = stat
|
|
33
31
|
@stat_start_mono = Process.clock_gettime(Process::CLOCK_MONOTONIC) if @stat
|
|
34
|
-
c_opts = { frequency: frequency, mode: mode }
|
|
32
|
+
c_opts = { frequency: frequency, mode: mode, aggregate: aggregate }
|
|
35
33
|
c_opts[:signal] = signal unless signal.nil?
|
|
36
34
|
_c_start(**c_opts)
|
|
37
35
|
|
|
@@ -104,7 +102,7 @@ module Rperf
|
|
|
104
102
|
def self.print_stats(data)
|
|
105
103
|
count = data[:sampling_count] || 0
|
|
106
104
|
total_ns = data[:sampling_time_ns] || 0
|
|
107
|
-
|
|
105
|
+
sample_count = data[:sampling_count] || 0
|
|
108
106
|
mode = data[:mode] || :cpu
|
|
109
107
|
frequency = data[:frequency] || 0
|
|
110
108
|
|
|
@@ -113,7 +111,7 @@ module Rperf
|
|
|
113
111
|
|
|
114
112
|
$stderr.puts "[rperf] mode=#{mode} frequency=#{frequency}Hz"
|
|
115
113
|
$stderr.puts "[rperf] sampling: #{count} calls, #{format("%.2f", total_ms)}ms total, #{format("%.1f", avg_us)}us/call avg"
|
|
116
|
-
$stderr.puts "[rperf] samples recorded: #{
|
|
114
|
+
$stderr.puts "[rperf] samples recorded: #{sample_count}"
|
|
117
115
|
|
|
118
116
|
print_top(data)
|
|
119
117
|
end
|
|
@@ -202,7 +200,7 @@ module Rperf
|
|
|
202
200
|
print_stat_breakdown(breakdown, total_weight)
|
|
203
201
|
print_stat_runtime_info
|
|
204
202
|
print_stat_system_info
|
|
205
|
-
|
|
203
|
+
print_stat_report(data) if ENV["RPERF_STAT_REPORT"] == "1"
|
|
206
204
|
print_stat_footer(samples_raw, real_ns, data)
|
|
207
205
|
end
|
|
208
206
|
|
|
@@ -291,37 +289,20 @@ module Rperf
|
|
|
291
289
|
end
|
|
292
290
|
private_class_method :print_stat_system_info
|
|
293
291
|
|
|
294
|
-
def self.print_stat_top(samples_raw, total_weight)
|
|
295
|
-
flat = Hash.new(0)
|
|
296
|
-
samples_raw.each do |frames, weight|
|
|
297
|
-
leaf = frames.first
|
|
298
|
-
if leaf
|
|
299
|
-
_, label = leaf
|
|
300
|
-
next if SYNTHETIC_LABELS.include?(label)
|
|
301
|
-
flat[[label, leaf[0]]] += weight
|
|
302
|
-
end
|
|
303
|
-
end
|
|
304
|
-
|
|
305
|
-
return if flat.empty?
|
|
306
292
|
|
|
307
|
-
|
|
293
|
+
def self.print_stat_report(data)
|
|
308
294
|
$stderr.puts
|
|
309
|
-
$stderr.puts
|
|
310
|
-
top.each do |key, weight|
|
|
311
|
-
label, path = key
|
|
312
|
-
pct = total_weight > 0 ? weight * 100.0 / total_weight : 0.0
|
|
313
|
-
loc = path.empty? ? "" : " (#{path})"
|
|
314
|
-
$stderr.puts STAT_PCT_LINE.call(format_ms(weight), "ms", pct, "#{label}#{loc}")
|
|
315
|
-
end
|
|
295
|
+
$stderr.puts Text.encode(data, header: false)
|
|
316
296
|
end
|
|
317
|
-
private_class_method :
|
|
297
|
+
private_class_method :print_stat_report
|
|
318
298
|
|
|
319
299
|
def self.print_stat_footer(samples_raw, real_ns, data)
|
|
320
|
-
|
|
300
|
+
triggers = data[:trigger_count] || 0
|
|
321
301
|
overhead_pct = real_ns > 0 ? (data[:sampling_time_ns] || 0) * 100.0 / real_ns : 0.0
|
|
322
302
|
$stderr.puts
|
|
323
|
-
|
|
324
|
-
|
|
303
|
+
samples = data[:sampling_count] || samples_raw.size
|
|
304
|
+
$stderr.puts format(" %d samples / %d triggers, %.1f%% profiler overhead",
|
|
305
|
+
samples, triggers, overhead_pct)
|
|
325
306
|
end
|
|
326
307
|
private_class_method :print_stat_footer
|
|
327
308
|
|
|
@@ -393,11 +374,13 @@ module Rperf
|
|
|
393
374
|
when "false" then false
|
|
394
375
|
else ENV["RPERF_SIGNAL"].to_i
|
|
395
376
|
end
|
|
377
|
+
_rperf_aggregate = ENV["RPERF_AGGREGATE"] != "0"
|
|
396
378
|
_rperf_start_opts = { frequency: (ENV["RPERF_FREQUENCY"] || 1000).to_i, mode: _rperf_mode,
|
|
397
379
|
output: _rperf_stat ? ENV["RPERF_OUTPUT"] : (ENV["RPERF_OUTPUT"] || "rperf.data"),
|
|
398
380
|
verbose: ENV["RPERF_VERBOSE"] == "1",
|
|
399
381
|
format: _rperf_format,
|
|
400
|
-
stat: _rperf_stat
|
|
382
|
+
stat: _rperf_stat,
|
|
383
|
+
aggregate: _rperf_aggregate }
|
|
401
384
|
_rperf_start_opts[:signal] = _rperf_signal unless _rperf_signal.nil?
|
|
402
385
|
start(**_rperf_start_opts)
|
|
403
386
|
at_exit { stop }
|
|
@@ -407,7 +390,7 @@ module Rperf
|
|
|
407
390
|
module Text
|
|
408
391
|
module_function
|
|
409
392
|
|
|
410
|
-
def encode(data, top_n: 50)
|
|
393
|
+
def encode(data, top_n: 50, header: true)
|
|
411
394
|
samples_raw = data[:samples]
|
|
412
395
|
mode = data[:mode] || :cpu
|
|
413
396
|
frequency = data[:frequency] || 0
|
|
@@ -417,10 +400,13 @@ module Rperf
|
|
|
417
400
|
result = Rperf.send(:compute_flat_cum, samples_raw)
|
|
418
401
|
|
|
419
402
|
out = String.new
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
403
|
+
if header
|
|
404
|
+
total_ms = result[:total_weight] / 1_000_000.0
|
|
405
|
+
out << "Total: #{"%.1f" % total_ms}ms (#{mode})\n"
|
|
406
|
+
sample_count = data[:sampling_count] || samples_raw.size
|
|
407
|
+
out << "Samples: #{sample_count}, Frequency: #{frequency}Hz\n"
|
|
408
|
+
out << "\n"
|
|
409
|
+
end
|
|
424
410
|
out << format_table("Flat", result[:flat], result[:total_weight], top_n)
|
|
425
411
|
out << "\n"
|
|
426
412
|
out << format_table("Cumulative", result[:cum], result[:total_weight], top_n)
|
|
@@ -430,13 +416,12 @@ module Rperf
|
|
|
430
416
|
def format_table(title, table, total_weight, top_n)
|
|
431
417
|
sorted = table.sort_by { |_, w| -w }.first(top_n)
|
|
432
418
|
out = String.new
|
|
433
|
-
out << "#{title}:\n"
|
|
419
|
+
out << " #{title}:\n"
|
|
434
420
|
sorted.each do |key, weight|
|
|
435
421
|
label, path = key
|
|
436
|
-
ms = weight / 1_000_000.0
|
|
437
422
|
pct = total_weight > 0 ? weight * 100.0 / total_weight : 0.0
|
|
438
423
|
loc = path.empty? ? "" : " (#{path})"
|
|
439
|
-
out << (" %
|
|
424
|
+
out << format(" %14s ms %5.1f%% %s%s\n", Rperf.send(:format_ms, weight), pct, label, loc)
|
|
440
425
|
end
|
|
441
426
|
out
|
|
442
427
|
end
|