rperf 0.7.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +21 -0
- data/README.md +75 -49
- data/docs/help.md +255 -36
- data/docs/logo.svg +25 -0
- data/exe/rperf +154 -30
- data/ext/rperf/rperf.c +235 -121
- data/lib/rperf/active_job.rb +1 -0
- data/lib/rperf/rack.rb +25 -3
- data/lib/rperf/version.rb +1 -1
- data/lib/rperf/viewer.rb +847 -0
- data/lib/rperf.rb +663 -92
- metadata +7 -4
data/docs/help.md
CHANGED
|
@@ -12,18 +12,22 @@ POSIX systems (Linux, macOS). Requires Ruby >= 3.4.0.
|
|
|
12
12
|
rperf stat [options] command [args...]
|
|
13
13
|
rperf exec [options] command [args...]
|
|
14
14
|
rperf report [options] [file]
|
|
15
|
+
rperf diff [options] base target
|
|
15
16
|
rperf help
|
|
17
|
+
rperf -v / --version
|
|
16
18
|
|
|
17
19
|
### record: Profile and save to file.
|
|
18
20
|
|
|
19
|
-
-o, --output PATH Output file (default: rperf.
|
|
21
|
+
-o, --output PATH Output file (default: rperf.json.gz)
|
|
20
22
|
-f, --frequency HZ Sampling frequency in Hz (default: 1000)
|
|
21
23
|
-m, --mode MODE cpu or wall (default: cpu)
|
|
22
|
-
--format FORMAT pprof, collapsed, or text (default: auto from extension)
|
|
24
|
+
--format FORMAT json, pprof, collapsed, or text (default: auto from extension)
|
|
23
25
|
-p, --print Print text profile to stdout
|
|
24
26
|
(same as --format=text --output=/dev/stdout)
|
|
25
27
|
--signal VALUE Timer signal (Linux only): signal number, or 'false'
|
|
26
28
|
for nanosleep thread (default: auto)
|
|
29
|
+
--no-inherit Do not profile forked/spawned child processes
|
|
30
|
+
--no-aggregate Disable C-level sample aggregation (raw per-sample data)
|
|
27
31
|
-v, --verbose Print sampling statistics to stderr
|
|
28
32
|
|
|
29
33
|
### stat: Run command and print performance summary to stderr.
|
|
@@ -36,12 +40,20 @@ Uses wall mode by default. No file output by default.
|
|
|
36
40
|
--report Include flat/cumulative profile tables in output
|
|
37
41
|
--signal VALUE Timer signal (Linux only): signal number, or 'false'
|
|
38
42
|
for nanosleep thread (default: auto)
|
|
43
|
+
--no-inherit Do not profile forked/spawned child processes
|
|
44
|
+
--no-aggregate Disable C-level sample aggregation (raw per-sample data)
|
|
39
45
|
-v, --verbose Print additional sampling statistics
|
|
40
46
|
|
|
41
47
|
Shows: user/sys/real time, time breakdown (CPU execution, GVL blocked,
|
|
42
48
|
GVL wait, GC marking, GC sweeping), GC/memory/OS stats, and profiler overhead.
|
|
49
|
+
Lines are prefixed: `[Rperf]` for sampling-derived data, `[Ruby ]` for
|
|
50
|
+
runtime info, `[OS ]` for OS-level info.
|
|
43
51
|
Use --report to add flat and cumulative top-50 function tables.
|
|
44
52
|
|
|
53
|
+
When child processes are profiled (default), the stat output shows
|
|
54
|
+
aggregated data from all processes and includes a "Ruby processes profiled"
|
|
55
|
+
count. Use --no-inherit to disable child process tracking.
|
|
56
|
+
|
|
45
57
|
### exec: Run command and print full profile report to stderr.
|
|
46
58
|
|
|
47
59
|
Like `stat --report`. Uses wall mode by default. No file output by default.
|
|
@@ -51,26 +63,81 @@ Like `stat --report`. Uses wall mode by default. No file output by default.
|
|
|
51
63
|
-m, --mode MODE cpu or wall (default: wall)
|
|
52
64
|
--signal VALUE Timer signal (Linux only): signal number, or 'false'
|
|
53
65
|
for nanosleep thread (default: auto)
|
|
66
|
+
--no-inherit Do not profile forked/spawned child processes
|
|
67
|
+
--no-aggregate Disable C-level sample aggregation (raw per-sample data)
|
|
54
68
|
-v, --verbose Print additional sampling statistics
|
|
55
69
|
|
|
56
70
|
Shows: user/sys/real time, time breakdown, GC/memory/OS stats, profiler overhead,
|
|
57
71
|
and flat/cumulative top-50 function tables.
|
|
58
72
|
|
|
59
|
-
### report: Open
|
|
73
|
+
### report: Open profile viewer or go tool pprof.
|
|
60
74
|
|
|
61
75
|
--top Print top functions by flat time
|
|
62
76
|
--text Print text report
|
|
77
|
+
--html Output static HTML viewer to stdout
|
|
63
78
|
|
|
64
79
|
Default (no flag): opens interactive web UI in browser.
|
|
65
|
-
Default file: rperf.
|
|
80
|
+
Default file: rperf.json.gz
|
|
81
|
+
|
|
82
|
+
`--html` generates an HTML file with profile data embedded inline.
|
|
83
|
+
No server is needed — open it directly in a browser. d3 and
|
|
84
|
+
d3-flamegraph are loaded from CDN, so an internet connection is
|
|
85
|
+
required on first viewing. Useful for sharing or hosting on static
|
|
86
|
+
sites (e.g., GitHub Pages).
|
|
87
|
+
|
|
88
|
+
rperf report --html profile.json.gz > report.html
|
|
89
|
+
|
|
90
|
+
### diff: Compare two profiles (target - base). Requires Go.
|
|
66
91
|
|
|
67
|
-
|
|
92
|
+
Accepts `.json.gz` (auto-converted to pprof) or `.pb.gz` files.
|
|
68
93
|
|
|
69
94
|
--top Print top functions by diff
|
|
70
95
|
--text Print text diff report
|
|
71
96
|
|
|
72
97
|
Default (no flag): opens diff in browser.
|
|
73
98
|
|
|
99
|
+
### Multi-process profiling
|
|
100
|
+
|
|
101
|
+
By default, rperf profiles forked and spawned Ruby child processes.
|
|
102
|
+
Profiles from all processes are merged into a single output. Each child
|
|
103
|
+
process's samples are tagged with a `%pid` label for per-process filtering.
|
|
104
|
+
|
|
105
|
+
# Profile a preforking server (Unicorn, Puma, etc.)
|
|
106
|
+
rperf stat -m wall bundle exec unicorn
|
|
107
|
+
rperf record -m wall -o profile.json.gz bundle exec unicorn
|
|
108
|
+
|
|
109
|
+
# Profile with fork
|
|
110
|
+
rperf stat ruby -e '4.times { fork { work } }; Process.waitall'
|
|
111
|
+
|
|
112
|
+
# Disable child process tracking
|
|
113
|
+
rperf stat --no-inherit ruby app.rb
|
|
114
|
+
|
|
115
|
+
How it works:
|
|
116
|
+
|
|
117
|
+
- On fork: `Process._fork` hook restarts profiling in the child and sets
|
|
118
|
+
a `%pid` label. When the child exits, its profile is saved to a
|
|
119
|
+
temporary session directory.
|
|
120
|
+
- On spawn/system: The spawned Ruby process inherits `RUBYLIB` (pointing
|
|
121
|
+
to rperf's lib directory) and `RUBYOPT=-rrperf`, plus `RPERF_SESSION_DIR`.
|
|
122
|
+
It auto-starts profiling and writes its profile to the session directory.
|
|
123
|
+
- When the root process exits, it aggregates all profiles from the
|
|
124
|
+
session directory into a single output (stat report or file).
|
|
125
|
+
- The session directory is cleaned up after aggregation.
|
|
126
|
+
|
|
127
|
+
Limitations:
|
|
128
|
+
|
|
129
|
+
- Daemon children (Process.daemon) that outlive the parent will have
|
|
130
|
+
their profiles lost, since the parent aggregates and cleans up the
|
|
131
|
+
session directory at exit.
|
|
132
|
+
- Cross-process snapshots (Rperf.snapshot) are not supported; snapshots
|
|
133
|
+
only cover the current process.
|
|
134
|
+
- Only Ruby child processes are profiled; non-Ruby children (shell
|
|
135
|
+
scripts, Python, etc.) are not affected.
|
|
136
|
+
- Child processes that use rperf independently (Rperf.start in their
|
|
137
|
+
own code) will conflict with the inherited auto-start session.
|
|
138
|
+
Such programs should clear RPERF_ENABLED from their environment
|
|
139
|
+
before requiring rperf.
|
|
140
|
+
|
|
74
141
|
### Examples
|
|
75
142
|
|
|
76
143
|
rperf record ruby app.rb
|
|
@@ -82,6 +149,8 @@ Default (no flag): opens diff in browser.
|
|
|
82
149
|
rperf stat ruby app.rb
|
|
83
150
|
rperf stat --report ruby app.rb
|
|
84
151
|
rperf stat -o profile.pb.gz ruby app.rb
|
|
152
|
+
rperf stat -m wall bundle exec unicorn
|
|
153
|
+
rperf stat --no-inherit ruby app.rb
|
|
85
154
|
rperf exec ruby app.rb
|
|
86
155
|
rperf exec -m cpu ruby app.rb
|
|
87
156
|
rperf report
|
|
@@ -112,12 +181,17 @@ Rperf.save("profile.txt", data)
|
|
|
112
181
|
|
|
113
182
|
### Rperf.start parameters
|
|
114
183
|
|
|
115
|
-
frequency: Sampling frequency in Hz (Integer, default: 1000)
|
|
184
|
+
frequency: Sampling frequency in Hz (Integer, 1..10000, default: 1000)
|
|
116
185
|
mode: :cpu or :wall (Symbol, default: :cpu)
|
|
117
186
|
output: File path to write on stop (String or nil)
|
|
118
187
|
verbose: Print statistics to stderr (true/false, default: false)
|
|
119
|
-
format: :pprof, :collapsed, :text, or nil for auto-detect (Symbol or nil)
|
|
188
|
+
format: :json, :pprof, :collapsed, :text, or nil for auto-detect (Symbol or nil)
|
|
120
189
|
defer: Start with timer paused; use Rperf.profile to activate (default: false)
|
|
190
|
+
inherit: Child process tracking: :fork (default), true (fork+spawn), false (none)
|
|
191
|
+
Note: CLI defaults to true (--no-inherit to disable)
|
|
192
|
+
signal: Timer signal (Linux only): nil (default, auto), false (use nanosleep),
|
|
193
|
+
or a signal number (Integer)
|
|
194
|
+
aggregate: Aggregate samples in C (default: true). false returns raw per-sample data
|
|
121
195
|
|
|
122
196
|
### Rperf.stop return value
|
|
123
197
|
|
|
@@ -126,7 +200,8 @@ nil if profiler was not running; otherwise a Hash:
|
|
|
126
200
|
```ruby
|
|
127
201
|
{ mode: :cpu, # or :wall
|
|
128
202
|
frequency: 500,
|
|
129
|
-
|
|
203
|
+
trigger_count: 1300, # number of timer triggers
|
|
204
|
+
sampling_count: 1234, # number of timer callbacks (may differ from trigger_count)
|
|
130
205
|
sampling_time_ns: 56789,
|
|
131
206
|
detected_thread_count: 4, # threads seen during profiling
|
|
132
207
|
start_time_ns: 17740..., # CLOCK_REALTIME epoch nanos
|
|
@@ -151,7 +226,9 @@ Only works in aggregate mode (the default). Returns nil if not profiling.
|
|
|
151
226
|
|
|
152
227
|
When `clear: true` is given, resets aggregated data after taking the snapshot.
|
|
153
228
|
This enables interval-based profiling where each snapshot covers only the
|
|
154
|
-
period since the last clear.
|
|
229
|
+
period since the last clear. Note: the frame table is intentionally retained
|
|
230
|
+
(frame IDs must stay stable for GC safety and thread data consistency), so
|
|
231
|
+
`unique_frames` may accumulate across intervals.
|
|
155
232
|
|
|
156
233
|
```ruby
|
|
157
234
|
Rperf.start(frequency: 1000)
|
|
@@ -215,13 +292,19 @@ In pprof output, use labels for filtering and grouping:
|
|
|
215
292
|
|
|
216
293
|
With `defer: true`, the profiler infrastructure is set up but the sampling
|
|
217
294
|
timer does not start. Use `Rperf.profile` to activate the timer for specific
|
|
218
|
-
sections. Outside `profile` blocks, overhead is zero.
|
|
295
|
+
sections. Outside `profile` blocks, the timer is disarmed and overhead is zero.
|
|
296
|
+
|
|
297
|
+
Note: the timer is process-wide, not per-thread. While a `profile` block is
|
|
298
|
+
active on one thread, other threads running at the same time will also be
|
|
299
|
+
sampled. Their samples carry their own labels (not the calling thread's labels),
|
|
300
|
+
so they can be distinguished in the profile. This design is intentional: it
|
|
301
|
+
provides complete visibility into what the process was doing during profiled
|
|
302
|
+
sections, including GVL contention and background work.
|
|
219
303
|
|
|
220
304
|
### Rperf.profile(**labels, &block)
|
|
221
305
|
|
|
222
|
-
Activates the sampling timer for the block duration and applies labels
|
|
223
|
-
Designed for use with `start(defer: true)
|
|
224
|
-
code paths.
|
|
306
|
+
Activates the sampling timer for the block duration and applies labels to
|
|
307
|
+
the current thread. Designed for use with `start(defer: true)`.
|
|
225
308
|
|
|
226
309
|
```ruby
|
|
227
310
|
Rperf.start(defer: true, mode: :wall)
|
|
@@ -242,9 +325,21 @@ running). Raises `RuntimeError` if not started, `ArgumentError` without block.
|
|
|
242
325
|
|
|
243
326
|
Returns the current thread's labels as a Hash. Empty hash if none set.
|
|
244
327
|
|
|
328
|
+
### Rperf.load(path)
|
|
329
|
+
|
|
330
|
+
Loads a `.json.gz` or `.json` profile file (saved by `rperf record` or `Rperf.save`)
|
|
331
|
+
and returns the parsed data hash (same format as `Rperf.stop` / `Rperf.snapshot`).
|
|
332
|
+
Gzip is auto-detected by magic bytes, so both compressed and plain files work.
|
|
333
|
+
Warns to stderr if the file was saved by a different rperf version.
|
|
334
|
+
|
|
335
|
+
```ruby
|
|
336
|
+
data = Rperf.load("rperf.json.gz") # gzip compressed
|
|
337
|
+
data = Rperf.load("profile.json") # plain text JSON
|
|
338
|
+
```
|
|
339
|
+
|
|
245
340
|
### Rperf.save(path, data, format: nil)
|
|
246
341
|
|
|
247
|
-
Writes data to path. format: :pprof, :collapsed, or :text.
|
|
342
|
+
Writes data to path. format: :json, :pprof, :collapsed, or :text.
|
|
248
343
|
nil auto-detects from extension.
|
|
249
344
|
|
|
250
345
|
### Rperf::RackMiddleware (Rack)
|
|
@@ -284,6 +379,88 @@ Sidekiq.configure_server do |config|
|
|
|
284
379
|
end
|
|
285
380
|
```
|
|
286
381
|
|
|
382
|
+
### Rperf::Viewer (Rack middleware)
|
|
383
|
+
|
|
384
|
+
In-browser profiling UI with flamegraph, top table, and tag breakdown.
|
|
385
|
+
Requires `require "rperf/viewer"`.
|
|
386
|
+
|
|
387
|
+
**Security note**: Rperf::Viewer has no built-in authentication and exposes
|
|
388
|
+
profiling data (including stack traces and label values) to anyone who can
|
|
389
|
+
reach the endpoint. In production, always restrict access using your
|
|
390
|
+
framework's authentication — see "Access control" below. The UI loads
|
|
391
|
+
d3.js and d3-flame-graph from CDNs (cdnjs.cloudflare.com, cdn.jsdelivr.net).
|
|
392
|
+
|
|
393
|
+
```ruby
|
|
394
|
+
# config.ru or Rails config
|
|
395
|
+
require "rperf/viewer"
|
|
396
|
+
use Rperf::Viewer # mount at /rperf/ (default)
|
|
397
|
+
use Rperf::Viewer, path: "/profiler" # custom mount path
|
|
398
|
+
use Rperf::Viewer, max_snapshots: 12 # keep fewer snapshots (default: 24)
|
|
399
|
+
```
|
|
400
|
+
|
|
401
|
+
Take snapshots via `Rperf::Viewer.instance.take_snapshot!` or
|
|
402
|
+
`Rperf::Viewer.instance.add_snapshot(data)`.
|
|
403
|
+
|
|
404
|
+
#### Typical setup with RackMiddleware and periodic snapshots
|
|
405
|
+
|
|
406
|
+
```ruby
|
|
407
|
+
require "rperf/viewer"
|
|
408
|
+
require "rperf/rack"
|
|
409
|
+
|
|
410
|
+
Rperf.start(mode: :wall, frequency: 999, defer: true)
|
|
411
|
+
use Rperf::Viewer
|
|
412
|
+
use Rperf::RackMiddleware
|
|
413
|
+
run MyApp
|
|
414
|
+
|
|
415
|
+
# Take a snapshot every 60 minutes in a background thread
|
|
416
|
+
Thread.new do
|
|
417
|
+
loop do
|
|
418
|
+
sleep 60 * 60
|
|
419
|
+
Rperf::Viewer.instance&.take_snapshot!
|
|
420
|
+
end
|
|
421
|
+
end
|
|
422
|
+
```
|
|
423
|
+
|
|
424
|
+
Visit `/rperf/` in a browser. Snapshots accumulate automatically
|
|
425
|
+
(up to `max_snapshots`, oldest are discarded). You can also trigger
|
|
426
|
+
a snapshot manually via an endpoint or console:
|
|
427
|
+
|
|
428
|
+
```ruby
|
|
429
|
+
Rperf::Viewer.instance.take_snapshot!
|
|
430
|
+
```
|
|
431
|
+
|
|
432
|
+
#### Access control
|
|
433
|
+
|
|
434
|
+
Rperf::Viewer has no built-in authentication. Restrict access using
|
|
435
|
+
your framework's existing mechanisms:
|
|
436
|
+
|
|
437
|
+
```ruby
|
|
438
|
+
# Rails: route constraint (e.g., admin-only)
|
|
439
|
+
# config/routes.rb
|
|
440
|
+
require "rperf/viewer"
|
|
441
|
+
constraints ->(req) { req.session[:admin] } do
|
|
442
|
+
mount Rperf::Viewer.new(nil, path: ""), at: "/rperf"
|
|
443
|
+
end
|
|
444
|
+
```
|
|
445
|
+
|
|
446
|
+
#### UI tabs
|
|
447
|
+
|
|
448
|
+
- **Flamegraph** — Interactive flamegraph (d3-flame-graph). Click to zoom.
|
|
449
|
+
- **Top** — Flat/cumulative weight table. Click column headers to sort.
|
|
450
|
+
- **Tags** — Label key/value breakdown with weight bars. Click a row to
|
|
451
|
+
set tagfocus and switch to Flamegraph.
|
|
452
|
+
|
|
453
|
+
#### Filtering controls
|
|
454
|
+
|
|
455
|
+
- **tagfocus** — Regex matched against label values. Press Enter to apply.
|
|
456
|
+
- **tagignore** — Dropdown checkboxes. Select `key = (none)` to exclude
|
|
457
|
+
samples without that key (e.g., background threads without `endpoint`).
|
|
458
|
+
- **tagroot** — Dropdown checkboxes for label keys. Checked keys are
|
|
459
|
+
prepended as root frames (e.g., `[endpoint: GET /users]`).
|
|
460
|
+
- **tagleaf** — Same as tagroot but appended as leaf frames.
|
|
461
|
+
|
|
462
|
+
Tag keys are sorted alphabetically (`%`-prefixed VM state keys appear first).
|
|
463
|
+
|
|
287
464
|
## PROFILING MODES
|
|
288
465
|
|
|
289
466
|
- **cpu** — Measures per-thread CPU time via Linux thread clock.
|
|
@@ -293,15 +470,26 @@ end
|
|
|
293
470
|
- **wall** — Measures wall-clock time (CLOCK_MONOTONIC).
|
|
294
471
|
Use for: finding where wall time goes, including I/O, sleep, GVL
|
|
295
472
|
contention, and off-CPU waits.
|
|
296
|
-
Includes
|
|
473
|
+
Includes VM state labels (see below).
|
|
297
474
|
|
|
298
475
|
## OUTPUT FORMATS
|
|
299
476
|
|
|
300
|
-
###
|
|
477
|
+
### json (default) — rperf native format
|
|
478
|
+
|
|
479
|
+
JSON representation of the internal data hash
|
|
480
|
+
(the same hash returned by `Rperf.stop` / `Rperf.snapshot` — see
|
|
481
|
+
"Return value" above for the full structure).
|
|
482
|
+
Preserves all data including labels, VM state, thread info, and statistics.
|
|
483
|
+
Readable by non-Ruby tools (Python, jq, etc.).
|
|
484
|
+
Extension convention: `.json.gz` (gzip-compressed, default) or `.json` (plain text).
|
|
485
|
+
View with: `rperf report` (opens rperf viewer in browser, no Go required).
|
|
486
|
+
Load programmatically: `data = Rperf.load("rperf.json.gz")`
|
|
487
|
+
|
|
488
|
+
### pprof
|
|
301
489
|
|
|
302
490
|
Gzip-compressed protobuf. Standard pprof format.
|
|
303
491
|
Extension convention: `.pb.gz`
|
|
304
|
-
View with: `go tool pprof`, pprof-rs, or
|
|
492
|
+
View with: `go tool pprof`, pprof-rs, speedscope, or `rperf report` (requires Go).
|
|
305
493
|
|
|
306
494
|
Embedded metadata:
|
|
307
495
|
|
|
@@ -358,27 +546,45 @@ Example output:
|
|
|
358
546
|
|
|
359
547
|
Format is auto-detected from the output file extension:
|
|
360
548
|
|
|
361
|
-
.
|
|
362
|
-
.
|
|
363
|
-
|
|
549
|
+
.json.gz → json (rperf native, gzip compressed, default)
|
|
550
|
+
.json → json (plain text, readable by jq)
|
|
551
|
+
.pb.gz → pprof
|
|
552
|
+
.collapsed → collapsed
|
|
553
|
+
.txt → text
|
|
364
554
|
|
|
365
555
|
The `--format` flag (CLI) or `format:` parameter (API) overrides auto-detect.
|
|
366
556
|
|
|
367
|
-
##
|
|
557
|
+
## VM STATE LABELS
|
|
368
558
|
|
|
369
|
-
|
|
559
|
+
rperf tracks GVL and GC states as **labels** (tags) on samples, not as
|
|
560
|
+
stack frames. The C extension records a VM state per sample, and the Ruby
|
|
561
|
+
layer merges it into the sample's label set using reserved keys `%GVL`
|
|
562
|
+
and `%GC`.
|
|
370
563
|
|
|
371
|
-
|
|
564
|
+
In wall mode, GVL state labels are recorded:
|
|
565
|
+
|
|
566
|
+
- **%GVL=blocked** — The thread was off-GVL (I/O, sleep, C extension
|
|
372
567
|
releasing GVL). Attributed to the stack at SUSPENDED.
|
|
373
|
-
-
|
|
568
|
+
- **%GVL=wait** — The thread was waiting to reacquire the GVL after
|
|
374
569
|
becoming ready. Indicates GVL contention. Same stack.
|
|
375
570
|
|
|
376
|
-
In both modes, GC
|
|
571
|
+
In both modes, GC state labels are recorded:
|
|
572
|
+
|
|
573
|
+
- **%GC=mark** — Time spent in GC marking phase (wall time).
|
|
574
|
+
- **%GC=sweep** — Time spent in GC sweeping phase (wall time).
|
|
575
|
+
|
|
576
|
+
These labels appear in `label_sets` (e.g., `{:"%GVL" => "blocked"}`,
|
|
577
|
+
`{:"%GC" => "mark"}`) and are written into pprof sample labels.
|
|
578
|
+
|
|
579
|
+
To add VM state as frames in flamegraphs, use pprof tag options:
|
|
377
580
|
|
|
378
|
-
|
|
379
|
-
|
|
581
|
+
go tool pprof -tagleaf=%GVL profile.pb.gz
|
|
582
|
+
go tool pprof -tagroot=%GC profile.pb.gz
|
|
380
583
|
|
|
381
|
-
|
|
584
|
+
To filter by VM state:
|
|
585
|
+
|
|
586
|
+
go tool pprof -tagfocus=%GVL=blocked profile.pb.gz
|
|
587
|
+
go tool pprof -tagfocus=%GC=mark profile.pb.gz
|
|
382
588
|
|
|
383
589
|
## INTERPRETING RESULTS
|
|
384
590
|
|
|
@@ -403,20 +609,24 @@ To convert: 1,000,000 ns = 1 ms, 1,000,000,000 ns = 1 s.
|
|
|
403
609
|
**Problem: slow request / high latency**
|
|
404
610
|
- Mode: wall
|
|
405
611
|
- Look for: functions with high cum wall time.
|
|
406
|
-
- If
|
|
407
|
-
|
|
612
|
+
- If %GVL=blocked is dominant → I/O or sleep is the bottleneck.
|
|
613
|
+
Filter: `go tool pprof -tagfocus=%GVL=blocked profile.pb.gz`
|
|
614
|
+
- If %GVL=wait is dominant → GVL contention; reduce GVL-holding work
|
|
408
615
|
or move work to Ractors / child processes.
|
|
616
|
+
Filter: `go tool pprof -tagfocus=%GVL=wait profile.pb.gz`
|
|
409
617
|
|
|
410
618
|
**Problem: GC pauses**
|
|
411
619
|
- Mode: cpu or wall
|
|
412
|
-
- Look for:
|
|
413
|
-
|
|
414
|
-
- High
|
|
620
|
+
- Look for: samples with %GC=mark and %GC=sweep labels.
|
|
621
|
+
Filter: `go tool pprof -tagfocus=%GC profile.pb.gz`
|
|
622
|
+
- High %GC=mark → too many live objects; reduce allocations.
|
|
623
|
+
- High %GC=sweep → too many short-lived objects; reuse or pool.
|
|
415
624
|
|
|
416
625
|
**Problem: multithreaded app slower than expected**
|
|
417
626
|
- Mode: wall
|
|
418
|
-
- Look for:
|
|
419
|
-
|
|
627
|
+
- Look for: samples with %GVL=wait label across threads.
|
|
628
|
+
Filter: `go tool pprof -tagfocus=%GVL=wait profile.pb.gz`
|
|
629
|
+
- High %GVL=wait means threads are serialized on the GVL.
|
|
420
630
|
|
|
421
631
|
## READING COLLAPSED STACKS PROGRAMMATICALLY
|
|
422
632
|
|
|
@@ -454,11 +664,20 @@ Used internally by the CLI to pass options to the auto-started profiler:
|
|
|
454
664
|
RPERF_OUTPUT=path Output file path
|
|
455
665
|
RPERF_FREQUENCY=hz Sampling frequency
|
|
456
666
|
RPERF_MODE=cpu|wall Profiling mode
|
|
457
|
-
RPERF_FORMAT=fmt pprof, collapsed, or text
|
|
667
|
+
RPERF_FORMAT=fmt json, pprof, collapsed, or text
|
|
458
668
|
RPERF_VERBOSE=1 Print statistics
|
|
459
669
|
RPERF_SIGNAL=N|false Timer signal number or 'false' for nanosleep (Linux only)
|
|
460
670
|
RPERF_STAT=1 Enable stat mode (used by rperf stat)
|
|
461
671
|
RPERF_STAT_REPORT=1 Include profile tables in stat output
|
|
672
|
+
RPERF_AGGREGATE=0 Disable C-level sample aggregation (raw mode)
|
|
673
|
+
RPERF_DEFER=1 Start with timer paused; use Rperf.profile to activate
|
|
674
|
+
RPERF_TMPDIR=path Base directory for session directories (overrides default tmpdir)
|
|
675
|
+
|
|
676
|
+
Internal variables (set automatically by the CLI — not for manual use):
|
|
677
|
+
|
|
678
|
+
RPERF_SESSION_DIR=path Session directory for multi-process profiling
|
|
679
|
+
RPERF_ROOT_PROCESS=pid Marks the root aggregating process
|
|
680
|
+
RPERF_STAT_COMMAND=str Command string displayed in stat output
|
|
462
681
|
|
|
463
682
|
## TIPS
|
|
464
683
|
|
data/docs/logo.svg
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
<svg viewBox="0 0 280 160" xmlns="http://www.w3.org/2000/svg">
|
|
2
|
+
<!-- Gauge arc -->
|
|
3
|
+
<path d="M70,100 A70,70 0 0,1 210,100" fill="none" stroke="#cc342d" stroke-width="3"/>
|
|
4
|
+
|
|
5
|
+
<!-- Tick marks -->
|
|
6
|
+
<g stroke="#cc342d" stroke-width="1.5">
|
|
7
|
+
<line x1="76" y1="68" x2="82" y2="74"/>
|
|
8
|
+
<line x1="100" y1="42" x2="104" y2="50"/>
|
|
9
|
+
<line x1="140" y1="32" x2="140" y2="40"/>
|
|
10
|
+
<line x1="180" y1="42" x2="176" y2="50"/>
|
|
11
|
+
<line x1="204" y1="68" x2="198" y2="74"/>
|
|
12
|
+
</g>
|
|
13
|
+
|
|
14
|
+
<!-- Ruby gem (below 12 o'clock tick) -->
|
|
15
|
+
<g transform="translate(140,52)">
|
|
16
|
+
<polygon points="-5,-5.5 5,-5.5 7.5,-1.5 0,6.5 -7.5,-1.5" fill="none" stroke="#cc342d" stroke-width="0.9"/>
|
|
17
|
+
</g>
|
|
18
|
+
|
|
19
|
+
<!-- Needle (-30%) -->
|
|
20
|
+
<line x1="140" y1="100" x2="172" y2="69" stroke="#8a7038" stroke-width="2.5" stroke-linecap="round"/>
|
|
21
|
+
<circle cx="140" cy="100" r="4" fill="#8a7038"/>
|
|
22
|
+
|
|
23
|
+
<!-- "rperf" text -->
|
|
24
|
+
<text x="140" y="148" text-anchor="middle" font-family="'Space Mono', monospace" font-size="24" font-weight="700" fill="#8a7038" letter-spacing="3">rperf</text>
|
|
25
|
+
</svg>
|