rperf 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +71 -47
- data/docs/help.md +154 -32
- data/docs/logo.svg +25 -0
- data/exe/rperf +121 -26
- data/ext/rperf/rperf.c +117 -89
- data/lib/rperf/rack.rb +25 -3
- data/lib/rperf/version.rb +1 -1
- data/lib/rperf/viewer.rb +798 -0
- data/lib/rperf.rb +166 -49
- metadata +6 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 520b1f5fd883bd68232c2b714aa1a66dbf811a3f3d2b2b54c212233f4a97d1c4
|
|
4
|
+
data.tar.gz: 11f5a6f52444abebc28a41055726eab246d623457c85001ba737fcb790b20cea
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 88c50af83f66f569739bd37377cc2dfc51f7bc6970243cb14bf5b2a7defc9ab6fee60f0e43f85e30f5acf0a315c2774ff0fd88b36c6ed913a5e9d6ef4aa63352
|
|
7
|
+
data.tar.gz: 63359d42f26529e726ef070f335e726b62bef23b0c897092b6010385ae91d01790039da35e300e734c4dfebb796c8b687759bdce395e13fcb778dca89c0318a1
|
data/README.md
CHANGED
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
</p>
|
|
18
18
|
|
|
19
19
|
<p align="center">
|
|
20
|
-
|
|
20
|
+
Built-in flamegraph viewer · CPU mode & wall mode (GVL + GC tracking)
|
|
21
21
|
</p>
|
|
22
22
|
|
|
23
23
|
<p align="center">
|
|
@@ -29,29 +29,34 @@
|
|
|
29
29
|
## See It in Action
|
|
30
30
|
|
|
31
31
|
```bash
|
|
32
|
-
$ gem install rperf
|
|
33
32
|
$ rperf exec ruby fib.rb
|
|
34
33
|
|
|
35
34
|
Performance stats for 'ruby fib.rb':
|
|
36
35
|
|
|
37
|
-
2,
|
|
38
|
-
|
|
39
|
-
2,
|
|
36
|
+
2,023.3 ms user
|
|
37
|
+
4.3 ms sys
|
|
38
|
+
2,001.8 ms real
|
|
40
39
|
|
|
41
|
-
2,
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
40
|
+
2,000.3 ms 100.0% [Rperf] CPU execution
|
|
41
|
+
3.0 ms [Ruby ] GC time (4 count: 2 minor, 2 major)
|
|
42
|
+
48,741 [Ruby ] allocated objects
|
|
43
|
+
27,034 [Ruby ] freed objects
|
|
44
|
+
1 [Ruby ] detected threads
|
|
45
|
+
16 MB [OS ] peak memory (maxrss)
|
|
46
|
+
5,784 [OS ] page faults (5,783 minor, 1 major)
|
|
47
|
+
22 [OS ] context switches (13 voluntary, 9 involuntary)
|
|
48
|
+
0 MB [OS ] disk I/O (0 MB read, 0 MB write)
|
|
46
49
|
|
|
47
50
|
Flat:
|
|
48
|
-
|
|
51
|
+
1,998.4 ms 99.9% Object#fibonacci (fib.rb)
|
|
52
|
+
1.9 ms 0.1% Module#method_added (<C method>)
|
|
49
53
|
|
|
50
54
|
Cumulative:
|
|
51
|
-
2,
|
|
52
|
-
|
|
55
|
+
2,000.3 ms 100.0% <main> (fib.rb)
|
|
56
|
+
1,998.4 ms 99.9% Object#fibonacci (fib.rb)
|
|
57
|
+
1.9 ms 0.1% Module#method_added (<C method>)
|
|
53
58
|
|
|
54
|
-
|
|
59
|
+
1999 samples / 1999 triggers, 0.1% profiler overhead
|
|
55
60
|
```
|
|
56
61
|
|
|
57
62
|
## Quick Start
|
|
@@ -60,17 +65,16 @@ $ rperf exec ruby fib.rb
|
|
|
60
65
|
# Performance summary (wall mode, prints to stderr)
|
|
61
66
|
rperf stat ruby app.rb
|
|
62
67
|
|
|
63
|
-
# Record a
|
|
64
|
-
rperf record ruby app.rb
|
|
65
|
-
rperf record -m wall
|
|
68
|
+
# Record a profile to file
|
|
69
|
+
rperf record ruby app.rb # → rperf.json.gz (cpu mode, default)
|
|
70
|
+
rperf record -m wall ruby server.rb # wall mode
|
|
66
71
|
|
|
67
|
-
# View results (
|
|
68
|
-
rperf report
|
|
69
|
-
rperf report --top profile.
|
|
72
|
+
# View results in browser (no external tools needed)
|
|
73
|
+
rperf report # open rperf.json.gz in viewer
|
|
74
|
+
rperf report --top profile.json.gz # print top functions to terminal
|
|
70
75
|
|
|
71
|
-
# Compare two profiles
|
|
72
|
-
rperf diff before.
|
|
73
|
-
rperf diff --top before.pb.gz after.pb.gz # print diff to terminal
|
|
76
|
+
# Compare two profiles (requires Go)
|
|
77
|
+
rperf diff before.json.gz after.json.gz # open diff in browser
|
|
74
78
|
```
|
|
75
79
|
|
|
76
80
|
### Ruby API
|
|
@@ -79,7 +83,7 @@ rperf diff --top before.pb.gz after.pb.gz # print diff to terminal
|
|
|
79
83
|
require "rperf"
|
|
80
84
|
|
|
81
85
|
# Block form — profiles and saves to file
|
|
82
|
-
Rperf.start(output: "profile.
|
|
86
|
+
Rperf.start(output: "profile.json.gz", frequency: 500, mode: :cpu) do
|
|
83
87
|
# code to profile
|
|
84
88
|
end
|
|
85
89
|
|
|
@@ -87,18 +91,37 @@ end
|
|
|
87
91
|
Rperf.start(frequency: 1000, mode: :wall)
|
|
88
92
|
# ...
|
|
89
93
|
data = Rperf.stop
|
|
90
|
-
Rperf.save("profile.
|
|
94
|
+
Rperf.save("profile.json.gz", data)
|
|
91
95
|
```
|
|
92
96
|
|
|
97
|
+
### In-browser Viewer
|
|
98
|
+
|
|
99
|
+
```ruby
|
|
100
|
+
# config.ru
|
|
101
|
+
require "rperf/viewer"
|
|
102
|
+
require "rperf/rack"
|
|
103
|
+
|
|
104
|
+
Rperf.start(mode: :wall, defer: true)
|
|
105
|
+
use Rperf::Viewer # visit /rperf/ for flamegraph UI
|
|
106
|
+
use Rperf::RackMiddleware # labels each request
|
|
107
|
+
run MyApp
|
|
108
|
+
|
|
109
|
+
# Snapshot every 60 minutes
|
|
110
|
+
Thread.new { loop { sleep 3600; Rperf::Viewer.instance&.take_snapshot! } }
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
> **Note:** `Rperf::Viewer` has no built-in authentication. In production, restrict access with your framework's auth mechanisms (e.g., route constraints in Rails). See the [manual](https://ko1.github.io/rperf/docs/manual/) for examples.
|
|
114
|
+
|
|
93
115
|
### Environment Variables
|
|
94
116
|
|
|
95
117
|
Profile without code changes (e.g., Rails):
|
|
96
118
|
|
|
97
119
|
```bash
|
|
98
|
-
RPERF_ENABLED=1 RPERF_MODE=wall
|
|
120
|
+
RPERF_ENABLED=1 RPERF_MODE=wall ruby app.rb # → rperf.json.gz
|
|
121
|
+
rperf report # open in viewer
|
|
99
122
|
```
|
|
100
123
|
|
|
101
|
-
Run `rperf help` for full documentation, or see the [online manual](https://ko1.github.io/rperf/).
|
|
124
|
+
Run `rperf help` for full documentation, or see the [online manual](https://ko1.github.io/rperf/docs/manual/).
|
|
102
125
|
|
|
103
126
|
## Subcommands
|
|
104
127
|
|
|
@@ -106,11 +129,11 @@ Inspired by Linux `perf` — familiar subcommand interface for profiling workflo
|
|
|
106
129
|
|
|
107
130
|
| Command | Description |
|
|
108
131
|
|---------|-------------|
|
|
109
|
-
| `rperf record` | Profile a command and save to file |
|
|
132
|
+
| `rperf record` | Profile a command and save to file (default: `.json.gz`) |
|
|
110
133
|
| `rperf stat` | Profile a command and print summary to stderr |
|
|
111
134
|
| `rperf exec` | Profile a command and print full report to stderr |
|
|
112
|
-
| `rperf report` | Open
|
|
113
|
-
| `rperf diff` | Compare two
|
|
135
|
+
| `rperf report` | Open viewer for `.json.gz`; wraps `go tool pprof` for `.pb.gz` (requires Go) |
|
|
136
|
+
| `rperf diff` | Compare two profiles (requires Go) |
|
|
114
137
|
| `rperf help` | Show full reference documentation |
|
|
115
138
|
|
|
116
139
|
## How It Works
|
|
@@ -147,23 +170,23 @@ If a safepoint is delayed, the sample carries proportionally more weight. The to
|
|
|
147
170
|
|
|
148
171
|
Use `cpu` to find what consumes CPU. Use `wall` to find what makes things slow (I/O, GVL contention, GC).
|
|
149
172
|
|
|
150
|
-
###
|
|
173
|
+
### GVL and GC Labels (wall mode)
|
|
151
174
|
|
|
152
|
-
rperf hooks GVL and GC events to attribute non-CPU time:
|
|
175
|
+
rperf hooks GVL and GC events to attribute non-CPU time. These are recorded as labels on samples rather than synthetic stack frames:
|
|
153
176
|
|
|
154
|
-
|
|
|
177
|
+
| Label | Meaning |
|
|
155
178
|
|-------|---------|
|
|
156
|
-
|
|
|
157
|
-
|
|
|
158
|
-
|
|
|
159
|
-
|
|
|
179
|
+
| `%GVL: blocked` | Off-GVL time (I/O, sleep, C extension releasing GVL) |
|
|
180
|
+
| `%GVL: wait` | Waiting to reacquire the GVL (contention) |
|
|
181
|
+
| `%GC: mark` | Time in GC mark phase |
|
|
182
|
+
| `%GC: sweep` | Time in GC sweep phase |
|
|
160
183
|
|
|
161
184
|
## Why rperf?
|
|
162
185
|
|
|
163
186
|
- **Accurate despite safepoints** — Safepoint sampling is *safer* (no async-signal-safety issues), but normally *inaccurate*. rperf compensates with real time-delta weights, so profiles faithfully reflect where time is actually spent.
|
|
164
|
-
- **See the whole picture** (wall mode) — GVL contention, off-GVL I/O, GC marking/sweeping — all attributed to the call stacks responsible, via
|
|
165
|
-
- **
|
|
166
|
-
- **
|
|
187
|
+
- **See the whole picture** (wall mode) — GVL contention, off-GVL I/O, GC marking/sweeping — all attributed to the call stacks responsible, via sample labels.
|
|
188
|
+
- **Built-in viewer** — Flamegraph, Top, Tags tabs with interactive tag filtering. No external tools needed to analyze profiles.
|
|
189
|
+
- **Low overhead** — Signal-based timer on Linux (no extra thread). ~1–5 us per sample.
|
|
167
190
|
- **Zero code changes** — Profile any Ruby program via CLI or environment variables. Drop-in for Rails, too.
|
|
168
191
|
- **`perf`-like CLI** — `record`, `stat`, `report`, `diff` — if you know Linux perf, you already know rperf.
|
|
169
192
|
|
|
@@ -172,19 +195,20 @@ rperf hooks GVL and GC events to attribute non-CPU time:
|
|
|
172
195
|
- **Method-level only** — no line-level granularity.
|
|
173
196
|
- **Ruby >= 3.4.0** — uses recent VM internals (postponed jobs, thread event hooks).
|
|
174
197
|
- **POSIX only** — Linux, macOS. No Windows.
|
|
175
|
-
- **No fork
|
|
198
|
+
- **No fork following** — profiling stops in fork(2) child processes (the child can start a new session).
|
|
176
199
|
|
|
177
200
|
|
|
178
201
|
## Output Formats
|
|
179
202
|
|
|
180
|
-
| Format | Extension |
|
|
181
|
-
|
|
182
|
-
|
|
|
183
|
-
|
|
|
184
|
-
|
|
|
203
|
+
| Format | Extension | Viewer |
|
|
204
|
+
|--------|-----------|--------|
|
|
205
|
+
| JSON (default) | `.json.gz` | `rperf report` (built-in viewer), `Rperf.load`, any JSON tool |
|
|
206
|
+
| pprof | `.pb.gz` | `go tool pprof` (requires Go), speedscope |
|
|
207
|
+
| collapsed | `.collapsed` | FlameGraph, speedscope |
|
|
208
|
+
| text | `.txt` | any text viewer |
|
|
185
209
|
|
|
186
210
|
Format is auto-detected from extension, or set explicitly with `--format`.
|
|
187
211
|
|
|
188
212
|
## License
|
|
189
213
|
|
|
190
|
-
MIT
|
|
214
|
+
MIT
|
data/docs/help.md
CHANGED
|
@@ -16,10 +16,10 @@ POSIX systems (Linux, macOS). Requires Ruby >= 3.4.0.
|
|
|
16
16
|
|
|
17
17
|
### record: Profile and save to file.
|
|
18
18
|
|
|
19
|
-
-o, --output PATH Output file (default: rperf.
|
|
19
|
+
-o, --output PATH Output file (default: rperf.json.gz)
|
|
20
20
|
-f, --frequency HZ Sampling frequency in Hz (default: 1000)
|
|
21
21
|
-m, --mode MODE cpu or wall (default: cpu)
|
|
22
|
-
--format FORMAT pprof, collapsed, or text (default: auto from extension)
|
|
22
|
+
--format FORMAT json, pprof, collapsed, or text (default: auto from extension)
|
|
23
23
|
-p, --print Print text profile to stdout
|
|
24
24
|
(same as --format=text --output=/dev/stdout)
|
|
25
25
|
--signal VALUE Timer signal (Linux only): signal number, or 'false'
|
|
@@ -40,6 +40,8 @@ Uses wall mode by default. No file output by default.
|
|
|
40
40
|
|
|
41
41
|
Shows: user/sys/real time, time breakdown (CPU execution, GVL blocked,
|
|
42
42
|
GVL wait, GC marking, GC sweeping), GC/memory/OS stats, and profiler overhead.
|
|
43
|
+
Lines are prefixed: `[Rperf]` for sampling-derived data, `[Ruby ]` for
|
|
44
|
+
runtime info, `[OS ]` for OS-level info.
|
|
43
45
|
Use --report to add flat and cumulative top-50 function tables.
|
|
44
46
|
|
|
45
47
|
### exec: Run command and print full profile report to stderr.
|
|
@@ -56,13 +58,13 @@ Like `stat --report`. Uses wall mode by default. No file output by default.
|
|
|
56
58
|
Shows: user/sys/real time, time breakdown, GC/memory/OS stats, profiler overhead,
|
|
57
59
|
and flat/cumulative top-50 function tables.
|
|
58
60
|
|
|
59
|
-
### report: Open
|
|
61
|
+
### report: Open profile viewer or go tool pprof.
|
|
60
62
|
|
|
61
63
|
--top Print top functions by flat time
|
|
62
64
|
--text Print text report
|
|
63
65
|
|
|
64
66
|
Default (no flag): opens interactive web UI in browser.
|
|
65
|
-
Default file: rperf.
|
|
67
|
+
Default file: rperf.json.gz
|
|
66
68
|
|
|
67
69
|
### diff: Compare two pprof profiles (target - base). Requires Go.
|
|
68
70
|
|
|
@@ -116,7 +118,7 @@ Rperf.save("profile.txt", data)
|
|
|
116
118
|
mode: :cpu or :wall (Symbol, default: :cpu)
|
|
117
119
|
output: File path to write on stop (String or nil)
|
|
118
120
|
verbose: Print statistics to stderr (true/false, default: false)
|
|
119
|
-
format: :pprof, :collapsed, :text, or nil for auto-detect (Symbol or nil)
|
|
121
|
+
format: :json, :pprof, :collapsed, :text, or nil for auto-detect (Symbol or nil)
|
|
120
122
|
defer: Start with timer paused; use Rperf.profile to activate (default: false)
|
|
121
123
|
|
|
122
124
|
### Rperf.stop return value
|
|
@@ -215,13 +217,19 @@ In pprof output, use labels for filtering and grouping:
|
|
|
215
217
|
|
|
216
218
|
With `defer: true`, the profiler infrastructure is set up but the sampling
|
|
217
219
|
timer does not start. Use `Rperf.profile` to activate the timer for specific
|
|
218
|
-
sections. Outside `profile` blocks, overhead is zero.
|
|
220
|
+
sections. Outside `profile` blocks, the timer is disarmed and overhead is zero.
|
|
221
|
+
|
|
222
|
+
Note: the timer is process-wide, not per-thread. While a `profile` block is
|
|
223
|
+
active on one thread, other threads running at the same time will also be
|
|
224
|
+
sampled. Their samples carry their own labels (not the calling thread's labels),
|
|
225
|
+
so they can be distinguished in the profile. This design is intentional: it
|
|
226
|
+
provides complete visibility into what the process was doing during profiled
|
|
227
|
+
sections, including GVL contention and background work.
|
|
219
228
|
|
|
220
229
|
### Rperf.profile(**labels, &block)
|
|
221
230
|
|
|
222
|
-
Activates the sampling timer for the block duration and applies labels
|
|
223
|
-
Designed for use with `start(defer: true)
|
|
224
|
-
code paths.
|
|
231
|
+
Activates the sampling timer for the block duration and applies labels to
|
|
232
|
+
the current thread. Designed for use with `start(defer: true)`.
|
|
225
233
|
|
|
226
234
|
```ruby
|
|
227
235
|
Rperf.start(defer: true, mode: :wall)
|
|
@@ -244,7 +252,7 @@ Returns the current thread's labels as a Hash. Empty hash if none set.
|
|
|
244
252
|
|
|
245
253
|
### Rperf.save(path, data, format: nil)
|
|
246
254
|
|
|
247
|
-
Writes data to path. format: :pprof, :collapsed, or :text.
|
|
255
|
+
Writes data to path. format: :json, :pprof, :collapsed, or :text.
|
|
248
256
|
nil auto-detects from extension.
|
|
249
257
|
|
|
250
258
|
### Rperf::RackMiddleware (Rack)
|
|
@@ -284,6 +292,88 @@ Sidekiq.configure_server do |config|
|
|
|
284
292
|
end
|
|
285
293
|
```
|
|
286
294
|
|
|
295
|
+
### Rperf::Viewer (Rack middleware)
|
|
296
|
+
|
|
297
|
+
In-browser profiling UI with flamegraph, top table, and tag breakdown.
|
|
298
|
+
Requires `require "rperf/viewer"`.
|
|
299
|
+
|
|
300
|
+
**Security note**: Rperf::Viewer has no built-in authentication and exposes
|
|
301
|
+
profiling data (including stack traces and label values) to anyone who can
|
|
302
|
+
reach the endpoint. In production, always restrict access using your
|
|
303
|
+
framework's authentication — see "Access control" below. The UI loads
|
|
304
|
+
d3.js and d3-flame-graph from CDNs (cdnjs.cloudflare.com, cdn.jsdelivr.net).
|
|
305
|
+
|
|
306
|
+
```ruby
|
|
307
|
+
# config.ru or Rails config
|
|
308
|
+
require "rperf/viewer"
|
|
309
|
+
use Rperf::Viewer # mount at /rperf/ (default)
|
|
310
|
+
use Rperf::Viewer, path: "/profiler" # custom mount path
|
|
311
|
+
use Rperf::Viewer, max_snapshots: 12 # keep fewer snapshots (default: 24)
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
Take snapshots via `Rperf::Viewer.instance.take_snapshot!` or
|
|
315
|
+
`Rperf::Viewer.instance.add_snapshot(data)`.
|
|
316
|
+
|
|
317
|
+
#### Typical setup with RackMiddleware and periodic snapshots
|
|
318
|
+
|
|
319
|
+
```ruby
|
|
320
|
+
require "rperf/viewer"
|
|
321
|
+
require "rperf/rack"
|
|
322
|
+
|
|
323
|
+
Rperf.start(mode: :wall, frequency: 999, defer: true)
|
|
324
|
+
use Rperf::Viewer
|
|
325
|
+
use Rperf::RackMiddleware
|
|
326
|
+
run MyApp
|
|
327
|
+
|
|
328
|
+
# Take a snapshot every 60 minutes in a background thread
|
|
329
|
+
Thread.new do
|
|
330
|
+
loop do
|
|
331
|
+
sleep 60 * 60
|
|
332
|
+
Rperf::Viewer.instance&.take_snapshot!
|
|
333
|
+
end
|
|
334
|
+
end
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
Visit `/rperf/` in a browser. Snapshots accumulate automatically
|
|
338
|
+
(up to `max_snapshots`, oldest are discarded). You can also trigger
|
|
339
|
+
a snapshot manually via an endpoint or console:
|
|
340
|
+
|
|
341
|
+
```ruby
|
|
342
|
+
Rperf::Viewer.instance.take_snapshot!
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
#### Access control
|
|
346
|
+
|
|
347
|
+
Rperf::Viewer has no built-in authentication. Restrict access using
|
|
348
|
+
your framework's existing mechanisms:
|
|
349
|
+
|
|
350
|
+
```ruby
|
|
351
|
+
# Rails: route constraint (e.g., admin-only)
|
|
352
|
+
# config/routes.rb
|
|
353
|
+
require "rperf/viewer"
|
|
354
|
+
constraints ->(req) { req.session[:admin] } do
|
|
355
|
+
mount Rperf::Viewer.new(nil, path: ""), at: "/rperf"
|
|
356
|
+
end
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
#### UI tabs
|
|
360
|
+
|
|
361
|
+
- **Flamegraph** — Interactive flamegraph (d3-flame-graph). Click to zoom.
|
|
362
|
+
- **Top** — Flat/cumulative weight table. Click column headers to sort.
|
|
363
|
+
- **Tags** — Label key/value breakdown with weight bars. Click a row to
|
|
364
|
+
set tagfocus and switch to Flamegraph.
|
|
365
|
+
|
|
366
|
+
#### Filtering controls
|
|
367
|
+
|
|
368
|
+
- **tagfocus** — Regex matched against label values. Press Enter to apply.
|
|
369
|
+
- **tagignore** — Dropdown checkboxes. Select `key = (none)` to exclude
|
|
370
|
+
samples without that key (e.g., background threads without `endpoint`).
|
|
371
|
+
- **tagroot** — Dropdown checkboxes for label keys. Checked keys are
|
|
372
|
+
prepended as root frames (e.g., `[endpoint: GET /users]`).
|
|
373
|
+
- **tagleaf** — Same as tagroot but appended as leaf frames.
|
|
374
|
+
|
|
375
|
+
Tag keys are sorted alphabetically (`%`-prefixed VM state keys appear first).
|
|
376
|
+
|
|
287
377
|
## PROFILING MODES
|
|
288
378
|
|
|
289
379
|
- **cpu** — Measures per-thread CPU time via Linux thread clock.
|
|
@@ -293,15 +383,26 @@ end
|
|
|
293
383
|
- **wall** — Measures wall-clock time (CLOCK_MONOTONIC).
|
|
294
384
|
Use for: finding where wall time goes, including I/O, sleep, GVL
|
|
295
385
|
contention, and off-CPU waits.
|
|
296
|
-
Includes
|
|
386
|
+
Includes VM state labels (see below).
|
|
297
387
|
|
|
298
388
|
## OUTPUT FORMATS
|
|
299
389
|
|
|
300
|
-
###
|
|
390
|
+
### json (default) — rperf native format
|
|
391
|
+
|
|
392
|
+
Gzip-compressed JSON representation of the internal data hash
|
|
393
|
+
(the same hash returned by `Rperf.stop` / `Rperf.snapshot` — see
|
|
394
|
+
"Return value" above for the full structure).
|
|
395
|
+
Preserves all data including labels, VM state, thread info, and statistics.
|
|
396
|
+
Readable by non-Ruby tools (Python, jq, etc.).
|
|
397
|
+
Extension convention: `.json.gz`
|
|
398
|
+
View with: `rperf report` (opens rperf viewer in browser, no Go required).
|
|
399
|
+
Load programmatically: `data = Rperf.load("rperf.json.gz")`
|
|
400
|
+
|
|
401
|
+
### pprof
|
|
301
402
|
|
|
302
403
|
Gzip-compressed protobuf. Standard pprof format.
|
|
303
404
|
Extension convention: `.pb.gz`
|
|
304
|
-
View with: `go tool pprof`, pprof-rs, or
|
|
405
|
+
View with: `go tool pprof`, pprof-rs, speedscope, or `rperf report` (requires Go).
|
|
305
406
|
|
|
306
407
|
Embedded metadata:
|
|
307
408
|
|
|
@@ -358,27 +459,44 @@ Example output:
|
|
|
358
459
|
|
|
359
460
|
Format is auto-detected from the output file extension:
|
|
360
461
|
|
|
361
|
-
.
|
|
362
|
-
.
|
|
363
|
-
|
|
462
|
+
.json.gz → json (rperf native, default)
|
|
463
|
+
.pb.gz → pprof
|
|
464
|
+
.collapsed → collapsed
|
|
465
|
+
.txt → text
|
|
364
466
|
|
|
365
467
|
The `--format` flag (CLI) or `format:` parameter (API) overrides auto-detect.
|
|
366
468
|
|
|
367
|
-
##
|
|
469
|
+
## VM STATE LABELS
|
|
470
|
+
|
|
471
|
+
rperf tracks GVL and GC states as **labels** (tags) on samples, not as
|
|
472
|
+
stack frames. The C extension records a VM state per sample, and the Ruby
|
|
473
|
+
layer merges it into the sample's label set using reserved keys `%GVL`
|
|
474
|
+
and `%GC`.
|
|
368
475
|
|
|
369
|
-
In wall mode,
|
|
476
|
+
In wall mode, GVL state labels are recorded:
|
|
370
477
|
|
|
371
|
-
-
|
|
478
|
+
- **%GVL=blocked** — The thread was off-GVL (I/O, sleep, C extension
|
|
372
479
|
releasing GVL). Attributed to the stack at SUSPENDED.
|
|
373
|
-
-
|
|
480
|
+
- **%GVL=wait** — The thread was waiting to reacquire the GVL after
|
|
374
481
|
becoming ready. Indicates GVL contention. Same stack.
|
|
375
482
|
|
|
376
|
-
In both modes, GC
|
|
483
|
+
In both modes, GC state labels are recorded:
|
|
484
|
+
|
|
485
|
+
- **%GC=mark** — Time spent in GC marking phase (wall time).
|
|
486
|
+
- **%GC=sweep** — Time spent in GC sweeping phase (wall time).
|
|
487
|
+
|
|
488
|
+
These labels appear in `label_sets` (e.g., `{"%GVL" => "blocked"}`,
|
|
489
|
+
`{"%GC" => "mark"}`) and are written into pprof sample labels.
|
|
490
|
+
|
|
491
|
+
To add VM state as frames in flamegraphs, use pprof tag options:
|
|
492
|
+
|
|
493
|
+
go tool pprof -tagleaf=%GVL profile.pb.gz
|
|
494
|
+
go tool pprof -tagroot=%GC profile.pb.gz
|
|
377
495
|
|
|
378
|
-
|
|
379
|
-
- **[GC sweeping]** — Time spent in GC sweeping phase (wall time).
|
|
496
|
+
To filter by VM state:
|
|
380
497
|
|
|
381
|
-
|
|
498
|
+
go tool pprof -tagfocus=%GVL=blocked profile.pb.gz
|
|
499
|
+
go tool pprof -tagfocus=%GC=mark profile.pb.gz
|
|
382
500
|
|
|
383
501
|
## INTERPRETING RESULTS
|
|
384
502
|
|
|
@@ -403,20 +521,24 @@ To convert: 1,000,000 ns = 1 ms, 1,000,000,000 ns = 1 s.
|
|
|
403
521
|
**Problem: slow request / high latency**
|
|
404
522
|
- Mode: wall
|
|
405
523
|
- Look for: functions with high cum wall time.
|
|
406
|
-
- If
|
|
407
|
-
|
|
524
|
+
- If %GVL=blocked is dominant → I/O or sleep is the bottleneck.
|
|
525
|
+
Filter: `go tool pprof -tagfocus=%GVL=blocked profile.pb.gz`
|
|
526
|
+
- If %GVL=wait is dominant → GVL contention; reduce GVL-holding work
|
|
408
527
|
or move work to Ractors / child processes.
|
|
528
|
+
Filter: `go tool pprof -tagfocus=%GVL=wait profile.pb.gz`
|
|
409
529
|
|
|
410
530
|
**Problem: GC pauses**
|
|
411
531
|
- Mode: cpu or wall
|
|
412
|
-
- Look for:
|
|
413
|
-
|
|
414
|
-
- High
|
|
532
|
+
- Look for: samples with %GC=mark and %GC=sweep labels.
|
|
533
|
+
Filter: `go tool pprof -tagfocus=%GC profile.pb.gz`
|
|
534
|
+
- High %GC=mark → too many live objects; reduce allocations.
|
|
535
|
+
- High %GC=sweep → too many short-lived objects; reuse or pool.
|
|
415
536
|
|
|
416
537
|
**Problem: multithreaded app slower than expected**
|
|
417
538
|
- Mode: wall
|
|
418
|
-
- Look for:
|
|
419
|
-
|
|
539
|
+
- Look for: samples with %GVL=wait label across threads.
|
|
540
|
+
Filter: `go tool pprof -tagfocus=%GVL=wait profile.pb.gz`
|
|
541
|
+
- High %GVL=wait means threads are serialized on the GVL.
|
|
420
542
|
|
|
421
543
|
## READING COLLAPSED STACKS PROGRAMMATICALLY
|
|
422
544
|
|
|
@@ -454,7 +576,7 @@ Used internally by the CLI to pass options to the auto-started profiler:
|
|
|
454
576
|
RPERF_OUTPUT=path Output file path
|
|
455
577
|
RPERF_FREQUENCY=hz Sampling frequency
|
|
456
578
|
RPERF_MODE=cpu|wall Profiling mode
|
|
457
|
-
RPERF_FORMAT=fmt pprof, collapsed, or text
|
|
579
|
+
RPERF_FORMAT=fmt json, pprof, collapsed, or text
|
|
458
580
|
RPERF_VERBOSE=1 Print statistics
|
|
459
581
|
RPERF_SIGNAL=N|false Timer signal number or 'false' for nanosleep (Linux only)
|
|
460
582
|
RPERF_STAT=1 Enable stat mode (used by rperf stat)
|
data/docs/logo.svg
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
<svg viewBox="0 0 280 160" xmlns="http://www.w3.org/2000/svg">
|
|
2
|
+
<!-- Gauge arc -->
|
|
3
|
+
<path d="M70,100 A70,70 0 0,1 210,100" fill="none" stroke="#cc342d" stroke-width="3"/>
|
|
4
|
+
|
|
5
|
+
<!-- Tick marks -->
|
|
6
|
+
<g stroke="#cc342d" stroke-width="1.5">
|
|
7
|
+
<line x1="76" y1="68" x2="82" y2="74"/>
|
|
8
|
+
<line x1="100" y1="42" x2="104" y2="50"/>
|
|
9
|
+
<line x1="140" y1="32" x2="140" y2="40"/>
|
|
10
|
+
<line x1="180" y1="42" x2="176" y2="50"/>
|
|
11
|
+
<line x1="204" y1="68" x2="198" y2="74"/>
|
|
12
|
+
</g>
|
|
13
|
+
|
|
14
|
+
<!-- Ruby gem (below 12 o'clock tick) -->
|
|
15
|
+
<g transform="translate(140,52)">
|
|
16
|
+
<polygon points="-5,-5.5 5,-5.5 7.5,-1.5 0,6.5 -7.5,-1.5" fill="none" stroke="#cc342d" stroke-width="0.9"/>
|
|
17
|
+
</g>
|
|
18
|
+
|
|
19
|
+
<!-- Needle (-30%) -->
|
|
20
|
+
<line x1="140" y1="100" x2="172" y2="69" stroke="#8a7038" stroke-width="2.5" stroke-linecap="round"/>
|
|
21
|
+
<circle cx="140" cy="100" r="4" fill="#8a7038"/>
|
|
22
|
+
|
|
23
|
+
<!-- "rperf" text -->
|
|
24
|
+
<text x="140" y="148" text-anchor="middle" font-family="'Space Mono', monospace" font-size="24" font-weight="700" fill="#8a7038" letter-spacing="3">rperf</text>
|
|
25
|
+
</svg>
|