rperf 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +69 -28
- data/docs/help.md +125 -6
- data/exe/rperf +1 -1
- data/ext/rperf/rperf.c +271 -105
- data/lib/rperf/active_job.rb +13 -0
- data/lib/rperf/middleware.rb +15 -0
- data/lib/rperf/sidekiq.rb +9 -0
- data/lib/rperf/version.rb +1 -1
- data/lib/rperf.rb +107 -10
- metadata +4 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 497392cfda8e82d1c37aadd0953b4c73b6bfb09870e6c612c1fd5fced0e3d24f
|
|
4
|
+
data.tar.gz: 6960be209fc3d4aac0f268378c5b7e1399027da0c5b7f498bcb4be0662012d62
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: '09fc32b7577ac9544a846c86c37a7ad11e9de00a27bbb0bbd25cbc2fcabe04e74741c64f9fb3cfe1a9663145e058215272a247766c7b8106218eda80cbcd838f'
|
|
7
|
+
data.tar.gz: 9d13e685c5a293c4d9033376509bf4b5c762a5f5155a2d5dd6e838d5a55dc79b9ef7d9521a5bcf65eac88a683f0e20cd7e5dac2680134aa565c709eb48452e40
|
data/README.md
CHANGED
|
@@ -2,25 +2,66 @@
|
|
|
2
2
|
<img src="docs/logo.svg" alt="rperf logo" width="260">
|
|
3
3
|
</p>
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
<h1 align="center">rperf</h1>
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
<p align="center">
|
|
8
|
+
<strong>Know where your Ruby spends its time — accurately.</strong><br>
|
|
9
|
+
A sampling profiler that corrects safepoint bias using real time deltas.
|
|
10
|
+
</p>
|
|
8
11
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
12
|
+
<p align="center">
|
|
13
|
+
<a href="https://rubygems.org/gems/rperf"><img src="https://img.shields.io/gem/v/rperf.svg" alt="Gem Version"></a>
|
|
14
|
+
<img src="https://img.shields.io/badge/Ruby-%3E%3D%203.4.0-cc342d" alt="Ruby >= 3.4.0">
|
|
15
|
+
<a href="https://ko1.github.io/rperf/docs/manual/"><img src="https://img.shields.io/badge/docs-manual-blue" alt="Manual"></a>
|
|
16
|
+
<img src="https://img.shields.io/badge/license-MIT-green" alt="MIT License">
|
|
17
|
+
</p>
|
|
13
18
|
|
|
14
|
-
|
|
19
|
+
<p align="center">
|
|
20
|
+
pprof / collapsed stacks / text report · CPU mode & wall mode (GVL + GC tracking)
|
|
21
|
+
</p>
|
|
22
|
+
|
|
23
|
+
<p align="center">
|
|
24
|
+
<a href='https://ko1.github.io/rperf/'>Web site</a>,
|
|
25
|
+
<a href='https://ko1.github.io/rperf/docs/manual/'>Online manual</a>,
|
|
26
|
+
<a href='https://github.com/ko1/rperf'>GitHub repository</a>
|
|
27
|
+
</p>
|
|
28
|
+
|
|
29
|
+
## See It in Action
|
|
15
30
|
|
|
16
31
|
```bash
|
|
17
|
-
gem install rperf
|
|
32
|
+
$ gem install rperf
|
|
33
|
+
$ rperf exec ruby fib.rb
|
|
18
34
|
|
|
35
|
+
Performance stats for 'ruby fib.rb':
|
|
36
|
+
|
|
37
|
+
2,326.0 ms user
|
|
38
|
+
64.5 ms sys
|
|
39
|
+
2,035.5 ms real
|
|
40
|
+
|
|
41
|
+
2,034.2 ms 100.0% CPU execution
|
|
42
|
+
1 [Ruby] detected threads
|
|
43
|
+
7.0 ms [Ruby] GC time (7 count: 5 minor, 2 major)
|
|
44
|
+
106,078 [Ruby] allocated objects
|
|
45
|
+
22 MB [OS] peak memory (maxrss)
|
|
46
|
+
|
|
47
|
+
Flat:
|
|
48
|
+
2,034.2 ms 100.0% Object#fibonacci (fib.rb)
|
|
49
|
+
|
|
50
|
+
Cumulative:
|
|
51
|
+
2,034.2 ms 100.0% Object#fibonacci (fib.rb)
|
|
52
|
+
2,034.2 ms 100.0% <main> (fib.rb)
|
|
53
|
+
|
|
54
|
+
2034 samples / 2034 triggers, 0.1% profiler overhead
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Quick Start
|
|
58
|
+
|
|
59
|
+
```bash
|
|
19
60
|
# Performance summary (wall mode, prints to stderr)
|
|
20
61
|
rperf stat ruby app.rb
|
|
21
62
|
|
|
22
|
-
#
|
|
23
|
-
rperf record ruby app.rb # → rperf.data (
|
|
63
|
+
# Record a pprof profile to file
|
|
64
|
+
rperf record ruby app.rb # → rperf.data (cpu mode)
|
|
24
65
|
rperf record -m wall -o profile.pb.gz ruby server.rb # wall mode, custom output
|
|
25
66
|
|
|
26
67
|
# View results (report/diff require Go: https://go.dev/dl/)
|
|
@@ -67,19 +108,20 @@ Inspired by Linux `perf` — familiar subcommand interface for profiling workflo
|
|
|
67
108
|
|---------|-------------|
|
|
68
109
|
| `rperf record` | Profile a command and save to file |
|
|
69
110
|
| `rperf stat` | Profile a command and print summary to stderr |
|
|
111
|
+
| `rperf exec` | Profile a command and print full report to stderr |
|
|
70
112
|
| `rperf report` | Open pprof profile with `go tool pprof` (requires Go) |
|
|
71
113
|
| `rperf diff` | Compare two pprof profiles (requires Go) |
|
|
72
114
|
| `rperf help` | Show full reference documentation |
|
|
73
115
|
|
|
74
116
|
## How It Works
|
|
75
117
|
|
|
76
|
-
### The
|
|
118
|
+
### The Challenge: Safepoint Sampling
|
|
77
119
|
|
|
78
|
-
Ruby
|
|
120
|
+
Most Ruby profilers (e.g., stackprof) use signal handlers to capture stack traces at the exact moment the timer fires. rperf takes a different approach — it samples at **safepoints** (VM checkpoints), which is safer (no async-signal-safety concerns, reliable access to VM state) but means the sample timing can be delayed. Without correction, this delay would skew the results.
|
|
79
121
|
|
|
80
|
-
### The
|
|
122
|
+
### The Fix: Weight = Real Time
|
|
81
123
|
|
|
82
|
-
rperf uses **time
|
|
124
|
+
rperf uses **actual elapsed time as sample weights** — so delayed samples carry proportionally more weight, and the profile matches reality:
|
|
83
125
|
|
|
84
126
|
```
|
|
85
127
|
Timer (signal or thread) VM thread (postponed job)
|
|
@@ -116,23 +158,22 @@ rperf hooks GVL and GC events to attribute non-CPU time:
|
|
|
116
158
|
| `[GC marking]` | Time in GC mark phase |
|
|
117
159
|
| `[GC sweeping]` | Time in GC sweep phase |
|
|
118
160
|
|
|
119
|
-
##
|
|
161
|
+
## Why rperf?
|
|
120
162
|
|
|
121
|
-
|
|
163
|
+
- **Accurate despite safepoints** — Safepoint sampling is *safer* (no async-signal-safety issues), but normally *inaccurate*. rperf compensates with real time-delta weights, so profiles faithfully reflect where time is actually spent.
|
|
164
|
+
- **See the whole picture** (wall mode) — GVL contention, off-GVL I/O, GC marking/sweeping — all attributed to the call stacks responsible, via synthetic frames.
|
|
165
|
+
- **Low overhead** — Signal-based timer on Linux (no extra thread). ~1–5 µs per sample.
|
|
166
|
+
- **pprof compatible** — Works with `go tool pprof`, speedscope, and other standard tools out of the box.
|
|
167
|
+
- **Zero code changes** — Profile any Ruby program via CLI or environment variables. Drop-in for Rails, too.
|
|
168
|
+
- **`perf`-like CLI** — `record`, `stat`, `report`, `diff` — if you know Linux perf, you already know rperf.
|
|
122
169
|
|
|
123
|
-
|
|
124
|
-
- **GVL & GC visibility** (wall mode): Attributes off-GVL time, GVL contention, and GC phases to the responsible call stacks with synthetic frames.
|
|
125
|
-
- **Low overhead**: No extra thread on Linux (signal-based timer). Sampling overhead is ~1-5 us per sample.
|
|
126
|
-
- **pprof compatible**: Output works with `go tool pprof`, speedscope, and other standard tools.
|
|
127
|
-
- **No code changes required**: Profile any Ruby program via CLI (`rperf stat ruby app.rb`) or environment variables (`RPERF_ENABLED=1`).
|
|
128
|
-
- **perf-like CLI**: Familiar subcommand interface — `record`, `stat`, `report`, `diff` — inspired by Linux perf.
|
|
170
|
+
### Limitations
|
|
129
171
|
|
|
130
|
-
|
|
172
|
+
- **Method-level only** — no line-level granularity.
|
|
173
|
+
- **Ruby >= 3.4.0** — uses recent VM internals (postponed jobs, thread event hooks).
|
|
174
|
+
- **POSIX only** — Linux, macOS. No Windows.
|
|
175
|
+
- **No fork support** — profiling does not follow fork(2) child processes.
|
|
131
176
|
|
|
132
|
-
- **Method-level only**: Profiles at the method level, not the line level. You can see which method is slow, but not which line within it.
|
|
133
|
-
- **Ruby >= 3.4.0**: Requires recent Ruby for the internal APIs used (postponed jobs, thread event hooks).
|
|
134
|
-
- **POSIX only**: Linux, macOS, etc. No Windows support.
|
|
135
|
-
- **Safepoint sampling**: Cannot sample inside C extensions or during long-running C calls that don't reach a safepoint. Time spent there is attributed to the next sample.
|
|
136
177
|
|
|
137
178
|
## Output Formats
|
|
138
179
|
|
|
@@ -146,4 +187,4 @@ Format is auto-detected from extension, or set explicitly with `--format`.
|
|
|
146
187
|
|
|
147
188
|
## License
|
|
148
189
|
|
|
149
|
-
MIT
|
|
190
|
+
MIT
|
data/docs/help.md
CHANGED
|
@@ -130,22 +130,132 @@ nil if profiler was not running; otherwise a Hash:
|
|
|
130
130
|
detected_thread_count: 4, # threads seen during profiling
|
|
131
131
|
start_time_ns: 17740..., # CLOCK_REALTIME epoch nanos
|
|
132
132
|
duration_ns: 10000000, # profiling duration in nanos
|
|
133
|
-
aggregated_samples: [
|
|
134
|
-
[frames, weight, seq],
|
|
135
|
-
...
|
|
136
|
-
],
|
|
133
|
+
aggregated_samples: [ # when aggregate: true (default)
|
|
134
|
+
[frames, weight, seq, label_set_id], # frames: [[path, label], ...] deepest-first
|
|
135
|
+
... # weight: Integer (nanoseconds, merged per unique stack)
|
|
136
|
+
], # seq: Integer (thread sequence, 1-based)
|
|
137
|
+
# label_set_id: Integer (0 = no labels)
|
|
138
|
+
label_sets: [{}, {request: "abc"}, ...], # label set table (index = label_set_id)
|
|
137
139
|
# --- OR ---
|
|
138
|
-
raw_samples: [
|
|
139
|
-
[frames, weight, seq], #
|
|
140
|
+
raw_samples: [ # when aggregate: false
|
|
141
|
+
[frames, weight, seq, label_set_id], # one entry per timer sample (not merged)
|
|
140
142
|
...
|
|
141
143
|
] }
|
|
142
144
|
```
|
|
143
145
|
|
|
146
|
+
### Rperf.snapshot(clear: false)
|
|
147
|
+
|
|
148
|
+
Returns a snapshot of the current profiling data without stopping.
|
|
149
|
+
Only works in aggregate mode (the default). Returns nil if not profiling.
|
|
150
|
+
|
|
151
|
+
When `clear: true` is given, resets aggregated data after taking the snapshot.
|
|
152
|
+
This enables interval-based profiling where each snapshot covers only the
|
|
153
|
+
period since the last clear.
|
|
154
|
+
|
|
155
|
+
```ruby
|
|
156
|
+
Rperf.start(frequency: 1000)
|
|
157
|
+
# ... work ...
|
|
158
|
+
snap = Rperf.snapshot # read data without stopping
|
|
159
|
+
Rperf.save("snap.pb.gz", snap)
|
|
160
|
+
# ... more work ...
|
|
161
|
+
data = Rperf.stop
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
Interval-based usage:
|
|
165
|
+
|
|
166
|
+
```ruby
|
|
167
|
+
Rperf.start(frequency: 1000)
|
|
168
|
+
loop do
|
|
169
|
+
sleep 10
|
|
170
|
+
snap = Rperf.snapshot(clear: true) # each snapshot covers the last 10s
|
|
171
|
+
Rperf.save("profile-#{Time.now.to_i}.pb.gz", snap)
|
|
172
|
+
end
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### Rperf.label(**labels, &block)
|
|
176
|
+
|
|
177
|
+
Attaches key-value labels to the current thread's samples. Labels appear
|
|
178
|
+
in pprof sample labels, enabling per-context filtering (e.g., per-request).
|
|
179
|
+
If profiling is not running, labels are silently ignored (no error).
|
|
180
|
+
|
|
181
|
+
```ruby
|
|
182
|
+
# Block form — labels are restored when the block exits
|
|
183
|
+
Rperf.label(request: "abc-123", endpoint: "/api/users") do
|
|
184
|
+
handle_request # samples inside get these labels
|
|
185
|
+
end
|
|
186
|
+
# labels are restored to previous state here
|
|
187
|
+
|
|
188
|
+
# Without block — labels persist until changed
|
|
189
|
+
Rperf.label(request: "abc-123")
|
|
190
|
+
|
|
191
|
+
# Merge — new labels merge with existing ones
|
|
192
|
+
Rperf.label(phase: "db") # adds phase, keeps request
|
|
193
|
+
|
|
194
|
+
# Delete a key — set value to nil
|
|
195
|
+
Rperf.label(request: nil) # removes request key
|
|
196
|
+
|
|
197
|
+
# Nested blocks — each block restores its entry state
|
|
198
|
+
Rperf.label(request: "abc") do
|
|
199
|
+
Rperf.label(phase: "db") do
|
|
200
|
+
Rperf.labels #=> {request: "abc", phase: "db"}
|
|
201
|
+
end
|
|
202
|
+
Rperf.labels #=> {request: "abc"}
|
|
203
|
+
end
|
|
204
|
+
Rperf.labels #=> {}
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
In pprof output, use labels for filtering and grouping:
|
|
208
|
+
|
|
209
|
+
go tool pprof -tagfocus=request=abc-123 profile.pb.gz
|
|
210
|
+
go tool pprof -tagroot=request profile.pb.gz
|
|
211
|
+
go tool pprof -tagleaf=request profile.pb.gz
|
|
212
|
+
|
|
213
|
+
### Rperf.labels
|
|
214
|
+
|
|
215
|
+
Returns the current thread's labels as a Hash. Empty hash if none set.
|
|
216
|
+
|
|
144
217
|
### Rperf.save(path, data, format: nil)
|
|
145
218
|
|
|
146
219
|
Writes data to path. format: :pprof, :collapsed, or :text.
|
|
147
220
|
nil auto-detects from extension.
|
|
148
221
|
|
|
222
|
+
### Rperf::Middleware (Rack)
|
|
223
|
+
|
|
224
|
+
Labels samples with the request endpoint. Requires `require "rperf/middleware"`.
|
|
225
|
+
|
|
226
|
+
```ruby
|
|
227
|
+
# Rails
|
|
228
|
+
Rails.application.config.middleware.use Rperf::Middleware
|
|
229
|
+
|
|
230
|
+
# Sinatra
|
|
231
|
+
use Rperf::Middleware
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
The middleware only sets labels — start profiling separately.
|
|
235
|
+
Option: `label_key:` (default: `:endpoint`).
|
|
236
|
+
|
|
237
|
+
### Rperf::ActiveJobMiddleware
|
|
238
|
+
|
|
239
|
+
Labels samples with the job class name. Requires `require "rperf/active_job"`.
|
|
240
|
+
|
|
241
|
+
```ruby
|
|
242
|
+
class ApplicationJob < ActiveJob::Base
|
|
243
|
+
include Rperf::ActiveJobMiddleware
|
|
244
|
+
end
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### Rperf::SidekiqMiddleware
|
|
248
|
+
|
|
249
|
+
Labels samples with the worker class name. Requires `require "rperf/sidekiq"`.
|
|
250
|
+
|
|
251
|
+
```ruby
|
|
252
|
+
Sidekiq.configure_server do |config|
|
|
253
|
+
config.server_middleware do |chain|
|
|
254
|
+
chain.add Rperf::SidekiqMiddleware
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
```
|
|
258
|
+
|
|
149
259
|
## PROFILING MODES
|
|
150
260
|
|
|
151
261
|
- **cpu** — Measures per-thread CPU time via Linux thread clock.
|
|
@@ -175,11 +285,20 @@ Embedded metadata:
|
|
|
175
285
|
Sample labels:
|
|
176
286
|
|
|
177
287
|
thread_seq thread sequence number (1-based, assigned per profiling session)
|
|
288
|
+
<user labels> custom key-value labels set via Rperf.label()
|
|
178
289
|
|
|
179
290
|
View comments: `go tool pprof -comments profile.pb.gz`
|
|
180
291
|
|
|
181
292
|
Group by thread: `go tool pprof -tagroot=thread_seq profile.pb.gz`
|
|
182
293
|
|
|
294
|
+
Filter by label: `go tool pprof -tagfocus=request=abc-123 profile.pb.gz`
|
|
295
|
+
|
|
296
|
+
Group by label (root): `go tool pprof -tagroot=request profile.pb.gz`
|
|
297
|
+
|
|
298
|
+
Group by label (leaf): `go tool pprof -tagleaf=request profile.pb.gz`
|
|
299
|
+
|
|
300
|
+
Exclude by label: `go tool pprof -tagignore=request=healthcheck profile.pb.gz`
|
|
301
|
+
|
|
183
302
|
### collapsed
|
|
184
303
|
|
|
185
304
|
Plain text. One line per unique stack: `frame1;frame2;...;leaf weight`
|
data/exe/rperf
CHANGED
|
@@ -80,7 +80,7 @@ USAGE = "Usage: rperf record [options] command [args...]\n" \
|
|
|
80
80
|
# Handle top-level flags before subcommand parsing
|
|
81
81
|
case ARGV.first
|
|
82
82
|
when "-v", "--version"
|
|
83
|
-
|
|
83
|
+
require_relative "../lib/rperf"
|
|
84
84
|
puts "rperf #{Rperf::VERSION}"
|
|
85
85
|
exit
|
|
86
86
|
when "-h", "--help"
|
data/ext/rperf/rperf.c
CHANGED
|
@@ -66,6 +66,7 @@ typedef struct rperf_sample {
|
|
|
66
66
|
int64_t weight;
|
|
67
67
|
int type; /* rperf_sample_type */
|
|
68
68
|
int thread_seq; /* thread sequence number (1-based) */
|
|
69
|
+
int label_set_id; /* label set ID (0 = no labels) */
|
|
69
70
|
} rperf_sample_t;
|
|
70
71
|
|
|
71
72
|
/* ---- Sample buffer (double-buffered) ---- */
|
|
@@ -103,6 +104,7 @@ typedef struct rperf_agg_entry {
|
|
|
103
104
|
uint32_t frame_start; /* offset into stack_pool */
|
|
104
105
|
int depth; /* includes synthetic frame */
|
|
105
106
|
int thread_seq;
|
|
107
|
+
int label_set_id; /* label set ID (0 = no labels) */
|
|
106
108
|
int64_t weight; /* accumulated */
|
|
107
109
|
uint32_t hash; /* cached hash value */
|
|
108
110
|
int used; /* 0 = empty, 1 = used */
|
|
@@ -124,6 +126,7 @@ typedef struct rperf_thread_data {
|
|
|
124
126
|
int64_t suspended_at_ns; /* wall time at SUSPENDED */
|
|
125
127
|
int64_t ready_at_ns; /* wall time at READY */
|
|
126
128
|
int thread_seq; /* thread sequence number (1-based) */
|
|
129
|
+
int label_set_id; /* current label set ID (0 = no labels) */
|
|
127
130
|
} rperf_thread_data_t;
|
|
128
131
|
|
|
129
132
|
/* ---- GC tracking state ---- */
|
|
@@ -132,6 +135,7 @@ typedef struct rperf_gc_state {
|
|
|
132
135
|
int phase; /* rperf_gc_phase */
|
|
133
136
|
int64_t enter_ns; /* wall time at GC_ENTER */
|
|
134
137
|
int thread_seq; /* thread_seq at GC_ENTER */
|
|
138
|
+
int label_set_id; /* label_set_id at GC_ENTER */
|
|
135
139
|
} rperf_gc_state_t;
|
|
136
140
|
|
|
137
141
|
/* ---- Sampling overhead stats ---- */
|
|
@@ -175,6 +179,9 @@ typedef struct rperf_profiler {
|
|
|
175
179
|
int next_thread_seq;
|
|
176
180
|
/* Sampling overhead stats */
|
|
177
181
|
rperf_stats_t stats;
|
|
182
|
+
/* Label sets: Ruby Array of Hash objects, managed from Ruby side.
|
|
183
|
+
* Index 0 is reserved (no labels). GC-marked via profiler_mark. */
|
|
184
|
+
VALUE label_sets; /* Ruby Array or Qnil */
|
|
178
185
|
} rperf_profiler_t;
|
|
179
186
|
|
|
180
187
|
static rperf_profiler_t g_profiler;
|
|
@@ -195,6 +202,10 @@ rperf_profiler_mark(void *ptr)
|
|
|
195
202
|
buf->frame_pool + buf->frame_pool_count);
|
|
196
203
|
}
|
|
197
204
|
}
|
|
205
|
+
/* Mark label_sets array */
|
|
206
|
+
if (prof->label_sets != Qnil) {
|
|
207
|
+
rb_gc_mark(prof->label_sets);
|
|
208
|
+
}
|
|
198
209
|
/* Mark frame_table keys (unique frame VALUEs).
|
|
199
210
|
* Acquire count to synchronize with the release-store in insert,
|
|
200
211
|
* ensuring we see the keys pointer that is valid for [0, count).
|
|
@@ -431,7 +442,7 @@ rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
|
|
|
431
442
|
/* ---- Aggregation table operations (all malloc-based, no GVL needed) ---- */
|
|
432
443
|
|
|
433
444
|
static uint32_t
|
|
434
|
-
rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq)
|
|
445
|
+
rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq, int label_set_id)
|
|
435
446
|
{
|
|
436
447
|
uint32_t h = 2166136261u;
|
|
437
448
|
int i;
|
|
@@ -441,6 +452,8 @@ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq)
|
|
|
441
452
|
}
|
|
442
453
|
h ^= (uint32_t)thread_seq;
|
|
443
454
|
h *= 16777619u;
|
|
455
|
+
h ^= (uint32_t)label_set_id;
|
|
456
|
+
h *= 16777619u;
|
|
444
457
|
return h;
|
|
445
458
|
}
|
|
446
459
|
|
|
@@ -506,7 +519,8 @@ rperf_agg_ensure_stack_pool(rperf_agg_table_t *at, int needed)
|
|
|
506
519
|
/* Insert or merge a stack into the aggregation table */
|
|
507
520
|
static void
|
|
508
521
|
rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
509
|
-
int depth, int thread_seq,
|
|
522
|
+
int depth, int thread_seq, int label_set_id,
|
|
523
|
+
int64_t weight, uint32_t hash)
|
|
510
524
|
{
|
|
511
525
|
size_t idx = hash % at->bucket_capacity;
|
|
512
526
|
|
|
@@ -514,6 +528,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
|
514
528
|
rperf_agg_entry_t *e = &at->buckets[idx];
|
|
515
529
|
if (!e->used) break;
|
|
516
530
|
if (e->hash == hash && e->depth == depth && e->thread_seq == thread_seq &&
|
|
531
|
+
e->label_set_id == label_set_id &&
|
|
517
532
|
memcmp(at->stack_pool + e->frame_start, frame_ids,
|
|
518
533
|
depth * sizeof(uint32_t)) == 0) {
|
|
519
534
|
/* Match — merge weight */
|
|
@@ -530,6 +545,7 @@ rperf_agg_table_insert(rperf_agg_table_t *at, const uint32_t *frame_ids,
|
|
|
530
545
|
e->frame_start = (uint32_t)at->stack_pool_count;
|
|
531
546
|
e->depth = depth;
|
|
532
547
|
e->thread_seq = thread_seq;
|
|
548
|
+
e->label_set_id = label_set_id;
|
|
533
549
|
e->weight = weight;
|
|
534
550
|
e->hash = hash;
|
|
535
551
|
e->used = 1;
|
|
@@ -581,10 +597,10 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
|
|
|
581
597
|
if (overflow) break; /* frame_table full, stop aggregating this buffer */
|
|
582
598
|
|
|
583
599
|
int total_depth = off + s->depth;
|
|
584
|
-
hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq);
|
|
600
|
+
hash = rperf_fnv1a_u32(temp_ids, total_depth, s->thread_seq, s->label_set_id);
|
|
585
601
|
|
|
586
602
|
rperf_agg_table_insert(&prof->agg_table, temp_ids, total_depth,
|
|
587
|
-
s->thread_seq, s->weight, hash);
|
|
603
|
+
s->thread_seq, s->label_set_id, s->weight, hash);
|
|
588
604
|
}
|
|
589
605
|
|
|
590
606
|
/* Reset buffer for reuse.
|
|
@@ -634,7 +650,7 @@ rperf_try_swap(rperf_profiler_t *prof)
|
|
|
634
650
|
/* Write a sample into a specific buffer. No swap check. */
|
|
635
651
|
static int
|
|
636
652
|
rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
|
|
637
|
-
int64_t weight, int type, int thread_seq)
|
|
653
|
+
int64_t weight, int type, int thread_seq, int label_set_id)
|
|
638
654
|
{
|
|
639
655
|
if (weight <= 0) return 0;
|
|
640
656
|
if (rperf_ensure_sample_capacity(buf) < 0) return -1;
|
|
@@ -645,16 +661,17 @@ rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
|
|
|
645
661
|
sample->weight = weight;
|
|
646
662
|
sample->type = type;
|
|
647
663
|
sample->thread_seq = thread_seq;
|
|
664
|
+
sample->label_set_id = label_set_id;
|
|
648
665
|
buf->sample_count++;
|
|
649
666
|
return 0;
|
|
650
667
|
}
|
|
651
668
|
|
|
652
669
|
static void
|
|
653
670
|
rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
|
|
654
|
-
int64_t weight, int type, int thread_seq)
|
|
671
|
+
int64_t weight, int type, int thread_seq, int label_set_id)
|
|
655
672
|
{
|
|
656
673
|
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
657
|
-
rperf_write_sample(buf, frame_start, depth, weight, type, thread_seq);
|
|
674
|
+
rperf_write_sample(buf, frame_start, depth, weight, type, thread_seq, label_set_id);
|
|
658
675
|
rperf_try_swap(prof);
|
|
659
676
|
}
|
|
660
677
|
|
|
@@ -676,12 +693,11 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
|
|
|
676
693
|
/* ---- Thread event hooks ---- */
|
|
677
694
|
|
|
678
695
|
static void
|
|
679
|
-
rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
|
|
696
|
+
rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
|
|
680
697
|
{
|
|
681
698
|
/* Has GVL — safe to call Ruby APIs */
|
|
682
699
|
int64_t wall_now = rperf_wall_time_ns();
|
|
683
700
|
|
|
684
|
-
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
685
701
|
int is_first = 0;
|
|
686
702
|
|
|
687
703
|
if (td == NULL) {
|
|
@@ -705,7 +721,7 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
|
|
|
705
721
|
/* Record normal sample (skip if first time — no prev_time) */
|
|
706
722
|
if (!is_first) {
|
|
707
723
|
int64_t weight = time_now - td->prev_time_ns;
|
|
708
|
-
rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
|
|
724
|
+
rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq, td->label_set_id);
|
|
709
725
|
}
|
|
710
726
|
|
|
711
727
|
/* Save timestamp for READY/RESUMED */
|
|
@@ -715,21 +731,18 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
|
|
|
715
731
|
}
|
|
716
732
|
|
|
717
733
|
static void
|
|
718
|
-
rperf_handle_ready(
|
|
734
|
+
rperf_handle_ready(rperf_thread_data_t *td)
|
|
719
735
|
{
|
|
720
736
|
/* May NOT have GVL — only simple C operations allowed */
|
|
721
|
-
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
722
737
|
if (!td) return;
|
|
723
738
|
|
|
724
739
|
td->ready_at_ns = rperf_wall_time_ns();
|
|
725
740
|
}
|
|
726
741
|
|
|
727
742
|
static void
|
|
728
|
-
rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread)
|
|
743
|
+
rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
|
|
729
744
|
{
|
|
730
745
|
/* Has GVL */
|
|
731
|
-
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
732
|
-
|
|
733
746
|
if (td == NULL) {
|
|
734
747
|
td = rperf_thread_data_create(prof, thread);
|
|
735
748
|
if (!td) return;
|
|
@@ -758,12 +771,12 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread)
|
|
|
758
771
|
if (td->ready_at_ns > 0 && td->ready_at_ns > td->suspended_at_ns) {
|
|
759
772
|
int64_t blocked_ns = td->ready_at_ns - td->suspended_at_ns;
|
|
760
773
|
rperf_write_sample(buf, frame_start, depth, blocked_ns,
|
|
761
|
-
RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq);
|
|
774
|
+
RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq, td->label_set_id);
|
|
762
775
|
}
|
|
763
776
|
if (td->ready_at_ns > 0 && wall_now > td->ready_at_ns) {
|
|
764
777
|
int64_t wait_ns = wall_now - td->ready_at_ns;
|
|
765
778
|
rperf_write_sample(buf, frame_start, depth, wait_ns,
|
|
766
|
-
RPERF_SAMPLE_GVL_WAIT, td->thread_seq);
|
|
779
|
+
RPERF_SAMPLE_GVL_WAIT, td->thread_seq, td->label_set_id);
|
|
767
780
|
}
|
|
768
781
|
|
|
769
782
|
rperf_try_swap(prof);
|
|
@@ -781,9 +794,8 @@ skip_gvl:
|
|
|
781
794
|
}
|
|
782
795
|
|
|
783
796
|
static void
|
|
784
|
-
rperf_handle_exited(rperf_profiler_t *prof, VALUE thread)
|
|
797
|
+
rperf_handle_exited(rperf_profiler_t *prof, VALUE thread, rperf_thread_data_t *td)
|
|
785
798
|
{
|
|
786
|
-
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
787
799
|
if (td) {
|
|
788
800
|
free(td);
|
|
789
801
|
rb_internal_thread_specific_set(thread, prof->ts_key, NULL);
|
|
@@ -797,15 +809,16 @@ rperf_thread_event_hook(rb_event_flag_t event, const rb_internal_thread_event_da
|
|
|
797
809
|
if (!prof->running) return;
|
|
798
810
|
|
|
799
811
|
VALUE thread = data->thread;
|
|
812
|
+
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
800
813
|
|
|
801
814
|
if (event & RUBY_INTERNAL_THREAD_EVENT_SUSPENDED)
|
|
802
|
-
rperf_handle_suspended(prof, thread);
|
|
815
|
+
rperf_handle_suspended(prof, thread, td);
|
|
803
816
|
else if (event & RUBY_INTERNAL_THREAD_EVENT_READY)
|
|
804
|
-
rperf_handle_ready(
|
|
817
|
+
rperf_handle_ready(td);
|
|
805
818
|
else if (event & RUBY_INTERNAL_THREAD_EVENT_RESUMED)
|
|
806
|
-
rperf_handle_resumed(prof, thread);
|
|
819
|
+
rperf_handle_resumed(prof, thread, td);
|
|
807
820
|
else if (event & RUBY_INTERNAL_THREAD_EVENT_EXITED)
|
|
808
|
-
rperf_handle_exited(prof, thread);
|
|
821
|
+
rperf_handle_exited(prof, thread, td);
|
|
809
822
|
}
|
|
810
823
|
|
|
811
824
|
/* ---- GC event hook ---- */
|
|
@@ -826,13 +839,14 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
|
|
|
826
839
|
prof->gc.phase = RPERF_GC_NONE;
|
|
827
840
|
}
|
|
828
841
|
else if (event & RUBY_INTERNAL_EVENT_GC_ENTER) {
|
|
829
|
-
/* Save timestamp and
|
|
842
|
+
/* Save timestamp, thread_seq, and label_set_id; backtrace is captured at GC_EXIT
|
|
830
843
|
* to avoid buffer mismatch after a double-buffer swap. */
|
|
831
844
|
prof->gc.enter_ns = rperf_wall_time_ns();
|
|
832
845
|
{
|
|
833
846
|
VALUE thread = rb_thread_current();
|
|
834
847
|
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
835
848
|
prof->gc.thread_seq = td ? td->thread_seq : 0;
|
|
849
|
+
prof->gc.label_set_id = td ? td->label_set_id : 0;
|
|
836
850
|
}
|
|
837
851
|
}
|
|
838
852
|
else if (event & RUBY_INTERNAL_EVENT_GC_EXIT) {
|
|
@@ -861,7 +875,7 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
|
|
|
861
875
|
}
|
|
862
876
|
buf->frame_pool_count += depth;
|
|
863
877
|
|
|
864
|
-
rperf_record_sample(prof, frame_start, depth, weight, type, prof->gc.thread_seq);
|
|
878
|
+
rperf_record_sample(prof, frame_start, depth, weight, type, prof->gc.thread_seq, prof->gc.label_set_id);
|
|
865
879
|
prof->gc.enter_ns = 0;
|
|
866
880
|
}
|
|
867
881
|
}
|
|
@@ -908,7 +922,7 @@ rperf_sample_job(void *arg)
|
|
|
908
922
|
if (depth <= 0) return;
|
|
909
923
|
buf->frame_pool_count += depth;
|
|
910
924
|
|
|
911
|
-
rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
|
|
925
|
+
rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq, td->label_set_id);
|
|
912
926
|
|
|
913
927
|
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_end);
|
|
914
928
|
prof->stats.sampling_count++;
|
|
@@ -1006,6 +1020,94 @@ rperf_resolve_frame(VALUE fval)
|
|
|
1006
1020
|
return rb_ary_new3(2, path, label);
|
|
1007
1021
|
}
|
|
1008
1022
|
|
|
1023
|
+
/* ---- Shared helpers for stop/snapshot ---- */
|
|
1024
|
+
|
|
1025
|
+
/* Flush pending sample buffers into agg_table.
|
|
1026
|
+
* Caller must ensure no concurrent access (worker joined or mutex held). */
|
|
1027
|
+
static void
|
|
1028
|
+
rperf_flush_buffers(rperf_profiler_t *prof)
|
|
1029
|
+
{
|
|
1030
|
+
int cur_idx = atomic_load_explicit(&prof->active_idx, memory_order_acquire);
|
|
1031
|
+
if (atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) {
|
|
1032
|
+
int standby_idx = cur_idx ^ 1;
|
|
1033
|
+
rperf_aggregate_buffer(prof, &prof->buffers[standby_idx]);
|
|
1034
|
+
atomic_store_explicit(&prof->swap_ready, 0, memory_order_release);
|
|
1035
|
+
}
|
|
1036
|
+
rperf_aggregate_buffer(prof, &prof->buffers[cur_idx]);
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
/* Build result hash from aggregated data (agg_table + frame_table).
|
|
1040
|
+
* Does NOT free any resources. Caller must hold GVL. */
|
|
1041
|
+
static VALUE
|
|
1042
|
+
rperf_build_aggregated_result(rperf_profiler_t *prof)
|
|
1043
|
+
{
|
|
1044
|
+
VALUE result, samples_ary;
|
|
1045
|
+
size_t i;
|
|
1046
|
+
int j;
|
|
1047
|
+
|
|
1048
|
+
result = rb_hash_new();
|
|
1049
|
+
|
|
1050
|
+
rb_hash_aset(result, ID2SYM(rb_intern("mode")),
|
|
1051
|
+
ID2SYM(rb_intern(prof->mode == 1 ? "wall" : "cpu")));
|
|
1052
|
+
rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(prof->frequency));
|
|
1053
|
+
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(prof->stats.trigger_count));
|
|
1054
|
+
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(prof->stats.sampling_count));
|
|
1055
|
+
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(prof->stats.sampling_total_ns));
|
|
1056
|
+
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(prof->next_thread_seq));
|
|
1057
|
+
rb_hash_aset(result, ID2SYM(rb_intern("unique_frames")),
|
|
1058
|
+
SIZET2NUM(prof->frame_table.count - RPERF_SYNTHETIC_COUNT));
|
|
1059
|
+
rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
|
|
1060
|
+
SIZET2NUM(prof->agg_table.count));
|
|
1061
|
+
|
|
1062
|
+
{
|
|
1063
|
+
struct timespec now_monotonic;
|
|
1064
|
+
int64_t start_ns, duration_ns;
|
|
1065
|
+
clock_gettime(CLOCK_MONOTONIC, &now_monotonic);
|
|
1066
|
+
start_ns = (int64_t)prof->start_realtime.tv_sec * 1000000000LL
|
|
1067
|
+
+ (int64_t)prof->start_realtime.tv_nsec;
|
|
1068
|
+
duration_ns = ((int64_t)now_monotonic.tv_sec - (int64_t)prof->start_monotonic.tv_sec) * 1000000000LL
|
|
1069
|
+
+ ((int64_t)now_monotonic.tv_nsec - (int64_t)prof->start_monotonic.tv_nsec);
|
|
1070
|
+
rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
|
|
1071
|
+
rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
|
|
1072
|
+
}
|
|
1073
|
+
|
|
1074
|
+
{
|
|
1075
|
+
rperf_frame_table_t *ft = &prof->frame_table;
|
|
1076
|
+
VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
|
|
1077
|
+
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]")));
|
|
1078
|
+
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
|
|
1079
|
+
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]")));
|
|
1080
|
+
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]")));
|
|
1081
|
+
for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
|
|
1082
|
+
rb_ary_push(resolved_ary, rperf_resolve_frame(atomic_load_explicit(&ft->keys, memory_order_relaxed)[i]));
|
|
1083
|
+
}
|
|
1084
|
+
|
|
1085
|
+
rperf_agg_table_t *at = &prof->agg_table;
|
|
1086
|
+
samples_ary = rb_ary_new();
|
|
1087
|
+
for (i = 0; i < at->bucket_capacity; i++) {
|
|
1088
|
+
rperf_agg_entry_t *e = &at->buckets[i];
|
|
1089
|
+
if (!e->used) continue;
|
|
1090
|
+
|
|
1091
|
+
VALUE frames = rb_ary_new_capa(e->depth);
|
|
1092
|
+
for (j = 0; j < e->depth; j++) {
|
|
1093
|
+
uint32_t fid = at->stack_pool[e->frame_start + j];
|
|
1094
|
+
rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
|
|
1095
|
+
}
|
|
1096
|
+
|
|
1097
|
+
VALUE sample = rb_ary_new3(4, frames, LONG2NUM(e->weight), INT2NUM(e->thread_seq), INT2NUM(e->label_set_id));
|
|
1098
|
+
rb_ary_push(samples_ary, sample);
|
|
1099
|
+
}
|
|
1100
|
+
}
|
|
1101
|
+
|
|
1102
|
+
rb_hash_aset(result, ID2SYM(rb_intern("aggregated_samples")), samples_ary);
|
|
1103
|
+
|
|
1104
|
+
if (prof->label_sets != Qnil) {
|
|
1105
|
+
rb_hash_aset(result, ID2SYM(rb_intern("label_sets")), prof->label_sets);
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
return result;
|
|
1109
|
+
}
|
|
1110
|
+
|
|
1009
1111
|
/* ---- Ruby API ---- */
|
|
1010
1112
|
|
|
1011
1113
|
/* _c_start(frequency, mode, aggregate, signal)
|
|
@@ -1038,6 +1140,7 @@ rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
|
|
|
1038
1140
|
g_profiler.stats.trigger_count = 0;
|
|
1039
1141
|
atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
|
|
1040
1142
|
atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
|
|
1143
|
+
g_profiler.label_sets = Qnil;
|
|
1041
1144
|
|
|
1042
1145
|
/* Initialize worker mutex/cond */
|
|
1043
1146
|
CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
|
|
@@ -1259,15 +1362,8 @@ rb_rperf_stop(VALUE self)
|
|
|
1259
1362
|
rb_remove_event_hook(rperf_gc_event_hook);
|
|
1260
1363
|
|
|
1261
1364
|
if (g_profiler.aggregate) {
|
|
1262
|
-
/* Worker thread is joined; no concurrent access
|
|
1263
|
-
|
|
1264
|
-
/* Aggregate remaining samples from both buffers */
|
|
1265
|
-
if (atomic_load_explicit(&g_profiler.swap_ready, memory_order_relaxed)) {
|
|
1266
|
-
int standby_idx = cur_idx ^ 1;
|
|
1267
|
-
rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[standby_idx]);
|
|
1268
|
-
atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
|
|
1269
|
-
}
|
|
1270
|
-
rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[cur_idx]);
|
|
1365
|
+
/* Worker thread is joined; no concurrent access. */
|
|
1366
|
+
rperf_flush_buffers(&g_profiler);
|
|
1271
1367
|
}
|
|
1272
1368
|
|
|
1273
1369
|
/* Clean up thread-specific data for all live threads */
|
|
@@ -1285,73 +1381,8 @@ rb_rperf_stop(VALUE self)
|
|
|
1285
1381
|
}
|
|
1286
1382
|
}
|
|
1287
1383
|
|
|
1288
|
-
/* Build result hash */
|
|
1289
|
-
result = rb_hash_new();
|
|
1290
|
-
|
|
1291
|
-
/* mode */
|
|
1292
|
-
rb_hash_aset(result, ID2SYM(rb_intern("mode")),
|
|
1293
|
-
ID2SYM(rb_intern(g_profiler.mode == 1 ? "wall" : "cpu")));
|
|
1294
|
-
|
|
1295
|
-
/* frequency */
|
|
1296
|
-
rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
|
|
1297
|
-
|
|
1298
|
-
/* trigger_count, sampling_count, sampling_time_ns, detected_thread_count */
|
|
1299
|
-
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
|
|
1300
|
-
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
|
|
1301
|
-
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
|
|
1302
|
-
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
|
|
1303
|
-
|
|
1304
|
-
/* aggregation stats */
|
|
1305
1384
|
if (g_profiler.aggregate) {
|
|
1306
|
-
|
|
1307
|
-
SIZET2NUM(g_profiler.frame_table.count - RPERF_SYNTHETIC_COUNT));
|
|
1308
|
-
rb_hash_aset(result, ID2SYM(rb_intern("unique_stacks")),
|
|
1309
|
-
SIZET2NUM(g_profiler.agg_table.count));
|
|
1310
|
-
}
|
|
1311
|
-
|
|
1312
|
-
/* start_time_ns (CLOCK_REALTIME epoch nanos), duration_ns (CLOCK_MONOTONIC delta) */
|
|
1313
|
-
{
|
|
1314
|
-
struct timespec stop_monotonic;
|
|
1315
|
-
int64_t start_ns, duration_ns;
|
|
1316
|
-
clock_gettime(CLOCK_MONOTONIC, &stop_monotonic);
|
|
1317
|
-
start_ns = (int64_t)g_profiler.start_realtime.tv_sec * 1000000000LL
|
|
1318
|
-
+ (int64_t)g_profiler.start_realtime.tv_nsec;
|
|
1319
|
-
duration_ns = ((int64_t)stop_monotonic.tv_sec - (int64_t)g_profiler.start_monotonic.tv_sec) * 1000000000LL
|
|
1320
|
-
+ ((int64_t)stop_monotonic.tv_nsec - (int64_t)g_profiler.start_monotonic.tv_nsec);
|
|
1321
|
-
rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
|
|
1322
|
-
rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
|
|
1323
|
-
}
|
|
1324
|
-
|
|
1325
|
-
if (g_profiler.aggregate) {
|
|
1326
|
-
/* Build samples from aggregation table.
|
|
1327
|
-
* Use a Ruby array for resolved frames so GC protects them. */
|
|
1328
|
-
rperf_frame_table_t *ft = &g_profiler.frame_table;
|
|
1329
|
-
VALUE resolved_ary = rb_ary_new_capa((long)ft->count);
|
|
1330
|
-
/* Synthetic frames */
|
|
1331
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL blocked]")));
|
|
1332
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GVL>"), rb_str_new_lit("[GVL wait]")));
|
|
1333
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC marking]")));
|
|
1334
|
-
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]")));
|
|
1335
|
-
/* Real frames */
|
|
1336
|
-
for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
|
|
1337
|
-
rb_ary_push(resolved_ary, rperf_resolve_frame(atomic_load_explicit(&ft->keys, memory_order_relaxed)[i]));
|
|
1338
|
-
}
|
|
1339
|
-
|
|
1340
|
-
rperf_agg_table_t *at = &g_profiler.agg_table;
|
|
1341
|
-
samples_ary = rb_ary_new();
|
|
1342
|
-
for (i = 0; i < at->bucket_capacity; i++) {
|
|
1343
|
-
rperf_agg_entry_t *e = &at->buckets[i];
|
|
1344
|
-
if (!e->used) continue;
|
|
1345
|
-
|
|
1346
|
-
VALUE frames = rb_ary_new_capa(e->depth);
|
|
1347
|
-
for (j = 0; j < e->depth; j++) {
|
|
1348
|
-
uint32_t fid = at->stack_pool[e->frame_start + j];
|
|
1349
|
-
rb_ary_push(frames, RARRAY_AREF(resolved_ary, fid));
|
|
1350
|
-
}
|
|
1351
|
-
|
|
1352
|
-
VALUE sample = rb_ary_new3(3, frames, LONG2NUM(e->weight), INT2NUM(e->thread_seq));
|
|
1353
|
-
rb_ary_push(samples_ary, sample);
|
|
1354
|
-
}
|
|
1385
|
+
result = rperf_build_aggregated_result(&g_profiler);
|
|
1355
1386
|
|
|
1356
1387
|
rperf_sample_buffer_free(&g_profiler.buffers[1]);
|
|
1357
1388
|
rperf_frame_table_free(&g_profiler.frame_table);
|
|
@@ -1359,6 +1390,27 @@ rb_rperf_stop(VALUE self)
|
|
|
1359
1390
|
} else {
|
|
1360
1391
|
/* Raw samples path (aggregate: false) */
|
|
1361
1392
|
rperf_sample_buffer_t *buf = &g_profiler.buffers[0];
|
|
1393
|
+
|
|
1394
|
+
result = rb_hash_new();
|
|
1395
|
+
rb_hash_aset(result, ID2SYM(rb_intern("mode")),
|
|
1396
|
+
ID2SYM(rb_intern(g_profiler.mode == 1 ? "wall" : "cpu")));
|
|
1397
|
+
rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
|
|
1398
|
+
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
|
|
1399
|
+
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
|
|
1400
|
+
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
|
|
1401
|
+
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
|
|
1402
|
+
{
|
|
1403
|
+
struct timespec stop_monotonic;
|
|
1404
|
+
int64_t start_ns, duration_ns;
|
|
1405
|
+
clock_gettime(CLOCK_MONOTONIC, &stop_monotonic);
|
|
1406
|
+
start_ns = (int64_t)g_profiler.start_realtime.tv_sec * 1000000000LL
|
|
1407
|
+
+ (int64_t)g_profiler.start_realtime.tv_nsec;
|
|
1408
|
+
duration_ns = ((int64_t)stop_monotonic.tv_sec - (int64_t)g_profiler.start_monotonic.tv_sec) * 1000000000LL
|
|
1409
|
+
+ ((int64_t)stop_monotonic.tv_nsec - (int64_t)g_profiler.start_monotonic.tv_nsec);
|
|
1410
|
+
rb_hash_aset(result, ID2SYM(rb_intern("start_time_ns")), LONG2NUM(start_ns));
|
|
1411
|
+
rb_hash_aset(result, ID2SYM(rb_intern("duration_ns")), LONG2NUM(duration_ns));
|
|
1412
|
+
}
|
|
1413
|
+
|
|
1362
1414
|
samples_ary = rb_ary_new_capa((long)buf->sample_count);
|
|
1363
1415
|
for (i = 0; i < buf->sample_count; i++) {
|
|
1364
1416
|
rperf_sample_t *s = &buf->samples[i];
|
|
@@ -1384,13 +1436,14 @@ rb_rperf_stop(VALUE self)
|
|
|
1384
1436
|
rb_ary_push(frames, rperf_resolve_frame(fval));
|
|
1385
1437
|
}
|
|
1386
1438
|
|
|
1387
|
-
VALUE sample = rb_ary_new3(
|
|
1439
|
+
VALUE sample = rb_ary_new3(4, frames, LONG2NUM(s->weight), INT2NUM(s->thread_seq), INT2NUM(s->label_set_id));
|
|
1388
1440
|
rb_ary_push(samples_ary, sample);
|
|
1389
1441
|
}
|
|
1442
|
+
rb_hash_aset(result, ID2SYM(rb_intern("raw_samples")), samples_ary);
|
|
1443
|
+
if (g_profiler.label_sets != Qnil) {
|
|
1444
|
+
rb_hash_aset(result, ID2SYM(rb_intern("label_sets")), g_profiler.label_sets);
|
|
1445
|
+
}
|
|
1390
1446
|
}
|
|
1391
|
-
rb_hash_aset(result,
|
|
1392
|
-
ID2SYM(rb_intern(g_profiler.aggregate ? "aggregated_samples" : "raw_samples")),
|
|
1393
|
-
samples_ary);
|
|
1394
1447
|
|
|
1395
1448
|
/* Cleanup */
|
|
1396
1449
|
rperf_sample_buffer_free(&g_profiler.buffers[0]);
|
|
@@ -1398,6 +1451,113 @@ rb_rperf_stop(VALUE self)
|
|
|
1398
1451
|
return result;
|
|
1399
1452
|
}
|
|
1400
1453
|
|
|
1454
|
+
/* ---- Snapshot: read aggregated data without stopping ---- */
|
|
1455
|
+
|
|
1456
|
+
/* Clear aggregated data for the next interval.
|
|
1457
|
+
* Caller must hold GVL + worker_mutex.
|
|
1458
|
+
* Keeps allocations intact for reuse. Does NOT touch frame_table
|
|
1459
|
+
* (frame IDs must stay stable — dmark may be iterating keys outside GVL,
|
|
1460
|
+
* and existing threads reference frame IDs via their thread_data). */
|
|
1461
|
+
static void
|
|
1462
|
+
rperf_clear_aggregated_data(rperf_profiler_t *prof)
|
|
1463
|
+
{
|
|
1464
|
+
/* Clear agg_table entries (keep allocation) */
|
|
1465
|
+
memset(prof->agg_table.buckets, 0,
|
|
1466
|
+
prof->agg_table.bucket_capacity * sizeof(rperf_agg_entry_t));
|
|
1467
|
+
prof->agg_table.count = 0;
|
|
1468
|
+
prof->agg_table.stack_pool_count = 0;
|
|
1469
|
+
|
|
1470
|
+
/* Reset stats */
|
|
1471
|
+
prof->stats.trigger_count = 0;
|
|
1472
|
+
prof->stats.sampling_count = 0;
|
|
1473
|
+
prof->stats.sampling_total_ns = 0;
|
|
1474
|
+
|
|
1475
|
+
/* Reset start timestamps so next snapshot's duration_ns covers
|
|
1476
|
+
* only the period since this clear. */
|
|
1477
|
+
clock_gettime(CLOCK_REALTIME, &prof->start_realtime);
|
|
1478
|
+
clock_gettime(CLOCK_MONOTONIC, &prof->start_monotonic);
|
|
1479
|
+
}
|
|
1480
|
+
|
|
1481
|
+
static VALUE
|
|
1482
|
+
rb_rperf_snapshot(VALUE self, VALUE vclear)
|
|
1483
|
+
{
|
|
1484
|
+
VALUE result;
|
|
1485
|
+
|
|
1486
|
+
if (!g_profiler.running) {
|
|
1487
|
+
return Qnil;
|
|
1488
|
+
}
|
|
1489
|
+
|
|
1490
|
+
if (!g_profiler.aggregate) {
|
|
1491
|
+
rb_raise(rb_eRuntimeError, "snapshot requires aggregate mode (aggregate: true)");
|
|
1492
|
+
}
|
|
1493
|
+
|
|
1494
|
+
/* GVL is held → no postponed jobs fire → no new samples written.
|
|
1495
|
+
* Lock worker_mutex to pause worker thread's aggregation. */
|
|
1496
|
+
CHECKED(pthread_mutex_lock(&g_profiler.worker_mutex));
|
|
1497
|
+
rperf_flush_buffers(&g_profiler);
|
|
1498
|
+
|
|
1499
|
+
/* Build result while mutex is held. If clear is requested, we must
|
|
1500
|
+
* also clear under the same lock to avoid a window where the worker
|
|
1501
|
+
* could aggregate into the table between build and clear. */
|
|
1502
|
+
result = rperf_build_aggregated_result(&g_profiler);
|
|
1503
|
+
|
|
1504
|
+
if (RTEST(vclear)) {
|
|
1505
|
+
rperf_clear_aggregated_data(&g_profiler);
|
|
1506
|
+
}
|
|
1507
|
+
|
|
1508
|
+
CHECKED(pthread_mutex_unlock(&g_profiler.worker_mutex));
|
|
1509
|
+
|
|
1510
|
+
return result;
|
|
1511
|
+
}
|
|
1512
|
+
|
|
1513
|
+
/* ---- Label API ---- */
|
|
1514
|
+
|
|
1515
|
+
/* _c_set_label(label_set_id) — set current thread's label_set_id.
|
|
1516
|
+
* Called from Ruby with GVL held. */
|
|
1517
|
+
static VALUE
|
|
1518
|
+
rb_rperf_set_label(VALUE self, VALUE vid)
|
|
1519
|
+
{
|
|
1520
|
+
if (!g_profiler.running) return vid;
|
|
1521
|
+
|
|
1522
|
+
int label_set_id = NUM2INT(vid);
|
|
1523
|
+
VALUE thread = rb_thread_current();
|
|
1524
|
+
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, g_profiler.ts_key);
|
|
1525
|
+
if (td == NULL) {
|
|
1526
|
+
td = rperf_thread_data_create(&g_profiler, thread);
|
|
1527
|
+
if (!td) rb_raise(rb_eNoMemError, "rperf: failed to allocate thread data");
|
|
1528
|
+
}
|
|
1529
|
+
td->label_set_id = label_set_id;
|
|
1530
|
+
return vid;
|
|
1531
|
+
}
|
|
1532
|
+
|
|
1533
|
+
/* _c_get_label() — get current thread's label_set_id.
|
|
1534
|
+
* Returns 0 if not profiling or thread not yet seen. */
|
|
1535
|
+
static VALUE
|
|
1536
|
+
rb_rperf_get_label(VALUE self)
|
|
1537
|
+
{
|
|
1538
|
+
if (!g_profiler.running) return INT2FIX(0);
|
|
1539
|
+
|
|
1540
|
+
VALUE thread = rb_thread_current();
|
|
1541
|
+
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, g_profiler.ts_key);
|
|
1542
|
+
if (td == NULL) return INT2FIX(0);
|
|
1543
|
+
return INT2NUM(td->label_set_id);
|
|
1544
|
+
}
|
|
1545
|
+
|
|
1546
|
+
/* _c_set_label_sets(ary) — store label_sets Ruby Array for result building */
|
|
1547
|
+
static VALUE
|
|
1548
|
+
rb_rperf_set_label_sets(VALUE self, VALUE ary)
|
|
1549
|
+
{
|
|
1550
|
+
g_profiler.label_sets = ary;
|
|
1551
|
+
return ary;
|
|
1552
|
+
}
|
|
1553
|
+
|
|
1554
|
+
/* _c_get_label_sets() — get label_sets Ruby Array */
|
|
1555
|
+
static VALUE
|
|
1556
|
+
rb_rperf_get_label_sets(VALUE self)
|
|
1557
|
+
{
|
|
1558
|
+
return g_profiler.label_sets;
|
|
1559
|
+
}
|
|
1560
|
+
|
|
1401
1561
|
/* ---- Fork safety ---- */
|
|
1402
1562
|
|
|
1403
1563
|
static void
|
|
@@ -1459,8 +1619,14 @@ Init_rperf(void)
|
|
|
1459
1619
|
VALUE mRperf = rb_define_module("Rperf");
|
|
1460
1620
|
rb_define_module_function(mRperf, "_c_start", rb_rperf_start, 4);
|
|
1461
1621
|
rb_define_module_function(mRperf, "_c_stop", rb_rperf_stop, 0);
|
|
1622
|
+
rb_define_module_function(mRperf, "_c_snapshot", rb_rperf_snapshot, 1);
|
|
1623
|
+
rb_define_module_function(mRperf, "_c_set_label", rb_rperf_set_label, 1);
|
|
1624
|
+
rb_define_module_function(mRperf, "_c_get_label", rb_rperf_get_label, 0);
|
|
1625
|
+
rb_define_module_function(mRperf, "_c_set_label_sets", rb_rperf_set_label_sets, 1);
|
|
1626
|
+
rb_define_module_function(mRperf, "_c_get_label_sets", rb_rperf_get_label_sets, 0);
|
|
1462
1627
|
|
|
1463
1628
|
memset(&g_profiler, 0, sizeof(g_profiler));
|
|
1629
|
+
g_profiler.label_sets = Qnil;
|
|
1464
1630
|
g_profiler.pj_handle = rb_postponed_job_preregister(0, rperf_sample_job, &g_profiler);
|
|
1465
1631
|
g_profiler.ts_key = rb_internal_thread_specific_key_create();
|
|
1466
1632
|
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
require "rperf"
|
|
2
|
+
|
|
3
|
+
class Rperf::Middleware
|
|
4
|
+
def initialize(app, label_key: :endpoint)
|
|
5
|
+
@app = app
|
|
6
|
+
@label_key = label_key
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def call(env)
|
|
10
|
+
endpoint = "#{env["REQUEST_METHOD"]} #{env["PATH_INFO"]}"
|
|
11
|
+
Rperf.label(@label_key => endpoint) do
|
|
12
|
+
@app.call(env)
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
data/lib/rperf/version.rb
CHANGED
data/lib/rperf.rb
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
require_relative "rperf/version"
|
|
2
2
|
require "zlib"
|
|
3
3
|
require "stringio"
|
|
4
4
|
|
|
@@ -42,6 +42,8 @@ module Rperf
|
|
|
42
42
|
@format = format
|
|
43
43
|
@stat = stat
|
|
44
44
|
@stat_start_mono = Process.clock_gettime(Process::CLOCK_MONOTONIC) if @stat
|
|
45
|
+
@label_set_table = nil
|
|
46
|
+
@label_set_index = nil
|
|
45
47
|
_c_start(frequency, c_mode, aggregate, c_signal)
|
|
46
48
|
|
|
47
49
|
if block_given?
|
|
@@ -61,15 +63,15 @@ module Rperf
|
|
|
61
63
|
# :aggregated_samples. Build aggregated view so encoders always work.
|
|
62
64
|
if data[:raw_samples] && !data[:aggregated_samples]
|
|
63
65
|
merged = {}
|
|
64
|
-
data[:raw_samples].each do |frames, weight, thread_seq|
|
|
65
|
-
key = [frames, thread_seq || 0]
|
|
66
|
+
data[:raw_samples].each do |frames, weight, thread_seq, label_set_id|
|
|
67
|
+
key = [frames, thread_seq || 0, label_set_id || 0]
|
|
66
68
|
if merged.key?(key)
|
|
67
69
|
merged[key] += weight
|
|
68
70
|
else
|
|
69
71
|
merged[key] = weight
|
|
70
72
|
end
|
|
71
73
|
end
|
|
72
|
-
data[:aggregated_samples] = merged.map { |(frames, ts), w| [frames, w, ts] }
|
|
74
|
+
data[:aggregated_samples] = merged.map { |(frames, ts, lsi), w| [frames, w, ts, lsi] }
|
|
73
75
|
end
|
|
74
76
|
|
|
75
77
|
print_stats(data) if @verbose
|
|
@@ -84,6 +86,77 @@ module Rperf
|
|
|
84
86
|
data
|
|
85
87
|
end
|
|
86
88
|
|
|
89
|
+
# Returns a snapshot of the current profiling data without stopping.
|
|
90
|
+
# Only works in aggregate mode (the default). Returns nil if not profiling.
|
|
91
|
+
# The returned data has the same format as stop's return value and can be
|
|
92
|
+
# passed to save(), PProf.encode(), Collapsed.encode(), or Text.encode().
|
|
93
|
+
#
|
|
94
|
+
# +clear:+ if true, resets aggregated data after taking the snapshot.
|
|
95
|
+
# This allows interval-based profiling where each snapshot covers only
|
|
96
|
+
# the period since the last clear.
|
|
97
|
+
def self.snapshot(clear: false)
|
|
98
|
+
_c_snapshot(clear)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Label set management for per-context profiling.
|
|
102
|
+
# Label sets are stored as an Array of Hashes, indexed by label_set_id.
|
|
103
|
+
# Index 0 is reserved (no labels).
|
|
104
|
+
|
|
105
|
+
@label_set_table = nil # Array of frozen Hash
|
|
106
|
+
@label_set_index = nil # Hash → id (for dedup)
|
|
107
|
+
|
|
108
|
+
def self._init_label_sets
|
|
109
|
+
@label_set_table = [{}] # id 0 = no labels
|
|
110
|
+
@label_set_index = { {} => 0 }
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def self._intern_label_set(hash)
|
|
114
|
+
frozen = hash.frozen? ? hash : hash.freeze
|
|
115
|
+
@label_set_index[frozen] ||= begin
|
|
116
|
+
id = @label_set_table.size
|
|
117
|
+
@label_set_table << frozen
|
|
118
|
+
_c_set_label_sets(@label_set_table)
|
|
119
|
+
id
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Sets labels on the current thread for profiling annotation.
|
|
124
|
+
# With a block: restores previous labels when the block exits.
|
|
125
|
+
# Without a block: sets labels persistently on the current thread.
|
|
126
|
+
# Labels are key-value pairs written into pprof sample labels.
|
|
127
|
+
#
|
|
128
|
+
# Rperf.label(request: "abc") { handle_request }
|
|
129
|
+
# Rperf.label(request: "abc") # persistent set
|
|
130
|
+
#
|
|
131
|
+
# Values of nil remove that key. Existing labels are merged.
|
|
132
|
+
def self.label(**kw, &block)
|
|
133
|
+
_init_label_sets unless @label_set_table
|
|
134
|
+
|
|
135
|
+
cur_id = _c_get_label
|
|
136
|
+
cur_labels = @label_set_table[cur_id] || {}
|
|
137
|
+
|
|
138
|
+
new_labels = cur_labels.merge(kw).reject { |_, v| v.nil? }
|
|
139
|
+
new_id = _intern_label_set(new_labels)
|
|
140
|
+
_c_set_label(new_id)
|
|
141
|
+
|
|
142
|
+
if block
|
|
143
|
+
begin
|
|
144
|
+
yield
|
|
145
|
+
ensure
|
|
146
|
+
_c_set_label(cur_id)
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Returns the current thread's labels as a Hash.
|
|
152
|
+
# Returns an empty Hash if no labels are set or profiling is not running.
|
|
153
|
+
def self.labels
|
|
154
|
+
return {} unless @label_set_table
|
|
155
|
+
cur_id = _c_get_label
|
|
156
|
+
@label_set_table[cur_id] || {}
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
|
|
87
160
|
# Saves profiling data to a file.
|
|
88
161
|
# format: :pprof, :collapsed, or :text. nil = auto-detect from path extension
|
|
89
162
|
# .collapsed → collapsed stacks (FlameGraph / speedscope compatible)
|
|
@@ -498,17 +571,30 @@ module Rperf
|
|
|
498
571
|
end
|
|
499
572
|
}
|
|
500
573
|
|
|
501
|
-
# Convert string frames to index frames and merge identical stacks per thread
|
|
574
|
+
# Convert string frames to index frames and merge identical stacks per thread/label
|
|
502
575
|
merged = Hash.new(0)
|
|
503
576
|
thread_seq_key = intern.("thread_seq")
|
|
504
|
-
|
|
505
|
-
|
|
577
|
+
label_sets = data[:label_sets] # Array of Hash (may be nil)
|
|
578
|
+
samples_raw.each do |frames, weight, thread_seq, label_set_id|
|
|
579
|
+
key = [frames.map { |path, label| [intern.(path), intern.(label)] }, thread_seq || 0, label_set_id || 0]
|
|
506
580
|
merged[key] += weight
|
|
507
581
|
end
|
|
508
582
|
merged = merged.to_a
|
|
509
583
|
|
|
584
|
+
# Intern label set keys/values for pprof labels
|
|
585
|
+
label_key_indices = {} # String key → string_table index
|
|
586
|
+
if label_sets
|
|
587
|
+
label_sets.each do |ls|
|
|
588
|
+
ls.each do |k, v|
|
|
589
|
+
sk = k.to_s
|
|
590
|
+
label_key_indices[sk] ||= intern.(sk)
|
|
591
|
+
intern.(v.to_s) # ensure value is interned
|
|
592
|
+
end
|
|
593
|
+
end
|
|
594
|
+
end
|
|
595
|
+
|
|
510
596
|
# Build location/function tables
|
|
511
|
-
locations, functions = build_tables(merged.map { |(frames, _), w| [frames, w] })
|
|
597
|
+
locations, functions = build_tables(merged.map { |(frames, _, _), w| [frames, w] })
|
|
512
598
|
|
|
513
599
|
# Intern type label and unit
|
|
514
600
|
type_label = mode == :wall ? "wall" : "cpu"
|
|
@@ -521,8 +607,8 @@ module Rperf
|
|
|
521
607
|
# field 1: sample_type (repeated ValueType)
|
|
522
608
|
buf << encode_message(1, encode_value_type(type_idx, ns_idx))
|
|
523
609
|
|
|
524
|
-
# field 2: sample (repeated Sample) with thread_seq
|
|
525
|
-
merged.each do |(frames, thread_seq), weight|
|
|
610
|
+
# field 2: sample (repeated Sample) with thread_seq + user labels
|
|
611
|
+
merged.each do |(frames, thread_seq, label_set_id), weight|
|
|
526
612
|
sample_buf = "".b
|
|
527
613
|
loc_ids = frames.map { |f| locations[f] }
|
|
528
614
|
sample_buf << encode_packed_uint64(1, loc_ids)
|
|
@@ -533,6 +619,17 @@ module Rperf
|
|
|
533
619
|
label_buf << encode_int64(3, thread_seq) # num
|
|
534
620
|
sample_buf << encode_message(3, label_buf)
|
|
535
621
|
end
|
|
622
|
+
if label_sets && label_set_id && label_set_id > 0
|
|
623
|
+
ls = label_sets[label_set_id]
|
|
624
|
+
if ls
|
|
625
|
+
ls.each do |k, v|
|
|
626
|
+
label_buf = "".b
|
|
627
|
+
label_buf << encode_int64(1, label_key_indices[k.to_s]) # key
|
|
628
|
+
label_buf << encode_int64(2, string_index[v.to_s]) # str
|
|
629
|
+
sample_buf << encode_message(3, label_buf)
|
|
630
|
+
end
|
|
631
|
+
end
|
|
632
|
+
end
|
|
536
633
|
buf << encode_message(2, sample_buf)
|
|
537
634
|
end
|
|
538
635
|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: rperf
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.6.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Koichi Sasada
|
|
@@ -52,6 +52,9 @@ files:
|
|
|
52
52
|
- ext/rperf/extconf.rb
|
|
53
53
|
- ext/rperf/rperf.c
|
|
54
54
|
- lib/rperf.rb
|
|
55
|
+
- lib/rperf/active_job.rb
|
|
56
|
+
- lib/rperf/middleware.rb
|
|
57
|
+
- lib/rperf/sidekiq.rb
|
|
55
58
|
- lib/rperf/version.rb
|
|
56
59
|
homepage: https://github.com/ko1/rperf
|
|
57
60
|
licenses:
|