rperf 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/docs/help.md +30 -7
- data/exe/rperf +32 -7
- data/ext/rperf/rperf.c +301 -184
- data/lib/rperf/version.rb +1 -1
- data/lib/rperf.rb +41 -11
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3413c4c6ed0cdc0897428bf01fc0fec17a4d14f1c2883e9e5afa0cff110247dc
|
|
4
|
+
data.tar.gz: '097b06203ce4648a860f2816635d6dfac52f8e5987aa381653cec874d52abf7c'
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 37065071f049a27eb1bab9f859ed39499022489a19aa8ecd91b3dc35cb6052ffb6b2fbc02c67ea46a94e8dba7644f2b23760d72d2dda7b998ccf3c61c304e225
|
|
7
|
+
data.tar.gz: 686ab430d58e5dd5163ae65a2bd330a76e57cf0dd72e7eac2b7c61621a03007bd724cac20b1e452766870ff33de325855e199bc3d873d004344f7b26b9b6614f
|
data/docs/help.md
CHANGED
|
@@ -10,6 +10,7 @@ POSIX systems (Linux, macOS). Requires Ruby >= 3.4.0.
|
|
|
10
10
|
|
|
11
11
|
rperf record [options] command [args...]
|
|
12
12
|
rperf stat [options] command [args...]
|
|
13
|
+
rperf exec [options] command [args...]
|
|
13
14
|
rperf report [options] [file]
|
|
14
15
|
rperf help
|
|
15
16
|
|
|
@@ -41,6 +42,20 @@ Shows: user/sys/real time, time breakdown (CPU execution, GVL blocked,
|
|
|
41
42
|
GVL wait, GC marking, GC sweeping), GC/memory/OS stats, and profiler overhead.
|
|
42
43
|
Use --report to add flat and cumulative top-50 function tables.
|
|
43
44
|
|
|
45
|
+
### exec: Run command and print full profile report to stderr.
|
|
46
|
+
|
|
47
|
+
Like `stat --report`. Uses wall mode by default. No file output by default.
|
|
48
|
+
|
|
49
|
+
-o, --output PATH Also save profile to file (default: none)
|
|
50
|
+
-f, --frequency HZ Sampling frequency in Hz (default: 1000)
|
|
51
|
+
-m, --mode MODE cpu or wall (default: wall)
|
|
52
|
+
--signal VALUE Timer signal (Linux only): signal number, or 'false'
|
|
53
|
+
for nanosleep thread (default: auto)
|
|
54
|
+
-v, --verbose Print additional sampling statistics
|
|
55
|
+
|
|
56
|
+
Shows: user/sys/real time, time breakdown, GC/memory/OS stats, profiler overhead,
|
|
57
|
+
and flat/cumulative top-50 function tables.
|
|
58
|
+
|
|
44
59
|
### report: Open pprof profile with go tool pprof. Requires Go.
|
|
45
60
|
|
|
46
61
|
--top Print top functions by flat time
|
|
@@ -67,6 +82,8 @@ Default (no flag): opens diff in browser.
|
|
|
67
82
|
rperf stat ruby app.rb
|
|
68
83
|
rperf stat --report ruby app.rb
|
|
69
84
|
rperf stat -o profile.pb.gz ruby app.rb
|
|
85
|
+
rperf exec ruby app.rb
|
|
86
|
+
rperf exec -m cpu ruby app.rb
|
|
70
87
|
rperf report
|
|
71
88
|
rperf report --top profile.pb.gz
|
|
72
89
|
rperf diff before.pb.gz after.pb.gz
|
|
@@ -106,16 +123,22 @@ Rperf.save("profile.txt", data)
|
|
|
106
123
|
nil if profiler was not running; otherwise a Hash:
|
|
107
124
|
|
|
108
125
|
```ruby
|
|
109
|
-
{ mode: :cpu,
|
|
126
|
+
{ mode: :cpu, # or :wall
|
|
110
127
|
frequency: 500,
|
|
111
128
|
sampling_count: 1234,
|
|
112
129
|
sampling_time_ns: 56789,
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
130
|
+
detected_thread_count: 4, # threads seen during profiling
|
|
131
|
+
start_time_ns: 17740..., # CLOCK_REALTIME epoch nanos
|
|
132
|
+
duration_ns: 10000000, # profiling duration in nanos
|
|
133
|
+
aggregated_samples: [ # when aggregate: true (default)
|
|
134
|
+
[frames, weight, seq], # frames: [[path, label], ...] deepest-first
|
|
135
|
+
... # weight: Integer (nanoseconds, merged per unique stack)
|
|
136
|
+
], # seq: Integer (thread sequence, 1-based)
|
|
137
|
+
# --- OR ---
|
|
138
|
+
raw_samples: [ # when aggregate: false
|
|
139
|
+
[frames, weight, seq], # one entry per timer sample (not merged)
|
|
140
|
+
...
|
|
141
|
+
] }
|
|
119
142
|
```
|
|
120
143
|
|
|
121
144
|
### Rperf.save(path, data, format: nil)
|
data/exe/rperf
CHANGED
|
@@ -72,6 +72,7 @@ HELP_TEXT = File.read(File.expand_path("../docs/help.md", __dir__))
|
|
|
72
72
|
|
|
73
73
|
USAGE = "Usage: rperf record [options] command [args...]\n" \
|
|
74
74
|
" rperf stat [options] command [args...]\n" \
|
|
75
|
+
" rperf exec [options] command [args...]\n" \
|
|
75
76
|
" rperf report [options] [file]\n" \
|
|
76
77
|
" rperf diff [options] base.pb.gz target.pb.gz\n" \
|
|
77
78
|
" rperf help\n"
|
|
@@ -120,7 +121,7 @@ when "diff"
|
|
|
120
121
|
else exec("go", "tool", "pprof", "-http=localhost:#{find_available_port}", "-diff_base=#{base_file}", target_file)
|
|
121
122
|
end
|
|
122
123
|
end
|
|
123
|
-
when "record", "stat"
|
|
124
|
+
when "record", "stat", "exec"
|
|
124
125
|
# continue below
|
|
125
126
|
else
|
|
126
127
|
$stderr.puts "Unknown subcommand: #{subcommand.inspect}" if subcommand
|
|
@@ -128,22 +129,23 @@ else
|
|
|
128
129
|
exit 1
|
|
129
130
|
end
|
|
130
131
|
|
|
131
|
-
output = (subcommand == "
|
|
132
|
+
output = (subcommand == "record") ? "rperf.data" : nil
|
|
132
133
|
frequency = 1000
|
|
133
|
-
mode = (subcommand == "
|
|
134
|
+
mode = (subcommand == "record") ? "cpu" : "wall"
|
|
134
135
|
format = nil
|
|
135
136
|
signal = nil
|
|
136
137
|
verbose = false
|
|
137
138
|
aggregate = true
|
|
138
|
-
stat_report =
|
|
139
|
+
stat_report = (subcommand == "exec")
|
|
139
140
|
|
|
140
141
|
parser = OptionParser.new do |opts|
|
|
141
142
|
opts.banner = case subcommand
|
|
142
143
|
when "record" then "Usage: rperf record [options] command [args...]"
|
|
143
144
|
when "stat" then "Usage: rperf stat [options] command [args...]"
|
|
145
|
+
when "exec" then "Usage: rperf exec [options] command [args...]"
|
|
144
146
|
end
|
|
145
147
|
|
|
146
|
-
opts.on("-o", "--output PATH", "Output file#{subcommand == '
|
|
148
|
+
opts.on("-o", "--output PATH", "Output file#{subcommand == 'record' ? ' (default: rperf.data)' : ' (default: none)'}") do |v|
|
|
147
149
|
output = v
|
|
148
150
|
end
|
|
149
151
|
|
|
@@ -151,7 +153,7 @@ parser = OptionParser.new do |opts|
|
|
|
151
153
|
frequency = v
|
|
152
154
|
end
|
|
153
155
|
|
|
154
|
-
default_mode = (subcommand == "
|
|
156
|
+
default_mode = (subcommand == "record") ? "cpu" : "wall"
|
|
155
157
|
opts.on("-m", "--mode MODE", %w[cpu wall], "Profiling mode: cpu or wall (default: #{default_mode})") do |v|
|
|
156
158
|
mode = v
|
|
157
159
|
end
|
|
@@ -208,6 +210,29 @@ if ARGV.empty?
|
|
|
208
210
|
exit 1
|
|
209
211
|
end
|
|
210
212
|
|
|
213
|
+
if frequency <= 0
|
|
214
|
+
$stderr.puts "Error: frequency must be a positive integer (got #{frequency})"
|
|
215
|
+
exit 1
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
if frequency > 10_000
|
|
219
|
+
$stderr.puts "Error: frequency must be <= 10000 (10KHz), got #{frequency}"
|
|
220
|
+
exit 1
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
if signal && signal != "false"
|
|
224
|
+
unless RUBY_PLATFORM =~ /linux/
|
|
225
|
+
$stderr.puts "Error: signal mode is only supported on Linux"
|
|
226
|
+
exit 1
|
|
227
|
+
end
|
|
228
|
+
sig_num = signal.to_i
|
|
229
|
+
uncatchable = [Signal.list["KILL"], Signal.list["STOP"]].compact
|
|
230
|
+
if uncatchable.include?(sig_num)
|
|
231
|
+
$stderr.puts "Error: signal #{sig_num} (#{Signal.signame(sig_num)}) cannot be caught; use a different signal"
|
|
232
|
+
exit 1
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
|
|
211
236
|
# Add lib dir to RUBYLIB so -rrperf can find the extension
|
|
212
237
|
lib_dir = File.expand_path("../lib", __dir__)
|
|
213
238
|
ENV["RUBYLIB"] = [lib_dir, ENV["RUBYLIB"]].compact.join(File::PATH_SEPARATOR)
|
|
@@ -221,7 +246,7 @@ ENV["RPERF_VERBOSE"] = "1" if verbose
|
|
|
221
246
|
ENV["RPERF_SIGNAL"] = signal if signal
|
|
222
247
|
ENV["RPERF_AGGREGATE"] = "0" unless aggregate
|
|
223
248
|
|
|
224
|
-
if subcommand == "stat"
|
|
249
|
+
if subcommand == "stat" || subcommand == "exec"
|
|
225
250
|
ENV["RPERF_STAT"] = "1"
|
|
226
251
|
ENV["RPERF_STAT_COMMAND"] = ARGV.join(" ")
|
|
227
252
|
ENV["RPERF_STAT_REPORT"] = "1" if stat_report
|
data/ext/rperf/rperf.c
CHANGED
|
@@ -7,13 +7,19 @@
|
|
|
7
7
|
#include <stdlib.h>
|
|
8
8
|
#include <unistd.h>
|
|
9
9
|
#include <signal.h>
|
|
10
|
-
#include <
|
|
10
|
+
#include <stdatomic.h>
|
|
11
11
|
#ifdef __linux__
|
|
12
12
|
#include <sys/syscall.h>
|
|
13
13
|
#endif
|
|
14
14
|
|
|
15
|
-
/* Checked pthread wrappers —
|
|
16
|
-
#define CHECKED(call) do {
|
|
15
|
+
/* Checked pthread wrappers — always active regardless of NDEBUG */
|
|
16
|
+
#define CHECKED(call) do { \
|
|
17
|
+
int _r = (call); \
|
|
18
|
+
if (_r != 0) { \
|
|
19
|
+
fprintf(stderr, "rperf: %s failed: %s\n", #call, strerror(_r)); \
|
|
20
|
+
abort(); \
|
|
21
|
+
} \
|
|
22
|
+
} while (0)
|
|
17
23
|
|
|
18
24
|
#ifdef __linux__
|
|
19
25
|
#define RPERF_USE_TIMER_SIGNAL 1
|
|
@@ -26,7 +32,8 @@
|
|
|
26
32
|
#define RPERF_INITIAL_SAMPLES 16384 /* >= AGG_THRESHOLD to avoid realloc before first aggregation */
|
|
27
33
|
#define RPERF_INITIAL_FRAME_POOL (1024 * 1024 / sizeof(VALUE)) /* ~1MB */
|
|
28
34
|
#define RPERF_AGG_THRESHOLD 10000 /* aggregate every N samples */
|
|
29
|
-
#define RPERF_FRAME_TABLE_INITIAL
|
|
35
|
+
#define RPERF_FRAME_TABLE_INITIAL 4096
|
|
36
|
+
#define RPERF_FRAME_TABLE_OLD_KEYS_INITIAL 16
|
|
30
37
|
#define RPERF_AGG_TABLE_INITIAL 1024
|
|
31
38
|
#define RPERF_STACK_POOL_INITIAL 4096
|
|
32
39
|
|
|
@@ -77,11 +84,15 @@ typedef struct rperf_sample_buffer {
|
|
|
77
84
|
#define RPERF_FRAME_TABLE_EMPTY UINT32_MAX
|
|
78
85
|
|
|
79
86
|
typedef struct rperf_frame_table {
|
|
80
|
-
VALUE *keys;
|
|
87
|
+
_Atomic(VALUE *) keys; /* unique VALUE array (GC mark target) */
|
|
81
88
|
size_t count; /* = next frame_id (starts after RPERF_SYNTHETIC_COUNT) */
|
|
82
89
|
size_t capacity;
|
|
83
90
|
uint32_t *buckets; /* open addressing: stores index into keys[] */
|
|
84
91
|
size_t bucket_capacity;
|
|
92
|
+
/* Old keys arrays kept alive for GC dmark safety until stop */
|
|
93
|
+
VALUE **old_keys;
|
|
94
|
+
int old_keys_count;
|
|
95
|
+
int old_keys_capacity;
|
|
85
96
|
} rperf_frame_table_t;
|
|
86
97
|
|
|
87
98
|
/* ---- Aggregation table: stack → weight ---- */
|
|
@@ -107,54 +118,63 @@ typedef struct rperf_agg_table {
|
|
|
107
118
|
} rperf_agg_table_t;
|
|
108
119
|
|
|
109
120
|
typedef struct rperf_thread_data {
|
|
110
|
-
int64_t
|
|
121
|
+
int64_t prev_time_ns;
|
|
111
122
|
int64_t prev_wall_ns;
|
|
112
123
|
/* GVL event tracking */
|
|
113
124
|
int64_t suspended_at_ns; /* wall time at SUSPENDED */
|
|
114
125
|
int64_t ready_at_ns; /* wall time at READY */
|
|
115
|
-
size_t suspended_frame_start; /* saved stack in frame_pool */
|
|
116
|
-
int suspended_frame_depth; /* saved stack depth */
|
|
117
126
|
int thread_seq; /* thread sequence number (1-based) */
|
|
118
127
|
} rperf_thread_data_t;
|
|
119
128
|
|
|
129
|
+
/* ---- GC tracking state ---- */
|
|
130
|
+
|
|
131
|
+
typedef struct rperf_gc_state {
|
|
132
|
+
int phase; /* rperf_gc_phase */
|
|
133
|
+
int64_t enter_ns; /* wall time at GC_ENTER */
|
|
134
|
+
int thread_seq; /* thread_seq at GC_ENTER */
|
|
135
|
+
} rperf_gc_state_t;
|
|
136
|
+
|
|
137
|
+
/* ---- Sampling overhead stats ---- */
|
|
138
|
+
|
|
139
|
+
typedef struct rperf_stats {
|
|
140
|
+
size_t trigger_count;
|
|
141
|
+
size_t sampling_count;
|
|
142
|
+
int64_t sampling_total_ns;
|
|
143
|
+
} rperf_stats_t;
|
|
144
|
+
|
|
120
145
|
typedef struct rperf_profiler {
|
|
121
146
|
int frequency;
|
|
122
147
|
int mode; /* 0 = cpu, 1 = wall */
|
|
123
|
-
|
|
148
|
+
_Atomic int running;
|
|
124
149
|
pthread_t worker_thread; /* combined timer + aggregation */
|
|
125
150
|
#if RPERF_USE_TIMER_SIGNAL
|
|
126
151
|
timer_t timer_id;
|
|
127
152
|
int timer_signal; /* >0: use timer signal, 0: use nanosleep thread */
|
|
128
|
-
|
|
153
|
+
_Atomic pid_t worker_tid; /* kernel TID of worker thread (for SIGEV_THREAD_ID) */
|
|
154
|
+
struct sigaction old_sigaction; /* saved handler to restore on stop */
|
|
129
155
|
#endif
|
|
130
156
|
rb_postponed_job_handle_t pj_handle;
|
|
131
157
|
int aggregate; /* 1 = aggregate samples, 0 = raw */
|
|
132
158
|
/* Double-buffered sample storage (only buffers[0] used when !aggregate) */
|
|
133
159
|
rperf_sample_buffer_t buffers[2];
|
|
134
|
-
int active_idx;
|
|
160
|
+
_Atomic int active_idx; /* 0 or 1 */
|
|
135
161
|
/* Aggregation (only used when aggregate=1) */
|
|
136
162
|
rperf_frame_table_t frame_table;
|
|
137
163
|
rperf_agg_table_t agg_table;
|
|
138
|
-
|
|
164
|
+
_Atomic int swap_ready; /* 1 = standby buffer ready for aggregation */
|
|
139
165
|
pthread_mutex_t worker_mutex;
|
|
140
166
|
pthread_cond_t worker_cond;
|
|
141
167
|
rb_internal_thread_specific_key_t ts_key;
|
|
142
168
|
rb_internal_thread_event_hook_t *thread_hook;
|
|
143
169
|
/* GC tracking */
|
|
144
|
-
|
|
145
|
-
int64_t gc_enter_ns; /* wall time at GC_ENTER */
|
|
146
|
-
size_t gc_frame_start; /* saved stack at GC_ENTER */
|
|
147
|
-
int gc_frame_depth; /* saved stack depth */
|
|
148
|
-
int gc_thread_seq; /* thread_seq at GC_ENTER */
|
|
170
|
+
rperf_gc_state_t gc;
|
|
149
171
|
/* Timing metadata for pprof */
|
|
150
172
|
struct timespec start_realtime; /* CLOCK_REALTIME at start */
|
|
151
173
|
struct timespec start_monotonic; /* CLOCK_MONOTONIC at start */
|
|
152
174
|
/* Thread sequence counter */
|
|
153
175
|
int next_thread_seq;
|
|
154
176
|
/* Sampling overhead stats */
|
|
155
|
-
|
|
156
|
-
size_t sampling_count;
|
|
157
|
-
int64_t sampling_total_ns;
|
|
177
|
+
rperf_stats_t stats;
|
|
158
178
|
} rperf_profiler_t;
|
|
159
179
|
|
|
160
180
|
static rperf_profiler_t g_profiler;
|
|
@@ -175,10 +195,18 @@ rperf_profiler_mark(void *ptr)
|
|
|
175
195
|
buf->frame_pool + buf->frame_pool_count);
|
|
176
196
|
}
|
|
177
197
|
}
|
|
178
|
-
/* Mark frame_table keys (unique frame VALUEs)
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
198
|
+
/* Mark frame_table keys (unique frame VALUEs).
|
|
199
|
+
* Acquire count to synchronize with the release-store in insert,
|
|
200
|
+
* ensuring we see the keys pointer that is valid for [0, count).
|
|
201
|
+
* If we see an old count, both old and new keys arrays have valid
|
|
202
|
+
* data (old keys are kept alive in old_keys[]). */
|
|
203
|
+
{
|
|
204
|
+
size_t ft_count = __atomic_load_n(&prof->frame_table.count, __ATOMIC_ACQUIRE);
|
|
205
|
+
VALUE *ft_keys = atomic_load_explicit(&prof->frame_table.keys, memory_order_acquire);
|
|
206
|
+
if (ft_keys && ft_count > 0) {
|
|
207
|
+
rb_gc_mark_locations(ft_keys + RPERF_SYNTHETIC_COUNT,
|
|
208
|
+
ft_keys + ft_count);
|
|
209
|
+
}
|
|
182
210
|
}
|
|
183
211
|
}
|
|
184
212
|
|
|
@@ -288,21 +316,38 @@ rperf_ensure_frame_pool_capacity(rperf_sample_buffer_t *buf, int needed)
|
|
|
288
316
|
|
|
289
317
|
/* ---- Frame table operations (all malloc-based, no GVL needed) ---- */
|
|
290
318
|
|
|
291
|
-
static
|
|
319
|
+
static int
|
|
292
320
|
rperf_frame_table_init(rperf_frame_table_t *ft)
|
|
293
321
|
{
|
|
294
322
|
ft->capacity = RPERF_FRAME_TABLE_INITIAL;
|
|
295
|
-
|
|
323
|
+
VALUE *keys = (VALUE *)calloc(ft->capacity, sizeof(VALUE));
|
|
324
|
+
if (!keys) return -1;
|
|
325
|
+
atomic_store_explicit(&ft->keys, keys, memory_order_relaxed);
|
|
296
326
|
ft->count = RPERF_SYNTHETIC_COUNT; /* reserve slots for synthetic frames */
|
|
297
327
|
ft->bucket_capacity = RPERF_FRAME_TABLE_INITIAL * 2;
|
|
298
328
|
ft->buckets = (uint32_t *)malloc(ft->bucket_capacity * sizeof(uint32_t));
|
|
329
|
+
if (!ft->buckets) { free(keys); atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed); return -1; }
|
|
299
330
|
memset(ft->buckets, 0xFF, ft->bucket_capacity * sizeof(uint32_t)); /* EMPTY */
|
|
331
|
+
ft->old_keys_count = 0;
|
|
332
|
+
ft->old_keys_capacity = RPERF_FRAME_TABLE_OLD_KEYS_INITIAL;
|
|
333
|
+
ft->old_keys = (VALUE **)malloc(ft->old_keys_capacity * sizeof(VALUE *));
|
|
334
|
+
if (!ft->old_keys) {
|
|
335
|
+
free(ft->buckets);
|
|
336
|
+
free(keys);
|
|
337
|
+
atomic_store_explicit(&ft->keys, NULL, memory_order_relaxed);
|
|
338
|
+
return -1;
|
|
339
|
+
}
|
|
340
|
+
return 0;
|
|
300
341
|
}
|
|
301
342
|
|
|
302
343
|
static void
|
|
303
344
|
rperf_frame_table_free(rperf_frame_table_t *ft)
|
|
304
345
|
{
|
|
305
|
-
|
|
346
|
+
int i;
|
|
347
|
+
for (i = 0; i < ft->old_keys_count; i++)
|
|
348
|
+
free(ft->old_keys[i]);
|
|
349
|
+
free(ft->old_keys);
|
|
350
|
+
free(atomic_load_explicit(&ft->keys, memory_order_relaxed));
|
|
306
351
|
free(ft->buckets);
|
|
307
352
|
memset(ft, 0, sizeof(*ft));
|
|
308
353
|
}
|
|
@@ -312,11 +357,13 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
|
|
|
312
357
|
{
|
|
313
358
|
size_t new_cap = ft->bucket_capacity * 2;
|
|
314
359
|
uint32_t *new_buckets = (uint32_t *)malloc(new_cap * sizeof(uint32_t));
|
|
360
|
+
if (!new_buckets) return; /* keep using current buckets at higher load factor */
|
|
315
361
|
memset(new_buckets, 0xFF, new_cap * sizeof(uint32_t));
|
|
316
362
|
|
|
363
|
+
VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
|
|
317
364
|
size_t i;
|
|
318
365
|
for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
|
|
319
|
-
uint32_t h = (uint32_t)(
|
|
366
|
+
uint32_t h = (uint32_t)(keys[i] >> 3); /* shift out tag bits */
|
|
320
367
|
size_t idx = h % new_cap;
|
|
321
368
|
while (new_buckets[idx] != RPERF_FRAME_TABLE_EMPTY)
|
|
322
369
|
idx = (idx + 1) % new_cap;
|
|
@@ -332,25 +379,42 @@ rperf_frame_table_rehash(rperf_frame_table_t *ft)
|
|
|
332
379
|
static uint32_t
|
|
333
380
|
rperf_frame_table_insert(rperf_frame_table_t *ft, VALUE fval)
|
|
334
381
|
{
|
|
382
|
+
VALUE *keys = atomic_load_explicit(&ft->keys, memory_order_relaxed);
|
|
335
383
|
uint32_t h = (uint32_t)(fval >> 3);
|
|
336
384
|
size_t idx = h % ft->bucket_capacity;
|
|
337
385
|
|
|
338
386
|
while (1) {
|
|
339
387
|
uint32_t slot = ft->buckets[idx];
|
|
340
388
|
if (slot == RPERF_FRAME_TABLE_EMPTY) break;
|
|
341
|
-
if (
|
|
389
|
+
if (keys[slot] == fval) return slot;
|
|
342
390
|
idx = (idx + 1) % ft->bucket_capacity;
|
|
343
391
|
}
|
|
344
392
|
|
|
345
|
-
/* Insert new entry.
|
|
346
|
-
*
|
|
347
|
-
*
|
|
393
|
+
/* Insert new entry. Grow keys array if capacity is exhausted.
|
|
394
|
+
* Cannot realloc in-place because GC dmark may concurrently read
|
|
395
|
+
* the old keys pointer. Instead, allocate new, copy, swap pointer
|
|
396
|
+
* atomically, and keep old array alive until stop. */
|
|
348
397
|
if (ft->count >= ft->capacity) {
|
|
349
|
-
|
|
398
|
+
size_t new_cap = ft->capacity * 2;
|
|
399
|
+
VALUE *new_keys = (VALUE *)calloc(new_cap, sizeof(VALUE));
|
|
400
|
+
if (!new_keys) return RPERF_FRAME_TABLE_EMPTY;
|
|
401
|
+
memcpy(new_keys, keys, ft->capacity * sizeof(VALUE));
|
|
402
|
+
/* Save old keys for deferred free (GC dmark safety) */
|
|
403
|
+
if (ft->old_keys_count >= ft->old_keys_capacity) {
|
|
404
|
+
int new_old_cap = ft->old_keys_capacity * 2;
|
|
405
|
+
VALUE **new_old = (VALUE **)realloc(ft->old_keys, new_old_cap * sizeof(VALUE *));
|
|
406
|
+
if (!new_old) { free(new_keys); return RPERF_FRAME_TABLE_EMPTY; }
|
|
407
|
+
ft->old_keys = new_old;
|
|
408
|
+
ft->old_keys_capacity = new_old_cap;
|
|
409
|
+
}
|
|
410
|
+
ft->old_keys[ft->old_keys_count++] = keys;
|
|
411
|
+
keys = new_keys;
|
|
412
|
+
atomic_store_explicit(&ft->keys, new_keys, memory_order_release);
|
|
413
|
+
ft->capacity = new_cap;
|
|
350
414
|
}
|
|
351
415
|
|
|
352
416
|
uint32_t frame_id = (uint32_t)ft->count;
|
|
353
|
-
|
|
417
|
+
keys[frame_id] = fval;
|
|
354
418
|
/* Store fence: ensure keys[frame_id] is visible before count is incremented,
|
|
355
419
|
* so GC dmark never reads uninitialized keys[count-1]. */
|
|
356
420
|
__atomic_store_n(&ft->count, ft->count + 1, __ATOMIC_RELEASE);
|
|
@@ -380,15 +444,18 @@ rperf_fnv1a_u32(const uint32_t *data, int len, int thread_seq)
|
|
|
380
444
|
return h;
|
|
381
445
|
}
|
|
382
446
|
|
|
383
|
-
static
|
|
447
|
+
static int
|
|
384
448
|
rperf_agg_table_init(rperf_agg_table_t *at)
|
|
385
449
|
{
|
|
386
450
|
at->bucket_capacity = RPERF_AGG_TABLE_INITIAL * 2;
|
|
387
451
|
at->buckets = (rperf_agg_entry_t *)calloc(at->bucket_capacity, sizeof(rperf_agg_entry_t));
|
|
452
|
+
if (!at->buckets) return -1;
|
|
388
453
|
at->count = 0;
|
|
389
454
|
at->stack_pool_capacity = RPERF_STACK_POOL_INITIAL;
|
|
390
455
|
at->stack_pool = (uint32_t *)malloc(at->stack_pool_capacity * sizeof(uint32_t));
|
|
456
|
+
if (!at->stack_pool) { free(at->buckets); at->buckets = NULL; return -1; }
|
|
391
457
|
at->stack_pool_count = 0;
|
|
458
|
+
return 0;
|
|
392
459
|
}
|
|
393
460
|
|
|
394
461
|
static void
|
|
@@ -404,6 +471,7 @@ rperf_agg_table_rehash(rperf_agg_table_t *at)
|
|
|
404
471
|
{
|
|
405
472
|
size_t new_cap = at->bucket_capacity * 2;
|
|
406
473
|
rperf_agg_entry_t *new_buckets = (rperf_agg_entry_t *)calloc(new_cap, sizeof(rperf_agg_entry_t));
|
|
474
|
+
if (!new_buckets) return; /* keep using current buckets at higher load factor */
|
|
407
475
|
|
|
408
476
|
size_t i;
|
|
409
477
|
for (i = 0; i < at->bucket_capacity; i++) {
|
|
@@ -535,10 +603,10 @@ rperf_aggregate_buffer(rperf_profiler_t *prof, rperf_sample_buffer_t *buf)
|
|
|
535
603
|
static void
|
|
536
604
|
rperf_try_aggregate(rperf_profiler_t *prof)
|
|
537
605
|
{
|
|
538
|
-
if (!prof->aggregate || !prof->swap_ready) return;
|
|
539
|
-
int standby_idx = prof->active_idx ^ 1;
|
|
606
|
+
if (!prof->aggregate || !atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) return;
|
|
607
|
+
int standby_idx = atomic_load_explicit(&prof->active_idx, memory_order_acquire) ^ 1;
|
|
540
608
|
rperf_aggregate_buffer(prof, &prof->buffers[standby_idx]);
|
|
541
|
-
prof->swap_ready
|
|
609
|
+
atomic_store_explicit(&prof->swap_ready, 0, memory_order_release);
|
|
542
610
|
}
|
|
543
611
|
|
|
544
612
|
/* ---- Record a sample ---- */
|
|
@@ -547,25 +615,29 @@ static void
|
|
|
547
615
|
rperf_try_swap(rperf_profiler_t *prof)
|
|
548
616
|
{
|
|
549
617
|
if (!prof->aggregate) return;
|
|
550
|
-
|
|
618
|
+
int idx = atomic_load_explicit(&prof->active_idx, memory_order_relaxed);
|
|
619
|
+
rperf_sample_buffer_t *buf = &prof->buffers[idx];
|
|
551
620
|
if (buf->sample_count < RPERF_AGG_THRESHOLD) return;
|
|
552
|
-
if (prof->swap_ready) return; /* standby still being aggregated */
|
|
621
|
+
if (atomic_load_explicit(&prof->swap_ready, memory_order_acquire)) return; /* standby still being aggregated */
|
|
553
622
|
|
|
554
|
-
/* Swap active buffer */
|
|
555
|
-
prof->active_idx
|
|
556
|
-
prof->swap_ready = 1;
|
|
623
|
+
/* Swap active buffer: release ensures buffer writes are visible to worker */
|
|
624
|
+
atomic_store_explicit(&prof->active_idx, idx ^ 1, memory_order_release);
|
|
557
625
|
|
|
558
|
-
/*
|
|
626
|
+
/* Set swap_ready under mutex and signal, preventing lost wakeup:
|
|
627
|
+
* the worker checks swap_ready while holding the same mutex. */
|
|
628
|
+
CHECKED(pthread_mutex_lock(&prof->worker_mutex));
|
|
629
|
+
atomic_store_explicit(&prof->swap_ready, 1, memory_order_release);
|
|
559
630
|
CHECKED(pthread_cond_signal(&prof->worker_cond));
|
|
631
|
+
CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
|
|
560
632
|
}
|
|
561
633
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
634
|
+
/* Write a sample into a specific buffer. No swap check. */
|
|
635
|
+
static int
|
|
636
|
+
rperf_write_sample(rperf_sample_buffer_t *buf, size_t frame_start, int depth,
|
|
637
|
+
int64_t weight, int type, int thread_seq)
|
|
565
638
|
{
|
|
566
|
-
if (weight <= 0) return;
|
|
567
|
-
|
|
568
|
-
if (rperf_ensure_sample_capacity(buf) < 0) return;
|
|
639
|
+
if (weight <= 0) return 0;
|
|
640
|
+
if (rperf_ensure_sample_capacity(buf) < 0) return -1;
|
|
569
641
|
|
|
570
642
|
rperf_sample_t *sample = &buf->samples[buf->sample_count];
|
|
571
643
|
sample->depth = depth;
|
|
@@ -574,7 +646,15 @@ rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
|
|
|
574
646
|
sample->type = type;
|
|
575
647
|
sample->thread_seq = thread_seq;
|
|
576
648
|
buf->sample_count++;
|
|
649
|
+
return 0;
|
|
650
|
+
}
|
|
577
651
|
|
|
652
|
+
static void
|
|
653
|
+
rperf_record_sample(rperf_profiler_t *prof, size_t frame_start, int depth,
|
|
654
|
+
int64_t weight, int type, int thread_seq)
|
|
655
|
+
{
|
|
656
|
+
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
657
|
+
rperf_write_sample(buf, frame_start, depth, weight, type, thread_seq);
|
|
578
658
|
rperf_try_swap(prof);
|
|
579
659
|
}
|
|
580
660
|
|
|
@@ -586,7 +666,7 @@ rperf_thread_data_create(rperf_profiler_t *prof, VALUE thread)
|
|
|
586
666
|
{
|
|
587
667
|
rperf_thread_data_t *td = (rperf_thread_data_t *)calloc(1, sizeof(rperf_thread_data_t));
|
|
588
668
|
if (!td) return NULL;
|
|
589
|
-
td->
|
|
669
|
+
td->prev_time_ns = rperf_current_time_ns(prof, td);
|
|
590
670
|
td->prev_wall_ns = rperf_wall_time_ns();
|
|
591
671
|
td->thread_seq = ++prof->next_thread_seq;
|
|
592
672
|
rb_internal_thread_specific_set(thread, prof->ts_key, td);
|
|
@@ -614,7 +694,7 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
|
|
|
614
694
|
if (time_now < 0) return;
|
|
615
695
|
|
|
616
696
|
/* Capture backtrace into active buffer's frame_pool */
|
|
617
|
-
rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
|
|
697
|
+
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
618
698
|
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
|
|
619
699
|
size_t frame_start = buf->frame_pool_count;
|
|
620
700
|
int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
|
|
@@ -624,15 +704,13 @@ rperf_handle_suspended(rperf_profiler_t *prof, VALUE thread)
|
|
|
624
704
|
|
|
625
705
|
/* Record normal sample (skip if first time — no prev_time) */
|
|
626
706
|
if (!is_first) {
|
|
627
|
-
int64_t weight = time_now - td->
|
|
707
|
+
int64_t weight = time_now - td->prev_time_ns;
|
|
628
708
|
rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
|
|
629
709
|
}
|
|
630
710
|
|
|
631
|
-
/* Save
|
|
711
|
+
/* Save timestamp for READY/RESUMED */
|
|
632
712
|
td->suspended_at_ns = wall_now;
|
|
633
|
-
td->
|
|
634
|
-
td->suspended_frame_depth = depth;
|
|
635
|
-
td->prev_cpu_ns = time_now;
|
|
713
|
+
td->prev_time_ns = time_now;
|
|
636
714
|
td->prev_wall_ns = wall_now;
|
|
637
715
|
}
|
|
638
716
|
|
|
@@ -659,29 +737,46 @@ rperf_handle_resumed(rperf_profiler_t *prof, VALUE thread)
|
|
|
659
737
|
|
|
660
738
|
int64_t wall_now = rperf_wall_time_ns();
|
|
661
739
|
|
|
662
|
-
/* Record GVL blocked/wait samples (wall mode only)
|
|
663
|
-
|
|
740
|
+
/* Record GVL blocked/wait samples (wall mode only).
|
|
741
|
+
* Capture backtrace here (not at SUSPENDED) so that frame_start always
|
|
742
|
+
* indexes into the current active buffer, avoiding mismatch after a
|
|
743
|
+
* double-buffer swap. The Ruby stack is unchanged while off-GVL.
|
|
744
|
+
*
|
|
745
|
+
* Both samples are written directly into the same buffer before calling
|
|
746
|
+
* rperf_try_swap, so that a swap triggered by the first sample cannot
|
|
747
|
+
* move the second into a different buffer with a stale frame_start. */
|
|
748
|
+
if (prof->mode == 1 && td->suspended_at_ns > 0) {
|
|
749
|
+
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
750
|
+
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) goto skip_gvl;
|
|
751
|
+
size_t frame_start = buf->frame_pool_count;
|
|
752
|
+
int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
|
|
753
|
+
&buf->frame_pool[frame_start], NULL);
|
|
754
|
+
if (depth <= 0) goto skip_gvl;
|
|
755
|
+
buf->frame_pool_count += depth;
|
|
756
|
+
|
|
757
|
+
/* Write both samples into the same buf, then swap-check once */
|
|
664
758
|
if (td->ready_at_ns > 0 && td->ready_at_ns > td->suspended_at_ns) {
|
|
665
759
|
int64_t blocked_ns = td->ready_at_ns - td->suspended_at_ns;
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq);
|
|
760
|
+
rperf_write_sample(buf, frame_start, depth, blocked_ns,
|
|
761
|
+
RPERF_SAMPLE_GVL_BLOCKED, td->thread_seq);
|
|
669
762
|
}
|
|
670
763
|
if (td->ready_at_ns > 0 && wall_now > td->ready_at_ns) {
|
|
671
764
|
int64_t wait_ns = wall_now - td->ready_at_ns;
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
RPERF_SAMPLE_GVL_WAIT, td->thread_seq);
|
|
765
|
+
rperf_write_sample(buf, frame_start, depth, wait_ns,
|
|
766
|
+
RPERF_SAMPLE_GVL_WAIT, td->thread_seq);
|
|
675
767
|
}
|
|
768
|
+
|
|
769
|
+
rperf_try_swap(prof);
|
|
676
770
|
}
|
|
771
|
+
skip_gvl:
|
|
677
772
|
|
|
678
773
|
/* Reset prev times to current — next timer sample measures from resume */
|
|
679
774
|
int64_t time_now = rperf_current_time_ns(prof, td);
|
|
680
|
-
if (time_now >= 0) td->
|
|
775
|
+
if (time_now >= 0) td->prev_time_ns = time_now;
|
|
681
776
|
td->prev_wall_ns = wall_now;
|
|
682
777
|
|
|
683
778
|
/* Clear suspended state */
|
|
684
|
-
td->
|
|
779
|
+
td->suspended_at_ns = 0;
|
|
685
780
|
td->ready_at_ns = 0;
|
|
686
781
|
}
|
|
687
782
|
|
|
@@ -722,50 +817,52 @@ rperf_gc_event_hook(rb_event_flag_t event, VALUE data, VALUE self, ID id, VALUE
|
|
|
722
817
|
if (!prof->running) return;
|
|
723
818
|
|
|
724
819
|
if (event & RUBY_INTERNAL_EVENT_GC_START) {
|
|
725
|
-
prof->
|
|
820
|
+
prof->gc.phase = RPERF_GC_MARKING;
|
|
726
821
|
}
|
|
727
822
|
else if (event & RUBY_INTERNAL_EVENT_GC_END_MARK) {
|
|
728
|
-
prof->
|
|
823
|
+
prof->gc.phase = RPERF_GC_SWEEPING;
|
|
729
824
|
}
|
|
730
825
|
else if (event & RUBY_INTERNAL_EVENT_GC_END_SWEEP) {
|
|
731
|
-
prof->
|
|
826
|
+
prof->gc.phase = RPERF_GC_NONE;
|
|
732
827
|
}
|
|
733
828
|
else if (event & RUBY_INTERNAL_EVENT_GC_ENTER) {
|
|
734
|
-
/*
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
|
|
738
|
-
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
|
|
739
|
-
size_t frame_start = buf->frame_pool_count;
|
|
740
|
-
int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
|
|
741
|
-
&buf->frame_pool[frame_start], NULL);
|
|
742
|
-
if (depth <= 0) {
|
|
743
|
-
prof->gc_frame_depth = 0;
|
|
744
|
-
return;
|
|
745
|
-
}
|
|
746
|
-
buf->frame_pool_count += depth;
|
|
747
|
-
prof->gc_frame_start = frame_start;
|
|
748
|
-
prof->gc_frame_depth = depth;
|
|
749
|
-
|
|
750
|
-
/* Save thread_seq for the GC_EXIT sample */
|
|
829
|
+
/* Save timestamp and thread_seq; backtrace is captured at GC_EXIT
|
|
830
|
+
* to avoid buffer mismatch after a double-buffer swap. */
|
|
831
|
+
prof->gc.enter_ns = rperf_wall_time_ns();
|
|
751
832
|
{
|
|
752
833
|
VALUE thread = rb_thread_current();
|
|
753
834
|
rperf_thread_data_t *td = (rperf_thread_data_t *)rb_internal_thread_specific_get(thread, prof->ts_key);
|
|
754
|
-
prof->
|
|
835
|
+
prof->gc.thread_seq = td ? td->thread_seq : 0;
|
|
755
836
|
}
|
|
756
837
|
}
|
|
757
838
|
else if (event & RUBY_INTERNAL_EVENT_GC_EXIT) {
|
|
758
|
-
if (prof->
|
|
839
|
+
if (prof->gc.enter_ns <= 0) return;
|
|
759
840
|
|
|
760
841
|
int64_t wall_now = rperf_wall_time_ns();
|
|
761
|
-
int64_t weight = wall_now - prof->
|
|
762
|
-
int type = (prof->
|
|
842
|
+
int64_t weight = wall_now - prof->gc.enter_ns;
|
|
843
|
+
int type = (prof->gc.phase == RPERF_GC_SWEEPING)
|
|
763
844
|
? RPERF_SAMPLE_GC_SWEEPING
|
|
764
845
|
: RPERF_SAMPLE_GC_MARKING;
|
|
765
846
|
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
847
|
+
/* Capture backtrace here (not at GC_ENTER) so that frame_start
|
|
848
|
+
* always indexes into the current active buffer. The Ruby stack
|
|
849
|
+
* is unchanged during GC. */
|
|
850
|
+
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
851
|
+
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) {
|
|
852
|
+
prof->gc.enter_ns = 0;
|
|
853
|
+
return;
|
|
854
|
+
}
|
|
855
|
+
size_t frame_start = buf->frame_pool_count;
|
|
856
|
+
int depth = rb_profile_frames(0, RPERF_MAX_STACK_DEPTH,
|
|
857
|
+
&buf->frame_pool[frame_start], NULL);
|
|
858
|
+
if (depth <= 0) {
|
|
859
|
+
prof->gc.enter_ns = 0;
|
|
860
|
+
return;
|
|
861
|
+
}
|
|
862
|
+
buf->frame_pool_count += depth;
|
|
863
|
+
|
|
864
|
+
rperf_record_sample(prof, frame_start, depth, weight, type, prof->gc.thread_seq);
|
|
865
|
+
prof->gc.enter_ns = 0;
|
|
769
866
|
}
|
|
770
867
|
}
|
|
771
868
|
|
|
@@ -795,14 +892,14 @@ rperf_sample_job(void *arg)
|
|
|
795
892
|
int64_t time_now = rperf_current_time_ns(prof, td);
|
|
796
893
|
if (time_now < 0) return;
|
|
797
894
|
|
|
798
|
-
int64_t weight = time_now - td->
|
|
799
|
-
td->
|
|
895
|
+
int64_t weight = time_now - td->prev_time_ns;
|
|
896
|
+
td->prev_time_ns = time_now;
|
|
800
897
|
td->prev_wall_ns = rperf_wall_time_ns();
|
|
801
898
|
|
|
802
899
|
if (weight <= 0) return;
|
|
803
900
|
|
|
804
901
|
/* Capture backtrace and record sample */
|
|
805
|
-
rperf_sample_buffer_t *buf = &prof->buffers[prof->active_idx];
|
|
902
|
+
rperf_sample_buffer_t *buf = &prof->buffers[atomic_load_explicit(&prof->active_idx, memory_order_relaxed)];
|
|
806
903
|
if (rperf_ensure_frame_pool_capacity(buf, RPERF_MAX_STACK_DEPTH) < 0) return;
|
|
807
904
|
|
|
808
905
|
size_t frame_start = buf->frame_pool_count;
|
|
@@ -814,8 +911,8 @@ rperf_sample_job(void *arg)
|
|
|
814
911
|
rperf_record_sample(prof, frame_start, depth, weight, RPERF_SAMPLE_NORMAL, td->thread_seq);
|
|
815
912
|
|
|
816
913
|
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts_end);
|
|
817
|
-
prof->sampling_count++;
|
|
818
|
-
prof->sampling_total_ns +=
|
|
914
|
+
prof->stats.sampling_count++;
|
|
915
|
+
prof->stats.sampling_total_ns +=
|
|
819
916
|
((int64_t)ts_end.tv_sec - ts_start.tv_sec) * 1000000000LL +
|
|
820
917
|
(ts_end.tv_nsec - ts_start.tv_nsec);
|
|
821
918
|
}
|
|
@@ -826,7 +923,7 @@ rperf_sample_job(void *arg)
|
|
|
826
923
|
static void
|
|
827
924
|
rperf_signal_handler(int sig)
|
|
828
925
|
{
|
|
829
|
-
g_profiler.trigger_count++;
|
|
926
|
+
g_profiler.stats.trigger_count++;
|
|
830
927
|
rb_postponed_job_trigger(g_profiler.pj_handle);
|
|
831
928
|
}
|
|
832
929
|
|
|
@@ -845,7 +942,8 @@ rperf_worker_signal_func(void *arg)
|
|
|
845
942
|
CHECKED(pthread_cond_signal(&prof->worker_cond));
|
|
846
943
|
|
|
847
944
|
while (prof->running) {
|
|
848
|
-
|
|
945
|
+
while (prof->running && !atomic_load_explicit(&prof->swap_ready, memory_order_acquire))
|
|
946
|
+
CHECKED(pthread_cond_wait(&prof->worker_cond, &prof->worker_mutex));
|
|
849
947
|
rperf_try_aggregate(prof);
|
|
850
948
|
}
|
|
851
949
|
CHECKED(pthread_mutex_unlock(&prof->worker_mutex));
|
|
@@ -874,9 +972,12 @@ rperf_worker_nanosleep_func(void *arg)
|
|
|
874
972
|
CHECKED(pthread_mutex_lock(&prof->worker_mutex));
|
|
875
973
|
while (prof->running) {
|
|
876
974
|
int ret = pthread_cond_timedwait(&prof->worker_cond, &prof->worker_mutex, &deadline);
|
|
877
|
-
|
|
975
|
+
if (ret != 0 && ret != ETIMEDOUT) {
|
|
976
|
+
fprintf(stderr, "rperf: pthread_cond_timedwait failed: %s\n", strerror(ret));
|
|
977
|
+
abort();
|
|
978
|
+
}
|
|
878
979
|
if (ret == ETIMEDOUT) {
|
|
879
|
-
prof->trigger_count++;
|
|
980
|
+
prof->stats.trigger_count++;
|
|
880
981
|
rb_postponed_job_trigger(prof->pj_handle);
|
|
881
982
|
/* Advance deadline by interval */
|
|
882
983
|
deadline.tv_nsec += interval_ns;
|
|
@@ -900,8 +1001,6 @@ rperf_resolve_frame(VALUE fval)
|
|
|
900
1001
|
VALUE label = rb_profile_frame_full_label(fval);
|
|
901
1002
|
|
|
902
1003
|
if (NIL_P(path)) path = rb_str_new_lit("<C method>");
|
|
903
|
-
|
|
904
|
-
if (NIL_P(path)) path = rb_str_new_cstr("");
|
|
905
1004
|
if (NIL_P(label)) label = rb_str_new_cstr("");
|
|
906
1005
|
|
|
907
1006
|
return rb_ary_new3(2, path, label);
|
|
@@ -909,58 +1008,23 @@ rperf_resolve_frame(VALUE fval)
|
|
|
909
1008
|
|
|
910
1009
|
/* ---- Ruby API ---- */
|
|
911
1010
|
|
|
1011
|
+
/* _c_start(frequency, mode, aggregate, signal)
|
|
1012
|
+
* frequency: Integer (Hz)
|
|
1013
|
+
* mode: 0 = cpu, 1 = wall
|
|
1014
|
+
* aggregate: 0 or 1
|
|
1015
|
+
* signal: Integer (RT signal number, 0 = nanosleep, -1 = default)
|
|
1016
|
+
*/
|
|
912
1017
|
static VALUE
|
|
913
|
-
rb_rperf_start(
|
|
1018
|
+
rb_rperf_start(VALUE self, VALUE vfreq, VALUE vmode, VALUE vagg, VALUE vsig)
|
|
914
1019
|
{
|
|
915
|
-
|
|
916
|
-
int
|
|
917
|
-
int
|
|
918
|
-
int aggregate = 1; /* default: aggregate */
|
|
1020
|
+
int frequency = NUM2INT(vfreq);
|
|
1021
|
+
int mode = NUM2INT(vmode);
|
|
1022
|
+
int aggregate = RTEST(vagg) ? 1 : 0;
|
|
919
1023
|
#if RPERF_USE_TIMER_SIGNAL
|
|
920
|
-
int
|
|
1024
|
+
int sig = NUM2INT(vsig);
|
|
1025
|
+
int timer_signal = (sig < 0) ? RPERF_TIMER_SIGNAL_DEFAULT : sig;
|
|
921
1026
|
#endif
|
|
922
1027
|
|
|
923
|
-
rb_scan_args(argc, argv, ":", &opts);
|
|
924
|
-
if (!NIL_P(opts)) {
|
|
925
|
-
VALUE vagg = rb_hash_aref(opts, ID2SYM(rb_intern("aggregate")));
|
|
926
|
-
if (!NIL_P(vagg)) {
|
|
927
|
-
aggregate = RTEST(vagg) ? 1 : 0;
|
|
928
|
-
}
|
|
929
|
-
VALUE vfreq = rb_hash_aref(opts, ID2SYM(rb_intern("frequency")));
|
|
930
|
-
if (!NIL_P(vfreq)) {
|
|
931
|
-
frequency = NUM2INT(vfreq);
|
|
932
|
-
if (frequency <= 0 || frequency > 1000000) {
|
|
933
|
-
rb_raise(rb_eArgError, "frequency must be between 1 and 1000000");
|
|
934
|
-
}
|
|
935
|
-
}
|
|
936
|
-
VALUE vmode = rb_hash_aref(opts, ID2SYM(rb_intern("mode")));
|
|
937
|
-
if (!NIL_P(vmode)) {
|
|
938
|
-
ID mode_id = SYM2ID(vmode);
|
|
939
|
-
if (mode_id == rb_intern("cpu")) {
|
|
940
|
-
mode = 0;
|
|
941
|
-
} else if (mode_id == rb_intern("wall")) {
|
|
942
|
-
mode = 1;
|
|
943
|
-
} else {
|
|
944
|
-
rb_raise(rb_eArgError, "mode must be :cpu or :wall");
|
|
945
|
-
}
|
|
946
|
-
}
|
|
947
|
-
#if RPERF_USE_TIMER_SIGNAL
|
|
948
|
-
VALUE vsig = rb_hash_aref(opts, ID2SYM(rb_intern("signal")));
|
|
949
|
-
if (!NIL_P(vsig)) {
|
|
950
|
-
if (RTEST(vsig)) {
|
|
951
|
-
timer_signal = NUM2INT(vsig);
|
|
952
|
-
if (timer_signal < SIGRTMIN || timer_signal > SIGRTMAX) {
|
|
953
|
-
rb_raise(rb_eArgError, "signal must be between SIGRTMIN(%d) and SIGRTMAX(%d)",
|
|
954
|
-
SIGRTMIN, SIGRTMAX);
|
|
955
|
-
}
|
|
956
|
-
} else {
|
|
957
|
-
/* signal: false or signal: 0 → use nanosleep thread */
|
|
958
|
-
timer_signal = 0;
|
|
959
|
-
}
|
|
960
|
-
}
|
|
961
|
-
#endif
|
|
962
|
-
}
|
|
963
|
-
|
|
964
1028
|
if (g_profiler.running) {
|
|
965
1029
|
rb_raise(rb_eRuntimeError, "Rperf is already running");
|
|
966
1030
|
}
|
|
@@ -969,11 +1033,11 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
969
1033
|
g_profiler.mode = mode;
|
|
970
1034
|
g_profiler.aggregate = aggregate;
|
|
971
1035
|
g_profiler.next_thread_seq = 0;
|
|
972
|
-
g_profiler.sampling_count = 0;
|
|
973
|
-
g_profiler.sampling_total_ns = 0;
|
|
974
|
-
g_profiler.trigger_count = 0;
|
|
975
|
-
g_profiler.active_idx
|
|
976
|
-
g_profiler.swap_ready
|
|
1036
|
+
g_profiler.stats.sampling_count = 0;
|
|
1037
|
+
g_profiler.stats.sampling_total_ns = 0;
|
|
1038
|
+
g_profiler.stats.trigger_count = 0;
|
|
1039
|
+
atomic_store_explicit(&g_profiler.active_idx, 0, memory_order_relaxed);
|
|
1040
|
+
atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
|
|
977
1041
|
|
|
978
1042
|
/* Initialize worker mutex/cond */
|
|
979
1043
|
CHECKED(pthread_mutex_init(&g_profiler.worker_mutex, NULL));
|
|
@@ -994,13 +1058,26 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
994
1058
|
}
|
|
995
1059
|
|
|
996
1060
|
/* Initialize aggregation structures */
|
|
997
|
-
rperf_frame_table_init(&g_profiler.frame_table)
|
|
998
|
-
|
|
1061
|
+
if (rperf_frame_table_init(&g_profiler.frame_table) < 0) {
|
|
1062
|
+
rperf_sample_buffer_free(&g_profiler.buffers[0]);
|
|
1063
|
+
rperf_sample_buffer_free(&g_profiler.buffers[1]);
|
|
1064
|
+
CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
|
|
1065
|
+
CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
|
|
1066
|
+
rb_raise(rb_eNoMemError, "rperf: failed to allocate frame table");
|
|
1067
|
+
}
|
|
1068
|
+
if (rperf_agg_table_init(&g_profiler.agg_table) < 0) {
|
|
1069
|
+
rperf_frame_table_free(&g_profiler.frame_table);
|
|
1070
|
+
rperf_sample_buffer_free(&g_profiler.buffers[0]);
|
|
1071
|
+
rperf_sample_buffer_free(&g_profiler.buffers[1]);
|
|
1072
|
+
CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
|
|
1073
|
+
CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
|
|
1074
|
+
rb_raise(rb_eNoMemError, "rperf: failed to allocate aggregation table");
|
|
1075
|
+
}
|
|
999
1076
|
}
|
|
1000
1077
|
|
|
1001
1078
|
/* Register GC event hook */
|
|
1002
|
-
g_profiler.
|
|
1003
|
-
g_profiler.
|
|
1079
|
+
g_profiler.gc.phase = RPERF_GC_NONE;
|
|
1080
|
+
g_profiler.gc.enter_ns = 0;
|
|
1004
1081
|
rb_add_event_hook(rperf_gc_event_hook,
|
|
1005
1082
|
RUBY_INTERNAL_EVENT_GC_START |
|
|
1006
1083
|
RUBY_INTERNAL_EVENT_GC_END_MARK |
|
|
@@ -1023,6 +1100,7 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
1023
1100
|
VALUE cur_thread = rb_thread_current();
|
|
1024
1101
|
rperf_thread_data_t *td = rperf_thread_data_create(&g_profiler, cur_thread);
|
|
1025
1102
|
if (!td) {
|
|
1103
|
+
rb_remove_event_hook(rperf_gc_event_hook);
|
|
1026
1104
|
rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
|
|
1027
1105
|
g_profiler.thread_hook = NULL;
|
|
1028
1106
|
if (g_profiler.aggregate) {
|
|
@@ -1053,14 +1131,17 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
1053
1131
|
memset(&sa, 0, sizeof(sa));
|
|
1054
1132
|
sa.sa_handler = rperf_signal_handler;
|
|
1055
1133
|
sa.sa_flags = SA_RESTART;
|
|
1056
|
-
sigaction(g_profiler.timer_signal, &sa,
|
|
1134
|
+
if (sigaction(g_profiler.timer_signal, &sa, &g_profiler.old_sigaction) != 0) {
|
|
1135
|
+
g_profiler.running = 0;
|
|
1136
|
+
goto timer_fail;
|
|
1137
|
+
}
|
|
1057
1138
|
|
|
1058
1139
|
/* Start worker thread first to get its kernel TID */
|
|
1059
1140
|
g_profiler.worker_tid = 0;
|
|
1060
1141
|
if (pthread_create(&g_profiler.worker_thread, NULL,
|
|
1061
1142
|
rperf_worker_signal_func, &g_profiler) != 0) {
|
|
1062
1143
|
g_profiler.running = 0;
|
|
1063
|
-
|
|
1144
|
+
sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
|
|
1064
1145
|
goto timer_fail;
|
|
1065
1146
|
}
|
|
1066
1147
|
|
|
@@ -1078,7 +1159,7 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
1078
1159
|
sev._sigev_un._tid = g_profiler.worker_tid;
|
|
1079
1160
|
if (timer_create(CLOCK_MONOTONIC, &sev, &g_profiler.timer_id) != 0) {
|
|
1080
1161
|
g_profiler.running = 0;
|
|
1081
|
-
|
|
1162
|
+
sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
|
|
1082
1163
|
CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
|
|
1083
1164
|
CHECKED(pthread_join(g_profiler.worker_thread, NULL));
|
|
1084
1165
|
goto timer_fail;
|
|
@@ -1087,7 +1168,14 @@ rb_rperf_start(int argc, VALUE *argv, VALUE self)
|
|
|
1087
1168
|
its.it_value.tv_sec = 0;
|
|
1088
1169
|
its.it_value.tv_nsec = 1000000000L / g_profiler.frequency;
|
|
1089
1170
|
its.it_interval = its.it_value;
|
|
1090
|
-
timer_settime(g_profiler.timer_id, 0, &its, NULL)
|
|
1171
|
+
if (timer_settime(g_profiler.timer_id, 0, &its, NULL) != 0) {
|
|
1172
|
+
timer_delete(g_profiler.timer_id);
|
|
1173
|
+
g_profiler.running = 0;
|
|
1174
|
+
sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
|
|
1175
|
+
CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
|
|
1176
|
+
CHECKED(pthread_join(g_profiler.worker_thread, NULL));
|
|
1177
|
+
goto timer_fail;
|
|
1178
|
+
}
|
|
1091
1179
|
} else
|
|
1092
1180
|
#endif
|
|
1093
1181
|
{
|
|
@@ -1109,6 +1197,7 @@ timer_fail:
|
|
|
1109
1197
|
rb_internal_thread_specific_set(cur, g_profiler.ts_key, NULL);
|
|
1110
1198
|
}
|
|
1111
1199
|
}
|
|
1200
|
+
rb_remove_event_hook(rperf_gc_event_hook);
|
|
1112
1201
|
rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
|
|
1113
1202
|
g_profiler.thread_hook = NULL;
|
|
1114
1203
|
if (g_profiler.aggregate) {
|
|
@@ -1139,17 +1228,28 @@ rb_rperf_stop(VALUE self)
|
|
|
1139
1228
|
g_profiler.running = 0;
|
|
1140
1229
|
#if RPERF_USE_TIMER_SIGNAL
|
|
1141
1230
|
if (g_profiler.timer_signal > 0) {
|
|
1231
|
+
/* Delete timer first to stop generating new signals.
|
|
1232
|
+
* Do NOT restore signal handler yet — the worker thread may still have
|
|
1233
|
+
* pending timer signals. rperf_signal_handler handles them harmlessly. */
|
|
1142
1234
|
timer_delete(g_profiler.timer_id);
|
|
1143
|
-
signal(g_profiler.timer_signal, SIG_IGN);
|
|
1144
1235
|
}
|
|
1145
1236
|
#endif
|
|
1146
1237
|
|
|
1147
|
-
/* Wake and join worker thread
|
|
1238
|
+
/* Wake and join worker thread.
|
|
1239
|
+
* Any pending timer signals are still handled by rperf_signal_handler
|
|
1240
|
+
* (just increments trigger_count + calls rb_postponed_job_trigger). */
|
|
1148
1241
|
CHECKED(pthread_cond_signal(&g_profiler.worker_cond));
|
|
1149
1242
|
CHECKED(pthread_join(g_profiler.worker_thread, NULL));
|
|
1150
1243
|
CHECKED(pthread_mutex_destroy(&g_profiler.worker_mutex));
|
|
1151
1244
|
CHECKED(pthread_cond_destroy(&g_profiler.worker_cond));
|
|
1152
1245
|
|
|
1246
|
+
#if RPERF_USE_TIMER_SIGNAL
|
|
1247
|
+
if (g_profiler.timer_signal > 0) {
|
|
1248
|
+
/* Worker thread is gone — safe to restore old signal handler now. */
|
|
1249
|
+
sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
|
|
1250
|
+
}
|
|
1251
|
+
#endif
|
|
1252
|
+
|
|
1153
1253
|
if (g_profiler.thread_hook) {
|
|
1154
1254
|
rb_internal_thread_remove_event_hook(g_profiler.thread_hook);
|
|
1155
1255
|
g_profiler.thread_hook = NULL;
|
|
@@ -1159,13 +1259,15 @@ rb_rperf_stop(VALUE self)
|
|
|
1159
1259
|
rb_remove_event_hook(rperf_gc_event_hook);
|
|
1160
1260
|
|
|
1161
1261
|
if (g_profiler.aggregate) {
|
|
1262
|
+
/* Worker thread is joined; no concurrent access to these atomics. */
|
|
1263
|
+
int cur_idx = atomic_load_explicit(&g_profiler.active_idx, memory_order_relaxed);
|
|
1162
1264
|
/* Aggregate remaining samples from both buffers */
|
|
1163
|
-
if (g_profiler.swap_ready) {
|
|
1164
|
-
int standby_idx =
|
|
1265
|
+
if (atomic_load_explicit(&g_profiler.swap_ready, memory_order_relaxed)) {
|
|
1266
|
+
int standby_idx = cur_idx ^ 1;
|
|
1165
1267
|
rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[standby_idx]);
|
|
1166
|
-
g_profiler.swap_ready
|
|
1268
|
+
atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
|
|
1167
1269
|
}
|
|
1168
|
-
rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[
|
|
1270
|
+
rperf_aggregate_buffer(&g_profiler, &g_profiler.buffers[cur_idx]);
|
|
1169
1271
|
}
|
|
1170
1272
|
|
|
1171
1273
|
/* Clean up thread-specific data for all live threads */
|
|
@@ -1193,10 +1295,11 @@ rb_rperf_stop(VALUE self)
|
|
|
1193
1295
|
/* frequency */
|
|
1194
1296
|
rb_hash_aset(result, ID2SYM(rb_intern("frequency")), INT2NUM(g_profiler.frequency));
|
|
1195
1297
|
|
|
1196
|
-
/* trigger_count, sampling_count, sampling_time_ns */
|
|
1197
|
-
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.trigger_count));
|
|
1198
|
-
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.sampling_count));
|
|
1199
|
-
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.sampling_total_ns));
|
|
1298
|
+
/* trigger_count, sampling_count, sampling_time_ns, detected_thread_count */
|
|
1299
|
+
rb_hash_aset(result, ID2SYM(rb_intern("trigger_count")), SIZET2NUM(g_profiler.stats.trigger_count));
|
|
1300
|
+
rb_hash_aset(result, ID2SYM(rb_intern("sampling_count")), SIZET2NUM(g_profiler.stats.sampling_count));
|
|
1301
|
+
rb_hash_aset(result, ID2SYM(rb_intern("sampling_time_ns")), LONG2NUM(g_profiler.stats.sampling_total_ns));
|
|
1302
|
+
rb_hash_aset(result, ID2SYM(rb_intern("detected_thread_count")), INT2NUM(g_profiler.next_thread_seq));
|
|
1200
1303
|
|
|
1201
1304
|
/* aggregation stats */
|
|
1202
1305
|
if (g_profiler.aggregate) {
|
|
@@ -1231,7 +1334,7 @@ rb_rperf_stop(VALUE self)
|
|
|
1231
1334
|
rb_ary_push(resolved_ary, rb_ary_new3(2, rb_str_new_lit("<GC>"), rb_str_new_lit("[GC sweeping]")));
|
|
1232
1335
|
/* Real frames */
|
|
1233
1336
|
for (i = RPERF_SYNTHETIC_COUNT; i < ft->count; i++) {
|
|
1234
|
-
rb_ary_push(resolved_ary, rperf_resolve_frame(ft->keys[i]));
|
|
1337
|
+
rb_ary_push(resolved_ary, rperf_resolve_frame(atomic_load_explicit(&ft->keys, memory_order_relaxed)[i]));
|
|
1235
1338
|
}
|
|
1236
1339
|
|
|
1237
1340
|
rperf_agg_table_t *at = &g_profiler.agg_table;
|
|
@@ -1285,7 +1388,9 @@ rb_rperf_stop(VALUE self)
|
|
|
1285
1388
|
rb_ary_push(samples_ary, sample);
|
|
1286
1389
|
}
|
|
1287
1390
|
}
|
|
1288
|
-
rb_hash_aset(result,
|
|
1391
|
+
rb_hash_aset(result,
|
|
1392
|
+
ID2SYM(rb_intern(g_profiler.aggregate ? "aggregated_samples" : "raw_samples")),
|
|
1393
|
+
samples_ary);
|
|
1289
1394
|
|
|
1290
1395
|
/* Cleanup */
|
|
1291
1396
|
rperf_sample_buffer_free(&g_profiler.buffers[0]);
|
|
@@ -1304,9 +1409,20 @@ rperf_after_fork_child(void)
|
|
|
1304
1409
|
g_profiler.running = 0;
|
|
1305
1410
|
|
|
1306
1411
|
#if RPERF_USE_TIMER_SIGNAL
|
|
1307
|
-
/* timer_create timers are not inherited across fork
|
|
1412
|
+
/* timer_create timers are not inherited across fork, but pending signals may be.
|
|
1413
|
+
* Block the signal, drain any pending instances, then restore old handler. */
|
|
1308
1414
|
if (g_profiler.timer_signal > 0) {
|
|
1309
|
-
|
|
1415
|
+
sigset_t block_set, old_set;
|
|
1416
|
+
struct timespec zero_ts = {0, 0};
|
|
1417
|
+
|
|
1418
|
+
sigemptyset(&block_set);
|
|
1419
|
+
sigaddset(&block_set, g_profiler.timer_signal);
|
|
1420
|
+
pthread_sigmask(SIG_BLOCK, &block_set, &old_set);
|
|
1421
|
+
|
|
1422
|
+
while (sigtimedwait(&block_set, NULL, &zero_ts) > 0) {}
|
|
1423
|
+
|
|
1424
|
+
sigaction(g_profiler.timer_signal, &g_profiler.old_sigaction, NULL);
|
|
1425
|
+
pthread_sigmask(SIG_SETMASK, &old_set, NULL);
|
|
1310
1426
|
}
|
|
1311
1427
|
#endif
|
|
1312
1428
|
|
|
@@ -1326,12 +1442,13 @@ rperf_after_fork_child(void)
|
|
|
1326
1442
|
}
|
|
1327
1443
|
|
|
1328
1444
|
/* Reset GC state */
|
|
1329
|
-
g_profiler.
|
|
1445
|
+
g_profiler.gc.phase = 0;
|
|
1446
|
+
g_profiler.gc.enter_ns = 0;
|
|
1330
1447
|
|
|
1331
1448
|
/* Reset stats */
|
|
1332
|
-
g_profiler.sampling_count = 0;
|
|
1333
|
-
g_profiler.sampling_total_ns = 0;
|
|
1334
|
-
g_profiler.swap_ready
|
|
1449
|
+
g_profiler.stats.sampling_count = 0;
|
|
1450
|
+
g_profiler.stats.sampling_total_ns = 0;
|
|
1451
|
+
atomic_store_explicit(&g_profiler.swap_ready, 0, memory_order_relaxed);
|
|
1335
1452
|
}
|
|
1336
1453
|
|
|
1337
1454
|
/* ---- Init ---- */
|
|
@@ -1340,7 +1457,7 @@ void
|
|
|
1340
1457
|
Init_rperf(void)
|
|
1341
1458
|
{
|
|
1342
1459
|
VALUE mRperf = rb_define_module("Rperf");
|
|
1343
|
-
rb_define_module_function(mRperf, "_c_start", rb_rperf_start,
|
|
1460
|
+
rb_define_module_function(mRperf, "_c_start", rb_rperf_start, 4);
|
|
1344
1461
|
rb_define_module_function(mRperf, "_c_stop", rb_rperf_stop, 0);
|
|
1345
1462
|
|
|
1346
1463
|
memset(&g_profiler, 0, sizeof(g_profiler));
|
data/lib/rperf/version.rb
CHANGED
data/lib/rperf.rb
CHANGED
|
@@ -24,14 +24,25 @@ module Rperf
|
|
|
24
24
|
# .txt → text report (human/AI readable flat + cumulative table)
|
|
25
25
|
# otherwise (.pb.gz etc) → pprof protobuf (gzip compressed)
|
|
26
26
|
def self.start(frequency: 1000, mode: :cpu, output: nil, verbose: false, format: nil, stat: false, signal: nil, aggregate: true)
|
|
27
|
+
raise ArgumentError, "frequency must be a positive integer (got #{frequency.inspect})" unless frequency.is_a?(Integer) && frequency > 0
|
|
28
|
+
raise ArgumentError, "frequency must be <= 10000 (10KHz), got #{frequency}" if frequency > 10_000
|
|
29
|
+
raise ArgumentError, "mode must be :cpu or :wall, got #{mode.inspect}" unless %i[cpu wall].include?(mode)
|
|
30
|
+
c_mode = mode == :cpu ? 0 : 1
|
|
31
|
+
c_signal = signal.nil? ? -1 : (signal ? signal.to_i : 0)
|
|
32
|
+
if c_signal > 0
|
|
33
|
+
raise ArgumentError, "signal mode is only supported on Linux" unless RUBY_PLATFORM =~ /linux/
|
|
34
|
+
uncatchable = [Signal.list["KILL"], Signal.list["STOP"]].compact
|
|
35
|
+
if uncatchable.include?(c_signal)
|
|
36
|
+
name = Signal.signame(c_signal) rescue c_signal.to_s
|
|
37
|
+
raise ArgumentError, "signal #{c_signal} (#{name}) cannot be caught; use a different signal"
|
|
38
|
+
end
|
|
39
|
+
end
|
|
27
40
|
@verbose = verbose || ENV["RPERF_VERBOSE"] == "1"
|
|
28
41
|
@output = output
|
|
29
42
|
@format = format
|
|
30
43
|
@stat = stat
|
|
31
44
|
@stat_start_mono = Process.clock_gettime(Process::CLOCK_MONOTONIC) if @stat
|
|
32
|
-
|
|
33
|
-
c_opts[:signal] = signal unless signal.nil?
|
|
34
|
-
_c_start(**c_opts)
|
|
45
|
+
_c_start(frequency, c_mode, aggregate, c_signal)
|
|
35
46
|
|
|
36
47
|
if block_given?
|
|
37
48
|
begin
|
|
@@ -46,6 +57,21 @@ module Rperf
|
|
|
46
57
|
data = _c_stop
|
|
47
58
|
return unless data
|
|
48
59
|
|
|
60
|
+
# When aggregate: false, C extension returns :raw_samples but not
|
|
61
|
+
# :aggregated_samples. Build aggregated view so encoders always work.
|
|
62
|
+
if data[:raw_samples] && !data[:aggregated_samples]
|
|
63
|
+
merged = {}
|
|
64
|
+
data[:raw_samples].each do |frames, weight, thread_seq|
|
|
65
|
+
key = [frames, thread_seq || 0]
|
|
66
|
+
if merged.key?(key)
|
|
67
|
+
merged[key] += weight
|
|
68
|
+
else
|
|
69
|
+
merged[key] = weight
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
data[:aggregated_samples] = merged.map { |(frames, ts), w| [frames, w, ts] }
|
|
73
|
+
end
|
|
74
|
+
|
|
49
75
|
print_stats(data) if @verbose
|
|
50
76
|
print_stat(data) if @stat
|
|
51
77
|
|
|
@@ -148,7 +174,7 @@ module Rperf
|
|
|
148
174
|
|
|
149
175
|
# Samples from C are now [[path_str, label_str], ...], weight]
|
|
150
176
|
def self.print_top(data)
|
|
151
|
-
samples_raw = data[:
|
|
177
|
+
samples_raw = data[:aggregated_samples]
|
|
152
178
|
return if !samples_raw || samples_raw.empty?
|
|
153
179
|
|
|
154
180
|
result = compute_flat_cum(samples_raw)
|
|
@@ -180,7 +206,7 @@ module Rperf
|
|
|
180
206
|
private_constant :STAT_PCT_LINE, :STAT_LINE
|
|
181
207
|
|
|
182
208
|
def self.print_stat(data)
|
|
183
|
-
samples_raw = data[:
|
|
209
|
+
samples_raw = data[:aggregated_samples] || []
|
|
184
210
|
real_ns = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - @stat_start_mono) * 1_000_000_000).to_i
|
|
185
211
|
times = Process.times
|
|
186
212
|
user_ns = (times.utime * 1_000_000_000).to_i
|
|
@@ -198,7 +224,7 @@ module Rperf
|
|
|
198
224
|
if samples_raw.size > 0
|
|
199
225
|
breakdown, total_weight = compute_stat_breakdown(samples_raw)
|
|
200
226
|
print_stat_breakdown(breakdown, total_weight)
|
|
201
|
-
print_stat_runtime_info
|
|
227
|
+
print_stat_runtime_info(data)
|
|
202
228
|
print_stat_system_info
|
|
203
229
|
print_stat_report(data) if ENV["RPERF_STAT_REPORT"] == "1"
|
|
204
230
|
print_stat_footer(samples_raw, real_ns, data)
|
|
@@ -246,7 +272,9 @@ module Rperf
|
|
|
246
272
|
end
|
|
247
273
|
private_class_method :print_stat_breakdown
|
|
248
274
|
|
|
249
|
-
def self.print_stat_runtime_info
|
|
275
|
+
def self.print_stat_runtime_info(data)
|
|
276
|
+
thread_count = data[:detected_thread_count] || 0
|
|
277
|
+
$stderr.puts STAT_LINE.call(format_integer(thread_count), " ", "[Ruby] detected threads") if thread_count > 0
|
|
250
278
|
gc = GC.stat
|
|
251
279
|
$stderr.puts STAT_LINE.call(format_ms(gc[:time] * 1_000_000), "ms",
|
|
252
280
|
"[Ruby] GC time (%s count: %s minor, %s major)" % [
|
|
@@ -391,7 +419,7 @@ module Rperf
|
|
|
391
419
|
module_function
|
|
392
420
|
|
|
393
421
|
def encode(data, top_n: 50, header: true)
|
|
394
|
-
samples_raw = data[:
|
|
422
|
+
samples_raw = data[:aggregated_samples]
|
|
395
423
|
mode = data[:mode] || :cpu
|
|
396
424
|
frequency = data[:frequency] || 0
|
|
397
425
|
|
|
@@ -433,8 +461,10 @@ module Rperf
|
|
|
433
461
|
module_function
|
|
434
462
|
|
|
435
463
|
def encode(data)
|
|
464
|
+
samples = data[:aggregated_samples]
|
|
465
|
+
return "" if !samples || samples.empty?
|
|
436
466
|
merged = Hash.new(0)
|
|
437
|
-
|
|
467
|
+
samples.each do |frames, weight|
|
|
438
468
|
key = frames.reverse.map { |_, label| label }.join(";")
|
|
439
469
|
merged[key] += weight
|
|
440
470
|
end
|
|
@@ -451,7 +481,7 @@ module Rperf
|
|
|
451
481
|
module_function
|
|
452
482
|
|
|
453
483
|
def encode(data)
|
|
454
|
-
samples_raw = data[:
|
|
484
|
+
samples_raw = data[:aggregated_samples]
|
|
455
485
|
frequency = data[:frequency]
|
|
456
486
|
interval_ns = 1_000_000_000 / frequency
|
|
457
487
|
mode = data[:mode] || :cpu
|
|
@@ -537,7 +567,7 @@ module Rperf
|
|
|
537
567
|
|
|
538
568
|
# field 6: string_table (repeated string)
|
|
539
569
|
string_table.each do |s|
|
|
540
|
-
buf << encode_bytes(6, s.encode("UTF-8"))
|
|
570
|
+
buf << encode_bytes(6, s.encode("UTF-8", invalid: :replace, undef: :replace))
|
|
541
571
|
end
|
|
542
572
|
|
|
543
573
|
# field 9: time_nanos (int64)
|