hyperion-rb 2.16.3 → 2.16.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,319 @@
1
+ //! `IORING_REGISTER_PBUF_RING` (Linux 5.19+) — kernel-managed receive
2
+ //! buffer pool. The ring registers N buffers of M bytes each; the
3
+ //! kernel hands back a buffer-id in each recv CQE. Caller borrows the
4
+ //! buffer (zero-copy view), consumes the bytes, then `release`s the
5
+ //! buffer-id back to the kernel so it can be refilled.
6
+ //!
7
+ //! Plan #2 (io_uring hot-path roadmap), Task 2.1.2. Linux-only;
8
+ //! non-Linux builds see `stub_impl` which always returns ENOSYS from
9
+ //! `new()` so the caller can fall through to the accept4 path cleanly.
10
+ //!
11
+ //! ## Memory layout
12
+ //!
13
+ //! PBUF_RING uses a single contiguous, **page-aligned** memory region as
14
+ //! a producer/consumer ring of `io_uring_buf` entries (16 bytes each:
15
+ //! `addr:u64 | len:u32 | bid:u16 | resv:u16`). The kernel treats
16
+ //! `ring[0].resv` as the tail counter it polls to discover newly-released
17
+ //! buffers; userspace increments it (with Release ordering) after writing
18
+ //! the entry's addr/len/bid. The actual receive data lands in a
19
+ //! separate `backing` allocation whose slices are pointed to by the ring
20
+ //! entries.
21
+ //!
22
+ //! The ring and backing allocations are kept alive by this struct.
23
+ //! The kernel's registration holds a reference to the *ring* memory; if
24
+ //! the IoUring is dropped before this BufferRing, the registration
25
+ //! becomes stale — the caller (Task 2.1.3's `HotpathRing`) is
26
+ //! responsible for drop ordering.
27
+
28
+ #[cfg(target_os = "linux")]
29
+ mod linux_impl {
30
+ use io_uring::{types::BufRingEntry, IoUring, squeue, cqueue};
31
+ use std::alloc::{alloc_zeroed, dealloc, Layout};
32
+ use std::sync::atomic::{AtomicU16, Ordering};
33
+
34
+ /// Kernel-managed receive buffer pool for one io_uring instance.
35
+ ///
36
+ /// `group_id` (`bgid`) identifies the pool; recv SQEs reference it so
37
+ /// the kernel knows which pool to pull a buffer from and return the
38
+ /// buffer-id in `cqe.flags >> IORING_CQE_BUFFER_SHIFT`.
39
+ pub struct BufferRing {
40
+ /// Buffer group id passed to recv SQEs and to `register_buf_ring`.
41
+ pub group_id: u16,
42
+ /// Number of buffers in the ring. Must be a power of two; the
43
+ /// kernel enforces `ring_entries <= 32768`.
44
+ pub n_bufs: u16,
45
+ /// Size of each individual receive buffer in bytes.
46
+ pub buf_size: u32,
47
+
48
+ /// Page-aligned ring memory: N `BufRingEntry` (16 bytes each).
49
+ /// The kernel reads from this to discover available buffers.
50
+ /// Must stay pinned until the ring is unregistered.
51
+ ring_ptr: *mut BufRingEntry,
52
+ ring_layout: Layout,
53
+
54
+ /// Backing storage for the actual receive data. Slice `buf_id`
55
+ /// starts at `buf_id as usize * buf_size as usize`.
56
+ backing_ptr: *mut u8,
57
+ backing_layout: Layout,
58
+
59
+ /// Shadow of the tail counter. The authoritative tail lives at
60
+ /// `ring[0].resv` — this mirror lets `release` compute the slot
61
+ /// index without re-reading the (volatile) kernel-shared field.
62
+ /// AtomicU16 for forward-compatibility with a future SQPOLL path
63
+ /// that might race; under the GVL today a Cell<u16> would suffice.
64
+ tail: AtomicU16,
65
+ }
66
+
67
+ // SAFETY: BufferRing owns its raw allocations and the ring_ptr /
68
+ // backing_ptr are not shared across threads (one ring per worker
69
+ // process; the GVL is held during every call into this struct).
70
+ unsafe impl Send for BufferRing {}
71
+
72
+ impl BufferRing {
73
+ /// Allocate the ring and backing memory, then register the buffer
74
+ /// ring with the kernel via `IORING_REGISTER_PBUF_RING`.
75
+ ///
76
+ /// Returns `Err` with the OS errno on kernel rejection (e.g.
77
+ /// `EINVAL` if `n_bufs` is not a power of two or exceeds 32768,
78
+ /// `ENOSYS` on kernels < 5.19, `EPERM` under seccomp, etc.).
79
+ ///
80
+ /// # Panics
81
+ ///
82
+ /// Panics if `n_bufs == 0` or `buf_size == 0` (programming error).
83
+ pub fn new(
84
+ ring: &mut IoUring<squeue::Entry, cqueue::Entry>,
85
+ group_id: u16,
86
+ n_bufs: u16,
87
+ buf_size: u32,
88
+ ) -> std::io::Result<Self> {
89
+ assert!(n_bufs > 0, "n_bufs must be > 0");
90
+ assert!(buf_size > 0, "buf_size must be > 0");
91
+
92
+ // --- Allocate the page-aligned ring entries ---
93
+ //
94
+ // The kernel requires the ring base address to be page-aligned.
95
+ // Each BufRingEntry is 16 bytes (size_of::<io_uring_buf>()).
96
+ let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as usize };
97
+ let ring_bytes = (n_bufs as usize) * std::mem::size_of::<BufRingEntry>();
98
+ // Round up to a full page so the allocation is page-aligned.
99
+ let ring_alloc_bytes = round_up(ring_bytes, page_size);
100
+ let ring_layout = Layout::from_size_align(ring_alloc_bytes, page_size)
101
+ .map_err(|_| std::io::Error::from_raw_os_error(libc::EINVAL))?;
102
+
103
+ // SAFETY: layout has non-zero size and valid alignment.
104
+ let ring_ptr = unsafe { alloc_zeroed(ring_layout) as *mut BufRingEntry };
105
+ if ring_ptr.is_null() {
106
+ return Err(std::io::Error::from_raw_os_error(libc::ENOMEM));
107
+ }
108
+
109
+ // --- Allocate the backing receive buffers ---
110
+ let backing_bytes = (n_bufs as usize) * (buf_size as usize);
111
+ // 64-byte alignment keeps each buffer on a cache line boundary.
112
+ let backing_layout = Layout::from_size_align(backing_bytes, 64)
113
+ .map_err(|_| {
114
+ unsafe { dealloc(ring_ptr as *mut u8, ring_layout) };
115
+ std::io::Error::from_raw_os_error(libc::EINVAL)
116
+ })?;
117
+ let backing_ptr = unsafe { alloc_zeroed(backing_layout) };
118
+ if backing_ptr.is_null() {
119
+ unsafe { dealloc(ring_ptr as *mut u8, ring_layout) };
120
+ return Err(std::io::Error::from_raw_os_error(libc::ENOMEM));
121
+ }
122
+
123
+ // --- Populate the ring entries before registration ---
124
+ //
125
+ // We must fill addr/len/bid for all N slots and set the initial
126
+ // tail (in ring[0].resv) to N so the kernel sees all buffers as
127
+ // available immediately after registration.
128
+ for i in 0..n_bufs {
129
+ let buf_offset = (i as usize) * (buf_size as usize);
130
+ // SAFETY: ring_ptr is valid for n_bufs entries.
131
+ let entry = unsafe { &mut *ring_ptr.add(i as usize) };
132
+ entry.set_addr(unsafe { backing_ptr.add(buf_offset) } as u64);
133
+ entry.set_len(buf_size);
134
+ entry.set_bid(i);
135
+ }
136
+ // Write the initial tail into ring[0].resv. The kernel begins
137
+ // reading from tail=0, so setting tail=n_bufs makes all N
138
+ // buffers available (the ring wraps modulo n_bufs).
139
+ // SAFETY: ring_ptr is valid; BufRingEntry::tail returns a pointer
140
+ // into the first entry's resv field.
141
+ unsafe {
142
+ let tail_ptr = BufRingEntry::tail(ring_ptr) as *mut u16;
143
+ tail_ptr.write_volatile(n_bufs);
144
+ }
145
+
146
+ // --- Register with the kernel ---
147
+ //
148
+ // io-uring 0.6.4: `Submitter::register_buf_ring(ring_addr, ring_entries, bgid)`.
149
+ // The kernel holds the registration until `unregister_buf_ring` or ring close.
150
+ unsafe {
151
+ ring.submitter()
152
+ .register_buf_ring(ring_ptr as u64, n_bufs, group_id)
153
+ .map_err(|e| {
154
+ // Free allocations on registration failure.
155
+ dealloc(backing_ptr, backing_layout);
156
+ dealloc(ring_ptr as *mut u8, ring_layout);
157
+ e
158
+ })?;
159
+ }
160
+
161
+ Ok(BufferRing {
162
+ group_id,
163
+ n_bufs,
164
+ buf_size,
165
+ ring_ptr,
166
+ ring_layout,
167
+ backing_ptr,
168
+ backing_layout,
169
+ // Mirror of the tail we just wrote.
170
+ tail: AtomicU16::new(n_bufs),
171
+ })
172
+ }
173
+
174
+ /// Borrow a read-only view into the kernel-filled buffer `buf_id`.
175
+ ///
176
+ /// The slice is valid until the next `release(buf_id)` call — the
177
+ /// kernel may overwrite the memory the moment the buffer is released.
178
+ /// Callers **must not** hold the slice across fiber yield points or
179
+ /// after calling `release`.
180
+ ///
181
+ /// # Safety
182
+ ///
183
+ /// - `buf_id` must be a valid id returned by a recv CQE on this ring.
184
+ /// - `len` must be `<= buf_size` (the kernel writes at most `buf_size`
185
+ /// bytes).
186
+ pub unsafe fn borrow(&self, buf_id: u16, len: usize) -> &[u8] {
187
+ debug_assert!((buf_id as usize) < (self.n_bufs as usize));
188
+ debug_assert!(len <= self.buf_size as usize);
189
+ let offset = (buf_id as usize) * (self.buf_size as usize);
190
+ // SAFETY: backing_ptr is valid for the full backing allocation;
191
+ // offset is within range by the invariants above.
192
+ std::slice::from_raw_parts(self.backing_ptr.add(offset), len)
193
+ }
194
+
195
+ /// Release `buf_id` back to the kernel.
196
+ ///
197
+ /// Re-writes the ring entry (addr/len/bid) and increments the tail
198
+ /// counter. No syscall is required — the kernel polls the tail in
199
+ /// shared memory.
200
+ pub fn release(&self, buf_id: u16) {
201
+ // Shadow tail is purely local state under the GVL; Relaxed is
202
+ // sufficient. The cross-domain ordering with the kernel is
203
+ // enforced by the explicit Release fence below before the
204
+ // tail-pointer store.
205
+ let shadow_tail = self.tail.fetch_add(1, Ordering::Relaxed);
206
+ let slot = (shadow_tail as usize) & (self.n_bufs as usize - 1);
207
+
208
+ // Re-publish the buffer at the slot.
209
+ let buf_offset = (buf_id as usize) * (self.buf_size as usize);
210
+ // SAFETY: ring_ptr is valid; slot < n_bufs by the mask above.
211
+ unsafe {
212
+ let entry = &mut *self.ring_ptr.add(slot);
213
+ entry.set_addr(self.backing_ptr.add(buf_offset) as u64);
214
+ entry.set_len(self.buf_size);
215
+ entry.set_bid(buf_id);
216
+ }
217
+
218
+ // Store-Release barrier: the slot writes above MUST be visible
219
+ // to the kernel before the tail increment is. write_volatile
220
+ // alone is not a barrier on ARM (DMB ST is needed); on x86 TSO
221
+ // makes this redundant but the fence is free there. Without
222
+ // this fence, ARM kernels could observe the tail increment
223
+ // before the slot writes and pick up stale buffer pointers.
224
+ // Mirrors liburing's io_uring_buf_ring_advance which uses
225
+ // smp_store_release on the tail.
226
+ std::sync::atomic::fence(Ordering::Release);
227
+ // SAFETY: ring_ptr is valid; tail() points to ring[0].resv.
228
+ unsafe {
229
+ let tail_ptr = BufRingEntry::tail(self.ring_ptr) as *mut u16;
230
+ // wrapping_add handles u16 overflow correctly (the kernel
231
+ // also uses wrapping arithmetic on this counter).
232
+ tail_ptr.write_volatile(shadow_tail.wrapping_add(1));
233
+ }
234
+ }
235
+
236
+ /// Accessors for callers that need read-only metadata.
237
+ pub fn group_id(&self) -> u16 { self.group_id }
238
+ pub fn n_bufs(&self) -> u16 { self.n_bufs }
239
+ pub fn buf_size(&self) -> u32 { self.buf_size }
240
+ }
241
+
242
+ impl Drop for BufferRing {
243
+ fn drop(&mut self) {
244
+ // CRITICAL CONTRACT for HotpathRing (Task 2.1.3):
245
+ //
246
+ // Before this Drop runs, the owner MUST have called
247
+ // `ring.submitter().unregister_buf_ring(self.group_id)` on the
248
+ // associated IoUring, OR have dropped the IoUring (which closes
249
+ // the ring fd and tears down the registration kernel-side).
250
+ //
251
+ // Otherwise the kernel retains a registration pointing to the
252
+ // memory we are about to free, and the next multishot recv CQE
253
+ // can write into freed userspace memory — a kernel-side
254
+ // use-after-free, NOT a benign leak.
255
+ //
256
+ // HotpathRing's own Drop impl must enforce the order:
257
+ // 1. unregister_buf_ring(group_id)
258
+ // 2. drop(BufferRing) ← this Drop runs here
259
+ // 3. drop(IoUring)
260
+ //
261
+ // SAFETY: backing_ptr / ring_ptr / *_layout are valid; the
262
+ // owner has guaranteed (per contract above) that the kernel
263
+ // is no longer accessing this memory.
264
+ unsafe { dealloc(self.backing_ptr, self.backing_layout) };
265
+ unsafe { dealloc(self.ring_ptr as *mut u8, self.ring_layout) };
266
+ }
267
+ }
268
+
269
+ /// Round `n` up to the nearest multiple of `align` (which must be a
270
+ /// power of two).
271
+ #[inline]
272
+ fn round_up(n: usize, align: usize) -> usize {
273
+ (n + align - 1) & !(align - 1)
274
+ }
275
+ }
276
+
277
+ // ===== Non-Linux stub =====
278
+ //
279
+ // On Darwin / BSD the entire io-uring dep is gated out; we compile a
280
+ // zero-cost stub that always returns ENOSYS from `new()`. The Ruby
281
+ // caller probes with `IOUring.supported?` before reaching this code
282
+ // in practice, but the stub ensures the macOS cdylib links cleanly.
283
+
284
+ #[cfg(not(target_os = "linux"))]
285
+ mod stub_impl {
286
+ /// Non-Linux stub — never instantiated in practice.
287
+ pub struct BufferRing {
288
+ pub group_id: u16,
289
+ pub n_bufs: u16,
290
+ pub buf_size: u32,
291
+ }
292
+
293
+ impl BufferRing {
294
+ pub fn new(
295
+ _ring: &mut (),
296
+ _group_id: u16,
297
+ _n_bufs: u16,
298
+ _buf_size: u32,
299
+ ) -> std::io::Result<Self> {
300
+ Err(std::io::Error::from_raw_os_error(38)) // ENOSYS
301
+ }
302
+
303
+ /// SAFETY: never called on non-Linux (new() always errors first).
304
+ pub unsafe fn borrow(&self, _buf_id: u16, _len: usize) -> &[u8] {
305
+ &[]
306
+ }
307
+
308
+ pub fn release(&self, _buf_id: u16) {}
309
+
310
+ pub fn group_id(&self) -> u16 { self.group_id }
311
+ pub fn n_bufs(&self) -> u16 { self.n_bufs }
312
+ pub fn buf_size(&self) -> u32 { self.buf_size }
313
+ }
314
+ }
315
+
316
+ #[cfg(target_os = "linux")]
317
+ pub use linux_impl::BufferRing;
318
+ #[cfg(not(target_os = "linux"))]
319
+ pub use stub_impl::BufferRing;