hyperion-rb 2.10.1 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,710 @@
1
+ /* ----------------------------------------------------------------------
2
+ * io_uring_loop.c — 2.12-D — io_uring-driven C accept loop.
3
+ *
4
+ * Sibling translation unit to page_cache.c's 2.12-C `accept4` loop.
5
+ * Same wire contract (handle_static-only routes, plain TCP, lifecycle
6
+ * + handoff Ruby callbacks); the only difference is *how* the kernel
7
+ * I/O is driven.
8
+ *
9
+ * Design
10
+ * ------
11
+ * Single ring per `run_static_io_uring_loop` invocation. Operator
12
+ * boots one worker per CPU; each worker calls into this loop and gets
13
+ * its own ring. No ring is shared across threads or across fork — fits
14
+ * the same model `Hyperion::IOUring` already established for the
15
+ * Rust-cdylib path.
16
+ *
17
+ * On entry we:
18
+ * 1. Probe `liburing` at runtime (`io_uring_queue_init`). On failure
19
+ * we return `:unavailable` so the Ruby caller falls through to
20
+ * the 2.12-C `accept4` path. Probe failure is the dominant path
21
+ * on locked-down containers (seccomp blocks `io_uring_setup`),
22
+ * old kernels, and the like.
23
+ * 2. Submit a multishot ACCEPT SQE on the listener fd. Multishot
24
+ * delivers one CQE per accepted connection without re-arming.
25
+ * 3. Drain CQEs in a loop. For each completion we advance the
26
+ * connection's state machine: ACCEPT -> RECV -> WRITE -> CLOSE.
27
+ * 4. After draining, submit any newly-armed SQEs and park on
28
+ * `io_uring_submit_and_wait(1)`. The kernel batches I/O the
29
+ * worker thread does **one** `io_uring_enter` per N CQEs in
30
+ * steady state instead of N×3 syscalls (accept + recv + write).
31
+ *
32
+ * Per-connection state lives in a `hyp_iu_conn_t` allocated on the
33
+ * heap. `user_data` on each SQE is `(uintptr_t)conn | tag` where
34
+ * `tag` is one of the OP_TYPE_* low-bit markers. The arena is
35
+ * intentionally simple: we don't pool — `malloc`/`free` per connection
36
+ * is far cheaper than the 3 syscalls we're saving (and the kernel's
37
+ * own SQE pool is the real win).
38
+ *
39
+ * GVL
40
+ * ---
41
+ * The loop runs INSIDE `rb_thread_call_without_gvl` for `submit_and_wait`,
42
+ * but we re-acquire the GVL whenever we need to call into Ruby
43
+ * (lifecycle callback, handoff callback, and `pc_internal_*` helpers
44
+ * that touch Ruby objects — actually the snapshot helper takes the
45
+ * pthread mutex but no Ruby state, so it's safe to call without the
46
+ * GVL). The hot path (no hooks, no handoffs) stays without the GVL
47
+ * for the entire `submit_and_wait` cycle.
48
+ *
49
+ * Build gating
50
+ * ------------
51
+ * The whole io_uring code path lives behind `#ifdef HAVE_LIBURING`.
52
+ * On macOS / hosts without `liburing-dev` the file compiles down to
53
+ * the stub init that registers `run_static_io_uring_loop` returning
54
+ * `:unavailable`. The stub keeps the Ruby surface stable across
55
+ * platforms — specs that check for the method's existence pass on
56
+ * Darwin too; only the body is gated.
57
+ *
58
+ * 2.12-D — initial drop.
59
+ * ---------------------------------------------------------------------- */
60
+
61
+ #include <ruby.h>
62
+ #include <ruby/thread.h>
63
+
64
+ #include <errno.h>
65
+ #include <stddef.h>
66
+ #include <stdint.h>
67
+ #include <stdlib.h>
68
+ #include <string.h>
69
+ #include <unistd.h>
70
+ #include <fcntl.h>
71
+ #include <sys/socket.h>
72
+
73
+ #include "page_cache_internal.h"
74
+
75
+ /* Ruby identifiers we cache at init time. */
76
+ static VALUE hyp_iu_sym_unavailable = Qnil;
77
+ static VALUE hyp_iu_sym_crashed = Qnil;
78
+
79
+ #if defined(__linux__) && defined(HAVE_LIBURING)
80
+
81
+ #include <liburing.h>
82
+
83
+ /* Per-worker ring state. Single instance per `run_static_io_uring_loop`
84
+ * invocation — the ring is freed on return. */
85
+ typedef struct hyp_iu_loop_s {
86
+ struct io_uring ring;
87
+ int listen_fd;
88
+ long served;
89
+ /* Counters operators don't see directly but specs use to confirm
90
+ * we walked the io_uring code path (vs. a fall-through). */
91
+ long accepts;
92
+ long handoffs;
93
+ long closes;
94
+ /* Set when the loop should drain remaining CQEs and exit. The
95
+ * shared `pc_internal_stop_requested()` flag flips this on
96
+ * `PageCache.stop_accept_loop`. */
97
+ int stopping;
98
+ /* Bound on outstanding connections — avoids unbounded heap growth
99
+ * under a SYN flood. The kernel will keep accepting onto its own
100
+ * accept queue; we just stop pulling them off until headroom
101
+ * frees up. */
102
+ int inflight;
103
+ int max_inflight;
104
+ } hyp_iu_loop_t;
105
+
106
+ /* Connection-level state. One alloc per accepted fd; freed on close
107
+ * completion. */
108
+ typedef enum {
109
+ HYP_IU_OP_ACCEPT = 0x1,
110
+ HYP_IU_OP_RECV = 0x2,
111
+ HYP_IU_OP_WRITE = 0x3,
112
+ HYP_IU_OP_CLOSE = 0x4
113
+ } hyp_iu_op_t;
114
+
115
+ #define HYP_IU_OP_MASK 0x7u
116
+ #define HYP_IU_OP_SHIFT 0u
117
+
118
+ typedef struct hyp_iu_conn_s {
119
+ int fd;
120
+ /* Read buffer. Header section is bounded by PC_INTERNAL_MAX_HEADER_BYTES;
121
+ * we allocate a single 8 KiB chunk eagerly (matches the 2.12-C
122
+ * loop's HYP_CL_READ_CHUNK + the typical request shape) and grow
123
+ * up to the cap on header straddle. */
124
+ char *rbuf;
125
+ size_t rcap;
126
+ size_t roff;
127
+ /* Response snapshot. Owned by the conn; freed in CLOSE handler
128
+ * after the WRITE completes. */
129
+ char *wbuf;
130
+ size_t wlen;
131
+ size_t wsent;
132
+ /* Method/path offsets within rbuf — kept across stages so the
133
+ * lifecycle callback fires with the right strings even after the
134
+ * write completes. */
135
+ size_t method_off, method_len;
136
+ size_t path_off, path_len;
137
+ /* Whether the request asked for `Connection: close`. We honour
138
+ * keep-alive in steady state; close-request shortens the
139
+ * connection lifetime. */
140
+ int keep_alive;
141
+ int handed_off;
142
+ } hyp_iu_conn_t;
143
+
144
+ #define HYP_IU_RBUF_INITIAL 8192
145
+ #define HYP_IU_DEFAULT_DEPTH 256
146
+ #define HYP_IU_DEFAULT_MAX_INFLIGHT 4096
147
+
148
+ /* Pack/unpack `(conn_ptr, op_tag)` into `user_data`. `conn` pointers are
149
+ * malloc'd, so the low 3 bits are zero — we steal them for the op tag.
150
+ * On exotic allocators where this isn't safe, swap to a dedicated tag
151
+ * field on the conn struct + per-op tag table; for glibc / musl /
152
+ * jemalloc this packing is sound (alignof(max_align_t) >= 8). */
153
+ static inline uint64_t hyp_iu_pack_ud(hyp_iu_conn_t *c, hyp_iu_op_t op) {
154
+ return ((uint64_t)(uintptr_t)c) | ((uint64_t)op & HYP_IU_OP_MASK);
155
+ }
156
+ static inline hyp_iu_conn_t *hyp_iu_unpack_conn(uint64_t ud) {
157
+ return (hyp_iu_conn_t *)(uintptr_t)(ud & ~(uint64_t)HYP_IU_OP_MASK);
158
+ }
159
+ static inline hyp_iu_op_t hyp_iu_unpack_op(uint64_t ud) {
160
+ return (hyp_iu_op_t)(ud & HYP_IU_OP_MASK);
161
+ }
162
+
163
+ /* Allocate a connection-state struct. NULL on OOM. */
164
+ static hyp_iu_conn_t *hyp_iu_conn_new(int fd) {
165
+ hyp_iu_conn_t *c = (hyp_iu_conn_t *)calloc(1, sizeof(*c));
166
+ if (c == NULL) {
167
+ return NULL;
168
+ }
169
+ c->fd = fd;
170
+ c->rbuf = (char *)malloc(HYP_IU_RBUF_INITIAL);
171
+ if (c->rbuf == NULL) {
172
+ free(c);
173
+ return NULL;
174
+ }
175
+ c->rcap = HYP_IU_RBUF_INITIAL;
176
+ c->keep_alive = 1;
177
+ return c;
178
+ }
179
+
180
+ static void hyp_iu_conn_free(hyp_iu_conn_t *c) {
181
+ if (c == NULL) return;
182
+ free(c->rbuf);
183
+ free(c->wbuf);
184
+ free(c);
185
+ }
186
+
187
+ /* Submit a CLOSE op for a fd. The fd is closed via io_uring rather than
188
+ * a direct close(2) so we collapse one more syscall into the ring's
189
+ * `submit_and_wait` cycle. The CLOSE completion's only responsibility
190
+ * is to free the conn struct. */
191
+ static void hyp_iu_submit_close(hyp_iu_loop_t *L, hyp_iu_conn_t *c) {
192
+ struct io_uring_sqe *sqe = io_uring_get_sqe(&L->ring);
193
+ if (sqe == NULL) {
194
+ /* SQ full — fall back to direct close + free here. The conn
195
+ * struct is freed inline; we DO NOT touch the ring so the
196
+ * caller's submit cycle stays clean. */
197
+ if (c->fd >= 0) close(c->fd);
198
+ hyp_iu_conn_free(c);
199
+ L->inflight--;
200
+ L->closes++;
201
+ return;
202
+ }
203
+ io_uring_prep_close(sqe, c->fd);
204
+ io_uring_sqe_set_data64(sqe, hyp_iu_pack_ud(c, HYP_IU_OP_CLOSE));
205
+ }
206
+
207
+ /* Submit a RECV onto the conn's read buffer. Reads into the tail of
208
+ * the buffer (rbuf + roff), up to rcap - roff bytes. */
209
+ static int hyp_iu_submit_recv(hyp_iu_loop_t *L, hyp_iu_conn_t *c) {
210
+ struct io_uring_sqe *sqe = io_uring_get_sqe(&L->ring);
211
+ if (sqe == NULL) {
212
+ return -1;
213
+ }
214
+ io_uring_prep_recv(sqe, c->fd, c->rbuf + c->roff, c->rcap - c->roff, 0);
215
+ io_uring_sqe_set_data64(sqe, hyp_iu_pack_ud(c, HYP_IU_OP_RECV));
216
+ return 0;
217
+ }
218
+
219
+ /* Submit a WRITE for the prepared response snapshot. */
220
+ static int hyp_iu_submit_write(hyp_iu_loop_t *L, hyp_iu_conn_t *c) {
221
+ struct io_uring_sqe *sqe = io_uring_get_sqe(&L->ring);
222
+ if (sqe == NULL) {
223
+ return -1;
224
+ }
225
+ io_uring_prep_send(sqe, c->fd, c->wbuf + c->wsent, c->wlen - c->wsent, 0);
226
+ io_uring_sqe_set_data64(sqe, hyp_iu_pack_ud(c, HYP_IU_OP_WRITE));
227
+ return 0;
228
+ }
229
+
230
+ /* Submit the multishot ACCEPT on the listener. Multishot continues
231
+ * delivering CQEs until the kernel returns -ENOBUFS or the SQE is
232
+ * cancelled; we re-arm only on -ENOBUFS. */
233
+ static int hyp_iu_submit_accept(hyp_iu_loop_t *L) {
234
+ struct io_uring_sqe *sqe = io_uring_get_sqe(&L->ring);
235
+ if (sqe == NULL) {
236
+ return -1;
237
+ }
238
+ io_uring_prep_multishot_accept(sqe, L->listen_fd, NULL, NULL, 0);
239
+ /* user_data: special tag (no conn pointer) — accept ops have no
240
+ * conn-state until the completion delivers the new fd. We use a
241
+ * dedicated tag-only encoding: zero conn pointer + ACCEPT tag. */
242
+ io_uring_sqe_set_data64(sqe, hyp_iu_pack_ud(NULL, HYP_IU_OP_ACCEPT));
243
+ return 0;
244
+ }
245
+
246
+ /* Process the request currently buffered in `c->rbuf[0..c->roff]`. On
247
+ * a static-cache hit, snapshots the response into `c->wbuf` + arms a
248
+ * WRITE. On any miss, hands the connection off to Ruby (CLOSE the fd
249
+ * locally is NOT correct — Ruby owns it from that point on).
250
+ *
251
+ * Returns 1 if we armed a WRITE on the conn (the loop will see a
252
+ * WRITE completion next), 0 if the conn was handed off / closed
253
+ * (caller should NOT touch it further — `c` is invalid). */
254
+ static int hyp_iu_dispatch_request(hyp_iu_loop_t *L, hyp_iu_conn_t *c) {
255
+ long eoh = pc_internal_find_eoh(c->rbuf, c->roff);
256
+ if (eoh < 0) {
257
+ /* Need more bytes. Re-arm RECV (the buffer might need to grow
258
+ * — handled by the caller before re-submission via roff/rcap). */
259
+ if (c->roff >= c->rcap) {
260
+ if (c->rcap >= PC_INTERNAL_MAX_HEADER_BYTES) {
261
+ /* Header section exceeds cap — hand off to Ruby. */
262
+ pc_internal_handoff(c->fd, c->rbuf, c->roff);
263
+ c->handed_off = 1;
264
+ hyp_iu_conn_free(c);
265
+ L->handoffs++;
266
+ L->inflight--;
267
+ return 0;
268
+ }
269
+ size_t new_cap = c->rcap * 2;
270
+ if (new_cap > PC_INTERNAL_MAX_HEADER_BYTES) {
271
+ new_cap = PC_INTERNAL_MAX_HEADER_BYTES;
272
+ }
273
+ char *grown = (char *)realloc(c->rbuf, new_cap);
274
+ if (grown == NULL) {
275
+ /* OOM — close the connection (best we can do). */
276
+ hyp_iu_submit_close(L, c);
277
+ return 0;
278
+ }
279
+ c->rbuf = grown;
280
+ c->rcap = new_cap;
281
+ }
282
+ if (hyp_iu_submit_recv(L, c) < 0) {
283
+ /* SQ full — close. The kernel will retry on next cycle
284
+ * after we drain. */
285
+ hyp_iu_submit_close(L, c);
286
+ return 0;
287
+ }
288
+ return 1; /* technically RECV not WRITE, but caller treats
289
+ * the same: conn still owned by us. */
290
+ }
291
+
292
+ long req_line_end = pc_internal_parse_request_line(
293
+ c->rbuf, (size_t)eoh,
294
+ &c->method_off, &c->method_len,
295
+ &c->path_off, &c->path_len);
296
+ if (req_line_end < 0) {
297
+ pc_internal_handoff(c->fd, c->rbuf, c->roff);
298
+ c->handed_off = 1;
299
+ hyp_iu_conn_free(c);
300
+ L->handoffs++;
301
+ L->inflight--;
302
+ return 0;
303
+ }
304
+
305
+ int connection_close = 0;
306
+ int has_body = 0;
307
+ int upgrade_seen = 0;
308
+ int hdr_ok = pc_internal_scan_headers(c->rbuf, (size_t)req_line_end,
309
+ (size_t)eoh, &connection_close,
310
+ &has_body, &upgrade_seen);
311
+ if (hdr_ok != 0 || has_body || upgrade_seen) {
312
+ pc_internal_handoff(c->fd, c->rbuf, c->roff);
313
+ c->handed_off = 1;
314
+ hyp_iu_conn_free(c);
315
+ L->handoffs++;
316
+ L->inflight--;
317
+ return 0;
318
+ }
319
+
320
+ pc_internal_method_t kind = pc_internal_classify_method(
321
+ c->rbuf + c->method_off, c->method_len);
322
+ if (kind == PC_INTERNAL_METHOD_OTHER) {
323
+ pc_internal_handoff(c->fd, c->rbuf, c->roff);
324
+ c->handed_off = 1;
325
+ hyp_iu_conn_free(c);
326
+ L->handoffs++;
327
+ L->inflight--;
328
+ return 0;
329
+ }
330
+
331
+ size_t snap_len = 0;
332
+ char *snap = pc_internal_snapshot_response(
333
+ c->rbuf + c->path_off, c->path_len, kind, &snap_len);
334
+ if (snap == NULL) {
335
+ pc_internal_handoff(c->fd, c->rbuf, c->roff);
336
+ c->handed_off = 1;
337
+ hyp_iu_conn_free(c);
338
+ L->handoffs++;
339
+ L->inflight--;
340
+ return 0;
341
+ }
342
+
343
+ c->wbuf = snap;
344
+ c->wlen = snap_len;
345
+ c->wsent = 0;
346
+ c->keep_alive = connection_close ? 0 : 1;
347
+ /* Stash the request boundary so RECV completions for the NEXT
348
+ * pipelined request can shift the buffer. We don't carry pipelining
349
+ * across the CQE boundary today — the typical wrk shape closes per
350
+ * connection or pipelines one or two requests. Left as a 2.13
351
+ * follow-up: shift `rbuf` by `eoh` post-write so a queued
352
+ * pipelined request can be parsed without an extra RECV. */
353
+ (void)eoh; /* silence unused-warning when assertion stripped */
354
+
355
+ if (hyp_iu_submit_write(L, c) < 0) {
356
+ free(c->wbuf); c->wbuf = NULL;
357
+ hyp_iu_submit_close(L, c);
358
+ return 0;
359
+ }
360
+ return 1;
361
+ }
362
+
363
+ /* Lifecycle hook firing: needs the GVL because the registered Ruby
364
+ * callback runs on it. Wrap with `rb_thread_call_with_gvl` only when
365
+ * the gate is on; the no-hook hot path skips the round-trip entirely. */
366
+ typedef struct {
367
+ const char *method;
368
+ size_t mlen;
369
+ const char *path;
370
+ size_t plen;
371
+ } hyp_iu_hook_args_t;
372
+
373
+ static void *hyp_iu_fire_lifecycle_with_gvl(void *raw) {
374
+ hyp_iu_hook_args_t *a = (hyp_iu_hook_args_t *)raw;
375
+ pc_internal_fire_lifecycle(a->method, a->mlen, a->path, a->plen);
376
+ return NULL;
377
+ }
378
+
379
+ /* Drain ready CQEs. Called inside the without-GVL region — but the
380
+ * inner Ruby-callback firing wraps GVL re-acquisition for the
381
+ * milliseconds the hook runs. Returns 1 if the listener was closed
382
+ * (graceful exit signal), 0 otherwise. */
383
+ static int hyp_iu_drain_cqes(hyp_iu_loop_t *L) {
384
+ struct io_uring_cqe *cqe;
385
+ unsigned head;
386
+ int processed = 0;
387
+ int listener_closed = 0;
388
+
389
+ io_uring_for_each_cqe(&L->ring, head, cqe) {
390
+ processed++;
391
+ uint64_t ud = cqe->user_data;
392
+ hyp_iu_op_t op = hyp_iu_unpack_op(ud);
393
+ hyp_iu_conn_t *c = hyp_iu_unpack_conn(ud);
394
+ int res = cqe->res;
395
+
396
+ switch (op) {
397
+ case HYP_IU_OP_ACCEPT: {
398
+ if (res < 0) {
399
+ if (res == -ENOBUFS || res == -EAGAIN) {
400
+ /* Multishot was disarmed by the kernel — re-arm. */
401
+ (void)hyp_iu_submit_accept(L);
402
+ break;
403
+ }
404
+ if (res == -ECANCELED || res == -EBADF || res == -EINVAL) {
405
+ /* Listener closed — graceful exit. */
406
+ listener_closed = 1;
407
+ break;
408
+ }
409
+ /* Other accept errors — re-arm and keep going. The
410
+ * 2.12-C path treated ECONNABORTED, EINTR, etc. as
411
+ * transient too. */
412
+ (void)hyp_iu_submit_accept(L);
413
+ break;
414
+ }
415
+ /* Successful accept: res is the new fd. */
416
+ int cfd = res;
417
+ if (L->inflight >= L->max_inflight) {
418
+ /* Backpressure: shed the connection rather than
419
+ * unbounded heap growth. The kernel keeps queueing
420
+ * incoming SYNs in its accept queue; we'll resume
421
+ * draining once headroom frees up. */
422
+ close(cfd);
423
+ break;
424
+ }
425
+ pc_internal_apply_tcp_nodelay(cfd);
426
+ hyp_iu_conn_t *nc = hyp_iu_conn_new(cfd);
427
+ if (nc == NULL) {
428
+ close(cfd);
429
+ break;
430
+ }
431
+ L->inflight++;
432
+ L->accepts++;
433
+ if (hyp_iu_submit_recv(L, nc) < 0) {
434
+ /* SQ full — close. */
435
+ hyp_iu_submit_close(L, nc);
436
+ }
437
+ /* Multishot ACCEPT: kernel keeps it armed unless
438
+ * IORING_CQE_F_MORE clears. If MORE is missing, re-arm. */
439
+ if (!(cqe->flags & IORING_CQE_F_MORE)) {
440
+ (void)hyp_iu_submit_accept(L);
441
+ }
442
+ break;
443
+ }
444
+
445
+ case HYP_IU_OP_RECV: {
446
+ if (c == NULL) break;
447
+ if (res <= 0) {
448
+ /* res == 0 -> peer closed cleanly. res < 0 -> error
449
+ * (-ECONNRESET, -EPIPE, etc.). Either way we close. */
450
+ hyp_iu_submit_close(L, c);
451
+ break;
452
+ }
453
+ c->roff += (size_t)res;
454
+ (void)hyp_iu_dispatch_request(L, c);
455
+ break;
456
+ }
457
+
458
+ case HYP_IU_OP_WRITE: {
459
+ if (c == NULL) break;
460
+ if (res <= 0) {
461
+ /* Write failed (peer gone, EPIPE). Close. */
462
+ free(c->wbuf); c->wbuf = NULL;
463
+ hyp_iu_submit_close(L, c);
464
+ break;
465
+ }
466
+ c->wsent += (size_t)res;
467
+ if (c->wsent < c->wlen) {
468
+ /* Short write — re-arm. Rare on loopback, common on
469
+ * congested links. */
470
+ if (hyp_iu_submit_write(L, c) < 0) {
471
+ free(c->wbuf); c->wbuf = NULL;
472
+ hyp_iu_submit_close(L, c);
473
+ }
474
+ break;
475
+ }
476
+ /* Full response written. Lifecycle hook fires here so
477
+ * observers see a finished request. */
478
+ L->served++;
479
+ /* 2.12-E — feed the per-process atomic so the SO_REUSEPORT
480
+ * audit harness can scrape `c_loop_requests_total` without
481
+ * caring whether the worker landed on the accept4 or
482
+ * io_uring path. Lock-free; no GVL re-acquisition. */
483
+ pc_internal_tick_request();
484
+ if (pc_internal_lifecycle_active()) {
485
+ hyp_iu_hook_args_t args = {
486
+ .method = c->rbuf + c->method_off, .mlen = c->method_len,
487
+ .path = c->rbuf + c->path_off, .plen = c->path_len
488
+ };
489
+ rb_thread_call_with_gvl(hyp_iu_fire_lifecycle_with_gvl, &args);
490
+ }
491
+ free(c->wbuf); c->wbuf = NULL;
492
+ c->wlen = 0; c->wsent = 0;
493
+
494
+ if (!c->keep_alive || L->stopping) {
495
+ hyp_iu_submit_close(L, c);
496
+ break;
497
+ }
498
+ /* Keep-alive: reset for the next request on this fd.
499
+ * The previous request occupied bytes [0..eoh) in rbuf;
500
+ * any pipelined bytes after that boundary aren't carried
501
+ * across today (we re-RECV from offset 0). Clearing is
502
+ * cheap and correct — pipelining is a 2.13 follow-up. */
503
+ c->roff = 0;
504
+ c->method_off = c->method_len = 0;
505
+ c->path_off = c->path_len = 0;
506
+ if (hyp_iu_submit_recv(L, c) < 0) {
507
+ hyp_iu_submit_close(L, c);
508
+ }
509
+ break;
510
+ }
511
+
512
+ case HYP_IU_OP_CLOSE: {
513
+ (void)res; /* close errors are advisory; nothing to do */
514
+ if (c != NULL) {
515
+ hyp_iu_conn_free(c);
516
+ }
517
+ L->inflight--;
518
+ L->closes++;
519
+ break;
520
+ }
521
+
522
+ default:
523
+ /* Should not happen — defensive no-op. */
524
+ break;
525
+ }
526
+ }
527
+ if (processed > 0) {
528
+ io_uring_cq_advance(&L->ring, processed);
529
+ }
530
+ return listener_closed;
531
+ }
532
+
533
+ /* No-GVL inner loop. Runs until the listener closes, the stop flag
534
+ * fires, or an unrecoverable error happens. */
535
+ typedef struct {
536
+ hyp_iu_loop_t *L;
537
+ int err;
538
+ /* 1 if the listener closed gracefully; 0 if an error tore us out. */
539
+ int graceful;
540
+ } hyp_iu_run_args_t;
541
+
542
+ static void *hyp_iu_run_blocking(void *raw) {
543
+ hyp_iu_run_args_t *a = (hyp_iu_run_args_t *)raw;
544
+ hyp_iu_loop_t *L = a->L;
545
+ a->err = 0;
546
+ a->graceful = 0;
547
+
548
+ /* Initial ACCEPT submission. */
549
+ if (hyp_iu_submit_accept(L) < 0) {
550
+ a->err = ENOMEM;
551
+ return NULL;
552
+ }
553
+
554
+ /* Bounded wait: the stop flag is flipped from another Ruby thread
555
+ * on `PageCache.stop_accept_loop`; the listener-close from
556
+ * `Server#stop` also delivers an -ECANCELED CQE on the multishot
557
+ * ACCEPT, but the kernel may park us in `submit_and_wait` past
558
+ * that point. The 250 ms cap means we wake every quarter-second
559
+ * to re-check the stop flag — well below the human-perceptible
560
+ * "did graceful shutdown finish?" threshold and small enough that
561
+ * a smoke spec's `t.join(5)` always succeeds. The timeout is
562
+ * idle-only: under load the ring stays full and we drain CQEs as
563
+ * fast as the kernel produces them. */
564
+ struct __kernel_timespec stop_check_ts;
565
+ stop_check_ts.tv_sec = 0;
566
+ stop_check_ts.tv_nsec = 250 * 1000 * 1000; /* 250 ms */
567
+
568
+ for (;;) {
569
+ if (pc_internal_stop_requested()) {
570
+ L->stopping = 1;
571
+ }
572
+ if (L->stopping && L->inflight == 0) {
573
+ a->graceful = 1;
574
+ return NULL;
575
+ }
576
+
577
+ /* `cqe_ptr` MUST be a valid pointer in liburing 2.5 — passing
578
+ * NULL segfaults inside the helper (it deref's it to write the
579
+ * first ready CQE). We don't actually use the ptr (we drain
580
+ * via `io_uring_for_each_cqe` below), but supplying a stack
581
+ * slot keeps the helper happy. */
582
+ struct io_uring_cqe *first_cqe = NULL;
583
+ int ret = io_uring_submit_and_wait_timeout(&L->ring, &first_cqe, 1,
584
+ &stop_check_ts, NULL);
585
+ (void)first_cqe;
586
+ if (ret < 0) {
587
+ if (ret == -EINTR || ret == -ETIME) {
588
+ /* Timeout: drain whatever's ready and re-check stop. */
589
+ (void)hyp_iu_drain_cqes(L);
590
+ continue;
591
+ }
592
+ if (L->stopping && L->inflight == 0) {
593
+ a->graceful = 1;
594
+ return NULL;
595
+ }
596
+ a->err = -ret;
597
+ return NULL;
598
+ }
599
+
600
+ int listener_closed = hyp_iu_drain_cqes(L);
601
+ if (listener_closed) {
602
+ L->stopping = 1;
603
+ }
604
+ if (L->stopping && L->inflight == 0) {
605
+ a->graceful = 1;
606
+ return NULL;
607
+ }
608
+ }
609
+ }
610
+
611
+ /* Public Ruby surface: PageCache.run_static_io_uring_loop(listen_fd) -> Integer | :crashed | :unavailable */
612
+ static VALUE rb_pc_run_static_io_uring_loop(VALUE self, VALUE rb_listen_fd) {
613
+ (void)self;
614
+ int listen_fd = NUM2INT(rb_listen_fd);
615
+ if (listen_fd < 0) {
616
+ rb_raise(rb_eArgError, "listen_fd must be >= 0");
617
+ }
618
+
619
+ /* Reset the stop flag so a previous invocation's
620
+ * `stop_accept_loop` doesn't immediately tear us down. The 2.12-C
621
+ * loop does the same dance at entry — see `rb_pc_run_static_accept_loop`. */
622
+ pc_internal_reset_stop();
623
+ /* 2.12-E — reset the per-process served-request counter on entry so
624
+ * `c_loop_requests_total` reflects THIS loop's served count. Mirrors
625
+ * the 2.12-C accept4 entry path. */
626
+ pc_internal_reset_requests_served();
627
+
628
+ /* Clear O_NONBLOCK on the listener — io_uring drives accept itself
629
+ * and we want the kernel to park us in the ring rather than spin
630
+ * on EAGAIN. The 2.12-C path does the same trick on its accept(2)
631
+ * fd. */
632
+ int flags = fcntl(listen_fd, F_GETFL, 0);
633
+ if (flags >= 0 && (flags & O_NONBLOCK)) {
634
+ (void)fcntl(listen_fd, F_SETFL, flags & ~O_NONBLOCK);
635
+ }
636
+
637
+ hyp_iu_loop_t L;
638
+ memset(&L, 0, sizeof(L));
639
+ L.listen_fd = listen_fd;
640
+ L.max_inflight = HYP_IU_DEFAULT_MAX_INFLIGHT;
641
+
642
+ /* Probe at boot: io_uring_queue_init returns 0 on success or a
643
+ * negative errno on failure (seccomp / kernel-too-old / out-of-mem).
644
+ * The Ruby caller treats `:unavailable` as "fall through to the
645
+ * 2.12-C accept4 path"; the operator sees nothing scary in the
646
+ * boot log unless they explicitly set HYPERION_IO_URING_ACCEPT=1
647
+ * AND the probe failed (caller is responsible for warning then). */
648
+ int rc = io_uring_queue_init(HYP_IU_DEFAULT_DEPTH, &L.ring, 0);
649
+ if (rc < 0) {
650
+ return hyp_iu_sym_unavailable;
651
+ }
652
+
653
+ hyp_iu_run_args_t a;
654
+ a.L = &L;
655
+ a.err = 0;
656
+ a.graceful = 0;
657
+ rb_thread_call_without_gvl(hyp_iu_run_blocking, &a, RUBY_UBF_IO, NULL);
658
+
659
+ /* Best-effort drain: any conn structs we still hold need their
660
+ * fds closed and memory freed. The CLOSE submissions normally
661
+ * handle this, but a torn-down ring (graceful or not) means
662
+ * pending SQEs never completed. We don't have a per-conn list,
663
+ * so we lean on the kernel — io_uring_queue_exit will cancel
664
+ * pending SQEs. Memory leak surface here is bounded by
665
+ * `L.inflight`; in practice graceful shutdown drains it to 0
666
+ * before exit. The non-graceful path leaks at most
667
+ * `max_inflight * sizeof(hyp_iu_conn_t + rbuf)` once per worker
668
+ * lifetime — acceptable for an emergency-tear-down path. */
669
+ io_uring_queue_exit(&L.ring);
670
+
671
+ if (!a.graceful && a.err != 0) {
672
+ return hyp_iu_sym_crashed;
673
+ }
674
+ return LONG2NUM(L.served);
675
+ }
676
+
677
+ #else /* not __linux__ or no HAVE_LIBURING */
678
+
679
+ static VALUE rb_pc_run_static_io_uring_loop(VALUE self, VALUE rb_listen_fd) {
680
+ (void)self;
681
+ (void)rb_listen_fd;
682
+ return hyp_iu_sym_unavailable;
683
+ }
684
+
685
+ #endif /* __linux__ && HAVE_LIBURING */
686
+
687
+ /* Whether the C ext was built WITH liburing support. The Ruby side
688
+ * uses this in `ConnectionLoop#io_uring_eligible?` to short-circuit
689
+ * the env-var check — no point reading HYPERION_IO_URING_ACCEPT on a
690
+ * build that can't honour it. */
691
+ static VALUE rb_pc_io_uring_loop_compiled_p(VALUE self) {
692
+ (void)self;
693
+ #if defined(__linux__) && defined(HAVE_LIBURING)
694
+ return Qtrue;
695
+ #else
696
+ return Qfalse;
697
+ #endif
698
+ }
699
+
700
+ void Init_hyperion_io_uring_loop(VALUE mPageCache) {
701
+ hyp_iu_sym_unavailable = ID2SYM(rb_intern("unavailable"));
702
+ hyp_iu_sym_crashed = ID2SYM(rb_intern("crashed"));
703
+ rb_gc_register_mark_object(hyp_iu_sym_unavailable);
704
+ rb_gc_register_mark_object(hyp_iu_sym_crashed);
705
+
706
+ rb_define_singleton_method(mPageCache, "run_static_io_uring_loop",
707
+ rb_pc_run_static_io_uring_loop, 1);
708
+ rb_define_singleton_method(mPageCache, "io_uring_loop_compiled?",
709
+ rb_pc_io_uring_loop_compiled_p, 0);
710
+ }