hyperion-rb 2.10.1 → 2.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +771 -0
- data/README.md +135 -5
- data/ext/hyperion_http/extconf.rb +41 -0
- data/ext/hyperion_http/io_uring_loop.c +710 -0
- data/ext/hyperion_http/page_cache.c +1032 -0
- data/ext/hyperion_http/page_cache_internal.h +132 -0
- data/lib/hyperion/connection.rb +14 -0
- data/lib/hyperion/dispatch_mode.rb +19 -1
- data/lib/hyperion/h2_codec.rb +52 -5
- data/lib/hyperion/http2_handler.rb +399 -41
- data/lib/hyperion/metrics.rb +38 -0
- data/lib/hyperion/prometheus_exporter.rb +76 -1
- data/lib/hyperion/server/connection_loop.rb +159 -0
- data/lib/hyperion/server.rb +183 -0
- data/lib/hyperion/thread_pool.rb +23 -7
- data/lib/hyperion/version.rb +1 -1
- metadata +4 -1
|
@@ -0,0 +1,710 @@
|
|
|
1
|
+
/* ----------------------------------------------------------------------
|
|
2
|
+
* io_uring_loop.c — 2.12-D — io_uring-driven C accept loop.
|
|
3
|
+
*
|
|
4
|
+
* Sibling translation unit to page_cache.c's 2.12-C `accept4` loop.
|
|
5
|
+
* Same wire contract (handle_static-only routes, plain TCP, lifecycle
|
|
6
|
+
* + handoff Ruby callbacks); the only difference is *how* the kernel
|
|
7
|
+
* I/O is driven.
|
|
8
|
+
*
|
|
9
|
+
* Design
|
|
10
|
+
* ------
|
|
11
|
+
* Single ring per `run_static_io_uring_loop` invocation. Operator
|
|
12
|
+
* boots one worker per CPU; each worker calls into this loop and gets
|
|
13
|
+
* its own ring. No ring is shared across threads or across fork — fits
|
|
14
|
+
* the same model `Hyperion::IOUring` already established for the
|
|
15
|
+
* Rust-cdylib path.
|
|
16
|
+
*
|
|
17
|
+
* On entry we:
|
|
18
|
+
* 1. Probe `liburing` at runtime (`io_uring_queue_init`). On failure
|
|
19
|
+
* we return `:unavailable` so the Ruby caller falls through to
|
|
20
|
+
* the 2.12-C `accept4` path. Probe failure is the dominant path
|
|
21
|
+
* on locked-down containers (seccomp blocks `io_uring_setup`),
|
|
22
|
+
* old kernels, and the like.
|
|
23
|
+
* 2. Submit a multishot ACCEPT SQE on the listener fd. Multishot
|
|
24
|
+
* delivers one CQE per accepted connection without re-arming.
|
|
25
|
+
* 3. Drain CQEs in a loop. For each completion we advance the
|
|
26
|
+
* connection's state machine: ACCEPT -> RECV -> WRITE -> CLOSE.
|
|
27
|
+
* 4. After draining, submit any newly-armed SQEs and park on
|
|
28
|
+
* `io_uring_submit_and_wait(1)`. The kernel batches I/O the
|
|
29
|
+
* worker thread does **one** `io_uring_enter` per N CQEs in
|
|
30
|
+
* steady state instead of N×3 syscalls (accept + recv + write).
|
|
31
|
+
*
|
|
32
|
+
* Per-connection state lives in a `hyp_iu_conn_t` allocated on the
|
|
33
|
+
* heap. `user_data` on each SQE is `(uintptr_t)conn | tag` where
|
|
34
|
+
* `tag` is one of the OP_TYPE_* low-bit markers. The arena is
|
|
35
|
+
* intentionally simple: we don't pool — `malloc`/`free` per connection
|
|
36
|
+
* is far cheaper than the 3 syscalls we're saving (and the kernel's
|
|
37
|
+
* own SQE pool is the real win).
|
|
38
|
+
*
|
|
39
|
+
* GVL
|
|
40
|
+
* ---
|
|
41
|
+
* The loop runs INSIDE `rb_thread_call_without_gvl` for `submit_and_wait`,
|
|
42
|
+
* but we re-acquire the GVL whenever we need to call into Ruby
|
|
43
|
+
* (lifecycle callback, handoff callback, and `pc_internal_*` helpers
|
|
44
|
+
* that touch Ruby objects — actually the snapshot helper takes the
|
|
45
|
+
* pthread mutex but no Ruby state, so it's safe to call without the
|
|
46
|
+
* GVL). The hot path (no hooks, no handoffs) stays without the GVL
|
|
47
|
+
* for the entire `submit_and_wait` cycle.
|
|
48
|
+
*
|
|
49
|
+
* Build gating
|
|
50
|
+
* ------------
|
|
51
|
+
* The whole io_uring code path lives behind `#ifdef HAVE_LIBURING`.
|
|
52
|
+
* On macOS / hosts without `liburing-dev` the file compiles down to
|
|
53
|
+
* the stub init that registers `run_static_io_uring_loop` returning
|
|
54
|
+
* `:unavailable`. The stub keeps the Ruby surface stable across
|
|
55
|
+
* platforms — specs that check for the method's existence pass on
|
|
56
|
+
* Darwin too; only the body is gated.
|
|
57
|
+
*
|
|
58
|
+
* 2.12-D — initial drop.
|
|
59
|
+
* ---------------------------------------------------------------------- */
|
|
60
|
+
|
|
61
|
+
#include <ruby.h>
|
|
62
|
+
#include <ruby/thread.h>
|
|
63
|
+
|
|
64
|
+
#include <errno.h>
|
|
65
|
+
#include <stddef.h>
|
|
66
|
+
#include <stdint.h>
|
|
67
|
+
#include <stdlib.h>
|
|
68
|
+
#include <string.h>
|
|
69
|
+
#include <unistd.h>
|
|
70
|
+
#include <fcntl.h>
|
|
71
|
+
#include <sys/socket.h>
|
|
72
|
+
|
|
73
|
+
#include "page_cache_internal.h"
|
|
74
|
+
|
|
75
|
+
/* Ruby identifiers we cache at init time. */
|
|
76
|
+
static VALUE hyp_iu_sym_unavailable = Qnil;
|
|
77
|
+
static VALUE hyp_iu_sym_crashed = Qnil;
|
|
78
|
+
|
|
79
|
+
#if defined(__linux__) && defined(HAVE_LIBURING)
|
|
80
|
+
|
|
81
|
+
#include <liburing.h>
|
|
82
|
+
|
|
83
|
+
/* Per-worker ring state. Single instance per `run_static_io_uring_loop`
|
|
84
|
+
* invocation — the ring is freed on return. */
|
|
85
|
+
typedef struct hyp_iu_loop_s {
|
|
86
|
+
struct io_uring ring;
|
|
87
|
+
int listen_fd;
|
|
88
|
+
long served;
|
|
89
|
+
/* Counters operators don't see directly but specs use to confirm
|
|
90
|
+
* we walked the io_uring code path (vs. a fall-through). */
|
|
91
|
+
long accepts;
|
|
92
|
+
long handoffs;
|
|
93
|
+
long closes;
|
|
94
|
+
/* Set when the loop should drain remaining CQEs and exit. The
|
|
95
|
+
* shared `pc_internal_stop_requested()` flag flips this on
|
|
96
|
+
* `PageCache.stop_accept_loop`. */
|
|
97
|
+
int stopping;
|
|
98
|
+
/* Bound on outstanding connections — avoids unbounded heap growth
|
|
99
|
+
* under a SYN flood. The kernel will keep accepting onto its own
|
|
100
|
+
* accept queue; we just stop pulling them off until headroom
|
|
101
|
+
* frees up. */
|
|
102
|
+
int inflight;
|
|
103
|
+
int max_inflight;
|
|
104
|
+
} hyp_iu_loop_t;
|
|
105
|
+
|
|
106
|
+
/* Connection-level state. One alloc per accepted fd; freed on close
|
|
107
|
+
* completion. */
|
|
108
|
+
typedef enum {
|
|
109
|
+
HYP_IU_OP_ACCEPT = 0x1,
|
|
110
|
+
HYP_IU_OP_RECV = 0x2,
|
|
111
|
+
HYP_IU_OP_WRITE = 0x3,
|
|
112
|
+
HYP_IU_OP_CLOSE = 0x4
|
|
113
|
+
} hyp_iu_op_t;
|
|
114
|
+
|
|
115
|
+
#define HYP_IU_OP_MASK 0x7u
|
|
116
|
+
#define HYP_IU_OP_SHIFT 0u
|
|
117
|
+
|
|
118
|
+
typedef struct hyp_iu_conn_s {
|
|
119
|
+
int fd;
|
|
120
|
+
/* Read buffer. Header section is bounded by PC_INTERNAL_MAX_HEADER_BYTES;
|
|
121
|
+
* we allocate a single 8 KiB chunk eagerly (matches the 2.12-C
|
|
122
|
+
* loop's HYP_CL_READ_CHUNK + the typical request shape) and grow
|
|
123
|
+
* up to the cap on header straddle. */
|
|
124
|
+
char *rbuf;
|
|
125
|
+
size_t rcap;
|
|
126
|
+
size_t roff;
|
|
127
|
+
/* Response snapshot. Owned by the conn; freed in CLOSE handler
|
|
128
|
+
* after the WRITE completes. */
|
|
129
|
+
char *wbuf;
|
|
130
|
+
size_t wlen;
|
|
131
|
+
size_t wsent;
|
|
132
|
+
/* Method/path offsets within rbuf — kept across stages so the
|
|
133
|
+
* lifecycle callback fires with the right strings even after the
|
|
134
|
+
* write completes. */
|
|
135
|
+
size_t method_off, method_len;
|
|
136
|
+
size_t path_off, path_len;
|
|
137
|
+
/* Whether the request asked for `Connection: close`. We honour
|
|
138
|
+
* keep-alive in steady state; close-request shortens the
|
|
139
|
+
* connection lifetime. */
|
|
140
|
+
int keep_alive;
|
|
141
|
+
int handed_off;
|
|
142
|
+
} hyp_iu_conn_t;
|
|
143
|
+
|
|
144
|
+
#define HYP_IU_RBUF_INITIAL 8192
|
|
145
|
+
#define HYP_IU_DEFAULT_DEPTH 256
|
|
146
|
+
#define HYP_IU_DEFAULT_MAX_INFLIGHT 4096
|
|
147
|
+
|
|
148
|
+
/* Pack/unpack `(conn_ptr, op_tag)` into `user_data`. `conn` pointers are
|
|
149
|
+
* malloc'd, so the low 3 bits are zero — we steal them for the op tag.
|
|
150
|
+
* On exotic allocators where this isn't safe, swap to a dedicated tag
|
|
151
|
+
* field on the conn struct + per-op tag table; for glibc / musl /
|
|
152
|
+
* jemalloc this packing is sound (alignof(max_align_t) >= 8). */
|
|
153
|
+
static inline uint64_t hyp_iu_pack_ud(hyp_iu_conn_t *c, hyp_iu_op_t op) {
|
|
154
|
+
return ((uint64_t)(uintptr_t)c) | ((uint64_t)op & HYP_IU_OP_MASK);
|
|
155
|
+
}
|
|
156
|
+
static inline hyp_iu_conn_t *hyp_iu_unpack_conn(uint64_t ud) {
|
|
157
|
+
return (hyp_iu_conn_t *)(uintptr_t)(ud & ~(uint64_t)HYP_IU_OP_MASK);
|
|
158
|
+
}
|
|
159
|
+
static inline hyp_iu_op_t hyp_iu_unpack_op(uint64_t ud) {
|
|
160
|
+
return (hyp_iu_op_t)(ud & HYP_IU_OP_MASK);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
/* Allocate a connection-state struct. NULL on OOM. */
|
|
164
|
+
static hyp_iu_conn_t *hyp_iu_conn_new(int fd) {
|
|
165
|
+
hyp_iu_conn_t *c = (hyp_iu_conn_t *)calloc(1, sizeof(*c));
|
|
166
|
+
if (c == NULL) {
|
|
167
|
+
return NULL;
|
|
168
|
+
}
|
|
169
|
+
c->fd = fd;
|
|
170
|
+
c->rbuf = (char *)malloc(HYP_IU_RBUF_INITIAL);
|
|
171
|
+
if (c->rbuf == NULL) {
|
|
172
|
+
free(c);
|
|
173
|
+
return NULL;
|
|
174
|
+
}
|
|
175
|
+
c->rcap = HYP_IU_RBUF_INITIAL;
|
|
176
|
+
c->keep_alive = 1;
|
|
177
|
+
return c;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
static void hyp_iu_conn_free(hyp_iu_conn_t *c) {
|
|
181
|
+
if (c == NULL) return;
|
|
182
|
+
free(c->rbuf);
|
|
183
|
+
free(c->wbuf);
|
|
184
|
+
free(c);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
/* Submit a CLOSE op for a fd. The fd is closed via io_uring rather than
|
|
188
|
+
* a direct close(2) so we collapse one more syscall into the ring's
|
|
189
|
+
* `submit_and_wait` cycle. The CLOSE completion's only responsibility
|
|
190
|
+
* is to free the conn struct. */
|
|
191
|
+
static void hyp_iu_submit_close(hyp_iu_loop_t *L, hyp_iu_conn_t *c) {
|
|
192
|
+
struct io_uring_sqe *sqe = io_uring_get_sqe(&L->ring);
|
|
193
|
+
if (sqe == NULL) {
|
|
194
|
+
/* SQ full — fall back to direct close + free here. The conn
|
|
195
|
+
* struct is freed inline; we DO NOT touch the ring so the
|
|
196
|
+
* caller's submit cycle stays clean. */
|
|
197
|
+
if (c->fd >= 0) close(c->fd);
|
|
198
|
+
hyp_iu_conn_free(c);
|
|
199
|
+
L->inflight--;
|
|
200
|
+
L->closes++;
|
|
201
|
+
return;
|
|
202
|
+
}
|
|
203
|
+
io_uring_prep_close(sqe, c->fd);
|
|
204
|
+
io_uring_sqe_set_data64(sqe, hyp_iu_pack_ud(c, HYP_IU_OP_CLOSE));
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
/* Submit a RECV onto the conn's read buffer. Reads into the tail of
|
|
208
|
+
* the buffer (rbuf + roff), up to rcap - roff bytes. */
|
|
209
|
+
static int hyp_iu_submit_recv(hyp_iu_loop_t *L, hyp_iu_conn_t *c) {
|
|
210
|
+
struct io_uring_sqe *sqe = io_uring_get_sqe(&L->ring);
|
|
211
|
+
if (sqe == NULL) {
|
|
212
|
+
return -1;
|
|
213
|
+
}
|
|
214
|
+
io_uring_prep_recv(sqe, c->fd, c->rbuf + c->roff, c->rcap - c->roff, 0);
|
|
215
|
+
io_uring_sqe_set_data64(sqe, hyp_iu_pack_ud(c, HYP_IU_OP_RECV));
|
|
216
|
+
return 0;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/* Submit a WRITE for the prepared response snapshot. */
|
|
220
|
+
static int hyp_iu_submit_write(hyp_iu_loop_t *L, hyp_iu_conn_t *c) {
|
|
221
|
+
struct io_uring_sqe *sqe = io_uring_get_sqe(&L->ring);
|
|
222
|
+
if (sqe == NULL) {
|
|
223
|
+
return -1;
|
|
224
|
+
}
|
|
225
|
+
io_uring_prep_send(sqe, c->fd, c->wbuf + c->wsent, c->wlen - c->wsent, 0);
|
|
226
|
+
io_uring_sqe_set_data64(sqe, hyp_iu_pack_ud(c, HYP_IU_OP_WRITE));
|
|
227
|
+
return 0;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/* Submit the multishot ACCEPT on the listener. Multishot continues
|
|
231
|
+
* delivering CQEs until the kernel returns -ENOBUFS or the SQE is
|
|
232
|
+
* cancelled; we re-arm only on -ENOBUFS. */
|
|
233
|
+
static int hyp_iu_submit_accept(hyp_iu_loop_t *L) {
|
|
234
|
+
struct io_uring_sqe *sqe = io_uring_get_sqe(&L->ring);
|
|
235
|
+
if (sqe == NULL) {
|
|
236
|
+
return -1;
|
|
237
|
+
}
|
|
238
|
+
io_uring_prep_multishot_accept(sqe, L->listen_fd, NULL, NULL, 0);
|
|
239
|
+
/* user_data: special tag (no conn pointer) — accept ops have no
|
|
240
|
+
* conn-state until the completion delivers the new fd. We use a
|
|
241
|
+
* dedicated tag-only encoding: zero conn pointer + ACCEPT tag. */
|
|
242
|
+
io_uring_sqe_set_data64(sqe, hyp_iu_pack_ud(NULL, HYP_IU_OP_ACCEPT));
|
|
243
|
+
return 0;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
/* Process the request currently buffered in `c->rbuf[0..c->roff]`. On
|
|
247
|
+
* a static-cache hit, snapshots the response into `c->wbuf` + arms a
|
|
248
|
+
* WRITE. On any miss, hands the connection off to Ruby (CLOSE the fd
|
|
249
|
+
* locally is NOT correct — Ruby owns it from that point on).
|
|
250
|
+
*
|
|
251
|
+
* Returns 1 if we armed a WRITE on the conn (the loop will see a
|
|
252
|
+
* WRITE completion next), 0 if the conn was handed off / closed
|
|
253
|
+
* (caller should NOT touch it further — `c` is invalid). */
|
|
254
|
+
static int hyp_iu_dispatch_request(hyp_iu_loop_t *L, hyp_iu_conn_t *c) {
|
|
255
|
+
long eoh = pc_internal_find_eoh(c->rbuf, c->roff);
|
|
256
|
+
if (eoh < 0) {
|
|
257
|
+
/* Need more bytes. Re-arm RECV (the buffer might need to grow
|
|
258
|
+
* — handled by the caller before re-submission via roff/rcap). */
|
|
259
|
+
if (c->roff >= c->rcap) {
|
|
260
|
+
if (c->rcap >= PC_INTERNAL_MAX_HEADER_BYTES) {
|
|
261
|
+
/* Header section exceeds cap — hand off to Ruby. */
|
|
262
|
+
pc_internal_handoff(c->fd, c->rbuf, c->roff);
|
|
263
|
+
c->handed_off = 1;
|
|
264
|
+
hyp_iu_conn_free(c);
|
|
265
|
+
L->handoffs++;
|
|
266
|
+
L->inflight--;
|
|
267
|
+
return 0;
|
|
268
|
+
}
|
|
269
|
+
size_t new_cap = c->rcap * 2;
|
|
270
|
+
if (new_cap > PC_INTERNAL_MAX_HEADER_BYTES) {
|
|
271
|
+
new_cap = PC_INTERNAL_MAX_HEADER_BYTES;
|
|
272
|
+
}
|
|
273
|
+
char *grown = (char *)realloc(c->rbuf, new_cap);
|
|
274
|
+
if (grown == NULL) {
|
|
275
|
+
/* OOM — close the connection (best we can do). */
|
|
276
|
+
hyp_iu_submit_close(L, c);
|
|
277
|
+
return 0;
|
|
278
|
+
}
|
|
279
|
+
c->rbuf = grown;
|
|
280
|
+
c->rcap = new_cap;
|
|
281
|
+
}
|
|
282
|
+
if (hyp_iu_submit_recv(L, c) < 0) {
|
|
283
|
+
/* SQ full — close. The kernel will retry on next cycle
|
|
284
|
+
* after we drain. */
|
|
285
|
+
hyp_iu_submit_close(L, c);
|
|
286
|
+
return 0;
|
|
287
|
+
}
|
|
288
|
+
return 1; /* technically RECV not WRITE, but caller treats
|
|
289
|
+
* the same: conn still owned by us. */
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
long req_line_end = pc_internal_parse_request_line(
|
|
293
|
+
c->rbuf, (size_t)eoh,
|
|
294
|
+
&c->method_off, &c->method_len,
|
|
295
|
+
&c->path_off, &c->path_len);
|
|
296
|
+
if (req_line_end < 0) {
|
|
297
|
+
pc_internal_handoff(c->fd, c->rbuf, c->roff);
|
|
298
|
+
c->handed_off = 1;
|
|
299
|
+
hyp_iu_conn_free(c);
|
|
300
|
+
L->handoffs++;
|
|
301
|
+
L->inflight--;
|
|
302
|
+
return 0;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
int connection_close = 0;
|
|
306
|
+
int has_body = 0;
|
|
307
|
+
int upgrade_seen = 0;
|
|
308
|
+
int hdr_ok = pc_internal_scan_headers(c->rbuf, (size_t)req_line_end,
|
|
309
|
+
(size_t)eoh, &connection_close,
|
|
310
|
+
&has_body, &upgrade_seen);
|
|
311
|
+
if (hdr_ok != 0 || has_body || upgrade_seen) {
|
|
312
|
+
pc_internal_handoff(c->fd, c->rbuf, c->roff);
|
|
313
|
+
c->handed_off = 1;
|
|
314
|
+
hyp_iu_conn_free(c);
|
|
315
|
+
L->handoffs++;
|
|
316
|
+
L->inflight--;
|
|
317
|
+
return 0;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
pc_internal_method_t kind = pc_internal_classify_method(
|
|
321
|
+
c->rbuf + c->method_off, c->method_len);
|
|
322
|
+
if (kind == PC_INTERNAL_METHOD_OTHER) {
|
|
323
|
+
pc_internal_handoff(c->fd, c->rbuf, c->roff);
|
|
324
|
+
c->handed_off = 1;
|
|
325
|
+
hyp_iu_conn_free(c);
|
|
326
|
+
L->handoffs++;
|
|
327
|
+
L->inflight--;
|
|
328
|
+
return 0;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
size_t snap_len = 0;
|
|
332
|
+
char *snap = pc_internal_snapshot_response(
|
|
333
|
+
c->rbuf + c->path_off, c->path_len, kind, &snap_len);
|
|
334
|
+
if (snap == NULL) {
|
|
335
|
+
pc_internal_handoff(c->fd, c->rbuf, c->roff);
|
|
336
|
+
c->handed_off = 1;
|
|
337
|
+
hyp_iu_conn_free(c);
|
|
338
|
+
L->handoffs++;
|
|
339
|
+
L->inflight--;
|
|
340
|
+
return 0;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
c->wbuf = snap;
|
|
344
|
+
c->wlen = snap_len;
|
|
345
|
+
c->wsent = 0;
|
|
346
|
+
c->keep_alive = connection_close ? 0 : 1;
|
|
347
|
+
/* Stash the request boundary so RECV completions for the NEXT
|
|
348
|
+
* pipelined request can shift the buffer. We don't carry pipelining
|
|
349
|
+
* across the CQE boundary today — the typical wrk shape closes per
|
|
350
|
+
* connection or pipelines one or two requests. Left as a 2.13
|
|
351
|
+
* follow-up: shift `rbuf` by `eoh` post-write so a queued
|
|
352
|
+
* pipelined request can be parsed without an extra RECV. */
|
|
353
|
+
(void)eoh; /* silence unused-warning when assertion stripped */
|
|
354
|
+
|
|
355
|
+
if (hyp_iu_submit_write(L, c) < 0) {
|
|
356
|
+
free(c->wbuf); c->wbuf = NULL;
|
|
357
|
+
hyp_iu_submit_close(L, c);
|
|
358
|
+
return 0;
|
|
359
|
+
}
|
|
360
|
+
return 1;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
/* Lifecycle hook firing: needs the GVL because the registered Ruby
|
|
364
|
+
* callback runs on it. Wrap with `rb_thread_call_with_gvl` only when
|
|
365
|
+
* the gate is on; the no-hook hot path skips the round-trip entirely. */
|
|
366
|
+
typedef struct {
|
|
367
|
+
const char *method;
|
|
368
|
+
size_t mlen;
|
|
369
|
+
const char *path;
|
|
370
|
+
size_t plen;
|
|
371
|
+
} hyp_iu_hook_args_t;
|
|
372
|
+
|
|
373
|
+
static void *hyp_iu_fire_lifecycle_with_gvl(void *raw) {
|
|
374
|
+
hyp_iu_hook_args_t *a = (hyp_iu_hook_args_t *)raw;
|
|
375
|
+
pc_internal_fire_lifecycle(a->method, a->mlen, a->path, a->plen);
|
|
376
|
+
return NULL;
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
/* Drain ready CQEs. Called inside the without-GVL region — but the
|
|
380
|
+
* inner Ruby-callback firing wraps GVL re-acquisition for the
|
|
381
|
+
* milliseconds the hook runs. Returns 1 if the listener was closed
|
|
382
|
+
* (graceful exit signal), 0 otherwise. */
|
|
383
|
+
static int hyp_iu_drain_cqes(hyp_iu_loop_t *L) {
|
|
384
|
+
struct io_uring_cqe *cqe;
|
|
385
|
+
unsigned head;
|
|
386
|
+
int processed = 0;
|
|
387
|
+
int listener_closed = 0;
|
|
388
|
+
|
|
389
|
+
io_uring_for_each_cqe(&L->ring, head, cqe) {
|
|
390
|
+
processed++;
|
|
391
|
+
uint64_t ud = cqe->user_data;
|
|
392
|
+
hyp_iu_op_t op = hyp_iu_unpack_op(ud);
|
|
393
|
+
hyp_iu_conn_t *c = hyp_iu_unpack_conn(ud);
|
|
394
|
+
int res = cqe->res;
|
|
395
|
+
|
|
396
|
+
switch (op) {
|
|
397
|
+
case HYP_IU_OP_ACCEPT: {
|
|
398
|
+
if (res < 0) {
|
|
399
|
+
if (res == -ENOBUFS || res == -EAGAIN) {
|
|
400
|
+
/* Multishot was disarmed by the kernel — re-arm. */
|
|
401
|
+
(void)hyp_iu_submit_accept(L);
|
|
402
|
+
break;
|
|
403
|
+
}
|
|
404
|
+
if (res == -ECANCELED || res == -EBADF || res == -EINVAL) {
|
|
405
|
+
/* Listener closed — graceful exit. */
|
|
406
|
+
listener_closed = 1;
|
|
407
|
+
break;
|
|
408
|
+
}
|
|
409
|
+
/* Other accept errors — re-arm and keep going. The
|
|
410
|
+
* 2.12-C path treated ECONNABORTED, EINTR, etc. as
|
|
411
|
+
* transient too. */
|
|
412
|
+
(void)hyp_iu_submit_accept(L);
|
|
413
|
+
break;
|
|
414
|
+
}
|
|
415
|
+
/* Successful accept: res is the new fd. */
|
|
416
|
+
int cfd = res;
|
|
417
|
+
if (L->inflight >= L->max_inflight) {
|
|
418
|
+
/* Backpressure: shed the connection rather than
|
|
419
|
+
* unbounded heap growth. The kernel keeps queueing
|
|
420
|
+
* incoming SYNs in its accept queue; we'll resume
|
|
421
|
+
* draining once headroom frees up. */
|
|
422
|
+
close(cfd);
|
|
423
|
+
break;
|
|
424
|
+
}
|
|
425
|
+
pc_internal_apply_tcp_nodelay(cfd);
|
|
426
|
+
hyp_iu_conn_t *nc = hyp_iu_conn_new(cfd);
|
|
427
|
+
if (nc == NULL) {
|
|
428
|
+
close(cfd);
|
|
429
|
+
break;
|
|
430
|
+
}
|
|
431
|
+
L->inflight++;
|
|
432
|
+
L->accepts++;
|
|
433
|
+
if (hyp_iu_submit_recv(L, nc) < 0) {
|
|
434
|
+
/* SQ full — close. */
|
|
435
|
+
hyp_iu_submit_close(L, nc);
|
|
436
|
+
}
|
|
437
|
+
/* Multishot ACCEPT: kernel keeps it armed unless
|
|
438
|
+
* IORING_CQE_F_MORE clears. If MORE is missing, re-arm. */
|
|
439
|
+
if (!(cqe->flags & IORING_CQE_F_MORE)) {
|
|
440
|
+
(void)hyp_iu_submit_accept(L);
|
|
441
|
+
}
|
|
442
|
+
break;
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
case HYP_IU_OP_RECV: {
|
|
446
|
+
if (c == NULL) break;
|
|
447
|
+
if (res <= 0) {
|
|
448
|
+
/* res == 0 -> peer closed cleanly. res < 0 -> error
|
|
449
|
+
* (-ECONNRESET, -EPIPE, etc.). Either way we close. */
|
|
450
|
+
hyp_iu_submit_close(L, c);
|
|
451
|
+
break;
|
|
452
|
+
}
|
|
453
|
+
c->roff += (size_t)res;
|
|
454
|
+
(void)hyp_iu_dispatch_request(L, c);
|
|
455
|
+
break;
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
case HYP_IU_OP_WRITE: {
|
|
459
|
+
if (c == NULL) break;
|
|
460
|
+
if (res <= 0) {
|
|
461
|
+
/* Write failed (peer gone, EPIPE). Close. */
|
|
462
|
+
free(c->wbuf); c->wbuf = NULL;
|
|
463
|
+
hyp_iu_submit_close(L, c);
|
|
464
|
+
break;
|
|
465
|
+
}
|
|
466
|
+
c->wsent += (size_t)res;
|
|
467
|
+
if (c->wsent < c->wlen) {
|
|
468
|
+
/* Short write — re-arm. Rare on loopback, common on
|
|
469
|
+
* congested links. */
|
|
470
|
+
if (hyp_iu_submit_write(L, c) < 0) {
|
|
471
|
+
free(c->wbuf); c->wbuf = NULL;
|
|
472
|
+
hyp_iu_submit_close(L, c);
|
|
473
|
+
}
|
|
474
|
+
break;
|
|
475
|
+
}
|
|
476
|
+
/* Full response written. Lifecycle hook fires here so
|
|
477
|
+
* observers see a finished request. */
|
|
478
|
+
L->served++;
|
|
479
|
+
/* 2.12-E — feed the per-process atomic so the SO_REUSEPORT
|
|
480
|
+
* audit harness can scrape `c_loop_requests_total` without
|
|
481
|
+
* caring whether the worker landed on the accept4 or
|
|
482
|
+
* io_uring path. Lock-free; no GVL re-acquisition. */
|
|
483
|
+
pc_internal_tick_request();
|
|
484
|
+
if (pc_internal_lifecycle_active()) {
|
|
485
|
+
hyp_iu_hook_args_t args = {
|
|
486
|
+
.method = c->rbuf + c->method_off, .mlen = c->method_len,
|
|
487
|
+
.path = c->rbuf + c->path_off, .plen = c->path_len
|
|
488
|
+
};
|
|
489
|
+
rb_thread_call_with_gvl(hyp_iu_fire_lifecycle_with_gvl, &args);
|
|
490
|
+
}
|
|
491
|
+
free(c->wbuf); c->wbuf = NULL;
|
|
492
|
+
c->wlen = 0; c->wsent = 0;
|
|
493
|
+
|
|
494
|
+
if (!c->keep_alive || L->stopping) {
|
|
495
|
+
hyp_iu_submit_close(L, c);
|
|
496
|
+
break;
|
|
497
|
+
}
|
|
498
|
+
/* Keep-alive: reset for the next request on this fd.
|
|
499
|
+
* The previous request occupied bytes [0..eoh) in rbuf;
|
|
500
|
+
* any pipelined bytes after that boundary aren't carried
|
|
501
|
+
* across today (we re-RECV from offset 0). Clearing is
|
|
502
|
+
* cheap and correct — pipelining is a 2.13 follow-up. */
|
|
503
|
+
c->roff = 0;
|
|
504
|
+
c->method_off = c->method_len = 0;
|
|
505
|
+
c->path_off = c->path_len = 0;
|
|
506
|
+
if (hyp_iu_submit_recv(L, c) < 0) {
|
|
507
|
+
hyp_iu_submit_close(L, c);
|
|
508
|
+
}
|
|
509
|
+
break;
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
case HYP_IU_OP_CLOSE: {
|
|
513
|
+
(void)res; /* close errors are advisory; nothing to do */
|
|
514
|
+
if (c != NULL) {
|
|
515
|
+
hyp_iu_conn_free(c);
|
|
516
|
+
}
|
|
517
|
+
L->inflight--;
|
|
518
|
+
L->closes++;
|
|
519
|
+
break;
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
default:
|
|
523
|
+
/* Should not happen — defensive no-op. */
|
|
524
|
+
break;
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
if (processed > 0) {
|
|
528
|
+
io_uring_cq_advance(&L->ring, processed);
|
|
529
|
+
}
|
|
530
|
+
return listener_closed;
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
/* No-GVL inner loop. Runs until the listener closes, the stop flag
|
|
534
|
+
* fires, or an unrecoverable error happens. */
|
|
535
|
+
typedef struct {
|
|
536
|
+
hyp_iu_loop_t *L;
|
|
537
|
+
int err;
|
|
538
|
+
/* 1 if the listener closed gracefully; 0 if an error tore us out. */
|
|
539
|
+
int graceful;
|
|
540
|
+
} hyp_iu_run_args_t;
|
|
541
|
+
|
|
542
|
+
static void *hyp_iu_run_blocking(void *raw) {
|
|
543
|
+
hyp_iu_run_args_t *a = (hyp_iu_run_args_t *)raw;
|
|
544
|
+
hyp_iu_loop_t *L = a->L;
|
|
545
|
+
a->err = 0;
|
|
546
|
+
a->graceful = 0;
|
|
547
|
+
|
|
548
|
+
/* Initial ACCEPT submission. */
|
|
549
|
+
if (hyp_iu_submit_accept(L) < 0) {
|
|
550
|
+
a->err = ENOMEM;
|
|
551
|
+
return NULL;
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
/* Bounded wait: the stop flag is flipped from another Ruby thread
|
|
555
|
+
* on `PageCache.stop_accept_loop`; the listener-close from
|
|
556
|
+
* `Server#stop` also delivers an -ECANCELED CQE on the multishot
|
|
557
|
+
* ACCEPT, but the kernel may park us in `submit_and_wait` past
|
|
558
|
+
* that point. The 250 ms cap means we wake every quarter-second
|
|
559
|
+
* to re-check the stop flag — well below the human-perceptible
|
|
560
|
+
* "did graceful shutdown finish?" threshold and small enough that
|
|
561
|
+
* a smoke spec's `t.join(5)` always succeeds. The timeout is
|
|
562
|
+
* idle-only: under load the ring stays full and we drain CQEs as
|
|
563
|
+
* fast as the kernel produces them. */
|
|
564
|
+
struct __kernel_timespec stop_check_ts;
|
|
565
|
+
stop_check_ts.tv_sec = 0;
|
|
566
|
+
stop_check_ts.tv_nsec = 250 * 1000 * 1000; /* 250 ms */
|
|
567
|
+
|
|
568
|
+
for (;;) {
|
|
569
|
+
if (pc_internal_stop_requested()) {
|
|
570
|
+
L->stopping = 1;
|
|
571
|
+
}
|
|
572
|
+
if (L->stopping && L->inflight == 0) {
|
|
573
|
+
a->graceful = 1;
|
|
574
|
+
return NULL;
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
/* `cqe_ptr` MUST be a valid pointer in liburing 2.5 — passing
|
|
578
|
+
* NULL segfaults inside the helper (it deref's it to write the
|
|
579
|
+
* first ready CQE). We don't actually use the ptr (we drain
|
|
580
|
+
* via `io_uring_for_each_cqe` below), but supplying a stack
|
|
581
|
+
* slot keeps the helper happy. */
|
|
582
|
+
struct io_uring_cqe *first_cqe = NULL;
|
|
583
|
+
int ret = io_uring_submit_and_wait_timeout(&L->ring, &first_cqe, 1,
|
|
584
|
+
&stop_check_ts, NULL);
|
|
585
|
+
(void)first_cqe;
|
|
586
|
+
if (ret < 0) {
|
|
587
|
+
if (ret == -EINTR || ret == -ETIME) {
|
|
588
|
+
/* Timeout: drain whatever's ready and re-check stop. */
|
|
589
|
+
(void)hyp_iu_drain_cqes(L);
|
|
590
|
+
continue;
|
|
591
|
+
}
|
|
592
|
+
if (L->stopping && L->inflight == 0) {
|
|
593
|
+
a->graceful = 1;
|
|
594
|
+
return NULL;
|
|
595
|
+
}
|
|
596
|
+
a->err = -ret;
|
|
597
|
+
return NULL;
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
int listener_closed = hyp_iu_drain_cqes(L);
|
|
601
|
+
if (listener_closed) {
|
|
602
|
+
L->stopping = 1;
|
|
603
|
+
}
|
|
604
|
+
if (L->stopping && L->inflight == 0) {
|
|
605
|
+
a->graceful = 1;
|
|
606
|
+
return NULL;
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
/* Public Ruby surface: PageCache.run_static_io_uring_loop(listen_fd) -> Integer | :crashed | :unavailable */
|
|
612
|
+
static VALUE rb_pc_run_static_io_uring_loop(VALUE self, VALUE rb_listen_fd) {
|
|
613
|
+
(void)self;
|
|
614
|
+
int listen_fd = NUM2INT(rb_listen_fd);
|
|
615
|
+
if (listen_fd < 0) {
|
|
616
|
+
rb_raise(rb_eArgError, "listen_fd must be >= 0");
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
/* Reset the stop flag so a previous invocation's
|
|
620
|
+
* `stop_accept_loop` doesn't immediately tear us down. The 2.12-C
|
|
621
|
+
* loop does the same dance at entry — see `rb_pc_run_static_accept_loop`. */
|
|
622
|
+
pc_internal_reset_stop();
|
|
623
|
+
/* 2.12-E — reset the per-process served-request counter on entry so
|
|
624
|
+
* `c_loop_requests_total` reflects THIS loop's served count. Mirrors
|
|
625
|
+
* the 2.12-C accept4 entry path. */
|
|
626
|
+
pc_internal_reset_requests_served();
|
|
627
|
+
|
|
628
|
+
/* Clear O_NONBLOCK on the listener — io_uring drives accept itself
|
|
629
|
+
* and we want the kernel to park us in the ring rather than spin
|
|
630
|
+
* on EAGAIN. The 2.12-C path does the same trick on its accept(2)
|
|
631
|
+
* fd. */
|
|
632
|
+
int flags = fcntl(listen_fd, F_GETFL, 0);
|
|
633
|
+
if (flags >= 0 && (flags & O_NONBLOCK)) {
|
|
634
|
+
(void)fcntl(listen_fd, F_SETFL, flags & ~O_NONBLOCK);
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
hyp_iu_loop_t L;
|
|
638
|
+
memset(&L, 0, sizeof(L));
|
|
639
|
+
L.listen_fd = listen_fd;
|
|
640
|
+
L.max_inflight = HYP_IU_DEFAULT_MAX_INFLIGHT;
|
|
641
|
+
|
|
642
|
+
/* Probe at boot: io_uring_queue_init returns 0 on success or a
|
|
643
|
+
* negative errno on failure (seccomp / kernel-too-old / out-of-mem).
|
|
644
|
+
* The Ruby caller treats `:unavailable` as "fall through to the
|
|
645
|
+
* 2.12-C accept4 path"; the operator sees nothing scary in the
|
|
646
|
+
* boot log unless they explicitly set HYPERION_IO_URING_ACCEPT=1
|
|
647
|
+
* AND the probe failed (caller is responsible for warning then). */
|
|
648
|
+
int rc = io_uring_queue_init(HYP_IU_DEFAULT_DEPTH, &L.ring, 0);
|
|
649
|
+
if (rc < 0) {
|
|
650
|
+
return hyp_iu_sym_unavailable;
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
hyp_iu_run_args_t a;
|
|
654
|
+
a.L = &L;
|
|
655
|
+
a.err = 0;
|
|
656
|
+
a.graceful = 0;
|
|
657
|
+
rb_thread_call_without_gvl(hyp_iu_run_blocking, &a, RUBY_UBF_IO, NULL);
|
|
658
|
+
|
|
659
|
+
/* Best-effort drain: any conn structs we still hold need their
|
|
660
|
+
* fds closed and memory freed. The CLOSE submissions normally
|
|
661
|
+
* handle this, but a torn-down ring (graceful or not) means
|
|
662
|
+
* pending SQEs never completed. We don't have a per-conn list,
|
|
663
|
+
* so we lean on the kernel — io_uring_queue_exit will cancel
|
|
664
|
+
* pending SQEs. Memory leak surface here is bounded by
|
|
665
|
+
* `L.inflight`; in practice graceful shutdown drains it to 0
|
|
666
|
+
* before exit. The non-graceful path leaks at most
|
|
667
|
+
* `max_inflight * sizeof(hyp_iu_conn_t + rbuf)` once per worker
|
|
668
|
+
* lifetime — acceptable for an emergency-tear-down path. */
|
|
669
|
+
io_uring_queue_exit(&L.ring);
|
|
670
|
+
|
|
671
|
+
if (!a.graceful && a.err != 0) {
|
|
672
|
+
return hyp_iu_sym_crashed;
|
|
673
|
+
}
|
|
674
|
+
return LONG2NUM(L.served);
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
#else /* not __linux__ or no HAVE_LIBURING */
|
|
678
|
+
|
|
679
|
+
static VALUE rb_pc_run_static_io_uring_loop(VALUE self, VALUE rb_listen_fd) {
|
|
680
|
+
(void)self;
|
|
681
|
+
(void)rb_listen_fd;
|
|
682
|
+
return hyp_iu_sym_unavailable;
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
#endif /* __linux__ && HAVE_LIBURING */
|
|
686
|
+
|
|
687
|
+
/* Whether the C ext was built WITH liburing support. The Ruby side
|
|
688
|
+
* uses this in `ConnectionLoop#io_uring_eligible?` to short-circuit
|
|
689
|
+
* the env-var check — no point reading HYPERION_IO_URING_ACCEPT on a
|
|
690
|
+
* build that can't honour it. */
|
|
691
|
+
static VALUE rb_pc_io_uring_loop_compiled_p(VALUE self) {
|
|
692
|
+
(void)self;
|
|
693
|
+
#if defined(__linux__) && defined(HAVE_LIBURING)
|
|
694
|
+
return Qtrue;
|
|
695
|
+
#else
|
|
696
|
+
return Qfalse;
|
|
697
|
+
#endif
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
void Init_hyperion_io_uring_loop(VALUE mPageCache) {
|
|
701
|
+
hyp_iu_sym_unavailable = ID2SYM(rb_intern("unavailable"));
|
|
702
|
+
hyp_iu_sym_crashed = ID2SYM(rb_intern("crashed"));
|
|
703
|
+
rb_gc_register_mark_object(hyp_iu_sym_unavailable);
|
|
704
|
+
rb_gc_register_mark_object(hyp_iu_sym_crashed);
|
|
705
|
+
|
|
706
|
+
rb_define_singleton_method(mPageCache, "run_static_io_uring_loop",
|
|
707
|
+
rb_pc_run_static_io_uring_loop, 1);
|
|
708
|
+
rb_define_singleton_method(mPageCache, "io_uring_loop_compiled?",
|
|
709
|
+
rb_pc_io_uring_loop_compiled_p, 0);
|
|
710
|
+
}
|