hyperion-rb 1.6.2 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4768 -0
  3. data/README.md +222 -13
  4. data/ext/hyperion_h2_codec/Cargo.lock +7 -0
  5. data/ext/hyperion_h2_codec/Cargo.toml +33 -0
  6. data/ext/hyperion_h2_codec/extconf.rb +73 -0
  7. data/ext/hyperion_h2_codec/src/frames.rs +140 -0
  8. data/ext/hyperion_h2_codec/src/hpack/huffman.rs +161 -0
  9. data/ext/hyperion_h2_codec/src/hpack.rs +457 -0
  10. data/ext/hyperion_h2_codec/src/lib.rs +296 -0
  11. data/ext/hyperion_http/extconf.rb +28 -0
  12. data/ext/hyperion_http/h2_codec_glue.c +408 -0
  13. data/ext/hyperion_http/page_cache.c +1125 -0
  14. data/ext/hyperion_http/parser.c +473 -38
  15. data/ext/hyperion_http/sendfile.c +982 -0
  16. data/ext/hyperion_http/websocket.c +493 -0
  17. data/ext/hyperion_io_uring/Cargo.lock +33 -0
  18. data/ext/hyperion_io_uring/Cargo.toml +34 -0
  19. data/ext/hyperion_io_uring/extconf.rb +74 -0
  20. data/ext/hyperion_io_uring/src/lib.rs +316 -0
  21. data/lib/hyperion/adapter/rack.rb +370 -42
  22. data/lib/hyperion/admin_listener.rb +207 -0
  23. data/lib/hyperion/admin_middleware.rb +36 -7
  24. data/lib/hyperion/cli.rb +310 -11
  25. data/lib/hyperion/config.rb +440 -14
  26. data/lib/hyperion/connection.rb +679 -22
  27. data/lib/hyperion/deprecations.rb +81 -0
  28. data/lib/hyperion/dispatch_mode.rb +165 -0
  29. data/lib/hyperion/fiber_local.rb +75 -13
  30. data/lib/hyperion/h2_admission.rb +77 -0
  31. data/lib/hyperion/h2_codec.rb +499 -0
  32. data/lib/hyperion/http/page_cache.rb +122 -0
  33. data/lib/hyperion/http/sendfile.rb +696 -0
  34. data/lib/hyperion/http2/native_hpack_adapter.rb +70 -0
  35. data/lib/hyperion/http2_handler.rb +618 -19
  36. data/lib/hyperion/io_uring.rb +317 -0
  37. data/lib/hyperion/lint_wrapper_pool.rb +126 -0
  38. data/lib/hyperion/master.rb +96 -9
  39. data/lib/hyperion/metrics/path_templater.rb +68 -0
  40. data/lib/hyperion/metrics.rb +256 -0
  41. data/lib/hyperion/prometheus_exporter.rb +150 -0
  42. data/lib/hyperion/request.rb +13 -0
  43. data/lib/hyperion/response_writer.rb +477 -16
  44. data/lib/hyperion/runtime.rb +195 -0
  45. data/lib/hyperion/server/route_table.rb +179 -0
  46. data/lib/hyperion/server.rb +519 -55
  47. data/lib/hyperion/static_preload.rb +133 -0
  48. data/lib/hyperion/thread_pool.rb +61 -7
  49. data/lib/hyperion/tls.rb +343 -1
  50. data/lib/hyperion/version.rb +1 -1
  51. data/lib/hyperion/websocket/close_codes.rb +71 -0
  52. data/lib/hyperion/websocket/connection.rb +876 -0
  53. data/lib/hyperion/websocket/frame.rb +356 -0
  54. data/lib/hyperion/websocket/handshake.rb +525 -0
  55. data/lib/hyperion/worker.rb +111 -9
  56. data/lib/hyperion.rb +137 -3
  57. metadata +50 -1
data/README.md CHANGED
@@ -11,9 +11,155 @@ gem install hyperion-rb
11
11
  bundle exec hyperion config.ru
12
12
  ```
13
13
 
14
+ ## What's new in 2.11.0
15
+
16
+ **h2 cold-stream latency cut + native HPACK CGlue flipped to default.**
17
+ Two perf wins on top of 2.10:
18
+
19
+ - **2.11-A — h2 first-stream TLS handshake parallelization.** The
20
+ 2.10-G `HYPERION_H2_TIMING=1` instrumentation, run against the
21
+ TCP_NODELAY-fixed handler, isolated the residual cold-stream cost
22
+ to **bucket 2**: lazy `task.async {}` fiber spawn for the first
23
+ stream of every connection. Fix: pre-spawn a stream-dispatch fiber
24
+ pool at connection accept (configurable via `HYPERION_H2_DISPATCH_POOL`,
25
+ default 4, ceiling 16). h2load `-c 1 -m 1 -n 50` cold first-run:
26
+ **time-to-1st-byte 20.28 → 9.28 ms (−54%); m=100 throughput +5.5%**.
27
+ Warm steady-state unchanged (no head-of-line blocking under the small
28
+ pool — backlog still spills to ad-hoc `task.async`).
29
+ - **2.11-B — HPACK FFI marshalling round-2 (CGlue flipped to default).**
30
+ Three-way bench (`bench/h2_rails_shape.sh` extended): `ruby` (1,585
31
+ r/s) vs `native v2` (1,602 r/s, +1% — noise) vs `native v3 / CGlue`
32
+ (**2,291 r/s, +43% over v2**). The +18-44% native-vs-Ruby headline
33
+ was almost entirely Fiddle marshalling overhead, not the underlying
34
+ Rust HPACK encoder — same encoder, no per-call FFI marshalling, +43%
35
+ rps. Default flipped: unset `HYPERION_H2_NATIVE_HPACK` now selects
36
+ CGlue. Three escape valves stay (`=v2` to force the old path, `=ruby`
37
+ / `=off` for the pure-Ruby fallback) for any operator that needs
38
+ them. Boot log gains a `native_mode` field documenting which path is
39
+ actually live.
40
+
41
+ Plus operator infrastructure: a stale-`.dylib`-on-Linux cross-platform
42
+ host-OS portability fix in `H2Codec.candidate_paths` (was silently
43
+ falling through to pure-Ruby on the bench host); `bench/h2_rails_shape.sh`
44
+ race-fixed (boot-log probe + stderr routing). Full bench tables and
45
+ flip-decision rationale in [`CHANGELOG.md`](CHANGELOG.md).
46
+
47
+ ## What's new in 2.10.1
48
+
49
+ **Static-asset operator surface (2.10-E) + C-ext fast-path response
50
+ writer (2.10-F).** Two follow-on streams to 2.10's static / direct-route
51
+ work:
52
+
53
+ - **2.10-E — Static asset preload + immutable flag.** Boot-time hook
54
+ warms `Hyperion::Http::PageCache` over a tree of files and marks
55
+ every cached entry immutable. Surface: `--preload-static <dir>` (and
56
+ `--no-preload-static`) CLI flags, `preload_static "/path", immutable:
57
+ true` config DSL key, and zero-config Rails auto-detect that pulls
58
+ `Rails.configuration.assets.paths.first(8)` when present. Hyperion
59
+ never `require`s Rails — purely defensive `defined?(::Rails)`
60
+ probing keeps the generic Rack server path clean. **Operator value:
61
+ predictable first-request latency** (the asset is in cache before
62
+ the first request arrives) and the `recheck_seconds` mtime poll is
63
+ skipped on immutable entries. Sustained-load throughput on the
64
+ static-1-KB bench did *not* move (cold 1,929 r/s vs warm 1,886 r/s,
65
+ inside trial noise) because `ResponseWriter` already auto-caches
66
+ Rack::Files responses on the first hit; preload moves that one
67
+ `cache_file` call from request 1 to boot.
68
+ - **2.10-F — C-ext fast-path response writer for prebuilt responses.**
69
+ `Server.handle_static`-routed requests now serve from a single
70
+ C function (`rb_pc_serve_request` in `ext/hyperion_http/page_cache.c`)
71
+ that does route lookup → header build → `write()` syscall without
72
+ re-entering Ruby on the response side. GVL is released across the
73
+ `write()` so slow clients no longer block other Ruby work on the
74
+ same VM. Automatic HEAD support (HTTP-mandated) lights up on every
75
+ GET registered via `handle_static` — same buffer, body stripped.
76
+ Bench (3-trial median, `wrk -t4 -c100 -d20s`): **5,768 r/s vs
77
+ 2.10-D's 5,619 r/s (+2.6% — inside noise) and p99 1.93 → 1.67 ms
78
+ (−14% — outside noise, reproducible).** The throughput needle didn't
79
+ move because the per-connection lifecycle (accept4 + clone3 + futex
80
+ on GVL handoff) dominates at 100 concurrent connections; 2.10-F
81
+ shrinks the response phase, but the response phase isn't the
82
+ bottleneck on this profile. Durable infrastructure for 2.11+ when
83
+ the accept-loop work closes.
84
+
85
+ Full per-stream details and bench tables in
86
+ [`CHANGELOG.md`](CHANGELOG.md).
87
+
88
+ ## What's new in 2.10.0
89
+
90
+ **4-way bench harness, page cache, direct routes, and the h2 40 ms
91
+ ceiling killed.** This sprint widens the comparison matrix to all four
92
+ major Ruby web servers (Hyperion + Puma + Falcon + Agoo) and ships
93
+ four substantive perf streams against that backdrop:
94
+
95
+ - **2.10-A / 2.10-B — 4-way bench harness + honest baseline.**
96
+ `bench/4way_compare.sh` runs the same 6 workloads (hello, static
97
+ 1 KB / 1 MiB, CPU JSON, PG-bound, SSE) against all four servers from
98
+ one script. Baseline numbers committed *before* any code changes:
99
+ Agoo wins the static-asset and JSON columns by ~2-4×, Hyperion wins
100
+ the static 1 MiB column by 9× and the SSE column by 3.6-17×.
101
+ - **2.10-C — `Hyperion::Http::PageCache` (pre-built static response
102
+ cache).** Open-addressed bucket table behind a pthread mutex
103
+ (GVL-released for writes), engages automatically on `Rack::Files`
104
+ responses. **Static 1 KB: 1,380 → 1,880 r/s (+36%), p99 3.7 → 2.7
105
+ ms.** Closes the Agoo gap from −47% to −28% on that column.
106
+ - **2.10-D — `Hyperion::Server.handle` direct route registration.**
107
+ New API for hot Rack-bypass paths (`Server.handle '/health' do …
108
+ end`, `Server.handle_static '/robots.txt', body: '...'`). Skips Rack
109
+ adapter + env-build for matched routes. **`hello` via
110
+ `handle_static`: 4,408 → 5,619 r/s (+27%), p99 1.93 ms** — the
111
+ cleanest p99 in the 4-way matrix.
112
+ - **2.10-G — h2 max-latency ceiling at ~40 ms: fixed.** Filed by 2.9-B
113
+ as a "first-stream cost" hypothesis, the instrumentation revealed
114
+ it was paid by *every* h2 stream — the canonical Linux delayed-ACK
115
+ + Nagle interaction on small framer writes. One-line fix:
116
+ TCP_NODELAY at accept time. **h2load `-c 1 -m 1 -n 200`: min
117
+ 40.62 → 0.54 ms (−98.7%), throughput 24 → 1,142 r/s (+47.6×).** The
118
+ `HYPERION_H2_TIMING=1` instrumentation stays in place as durable
119
+ diagnostic infrastructure.
120
+
121
+ Full per-stream details, bench numbers, and follow-up items live in
122
+ [`CHANGELOG.md`](CHANGELOG.md).
123
+
124
+ ## What's new in 2.5.0
125
+
126
+ **Native HPACK ON by default + autobahn 100% conformance + request
127
+ hooks.** The Rust HPACK encoder (added in 2.0.0, opt-in until 2.4.x)
128
+ flips ON by default in 2.5.0 — verified **+18% rps on Rails-shape h2
129
+ workloads** (25-header responses, the bench harness lives at
130
+ `bench/h2_rails_shape.ru` + `bench/h2_rails_shape.sh`). RFC 6455
131
+ WebSocket conformance hit **463/463 autobahn-testsuite cases passing**
132
+ (2.5-A, host openclaw-vm). Request lifecycle hooks
133
+ (`Runtime#on_request_start` / `on_request_end`) shipped in 2.5-C —
134
+ recipes in [`docs/OBSERVABILITY.md`](docs/OBSERVABILITY.md).
135
+
136
+ ## What's new in 2.4.0
137
+
138
+ **Production observability.** The `/-/metrics` endpoint now exposes
139
+ per-route latency histograms, per-conn fairness rejections, WebSocket
140
+ permessage-deflate compression ratio, kTLS active connections,
141
+ io_uring-active workers, and ThreadPool queue depth — operators can
142
+ finally see whether the 2.x knobs are firing and how effective they
143
+ are. A pre-built Grafana dashboard ships at
144
+ [`docs/grafana/hyperion-2.4-dashboard.json`](docs/grafana/hyperion-2.4-dashboard.json).
145
+ Full metric reference + operator playbook in
146
+ [`docs/OBSERVABILITY.md`](docs/OBSERVABILITY.md).
147
+
148
+ ## What's new in 2.1.0
149
+
150
+ **WebSockets.** RFC 6455 over Rack 3 full hijack, native frame codec,
151
+ per-connection wrapper with auto-pong / close handshake / UTF-8 validation /
152
+ per-message size cap. **ActionCable on Hyperion is now a single-binary
153
+ deployment** — one `hyperion -w 4 -t 10 config.ru` process serves HTTP,
154
+ HTTP/2, TLS, **and** `/cable` from the same listener; no separate cable
155
+ container required. HTTP/1.1 only this release; WS-over-HTTP/2 (RFC 8441
156
+ Extended CONNECT) and permessage-deflate (RFC 7692) defer to 2.2.x.
157
+ See [`docs/WEBSOCKETS.md`](docs/WEBSOCKETS.md).
158
+
14
159
  ## Highlights
15
160
 
16
161
  - **HTTP/1.1 + HTTP/2 + TLS** out of the box (HTTP/2 with per-stream fiber multiplexing, WINDOW_UPDATE-aware flow control, ALPN auto-negotiation).
162
+ - **WebSockets (RFC 6455)** — full handshake, native frame codec, per-connection wrapper. ActionCable + faye-websocket work on a single-binary deploy. See [`docs/WEBSOCKETS.md`](docs/WEBSOCKETS.md). (2.1.0+, HTTP/1.1 only.)
17
163
  - **Pre-fork cluster** with per-OS worker model: `SO_REUSEPORT` on Linux, master-bind + worker-fd-share on macOS/BSD (Darwin's `SO_REUSEPORT` doesn't load-balance).
18
164
  - **Hybrid concurrency**: fiber-per-connection for I/O, OS-thread pool for `app.call(env)` — synchronous Rack handlers (Rails, ActiveRecord, anything holding a global mutex) get true OS-thread concurrency.
19
165
  - **Vendored llhttp 9.3.0** C parser; pure-Ruby fallback for non-MRI runtimes.
@@ -25,13 +171,43 @@ bundle exec hyperion config.ru
25
171
 
26
172
  ## Benchmarks
27
173
 
28
- All numbers are real wrk runs against published Hyperion configs. Hyperion ships **with default-ON structured access logs**; Puma comparisons use Puma defaults (no per-request log emission). Each section is stamped with the Hyperion version it was measured against — newer versions (1.3.0+ `--async-io`, 1.4.0+ TLS h1 inline, 1.4.1+ Metrics fiber-key fix, 1.6.0+ HTTP/2 writer fiber + 3 C-ext additions) preserve or improve these numbers; we re-run the headline configs each release and have not seen regressions on these workloads.
29
-
30
- > **Comprehensive matrix for 1.6.0 + hyperion-async-pg 0.5.0 (16-vCPU Linux, 9 workloads × 25+ configs)**: see [`docs/BENCH_2026_04_27.md`](docs/BENCH_2026_04_27.md). Headline: 98,818 r/s on hello `-w 16`, 21,215 r/s `-w 4` at p99 < 2 ms, 2,180 r/s on a 50 ms-waiting PG workload (4.1× the best Puma), 1,667 req/s HTTP/2 multiplexed at 0 errors, 155 MB RSS for 10k idle keep-alive connections.
174
+ All numbers are real wrk runs against published Hyperion configs. Hyperion ships **with default-ON structured access logs**; Puma comparisons use Puma defaults (no per-request log emission). Each section is stamped with the Hyperion version + bench host it was measured against — bench-host drift over time is real (see "Bench-host drift" note below).
175
+
176
+ **Headline doc**: the most recent comprehensive sweep is
177
+ [`docs/BENCH_HYPERION_2_0.md`](docs/BENCH_HYPERION_2_0.md) (Hyperion
178
+ 2.0.0 vs Puma 8.0.1, 16-vCPU Ubuntu 24.04, 12 workloads). The 1.6.0
179
+ matrix at [`docs/BENCH_2026_04_27.md`](docs/BENCH_2026_04_27.md) covers
180
+ 9 workloads × 25+ configs against hyperion-async-pg 0.5.0; both docs
181
+ include caveats and per-row reproduction commands.
182
+
183
+ > **Bench-host drift note (2026-05-01).** A spot-check rerun on
184
+ > `openclaw-vm` 5 days after the 2.0.0 sweep showed Puma 8.0.1 and
185
+ > Hyperion 2.0.0 baseline numbers had drifted 14-32% downward from the
186
+ > 2026-04-29 sweep with no code changes — the bench host runs other
187
+ > workloads in the background and is a single VM (KVM CPU). Numbers in
188
+ > this README and BENCH docs are snapshots; expect ±10-30% absolute
189
+ > drift between sweep dates. **The relative position (Hyperion vs Puma
190
+ > at matched config) is the durable signal**; e.g. Hyperion `-w 16 -t 5`
191
+ > hello-world today is 76,593 r/s vs Puma 8.0.1 `-w 16 -t 5:5` at 55,609
192
+ > r/s, **+37.7% over Puma** — wider than the 2.0.0 sweep's +27.8% even
193
+ > though absolute rps is lower. Reproduce: `bundle exec bin/hyperion
194
+ > -p 9501 -w 16 -t 5 bench/hello.ru` then `wrk -t4 -c200 -d20s
195
+ > http://127.0.0.1:9501/`.
196
+
197
+ > **Topology relevance.** Hyperion is built to run **fronted by nginx
198
+ > or an L7 load balancer** in most production deployments — plaintext
199
+ > HTTP/1.1 upstream, TLS terminated at the LB. The benches in this
200
+ > README that match that topology are: hello-world, CPU JSON, static,
201
+ > SSE, PG, WebSocket. Benches that are **bench-only for nginx-fronted
202
+ > ops** (the LB → upstream hop is plaintext h1 regardless): TLS h1,
203
+ > HTTP/2, kTLS_TX. Those rows still ship for operators who terminate
204
+ > TLS / h2 at Hyperion directly (small static fleets, edge boxes), but
205
+ > don't chase the +60% TLS-h1 win unless you actually terminate TLS at
206
+ > Hyperion.
31
207
 
32
208
  ### Hello-world Rack app
33
209
 
34
- `bench/hello.ru`, single worker, parity threads (`-t 5` vs Puma `-t 5:5`), 4 wrk threads / 100 connections / 15s, macOS arm64 / Ruby 3.3.3, Hyperion 1.2.0:
210
+ `bench/hello.ru`, single worker, parity threads (`-t 5` vs Puma `-t 5:5`), 4 wrk threads / 100 connections / 15s, macOS arm64 / Ruby 3.3.3, Hyperion 1.2.0. **macOS dev numbers; the headline Linux 2.0.0 bench is in [`docs/BENCH_HYPERION_2_0.md`](docs/BENCH_HYPERION_2_0.md)**:
35
211
 
36
212
  | | r/s | p99 | tail vs Hyperion |
37
213
  |---|---:|---:|---:|
@@ -69,7 +245,7 @@ Bench is **wait-bound** — ~3-4 ms median is the PG + Redis round-trip, dwarfin
69
245
 
70
246
  Ubuntu 24.04 / 16 vCPU / Ruby 3.3.3, Postgres 17 over WAN, `wrk -t4 -c200 -d20s`. Single worker (`-w 1`) unless noted. All configs returned 0 non-2xx and 0 timeouts. RSS sampled mid-run via `ps -o rss`.
71
247
 
72
- **Wait-bound workload** (`bench/pg_concurrent.ru`: `SELECT pg_sleep(0.05)` + tiny JSON):
248
+ **Wait-bound workload** (`pg_concurrent.ru`: `SELECT pg_sleep(0.05)` + tiny JSON; rackup lives in the [hyperion-async-pg companion repo](https://github.com/andrew-woblavobla/hyperion-async-pg) and on the bench host at `~/bench/pg_concurrent.ru`, not in this repo):
73
249
 
74
250
  | | r/s | p99 | RSS | vs Puma `-t 5` |
75
251
  |---|---:|---:|---:|---:|
@@ -83,7 +259,7 @@ Ubuntu 24.04 / 16 vCPU / Ruby 3.3.3, Postgres 17 over WAN, `wrk -t4 -c200 -d20s`
83
259
  | Hyperion `--async-io -w 4 -t 5` pool=64 | 1937.5 | 4.84 s | 416 MB | 34.3× (cold-start p99 — see note) |
84
260
  | Falcon 0.55.3 `--count 1` pool=128 | 1665.7 | 516 ms | 141 MB | 29.5× |
85
261
 
86
- **Mixed CPU+wait** (`bench/pg_mixed.ru`: same query + 50-key JSON serialization, ~5 ms CPU):
262
+ **Mixed CPU+wait** (`pg_mixed.ru`: same query + 50-key JSON serialization, ~5 ms CPU; rackup lives in hyperion-async-pg + on the bench host at `~/bench/pg_mixed.ru`, not in this repo):
87
263
 
88
264
  | | r/s | p99 | RSS | vs Puma `-t 30` |
89
265
  |---|---:|---:|---:|---:|
@@ -94,11 +270,12 @@ Ubuntu 24.04 / 16 vCPU / Ruby 3.3.3, Postgres 17 over WAN, `wrk -t4 -c200 -d20s`
94
270
  | Falcon `--count 1` pool=128 | 1642.1 | 531 ms | 213 MB | 4.7× |
95
271
 
96
272
  **Takeaways:**
97
- 1. **Linear scaling with pool size** under `--async-io` — `r/s ≈ pool × 12` on this WAN bench. Single-worker pool=200 hits 2381 r/s, **42× Puma `-t 5`** and **5.9× Puma's best** (`-t 30`).
273
+ 1. **Linear scaling with pool size** under `--async-io` — `r/s ≈ pool × 12` on this WAN bench. Single-worker pool=200 hits 2381 r/s. The "**42× Puma `-t 5`**" and "**5.9× Puma's best**" framings above use Puma's pool=5 (timeout-floor) and pool=30 (mid-tier) rows respectively — fair comparisons on the *same* bench fixture, but a Puma operator who sizes their pool to match (`-t 100 pool=100` row above) lands at 1,067 r/s, so the **honest "Puma at its own best vs Hyperion at its own best" ratio is 2,381 / 1,067 ≈ 2.2×**, not 42×. The architectural win — fiber-pool grows to pool=200 without OS-thread cost — is real; the 42× headline is a configuration-difference effect, not a steady-state gap on matched configs.
98
274
  2. **Mixed workload doesn't kill the win** — Hyperion `--async-io` pool=128 actually goes *up* on mixed (1740 vs 1344 r/s) because CPU work overlaps other fibers' PG-wait windows. This is the honest "what happens to a real Rails handler" answer.
99
275
  3. **Hyperion ≈ Falcon within 3-7%** across pool sizes; both fiber-native architectures extract similar value from `hyperion-async-pg`.
100
276
  4. **RSS at single-worker scale isn't the architectural moat** — Linux thread stacks are demand-paged; PG connection buffers dominate RSS at pool sizes ≤ 200. The architectural win is **handler concurrency under load**, not idle memory: Hyperion's fiber path runs thousands of in-flight handler invocations per OS thread, so wait-bound handlers don't queue at `max_threads`. See [Concurrency at scale](#concurrency-at-scale-architectural-advantages) for both the throughput-under-load row and a measured 10k-idle-keepalive RSS sweep against Puma and Falcon.
101
277
  5. **`-w 4` cold-start caveat** — multi-worker p99 inflates because the bench rackup uses lazy per-process pool init (each worker pays full pool fill on its first request). Production apps avoid this with `on_worker_boot { Hyperion::AsyncPg::FiberPool.new(...).fill }`.
278
+ 6. **Apples-to-apples PG note**: the row above uses `pg.wobla.space` WAN PG with `max_connections=500`. Earlier sweeps that compared Hyperion (WAN, max_conn=500) against Puma (local, max_conn=100) overstated the ratio because the Puma side timed out at the local pool ceiling. The 2.0.0 bench doc carries this caveat in the row 7 verification section; treat any "Hyperion 4× Puma on PG" headline as **indicative**, not precisely calibrated, until rerun against matched-pool PG.
102
279
 
103
280
  Three things must all be true to get this win:
104
281
  1. **`async_io: true`** in your Hyperion config (or `--async-io` CLI flag). Default is off to keep 1.2.0's raw-loop perf for fiber-unaware apps.
@@ -205,19 +382,51 @@ Hyperion fans 100 in-flight streams across separate fibers within a single TCP c
205
382
 
206
383
  > **1.6.0 outbound write path** — `Http2Handler` no longer serializes every framer write through one `Mutex#synchronize { socket.write(...) }`. HPACK encoding (microseconds, in-memory) still serializes on a fast encode mutex, but the actual `socket.write` is owned by a dedicated per-connection writer fiber draining a queue. On per-connection multi-stream workloads where the kernel send buffer or peer reads are slow, encode work for ready streams overlaps the writer's flush of earlier chunks, instead of stacking up behind it. See `bench/h2_streams.sh` (`h2load -c 1 -m 100 -n 5000`) for a recipe to compare 1.5.0 vs 1.6.0 on a workload of your choice.
207
384
 
208
- ### Reproduce
385
+ ### Reproducing the benchmarks
386
+
387
+ Every number in this README and `docs/BENCH_HYPERION_2_0.md` is reproducible. Operators who don't trust headline numbers (and you shouldn't trust *any* benchmark numbers without independent verification) can rerun the workloads on their own host and get their own honest measurements. Per-row reproduction commands:
209
388
 
210
389
  ```sh
211
- # hello-world
212
- bundle exec ruby bench/compare.rb
213
- HYPERION_WORKERS=4 PUMA_WORKERS=4 FALCON_COUNT=4 bundle exec ruby bench/compare.rb
390
+ # Setup (once)
391
+ bundle install
392
+ bundle exec rake compile
393
+
394
+ # Hello-world (rps + p99 ceiling, no I/O)
395
+ bundle exec bin/hyperion -p 9292 -w 16 -t 5 bench/hello.ru &
396
+ wrk -t4 -c200 -d20s --latency http://127.0.0.1:9292/
397
+
398
+ # CPU-bound JSON (per-request CPU savings visible)
399
+ bundle exec bin/hyperion -p 9292 -w 4 -t 5 bench/work.ru &
400
+ wrk -t4 -c200 -d15s --latency http://127.0.0.1:9292/
401
+
402
+ # Static 1 MiB sendfile path
403
+ ruby -e 'File.binwrite("/tmp/hyperion_bench_asset_1m.bin", "x" * (1024*1024))'
404
+ bundle exec bin/hyperion -p 9292 -w 1 -t 5 bench/static.ru &
405
+ wrk -t4 -c100 -d15s --latency http://127.0.0.1:9292/hyperion_bench_asset_1m.bin
406
+
407
+ # SSE streaming (Hyperion-shaped rackup with explicit flush sentinel — see caveat in BENCH doc)
408
+ bundle exec bin/hyperion -p 9292 -w 1 -t 5 bench/sse.ru &
409
+ wrk -t1 -c1 -d10s http://127.0.0.1:9292/
410
+
411
+ # WebSocket multi-process throughput
412
+ bundle exec bin/hyperion -p 9888 -w 4 -t 64 bench/ws_echo.ru &
413
+ ruby bench/ws_bench_client_multi.rb --port 9888 --procs 4 --conns 200 --msgs 1000 --bytes 1024 --json
414
+
415
+ # h2 native HPACK (Rails-shape, 25-header response)
416
+ ./bench/h2_rails_shape.sh
214
417
 
215
418
  # Idle keep-alive RSS sweep (1k / 5k / 10k conns, 30s hold per server)
216
419
  ./bench/keepalive_memory.sh
217
420
 
218
- # Real Rails / Grape: see bench/db.ru for the schema
421
+ # Hello-world quick comparator (Hyperion vs Puma vs Falcon)
422
+ bundle exec ruby bench/compare.rb
423
+ HYPERION_WORKERS=4 PUMA_WORKERS=4 FALCON_COUNT=4 bundle exec ruby bench/compare.rb
219
424
  ```
220
425
 
426
+ PG benches (`pg_concurrent.ru`, `pg_mixed.ru`, `pg_realistic.ru`) live in the [hyperion-async-pg companion repo](https://github.com/andrew-woblavobla/hyperion-async-pg) — they require a running Postgres + the companion gem and are not part of this repo. The 2.0.0 sweep used `~/bench/pg_concurrent.ru` on the bench host; reproduce by cloning hyperion-async-pg and following its README, or `scp` the rackup + DATABASE_URL.
427
+
428
+ When numbers from your host don't match the published numbers, the most likely explanations (in order): (1) bench-host noise — single-VM benches drift 10-30% over days; (2) Puma version mismatch — the 2.0.0 sweep used Puma 8.0.1 in the `~/bench/Gemfile`, the hyperion repo's own Gemfile pins Puma `~> 6.4`; (3) different kernel / Ruby; (4) different `-t` / `-c` (apples-to-apples requires identical worker count, thread count, wrk concurrency, payload size, kernel, Ruby, TLS cipher).
429
+
221
430
  ## Quick start
222
431
 
223
432
  ```sh
@@ -332,7 +541,7 @@ Concrete tradeoffs distilled from [`docs/BENCH_2026_04_27.md`](docs/BENCH_2026_0
332
541
  |---|---|---|
333
542
  | **Pure I/O-bound** (PG / Redis / external HTTP, no significant CPU) | `-w 1` + larger pool | Bench: `-w 1 pool=200` = 87 MB / 2,180 r/s vs `-w 4 pool=64` = 224 MB / 1,680 r/s. **2.6× more memory, 0.77× rps** if you pick multi-worker on a wait-bound workload. |
334
543
  | **Pure CPU-bound** (heavy JSON / template render / image processing) | `-w N` matching CPU count | Each worker's accept loop is single-threaded under `--async-io`; multi-worker gives CPU-parallelism. Bench: `-w 16 -t 5` hits 98,818 r/s on a 16-vCPU box, 4.7× a `-w 1` ceiling on the same hardware. |
335
- | **Mixed** (Rails-shaped: ~5 ms CPU + 50 ms PG wait per request) | `-w N/2` (half cores) + medium pool | Lets CPU work parallelise while keeping per-worker memory tractable. Bench `pg_mixed.ru` at `-w 4 -t 5 pool=128` = 1,740 r/s with no cold-start spike (ForkSafe `prefill_in_child: true`). |
544
+ | **Mixed** (Rails-shaped: ~5 ms CPU + 50 ms PG wait per request) | `-w N/2` (half cores) + medium pool | Lets CPU work parallelise while keeping per-worker memory tractable. Bench `pg_mixed.ru` (in hyperion-async-pg repo / `~/bench/`) at `-w 4 -t 5 pool=128` = 1,740 r/s with no cold-start spike (ForkSafe `prefill_in_child: true`). |
336
545
 
337
546
  Multi-worker on PG-wait workloads is the **wrong** default for most apps — the headline rps doesn't justify the memory and PG-connection cost. Verify your shape with the bench before scaling out.
338
547
 
@@ -0,0 +1,7 @@
1
+ # This file is automatically @generated by Cargo.
2
+ # It is not intended for manual editing.
3
+ version = 3
4
+
5
+ [[package]]
6
+ name = "hyperion_h2_codec"
7
+ version = "2.0.0"
@@ -0,0 +1,33 @@
1
+ [package]
2
+ name = "hyperion_h2_codec"
3
+ version = "2.0.0"
4
+ edition = "2021"
5
+ publish = false
6
+ description = "Native HPACK + HTTP/2 frame codec for Hyperion (Phase 6)"
7
+
8
+ [lib]
9
+ crate-type = ["cdylib"]
10
+ name = "hyperion_h2_codec"
11
+
12
+ # Phase 6 takes the simplest reliable path: a self-contained,
13
+ # zero-dependency RFC 7541 / RFC 7540 implementation in Rust, exposed
14
+ # to Ruby via a thin extern "C" surface and called from MRI through
15
+ # Fiddle. This avoids the magnus toolchain dependency tree (which
16
+ # pulls in 10+ transitive crates and sometimes locks to newer rustc
17
+ # than what `rustup show` reports), at the cost of slightly more
18
+ # verbose Ruby-side glue. The trade-off was right for 2.0: the
19
+ # alternative was deferring Phase 6 because a build-time `cargo fetch`
20
+ # could fail in air-gapped or CI-cache-cold environments.
21
+ #
22
+ # If/when we want a richer Ruby-Rust API surface, the next step is to
23
+ # add `magnus = "0.6"` here and rewrite `lib.rs::ffi` as a magnus
24
+ # `#[magnus::init]` block; the inner pure-Rust modules
25
+ # (`hpack`/`frames`) are deliberately decoupled from the FFI shape so
26
+ # that swap is local.
27
+ [dependencies]
28
+
29
+ [profile.release]
30
+ opt-level = 3
31
+ lto = true
32
+ codegen-units = 1
33
+ strip = true
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Build the hyperion_h2_codec Rust extension.
4
+ #
5
+ # Phase 6 (RFC 2.0 §3): native HPACK encode/decode + frame ser/de via
6
+ # a Rust crate. This extconf.rb is invoked by `gem install`; it shells
7
+ # out to `cargo build --release` and writes a Makefile that copies the
8
+ # resulting cdylib into `lib/hyperion_h2_codec/` so that
9
+ # `lib/hyperion/h2_codec.rb` can find it via Fiddle.
10
+ #
11
+ # Cargo is OPTIONAL. If it's missing, the extconf writes a stub
12
+ # Makefile that prints a friendly note and exits cleanly — Hyperion
13
+ # still ships and falls back to the existing Ruby HPACK path
14
+ # (`Hyperion::H2Codec.available?` returns false). Operators who want
15
+ # the perf bump install Rust via `rustup` and `gem pristine
16
+ # hyperion-rb` to rebuild.
17
+ #
18
+ # Cross-platform notes:
19
+ # * Linux + GNU libc: cargo emits `libhyperion_h2_codec.so`.
20
+ # * macOS: `libhyperion_h2_codec.dylib`.
21
+ # * The Ruby loader (Fiddle) probes both extensions in order, so we
22
+ # copy whichever cargo produced into the gem's lib_dir under both
23
+ # names where convenient.
24
+
25
+ require 'mkmf'
26
+ require 'fileutils'
27
+ require 'rbconfig'
28
+
29
+ ext_dir = __dir__
30
+ crate_dir = ext_dir
31
+ target_dir = File.join(crate_dir, 'target', 'release')
32
+ gem_lib_dir = File.expand_path('../../lib/hyperion_h2_codec', __dir__)
33
+
34
+ cargo_present = system('cargo --version > /dev/null 2>&1')
35
+
36
+ if cargo_present
37
+ warn '[hyperion_h2_codec] cargo detected — building native HPACK extension'
38
+ Dir.chdir(crate_dir) do
39
+ ok = system('cargo build --release')
40
+ unless ok
41
+ warn '[hyperion_h2_codec] cargo build failed; falling back to pure-Ruby HPACK path'
42
+ cargo_present = false
43
+ end
44
+ end
45
+ end
46
+
47
+ FileUtils.mkdir_p(gem_lib_dir)
48
+
49
+ if cargo_present
50
+ candidates = %w[libhyperion_h2_codec.dylib libhyperion_h2_codec.so]
51
+ found = candidates.find { |c| File.exist?(File.join(target_dir, c)) }
52
+ if found
53
+ src = File.join(target_dir, found)
54
+ dst = File.join(gem_lib_dir, found)
55
+ FileUtils.cp(src, dst)
56
+ warn "[hyperion_h2_codec] installed #{dst}"
57
+ else
58
+ warn '[hyperion_h2_codec] cargo finished but no cdylib artifact found; falling back'
59
+ cargo_present = false
60
+ end
61
+ end
62
+
63
+ # Always emit a Makefile — gem install protocol expects one. The body
64
+ # is a no-op when cargo isn't present so `make` exits 0 and gem
65
+ # install completes.
66
+ File.open(File.join(ext_dir, 'Makefile'), 'w') do |f|
67
+ f.puts 'all:'
68
+ f.puts "\t@echo \"[hyperion_h2_codec] no-op make (cargo handled the build)\""
69
+ f.puts 'clean:'
70
+ f.puts "\t@rm -rf target"
71
+ f.puts 'install:'
72
+ f.puts "\t@echo \"[hyperion_h2_codec] no-op install (artifact already in lib/)\""
73
+ end
@@ -0,0 +1,140 @@
1
+ //! HTTP/2 frame primitives (RFC 7540 §6).
2
+ //!
3
+ //! Phase 6a only ships the simplest frame types the writer fiber
4
+ //! needs in the response path: HEADERS, DATA, RST_STREAM, WINDOW_UPDATE.
5
+ //! The h2 connection state machine continues to be driven by
6
+ //! `protocol-http2` for now — we just expose the wire-formatting
7
+ //! primitives so a future Phase 6b can replace the Ruby-side framer.
8
+
9
+ use std::fmt;
10
+
11
+ /// HPACK error type, shared between encoder/decoder. Public so the
12
+ /// FFI layer can surface a numeric code to Ruby.
13
+ #[derive(Debug)]
14
+ pub enum HpackError {
15
+ Truncated,
16
+ Overflow,
17
+ BadIndex,
18
+ ZeroIndex,
19
+ HuffmanInvalid,
20
+ }
21
+
22
+ impl fmt::Display for HpackError {
23
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
24
+ match self {
25
+ HpackError::Truncated => write!(f, "truncated input"),
26
+ HpackError::Overflow => write!(f, "integer overflow"),
27
+ HpackError::BadIndex => write!(f, "invalid HPACK index"),
28
+ HpackError::ZeroIndex => write!(f, "HPACK index 0 is reserved"),
29
+ HpackError::HuffmanInvalid => write!(f, "invalid Huffman sequence"),
30
+ }
31
+ }
32
+ }
33
+
34
+ // Frame type constants. RFC 7540 §11.2 / RFC 9113 §6.
35
+ // Some are unused by the Phase 6a wire path but ship now so that the
36
+ // FFI surface, when expanded in Phase 6b, doesn't churn ABI numbers.
37
+ #[allow(dead_code)]
38
+ pub const FRAME_DATA: u8 = 0x0;
39
+ #[allow(dead_code)]
40
+ pub const FRAME_HEADERS: u8 = 0x1;
41
+ #[allow(dead_code)]
42
+ pub const FRAME_RST_STREAM: u8 = 0x3;
43
+ #[allow(dead_code)]
44
+ pub const FRAME_SETTINGS: u8 = 0x4;
45
+ #[allow(dead_code)]
46
+ pub const FRAME_PING: u8 = 0x6;
47
+ #[allow(dead_code)]
48
+ pub const FRAME_GOAWAY: u8 = 0x7;
49
+ #[allow(dead_code)]
50
+ pub const FRAME_WINDOW_UPDATE: u8 = 0x8;
51
+ #[allow(dead_code)]
52
+ pub const FRAME_CONTINUATION: u8 = 0x9;
53
+
54
+ #[allow(dead_code)]
55
+ pub const FLAG_END_STREAM: u8 = 0x1;
56
+ #[allow(dead_code)]
57
+ pub const FLAG_END_HEADERS: u8 = 0x4;
58
+
59
+ /// 9-byte frame header + payload writer (RFC 7540 §4.1).
60
+ fn write_frame_header(out: &mut Vec<u8>, len: u32, kind: u8, flags: u8, stream_id: u32) {
61
+ out.push(((len >> 16) & 0xff) as u8);
62
+ out.push(((len >> 8) & 0xff) as u8);
63
+ out.push((len & 0xff) as u8);
64
+ out.push(kind);
65
+ out.push(flags);
66
+ let sid = stream_id & 0x7fff_ffff; // R-bit cleared per spec
67
+ out.push(((sid >> 24) & 0xff) as u8);
68
+ out.push(((sid >> 16) & 0xff) as u8);
69
+ out.push(((sid >> 8) & 0xff) as u8);
70
+ out.push((sid & 0xff) as u8);
71
+ }
72
+
73
+ pub fn encode_data_frame(stream_id: u32, end_stream: bool, payload: &[u8]) -> Vec<u8> {
74
+ let mut out = Vec::with_capacity(9 + payload.len());
75
+ let flags = if end_stream { FLAG_END_STREAM } else { 0 };
76
+ write_frame_header(&mut out, payload.len() as u32, FRAME_DATA, flags, stream_id);
77
+ out.extend_from_slice(payload);
78
+ out
79
+ }
80
+
81
+ #[allow(dead_code)]
82
+ pub fn encode_headers_frame(
83
+ stream_id: u32,
84
+ end_stream: bool,
85
+ end_headers: bool,
86
+ block: &[u8],
87
+ ) -> Vec<u8> {
88
+ let mut out = Vec::with_capacity(9 + block.len());
89
+ let mut flags = 0u8;
90
+ if end_stream {
91
+ flags |= FLAG_END_STREAM;
92
+ }
93
+ if end_headers {
94
+ flags |= FLAG_END_HEADERS;
95
+ }
96
+ write_frame_header(&mut out, block.len() as u32, FRAME_HEADERS, flags, stream_id);
97
+ out.extend_from_slice(block);
98
+ out
99
+ }
100
+
101
+ #[allow(dead_code)]
102
+ pub fn encode_rst_stream(stream_id: u32, error_code: u32) -> Vec<u8> {
103
+ let mut out = Vec::with_capacity(9 + 4);
104
+ write_frame_header(&mut out, 4, FRAME_RST_STREAM, 0, stream_id);
105
+ out.extend_from_slice(&error_code.to_be_bytes());
106
+ out
107
+ }
108
+
109
+ #[allow(dead_code)]
110
+ pub fn encode_window_update(stream_id: u32, increment: u32) -> Vec<u8> {
111
+ let mut out = Vec::with_capacity(9 + 4);
112
+ let inc = increment & 0x7fff_ffff;
113
+ write_frame_header(&mut out, 4, FRAME_WINDOW_UPDATE, 0, stream_id);
114
+ out.extend_from_slice(&inc.to_be_bytes());
115
+ out
116
+ }
117
+
118
+ #[cfg(test)]
119
+ mod tests {
120
+ use super::*;
121
+
122
+ #[test]
123
+ fn data_frame_layout() {
124
+ let frame = encode_data_frame(1, true, b"hello");
125
+ // 9-byte header: 00 00 05 00 01 00 00 00 01
126
+ assert_eq!(frame[0..3], [0, 0, 5]);
127
+ assert_eq!(frame[3], FRAME_DATA);
128
+ assert_eq!(frame[4], FLAG_END_STREAM);
129
+ assert_eq!(u32::from_be_bytes([frame[5], frame[6], frame[7], frame[8]]), 1);
130
+ assert_eq!(&frame[9..], b"hello");
131
+ }
132
+
133
+ #[test]
134
+ fn rst_stream_layout() {
135
+ let frame = encode_rst_stream(3, 0xa);
136
+ assert_eq!(frame[3], FRAME_RST_STREAM);
137
+ assert_eq!(u32::from_be_bytes([frame[5], frame[6], frame[7], frame[8]]), 3);
138
+ assert_eq!(u32::from_be_bytes([frame[9], frame[10], frame[11], frame[12]]), 0xa);
139
+ }
140
+ }