RubyGems - hyperion-rb - Versions diffs - 1.6.2 → 2.11.0 - Mend

hyperion-rb 1.6.2 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4768 -0
data/README.md +222 -13
data/ext/hyperion_h2_codec/Cargo.lock +7 -0
data/ext/hyperion_h2_codec/Cargo.toml +33 -0
data/ext/hyperion_h2_codec/extconf.rb +73 -0
data/ext/hyperion_h2_codec/src/frames.rs +140 -0
data/ext/hyperion_h2_codec/src/hpack/huffman.rs +161 -0
data/ext/hyperion_h2_codec/src/hpack.rs +457 -0
data/ext/hyperion_h2_codec/src/lib.rs +296 -0
data/ext/hyperion_http/extconf.rb +28 -0
data/ext/hyperion_http/h2_codec_glue.c +408 -0
data/ext/hyperion_http/page_cache.c +1125 -0
data/ext/hyperion_http/parser.c +473 -38
data/ext/hyperion_http/sendfile.c +982 -0
data/ext/hyperion_http/websocket.c +493 -0
data/ext/hyperion_io_uring/Cargo.lock +33 -0
data/ext/hyperion_io_uring/Cargo.toml +34 -0
data/ext/hyperion_io_uring/extconf.rb +74 -0
data/ext/hyperion_io_uring/src/lib.rs +316 -0
data/lib/hyperion/adapter/rack.rb +370 -42
data/lib/hyperion/admin_listener.rb +207 -0
data/lib/hyperion/admin_middleware.rb +36 -7
data/lib/hyperion/cli.rb +310 -11
data/lib/hyperion/config.rb +440 -14
data/lib/hyperion/connection.rb +679 -22
data/lib/hyperion/deprecations.rb +81 -0
data/lib/hyperion/dispatch_mode.rb +165 -0
data/lib/hyperion/fiber_local.rb +75 -13
data/lib/hyperion/h2_admission.rb +77 -0
data/lib/hyperion/h2_codec.rb +499 -0
data/lib/hyperion/http/page_cache.rb +122 -0
data/lib/hyperion/http/sendfile.rb +696 -0
data/lib/hyperion/http2/native_hpack_adapter.rb +70 -0
data/lib/hyperion/http2_handler.rb +618 -19
data/lib/hyperion/io_uring.rb +317 -0
data/lib/hyperion/lint_wrapper_pool.rb +126 -0
data/lib/hyperion/master.rb +96 -9
data/lib/hyperion/metrics/path_templater.rb +68 -0
data/lib/hyperion/metrics.rb +256 -0
data/lib/hyperion/prometheus_exporter.rb +150 -0
data/lib/hyperion/request.rb +13 -0
data/lib/hyperion/response_writer.rb +477 -16
data/lib/hyperion/runtime.rb +195 -0
data/lib/hyperion/server/route_table.rb +179 -0
data/lib/hyperion/server.rb +519 -55
data/lib/hyperion/static_preload.rb +133 -0
data/lib/hyperion/thread_pool.rb +61 -7
data/lib/hyperion/tls.rb +343 -1
data/lib/hyperion/version.rb +1 -1
data/lib/hyperion/websocket/close_codes.rb +71 -0
data/lib/hyperion/websocket/connection.rb +876 -0
data/lib/hyperion/websocket/frame.rb +356 -0
data/lib/hyperion/websocket/handshake.rb +525 -0
data/lib/hyperion/worker.rb +111 -9
data/lib/hyperion.rb +137 -3
metadata +50 -1

data/README.md CHANGED Viewed

@@ -11,9 +11,155 @@ gem install hyperion-rb
 bundle exec hyperion config.ru
 ```
+## What's new in 2.11.0
+**h2 cold-stream latency cut + native HPACK CGlue flipped to default.**
+Two perf wins on top of 2.10:
+- **2.11-A — h2 first-stream TLS handshake parallelization.** The
+  2.10-G `HYPERION_H2_TIMING=1` instrumentation, run against the
+  TCP_NODELAY-fixed handler, isolated the residual cold-stream cost
+  to **bucket 2**: lazy `task.async {}` fiber spawn for the first
+  stream of every connection. Fix: pre-spawn a stream-dispatch fiber
+  pool at connection accept (configurable via `HYPERION_H2_DISPATCH_POOL`,
+  default 4, ceiling 16). h2load `-c 1 -m 1 -n 50` cold first-run:
+  **time-to-1st-byte 20.28 → 9.28 ms (−54%); m=100 throughput +5.5%**.
+  Warm steady-state unchanged (no head-of-line blocking under the small
+  pool — backlog still spills to ad-hoc `task.async`).
+- **2.11-B — HPACK FFI marshalling round-2 (CGlue flipped to default).**
+  Three-way bench (`bench/h2_rails_shape.sh` extended): `ruby` (1,585
+  r/s) vs `native v2` (1,602 r/s, +1% — noise) vs `native v3 / CGlue`
+  (**2,291 r/s, +43% over v2**). The +18-44% native-vs-Ruby headline
+  was almost entirely Fiddle marshalling overhead, not the underlying
+  Rust HPACK encoder — same encoder, no per-call FFI marshalling, +43%
+  rps. Default flipped: unset `HYPERION_H2_NATIVE_HPACK` now selects
+  CGlue. Three escape valves stay (`=v2` to force the old path, `=ruby`
+  / `=off` for the pure-Ruby fallback) for any operator that needs
+  them. Boot log gains a `native_mode` field documenting which path is
+  actually live.
+Plus operator infrastructure: a stale-`.dylib`-on-Linux cross-platform
+host-OS portability fix in `H2Codec.candidate_paths` (was silently
+falling through to pure-Ruby on the bench host); `bench/h2_rails_shape.sh`
+race-fixed (boot-log probe + stderr routing). Full bench tables and
+flip-decision rationale in [`CHANGELOG.md`](CHANGELOG.md).
+## What's new in 2.10.1
+**Static-asset operator surface (2.10-E) + C-ext fast-path response
+writer (2.10-F).** Two follow-on streams to 2.10's static / direct-route
+work:
+- **2.10-E — Static asset preload + immutable flag.** Boot-time hook
+  warms `Hyperion::Http::PageCache` over a tree of files and marks
+  every cached entry immutable. Surface: `--preload-static <dir>` (and
+  `--no-preload-static`) CLI flags, `preload_static "/path", immutable:
+  true` config DSL key, and zero-config Rails auto-detect that pulls
+  `Rails.configuration.assets.paths.first(8)` when present. Hyperion
+  never `require`s Rails — purely defensive `defined?(::Rails)`
+  probing keeps the generic Rack server path clean. **Operator value:
+  predictable first-request latency** (the asset is in cache before
+  the first request arrives) and the `recheck_seconds` mtime poll is
+  skipped on immutable entries. Sustained-load throughput on the
+  static-1-KB bench did *not* move (cold 1,929 r/s vs warm 1,886 r/s,
+  inside trial noise) because `ResponseWriter` already auto-caches
+  Rack::Files responses on the first hit; preload moves that one
+  `cache_file` call from request 1 to boot.
+- **2.10-F — C-ext fast-path response writer for prebuilt responses.**
+  `Server.handle_static`-routed requests now serve from a single
+  C function (`rb_pc_serve_request` in `ext/hyperion_http/page_cache.c`)
+  that does route lookup → header build → `write()` syscall without
+  re-entering Ruby on the response side. GVL is released across the
+  `write()` so slow clients no longer block other Ruby work on the
+  same VM. Automatic HEAD support (HTTP-mandated) lights up on every
+  GET registered via `handle_static` — same buffer, body stripped.
+  Bench (3-trial median, `wrk -t4 -c100 -d20s`): **5,768 r/s vs
+  2.10-D's 5,619 r/s (+2.6% — inside noise) and p99 1.93 → 1.67 ms
+  (−14% — outside noise, reproducible).** The throughput needle didn't
+  move because the per-connection lifecycle (accept4 + clone3 + futex
+  on GVL handoff) dominates at 100 concurrent connections; 2.10-F
+  shrinks the response phase, but the response phase isn't the
+  bottleneck on this profile. Durable infrastructure for 2.11+ when
+  the accept-loop work closes.
+Full per-stream details and bench tables in
+[`CHANGELOG.md`](CHANGELOG.md).
+## What's new in 2.10.0
+**4-way bench harness, page cache, direct routes, and the h2 40 ms
+ceiling killed.** This sprint widens the comparison matrix to all four
+major Ruby web servers (Hyperion + Puma + Falcon + Agoo) and ships
+four substantive perf streams against that backdrop:
+- **2.10-A / 2.10-B — 4-way bench harness + honest baseline.**
+  `bench/4way_compare.sh` runs the same 6 workloads (hello, static
+  1 KB / 1 MiB, CPU JSON, PG-bound, SSE) against all four servers from
+  one script. Baseline numbers committed *before* any code changes:
+  Agoo wins the static-asset and JSON columns by ~2-4×, Hyperion wins
+  the static 1 MiB column by 9× and the SSE column by 3.6-17×.
+- **2.10-C — `Hyperion::Http::PageCache` (pre-built static response
+  cache).** Open-addressed bucket table behind a pthread mutex
+  (GVL-released for writes), engages automatically on `Rack::Files`
+  responses. **Static 1 KB: 1,380 → 1,880 r/s (+36%), p99 3.7 → 2.7
+  ms.** Closes the Agoo gap from −47% to −28% on that column.
+- **2.10-D — `Hyperion::Server.handle` direct route registration.**
+  New API for hot Rack-bypass paths (`Server.handle '/health' do …
+  end`, `Server.handle_static '/robots.txt', body: '...'`). Skips Rack
+  adapter + env-build for matched routes. **`hello` via
+  `handle_static`: 4,408 → 5,619 r/s (+27%), p99 1.93 ms** — the
+  cleanest p99 in the 4-way matrix.
+- **2.10-G — h2 max-latency ceiling at ~40 ms: fixed.** Filed by 2.9-B
+  as a "first-stream cost" hypothesis, the instrumentation revealed
+  it was paid by *every* h2 stream — the canonical Linux delayed-ACK
+  + Nagle interaction on small framer writes. One-line fix:
+  TCP_NODELAY at accept time. **h2load `-c 1 -m 1 -n 200`: min
+  40.62 → 0.54 ms (−98.7%), throughput 24 → 1,142 r/s (+47.6×).** The
+  `HYPERION_H2_TIMING=1` instrumentation stays in place as durable
+  diagnostic infrastructure.
+Full per-stream details, bench numbers, and follow-up items live in
+[`CHANGELOG.md`](CHANGELOG.md).
+## What's new in 2.5.0
+**Native HPACK ON by default + autobahn 100% conformance + request
+hooks.** The Rust HPACK encoder (added in 2.0.0, opt-in until 2.4.x)
+flips ON by default in 2.5.0 — verified **+18% rps on Rails-shape h2
+workloads** (25-header responses, the bench harness lives at
+`bench/h2_rails_shape.ru` + `bench/h2_rails_shape.sh`). RFC 6455
+WebSocket conformance hit **463/463 autobahn-testsuite cases passing**
+(2.5-A, host openclaw-vm). Request lifecycle hooks
+(`Runtime#on_request_start` / `on_request_end`) shipped in 2.5-C —
+recipes in [`docs/OBSERVABILITY.md`](docs/OBSERVABILITY.md).
+## What's new in 2.4.0
+**Production observability.** The `/-/metrics` endpoint now exposes
+per-route latency histograms, per-conn fairness rejections, WebSocket
+permessage-deflate compression ratio, kTLS active connections,
+io_uring-active workers, and ThreadPool queue depth — operators can
+finally see whether the 2.x knobs are firing and how effective they
+are. A pre-built Grafana dashboard ships at
+[`docs/grafana/hyperion-2.4-dashboard.json`](docs/grafana/hyperion-2.4-dashboard.json).
+Full metric reference + operator playbook in
+[`docs/OBSERVABILITY.md`](docs/OBSERVABILITY.md).
+## What's new in 2.1.0
+**WebSockets.** RFC 6455 over Rack 3 full hijack, native frame codec,
+per-connection wrapper with auto-pong / close handshake / UTF-8 validation /
+per-message size cap. **ActionCable on Hyperion is now a single-binary
+deployment** — one `hyperion -w 4 -t 10 config.ru` process serves HTTP,
+HTTP/2, TLS, **and** `/cable` from the same listener; no separate cable
+container required. HTTP/1.1 only this release; WS-over-HTTP/2 (RFC 8441
+Extended CONNECT) and permessage-deflate (RFC 7692) defer to 2.2.x.
+See [`docs/WEBSOCKETS.md`](docs/WEBSOCKETS.md).
 ## Highlights
 - **HTTP/1.1 + HTTP/2 + TLS** out of the box (HTTP/2 with per-stream fiber multiplexing, WINDOW_UPDATE-aware flow control, ALPN auto-negotiation).
+- **WebSockets (RFC 6455)** — full handshake, native frame codec, per-connection wrapper. ActionCable + faye-websocket work on a single-binary deploy. See [`docs/WEBSOCKETS.md`](docs/WEBSOCKETS.md). (2.1.0+, HTTP/1.1 only.)
 - **Pre-fork cluster** with per-OS worker model: `SO_REUSEPORT` on Linux, master-bind + worker-fd-share on macOS/BSD (Darwin's `SO_REUSEPORT` doesn't load-balance).
 - **Hybrid concurrency**: fiber-per-connection for I/O, OS-thread pool for `app.call(env)` — synchronous Rack handlers (Rails, ActiveRecord, anything holding a global mutex) get true OS-thread concurrency.
 - **Vendored llhttp 9.3.0** C parser; pure-Ruby fallback for non-MRI runtimes.
@@ -25,13 +171,43 @@ bundle exec hyperion config.ru
 ## Benchmarks
-All numbers are real wrk runs against published Hyperion configs. Hyperion ships **with default-ON structured access logs**; Puma comparisons use Puma defaults (no per-request log emission). Each section is stamped with the Hyperion version it was measured against — newer versions (1.3.0+ `--async-io`, 1.4.0+ TLS h1 inline, 1.4.1+ Metrics fiber-key fix, 1.6.0+ HTTP/2 writer fiber + 3 C-ext additions) preserve or improve these numbers; we re-run the headline configs each release and have not seen regressions on these workloads.
-> **Comprehensive matrix for 1.6.0 + hyperion-async-pg 0.5.0 (16-vCPU Linux, 9 workloads × 25+ configs)**: see [`docs/BENCH_2026_04_27.md`](docs/BENCH_2026_04_27.md). Headline: 98,818 r/s on hello `-w 16`, 21,215 r/s `-w 4` at p99 < 2 ms, 2,180 r/s on a 50 ms-waiting PG workload (4.1× the best Puma), 1,667 req/s HTTP/2 multiplexed at 0 errors, 155 MB RSS for 10k idle keep-alive connections.
+All numbers are real wrk runs against published Hyperion configs. Hyperion ships **with default-ON structured access logs**; Puma comparisons use Puma defaults (no per-request log emission). Each section is stamped with the Hyperion version + bench host it was measured against — bench-host drift over time is real (see "Bench-host drift" note below).
+**Headline doc**: the most recent comprehensive sweep is
+[`docs/BENCH_HYPERION_2_0.md`](docs/BENCH_HYPERION_2_0.md) (Hyperion
+2.0.0 vs Puma 8.0.1, 16-vCPU Ubuntu 24.04, 12 workloads). The 1.6.0
+matrix at [`docs/BENCH_2026_04_27.md`](docs/BENCH_2026_04_27.md) covers
+9 workloads × 25+ configs against hyperion-async-pg 0.5.0; both docs
+include caveats and per-row reproduction commands.
+> **Bench-host drift note (2026-05-01).** A spot-check rerun on
+> `openclaw-vm` 5 days after the 2.0.0 sweep showed Puma 8.0.1 and
+> Hyperion 2.0.0 baseline numbers had drifted 14-32% downward from the
+> 2026-04-29 sweep with no code changes — the bench host runs other
+> workloads in the background and is a single VM (KVM CPU). Numbers in
+> this README and BENCH docs are snapshots; expect ±10-30% absolute
+> drift between sweep dates. **The relative position (Hyperion vs Puma
+> at matched config) is the durable signal**; e.g. Hyperion `-w 16 -t 5`
+> hello-world today is 76,593 r/s vs Puma 8.0.1 `-w 16 -t 5:5` at 55,609
+> r/s, **+37.7% over Puma** — wider than the 2.0.0 sweep's +27.8% even
+> though absolute rps is lower. Reproduce: `bundle exec bin/hyperion
+> -p 9501 -w 16 -t 5 bench/hello.ru` then `wrk -t4 -c200 -d20s
+> http://127.0.0.1:9501/`.
+> **Topology relevance.** Hyperion is built to run **fronted by nginx
+> or an L7 load balancer** in most production deployments — plaintext
+> HTTP/1.1 upstream, TLS terminated at the LB. The benches in this
+> README that match that topology are: hello-world, CPU JSON, static,
+> SSE, PG, WebSocket. Benches that are **bench-only for nginx-fronted
+> ops** (the LB → upstream hop is plaintext h1 regardless): TLS h1,
+> HTTP/2, kTLS_TX. Those rows still ship for operators who terminate
+> TLS / h2 at Hyperion directly (small static fleets, edge boxes), but
+> don't chase the +60% TLS-h1 win unless you actually terminate TLS at
+> Hyperion.
 ### Hello-world Rack app
-`bench/hello.ru`, single worker, parity threads (`-t 5` vs Puma `-t 5:5`), 4 wrk threads / 100 connections / 15s, macOS arm64 / Ruby 3.3.3, Hyperion 1.2.0:
+`bench/hello.ru`, single worker, parity threads (`-t 5` vs Puma `-t 5:5`), 4 wrk threads / 100 connections / 15s, macOS arm64 / Ruby 3.3.3, Hyperion 1.2.0. **macOS dev numbers; the headline Linux 2.0.0 bench is in [`docs/BENCH_HYPERION_2_0.md`](docs/BENCH_HYPERION_2_0.md)**:
 | | r/s | p99 | tail vs Hyperion |
 |---|---:|---:|---:|
@@ -69,7 +245,7 @@ Bench is **wait-bound** — ~3-4 ms median is the PG + Redis round-trip, dwarfin
 Ubuntu 24.04 / 16 vCPU / Ruby 3.3.3, Postgres 17 over WAN, `wrk -t4 -c200 -d20s`. Single worker (`-w 1`) unless noted. All configs returned 0 non-2xx and 0 timeouts. RSS sampled mid-run via `ps -o rss`.
-**Wait-bound workload** (`bench/pg_concurrent.ru`: `SELECT pg_sleep(0.05)` + tiny JSON):
+**Wait-bound workload** (`pg_concurrent.ru`: `SELECT pg_sleep(0.05)` + tiny JSON; rackup lives in the [hyperion-async-pg companion repo](https://github.com/andrew-woblavobla/hyperion-async-pg) and on the bench host at `~/bench/pg_concurrent.ru`, not in this repo):
 | | r/s | p99 | RSS | vs Puma `-t 5` |
 |---|---:|---:|---:|---:|
@@ -83,7 +259,7 @@ Ubuntu 24.04 / 16 vCPU / Ruby 3.3.3, Postgres 17 over WAN, `wrk -t4 -c200 -d20s`
 | Hyperion `--async-io -w 4 -t 5` pool=64 | 1937.5 | 4.84 s | 416 MB | 34.3× (cold-start p99 — see note) |
 | Falcon 0.55.3 `--count 1` pool=128 | 1665.7 | 516 ms | 141 MB | 29.5× |
-**Mixed CPU+wait** (`bench/pg_mixed.ru`: same query + 50-key JSON serialization, ~5 ms CPU):
+**Mixed CPU+wait** (`pg_mixed.ru`: same query + 50-key JSON serialization, ~5 ms CPU; rackup lives in hyperion-async-pg + on the bench host at `~/bench/pg_mixed.ru`, not in this repo):
 | | r/s | p99 | RSS | vs Puma `-t 30` |
 |---|---:|---:|---:|---:|
@@ -94,11 +270,12 @@ Ubuntu 24.04 / 16 vCPU / Ruby 3.3.3, Postgres 17 over WAN, `wrk -t4 -c200 -d20s`
 | Falcon `--count 1` pool=128 | 1642.1 | 531 ms | 213 MB | 4.7× |
 **Takeaways:**
-1. **Linear scaling with pool size** under `--async-io` — `r/s ≈ pool × 12` on this WAN bench. Single-worker pool=200 hits 2381 r/s, **42× Puma `-t 5`** and **5.9× Puma's best** (`-t 30`).
+1. **Linear scaling with pool size** under `--async-io` — `r/s ≈ pool × 12` on this WAN bench. Single-worker pool=200 hits 2381 r/s. The "**42× Puma `-t 5`**" and "**5.9× Puma's best**" framings above use Puma's pool=5 (timeout-floor) and pool=30 (mid-tier) rows respectively — fair comparisons on the *same* bench fixture, but a Puma operator who sizes their pool to match (`-t 100 pool=100` row above) lands at 1,067 r/s, so the **honest "Puma at its own best vs Hyperion at its own best" ratio is 2,381 / 1,067 ≈ 2.2×**, not 42×. The architectural win — fiber-pool grows to pool=200 without OS-thread cost — is real; the 42× headline is a configuration-difference effect, not a steady-state gap on matched configs.
 2. **Mixed workload doesn't kill the win** — Hyperion `--async-io` pool=128 actually goes *up* on mixed (1740 vs 1344 r/s) because CPU work overlaps other fibers' PG-wait windows. This is the honest "what happens to a real Rails handler" answer.
 3. **Hyperion ≈ Falcon within 3-7%** across pool sizes; both fiber-native architectures extract similar value from `hyperion-async-pg`.
 4. **RSS at single-worker scale isn't the architectural moat** — Linux thread stacks are demand-paged; PG connection buffers dominate RSS at pool sizes ≤ 200. The architectural win is **handler concurrency under load**, not idle memory: Hyperion's fiber path runs thousands of in-flight handler invocations per OS thread, so wait-bound handlers don't queue at `max_threads`. See [Concurrency at scale](#concurrency-at-scale-architectural-advantages) for both the throughput-under-load row and a measured 10k-idle-keepalive RSS sweep against Puma and Falcon.
 5. **`-w 4` cold-start caveat** — multi-worker p99 inflates because the bench rackup uses lazy per-process pool init (each worker pays full pool fill on its first request). Production apps avoid this with `on_worker_boot { Hyperion::AsyncPg::FiberPool.new(...).fill }`.
+6. **Apples-to-apples PG note**: the row above uses `pg.wobla.space` WAN PG with `max_connections=500`. Earlier sweeps that compared Hyperion (WAN, max_conn=500) against Puma (local, max_conn=100) overstated the ratio because the Puma side timed out at the local pool ceiling. The 2.0.0 bench doc carries this caveat in the row 7 verification section; treat any "Hyperion 4× Puma on PG" headline as **indicative**, not precisely calibrated, until rerun against matched-pool PG.
 Three things must all be true to get this win:
 1. **`async_io: true`** in your Hyperion config (or `--async-io` CLI flag). Default is off to keep 1.2.0's raw-loop perf for fiber-unaware apps.
@@ -205,19 +382,51 @@ Hyperion fans 100 in-flight streams across separate fibers within a single TCP c
 > **1.6.0 outbound write path** — `Http2Handler` no longer serializes every framer write through one `Mutex#synchronize { socket.write(...) }`. HPACK encoding (microseconds, in-memory) still serializes on a fast encode mutex, but the actual `socket.write` is owned by a dedicated per-connection writer fiber draining a queue. On per-connection multi-stream workloads where the kernel send buffer or peer reads are slow, encode work for ready streams overlaps the writer's flush of earlier chunks, instead of stacking up behind it. See `bench/h2_streams.sh` (`h2load -c 1 -m 100 -n 5000`) for a recipe to compare 1.5.0 vs 1.6.0 on a workload of your choice.
-### Reproduce
+### Reproducing the benchmarks
+Every number in this README and `docs/BENCH_HYPERION_2_0.md` is reproducible. Operators who don't trust headline numbers (and you shouldn't trust *any* benchmark numbers without independent verification) can rerun the workloads on their own host and get their own honest measurements. Per-row reproduction commands:
 ```sh
-# hello-world
-bundle exec ruby bench/compare.rb
-HYPERION_WORKERS=4 PUMA_WORKERS=4 FALCON_COUNT=4 bundle exec ruby bench/compare.rb
+# Setup (once)
+bundle install
+bundle exec rake compile
+# Hello-world (rps + p99 ceiling, no I/O)
+bundle exec bin/hyperion -p 9292 -w 16 -t 5 bench/hello.ru &
+wrk -t4 -c200 -d20s --latency http://127.0.0.1:9292/
+# CPU-bound JSON (per-request CPU savings visible)
+bundle exec bin/hyperion -p 9292 -w 4 -t 5 bench/work.ru &
+wrk -t4 -c200 -d15s --latency http://127.0.0.1:9292/
+# Static 1 MiB sendfile path
+ruby -e 'File.binwrite("/tmp/hyperion_bench_asset_1m.bin", "x" * (1024*1024))'
+bundle exec bin/hyperion -p 9292 -w 1 -t 5 bench/static.ru &
+wrk -t4 -c100 -d15s --latency http://127.0.0.1:9292/hyperion_bench_asset_1m.bin
+# SSE streaming (Hyperion-shaped rackup with explicit flush sentinel — see caveat in BENCH doc)
+bundle exec bin/hyperion -p 9292 -w 1 -t 5 bench/sse.ru &
+wrk -t1 -c1 -d10s http://127.0.0.1:9292/
+# WebSocket multi-process throughput
+bundle exec bin/hyperion -p 9888 -w 4 -t 64 bench/ws_echo.ru &
+ruby bench/ws_bench_client_multi.rb --port 9888 --procs 4 --conns 200 --msgs 1000 --bytes 1024 --json
+# h2 native HPACK (Rails-shape, 25-header response)
+./bench/h2_rails_shape.sh
 # Idle keep-alive RSS sweep (1k / 5k / 10k conns, 30s hold per server)
 ./bench/keepalive_memory.sh
-# Real Rails / Grape: see bench/db.ru for the schema
+# Hello-world quick comparator (Hyperion vs Puma vs Falcon)
+bundle exec ruby bench/compare.rb
+HYPERION_WORKERS=4 PUMA_WORKERS=4 FALCON_COUNT=4 bundle exec ruby bench/compare.rb
 ```
+PG benches (`pg_concurrent.ru`, `pg_mixed.ru`, `pg_realistic.ru`) live in the [hyperion-async-pg companion repo](https://github.com/andrew-woblavobla/hyperion-async-pg) — they require a running Postgres + the companion gem and are not part of this repo. The 2.0.0 sweep used `~/bench/pg_concurrent.ru` on the bench host; reproduce by cloning hyperion-async-pg and following its README, or `scp` the rackup + DATABASE_URL.
+When numbers from your host don't match the published numbers, the most likely explanations (in order): (1) bench-host noise — single-VM benches drift 10-30% over days; (2) Puma version mismatch — the 2.0.0 sweep used Puma 8.0.1 in the `~/bench/Gemfile`, the hyperion repo's own Gemfile pins Puma `~> 6.4`; (3) different kernel / Ruby; (4) different `-t` / `-c` (apples-to-apples requires identical worker count, thread count, wrk concurrency, payload size, kernel, Ruby, TLS cipher).
 ## Quick start
 ```sh
@@ -332,7 +541,7 @@ Concrete tradeoffs distilled from [`docs/BENCH_2026_04_27.md`](docs/BENCH_2026_0
 |---|---|---|
 | **Pure I/O-bound** (PG / Redis / external HTTP, no significant CPU) | `-w 1` + larger pool | Bench: `-w 1 pool=200` = 87 MB / 2,180 r/s vs `-w 4 pool=64` = 224 MB / 1,680 r/s. **2.6× more memory, 0.77× rps** if you pick multi-worker on a wait-bound workload. |
 | **Pure CPU-bound** (heavy JSON / template render / image processing) | `-w N` matching CPU count | Each worker's accept loop is single-threaded under `--async-io`; multi-worker gives CPU-parallelism. Bench: `-w 16 -t 5` hits 98,818 r/s on a 16-vCPU box, 4.7× a `-w 1` ceiling on the same hardware. |
-| **Mixed** (Rails-shaped: ~5 ms CPU + 50 ms PG wait per request) | `-w N/2` (half cores) + medium pool | Lets CPU work parallelise while keeping per-worker memory tractable. Bench `pg_mixed.ru` at `-w 4 -t 5 pool=128` = 1,740 r/s with no cold-start spike (ForkSafe `prefill_in_child: true`). |
+| **Mixed** (Rails-shaped: ~5 ms CPU + 50 ms PG wait per request) | `-w N/2` (half cores) + medium pool | Lets CPU work parallelise while keeping per-worker memory tractable. Bench `pg_mixed.ru` (in hyperion-async-pg repo / `~/bench/`) at `-w 4 -t 5 pool=128` = 1,740 r/s with no cold-start spike (ForkSafe `prefill_in_child: true`). |
 Multi-worker on PG-wait workloads is the **wrong** default for most apps — the headline rps doesn't justify the memory and PG-connection cost. Verify your shape with the bench before scaling out.

data/ext/hyperion_h2_codec/Cargo.lock ADDED Viewed

@@ -0,0 +1,7 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+[[package]]
+name = "hyperion_h2_codec"
+version = "2.0.0"

data/ext/hyperion_h2_codec/Cargo.toml ADDED Viewed

@@ -0,0 +1,33 @@
+[package]
+name = "hyperion_h2_codec"
+version = "2.0.0"
+edition = "2021"
+publish = false
+description = "Native HPACK + HTTP/2 frame codec for Hyperion (Phase 6)"
+[lib]
+crate-type = ["cdylib"]
+name = "hyperion_h2_codec"
+# Phase 6 takes the simplest reliable path: a self-contained,
+# zero-dependency RFC 7541 / RFC 7540 implementation in Rust, exposed
+# to Ruby via a thin extern "C" surface and called from MRI through
+# Fiddle. This avoids the magnus toolchain dependency tree (which
+# pulls in 10+ transitive crates and sometimes locks to newer rustc
+# than what `rustup show` reports), at the cost of slightly more
+# verbose Ruby-side glue. The trade-off was right for 2.0: the
+# alternative was deferring Phase 6 because a build-time `cargo fetch`
+# could fail in air-gapped or CI-cache-cold environments.
+#
+# If/when we want a richer Ruby-Rust API surface, the next step is to
+# add `magnus = "0.6"` here and rewrite `lib.rs::ffi` as a magnus
+# `#[magnus::init]` block; the inner pure-Rust modules
+# (`hpack`/`frames`) are deliberately decoupled from the FFI shape so
+# that swap is local.
+[dependencies]
+[profile.release]
+opt-level = 3
+lto = true
+codegen-units = 1
+strip = true

data/ext/hyperion_h2_codec/extconf.rb ADDED Viewed

@@ -0,0 +1,73 @@
+# frozen_string_literal: true
+# Build the hyperion_h2_codec Rust extension.
+#
+# Phase 6 (RFC 2.0 §3): native HPACK encode/decode + frame ser/de via
+# a Rust crate. This extconf.rb is invoked by `gem install`; it shells
+# out to `cargo build --release` and writes a Makefile that copies the
+# resulting cdylib into `lib/hyperion_h2_codec/` so that
+# `lib/hyperion/h2_codec.rb` can find it via Fiddle.
+#
+# Cargo is OPTIONAL. If it's missing, the extconf writes a stub
+# Makefile that prints a friendly note and exits cleanly — Hyperion
+# still ships and falls back to the existing Ruby HPACK path
+# (`Hyperion::H2Codec.available?` returns false). Operators who want
+# the perf bump install Rust via `rustup` and `gem pristine
+# hyperion-rb` to rebuild.
+#
+# Cross-platform notes:
+#   * Linux + GNU libc: cargo emits `libhyperion_h2_codec.so`.
+#   * macOS:           `libhyperion_h2_codec.dylib`.
+#   * The Ruby loader (Fiddle) probes both extensions in order, so we
+#     copy whichever cargo produced into the gem's lib_dir under both
+#     names where convenient.
+require 'mkmf'
+require 'fileutils'
+require 'rbconfig'
+ext_dir = __dir__
+crate_dir = ext_dir
+target_dir = File.join(crate_dir, 'target', 'release')
+gem_lib_dir = File.expand_path('../../lib/hyperion_h2_codec', __dir__)
+cargo_present = system('cargo --version > /dev/null 2>&1')
+if cargo_present
+  warn '[hyperion_h2_codec] cargo detected — building native HPACK extension'
+  Dir.chdir(crate_dir) do
+    ok = system('cargo build --release')
+    unless ok
+      warn '[hyperion_h2_codec] cargo build failed; falling back to pure-Ruby HPACK path'
+      cargo_present = false
+    end
+  end
+end
+FileUtils.mkdir_p(gem_lib_dir)
+if cargo_present
+  candidates = %w[libhyperion_h2_codec.dylib libhyperion_h2_codec.so]
+  found = candidates.find { |c| File.exist?(File.join(target_dir, c)) }
+  if found
+    src = File.join(target_dir, found)
+    dst = File.join(gem_lib_dir, found)
+    FileUtils.cp(src, dst)
+    warn "[hyperion_h2_codec] installed #{dst}"
+  else
+    warn '[hyperion_h2_codec] cargo finished but no cdylib artifact found; falling back'
+    cargo_present = false
+  end
+end
+# Always emit a Makefile — gem install protocol expects one. The body
+# is a no-op when cargo isn't present so `make` exits 0 and gem
+# install completes.
+File.open(File.join(ext_dir, 'Makefile'), 'w') do |f|
+  f.puts 'all:'
+  f.puts "\t@echo \"[hyperion_h2_codec] no-op make (cargo handled the build)\""
+  f.puts 'clean:'
+  f.puts "\t@rm -rf target"
+  f.puts 'install:'
+  f.puts "\t@echo \"[hyperion_h2_codec] no-op install (artifact already in lib/)\""
+end

data/ext/hyperion_h2_codec/src/frames.rs ADDED Viewed

@@ -0,0 +1,140 @@
+//! HTTP/2 frame primitives (RFC 7540 §6).
+//!
+//! Phase 6a only ships the simplest frame types the writer fiber
+//! needs in the response path: HEADERS, DATA, RST_STREAM, WINDOW_UPDATE.
+//! The h2 connection state machine continues to be driven by
+//! `protocol-http2` for now — we just expose the wire-formatting
+//! primitives so a future Phase 6b can replace the Ruby-side framer.
+use std::fmt;
+/// HPACK error type, shared between encoder/decoder. Public so the
+/// FFI layer can surface a numeric code to Ruby.
+#[derive(Debug)]
+pub enum HpackError {
+    Truncated,
+    Overflow,
+    BadIndex,
+    ZeroIndex,
+    HuffmanInvalid,
+}
+impl fmt::Display for HpackError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            HpackError::Truncated => write!(f, "truncated input"),
+            HpackError::Overflow => write!(f, "integer overflow"),
+            HpackError::BadIndex => write!(f, "invalid HPACK index"),
+            HpackError::ZeroIndex => write!(f, "HPACK index 0 is reserved"),
+            HpackError::HuffmanInvalid => write!(f, "invalid Huffman sequence"),
+        }
+    }
+}
+// Frame type constants. RFC 7540 §11.2 / RFC 9113 §6.
+// Some are unused by the Phase 6a wire path but ship now so that the
+// FFI surface, when expanded in Phase 6b, doesn't churn ABI numbers.
+#[allow(dead_code)]
+pub const FRAME_DATA: u8 = 0x0;
+#[allow(dead_code)]
+pub const FRAME_HEADERS: u8 = 0x1;
+#[allow(dead_code)]
+pub const FRAME_RST_STREAM: u8 = 0x3;
+#[allow(dead_code)]
+pub const FRAME_SETTINGS: u8 = 0x4;
+#[allow(dead_code)]
+pub const FRAME_PING: u8 = 0x6;
+#[allow(dead_code)]
+pub const FRAME_GOAWAY: u8 = 0x7;
+#[allow(dead_code)]
+pub const FRAME_WINDOW_UPDATE: u8 = 0x8;
+#[allow(dead_code)]
+pub const FRAME_CONTINUATION: u8 = 0x9;
+#[allow(dead_code)]
+pub const FLAG_END_STREAM: u8 = 0x1;
+#[allow(dead_code)]
+pub const FLAG_END_HEADERS: u8 = 0x4;
+/// 9-byte frame header + payload writer (RFC 7540 §4.1).
+fn write_frame_header(out: &mut Vec<u8>, len: u32, kind: u8, flags: u8, stream_id: u32) {
+    out.push(((len >> 16) & 0xff) as u8);
+    out.push(((len >> 8) & 0xff) as u8);
+    out.push((len & 0xff) as u8);
+    out.push(kind);
+    out.push(flags);
+    let sid = stream_id & 0x7fff_ffff; // R-bit cleared per spec
+    out.push(((sid >> 24) & 0xff) as u8);
+    out.push(((sid >> 16) & 0xff) as u8);
+    out.push(((sid >> 8) & 0xff) as u8);
+    out.push((sid & 0xff) as u8);
+}
+pub fn encode_data_frame(stream_id: u32, end_stream: bool, payload: &[u8]) -> Vec<u8> {
+    let mut out = Vec::with_capacity(9 + payload.len());
+    let flags = if end_stream { FLAG_END_STREAM } else { 0 };
+    write_frame_header(&mut out, payload.len() as u32, FRAME_DATA, flags, stream_id);
+    out.extend_from_slice(payload);
+    out
+}
+#[allow(dead_code)]
+pub fn encode_headers_frame(
+    stream_id: u32,
+    end_stream: bool,
+    end_headers: bool,
+    block: &[u8],
+) -> Vec<u8> {
+    let mut out = Vec::with_capacity(9 + block.len());
+    let mut flags = 0u8;
+    if end_stream {
+        flags |= FLAG_END_STREAM;
+    }
+    if end_headers {
+        flags |= FLAG_END_HEADERS;
+    }
+    write_frame_header(&mut out, block.len() as u32, FRAME_HEADERS, flags, stream_id);
+    out.extend_from_slice(block);
+    out
+}
+#[allow(dead_code)]
+pub fn encode_rst_stream(stream_id: u32, error_code: u32) -> Vec<u8> {
+    let mut out = Vec::with_capacity(9 + 4);
+    write_frame_header(&mut out, 4, FRAME_RST_STREAM, 0, stream_id);
+    out.extend_from_slice(&error_code.to_be_bytes());
+    out
+}
+#[allow(dead_code)]
+pub fn encode_window_update(stream_id: u32, increment: u32) -> Vec<u8> {
+    let mut out = Vec::with_capacity(9 + 4);
+    let inc = increment & 0x7fff_ffff;
+    write_frame_header(&mut out, 4, FRAME_WINDOW_UPDATE, 0, stream_id);
+    out.extend_from_slice(&inc.to_be_bytes());
+    out
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn data_frame_layout() {
+        let frame = encode_data_frame(1, true, b"hello");
+        // 9-byte header: 00 00 05 00 01 00 00 00 01
+        assert_eq!(frame[0..3], [0, 0, 5]);
+        assert_eq!(frame[3], FRAME_DATA);
+        assert_eq!(frame[4], FLAG_END_STREAM);
+        assert_eq!(u32::from_be_bytes([frame[5], frame[6], frame[7], frame[8]]), 1);
+        assert_eq!(&frame[9..], b"hello");
+    }
+    #[test]
+    fn rst_stream_layout() {
+        let frame = encode_rst_stream(3, 0xa);
+        assert_eq!(frame[3], FRAME_RST_STREAM);
+        assert_eq!(u32::from_be_bytes([frame[5], frame[6], frame[7], frame[8]]), 3);
+        assert_eq!(u32::from_be_bytes([frame[9], frame[10], frame[11], frame[12]]), 0xa);
+    }
+}