hyperion-rb 2.11.0 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,132 @@
1
+ /* ----------------------------------------------------------------------
2
+ * page_cache_internal.h — internal C-ext sharing surface.
3
+ *
4
+ * 2.12-D — exposes the request-parsing + lookup + write helpers built by
5
+ * `page_cache.c`'s C accept loop so the io_uring sibling
6
+ * (`io_uring_loop.c`) can reuse them rather than copy-pasting. The
7
+ * helpers stay `static` inside `page_cache.c` and the symbols below are
8
+ * thin extern wrappers — one indirection per call, but the io_uring
9
+ * loop calls them at most once per request, so the cost is negligible
10
+ * (single-direct-call jump) compared to the syscall savings the loop
11
+ * delivers.
12
+ *
13
+ * NOT public surface. NOT installed in any include path. The header
14
+ * lives next to the .c files and is included only by the in-tree C
15
+ * sources.
16
+ * ---------------------------------------------------------------------- */
17
+ #ifndef HYP_PAGE_CACHE_INTERNAL_H
18
+ #define HYP_PAGE_CACHE_INTERNAL_H
19
+
20
+ #include <stddef.h>
21
+ #include <sys/types.h>
22
+
23
+ #ifdef __cplusplus
24
+ extern "C" {
25
+ #endif
26
+
27
+ /* Method classification (mirrors `hyp_pc_method_t` in page_cache.c). The
28
+ * io_uring loop uses this via `pc_internal_classify_method` to decide
29
+ * how much of the cached response to write (HEAD = headers only, GET =
30
+ * full response). */
31
+ typedef enum {
32
+ PC_INTERNAL_METHOD_GET = 0,
33
+ PC_INTERNAL_METHOD_HEAD = 1,
34
+ PC_INTERNAL_METHOD_OTHER = 2
35
+ } pc_internal_method_t;
36
+
37
+ /* End-of-headers scanner. Returns the byte offset PAST the trailing
38
+ * CRLFCRLF, or -1 if not found. */
39
+ long pc_internal_find_eoh(const char *buf, size_t len);
40
+
41
+ /* Request-line parser. On success fills *m_off, *m_len, *p_off, *p_len
42
+ * with offsets/lengths of METHOD and PATH inside `buf`, and returns the
43
+ * length of the request line including the trailing CRLF. Returns -1
44
+ * on malformed input or non-HTTP/1.1 versions (HTTP/1.0 differs in
45
+ * keep-alive defaults; the caller must hand it off to Ruby). */
46
+ long pc_internal_parse_request_line(const char *buf, size_t len,
47
+ size_t *m_off, size_t *m_len,
48
+ size_t *p_off, size_t *p_len);
49
+
50
+ /* Header-block scanner. `start` and `end` bracket the headers section
51
+ * (between request-line end and the closing CRLFCRLF). Reports:
52
+ * *connection_close — Connection: close seen
53
+ * *has_body — non-zero Content-Length OR Transfer-Encoding
54
+ * *upgrade_seen — Upgrade or HTTP2-Settings seen
55
+ * Returns 0 on success, -1 on malformed framing. */
56
+ int pc_internal_scan_headers(const char *buf, size_t start, size_t end,
57
+ int *connection_close, int *has_body,
58
+ int *upgrade_seen);
59
+
60
+ /* Method classifier. Returns GET / HEAD / OTHER. */
61
+ pc_internal_method_t pc_internal_classify_method(const char *m, size_t len);
62
+
63
+ /* Snapshot the response bytes for `(path, kind)` into a freshly malloc'd
64
+ * buffer. On hit: returns the malloc'd buffer (caller must `free()` it)
65
+ * and writes the byte length into *out_len. On miss: returns NULL and
66
+ * sets *out_len = 0. The buffer is whatever the page cache's lookup
67
+ * picks given the recheck/staleness rules; the io_uring loop writes it
68
+ * verbatim. Takes the C-side cache lock briefly; releases it before
69
+ * returning. Returns NULL on OOM as well — the caller treats both as
70
+ * "couldn't serve from C, hand off to Ruby". */
71
+ char *pc_internal_snapshot_response(const char *path, size_t path_len,
72
+ pc_internal_method_t kind,
73
+ size_t *out_len);
74
+
75
+ /* Apply TCP_NODELAY to an accepted fd (best-effort; failures swallowed). */
76
+ void pc_internal_apply_tcp_nodelay(int fd);
77
+
78
+ /* Lifecycle hook fire wrapper. The io_uring loop calls this AFTER the
79
+ * write completion arrives so observers see a finished request. The
80
+ * C-side gate (`lifecycle_active`) is checked inside; the wrapper is
81
+ * a no-op when no callback is registered or the gate is off. Must be
82
+ * called under the GVL. */
83
+ void pc_internal_fire_lifecycle(const char *method, size_t mlen,
84
+ const char *path, size_t plen);
85
+
86
+ /* Whether the lifecycle gate is currently on. The io_uring loop reads
87
+ * this BEFORE re-acquiring the GVL — when it's off, the loop skips
88
+ * the rb_thread_call_with_gvl round-trip entirely. */
89
+ int pc_internal_lifecycle_active(void);
90
+
91
+ /* Handoff wrapper — invokes the registered Ruby callback with
92
+ * (fd, partial_buffer_or_nil). Must be called under the GVL. Closes
93
+ * the fd locally if no callback is registered or if the callback
94
+ * raised. */
95
+ void pc_internal_handoff(int client_fd, const char *partial, size_t partial_len);
96
+
97
+ /* Read the stop flag flipped by `PageCache.stop_accept_loop`. Both the
98
+ * 2.12-C accept4 loop AND the 2.12-D io_uring loop honour it as a
99
+ * graceful-shutdown signal. */
100
+ int pc_internal_stop_requested(void);
101
+
102
+ /* Reset the stop flag to 0. Called by the loop entry points
103
+ * (`run_static_accept_loop`, `run_static_io_uring_loop`) so a previous
104
+ * invocation's `stop_accept_loop` doesn't immediately tear down a
105
+ * fresh loop. Specs hammer this path between examples — the 2.12-C
106
+ * loop resets inline; the io_uring sibling needs the same surface. */
107
+ void pc_internal_reset_stop(void);
108
+
109
+ /* 2.12-E — bump the per-process served-request counter (atomic; safe
110
+ * to call from any thread / fiber / accept-loop context). Both the
111
+ * 2.12-C accept4 loop and the 2.12-D io_uring loop call this after
112
+ * a successful response write so the SO_REUSEPORT distribution audit
113
+ * (`PageCache.c_loop_requests_total`) sees ticks regardless of which
114
+ * loop variant is active. */
115
+ void pc_internal_tick_request(void);
116
+
117
+ /* 2.12-E — reset the per-process served-request counter. Mirrors the
118
+ * stop-flag reset rationale: loop entry points call this so a prior
119
+ * invocation's count doesn't bleed into the new loop's snapshot. */
120
+ void pc_internal_reset_requests_served(void);
121
+
122
+ /* The 64 KiB header-cap shared with `page_cache.c`. Re-declared here
123
+ * so io_uring_loop.c doesn't need to mirror the magic number. */
124
+ #ifndef PC_INTERNAL_MAX_HEADER_BYTES
125
+ #define PC_INTERNAL_MAX_HEADER_BYTES 65536
126
+ #endif
127
+
128
+ #ifdef __cplusplus
129
+ }
130
+ #endif
131
+
132
+ #endif /* HYP_PAGE_CACHE_INTERNAL_H */
@@ -442,6 +442,327 @@ static VALUE cparser_parse(VALUE self, VALUE buffer) {
442
442
  return rb_ary_new_from_args(2, request, ULONG2NUM((unsigned long)consumed));
443
443
  }
444
444
 
445
+ /* 2.13-B — pre-baked status-line table for the most common HTTP status codes.
446
+ * The full "HTTP/1.1 NNN <reason>\r\n" line is a constant for any (status,
447
+ * reason) pair the server emits on the hot path, so we sidestep the
448
+ * per-request `snprintf("HTTP/1.1 %d ", status)` + reason-cat by switching
449
+ * on `status` and emitting a single literal-bytes cat. A non-cached status
450
+ * (or a non-default reason — operator override) still falls through to the
451
+ * generic snprintf path below. The table covers every code in
452
+ * `Hyperion::ResponseWriter::REASONS`. */
453
+ struct status_line {
454
+ int status;
455
+ const char *bytes;
456
+ long len; /* strlen of bytes (filled at extension load) */
457
+ };
458
+
459
+ #define STATUS_LINE(code, reason) { (code), "HTTP/1.1 " #code " " reason "\r\n", 0 }
460
+ static struct status_line k_status_lines[] = {
461
+ STATUS_LINE(200, "OK"),
462
+ STATUS_LINE(201, "Created"),
463
+ STATUS_LINE(204, "No Content"),
464
+ STATUS_LINE(301, "Moved Permanently"),
465
+ STATUS_LINE(302, "Found"),
466
+ STATUS_LINE(304, "Not Modified"),
467
+ STATUS_LINE(400, "Bad Request"),
468
+ STATUS_LINE(401, "Unauthorized"),
469
+ STATUS_LINE(403, "Forbidden"),
470
+ STATUS_LINE(404, "Not Found"),
471
+ STATUS_LINE(405, "Method Not Allowed"),
472
+ STATUS_LINE(408, "Request Timeout"),
473
+ STATUS_LINE(409, "Conflict"),
474
+ STATUS_LINE(410, "Gone"),
475
+ STATUS_LINE(413, "Payload Too Large"),
476
+ STATUS_LINE(414, "URI Too Long"),
477
+ STATUS_LINE(422, "Unprocessable Entity"),
478
+ STATUS_LINE(429, "Too Many Requests"),
479
+ STATUS_LINE(500, "Internal Server Error"),
480
+ STATUS_LINE(501, "Not Implemented"),
481
+ STATUS_LINE(502, "Bad Gateway"),
482
+ STATUS_LINE(503, "Service Unavailable"),
483
+ STATUS_LINE(504, "Gateway Timeout"),
484
+ { 0, NULL, 0 }
485
+ };
486
+ #undef STATUS_LINE
487
+
488
+ /* Lookup a pre-baked status line by (status, reason). Returns NULL if
489
+ * the status isn't in the table OR the operator passed a custom reason
490
+ * phrase that doesn't match the table's default — in either case the
491
+ * caller falls through to the generic snprintf path. The reason match
492
+ * uses memcmp (NOT case-insensitive) — apps overriding to a different
493
+ * casing get the safe fallback rather than a wire-string mismatch. */
494
+ static const struct status_line *lookup_status_line(int status,
495
+ const char *reason_ptr,
496
+ long reason_len) {
497
+ for (struct status_line *e = k_status_lines; e->bytes != NULL; e++) {
498
+ if (e->status != status) continue;
499
+ /* Format of e->bytes: "HTTP/1.1 NNN <reason>\r\n". The reason
500
+ * starts at offset 13 (9 bytes "HTTP/1.1 " + 3 bytes status + 1
501
+ * byte space) and has length e->len - 13 - 2 (strip trailing CRLF). */
502
+ long table_reason_len = e->len - 13 - 2;
503
+ if (table_reason_len != reason_len) return NULL;
504
+ if (memcmp(e->bytes + 13, reason_ptr, reason_len) != 0) return NULL;
505
+ return e;
506
+ }
507
+ return NULL;
508
+ }
509
+
510
+ /* 2.13-B — hand-rolled positive-integer-to-decimal-ASCII writer. snprintf is
511
+ * 1 % of CPU on the CPU-JSON workload (per perf -F 199 -g sampling);
512
+ * `body_size` is always non-negative (bytesize of a buffered body) so the
513
+ * sign branch + locale logic in vfprintf are pure overhead. Writes the
514
+ * digits backwards into a 24-byte scratch then returns the offset+length
515
+ * pair so the caller can rb_str_cat without reordering. */
516
+ static int itoa_positive_decimal(long n, char *out, int out_size) {
517
+ /* out_size is the buffer; we fill from the right edge. */
518
+ int i = out_size;
519
+ if (n == 0) {
520
+ out[--i] = '0';
521
+ return i;
522
+ }
523
+ while (n > 0 && i > 0) {
524
+ out[--i] = (char)('0' + (n % 10));
525
+ n /= 10;
526
+ }
527
+ return i;
528
+ }
529
+
530
+ /* 2.13-B — per-key downcase result cache. Operators overwhelmingly call
531
+ * `build_response_head` with a fixed set of frozen-literal header keys
532
+ * (`'content-type'`, `'cache-control'`, etc.) — same String VALUE every
533
+ * request. Re-running `String#downcase` per call allocates a fresh
534
+ * lowercase String + crosses the FFI boundary; for `n_headers=4` that's
535
+ * 4 allocs + 4 method dispatches per response. The cache keys on the
536
+ * input String's object_id and stores the lowercase VALUE, the
537
+ * pre-built `lc + ": "` prefix line, and the cached length. Cap at 64
538
+ * entries so a misbehaving app emitting a unique `x-trace-<uuid>` key
539
+ * per request can't grow the cache without bound — it just falls
540
+ * through to the slow path on overflow.
541
+ *
542
+ * Pinning: each cached VALUE is anchored in a Ruby Array (`rb_aHeaderKeyCache`)
543
+ * registered as a global. The cache itself is an `st_table` keyed by
544
+ * VALUE bits (the input frozen String's id, since it's frozen and safe
545
+ * to reference forever). */
546
+ #define HEADER_KEY_CACHE_MAX 64
547
+ typedef struct {
548
+ VALUE key; /* original frozen input String */
549
+ VALUE lc; /* lowercase form (may be == key when already lowercase) */
550
+ VALUE prefix; /* "<lc>: " — pre-built byte buffer ready to cat */
551
+ long lc_len;
552
+ } header_key_cache_entry_t;
553
+
554
+ static st_table *g_header_key_cache = NULL;
555
+ static VALUE rb_aHeaderKeyAnchor; /* keeps cached VALUEs alive */
556
+
557
+ /* 2.13-B — full header-line cache. When BOTH the key AND the value of a
558
+ * header are frozen-literal Strings (the overwhelmingly common case for
559
+ * fixed Rack apps: `'cache-control' => 'no-store'`,
560
+ * `'content-type' => 'application/json'`), the entire wire line
561
+ * `"<lc-key>: <value>\r\n"` is identical every request. Cache it keyed
562
+ * on `(key.object_id, value.object_id)`; on hit the entire emit is one
563
+ * `rb_str_cat`. Same 64-entry cap + same anchor-Array pinning as the
564
+ * key cache. The `value` slot pins the original value VALUE so the
565
+ * frozen literal isn't reclaimed. */
566
+ #define HEADER_LINE_CACHE_MAX 256
567
+ typedef struct {
568
+ /* Two-word key: input key VALUE bits + value VALUE bits. */
569
+ VALUE key_v;
570
+ VALUE val_v;
571
+ VALUE line; /* "<lc-key>: <value>\r\n" buffer */
572
+ long line_len;
573
+ int is_date; /* 1 if lc-key == "date" — caller skips the date tail */
574
+ } header_line_cache_entry_t;
575
+
576
+ static st_table *g_header_line_cache = NULL;
577
+ static VALUE rb_aHeaderLineAnchor;
578
+
579
+ static st_index_t header_line_cache_hash(st_data_t a) {
580
+ /* Combine the two VALUEs via a simple xor+mul mix. The VALUEs are
581
+ * pointers to frozen Strings — the low 3 bits are alignment so we
582
+ * shift before mixing to avoid trivial collisions. */
583
+ const header_line_cache_entry_t *e = (const header_line_cache_entry_t *)a;
584
+ st_data_t x = ((st_data_t)e->key_v >> 3) * 0x9E3779B97F4A7C15ULL;
585
+ st_data_t y = ((st_data_t)e->val_v >> 3) * 0xBF58476D1CE4E5B9ULL;
586
+ return (st_index_t)(x ^ y);
587
+ }
588
+ static int header_line_cache_cmp(st_data_t a, st_data_t b) {
589
+ const header_line_cache_entry_t *ea = (const header_line_cache_entry_t *)a;
590
+ const header_line_cache_entry_t *eb = (const header_line_cache_entry_t *)b;
591
+ /* st returns 0 on match (same as memcmp). */
592
+ return !(ea->key_v == eb->key_v && ea->val_v == eb->val_v);
593
+ }
594
+ static const struct st_hash_type header_line_cache_type = {
595
+ header_line_cache_cmp,
596
+ header_line_cache_hash
597
+ };
598
+
599
+ /* Reuse the same cap-and-anchor strategy from the key cache. Look up by
600
+ * a stack-allocated probe entry; on miss + room, allocate a new entry
601
+ * and st_insert. */
602
+ static const header_line_cache_entry_t *header_line_cache_lookup(VALUE key, VALUE val) {
603
+ if (g_header_line_cache == NULL) return NULL;
604
+ header_line_cache_entry_t probe = { key, val, Qnil, 0, 0 };
605
+ st_data_t found_data;
606
+ if (st_lookup(g_header_line_cache, (st_data_t)&probe, &found_data)) {
607
+ return (const header_line_cache_entry_t *)found_data;
608
+ }
609
+ return NULL;
610
+ }
611
+
612
+ /* Lookup-or-build for the per-key downcase cache. Fast path: st hit, return
613
+ * the cached entry. Slow path: cap-bound check, freeze + lowercase the key,
614
+ * build the "<lc>: " prefix String, anchor both in rb_aHeaderKeyAnchor,
615
+ * st_insert. The anchor Array keeps the VALUEs alive across GC.
616
+ *
617
+ * Returns NULL when the cache is full AND the input key isn't already
618
+ * lowercase + already short — caller falls through to the per-call
619
+ * downcase path. */
620
+ static const header_key_cache_entry_t *header_key_cache_lookup(VALUE key_v) {
621
+ if (g_header_key_cache != NULL) {
622
+ st_data_t found_data;
623
+ if (st_lookup(g_header_key_cache, (st_data_t)key_v, &found_data)) {
624
+ return (const header_key_cache_entry_t *)found_data;
625
+ }
626
+ if (g_header_key_cache->num_entries >= HEADER_KEY_CACHE_MAX) {
627
+ return NULL; /* don't grow past cap */
628
+ }
629
+ } else {
630
+ g_header_key_cache = st_init_numtable();
631
+ }
632
+
633
+ /* Build the entry. Coerce to String, downcase, freeze, build prefix. */
634
+ VALUE k_s = rb_obj_as_string(key_v);
635
+ VALUE k_lower = rb_funcall(k_s, id_downcase, 0);
636
+ if (!OBJ_FROZEN(k_lower)) k_lower = rb_obj_freeze(k_lower);
637
+
638
+ long lc_len = RSTRING_LEN(k_lower);
639
+ VALUE prefix = rb_str_buf_new(lc_len + 2);
640
+ rb_str_cat(prefix, RSTRING_PTR(k_lower), lc_len);
641
+ rb_str_cat(prefix, ": ", 2);
642
+ rb_obj_freeze(prefix);
643
+
644
+ header_key_cache_entry_t *e = ALLOC(header_key_cache_entry_t);
645
+ e->key = key_v;
646
+ e->lc = k_lower;
647
+ e->prefix = prefix;
648
+ e->lc_len = lc_len;
649
+
650
+ /* Pin the VALUEs (key isn't ours to extend lifetime of, but lc/prefix
651
+ * are; rooting all three in the anchor Array is simplest + safest). */
652
+ rb_ary_push(rb_aHeaderKeyAnchor, key_v);
653
+ rb_ary_push(rb_aHeaderKeyAnchor, k_lower);
654
+ rb_ary_push(rb_aHeaderKeyAnchor, prefix);
655
+
656
+ st_insert(g_header_key_cache, (st_data_t)key_v, (st_data_t)e);
657
+ return e;
658
+ }
659
+
660
+ /* foreach state for the response-head builder. Threads the response buffer
661
+ * + framing flags through `rb_hash_foreach`. Errors propagate via
662
+ * `rb_raise` (longjmp-safe; the foreach unwinds and the buffer's RBasic
663
+ * pinning lets GC reclaim it). */
664
+ typedef struct {
665
+ VALUE buf;
666
+ int has_date;
667
+ } build_head_state_t;
668
+
669
+ static int build_head_each(VALUE k, VALUE v, VALUE arg) {
670
+ build_head_state_t *st = (build_head_state_t *)arg;
671
+
672
+ /* Full-line cache fast path: BOTH key AND value are frozen-literal
673
+ * Strings AND the (key, value) pair is already cached. ONE rb_str_cat
674
+ * consumes the entire prebuilt "<lc-key>: <value>\r\n" line. */
675
+ if (TYPE(k) == T_STRING && TYPE(v) == T_STRING &&
676
+ OBJ_FROZEN_RAW(k) && OBJ_FROZEN_RAW(v)) {
677
+ const header_line_cache_entry_t *line_e = header_line_cache_lookup(k, v);
678
+ if (line_e != NULL) {
679
+ rb_str_cat(st->buf, RSTRING_PTR(line_e->line), line_e->line_len);
680
+ if (line_e->is_date) st->has_date = 1;
681
+ return ST_CONTINUE;
682
+ }
683
+ }
684
+
685
+ /* Cached prefix path: lowercase form + "<lc>: " bytes already built. */
686
+ const header_key_cache_entry_t *e = header_key_cache_lookup(k);
687
+ VALUE lc;
688
+ const char *lc_ptr;
689
+ long lc_len;
690
+ VALUE prefix; /* always the cached "<lc>: " when e != NULL */
691
+ if (e != NULL) {
692
+ lc = e->lc;
693
+ lc_ptr = RSTRING_PTR(lc);
694
+ lc_len = e->lc_len;
695
+ prefix = e->prefix;
696
+ } else {
697
+ /* Cap exceeded: fall through to the per-call downcase. Still
698
+ * cheaper than the legacy path because we skip the keys-Array
699
+ * iteration overhead. */
700
+ VALUE k_s = rb_obj_as_string(k);
701
+ lc = rb_funcall(k_s, id_downcase, 0);
702
+ lc_ptr = RSTRING_PTR(lc);
703
+ lc_len = RSTRING_LEN(lc);
704
+ prefix = Qnil;
705
+ }
706
+
707
+ VALUE v_s = rb_obj_as_string(v);
708
+ const char *v_ptr = RSTRING_PTR(v_s);
709
+ long v_len = RSTRING_LEN(v_s);
710
+
711
+ /* CRLF injection guard on value. */
712
+ for (long j = 0; j < v_len; j++) {
713
+ if (v_ptr[j] == '\r' || v_ptr[j] == '\n') {
714
+ rb_raise(rb_eArgError, "header %s contains CR/LF",
715
+ RSTRING_PTR(rb_inspect(lc)));
716
+ }
717
+ }
718
+
719
+ /* Drop user-supplied content-length / connection — we always set
720
+ * these unconditionally below. */
721
+ if (lc_len == 14 && memcmp(lc_ptr, "content-length", 14) == 0) return ST_CONTINUE;
722
+ if (lc_len == 10 && memcmp(lc_ptr, "connection", 10) == 0) return ST_CONTINUE;
723
+
724
+ if (lc_len == 4 && memcmp(lc_ptr, "date", 4) == 0) st->has_date = 1;
725
+
726
+ if (prefix != Qnil) {
727
+ rb_str_cat(st->buf, RSTRING_PTR(prefix), lc_len + 2);
728
+ } else {
729
+ rb_str_cat(st->buf, lc_ptr, lc_len);
730
+ rb_str_cat(st->buf, ": ", 2);
731
+ }
732
+ rb_str_cat(st->buf, v_ptr, v_len);
733
+ rb_str_cat(st->buf, "\r\n", 2);
734
+
735
+ /* Populate the line cache for next time when both sides are frozen
736
+ * literals and we have room. */
737
+ if (g_header_line_cache != NULL &&
738
+ TYPE(k) == T_STRING && TYPE(v) == T_STRING &&
739
+ OBJ_FROZEN_RAW(k) && OBJ_FROZEN_RAW(v) &&
740
+ g_header_line_cache->num_entries < HEADER_LINE_CACHE_MAX) {
741
+ long line_len = lc_len + 2 + v_len + 2;
742
+ VALUE line = rb_str_buf_new(line_len);
743
+ rb_str_cat(line, lc_ptr, lc_len);
744
+ rb_str_cat(line, ": ", 2);
745
+ rb_str_cat(line, v_ptr, v_len);
746
+ rb_str_cat(line, "\r\n", 2);
747
+ rb_obj_freeze(line);
748
+
749
+ header_line_cache_entry_t *ne = ALLOC(header_line_cache_entry_t);
750
+ ne->key_v = k;
751
+ ne->val_v = v;
752
+ ne->line = line;
753
+ ne->line_len = line_len;
754
+ ne->is_date = (lc_len == 4 && memcmp(lc_ptr, "date", 4) == 0) ? 1 : 0;
755
+
756
+ rb_ary_push(rb_aHeaderLineAnchor, k);
757
+ rb_ary_push(rb_aHeaderLineAnchor, v);
758
+ rb_ary_push(rb_aHeaderLineAnchor, line);
759
+
760
+ st_insert(g_header_line_cache, (st_data_t)ne, (st_data_t)ne);
761
+ }
762
+
763
+ return ST_CONTINUE;
764
+ }
765
+
445
766
  /* Hyperion::CParser.build_response_head(status, reason, headers, body_size,
446
767
  * keep_alive, date_str) -> String
447
768
  *
@@ -459,6 +780,24 @@ static VALUE cparser_parse(VALUE self, VALUE buffer) {
459
780
  * Header values containing CR/LF raise ArgumentError (response-splitting
460
781
  * guard). Bypasses Ruby Hash#each + per-line String#<< allocation; the
461
782
  * status line, framing headers, and join slices live in C buffers.
783
+ *
784
+ * 2.13-B — three CPU savings over the rc17 baseline:
785
+ * 1. Common (status, reason) pairs hit a static table of pre-baked
786
+ * "HTTP/1.1 NNN <reason>\r\n" lines — one rb_str_cat replaces the
787
+ * per-request snprintf + reason-cat + CRLF-cat triple.
788
+ * 2. Header iteration uses rb_hash_foreach instead of
789
+ * `rb_funcall(:keys)` + per-key `rb_hash_aref` — eliminates the
790
+ * keys-Array allocation and the N hash lookups per call.
791
+ * 3. Per-key downcase result + "<lc>: " prefix is cached on the
792
+ * input frozen String's identity (capped at 64 entries; a
793
+ * misbehaving app emitting unique keys per request just falls
794
+ * back to the slow path on overflow). For the canonical Rack-3
795
+ * app emitting `'content-type' / 'cache-control' / ...` from
796
+ * frozen literals, every header lookup is a single st hit.
797
+ * 4. (key, value) full-line cache: both sides are frozen-literal
798
+ * Strings (e.g. `'cache-control' => 'no-store'`) — entire
799
+ * "<lc-key>: <value>\r\n" line is one rb_str_cat after the first
800
+ * request populates the cache. Capped at 256 entries.
462
801
  */
463
802
  static VALUE cbuild_response_head(VALUE self, VALUE rb_status, VALUE rb_reason,
464
803
  VALUE rb_headers, VALUE rb_body_size,
@@ -475,59 +814,35 @@ static VALUE cbuild_response_head(VALUE self, VALUE rb_status, VALUE rb_reason,
475
814
  /* Most heads fit in 1 KiB; rb_str_cat grows on demand. */
476
815
  VALUE buf = rb_str_buf_new(1024);
477
816
 
478
- /* Status line: "HTTP/1.1 <status> <reason>\r\n" */
479
- char status_line[48];
480
- int n = snprintf(status_line, sizeof(status_line), "HTTP/1.1 %d ", status);
481
- rb_str_cat(buf, status_line, n);
482
- rb_str_cat(buf, RSTRING_PTR(rb_reason), RSTRING_LEN(rb_reason));
483
- rb_str_cat(buf, "\r\n", 2);
484
-
485
- /* Iterate user headers — lowercase key, validate value, skip framing. */
486
- int has_date = 0;
487
-
488
- VALUE keys = rb_funcall(rb_headers, rb_intern("keys"), 0);
489
- long n_keys = RARRAY_LEN(keys);
490
- for (long i = 0; i < n_keys; i++) {
491
- VALUE k = rb_ary_entry(keys, i);
492
- VALUE v = rb_hash_aref(rb_headers, k);
493
-
494
- VALUE k_s = rb_obj_as_string(k);
495
- VALUE v_s = rb_obj_as_string(v);
496
- VALUE k_lower = rb_funcall(k_s, id_downcase, 0);
497
-
498
- const char *k_ptr = RSTRING_PTR(k_lower);
499
- long k_len = RSTRING_LEN(k_lower);
500
- const char *v_ptr = RSTRING_PTR(v_s);
501
- long v_len = RSTRING_LEN(v_s);
502
-
503
- /* CRLF injection guard on value. */
504
- for (long j = 0; j < v_len; j++) {
505
- if (v_ptr[j] == '\r' || v_ptr[j] == '\n') {
506
- rb_raise(rb_eArgError, "header %s contains CR/LF",
507
- RSTRING_PTR(rb_inspect(k_lower)));
508
- }
509
- }
510
-
511
- /* Drop user-supplied content-length / connection — we always set
512
- * these unconditionally below (matches rc16 Ruby behaviour where
513
- * the normalized hash overwrites in place). */
514
- if (k_len == 14 && memcmp(k_ptr, "content-length", 14) == 0) continue;
515
- if (k_len == 10 && memcmp(k_ptr, "connection", 10) == 0) continue;
516
-
517
- if (k_len == 4 && memcmp(k_ptr, "date", 4) == 0) {
518
- has_date = 1;
519
- }
520
-
521
- rb_str_cat(buf, k_ptr, k_len);
522
- rb_str_cat(buf, ": ", 2);
523
- rb_str_cat(buf, v_ptr, v_len);
817
+ /* Status line: pre-baked when (status, reason) is one of the well-known
818
+ * pairs in `Hyperion::ResponseWriter::REASONS`; falls back to
819
+ * `snprintf("HTTP/1.1 %d ", status)` + reason-cat for unknowns. */
820
+ const struct status_line *sline =
821
+ lookup_status_line(status, RSTRING_PTR(rb_reason), RSTRING_LEN(rb_reason));
822
+ if (sline != NULL) {
823
+ rb_str_cat(buf, sline->bytes, sline->len);
824
+ } else {
825
+ char status_line_buf[48];
826
+ int n = snprintf(status_line_buf, sizeof(status_line_buf), "HTTP/1.1 %d ", status);
827
+ rb_str_cat(buf, status_line_buf, n);
828
+ rb_str_cat(buf, RSTRING_PTR(rb_reason), RSTRING_LEN(rb_reason));
524
829
  rb_str_cat(buf, "\r\n", 2);
525
830
  }
526
831
 
527
- /* Framing headers — always emitted. */
528
- char cl_buf[48];
529
- n = snprintf(cl_buf, sizeof(cl_buf), "content-length: %ld\r\n", body_size);
530
- rb_str_cat(buf, cl_buf, n);
832
+ /* Iterate user headers — lowercase key, validate value, skip framing.
833
+ * Threaded through rb_hash_foreach so we can reuse the per-key
834
+ * downcase cache and skip the per-call `keys` Array allocation. */
835
+ build_head_state_t state = { buf, 0 };
836
+ rb_hash_foreach(rb_headers, build_head_each, (VALUE)&state);
837
+
838
+ /* Framing headers — always emitted. content-length uses a hand-rolled
839
+ * itoa rather than snprintf (vfprintf was 1 % of CPU on the
840
+ * CPU-JSON profile). */
841
+ char itoa_scratch[24];
842
+ int cl_off = itoa_positive_decimal(body_size, itoa_scratch, (int)sizeof(itoa_scratch));
843
+ rb_str_cat(buf, "content-length: ", 16);
844
+ rb_str_cat(buf, itoa_scratch + cl_off, sizeof(itoa_scratch) - cl_off);
845
+ rb_str_cat(buf, "\r\n", 2);
531
846
 
532
847
  if (keep_alive) {
533
848
  rb_str_cat(buf, "connection: keep-alive\r\n", 24);
@@ -535,7 +850,7 @@ static VALUE cbuild_response_head(VALUE self, VALUE rb_status, VALUE rb_reason,
535
850
  rb_str_cat(buf, "connection: close\r\n", 19);
536
851
  }
537
852
 
538
- if (!has_date) {
853
+ if (!state.has_date) {
539
854
  rb_str_cat(buf, "date: ", 6);
540
855
  rb_str_cat(buf, RSTRING_PTR(rb_date), RSTRING_LEN(rb_date));
541
856
  rb_str_cat(buf, "\r\n", 2);
@@ -1287,6 +1602,22 @@ void Init_hyperion_http(void) {
1287
1602
  rb_obj_freeze(rb_aHeaderTable);
1288
1603
  rb_define_const(rb_cCParser, "PREINTERNED_HEADERS", rb_aHeaderTable);
1289
1604
 
1605
+ /* 2.13-B — status-line, header-key, header-line caches used by
1606
+ * cbuild_response_head. The status-line table is fixed-size (no GC
1607
+ * concerns; bytes are .rodata). The two header caches are
1608
+ * GC-aware: their contents pin VALUEs through globally-rooted
1609
+ * Anchor Arrays, while the actual st_table maps live for the
1610
+ * extension lifetime (one per process; never freed). */
1611
+ for (struct status_line *e = k_status_lines; e->bytes != NULL; e++) {
1612
+ e->len = (long)strlen(e->bytes);
1613
+ }
1614
+ rb_aHeaderKeyAnchor = rb_ary_new();
1615
+ rb_aHeaderLineAnchor = rb_ary_new();
1616
+ rb_global_variable(&rb_aHeaderKeyAnchor);
1617
+ rb_global_variable(&rb_aHeaderLineAnchor);
1618
+ g_header_key_cache = st_init_numtable();
1619
+ g_header_line_cache = st_init_table(&header_line_cache_type);
1620
+
1290
1621
  /* Phase 1 (1.7.0) — sibling C unit owns Hyperion::Http::Sendfile.
1291
1622
  * Defined in sendfile.c; both objects link into the same .bundle/.so
1292
1623
  * so a single `require 'hyperion_http/hyperion_http'` brings up the
@@ -362,9 +362,24 @@ module Hyperion
362
362
  server_name, server_port = split_host(host_header)
363
363
 
364
364
  env = ENV_POOL.acquire
365
- input = INPUT_POOL.acquire
366
- input.string = request.body
367
- input.rewind
365
+ # 2.13-D — gRPC streaming requests pass a non-String IO-shaped
366
+ # body (Hyperion::Http2Handler::StreamingInput) and must NOT go
367
+ # through the StringIO pool: the StringIO would `string=` consume
368
+ # it as a String and lose the streaming-read semantic. Fall back
369
+ # to the legacy buffered path only when `request.body` is a
370
+ # String — covers HTTP/1.1 (always String) and HTTP/2 unary
371
+ # (String per RequestStream#@request_body). The streaming path
372
+ # tags `input` as nil so the ensure-block release skips the
373
+ # pool return for this request.
374
+ if request.body.is_a?(String)
375
+ input = INPUT_POOL.acquire
376
+ input.string = request.body
377
+ input.rewind
378
+ env['rack.input'] = input
379
+ else
380
+ input = nil
381
+ env['rack.input'] = request.body
382
+ end
368
383
 
369
384
  # Adapter-owned (non-header, non-request-line) env. SERVER_NAME/PORT
370
385
  # need split_host, REMOTE_ADDR needs peer info, the rack.* keys are
@@ -379,7 +394,6 @@ module Hyperion
379
394
  # without a backing socket.
380
395
  env['REMOTE_ADDR'] = request.peer_address || '127.0.0.1'
381
396
  env['rack.url_scheme'] = 'http'
382
- env['rack.input'] = input
383
397
  env['rack.errors'] = $stderr
384
398
  if connection
385
399
  # 2.1.0 (WS-1) — Rack 3 full-hijack. The proc captures the