hyperion-rb 2.12.0 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -442,6 +442,327 @@ static VALUE cparser_parse(VALUE self, VALUE buffer) {
442
442
  return rb_ary_new_from_args(2, request, ULONG2NUM((unsigned long)consumed));
443
443
  }
444
444
 
445
+ /* 2.13-B — pre-baked status-line table for the most common HTTP status codes.
446
+ * The full "HTTP/1.1 NNN <reason>\r\n" line is a constant for any (status,
447
+ * reason) pair the server emits on the hot path, so we sidestep the
448
+ * per-request `snprintf("HTTP/1.1 %d ", status)` + reason-cat by switching
449
+ * on `status` and emitting a single literal-bytes cat. A non-cached status
450
+ * (or a non-default reason — operator override) still falls through to the
451
+ * generic snprintf path below. The table covers every code in
452
+ * `Hyperion::ResponseWriter::REASONS`. */
453
+ struct status_line {
454
+ int status;
455
+ const char *bytes;
456
+ long len; /* strlen of bytes (filled at extension load) */
457
+ };
458
+
459
+ #define STATUS_LINE(code, reason) { (code), "HTTP/1.1 " #code " " reason "\r\n", 0 }
460
+ static struct status_line k_status_lines[] = {
461
+ STATUS_LINE(200, "OK"),
462
+ STATUS_LINE(201, "Created"),
463
+ STATUS_LINE(204, "No Content"),
464
+ STATUS_LINE(301, "Moved Permanently"),
465
+ STATUS_LINE(302, "Found"),
466
+ STATUS_LINE(304, "Not Modified"),
467
+ STATUS_LINE(400, "Bad Request"),
468
+ STATUS_LINE(401, "Unauthorized"),
469
+ STATUS_LINE(403, "Forbidden"),
470
+ STATUS_LINE(404, "Not Found"),
471
+ STATUS_LINE(405, "Method Not Allowed"),
472
+ STATUS_LINE(408, "Request Timeout"),
473
+ STATUS_LINE(409, "Conflict"),
474
+ STATUS_LINE(410, "Gone"),
475
+ STATUS_LINE(413, "Payload Too Large"),
476
+ STATUS_LINE(414, "URI Too Long"),
477
+ STATUS_LINE(422, "Unprocessable Entity"),
478
+ STATUS_LINE(429, "Too Many Requests"),
479
+ STATUS_LINE(500, "Internal Server Error"),
480
+ STATUS_LINE(501, "Not Implemented"),
481
+ STATUS_LINE(502, "Bad Gateway"),
482
+ STATUS_LINE(503, "Service Unavailable"),
483
+ STATUS_LINE(504, "Gateway Timeout"),
484
+ { 0, NULL, 0 }
485
+ };
486
+ #undef STATUS_LINE
487
+
488
+ /* Lookup a pre-baked status line by (status, reason). Returns NULL if
489
+ * the status isn't in the table OR the operator passed a custom reason
490
+ * phrase that doesn't match the table's default — in either case the
491
+ * caller falls through to the generic snprintf path. The reason match
492
+ * uses memcmp (NOT case-insensitive) — apps overriding to a different
493
+ * casing get the safe fallback rather than a wire-string mismatch. */
494
+ static const struct status_line *lookup_status_line(int status,
495
+ const char *reason_ptr,
496
+ long reason_len) {
497
+ for (struct status_line *e = k_status_lines; e->bytes != NULL; e++) {
498
+ if (e->status != status) continue;
499
+ /* Format of e->bytes: "HTTP/1.1 NNN <reason>\r\n". The reason
500
+ * starts at offset 13 (9 bytes "HTTP/1.1 " + 3 bytes status + 1
501
+ * byte space) and has length e->len - 13 - 2 (strip trailing CRLF). */
502
+ long table_reason_len = e->len - 13 - 2;
503
+ if (table_reason_len != reason_len) return NULL;
504
+ if (memcmp(e->bytes + 13, reason_ptr, reason_len) != 0) return NULL;
505
+ return e;
506
+ }
507
+ return NULL;
508
+ }
509
+
510
+ /* 2.13-B — hand-rolled positive-integer-to-decimal-ASCII writer. snprintf is
511
+ * 1 % of CPU on the CPU-JSON workload (per perf -F 199 -g sampling);
512
+ * `body_size` is always non-negative (bytesize of a buffered body) so the
513
+ * sign branch + locale logic in vfprintf are pure overhead. Writes the
514
+ * digits backwards into a 24-byte scratch then returns the offset+length
515
+ * pair so the caller can rb_str_cat without reordering. */
516
+ static int itoa_positive_decimal(long n, char *out, int out_size) {
517
+ /* out_size is the buffer; we fill from the right edge. */
518
+ int i = out_size;
519
+ if (n == 0) {
520
+ out[--i] = '0';
521
+ return i;
522
+ }
523
+ while (n > 0 && i > 0) {
524
+ out[--i] = (char)('0' + (n % 10));
525
+ n /= 10;
526
+ }
527
+ return i;
528
+ }
529
+
530
+ /* 2.13-B — per-key downcase result cache. Operators overwhelmingly call
531
+ * `build_response_head` with a fixed set of frozen-literal header keys
532
+ * (`'content-type'`, `'cache-control'`, etc.) — same String VALUE every
533
+ * request. Re-running `String#downcase` per call allocates a fresh
534
+ * lowercase String + crosses the FFI boundary; for `n_headers=4` that's
535
+ * 4 allocs + 4 method dispatches per response. The cache keys on the
536
+ * input String's object_id and stores the lowercase VALUE, the
537
+ * pre-built `lc + ": "` prefix line, and the cached length. Cap at 64
538
+ * entries so a misbehaving app emitting a unique `x-trace-<uuid>` key
539
+ * per request can't grow the cache without bound — it just falls
540
+ * through to the slow path on overflow.
541
+ *
542
+ * Pinning: each cached VALUE is anchored in a Ruby Array (`rb_aHeaderKeyCache`)
543
+ * registered as a global. The cache itself is an `st_table` keyed by
544
+ * VALUE bits (the input frozen String's id, since it's frozen and safe
545
+ * to reference forever). */
546
+ #define HEADER_KEY_CACHE_MAX 64
547
+ typedef struct {
548
+ VALUE key; /* original frozen input String */
549
+ VALUE lc; /* lowercase form (may be == key when already lowercase) */
550
+ VALUE prefix; /* "<lc>: " — pre-built byte buffer ready to cat */
551
+ long lc_len;
552
+ } header_key_cache_entry_t;
553
+
554
+ static st_table *g_header_key_cache = NULL;
555
+ static VALUE rb_aHeaderKeyAnchor; /* keeps cached VALUEs alive */
556
+
557
+ /* 2.13-B — full header-line cache. When BOTH the key AND the value of a
558
+ * header are frozen-literal Strings (the overwhelmingly common case for
559
+ * fixed Rack apps: `'cache-control' => 'no-store'`,
560
+ * `'content-type' => 'application/json'`), the entire wire line
561
+ * `"<lc-key>: <value>\r\n"` is identical every request. Cache it keyed
562
+ * on `(key.object_id, value.object_id)`; on hit the entire emit is one
563
+ * `rb_str_cat`. Same 64-entry cap + same anchor-Array pinning as the
564
+ * key cache. The `value` slot pins the original value VALUE so the
565
+ * frozen literal isn't reclaimed. */
566
+ #define HEADER_LINE_CACHE_MAX 256
567
+ typedef struct {
568
+ /* Two-word key: input key VALUE bits + value VALUE bits. */
569
+ VALUE key_v;
570
+ VALUE val_v;
571
+ VALUE line; /* "<lc-key>: <value>\r\n" buffer */
572
+ long line_len;
573
+ int is_date; /* 1 if lc-key == "date" — caller skips the date tail */
574
+ } header_line_cache_entry_t;
575
+
576
+ static st_table *g_header_line_cache = NULL;
577
+ static VALUE rb_aHeaderLineAnchor;
578
+
579
+ static st_index_t header_line_cache_hash(st_data_t a) {
580
+ /* Combine the two VALUEs via a simple xor+mul mix. The VALUEs are
581
+ * pointers to frozen Strings — the low 3 bits are alignment so we
582
+ * shift before mixing to avoid trivial collisions. */
583
+ const header_line_cache_entry_t *e = (const header_line_cache_entry_t *)a;
584
+ st_data_t x = ((st_data_t)e->key_v >> 3) * 0x9E3779B97F4A7C15ULL;
585
+ st_data_t y = ((st_data_t)e->val_v >> 3) * 0xBF58476D1CE4E5B9ULL;
586
+ return (st_index_t)(x ^ y);
587
+ }
588
+ static int header_line_cache_cmp(st_data_t a, st_data_t b) {
589
+ const header_line_cache_entry_t *ea = (const header_line_cache_entry_t *)a;
590
+ const header_line_cache_entry_t *eb = (const header_line_cache_entry_t *)b;
591
+ /* st returns 0 on match (same as memcmp). */
592
+ return !(ea->key_v == eb->key_v && ea->val_v == eb->val_v);
593
+ }
594
+ static const struct st_hash_type header_line_cache_type = {
595
+ header_line_cache_cmp,
596
+ header_line_cache_hash
597
+ };
598
+
599
+ /* Reuse the same cap-and-anchor strategy from the key cache. Look up by
600
+ * a stack-allocated probe entry; on miss + room, allocate a new entry
601
+ * and st_insert. */
602
+ static const header_line_cache_entry_t *header_line_cache_lookup(VALUE key, VALUE val) {
603
+ if (g_header_line_cache == NULL) return NULL;
604
+ header_line_cache_entry_t probe = { key, val, Qnil, 0, 0 };
605
+ st_data_t found_data;
606
+ if (st_lookup(g_header_line_cache, (st_data_t)&probe, &found_data)) {
607
+ return (const header_line_cache_entry_t *)found_data;
608
+ }
609
+ return NULL;
610
+ }
611
+
612
+ /* Lookup-or-build for the per-key downcase cache. Fast path: st hit, return
613
+ * the cached entry. Slow path: cap-bound check, freeze + lowercase the key,
614
+ * build the "<lc>: " prefix String, anchor both in rb_aHeaderKeyAnchor,
615
+ * st_insert. The anchor Array keeps the VALUEs alive across GC.
616
+ *
617
+ * Returns NULL when the cache is full AND the input key isn't already
618
+ * lowercase + already short — caller falls through to the per-call
619
+ * downcase path. */
620
+ static const header_key_cache_entry_t *header_key_cache_lookup(VALUE key_v) {
621
+ if (g_header_key_cache != NULL) {
622
+ st_data_t found_data;
623
+ if (st_lookup(g_header_key_cache, (st_data_t)key_v, &found_data)) {
624
+ return (const header_key_cache_entry_t *)found_data;
625
+ }
626
+ if (g_header_key_cache->num_entries >= HEADER_KEY_CACHE_MAX) {
627
+ return NULL; /* don't grow past cap */
628
+ }
629
+ } else {
630
+ g_header_key_cache = st_init_numtable();
631
+ }
632
+
633
+ /* Build the entry. Coerce to String, downcase, freeze, build prefix. */
634
+ VALUE k_s = rb_obj_as_string(key_v);
635
+ VALUE k_lower = rb_funcall(k_s, id_downcase, 0);
636
+ if (!OBJ_FROZEN(k_lower)) k_lower = rb_obj_freeze(k_lower);
637
+
638
+ long lc_len = RSTRING_LEN(k_lower);
639
+ VALUE prefix = rb_str_buf_new(lc_len + 2);
640
+ rb_str_cat(prefix, RSTRING_PTR(k_lower), lc_len);
641
+ rb_str_cat(prefix, ": ", 2);
642
+ rb_obj_freeze(prefix);
643
+
644
+ header_key_cache_entry_t *e = ALLOC(header_key_cache_entry_t);
645
+ e->key = key_v;
646
+ e->lc = k_lower;
647
+ e->prefix = prefix;
648
+ e->lc_len = lc_len;
649
+
650
+ /* Pin the VALUEs (key isn't ours to extend lifetime of, but lc/prefix
651
+ * are; rooting all three in the anchor Array is simplest + safest). */
652
+ rb_ary_push(rb_aHeaderKeyAnchor, key_v);
653
+ rb_ary_push(rb_aHeaderKeyAnchor, k_lower);
654
+ rb_ary_push(rb_aHeaderKeyAnchor, prefix);
655
+
656
+ st_insert(g_header_key_cache, (st_data_t)key_v, (st_data_t)e);
657
+ return e;
658
+ }
659
+
660
+ /* foreach state for the response-head builder. Threads the response buffer
661
+ * + framing flags through `rb_hash_foreach`. Errors propagate via
662
+ * `rb_raise` (longjmp-safe; the foreach unwinds and the buffer's RBasic
663
+ * pinning lets GC reclaim it). */
664
+ typedef struct {
665
+ VALUE buf;
666
+ int has_date;
667
+ } build_head_state_t;
668
+
669
+ static int build_head_each(VALUE k, VALUE v, VALUE arg) {
670
+ build_head_state_t *st = (build_head_state_t *)arg;
671
+
672
+ /* Full-line cache fast path: BOTH key AND value are frozen-literal
673
+ * Strings AND the (key, value) pair is already cached. ONE rb_str_cat
674
+ * consumes the entire prebuilt "<lc-key>: <value>\r\n" line. */
675
+ if (TYPE(k) == T_STRING && TYPE(v) == T_STRING &&
676
+ OBJ_FROZEN_RAW(k) && OBJ_FROZEN_RAW(v)) {
677
+ const header_line_cache_entry_t *line_e = header_line_cache_lookup(k, v);
678
+ if (line_e != NULL) {
679
+ rb_str_cat(st->buf, RSTRING_PTR(line_e->line), line_e->line_len);
680
+ if (line_e->is_date) st->has_date = 1;
681
+ return ST_CONTINUE;
682
+ }
683
+ }
684
+
685
+ /* Cached prefix path: lowercase form + "<lc>: " bytes already built. */
686
+ const header_key_cache_entry_t *e = header_key_cache_lookup(k);
687
+ VALUE lc;
688
+ const char *lc_ptr;
689
+ long lc_len;
690
+ VALUE prefix; /* always the cached "<lc>: " when e != NULL */
691
+ if (e != NULL) {
692
+ lc = e->lc;
693
+ lc_ptr = RSTRING_PTR(lc);
694
+ lc_len = e->lc_len;
695
+ prefix = e->prefix;
696
+ } else {
697
+ /* Cap exceeded: fall through to the per-call downcase. Still
698
+ * cheaper than the legacy path because we skip the keys-Array
699
+ * iteration overhead. */
700
+ VALUE k_s = rb_obj_as_string(k);
701
+ lc = rb_funcall(k_s, id_downcase, 0);
702
+ lc_ptr = RSTRING_PTR(lc);
703
+ lc_len = RSTRING_LEN(lc);
704
+ prefix = Qnil;
705
+ }
706
+
707
+ VALUE v_s = rb_obj_as_string(v);
708
+ const char *v_ptr = RSTRING_PTR(v_s);
709
+ long v_len = RSTRING_LEN(v_s);
710
+
711
+ /* CRLF injection guard on value. */
712
+ for (long j = 0; j < v_len; j++) {
713
+ if (v_ptr[j] == '\r' || v_ptr[j] == '\n') {
714
+ rb_raise(rb_eArgError, "header %s contains CR/LF",
715
+ RSTRING_PTR(rb_inspect(lc)));
716
+ }
717
+ }
718
+
719
+ /* Drop user-supplied content-length / connection — we always set
720
+ * these unconditionally below. */
721
+ if (lc_len == 14 && memcmp(lc_ptr, "content-length", 14) == 0) return ST_CONTINUE;
722
+ if (lc_len == 10 && memcmp(lc_ptr, "connection", 10) == 0) return ST_CONTINUE;
723
+
724
+ if (lc_len == 4 && memcmp(lc_ptr, "date", 4) == 0) st->has_date = 1;
725
+
726
+ if (prefix != Qnil) {
727
+ rb_str_cat(st->buf, RSTRING_PTR(prefix), lc_len + 2);
728
+ } else {
729
+ rb_str_cat(st->buf, lc_ptr, lc_len);
730
+ rb_str_cat(st->buf, ": ", 2);
731
+ }
732
+ rb_str_cat(st->buf, v_ptr, v_len);
733
+ rb_str_cat(st->buf, "\r\n", 2);
734
+
735
+ /* Populate the line cache for next time when both sides are frozen
736
+ * literals and we have room. */
737
+ if (g_header_line_cache != NULL &&
738
+ TYPE(k) == T_STRING && TYPE(v) == T_STRING &&
739
+ OBJ_FROZEN_RAW(k) && OBJ_FROZEN_RAW(v) &&
740
+ g_header_line_cache->num_entries < HEADER_LINE_CACHE_MAX) {
741
+ long line_len = lc_len + 2 + v_len + 2;
742
+ VALUE line = rb_str_buf_new(line_len);
743
+ rb_str_cat(line, lc_ptr, lc_len);
744
+ rb_str_cat(line, ": ", 2);
745
+ rb_str_cat(line, v_ptr, v_len);
746
+ rb_str_cat(line, "\r\n", 2);
747
+ rb_obj_freeze(line);
748
+
749
+ header_line_cache_entry_t *ne = ALLOC(header_line_cache_entry_t);
750
+ ne->key_v = k;
751
+ ne->val_v = v;
752
+ ne->line = line;
753
+ ne->line_len = line_len;
754
+ ne->is_date = (lc_len == 4 && memcmp(lc_ptr, "date", 4) == 0) ? 1 : 0;
755
+
756
+ rb_ary_push(rb_aHeaderLineAnchor, k);
757
+ rb_ary_push(rb_aHeaderLineAnchor, v);
758
+ rb_ary_push(rb_aHeaderLineAnchor, line);
759
+
760
+ st_insert(g_header_line_cache, (st_data_t)ne, (st_data_t)ne);
761
+ }
762
+
763
+ return ST_CONTINUE;
764
+ }
765
+
445
766
  /* Hyperion::CParser.build_response_head(status, reason, headers, body_size,
446
767
  * keep_alive, date_str) -> String
447
768
  *
@@ -459,6 +780,24 @@ static VALUE cparser_parse(VALUE self, VALUE buffer) {
459
780
  * Header values containing CR/LF raise ArgumentError (response-splitting
460
781
  * guard). Bypasses Ruby Hash#each + per-line String#<< allocation; the
461
782
  * status line, framing headers, and join slices live in C buffers.
783
+ *
784
+ * 2.13-B — three CPU savings over the rc17 baseline:
785
+ * 1. Common (status, reason) pairs hit a static table of pre-baked
786
+ * "HTTP/1.1 NNN <reason>\r\n" lines — one rb_str_cat replaces the
787
+ * per-request snprintf + reason-cat + CRLF-cat triple.
788
+ * 2. Header iteration uses rb_hash_foreach instead of
789
+ * `rb_funcall(:keys)` + per-key `rb_hash_aref` — eliminates the
790
+ * keys-Array allocation and the N hash lookups per call.
791
+ * 3. Per-key downcase result + "<lc>: " prefix is cached on the
792
+ * input frozen String's identity (capped at 64 entries; a
793
+ * misbehaving app emitting unique keys per request just falls
794
+ * back to the slow path on overflow). For the canonical Rack-3
795
+ * app emitting `'content-type' / 'cache-control' / ...` from
796
+ * frozen literals, every header lookup is a single st hit.
797
+ * 4. (key, value) full-line cache: both sides are frozen-literal
798
+ * Strings (e.g. `'cache-control' => 'no-store'`) — entire
799
+ * "<lc-key>: <value>\r\n" line is one rb_str_cat after the first
800
+ * request populates the cache. Capped at 256 entries.
462
801
  */
463
802
  static VALUE cbuild_response_head(VALUE self, VALUE rb_status, VALUE rb_reason,
464
803
  VALUE rb_headers, VALUE rb_body_size,
@@ -475,59 +814,35 @@ static VALUE cbuild_response_head(VALUE self, VALUE rb_status, VALUE rb_reason,
475
814
  /* Most heads fit in 1 KiB; rb_str_cat grows on demand. */
476
815
  VALUE buf = rb_str_buf_new(1024);
477
816
 
478
- /* Status line: "HTTP/1.1 <status> <reason>\r\n" */
479
- char status_line[48];
480
- int n = snprintf(status_line, sizeof(status_line), "HTTP/1.1 %d ", status);
481
- rb_str_cat(buf, status_line, n);
482
- rb_str_cat(buf, RSTRING_PTR(rb_reason), RSTRING_LEN(rb_reason));
483
- rb_str_cat(buf, "\r\n", 2);
484
-
485
- /* Iterate user headers — lowercase key, validate value, skip framing. */
486
- int has_date = 0;
487
-
488
- VALUE keys = rb_funcall(rb_headers, rb_intern("keys"), 0);
489
- long n_keys = RARRAY_LEN(keys);
490
- for (long i = 0; i < n_keys; i++) {
491
- VALUE k = rb_ary_entry(keys, i);
492
- VALUE v = rb_hash_aref(rb_headers, k);
493
-
494
- VALUE k_s = rb_obj_as_string(k);
495
- VALUE v_s = rb_obj_as_string(v);
496
- VALUE k_lower = rb_funcall(k_s, id_downcase, 0);
497
-
498
- const char *k_ptr = RSTRING_PTR(k_lower);
499
- long k_len = RSTRING_LEN(k_lower);
500
- const char *v_ptr = RSTRING_PTR(v_s);
501
- long v_len = RSTRING_LEN(v_s);
502
-
503
- /* CRLF injection guard on value. */
504
- for (long j = 0; j < v_len; j++) {
505
- if (v_ptr[j] == '\r' || v_ptr[j] == '\n') {
506
- rb_raise(rb_eArgError, "header %s contains CR/LF",
507
- RSTRING_PTR(rb_inspect(k_lower)));
508
- }
509
- }
510
-
511
- /* Drop user-supplied content-length / connection — we always set
512
- * these unconditionally below (matches rc16 Ruby behaviour where
513
- * the normalized hash overwrites in place). */
514
- if (k_len == 14 && memcmp(k_ptr, "content-length", 14) == 0) continue;
515
- if (k_len == 10 && memcmp(k_ptr, "connection", 10) == 0) continue;
516
-
517
- if (k_len == 4 && memcmp(k_ptr, "date", 4) == 0) {
518
- has_date = 1;
519
- }
520
-
521
- rb_str_cat(buf, k_ptr, k_len);
522
- rb_str_cat(buf, ": ", 2);
523
- rb_str_cat(buf, v_ptr, v_len);
817
+ /* Status line: pre-baked when (status, reason) is one of the well-known
818
+ * pairs in `Hyperion::ResponseWriter::REASONS`; falls back to
819
+ * `snprintf("HTTP/1.1 %d ", status)` + reason-cat for unknowns. */
820
+ const struct status_line *sline =
821
+ lookup_status_line(status, RSTRING_PTR(rb_reason), RSTRING_LEN(rb_reason));
822
+ if (sline != NULL) {
823
+ rb_str_cat(buf, sline->bytes, sline->len);
824
+ } else {
825
+ char status_line_buf[48];
826
+ int n = snprintf(status_line_buf, sizeof(status_line_buf), "HTTP/1.1 %d ", status);
827
+ rb_str_cat(buf, status_line_buf, n);
828
+ rb_str_cat(buf, RSTRING_PTR(rb_reason), RSTRING_LEN(rb_reason));
524
829
  rb_str_cat(buf, "\r\n", 2);
525
830
  }
526
831
 
527
- /* Framing headers — always emitted. */
528
- char cl_buf[48];
529
- n = snprintf(cl_buf, sizeof(cl_buf), "content-length: %ld\r\n", body_size);
530
- rb_str_cat(buf, cl_buf, n);
832
+ /* Iterate user headers — lowercase key, validate value, skip framing.
833
+ * Threaded through rb_hash_foreach so we can reuse the per-key
834
+ * downcase cache and skip the per-call `keys` Array allocation. */
835
+ build_head_state_t state = { buf, 0 };
836
+ rb_hash_foreach(rb_headers, build_head_each, (VALUE)&state);
837
+
838
+ /* Framing headers — always emitted. content-length uses a hand-rolled
839
+ * itoa rather than snprintf (vfprintf was 1 % of CPU on the
840
+ * CPU-JSON profile). */
841
+ char itoa_scratch[24];
842
+ int cl_off = itoa_positive_decimal(body_size, itoa_scratch, (int)sizeof(itoa_scratch));
843
+ rb_str_cat(buf, "content-length: ", 16);
844
+ rb_str_cat(buf, itoa_scratch + cl_off, sizeof(itoa_scratch) - cl_off);
845
+ rb_str_cat(buf, "\r\n", 2);
531
846
 
532
847
  if (keep_alive) {
533
848
  rb_str_cat(buf, "connection: keep-alive\r\n", 24);
@@ -535,7 +850,7 @@ static VALUE cbuild_response_head(VALUE self, VALUE rb_status, VALUE rb_reason,
535
850
  rb_str_cat(buf, "connection: close\r\n", 19);
536
851
  }
537
852
 
538
- if (!has_date) {
853
+ if (!state.has_date) {
539
854
  rb_str_cat(buf, "date: ", 6);
540
855
  rb_str_cat(buf, RSTRING_PTR(rb_date), RSTRING_LEN(rb_date));
541
856
  rb_str_cat(buf, "\r\n", 2);
@@ -1287,6 +1602,22 @@ void Init_hyperion_http(void) {
1287
1602
  rb_obj_freeze(rb_aHeaderTable);
1288
1603
  rb_define_const(rb_cCParser, "PREINTERNED_HEADERS", rb_aHeaderTable);
1289
1604
 
1605
+ /* 2.13-B — status-line, header-key, header-line caches used by
1606
+ * cbuild_response_head. The status-line table is fixed-size (no GC
1607
+ * concerns; bytes are .rodata). The two header caches are
1608
+ * GC-aware: their contents pin VALUEs through globally-rooted
1609
+ * Anchor Arrays, while the actual st_table maps live for the
1610
+ * extension lifetime (one per process; never freed). */
1611
+ for (struct status_line *e = k_status_lines; e->bytes != NULL; e++) {
1612
+ e->len = (long)strlen(e->bytes);
1613
+ }
1614
+ rb_aHeaderKeyAnchor = rb_ary_new();
1615
+ rb_aHeaderLineAnchor = rb_ary_new();
1616
+ rb_global_variable(&rb_aHeaderKeyAnchor);
1617
+ rb_global_variable(&rb_aHeaderLineAnchor);
1618
+ g_header_key_cache = st_init_numtable();
1619
+ g_header_line_cache = st_init_table(&header_line_cache_type);
1620
+
1290
1621
  /* Phase 1 (1.7.0) — sibling C unit owns Hyperion::Http::Sendfile.
1291
1622
  * Defined in sendfile.c; both objects link into the same .bundle/.so
1292
1623
  * so a single `require 'hyperion_http/hyperion_http'` brings up the
@@ -362,9 +362,24 @@ module Hyperion
362
362
  server_name, server_port = split_host(host_header)
363
363
 
364
364
  env = ENV_POOL.acquire
365
- input = INPUT_POOL.acquire
366
- input.string = request.body
367
- input.rewind
365
+ # 2.13-D — gRPC streaming requests pass a non-String IO-shaped
366
+ # body (Hyperion::Http2Handler::StreamingInput) and must NOT go
367
+ # through the StringIO pool: the StringIO would `string=` consume
368
+ # it as a String and lose the streaming-read semantic. Fall back
369
+ # to the legacy buffered path only when `request.body` is a
370
+ # String — covers HTTP/1.1 (always String) and HTTP/2 unary
371
+ # (String per RequestStream#@request_body). The streaming path
372
+ # tags `input` as nil so the ensure-block release skips the
373
+ # pool return for this request.
374
+ if request.body.is_a?(String)
375
+ input = INPUT_POOL.acquire
376
+ input.string = request.body
377
+ input.rewind
378
+ env['rack.input'] = input
379
+ else
380
+ input = nil
381
+ env['rack.input'] = request.body
382
+ end
368
383
 
369
384
  # Adapter-owned (non-header, non-request-line) env. SERVER_NAME/PORT
370
385
  # need split_host, REMOTE_ADDR needs peer info, the rack.* keys are
@@ -379,7 +394,6 @@ module Hyperion
379
394
  # without a backing socket.
380
395
  env['REMOTE_ADDR'] = request.peer_address || '127.0.0.1'
381
396
  env['rack.url_scheme'] = 'http'
382
- env['rack.input'] = input
383
397
  env['rack.errors'] = $stderr
384
398
  if connection
385
399
  # 2.1.0 (WS-1) — Rack 3 full-hijack. The proc captures the
@@ -141,6 +141,18 @@ module Hyperion
141
141
  # asked Ruby for the symbol/label). Each Connection lives in
142
142
  # exactly one process, so the cache is tight and never stale.
143
143
  @worker_id = Process.pid.to_s
144
+ # 2.13-A — pre-build the frozen single-element label tuple that
145
+ # `tick_worker_request` would otherwise allocate every request
146
+ # (`[@worker_id]` per call). Per-Connection caching is safe
147
+ # because @worker_id is process-constant and the tuple is
148
+ # frozen so consumers can't mutate the shared instance.
149
+ @worker_id_label_tuple = [@worker_id].freeze
150
+ # 2.13-A — register the labeled-counter family ONCE here (used
151
+ # to fire on every `tick_worker_request` via an `unless`-flag
152
+ # check; the early-return cost is small but real on the
153
+ # 8000 r/s -c1 single-thread profile). After this, the
154
+ # request loop calls `increment_labeled_counter` directly.
155
+ @metrics.ensure_worker_request_family_registered!
144
156
  # 2.10-D — direct-dispatch route table. The hot-path lookup
145
157
  # is `@route_table&.lookup(method, path)` so the nil-default
146
158
  # case (no operator-registered direct routes — the
@@ -320,7 +332,16 @@ module Hyperion
320
332
  # via `dispatch_request`, direct dispatch via `dispatch_direct!`,
321
333
  # and the StaticEntry fast path via `dispatch_direct_static!`
322
334
  # all flow through this point in `serve`.
323
- @metrics.tick_worker_request(@worker_id)
335
+ #
336
+ # 2.13-A — call `increment_labeled_counter` directly with the
337
+ # pre-built frozen `[@worker_id]` tuple instead of going
338
+ # through `tick_worker_request`. The wrapper allocates a
339
+ # fresh `[label]` array AND calls `worker_id.to_s` per
340
+ # request; cached tuple skips both. Family registration was
341
+ # done once in the constructor (idempotent on the Metrics
342
+ # instance) so the request loop is registration-free.
343
+ @metrics.increment_labeled_counter(Hyperion::Metrics::REQUESTS_DISPATCH_TOTAL,
344
+ @worker_id_label_tuple)
324
345
  # 2.4-C: capture start time for the per-route duration histogram.
325
346
  # Same Process.clock_gettime that the access-log path was already
326
347
  # paying — at default-ON log_requests the second call here is
@@ -788,10 +809,35 @@ module Hyperion
788
809
  )
789
810
  end
790
811
 
812
+ # 2.13-A — Rack 3 (the version Hyperion advertises in
813
+ # `env['rack.version']`) requires response header keys to be
814
+ # lowercase Strings (Rack 3 spec §6.4 "Headers must be a Hash;
815
+ # the header keys must be lowercase Strings"). Pre-2.13-A this
816
+ # method scanned the whole Hash via `headers.find` + per-key
817
+ # `k.to_s.downcase` to find the Connection header — that's an
818
+ # O(N) walk + N transient string allocations on EVERY response
819
+ # (and most responses don't carry a Connection header at all,
820
+ # so the loop ran to completion every time).
821
+ #
822
+ # The new path is a single Hash lookup. Apps that violate the
823
+ # Rack 3 spec by returning mixed-case keys (some legacy gems
824
+ # still do; less common in 2026) lose the Connection-close
825
+ # signal and stay on keep-alive — that's a benign degradation
826
+ # (the connection is reused; the next request still goes through
827
+ # request-side `Connection: close` parsing) and the fix is to
828
+ # update the app to spec.
829
+ CONNECTION_HEADER_KEY_DOWNCASE = 'connection'
830
+
791
831
  def should_keep_alive?(request, _status, headers)
792
- # App-emitted Connection: close wins.
793
- conn_response = headers.find { |k, _| k.to_s.downcase == 'connection' }
794
- return false if conn_response && conn_response.last.to_s.downcase == 'close'
832
+ # App-emitted Connection: close wins. Rack-3 fast path: O(1)
833
+ # Hash lookup; non-Hash headers (Array-of-pairs, etc.) fall
834
+ # back to a single allocation-free scan.
835
+ conn_response_value = if headers.is_a?(Hash)
836
+ headers[CONNECTION_HEADER_KEY_DOWNCASE]
837
+ else
838
+ find_connection_header_array(headers)
839
+ end
840
+ return false if conn_response_value && conn_response_value.to_s.downcase == 'close'
795
841
 
796
842
  # Request-side Connection header.
797
843
  conn_request = request.header('connection')&.downcase
@@ -806,6 +852,21 @@ module Hyperion
806
852
  end
807
853
  end
808
854
 
855
+ # 2.13-A — non-Hash headers fallback (Array of [key, value] pairs).
856
+ # Rack 3 mandates Hash, but legacy code occasionally returns an
857
+ # Array; we walk it case-sensitively because Rack-3 lowercase is
858
+ # part of the contract for non-Hash returns too. Apps emitting
859
+ # `'Connection'`-cased keys via Array form fall through to no-
860
+ # match and stay on keep-alive — same benign degradation as the
861
+ # Hash branch.
862
+ def find_connection_header_array(headers)
863
+ headers.each do |pair|
864
+ next unless pair.is_a?(Array) && pair.length >= 2
865
+ return pair[1] if pair[0] == CONNECTION_HEADER_KEY_DOWNCASE
866
+ end
867
+ nil
868
+ end
869
+
809
870
  def set_idle_timeout(socket)
810
871
  socket.timeout = IDLE_KEEPALIVE_TIMEOUT_SECONDS if socket.respond_to?(:timeout=)
811
872
  rescue StandardError