PyPI - coderouter-cli - Versions diffs - 2.5.2__tar.gz → 2.5.3__tar.gz - Mend

coderouter-cli 2.5.2tar.gz → 2.5.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (203) hide show

{coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/.gitignore RENAMED Viewed

@@ -89,3 +89,6 @@ docs/articles/
 # of the repo because the Vault path includes personal Vault names.
 .env.publish.tpl
 .env.tpl
+# FUSE / virtiofs artifacts
+.fuse_hidden*

{coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: coderouter-cli
-Version: 2.5.2
+Version: 2.5.3
 Summary: Local-first, free-first, fallback-built-in LLM router. Claude Code / OpenAI compatible.
 Project-URL: Homepage, https://github.com/zephel01/CodeRouter
 Project-URL: Repository, https://github.com/zephel01/CodeRouter
@@ -27,6 +27,8 @@ Requires-Dist: httpx>=0.27.0
 Requires-Dist: pydantic>=2.9.0
 Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: uvicorn[standard]>=0.32.0
+Provides-Extra: accuracy
+Requires-Dist: tokenizers>=0.20; extra == 'accuracy'
 Provides-Extra: dev
 Requires-Dist: mypy>=1.13.0; extra == 'dev'
 Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
@@ -37,6 +39,8 @@ Requires-Dist: ruff>=0.7.0; extra == 'dev'
 Requires-Dist: types-pyyaml>=6.0.12; extra == 'dev'
 Provides-Extra: doctor
 Requires-Dist: ruamel-yaml>=0.18.6; extra == 'doctor'
+Provides-Extra: repair
+Requires-Dist: json-repair>=0.30; extra == 'repair'
 Description-Content-Type: text/markdown
 <h1 align="center">CodeRouter</h1>

coderouter_cli-2.5.3/coderouter/gguf_introspect.py ADDED Viewed

@@ -0,0 +1,304 @@
+"""Minimal, dependency-free GGUF header introspection (low-memory track).
+Why self-written
+================
+To right-size ``num_ctx`` *before* dispatch we need a model's layer
+count and embedding width so the KV-cache footprint can be estimated.
+That data lives in the GGUF metadata header. Rather than add the
+official ``gguf`` package (and its ``numpy`` transitive dep) we read
+only the handful of header fields we need with the standard library —
+preserving the 5-deps invariant.
+The GGUF binary layout we parse (little-endian):
+  magic      : 4 bytes  == b"GGUF"
+  version    : uint32   (2 or 3 supported)
+  tensor_cnt : uint64   (ignored — we never read tensor data)
+  kv_count   : uint64   (number of metadata key/value pairs)
+  kv_pairs   : kv_count repetitions of:
+      key        : gguf-string (uint64 length + UTF-8 bytes)
+      value_type : uint32  (see _GGUF_TYPE_*)
+      value      : type-dependent
+We walk the KV pairs, capturing only the keys we care about, and skip
+the rest (including arbitrarily nested arrays) without materialising
+them.
+Security
+========
+The parser treats the file as **untrusted input**:
+  * Every string length and array element count is clamped against
+    :data:`_MAX_STR_BYTES` / :data:`_MAX_ARRAY_LEN` so a corrupt or
+    hostile header cannot trigger a multi-GB allocation (DoS).
+  * Reads past EOF raise :class:`GGUFParseError`, never an unbounded
+    loop.
+  * No ``mmap``, no tensor payload read, no code execution path — we
+    only seek/read a small prefix.
+"""
+from __future__ import annotations
+import struct
+from dataclasses import dataclass
+from pathlib import Path
+from typing import BinaryIO
+# ---------------------------------------------------------------------------
+# Constants / format
+# ---------------------------------------------------------------------------
+_GGUF_MAGIC = b"GGUF"
+# GGUF metadata value type tags.
+_GGUF_TYPE_UINT8 = 0
+_GGUF_TYPE_INT8 = 1
+_GGUF_TYPE_UINT16 = 2
+_GGUF_TYPE_INT16 = 3
+_GGUF_TYPE_UINT32 = 4
+_GGUF_TYPE_INT32 = 5
+_GGUF_TYPE_FLOAT32 = 6
+_GGUF_TYPE_BOOL = 7
+_GGUF_TYPE_STRING = 8
+_GGUF_TYPE_ARRAY = 9
+_GGUF_TYPE_UINT64 = 10
+_GGUF_TYPE_INT64 = 11
+_GGUF_TYPE_FLOAT64 = 12
+# Fixed-width scalar (struct format, size) by type tag.
+_SCALAR: dict[int, tuple[str, int]] = {
+    _GGUF_TYPE_UINT8: ("<B", 1),
+    _GGUF_TYPE_INT8: ("<b", 1),
+    _GGUF_TYPE_UINT16: ("<H", 2),
+    _GGUF_TYPE_INT16: ("<h", 2),
+    _GGUF_TYPE_UINT32: ("<I", 4),
+    _GGUF_TYPE_INT32: ("<i", 4),
+    _GGUF_TYPE_FLOAT32: ("<f", 4),
+    _GGUF_TYPE_BOOL: ("<?", 1),
+    _GGUF_TYPE_UINT64: ("<Q", 8),
+    _GGUF_TYPE_INT64: ("<q", 8),
+    _GGUF_TYPE_FLOAT64: ("<d", 8),
+}
+# Defensive clamps against hostile / corrupt headers.
+_MAX_STR_BYTES: int = 1 << 20  # 1 MiB key/value string ceiling
+_MAX_ARRAY_LEN: int = 1 << 24  # element-count ceiling for arrays
+_MAX_KV_PAIRS: int = 1 << 20  # metadata pair ceiling
+# Human-readable names for the GGUF ``general.file_type`` enum (subset).
+_FILE_TYPE_NAMES: dict[int, str] = {
+    0: "F32",
+    1: "F16",
+    2: "Q4_0",
+    3: "Q4_1",
+    7: "Q8_0",
+    8: "Q5_0",
+    9: "Q5_1",
+    10: "Q2_K",
+    11: "Q3_K_S",
+    12: "Q3_K_M",
+    13: "Q3_K_L",
+    14: "Q4_K_S",
+    15: "Q4_K_M",
+    16: "Q5_K_S",
+    17: "Q5_K_M",
+    18: "Q6_K",
+    19: "IQ2_XXS",
+    20: "IQ2_XS",
+    21: "Q2_K_S",
+    22: "IQ3_XS",
+    23: "IQ3_XXS",
+    24: "IQ1_S",
+    25: "IQ4_NL",
+    26: "IQ3_S",
+    27: "IQ3_M",
+    28: "IQ2_S",
+    29: "IQ2_M",
+    30: "IQ4_XS",
+    31: "IQ1_M",
+}
+class GGUFParseError(Exception):
+    """Raised when a file is not a parseable GGUF header."""
+@dataclass(frozen=True, slots=True)
+class GGUFInfo:
+    """The subset of GGUF metadata needed for memory accounting."""
+    architecture: str | None
+    n_layers: int | None
+    n_embd: int | None
+    n_heads: int | None
+    n_kv_heads: int | None
+    file_type: int | None
+    file_size_bytes: int
+    @property
+    def quant_name(self) -> str | None:
+        """Human-readable quantization label, or None if unknown."""
+        if self.file_type is None:
+            return None
+        return _FILE_TYPE_NAMES.get(self.file_type, f"type{self.file_type}")
+    @property
+    def weights_bytes(self) -> int:
+        """Approximate on-disk weight size — the file size is the best
+        proxy (GGUF is almost entirely tensor data)."""
+        return self.file_size_bytes
+# ---------------------------------------------------------------------------
+# Low-level readers
+# ---------------------------------------------------------------------------
+def _read_exact(fh: BinaryIO, n: int) -> bytes:
+    data = fh.read(n)
+    if len(data) != n:
+        raise GGUFParseError(f"unexpected EOF (wanted {n} bytes, got {len(data)})")
+    return data
+def _read_scalar(fh: BinaryIO, type_tag: int) -> object:
+    fmt_size = _SCALAR.get(type_tag)
+    if fmt_size is None:
+        raise GGUFParseError(f"unknown scalar type tag {type_tag}")
+    fmt, size = fmt_size
+    return struct.unpack(fmt, _read_exact(fh, size))[0]
+def _read_u32(fh: BinaryIO) -> int:
+    return struct.unpack("<I", _read_exact(fh, 4))[0]
+def _read_u64(fh: BinaryIO) -> int:
+    return struct.unpack("<Q", _read_exact(fh, 8))[0]
+def _read_gguf_string(fh: BinaryIO) -> str:
+    length = _read_u64(fh)
+    if length > _MAX_STR_BYTES:
+        raise GGUFParseError(f"string length {length} exceeds cap")
+    return _read_exact(fh, length).decode("utf-8", errors="replace")
+def _skip_value(fh: BinaryIO, type_tag: int) -> None:
+    """Consume a metadata value of ``type_tag`` without retaining it."""
+    if type_tag == _GGUF_TYPE_STRING:
+        _read_gguf_string(fh)
+        return
+    if type_tag == _GGUF_TYPE_ARRAY:
+        elem_type = _read_u32(fh)
+        count = _read_u64(fh)
+        if count > _MAX_ARRAY_LEN:
+            raise GGUFParseError(f"array length {count} exceeds cap")
+        for _ in range(count):
+            _skip_value(fh, elem_type)
+        return
+    fmt_size = _SCALAR.get(type_tag)
+    if fmt_size is None:
+        raise GGUFParseError(f"unknown value type tag {type_tag}")
+    fh.seek(fmt_size[1], 1)  # skip scalar bytes
+def _read_scalar_value(fh: BinaryIO, type_tag: int) -> object:
+    """Read (and return) a value, skipping arrays/strings we don't need."""
+    if type_tag == _GGUF_TYPE_STRING:
+        return _read_gguf_string(fh)
+    if type_tag == _GGUF_TYPE_ARRAY:
+        _skip_value(fh, type_tag)
+        return None
+    return _read_scalar(fh, type_tag)
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+# Suffixes of the arch-prefixed keys we capture (e.g. "llama.block_count").
+_KEY_BLOCK_COUNT = ".block_count"
+_KEY_EMBED_LEN = ".embedding_length"
+_KEY_HEAD_COUNT = ".attention.head_count"
+_KEY_HEAD_COUNT_KV = ".attention.head_count_kv"
+def read_gguf_metadata(path: str | Path) -> GGUFInfo:
+    """Parse the GGUF header at ``path`` and return a :class:`GGUFInfo`.
+    Raises :class:`GGUFParseError` if the file is missing, too short,
+    or not a GGUF container. Captures only the keys needed for memory
+    accounting; everything else is skipped.
+    """
+    p = Path(path)
+    try:
+        file_size = p.stat().st_size
+    except OSError as exc:  # missing / unreadable
+        raise GGUFParseError(f"cannot stat {path}: {exc}") from exc
+    arch: str | None = None
+    n_layers: int | None = None
+    n_embd: int | None = None
+    n_heads: int | None = None
+    n_kv_heads: int | None = None
+    file_type: int | None = None
+    with p.open("rb") as fh:
+        magic = fh.read(4)
+        if magic != _GGUF_MAGIC:
+            raise GGUFParseError(f"bad magic {magic!r} (not a GGUF file)")
+        version = _read_u32(fh)
+        if version not in (2, 3):
+            raise GGUFParseError(f"unsupported GGUF version {version}")
+        _read_u64(fh)  # tensor_count: advance cursor, not needed
+        kv_count = _read_u64(fh)
+        if kv_count > _MAX_KV_PAIRS:
+            raise GGUFParseError(f"kv_count {kv_count} exceeds cap")
+        for _ in range(kv_count):
+            key = _read_gguf_string(fh)
+            value_type = _read_u32(fh)
+            value = _read_scalar_value(fh, value_type)
+            if key == "general.architecture" and isinstance(value, str):
+                arch = value
+            elif key == "general.file_type" and isinstance(value, int):
+                file_type = value
+            elif key.endswith(_KEY_BLOCK_COUNT) and isinstance(value, int):
+                n_layers = value
+            elif key.endswith(_KEY_EMBED_LEN) and isinstance(value, int):
+                n_embd = value
+            elif key.endswith(_KEY_HEAD_COUNT_KV) and isinstance(value, int):
+                n_kv_heads = value
+            elif key.endswith(_KEY_HEAD_COUNT) and isinstance(value, int):
+                n_heads = value
+    return GGUFInfo(
+        architecture=arch,
+        n_layers=n_layers,
+        n_embd=n_embd,
+        n_heads=n_heads,
+        n_kv_heads=n_kv_heads,
+        file_type=file_type,
+        file_size_bytes=file_size,
+    )
+def try_read_gguf_metadata(path: str | Path) -> GGUFInfo | None:
+    """Like :func:`read_gguf_metadata` but returns None on any parse
+    failure — convenient for best-effort advisory paths."""
+    try:
+        return read_gguf_metadata(path)
+    except GGUFParseError:
+        return None
+__all__ = [
+    "GGUFInfo",
+    "GGUFParseError",
+    "read_gguf_metadata",
+    "try_read_gguf_metadata",
+]

coderouter_cli-2.5.3/coderouter/guards/memory_budget.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""Proactive memory-budget guard (low-memory track, L1).
+Where :mod:`coderouter.guards.memory_pressure` reacts *after* an OOM,
+this guard prevents it: given the host's available memory (from
+:mod:`coderouter.hardware`) and the model's shape (from
+:mod:`coderouter.gguf_introspect`), it computes the largest context
+window (``num_ctx``) that will actually fit, *before* the request is
+dispatched.
+The engine then (a) caps the backend's ``num_ctx`` to that value and
+(b) trims conversation history to the same budget via
+:func:`coderouter.guards.context_budget.trim_to_budget`.
+Everything here is **pure** (no I/O, no globals) so it is trivially
+testable and free of the 5-deps constraint.
+KV-cache model
+==============
+The dominant runtime cost beyond the weights is the attention KV
+cache, which grows linearly with context length:
+    kv_bytes ≈ 2 (K and V)
+             x n_layers
+             x n_ctx
+             x kv_dim
+             x bytes_per_element
+``kv_dim`` is the per-token key/value width. With grouped-query
+attention (GQA) it is ``n_embd x n_kv_heads / n_heads``; without GQA
+metadata it falls back to ``n_embd`` (conservative — over-counts, so
+we under-promise context, which is the safe direction for OOM).
+``bytes_per_element`` defaults to 2 (fp16 KV cache). The estimate is
+deliberately conservative; the headroom in :mod:`coderouter.hardware`
+absorbs activation/compute buffers not modelled here.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Literal
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+_BYTES_PER_GB: int = 1024**3
+#: Default bytes per KV-cache element (fp16).
+DEFAULT_KV_BYTES_PER_ELEM: int = 2
+#: Fraction of the post-weights budget held back for activations and
+#: the compute buffer (not modelled explicitly). The remainder is what
+#: the KV cache may consume.
+DEFAULT_COMPUTE_OVERHEAD_RATIO: float = 0.10
+#: Fallback layer/embedding shape when GGUF metadata is incomplete.
+#: Chosen to over-estimate KV (safe: under-promises context).
+_FALLBACK_N_LAYERS: int = 32
+_FALLBACK_N_EMBD: int = 4096
+FitAction = Literal["ok", "shrink", "insufficient", "unknown"]
+# ---------------------------------------------------------------------------
+# Result type
+# ---------------------------------------------------------------------------
+@dataclass(frozen=True, slots=True)
+class FitDecision:
+    """Outcome of a pre-dispatch memory-fit computation.
+    ``action``:
+      * ``"ok"``           — requested context fits as-is.
+      * ``"shrink"``       — fits only at ``effective_num_ctx`` < requested.
+      * ``"insufficient"`` — won't fit even at ``min_num_ctx``; the model
+                             is too big for this host (caller should warn
+                             / fall through to another provider).
+      * ``"unknown"``      — hardware undetected; guard is a no-op.
+    """
+    action: FitAction
+    fits: bool
+    requested_num_ctx: int
+    effective_num_ctx: int
+    weights_bytes: int
+    kv_cache_bytes: int
+    available_bytes: int
+    reason: str
+# ---------------------------------------------------------------------------
+# KV-cache math (pure)
+# ---------------------------------------------------------------------------
+def kv_dim(
+    n_embd: int | None,
+    n_heads: int | None,
+    n_kv_heads: int | None,
+) -> int:
+    """Per-token KV width in elements.
+    Applies the GQA reduction when both head counts are known and
+    valid; otherwise returns ``n_embd`` (over-counts → safe).
+    """
+    embd = n_embd if (n_embd and n_embd > 0) else _FALLBACK_N_EMBD
+    if (
+        n_heads
+        and n_kv_heads
+        and n_heads > 0
+        and 0 < n_kv_heads <= n_heads
+    ):
+        return max(1, int(embd * n_kv_heads / n_heads))
+    return embd
+def kv_cache_bytes(
+    n_ctx: int,
+    n_layers: int,
+    kv_width: int,
+    *,
+    bytes_per_elem: int = DEFAULT_KV_BYTES_PER_ELEM,
+) -> int:
+    """KV-cache size in bytes for a given context length (K and V)."""
+    return 2 * max(0, n_layers) * max(0, n_ctx) * max(0, kv_width) * bytes_per_elem
+def max_num_ctx_for_budget(
+    kv_budget_bytes: int,
+    n_layers: int,
+    kv_width: int,
+    *,
+    bytes_per_elem: int = DEFAULT_KV_BYTES_PER_ELEM,
+) -> int:
+    """Largest ``n_ctx`` whose KV cache fits in ``kv_budget_bytes``."""
+    per_token = 2 * max(1, n_layers) * max(1, kv_width) * bytes_per_elem
+    if per_token <= 0 or kv_budget_bytes <= 0:
+        return 0
+    return int(kv_budget_bytes // per_token)
+# ---------------------------------------------------------------------------
+# Fit decision (pure)
+# ---------------------------------------------------------------------------
+def plan_fit(
+    *,
+    available_budget_gb: float,
+    weights_bytes: int,
+    requested_num_ctx: int,
+    n_layers: int | None,
+    n_embd: int | None = None,
+    n_heads: int | None = None,
+    n_kv_heads: int | None = None,
+    min_num_ctx: int = 2048,
+    bytes_per_elem: int = DEFAULT_KV_BYTES_PER_ELEM,
+    compute_overhead_ratio: float = DEFAULT_COMPUTE_OVERHEAD_RATIO,
+) -> FitDecision:
+    """Decide whether ``requested_num_ctx`` fits, and by how much to shrink.
+    ``available_budget_gb`` is the net memory (after OS headroom) from
+    :func:`coderouter.hardware.available_budget_gb`. ``0.0`` means
+    hardware was undetected → returns an ``"unknown"`` no-op decision.
+    """
+    if available_budget_gb <= 0.0:
+        return FitDecision(
+            action="unknown",
+            fits=True,  # don't block when we can't measure
+            requested_num_ctx=requested_num_ctx,
+            effective_num_ctx=requested_num_ctx,
+            weights_bytes=weights_bytes,
+            kv_cache_bytes=0,
+            available_bytes=0,
+            reason="hardware undetected; guard no-op",
+        )
+    available_bytes = int(available_budget_gb * _BYTES_PER_GB)
+    layers = n_layers if (n_layers and n_layers > 0) else _FALLBACK_N_LAYERS
+    width = kv_dim(n_embd, n_heads, n_kv_heads)
+    # Memory left for the KV cache after weights, minus a compute buffer.
+    post_weights = available_bytes - max(0, weights_bytes)
+    kv_budget = int(post_weights * (1.0 - compute_overhead_ratio))
+    # Can we even run the minimum context?
+    min_kv = kv_cache_bytes(min_num_ctx, layers, width, bytes_per_elem=bytes_per_elem)
+    if kv_budget < min_kv:
+        return FitDecision(
+            action="insufficient",
+            fits=False,
+            requested_num_ctx=requested_num_ctx,
+            effective_num_ctx=min_num_ctx,
+            weights_bytes=weights_bytes,
+            kv_cache_bytes=min_kv,
+            available_bytes=available_bytes,
+            reason=(
+                "weights + minimum KV cache exceed available memory; "
+                "model too large for this host"
+            ),
+        )
+    ctx_cap = max_num_ctx_for_budget(
+        kv_budget, layers, width, bytes_per_elem=bytes_per_elem
+    )
+    if ctx_cap >= requested_num_ctx:
+        kv = kv_cache_bytes(
+            requested_num_ctx, layers, width, bytes_per_elem=bytes_per_elem
+        )
+        return FitDecision(
+            action="ok",
+            fits=True,
+            requested_num_ctx=requested_num_ctx,
+            effective_num_ctx=requested_num_ctx,
+            weights_bytes=weights_bytes,
+            kv_cache_bytes=kv,
+            available_bytes=available_bytes,
+            reason="requested context fits",
+        )
+    # Shrink to the cap, but never below the floor.
+    effective = max(min_num_ctx, ctx_cap)
+    kv = kv_cache_bytes(effective, layers, width, bytes_per_elem=bytes_per_elem)
+    return FitDecision(
+        action="shrink",
+        fits=True,
+        requested_num_ctx=requested_num_ctx,
+        effective_num_ctx=effective,
+        weights_bytes=weights_bytes,
+        kv_cache_bytes=kv,
+        available_bytes=available_bytes,
+        reason=f"context shrunk from {requested_num_ctx} to {effective} to fit memory",
+    )
+__all__ = [
+    "DEFAULT_COMPUTE_OVERHEAD_RATIO",
+    "DEFAULT_KV_BYTES_PER_ELEM",
+    "FitAction",
+    "FitDecision",
+    "kv_cache_bytes",
+    "kv_dim",
+    "max_num_ctx_for_budget",
+    "plan_fit",
+]

coderouter-cli 2.5.2__tar.gz → 2.5.3__tar.gz

coderouter-cli 2.5.2tar.gz → 2.5.3tar.gz