PyPI - coderouter-cli - Versions diffs - 2.5.1__py3-none-any.whl → 2.5.3__py3-none-any.whl - Mend

coderouter-cli 2.5.1py3-none-any.whl → 2.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

coderouter/gguf_introspect.py ADDED Viewed

@@ -0,0 +1,304 @@
+"""Minimal, dependency-free GGUF header introspection (low-memory track).
+Why self-written
+================
+To right-size ``num_ctx`` *before* dispatch we need a model's layer
+count and embedding width so the KV-cache footprint can be estimated.
+That data lives in the GGUF metadata header. Rather than add the
+official ``gguf`` package (and its ``numpy`` transitive dep) we read
+only the handful of header fields we need with the standard library —
+preserving the 5-deps invariant.
+The GGUF binary layout we parse (little-endian):
+  magic      : 4 bytes  == b"GGUF"
+  version    : uint32   (2 or 3 supported)
+  tensor_cnt : uint64   (ignored — we never read tensor data)
+  kv_count   : uint64   (number of metadata key/value pairs)
+  kv_pairs   : kv_count repetitions of:
+      key        : gguf-string (uint64 length + UTF-8 bytes)
+      value_type : uint32  (see _GGUF_TYPE_*)
+      value      : type-dependent
+We walk the KV pairs, capturing only the keys we care about, and skip
+the rest (including arbitrarily nested arrays) without materialising
+them.
+Security
+========
+The parser treats the file as **untrusted input**:
+  * Every string length and array element count is clamped against
+    :data:`_MAX_STR_BYTES` / :data:`_MAX_ARRAY_LEN` so a corrupt or
+    hostile header cannot trigger a multi-GB allocation (DoS).
+  * Reads past EOF raise :class:`GGUFParseError`, never an unbounded
+    loop.
+  * No ``mmap``, no tensor payload read, no code execution path — we
+    only seek/read a small prefix.
+"""
+from __future__ import annotations
+import struct
+from dataclasses import dataclass
+from pathlib import Path
+from typing import BinaryIO
+# ---------------------------------------------------------------------------
+# Constants / format
+# ---------------------------------------------------------------------------
+_GGUF_MAGIC = b"GGUF"
+# GGUF metadata value type tags.
+_GGUF_TYPE_UINT8 = 0
+_GGUF_TYPE_INT8 = 1
+_GGUF_TYPE_UINT16 = 2
+_GGUF_TYPE_INT16 = 3
+_GGUF_TYPE_UINT32 = 4
+_GGUF_TYPE_INT32 = 5
+_GGUF_TYPE_FLOAT32 = 6
+_GGUF_TYPE_BOOL = 7
+_GGUF_TYPE_STRING = 8
+_GGUF_TYPE_ARRAY = 9
+_GGUF_TYPE_UINT64 = 10
+_GGUF_TYPE_INT64 = 11
+_GGUF_TYPE_FLOAT64 = 12
+# Fixed-width scalar (struct format, size) by type tag.
+_SCALAR: dict[int, tuple[str, int]] = {
+    _GGUF_TYPE_UINT8: ("<B", 1),
+    _GGUF_TYPE_INT8: ("<b", 1),
+    _GGUF_TYPE_UINT16: ("<H", 2),
+    _GGUF_TYPE_INT16: ("<h", 2),
+    _GGUF_TYPE_UINT32: ("<I", 4),
+    _GGUF_TYPE_INT32: ("<i", 4),
+    _GGUF_TYPE_FLOAT32: ("<f", 4),
+    _GGUF_TYPE_BOOL: ("<?", 1),
+    _GGUF_TYPE_UINT64: ("<Q", 8),
+    _GGUF_TYPE_INT64: ("<q", 8),
+    _GGUF_TYPE_FLOAT64: ("<d", 8),
+}
+# Defensive clamps against hostile / corrupt headers.
+_MAX_STR_BYTES: int = 1 << 20  # 1 MiB key/value string ceiling
+_MAX_ARRAY_LEN: int = 1 << 24  # element-count ceiling for arrays
+_MAX_KV_PAIRS: int = 1 << 20  # metadata pair ceiling
+# Human-readable names for the GGUF ``general.file_type`` enum (subset).
+_FILE_TYPE_NAMES: dict[int, str] = {
+    0: "F32",
+    1: "F16",
+    2: "Q4_0",
+    3: "Q4_1",
+    7: "Q8_0",
+    8: "Q5_0",
+    9: "Q5_1",
+    10: "Q2_K",
+    11: "Q3_K_S",
+    12: "Q3_K_M",
+    13: "Q3_K_L",
+    14: "Q4_K_S",
+    15: "Q4_K_M",
+    16: "Q5_K_S",
+    17: "Q5_K_M",
+    18: "Q6_K",
+    19: "IQ2_XXS",
+    20: "IQ2_XS",
+    21: "Q2_K_S",
+    22: "IQ3_XS",
+    23: "IQ3_XXS",
+    24: "IQ1_S",
+    25: "IQ4_NL",
+    26: "IQ3_S",
+    27: "IQ3_M",
+    28: "IQ2_S",
+    29: "IQ2_M",
+    30: "IQ4_XS",
+    31: "IQ1_M",
+}
+class GGUFParseError(Exception):
+    """Raised when a file is not a parseable GGUF header."""
+@dataclass(frozen=True, slots=True)
+class GGUFInfo:
+    """The subset of GGUF metadata needed for memory accounting."""
+    architecture: str | None
+    n_layers: int | None
+    n_embd: int | None
+    n_heads: int | None
+    n_kv_heads: int | None
+    file_type: int | None
+    file_size_bytes: int
+    @property
+    def quant_name(self) -> str | None:
+        """Human-readable quantization label, or None if unknown."""
+        if self.file_type is None:
+            return None
+        return _FILE_TYPE_NAMES.get(self.file_type, f"type{self.file_type}")
+    @property
+    def weights_bytes(self) -> int:
+        """Approximate on-disk weight size — the file size is the best
+        proxy (GGUF is almost entirely tensor data)."""
+        return self.file_size_bytes
+# ---------------------------------------------------------------------------
+# Low-level readers
+# ---------------------------------------------------------------------------
+def _read_exact(fh: BinaryIO, n: int) -> bytes:
+    data = fh.read(n)
+    if len(data) != n:
+        raise GGUFParseError(f"unexpected EOF (wanted {n} bytes, got {len(data)})")
+    return data
+def _read_scalar(fh: BinaryIO, type_tag: int) -> object:
+    fmt_size = _SCALAR.get(type_tag)
+    if fmt_size is None:
+        raise GGUFParseError(f"unknown scalar type tag {type_tag}")
+    fmt, size = fmt_size
+    return struct.unpack(fmt, _read_exact(fh, size))[0]
+def _read_u32(fh: BinaryIO) -> int:
+    return struct.unpack("<I", _read_exact(fh, 4))[0]
+def _read_u64(fh: BinaryIO) -> int:
+    return struct.unpack("<Q", _read_exact(fh, 8))[0]
+def _read_gguf_string(fh: BinaryIO) -> str:
+    length = _read_u64(fh)
+    if length > _MAX_STR_BYTES:
+        raise GGUFParseError(f"string length {length} exceeds cap")
+    return _read_exact(fh, length).decode("utf-8", errors="replace")
+def _skip_value(fh: BinaryIO, type_tag: int) -> None:
+    """Consume a metadata value of ``type_tag`` without retaining it."""
+    if type_tag == _GGUF_TYPE_STRING:
+        _read_gguf_string(fh)
+        return
+    if type_tag == _GGUF_TYPE_ARRAY:
+        elem_type = _read_u32(fh)
+        count = _read_u64(fh)
+        if count > _MAX_ARRAY_LEN:
+            raise GGUFParseError(f"array length {count} exceeds cap")
+        for _ in range(count):
+            _skip_value(fh, elem_type)
+        return
+    fmt_size = _SCALAR.get(type_tag)
+    if fmt_size is None:
+        raise GGUFParseError(f"unknown value type tag {type_tag}")
+    fh.seek(fmt_size[1], 1)  # skip scalar bytes
+def _read_scalar_value(fh: BinaryIO, type_tag: int) -> object:
+    """Read (and return) a value, skipping arrays/strings we don't need."""
+    if type_tag == _GGUF_TYPE_STRING:
+        return _read_gguf_string(fh)
+    if type_tag == _GGUF_TYPE_ARRAY:
+        _skip_value(fh, type_tag)
+        return None
+    return _read_scalar(fh, type_tag)
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+# Suffixes of the arch-prefixed keys we capture (e.g. "llama.block_count").
+_KEY_BLOCK_COUNT = ".block_count"
+_KEY_EMBED_LEN = ".embedding_length"
+_KEY_HEAD_COUNT = ".attention.head_count"
+_KEY_HEAD_COUNT_KV = ".attention.head_count_kv"
+def read_gguf_metadata(path: str | Path) -> GGUFInfo:
+    """Parse the GGUF header at ``path`` and return a :class:`GGUFInfo`.
+    Raises :class:`GGUFParseError` if the file is missing, too short,
+    or not a GGUF container. Captures only the keys needed for memory
+    accounting; everything else is skipped.
+    """
+    p = Path(path)
+    try:
+        file_size = p.stat().st_size
+    except OSError as exc:  # missing / unreadable
+        raise GGUFParseError(f"cannot stat {path}: {exc}") from exc
+    arch: str | None = None
+    n_layers: int | None = None
+    n_embd: int | None = None
+    n_heads: int | None = None
+    n_kv_heads: int | None = None
+    file_type: int | None = None
+    with p.open("rb") as fh:
+        magic = fh.read(4)
+        if magic != _GGUF_MAGIC:
+            raise GGUFParseError(f"bad magic {magic!r} (not a GGUF file)")
+        version = _read_u32(fh)
+        if version not in (2, 3):
+            raise GGUFParseError(f"unsupported GGUF version {version}")
+        _read_u64(fh)  # tensor_count: advance cursor, not needed
+        kv_count = _read_u64(fh)
+        if kv_count > _MAX_KV_PAIRS:
+            raise GGUFParseError(f"kv_count {kv_count} exceeds cap")
+        for _ in range(kv_count):
+            key = _read_gguf_string(fh)
+            value_type = _read_u32(fh)
+            value = _read_scalar_value(fh, value_type)
+            if key == "general.architecture" and isinstance(value, str):
+                arch = value
+            elif key == "general.file_type" and isinstance(value, int):
+                file_type = value
+            elif key.endswith(_KEY_BLOCK_COUNT) and isinstance(value, int):
+                n_layers = value
+            elif key.endswith(_KEY_EMBED_LEN) and isinstance(value, int):
+                n_embd = value
+            elif key.endswith(_KEY_HEAD_COUNT_KV) and isinstance(value, int):
+                n_kv_heads = value
+            elif key.endswith(_KEY_HEAD_COUNT) and isinstance(value, int):
+                n_heads = value
+    return GGUFInfo(
+        architecture=arch,
+        n_layers=n_layers,
+        n_embd=n_embd,
+        n_heads=n_heads,
+        n_kv_heads=n_kv_heads,
+        file_type=file_type,
+        file_size_bytes=file_size,
+    )
+def try_read_gguf_metadata(path: str | Path) -> GGUFInfo | None:
+    """Like :func:`read_gguf_metadata` but returns None on any parse
+    failure — convenient for best-effort advisory paths."""
+    try:
+        return read_gguf_metadata(path)
+    except GGUFParseError:
+        return None
+__all__ = [
+    "GGUFInfo",
+    "GGUFParseError",
+    "read_gguf_metadata",
+    "try_read_gguf_metadata",
+]

coderouter/guards/memory_budget.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""Proactive memory-budget guard (low-memory track, L1).
+Where :mod:`coderouter.guards.memory_pressure` reacts *after* an OOM,
+this guard prevents it: given the host's available memory (from
+:mod:`coderouter.hardware`) and the model's shape (from
+:mod:`coderouter.gguf_introspect`), it computes the largest context
+window (``num_ctx``) that will actually fit, *before* the request is
+dispatched.
+The engine then (a) caps the backend's ``num_ctx`` to that value and
+(b) trims conversation history to the same budget via
+:func:`coderouter.guards.context_budget.trim_to_budget`.
+Everything here is **pure** (no I/O, no globals) so it is trivially
+testable and free of the 5-deps constraint.
+KV-cache model
+==============
+The dominant runtime cost beyond the weights is the attention KV
+cache, which grows linearly with context length:
+    kv_bytes ≈ 2 (K and V)
+             x n_layers
+             x n_ctx
+             x kv_dim
+             x bytes_per_element
+``kv_dim`` is the per-token key/value width. With grouped-query
+attention (GQA) it is ``n_embd x n_kv_heads / n_heads``; without GQA
+metadata it falls back to ``n_embd`` (conservative — over-counts, so
+we under-promise context, which is the safe direction for OOM).
+``bytes_per_element`` defaults to 2 (fp16 KV cache). The estimate is
+deliberately conservative; the headroom in :mod:`coderouter.hardware`
+absorbs activation/compute buffers not modelled here.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Literal
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+_BYTES_PER_GB: int = 1024**3
+#: Default bytes per KV-cache element (fp16).
+DEFAULT_KV_BYTES_PER_ELEM: int = 2
+#: Fraction of the post-weights budget held back for activations and
+#: the compute buffer (not modelled explicitly). The remainder is what
+#: the KV cache may consume.
+DEFAULT_COMPUTE_OVERHEAD_RATIO: float = 0.10
+#: Fallback layer/embedding shape when GGUF metadata is incomplete.
+#: Chosen to over-estimate KV (safe: under-promises context).
+_FALLBACK_N_LAYERS: int = 32
+_FALLBACK_N_EMBD: int = 4096
+FitAction = Literal["ok", "shrink", "insufficient", "unknown"]
+# ---------------------------------------------------------------------------
+# Result type
+# ---------------------------------------------------------------------------
+@dataclass(frozen=True, slots=True)
+class FitDecision:
+    """Outcome of a pre-dispatch memory-fit computation.
+    ``action``:
+      * ``"ok"``           — requested context fits as-is.
+      * ``"shrink"``       — fits only at ``effective_num_ctx`` < requested.
+      * ``"insufficient"`` — won't fit even at ``min_num_ctx``; the model
+                             is too big for this host (caller should warn
+                             / fall through to another provider).
+      * ``"unknown"``      — hardware undetected; guard is a no-op.
+    """
+    action: FitAction
+    fits: bool
+    requested_num_ctx: int
+    effective_num_ctx: int
+    weights_bytes: int
+    kv_cache_bytes: int
+    available_bytes: int
+    reason: str
+# ---------------------------------------------------------------------------
+# KV-cache math (pure)
+# ---------------------------------------------------------------------------
+def kv_dim(
+    n_embd: int | None,
+    n_heads: int | None,
+    n_kv_heads: int | None,
+) -> int:
+    """Per-token KV width in elements.
+    Applies the GQA reduction when both head counts are known and
+    valid; otherwise returns ``n_embd`` (over-counts → safe).
+    """
+    embd = n_embd if (n_embd and n_embd > 0) else _FALLBACK_N_EMBD
+    if (
+        n_heads
+        and n_kv_heads
+        and n_heads > 0
+        and 0 < n_kv_heads <= n_heads
+    ):
+        return max(1, int(embd * n_kv_heads / n_heads))
+    return embd
+def kv_cache_bytes(
+    n_ctx: int,
+    n_layers: int,
+    kv_width: int,
+    *,
+    bytes_per_elem: int = DEFAULT_KV_BYTES_PER_ELEM,
+) -> int:
+    """KV-cache size in bytes for a given context length (K and V)."""
+    return 2 * max(0, n_layers) * max(0, n_ctx) * max(0, kv_width) * bytes_per_elem
+def max_num_ctx_for_budget(
+    kv_budget_bytes: int,
+    n_layers: int,
+    kv_width: int,
+    *,
+    bytes_per_elem: int = DEFAULT_KV_BYTES_PER_ELEM,
+) -> int:
+    """Largest ``n_ctx`` whose KV cache fits in ``kv_budget_bytes``."""
+    per_token = 2 * max(1, n_layers) * max(1, kv_width) * bytes_per_elem
+    if per_token <= 0 or kv_budget_bytes <= 0:
+        return 0
+    return int(kv_budget_bytes // per_token)
+# ---------------------------------------------------------------------------
+# Fit decision (pure)
+# ---------------------------------------------------------------------------
+def plan_fit(
+    *,
+    available_budget_gb: float,
+    weights_bytes: int,
+    requested_num_ctx: int,
+    n_layers: int | None,
+    n_embd: int | None = None,
+    n_heads: int | None = None,
+    n_kv_heads: int | None = None,
+    min_num_ctx: int = 2048,
+    bytes_per_elem: int = DEFAULT_KV_BYTES_PER_ELEM,
+    compute_overhead_ratio: float = DEFAULT_COMPUTE_OVERHEAD_RATIO,
+) -> FitDecision:
+    """Decide whether ``requested_num_ctx`` fits, and by how much to shrink.
+    ``available_budget_gb`` is the net memory (after OS headroom) from
+    :func:`coderouter.hardware.available_budget_gb`. ``0.0`` means
+    hardware was undetected → returns an ``"unknown"`` no-op decision.
+    """
+    if available_budget_gb <= 0.0:
+        return FitDecision(
+            action="unknown",
+            fits=True,  # don't block when we can't measure
+            requested_num_ctx=requested_num_ctx,
+            effective_num_ctx=requested_num_ctx,
+            weights_bytes=weights_bytes,
+            kv_cache_bytes=0,
+            available_bytes=0,
+            reason="hardware undetected; guard no-op",
+        )
+    available_bytes = int(available_budget_gb * _BYTES_PER_GB)
+    layers = n_layers if (n_layers and n_layers > 0) else _FALLBACK_N_LAYERS
+    width = kv_dim(n_embd, n_heads, n_kv_heads)
+    # Memory left for the KV cache after weights, minus a compute buffer.
+    post_weights = available_bytes - max(0, weights_bytes)
+    kv_budget = int(post_weights * (1.0 - compute_overhead_ratio))
+    # Can we even run the minimum context?
+    min_kv = kv_cache_bytes(min_num_ctx, layers, width, bytes_per_elem=bytes_per_elem)
+    if kv_budget < min_kv:
+        return FitDecision(
+            action="insufficient",
+            fits=False,
+            requested_num_ctx=requested_num_ctx,
+            effective_num_ctx=min_num_ctx,
+            weights_bytes=weights_bytes,
+            kv_cache_bytes=min_kv,
+            available_bytes=available_bytes,
+            reason=(
+                "weights + minimum KV cache exceed available memory; "
+                "model too large for this host"
+            ),
+        )
+    ctx_cap = max_num_ctx_for_budget(
+        kv_budget, layers, width, bytes_per_elem=bytes_per_elem
+    )
+    if ctx_cap >= requested_num_ctx:
+        kv = kv_cache_bytes(
+            requested_num_ctx, layers, width, bytes_per_elem=bytes_per_elem
+        )
+        return FitDecision(
+            action="ok",
+            fits=True,
+            requested_num_ctx=requested_num_ctx,
+            effective_num_ctx=requested_num_ctx,
+            weights_bytes=weights_bytes,
+            kv_cache_bytes=kv,
+            available_bytes=available_bytes,
+            reason="requested context fits",
+        )
+    # Shrink to the cap, but never below the floor.
+    effective = max(min_num_ctx, ctx_cap)
+    kv = kv_cache_bytes(effective, layers, width, bytes_per_elem=bytes_per_elem)
+    return FitDecision(
+        action="shrink",
+        fits=True,
+        requested_num_ctx=requested_num_ctx,
+        effective_num_ctx=effective,
+        weights_bytes=weights_bytes,
+        kv_cache_bytes=kv,
+        available_bytes=available_bytes,
+        reason=f"context shrunk from {requested_num_ctx} to {effective} to fit memory",
+    )
+__all__ = [
+    "DEFAULT_COMPUTE_OVERHEAD_RATIO",
+    "DEFAULT_KV_BYTES_PER_ELEM",
+    "FitAction",
+    "FitDecision",
+    "kv_cache_bytes",
+    "kv_dim",
+    "max_num_ctx_for_budget",
+    "plan_fit",
+]

coderouter/hardware.py ADDED Viewed

@@ -0,0 +1,264 @@
+"""Shared hardware detection + memory accounting (low-memory track, L0).
+Background
+==========
+Low-memory machines (8-16 GB unified / discrete VRAM) can only run
+small GGUF models, and CodeRouter's existing memory handling is purely
+*reactive*: :mod:`coderouter.guards.memory_pressure` only fires *after*
+a backend has already tripped an OOM. To prevent OOM *before* dispatch
+we need to know how much memory the host actually has.
+The detection primitive already existed inside
+``coderouter.ingress.launcher_routes._detect_hardware`` but was only
+wired to the launcher UI. This module promotes it to a shared,
+cached, dependency-free utility so the guard path can consume it too.
+5-deps invariant
+================
+Detection is **best-effort and uses only the standard library**
+(``os.sysconf`` / ``subprocess`` calling ``sysctl`` / ``nvidia-smi``).
+No ``psutil`` / ``pynvml``. Every probe is wrapped so a missing tool or
+permission error degrades gracefully to ``0.0`` rather than raising.
+Caching
+=======
+Detection performs blocking I/O (subprocess). Results are cached in
+process with a short TTL (:data:`_CACHE_TTL_S`) so the hot dispatch
+path pays the cost at most once per minute. ``detect_hardware`` is
+safe to call from async code via ``asyncio.to_thread``.
+"""
+from __future__ import annotations
+import contextlib
+import os
+import platform
+import shutil
+import subprocess  # controlled: fixed argv, no shell
+import threading
+import time
+from dataclasses import dataclass
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+_BYTES_PER_GB: int = 1024**3
+#: Detection cache TTL. Hardware doesn't change mid-session, but we keep
+#: a TTL so a hot-plugged eGPU or driver restart is eventually noticed.
+_CACHE_TTL_S: float = 60.0
+#: Default headroom reserved for the OS and other processes, in GB.
+#: On unified-memory (Metal) systems the OS + UI already consume a few
+#: GB, so a conservative floor avoids starving the desktop.
+DEFAULT_HEADROOM_GB: float = 1.5
+#: Default headroom as a fraction of usable memory. The effective
+#: headroom is ``max(DEFAULT_HEADROOM_GB, usable * DEFAULT_HEADROOM_RATIO)``.
+DEFAULT_HEADROOM_RATIO: float = 0.15
+# ---------------------------------------------------------------------------
+# Result types
+# ---------------------------------------------------------------------------
+@dataclass(frozen=True, slots=True)
+class HardwareInfo:
+    """Best-effort snapshot of the host's compute resources.
+    All memory values are in GiB. ``0.0`` means "could not detect"
+    (caller should treat detection as unavailable, not as "zero RAM").
+    """
+    #: System RAM in GiB (0.0 if undetectable).
+    ram_gb: float
+    #: GPU VRAM in GiB. For Metal/unified memory this mirrors ``ram_gb``;
+    #: for CPU-only it is 0.0.
+    vram_gb: float
+    #: One of ``"cuda"`` / ``"metal"`` / ``"cpu"``.
+    gpu: str
+    #: Logical CPU count (best-effort, defaults to 4).
+    cpu_count: int
+    @property
+    def detected(self) -> bool:
+        """True iff at least RAM was detected (a usable budget exists)."""
+        return self.ram_gb > 0.0
+    @property
+    def unified_memory(self) -> bool:
+        """True for Apple-silicon Metal, where VRAM and RAM are shared."""
+        return self.gpu == "metal"
+# ---------------------------------------------------------------------------
+# Detection (cached)
+# ---------------------------------------------------------------------------
+_cache_lock = threading.RLock()
+_cache_value: HardwareInfo | None = None
+_cache_ts: float = 0.0
+def _detect_ram_gb() -> float:
+    """Detect system RAM in GiB via stdlib, then ``sysctl`` fallback."""
+    ram_gb = 0.0
+    with contextlib.suppress(ValueError, OSError, AttributeError):
+        ram_gb = (
+            os.sysconf("SC_PHYS_PAGES") * os.sysconf("SC_PAGE_SIZE")
+        ) / _BYTES_PER_GB
+    if ram_gb <= 0:
+        with contextlib.suppress(ValueError, OSError, subprocess.SubprocessError):
+            out = subprocess.run(  # fixed argv, no shell
+                ["sysctl", "-n", "hw.memsize"],
+                capture_output=True,
+                text=True,
+                timeout=3,
+                check=False,
+            )
+            ram_gb = int(out.stdout.strip()) / _BYTES_PER_GB
+    return ram_gb
+def _detect_gpu(ram_gb: float) -> tuple[str, float]:
+    """Detect (gpu_kind, vram_gb).
+    Apple silicon → unified memory (VRAM == RAM). NVIDIA → query
+    ``nvidia-smi``. Otherwise CPU with 0 VRAM.
+    """
+    if platform.system() == "Darwin" and platform.machine() == "arm64":
+        return "metal", ram_gb  # unified memory
+    if shutil.which("nvidia-smi"):
+        with contextlib.suppress(ValueError, OSError, subprocess.SubprocessError):
+            out = subprocess.run(  # fixed argv, no shell
+                [
+                    "nvidia-smi",
+                    "--query-gpu=memory.total",
+                    "--format=csv,noheader,nounits",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=5,
+                check=False,
+            )
+            mb = max(
+                (int(x) for x in out.stdout.split() if x.strip().isdigit()),
+                default=0,
+            )
+            if mb > 0:
+                return "cuda", mb / 1024
+    return "cpu", 0.0
+def _detect_uncached() -> HardwareInfo:
+    """Run the full best-effort detection (no caching)."""
+    cpu = os.cpu_count() or 4
+    ram_gb = _detect_ram_gb()
+    gpu, vram_gb = _detect_gpu(ram_gb)
+    return HardwareInfo(
+        ram_gb=round(ram_gb, 1),
+        vram_gb=round(vram_gb, 1),
+        gpu=gpu,
+        cpu_count=cpu,
+    )
+def detect_hardware(*, force_refresh: bool = False) -> HardwareInfo:
+    """Return a cached :class:`HardwareInfo` snapshot.
+    Blocking (subprocess). Call via ``asyncio.to_thread`` from async
+    code. The result is cached for :data:`_CACHE_TTL_S` seconds.
+    Parameters
+    ----------
+    force_refresh
+        Bypass the cache and re-probe immediately (e.g. after a
+        backend restart).
+    """
+    global _cache_value, _cache_ts
+    now = time.monotonic()
+    with _cache_lock:
+        if (
+            not force_refresh
+            and _cache_value is not None
+            and (now - _cache_ts) < _CACHE_TTL_S
+        ):
+            return _cache_value
+        info = _detect_uncached()
+        _cache_value = info
+        _cache_ts = now
+        return info
+def reset_cache() -> None:
+    """Drop the detection cache. Mainly for tests."""
+    global _cache_value, _cache_ts
+    with _cache_lock:
+        _cache_value = None
+        _cache_ts = 0.0
+# ---------------------------------------------------------------------------
+# Memory accounting
+# ---------------------------------------------------------------------------
+def usable_memory_gb(hw: HardwareInfo) -> float:
+    """Memory available for model weights + KV cache, in GiB.
+    CUDA → dedicated VRAM. Metal/CPU → system RAM (unified or host).
+    Returns 0.0 when nothing was detected (caller should no-op rather
+    than make a wrong decision).
+    """
+    if not hw.detected:
+        return 0.0
+    if hw.gpu == "cuda":
+        return hw.vram_gb
+    return hw.ram_gb
+def headroom_gb(
+    usable_gb: float,
+    *,
+    floor_gb: float = DEFAULT_HEADROOM_GB,
+    ratio: float = DEFAULT_HEADROOM_RATIO,
+) -> float:
+    """Memory to *reserve* for the OS / other processes, in GiB.
+    ``max(floor_gb, usable_gb * ratio)`` — a fixed floor protects tiny
+    machines, the ratio scales the reserve on larger ones.
+    """
+    return max(floor_gb, usable_gb * ratio)
+def available_budget_gb(
+    hw: HardwareInfo,
+    *,
+    floor_gb: float = DEFAULT_HEADROOM_GB,
+    ratio: float = DEFAULT_HEADROOM_RATIO,
+) -> float:
+    """Net memory usable for weights + KV after subtracting headroom.
+    Never negative. Returns 0.0 when hardware is undetected.
+    """
+    usable = usable_memory_gb(hw)
+    if usable <= 0.0:
+        return 0.0
+    return max(0.0, usable - headroom_gb(usable, floor_gb=floor_gb, ratio=ratio))
+__all__ = [
+    "DEFAULT_HEADROOM_GB",
+    "DEFAULT_HEADROOM_RATIO",
+    "HardwareInfo",
+    "available_budget_gb",
+    "detect_hardware",
+    "headroom_gb",
+    "reset_cache",
+    "usable_memory_gb",
+]

coderouter/ingress/launcher_routes.py CHANGED Viewed

@@ -253,14 +253,32 @@ def _model_recommendation(size_gb: float, hw: dict[str, Any]) -> dict[str, str]:
     return {"level": "warn", "label": "メモリ厳しい"}
-def _suggest_launch_flags(size_gb: float, hw: dict[str, Any]) -> str:
-    """選択モデル + ハードから -ngl / --ctx-size / --threads を提案する。
+def _suggest_launch_flags(backend: str, size_gb: float,
+                          hw: dict[str, Any]) -> str:
+    """選択モデル + ハード + バックエンドから推奨起動フラグを提案する。
+    バックエンドごとにフラグ体系が違うため分岐する:
+      - llama.cpp : -ngl / --ctx-size / --threads を算出
+      - vllm      : モデル config からの自動導出に任せる (空文字)
+      - mlx       : 統合メモリ前提で起動時フラグ不要 (空文字)
     あくまで目安。他プロセスのメモリ使用や量子化方式までは考慮しない。
     """
-    threads = max(1, int(hw.get("cpu_count", 4)) - 2)
+    if backend == "mlx":
+        # MLX は統合メモリ + Metal 前提。llama.cpp の -ngl に相当する
+        # レイヤーオフロードの概念がなく、mlx_lm.server は起動時の
+        # 性能チューニングフラグを取らない。
+        return ""
+    if backend == "vllm":
+        # vllm の --max-model-len はモデルの実コンテキスト長に依存する。
+        # メモリ量だけのヒューリスティックで値を出すと、モデルの上限を
+        # 超えたときに vllm が起動を拒否する。空にしてエンジンの
+        # 自動導出 (モデル config) に任せるのが安全。
+        return ""
+    # llama.cpp (デフォルト)
     usable = _usable_memory_gb(hw)
     weights = size_gb * 1.15                       # 重み + オーバーヘッド概算
+    threads = max(1, int(hw.get("cpu_count", 4)) - 2)
     if hw.get("gpu") == "cpu":
         ngl = 0
     elif usable >= weights + 1.0:
@@ -620,17 +638,20 @@ async def api_logs(proc_id: str, request: Request, n: int = 100) -> dict[str, An
 @router.get("/api/launcher/suggest")
-async def api_suggest(model_path: str = "") -> dict[str, Any]:
+async def api_suggest(model_path: str = "",
+                      backend: str = "llama.cpp") -> dict[str, Any]:
     """Suggest launch flags for the given model based on detected hardware.
     クライアントの「推奨値」ボタンから呼ばれる。値はあくまで目安。
+    バックエンドごとにフラグ体系が違うため backend も受け取る。
     """
     hw = await asyncio.to_thread(_detect_hardware)
     size_gb = 0.0
     if model_path:
         size_gb = await asyncio.to_thread(_model_size_gb, model_path)
     return {
-        "extra_args": _suggest_launch_flags(size_gb, hw),
+        "extra_args": _suggest_launch_flags(backend, size_gb, hw),
+        "backend": backend,
         "hardware": hw,
         "size_gb": round(size_gb, 2),
     }
@@ -905,14 +926,24 @@ _LAUNCHER_HTML = r"""<!doctype html>
   window.suggestOptions = async () => {
     const model = document.getElementById("f-model").value.trim();
     if (!model) { showLaunchErr("先にモデルを選択してください"); return; }
+    const backend = document.getElementById("f-backend").value;
     try {
       const r = await fetch("/api/launcher/suggest?model_path="
-                            + encodeURIComponent(model));
+                            + encodeURIComponent(model)
+                            + "&backend=" + encodeURIComponent(backend));
       const d = await r.json();
       if (!r.ok) { showLaunchErr(d.detail || "推奨値の取得に失敗"); return; }
       document.getElementById("f-extra").value = d.extra_args;
       showLaunchErr("");
-      statusMsg("推奨値を設定(目安): " + d.extra_args);
+      if (d.extra_args) {
+        statusMsg("推奨値を設定(目安): " + d.extra_args);
+      } else if (backend === "mlx") {
+        statusMsg("MLX は起動時の調整フラグ不要です(統合メモリで自動)");
+      } else if (backend === "vllm") {
+        statusMsg("vllm は起動時フラグ不要です(モデル設定から自動導出)");
+      } else {
+        statusMsg("このバックエンドは推奨フラグの自動設定対象外です");
+      }
     } catch (e) {
       showLaunchErr(e.message);
     }

coderouter/routing/budget.py CHANGED Viewed

@@ -218,7 +218,7 @@ class BudgetTracker:
             totals = state.get("totals", {})
             if isinstance(totals, dict):
                 self._totals = {
-                    k: float(v) for k, v in totals.items() if isinstance(v, (int, float))
+                    k: float(v) for k, v in totals.items() if isinstance(v, int | float)
                 }
                 self._month = current

coderouter/token_estimation_accurate.py ADDED Viewed

@@ -0,0 +1,136 @@
+"""Optional precision token counting (low-memory accuracy track).
+The core estimator in :mod:`coderouter.token_estimation` uses a
+``char/4`` heuristic that under-counts CJK text badly — which is
+exactly the failure mode that makes the memory-budget guard either
+OOM (under-count) or over-trim (over-count). This module offers an
+opt-in precise backend without breaking the 5-deps invariant.
+Design
+======
+* **Optional dependency.** ``tokenizers`` (HuggingFace, Rust core) is
+  declared under the ``accuracy`` extra, *not* a core dependency. It is
+  imported lazily; if absent, every function falls back to the char/4
+  heuristic. Callers always get an ``int``.
+* **Local files only — no network.** We load tokenizers exclusively via
+  ``Tokenizer.from_file(<local tokenizer.json>)``. We never call
+  ``from_pretrained`` or anything that contacts the HuggingFace Hub, so
+  this module performs **zero network I/O** and cannot be steered into
+  downloading arbitrary content.
+* **No pickle / no torch.** ``tokenizers`` reads JSON only; we never
+  import ``torch`` or ``transformers`` (avoids the pickle-deserialization
+  RCE surface).
+A loaded tokenizer is cached per resolved path so repeated requests
+don't re-parse ``tokenizer.json``.
+"""
+from __future__ import annotations
+import threading
+from pathlib import Path
+from typing import Any
+from coderouter.token_estimation import CHARS_PER_TOKEN_HEURISTIC
+# ---------------------------------------------------------------------------
+# Lazy backend detection
+# ---------------------------------------------------------------------------
+_backend_lock = threading.RLock()
+_tokenizer_cache: dict[str, Any] = {}
+_accuracy_available: bool | None = None
+def is_accuracy_available() -> bool:
+    """True iff the optional ``tokenizers`` backend can be imported.
+    Result is memoised. Never raises — a missing package simply
+    returns False (callers fall back to the heuristic).
+    """
+    global _accuracy_available
+    if _accuracy_available is not None:
+        return _accuracy_available
+    with _backend_lock:
+        if _accuracy_available is None:
+            try:
+                import tokenizers  # noqa: F401  (probe only)
+                _accuracy_available = True
+            except Exception:  # pragma: no cover - import failure path
+                _accuracy_available = False
+        return _accuracy_available
+def _load_tokenizer(tokenizer_path: str | Path) -> Any | None:
+    """Load and cache a tokenizer from a **local** ``tokenizer.json``.
+    Returns None if the backend is unavailable, the path is missing,
+    or the file fails to parse. Strictly local — never touches the Hub.
+    """
+    if not is_accuracy_available():
+        return None
+    p = Path(tokenizer_path)
+    key = str(p.resolve()) if p.exists() else str(p)
+    with _backend_lock:
+        if key in _tokenizer_cache:
+            return _tokenizer_cache[key]
+        if not p.is_file():
+            _tokenizer_cache[key] = None
+            return None
+        try:
+            from tokenizers import Tokenizer  # local import
+            tok = Tokenizer.from_file(str(p))  # local file only, no network
+        except Exception:
+            tok = None
+        _tokenizer_cache[key] = tok
+        return tok
+def reset_cache() -> None:
+    """Clear the tokenizer cache and backend probe. Mainly for tests."""
+    global _accuracy_available
+    with _backend_lock:
+        _tokenizer_cache.clear()
+        _accuracy_available = None
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def _heuristic(text: str) -> int:
+    return len(text) // CHARS_PER_TOKEN_HEURISTIC
+def count_tokens(text: str, *, tokenizer_path: str | Path | None = None) -> int:
+    """Count tokens in ``text``.
+    Uses the precise ``tokenizers`` backend when ``tokenizer_path``
+    points at a readable local ``tokenizer.json`` *and* the optional
+    dependency is installed; otherwise falls back to the char/4
+    heuristic. Always returns a non-negative ``int`` and never raises
+    on backend problems.
+    """
+    if not text:
+        return 0
+    if tokenizer_path is not None:
+        tok = _load_tokenizer(tokenizer_path)
+        if tok is not None:
+            try:
+                return len(tok.encode(text).ids)
+            except Exception:  # pragma: no cover - encode failure path
+                return _heuristic(text)
+    return _heuristic(text)
+__all__ = [
+    "count_tokens",
+    "is_accuracy_available",
+    "reset_cache",
+]

{coderouter_cli-2.5.1.dist-info → coderouter_cli-2.5.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: coderouter-cli
-Version: 2.5.1
+Version: 2.5.3
 Summary: Local-first, free-first, fallback-built-in LLM router. Claude Code / OpenAI compatible.
 Project-URL: Homepage, https://github.com/zephel01/CodeRouter
 Project-URL: Repository, https://github.com/zephel01/CodeRouter
@@ -27,6 +27,8 @@ Requires-Dist: httpx>=0.27.0
 Requires-Dist: pydantic>=2.9.0
 Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: uvicorn[standard]>=0.32.0
+Provides-Extra: accuracy
+Requires-Dist: tokenizers>=0.20; extra == 'accuracy'
 Provides-Extra: dev
 Requires-Dist: mypy>=1.13.0; extra == 'dev'
 Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
@@ -37,6 +39,8 @@ Requires-Dist: ruff>=0.7.0; extra == 'dev'
 Requires-Dist: types-pyyaml>=6.0.12; extra == 'dev'
 Provides-Extra: doctor
 Requires-Dist: ruamel-yaml>=0.18.6; extra == 'doctor'
+Provides-Extra: repair
+Requires-Dist: json-repair>=0.30; extra == 'repair'
 Description-Content-Type: text/markdown
 <h1 align="center">CodeRouter</h1>

{coderouter_cli-2.5.1.dist-info → coderouter_cli-2.5.3.dist-info}/RECORD RENAMED Viewed

@@ -7,9 +7,12 @@ coderouter/doctor.py,sha256=2luNk6BHSRvpQStJnHcqzNvNi-SKdOuKV0WZdorZhVk,82854
 coderouter/doctor_apply.py,sha256=r_J6xbu5-HivofPNriw4_vjNYs_VRs7GsGTS0oMEX10,24209
 coderouter/env_security.py,sha256=FEBZnXfJ0xE39kmMMn39zk0W_DRRnmcB_REmP9f4xWo,14796
 coderouter/errors.py,sha256=Xmq67lheyw8iv3Ox39jh2c4tvNI5RcUR4QkoxVDN6l4,1130
+coderouter/gguf_introspect.py,sha256=FZO14STLSp94Rfo5AInGwYUOpfjiXOW6CH5RiczTWDE,9514
+coderouter/hardware.py,sha256=gn3_9qbVcGRR81yKMn1lJE_8-YDRau0LxIH_M-f7pxE,8356
 coderouter/logging.py,sha256=U7QiGRaoQXTSGijc-jV9TebnbbzrD-snfnoZy73Nvwo,52737
 coderouter/output_filters.py,sha256=LOOh68Kcn2LFDy1wPFynA6O_HGazV756q_79Z0_4Jww,19350
 coderouter/token_estimation.py,sha256=1Ai1uT68hahpyr4LBhNyVRGq7y4yXItd6J4k5ApGX7M,5995
+coderouter/token_estimation_accurate.py,sha256=GTfzrBVnvAGjeVzmzAeUdOYZvWZKLAxcxPpFiJGlzjk,4609
 coderouter/adapters/__init__.py,sha256=7dIDSZ-FE_0iSqLSDc_lK1idRdLTKcM2hP9tCJipgPI,463
 coderouter/adapters/anthropic_native.py,sha256=qfdjxy4YyLt-0Fj7hUYn1oi1SFjEEbSvpaRBUC2hMf4,21903
 coderouter/adapters/base.py,sha256=H4uM6r_-95Xs1hCM_X4Zv3tq-xN3cXWLj83F-QjPNLw,8265
@@ -29,6 +32,7 @@ coderouter/guards/context_budget.py,sha256=moWulVr5NtVci13vXxS0ucV4EvX2b7tbA1W1d
 coderouter/guards/continuous_probe.py,sha256=AKNMbJ7hUJG-FDoU160BCbSEQQUyw0hBxFYMTaBZg84,11681
 coderouter/guards/drift_actions.py,sha256=A6pY5CR480Ct5rCVyjlBvjPFVc93eu_r5qcUpK9mWKc,3602
 coderouter/guards/drift_detection.py,sha256=vlepNw_GjvmpHZHTkMS5JM7XKxHaRxzjj_GfopRa1M0,13489
+coderouter/guards/memory_budget.py,sha256=_bRtusk4AwrU781wVXW32OFU7zD2FXxOwTb7yGqPJqA,8278
 coderouter/guards/memory_pressure.py,sha256=mul1KXO9oE1i424cs92Sk6uzoRrV6Seck2Lk3bu-w68,7903
 coderouter/guards/self_healing.py,sha256=_fT_EJvTTp5VSi-qAP93J_1LkgPK5jkzsyrUHdKC45A,13853
 coderouter/guards/tool_loop.py,sha256=EzeMcmU7BLeTW2jsRVevU81l5rhWcn1oUr7EpzgXjVM,15209
@@ -36,7 +40,7 @@ coderouter/ingress/__init__.py,sha256=WQsCH2CGJCAhy0mS6GSEdeYZRkkQu2OHDsP4CJWTLu
 coderouter/ingress/anthropic_routes.py,sha256=It2f7XGe3fgKQX01J2F5JOCoZr96t_Tx_kY2om99MVo,16894
 coderouter/ingress/app.py,sha256=PcuTvUFNjr04EbsUOu8qdyKTdBzxkIJYB4xpz8dFfMo,12635
 coderouter/ingress/dashboard_routes.py,sha256=rscoj89weHTfc8QmYk-fof-7062rhKFHVHRA8cDImDI,21931
-coderouter/ingress/launcher_routes.py,sha256=T3uMmpUaFsc0k5jKyUIUPbSGCUBbzgIUMIPqEpNV3j4,45762
+coderouter/ingress/launcher_routes.py,sha256=Jh-E6qFmHnr7ON4W6QanafxQIoojT4F034mybLvhTyQ,47548
 coderouter/ingress/metrics_routes.py,sha256=M22dwOGn24P05Ge4W3c7d7mYytSGWjIR-pPSPOAiHJY,3965
 coderouter/ingress/openai_routes.py,sha256=Zw1efPw9DI6GgV8ZcLrzS6Cda0KLrFkKn2GBZWSe6Vo,6322
 coderouter/metrics/__init__.py,sha256=7Es351DPS7yLM0yVF_F0eesmiD83n7Zzhie44chht38,1465
@@ -49,7 +53,7 @@ coderouter/plugins/registry.py,sha256=Tx0QHJHozZ5LTUliGylBdNVcdzHTBV0nedCUwGlbLM
 coderouter/routing/__init__.py,sha256=g2vhutbozRx5QBThReqwPN3imk5qXdpDiaogILd3IRc,257
 coderouter/routing/adaptive.py,sha256=G2o377twGSjbUh65wiIFx6klnpFGjsD_nI3oDvcBwhY,21257
 coderouter/routing/auto_router.py,sha256=4_sQR0ztSED9FgQSvQqgqSiydyQVY_qOSRvwyZ5BfRc,12909
-coderouter/routing/budget.py,sha256=A3_i44tmS3SrqVNnoGkLKMsiYwI_Ug6m5-3gitVoQSM,8452
+coderouter/routing/budget.py,sha256=PblmVKJGs_BwNa9uDHAA8hmZ4XIVKv38mHAeU0V3OMs,8451
 coderouter/routing/capability.py,sha256=DCDmiQ-78dkYonCM1WQBCMf6e6XI6VIv_cnuz9hdWT0,18443
 coderouter/routing/fallback.py,sha256=P3f6Yna1EGnLAT-ZS5ADrrZ-qRWc-M5xvwEuan4rmcs,104568
 coderouter/state/__init__.py,sha256=XoGcPmmBQSiZWML2S0juSveQ78xfhtdeCliNnVyzu7E,1088
@@ -62,8 +66,8 @@ coderouter/translation/__init__.py,sha256=PYXN7XVEwpG1uC8RLy6fvnGbzEZhhrEuUapH8I
 coderouter/translation/anthropic.py,sha256=JpvIWNXHUPVqOGvps7o_6ZADhXuJuvpU7RdMqQFtwwM,6421
 coderouter/translation/convert.py,sha256=-qyzFzmmr9hhQV6_Sg75kJnvCZvHe3n7vRdaZtk_JqQ,47269
 coderouter/translation/tool_repair.py,sha256=Ok2PF947Liegc5oaytfptv5MWMkpfJYQie-zdP1y3cY,9946
-coderouter_cli-2.5.1.dist-info/METADATA,sha256=3ltKBldo-TSDI97pvjmhs6esu7OwjZvsBtKD5Ll3F04,11521
-coderouter_cli-2.5.1.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
-coderouter_cli-2.5.1.dist-info/entry_points.txt,sha256=-dnLfD1YZ2WjH2zSdNCvlO65wYltM9bsHt9Fhg3yGss,51
-coderouter_cli-2.5.1.dist-info/licenses/LICENSE,sha256=wkEzoR86jFw33jvfOHjULqmkGEfxTFMgMaJnpR8mPRw,1065
-coderouter_cli-2.5.1.dist-info/RECORD,,
+coderouter_cli-2.5.3.dist-info/METADATA,sha256=3q3FPL44mGgfySDAi_5gEW1Y_CaZk6i_8wH2RkQKwf0,11674
+coderouter_cli-2.5.3.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
+coderouter_cli-2.5.3.dist-info/entry_points.txt,sha256=-dnLfD1YZ2WjH2zSdNCvlO65wYltM9bsHt9Fhg3yGss,51
+coderouter_cli-2.5.3.dist-info/licenses/LICENSE,sha256=wkEzoR86jFw33jvfOHjULqmkGEfxTFMgMaJnpR8mPRw,1065
+coderouter_cli-2.5.3.dist-info/RECORD,,

{coderouter_cli-2.5.1.dist-info → coderouter_cli-2.5.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{coderouter_cli-2.5.1.dist-info → coderouter_cli-2.5.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{coderouter_cli-2.5.1.dist-info → coderouter_cli-2.5.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

coderouter-cli 2.5.1__py3-none-any.whl → 2.5.3__py3-none-any.whl

coderouter-cli 2.5.1py3-none-any.whl → 2.5.3py3-none-any.whl