coderouter-cli 2.5.1__py3-none-any.whl → 2.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,304 @@
1
+ """Minimal, dependency-free GGUF header introspection (low-memory track).
2
+
3
+ Why self-written
4
+ ================
5
+
6
+ To right-size ``num_ctx`` *before* dispatch we need a model's layer
7
+ count and embedding width so the KV-cache footprint can be estimated.
8
+ That data lives in the GGUF metadata header. Rather than add the
9
+ official ``gguf`` package (and its ``numpy`` transitive dep) we read
10
+ only the handful of header fields we need with the standard library —
11
+ preserving the 5-deps invariant.
12
+
13
+ The GGUF binary layout we parse (little-endian):
14
+
15
+ magic : 4 bytes == b"GGUF"
16
+ version : uint32 (2 or 3 supported)
17
+ tensor_cnt : uint64 (ignored — we never read tensor data)
18
+ kv_count : uint64 (number of metadata key/value pairs)
19
+ kv_pairs : kv_count repetitions of:
20
+ key : gguf-string (uint64 length + UTF-8 bytes)
21
+ value_type : uint32 (see _GGUF_TYPE_*)
22
+ value : type-dependent
23
+
24
+ We walk the KV pairs, capturing only the keys we care about, and skip
25
+ the rest (including arbitrarily nested arrays) without materialising
26
+ them.
27
+
28
+ Security
29
+ ========
30
+
31
+ The parser treats the file as **untrusted input**:
32
+
33
+ * Every string length and array element count is clamped against
34
+ :data:`_MAX_STR_BYTES` / :data:`_MAX_ARRAY_LEN` so a corrupt or
35
+ hostile header cannot trigger a multi-GB allocation (DoS).
36
+ * Reads past EOF raise :class:`GGUFParseError`, never an unbounded
37
+ loop.
38
+ * No ``mmap``, no tensor payload read, no code execution path — we
39
+ only seek/read a small prefix.
40
+ """
41
+
42
+ from __future__ import annotations
43
+
44
+ import struct
45
+ from dataclasses import dataclass
46
+ from pathlib import Path
47
+ from typing import BinaryIO
48
+
49
+ # ---------------------------------------------------------------------------
50
+ # Constants / format
51
+ # ---------------------------------------------------------------------------
52
+
53
+ _GGUF_MAGIC = b"GGUF"
54
+
55
+ # GGUF metadata value type tags.
56
+ _GGUF_TYPE_UINT8 = 0
57
+ _GGUF_TYPE_INT8 = 1
58
+ _GGUF_TYPE_UINT16 = 2
59
+ _GGUF_TYPE_INT16 = 3
60
+ _GGUF_TYPE_UINT32 = 4
61
+ _GGUF_TYPE_INT32 = 5
62
+ _GGUF_TYPE_FLOAT32 = 6
63
+ _GGUF_TYPE_BOOL = 7
64
+ _GGUF_TYPE_STRING = 8
65
+ _GGUF_TYPE_ARRAY = 9
66
+ _GGUF_TYPE_UINT64 = 10
67
+ _GGUF_TYPE_INT64 = 11
68
+ _GGUF_TYPE_FLOAT64 = 12
69
+
70
+ # Fixed-width scalar (struct format, size) by type tag.
71
+ _SCALAR: dict[int, tuple[str, int]] = {
72
+ _GGUF_TYPE_UINT8: ("<B", 1),
73
+ _GGUF_TYPE_INT8: ("<b", 1),
74
+ _GGUF_TYPE_UINT16: ("<H", 2),
75
+ _GGUF_TYPE_INT16: ("<h", 2),
76
+ _GGUF_TYPE_UINT32: ("<I", 4),
77
+ _GGUF_TYPE_INT32: ("<i", 4),
78
+ _GGUF_TYPE_FLOAT32: ("<f", 4),
79
+ _GGUF_TYPE_BOOL: ("<?", 1),
80
+ _GGUF_TYPE_UINT64: ("<Q", 8),
81
+ _GGUF_TYPE_INT64: ("<q", 8),
82
+ _GGUF_TYPE_FLOAT64: ("<d", 8),
83
+ }
84
+
85
+ # Defensive clamps against hostile / corrupt headers.
86
+ _MAX_STR_BYTES: int = 1 << 20 # 1 MiB key/value string ceiling
87
+ _MAX_ARRAY_LEN: int = 1 << 24 # element-count ceiling for arrays
88
+ _MAX_KV_PAIRS: int = 1 << 20 # metadata pair ceiling
89
+
90
+ # Human-readable names for the GGUF ``general.file_type`` enum (subset).
91
+ _FILE_TYPE_NAMES: dict[int, str] = {
92
+ 0: "F32",
93
+ 1: "F16",
94
+ 2: "Q4_0",
95
+ 3: "Q4_1",
96
+ 7: "Q8_0",
97
+ 8: "Q5_0",
98
+ 9: "Q5_1",
99
+ 10: "Q2_K",
100
+ 11: "Q3_K_S",
101
+ 12: "Q3_K_M",
102
+ 13: "Q3_K_L",
103
+ 14: "Q4_K_S",
104
+ 15: "Q4_K_M",
105
+ 16: "Q5_K_S",
106
+ 17: "Q5_K_M",
107
+ 18: "Q6_K",
108
+ 19: "IQ2_XXS",
109
+ 20: "IQ2_XS",
110
+ 21: "Q2_K_S",
111
+ 22: "IQ3_XS",
112
+ 23: "IQ3_XXS",
113
+ 24: "IQ1_S",
114
+ 25: "IQ4_NL",
115
+ 26: "IQ3_S",
116
+ 27: "IQ3_M",
117
+ 28: "IQ2_S",
118
+ 29: "IQ2_M",
119
+ 30: "IQ4_XS",
120
+ 31: "IQ1_M",
121
+ }
122
+
123
+
124
+ class GGUFParseError(Exception):
125
+ """Raised when a file is not a parseable GGUF header."""
126
+
127
+
128
+ @dataclass(frozen=True, slots=True)
129
+ class GGUFInfo:
130
+ """The subset of GGUF metadata needed for memory accounting."""
131
+
132
+ architecture: str | None
133
+ n_layers: int | None
134
+ n_embd: int | None
135
+ n_heads: int | None
136
+ n_kv_heads: int | None
137
+ file_type: int | None
138
+ file_size_bytes: int
139
+
140
+ @property
141
+ def quant_name(self) -> str | None:
142
+ """Human-readable quantization label, or None if unknown."""
143
+ if self.file_type is None:
144
+ return None
145
+ return _FILE_TYPE_NAMES.get(self.file_type, f"type{self.file_type}")
146
+
147
+ @property
148
+ def weights_bytes(self) -> int:
149
+ """Approximate on-disk weight size — the file size is the best
150
+ proxy (GGUF is almost entirely tensor data)."""
151
+ return self.file_size_bytes
152
+
153
+
154
+ # ---------------------------------------------------------------------------
155
+ # Low-level readers
156
+ # ---------------------------------------------------------------------------
157
+
158
+
159
+ def _read_exact(fh: BinaryIO, n: int) -> bytes:
160
+ data = fh.read(n)
161
+ if len(data) != n:
162
+ raise GGUFParseError(f"unexpected EOF (wanted {n} bytes, got {len(data)})")
163
+ return data
164
+
165
+
166
+ def _read_scalar(fh: BinaryIO, type_tag: int) -> object:
167
+ fmt_size = _SCALAR.get(type_tag)
168
+ if fmt_size is None:
169
+ raise GGUFParseError(f"unknown scalar type tag {type_tag}")
170
+ fmt, size = fmt_size
171
+ return struct.unpack(fmt, _read_exact(fh, size))[0]
172
+
173
+
174
+ def _read_u32(fh: BinaryIO) -> int:
175
+ return struct.unpack("<I", _read_exact(fh, 4))[0]
176
+
177
+
178
+ def _read_u64(fh: BinaryIO) -> int:
179
+ return struct.unpack("<Q", _read_exact(fh, 8))[0]
180
+
181
+
182
+ def _read_gguf_string(fh: BinaryIO) -> str:
183
+ length = _read_u64(fh)
184
+ if length > _MAX_STR_BYTES:
185
+ raise GGUFParseError(f"string length {length} exceeds cap")
186
+ return _read_exact(fh, length).decode("utf-8", errors="replace")
187
+
188
+
189
+ def _skip_value(fh: BinaryIO, type_tag: int) -> None:
190
+ """Consume a metadata value of ``type_tag`` without retaining it."""
191
+ if type_tag == _GGUF_TYPE_STRING:
192
+ _read_gguf_string(fh)
193
+ return
194
+ if type_tag == _GGUF_TYPE_ARRAY:
195
+ elem_type = _read_u32(fh)
196
+ count = _read_u64(fh)
197
+ if count > _MAX_ARRAY_LEN:
198
+ raise GGUFParseError(f"array length {count} exceeds cap")
199
+ for _ in range(count):
200
+ _skip_value(fh, elem_type)
201
+ return
202
+ fmt_size = _SCALAR.get(type_tag)
203
+ if fmt_size is None:
204
+ raise GGUFParseError(f"unknown value type tag {type_tag}")
205
+ fh.seek(fmt_size[1], 1) # skip scalar bytes
206
+
207
+
208
+ def _read_scalar_value(fh: BinaryIO, type_tag: int) -> object:
209
+ """Read (and return) a value, skipping arrays/strings we don't need."""
210
+ if type_tag == _GGUF_TYPE_STRING:
211
+ return _read_gguf_string(fh)
212
+ if type_tag == _GGUF_TYPE_ARRAY:
213
+ _skip_value(fh, type_tag)
214
+ return None
215
+ return _read_scalar(fh, type_tag)
216
+
217
+
218
+ # ---------------------------------------------------------------------------
219
+ # Public API
220
+ # ---------------------------------------------------------------------------
221
+
222
+ # Suffixes of the arch-prefixed keys we capture (e.g. "llama.block_count").
223
+ _KEY_BLOCK_COUNT = ".block_count"
224
+ _KEY_EMBED_LEN = ".embedding_length"
225
+ _KEY_HEAD_COUNT = ".attention.head_count"
226
+ _KEY_HEAD_COUNT_KV = ".attention.head_count_kv"
227
+
228
+
229
+ def read_gguf_metadata(path: str | Path) -> GGUFInfo:
230
+ """Parse the GGUF header at ``path`` and return a :class:`GGUFInfo`.
231
+
232
+ Raises :class:`GGUFParseError` if the file is missing, too short,
233
+ or not a GGUF container. Captures only the keys needed for memory
234
+ accounting; everything else is skipped.
235
+ """
236
+ p = Path(path)
237
+ try:
238
+ file_size = p.stat().st_size
239
+ except OSError as exc: # missing / unreadable
240
+ raise GGUFParseError(f"cannot stat {path}: {exc}") from exc
241
+
242
+ arch: str | None = None
243
+ n_layers: int | None = None
244
+ n_embd: int | None = None
245
+ n_heads: int | None = None
246
+ n_kv_heads: int | None = None
247
+ file_type: int | None = None
248
+
249
+ with p.open("rb") as fh:
250
+ magic = fh.read(4)
251
+ if magic != _GGUF_MAGIC:
252
+ raise GGUFParseError(f"bad magic {magic!r} (not a GGUF file)")
253
+ version = _read_u32(fh)
254
+ if version not in (2, 3):
255
+ raise GGUFParseError(f"unsupported GGUF version {version}")
256
+ _read_u64(fh) # tensor_count: advance cursor, not needed
257
+ kv_count = _read_u64(fh)
258
+ if kv_count > _MAX_KV_PAIRS:
259
+ raise GGUFParseError(f"kv_count {kv_count} exceeds cap")
260
+
261
+ for _ in range(kv_count):
262
+ key = _read_gguf_string(fh)
263
+ value_type = _read_u32(fh)
264
+ value = _read_scalar_value(fh, value_type)
265
+
266
+ if key == "general.architecture" and isinstance(value, str):
267
+ arch = value
268
+ elif key == "general.file_type" and isinstance(value, int):
269
+ file_type = value
270
+ elif key.endswith(_KEY_BLOCK_COUNT) and isinstance(value, int):
271
+ n_layers = value
272
+ elif key.endswith(_KEY_EMBED_LEN) and isinstance(value, int):
273
+ n_embd = value
274
+ elif key.endswith(_KEY_HEAD_COUNT_KV) and isinstance(value, int):
275
+ n_kv_heads = value
276
+ elif key.endswith(_KEY_HEAD_COUNT) and isinstance(value, int):
277
+ n_heads = value
278
+
279
+ return GGUFInfo(
280
+ architecture=arch,
281
+ n_layers=n_layers,
282
+ n_embd=n_embd,
283
+ n_heads=n_heads,
284
+ n_kv_heads=n_kv_heads,
285
+ file_type=file_type,
286
+ file_size_bytes=file_size,
287
+ )
288
+
289
+
290
+ def try_read_gguf_metadata(path: str | Path) -> GGUFInfo | None:
291
+ """Like :func:`read_gguf_metadata` but returns None on any parse
292
+ failure — convenient for best-effort advisory paths."""
293
+ try:
294
+ return read_gguf_metadata(path)
295
+ except GGUFParseError:
296
+ return None
297
+
298
+
299
+ __all__ = [
300
+ "GGUFInfo",
301
+ "GGUFParseError",
302
+ "read_gguf_metadata",
303
+ "try_read_gguf_metadata",
304
+ ]
@@ -0,0 +1,249 @@
1
+ """Proactive memory-budget guard (low-memory track, L1).
2
+
3
+ Where :mod:`coderouter.guards.memory_pressure` reacts *after* an OOM,
4
+ this guard prevents it: given the host's available memory (from
5
+ :mod:`coderouter.hardware`) and the model's shape (from
6
+ :mod:`coderouter.gguf_introspect`), it computes the largest context
7
+ window (``num_ctx``) that will actually fit, *before* the request is
8
+ dispatched.
9
+
10
+ The engine then (a) caps the backend's ``num_ctx`` to that value and
11
+ (b) trims conversation history to the same budget via
12
+ :func:`coderouter.guards.context_budget.trim_to_budget`.
13
+
14
+ Everything here is **pure** (no I/O, no globals) so it is trivially
15
+ testable and free of the 5-deps constraint.
16
+
17
+ KV-cache model
18
+ ==============
19
+
20
+ The dominant runtime cost beyond the weights is the attention KV
21
+ cache, which grows linearly with context length:
22
+
23
+ kv_bytes ≈ 2 (K and V)
24
+ x n_layers
25
+ x n_ctx
26
+ x kv_dim
27
+ x bytes_per_element
28
+
29
+ ``kv_dim`` is the per-token key/value width. With grouped-query
30
+ attention (GQA) it is ``n_embd x n_kv_heads / n_heads``; without GQA
31
+ metadata it falls back to ``n_embd`` (conservative — over-counts, so
32
+ we under-promise context, which is the safe direction for OOM).
33
+
34
+ ``bytes_per_element`` defaults to 2 (fp16 KV cache). The estimate is
35
+ deliberately conservative; the headroom in :mod:`coderouter.hardware`
36
+ absorbs activation/compute buffers not modelled here.
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ from dataclasses import dataclass
42
+ from typing import Literal
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # Constants
46
+ # ---------------------------------------------------------------------------
47
+
48
+ _BYTES_PER_GB: int = 1024**3
49
+
50
+ #: Default bytes per KV-cache element (fp16).
51
+ DEFAULT_KV_BYTES_PER_ELEM: int = 2
52
+
53
+ #: Fraction of the post-weights budget held back for activations and
54
+ #: the compute buffer (not modelled explicitly). The remainder is what
55
+ #: the KV cache may consume.
56
+ DEFAULT_COMPUTE_OVERHEAD_RATIO: float = 0.10
57
+
58
+ #: Fallback layer/embedding shape when GGUF metadata is incomplete.
59
+ #: Chosen to over-estimate KV (safe: under-promises context).
60
+ _FALLBACK_N_LAYERS: int = 32
61
+ _FALLBACK_N_EMBD: int = 4096
62
+
63
+ FitAction = Literal["ok", "shrink", "insufficient", "unknown"]
64
+
65
+
66
+ # ---------------------------------------------------------------------------
67
+ # Result type
68
+ # ---------------------------------------------------------------------------
69
+
70
+
71
+ @dataclass(frozen=True, slots=True)
72
+ class FitDecision:
73
+ """Outcome of a pre-dispatch memory-fit computation.
74
+
75
+ ``action``:
76
+ * ``"ok"`` — requested context fits as-is.
77
+ * ``"shrink"`` — fits only at ``effective_num_ctx`` < requested.
78
+ * ``"insufficient"`` — won't fit even at ``min_num_ctx``; the model
79
+ is too big for this host (caller should warn
80
+ / fall through to another provider).
81
+ * ``"unknown"`` — hardware undetected; guard is a no-op.
82
+ """
83
+
84
+ action: FitAction
85
+ fits: bool
86
+ requested_num_ctx: int
87
+ effective_num_ctx: int
88
+ weights_bytes: int
89
+ kv_cache_bytes: int
90
+ available_bytes: int
91
+ reason: str
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # KV-cache math (pure)
96
+ # ---------------------------------------------------------------------------
97
+
98
+
99
+ def kv_dim(
100
+ n_embd: int | None,
101
+ n_heads: int | None,
102
+ n_kv_heads: int | None,
103
+ ) -> int:
104
+ """Per-token KV width in elements.
105
+
106
+ Applies the GQA reduction when both head counts are known and
107
+ valid; otherwise returns ``n_embd`` (over-counts → safe).
108
+ """
109
+ embd = n_embd if (n_embd and n_embd > 0) else _FALLBACK_N_EMBD
110
+ if (
111
+ n_heads
112
+ and n_kv_heads
113
+ and n_heads > 0
114
+ and 0 < n_kv_heads <= n_heads
115
+ ):
116
+ return max(1, int(embd * n_kv_heads / n_heads))
117
+ return embd
118
+
119
+
120
+ def kv_cache_bytes(
121
+ n_ctx: int,
122
+ n_layers: int,
123
+ kv_width: int,
124
+ *,
125
+ bytes_per_elem: int = DEFAULT_KV_BYTES_PER_ELEM,
126
+ ) -> int:
127
+ """KV-cache size in bytes for a given context length (K and V)."""
128
+ return 2 * max(0, n_layers) * max(0, n_ctx) * max(0, kv_width) * bytes_per_elem
129
+
130
+
131
+ def max_num_ctx_for_budget(
132
+ kv_budget_bytes: int,
133
+ n_layers: int,
134
+ kv_width: int,
135
+ *,
136
+ bytes_per_elem: int = DEFAULT_KV_BYTES_PER_ELEM,
137
+ ) -> int:
138
+ """Largest ``n_ctx`` whose KV cache fits in ``kv_budget_bytes``."""
139
+ per_token = 2 * max(1, n_layers) * max(1, kv_width) * bytes_per_elem
140
+ if per_token <= 0 or kv_budget_bytes <= 0:
141
+ return 0
142
+ return int(kv_budget_bytes // per_token)
143
+
144
+
145
+ # ---------------------------------------------------------------------------
146
+ # Fit decision (pure)
147
+ # ---------------------------------------------------------------------------
148
+
149
+
150
+ def plan_fit(
151
+ *,
152
+ available_budget_gb: float,
153
+ weights_bytes: int,
154
+ requested_num_ctx: int,
155
+ n_layers: int | None,
156
+ n_embd: int | None = None,
157
+ n_heads: int | None = None,
158
+ n_kv_heads: int | None = None,
159
+ min_num_ctx: int = 2048,
160
+ bytes_per_elem: int = DEFAULT_KV_BYTES_PER_ELEM,
161
+ compute_overhead_ratio: float = DEFAULT_COMPUTE_OVERHEAD_RATIO,
162
+ ) -> FitDecision:
163
+ """Decide whether ``requested_num_ctx`` fits, and by how much to shrink.
164
+
165
+ ``available_budget_gb`` is the net memory (after OS headroom) from
166
+ :func:`coderouter.hardware.available_budget_gb`. ``0.0`` means
167
+ hardware was undetected → returns an ``"unknown"`` no-op decision.
168
+ """
169
+ if available_budget_gb <= 0.0:
170
+ return FitDecision(
171
+ action="unknown",
172
+ fits=True, # don't block when we can't measure
173
+ requested_num_ctx=requested_num_ctx,
174
+ effective_num_ctx=requested_num_ctx,
175
+ weights_bytes=weights_bytes,
176
+ kv_cache_bytes=0,
177
+ available_bytes=0,
178
+ reason="hardware undetected; guard no-op",
179
+ )
180
+
181
+ available_bytes = int(available_budget_gb * _BYTES_PER_GB)
182
+ layers = n_layers if (n_layers and n_layers > 0) else _FALLBACK_N_LAYERS
183
+ width = kv_dim(n_embd, n_heads, n_kv_heads)
184
+
185
+ # Memory left for the KV cache after weights, minus a compute buffer.
186
+ post_weights = available_bytes - max(0, weights_bytes)
187
+ kv_budget = int(post_weights * (1.0 - compute_overhead_ratio))
188
+
189
+ # Can we even run the minimum context?
190
+ min_kv = kv_cache_bytes(min_num_ctx, layers, width, bytes_per_elem=bytes_per_elem)
191
+ if kv_budget < min_kv:
192
+ return FitDecision(
193
+ action="insufficient",
194
+ fits=False,
195
+ requested_num_ctx=requested_num_ctx,
196
+ effective_num_ctx=min_num_ctx,
197
+ weights_bytes=weights_bytes,
198
+ kv_cache_bytes=min_kv,
199
+ available_bytes=available_bytes,
200
+ reason=(
201
+ "weights + minimum KV cache exceed available memory; "
202
+ "model too large for this host"
203
+ ),
204
+ )
205
+
206
+ ctx_cap = max_num_ctx_for_budget(
207
+ kv_budget, layers, width, bytes_per_elem=bytes_per_elem
208
+ )
209
+
210
+ if ctx_cap >= requested_num_ctx:
211
+ kv = kv_cache_bytes(
212
+ requested_num_ctx, layers, width, bytes_per_elem=bytes_per_elem
213
+ )
214
+ return FitDecision(
215
+ action="ok",
216
+ fits=True,
217
+ requested_num_ctx=requested_num_ctx,
218
+ effective_num_ctx=requested_num_ctx,
219
+ weights_bytes=weights_bytes,
220
+ kv_cache_bytes=kv,
221
+ available_bytes=available_bytes,
222
+ reason="requested context fits",
223
+ )
224
+
225
+ # Shrink to the cap, but never below the floor.
226
+ effective = max(min_num_ctx, ctx_cap)
227
+ kv = kv_cache_bytes(effective, layers, width, bytes_per_elem=bytes_per_elem)
228
+ return FitDecision(
229
+ action="shrink",
230
+ fits=True,
231
+ requested_num_ctx=requested_num_ctx,
232
+ effective_num_ctx=effective,
233
+ weights_bytes=weights_bytes,
234
+ kv_cache_bytes=kv,
235
+ available_bytes=available_bytes,
236
+ reason=f"context shrunk from {requested_num_ctx} to {effective} to fit memory",
237
+ )
238
+
239
+
240
+ __all__ = [
241
+ "DEFAULT_COMPUTE_OVERHEAD_RATIO",
242
+ "DEFAULT_KV_BYTES_PER_ELEM",
243
+ "FitAction",
244
+ "FitDecision",
245
+ "kv_cache_bytes",
246
+ "kv_dim",
247
+ "max_num_ctx_for_budget",
248
+ "plan_fit",
249
+ ]
coderouter/hardware.py ADDED
@@ -0,0 +1,264 @@
1
+ """Shared hardware detection + memory accounting (low-memory track, L0).
2
+
3
+ Background
4
+ ==========
5
+
6
+ Low-memory machines (8-16 GB unified / discrete VRAM) can only run
7
+ small GGUF models, and CodeRouter's existing memory handling is purely
8
+ *reactive*: :mod:`coderouter.guards.memory_pressure` only fires *after*
9
+ a backend has already tripped an OOM. To prevent OOM *before* dispatch
10
+ we need to know how much memory the host actually has.
11
+
12
+ The detection primitive already existed inside
13
+ ``coderouter.ingress.launcher_routes._detect_hardware`` but was only
14
+ wired to the launcher UI. This module promotes it to a shared,
15
+ cached, dependency-free utility so the guard path can consume it too.
16
+
17
+ 5-deps invariant
18
+ ================
19
+
20
+ Detection is **best-effort and uses only the standard library**
21
+ (``os.sysconf`` / ``subprocess`` calling ``sysctl`` / ``nvidia-smi``).
22
+ No ``psutil`` / ``pynvml``. Every probe is wrapped so a missing tool or
23
+ permission error degrades gracefully to ``0.0`` rather than raising.
24
+
25
+ Caching
26
+ =======
27
+
28
+ Detection performs blocking I/O (subprocess). Results are cached in
29
+ process with a short TTL (:data:`_CACHE_TTL_S`) so the hot dispatch
30
+ path pays the cost at most once per minute. ``detect_hardware`` is
31
+ safe to call from async code via ``asyncio.to_thread``.
32
+ """
33
+
34
+ from __future__ import annotations
35
+
36
+ import contextlib
37
+ import os
38
+ import platform
39
+ import shutil
40
+ import subprocess # controlled: fixed argv, no shell
41
+ import threading
42
+ import time
43
+ from dataclasses import dataclass
44
+
45
+ # ---------------------------------------------------------------------------
46
+ # Constants
47
+ # ---------------------------------------------------------------------------
48
+
49
+ _BYTES_PER_GB: int = 1024**3
50
+
51
+ #: Detection cache TTL. Hardware doesn't change mid-session, but we keep
52
+ #: a TTL so a hot-plugged eGPU or driver restart is eventually noticed.
53
+ _CACHE_TTL_S: float = 60.0
54
+
55
+ #: Default headroom reserved for the OS and other processes, in GB.
56
+ #: On unified-memory (Metal) systems the OS + UI already consume a few
57
+ #: GB, so a conservative floor avoids starving the desktop.
58
+ DEFAULT_HEADROOM_GB: float = 1.5
59
+
60
+ #: Default headroom as a fraction of usable memory. The effective
61
+ #: headroom is ``max(DEFAULT_HEADROOM_GB, usable * DEFAULT_HEADROOM_RATIO)``.
62
+ DEFAULT_HEADROOM_RATIO: float = 0.15
63
+
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Result types
67
+ # ---------------------------------------------------------------------------
68
+
69
+
70
+ @dataclass(frozen=True, slots=True)
71
+ class HardwareInfo:
72
+ """Best-effort snapshot of the host's compute resources.
73
+
74
+ All memory values are in GiB. ``0.0`` means "could not detect"
75
+ (caller should treat detection as unavailable, not as "zero RAM").
76
+ """
77
+
78
+ #: System RAM in GiB (0.0 if undetectable).
79
+ ram_gb: float
80
+ #: GPU VRAM in GiB. For Metal/unified memory this mirrors ``ram_gb``;
81
+ #: for CPU-only it is 0.0.
82
+ vram_gb: float
83
+ #: One of ``"cuda"`` / ``"metal"`` / ``"cpu"``.
84
+ gpu: str
85
+ #: Logical CPU count (best-effort, defaults to 4).
86
+ cpu_count: int
87
+
88
+ @property
89
+ def detected(self) -> bool:
90
+ """True iff at least RAM was detected (a usable budget exists)."""
91
+ return self.ram_gb > 0.0
92
+
93
+ @property
94
+ def unified_memory(self) -> bool:
95
+ """True for Apple-silicon Metal, where VRAM and RAM are shared."""
96
+ return self.gpu == "metal"
97
+
98
+
99
+ # ---------------------------------------------------------------------------
100
+ # Detection (cached)
101
+ # ---------------------------------------------------------------------------
102
+
103
+ _cache_lock = threading.RLock()
104
+ _cache_value: HardwareInfo | None = None
105
+ _cache_ts: float = 0.0
106
+
107
+
108
+ def _detect_ram_gb() -> float:
109
+ """Detect system RAM in GiB via stdlib, then ``sysctl`` fallback."""
110
+ ram_gb = 0.0
111
+ with contextlib.suppress(ValueError, OSError, AttributeError):
112
+ ram_gb = (
113
+ os.sysconf("SC_PHYS_PAGES") * os.sysconf("SC_PAGE_SIZE")
114
+ ) / _BYTES_PER_GB
115
+ if ram_gb <= 0:
116
+ with contextlib.suppress(ValueError, OSError, subprocess.SubprocessError):
117
+ out = subprocess.run( # fixed argv, no shell
118
+ ["sysctl", "-n", "hw.memsize"],
119
+ capture_output=True,
120
+ text=True,
121
+ timeout=3,
122
+ check=False,
123
+ )
124
+ ram_gb = int(out.stdout.strip()) / _BYTES_PER_GB
125
+ return ram_gb
126
+
127
+
128
+ def _detect_gpu(ram_gb: float) -> tuple[str, float]:
129
+ """Detect (gpu_kind, vram_gb).
130
+
131
+ Apple silicon → unified memory (VRAM == RAM). NVIDIA → query
132
+ ``nvidia-smi``. Otherwise CPU with 0 VRAM.
133
+ """
134
+ if platform.system() == "Darwin" and platform.machine() == "arm64":
135
+ return "metal", ram_gb # unified memory
136
+ if shutil.which("nvidia-smi"):
137
+ with contextlib.suppress(ValueError, OSError, subprocess.SubprocessError):
138
+ out = subprocess.run( # fixed argv, no shell
139
+ [
140
+ "nvidia-smi",
141
+ "--query-gpu=memory.total",
142
+ "--format=csv,noheader,nounits",
143
+ ],
144
+ capture_output=True,
145
+ text=True,
146
+ timeout=5,
147
+ check=False,
148
+ )
149
+ mb = max(
150
+ (int(x) for x in out.stdout.split() if x.strip().isdigit()),
151
+ default=0,
152
+ )
153
+ if mb > 0:
154
+ return "cuda", mb / 1024
155
+ return "cpu", 0.0
156
+
157
+
158
+ def _detect_uncached() -> HardwareInfo:
159
+ """Run the full best-effort detection (no caching)."""
160
+ cpu = os.cpu_count() or 4
161
+ ram_gb = _detect_ram_gb()
162
+ gpu, vram_gb = _detect_gpu(ram_gb)
163
+ return HardwareInfo(
164
+ ram_gb=round(ram_gb, 1),
165
+ vram_gb=round(vram_gb, 1),
166
+ gpu=gpu,
167
+ cpu_count=cpu,
168
+ )
169
+
170
+
171
+ def detect_hardware(*, force_refresh: bool = False) -> HardwareInfo:
172
+ """Return a cached :class:`HardwareInfo` snapshot.
173
+
174
+ Blocking (subprocess). Call via ``asyncio.to_thread`` from async
175
+ code. The result is cached for :data:`_CACHE_TTL_S` seconds.
176
+
177
+ Parameters
178
+ ----------
179
+ force_refresh
180
+ Bypass the cache and re-probe immediately (e.g. after a
181
+ backend restart).
182
+ """
183
+ global _cache_value, _cache_ts
184
+ now = time.monotonic()
185
+ with _cache_lock:
186
+ if (
187
+ not force_refresh
188
+ and _cache_value is not None
189
+ and (now - _cache_ts) < _CACHE_TTL_S
190
+ ):
191
+ return _cache_value
192
+ info = _detect_uncached()
193
+ _cache_value = info
194
+ _cache_ts = now
195
+ return info
196
+
197
+
198
+ def reset_cache() -> None:
199
+ """Drop the detection cache. Mainly for tests."""
200
+ global _cache_value, _cache_ts
201
+ with _cache_lock:
202
+ _cache_value = None
203
+ _cache_ts = 0.0
204
+
205
+
206
+ # ---------------------------------------------------------------------------
207
+ # Memory accounting
208
+ # ---------------------------------------------------------------------------
209
+
210
+
211
+ def usable_memory_gb(hw: HardwareInfo) -> float:
212
+ """Memory available for model weights + KV cache, in GiB.
213
+
214
+ CUDA → dedicated VRAM. Metal/CPU → system RAM (unified or host).
215
+ Returns 0.0 when nothing was detected (caller should no-op rather
216
+ than make a wrong decision).
217
+ """
218
+ if not hw.detected:
219
+ return 0.0
220
+ if hw.gpu == "cuda":
221
+ return hw.vram_gb
222
+ return hw.ram_gb
223
+
224
+
225
+ def headroom_gb(
226
+ usable_gb: float,
227
+ *,
228
+ floor_gb: float = DEFAULT_HEADROOM_GB,
229
+ ratio: float = DEFAULT_HEADROOM_RATIO,
230
+ ) -> float:
231
+ """Memory to *reserve* for the OS / other processes, in GiB.
232
+
233
+ ``max(floor_gb, usable_gb * ratio)`` — a fixed floor protects tiny
234
+ machines, the ratio scales the reserve on larger ones.
235
+ """
236
+ return max(floor_gb, usable_gb * ratio)
237
+
238
+
239
+ def available_budget_gb(
240
+ hw: HardwareInfo,
241
+ *,
242
+ floor_gb: float = DEFAULT_HEADROOM_GB,
243
+ ratio: float = DEFAULT_HEADROOM_RATIO,
244
+ ) -> float:
245
+ """Net memory usable for weights + KV after subtracting headroom.
246
+
247
+ Never negative. Returns 0.0 when hardware is undetected.
248
+ """
249
+ usable = usable_memory_gb(hw)
250
+ if usable <= 0.0:
251
+ return 0.0
252
+ return max(0.0, usable - headroom_gb(usable, floor_gb=floor_gb, ratio=ratio))
253
+
254
+
255
+ __all__ = [
256
+ "DEFAULT_HEADROOM_GB",
257
+ "DEFAULT_HEADROOM_RATIO",
258
+ "HardwareInfo",
259
+ "available_budget_gb",
260
+ "detect_hardware",
261
+ "headroom_gb",
262
+ "reset_cache",
263
+ "usable_memory_gb",
264
+ ]
@@ -253,14 +253,32 @@ def _model_recommendation(size_gb: float, hw: dict[str, Any]) -> dict[str, str]:
253
253
  return {"level": "warn", "label": "メモリ厳しい"}
254
254
 
255
255
 
256
- def _suggest_launch_flags(size_gb: float, hw: dict[str, Any]) -> str:
257
- """選択モデル + ハードから -ngl / --ctx-size / --threads を提案する。
258
-
256
+ def _suggest_launch_flags(backend: str, size_gb: float,
257
+ hw: dict[str, Any]) -> str:
258
+ """選択モデル + ハード + バックエンドから推奨起動フラグを提案する。
259
+
260
+ バックエンドごとにフラグ体系が違うため分岐する:
261
+ - llama.cpp : -ngl / --ctx-size / --threads を算出
262
+ - vllm : モデル config からの自動導出に任せる (空文字)
263
+ - mlx : 統合メモリ前提で起動時フラグ不要 (空文字)
259
264
  あくまで目安。他プロセスのメモリ使用や量子化方式までは考慮しない。
260
265
  """
261
- threads = max(1, int(hw.get("cpu_count", 4)) - 2)
266
+ if backend == "mlx":
267
+ # MLX は統合メモリ + Metal 前提。llama.cpp の -ngl に相当する
268
+ # レイヤーオフロードの概念がなく、mlx_lm.server は起動時の
269
+ # 性能チューニングフラグを取らない。
270
+ return ""
271
+ if backend == "vllm":
272
+ # vllm の --max-model-len はモデルの実コンテキスト長に依存する。
273
+ # メモリ量だけのヒューリスティックで値を出すと、モデルの上限を
274
+ # 超えたときに vllm が起動を拒否する。空にしてエンジンの
275
+ # 自動導出 (モデル config) に任せるのが安全。
276
+ return ""
277
+
278
+ # llama.cpp (デフォルト)
262
279
  usable = _usable_memory_gb(hw)
263
280
  weights = size_gb * 1.15 # 重み + オーバーヘッド概算
281
+ threads = max(1, int(hw.get("cpu_count", 4)) - 2)
264
282
  if hw.get("gpu") == "cpu":
265
283
  ngl = 0
266
284
  elif usable >= weights + 1.0:
@@ -620,17 +638,20 @@ async def api_logs(proc_id: str, request: Request, n: int = 100) -> dict[str, An
620
638
 
621
639
 
622
640
  @router.get("/api/launcher/suggest")
623
- async def api_suggest(model_path: str = "") -> dict[str, Any]:
641
+ async def api_suggest(model_path: str = "",
642
+ backend: str = "llama.cpp") -> dict[str, Any]:
624
643
  """Suggest launch flags for the given model based on detected hardware.
625
644
 
626
645
  クライアントの「推奨値」ボタンから呼ばれる。値はあくまで目安。
646
+ バックエンドごとにフラグ体系が違うため backend も受け取る。
627
647
  """
628
648
  hw = await asyncio.to_thread(_detect_hardware)
629
649
  size_gb = 0.0
630
650
  if model_path:
631
651
  size_gb = await asyncio.to_thread(_model_size_gb, model_path)
632
652
  return {
633
- "extra_args": _suggest_launch_flags(size_gb, hw),
653
+ "extra_args": _suggest_launch_flags(backend, size_gb, hw),
654
+ "backend": backend,
634
655
  "hardware": hw,
635
656
  "size_gb": round(size_gb, 2),
636
657
  }
@@ -905,14 +926,24 @@ _LAUNCHER_HTML = r"""<!doctype html>
905
926
  window.suggestOptions = async () => {
906
927
  const model = document.getElementById("f-model").value.trim();
907
928
  if (!model) { showLaunchErr("先にモデルを選択してください"); return; }
929
+ const backend = document.getElementById("f-backend").value;
908
930
  try {
909
931
  const r = await fetch("/api/launcher/suggest?model_path="
910
- + encodeURIComponent(model));
932
+ + encodeURIComponent(model)
933
+ + "&backend=" + encodeURIComponent(backend));
911
934
  const d = await r.json();
912
935
  if (!r.ok) { showLaunchErr(d.detail || "推奨値の取得に失敗"); return; }
913
936
  document.getElementById("f-extra").value = d.extra_args;
914
937
  showLaunchErr("");
915
- statusMsg("推奨値を設定(目安): " + d.extra_args);
938
+ if (d.extra_args) {
939
+ statusMsg("推奨値を設定(目安): " + d.extra_args);
940
+ } else if (backend === "mlx") {
941
+ statusMsg("MLX は起動時の調整フラグ不要です(統合メモリで自動)");
942
+ } else if (backend === "vllm") {
943
+ statusMsg("vllm は起動時フラグ不要です(モデル設定から自動導出)");
944
+ } else {
945
+ statusMsg("このバックエンドは推奨フラグの自動設定対象外です");
946
+ }
916
947
  } catch (e) {
917
948
  showLaunchErr(e.message);
918
949
  }
@@ -218,7 +218,7 @@ class BudgetTracker:
218
218
  totals = state.get("totals", {})
219
219
  if isinstance(totals, dict):
220
220
  self._totals = {
221
- k: float(v) for k, v in totals.items() if isinstance(v, (int, float))
221
+ k: float(v) for k, v in totals.items() if isinstance(v, int | float)
222
222
  }
223
223
  self._month = current
224
224
 
@@ -0,0 +1,136 @@
1
+ """Optional precision token counting (low-memory accuracy track).
2
+
3
+ The core estimator in :mod:`coderouter.token_estimation` uses a
4
+ ``char/4`` heuristic that under-counts CJK text badly — which is
5
+ exactly the failure mode that makes the memory-budget guard either
6
+ OOM (under-count) or over-trim (over-count). This module offers an
7
+ opt-in precise backend without breaking the 5-deps invariant.
8
+
9
+ Design
10
+ ======
11
+
12
+ * **Optional dependency.** ``tokenizers`` (HuggingFace, Rust core) is
13
+ declared under the ``accuracy`` extra, *not* a core dependency. It is
14
+ imported lazily; if absent, every function falls back to the char/4
15
+ heuristic. Callers always get an ``int``.
16
+
17
+ * **Local files only — no network.** We load tokenizers exclusively via
18
+ ``Tokenizer.from_file(<local tokenizer.json>)``. We never call
19
+ ``from_pretrained`` or anything that contacts the HuggingFace Hub, so
20
+ this module performs **zero network I/O** and cannot be steered into
21
+ downloading arbitrary content.
22
+
23
+ * **No pickle / no torch.** ``tokenizers`` reads JSON only; we never
24
+ import ``torch`` or ``transformers`` (avoids the pickle-deserialization
25
+ RCE surface).
26
+
27
+ A loaded tokenizer is cached per resolved path so repeated requests
28
+ don't re-parse ``tokenizer.json``.
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import threading
34
+ from pathlib import Path
35
+ from typing import Any
36
+
37
+ from coderouter.token_estimation import CHARS_PER_TOKEN_HEURISTIC
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # Lazy backend detection
41
+ # ---------------------------------------------------------------------------
42
+
43
+ _backend_lock = threading.RLock()
44
+ _tokenizer_cache: dict[str, Any] = {}
45
+ _accuracy_available: bool | None = None
46
+
47
+
48
+ def is_accuracy_available() -> bool:
49
+ """True iff the optional ``tokenizers`` backend can be imported.
50
+
51
+ Result is memoised. Never raises — a missing package simply
52
+ returns False (callers fall back to the heuristic).
53
+ """
54
+ global _accuracy_available
55
+ if _accuracy_available is not None:
56
+ return _accuracy_available
57
+ with _backend_lock:
58
+ if _accuracy_available is None:
59
+ try:
60
+ import tokenizers # noqa: F401 (probe only)
61
+
62
+ _accuracy_available = True
63
+ except Exception: # pragma: no cover - import failure path
64
+ _accuracy_available = False
65
+ return _accuracy_available
66
+
67
+
68
+ def _load_tokenizer(tokenizer_path: str | Path) -> Any | None:
69
+ """Load and cache a tokenizer from a **local** ``tokenizer.json``.
70
+
71
+ Returns None if the backend is unavailable, the path is missing,
72
+ or the file fails to parse. Strictly local — never touches the Hub.
73
+ """
74
+ if not is_accuracy_available():
75
+ return None
76
+ p = Path(tokenizer_path)
77
+ key = str(p.resolve()) if p.exists() else str(p)
78
+ with _backend_lock:
79
+ if key in _tokenizer_cache:
80
+ return _tokenizer_cache[key]
81
+ if not p.is_file():
82
+ _tokenizer_cache[key] = None
83
+ return None
84
+ try:
85
+ from tokenizers import Tokenizer # local import
86
+
87
+ tok = Tokenizer.from_file(str(p)) # local file only, no network
88
+ except Exception:
89
+ tok = None
90
+ _tokenizer_cache[key] = tok
91
+ return tok
92
+
93
+
94
+ def reset_cache() -> None:
95
+ """Clear the tokenizer cache and backend probe. Mainly for tests."""
96
+ global _accuracy_available
97
+ with _backend_lock:
98
+ _tokenizer_cache.clear()
99
+ _accuracy_available = None
100
+
101
+
102
+ # ---------------------------------------------------------------------------
103
+ # Public API
104
+ # ---------------------------------------------------------------------------
105
+
106
+
107
+ def _heuristic(text: str) -> int:
108
+ return len(text) // CHARS_PER_TOKEN_HEURISTIC
109
+
110
+
111
+ def count_tokens(text: str, *, tokenizer_path: str | Path | None = None) -> int:
112
+ """Count tokens in ``text``.
113
+
114
+ Uses the precise ``tokenizers`` backend when ``tokenizer_path``
115
+ points at a readable local ``tokenizer.json`` *and* the optional
116
+ dependency is installed; otherwise falls back to the char/4
117
+ heuristic. Always returns a non-negative ``int`` and never raises
118
+ on backend problems.
119
+ """
120
+ if not text:
121
+ return 0
122
+ if tokenizer_path is not None:
123
+ tok = _load_tokenizer(tokenizer_path)
124
+ if tok is not None:
125
+ try:
126
+ return len(tok.encode(text).ids)
127
+ except Exception: # pragma: no cover - encode failure path
128
+ return _heuristic(text)
129
+ return _heuristic(text)
130
+
131
+
132
+ __all__ = [
133
+ "count_tokens",
134
+ "is_accuracy_available",
135
+ "reset_cache",
136
+ ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: coderouter-cli
3
- Version: 2.5.1
3
+ Version: 2.5.3
4
4
  Summary: Local-first, free-first, fallback-built-in LLM router. Claude Code / OpenAI compatible.
5
5
  Project-URL: Homepage, https://github.com/zephel01/CodeRouter
6
6
  Project-URL: Repository, https://github.com/zephel01/CodeRouter
@@ -27,6 +27,8 @@ Requires-Dist: httpx>=0.27.0
27
27
  Requires-Dist: pydantic>=2.9.0
28
28
  Requires-Dist: pyyaml>=6.0.2
29
29
  Requires-Dist: uvicorn[standard]>=0.32.0
30
+ Provides-Extra: accuracy
31
+ Requires-Dist: tokenizers>=0.20; extra == 'accuracy'
30
32
  Provides-Extra: dev
31
33
  Requires-Dist: mypy>=1.13.0; extra == 'dev'
32
34
  Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
@@ -37,6 +39,8 @@ Requires-Dist: ruff>=0.7.0; extra == 'dev'
37
39
  Requires-Dist: types-pyyaml>=6.0.12; extra == 'dev'
38
40
  Provides-Extra: doctor
39
41
  Requires-Dist: ruamel-yaml>=0.18.6; extra == 'doctor'
42
+ Provides-Extra: repair
43
+ Requires-Dist: json-repair>=0.30; extra == 'repair'
40
44
  Description-Content-Type: text/markdown
41
45
 
42
46
  <h1 align="center">CodeRouter</h1>
@@ -7,9 +7,12 @@ coderouter/doctor.py,sha256=2luNk6BHSRvpQStJnHcqzNvNi-SKdOuKV0WZdorZhVk,82854
7
7
  coderouter/doctor_apply.py,sha256=r_J6xbu5-HivofPNriw4_vjNYs_VRs7GsGTS0oMEX10,24209
8
8
  coderouter/env_security.py,sha256=FEBZnXfJ0xE39kmMMn39zk0W_DRRnmcB_REmP9f4xWo,14796
9
9
  coderouter/errors.py,sha256=Xmq67lheyw8iv3Ox39jh2c4tvNI5RcUR4QkoxVDN6l4,1130
10
+ coderouter/gguf_introspect.py,sha256=FZO14STLSp94Rfo5AInGwYUOpfjiXOW6CH5RiczTWDE,9514
11
+ coderouter/hardware.py,sha256=gn3_9qbVcGRR81yKMn1lJE_8-YDRau0LxIH_M-f7pxE,8356
10
12
  coderouter/logging.py,sha256=U7QiGRaoQXTSGijc-jV9TebnbbzrD-snfnoZy73Nvwo,52737
11
13
  coderouter/output_filters.py,sha256=LOOh68Kcn2LFDy1wPFynA6O_HGazV756q_79Z0_4Jww,19350
12
14
  coderouter/token_estimation.py,sha256=1Ai1uT68hahpyr4LBhNyVRGq7y4yXItd6J4k5ApGX7M,5995
15
+ coderouter/token_estimation_accurate.py,sha256=GTfzrBVnvAGjeVzmzAeUdOYZvWZKLAxcxPpFiJGlzjk,4609
13
16
  coderouter/adapters/__init__.py,sha256=7dIDSZ-FE_0iSqLSDc_lK1idRdLTKcM2hP9tCJipgPI,463
14
17
  coderouter/adapters/anthropic_native.py,sha256=qfdjxy4YyLt-0Fj7hUYn1oi1SFjEEbSvpaRBUC2hMf4,21903
15
18
  coderouter/adapters/base.py,sha256=H4uM6r_-95Xs1hCM_X4Zv3tq-xN3cXWLj83F-QjPNLw,8265
@@ -29,6 +32,7 @@ coderouter/guards/context_budget.py,sha256=moWulVr5NtVci13vXxS0ucV4EvX2b7tbA1W1d
29
32
  coderouter/guards/continuous_probe.py,sha256=AKNMbJ7hUJG-FDoU160BCbSEQQUyw0hBxFYMTaBZg84,11681
30
33
  coderouter/guards/drift_actions.py,sha256=A6pY5CR480Ct5rCVyjlBvjPFVc93eu_r5qcUpK9mWKc,3602
31
34
  coderouter/guards/drift_detection.py,sha256=vlepNw_GjvmpHZHTkMS5JM7XKxHaRxzjj_GfopRa1M0,13489
35
+ coderouter/guards/memory_budget.py,sha256=_bRtusk4AwrU781wVXW32OFU7zD2FXxOwTb7yGqPJqA,8278
32
36
  coderouter/guards/memory_pressure.py,sha256=mul1KXO9oE1i424cs92Sk6uzoRrV6Seck2Lk3bu-w68,7903
33
37
  coderouter/guards/self_healing.py,sha256=_fT_EJvTTp5VSi-qAP93J_1LkgPK5jkzsyrUHdKC45A,13853
34
38
  coderouter/guards/tool_loop.py,sha256=EzeMcmU7BLeTW2jsRVevU81l5rhWcn1oUr7EpzgXjVM,15209
@@ -36,7 +40,7 @@ coderouter/ingress/__init__.py,sha256=WQsCH2CGJCAhy0mS6GSEdeYZRkkQu2OHDsP4CJWTLu
36
40
  coderouter/ingress/anthropic_routes.py,sha256=It2f7XGe3fgKQX01J2F5JOCoZr96t_Tx_kY2om99MVo,16894
37
41
  coderouter/ingress/app.py,sha256=PcuTvUFNjr04EbsUOu8qdyKTdBzxkIJYB4xpz8dFfMo,12635
38
42
  coderouter/ingress/dashboard_routes.py,sha256=rscoj89weHTfc8QmYk-fof-7062rhKFHVHRA8cDImDI,21931
39
- coderouter/ingress/launcher_routes.py,sha256=T3uMmpUaFsc0k5jKyUIUPbSGCUBbzgIUMIPqEpNV3j4,45762
43
+ coderouter/ingress/launcher_routes.py,sha256=Jh-E6qFmHnr7ON4W6QanafxQIoojT4F034mybLvhTyQ,47548
40
44
  coderouter/ingress/metrics_routes.py,sha256=M22dwOGn24P05Ge4W3c7d7mYytSGWjIR-pPSPOAiHJY,3965
41
45
  coderouter/ingress/openai_routes.py,sha256=Zw1efPw9DI6GgV8ZcLrzS6Cda0KLrFkKn2GBZWSe6Vo,6322
42
46
  coderouter/metrics/__init__.py,sha256=7Es351DPS7yLM0yVF_F0eesmiD83n7Zzhie44chht38,1465
@@ -49,7 +53,7 @@ coderouter/plugins/registry.py,sha256=Tx0QHJHozZ5LTUliGylBdNVcdzHTBV0nedCUwGlbLM
49
53
  coderouter/routing/__init__.py,sha256=g2vhutbozRx5QBThReqwPN3imk5qXdpDiaogILd3IRc,257
50
54
  coderouter/routing/adaptive.py,sha256=G2o377twGSjbUh65wiIFx6klnpFGjsD_nI3oDvcBwhY,21257
51
55
  coderouter/routing/auto_router.py,sha256=4_sQR0ztSED9FgQSvQqgqSiydyQVY_qOSRvwyZ5BfRc,12909
52
- coderouter/routing/budget.py,sha256=A3_i44tmS3SrqVNnoGkLKMsiYwI_Ug6m5-3gitVoQSM,8452
56
+ coderouter/routing/budget.py,sha256=PblmVKJGs_BwNa9uDHAA8hmZ4XIVKv38mHAeU0V3OMs,8451
53
57
  coderouter/routing/capability.py,sha256=DCDmiQ-78dkYonCM1WQBCMf6e6XI6VIv_cnuz9hdWT0,18443
54
58
  coderouter/routing/fallback.py,sha256=P3f6Yna1EGnLAT-ZS5ADrrZ-qRWc-M5xvwEuan4rmcs,104568
55
59
  coderouter/state/__init__.py,sha256=XoGcPmmBQSiZWML2S0juSveQ78xfhtdeCliNnVyzu7E,1088
@@ -62,8 +66,8 @@ coderouter/translation/__init__.py,sha256=PYXN7XVEwpG1uC8RLy6fvnGbzEZhhrEuUapH8I
62
66
  coderouter/translation/anthropic.py,sha256=JpvIWNXHUPVqOGvps7o_6ZADhXuJuvpU7RdMqQFtwwM,6421
63
67
  coderouter/translation/convert.py,sha256=-qyzFzmmr9hhQV6_Sg75kJnvCZvHe3n7vRdaZtk_JqQ,47269
64
68
  coderouter/translation/tool_repair.py,sha256=Ok2PF947Liegc5oaytfptv5MWMkpfJYQie-zdP1y3cY,9946
65
- coderouter_cli-2.5.1.dist-info/METADATA,sha256=3ltKBldo-TSDI97pvjmhs6esu7OwjZvsBtKD5Ll3F04,11521
66
- coderouter_cli-2.5.1.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
67
- coderouter_cli-2.5.1.dist-info/entry_points.txt,sha256=-dnLfD1YZ2WjH2zSdNCvlO65wYltM9bsHt9Fhg3yGss,51
68
- coderouter_cli-2.5.1.dist-info/licenses/LICENSE,sha256=wkEzoR86jFw33jvfOHjULqmkGEfxTFMgMaJnpR8mPRw,1065
69
- coderouter_cli-2.5.1.dist-info/RECORD,,
69
+ coderouter_cli-2.5.3.dist-info/METADATA,sha256=3q3FPL44mGgfySDAi_5gEW1Y_CaZk6i_8wH2RkQKwf0,11674
70
+ coderouter_cli-2.5.3.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
71
+ coderouter_cli-2.5.3.dist-info/entry_points.txt,sha256=-dnLfD1YZ2WjH2zSdNCvlO65wYltM9bsHt9Fhg3yGss,51
72
+ coderouter_cli-2.5.3.dist-info/licenses/LICENSE,sha256=wkEzoR86jFw33jvfOHjULqmkGEfxTFMgMaJnpR8mPRw,1065
73
+ coderouter_cli-2.5.3.dist-info/RECORD,,