coderouter-cli 2.5.2__py3-none-any.whl → 2.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderouter/gguf_introspect.py +304 -0
- coderouter/guards/memory_budget.py +249 -0
- coderouter/hardware.py +264 -0
- coderouter/output_filters.py +148 -0
- coderouter/routing/budget.py +1 -1
- coderouter/token_estimation_accurate.py +136 -0
- {coderouter_cli-2.5.2.dist-info → coderouter_cli-2.5.4.dist-info}/METADATA +5 -1
- {coderouter_cli-2.5.2.dist-info → coderouter_cli-2.5.4.dist-info}/RECORD +11 -7
- {coderouter_cli-2.5.2.dist-info → coderouter_cli-2.5.4.dist-info}/WHEEL +1 -1
- {coderouter_cli-2.5.2.dist-info → coderouter_cli-2.5.4.dist-info}/entry_points.txt +0 -0
- {coderouter_cli-2.5.2.dist-info → coderouter_cli-2.5.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
"""Minimal, dependency-free GGUF header introspection (low-memory track).
|
|
2
|
+
|
|
3
|
+
Why self-written
|
|
4
|
+
================
|
|
5
|
+
|
|
6
|
+
To right-size ``num_ctx`` *before* dispatch we need a model's layer
|
|
7
|
+
count and embedding width so the KV-cache footprint can be estimated.
|
|
8
|
+
That data lives in the GGUF metadata header. Rather than add the
|
|
9
|
+
official ``gguf`` package (and its ``numpy`` transitive dep) we read
|
|
10
|
+
only the handful of header fields we need with the standard library —
|
|
11
|
+
preserving the 5-deps invariant.
|
|
12
|
+
|
|
13
|
+
The GGUF binary layout we parse (little-endian):
|
|
14
|
+
|
|
15
|
+
magic : 4 bytes == b"GGUF"
|
|
16
|
+
version : uint32 (2 or 3 supported)
|
|
17
|
+
tensor_cnt : uint64 (ignored — we never read tensor data)
|
|
18
|
+
kv_count : uint64 (number of metadata key/value pairs)
|
|
19
|
+
kv_pairs : kv_count repetitions of:
|
|
20
|
+
key : gguf-string (uint64 length + UTF-8 bytes)
|
|
21
|
+
value_type : uint32 (see _GGUF_TYPE_*)
|
|
22
|
+
value : type-dependent
|
|
23
|
+
|
|
24
|
+
We walk the KV pairs, capturing only the keys we care about, and skip
|
|
25
|
+
the rest (including arbitrarily nested arrays) without materialising
|
|
26
|
+
them.
|
|
27
|
+
|
|
28
|
+
Security
|
|
29
|
+
========
|
|
30
|
+
|
|
31
|
+
The parser treats the file as **untrusted input**:
|
|
32
|
+
|
|
33
|
+
* Every string length and array element count is clamped against
|
|
34
|
+
:data:`_MAX_STR_BYTES` / :data:`_MAX_ARRAY_LEN` so a corrupt or
|
|
35
|
+
hostile header cannot trigger a multi-GB allocation (DoS).
|
|
36
|
+
* Reads past EOF raise :class:`GGUFParseError`, never an unbounded
|
|
37
|
+
loop.
|
|
38
|
+
* No ``mmap``, no tensor payload read, no code execution path — we
|
|
39
|
+
only seek/read a small prefix.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
import struct
|
|
45
|
+
from dataclasses import dataclass
|
|
46
|
+
from pathlib import Path
|
|
47
|
+
from typing import BinaryIO
|
|
48
|
+
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
# Constants / format
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
|
|
53
|
+
_GGUF_MAGIC = b"GGUF"
|
|
54
|
+
|
|
55
|
+
# GGUF metadata value type tags.
|
|
56
|
+
_GGUF_TYPE_UINT8 = 0
|
|
57
|
+
_GGUF_TYPE_INT8 = 1
|
|
58
|
+
_GGUF_TYPE_UINT16 = 2
|
|
59
|
+
_GGUF_TYPE_INT16 = 3
|
|
60
|
+
_GGUF_TYPE_UINT32 = 4
|
|
61
|
+
_GGUF_TYPE_INT32 = 5
|
|
62
|
+
_GGUF_TYPE_FLOAT32 = 6
|
|
63
|
+
_GGUF_TYPE_BOOL = 7
|
|
64
|
+
_GGUF_TYPE_STRING = 8
|
|
65
|
+
_GGUF_TYPE_ARRAY = 9
|
|
66
|
+
_GGUF_TYPE_UINT64 = 10
|
|
67
|
+
_GGUF_TYPE_INT64 = 11
|
|
68
|
+
_GGUF_TYPE_FLOAT64 = 12
|
|
69
|
+
|
|
70
|
+
# Fixed-width scalar (struct format, size) by type tag.
|
|
71
|
+
_SCALAR: dict[int, tuple[str, int]] = {
|
|
72
|
+
_GGUF_TYPE_UINT8: ("<B", 1),
|
|
73
|
+
_GGUF_TYPE_INT8: ("<b", 1),
|
|
74
|
+
_GGUF_TYPE_UINT16: ("<H", 2),
|
|
75
|
+
_GGUF_TYPE_INT16: ("<h", 2),
|
|
76
|
+
_GGUF_TYPE_UINT32: ("<I", 4),
|
|
77
|
+
_GGUF_TYPE_INT32: ("<i", 4),
|
|
78
|
+
_GGUF_TYPE_FLOAT32: ("<f", 4),
|
|
79
|
+
_GGUF_TYPE_BOOL: ("<?", 1),
|
|
80
|
+
_GGUF_TYPE_UINT64: ("<Q", 8),
|
|
81
|
+
_GGUF_TYPE_INT64: ("<q", 8),
|
|
82
|
+
_GGUF_TYPE_FLOAT64: ("<d", 8),
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
# Defensive clamps against hostile / corrupt headers.
|
|
86
|
+
_MAX_STR_BYTES: int = 1 << 20 # 1 MiB key/value string ceiling
|
|
87
|
+
_MAX_ARRAY_LEN: int = 1 << 24 # element-count ceiling for arrays
|
|
88
|
+
_MAX_KV_PAIRS: int = 1 << 20 # metadata pair ceiling
|
|
89
|
+
|
|
90
|
+
# Human-readable names for the GGUF ``general.file_type`` enum (subset).
|
|
91
|
+
_FILE_TYPE_NAMES: dict[int, str] = {
|
|
92
|
+
0: "F32",
|
|
93
|
+
1: "F16",
|
|
94
|
+
2: "Q4_0",
|
|
95
|
+
3: "Q4_1",
|
|
96
|
+
7: "Q8_0",
|
|
97
|
+
8: "Q5_0",
|
|
98
|
+
9: "Q5_1",
|
|
99
|
+
10: "Q2_K",
|
|
100
|
+
11: "Q3_K_S",
|
|
101
|
+
12: "Q3_K_M",
|
|
102
|
+
13: "Q3_K_L",
|
|
103
|
+
14: "Q4_K_S",
|
|
104
|
+
15: "Q4_K_M",
|
|
105
|
+
16: "Q5_K_S",
|
|
106
|
+
17: "Q5_K_M",
|
|
107
|
+
18: "Q6_K",
|
|
108
|
+
19: "IQ2_XXS",
|
|
109
|
+
20: "IQ2_XS",
|
|
110
|
+
21: "Q2_K_S",
|
|
111
|
+
22: "IQ3_XS",
|
|
112
|
+
23: "IQ3_XXS",
|
|
113
|
+
24: "IQ1_S",
|
|
114
|
+
25: "IQ4_NL",
|
|
115
|
+
26: "IQ3_S",
|
|
116
|
+
27: "IQ3_M",
|
|
117
|
+
28: "IQ2_S",
|
|
118
|
+
29: "IQ2_M",
|
|
119
|
+
30: "IQ4_XS",
|
|
120
|
+
31: "IQ1_M",
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class GGUFParseError(Exception):
|
|
125
|
+
"""Raised when a file is not a parseable GGUF header."""
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@dataclass(frozen=True, slots=True)
|
|
129
|
+
class GGUFInfo:
|
|
130
|
+
"""The subset of GGUF metadata needed for memory accounting."""
|
|
131
|
+
|
|
132
|
+
architecture: str | None
|
|
133
|
+
n_layers: int | None
|
|
134
|
+
n_embd: int | None
|
|
135
|
+
n_heads: int | None
|
|
136
|
+
n_kv_heads: int | None
|
|
137
|
+
file_type: int | None
|
|
138
|
+
file_size_bytes: int
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def quant_name(self) -> str | None:
|
|
142
|
+
"""Human-readable quantization label, or None if unknown."""
|
|
143
|
+
if self.file_type is None:
|
|
144
|
+
return None
|
|
145
|
+
return _FILE_TYPE_NAMES.get(self.file_type, f"type{self.file_type}")
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def weights_bytes(self) -> int:
|
|
149
|
+
"""Approximate on-disk weight size — the file size is the best
|
|
150
|
+
proxy (GGUF is almost entirely tensor data)."""
|
|
151
|
+
return self.file_size_bytes
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# ---------------------------------------------------------------------------
|
|
155
|
+
# Low-level readers
|
|
156
|
+
# ---------------------------------------------------------------------------
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _read_exact(fh: BinaryIO, n: int) -> bytes:
|
|
160
|
+
data = fh.read(n)
|
|
161
|
+
if len(data) != n:
|
|
162
|
+
raise GGUFParseError(f"unexpected EOF (wanted {n} bytes, got {len(data)})")
|
|
163
|
+
return data
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _read_scalar(fh: BinaryIO, type_tag: int) -> object:
|
|
167
|
+
fmt_size = _SCALAR.get(type_tag)
|
|
168
|
+
if fmt_size is None:
|
|
169
|
+
raise GGUFParseError(f"unknown scalar type tag {type_tag}")
|
|
170
|
+
fmt, size = fmt_size
|
|
171
|
+
return struct.unpack(fmt, _read_exact(fh, size))[0]
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _read_u32(fh: BinaryIO) -> int:
|
|
175
|
+
return struct.unpack("<I", _read_exact(fh, 4))[0]
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _read_u64(fh: BinaryIO) -> int:
|
|
179
|
+
return struct.unpack("<Q", _read_exact(fh, 8))[0]
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _read_gguf_string(fh: BinaryIO) -> str:
|
|
183
|
+
length = _read_u64(fh)
|
|
184
|
+
if length > _MAX_STR_BYTES:
|
|
185
|
+
raise GGUFParseError(f"string length {length} exceeds cap")
|
|
186
|
+
return _read_exact(fh, length).decode("utf-8", errors="replace")
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _skip_value(fh: BinaryIO, type_tag: int) -> None:
|
|
190
|
+
"""Consume a metadata value of ``type_tag`` without retaining it."""
|
|
191
|
+
if type_tag == _GGUF_TYPE_STRING:
|
|
192
|
+
_read_gguf_string(fh)
|
|
193
|
+
return
|
|
194
|
+
if type_tag == _GGUF_TYPE_ARRAY:
|
|
195
|
+
elem_type = _read_u32(fh)
|
|
196
|
+
count = _read_u64(fh)
|
|
197
|
+
if count > _MAX_ARRAY_LEN:
|
|
198
|
+
raise GGUFParseError(f"array length {count} exceeds cap")
|
|
199
|
+
for _ in range(count):
|
|
200
|
+
_skip_value(fh, elem_type)
|
|
201
|
+
return
|
|
202
|
+
fmt_size = _SCALAR.get(type_tag)
|
|
203
|
+
if fmt_size is None:
|
|
204
|
+
raise GGUFParseError(f"unknown value type tag {type_tag}")
|
|
205
|
+
fh.seek(fmt_size[1], 1) # skip scalar bytes
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _read_scalar_value(fh: BinaryIO, type_tag: int) -> object:
|
|
209
|
+
"""Read (and return) a value, skipping arrays/strings we don't need."""
|
|
210
|
+
if type_tag == _GGUF_TYPE_STRING:
|
|
211
|
+
return _read_gguf_string(fh)
|
|
212
|
+
if type_tag == _GGUF_TYPE_ARRAY:
|
|
213
|
+
_skip_value(fh, type_tag)
|
|
214
|
+
return None
|
|
215
|
+
return _read_scalar(fh, type_tag)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
# ---------------------------------------------------------------------------
|
|
219
|
+
# Public API
|
|
220
|
+
# ---------------------------------------------------------------------------
|
|
221
|
+
|
|
222
|
+
# Suffixes of the arch-prefixed keys we capture (e.g. "llama.block_count").
|
|
223
|
+
_KEY_BLOCK_COUNT = ".block_count"
|
|
224
|
+
_KEY_EMBED_LEN = ".embedding_length"
|
|
225
|
+
_KEY_HEAD_COUNT = ".attention.head_count"
|
|
226
|
+
_KEY_HEAD_COUNT_KV = ".attention.head_count_kv"
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def read_gguf_metadata(path: str | Path) -> GGUFInfo:
|
|
230
|
+
"""Parse the GGUF header at ``path`` and return a :class:`GGUFInfo`.
|
|
231
|
+
|
|
232
|
+
Raises :class:`GGUFParseError` if the file is missing, too short,
|
|
233
|
+
or not a GGUF container. Captures only the keys needed for memory
|
|
234
|
+
accounting; everything else is skipped.
|
|
235
|
+
"""
|
|
236
|
+
p = Path(path)
|
|
237
|
+
try:
|
|
238
|
+
file_size = p.stat().st_size
|
|
239
|
+
except OSError as exc: # missing / unreadable
|
|
240
|
+
raise GGUFParseError(f"cannot stat {path}: {exc}") from exc
|
|
241
|
+
|
|
242
|
+
arch: str | None = None
|
|
243
|
+
n_layers: int | None = None
|
|
244
|
+
n_embd: int | None = None
|
|
245
|
+
n_heads: int | None = None
|
|
246
|
+
n_kv_heads: int | None = None
|
|
247
|
+
file_type: int | None = None
|
|
248
|
+
|
|
249
|
+
with p.open("rb") as fh:
|
|
250
|
+
magic = fh.read(4)
|
|
251
|
+
if magic != _GGUF_MAGIC:
|
|
252
|
+
raise GGUFParseError(f"bad magic {magic!r} (not a GGUF file)")
|
|
253
|
+
version = _read_u32(fh)
|
|
254
|
+
if version not in (2, 3):
|
|
255
|
+
raise GGUFParseError(f"unsupported GGUF version {version}")
|
|
256
|
+
_read_u64(fh) # tensor_count: advance cursor, not needed
|
|
257
|
+
kv_count = _read_u64(fh)
|
|
258
|
+
if kv_count > _MAX_KV_PAIRS:
|
|
259
|
+
raise GGUFParseError(f"kv_count {kv_count} exceeds cap")
|
|
260
|
+
|
|
261
|
+
for _ in range(kv_count):
|
|
262
|
+
key = _read_gguf_string(fh)
|
|
263
|
+
value_type = _read_u32(fh)
|
|
264
|
+
value = _read_scalar_value(fh, value_type)
|
|
265
|
+
|
|
266
|
+
if key == "general.architecture" and isinstance(value, str):
|
|
267
|
+
arch = value
|
|
268
|
+
elif key == "general.file_type" and isinstance(value, int):
|
|
269
|
+
file_type = value
|
|
270
|
+
elif key.endswith(_KEY_BLOCK_COUNT) and isinstance(value, int):
|
|
271
|
+
n_layers = value
|
|
272
|
+
elif key.endswith(_KEY_EMBED_LEN) and isinstance(value, int):
|
|
273
|
+
n_embd = value
|
|
274
|
+
elif key.endswith(_KEY_HEAD_COUNT_KV) and isinstance(value, int):
|
|
275
|
+
n_kv_heads = value
|
|
276
|
+
elif key.endswith(_KEY_HEAD_COUNT) and isinstance(value, int):
|
|
277
|
+
n_heads = value
|
|
278
|
+
|
|
279
|
+
return GGUFInfo(
|
|
280
|
+
architecture=arch,
|
|
281
|
+
n_layers=n_layers,
|
|
282
|
+
n_embd=n_embd,
|
|
283
|
+
n_heads=n_heads,
|
|
284
|
+
n_kv_heads=n_kv_heads,
|
|
285
|
+
file_type=file_type,
|
|
286
|
+
file_size_bytes=file_size,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def try_read_gguf_metadata(path: str | Path) -> GGUFInfo | None:
|
|
291
|
+
"""Like :func:`read_gguf_metadata` but returns None on any parse
|
|
292
|
+
failure — convenient for best-effort advisory paths."""
|
|
293
|
+
try:
|
|
294
|
+
return read_gguf_metadata(path)
|
|
295
|
+
except GGUFParseError:
|
|
296
|
+
return None
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
__all__ = [
|
|
300
|
+
"GGUFInfo",
|
|
301
|
+
"GGUFParseError",
|
|
302
|
+
"read_gguf_metadata",
|
|
303
|
+
"try_read_gguf_metadata",
|
|
304
|
+
]
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""Proactive memory-budget guard (low-memory track, L1).
|
|
2
|
+
|
|
3
|
+
Where :mod:`coderouter.guards.memory_pressure` reacts *after* an OOM,
|
|
4
|
+
this guard prevents it: given the host's available memory (from
|
|
5
|
+
:mod:`coderouter.hardware`) and the model's shape (from
|
|
6
|
+
:mod:`coderouter.gguf_introspect`), it computes the largest context
|
|
7
|
+
window (``num_ctx``) that will actually fit, *before* the request is
|
|
8
|
+
dispatched.
|
|
9
|
+
|
|
10
|
+
The engine then (a) caps the backend's ``num_ctx`` to that value and
|
|
11
|
+
(b) trims conversation history to the same budget via
|
|
12
|
+
:func:`coderouter.guards.context_budget.trim_to_budget`.
|
|
13
|
+
|
|
14
|
+
Everything here is **pure** (no I/O, no globals) so it is trivially
|
|
15
|
+
testable and free of the 5-deps constraint.
|
|
16
|
+
|
|
17
|
+
KV-cache model
|
|
18
|
+
==============
|
|
19
|
+
|
|
20
|
+
The dominant runtime cost beyond the weights is the attention KV
|
|
21
|
+
cache, which grows linearly with context length:
|
|
22
|
+
|
|
23
|
+
kv_bytes ≈ 2 (K and V)
|
|
24
|
+
x n_layers
|
|
25
|
+
x n_ctx
|
|
26
|
+
x kv_dim
|
|
27
|
+
x bytes_per_element
|
|
28
|
+
|
|
29
|
+
``kv_dim`` is the per-token key/value width. With grouped-query
|
|
30
|
+
attention (GQA) it is ``n_embd x n_kv_heads / n_heads``; without GQA
|
|
31
|
+
metadata it falls back to ``n_embd`` (conservative — over-counts, so
|
|
32
|
+
we under-promise context, which is the safe direction for OOM).
|
|
33
|
+
|
|
34
|
+
``bytes_per_element`` defaults to 2 (fp16 KV cache). The estimate is
|
|
35
|
+
deliberately conservative; the headroom in :mod:`coderouter.hardware`
|
|
36
|
+
absorbs activation/compute buffers not modelled here.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
from __future__ import annotations
|
|
40
|
+
|
|
41
|
+
from dataclasses import dataclass
|
|
42
|
+
from typing import Literal
|
|
43
|
+
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
# Constants
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
_BYTES_PER_GB: int = 1024**3
|
|
49
|
+
|
|
50
|
+
#: Default bytes per KV-cache element (fp16).
|
|
51
|
+
DEFAULT_KV_BYTES_PER_ELEM: int = 2
|
|
52
|
+
|
|
53
|
+
#: Fraction of the post-weights budget held back for activations and
|
|
54
|
+
#: the compute buffer (not modelled explicitly). The remainder is what
|
|
55
|
+
#: the KV cache may consume.
|
|
56
|
+
DEFAULT_COMPUTE_OVERHEAD_RATIO: float = 0.10
|
|
57
|
+
|
|
58
|
+
#: Fallback layer/embedding shape when GGUF metadata is incomplete.
|
|
59
|
+
#: Chosen to over-estimate KV (safe: under-promises context).
|
|
60
|
+
_FALLBACK_N_LAYERS: int = 32
|
|
61
|
+
_FALLBACK_N_EMBD: int = 4096
|
|
62
|
+
|
|
63
|
+
FitAction = Literal["ok", "shrink", "insufficient", "unknown"]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
# Result type
|
|
68
|
+
# ---------------------------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass(frozen=True, slots=True)
|
|
72
|
+
class FitDecision:
|
|
73
|
+
"""Outcome of a pre-dispatch memory-fit computation.
|
|
74
|
+
|
|
75
|
+
``action``:
|
|
76
|
+
* ``"ok"`` — requested context fits as-is.
|
|
77
|
+
* ``"shrink"`` — fits only at ``effective_num_ctx`` < requested.
|
|
78
|
+
* ``"insufficient"`` — won't fit even at ``min_num_ctx``; the model
|
|
79
|
+
is too big for this host (caller should warn
|
|
80
|
+
/ fall through to another provider).
|
|
81
|
+
* ``"unknown"`` — hardware undetected; guard is a no-op.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
action: FitAction
|
|
85
|
+
fits: bool
|
|
86
|
+
requested_num_ctx: int
|
|
87
|
+
effective_num_ctx: int
|
|
88
|
+
weights_bytes: int
|
|
89
|
+
kv_cache_bytes: int
|
|
90
|
+
available_bytes: int
|
|
91
|
+
reason: str
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
# KV-cache math (pure)
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def kv_dim(
|
|
100
|
+
n_embd: int | None,
|
|
101
|
+
n_heads: int | None,
|
|
102
|
+
n_kv_heads: int | None,
|
|
103
|
+
) -> int:
|
|
104
|
+
"""Per-token KV width in elements.
|
|
105
|
+
|
|
106
|
+
Applies the GQA reduction when both head counts are known and
|
|
107
|
+
valid; otherwise returns ``n_embd`` (over-counts → safe).
|
|
108
|
+
"""
|
|
109
|
+
embd = n_embd if (n_embd and n_embd > 0) else _FALLBACK_N_EMBD
|
|
110
|
+
if (
|
|
111
|
+
n_heads
|
|
112
|
+
and n_kv_heads
|
|
113
|
+
and n_heads > 0
|
|
114
|
+
and 0 < n_kv_heads <= n_heads
|
|
115
|
+
):
|
|
116
|
+
return max(1, int(embd * n_kv_heads / n_heads))
|
|
117
|
+
return embd
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def kv_cache_bytes(
|
|
121
|
+
n_ctx: int,
|
|
122
|
+
n_layers: int,
|
|
123
|
+
kv_width: int,
|
|
124
|
+
*,
|
|
125
|
+
bytes_per_elem: int = DEFAULT_KV_BYTES_PER_ELEM,
|
|
126
|
+
) -> int:
|
|
127
|
+
"""KV-cache size in bytes for a given context length (K and V)."""
|
|
128
|
+
return 2 * max(0, n_layers) * max(0, n_ctx) * max(0, kv_width) * bytes_per_elem
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def max_num_ctx_for_budget(
|
|
132
|
+
kv_budget_bytes: int,
|
|
133
|
+
n_layers: int,
|
|
134
|
+
kv_width: int,
|
|
135
|
+
*,
|
|
136
|
+
bytes_per_elem: int = DEFAULT_KV_BYTES_PER_ELEM,
|
|
137
|
+
) -> int:
|
|
138
|
+
"""Largest ``n_ctx`` whose KV cache fits in ``kv_budget_bytes``."""
|
|
139
|
+
per_token = 2 * max(1, n_layers) * max(1, kv_width) * bytes_per_elem
|
|
140
|
+
if per_token <= 0 or kv_budget_bytes <= 0:
|
|
141
|
+
return 0
|
|
142
|
+
return int(kv_budget_bytes // per_token)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# ---------------------------------------------------------------------------
|
|
146
|
+
# Fit decision (pure)
|
|
147
|
+
# ---------------------------------------------------------------------------
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def plan_fit(
|
|
151
|
+
*,
|
|
152
|
+
available_budget_gb: float,
|
|
153
|
+
weights_bytes: int,
|
|
154
|
+
requested_num_ctx: int,
|
|
155
|
+
n_layers: int | None,
|
|
156
|
+
n_embd: int | None = None,
|
|
157
|
+
n_heads: int | None = None,
|
|
158
|
+
n_kv_heads: int | None = None,
|
|
159
|
+
min_num_ctx: int = 2048,
|
|
160
|
+
bytes_per_elem: int = DEFAULT_KV_BYTES_PER_ELEM,
|
|
161
|
+
compute_overhead_ratio: float = DEFAULT_COMPUTE_OVERHEAD_RATIO,
|
|
162
|
+
) -> FitDecision:
|
|
163
|
+
"""Decide whether ``requested_num_ctx`` fits, and by how much to shrink.
|
|
164
|
+
|
|
165
|
+
``available_budget_gb`` is the net memory (after OS headroom) from
|
|
166
|
+
:func:`coderouter.hardware.available_budget_gb`. ``0.0`` means
|
|
167
|
+
hardware was undetected → returns an ``"unknown"`` no-op decision.
|
|
168
|
+
"""
|
|
169
|
+
if available_budget_gb <= 0.0:
|
|
170
|
+
return FitDecision(
|
|
171
|
+
action="unknown",
|
|
172
|
+
fits=True, # don't block when we can't measure
|
|
173
|
+
requested_num_ctx=requested_num_ctx,
|
|
174
|
+
effective_num_ctx=requested_num_ctx,
|
|
175
|
+
weights_bytes=weights_bytes,
|
|
176
|
+
kv_cache_bytes=0,
|
|
177
|
+
available_bytes=0,
|
|
178
|
+
reason="hardware undetected; guard no-op",
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
available_bytes = int(available_budget_gb * _BYTES_PER_GB)
|
|
182
|
+
layers = n_layers if (n_layers and n_layers > 0) else _FALLBACK_N_LAYERS
|
|
183
|
+
width = kv_dim(n_embd, n_heads, n_kv_heads)
|
|
184
|
+
|
|
185
|
+
# Memory left for the KV cache after weights, minus a compute buffer.
|
|
186
|
+
post_weights = available_bytes - max(0, weights_bytes)
|
|
187
|
+
kv_budget = int(post_weights * (1.0 - compute_overhead_ratio))
|
|
188
|
+
|
|
189
|
+
# Can we even run the minimum context?
|
|
190
|
+
min_kv = kv_cache_bytes(min_num_ctx, layers, width, bytes_per_elem=bytes_per_elem)
|
|
191
|
+
if kv_budget < min_kv:
|
|
192
|
+
return FitDecision(
|
|
193
|
+
action="insufficient",
|
|
194
|
+
fits=False,
|
|
195
|
+
requested_num_ctx=requested_num_ctx,
|
|
196
|
+
effective_num_ctx=min_num_ctx,
|
|
197
|
+
weights_bytes=weights_bytes,
|
|
198
|
+
kv_cache_bytes=min_kv,
|
|
199
|
+
available_bytes=available_bytes,
|
|
200
|
+
reason=(
|
|
201
|
+
"weights + minimum KV cache exceed available memory; "
|
|
202
|
+
"model too large for this host"
|
|
203
|
+
),
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
ctx_cap = max_num_ctx_for_budget(
|
|
207
|
+
kv_budget, layers, width, bytes_per_elem=bytes_per_elem
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
if ctx_cap >= requested_num_ctx:
|
|
211
|
+
kv = kv_cache_bytes(
|
|
212
|
+
requested_num_ctx, layers, width, bytes_per_elem=bytes_per_elem
|
|
213
|
+
)
|
|
214
|
+
return FitDecision(
|
|
215
|
+
action="ok",
|
|
216
|
+
fits=True,
|
|
217
|
+
requested_num_ctx=requested_num_ctx,
|
|
218
|
+
effective_num_ctx=requested_num_ctx,
|
|
219
|
+
weights_bytes=weights_bytes,
|
|
220
|
+
kv_cache_bytes=kv,
|
|
221
|
+
available_bytes=available_bytes,
|
|
222
|
+
reason="requested context fits",
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Shrink to the cap, but never below the floor.
|
|
226
|
+
effective = max(min_num_ctx, ctx_cap)
|
|
227
|
+
kv = kv_cache_bytes(effective, layers, width, bytes_per_elem=bytes_per_elem)
|
|
228
|
+
return FitDecision(
|
|
229
|
+
action="shrink",
|
|
230
|
+
fits=True,
|
|
231
|
+
requested_num_ctx=requested_num_ctx,
|
|
232
|
+
effective_num_ctx=effective,
|
|
233
|
+
weights_bytes=weights_bytes,
|
|
234
|
+
kv_cache_bytes=kv,
|
|
235
|
+
available_bytes=available_bytes,
|
|
236
|
+
reason=f"context shrunk from {requested_num_ctx} to {effective} to fit memory",
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
__all__ = [
|
|
241
|
+
"DEFAULT_COMPUTE_OVERHEAD_RATIO",
|
|
242
|
+
"DEFAULT_KV_BYTES_PER_ELEM",
|
|
243
|
+
"FitAction",
|
|
244
|
+
"FitDecision",
|
|
245
|
+
"kv_cache_bytes",
|
|
246
|
+
"kv_dim",
|
|
247
|
+
"max_num_ctx_for_budget",
|
|
248
|
+
"plan_fit",
|
|
249
|
+
]
|
coderouter/hardware.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""Shared hardware detection + memory accounting (low-memory track, L0).
|
|
2
|
+
|
|
3
|
+
Background
|
|
4
|
+
==========
|
|
5
|
+
|
|
6
|
+
Low-memory machines (8-16 GB unified / discrete VRAM) can only run
|
|
7
|
+
small GGUF models, and CodeRouter's existing memory handling is purely
|
|
8
|
+
*reactive*: :mod:`coderouter.guards.memory_pressure` only fires *after*
|
|
9
|
+
a backend has already tripped an OOM. To prevent OOM *before* dispatch
|
|
10
|
+
we need to know how much memory the host actually has.
|
|
11
|
+
|
|
12
|
+
The detection primitive already existed inside
|
|
13
|
+
``coderouter.ingress.launcher_routes._detect_hardware`` but was only
|
|
14
|
+
wired to the launcher UI. This module promotes it to a shared,
|
|
15
|
+
cached, dependency-free utility so the guard path can consume it too.
|
|
16
|
+
|
|
17
|
+
5-deps invariant
|
|
18
|
+
================
|
|
19
|
+
|
|
20
|
+
Detection is **best-effort and uses only the standard library**
|
|
21
|
+
(``os.sysconf`` / ``subprocess`` calling ``sysctl`` / ``nvidia-smi``).
|
|
22
|
+
No ``psutil`` / ``pynvml``. Every probe is wrapped so a missing tool or
|
|
23
|
+
permission error degrades gracefully to ``0.0`` rather than raising.
|
|
24
|
+
|
|
25
|
+
Caching
|
|
26
|
+
=======
|
|
27
|
+
|
|
28
|
+
Detection performs blocking I/O (subprocess). Results are cached in
|
|
29
|
+
process with a short TTL (:data:`_CACHE_TTL_S`) so the hot dispatch
|
|
30
|
+
path pays the cost at most once per minute. ``detect_hardware`` is
|
|
31
|
+
safe to call from async code via ``asyncio.to_thread``.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
import contextlib
|
|
37
|
+
import os
|
|
38
|
+
import platform
|
|
39
|
+
import shutil
|
|
40
|
+
import subprocess # controlled: fixed argv, no shell
|
|
41
|
+
import threading
|
|
42
|
+
import time
|
|
43
|
+
from dataclasses import dataclass
|
|
44
|
+
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
# Constants
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
_BYTES_PER_GB: int = 1024**3
|
|
50
|
+
|
|
51
|
+
#: Detection cache TTL. Hardware doesn't change mid-session, but we keep
|
|
52
|
+
#: a TTL so a hot-plugged eGPU or driver restart is eventually noticed.
|
|
53
|
+
_CACHE_TTL_S: float = 60.0
|
|
54
|
+
|
|
55
|
+
#: Default headroom reserved for the OS and other processes, in GB.
|
|
56
|
+
#: On unified-memory (Metal) systems the OS + UI already consume a few
|
|
57
|
+
#: GB, so a conservative floor avoids starving the desktop.
|
|
58
|
+
DEFAULT_HEADROOM_GB: float = 1.5
|
|
59
|
+
|
|
60
|
+
#: Default headroom as a fraction of usable memory. The effective
|
|
61
|
+
#: headroom is ``max(DEFAULT_HEADROOM_GB, usable * DEFAULT_HEADROOM_RATIO)``.
|
|
62
|
+
DEFAULT_HEADROOM_RATIO: float = 0.15
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
# Result types
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass(frozen=True, slots=True)
|
|
71
|
+
class HardwareInfo:
|
|
72
|
+
"""Best-effort snapshot of the host's compute resources.
|
|
73
|
+
|
|
74
|
+
All memory values are in GiB. ``0.0`` means "could not detect"
|
|
75
|
+
(caller should treat detection as unavailable, not as "zero RAM").
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
#: System RAM in GiB (0.0 if undetectable).
|
|
79
|
+
ram_gb: float
|
|
80
|
+
#: GPU VRAM in GiB. For Metal/unified memory this mirrors ``ram_gb``;
|
|
81
|
+
#: for CPU-only it is 0.0.
|
|
82
|
+
vram_gb: float
|
|
83
|
+
#: One of ``"cuda"`` / ``"metal"`` / ``"cpu"``.
|
|
84
|
+
gpu: str
|
|
85
|
+
#: Logical CPU count (best-effort, defaults to 4).
|
|
86
|
+
cpu_count: int
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def detected(self) -> bool:
|
|
90
|
+
"""True iff at least RAM was detected (a usable budget exists)."""
|
|
91
|
+
return self.ram_gb > 0.0
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def unified_memory(self) -> bool:
|
|
95
|
+
"""True for Apple-silicon Metal, where VRAM and RAM are shared."""
|
|
96
|
+
return self.gpu == "metal"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# ---------------------------------------------------------------------------
|
|
100
|
+
# Detection (cached)
|
|
101
|
+
# ---------------------------------------------------------------------------
|
|
102
|
+
|
|
103
|
+
_cache_lock = threading.RLock()
|
|
104
|
+
_cache_value: HardwareInfo | None = None
|
|
105
|
+
_cache_ts: float = 0.0
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _detect_ram_gb() -> float:
|
|
109
|
+
"""Detect system RAM in GiB via stdlib, then ``sysctl`` fallback."""
|
|
110
|
+
ram_gb = 0.0
|
|
111
|
+
with contextlib.suppress(ValueError, OSError, AttributeError):
|
|
112
|
+
ram_gb = (
|
|
113
|
+
os.sysconf("SC_PHYS_PAGES") * os.sysconf("SC_PAGE_SIZE")
|
|
114
|
+
) / _BYTES_PER_GB
|
|
115
|
+
if ram_gb <= 0:
|
|
116
|
+
with contextlib.suppress(ValueError, OSError, subprocess.SubprocessError):
|
|
117
|
+
out = subprocess.run( # fixed argv, no shell
|
|
118
|
+
["sysctl", "-n", "hw.memsize"],
|
|
119
|
+
capture_output=True,
|
|
120
|
+
text=True,
|
|
121
|
+
timeout=3,
|
|
122
|
+
check=False,
|
|
123
|
+
)
|
|
124
|
+
ram_gb = int(out.stdout.strip()) / _BYTES_PER_GB
|
|
125
|
+
return ram_gb
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _detect_gpu(ram_gb: float) -> tuple[str, float]:
|
|
129
|
+
"""Detect (gpu_kind, vram_gb).
|
|
130
|
+
|
|
131
|
+
Apple silicon → unified memory (VRAM == RAM). NVIDIA → query
|
|
132
|
+
``nvidia-smi``. Otherwise CPU with 0 VRAM.
|
|
133
|
+
"""
|
|
134
|
+
if platform.system() == "Darwin" and platform.machine() == "arm64":
|
|
135
|
+
return "metal", ram_gb # unified memory
|
|
136
|
+
if shutil.which("nvidia-smi"):
|
|
137
|
+
with contextlib.suppress(ValueError, OSError, subprocess.SubprocessError):
|
|
138
|
+
out = subprocess.run( # fixed argv, no shell
|
|
139
|
+
[
|
|
140
|
+
"nvidia-smi",
|
|
141
|
+
"--query-gpu=memory.total",
|
|
142
|
+
"--format=csv,noheader,nounits",
|
|
143
|
+
],
|
|
144
|
+
capture_output=True,
|
|
145
|
+
text=True,
|
|
146
|
+
timeout=5,
|
|
147
|
+
check=False,
|
|
148
|
+
)
|
|
149
|
+
mb = max(
|
|
150
|
+
(int(x) for x in out.stdout.split() if x.strip().isdigit()),
|
|
151
|
+
default=0,
|
|
152
|
+
)
|
|
153
|
+
if mb > 0:
|
|
154
|
+
return "cuda", mb / 1024
|
|
155
|
+
return "cpu", 0.0
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _detect_uncached() -> HardwareInfo:
|
|
159
|
+
"""Run the full best-effort detection (no caching)."""
|
|
160
|
+
cpu = os.cpu_count() or 4
|
|
161
|
+
ram_gb = _detect_ram_gb()
|
|
162
|
+
gpu, vram_gb = _detect_gpu(ram_gb)
|
|
163
|
+
return HardwareInfo(
|
|
164
|
+
ram_gb=round(ram_gb, 1),
|
|
165
|
+
vram_gb=round(vram_gb, 1),
|
|
166
|
+
gpu=gpu,
|
|
167
|
+
cpu_count=cpu,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def detect_hardware(*, force_refresh: bool = False) -> HardwareInfo:
|
|
172
|
+
"""Return a cached :class:`HardwareInfo` snapshot.
|
|
173
|
+
|
|
174
|
+
Blocking (subprocess). Call via ``asyncio.to_thread`` from async
|
|
175
|
+
code. The result is cached for :data:`_CACHE_TTL_S` seconds.
|
|
176
|
+
|
|
177
|
+
Parameters
|
|
178
|
+
----------
|
|
179
|
+
force_refresh
|
|
180
|
+
Bypass the cache and re-probe immediately (e.g. after a
|
|
181
|
+
backend restart).
|
|
182
|
+
"""
|
|
183
|
+
global _cache_value, _cache_ts
|
|
184
|
+
now = time.monotonic()
|
|
185
|
+
with _cache_lock:
|
|
186
|
+
if (
|
|
187
|
+
not force_refresh
|
|
188
|
+
and _cache_value is not None
|
|
189
|
+
and (now - _cache_ts) < _CACHE_TTL_S
|
|
190
|
+
):
|
|
191
|
+
return _cache_value
|
|
192
|
+
info = _detect_uncached()
|
|
193
|
+
_cache_value = info
|
|
194
|
+
_cache_ts = now
|
|
195
|
+
return info
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def reset_cache() -> None:
|
|
199
|
+
"""Drop the detection cache. Mainly for tests."""
|
|
200
|
+
global _cache_value, _cache_ts
|
|
201
|
+
with _cache_lock:
|
|
202
|
+
_cache_value = None
|
|
203
|
+
_cache_ts = 0.0
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
# ---------------------------------------------------------------------------
|
|
207
|
+
# Memory accounting
|
|
208
|
+
# ---------------------------------------------------------------------------
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def usable_memory_gb(hw: HardwareInfo) -> float:
|
|
212
|
+
"""Memory available for model weights + KV cache, in GiB.
|
|
213
|
+
|
|
214
|
+
CUDA → dedicated VRAM. Metal/CPU → system RAM (unified or host).
|
|
215
|
+
Returns 0.0 when nothing was detected (caller should no-op rather
|
|
216
|
+
than make a wrong decision).
|
|
217
|
+
"""
|
|
218
|
+
if not hw.detected:
|
|
219
|
+
return 0.0
|
|
220
|
+
if hw.gpu == "cuda":
|
|
221
|
+
return hw.vram_gb
|
|
222
|
+
return hw.ram_gb
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def headroom_gb(
|
|
226
|
+
usable_gb: float,
|
|
227
|
+
*,
|
|
228
|
+
floor_gb: float = DEFAULT_HEADROOM_GB,
|
|
229
|
+
ratio: float = DEFAULT_HEADROOM_RATIO,
|
|
230
|
+
) -> float:
|
|
231
|
+
"""Memory to *reserve* for the OS / other processes, in GiB.
|
|
232
|
+
|
|
233
|
+
``max(floor_gb, usable_gb * ratio)`` — a fixed floor protects tiny
|
|
234
|
+
machines, the ratio scales the reserve on larger ones.
|
|
235
|
+
"""
|
|
236
|
+
return max(floor_gb, usable_gb * ratio)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def available_budget_gb(
|
|
240
|
+
hw: HardwareInfo,
|
|
241
|
+
*,
|
|
242
|
+
floor_gb: float = DEFAULT_HEADROOM_GB,
|
|
243
|
+
ratio: float = DEFAULT_HEADROOM_RATIO,
|
|
244
|
+
) -> float:
|
|
245
|
+
"""Net memory usable for weights + KV after subtracting headroom.
|
|
246
|
+
|
|
247
|
+
Never negative. Returns 0.0 when hardware is undetected.
|
|
248
|
+
"""
|
|
249
|
+
usable = usable_memory_gb(hw)
|
|
250
|
+
if usable <= 0.0:
|
|
251
|
+
return 0.0
|
|
252
|
+
return max(0.0, usable - headroom_gb(usable, floor_gb=floor_gb, ratio=ratio))
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
__all__ = [
|
|
256
|
+
"DEFAULT_HEADROOM_GB",
|
|
257
|
+
"DEFAULT_HEADROOM_RATIO",
|
|
258
|
+
"HardwareInfo",
|
|
259
|
+
"available_budget_gb",
|
|
260
|
+
"detect_hardware",
|
|
261
|
+
"headroom_gb",
|
|
262
|
+
"reset_cache",
|
|
263
|
+
"usable_memory_gb",
|
|
264
|
+
]
|
coderouter/output_filters.py
CHANGED
|
@@ -43,6 +43,7 @@ Reference: plan.md §10.2 "出力クリーニング" / docs/retrospectives/v0.7.
|
|
|
43
43
|
|
|
44
44
|
from __future__ import annotations
|
|
45
45
|
|
|
46
|
+
import re
|
|
46
47
|
from typing import Protocol
|
|
47
48
|
|
|
48
49
|
__all__ = [
|
|
@@ -50,6 +51,7 @@ __all__ = [
|
|
|
50
51
|
"KNOWN_FILTERS",
|
|
51
52
|
"OutputFilter",
|
|
52
53
|
"OutputFilterChain",
|
|
54
|
+
"RepairByteFallbackFilter",
|
|
53
55
|
"StripStopMarkersFilter",
|
|
54
56
|
"StripThinkingFilter",
|
|
55
57
|
"StripToolCallXmlFilter",
|
|
@@ -382,6 +384,151 @@ class StripToolCallXmlFilter:
|
|
|
382
384
|
return "".join(out_parts)
|
|
383
385
|
|
|
384
386
|
|
|
387
|
+
# ---------------------------------------------------------------------------
|
|
388
|
+
# repair_byte_fallback (v2.x)
|
|
389
|
+
# ---------------------------------------------------------------------------
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
# A complete byte-fallback token: ``<0x`` + exactly two hex digits + ``>``.
|
|
393
|
+
_BYTE_RE = re.compile(r"<0x([0-9A-Fa-f]{2})>")
|
|
394
|
+
|
|
395
|
+
# The whole remaining buffer is a *proper prefix* of some ``<0xHH>`` token,
|
|
396
|
+
# i.e. it could still complete (and continue a run) on the next feed:
|
|
397
|
+
# ``<`` / ``<0`` / ``<0x`` / ``<0xH`` / ``<0xHH`` (closing ``>`` not yet seen).
|
|
398
|
+
_PREFIX_RE = re.compile(r"<(0(x[0-9A-Fa-f]{0,2})?)?")
|
|
399
|
+
|
|
400
|
+
_BYTE_TOKEN_START = "<0x"
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def _decode_byte_run(buf: bytes) -> str:
|
|
404
|
+
"""Decode a run of fallback bytes to text, losslessly.
|
|
405
|
+
|
|
406
|
+
Decodes the maximal valid UTF-8 prefix; any byte that cannot start or
|
|
407
|
+
continue a valid sequence is re-emitted as its ``<0xHH>`` token and
|
|
408
|
+
decoding resumes after it. So ``b"\\xe3\\x80\\x80"`` -> ``" "`` while a
|
|
409
|
+
stray ``b"\\xff"`` round-trips to ``"<0xFF>"`` — we never make the stream
|
|
410
|
+
worse than llama.cpp already did.
|
|
411
|
+
"""
|
|
412
|
+
parts: list[str] = []
|
|
413
|
+
i = 0
|
|
414
|
+
n = len(buf)
|
|
415
|
+
while i < n:
|
|
416
|
+
try:
|
|
417
|
+
parts.append(buf[i:].decode("utf-8"))
|
|
418
|
+
break
|
|
419
|
+
except UnicodeDecodeError as exc:
|
|
420
|
+
good_end = i + exc.start
|
|
421
|
+
if good_end > i:
|
|
422
|
+
parts.append(buf[i:good_end].decode("utf-8"))
|
|
423
|
+
parts.append(f"<0x{buf[good_end]:02X}>")
|
|
424
|
+
i = good_end + 1
|
|
425
|
+
return "".join(parts)
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
class RepairByteFallbackFilter:
|
|
429
|
+
"""Reassemble llama.cpp ``<0xNN>`` byte-fallback leaks into UTF-8 text.
|
|
430
|
+
|
|
431
|
+
Ollama 0.30 unified its GGUF runtime onto llama.cpp
|
|
432
|
+
(``ollama/ollama#16031``). For gemma4 the detokenizer changed, and
|
|
433
|
+
multi-byte characters it cannot assemble now leak as llama.cpp's
|
|
434
|
+
byte-fallback notation::
|
|
435
|
+
|
|
436
|
+
full-width space `` `` -> ``<0xE3><0x80><0x80>``
|
|
437
|
+
rare kanji ``躙`` -> ``<0xE8><0xBA><0x99>``
|
|
438
|
+
|
|
439
|
+
These corrupt Japanese prose AND tool-call JSON arguments (a stray
|
|
440
|
+
``<0xNN>`` inside an argument string breaks JSON parsing). This filter
|
|
441
|
+
reassembles runs of consecutive ``<0xNN>`` tokens back into UTF-8.
|
|
442
|
+
|
|
443
|
+
Stateful across ``feed`` calls so a token split across SSE deltas
|
|
444
|
+
(``<0x`` | ``E3>``) and a multi-byte run split across deltas
|
|
445
|
+
(``<0xE3>`` | ``<0x80><0x80>``) both reassemble correctly. A pending byte
|
|
446
|
+
run is only flushed once we are certain it has ended (confirmed normal
|
|
447
|
+
text follows, or ``eof``) — never at a bare chunk boundary, where the run
|
|
448
|
+
might continue in the next delta. Bytes that cannot form valid UTF-8 are
|
|
449
|
+
re-emitted verbatim (lossless).
|
|
450
|
+
|
|
451
|
+
``modified`` flips True the first time any ``<0xNN>`` token is consumed —
|
|
452
|
+
the adapter uses it to gate the log-once "output-filter-applied" line.
|
|
453
|
+
|
|
454
|
+
Ordering note: place this BEFORE ``tool_repair`` / the tool-call XML
|
|
455
|
+
strip so byte-fallback inside tool-call argument strings is restored
|
|
456
|
+
before JSON extraction.
|
|
457
|
+
"""
|
|
458
|
+
|
|
459
|
+
name = "repair_byte_fallback"
|
|
460
|
+
|
|
461
|
+
def __init__(self) -> None:
|
|
462
|
+
"""Initialize per-request buffer, pending byte run and state."""
|
|
463
|
+
self.modified: bool = False
|
|
464
|
+
self._buffer: str = ""
|
|
465
|
+
self._pending = bytearray()
|
|
466
|
+
|
|
467
|
+
def _flush_pending(self, out: list[str]) -> None:
|
|
468
|
+
"""Decode and emit the accumulated byte run, then clear it."""
|
|
469
|
+
if self._pending:
|
|
470
|
+
out.append(_decode_byte_run(bytes(self._pending)))
|
|
471
|
+
self._pending.clear()
|
|
472
|
+
|
|
473
|
+
def feed(self, text: str, *, eof: bool = False) -> str:
|
|
474
|
+
"""Consume ``text``; return the portion safe to emit now."""
|
|
475
|
+
self._buffer += text
|
|
476
|
+
out: list[str] = []
|
|
477
|
+
|
|
478
|
+
while self._buffer:
|
|
479
|
+
m = _BYTE_RE.match(self._buffer)
|
|
480
|
+
if m is not None:
|
|
481
|
+
# Complete byte token at position 0 — extend the run.
|
|
482
|
+
self._pending.append(int(m.group(1), 16))
|
|
483
|
+
self._buffer = self._buffer[m.end() :]
|
|
484
|
+
self.modified = True
|
|
485
|
+
continue
|
|
486
|
+
|
|
487
|
+
idx = self._buffer.find(_BYTE_TOKEN_START)
|
|
488
|
+
if idx == -1:
|
|
489
|
+
# No complete/started token in the buffer. Hold a trailing
|
|
490
|
+
# partial of ``<0x`` (it may complete — and CONTINUE the run —
|
|
491
|
+
# on the next feed); treat anything before it as confirmed
|
|
492
|
+
# normal text that ends the run.
|
|
493
|
+
hold = (
|
|
494
|
+
0 if eof else _max_suffix_overlap(self._buffer, _BYTE_TOKEN_START)
|
|
495
|
+
)
|
|
496
|
+
safe = self._buffer[:-hold] if hold else self._buffer
|
|
497
|
+
if safe:
|
|
498
|
+
self._flush_pending(out)
|
|
499
|
+
out.append(safe)
|
|
500
|
+
self._buffer = self._buffer[len(safe) :]
|
|
501
|
+
# else: whole buffer is a token-start prefix; keep pending
|
|
502
|
+
# (the run might continue) and wait for more input.
|
|
503
|
+
break
|
|
504
|
+
|
|
505
|
+
if idx > 0:
|
|
506
|
+
# Normal text precedes the next token start — run ended.
|
|
507
|
+
self._flush_pending(out)
|
|
508
|
+
out.append(self._buffer[:idx])
|
|
509
|
+
self._buffer = self._buffer[idx:]
|
|
510
|
+
continue
|
|
511
|
+
|
|
512
|
+
# idx == 0: buffer starts with ``<0x`` but is not a complete token.
|
|
513
|
+
if not eof and _PREFIX_RE.fullmatch(self._buffer):
|
|
514
|
+
# Could still complete next feed — hold token AND pending run.
|
|
515
|
+
break
|
|
516
|
+
|
|
517
|
+
# Malformed ``<0x..`` (non-hex, or stuck at eof). The ``<`` is
|
|
518
|
+
# ordinary text; the run (if any) has ended.
|
|
519
|
+
self._flush_pending(out)
|
|
520
|
+
out.append("<")
|
|
521
|
+
self._buffer = self._buffer[1:]
|
|
522
|
+
|
|
523
|
+
if eof:
|
|
524
|
+
self._flush_pending(out)
|
|
525
|
+
if self._buffer:
|
|
526
|
+
out.append(self._buffer)
|
|
527
|
+
self._buffer = ""
|
|
528
|
+
|
|
529
|
+
return "".join(out)
|
|
530
|
+
|
|
531
|
+
|
|
385
532
|
# ---------------------------------------------------------------------------
|
|
386
533
|
# Registry + chain
|
|
387
534
|
# ---------------------------------------------------------------------------
|
|
@@ -391,6 +538,7 @@ KNOWN_FILTERS: dict[str, type[OutputFilter]] = {
|
|
|
391
538
|
StripThinkingFilter.name: StripThinkingFilter,
|
|
392
539
|
StripStopMarkersFilter.name: StripStopMarkersFilter,
|
|
393
540
|
StripToolCallXmlFilter.name: StripToolCallXmlFilter,
|
|
541
|
+
RepairByteFallbackFilter.name: RepairByteFallbackFilter,
|
|
394
542
|
}
|
|
395
543
|
"""Registry of string-name → filter class.
|
|
396
544
|
|
coderouter/routing/budget.py
CHANGED
|
@@ -218,7 +218,7 @@ class BudgetTracker:
|
|
|
218
218
|
totals = state.get("totals", {})
|
|
219
219
|
if isinstance(totals, dict):
|
|
220
220
|
self._totals = {
|
|
221
|
-
k: float(v) for k, v in totals.items() if isinstance(v,
|
|
221
|
+
k: float(v) for k, v in totals.items() if isinstance(v, int | float)
|
|
222
222
|
}
|
|
223
223
|
self._month = current
|
|
224
224
|
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Optional precision token counting (low-memory accuracy track).
|
|
2
|
+
|
|
3
|
+
The core estimator in :mod:`coderouter.token_estimation` uses a
|
|
4
|
+
``char/4`` heuristic that under-counts CJK text badly — which is
|
|
5
|
+
exactly the failure mode that makes the memory-budget guard either
|
|
6
|
+
OOM (under-count) or over-trim (over-count). This module offers an
|
|
7
|
+
opt-in precise backend without breaking the 5-deps invariant.
|
|
8
|
+
|
|
9
|
+
Design
|
|
10
|
+
======
|
|
11
|
+
|
|
12
|
+
* **Optional dependency.** ``tokenizers`` (HuggingFace, Rust core) is
|
|
13
|
+
declared under the ``accuracy`` extra, *not* a core dependency. It is
|
|
14
|
+
imported lazily; if absent, every function falls back to the char/4
|
|
15
|
+
heuristic. Callers always get an ``int``.
|
|
16
|
+
|
|
17
|
+
* **Local files only — no network.** We load tokenizers exclusively via
|
|
18
|
+
``Tokenizer.from_file(<local tokenizer.json>)``. We never call
|
|
19
|
+
``from_pretrained`` or anything that contacts the HuggingFace Hub, so
|
|
20
|
+
this module performs **zero network I/O** and cannot be steered into
|
|
21
|
+
downloading arbitrary content.
|
|
22
|
+
|
|
23
|
+
* **No pickle / no torch.** ``tokenizers`` reads JSON only; we never
|
|
24
|
+
import ``torch`` or ``transformers`` (avoids the pickle-deserialization
|
|
25
|
+
RCE surface).
|
|
26
|
+
|
|
27
|
+
A loaded tokenizer is cached per resolved path so repeated requests
|
|
28
|
+
don't re-parse ``tokenizer.json``.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
from __future__ import annotations
|
|
32
|
+
|
|
33
|
+
import threading
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
from typing import Any
|
|
36
|
+
|
|
37
|
+
from coderouter.token_estimation import CHARS_PER_TOKEN_HEURISTIC
|
|
38
|
+
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
# Lazy backend detection
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
|
|
43
|
+
_backend_lock = threading.RLock()
|
|
44
|
+
_tokenizer_cache: dict[str, Any] = {}
|
|
45
|
+
_accuracy_available: bool | None = None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def is_accuracy_available() -> bool:
|
|
49
|
+
"""True iff the optional ``tokenizers`` backend can be imported.
|
|
50
|
+
|
|
51
|
+
Result is memoised. Never raises — a missing package simply
|
|
52
|
+
returns False (callers fall back to the heuristic).
|
|
53
|
+
"""
|
|
54
|
+
global _accuracy_available
|
|
55
|
+
if _accuracy_available is not None:
|
|
56
|
+
return _accuracy_available
|
|
57
|
+
with _backend_lock:
|
|
58
|
+
if _accuracy_available is None:
|
|
59
|
+
try:
|
|
60
|
+
import tokenizers # noqa: F401 (probe only)
|
|
61
|
+
|
|
62
|
+
_accuracy_available = True
|
|
63
|
+
except Exception: # pragma: no cover - import failure path
|
|
64
|
+
_accuracy_available = False
|
|
65
|
+
return _accuracy_available
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _load_tokenizer(tokenizer_path: str | Path) -> Any | None:
|
|
69
|
+
"""Load and cache a tokenizer from a **local** ``tokenizer.json``.
|
|
70
|
+
|
|
71
|
+
Returns None if the backend is unavailable, the path is missing,
|
|
72
|
+
or the file fails to parse. Strictly local — never touches the Hub.
|
|
73
|
+
"""
|
|
74
|
+
if not is_accuracy_available():
|
|
75
|
+
return None
|
|
76
|
+
p = Path(tokenizer_path)
|
|
77
|
+
key = str(p.resolve()) if p.exists() else str(p)
|
|
78
|
+
with _backend_lock:
|
|
79
|
+
if key in _tokenizer_cache:
|
|
80
|
+
return _tokenizer_cache[key]
|
|
81
|
+
if not p.is_file():
|
|
82
|
+
_tokenizer_cache[key] = None
|
|
83
|
+
return None
|
|
84
|
+
try:
|
|
85
|
+
from tokenizers import Tokenizer # local import
|
|
86
|
+
|
|
87
|
+
tok = Tokenizer.from_file(str(p)) # local file only, no network
|
|
88
|
+
except Exception:
|
|
89
|
+
tok = None
|
|
90
|
+
_tokenizer_cache[key] = tok
|
|
91
|
+
return tok
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def reset_cache() -> None:
|
|
95
|
+
"""Clear the tokenizer cache and backend probe. Mainly for tests."""
|
|
96
|
+
global _accuracy_available
|
|
97
|
+
with _backend_lock:
|
|
98
|
+
_tokenizer_cache.clear()
|
|
99
|
+
_accuracy_available = None
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ---------------------------------------------------------------------------
|
|
103
|
+
# Public API
|
|
104
|
+
# ---------------------------------------------------------------------------
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _heuristic(text: str) -> int:
|
|
108
|
+
return len(text) // CHARS_PER_TOKEN_HEURISTIC
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def count_tokens(text: str, *, tokenizer_path: str | Path | None = None) -> int:
|
|
112
|
+
"""Count tokens in ``text``.
|
|
113
|
+
|
|
114
|
+
Uses the precise ``tokenizers`` backend when ``tokenizer_path``
|
|
115
|
+
points at a readable local ``tokenizer.json`` *and* the optional
|
|
116
|
+
dependency is installed; otherwise falls back to the char/4
|
|
117
|
+
heuristic. Always returns a non-negative ``int`` and never raises
|
|
118
|
+
on backend problems.
|
|
119
|
+
"""
|
|
120
|
+
if not text:
|
|
121
|
+
return 0
|
|
122
|
+
if tokenizer_path is not None:
|
|
123
|
+
tok = _load_tokenizer(tokenizer_path)
|
|
124
|
+
if tok is not None:
|
|
125
|
+
try:
|
|
126
|
+
return len(tok.encode(text).ids)
|
|
127
|
+
except Exception: # pragma: no cover - encode failure path
|
|
128
|
+
return _heuristic(text)
|
|
129
|
+
return _heuristic(text)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
__all__ = [
|
|
133
|
+
"count_tokens",
|
|
134
|
+
"is_accuracy_available",
|
|
135
|
+
"reset_cache",
|
|
136
|
+
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: coderouter-cli
|
|
3
|
-
Version: 2.5.
|
|
3
|
+
Version: 2.5.4
|
|
4
4
|
Summary: Local-first, free-first, fallback-built-in LLM router. Claude Code / OpenAI compatible.
|
|
5
5
|
Project-URL: Homepage, https://github.com/zephel01/CodeRouter
|
|
6
6
|
Project-URL: Repository, https://github.com/zephel01/CodeRouter
|
|
@@ -27,6 +27,8 @@ Requires-Dist: httpx>=0.27.0
|
|
|
27
27
|
Requires-Dist: pydantic>=2.9.0
|
|
28
28
|
Requires-Dist: pyyaml>=6.0.2
|
|
29
29
|
Requires-Dist: uvicorn[standard]>=0.32.0
|
|
30
|
+
Provides-Extra: accuracy
|
|
31
|
+
Requires-Dist: tokenizers>=0.20; extra == 'accuracy'
|
|
30
32
|
Provides-Extra: dev
|
|
31
33
|
Requires-Dist: mypy>=1.13.0; extra == 'dev'
|
|
32
34
|
Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
|
|
@@ -37,6 +39,8 @@ Requires-Dist: ruff>=0.7.0; extra == 'dev'
|
|
|
37
39
|
Requires-Dist: types-pyyaml>=6.0.12; extra == 'dev'
|
|
38
40
|
Provides-Extra: doctor
|
|
39
41
|
Requires-Dist: ruamel-yaml>=0.18.6; extra == 'doctor'
|
|
42
|
+
Provides-Extra: repair
|
|
43
|
+
Requires-Dist: json-repair>=0.30; extra == 'repair'
|
|
40
44
|
Description-Content-Type: text/markdown
|
|
41
45
|
|
|
42
46
|
<h1 align="center">CodeRouter</h1>
|
|
@@ -7,9 +7,12 @@ coderouter/doctor.py,sha256=2luNk6BHSRvpQStJnHcqzNvNi-SKdOuKV0WZdorZhVk,82854
|
|
|
7
7
|
coderouter/doctor_apply.py,sha256=r_J6xbu5-HivofPNriw4_vjNYs_VRs7GsGTS0oMEX10,24209
|
|
8
8
|
coderouter/env_security.py,sha256=FEBZnXfJ0xE39kmMMn39zk0W_DRRnmcB_REmP9f4xWo,14796
|
|
9
9
|
coderouter/errors.py,sha256=Xmq67lheyw8iv3Ox39jh2c4tvNI5RcUR4QkoxVDN6l4,1130
|
|
10
|
+
coderouter/gguf_introspect.py,sha256=FZO14STLSp94Rfo5AInGwYUOpfjiXOW6CH5RiczTWDE,9514
|
|
11
|
+
coderouter/hardware.py,sha256=gn3_9qbVcGRR81yKMn1lJE_8-YDRau0LxIH_M-f7pxE,8356
|
|
10
12
|
coderouter/logging.py,sha256=U7QiGRaoQXTSGijc-jV9TebnbbzrD-snfnoZy73Nvwo,52737
|
|
11
|
-
coderouter/output_filters.py,sha256=
|
|
13
|
+
coderouter/output_filters.py,sha256=0ry_rPiS_kC-FnHgaNVP6v7e6Al2djxzu9vBzZ8kEkE,25314
|
|
12
14
|
coderouter/token_estimation.py,sha256=1Ai1uT68hahpyr4LBhNyVRGq7y4yXItd6J4k5ApGX7M,5995
|
|
15
|
+
coderouter/token_estimation_accurate.py,sha256=GTfzrBVnvAGjeVzmzAeUdOYZvWZKLAxcxPpFiJGlzjk,4609
|
|
13
16
|
coderouter/adapters/__init__.py,sha256=7dIDSZ-FE_0iSqLSDc_lK1idRdLTKcM2hP9tCJipgPI,463
|
|
14
17
|
coderouter/adapters/anthropic_native.py,sha256=qfdjxy4YyLt-0Fj7hUYn1oi1SFjEEbSvpaRBUC2hMf4,21903
|
|
15
18
|
coderouter/adapters/base.py,sha256=H4uM6r_-95Xs1hCM_X4Zv3tq-xN3cXWLj83F-QjPNLw,8265
|
|
@@ -29,6 +32,7 @@ coderouter/guards/context_budget.py,sha256=moWulVr5NtVci13vXxS0ucV4EvX2b7tbA1W1d
|
|
|
29
32
|
coderouter/guards/continuous_probe.py,sha256=AKNMbJ7hUJG-FDoU160BCbSEQQUyw0hBxFYMTaBZg84,11681
|
|
30
33
|
coderouter/guards/drift_actions.py,sha256=A6pY5CR480Ct5rCVyjlBvjPFVc93eu_r5qcUpK9mWKc,3602
|
|
31
34
|
coderouter/guards/drift_detection.py,sha256=vlepNw_GjvmpHZHTkMS5JM7XKxHaRxzjj_GfopRa1M0,13489
|
|
35
|
+
coderouter/guards/memory_budget.py,sha256=_bRtusk4AwrU781wVXW32OFU7zD2FXxOwTb7yGqPJqA,8278
|
|
32
36
|
coderouter/guards/memory_pressure.py,sha256=mul1KXO9oE1i424cs92Sk6uzoRrV6Seck2Lk3bu-w68,7903
|
|
33
37
|
coderouter/guards/self_healing.py,sha256=_fT_EJvTTp5VSi-qAP93J_1LkgPK5jkzsyrUHdKC45A,13853
|
|
34
38
|
coderouter/guards/tool_loop.py,sha256=EzeMcmU7BLeTW2jsRVevU81l5rhWcn1oUr7EpzgXjVM,15209
|
|
@@ -49,7 +53,7 @@ coderouter/plugins/registry.py,sha256=Tx0QHJHozZ5LTUliGylBdNVcdzHTBV0nedCUwGlbLM
|
|
|
49
53
|
coderouter/routing/__init__.py,sha256=g2vhutbozRx5QBThReqwPN3imk5qXdpDiaogILd3IRc,257
|
|
50
54
|
coderouter/routing/adaptive.py,sha256=G2o377twGSjbUh65wiIFx6klnpFGjsD_nI3oDvcBwhY,21257
|
|
51
55
|
coderouter/routing/auto_router.py,sha256=4_sQR0ztSED9FgQSvQqgqSiydyQVY_qOSRvwyZ5BfRc,12909
|
|
52
|
-
coderouter/routing/budget.py,sha256=
|
|
56
|
+
coderouter/routing/budget.py,sha256=PblmVKJGs_BwNa9uDHAA8hmZ4XIVKv38mHAeU0V3OMs,8451
|
|
53
57
|
coderouter/routing/capability.py,sha256=DCDmiQ-78dkYonCM1WQBCMf6e6XI6VIv_cnuz9hdWT0,18443
|
|
54
58
|
coderouter/routing/fallback.py,sha256=P3f6Yna1EGnLAT-ZS5ADrrZ-qRWc-M5xvwEuan4rmcs,104568
|
|
55
59
|
coderouter/state/__init__.py,sha256=XoGcPmmBQSiZWML2S0juSveQ78xfhtdeCliNnVyzu7E,1088
|
|
@@ -62,8 +66,8 @@ coderouter/translation/__init__.py,sha256=PYXN7XVEwpG1uC8RLy6fvnGbzEZhhrEuUapH8I
|
|
|
62
66
|
coderouter/translation/anthropic.py,sha256=JpvIWNXHUPVqOGvps7o_6ZADhXuJuvpU7RdMqQFtwwM,6421
|
|
63
67
|
coderouter/translation/convert.py,sha256=-qyzFzmmr9hhQV6_Sg75kJnvCZvHe3n7vRdaZtk_JqQ,47269
|
|
64
68
|
coderouter/translation/tool_repair.py,sha256=Ok2PF947Liegc5oaytfptv5MWMkpfJYQie-zdP1y3cY,9946
|
|
65
|
-
coderouter_cli-2.5.
|
|
66
|
-
coderouter_cli-2.5.
|
|
67
|
-
coderouter_cli-2.5.
|
|
68
|
-
coderouter_cli-2.5.
|
|
69
|
-
coderouter_cli-2.5.
|
|
69
|
+
coderouter_cli-2.5.4.dist-info/METADATA,sha256=cpwFo9rILUr99bq2K1bRH62s-hhVQqmed4psTvG-XFM,11674
|
|
70
|
+
coderouter_cli-2.5.4.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
71
|
+
coderouter_cli-2.5.4.dist-info/entry_points.txt,sha256=-dnLfD1YZ2WjH2zSdNCvlO65wYltM9bsHt9Fhg3yGss,51
|
|
72
|
+
coderouter_cli-2.5.4.dist-info/licenses/LICENSE,sha256=wkEzoR86jFw33jvfOHjULqmkGEfxTFMgMaJnpR8mPRw,1065
|
|
73
|
+
coderouter_cli-2.5.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|