coderouter-cli 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderouter/__init__.py +17 -0
- coderouter/__main__.py +6 -0
- coderouter/adapters/__init__.py +23 -0
- coderouter/adapters/anthropic_native.py +502 -0
- coderouter/adapters/base.py +220 -0
- coderouter/adapters/openai_compat.py +395 -0
- coderouter/adapters/registry.py +17 -0
- coderouter/cli.py +345 -0
- coderouter/cli_stats.py +751 -0
- coderouter/config/__init__.py +10 -0
- coderouter/config/capability_registry.py +339 -0
- coderouter/config/env_file.py +295 -0
- coderouter/config/loader.py +73 -0
- coderouter/config/schemas.py +515 -0
- coderouter/data/__init__.py +7 -0
- coderouter/data/model-capabilities.yaml +86 -0
- coderouter/doctor.py +1596 -0
- coderouter/env_security.py +434 -0
- coderouter/errors.py +29 -0
- coderouter/ingress/__init__.py +5 -0
- coderouter/ingress/anthropic_routes.py +205 -0
- coderouter/ingress/app.py +144 -0
- coderouter/ingress/dashboard_routes.py +493 -0
- coderouter/ingress/metrics_routes.py +92 -0
- coderouter/ingress/openai_routes.py +153 -0
- coderouter/logging.py +315 -0
- coderouter/metrics/__init__.py +39 -0
- coderouter/metrics/collector.py +471 -0
- coderouter/metrics/prometheus.py +221 -0
- coderouter/output_filters.py +407 -0
- coderouter/routing/__init__.py +13 -0
- coderouter/routing/auto_router.py +244 -0
- coderouter/routing/capability.py +285 -0
- coderouter/routing/fallback.py +611 -0
- coderouter/translation/__init__.py +57 -0
- coderouter/translation/anthropic.py +204 -0
- coderouter/translation/convert.py +1291 -0
- coderouter/translation/tool_repair.py +236 -0
- coderouter_cli-1.7.0.dist-info/METADATA +509 -0
- coderouter_cli-1.7.0.dist-info/RECORD +43 -0
- coderouter_cli-1.7.0.dist-info/WHEEL +4 -0
- coderouter_cli-1.7.0.dist-info/entry_points.txt +2 -0
- coderouter_cli-1.7.0.dist-info/licenses/LICENSE +21 -0
coderouter/doctor.py
ADDED
|
@@ -0,0 +1,1596 @@
|
|
|
1
|
+
"""`coderouter doctor --check-model <provider>` — per-provider capability probe.
|
|
2
|
+
|
|
3
|
+
Purpose (v0.7-B)
|
|
4
|
+
----------------
|
|
5
|
+
Run a small set of live probes against a single provider from
|
|
6
|
+
``providers.yaml`` and compare the observed behavior against the
|
|
7
|
+
declarations in ``providers.yaml`` + ``model-capabilities.yaml`` (v0.7-A
|
|
8
|
+
registry). Emit a per-probe verdict and, on mismatch, a copy-paste-able
|
|
9
|
+
YAML patch that the user can drop into either file.
|
|
10
|
+
|
|
11
|
+
Motivated by the 5 silent-fail symptoms enumerated in plan.md §9.4:
|
|
12
|
+
|
|
13
|
+
1. 空応答 / 意味不明応答 → num_ctx probe (v1.0-B direct detection
|
|
14
|
+
via canary echo-back) + streaming probe
|
|
15
|
+
(v1.0-C — output-side num_predict cap)
|
|
16
|
+
+ basic-chat probe
|
|
17
|
+
2. Claude Code「ファイル読めない」 → tool_calls probe (symptom 2)
|
|
18
|
+
3. UI に <think> タグ生露出 → thinking probe + reasoning-leak
|
|
19
|
+
content-marker detection (v1.0-A)
|
|
20
|
+
4. 起動後 1 発目で必ず失敗 → auth + model-not-found probe (symptom 4)
|
|
21
|
+
5. 全部 fallback 失敗 → auth probe (symptom 5)
|
|
22
|
+
|
|
23
|
+
Exit-code contract (CI-friendly)
|
|
24
|
+
--------------------------------
|
|
25
|
+
0 = all probes match the registry / providers.yaml declarations.
|
|
26
|
+
2 = at least one probe returned NEEDS_TUNING (structural mismatch;
|
|
27
|
+
the user should apply the emitted YAML patch).
|
|
28
|
+
1 = at least one probe could not run (AUTH_FAIL / UNSUPPORTED /
|
|
29
|
+
TRANSPORT_ERROR). When the auth probe fails, subsequent probes
|
|
30
|
+
are marked SKIP and do not influence the exit code — the auth
|
|
31
|
+
failure dominates.
|
|
32
|
+
|
|
33
|
+
Non-destructive contract
|
|
34
|
+
------------------------
|
|
35
|
+
Probes must not induce tool-side-effects. The tool-calls probe declares
|
|
36
|
+
a fake ``echo`` tool with no real-world meaning; even if the caller
|
|
37
|
+
later re-used the response (they won't), ``echo`` cannot trigger
|
|
38
|
+
anything on the caller's side. Each probe is minimized to ≤ ~100
|
|
39
|
+
tokens in / ≤ ~20 tokens out.
|
|
40
|
+
|
|
41
|
+
Layering
|
|
42
|
+
--------
|
|
43
|
+
Probes issue raw httpx calls rather than going through
|
|
44
|
+
``OpenAICompatAdapter`` / ``AnthropicAdapter`` because:
|
|
45
|
+
|
|
46
|
+
* The reasoning-leak probe needs to see the raw upstream body BEFORE
|
|
47
|
+
the adapter's v0.5-C passive strip runs.
|
|
48
|
+
* The thinking probe for ``kind: anthropic`` needs to send an
|
|
49
|
+
Anthropic wire-format body directly rather than the reverse-
|
|
50
|
+
translated ChatRequest shape.
|
|
51
|
+
* The tool-calls probe wants to observe the raw ``tool_calls`` field
|
|
52
|
+
vs the raw text content before any repair pass.
|
|
53
|
+
|
|
54
|
+
Keeping the HTTP plumbing inline in this module (~one helper, no
|
|
55
|
+
adapter dependency) makes the probe behavior stable against adapter-
|
|
56
|
+
layer changes and keeps the test surface narrow (``httpx_mock`` +
|
|
57
|
+
assertions on the probe output).
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
from __future__ import annotations
|
|
61
|
+
|
|
62
|
+
import asyncio
|
|
63
|
+
import json
|
|
64
|
+
from collections.abc import Sequence
|
|
65
|
+
from dataclasses import dataclass, field
|
|
66
|
+
from enum import StrEnum
|
|
67
|
+
from typing import Any
|
|
68
|
+
|
|
69
|
+
import httpx
|
|
70
|
+
|
|
71
|
+
from coderouter.config.capability_registry import (
|
|
72
|
+
CapabilityRegistry,
|
|
73
|
+
ResolvedCapabilities,
|
|
74
|
+
)
|
|
75
|
+
from coderouter.config.loader import resolve_api_key
|
|
76
|
+
from coderouter.config.schemas import CodeRouterConfig, ProviderConfig
|
|
77
|
+
from coderouter.output_filters import DEFAULT_STOP_MARKERS
|
|
78
|
+
from coderouter.routing.capability import get_default_registry
|
|
79
|
+
from coderouter.translation.tool_repair import repair_tool_calls_in_text
|
|
80
|
+
|
|
81
|
+
__all__ = [
|
|
82
|
+
"DoctorReport",
|
|
83
|
+
"ProbeResult",
|
|
84
|
+
"ProbeVerdict",
|
|
85
|
+
"check_model",
|
|
86
|
+
"exit_code_for",
|
|
87
|
+
"format_report",
|
|
88
|
+
"run_check_model_sync",
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
# Result types
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class ProbeVerdict(StrEnum):
|
|
98
|
+
"""Per-probe verdict.
|
|
99
|
+
|
|
100
|
+
Mapping to exit code (see :func:`exit_code_for`):
|
|
101
|
+
OK → contributes 0
|
|
102
|
+
SKIP → contributes 0 (not applicable or blocked by auth)
|
|
103
|
+
NEEDS_TUNING → contributes 2 (structural mismatch)
|
|
104
|
+
UNSUPPORTED → contributes 1 (model not found / feature absent)
|
|
105
|
+
AUTH_FAIL → contributes 1 (401/403 from upstream)
|
|
106
|
+
TRANSPORT_ERROR → contributes 1 (timeout / 5xx / network)
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
OK = "ok"
|
|
110
|
+
SKIP = "skip"
|
|
111
|
+
NEEDS_TUNING = "needs_tuning"
|
|
112
|
+
UNSUPPORTED = "unsupported"
|
|
113
|
+
AUTH_FAIL = "auth_fail"
|
|
114
|
+
TRANSPORT_ERROR = "transport_error"
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@dataclass
|
|
118
|
+
class ProbeResult:
|
|
119
|
+
"""Outcome of a single probe.
|
|
120
|
+
|
|
121
|
+
``suggested_patch`` is a YAML snippet the user can copy-paste into
|
|
122
|
+
the named file. ``target_file`` is either ``"providers.yaml"`` or
|
|
123
|
+
``"model-capabilities.yaml"`` — the probe picks whichever is the
|
|
124
|
+
more specific fix (per-provider opt-in wins over per-glob registry
|
|
125
|
+
rule when only one provider is affected; glob-level patches are
|
|
126
|
+
preferred when the mismatch appears to be a whole-family pattern,
|
|
127
|
+
but since doctor probes only one provider at a time, providers.yaml
|
|
128
|
+
is always the safe suggestion for a single-provider fix).
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
name: str
|
|
132
|
+
verdict: ProbeVerdict
|
|
133
|
+
detail: str
|
|
134
|
+
suggested_patch: str | None = None
|
|
135
|
+
target_file: str | None = None # "providers.yaml" or "model-capabilities.yaml"
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@dataclass
|
|
139
|
+
class DoctorReport:
|
|
140
|
+
"""Aggregate report for a single ``--check-model`` invocation."""
|
|
141
|
+
|
|
142
|
+
provider_name: str
|
|
143
|
+
provider: ProviderConfig
|
|
144
|
+
resolved_caps: ResolvedCapabilities
|
|
145
|
+
results: list[ProbeResult] = field(default_factory=list)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def exit_code_for(report: DoctorReport) -> int:
|
|
149
|
+
"""Derive the CLI exit code from a report (see :class:`ProbeVerdict`)."""
|
|
150
|
+
has_blocker = False
|
|
151
|
+
has_tuning = False
|
|
152
|
+
for r in report.results:
|
|
153
|
+
if r.verdict in (
|
|
154
|
+
ProbeVerdict.AUTH_FAIL,
|
|
155
|
+
ProbeVerdict.UNSUPPORTED,
|
|
156
|
+
ProbeVerdict.TRANSPORT_ERROR,
|
|
157
|
+
):
|
|
158
|
+
has_blocker = True
|
|
159
|
+
elif r.verdict == ProbeVerdict.NEEDS_TUNING:
|
|
160
|
+
has_tuning = True
|
|
161
|
+
if has_blocker:
|
|
162
|
+
return 1
|
|
163
|
+
if has_tuning:
|
|
164
|
+
return 2
|
|
165
|
+
return 0
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
# ---------------------------------------------------------------------------
|
|
169
|
+
# HTTP helpers
|
|
170
|
+
# ---------------------------------------------------------------------------
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _openai_chat_url(provider: ProviderConfig) -> str:
|
|
174
|
+
base = str(provider.base_url).rstrip("/")
|
|
175
|
+
return f"{base}/chat/completions"
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _anthropic_messages_url(provider: ProviderConfig) -> str:
|
|
179
|
+
base = str(provider.base_url).rstrip("/")
|
|
180
|
+
return f"{base}/v1/messages"
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _openai_headers(provider: ProviderConfig) -> dict[str, str]:
|
|
184
|
+
headers = {"Content-Type": "application/json", "User-Agent": "CodeRouter-doctor/0.7"}
|
|
185
|
+
api_key = resolve_api_key(provider.api_key_env)
|
|
186
|
+
if api_key:
|
|
187
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
|
188
|
+
return headers
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _anthropic_headers(provider: ProviderConfig) -> dict[str, str]:
|
|
192
|
+
headers = {
|
|
193
|
+
"Content-Type": "application/json",
|
|
194
|
+
"User-Agent": "CodeRouter-doctor/0.7",
|
|
195
|
+
"anthropic-version": "2023-06-01",
|
|
196
|
+
}
|
|
197
|
+
api_key = resolve_api_key(provider.api_key_env)
|
|
198
|
+
if api_key:
|
|
199
|
+
headers["x-api-key"] = api_key
|
|
200
|
+
return headers
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
async def _http_post_json(
|
|
204
|
+
url: str,
|
|
205
|
+
*,
|
|
206
|
+
headers: dict[str, str],
|
|
207
|
+
body: dict[str, Any],
|
|
208
|
+
timeout: float,
|
|
209
|
+
) -> tuple[int | None, dict[str, Any] | None, str]:
|
|
210
|
+
"""POST JSON. Returns (status_or_None, parsed_or_None, raw_text_or_error).
|
|
211
|
+
|
|
212
|
+
``status=None`` signals a transport-level failure (connection refused,
|
|
213
|
+
DNS, timeout). ``parsed=None`` with non-None status means the body
|
|
214
|
+
was not parseable JSON (still treated as an upstream protocol issue
|
|
215
|
+
at the caller's discretion).
|
|
216
|
+
"""
|
|
217
|
+
try:
|
|
218
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
219
|
+
resp = await client.post(url, json=body, headers=headers)
|
|
220
|
+
except httpx.HTTPError as exc:
|
|
221
|
+
return None, None, f"transport error: {exc}"
|
|
222
|
+
try:
|
|
223
|
+
parsed = resp.json()
|
|
224
|
+
except (json.JSONDecodeError, ValueError):
|
|
225
|
+
return resp.status_code, None, resp.text
|
|
226
|
+
return resp.status_code, parsed, resp.text
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
async def _http_stream_sse(
|
|
230
|
+
url: str,
|
|
231
|
+
*,
|
|
232
|
+
headers: dict[str, str],
|
|
233
|
+
body: dict[str, Any],
|
|
234
|
+
timeout: float,
|
|
235
|
+
) -> tuple[int | None, list[dict[str, Any]], bool, str]:
|
|
236
|
+
"""POST a streaming request and consume the SSE stream.
|
|
237
|
+
|
|
238
|
+
Returns ``(status, chunks, saw_done, error_text)``.
|
|
239
|
+
|
|
240
|
+
* ``status=None`` signals a transport-level failure; ``error_text``
|
|
241
|
+
carries the reason.
|
|
242
|
+
* ``chunks`` are the parsed JSON objects from ``data: <json>`` lines,
|
|
243
|
+
in observed order. ``[DONE]`` is not included.
|
|
244
|
+
* ``saw_done`` is True iff the terminator line ``data: [DONE]`` was
|
|
245
|
+
observed. Strict SSE clients require it; many upstreams omit it
|
|
246
|
+
and rely on connection close instead.
|
|
247
|
+
* On HTTP error (status >= 400) the body is read once and returned
|
|
248
|
+
in ``error_text``; ``chunks`` is empty.
|
|
249
|
+
|
|
250
|
+
Mirrors :func:`_http_post_json`'s error handling shape so the caller
|
|
251
|
+
can branch on ``status`` the same way.
|
|
252
|
+
"""
|
|
253
|
+
try:
|
|
254
|
+
async with (
|
|
255
|
+
httpx.AsyncClient(timeout=timeout) as client,
|
|
256
|
+
client.stream("POST", url, json=body, headers=headers) as resp,
|
|
257
|
+
):
|
|
258
|
+
status = resp.status_code
|
|
259
|
+
if status >= 400:
|
|
260
|
+
raw = await resp.aread()
|
|
261
|
+
return (
|
|
262
|
+
status,
|
|
263
|
+
[],
|
|
264
|
+
False,
|
|
265
|
+
raw.decode("utf-8", errors="replace")[:400],
|
|
266
|
+
)
|
|
267
|
+
chunks: list[dict[str, Any]] = []
|
|
268
|
+
saw_done = False
|
|
269
|
+
async for line in resp.aiter_lines():
|
|
270
|
+
if not line or line.startswith(":"):
|
|
271
|
+
continue
|
|
272
|
+
if not line.startswith("data:"):
|
|
273
|
+
continue
|
|
274
|
+
data_str = line[len("data:") :].strip()
|
|
275
|
+
if data_str == "[DONE]":
|
|
276
|
+
saw_done = True
|
|
277
|
+
continue
|
|
278
|
+
try:
|
|
279
|
+
chunks.append(json.loads(data_str))
|
|
280
|
+
except json.JSONDecodeError:
|
|
281
|
+
continue # skip malformed chunks, keep consuming
|
|
282
|
+
return status, chunks, saw_done, ""
|
|
283
|
+
except httpx.HTTPError as exc:
|
|
284
|
+
return None, [], False, f"transport error: {exc}"
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
# ---------------------------------------------------------------------------
|
|
288
|
+
# Patch emitters
|
|
289
|
+
#
|
|
290
|
+
# Kept as tiny helpers rather than a Jinja dance — the surface area is too
|
|
291
|
+
# small to justify templating, and exact indentation in the emitted YAML
|
|
292
|
+
# matters for copy-paste fidelity.
|
|
293
|
+
# ---------------------------------------------------------------------------
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _patch_providers_yaml_capability(provider_name: str, key: str, value: bool) -> str:
|
|
297
|
+
"""Emit a providers.yaml patch that flips ``capabilities.<key>``."""
|
|
298
|
+
val = "true" if value else "false"
|
|
299
|
+
return (
|
|
300
|
+
"# providers.yaml — update the entry for "
|
|
301
|
+
f"{provider_name!r}:\n"
|
|
302
|
+
"providers:\n"
|
|
303
|
+
f" - name: {provider_name}\n"
|
|
304
|
+
" # ... existing fields ...\n"
|
|
305
|
+
" capabilities:\n"
|
|
306
|
+
f" {key}: {val}\n"
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _patch_model_capabilities_yaml(*, match: str, kind: str, key: str, value: bool) -> str:
|
|
311
|
+
"""Emit a model-capabilities.yaml rule that declares ``<key>=<value>``."""
|
|
312
|
+
val = "true" if value else "false"
|
|
313
|
+
return (
|
|
314
|
+
"# ~/.coderouter/model-capabilities.yaml — append under `rules:`:\n"
|
|
315
|
+
"rules:\n"
|
|
316
|
+
f" - match: {match!r}\n"
|
|
317
|
+
f" kind: {kind}\n"
|
|
318
|
+
" capabilities:\n"
|
|
319
|
+
f" {key}: {val}\n"
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _patch_providers_yaml_output_filters(provider_name: str, filters: list[str]) -> str:
|
|
324
|
+
"""v1.0-A: Emit a providers.yaml patch adding/extending ``output_filters``.
|
|
325
|
+
|
|
326
|
+
Lists the filters verbatim so copy-paste yields a valid YAML list.
|
|
327
|
+
The comment block above the stanza hints that this is additive with
|
|
328
|
+
any existing filter chain — users with a bespoke chain should merge
|
|
329
|
+
rather than replace.
|
|
330
|
+
"""
|
|
331
|
+
items = "\n".join(f" - {f}" for f in filters)
|
|
332
|
+
return (
|
|
333
|
+
"# providers.yaml — update the entry for "
|
|
334
|
+
f"{provider_name!r} (merge if a chain already exists):\n"
|
|
335
|
+
"providers:\n"
|
|
336
|
+
f" - name: {provider_name}\n"
|
|
337
|
+
" # ... existing fields ...\n"
|
|
338
|
+
" output_filters:\n"
|
|
339
|
+
f"{items}\n"
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def _patch_providers_yaml_num_ctx(provider_name: str, desired_ctx: int = 32768) -> str:
|
|
344
|
+
"""v1.0-B: Emit a providers.yaml patch setting ``extra_body.options.num_ctx``.
|
|
345
|
+
|
|
346
|
+
The path is Ollama-specific: ``extra_body`` is shallow-merged into the
|
|
347
|
+
outbound body by the openai_compat adapter, and Ollama exposes context
|
|
348
|
+
length via a nested ``options`` object. 32768 is a practical default
|
|
349
|
+
for Claude Code's tool-heavy system prompts (see plan.md §9.4 symptom
|
|
350
|
+
#1) — operators can dial it down for memory-bound hosts.
|
|
351
|
+
"""
|
|
352
|
+
return (
|
|
353
|
+
"# providers.yaml — update the entry for "
|
|
354
|
+
f"{provider_name!r} (merge into any existing extra_body):\n"
|
|
355
|
+
"providers:\n"
|
|
356
|
+
f" - name: {provider_name}\n"
|
|
357
|
+
" # ... existing fields ...\n"
|
|
358
|
+
" extra_body:\n"
|
|
359
|
+
" options:\n"
|
|
360
|
+
f" num_ctx: {desired_ctx}\n"
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _patch_providers_yaml_num_predict(provider_name: str, desired_predict: int = 4096) -> str:
|
|
365
|
+
"""v1.0-C: Emit a providers.yaml patch setting ``extra_body.options.num_predict``.
|
|
366
|
+
|
|
367
|
+
Sibling of :func:`_patch_providers_yaml_num_ctx` — same ``extra_body.options``
|
|
368
|
+
path, but controls the **output-side** token cap rather than the input-side
|
|
369
|
+
window. Ollama's default for ``num_predict`` is -1 (unlimited) in recent
|
|
370
|
+
builds, but older builds and some Ollama-compat servers cap at 128 or 256
|
|
371
|
+
which silently truncates Claude Code's longer completions mid-response.
|
|
372
|
+
4096 is a practical cap that covers ~95 % of Claude Code completions
|
|
373
|
+
without risking runaway generations; operators can set to -1 for uncapped.
|
|
374
|
+
"""
|
|
375
|
+
return (
|
|
376
|
+
"# providers.yaml — update the entry for "
|
|
377
|
+
f"{provider_name!r} (merge into any existing extra_body):\n"
|
|
378
|
+
"providers:\n"
|
|
379
|
+
f" - name: {provider_name}\n"
|
|
380
|
+
" # ... existing fields ...\n"
|
|
381
|
+
" extra_body:\n"
|
|
382
|
+
" options:\n"
|
|
383
|
+
f" num_predict: {desired_predict}\n"
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
# ---------------------------------------------------------------------------
|
|
388
|
+
# Probes
|
|
389
|
+
# ---------------------------------------------------------------------------
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
# v1.0-B: num_ctx probe constants.
|
|
393
|
+
#
|
|
394
|
+
# We embed a short, unusual canary token at the very beginning of the user
|
|
395
|
+
# prompt, follow it with enough filler sentences to exceed Ollama's default
|
|
396
|
+
# 2048-token context window, and ask the model to echo the canary back.
|
|
397
|
+
# Because Ollama silently drops the BEGINNING of the prompt when it
|
|
398
|
+
# overflows `num_ctx` (not the end), a model running at the default cannot
|
|
399
|
+
# know what the canary was and fails to echo it. When the operator has
|
|
400
|
+
# correctly bumped `num_ctx` via ``extra_body.options.num_ctx``, the canary
|
|
401
|
+
# survives and the model replies with it.
|
|
402
|
+
#
|
|
403
|
+
# The padding sentence is ~16 tokens; 300 repeats ≈ 4800 tokens — well
|
|
404
|
+
# beyond 2048 yet still cheap enough to issue once per doctor invocation.
|
|
405
|
+
# ZEBRA-MOON-847 is chosen to be hyphenated and all-caps so it does not
|
|
406
|
+
# appear in natural text; the model cannot produce it without having seen
|
|
407
|
+
# it in the prompt.
|
|
408
|
+
_NUM_CTX_PROBE_CANARY = "ZEBRA-MOON-847"
|
|
409
|
+
_NUM_CTX_PROBE_PADDING_SENTENCE = (
|
|
410
|
+
"The quick brown fox jumps over the lazy dog near the river bank today. "
|
|
411
|
+
)
|
|
412
|
+
_NUM_CTX_PROBE_PADDING_REPEATS = 300
|
|
413
|
+
# Threshold below which a declared ``num_ctx`` is still considered "too
|
|
414
|
+
# tight for Claude Code's tool-heavy prompts" — the Claude Code system
|
|
415
|
+
# prompt + tool roster alone is routinely north of 15k tokens. 8192 leaves
|
|
416
|
+
# headroom for small user messages without enabling a corner case where
|
|
417
|
+
# the probe happens to fit (our padding is only ~5k tokens) but a real
|
|
418
|
+
# Claude Code session still truncates.
|
|
419
|
+
_NUM_CTX_ADEQUATE_THRESHOLD = 8192
|
|
420
|
+
|
|
421
|
+
# v1.0-C: streaming probe constants.
|
|
422
|
+
#
|
|
423
|
+
# A short, deterministic task that forces the model to emit ~60-80 output
|
|
424
|
+
# chars in a predictable shape. Counting 1..30 one-per-line yields "1\n2\n
|
|
425
|
+
# ...30\n" = ~80 chars; any cap below the prompt's intent shows up as a
|
|
426
|
+
# ``finish_reason: length`` with heavily-truncated content. The prompt is
|
|
427
|
+
# kept well under ``num_ctx`` so a stray ``num_ctx`` issue does not
|
|
428
|
+
# masquerade as a ``num_predict`` issue (num_ctx probe runs first anyway).
|
|
429
|
+
_STREAMING_PROBE_USER_PROMPT = (
|
|
430
|
+
"Count from 1 to 30, one number per line. Output only the numbers, nothing else."
|
|
431
|
+
)
|
|
432
|
+
# Minimum content length we require to call the stream "not prematurely
|
|
433
|
+
# truncated". "1\n2\n...\n30" is ~80 chars; 40 chars covers the halfway
|
|
434
|
+
# mark (1..20) which is already obviously-truncated territory.
|
|
435
|
+
_STREAMING_PROBE_MIN_EXPECTED_CHARS = 40
|
|
436
|
+
# Default ``num_predict`` suggested in the emitted patch. -1 would be
|
|
437
|
+
# optimal (uncapped) but "4096" communicates intent more clearly to
|
|
438
|
+
# operators unfamiliar with Ollama's sentinel value, and covers Claude
|
|
439
|
+
# Code completions comfortably while still protecting against runaway
|
|
440
|
+
# generations on broken models.
|
|
441
|
+
_STREAMING_PROBE_NUM_PREDICT_DEFAULT = 4096
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def _is_ollama_like(provider: ProviderConfig) -> bool:
|
|
445
|
+
"""Return True iff num_ctx truncation is plausible for this provider.
|
|
446
|
+
|
|
447
|
+
Two signals fire:
|
|
448
|
+
* base_url uses the canonical Ollama port ``11434``. This is the
|
|
449
|
+
off-the-shelf install; operators who moved it still trigger the
|
|
450
|
+
second signal.
|
|
451
|
+
* ``extra_body.options.num_ctx`` is declared. Only Ollama honors
|
|
452
|
+
this path, so an operator who wrote the field is declaring — by
|
|
453
|
+
construction — that the upstream is Ollama-shape.
|
|
454
|
+
|
|
455
|
+
Deliberately does NOT fire on llama.cpp (port 8080), OpenRouter,
|
|
456
|
+
Together, Groq, or Anthropic native — those upstreams either don't
|
|
457
|
+
truncate silently (they hard-error on over-long prompts) or use a
|
|
458
|
+
different context-length knob (``max_tokens``, ``n_ctx`` at server
|
|
459
|
+
start, etc.) that isn't reachable from providers.yaml.
|
|
460
|
+
"""
|
|
461
|
+
if provider.kind != "openai_compat":
|
|
462
|
+
return False
|
|
463
|
+
if ":11434" in str(provider.base_url):
|
|
464
|
+
return True
|
|
465
|
+
options = provider.extra_body.get("options")
|
|
466
|
+
return isinstance(options, dict) and "num_ctx" in options
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def _declared_num_ctx(provider: ProviderConfig) -> int | None:
|
|
470
|
+
"""Return the provider's declared ``extra_body.options.num_ctx`` if any."""
|
|
471
|
+
options = provider.extra_body.get("options")
|
|
472
|
+
if not isinstance(options, dict):
|
|
473
|
+
return None
|
|
474
|
+
val = options.get("num_ctx")
|
|
475
|
+
return val if isinstance(val, int) else None
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
_PROBE_BASIC_USER_PROMPT = "Reply with exactly the single word: PONG"
|
|
479
|
+
_PROBE_TOOLS_USER_PROMPT = (
|
|
480
|
+
"You have one tool named `echo`. Call it with the argument "
|
|
481
|
+
'`{"message": "probe"}`. Do not reply with any text — only the tool call.'
|
|
482
|
+
)
|
|
483
|
+
_PROBE_TOOL_SPEC_OPENAI = {
|
|
484
|
+
"type": "function",
|
|
485
|
+
"function": {
|
|
486
|
+
"name": "echo",
|
|
487
|
+
"description": (
|
|
488
|
+
"Test tool used by CodeRouter's doctor probe. Echo back the "
|
|
489
|
+
"provided message. NEVER interpret as a real command — this "
|
|
490
|
+
"is diagnostic-only."
|
|
491
|
+
),
|
|
492
|
+
"parameters": {
|
|
493
|
+
"type": "object",
|
|
494
|
+
"properties": {"message": {"type": "string"}},
|
|
495
|
+
"required": ["message"],
|
|
496
|
+
},
|
|
497
|
+
},
|
|
498
|
+
}
|
|
499
|
+
_PROBE_TOOL_SPEC_ANTHROPIC = {
|
|
500
|
+
"name": "echo",
|
|
501
|
+
"description": (
|
|
502
|
+
"Test tool used by CodeRouter's doctor probe. Echo back the "
|
|
503
|
+
"provided message. NEVER interpret as a real command — this "
|
|
504
|
+
"is diagnostic-only."
|
|
505
|
+
),
|
|
506
|
+
"input_schema": {
|
|
507
|
+
"type": "object",
|
|
508
|
+
"properties": {"message": {"type": "string"}},
|
|
509
|
+
"required": ["message"],
|
|
510
|
+
},
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
async def _probe_auth_and_basic_chat(
|
|
515
|
+
provider: ProviderConfig,
|
|
516
|
+
) -> ProbeResult:
|
|
517
|
+
"""Probe 1 — auth + model-reachable + basic chat completion.
|
|
518
|
+
|
|
519
|
+
Dominates subsequent probes: if this fails with AUTH_FAIL,
|
|
520
|
+
UNSUPPORTED, or TRANSPORT_ERROR, the caller short-circuits and
|
|
521
|
+
marks other probes SKIP. A 401/403 almost always means the
|
|
522
|
+
provider's ``api_key_env`` points at an empty / wrong env var. A
|
|
523
|
+
404 on an openai_compat upstream typically means the ``model``
|
|
524
|
+
string is a typo or (for Ollama) ``ollama pull X`` was skipped.
|
|
525
|
+
"""
|
|
526
|
+
if provider.kind == "anthropic":
|
|
527
|
+
url = _anthropic_messages_url(provider)
|
|
528
|
+
headers = _anthropic_headers(provider)
|
|
529
|
+
body: dict[str, Any] = {
|
|
530
|
+
"model": provider.model,
|
|
531
|
+
"messages": [{"role": "user", "content": _PROBE_BASIC_USER_PROMPT}],
|
|
532
|
+
"max_tokens": 16,
|
|
533
|
+
}
|
|
534
|
+
else:
|
|
535
|
+
url = _openai_chat_url(provider)
|
|
536
|
+
headers = _openai_headers(provider)
|
|
537
|
+
body = {
|
|
538
|
+
"model": provider.model,
|
|
539
|
+
"messages": [{"role": "user", "content": _PROBE_BASIC_USER_PROMPT}],
|
|
540
|
+
"max_tokens": 16,
|
|
541
|
+
"temperature": 0,
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
status, parsed, raw = await _http_post_json(
|
|
545
|
+
url, headers=headers, body=body, timeout=provider.timeout_s
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
if status is None:
|
|
549
|
+
return ProbeResult(
|
|
550
|
+
name="auth+basic-chat",
|
|
551
|
+
verdict=ProbeVerdict.TRANSPORT_ERROR,
|
|
552
|
+
detail=f"could not reach {url}: {raw}",
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
if status in (401, 403):
|
|
556
|
+
return ProbeResult(
|
|
557
|
+
name="auth+basic-chat",
|
|
558
|
+
verdict=ProbeVerdict.AUTH_FAIL,
|
|
559
|
+
detail=(
|
|
560
|
+
f"upstream returned {status}. Check that env var "
|
|
561
|
+
f"{provider.api_key_env!r} is set "
|
|
562
|
+
"and holds a valid key (plan.md §9.4 symptom #5)."
|
|
563
|
+
),
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
if status == 404:
|
|
567
|
+
return ProbeResult(
|
|
568
|
+
name="auth+basic-chat",
|
|
569
|
+
verdict=ProbeVerdict.UNSUPPORTED,
|
|
570
|
+
detail=(
|
|
571
|
+
f"upstream returned 404 for model {provider.model!r}. "
|
|
572
|
+
"For Ollama: run `ollama pull "
|
|
573
|
+
f"{provider.model}`. For OpenRouter: verify the model slug "
|
|
574
|
+
"at https://openrouter.ai/models (plan.md §9.4 symptom #4)."
|
|
575
|
+
),
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
if status >= 400:
|
|
579
|
+
snippet = (raw or "")[:160]
|
|
580
|
+
return ProbeResult(
|
|
581
|
+
name="auth+basic-chat",
|
|
582
|
+
verdict=ProbeVerdict.TRANSPORT_ERROR,
|
|
583
|
+
detail=f"upstream returned {status}: {snippet!r}",
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
if parsed is None:
|
|
587
|
+
return ProbeResult(
|
|
588
|
+
name="auth+basic-chat",
|
|
589
|
+
verdict=ProbeVerdict.TRANSPORT_ERROR,
|
|
590
|
+
detail="upstream returned 2xx but body was not JSON",
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
# Success — give a short confirmation with observed usage (if any).
|
|
594
|
+
usage = parsed.get("usage") or {}
|
|
595
|
+
tokens_in = usage.get("prompt_tokens") or usage.get("input_tokens")
|
|
596
|
+
tokens_out = usage.get("completion_tokens") or usage.get("output_tokens")
|
|
597
|
+
return ProbeResult(
|
|
598
|
+
name="auth+basic-chat",
|
|
599
|
+
verdict=ProbeVerdict.OK,
|
|
600
|
+
detail=(
|
|
601
|
+
f"{status} OK"
|
|
602
|
+
+ (f" (in={tokens_in}, out={tokens_out})" if tokens_in is not None else "")
|
|
603
|
+
),
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
def _extract_openai_assistant_choice(
|
|
608
|
+
body: dict[str, Any],
|
|
609
|
+
) -> dict[str, Any] | None:
|
|
610
|
+
choices = body.get("choices")
|
|
611
|
+
if not isinstance(choices, list) or not choices:
|
|
612
|
+
return None
|
|
613
|
+
first = choices[0]
|
|
614
|
+
if not isinstance(first, dict):
|
|
615
|
+
return None
|
|
616
|
+
msg = first.get("message")
|
|
617
|
+
return msg if isinstance(msg, dict) else None
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
async def _probe_num_ctx(provider: ProviderConfig) -> ProbeResult:
|
|
621
|
+
"""v1.0-B Probe — direct detection of Ollama ``num_ctx`` truncation.
|
|
622
|
+
|
|
623
|
+
Addresses plan.md §9.4 symptom #1 (空応答 / 意味不明応答). Prior to
|
|
624
|
+
v1.0-B the symptom was inferred only indirectly — a silently-truncated
|
|
625
|
+
system prompt often produced a tool-unaware assistant reply, which the
|
|
626
|
+
v0.7-B tool_calls probe then flagged as NEEDS_TUNING for
|
|
627
|
+
``capabilities.tools=false``. That patch did not fix the root cause;
|
|
628
|
+
the remediation was always the same ``extra_body.options.num_ctx: N``
|
|
629
|
+
bump. The direct probe here uses a canary echo-back to observe the
|
|
630
|
+
truncation first-hand and emit the correct patch.
|
|
631
|
+
|
|
632
|
+
Mechanism:
|
|
633
|
+
* Apply the canary (``ZEBRA-MOON-847``) at the very beginning.
|
|
634
|
+
* Follow with ~5k tokens of filler sentences to overflow Ollama's
|
|
635
|
+
default 2048-token context window.
|
|
636
|
+
* Close with an explicit ask to echo the canary token back.
|
|
637
|
+
* Merge ``provider.extra_body`` into the request body (so any
|
|
638
|
+
declared ``options.num_ctx`` is exercised).
|
|
639
|
+
|
|
640
|
+
Verdict branches:
|
|
641
|
+
|
|
642
|
+
canary echoed + num_ctx declared ≥ threshold → OK
|
|
643
|
+
canary echoed + num_ctx not declared → OK (informational —
|
|
644
|
+
upstream isn't
|
|
645
|
+
actually truncating
|
|
646
|
+
at its advertised
|
|
647
|
+
default, which is
|
|
648
|
+
unusual but benign)
|
|
649
|
+
canary missing + num_ctx not declared → NEEDS_TUNING, patch
|
|
650
|
+
adds 32768
|
|
651
|
+
canary missing + num_ctx declared < threshold → NEEDS_TUNING, patch
|
|
652
|
+
bumps to 32768
|
|
653
|
+
canary missing + num_ctx declared ≥ threshold → NEEDS_TUNING with a
|
|
654
|
+
note about model
|
|
655
|
+
intrinsic limits
|
|
656
|
+
|
|
657
|
+
Non-Ollama-shape providers SKIP (see ``_is_ollama_like``).
|
|
658
|
+
"""
|
|
659
|
+
if not _is_ollama_like(provider):
|
|
660
|
+
return ProbeResult(
|
|
661
|
+
name="num_ctx",
|
|
662
|
+
verdict=ProbeVerdict.SKIP,
|
|
663
|
+
detail=(
|
|
664
|
+
"not applicable — provider does not look Ollama-shape "
|
|
665
|
+
"(base_url is not on port 11434 and no "
|
|
666
|
+
"`extra_body.options.num_ctx` is declared)."
|
|
667
|
+
),
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
padding = _NUM_CTX_PROBE_PADDING_SENTENCE * _NUM_CTX_PROBE_PADDING_REPEATS
|
|
671
|
+
user_prompt = (
|
|
672
|
+
f"CANARY: {_NUM_CTX_PROBE_CANARY}\n\n"
|
|
673
|
+
+ padding
|
|
674
|
+
+ "\n\nQuestion: What exact canary token appeared at the very "
|
|
675
|
+
"beginning of this message? Reply with only the canary token "
|
|
676
|
+
"itself, nothing else."
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
url = _openai_chat_url(provider)
|
|
680
|
+
headers = _openai_headers(provider)
|
|
681
|
+
# Start from the provider's extra_body — this is the only probe that
|
|
682
|
+
# merges it in, because the whole point of this probe is to exercise
|
|
683
|
+
# whatever ``options.num_ctx`` the operator has declared. Request
|
|
684
|
+
# fields win over extra_body, matching the adapter's merge order.
|
|
685
|
+
body: dict[str, Any] = dict(provider.extra_body)
|
|
686
|
+
body.update(
|
|
687
|
+
{
|
|
688
|
+
"model": provider.model,
|
|
689
|
+
"messages": [{"role": "user", "content": user_prompt}],
|
|
690
|
+
"max_tokens": 32,
|
|
691
|
+
"temperature": 0,
|
|
692
|
+
}
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
status, parsed, _raw = await _http_post_json(
|
|
696
|
+
url, headers=headers, body=body, timeout=provider.timeout_s
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
if status is None or status >= 400 or parsed is None:
|
|
700
|
+
return ProbeResult(
|
|
701
|
+
name="num_ctx",
|
|
702
|
+
verdict=ProbeVerdict.SKIP,
|
|
703
|
+
detail=f"skipped (upstream status={status!r}).",
|
|
704
|
+
)
|
|
705
|
+
|
|
706
|
+
msg = _extract_openai_assistant_choice(parsed)
|
|
707
|
+
content = msg.get("content") if isinstance(msg, dict) else None
|
|
708
|
+
content_text = content if isinstance(content, str) else ""
|
|
709
|
+
canary_echoed = _NUM_CTX_PROBE_CANARY in content_text
|
|
710
|
+
|
|
711
|
+
declared = _declared_num_ctx(provider)
|
|
712
|
+
|
|
713
|
+
if canary_echoed:
|
|
714
|
+
if declared is not None and declared >= _NUM_CTX_ADEQUATE_THRESHOLD:
|
|
715
|
+
return ProbeResult(
|
|
716
|
+
name="num_ctx",
|
|
717
|
+
verdict=ProbeVerdict.OK,
|
|
718
|
+
detail=(
|
|
719
|
+
f"canary echoed at ~{len(user_prompt)} chars of prompt; "
|
|
720
|
+
f"declared num_ctx={declared} is adequate "
|
|
721
|
+
f"(≥ {_NUM_CTX_ADEQUATE_THRESHOLD})."
|
|
722
|
+
),
|
|
723
|
+
)
|
|
724
|
+
if declared is None:
|
|
725
|
+
return ProbeResult(
|
|
726
|
+
name="num_ctx",
|
|
727
|
+
verdict=ProbeVerdict.OK,
|
|
728
|
+
detail=(
|
|
729
|
+
f"canary echoed at ~{len(user_prompt)} chars; upstream "
|
|
730
|
+
"accepted the full prompt without truncation "
|
|
731
|
+
"(no `options.num_ctx` declared — the Ollama default is "
|
|
732
|
+
"2048 so this is unusual; treat as informational)."
|
|
733
|
+
),
|
|
734
|
+
)
|
|
735
|
+
# declared is not None but below threshold, yet canary still echoed.
|
|
736
|
+
# Either Ollama silently overrode the low declaration (some 0.20+
|
|
737
|
+
# builds clamp `options.num_ctx` to the model's loaded context size)
|
|
738
|
+
# or the prompt simply fit. Surface the declared value so operators
|
|
739
|
+
# running the v1.0-verify script can tell this case apart from a
|
|
740
|
+
# config-loading failure.
|
|
741
|
+
return ProbeResult(
|
|
742
|
+
name="num_ctx",
|
|
743
|
+
verdict=ProbeVerdict.OK,
|
|
744
|
+
detail=(
|
|
745
|
+
f"canary echoed at ~{len(user_prompt)} chars; upstream "
|
|
746
|
+
f"accepted the full prompt despite declared num_ctx="
|
|
747
|
+
f"{declared} (below the {_NUM_CTX_ADEQUATE_THRESHOLD}-token "
|
|
748
|
+
"threshold). Either the prompt fit anyway or Ollama "
|
|
749
|
+
"ignored the declared value — check `ollama ps` for the "
|
|
750
|
+
"session's loaded context and consider `ollama stop "
|
|
751
|
+
f"{provider.model}` before probing to force a cold reload."
|
|
752
|
+
),
|
|
753
|
+
)
|
|
754
|
+
|
|
755
|
+
# Canary missing → truncation occurred.
|
|
756
|
+
if declared is None:
|
|
757
|
+
return ProbeResult(
|
|
758
|
+
name="num_ctx",
|
|
759
|
+
verdict=ProbeVerdict.NEEDS_TUNING,
|
|
760
|
+
detail=(
|
|
761
|
+
f"canary {_NUM_CTX_PROBE_CANARY!r} missing from reply — "
|
|
762
|
+
"upstream truncated the prompt. No `extra_body.options.num_ctx` "
|
|
763
|
+
"is declared, so Ollama is running at its 2048-token default, "
|
|
764
|
+
"which cannot hold Claude Code's system + tool prompts "
|
|
765
|
+
"(plan.md §9.4 symptom #1)."
|
|
766
|
+
),
|
|
767
|
+
target_file="providers.yaml",
|
|
768
|
+
suggested_patch=_patch_providers_yaml_num_ctx(provider.name, 32768),
|
|
769
|
+
)
|
|
770
|
+
if declared < _NUM_CTX_ADEQUATE_THRESHOLD:
|
|
771
|
+
return ProbeResult(
|
|
772
|
+
name="num_ctx",
|
|
773
|
+
verdict=ProbeVerdict.NEEDS_TUNING,
|
|
774
|
+
detail=(
|
|
775
|
+
f"canary missing — declared num_ctx={declared} is below "
|
|
776
|
+
f"the {_NUM_CTX_ADEQUATE_THRESHOLD}-token threshold needed "
|
|
777
|
+
"for Claude Code prompts. Bump it (plan.md §9.4 symptom #1)."
|
|
778
|
+
),
|
|
779
|
+
target_file="providers.yaml",
|
|
780
|
+
suggested_patch=_patch_providers_yaml_num_ctx(provider.name, 32768),
|
|
781
|
+
)
|
|
782
|
+
# Declared high but still truncated — the upstream model's intrinsic
|
|
783
|
+
# limit is probably lower than the declared num_ctx, or the server is
|
|
784
|
+
# silently capping it. Still NEEDS_TUNING because the observed behavior
|
|
785
|
+
# doesn't match the declaration; operator should verify.
|
|
786
|
+
return ProbeResult(
|
|
787
|
+
name="num_ctx",
|
|
788
|
+
verdict=ProbeVerdict.NEEDS_TUNING,
|
|
789
|
+
detail=(
|
|
790
|
+
f"canary missing even with num_ctx={declared} declared. The "
|
|
791
|
+
"model's intrinsic context limit may be shorter than the "
|
|
792
|
+
"declared value, or the upstream is silently capping it — "
|
|
793
|
+
"verify with the model card / server logs. The suggested "
|
|
794
|
+
"patch still emits 32768 as a starting point; dial down if "
|
|
795
|
+
"the host is memory-constrained."
|
|
796
|
+
),
|
|
797
|
+
target_file="providers.yaml",
|
|
798
|
+
suggested_patch=_patch_providers_yaml_num_ctx(provider.name, 32768),
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
async def _probe_streaming(provider: ProviderConfig) -> ProbeResult:
|
|
803
|
+
"""v1.0-C Probe — streaming completion path integrity.
|
|
804
|
+
|
|
805
|
+
Addresses plan.md §9.4 symptom #1 from the **output** side. The v1.0-B
|
|
806
|
+
``num_ctx`` probe catches silent **prompt** truncation; this one
|
|
807
|
+
catches silent **completion** truncation — specifically Ollama's
|
|
808
|
+
``options.num_predict`` cap closing the stream early with
|
|
809
|
+
``finish_reason: length``. Secondary failure mode covered: upstream
|
|
810
|
+
silently ignoring ``stream: true`` (2xx response but zero SSE chunks),
|
|
811
|
+
which Claude Code experiences as a "no output until timeout" stall.
|
|
812
|
+
|
|
813
|
+
Ollama-shape gating
|
|
814
|
+
-------------------
|
|
815
|
+
Fires only when :func:`_is_ollama_like` returns True — same signal set
|
|
816
|
+
as the num_ctx probe (``:11434`` port or declared
|
|
817
|
+
``extra_body.options.num_ctx``). Rationale:
|
|
818
|
+
|
|
819
|
+
* Non-Ollama openai_compat upstreams (OpenRouter, Together, Groq,
|
|
820
|
+
vLLM, llama.cpp) either cap via non-``extra_body`` knobs (server
|
|
821
|
+
start flags, plan-level limits) that ``providers.yaml`` cannot
|
|
822
|
+
reach, or they don't silently cap at all. Emitting a patch would
|
|
823
|
+
be actionless.
|
|
824
|
+
* Anthropic native streaming uses a different event wire format
|
|
825
|
+
(``content_block_delta`` etc.); deferred to a hypothetical v1.0-D
|
|
826
|
+
if symptoms ever surface there.
|
|
827
|
+
|
|
828
|
+
Gating also keeps the existing :8080 fixture-based tests
|
|
829
|
+
SKIP-without-HTTP, so the mock FIFO in 30+ tests stays intact.
|
|
830
|
+
|
|
831
|
+
Verdicts
|
|
832
|
+
--------
|
|
833
|
+
* non-Ollama-shape → SKIP
|
|
834
|
+
* transport/auth/HTTP error → SKIP (auth probe dominates)
|
|
835
|
+
* 2xx + 0 chunks (stream ignored) → NEEDS_TUNING (no patch —
|
|
836
|
+
advisory; the upstream
|
|
837
|
+
framing is broken or the
|
|
838
|
+
model does not support
|
|
839
|
+
streaming)
|
|
840
|
+
* 2xx + chunks + finish_reason=length
|
|
841
|
+
+ content < threshold → NEEDS_TUNING + num_predict
|
|
842
|
+
patch
|
|
843
|
+
* 2xx + chunks + finish_reason=stop
|
|
844
|
+
+ content ≥ threshold → OK
|
|
845
|
+
* 2xx + chunks + no ``[DONE]`` → OK with informational note
|
|
846
|
+
(most clients tolerate; the
|
|
847
|
+
signal is surfaced for
|
|
848
|
+
operators running strict
|
|
849
|
+
SSE parsers)
|
|
850
|
+
"""
|
|
851
|
+
if not _is_ollama_like(provider):
|
|
852
|
+
return ProbeResult(
|
|
853
|
+
name="streaming",
|
|
854
|
+
verdict=ProbeVerdict.SKIP,
|
|
855
|
+
detail=(
|
|
856
|
+
"not applicable — streaming-path truncation detection is "
|
|
857
|
+
"Ollama-shape-gated (same signal as num_ctx probe: port "
|
|
858
|
+
"11434 or declared `extra_body.options.num_ctx`). Cloud "
|
|
859
|
+
"openai_compat upstreams do not expose an actionable "
|
|
860
|
+
"`num_predict` knob from providers.yaml."
|
|
861
|
+
),
|
|
862
|
+
)
|
|
863
|
+
|
|
864
|
+
url = _openai_chat_url(provider)
|
|
865
|
+
headers = _openai_headers(provider)
|
|
866
|
+
# Merge extra_body same as num_ctx probe — we want declared
|
|
867
|
+
# ``options.num_predict`` (if any) to actually take effect during
|
|
868
|
+
# probing. Top-level probe fields win on collision, matching adapter
|
|
869
|
+
# merge order.
|
|
870
|
+
body: dict[str, Any] = dict(provider.extra_body)
|
|
871
|
+
body.update(
|
|
872
|
+
{
|
|
873
|
+
"model": provider.model,
|
|
874
|
+
"messages": [{"role": "user", "content": _STREAMING_PROBE_USER_PROMPT}],
|
|
875
|
+
"max_tokens": 128,
|
|
876
|
+
"temperature": 0,
|
|
877
|
+
"stream": True,
|
|
878
|
+
}
|
|
879
|
+
)
|
|
880
|
+
|
|
881
|
+
status, chunks, saw_done, err = await _http_stream_sse(
|
|
882
|
+
url, headers=headers, body=body, timeout=provider.timeout_s
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
if status is None:
|
|
886
|
+
return ProbeResult(
|
|
887
|
+
name="streaming",
|
|
888
|
+
verdict=ProbeVerdict.SKIP,
|
|
889
|
+
detail=f"skipped (transport error during streaming: {err}).",
|
|
890
|
+
)
|
|
891
|
+
if status in (401, 403):
|
|
892
|
+
return ProbeResult(
|
|
893
|
+
name="streaming",
|
|
894
|
+
verdict=ProbeVerdict.SKIP,
|
|
895
|
+
detail=(
|
|
896
|
+
f"skipped (upstream status={status} during streaming); "
|
|
897
|
+
"auth probe already reported this."
|
|
898
|
+
),
|
|
899
|
+
)
|
|
900
|
+
if status >= 400:
|
|
901
|
+
return ProbeResult(
|
|
902
|
+
name="streaming",
|
|
903
|
+
verdict=ProbeVerdict.SKIP,
|
|
904
|
+
detail=f"skipped (upstream status={status}): {err[:160]!r}",
|
|
905
|
+
)
|
|
906
|
+
|
|
907
|
+
# 2xx — aggregate content + finish_reason across chunks.
|
|
908
|
+
content_parts: list[str] = []
|
|
909
|
+
finish_reason: str | None = None
|
|
910
|
+
for chunk in chunks:
|
|
911
|
+
choices = chunk.get("choices")
|
|
912
|
+
if not isinstance(choices, list):
|
|
913
|
+
continue
|
|
914
|
+
for c in choices:
|
|
915
|
+
if not isinstance(c, dict):
|
|
916
|
+
continue
|
|
917
|
+
delta = c.get("delta")
|
|
918
|
+
if isinstance(delta, dict):
|
|
919
|
+
piece = delta.get("content")
|
|
920
|
+
if isinstance(piece, str):
|
|
921
|
+
content_parts.append(piece)
|
|
922
|
+
fr = c.get("finish_reason")
|
|
923
|
+
if isinstance(fr, str) and fr:
|
|
924
|
+
finish_reason = fr
|
|
925
|
+
content = "".join(content_parts)
|
|
926
|
+
|
|
927
|
+
if not chunks:
|
|
928
|
+
# Non-blocking upstream: 2xx arrived but no SSE chunks did. The
|
|
929
|
+
# `stream: true` flag was likely dropped (some Ollama-compat
|
|
930
|
+
# forks) or the upstream returned a single-shot JSON with a
|
|
931
|
+
# non-SSE content-type. No actionable ``extra_body`` patch —
|
|
932
|
+
# surface the observation and let the operator investigate.
|
|
933
|
+
return ProbeResult(
|
|
934
|
+
name="streaming",
|
|
935
|
+
verdict=ProbeVerdict.NEEDS_TUNING,
|
|
936
|
+
detail=(
|
|
937
|
+
"upstream returned 2xx but emitted no streaming chunks. "
|
|
938
|
+
"`stream: true` was likely ignored, or the SSE framing is "
|
|
939
|
+
"non-standard (no `data:` prefix / content-type != "
|
|
940
|
+
"`text/event-stream`). Verify with "
|
|
941
|
+
"`curl -N -H 'Accept: text/event-stream'` before relying "
|
|
942
|
+
"on streaming from Claude Code."
|
|
943
|
+
),
|
|
944
|
+
)
|
|
945
|
+
|
|
946
|
+
if finish_reason == "length" and len(content) < _STREAMING_PROBE_MIN_EXPECTED_CHARS:
|
|
947
|
+
# Premature cap — the hallmark of a low ``num_predict`` on
|
|
948
|
+
# Ollama. Claude Code users see this as "assistant cut off
|
|
949
|
+
# mid-word". Since we're already Ollama-shape-gated, the
|
|
950
|
+
# remediation is always the ``extra_body.options.num_predict``
|
|
951
|
+
# bump.
|
|
952
|
+
return ProbeResult(
|
|
953
|
+
name="streaming",
|
|
954
|
+
verdict=ProbeVerdict.NEEDS_TUNING,
|
|
955
|
+
detail=(
|
|
956
|
+
f"stream closed with `finish_reason='length'` after only "
|
|
957
|
+
f"{len(content)} chars (expected ≥ "
|
|
958
|
+
f"{_STREAMING_PROBE_MIN_EXPECTED_CHARS}). Upstream is "
|
|
959
|
+
"capping output — most likely `options.num_predict`. "
|
|
960
|
+
"Bump it via `extra_body` (plan.md §9.4 symptom #1 "
|
|
961
|
+
"streaming variant)."
|
|
962
|
+
),
|
|
963
|
+
target_file="providers.yaml",
|
|
964
|
+
suggested_patch=_patch_providers_yaml_num_predict(
|
|
965
|
+
provider.name, _STREAMING_PROBE_NUM_PREDICT_DEFAULT
|
|
966
|
+
),
|
|
967
|
+
)
|
|
968
|
+
|
|
969
|
+
# Stream completed; surface the `[DONE]` observation as an
|
|
970
|
+
# informational suffix so strict-SSE operators know to check their
|
|
971
|
+
# parser tolerance.
|
|
972
|
+
done_note = (
|
|
973
|
+
""
|
|
974
|
+
if saw_done
|
|
975
|
+
else (
|
|
976
|
+
" (no explicit `[DONE]` terminator observed — most clients "
|
|
977
|
+
"tolerate this but strict SSE parsers may stall)"
|
|
978
|
+
)
|
|
979
|
+
)
|
|
980
|
+
return ProbeResult(
|
|
981
|
+
name="streaming",
|
|
982
|
+
verdict=ProbeVerdict.OK,
|
|
983
|
+
detail=(
|
|
984
|
+
f"stream completed: {len(chunks)} chunks, {len(content)} "
|
|
985
|
+
f"chars, finish_reason={finish_reason!r}{done_note}."
|
|
986
|
+
),
|
|
987
|
+
)
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
async def _probe_tool_calls(
|
|
991
|
+
provider: ProviderConfig,
|
|
992
|
+
resolved: ResolvedCapabilities,
|
|
993
|
+
) -> ProbeResult:
|
|
994
|
+
"""Probe 2 — does the model emit native ``tool_calls`` structure?
|
|
995
|
+
|
|
996
|
+
Three observed paths, mapped to a verdict vs the declaration chain
|
|
997
|
+
(``provider.capabilities.tools`` → registry → None):
|
|
998
|
+
|
|
999
|
+
* Native ``tool_calls`` populated → *supports tools natively*.
|
|
1000
|
+
If declaration says False → NEEDS_TUNING (flip to True).
|
|
1001
|
+
If declaration says True → OK.
|
|
1002
|
+
|
|
1003
|
+
* No ``tool_calls`` but text contains tool-shaped JSON that
|
|
1004
|
+
v0.3-A ``repair_tool_calls_in_text`` can extract → *supports
|
|
1005
|
+
tools via text-JSON only*. If declaration says True →
|
|
1006
|
+
NEEDS_TUNING (model works but relies on repair; a narrower
|
|
1007
|
+
declaration avoids surprises downstream). If False → OK
|
|
1008
|
+
(repair path still rescues at runtime, no tuning needed).
|
|
1009
|
+
|
|
1010
|
+
* Nothing tool-shaped at all → *tools likely unsupported*.
|
|
1011
|
+
If declaration says True → NEEDS_TUNING (flip to False). If
|
|
1012
|
+
False → OK.
|
|
1013
|
+
"""
|
|
1014
|
+
if provider.kind == "anthropic":
|
|
1015
|
+
# Anthropic native tools use a different wire shape; we probe
|
|
1016
|
+
# via the messages API. A capable model returns content blocks
|
|
1017
|
+
# of type "tool_use".
|
|
1018
|
+
url = _anthropic_messages_url(provider)
|
|
1019
|
+
headers = _anthropic_headers(provider)
|
|
1020
|
+
body: dict[str, Any] = {
|
|
1021
|
+
"model": provider.model,
|
|
1022
|
+
"messages": [
|
|
1023
|
+
{"role": "user", "content": _PROBE_TOOLS_USER_PROMPT},
|
|
1024
|
+
],
|
|
1025
|
+
"max_tokens": 64,
|
|
1026
|
+
"tools": [_PROBE_TOOL_SPEC_ANTHROPIC],
|
|
1027
|
+
}
|
|
1028
|
+
else:
|
|
1029
|
+
url = _openai_chat_url(provider)
|
|
1030
|
+
headers = _openai_headers(provider)
|
|
1031
|
+
body = {
|
|
1032
|
+
"model": provider.model,
|
|
1033
|
+
"messages": [
|
|
1034
|
+
{"role": "user", "content": _PROBE_TOOLS_USER_PROMPT},
|
|
1035
|
+
],
|
|
1036
|
+
"max_tokens": 64,
|
|
1037
|
+
"temperature": 0,
|
|
1038
|
+
"tools": [_PROBE_TOOL_SPEC_OPENAI],
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
status, parsed, _raw = await _http_post_json(
|
|
1042
|
+
url, headers=headers, body=body, timeout=provider.timeout_s
|
|
1043
|
+
)
|
|
1044
|
+
|
|
1045
|
+
if status is None or status >= 400 or parsed is None:
|
|
1046
|
+
return ProbeResult(
|
|
1047
|
+
name="tool_calls",
|
|
1048
|
+
verdict=ProbeVerdict.SKIP,
|
|
1049
|
+
detail=(
|
|
1050
|
+
f"skipped (upstream status={status!r}); run auth probe "
|
|
1051
|
+
"first. Probe re-inspects this on the next invocation."
|
|
1052
|
+
),
|
|
1053
|
+
)
|
|
1054
|
+
|
|
1055
|
+
native_tool_call = False
|
|
1056
|
+
text_json_tool_call = False
|
|
1057
|
+
content_sample = ""
|
|
1058
|
+
if provider.kind == "anthropic":
|
|
1059
|
+
blocks = parsed.get("content")
|
|
1060
|
+
if isinstance(blocks, list):
|
|
1061
|
+
for block in blocks:
|
|
1062
|
+
if isinstance(block, dict) and block.get("type") == "tool_use":
|
|
1063
|
+
native_tool_call = True
|
|
1064
|
+
break
|
|
1065
|
+
content_sample = " ".join(
|
|
1066
|
+
str(b.get("text", ""))
|
|
1067
|
+
for b in blocks
|
|
1068
|
+
if isinstance(b, dict) and b.get("type") == "text"
|
|
1069
|
+
)[:200]
|
|
1070
|
+
else:
|
|
1071
|
+
msg = _extract_openai_assistant_choice(parsed)
|
|
1072
|
+
if msg is not None:
|
|
1073
|
+
if msg.get("tool_calls"):
|
|
1074
|
+
native_tool_call = True
|
|
1075
|
+
content = msg.get("content")
|
|
1076
|
+
if isinstance(content, str):
|
|
1077
|
+
content_sample = content[:200]
|
|
1078
|
+
|
|
1079
|
+
if not native_tool_call and content_sample:
|
|
1080
|
+
_, repaired = repair_tool_calls_in_text(content_sample, ["echo"])
|
|
1081
|
+
text_json_tool_call = bool(repaired)
|
|
1082
|
+
|
|
1083
|
+
# Resolve the declared support:
|
|
1084
|
+
# - explicit providers.yaml `capabilities.tools` wins (schema default is
|
|
1085
|
+
# False, so "declared" here means the user opted in). We treat the
|
|
1086
|
+
# registry as our fallback source of truth.
|
|
1087
|
+
declared_explicit = provider.capabilities.tools
|
|
1088
|
+
declared_registry = resolved.tools
|
|
1089
|
+
# "declared true" = either explicit opt-in OR registry True.
|
|
1090
|
+
# "declared false" = explicit False AND registry False/None.
|
|
1091
|
+
declared = declared_explicit or (declared_registry is True)
|
|
1092
|
+
|
|
1093
|
+
if native_tool_call:
|
|
1094
|
+
if declared:
|
|
1095
|
+
return ProbeResult(
|
|
1096
|
+
name="tool_calls",
|
|
1097
|
+
verdict=ProbeVerdict.OK,
|
|
1098
|
+
detail="native `tool_calls` observed; matches declaration.",
|
|
1099
|
+
)
|
|
1100
|
+
return ProbeResult(
|
|
1101
|
+
name="tool_calls",
|
|
1102
|
+
verdict=ProbeVerdict.NEEDS_TUNING,
|
|
1103
|
+
detail=(
|
|
1104
|
+
"model emitted native `tool_calls` but neither "
|
|
1105
|
+
"providers.yaml nor the registry declares tools=true. "
|
|
1106
|
+
"Opt in to unlock tool-bearing prompts."
|
|
1107
|
+
),
|
|
1108
|
+
target_file="providers.yaml",
|
|
1109
|
+
suggested_patch=_patch_providers_yaml_capability(provider.name, "tools", True),
|
|
1110
|
+
)
|
|
1111
|
+
|
|
1112
|
+
if text_json_tool_call:
|
|
1113
|
+
# Model wrote tool JSON in text. v0.3-A repair will rescue it,
|
|
1114
|
+
# but advertise it as a partial support so operators know.
|
|
1115
|
+
if declared:
|
|
1116
|
+
return ProbeResult(
|
|
1117
|
+
name="tool_calls",
|
|
1118
|
+
verdict=ProbeVerdict.NEEDS_TUNING,
|
|
1119
|
+
detail=(
|
|
1120
|
+
"model wrote tool JSON in assistant text (not native "
|
|
1121
|
+
"`tool_calls`). v0.3-A repair will rescue it at runtime, "
|
|
1122
|
+
"but the declaration implies native support. Either "
|
|
1123
|
+
"update the model to a tool-native build, or downgrade "
|
|
1124
|
+
"the declaration to rely on repair."
|
|
1125
|
+
),
|
|
1126
|
+
target_file="providers.yaml",
|
|
1127
|
+
suggested_patch=_patch_providers_yaml_capability(provider.name, "tools", False),
|
|
1128
|
+
)
|
|
1129
|
+
return ProbeResult(
|
|
1130
|
+
name="tool_calls",
|
|
1131
|
+
verdict=ProbeVerdict.OK,
|
|
1132
|
+
detail=(
|
|
1133
|
+
"no native `tool_calls`, but v0.3-A repair extracted tool "
|
|
1134
|
+
"JSON from the text — matches declaration tools=false."
|
|
1135
|
+
),
|
|
1136
|
+
)
|
|
1137
|
+
|
|
1138
|
+
# Nothing tool-shaped at all.
|
|
1139
|
+
if declared:
|
|
1140
|
+
return ProbeResult(
|
|
1141
|
+
name="tool_calls",
|
|
1142
|
+
verdict=ProbeVerdict.NEEDS_TUNING,
|
|
1143
|
+
detail=(
|
|
1144
|
+
"declaration says tools=true but model produced neither "
|
|
1145
|
+
"native `tool_calls` nor repairable tool JSON. Common for "
|
|
1146
|
+
"quantized small models (plan.md §9.4 symptom #2)."
|
|
1147
|
+
),
|
|
1148
|
+
target_file="providers.yaml",
|
|
1149
|
+
suggested_patch=_patch_providers_yaml_capability(provider.name, "tools", False),
|
|
1150
|
+
)
|
|
1151
|
+
return ProbeResult(
|
|
1152
|
+
name="tool_calls",
|
|
1153
|
+
verdict=ProbeVerdict.OK,
|
|
1154
|
+
detail="no tool calls, declaration tools=false — consistent.",
|
|
1155
|
+
)
|
|
1156
|
+
|
|
1157
|
+
|
|
1158
|
+
async def _probe_thinking(
|
|
1159
|
+
provider: ProviderConfig,
|
|
1160
|
+
resolved: ResolvedCapabilities,
|
|
1161
|
+
) -> ProbeResult:
|
|
1162
|
+
"""Probe 3 — does the model actually emit a ``thinking`` block?
|
|
1163
|
+
|
|
1164
|
+
Only applicable to ``kind: anthropic`` providers (the body field is
|
|
1165
|
+
Anthropic-specific; openai_compat providers silently lose it during
|
|
1166
|
+
OpenAI-shape translation). If the provider is openai_compat, we
|
|
1167
|
+
return SKIP unless they explicitly opted in via
|
|
1168
|
+
``capabilities.thinking: true`` — in which case we still SKIP but
|
|
1169
|
+
with a one-line note that the flag currently has no effect for
|
|
1170
|
+
that adapter (the v0.5-A gate would still strip it on the way out).
|
|
1171
|
+
"""
|
|
1172
|
+
if provider.kind != "anthropic":
|
|
1173
|
+
if provider.capabilities.thinking:
|
|
1174
|
+
return ProbeResult(
|
|
1175
|
+
name="thinking",
|
|
1176
|
+
verdict=ProbeVerdict.SKIP,
|
|
1177
|
+
detail=(
|
|
1178
|
+
"capabilities.thinking=true on an openai_compat "
|
|
1179
|
+
"provider has no effect — the thinking block is lost "
|
|
1180
|
+
"during OpenAI-shape translation. Remove the flag or "
|
|
1181
|
+
"switch kind to `anthropic` if the upstream speaks "
|
|
1182
|
+
"Anthropic wire."
|
|
1183
|
+
),
|
|
1184
|
+
)
|
|
1185
|
+
return ProbeResult(
|
|
1186
|
+
name="thinking",
|
|
1187
|
+
verdict=ProbeVerdict.SKIP,
|
|
1188
|
+
detail="not applicable (kind=openai_compat).",
|
|
1189
|
+
)
|
|
1190
|
+
|
|
1191
|
+
url = _anthropic_messages_url(provider)
|
|
1192
|
+
headers = _anthropic_headers(provider)
|
|
1193
|
+
body: dict[str, Any] = {
|
|
1194
|
+
"model": provider.model,
|
|
1195
|
+
"messages": [
|
|
1196
|
+
{
|
|
1197
|
+
"role": "user",
|
|
1198
|
+
"content": "Briefly: what is 2+2? Think step by step first.",
|
|
1199
|
+
},
|
|
1200
|
+
],
|
|
1201
|
+
"max_tokens": 128,
|
|
1202
|
+
"thinking": {"type": "enabled", "budget_tokens": 1024},
|
|
1203
|
+
}
|
|
1204
|
+
status, parsed, raw = await _http_post_json(
|
|
1205
|
+
url, headers=headers, body=body, timeout=provider.timeout_s
|
|
1206
|
+
)
|
|
1207
|
+
|
|
1208
|
+
if status is None or status >= 400 or parsed is None:
|
|
1209
|
+
# A 400 on the thinking-enabled payload is diagnostic: the
|
|
1210
|
+
# model rejected the field. Map to NEEDS_TUNING when the
|
|
1211
|
+
# registry / explicit flag promised support, otherwise OK.
|
|
1212
|
+
rejected = (
|
|
1213
|
+
status is not None and status == 400 and raw is not None and "thinking" in raw.lower()
|
|
1214
|
+
)
|
|
1215
|
+
declared = provider.capabilities.thinking or (resolved.thinking is True)
|
|
1216
|
+
if rejected and declared:
|
|
1217
|
+
return ProbeResult(
|
|
1218
|
+
name="thinking",
|
|
1219
|
+
verdict=ProbeVerdict.NEEDS_TUNING,
|
|
1220
|
+
detail=(
|
|
1221
|
+
"upstream rejected `thinking: {type: enabled}` with "
|
|
1222
|
+
"400. Declaration says supported — disable it for "
|
|
1223
|
+
"this provider or refine the registry rule."
|
|
1224
|
+
),
|
|
1225
|
+
target_file="providers.yaml",
|
|
1226
|
+
suggested_patch=_patch_providers_yaml_capability(provider.name, "thinking", False),
|
|
1227
|
+
)
|
|
1228
|
+
if rejected and not declared:
|
|
1229
|
+
return ProbeResult(
|
|
1230
|
+
name="thinking",
|
|
1231
|
+
verdict=ProbeVerdict.OK,
|
|
1232
|
+
detail="upstream rejects thinking; matches declaration.",
|
|
1233
|
+
)
|
|
1234
|
+
return ProbeResult(
|
|
1235
|
+
name="thinking",
|
|
1236
|
+
verdict=ProbeVerdict.SKIP,
|
|
1237
|
+
detail=f"skipped (upstream status={status!r}).",
|
|
1238
|
+
)
|
|
1239
|
+
|
|
1240
|
+
# Look for a `thinking` block in the response content array.
|
|
1241
|
+
emitted = False
|
|
1242
|
+
blocks = parsed.get("content")
|
|
1243
|
+
if isinstance(blocks, list):
|
|
1244
|
+
for block in blocks:
|
|
1245
|
+
if isinstance(block, dict) and block.get("type") == "thinking":
|
|
1246
|
+
emitted = True
|
|
1247
|
+
break
|
|
1248
|
+
|
|
1249
|
+
declared = provider.capabilities.thinking or (resolved.thinking is True)
|
|
1250
|
+
|
|
1251
|
+
if emitted and declared:
|
|
1252
|
+
return ProbeResult(
|
|
1253
|
+
name="thinking",
|
|
1254
|
+
verdict=ProbeVerdict.OK,
|
|
1255
|
+
detail="thinking block emitted; matches declaration.",
|
|
1256
|
+
)
|
|
1257
|
+
if emitted and not declared:
|
|
1258
|
+
return ProbeResult(
|
|
1259
|
+
name="thinking",
|
|
1260
|
+
verdict=ProbeVerdict.NEEDS_TUNING,
|
|
1261
|
+
detail=(
|
|
1262
|
+
"thinking block emitted but declaration is silent. "
|
|
1263
|
+
"Declare support to let the capability gate route to "
|
|
1264
|
+
"this provider for thinking-bearing requests."
|
|
1265
|
+
),
|
|
1266
|
+
target_file="model-capabilities.yaml",
|
|
1267
|
+
suggested_patch=_patch_model_capabilities_yaml(
|
|
1268
|
+
match=provider.model, kind="anthropic", key="thinking", value=True
|
|
1269
|
+
),
|
|
1270
|
+
)
|
|
1271
|
+
if not emitted and declared:
|
|
1272
|
+
return ProbeResult(
|
|
1273
|
+
name="thinking",
|
|
1274
|
+
verdict=ProbeVerdict.NEEDS_TUNING,
|
|
1275
|
+
detail=(
|
|
1276
|
+
"declaration says thinking supported but response had no "
|
|
1277
|
+
"`thinking` block. The upstream may silently drop it; "
|
|
1278
|
+
"disable the flag or narrow the registry rule."
|
|
1279
|
+
),
|
|
1280
|
+
target_file="providers.yaml",
|
|
1281
|
+
suggested_patch=_patch_providers_yaml_capability(provider.name, "thinking", False),
|
|
1282
|
+
)
|
|
1283
|
+
return ProbeResult(
|
|
1284
|
+
name="thinking",
|
|
1285
|
+
verdict=ProbeVerdict.OK,
|
|
1286
|
+
detail="no thinking block emitted; matches declaration.",
|
|
1287
|
+
)
|
|
1288
|
+
|
|
1289
|
+
|
|
1290
|
+
async def _probe_reasoning_leak(
|
|
1291
|
+
provider: ProviderConfig,
|
|
1292
|
+
resolved: ResolvedCapabilities,
|
|
1293
|
+
) -> ProbeResult:
|
|
1294
|
+
"""Probe 4 — does the upstream leak non-standard reasoning / harness markers?
|
|
1295
|
+
|
|
1296
|
+
Two orthogonal leaks inspected here:
|
|
1297
|
+
|
|
1298
|
+
A. The non-standard ``message.reasoning`` field (v0.5-C).
|
|
1299
|
+
The adapter strips it before the response reaches the client, but
|
|
1300
|
+
this probe bypasses the adapter and reads the raw body so the
|
|
1301
|
+
operator knows whether any ``capability-degraded`` log lines come
|
|
1302
|
+
from this provider.
|
|
1303
|
+
|
|
1304
|
+
B. (v1.0-A) Content-embedded harness markers — a ``<think>...</think>``
|
|
1305
|
+
block or stop markers (``<|python_tag|>`` / ``<|eot_id|>`` /
|
|
1306
|
+
``<|im_end|>`` / ``<|turn|>`` / ``<|end|>`` / ``<|channel>thought``)
|
|
1307
|
+
inside ``message.content``. These slip past the v0.5-C strip (which
|
|
1308
|
+
only inspects the ``reasoning`` field), so the v1.0-A
|
|
1309
|
+
``output_filters`` chain is the remediation. When the probe observes
|
|
1310
|
+
such markers AND the configured ``output_filters`` list does not
|
|
1311
|
+
cover them, a NEEDS_TUNING verdict emits a copy-paste YAML patch.
|
|
1312
|
+
|
|
1313
|
+
Verdict priority: content-embedded leak dominates the reasoning-field
|
|
1314
|
+
observation (a NEEDS_TUNING from B overrides an informational OK from
|
|
1315
|
+
A) because the user-visible symptom — ``<think>`` rendered in the
|
|
1316
|
+
Claude Code UI — is the one operators actually feel.
|
|
1317
|
+
"""
|
|
1318
|
+
if provider.kind != "openai_compat":
|
|
1319
|
+
return ProbeResult(
|
|
1320
|
+
name="reasoning-leak",
|
|
1321
|
+
verdict=ProbeVerdict.SKIP,
|
|
1322
|
+
detail=(
|
|
1323
|
+
"not applicable (only openai_compat emits the non-standard "
|
|
1324
|
+
"reasoning field; Anthropic content blocks would need a "
|
|
1325
|
+
"different probe)."
|
|
1326
|
+
),
|
|
1327
|
+
)
|
|
1328
|
+
|
|
1329
|
+
url = _openai_chat_url(provider)
|
|
1330
|
+
headers = _openai_headers(provider)
|
|
1331
|
+
# Nudge models that default to thinking into emitting the block, so
|
|
1332
|
+
# the content-embedded check has something to look at when the model
|
|
1333
|
+
# is genuinely leaky. A model that ignores the nudge will still be
|
|
1334
|
+
# tested against the reasoning-field observation from its plain reply.
|
|
1335
|
+
body = {
|
|
1336
|
+
"model": provider.model,
|
|
1337
|
+
"messages": [
|
|
1338
|
+
{
|
|
1339
|
+
"role": "user",
|
|
1340
|
+
"content": (
|
|
1341
|
+
"Think step by step about the capital of France, then answer in one word."
|
|
1342
|
+
),
|
|
1343
|
+
},
|
|
1344
|
+
],
|
|
1345
|
+
"max_tokens": 128,
|
|
1346
|
+
"temperature": 0,
|
|
1347
|
+
}
|
|
1348
|
+
status, parsed, _raw = await _http_post_json(
|
|
1349
|
+
url, headers=headers, body=body, timeout=provider.timeout_s
|
|
1350
|
+
)
|
|
1351
|
+
|
|
1352
|
+
if status is None or status >= 400 or parsed is None:
|
|
1353
|
+
return ProbeResult(
|
|
1354
|
+
name="reasoning-leak",
|
|
1355
|
+
verdict=ProbeVerdict.SKIP,
|
|
1356
|
+
detail=f"skipped (upstream status={status!r}).",
|
|
1357
|
+
)
|
|
1358
|
+
|
|
1359
|
+
msg = _extract_openai_assistant_choice(parsed)
|
|
1360
|
+
has_reasoning = bool(msg and "reasoning" in msg)
|
|
1361
|
+
|
|
1362
|
+
# v1.0-A: content-embedded marker detection.
|
|
1363
|
+
content = (msg.get("content") if isinstance(msg, dict) else None) or ""
|
|
1364
|
+
content_text = content if isinstance(content, str) else ""
|
|
1365
|
+
has_think = "<think>" in content_text
|
|
1366
|
+
leaked_markers: list[str] = [m for m in DEFAULT_STOP_MARKERS if m in content_text]
|
|
1367
|
+
configured_filters = set(provider.output_filters)
|
|
1368
|
+
needs_strip_thinking = has_think and "strip_thinking" not in configured_filters
|
|
1369
|
+
needs_strip_markers = bool(leaked_markers) and "strip_stop_markers" not in configured_filters
|
|
1370
|
+
|
|
1371
|
+
if needs_strip_thinking or needs_strip_markers:
|
|
1372
|
+
# Dominant signal — emit NEEDS_TUNING with a copy-paste patch
|
|
1373
|
+
# that adds exactly the filters that would have caught this
|
|
1374
|
+
# observation. A provider already running one filter and newly
|
|
1375
|
+
# tripping on the other is rare; we still emit the full needed
|
|
1376
|
+
# set so operators see the complete remediation.
|
|
1377
|
+
recommended: list[str] = []
|
|
1378
|
+
if needs_strip_thinking:
|
|
1379
|
+
recommended.append("strip_thinking")
|
|
1380
|
+
if needs_strip_markers:
|
|
1381
|
+
recommended.append("strip_stop_markers")
|
|
1382
|
+
|
|
1383
|
+
found_desc: list[str] = []
|
|
1384
|
+
if has_think:
|
|
1385
|
+
found_desc.append("<think>...</think>")
|
|
1386
|
+
if leaked_markers:
|
|
1387
|
+
found_desc.append("stop markers " + ", ".join(repr(m) for m in leaked_markers))
|
|
1388
|
+
|
|
1389
|
+
return ProbeResult(
|
|
1390
|
+
name="reasoning-leak",
|
|
1391
|
+
verdict=ProbeVerdict.NEEDS_TUNING,
|
|
1392
|
+
detail=(
|
|
1393
|
+
"content-embedded leak detected ("
|
|
1394
|
+
+ " + ".join(found_desc)
|
|
1395
|
+
+ "). v1.0-A `output_filters` would scrub this; current "
|
|
1396
|
+
f"provider chain = {sorted(configured_filters)}. Recommended: "
|
|
1397
|
+
f"add {recommended}."
|
|
1398
|
+
),
|
|
1399
|
+
target_file="providers.yaml",
|
|
1400
|
+
suggested_patch=_patch_providers_yaml_output_filters(provider.name, recommended),
|
|
1401
|
+
)
|
|
1402
|
+
|
|
1403
|
+
passthrough_on = (
|
|
1404
|
+
provider.capabilities.reasoning_passthrough or resolved.reasoning_passthrough is True
|
|
1405
|
+
)
|
|
1406
|
+
|
|
1407
|
+
if has_reasoning and passthrough_on:
|
|
1408
|
+
return ProbeResult(
|
|
1409
|
+
name="reasoning-leak",
|
|
1410
|
+
verdict=ProbeVerdict.OK,
|
|
1411
|
+
detail=(
|
|
1412
|
+
"upstream emits `reasoning`; passthrough is on, so the "
|
|
1413
|
+
"field reaches clients as intended."
|
|
1414
|
+
),
|
|
1415
|
+
)
|
|
1416
|
+
if has_reasoning and not passthrough_on:
|
|
1417
|
+
# Default behavior — v0.5-C strip removes it. No tuning needed;
|
|
1418
|
+
# this is expected. Emit OK with an informational note so the
|
|
1419
|
+
# operator understands where any `capability-degraded` logs
|
|
1420
|
+
# originate.
|
|
1421
|
+
return ProbeResult(
|
|
1422
|
+
name="reasoning-leak",
|
|
1423
|
+
verdict=ProbeVerdict.OK,
|
|
1424
|
+
detail=(
|
|
1425
|
+
"upstream emits non-standard `reasoning`; v0.5-C adapter "
|
|
1426
|
+
"strips it before it reaches the client (expected — "
|
|
1427
|
+
"expect `capability-degraded` log lines for this provider)."
|
|
1428
|
+
),
|
|
1429
|
+
)
|
|
1430
|
+
return ProbeResult(
|
|
1431
|
+
name="reasoning-leak",
|
|
1432
|
+
verdict=ProbeVerdict.OK,
|
|
1433
|
+
detail=(
|
|
1434
|
+
"no `reasoning` field observed and no content-embedded markers — nothing to strip."
|
|
1435
|
+
),
|
|
1436
|
+
)
|
|
1437
|
+
|
|
1438
|
+
|
|
1439
|
+
# ---------------------------------------------------------------------------
|
|
1440
|
+
# Orchestration
|
|
1441
|
+
# ---------------------------------------------------------------------------
|
|
1442
|
+
|
|
1443
|
+
|
|
1444
|
+
async def check_model(
|
|
1445
|
+
config: CodeRouterConfig,
|
|
1446
|
+
provider_name: str,
|
|
1447
|
+
*,
|
|
1448
|
+
registry: CapabilityRegistry | None = None,
|
|
1449
|
+
) -> DoctorReport:
|
|
1450
|
+
"""Run the full probe suite against ``provider_name`` in ``config``.
|
|
1451
|
+
|
|
1452
|
+
The auth probe runs first; if it fails, remaining probes are
|
|
1453
|
+
returned as SKIP (the suite does not waste tokens against a
|
|
1454
|
+
provider that can't respond).
|
|
1455
|
+
|
|
1456
|
+
``registry`` is optional for testing — production callers pass
|
|
1457
|
+
nothing and the function uses the process-wide default (same
|
|
1458
|
+
registry the capability gate consults).
|
|
1459
|
+
"""
|
|
1460
|
+
try:
|
|
1461
|
+
provider = config.provider_by_name(provider_name)
|
|
1462
|
+
except KeyError as exc:
|
|
1463
|
+
raise KeyError(
|
|
1464
|
+
f"provider {provider_name!r} not found in providers.yaml. "
|
|
1465
|
+
f"Known: {sorted(p.name for p in config.providers)}"
|
|
1466
|
+
) from exc
|
|
1467
|
+
|
|
1468
|
+
reg = registry if registry is not None else get_default_registry()
|
|
1469
|
+
resolved = reg.lookup(kind=provider.kind, model=provider.model or "")
|
|
1470
|
+
|
|
1471
|
+
report = DoctorReport(
|
|
1472
|
+
provider_name=provider_name,
|
|
1473
|
+
provider=provider,
|
|
1474
|
+
resolved_caps=resolved,
|
|
1475
|
+
)
|
|
1476
|
+
|
|
1477
|
+
auth_result = await _probe_auth_and_basic_chat(provider)
|
|
1478
|
+
report.results.append(auth_result)
|
|
1479
|
+
|
|
1480
|
+
if auth_result.verdict != ProbeVerdict.OK:
|
|
1481
|
+
# Auth dominates; mark the other probes SKIP so the report
|
|
1482
|
+
# still lists them (operators can see at a glance what wasn't
|
|
1483
|
+
# checked) without spending tokens / API quota.
|
|
1484
|
+
for name in (
|
|
1485
|
+
"num_ctx",
|
|
1486
|
+
"tool_calls",
|
|
1487
|
+
"thinking",
|
|
1488
|
+
"reasoning-leak",
|
|
1489
|
+
"streaming",
|
|
1490
|
+
):
|
|
1491
|
+
report.results.append(
|
|
1492
|
+
ProbeResult(
|
|
1493
|
+
name=name,
|
|
1494
|
+
verdict=ProbeVerdict.SKIP,
|
|
1495
|
+
detail="skipped — auth probe did not succeed.",
|
|
1496
|
+
)
|
|
1497
|
+
)
|
|
1498
|
+
return report
|
|
1499
|
+
|
|
1500
|
+
# v1.0-B: num_ctx probe runs before tool_calls. When Ollama silently
|
|
1501
|
+
# truncates the prompt the assistant often replies without tool calls,
|
|
1502
|
+
# which used to flag as a tools=false NEEDS_TUNING in v0.7-B. Putting
|
|
1503
|
+
# num_ctx first ensures the truncation verdict dominates the report so
|
|
1504
|
+
# operators apply the right remediation (bump num_ctx, not disable tools).
|
|
1505
|
+
# v1.0-C: streaming probe runs last. The input-side (num_ctx) and
|
|
1506
|
+
# declaration probes (tool_calls / thinking / reasoning-leak) should
|
|
1507
|
+
# dominate the report — streaming is the output-side sibling of
|
|
1508
|
+
# num_ctx and its NEEDS_TUNING verdict is orthogonal to the others.
|
|
1509
|
+
report.results.append(await _probe_num_ctx(provider))
|
|
1510
|
+
report.results.append(await _probe_tool_calls(provider, resolved))
|
|
1511
|
+
report.results.append(await _probe_thinking(provider, resolved))
|
|
1512
|
+
report.results.append(await _probe_reasoning_leak(provider, resolved))
|
|
1513
|
+
report.results.append(await _probe_streaming(provider))
|
|
1514
|
+
return report
|
|
1515
|
+
|
|
1516
|
+
|
|
1517
|
+
def run_check_model_sync(
|
|
1518
|
+
config: CodeRouterConfig,
|
|
1519
|
+
provider_name: str,
|
|
1520
|
+
*,
|
|
1521
|
+
registry: CapabilityRegistry | None = None,
|
|
1522
|
+
) -> DoctorReport:
|
|
1523
|
+
"""Sync wrapper — called from the CLI which is not otherwise async."""
|
|
1524
|
+
return asyncio.run(check_model(config, provider_name, registry=registry))
|
|
1525
|
+
|
|
1526
|
+
|
|
1527
|
+
# ---------------------------------------------------------------------------
|
|
1528
|
+
# Reporting
|
|
1529
|
+
# ---------------------------------------------------------------------------
|
|
1530
|
+
|
|
1531
|
+
|
|
1532
|
+
_VERDICT_BADGE = {
|
|
1533
|
+
ProbeVerdict.OK: "[OK]",
|
|
1534
|
+
ProbeVerdict.SKIP: "[SKIP]",
|
|
1535
|
+
ProbeVerdict.NEEDS_TUNING: "[NEEDS TUNING]",
|
|
1536
|
+
ProbeVerdict.UNSUPPORTED: "[UNSUPPORTED]",
|
|
1537
|
+
ProbeVerdict.AUTH_FAIL: "[AUTH FAIL]",
|
|
1538
|
+
ProbeVerdict.TRANSPORT_ERROR: "[TRANSPORT ERROR]",
|
|
1539
|
+
}
|
|
1540
|
+
|
|
1541
|
+
|
|
1542
|
+
def format_report(report: DoctorReport) -> str:
|
|
1543
|
+
"""Human-readable, line-oriented report. Goes to stdout."""
|
|
1544
|
+
p = report.provider
|
|
1545
|
+
caps = report.resolved_caps
|
|
1546
|
+
lines: list[str] = []
|
|
1547
|
+
lines.append(f"coderouter doctor --check-model {report.provider_name}")
|
|
1548
|
+
lines.append("─" * 60)
|
|
1549
|
+
lines.append(f"provider: {p.name}")
|
|
1550
|
+
lines.append(f" kind: {p.kind}")
|
|
1551
|
+
lines.append(f" base_url: {p.base_url}")
|
|
1552
|
+
lines.append(f" model: {p.model}")
|
|
1553
|
+
|
|
1554
|
+
lines.append("")
|
|
1555
|
+
lines.append("Registry + providers.yaml declarations:")
|
|
1556
|
+
lines.append(
|
|
1557
|
+
f" thinking: providers={p.capabilities.thinking}, registry={caps.thinking}"
|
|
1558
|
+
)
|
|
1559
|
+
lines.append(
|
|
1560
|
+
f" tools: providers={p.capabilities.tools}, registry={caps.tools}"
|
|
1561
|
+
)
|
|
1562
|
+
lines.append(
|
|
1563
|
+
f" reasoning_passthrough: providers={p.capabilities.reasoning_passthrough}, "
|
|
1564
|
+
f"registry={caps.reasoning_passthrough}"
|
|
1565
|
+
)
|
|
1566
|
+
# v1.0-A: surface the output_filters chain so operators can see at a
|
|
1567
|
+
# glance which filters are active before running the probes.
|
|
1568
|
+
lines.append(f" output_filters: providers={list(p.output_filters)}")
|
|
1569
|
+
|
|
1570
|
+
lines.append("")
|
|
1571
|
+
lines.append("Probes:")
|
|
1572
|
+
for i, r in enumerate(report.results, start=1):
|
|
1573
|
+
badge = _VERDICT_BADGE[r.verdict]
|
|
1574
|
+
lines.append(f" [{i}/{len(report.results)}] {r.name} …… {badge}")
|
|
1575
|
+
for dline in r.detail.splitlines():
|
|
1576
|
+
lines.append(f" {dline}")
|
|
1577
|
+
if r.suggested_patch:
|
|
1578
|
+
lines.append(f" Suggested patch → {r.target_file}:")
|
|
1579
|
+
for pl in r.suggested_patch.splitlines():
|
|
1580
|
+
lines.append(f" {pl}")
|
|
1581
|
+
|
|
1582
|
+
lines.append("")
|
|
1583
|
+
code = exit_code_for(report)
|
|
1584
|
+
summary = {
|
|
1585
|
+
0: "all probes match declarations.",
|
|
1586
|
+
1: "at least one probe could not run (auth/transport/model).",
|
|
1587
|
+
2: "at least one probe needs tuning (see suggested patches).",
|
|
1588
|
+
}[code]
|
|
1589
|
+
lines.append(f"Summary: {summary}")
|
|
1590
|
+
lines.append(f"Exit: {code}")
|
|
1591
|
+
return "\n".join(lines)
|
|
1592
|
+
|
|
1593
|
+
|
|
1594
|
+
def _probes_by_name(results: Sequence[ProbeResult]) -> dict[str, ProbeResult]:
|
|
1595
|
+
"""Small convenience for tests that want to assert on one probe."""
|
|
1596
|
+
return {r.name: r for r in results}
|