coderouter-cli 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. coderouter/__init__.py +17 -0
  2. coderouter/__main__.py +6 -0
  3. coderouter/adapters/__init__.py +23 -0
  4. coderouter/adapters/anthropic_native.py +502 -0
  5. coderouter/adapters/base.py +220 -0
  6. coderouter/adapters/openai_compat.py +395 -0
  7. coderouter/adapters/registry.py +17 -0
  8. coderouter/cli.py +345 -0
  9. coderouter/cli_stats.py +751 -0
  10. coderouter/config/__init__.py +10 -0
  11. coderouter/config/capability_registry.py +339 -0
  12. coderouter/config/env_file.py +295 -0
  13. coderouter/config/loader.py +73 -0
  14. coderouter/config/schemas.py +515 -0
  15. coderouter/data/__init__.py +7 -0
  16. coderouter/data/model-capabilities.yaml +86 -0
  17. coderouter/doctor.py +1596 -0
  18. coderouter/env_security.py +434 -0
  19. coderouter/errors.py +29 -0
  20. coderouter/ingress/__init__.py +5 -0
  21. coderouter/ingress/anthropic_routes.py +205 -0
  22. coderouter/ingress/app.py +144 -0
  23. coderouter/ingress/dashboard_routes.py +493 -0
  24. coderouter/ingress/metrics_routes.py +92 -0
  25. coderouter/ingress/openai_routes.py +153 -0
  26. coderouter/logging.py +315 -0
  27. coderouter/metrics/__init__.py +39 -0
  28. coderouter/metrics/collector.py +471 -0
  29. coderouter/metrics/prometheus.py +221 -0
  30. coderouter/output_filters.py +407 -0
  31. coderouter/routing/__init__.py +13 -0
  32. coderouter/routing/auto_router.py +244 -0
  33. coderouter/routing/capability.py +285 -0
  34. coderouter/routing/fallback.py +611 -0
  35. coderouter/translation/__init__.py +57 -0
  36. coderouter/translation/anthropic.py +204 -0
  37. coderouter/translation/convert.py +1291 -0
  38. coderouter/translation/tool_repair.py +236 -0
  39. coderouter_cli-1.7.0.dist-info/METADATA +509 -0
  40. coderouter_cli-1.7.0.dist-info/RECORD +43 -0
  41. coderouter_cli-1.7.0.dist-info/WHEEL +4 -0
  42. coderouter_cli-1.7.0.dist-info/entry_points.txt +2 -0
  43. coderouter_cli-1.7.0.dist-info/licenses/LICENSE +21 -0
coderouter/doctor.py ADDED
@@ -0,0 +1,1596 @@
1
+ """`coderouter doctor --check-model <provider>` — per-provider capability probe.
2
+
3
+ Purpose (v0.7-B)
4
+ ----------------
5
+ Run a small set of live probes against a single provider from
6
+ ``providers.yaml`` and compare the observed behavior against the
7
+ declarations in ``providers.yaml`` + ``model-capabilities.yaml`` (v0.7-A
8
+ registry). Emit a per-probe verdict and, on mismatch, a copy-paste-able
9
+ YAML patch that the user can drop into either file.
10
+
11
+ Motivated by the 5 silent-fail symptoms enumerated in plan.md §9.4:
12
+
13
+ 1. 空応答 / 意味不明応答 → num_ctx probe (v1.0-B direct detection
14
+ via canary echo-back) + streaming probe
15
+ (v1.0-C — output-side num_predict cap)
16
+ + basic-chat probe
17
+ 2. Claude Code「ファイル読めない」 → tool_calls probe (symptom 2)
18
+ 3. UI に <think> タグ生露出 → thinking probe + reasoning-leak
19
+ content-marker detection (v1.0-A)
20
+ 4. 起動後 1 発目で必ず失敗 → auth + model-not-found probe (symptom 4)
21
+ 5. 全部 fallback 失敗 → auth probe (symptom 5)
22
+
23
+ Exit-code contract (CI-friendly)
24
+ --------------------------------
25
+ 0 = all probes match the registry / providers.yaml declarations.
26
+ 2 = at least one probe returned NEEDS_TUNING (structural mismatch;
27
+ the user should apply the emitted YAML patch).
28
+ 1 = at least one probe could not run (AUTH_FAIL / UNSUPPORTED /
29
+ TRANSPORT_ERROR). When the auth probe fails, subsequent probes
30
+ are marked SKIP and do not influence the exit code — the auth
31
+ failure dominates.
32
+
33
+ Non-destructive contract
34
+ ------------------------
35
+ Probes must not induce tool-side-effects. The tool-calls probe declares
36
+ a fake ``echo`` tool with no real-world meaning; even if the caller
37
+ later re-used the response (they won't), ``echo`` cannot trigger
38
+ anything on the caller's side. Each probe is minimized to ≤ ~100
39
+ tokens in / ≤ ~20 tokens out.
40
+
41
+ Layering
42
+ --------
43
+ Probes issue raw httpx calls rather than going through
44
+ ``OpenAICompatAdapter`` / ``AnthropicAdapter`` because:
45
+
46
+ * The reasoning-leak probe needs to see the raw upstream body BEFORE
47
+ the adapter's v0.5-C passive strip runs.
48
+ * The thinking probe for ``kind: anthropic`` needs to send an
49
+ Anthropic wire-format body directly rather than the reverse-
50
+ translated ChatRequest shape.
51
+ * The tool-calls probe wants to observe the raw ``tool_calls`` field
52
+ vs the raw text content before any repair pass.
53
+
54
+ Keeping the HTTP plumbing inline in this module (~one helper, no
55
+ adapter dependency) makes the probe behavior stable against adapter-
56
+ layer changes and keeps the test surface narrow (``httpx_mock`` +
57
+ assertions on the probe output).
58
+ """
59
+
60
+ from __future__ import annotations
61
+
62
+ import asyncio
63
+ import json
64
+ from collections.abc import Sequence
65
+ from dataclasses import dataclass, field
66
+ from enum import StrEnum
67
+ from typing import Any
68
+
69
+ import httpx
70
+
71
+ from coderouter.config.capability_registry import (
72
+ CapabilityRegistry,
73
+ ResolvedCapabilities,
74
+ )
75
+ from coderouter.config.loader import resolve_api_key
76
+ from coderouter.config.schemas import CodeRouterConfig, ProviderConfig
77
+ from coderouter.output_filters import DEFAULT_STOP_MARKERS
78
+ from coderouter.routing.capability import get_default_registry
79
+ from coderouter.translation.tool_repair import repair_tool_calls_in_text
80
+
81
+ __all__ = [
82
+ "DoctorReport",
83
+ "ProbeResult",
84
+ "ProbeVerdict",
85
+ "check_model",
86
+ "exit_code_for",
87
+ "format_report",
88
+ "run_check_model_sync",
89
+ ]
90
+
91
+
92
+ # ---------------------------------------------------------------------------
93
+ # Result types
94
+ # ---------------------------------------------------------------------------
95
+
96
+
97
+ class ProbeVerdict(StrEnum):
98
+ """Per-probe verdict.
99
+
100
+ Mapping to exit code (see :func:`exit_code_for`):
101
+ OK → contributes 0
102
+ SKIP → contributes 0 (not applicable or blocked by auth)
103
+ NEEDS_TUNING → contributes 2 (structural mismatch)
104
+ UNSUPPORTED → contributes 1 (model not found / feature absent)
105
+ AUTH_FAIL → contributes 1 (401/403 from upstream)
106
+ TRANSPORT_ERROR → contributes 1 (timeout / 5xx / network)
107
+ """
108
+
109
+ OK = "ok"
110
+ SKIP = "skip"
111
+ NEEDS_TUNING = "needs_tuning"
112
+ UNSUPPORTED = "unsupported"
113
+ AUTH_FAIL = "auth_fail"
114
+ TRANSPORT_ERROR = "transport_error"
115
+
116
+
117
+ @dataclass
118
+ class ProbeResult:
119
+ """Outcome of a single probe.
120
+
121
+ ``suggested_patch`` is a YAML snippet the user can copy-paste into
122
+ the named file. ``target_file`` is either ``"providers.yaml"`` or
123
+ ``"model-capabilities.yaml"`` — the probe picks whichever is the
124
+ more specific fix (per-provider opt-in wins over per-glob registry
125
+ rule when only one provider is affected; glob-level patches are
126
+ preferred when the mismatch appears to be a whole-family pattern,
127
+ but since doctor probes only one provider at a time, providers.yaml
128
+ is always the safe suggestion for a single-provider fix).
129
+ """
130
+
131
+ name: str
132
+ verdict: ProbeVerdict
133
+ detail: str
134
+ suggested_patch: str | None = None
135
+ target_file: str | None = None # "providers.yaml" or "model-capabilities.yaml"
136
+
137
+
138
+ @dataclass
139
+ class DoctorReport:
140
+ """Aggregate report for a single ``--check-model`` invocation."""
141
+
142
+ provider_name: str
143
+ provider: ProviderConfig
144
+ resolved_caps: ResolvedCapabilities
145
+ results: list[ProbeResult] = field(default_factory=list)
146
+
147
+
148
+ def exit_code_for(report: DoctorReport) -> int:
149
+ """Derive the CLI exit code from a report (see :class:`ProbeVerdict`)."""
150
+ has_blocker = False
151
+ has_tuning = False
152
+ for r in report.results:
153
+ if r.verdict in (
154
+ ProbeVerdict.AUTH_FAIL,
155
+ ProbeVerdict.UNSUPPORTED,
156
+ ProbeVerdict.TRANSPORT_ERROR,
157
+ ):
158
+ has_blocker = True
159
+ elif r.verdict == ProbeVerdict.NEEDS_TUNING:
160
+ has_tuning = True
161
+ if has_blocker:
162
+ return 1
163
+ if has_tuning:
164
+ return 2
165
+ return 0
166
+
167
+
168
+ # ---------------------------------------------------------------------------
169
+ # HTTP helpers
170
+ # ---------------------------------------------------------------------------
171
+
172
+
173
+ def _openai_chat_url(provider: ProviderConfig) -> str:
174
+ base = str(provider.base_url).rstrip("/")
175
+ return f"{base}/chat/completions"
176
+
177
+
178
+ def _anthropic_messages_url(provider: ProviderConfig) -> str:
179
+ base = str(provider.base_url).rstrip("/")
180
+ return f"{base}/v1/messages"
181
+
182
+
183
+ def _openai_headers(provider: ProviderConfig) -> dict[str, str]:
184
+ headers = {"Content-Type": "application/json", "User-Agent": "CodeRouter-doctor/0.7"}
185
+ api_key = resolve_api_key(provider.api_key_env)
186
+ if api_key:
187
+ headers["Authorization"] = f"Bearer {api_key}"
188
+ return headers
189
+
190
+
191
+ def _anthropic_headers(provider: ProviderConfig) -> dict[str, str]:
192
+ headers = {
193
+ "Content-Type": "application/json",
194
+ "User-Agent": "CodeRouter-doctor/0.7",
195
+ "anthropic-version": "2023-06-01",
196
+ }
197
+ api_key = resolve_api_key(provider.api_key_env)
198
+ if api_key:
199
+ headers["x-api-key"] = api_key
200
+ return headers
201
+
202
+
203
+ async def _http_post_json(
204
+ url: str,
205
+ *,
206
+ headers: dict[str, str],
207
+ body: dict[str, Any],
208
+ timeout: float,
209
+ ) -> tuple[int | None, dict[str, Any] | None, str]:
210
+ """POST JSON. Returns (status_or_None, parsed_or_None, raw_text_or_error).
211
+
212
+ ``status=None`` signals a transport-level failure (connection refused,
213
+ DNS, timeout). ``parsed=None`` with non-None status means the body
214
+ was not parseable JSON (still treated as an upstream protocol issue
215
+ at the caller's discretion).
216
+ """
217
+ try:
218
+ async with httpx.AsyncClient(timeout=timeout) as client:
219
+ resp = await client.post(url, json=body, headers=headers)
220
+ except httpx.HTTPError as exc:
221
+ return None, None, f"transport error: {exc}"
222
+ try:
223
+ parsed = resp.json()
224
+ except (json.JSONDecodeError, ValueError):
225
+ return resp.status_code, None, resp.text
226
+ return resp.status_code, parsed, resp.text
227
+
228
+
229
+ async def _http_stream_sse(
230
+ url: str,
231
+ *,
232
+ headers: dict[str, str],
233
+ body: dict[str, Any],
234
+ timeout: float,
235
+ ) -> tuple[int | None, list[dict[str, Any]], bool, str]:
236
+ """POST a streaming request and consume the SSE stream.
237
+
238
+ Returns ``(status, chunks, saw_done, error_text)``.
239
+
240
+ * ``status=None`` signals a transport-level failure; ``error_text``
241
+ carries the reason.
242
+ * ``chunks`` are the parsed JSON objects from ``data: <json>`` lines,
243
+ in observed order. ``[DONE]`` is not included.
244
+ * ``saw_done`` is True iff the terminator line ``data: [DONE]`` was
245
+ observed. Strict SSE clients require it; many upstreams omit it
246
+ and rely on connection close instead.
247
+ * On HTTP error (status >= 400) the body is read once and returned
248
+ in ``error_text``; ``chunks`` is empty.
249
+
250
+ Mirrors :func:`_http_post_json`'s error handling shape so the caller
251
+ can branch on ``status`` the same way.
252
+ """
253
+ try:
254
+ async with (
255
+ httpx.AsyncClient(timeout=timeout) as client,
256
+ client.stream("POST", url, json=body, headers=headers) as resp,
257
+ ):
258
+ status = resp.status_code
259
+ if status >= 400:
260
+ raw = await resp.aread()
261
+ return (
262
+ status,
263
+ [],
264
+ False,
265
+ raw.decode("utf-8", errors="replace")[:400],
266
+ )
267
+ chunks: list[dict[str, Any]] = []
268
+ saw_done = False
269
+ async for line in resp.aiter_lines():
270
+ if not line or line.startswith(":"):
271
+ continue
272
+ if not line.startswith("data:"):
273
+ continue
274
+ data_str = line[len("data:") :].strip()
275
+ if data_str == "[DONE]":
276
+ saw_done = True
277
+ continue
278
+ try:
279
+ chunks.append(json.loads(data_str))
280
+ except json.JSONDecodeError:
281
+ continue # skip malformed chunks, keep consuming
282
+ return status, chunks, saw_done, ""
283
+ except httpx.HTTPError as exc:
284
+ return None, [], False, f"transport error: {exc}"
285
+
286
+
287
+ # ---------------------------------------------------------------------------
288
+ # Patch emitters
289
+ #
290
+ # Kept as tiny helpers rather than a Jinja dance — the surface area is too
291
+ # small to justify templating, and exact indentation in the emitted YAML
292
+ # matters for copy-paste fidelity.
293
+ # ---------------------------------------------------------------------------
294
+
295
+
296
+ def _patch_providers_yaml_capability(provider_name: str, key: str, value: bool) -> str:
297
+ """Emit a providers.yaml patch that flips ``capabilities.<key>``."""
298
+ val = "true" if value else "false"
299
+ return (
300
+ "# providers.yaml — update the entry for "
301
+ f"{provider_name!r}:\n"
302
+ "providers:\n"
303
+ f" - name: {provider_name}\n"
304
+ " # ... existing fields ...\n"
305
+ " capabilities:\n"
306
+ f" {key}: {val}\n"
307
+ )
308
+
309
+
310
+ def _patch_model_capabilities_yaml(*, match: str, kind: str, key: str, value: bool) -> str:
311
+ """Emit a model-capabilities.yaml rule that declares ``<key>=<value>``."""
312
+ val = "true" if value else "false"
313
+ return (
314
+ "# ~/.coderouter/model-capabilities.yaml — append under `rules:`:\n"
315
+ "rules:\n"
316
+ f" - match: {match!r}\n"
317
+ f" kind: {kind}\n"
318
+ " capabilities:\n"
319
+ f" {key}: {val}\n"
320
+ )
321
+
322
+
323
+ def _patch_providers_yaml_output_filters(provider_name: str, filters: list[str]) -> str:
324
+ """v1.0-A: Emit a providers.yaml patch adding/extending ``output_filters``.
325
+
326
+ Lists the filters verbatim so copy-paste yields a valid YAML list.
327
+ The comment block above the stanza hints that this is additive with
328
+ any existing filter chain — users with a bespoke chain should merge
329
+ rather than replace.
330
+ """
331
+ items = "\n".join(f" - {f}" for f in filters)
332
+ return (
333
+ "# providers.yaml — update the entry for "
334
+ f"{provider_name!r} (merge if a chain already exists):\n"
335
+ "providers:\n"
336
+ f" - name: {provider_name}\n"
337
+ " # ... existing fields ...\n"
338
+ " output_filters:\n"
339
+ f"{items}\n"
340
+ )
341
+
342
+
343
+ def _patch_providers_yaml_num_ctx(provider_name: str, desired_ctx: int = 32768) -> str:
344
+ """v1.0-B: Emit a providers.yaml patch setting ``extra_body.options.num_ctx``.
345
+
346
+ The path is Ollama-specific: ``extra_body`` is shallow-merged into the
347
+ outbound body by the openai_compat adapter, and Ollama exposes context
348
+ length via a nested ``options`` object. 32768 is a practical default
349
+ for Claude Code's tool-heavy system prompts (see plan.md §9.4 symptom
350
+ #1) — operators can dial it down for memory-bound hosts.
351
+ """
352
+ return (
353
+ "# providers.yaml — update the entry for "
354
+ f"{provider_name!r} (merge into any existing extra_body):\n"
355
+ "providers:\n"
356
+ f" - name: {provider_name}\n"
357
+ " # ... existing fields ...\n"
358
+ " extra_body:\n"
359
+ " options:\n"
360
+ f" num_ctx: {desired_ctx}\n"
361
+ )
362
+
363
+
364
+ def _patch_providers_yaml_num_predict(provider_name: str, desired_predict: int = 4096) -> str:
365
+ """v1.0-C: Emit a providers.yaml patch setting ``extra_body.options.num_predict``.
366
+
367
+ Sibling of :func:`_patch_providers_yaml_num_ctx` — same ``extra_body.options``
368
+ path, but controls the **output-side** token cap rather than the input-side
369
+ window. Ollama's default for ``num_predict`` is -1 (unlimited) in recent
370
+ builds, but older builds and some Ollama-compat servers cap at 128 or 256
371
+ which silently truncates Claude Code's longer completions mid-response.
372
+ 4096 is a practical cap that covers ~95 % of Claude Code completions
373
+ without risking runaway generations; operators can set to -1 for uncapped.
374
+ """
375
+ return (
376
+ "# providers.yaml — update the entry for "
377
+ f"{provider_name!r} (merge into any existing extra_body):\n"
378
+ "providers:\n"
379
+ f" - name: {provider_name}\n"
380
+ " # ... existing fields ...\n"
381
+ " extra_body:\n"
382
+ " options:\n"
383
+ f" num_predict: {desired_predict}\n"
384
+ )
385
+
386
+
387
+ # ---------------------------------------------------------------------------
388
+ # Probes
389
+ # ---------------------------------------------------------------------------
390
+
391
+
392
+ # v1.0-B: num_ctx probe constants.
393
+ #
394
+ # We embed a short, unusual canary token at the very beginning of the user
395
+ # prompt, follow it with enough filler sentences to exceed Ollama's default
396
+ # 2048-token context window, and ask the model to echo the canary back.
397
+ # Because Ollama silently drops the BEGINNING of the prompt when it
398
+ # overflows `num_ctx` (not the end), a model running at the default cannot
399
+ # know what the canary was and fails to echo it. When the operator has
400
+ # correctly bumped `num_ctx` via ``extra_body.options.num_ctx``, the canary
401
+ # survives and the model replies with it.
402
+ #
403
+ # The padding sentence is ~16 tokens; 300 repeats ≈ 4800 tokens — well
404
+ # beyond 2048 yet still cheap enough to issue once per doctor invocation.
405
+ # ZEBRA-MOON-847 is chosen to be hyphenated and all-caps so it does not
406
+ # appear in natural text; the model cannot produce it without having seen
407
+ # it in the prompt.
408
+ _NUM_CTX_PROBE_CANARY = "ZEBRA-MOON-847"
409
+ _NUM_CTX_PROBE_PADDING_SENTENCE = (
410
+ "The quick brown fox jumps over the lazy dog near the river bank today. "
411
+ )
412
+ _NUM_CTX_PROBE_PADDING_REPEATS = 300
413
+ # Threshold below which a declared ``num_ctx`` is still considered "too
414
+ # tight for Claude Code's tool-heavy prompts" — the Claude Code system
415
+ # prompt + tool roster alone is routinely north of 15k tokens. 8192 leaves
416
+ # headroom for small user messages without enabling a corner case where
417
+ # the probe happens to fit (our padding is only ~5k tokens) but a real
418
+ # Claude Code session still truncates.
419
+ _NUM_CTX_ADEQUATE_THRESHOLD = 8192
420
+
421
+ # v1.0-C: streaming probe constants.
422
+ #
423
+ # A short, deterministic task that forces the model to emit ~60-80 output
424
+ # chars in a predictable shape. Counting 1..30 one-per-line yields "1\n2\n
425
+ # ...30\n" = ~80 chars; any cap below the prompt's intent shows up as a
426
+ # ``finish_reason: length`` with heavily-truncated content. The prompt is
427
+ # kept well under ``num_ctx`` so a stray ``num_ctx`` issue does not
428
+ # masquerade as a ``num_predict`` issue (num_ctx probe runs first anyway).
429
+ _STREAMING_PROBE_USER_PROMPT = (
430
+ "Count from 1 to 30, one number per line. Output only the numbers, nothing else."
431
+ )
432
+ # Minimum content length we require to call the stream "not prematurely
433
+ # truncated". "1\n2\n...\n30" is ~80 chars; 40 chars covers the halfway
434
+ # mark (1..20) which is already obviously-truncated territory.
435
+ _STREAMING_PROBE_MIN_EXPECTED_CHARS = 40
436
+ # Default ``num_predict`` suggested in the emitted patch. -1 would be
437
+ # optimal (uncapped) but "4096" communicates intent more clearly to
438
+ # operators unfamiliar with Ollama's sentinel value, and covers Claude
439
+ # Code completions comfortably while still protecting against runaway
440
+ # generations on broken models.
441
+ _STREAMING_PROBE_NUM_PREDICT_DEFAULT = 4096
442
+
443
+
444
+ def _is_ollama_like(provider: ProviderConfig) -> bool:
445
+ """Return True iff num_ctx truncation is plausible for this provider.
446
+
447
+ Two signals fire:
448
+ * base_url uses the canonical Ollama port ``11434``. This is the
449
+ off-the-shelf install; operators who moved it still trigger the
450
+ second signal.
451
+ * ``extra_body.options.num_ctx`` is declared. Only Ollama honors
452
+ this path, so an operator who wrote the field is declaring — by
453
+ construction — that the upstream is Ollama-shape.
454
+
455
+ Deliberately does NOT fire on llama.cpp (port 8080), OpenRouter,
456
+ Together, Groq, or Anthropic native — those upstreams either don't
457
+ truncate silently (they hard-error on over-long prompts) or use a
458
+ different context-length knob (``max_tokens``, ``n_ctx`` at server
459
+ start, etc.) that isn't reachable from providers.yaml.
460
+ """
461
+ if provider.kind != "openai_compat":
462
+ return False
463
+ if ":11434" in str(provider.base_url):
464
+ return True
465
+ options = provider.extra_body.get("options")
466
+ return isinstance(options, dict) and "num_ctx" in options
467
+
468
+
469
+ def _declared_num_ctx(provider: ProviderConfig) -> int | None:
470
+ """Return the provider's declared ``extra_body.options.num_ctx`` if any."""
471
+ options = provider.extra_body.get("options")
472
+ if not isinstance(options, dict):
473
+ return None
474
+ val = options.get("num_ctx")
475
+ return val if isinstance(val, int) else None
476
+
477
+
478
+ _PROBE_BASIC_USER_PROMPT = "Reply with exactly the single word: PONG"
479
+ _PROBE_TOOLS_USER_PROMPT = (
480
+ "You have one tool named `echo`. Call it with the argument "
481
+ '`{"message": "probe"}`. Do not reply with any text — only the tool call.'
482
+ )
483
+ _PROBE_TOOL_SPEC_OPENAI = {
484
+ "type": "function",
485
+ "function": {
486
+ "name": "echo",
487
+ "description": (
488
+ "Test tool used by CodeRouter's doctor probe. Echo back the "
489
+ "provided message. NEVER interpret as a real command — this "
490
+ "is diagnostic-only."
491
+ ),
492
+ "parameters": {
493
+ "type": "object",
494
+ "properties": {"message": {"type": "string"}},
495
+ "required": ["message"],
496
+ },
497
+ },
498
+ }
499
+ _PROBE_TOOL_SPEC_ANTHROPIC = {
500
+ "name": "echo",
501
+ "description": (
502
+ "Test tool used by CodeRouter's doctor probe. Echo back the "
503
+ "provided message. NEVER interpret as a real command — this "
504
+ "is diagnostic-only."
505
+ ),
506
+ "input_schema": {
507
+ "type": "object",
508
+ "properties": {"message": {"type": "string"}},
509
+ "required": ["message"],
510
+ },
511
+ }
512
+
513
+
514
+ async def _probe_auth_and_basic_chat(
515
+ provider: ProviderConfig,
516
+ ) -> ProbeResult:
517
+ """Probe 1 — auth + model-reachable + basic chat completion.
518
+
519
+ Dominates subsequent probes: if this fails with AUTH_FAIL,
520
+ UNSUPPORTED, or TRANSPORT_ERROR, the caller short-circuits and
521
+ marks other probes SKIP. A 401/403 almost always means the
522
+ provider's ``api_key_env`` points at an empty / wrong env var. A
523
+ 404 on an openai_compat upstream typically means the ``model``
524
+ string is a typo or (for Ollama) ``ollama pull X`` was skipped.
525
+ """
526
+ if provider.kind == "anthropic":
527
+ url = _anthropic_messages_url(provider)
528
+ headers = _anthropic_headers(provider)
529
+ body: dict[str, Any] = {
530
+ "model": provider.model,
531
+ "messages": [{"role": "user", "content": _PROBE_BASIC_USER_PROMPT}],
532
+ "max_tokens": 16,
533
+ }
534
+ else:
535
+ url = _openai_chat_url(provider)
536
+ headers = _openai_headers(provider)
537
+ body = {
538
+ "model": provider.model,
539
+ "messages": [{"role": "user", "content": _PROBE_BASIC_USER_PROMPT}],
540
+ "max_tokens": 16,
541
+ "temperature": 0,
542
+ }
543
+
544
+ status, parsed, raw = await _http_post_json(
545
+ url, headers=headers, body=body, timeout=provider.timeout_s
546
+ )
547
+
548
+ if status is None:
549
+ return ProbeResult(
550
+ name="auth+basic-chat",
551
+ verdict=ProbeVerdict.TRANSPORT_ERROR,
552
+ detail=f"could not reach {url}: {raw}",
553
+ )
554
+
555
+ if status in (401, 403):
556
+ return ProbeResult(
557
+ name="auth+basic-chat",
558
+ verdict=ProbeVerdict.AUTH_FAIL,
559
+ detail=(
560
+ f"upstream returned {status}. Check that env var "
561
+ f"{provider.api_key_env!r} is set "
562
+ "and holds a valid key (plan.md §9.4 symptom #5)."
563
+ ),
564
+ )
565
+
566
+ if status == 404:
567
+ return ProbeResult(
568
+ name="auth+basic-chat",
569
+ verdict=ProbeVerdict.UNSUPPORTED,
570
+ detail=(
571
+ f"upstream returned 404 for model {provider.model!r}. "
572
+ "For Ollama: run `ollama pull "
573
+ f"{provider.model}`. For OpenRouter: verify the model slug "
574
+ "at https://openrouter.ai/models (plan.md §9.4 symptom #4)."
575
+ ),
576
+ )
577
+
578
+ if status >= 400:
579
+ snippet = (raw or "")[:160]
580
+ return ProbeResult(
581
+ name="auth+basic-chat",
582
+ verdict=ProbeVerdict.TRANSPORT_ERROR,
583
+ detail=f"upstream returned {status}: {snippet!r}",
584
+ )
585
+
586
+ if parsed is None:
587
+ return ProbeResult(
588
+ name="auth+basic-chat",
589
+ verdict=ProbeVerdict.TRANSPORT_ERROR,
590
+ detail="upstream returned 2xx but body was not JSON",
591
+ )
592
+
593
+ # Success — give a short confirmation with observed usage (if any).
594
+ usage = parsed.get("usage") or {}
595
+ tokens_in = usage.get("prompt_tokens") or usage.get("input_tokens")
596
+ tokens_out = usage.get("completion_tokens") or usage.get("output_tokens")
597
+ return ProbeResult(
598
+ name="auth+basic-chat",
599
+ verdict=ProbeVerdict.OK,
600
+ detail=(
601
+ f"{status} OK"
602
+ + (f" (in={tokens_in}, out={tokens_out})" if tokens_in is not None else "")
603
+ ),
604
+ )
605
+
606
+
607
+ def _extract_openai_assistant_choice(
608
+ body: dict[str, Any],
609
+ ) -> dict[str, Any] | None:
610
+ choices = body.get("choices")
611
+ if not isinstance(choices, list) or not choices:
612
+ return None
613
+ first = choices[0]
614
+ if not isinstance(first, dict):
615
+ return None
616
+ msg = first.get("message")
617
+ return msg if isinstance(msg, dict) else None
618
+
619
+
620
+ async def _probe_num_ctx(provider: ProviderConfig) -> ProbeResult:
621
+ """v1.0-B Probe — direct detection of Ollama ``num_ctx`` truncation.
622
+
623
+ Addresses plan.md §9.4 symptom #1 (空応答 / 意味不明応答). Prior to
624
+ v1.0-B the symptom was inferred only indirectly — a silently-truncated
625
+ system prompt often produced a tool-unaware assistant reply, which the
626
+ v0.7-B tool_calls probe then flagged as NEEDS_TUNING for
627
+ ``capabilities.tools=false``. That patch did not fix the root cause;
628
+ the remediation was always the same ``extra_body.options.num_ctx: N``
629
+ bump. The direct probe here uses a canary echo-back to observe the
630
+ truncation first-hand and emit the correct patch.
631
+
632
+ Mechanism:
633
+ * Apply the canary (``ZEBRA-MOON-847``) at the very beginning.
634
+ * Follow with ~5k tokens of filler sentences to overflow Ollama's
635
+ default 2048-token context window.
636
+ * Close with an explicit ask to echo the canary token back.
637
+ * Merge ``provider.extra_body`` into the request body (so any
638
+ declared ``options.num_ctx`` is exercised).
639
+
640
+ Verdict branches:
641
+
642
+ canary echoed + num_ctx declared ≥ threshold → OK
643
+ canary echoed + num_ctx not declared → OK (informational —
644
+ upstream isn't
645
+ actually truncating
646
+ at its advertised
647
+ default, which is
648
+ unusual but benign)
649
+ canary missing + num_ctx not declared → NEEDS_TUNING, patch
650
+ adds 32768
651
+ canary missing + num_ctx declared < threshold → NEEDS_TUNING, patch
652
+ bumps to 32768
653
+ canary missing + num_ctx declared ≥ threshold → NEEDS_TUNING with a
654
+ note about model
655
+ intrinsic limits
656
+
657
+ Non-Ollama-shape providers SKIP (see ``_is_ollama_like``).
658
+ """
659
+ if not _is_ollama_like(provider):
660
+ return ProbeResult(
661
+ name="num_ctx",
662
+ verdict=ProbeVerdict.SKIP,
663
+ detail=(
664
+ "not applicable — provider does not look Ollama-shape "
665
+ "(base_url is not on port 11434 and no "
666
+ "`extra_body.options.num_ctx` is declared)."
667
+ ),
668
+ )
669
+
670
+ padding = _NUM_CTX_PROBE_PADDING_SENTENCE * _NUM_CTX_PROBE_PADDING_REPEATS
671
+ user_prompt = (
672
+ f"CANARY: {_NUM_CTX_PROBE_CANARY}\n\n"
673
+ + padding
674
+ + "\n\nQuestion: What exact canary token appeared at the very "
675
+ "beginning of this message? Reply with only the canary token "
676
+ "itself, nothing else."
677
+ )
678
+
679
+ url = _openai_chat_url(provider)
680
+ headers = _openai_headers(provider)
681
+ # Start from the provider's extra_body — this is the only probe that
682
+ # merges it in, because the whole point of this probe is to exercise
683
+ # whatever ``options.num_ctx`` the operator has declared. Request
684
+ # fields win over extra_body, matching the adapter's merge order.
685
+ body: dict[str, Any] = dict(provider.extra_body)
686
+ body.update(
687
+ {
688
+ "model": provider.model,
689
+ "messages": [{"role": "user", "content": user_prompt}],
690
+ "max_tokens": 32,
691
+ "temperature": 0,
692
+ }
693
+ )
694
+
695
+ status, parsed, _raw = await _http_post_json(
696
+ url, headers=headers, body=body, timeout=provider.timeout_s
697
+ )
698
+
699
+ if status is None or status >= 400 or parsed is None:
700
+ return ProbeResult(
701
+ name="num_ctx",
702
+ verdict=ProbeVerdict.SKIP,
703
+ detail=f"skipped (upstream status={status!r}).",
704
+ )
705
+
706
+ msg = _extract_openai_assistant_choice(parsed)
707
+ content = msg.get("content") if isinstance(msg, dict) else None
708
+ content_text = content if isinstance(content, str) else ""
709
+ canary_echoed = _NUM_CTX_PROBE_CANARY in content_text
710
+
711
+ declared = _declared_num_ctx(provider)
712
+
713
+ if canary_echoed:
714
+ if declared is not None and declared >= _NUM_CTX_ADEQUATE_THRESHOLD:
715
+ return ProbeResult(
716
+ name="num_ctx",
717
+ verdict=ProbeVerdict.OK,
718
+ detail=(
719
+ f"canary echoed at ~{len(user_prompt)} chars of prompt; "
720
+ f"declared num_ctx={declared} is adequate "
721
+ f"(≥ {_NUM_CTX_ADEQUATE_THRESHOLD})."
722
+ ),
723
+ )
724
+ if declared is None:
725
+ return ProbeResult(
726
+ name="num_ctx",
727
+ verdict=ProbeVerdict.OK,
728
+ detail=(
729
+ f"canary echoed at ~{len(user_prompt)} chars; upstream "
730
+ "accepted the full prompt without truncation "
731
+ "(no `options.num_ctx` declared — the Ollama default is "
732
+ "2048 so this is unusual; treat as informational)."
733
+ ),
734
+ )
735
+ # declared is not None but below threshold, yet canary still echoed.
736
+ # Either Ollama silently overrode the low declaration (some 0.20+
737
+ # builds clamp `options.num_ctx` to the model's loaded context size)
738
+ # or the prompt simply fit. Surface the declared value so operators
739
+ # running the v1.0-verify script can tell this case apart from a
740
+ # config-loading failure.
741
+ return ProbeResult(
742
+ name="num_ctx",
743
+ verdict=ProbeVerdict.OK,
744
+ detail=(
745
+ f"canary echoed at ~{len(user_prompt)} chars; upstream "
746
+ f"accepted the full prompt despite declared num_ctx="
747
+ f"{declared} (below the {_NUM_CTX_ADEQUATE_THRESHOLD}-token "
748
+ "threshold). Either the prompt fit anyway or Ollama "
749
+ "ignored the declared value — check `ollama ps` for the "
750
+ "session's loaded context and consider `ollama stop "
751
+ f"{provider.model}` before probing to force a cold reload."
752
+ ),
753
+ )
754
+
755
+ # Canary missing → truncation occurred.
756
+ if declared is None:
757
+ return ProbeResult(
758
+ name="num_ctx",
759
+ verdict=ProbeVerdict.NEEDS_TUNING,
760
+ detail=(
761
+ f"canary {_NUM_CTX_PROBE_CANARY!r} missing from reply — "
762
+ "upstream truncated the prompt. No `extra_body.options.num_ctx` "
763
+ "is declared, so Ollama is running at its 2048-token default, "
764
+ "which cannot hold Claude Code's system + tool prompts "
765
+ "(plan.md §9.4 symptom #1)."
766
+ ),
767
+ target_file="providers.yaml",
768
+ suggested_patch=_patch_providers_yaml_num_ctx(provider.name, 32768),
769
+ )
770
+ if declared < _NUM_CTX_ADEQUATE_THRESHOLD:
771
+ return ProbeResult(
772
+ name="num_ctx",
773
+ verdict=ProbeVerdict.NEEDS_TUNING,
774
+ detail=(
775
+ f"canary missing — declared num_ctx={declared} is below "
776
+ f"the {_NUM_CTX_ADEQUATE_THRESHOLD}-token threshold needed "
777
+ "for Claude Code prompts. Bump it (plan.md §9.4 symptom #1)."
778
+ ),
779
+ target_file="providers.yaml",
780
+ suggested_patch=_patch_providers_yaml_num_ctx(provider.name, 32768),
781
+ )
782
+ # Declared high but still truncated — the upstream model's intrinsic
783
+ # limit is probably lower than the declared num_ctx, or the server is
784
+ # silently capping it. Still NEEDS_TUNING because the observed behavior
785
+ # doesn't match the declaration; operator should verify.
786
+ return ProbeResult(
787
+ name="num_ctx",
788
+ verdict=ProbeVerdict.NEEDS_TUNING,
789
+ detail=(
790
+ f"canary missing even with num_ctx={declared} declared. The "
791
+ "model's intrinsic context limit may be shorter than the "
792
+ "declared value, or the upstream is silently capping it — "
793
+ "verify with the model card / server logs. The suggested "
794
+ "patch still emits 32768 as a starting point; dial down if "
795
+ "the host is memory-constrained."
796
+ ),
797
+ target_file="providers.yaml",
798
+ suggested_patch=_patch_providers_yaml_num_ctx(provider.name, 32768),
799
+ )
800
+
801
+
802
+ async def _probe_streaming(provider: ProviderConfig) -> ProbeResult:
803
+ """v1.0-C Probe — streaming completion path integrity.
804
+
805
+ Addresses plan.md §9.4 symptom #1 from the **output** side. The v1.0-B
806
+ ``num_ctx`` probe catches silent **prompt** truncation; this one
807
+ catches silent **completion** truncation — specifically Ollama's
808
+ ``options.num_predict`` cap closing the stream early with
809
+ ``finish_reason: length``. Secondary failure mode covered: upstream
810
+ silently ignoring ``stream: true`` (2xx response but zero SSE chunks),
811
+ which Claude Code experiences as a "no output until timeout" stall.
812
+
813
+ Ollama-shape gating
814
+ -------------------
815
+ Fires only when :func:`_is_ollama_like` returns True — same signal set
816
+ as the num_ctx probe (``:11434`` port or declared
817
+ ``extra_body.options.num_ctx``). Rationale:
818
+
819
+ * Non-Ollama openai_compat upstreams (OpenRouter, Together, Groq,
820
+ vLLM, llama.cpp) either cap via non-``extra_body`` knobs (server
821
+ start flags, plan-level limits) that ``providers.yaml`` cannot
822
+ reach, or they don't silently cap at all. Emitting a patch would
823
+ be actionless.
824
+ * Anthropic native streaming uses a different event wire format
825
+ (``content_block_delta`` etc.); deferred to a hypothetical v1.0-D
826
+ if symptoms ever surface there.
827
+
828
+ Gating also keeps the existing :8080 fixture-based tests
829
+ SKIP-without-HTTP, so the mock FIFO in 30+ tests stays intact.
830
+
831
+ Verdicts
832
+ --------
833
+ * non-Ollama-shape → SKIP
834
+ * transport/auth/HTTP error → SKIP (auth probe dominates)
835
+ * 2xx + 0 chunks (stream ignored) → NEEDS_TUNING (no patch —
836
+ advisory; the upstream
837
+ framing is broken or the
838
+ model does not support
839
+ streaming)
840
+ * 2xx + chunks + finish_reason=length
841
+ + content < threshold → NEEDS_TUNING + num_predict
842
+ patch
843
+ * 2xx + chunks + finish_reason=stop
844
+ + content ≥ threshold → OK
845
+ * 2xx + chunks + no ``[DONE]`` → OK with informational note
846
+ (most clients tolerate; the
847
+ signal is surfaced for
848
+ operators running strict
849
+ SSE parsers)
850
+ """
851
+ if not _is_ollama_like(provider):
852
+ return ProbeResult(
853
+ name="streaming",
854
+ verdict=ProbeVerdict.SKIP,
855
+ detail=(
856
+ "not applicable — streaming-path truncation detection is "
857
+ "Ollama-shape-gated (same signal as num_ctx probe: port "
858
+ "11434 or declared `extra_body.options.num_ctx`). Cloud "
859
+ "openai_compat upstreams do not expose an actionable "
860
+ "`num_predict` knob from providers.yaml."
861
+ ),
862
+ )
863
+
864
+ url = _openai_chat_url(provider)
865
+ headers = _openai_headers(provider)
866
+ # Merge extra_body same as num_ctx probe — we want declared
867
+ # ``options.num_predict`` (if any) to actually take effect during
868
+ # probing. Top-level probe fields win on collision, matching adapter
869
+ # merge order.
870
+ body: dict[str, Any] = dict(provider.extra_body)
871
+ body.update(
872
+ {
873
+ "model": provider.model,
874
+ "messages": [{"role": "user", "content": _STREAMING_PROBE_USER_PROMPT}],
875
+ "max_tokens": 128,
876
+ "temperature": 0,
877
+ "stream": True,
878
+ }
879
+ )
880
+
881
+ status, chunks, saw_done, err = await _http_stream_sse(
882
+ url, headers=headers, body=body, timeout=provider.timeout_s
883
+ )
884
+
885
+ if status is None:
886
+ return ProbeResult(
887
+ name="streaming",
888
+ verdict=ProbeVerdict.SKIP,
889
+ detail=f"skipped (transport error during streaming: {err}).",
890
+ )
891
+ if status in (401, 403):
892
+ return ProbeResult(
893
+ name="streaming",
894
+ verdict=ProbeVerdict.SKIP,
895
+ detail=(
896
+ f"skipped (upstream status={status} during streaming); "
897
+ "auth probe already reported this."
898
+ ),
899
+ )
900
+ if status >= 400:
901
+ return ProbeResult(
902
+ name="streaming",
903
+ verdict=ProbeVerdict.SKIP,
904
+ detail=f"skipped (upstream status={status}): {err[:160]!r}",
905
+ )
906
+
907
+ # 2xx — aggregate content + finish_reason across chunks.
908
+ content_parts: list[str] = []
909
+ finish_reason: str | None = None
910
+ for chunk in chunks:
911
+ choices = chunk.get("choices")
912
+ if not isinstance(choices, list):
913
+ continue
914
+ for c in choices:
915
+ if not isinstance(c, dict):
916
+ continue
917
+ delta = c.get("delta")
918
+ if isinstance(delta, dict):
919
+ piece = delta.get("content")
920
+ if isinstance(piece, str):
921
+ content_parts.append(piece)
922
+ fr = c.get("finish_reason")
923
+ if isinstance(fr, str) and fr:
924
+ finish_reason = fr
925
+ content = "".join(content_parts)
926
+
927
+ if not chunks:
928
+ # Non-blocking upstream: 2xx arrived but no SSE chunks did. The
929
+ # `stream: true` flag was likely dropped (some Ollama-compat
930
+ # forks) or the upstream returned a single-shot JSON with a
931
+ # non-SSE content-type. No actionable ``extra_body`` patch —
932
+ # surface the observation and let the operator investigate.
933
+ return ProbeResult(
934
+ name="streaming",
935
+ verdict=ProbeVerdict.NEEDS_TUNING,
936
+ detail=(
937
+ "upstream returned 2xx but emitted no streaming chunks. "
938
+ "`stream: true` was likely ignored, or the SSE framing is "
939
+ "non-standard (no `data:` prefix / content-type != "
940
+ "`text/event-stream`). Verify with "
941
+ "`curl -N -H 'Accept: text/event-stream'` before relying "
942
+ "on streaming from Claude Code."
943
+ ),
944
+ )
945
+
946
+ if finish_reason == "length" and len(content) < _STREAMING_PROBE_MIN_EXPECTED_CHARS:
947
+ # Premature cap — the hallmark of a low ``num_predict`` on
948
+ # Ollama. Claude Code users see this as "assistant cut off
949
+ # mid-word". Since we're already Ollama-shape-gated, the
950
+ # remediation is always the ``extra_body.options.num_predict``
951
+ # bump.
952
+ return ProbeResult(
953
+ name="streaming",
954
+ verdict=ProbeVerdict.NEEDS_TUNING,
955
+ detail=(
956
+ f"stream closed with `finish_reason='length'` after only "
957
+ f"{len(content)} chars (expected ≥ "
958
+ f"{_STREAMING_PROBE_MIN_EXPECTED_CHARS}). Upstream is "
959
+ "capping output — most likely `options.num_predict`. "
960
+ "Bump it via `extra_body` (plan.md §9.4 symptom #1 "
961
+ "streaming variant)."
962
+ ),
963
+ target_file="providers.yaml",
964
+ suggested_patch=_patch_providers_yaml_num_predict(
965
+ provider.name, _STREAMING_PROBE_NUM_PREDICT_DEFAULT
966
+ ),
967
+ )
968
+
969
+ # Stream completed; surface the `[DONE]` observation as an
970
+ # informational suffix so strict-SSE operators know to check their
971
+ # parser tolerance.
972
+ done_note = (
973
+ ""
974
+ if saw_done
975
+ else (
976
+ " (no explicit `[DONE]` terminator observed — most clients "
977
+ "tolerate this but strict SSE parsers may stall)"
978
+ )
979
+ )
980
+ return ProbeResult(
981
+ name="streaming",
982
+ verdict=ProbeVerdict.OK,
983
+ detail=(
984
+ f"stream completed: {len(chunks)} chunks, {len(content)} "
985
+ f"chars, finish_reason={finish_reason!r}{done_note}."
986
+ ),
987
+ )
988
+
989
+
990
+ async def _probe_tool_calls(
991
+ provider: ProviderConfig,
992
+ resolved: ResolvedCapabilities,
993
+ ) -> ProbeResult:
994
+ """Probe 2 — does the model emit native ``tool_calls`` structure?
995
+
996
+ Three observed paths, mapped to a verdict vs the declaration chain
997
+ (``provider.capabilities.tools`` → registry → None):
998
+
999
+ * Native ``tool_calls`` populated → *supports tools natively*.
1000
+ If declaration says False → NEEDS_TUNING (flip to True).
1001
+ If declaration says True → OK.
1002
+
1003
+ * No ``tool_calls`` but text contains tool-shaped JSON that
1004
+ v0.3-A ``repair_tool_calls_in_text`` can extract → *supports
1005
+ tools via text-JSON only*. If declaration says True →
1006
+ NEEDS_TUNING (model works but relies on repair; a narrower
1007
+ declaration avoids surprises downstream). If False → OK
1008
+ (repair path still rescues at runtime, no tuning needed).
1009
+
1010
+ * Nothing tool-shaped at all → *tools likely unsupported*.
1011
+ If declaration says True → NEEDS_TUNING (flip to False). If
1012
+ False → OK.
1013
+ """
1014
+ if provider.kind == "anthropic":
1015
+ # Anthropic native tools use a different wire shape; we probe
1016
+ # via the messages API. A capable model returns content blocks
1017
+ # of type "tool_use".
1018
+ url = _anthropic_messages_url(provider)
1019
+ headers = _anthropic_headers(provider)
1020
+ body: dict[str, Any] = {
1021
+ "model": provider.model,
1022
+ "messages": [
1023
+ {"role": "user", "content": _PROBE_TOOLS_USER_PROMPT},
1024
+ ],
1025
+ "max_tokens": 64,
1026
+ "tools": [_PROBE_TOOL_SPEC_ANTHROPIC],
1027
+ }
1028
+ else:
1029
+ url = _openai_chat_url(provider)
1030
+ headers = _openai_headers(provider)
1031
+ body = {
1032
+ "model": provider.model,
1033
+ "messages": [
1034
+ {"role": "user", "content": _PROBE_TOOLS_USER_PROMPT},
1035
+ ],
1036
+ "max_tokens": 64,
1037
+ "temperature": 0,
1038
+ "tools": [_PROBE_TOOL_SPEC_OPENAI],
1039
+ }
1040
+
1041
+ status, parsed, _raw = await _http_post_json(
1042
+ url, headers=headers, body=body, timeout=provider.timeout_s
1043
+ )
1044
+
1045
+ if status is None or status >= 400 or parsed is None:
1046
+ return ProbeResult(
1047
+ name="tool_calls",
1048
+ verdict=ProbeVerdict.SKIP,
1049
+ detail=(
1050
+ f"skipped (upstream status={status!r}); run auth probe "
1051
+ "first. Probe re-inspects this on the next invocation."
1052
+ ),
1053
+ )
1054
+
1055
+ native_tool_call = False
1056
+ text_json_tool_call = False
1057
+ content_sample = ""
1058
+ if provider.kind == "anthropic":
1059
+ blocks = parsed.get("content")
1060
+ if isinstance(blocks, list):
1061
+ for block in blocks:
1062
+ if isinstance(block, dict) and block.get("type") == "tool_use":
1063
+ native_tool_call = True
1064
+ break
1065
+ content_sample = " ".join(
1066
+ str(b.get("text", ""))
1067
+ for b in blocks
1068
+ if isinstance(b, dict) and b.get("type") == "text"
1069
+ )[:200]
1070
+ else:
1071
+ msg = _extract_openai_assistant_choice(parsed)
1072
+ if msg is not None:
1073
+ if msg.get("tool_calls"):
1074
+ native_tool_call = True
1075
+ content = msg.get("content")
1076
+ if isinstance(content, str):
1077
+ content_sample = content[:200]
1078
+
1079
+ if not native_tool_call and content_sample:
1080
+ _, repaired = repair_tool_calls_in_text(content_sample, ["echo"])
1081
+ text_json_tool_call = bool(repaired)
1082
+
1083
+ # Resolve the declared support:
1084
+ # - explicit providers.yaml `capabilities.tools` wins (schema default is
1085
+ # False, so "declared" here means the user opted in). We treat the
1086
+ # registry as our fallback source of truth.
1087
+ declared_explicit = provider.capabilities.tools
1088
+ declared_registry = resolved.tools
1089
+ # "declared true" = either explicit opt-in OR registry True.
1090
+ # "declared false" = explicit False AND registry False/None.
1091
+ declared = declared_explicit or (declared_registry is True)
1092
+
1093
+ if native_tool_call:
1094
+ if declared:
1095
+ return ProbeResult(
1096
+ name="tool_calls",
1097
+ verdict=ProbeVerdict.OK,
1098
+ detail="native `tool_calls` observed; matches declaration.",
1099
+ )
1100
+ return ProbeResult(
1101
+ name="tool_calls",
1102
+ verdict=ProbeVerdict.NEEDS_TUNING,
1103
+ detail=(
1104
+ "model emitted native `tool_calls` but neither "
1105
+ "providers.yaml nor the registry declares tools=true. "
1106
+ "Opt in to unlock tool-bearing prompts."
1107
+ ),
1108
+ target_file="providers.yaml",
1109
+ suggested_patch=_patch_providers_yaml_capability(provider.name, "tools", True),
1110
+ )
1111
+
1112
+ if text_json_tool_call:
1113
+ # Model wrote tool JSON in text. v0.3-A repair will rescue it,
1114
+ # but advertise it as a partial support so operators know.
1115
+ if declared:
1116
+ return ProbeResult(
1117
+ name="tool_calls",
1118
+ verdict=ProbeVerdict.NEEDS_TUNING,
1119
+ detail=(
1120
+ "model wrote tool JSON in assistant text (not native "
1121
+ "`tool_calls`). v0.3-A repair will rescue it at runtime, "
1122
+ "but the declaration implies native support. Either "
1123
+ "update the model to a tool-native build, or downgrade "
1124
+ "the declaration to rely on repair."
1125
+ ),
1126
+ target_file="providers.yaml",
1127
+ suggested_patch=_patch_providers_yaml_capability(provider.name, "tools", False),
1128
+ )
1129
+ return ProbeResult(
1130
+ name="tool_calls",
1131
+ verdict=ProbeVerdict.OK,
1132
+ detail=(
1133
+ "no native `tool_calls`, but v0.3-A repair extracted tool "
1134
+ "JSON from the text — matches declaration tools=false."
1135
+ ),
1136
+ )
1137
+
1138
+ # Nothing tool-shaped at all.
1139
+ if declared:
1140
+ return ProbeResult(
1141
+ name="tool_calls",
1142
+ verdict=ProbeVerdict.NEEDS_TUNING,
1143
+ detail=(
1144
+ "declaration says tools=true but model produced neither "
1145
+ "native `tool_calls` nor repairable tool JSON. Common for "
1146
+ "quantized small models (plan.md §9.4 symptom #2)."
1147
+ ),
1148
+ target_file="providers.yaml",
1149
+ suggested_patch=_patch_providers_yaml_capability(provider.name, "tools", False),
1150
+ )
1151
+ return ProbeResult(
1152
+ name="tool_calls",
1153
+ verdict=ProbeVerdict.OK,
1154
+ detail="no tool calls, declaration tools=false — consistent.",
1155
+ )
1156
+
1157
+
1158
+ async def _probe_thinking(
1159
+ provider: ProviderConfig,
1160
+ resolved: ResolvedCapabilities,
1161
+ ) -> ProbeResult:
1162
+ """Probe 3 — does the model actually emit a ``thinking`` block?
1163
+
1164
+ Only applicable to ``kind: anthropic`` providers (the body field is
1165
+ Anthropic-specific; openai_compat providers silently lose it during
1166
+ OpenAI-shape translation). If the provider is openai_compat, we
1167
+ return SKIP unless they explicitly opted in via
1168
+ ``capabilities.thinking: true`` — in which case we still SKIP but
1169
+ with a one-line note that the flag currently has no effect for
1170
+ that adapter (the v0.5-A gate would still strip it on the way out).
1171
+ """
1172
+ if provider.kind != "anthropic":
1173
+ if provider.capabilities.thinking:
1174
+ return ProbeResult(
1175
+ name="thinking",
1176
+ verdict=ProbeVerdict.SKIP,
1177
+ detail=(
1178
+ "capabilities.thinking=true on an openai_compat "
1179
+ "provider has no effect — the thinking block is lost "
1180
+ "during OpenAI-shape translation. Remove the flag or "
1181
+ "switch kind to `anthropic` if the upstream speaks "
1182
+ "Anthropic wire."
1183
+ ),
1184
+ )
1185
+ return ProbeResult(
1186
+ name="thinking",
1187
+ verdict=ProbeVerdict.SKIP,
1188
+ detail="not applicable (kind=openai_compat).",
1189
+ )
1190
+
1191
+ url = _anthropic_messages_url(provider)
1192
+ headers = _anthropic_headers(provider)
1193
+ body: dict[str, Any] = {
1194
+ "model": provider.model,
1195
+ "messages": [
1196
+ {
1197
+ "role": "user",
1198
+ "content": "Briefly: what is 2+2? Think step by step first.",
1199
+ },
1200
+ ],
1201
+ "max_tokens": 128,
1202
+ "thinking": {"type": "enabled", "budget_tokens": 1024},
1203
+ }
1204
+ status, parsed, raw = await _http_post_json(
1205
+ url, headers=headers, body=body, timeout=provider.timeout_s
1206
+ )
1207
+
1208
+ if status is None or status >= 400 or parsed is None:
1209
+ # A 400 on the thinking-enabled payload is diagnostic: the
1210
+ # model rejected the field. Map to NEEDS_TUNING when the
1211
+ # registry / explicit flag promised support, otherwise OK.
1212
+ rejected = (
1213
+ status is not None and status == 400 and raw is not None and "thinking" in raw.lower()
1214
+ )
1215
+ declared = provider.capabilities.thinking or (resolved.thinking is True)
1216
+ if rejected and declared:
1217
+ return ProbeResult(
1218
+ name="thinking",
1219
+ verdict=ProbeVerdict.NEEDS_TUNING,
1220
+ detail=(
1221
+ "upstream rejected `thinking: {type: enabled}` with "
1222
+ "400. Declaration says supported — disable it for "
1223
+ "this provider or refine the registry rule."
1224
+ ),
1225
+ target_file="providers.yaml",
1226
+ suggested_patch=_patch_providers_yaml_capability(provider.name, "thinking", False),
1227
+ )
1228
+ if rejected and not declared:
1229
+ return ProbeResult(
1230
+ name="thinking",
1231
+ verdict=ProbeVerdict.OK,
1232
+ detail="upstream rejects thinking; matches declaration.",
1233
+ )
1234
+ return ProbeResult(
1235
+ name="thinking",
1236
+ verdict=ProbeVerdict.SKIP,
1237
+ detail=f"skipped (upstream status={status!r}).",
1238
+ )
1239
+
1240
+ # Look for a `thinking` block in the response content array.
1241
+ emitted = False
1242
+ blocks = parsed.get("content")
1243
+ if isinstance(blocks, list):
1244
+ for block in blocks:
1245
+ if isinstance(block, dict) and block.get("type") == "thinking":
1246
+ emitted = True
1247
+ break
1248
+
1249
+ declared = provider.capabilities.thinking or (resolved.thinking is True)
1250
+
1251
+ if emitted and declared:
1252
+ return ProbeResult(
1253
+ name="thinking",
1254
+ verdict=ProbeVerdict.OK,
1255
+ detail="thinking block emitted; matches declaration.",
1256
+ )
1257
+ if emitted and not declared:
1258
+ return ProbeResult(
1259
+ name="thinking",
1260
+ verdict=ProbeVerdict.NEEDS_TUNING,
1261
+ detail=(
1262
+ "thinking block emitted but declaration is silent. "
1263
+ "Declare support to let the capability gate route to "
1264
+ "this provider for thinking-bearing requests."
1265
+ ),
1266
+ target_file="model-capabilities.yaml",
1267
+ suggested_patch=_patch_model_capabilities_yaml(
1268
+ match=provider.model, kind="anthropic", key="thinking", value=True
1269
+ ),
1270
+ )
1271
+ if not emitted and declared:
1272
+ return ProbeResult(
1273
+ name="thinking",
1274
+ verdict=ProbeVerdict.NEEDS_TUNING,
1275
+ detail=(
1276
+ "declaration says thinking supported but response had no "
1277
+ "`thinking` block. The upstream may silently drop it; "
1278
+ "disable the flag or narrow the registry rule."
1279
+ ),
1280
+ target_file="providers.yaml",
1281
+ suggested_patch=_patch_providers_yaml_capability(provider.name, "thinking", False),
1282
+ )
1283
+ return ProbeResult(
1284
+ name="thinking",
1285
+ verdict=ProbeVerdict.OK,
1286
+ detail="no thinking block emitted; matches declaration.",
1287
+ )
1288
+
1289
+
1290
+ async def _probe_reasoning_leak(
1291
+ provider: ProviderConfig,
1292
+ resolved: ResolvedCapabilities,
1293
+ ) -> ProbeResult:
1294
+ """Probe 4 — does the upstream leak non-standard reasoning / harness markers?
1295
+
1296
+ Two orthogonal leaks inspected here:
1297
+
1298
+ A. The non-standard ``message.reasoning`` field (v0.5-C).
1299
+ The adapter strips it before the response reaches the client, but
1300
+ this probe bypasses the adapter and reads the raw body so the
1301
+ operator knows whether any ``capability-degraded`` log lines come
1302
+ from this provider.
1303
+
1304
+ B. (v1.0-A) Content-embedded harness markers — a ``<think>...</think>``
1305
+ block or stop markers (``<|python_tag|>`` / ``<|eot_id|>`` /
1306
+ ``<|im_end|>`` / ``<|turn|>`` / ``<|end|>`` / ``<|channel>thought``)
1307
+ inside ``message.content``. These slip past the v0.5-C strip (which
1308
+ only inspects the ``reasoning`` field), so the v1.0-A
1309
+ ``output_filters`` chain is the remediation. When the probe observes
1310
+ such markers AND the configured ``output_filters`` list does not
1311
+ cover them, a NEEDS_TUNING verdict emits a copy-paste YAML patch.
1312
+
1313
+ Verdict priority: content-embedded leak dominates the reasoning-field
1314
+ observation (a NEEDS_TUNING from B overrides an informational OK from
1315
+ A) because the user-visible symptom — ``<think>`` rendered in the
1316
+ Claude Code UI — is the one operators actually feel.
1317
+ """
1318
+ if provider.kind != "openai_compat":
1319
+ return ProbeResult(
1320
+ name="reasoning-leak",
1321
+ verdict=ProbeVerdict.SKIP,
1322
+ detail=(
1323
+ "not applicable (only openai_compat emits the non-standard "
1324
+ "reasoning field; Anthropic content blocks would need a "
1325
+ "different probe)."
1326
+ ),
1327
+ )
1328
+
1329
+ url = _openai_chat_url(provider)
1330
+ headers = _openai_headers(provider)
1331
+ # Nudge models that default to thinking into emitting the block, so
1332
+ # the content-embedded check has something to look at when the model
1333
+ # is genuinely leaky. A model that ignores the nudge will still be
1334
+ # tested against the reasoning-field observation from its plain reply.
1335
+ body = {
1336
+ "model": provider.model,
1337
+ "messages": [
1338
+ {
1339
+ "role": "user",
1340
+ "content": (
1341
+ "Think step by step about the capital of France, then answer in one word."
1342
+ ),
1343
+ },
1344
+ ],
1345
+ "max_tokens": 128,
1346
+ "temperature": 0,
1347
+ }
1348
+ status, parsed, _raw = await _http_post_json(
1349
+ url, headers=headers, body=body, timeout=provider.timeout_s
1350
+ )
1351
+
1352
+ if status is None or status >= 400 or parsed is None:
1353
+ return ProbeResult(
1354
+ name="reasoning-leak",
1355
+ verdict=ProbeVerdict.SKIP,
1356
+ detail=f"skipped (upstream status={status!r}).",
1357
+ )
1358
+
1359
+ msg = _extract_openai_assistant_choice(parsed)
1360
+ has_reasoning = bool(msg and "reasoning" in msg)
1361
+
1362
+ # v1.0-A: content-embedded marker detection.
1363
+ content = (msg.get("content") if isinstance(msg, dict) else None) or ""
1364
+ content_text = content if isinstance(content, str) else ""
1365
+ has_think = "<think>" in content_text
1366
+ leaked_markers: list[str] = [m for m in DEFAULT_STOP_MARKERS if m in content_text]
1367
+ configured_filters = set(provider.output_filters)
1368
+ needs_strip_thinking = has_think and "strip_thinking" not in configured_filters
1369
+ needs_strip_markers = bool(leaked_markers) and "strip_stop_markers" not in configured_filters
1370
+
1371
+ if needs_strip_thinking or needs_strip_markers:
1372
+ # Dominant signal — emit NEEDS_TUNING with a copy-paste patch
1373
+ # that adds exactly the filters that would have caught this
1374
+ # observation. A provider already running one filter and newly
1375
+ # tripping on the other is rare; we still emit the full needed
1376
+ # set so operators see the complete remediation.
1377
+ recommended: list[str] = []
1378
+ if needs_strip_thinking:
1379
+ recommended.append("strip_thinking")
1380
+ if needs_strip_markers:
1381
+ recommended.append("strip_stop_markers")
1382
+
1383
+ found_desc: list[str] = []
1384
+ if has_think:
1385
+ found_desc.append("<think>...</think>")
1386
+ if leaked_markers:
1387
+ found_desc.append("stop markers " + ", ".join(repr(m) for m in leaked_markers))
1388
+
1389
+ return ProbeResult(
1390
+ name="reasoning-leak",
1391
+ verdict=ProbeVerdict.NEEDS_TUNING,
1392
+ detail=(
1393
+ "content-embedded leak detected ("
1394
+ + " + ".join(found_desc)
1395
+ + "). v1.0-A `output_filters` would scrub this; current "
1396
+ f"provider chain = {sorted(configured_filters)}. Recommended: "
1397
+ f"add {recommended}."
1398
+ ),
1399
+ target_file="providers.yaml",
1400
+ suggested_patch=_patch_providers_yaml_output_filters(provider.name, recommended),
1401
+ )
1402
+
1403
+ passthrough_on = (
1404
+ provider.capabilities.reasoning_passthrough or resolved.reasoning_passthrough is True
1405
+ )
1406
+
1407
+ if has_reasoning and passthrough_on:
1408
+ return ProbeResult(
1409
+ name="reasoning-leak",
1410
+ verdict=ProbeVerdict.OK,
1411
+ detail=(
1412
+ "upstream emits `reasoning`; passthrough is on, so the "
1413
+ "field reaches clients as intended."
1414
+ ),
1415
+ )
1416
+ if has_reasoning and not passthrough_on:
1417
+ # Default behavior — v0.5-C strip removes it. No tuning needed;
1418
+ # this is expected. Emit OK with an informational note so the
1419
+ # operator understands where any `capability-degraded` logs
1420
+ # originate.
1421
+ return ProbeResult(
1422
+ name="reasoning-leak",
1423
+ verdict=ProbeVerdict.OK,
1424
+ detail=(
1425
+ "upstream emits non-standard `reasoning`; v0.5-C adapter "
1426
+ "strips it before it reaches the client (expected — "
1427
+ "expect `capability-degraded` log lines for this provider)."
1428
+ ),
1429
+ )
1430
+ return ProbeResult(
1431
+ name="reasoning-leak",
1432
+ verdict=ProbeVerdict.OK,
1433
+ detail=(
1434
+ "no `reasoning` field observed and no content-embedded markers — nothing to strip."
1435
+ ),
1436
+ )
1437
+
1438
+
1439
+ # ---------------------------------------------------------------------------
1440
+ # Orchestration
1441
+ # ---------------------------------------------------------------------------
1442
+
1443
+
1444
+ async def check_model(
1445
+ config: CodeRouterConfig,
1446
+ provider_name: str,
1447
+ *,
1448
+ registry: CapabilityRegistry | None = None,
1449
+ ) -> DoctorReport:
1450
+ """Run the full probe suite against ``provider_name`` in ``config``.
1451
+
1452
+ The auth probe runs first; if it fails, remaining probes are
1453
+ returned as SKIP (the suite does not waste tokens against a
1454
+ provider that can't respond).
1455
+
1456
+ ``registry`` is optional for testing — production callers pass
1457
+ nothing and the function uses the process-wide default (same
1458
+ registry the capability gate consults).
1459
+ """
1460
+ try:
1461
+ provider = config.provider_by_name(provider_name)
1462
+ except KeyError as exc:
1463
+ raise KeyError(
1464
+ f"provider {provider_name!r} not found in providers.yaml. "
1465
+ f"Known: {sorted(p.name for p in config.providers)}"
1466
+ ) from exc
1467
+
1468
+ reg = registry if registry is not None else get_default_registry()
1469
+ resolved = reg.lookup(kind=provider.kind, model=provider.model or "")
1470
+
1471
+ report = DoctorReport(
1472
+ provider_name=provider_name,
1473
+ provider=provider,
1474
+ resolved_caps=resolved,
1475
+ )
1476
+
1477
+ auth_result = await _probe_auth_and_basic_chat(provider)
1478
+ report.results.append(auth_result)
1479
+
1480
+ if auth_result.verdict != ProbeVerdict.OK:
1481
+ # Auth dominates; mark the other probes SKIP so the report
1482
+ # still lists them (operators can see at a glance what wasn't
1483
+ # checked) without spending tokens / API quota.
1484
+ for name in (
1485
+ "num_ctx",
1486
+ "tool_calls",
1487
+ "thinking",
1488
+ "reasoning-leak",
1489
+ "streaming",
1490
+ ):
1491
+ report.results.append(
1492
+ ProbeResult(
1493
+ name=name,
1494
+ verdict=ProbeVerdict.SKIP,
1495
+ detail="skipped — auth probe did not succeed.",
1496
+ )
1497
+ )
1498
+ return report
1499
+
1500
+ # v1.0-B: num_ctx probe runs before tool_calls. When Ollama silently
1501
+ # truncates the prompt the assistant often replies without tool calls,
1502
+ # which used to flag as a tools=false NEEDS_TUNING in v0.7-B. Putting
1503
+ # num_ctx first ensures the truncation verdict dominates the report so
1504
+ # operators apply the right remediation (bump num_ctx, not disable tools).
1505
+ # v1.0-C: streaming probe runs last. The input-side (num_ctx) and
1506
+ # declaration probes (tool_calls / thinking / reasoning-leak) should
1507
+ # dominate the report — streaming is the output-side sibling of
1508
+ # num_ctx and its NEEDS_TUNING verdict is orthogonal to the others.
1509
+ report.results.append(await _probe_num_ctx(provider))
1510
+ report.results.append(await _probe_tool_calls(provider, resolved))
1511
+ report.results.append(await _probe_thinking(provider, resolved))
1512
+ report.results.append(await _probe_reasoning_leak(provider, resolved))
1513
+ report.results.append(await _probe_streaming(provider))
1514
+ return report
1515
+
1516
+
1517
+ def run_check_model_sync(
1518
+ config: CodeRouterConfig,
1519
+ provider_name: str,
1520
+ *,
1521
+ registry: CapabilityRegistry | None = None,
1522
+ ) -> DoctorReport:
1523
+ """Sync wrapper — called from the CLI which is not otherwise async."""
1524
+ return asyncio.run(check_model(config, provider_name, registry=registry))
1525
+
1526
+
1527
+ # ---------------------------------------------------------------------------
1528
+ # Reporting
1529
+ # ---------------------------------------------------------------------------
1530
+
1531
+
1532
+ _VERDICT_BADGE = {
1533
+ ProbeVerdict.OK: "[OK]",
1534
+ ProbeVerdict.SKIP: "[SKIP]",
1535
+ ProbeVerdict.NEEDS_TUNING: "[NEEDS TUNING]",
1536
+ ProbeVerdict.UNSUPPORTED: "[UNSUPPORTED]",
1537
+ ProbeVerdict.AUTH_FAIL: "[AUTH FAIL]",
1538
+ ProbeVerdict.TRANSPORT_ERROR: "[TRANSPORT ERROR]",
1539
+ }
1540
+
1541
+
1542
+ def format_report(report: DoctorReport) -> str:
1543
+ """Human-readable, line-oriented report. Goes to stdout."""
1544
+ p = report.provider
1545
+ caps = report.resolved_caps
1546
+ lines: list[str] = []
1547
+ lines.append(f"coderouter doctor --check-model {report.provider_name}")
1548
+ lines.append("─" * 60)
1549
+ lines.append(f"provider: {p.name}")
1550
+ lines.append(f" kind: {p.kind}")
1551
+ lines.append(f" base_url: {p.base_url}")
1552
+ lines.append(f" model: {p.model}")
1553
+
1554
+ lines.append("")
1555
+ lines.append("Registry + providers.yaml declarations:")
1556
+ lines.append(
1557
+ f" thinking: providers={p.capabilities.thinking}, registry={caps.thinking}"
1558
+ )
1559
+ lines.append(
1560
+ f" tools: providers={p.capabilities.tools}, registry={caps.tools}"
1561
+ )
1562
+ lines.append(
1563
+ f" reasoning_passthrough: providers={p.capabilities.reasoning_passthrough}, "
1564
+ f"registry={caps.reasoning_passthrough}"
1565
+ )
1566
+ # v1.0-A: surface the output_filters chain so operators can see at a
1567
+ # glance which filters are active before running the probes.
1568
+ lines.append(f" output_filters: providers={list(p.output_filters)}")
1569
+
1570
+ lines.append("")
1571
+ lines.append("Probes:")
1572
+ for i, r in enumerate(report.results, start=1):
1573
+ badge = _VERDICT_BADGE[r.verdict]
1574
+ lines.append(f" [{i}/{len(report.results)}] {r.name} …… {badge}")
1575
+ for dline in r.detail.splitlines():
1576
+ lines.append(f" {dline}")
1577
+ if r.suggested_patch:
1578
+ lines.append(f" Suggested patch → {r.target_file}:")
1579
+ for pl in r.suggested_patch.splitlines():
1580
+ lines.append(f" {pl}")
1581
+
1582
+ lines.append("")
1583
+ code = exit_code_for(report)
1584
+ summary = {
1585
+ 0: "all probes match declarations.",
1586
+ 1: "at least one probe could not run (auth/transport/model).",
1587
+ 2: "at least one probe needs tuning (see suggested patches).",
1588
+ }[code]
1589
+ lines.append(f"Summary: {summary}")
1590
+ lines.append(f"Exit: {code}")
1591
+ return "\n".join(lines)
1592
+
1593
+
1594
+ def _probes_by_name(results: Sequence[ProbeResult]) -> dict[str, ProbeResult]:
1595
+ """Small convenience for tests that want to assert on one probe."""
1596
+ return {r.name: r for r in results}