coderouter-cli 1.8.1__tar.gz → 1.8.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/CHANGELOG.md +60 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/PKG-INFO +1 -1
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/data/model-capabilities.yaml +21 -9
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/doctor.py +88 -6
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/troubleshooting.md +30 -10
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/pyproject.toml +1 -1
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_doctor.py +168 -1
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/.gitignore +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/LICENSE +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/README.en.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/README.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/__init__.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/__main__.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/adapters/__init__.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/adapters/anthropic_native.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/adapters/base.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/adapters/openai_compat.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/adapters/registry.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/cli.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/cli_stats.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/config/__init__.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/config/capability_registry.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/config/env_file.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/config/loader.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/config/schemas.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/data/__init__.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/doctor_apply.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/env_security.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/errors.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/ingress/__init__.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/ingress/anthropic_routes.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/ingress/app.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/ingress/dashboard_routes.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/ingress/metrics_routes.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/ingress/openai_routes.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/logging.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/metrics/__init__.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/metrics/collector.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/metrics/prometheus.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/output_filters.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/routing/__init__.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/routing/auto_router.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/routing/capability.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/routing/fallback.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/translation/__init__.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/translation/anthropic.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/translation/convert.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/coderouter/translation/tool_repair.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/assets/dashboard-demo.png +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/designs/v1.5-dashboard-mockup.html +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/designs/v1.6-auto-router-verification.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/designs/v1.6-auto-router.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/free-tier-guide.en.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/free-tier-guide.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/hf-ollama-models.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/openrouter-roster/README.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/openrouter-roster/latest.json +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/quickstart.en.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/quickstart.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/retrospectives/v0.4.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/retrospectives/v0.5-verify.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/retrospectives/v0.5.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/retrospectives/v0.6.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/retrospectives/v0.7.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/retrospectives/v1.0-verify.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/retrospectives/v1.0.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/security.en.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/security.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/troubleshooting.en.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/usage-guide.en.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/usage-guide.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/when-do-i-need-coderouter.en.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/docs/when-do-i-need-coderouter.md +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/examples/.env.example +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/examples/providers.auto-custom.yaml +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/examples/providers.auto.yaml +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/examples/providers.note-2026.yaml +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/examples/providers.nvidia-nim.yaml +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/examples/providers.yaml +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/scripts/demo_traffic.sh +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/scripts/openrouter_roster_diff.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/scripts/verify_v0_5.sh +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/scripts/verify_v1_0.sh +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/__init__.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/conftest.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_adapter_anthropic.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_auto_router.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_capability.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_capability_degraded_payload.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_capability_registry.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_claude_code_suitability.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_cli.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_cli_stats.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_config.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_dashboard_endpoint.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_doctor_apply.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_env_file.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_env_security.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_errors.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_examples_yaml.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_fallback.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_fallback_anthropic.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_fallback_cache_control.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_fallback_misconfig_warn.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_fallback_paid_gate.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_fallback_thinking.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_ingress_anthropic.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_ingress_profile.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_metrics_collector.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_metrics_endpoint.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_metrics_jsonl.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_metrics_prometheus.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_openai_compat.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_openrouter_roster_diff.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_output_filters.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_output_filters_adapters.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_reasoning_strip.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_setup_sh.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_tool_repair.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_translation_anthropic.py +0 -0
- {coderouter_cli-1.8.1 → coderouter_cli-1.8.2}/tests/test_translation_reverse.py +0 -0
|
@@ -6,6 +6,66 @@ versioning follows [SemVer](https://semver.org/).
|
|
|
6
6
|
|
|
7
7
|
---
|
|
8
8
|
|
|
9
|
+
## [v1.8.2] — 2026-04-26 (doctor probe を thinking モデル対応に — Gemma 4 偽陽性の解消)
|
|
10
|
+
|
|
11
|
+
**Theme: v1.8.1 リリース直後の深掘りで `doctor` の `num_ctx` / `streaming` probe が thinking モデルに対して偽陽性 NEEDS_TUNING を出していた事実を発見、probe の `max_tokens` バジェットを reasoning トークン消費分込みで設計し直した patch。**
|
|
12
|
+
|
|
13
|
+
v1.8.1 で `coding` profile primary に置いた Gemma 4 26B の doctor 結果が `tool_calls [OK]` + `num_ctx [NEEDS TUNING]` + `streaming [NEEDS TUNING]` で「中途半端に動く」と判定されていたが、実機で curl 直叩きすると **Ollama OpenAI-compat 経由でも 5K トークンの canary echo-back に成功** することが判明。原因切り分けの結果、Gemma 4 が emit する非標準 `reasoning` フィールドが doctor probe の `max_tokens=32` (num_ctx) / `max_tokens=128` (streaming) を**思考トークンで食い切って `content=""` で `finish_reason='length'`** を返していた偽陽性と確定。実機検証 (M3 Max 64GB / Ollama 0.21.2) で Anthropic 互換 `/v1/messages` 経由 Gemma 4 26B が "Hello." を 2 秒で返すことも確認、**Gemma 4 26B は実用 OK** と最終判定。
|
|
14
|
+
|
|
15
|
+
- Tests: 730 → **733** (+3: thinking provider declaration / registry-based / streaming の 3 件)
|
|
16
|
+
- Runtime deps: 5 → 5 (20 sub-release 連続据え置き)
|
|
17
|
+
- Backward compat: 完全互換、`providers.yaml` / `~/.coderouter/model-capabilities.yaml` 編集不要
|
|
18
|
+
|
|
19
|
+
### Changes
|
|
20
|
+
|
|
21
|
+
#### Doctor probe: thinking モデル対応バジェット選択
|
|
22
|
+
|
|
23
|
+
- **`coderouter/doctor.py`**: `_probe_num_ctx` / `_probe_streaming` の `max_tokens` を thinking 検出付きの動的選択に変更。新 helper `_is_reasoning_model(provider, resolved)` が provider declaration / registry resolved の両方から `thinking` / `reasoning_passthrough` の真偽を見て、reasoning モデルのときだけ大きいバジェットを選ぶ。
|
|
24
|
+
- `_NUM_CTX_PROBE_MAX_TOKENS_DEFAULT = 256` (旧 32)、`_NUM_CTX_PROBE_MAX_TOKENS_THINKING = 1024`
|
|
25
|
+
- `_STREAMING_PROBE_MAX_TOKENS_DEFAULT = 512` (旧 128)、`_STREAMING_PROBE_MAX_TOKENS_THINKING = 1024`
|
|
26
|
+
- 非 thinking モデルは natural stop で早期終了するので無駄消費なし、thinking モデルは reasoning trace + 答えが収まる headroom
|
|
27
|
+
|
|
28
|
+
#### Registry: 既知 thinking モデルに `thinking: true` 宣言
|
|
29
|
+
|
|
30
|
+
- **`coderouter/data/model-capabilities.yaml`**: `gemma4:*` / `google/gemma-4*` / `qwen3.6:*` / `qwen/qwen3.6-*` に `thinking: true` を追加。これらは Ollama 経由で `reasoning` フィールドにかなりのトークンを吐く設計と確認済み。registry 経由で渡るので user は `providers.yaml` を触らなくても doctor の thinking バジェットが効く
|
|
31
|
+
- **Qwen3.6 セクションのコメント更新**: v1.8.1 時点で「Ollama silent cap」と書いていた part を「**v1.8.2 で num_ctx / streaming は doctor 偽陽性と判明、tool_calls [NEEDS TUNING] が真の課題として残る**」に整理。`claude_code_suitability` 撤回判断は維持 (Qwen3.6 の tool_calls 不全は thinking 起因ではない別の真の課題)
|
|
32
|
+
|
|
33
|
+
#### Tests
|
|
34
|
+
|
|
35
|
+
- **`tests/test_doctor.py`**: 3 件追加
|
|
36
|
+
- `test_num_ctx_max_tokens_bumped_for_thinking_provider_declaration`: `provider.capabilities.thinking=True` → 1024
|
|
37
|
+
- `test_num_ctx_max_tokens_bumped_when_registry_says_thinking`: provider 宣言なし + registry 宣言あり → 1024
|
|
38
|
+
- `test_streaming_max_tokens_bumped_for_thinking_provider`: streaming probe も同経路で 1024 になる
|
|
39
|
+
- 既存 `test_num_ctx_request_body_merges_extra_body_options` の `max_tokens == 32` assertion を `== 256` に更新 (新 baseline)
|
|
40
|
+
- 既存 `test_streaming_request_body_carries_stream_true_and_merges_extra_body` に `max_tokens == 512` assertion を追加 (streaming baseline)
|
|
41
|
+
|
|
42
|
+
### Why
|
|
43
|
+
|
|
44
|
+
v1.8.1 article 執筆中に「note の流行モデル → ollama pull → 動かない」のうち Gemma 4 だけ `tool_calls [OK]` の **逆転勝利** だったはずが、`num_ctx [NEEDS TUNING]` も出ていて記事として煮え切らない状態だった。深掘りの結果、`/v1/chat/completions` 経由でも options は効く / `ollama ps` で context length 262144 が出る / **でも doctor は失敗** という矛盾を観測。`.choices[0].message.reasoning` フィールドに思考トークンが流れて `max_tokens=32` を消費していた事実を確認、**doctor 側の probe 設計が thinking モデル時代に追いついていない**ことが判明。
|
|
45
|
+
|
|
46
|
+
これは「実機 evidence first」原則 (plan.md §5.4) の更に一段下のメタ教訓:**diagnostic ツール自身も diagnostic され続ける必要がある**。
|
|
47
|
+
|
|
48
|
+
### Migration
|
|
49
|
+
|
|
50
|
+
`pyproject.toml version 1.8.1 → 1.8.2`、`coderouter --version` は 1.8.2 を返す。**手元の `~/.coderouter/providers.yaml` は触らない限り完全に変化なし**。
|
|
51
|
+
|
|
52
|
+
v1.8.1 で Gemma 4 26B を `claude_code_suitability` 抑え目に運用していたユーザーは v1.8.2 で doctor 再実行すると `num_ctx [OK]` + `streaming [OK]` まで通るはず。Qwen3.6 系の `tool_calls [NEEDS TUNING]` は本物 (thinking 起因ではない) なので引き続き coding chain primary には推奨しない。
|
|
53
|
+
|
|
54
|
+
### Files touched
|
|
55
|
+
|
|
56
|
+
```
|
|
57
|
+
M CHANGELOG.md
|
|
58
|
+
M coderouter/data/model-capabilities.yaml
|
|
59
|
+
M coderouter/doctor.py
|
|
60
|
+
M pyproject.toml
|
|
61
|
+
M plan.md
|
|
62
|
+
M docs/troubleshooting.md
|
|
63
|
+
M docs/articles/note-v1-8-1-reality-check.md (or new file v1-8-2)
|
|
64
|
+
M tests/test_doctor.py
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
9
69
|
## [v1.8.1] — 2026-04-26 (実機検証反映 patch — mode_aliases 解決 + Gemma 4 第一候補化 + Ollama 既知問題ドキュメント化)
|
|
10
70
|
|
|
11
71
|
**Theme: v1.8.0 出荷直後の実機検証 (M3 Max 32GB / Ollama 0.21.2) で踏んだ問題 3 件を patch で解消。**
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: coderouter-cli
|
|
3
|
-
Version: 1.8.
|
|
3
|
+
Version: 1.8.2
|
|
4
4
|
Summary: Local-first, free-first, fallback-built-in LLM router. Claude Code / OpenAI compatible.
|
|
5
5
|
Project-URL: Homepage, https://github.com/zephel01/CodeRouter
|
|
6
6
|
Project-URL: Repository, https://github.com/zephel01/CodeRouter
|
|
@@ -176,47 +176,59 @@ rules:
|
|
|
176
176
|
# 「Claude Code 代替として最高」「local champ」と評価されている。
|
|
177
177
|
#
|
|
178
178
|
# ただし v1.8.0 までで `claude_code_suitability: ok` を declare していた
|
|
179
|
-
# のは note 記事の伝聞ベースの先回り宣言で、v1.8.1
|
|
180
|
-
#
|
|
181
|
-
# - num_ctx
|
|
182
|
-
#
|
|
179
|
+
# のは note 記事の伝聞ベースの先回り宣言で、v1.8.1 〜 v1.8.2
|
|
180
|
+
# (2026-04-26) の実機検証 (M3 Max 64GB / Ollama 0.21.2) で:
|
|
181
|
+
# - num_ctx と streaming の NEEDS_TUNING は v1.8.2 で thinking モデル
|
|
182
|
+
# 用 probe バジェット拡大により偽陽性と判明 (doctor 側の課題)
|
|
183
183
|
# - tool_calls probe が native tool_calls / 修復可能 JSON のいずれも
|
|
184
|
-
#
|
|
185
|
-
#
|
|
186
|
-
# これらは Ollama 経由特有の問題で、HF / vLLM 直接ロードなら違う可能性。
|
|
187
|
-
# 確証ない以上、`claude_code_suitability` は撤回し `tools` 宣言だけ残す。
|
|
184
|
+
# 返さない真の課題が残る (Qwen3.6 系の Ollama 経由 tool 仕様未成熟)
|
|
185
|
+
# tool_calls 不全が解消されるまで `claude_code_suitability` は撤回。
|
|
188
186
|
# 実機で動いたユーザーは `~/.coderouter/model-capabilities.yaml` で
|
|
189
187
|
# `claude_code_suitability: ok` を上書きできる。
|
|
190
188
|
# ------------------------------------------------------------------
|
|
191
189
|
|
|
190
|
+
# v1.8.2: thinking: true は doctor probe (num_ctx / streaming) が reasoning
|
|
191
|
+
# トークン消費分の max_tokens 余裕を確保するためのヒント。Qwen3 系は
|
|
192
|
+
# /think モードで thinking トークンを吐く設計なので true 宣言。
|
|
192
193
|
- match: "qwen3.6:*"
|
|
193
194
|
kind: openai_compat
|
|
194
195
|
capabilities:
|
|
195
196
|
tools: true
|
|
197
|
+
thinking: true
|
|
196
198
|
|
|
197
199
|
- match: "qwen/qwen3.6-*"
|
|
198
200
|
kind: openai_compat
|
|
199
201
|
capabilities:
|
|
200
202
|
tools: true
|
|
203
|
+
thinking: true
|
|
201
204
|
|
|
202
205
|
# ------------------------------------------------------------------
|
|
203
|
-
# Gemma 4 family (v1.7-B
|
|
206
|
+
# Gemma 4 family (v1.7-B 追加、v1.8.2 で thinking: true 宣言)
|
|
204
207
|
#
|
|
205
208
|
# Google 公式 Gemma 4。Ollama 公式 tag は gemma4:e2b / e4b / 26b / 31b、
|
|
206
209
|
# 全 variant が tools+vision+thinking 対応、E2B/E4B は audio もサポート。
|
|
207
210
|
# MoE (26b は active 3.8B / total 25.2B)。note 記事で「日常・バランスの
|
|
208
211
|
# 王者」と評価。Claude Haiku 互換性に近い簡潔な応答スタイル。
|
|
212
|
+
#
|
|
213
|
+
# v1.8.2 (2026-04-26): 実機検証 (M3 Max 64GB / Ollama 0.21.2 / gemma4:26b)
|
|
214
|
+
# で `reasoning` フィールドにかなりの量のトークンを吐く thinking モデル
|
|
215
|
+
# と確認。doctor probe の max_tokens=32 / 128 が thinking トークンに
|
|
216
|
+
# 食い切られて偽陽性 NEEDS_TUNING を出していた。registry で
|
|
217
|
+
# `thinking: true` を宣言すると doctor が probe バジェットを 1024 まで
|
|
218
|
+
# 引き上げて偽陽性を回避する。
|
|
209
219
|
# ------------------------------------------------------------------
|
|
210
220
|
|
|
211
221
|
- match: "gemma4:*"
|
|
212
222
|
kind: openai_compat
|
|
213
223
|
capabilities:
|
|
214
224
|
tools: true
|
|
225
|
+
thinking: true
|
|
215
226
|
|
|
216
227
|
- match: "google/gemma-4*"
|
|
217
228
|
kind: openai_compat
|
|
218
229
|
capabilities:
|
|
219
230
|
tools: true
|
|
231
|
+
thinking: true
|
|
220
232
|
|
|
221
233
|
# ------------------------------------------------------------------
|
|
222
234
|
# GLM family (Z.AI / Zhipu AI、v1.7-B 追加)
|
|
@@ -433,6 +433,31 @@ _STREAMING_PROBE_USER_PROMPT = (
|
|
|
433
433
|
# truncated". "1\n2\n...\n30" is ~80 chars; 40 chars covers the halfway
|
|
434
434
|
# mark (1..20) which is already obviously-truncated territory.
|
|
435
435
|
_STREAMING_PROBE_MIN_EXPECTED_CHARS = 40
|
|
436
|
+
|
|
437
|
+
# v1.8.2: probe response budgets.
|
|
438
|
+
#
|
|
439
|
+
# Both num_ctx and streaming probes ask the model for a *short* answer
|
|
440
|
+
# (the canary token / "1..30"). The original budgets (32 / 128 tokens)
|
|
441
|
+
# assumed a non-thinking model that emits the answer immediately. On a
|
|
442
|
+
# thinking model — Gemma 4 26B, Qwen3.6, gpt-oss, deepseek-r1 — the
|
|
443
|
+
# upstream burns the entire budget on a hidden ``reasoning`` field
|
|
444
|
+
# *before* emitting any visible ``content``, producing a false-positive
|
|
445
|
+
# NEEDS_TUNING (canary missing / 0 chars streamed). Bumping the budget
|
|
446
|
+
# is the cleanest fix: non-thinking models stop early at their natural
|
|
447
|
+
# stop token (no waste), thinking models get headroom for the reasoning
|
|
448
|
+
# trace plus the actual answer.
|
|
449
|
+
#
|
|
450
|
+
# Numbers picked from the v1.8.1 reality-check session
|
|
451
|
+
# (docs/articles/note-v1-8-1-reality-check.md):
|
|
452
|
+
# * Gemma 4 26B reasoning prefix observed at ~150-300 tokens before
|
|
453
|
+
# content starts → 1024 covers reasoning + 30-line count comfortably.
|
|
454
|
+
# * Non-thinking baseline kept conservative-but-non-tight (256/512) to
|
|
455
|
+
# absorb stylistic preambles ("Sure, the answer is...") without
|
|
456
|
+
# burning extra cloud quota when the operator probes a paid endpoint.
|
|
457
|
+
_NUM_CTX_PROBE_MAX_TOKENS_DEFAULT = 256
|
|
458
|
+
_NUM_CTX_PROBE_MAX_TOKENS_THINKING = 1024
|
|
459
|
+
_STREAMING_PROBE_MAX_TOKENS_DEFAULT = 512
|
|
460
|
+
_STREAMING_PROBE_MAX_TOKENS_THINKING = 1024
|
|
436
461
|
# Default ``num_predict`` suggested in the emitted patch. -1 would be
|
|
437
462
|
# optimal (uncapped) but "4096" communicates intent more clearly to
|
|
438
463
|
# operators unfamiliar with Ollama's sentinel value, and covers Claude
|
|
@@ -475,6 +500,42 @@ def _declared_num_ctx(provider: ProviderConfig) -> int | None:
|
|
|
475
500
|
return val if isinstance(val, int) else None
|
|
476
501
|
|
|
477
502
|
|
|
503
|
+
def _is_reasoning_model(
|
|
504
|
+
provider: ProviderConfig, resolved: ResolvedCapabilities
|
|
505
|
+
) -> bool:
|
|
506
|
+
"""v1.8.2: True iff the model is known to emit a hidden reasoning trace.
|
|
507
|
+
|
|
508
|
+
Thinking models (Gemma 4, Qwen3-with-/think, gpt-oss, deepseek-r1,
|
|
509
|
+
Claude Sonnet 4.5+ in extended-thinking mode) burn output tokens on a
|
|
510
|
+
``reasoning`` field before any visible ``content`` is produced. The
|
|
511
|
+
num_ctx / streaming probes use small response budgets that get fully
|
|
512
|
+
consumed by the reasoning prefix, producing a false-positive
|
|
513
|
+
NEEDS_TUNING. Callers use this to choose a generous probe budget.
|
|
514
|
+
|
|
515
|
+
Three signals fire:
|
|
516
|
+
* provider declared ``capabilities.thinking: true`` in providers.yaml
|
|
517
|
+
* provider declared ``capabilities.reasoning_passthrough: true``
|
|
518
|
+
(the operator opted in to passing the raw reasoning to the client,
|
|
519
|
+
which is only meaningful for models that emit it)
|
|
520
|
+
* registry resolved ``thinking: true`` for this (kind, model) pair
|
|
521
|
+
|
|
522
|
+
Conservative bias — when both provider declaration and registry are
|
|
523
|
+
silent, treat as non-reasoning. The probe still completes for thinking
|
|
524
|
+
models in that case (they just hit ``finish_reason='length'`` like
|
|
525
|
+
they did pre-v1.8.2), but at least the new generous default budget
|
|
526
|
+
(256 / 512) gives more headroom than the old 32 / 128.
|
|
527
|
+
"""
|
|
528
|
+
if provider.capabilities.thinking is True:
|
|
529
|
+
return True
|
|
530
|
+
if provider.capabilities.reasoning_passthrough is True:
|
|
531
|
+
return True
|
|
532
|
+
if resolved.thinking is True:
|
|
533
|
+
return True
|
|
534
|
+
if resolved.reasoning_passthrough is True:
|
|
535
|
+
return True
|
|
536
|
+
return False
|
|
537
|
+
|
|
538
|
+
|
|
478
539
|
_PROBE_BASIC_USER_PROMPT = "Reply with exactly the single word: PONG"
|
|
479
540
|
_PROBE_TOOLS_USER_PROMPT = (
|
|
480
541
|
"You have one tool named `echo`. Call it with the argument "
|
|
@@ -617,7 +678,9 @@ def _extract_openai_assistant_choice(
|
|
|
617
678
|
return msg if isinstance(msg, dict) else None
|
|
618
679
|
|
|
619
680
|
|
|
620
|
-
async def _probe_num_ctx(
|
|
681
|
+
async def _probe_num_ctx(
|
|
682
|
+
provider: ProviderConfig, resolved: ResolvedCapabilities
|
|
683
|
+
) -> ProbeResult:
|
|
621
684
|
"""v1.0-B Probe — direct detection of Ollama ``num_ctx`` truncation.
|
|
622
685
|
|
|
623
686
|
Addresses plan.md §9.4 symptom #1 (空応答 / 意味不明応答). Prior to
|
|
@@ -683,11 +746,21 @@ async def _probe_num_ctx(provider: ProviderConfig) -> ProbeResult:
|
|
|
683
746
|
# whatever ``options.num_ctx`` the operator has declared. Request
|
|
684
747
|
# fields win over extra_body, matching the adapter's merge order.
|
|
685
748
|
body: dict[str, Any] = dict(provider.extra_body)
|
|
749
|
+
# v1.8.2: thinking models burn output tokens on a hidden ``reasoning``
|
|
750
|
+
# trace before emitting any ``content``. The pre-v1.8.2 default of 32
|
|
751
|
+
# was tight for any preamble at all; on Gemma 4 26B it caused
|
|
752
|
+
# ``finish_reason='length'`` with content="" before the canary could
|
|
753
|
+
# surface, producing a false-positive NEEDS_TUNING.
|
|
754
|
+
max_tokens = (
|
|
755
|
+
_NUM_CTX_PROBE_MAX_TOKENS_THINKING
|
|
756
|
+
if _is_reasoning_model(provider, resolved)
|
|
757
|
+
else _NUM_CTX_PROBE_MAX_TOKENS_DEFAULT
|
|
758
|
+
)
|
|
686
759
|
body.update(
|
|
687
760
|
{
|
|
688
761
|
"model": provider.model,
|
|
689
762
|
"messages": [{"role": "user", "content": user_prompt}],
|
|
690
|
-
"max_tokens":
|
|
763
|
+
"max_tokens": max_tokens,
|
|
691
764
|
"temperature": 0,
|
|
692
765
|
}
|
|
693
766
|
)
|
|
@@ -799,7 +872,9 @@ async def _probe_num_ctx(provider: ProviderConfig) -> ProbeResult:
|
|
|
799
872
|
)
|
|
800
873
|
|
|
801
874
|
|
|
802
|
-
async def _probe_streaming(
|
|
875
|
+
async def _probe_streaming(
|
|
876
|
+
provider: ProviderConfig, resolved: ResolvedCapabilities
|
|
877
|
+
) -> ProbeResult:
|
|
803
878
|
"""v1.0-C Probe — streaming completion path integrity.
|
|
804
879
|
|
|
805
880
|
Addresses plan.md §9.4 symptom #1 from the **output** side. The v1.0-B
|
|
@@ -868,11 +943,18 @@ async def _probe_streaming(provider: ProviderConfig) -> ProbeResult:
|
|
|
868
943
|
# probing. Top-level probe fields win on collision, matching adapter
|
|
869
944
|
# merge order.
|
|
870
945
|
body: dict[str, Any] = dict(provider.extra_body)
|
|
946
|
+
# v1.8.2: same thinking-model rationale as num_ctx probe — give
|
|
947
|
+
# reasoning a budget so the visible content has a chance to surface.
|
|
948
|
+
max_tokens = (
|
|
949
|
+
_STREAMING_PROBE_MAX_TOKENS_THINKING
|
|
950
|
+
if _is_reasoning_model(provider, resolved)
|
|
951
|
+
else _STREAMING_PROBE_MAX_TOKENS_DEFAULT
|
|
952
|
+
)
|
|
871
953
|
body.update(
|
|
872
954
|
{
|
|
873
955
|
"model": provider.model,
|
|
874
956
|
"messages": [{"role": "user", "content": _STREAMING_PROBE_USER_PROMPT}],
|
|
875
|
-
"max_tokens":
|
|
957
|
+
"max_tokens": max_tokens,
|
|
876
958
|
"temperature": 0,
|
|
877
959
|
"stream": True,
|
|
878
960
|
}
|
|
@@ -1506,11 +1588,11 @@ async def check_model(
|
|
|
1506
1588
|
# declaration probes (tool_calls / thinking / reasoning-leak) should
|
|
1507
1589
|
# dominate the report — streaming is the output-side sibling of
|
|
1508
1590
|
# num_ctx and its NEEDS_TUNING verdict is orthogonal to the others.
|
|
1509
|
-
report.results.append(await _probe_num_ctx(provider))
|
|
1591
|
+
report.results.append(await _probe_num_ctx(provider, resolved))
|
|
1510
1592
|
report.results.append(await _probe_tool_calls(provider, resolved))
|
|
1511
1593
|
report.results.append(await _probe_thinking(provider, resolved))
|
|
1512
1594
|
report.results.append(await _probe_reasoning_leak(provider, resolved))
|
|
1513
|
-
report.results.append(await _probe_streaming(provider))
|
|
1595
|
+
report.results.append(await _probe_streaming(provider, resolved))
|
|
1514
1596
|
return report
|
|
1515
1597
|
|
|
1516
1598
|
|
|
@@ -282,24 +282,26 @@ profiles:
|
|
|
282
282
|
|
|
283
283
|
> **モデル別 tool-call 挙動の深掘り**: Llama-3.3-70B 系の「自然文を tool 呼び出しに変換しがち」性質は、agentic tuning の RLHF signal とシステムプロンプトの相性に起因します。各モデルの傾向と回避策は Unsloth の [Tool calling guide for local LLMs (日本語)](https://unsloth.ai/docs/jp/ji-ben/tool-calling-guide-for-local-llms) が読みやすく、CodeRouter v1.8.0 で導入した `claude_code_suitability` 判定の背景理解にも役立ちます。
|
|
284
284
|
|
|
285
|
-
### 4-2. ローカル Ollama 経由で踏みやすい既知問題 (v1.8.1
|
|
285
|
+
### 4-2. ローカル Ollama 経由で踏みやすい既知問題 (v1.8.1 追記、v1.8.2 改訂)
|
|
286
286
|
|
|
287
|
-
2026-04-26 の実機検証 (M3 Max
|
|
287
|
+
2026-04-26 の実機検証 (M3 Max 64GB / Ollama 0.21.2 / CodeRouter v1.8.0 → v1.8.2) で、**note 記事や HF で評価が高いモデルでも Ollama 経由では動かないケース**が判明したのでまとめます。
|
|
288
|
+
|
|
289
|
+
> **v1.8.2 重要更新**: 当初 v1.8.1 で「Qwen3.6 / Gemma 4 ともに num_ctx silent cap」「streaming 0 chars 打ち切り」と判定していた問題は、深掘りの結果 **doctor の `num_ctx` / `streaming` probe が thinking モデルの reasoning トークン消費分を見ていない `max_tokens=32` / `128` バジェットで偽陽性 NEEDS_TUNING を出していた** ことが判明。v1.8.2 で probe バジェットを reasoning モデル時に 1024 まで拡大、registry で `gemma4:*` / `qwen3.6:*` に `thinking: true` 宣言を追加。**Gemma 4 26B は実機で完全動作確定** (`/v1/messages` Anthropic 互換で "Hello." を 2 秒応答)、**Qwen3.6 系の `tool_calls [NEEDS TUNING]` だけが真の課題として残る** (thinking 起因とは別の Ollama tool 仕様未成熟)。
|
|
288
290
|
|
|
289
291
|
#### 4-2-A. **Qwen3.6:27b / 35b** が Claude Code で実用厳しい
|
|
290
292
|
|
|
291
|
-
|
|
293
|
+
v1.8.2 の偽陽性除去後、`coderouter doctor --check-model ollama-qwen3-6-27b` の結果:
|
|
292
294
|
|
|
293
295
|
| Probe | 結果 | 症状 |
|
|
294
296
|
|---|---|---|
|
|
295
297
|
| auth+basic-chat | OK | 短い chat なら動く |
|
|
296
|
-
|
|
|
297
|
-
| **tool_calls** | **NEEDS_TUNING** | native tool_calls / 修復可能 JSON のいずれも返さず |
|
|
298
|
-
|
|
|
298
|
+
| num_ctx | OK or NEEDS_TUNING (model依存) | thinking バジェット 1024 で偽陽性は解消 |
|
|
299
|
+
| **tool_calls** | **NEEDS_TUNING** ← 残る真の課題 | native tool_calls / 修復可能 JSON のいずれも返さず |
|
|
300
|
+
| streaming | OK or NEEDS_TUNING | thinking バジェット 1024 で偽陽性は解消 |
|
|
299
301
|
|
|
300
|
-
`/no_think` を `append_system_prompt`
|
|
302
|
+
`/no_think` を `append_system_prompt` に入れても tool_calls は改善せず。Ollama 0.21.2 / llama.cpp 側の Qwen3.6 family の **tool 仕様** がまだ完全でない (chat template / tool スキーマ整合) 可能性が高い。
|
|
301
303
|
|
|
302
|
-
**回避**: `claude-code-nim` profile の primary に Qwen3.6 を置かず、**Gemma 4 26B または Qwen2.5-Coder 14b を上位に**。bundled `model-capabilities.yaml` も v1.8.1 で `qwen3.6:*` の `claude_code_suitability: ok` を撤回 (declaration 過信の例)。
|
|
304
|
+
**回避**: `claude-code-nim` profile の primary に Qwen3.6 を置かず、**Gemma 4 26B または Qwen2.5-Coder 14b を上位に**。bundled `model-capabilities.yaml` も v1.8.1 で `qwen3.6:*` の `claude_code_suitability: ok` を撤回 (declaration 過信の例)、v1.8.2 で `thinking: true` 追加 (doctor probe 偽陽性除去のため)。
|
|
303
305
|
|
|
304
306
|
#### 4-2-B. **Qwen3.5 系の HF 蒸留モデル** (Qwopus3.5 等) は llama.cpp 未対応
|
|
305
307
|
|
|
@@ -323,9 +325,13 @@ llama_model_load: error loading model: error loading model architecture:
|
|
|
323
325
|
|
|
324
326
|
> **教訓**: HF で「Qwen3.5 + Opus 蒸留」のような新しい組み合わせは note / r/LocalLLaMA で評判が立っていても、**Ollama 経由ですぐ使えるとは限らない**。`ollama pull` → `ollama run` で 500 が出たら、まず Ollama server log で `unknown model architecture` を確認。出たら今は諦めて他のモデルに行くのが時間効率的に正解。
|
|
325
327
|
|
|
326
|
-
#### 4-2-C. **Gemma 4 26B**
|
|
328
|
+
#### 4-2-C. **Gemma 4 26B** は実機で完全動作 (v1.8.2 で確定)
|
|
329
|
+
|
|
330
|
+
`coderouter doctor --check-model ollama-gemma4-26b` の `tool_calls` probe が無加工で `[OK]`。v1.8.2 で thinking モデル対応 probe バジェット (1024) を入れた後は **`num_ctx` / `streaming` も `[OK]`** で 6 probe 全クリア。`/v1/messages` Anthropic 互換経由で "Hello." を 2 秒応答 (M3 Max 64GB)、`tool_calls native OK`、`reasoning strip` 動作。
|
|
331
|
+
|
|
332
|
+
ただし **interactive UX は若干重い** — Gemma 4 は thinking モデルなので `reasoning` フィールドにも応答時間を使う + 26B サイズ + Claude Code の agent loop (1 プロンプトで 3〜6 round-trip) で総応答時間が 30〜90 秒 / プロンプトになる。daily driver には `qwen2.5-coder:14b` のほうが速い。**Gemma 4 は tool_calls native + 高品質が要るときの選択肢**。
|
|
327
333
|
|
|
328
|
-
|
|
334
|
+
note 記事の「Gemma 4 が日常の王者」評価は **Claude Code agentic 用途でも裏付けられた**形。v1.8.1 で `coding` profile primary を Gemma 4 / Qwen-Coder 14b へ調整、v1.8.2 で registry に `thinking: true` を宣言。
|
|
329
335
|
|
|
330
336
|
#### 4-2-D. ベスト実践 — 「枯れたモデル + 観測ツール」
|
|
331
337
|
|
|
@@ -337,6 +343,20 @@ llama_model_load: error loading model: error loading model architecture:
|
|
|
337
343
|
4. **新興モデルは慎重に**: HF で見つけた新モデルは Ollama 0.20+ でも未対応のことあり、`ollama run` → server log で確認
|
|
338
344
|
5. **fallback chain で守る**: ローカル primary が落ちても NIM / OpenRouter free に流れるように chain を厚く
|
|
339
345
|
|
|
346
|
+
#### 4-2-E. doctor probe 自体の限界 — thinking モデル対応 (v1.8.2)
|
|
347
|
+
|
|
348
|
+
v1.8.2 までの doctor `num_ctx` / `streaming` probe は **`max_tokens=32` / `128`** で出力を要求していた。これは canary token (~5 tokens) や "1..30" (~40 tokens) を返すには十分だったが、**thinking モデル** (Gemma 4、Qwen3 系、gpt-oss、deepseek-r1) は `reasoning` フィールドに思考トークンを吐く設計のため、可視 `content` が出る前に max_tokens に到達して `finish_reason='length'` で打ち切られる **偽陽性 NEEDS_TUNING** を出していた。
|
|
349
|
+
|
|
350
|
+
v1.8.2 で:
|
|
351
|
+
|
|
352
|
+
- `_NUM_CTX_PROBE_MAX_TOKENS_DEFAULT = 256` (旧 32)、`_NUM_CTX_PROBE_MAX_TOKENS_THINKING = 1024`
|
|
353
|
+
- `_STREAMING_PROBE_MAX_TOKENS_DEFAULT = 512` (旧 128)、`_STREAMING_PROBE_MAX_TOKENS_THINKING = 1024`
|
|
354
|
+
- `provider.capabilities.thinking` / `provider.capabilities.reasoning_passthrough` / registry の `thinking` / `reasoning_passthrough` のいずれかが true なら thinking バジェットを採用
|
|
355
|
+
|
|
356
|
+
つまり **provider 宣言不要**: bundled `model-capabilities.yaml` で `gemma4:*` / `qwen3.6:*` などに `thinking: true` を宣言してあれば、user の providers.yaml には何も書かなくても doctor が自動的に正しいバジェットを使う。
|
|
357
|
+
|
|
358
|
+
**メタ教訓**: diagnostic ツール自身も diagnostic され続ける必要がある (plan.md §5.4 「実機 evidence first」原則の補強)。
|
|
359
|
+
|
|
340
360
|
### 4-3. `UserPromptSubmit hook error` が出る (第三者 Claude Code プラグイン)
|
|
341
361
|
|
|
342
362
|
```
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# in plan.md §11.B; once granted, this name will become an alias and
|
|
12
12
|
# `coderouter` will become the canonical distribution name.
|
|
13
13
|
name = "coderouter-cli"
|
|
14
|
-
version = "1.8.
|
|
14
|
+
version = "1.8.2"
|
|
15
15
|
description = "Local-first, free-first, fallback-built-in LLM router. Claude Code / OpenAI compatible."
|
|
16
16
|
readme = "README.md"
|
|
17
17
|
requires-python = ">=3.12"
|
|
@@ -770,7 +770,169 @@ async def test_num_ctx_request_body_merges_extra_body_options(
|
|
|
770
770
|
# And the probe's own fields must be present and dominate over any
|
|
771
771
|
# extra_body collisions on top-level keys.
|
|
772
772
|
assert num_ctx_body["model"] == provider.model
|
|
773
|
-
|
|
773
|
+
# v1.8.2: default probe budget bumped 32 → 256. Thinking-flagged
|
|
774
|
+
# models bump further to 1024 (covered by a dedicated test below).
|
|
775
|
+
# The provider here has no `capabilities.thinking`, so the default
|
|
776
|
+
# baseline applies.
|
|
777
|
+
assert num_ctx_body["max_tokens"] == 256
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
@pytest.mark.asyncio
|
|
781
|
+
async def test_num_ctx_max_tokens_bumped_for_thinking_provider_declaration(
|
|
782
|
+
httpx_mock: HTTPXMock,
|
|
783
|
+
) -> None:
|
|
784
|
+
"""v1.8.2: provider declared ``capabilities.thinking: true`` →
|
|
785
|
+
num_ctx probe budget is the thinking variant (1024) instead of the
|
|
786
|
+
256 baseline.
|
|
787
|
+
|
|
788
|
+
Thinking models (Gemma 4 26B, Qwen3-with-/think, gpt-oss, deepseek-r1)
|
|
789
|
+
burn output tokens on a hidden ``reasoning`` trace before any visible
|
|
790
|
+
``content`` is emitted. The pre-v1.8.2 default of 32 caused
|
|
791
|
+
``finish_reason='length'`` with empty content, producing a
|
|
792
|
+
false-positive NEEDS_TUNING. Bumping the budget to 1024 gives the
|
|
793
|
+
reasoning trace + canary echo room to surface.
|
|
794
|
+
"""
|
|
795
|
+
thinking_caps = Capabilities(thinking=True, tools=True)
|
|
796
|
+
provider = _oa_provider(
|
|
797
|
+
name="ollama-gemma4-26b",
|
|
798
|
+
base_url="http://localhost:11434/v1",
|
|
799
|
+
model="gemma4:26b",
|
|
800
|
+
caps=thinking_caps,
|
|
801
|
+
extra_body={"options": {"num_ctx": 32768}},
|
|
802
|
+
)
|
|
803
|
+
captured: list[httpx.Request] = []
|
|
804
|
+
|
|
805
|
+
def _capture(request: httpx.Request) -> httpx.Response:
|
|
806
|
+
captured.append(request)
|
|
807
|
+
body = json.loads(request.content.decode("utf-8"))
|
|
808
|
+
if body.get("stream") is True:
|
|
809
|
+
return httpx.Response(
|
|
810
|
+
200,
|
|
811
|
+
content=_sse_stream_count_body(),
|
|
812
|
+
headers={"content-type": "text/event-stream"},
|
|
813
|
+
)
|
|
814
|
+
return httpx.Response(
|
|
815
|
+
200, json=_openai_ok_response(content=_NUM_CTX_PROBE_CANARY)
|
|
816
|
+
)
|
|
817
|
+
|
|
818
|
+
httpx_mock.add_callback(
|
|
819
|
+
_capture,
|
|
820
|
+
url="http://localhost:11434/v1/chat/completions",
|
|
821
|
+
method="POST",
|
|
822
|
+
is_reusable=True,
|
|
823
|
+
)
|
|
824
|
+
await check_model(
|
|
825
|
+
_config_for([provider]), provider.name, registry=_empty_registry()
|
|
826
|
+
)
|
|
827
|
+
# auth → captured[0], num_ctx → captured[1]
|
|
828
|
+
num_ctx_body = json.loads(captured[1].content.decode("utf-8"))
|
|
829
|
+
assert num_ctx_body["max_tokens"] == 1024
|
|
830
|
+
|
|
831
|
+
|
|
832
|
+
@pytest.mark.asyncio
|
|
833
|
+
async def test_num_ctx_max_tokens_bumped_when_registry_says_thinking(
|
|
834
|
+
httpx_mock: HTTPXMock,
|
|
835
|
+
) -> None:
|
|
836
|
+
"""v1.8.2: provider declares no thinking but registry says thinking=true
|
|
837
|
+
for the (kind, model) → still bump to 1024.
|
|
838
|
+
|
|
839
|
+
This mirrors the production path: bundled model-capabilities.yaml
|
|
840
|
+
declares ``thinking: true`` for ``gemma4:*`` / ``qwen3.6:*`` so
|
|
841
|
+
operators don't have to repeat the flag in every providers.yaml.
|
|
842
|
+
"""
|
|
843
|
+
provider = _oa_provider(
|
|
844
|
+
name="ollama-gemma4-26b",
|
|
845
|
+
base_url="http://localhost:11434/v1",
|
|
846
|
+
model="gemma4:26b",
|
|
847
|
+
# capabilities.thinking left at the default (False)
|
|
848
|
+
extra_body={"options": {"num_ctx": 32768}},
|
|
849
|
+
)
|
|
850
|
+
registry = CapabilityRegistry(
|
|
851
|
+
[
|
|
852
|
+
CapabilityRule(
|
|
853
|
+
match="gemma4:*",
|
|
854
|
+
kind="openai_compat",
|
|
855
|
+
capabilities=RegistryCapabilities(thinking=True),
|
|
856
|
+
)
|
|
857
|
+
]
|
|
858
|
+
)
|
|
859
|
+
captured: list[httpx.Request] = []
|
|
860
|
+
|
|
861
|
+
def _capture(request: httpx.Request) -> httpx.Response:
|
|
862
|
+
captured.append(request)
|
|
863
|
+
body = json.loads(request.content.decode("utf-8"))
|
|
864
|
+
if body.get("stream") is True:
|
|
865
|
+
return httpx.Response(
|
|
866
|
+
200,
|
|
867
|
+
content=_sse_stream_count_body(),
|
|
868
|
+
headers={"content-type": "text/event-stream"},
|
|
869
|
+
)
|
|
870
|
+
return httpx.Response(
|
|
871
|
+
200, json=_openai_ok_response(content=_NUM_CTX_PROBE_CANARY)
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
httpx_mock.add_callback(
|
|
875
|
+
_capture,
|
|
876
|
+
url="http://localhost:11434/v1/chat/completions",
|
|
877
|
+
method="POST",
|
|
878
|
+
is_reusable=True,
|
|
879
|
+
)
|
|
880
|
+
await check_model(_config_for([provider]), provider.name, registry=registry)
|
|
881
|
+
num_ctx_body = json.loads(captured[1].content.decode("utf-8"))
|
|
882
|
+
assert num_ctx_body["max_tokens"] == 1024
|
|
883
|
+
|
|
884
|
+
|
|
885
|
+
@pytest.mark.asyncio
|
|
886
|
+
async def test_streaming_max_tokens_bumped_for_thinking_provider(
|
|
887
|
+
httpx_mock: HTTPXMock,
|
|
888
|
+
) -> None:
|
|
889
|
+
"""v1.8.2: streaming probe also uses the thinking-aware budget — same
|
|
890
|
+
rationale as num_ctx (reasoning trace burns through the small default).
|
|
891
|
+
|
|
892
|
+
Pre-v1.8.2 streaming used ``max_tokens=128`` and Gemma 4 reported
|
|
893
|
+
``finish_reason='length'`` after 0 chars of content. Bumping to 1024
|
|
894
|
+
lets the reasoning prefix + the "1..30" answer fit.
|
|
895
|
+
"""
|
|
896
|
+
thinking_caps = Capabilities(thinking=True, tools=True)
|
|
897
|
+
provider = _oa_provider(
|
|
898
|
+
name="ollama-gemma4-26b",
|
|
899
|
+
base_url="http://localhost:11434/v1",
|
|
900
|
+
model="gemma4:26b",
|
|
901
|
+
caps=thinking_caps,
|
|
902
|
+
extra_body={"options": {"num_ctx": 32768}},
|
|
903
|
+
)
|
|
904
|
+
captured: list[httpx.Request] = []
|
|
905
|
+
|
|
906
|
+
def _route(request: httpx.Request) -> httpx.Response:
|
|
907
|
+
captured.append(request)
|
|
908
|
+
body = json.loads(request.content.decode("utf-8"))
|
|
909
|
+
if body.get("stream") is True:
|
|
910
|
+
return httpx.Response(
|
|
911
|
+
200,
|
|
912
|
+
content=_sse_stream_count_body(),
|
|
913
|
+
headers={"content-type": "text/event-stream"},
|
|
914
|
+
)
|
|
915
|
+
return httpx.Response(
|
|
916
|
+
200, json=_openai_ok_response(content=_NUM_CTX_PROBE_CANARY)
|
|
917
|
+
)
|
|
918
|
+
|
|
919
|
+
httpx_mock.add_callback(
|
|
920
|
+
_route,
|
|
921
|
+
url="http://localhost:11434/v1/chat/completions",
|
|
922
|
+
method="POST",
|
|
923
|
+
is_reusable=True,
|
|
924
|
+
)
|
|
925
|
+
await check_model(
|
|
926
|
+
_config_for([provider]), provider.name, registry=_empty_registry()
|
|
927
|
+
)
|
|
928
|
+
# The streaming probe runs last; identify it by ``stream: true``.
|
|
929
|
+
stream_bodies = [
|
|
930
|
+
json.loads(req.content.decode("utf-8"))
|
|
931
|
+
for req in captured
|
|
932
|
+
if json.loads(req.content.decode("utf-8")).get("stream") is True
|
|
933
|
+
]
|
|
934
|
+
assert len(stream_bodies) == 1
|
|
935
|
+
assert stream_bodies[0]["max_tokens"] == 1024
|
|
774
936
|
|
|
775
937
|
|
|
776
938
|
@pytest.mark.asyncio
|
|
@@ -1767,6 +1929,11 @@ async def test_streaming_request_body_carries_stream_true_and_merges_extra_body(
|
|
|
1767
1929
|
}
|
|
1768
1930
|
# Top-level probe fields must win over any extra_body collision.
|
|
1769
1931
|
assert streaming_body["model"] == provider.model
|
|
1932
|
+
# v1.8.2: streaming probe baseline budget bumped 128 → 512 to absorb
|
|
1933
|
+
# short stylistic preambles. Thinking models bump further to 1024
|
|
1934
|
+
# (covered by ``test_streaming_max_tokens_bumped_for_thinking_provider``).
|
|
1935
|
+
# Provider here has no thinking declaration, so baseline applies.
|
|
1936
|
+
assert streaming_body["max_tokens"] == 512
|
|
1770
1937
|
|
|
1771
1938
|
|
|
1772
1939
|
@pytest.mark.asyncio
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|