coderouter-cli 2.5.2__tar.gz → 2.5.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/.gitignore +3 -0
  2. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/PKG-INFO +5 -1
  3. coderouter_cli-2.5.3/coderouter/gguf_introspect.py +304 -0
  4. coderouter_cli-2.5.3/coderouter/guards/memory_budget.py +249 -0
  5. coderouter_cli-2.5.3/coderouter/hardware.py +264 -0
  6. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/routing/budget.py +1 -1
  7. coderouter_cli-2.5.3/coderouter/token_estimation_accurate.py +136 -0
  8. coderouter_cli-2.5.3/docs/low-memory-integration.md +337 -0
  9. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/pyproject.toml +14 -1
  10. coderouter_cli-2.5.3/tests/test_gguf_introspect.py +139 -0
  11. coderouter_cli-2.5.3/tests/test_hardware.py +78 -0
  12. coderouter_cli-2.5.3/tests/test_memory_budget.py +130 -0
  13. coderouter_cli-2.5.3/tests/test_token_estimation_accurate.py +69 -0
  14. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/CHANGELOG.md +0 -0
  15. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/LICENSE +0 -0
  16. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/README.en.md +0 -0
  17. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/README.md +0 -0
  18. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/__init__.py +0 -0
  19. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/__main__.py +0 -0
  20. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/adapters/__init__.py +0 -0
  21. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/adapters/anthropic_native.py +0 -0
  22. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/adapters/base.py +0 -0
  23. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/adapters/openai_compat.py +0 -0
  24. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/adapters/registry.py +0 -0
  25. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/cli.py +0 -0
  26. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/cli_stats.py +0 -0
  27. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/config/__init__.py +0 -0
  28. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/config/capability_registry.py +0 -0
  29. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/config/env_file.py +0 -0
  30. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/config/loader.py +0 -0
  31. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/config/schemas.py +0 -0
  32. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/cost.py +0 -0
  33. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/data/__init__.py +0 -0
  34. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/data/model-capabilities.yaml +0 -0
  35. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/doctor.py +0 -0
  36. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/doctor_apply.py +0 -0
  37. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/env_security.py +0 -0
  38. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/errors.py +0 -0
  39. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/guards/__init__.py +0 -0
  40. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/guards/_fingerprint.py +0 -0
  41. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/guards/backend_health.py +0 -0
  42. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/guards/context_budget.py +0 -0
  43. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/guards/continuous_probe.py +0 -0
  44. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/guards/drift_actions.py +0 -0
  45. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/guards/drift_detection.py +0 -0
  46. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/guards/memory_pressure.py +0 -0
  47. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/guards/self_healing.py +0 -0
  48. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/guards/tool_loop.py +0 -0
  49. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/ingress/__init__.py +0 -0
  50. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/ingress/anthropic_routes.py +0 -0
  51. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/ingress/app.py +0 -0
  52. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/ingress/dashboard_routes.py +0 -0
  53. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/ingress/launcher_routes.py +0 -0
  54. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/ingress/metrics_routes.py +0 -0
  55. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/ingress/openai_routes.py +0 -0
  56. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/logging.py +0 -0
  57. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/metrics/__init__.py +0 -0
  58. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/metrics/collector.py +0 -0
  59. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/metrics/prometheus.py +0 -0
  60. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/output_filters.py +0 -0
  61. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/plugins/__init__.py +0 -0
  62. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/plugins/base.py +0 -0
  63. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/plugins/loader.py +0 -0
  64. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/plugins/registry.py +0 -0
  65. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/routing/__init__.py +0 -0
  66. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/routing/adaptive.py +0 -0
  67. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/routing/auto_router.py +0 -0
  68. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/routing/capability.py +0 -0
  69. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/routing/fallback.py +0 -0
  70. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/state/__init__.py +0 -0
  71. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/state/audit_log.py +0 -0
  72. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/state/replay.py +0 -0
  73. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/state/request_log.py +0 -0
  74. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/state/store.py +0 -0
  75. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/state/suggest_rules.py +0 -0
  76. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/token_estimation.py +0 -0
  77. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/translation/__init__.py +0 -0
  78. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/translation/anthropic.py +0 -0
  79. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/translation/convert.py +0 -0
  80. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/coderouter/translation/tool_repair.py +0 -0
  81. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/README.md +0 -0
  82. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/assets/dashboard-demo.png +0 -0
  83. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/backends/gguf_dl.md +0 -0
  84. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/backends/hf-ollama-models.md +0 -0
  85. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/backends/install-backends.en.md +0 -0
  86. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/backends/install-backends.md +0 -0
  87. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/backends/launcher-quickstart.md +0 -0
  88. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/backends/launcher.md +0 -0
  89. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/backends/llamacpp-direct.en.md +0 -0
  90. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/backends/llamacpp-direct.md +0 -0
  91. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/backends/lmstudio-direct.en.md +0 -0
  92. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/backends/lmstudio-direct.md +0 -0
  93. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/backends/verify-ollama-0.23.1.md +0 -0
  94. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/concepts/architecture.md +0 -0
  95. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/concepts/context-budget.md +0 -0
  96. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/concepts/continuous-probing.md +0 -0
  97. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/concepts/drift-detection.md +0 -0
  98. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/concepts/partial-stitch.md +0 -0
  99. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/designs/v1.5-dashboard-mockup.html +0 -0
  100. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/designs/v1.6-auto-router-verification.md +0 -0
  101. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/designs/v1.6-auto-router.md +0 -0
  102. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/guides/free-tier-guide.en.md +0 -0
  103. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/guides/free-tier-guide.md +0 -0
  104. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/guides/security.en.md +0 -0
  105. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/guides/security.md +0 -0
  106. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/guides/troubleshooting.en.md +0 -0
  107. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/guides/troubleshooting.md +0 -0
  108. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/guides/usage-guide.en.md +0 -0
  109. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/guides/usage-guide.md +0 -0
  110. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/openrouter-roster/CHANGES.md +0 -0
  111. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/openrouter-roster/README.md +0 -0
  112. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/openrouter-roster/latest.json +0 -0
  113. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/retrospectives/v0.4.md +0 -0
  114. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/retrospectives/v0.5-verify.md +0 -0
  115. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/retrospectives/v0.5.md +0 -0
  116. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/retrospectives/v0.6.md +0 -0
  117. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/retrospectives/v0.7.md +0 -0
  118. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/retrospectives/v1.0-verify.md +0 -0
  119. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/retrospectives/v1.0.md +0 -0
  120. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/start/quickstart.en.md +0 -0
  121. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/start/quickstart.md +0 -0
  122. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/start/when-do-i-need-coderouter.en.md +0 -0
  123. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/docs/start/when-do-i-need-coderouter.md +0 -0
  124. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/examples/.env.example +0 -0
  125. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/examples/providers.auto-custom.yaml +0 -0
  126. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/examples/providers.auto.yaml +0 -0
  127. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/examples/providers.llama-cpp-vllm.yaml +0 -0
  128. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/examples/providers.note-2026.yaml +0 -0
  129. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/examples/providers.nvidia-nim.yaml +0 -0
  130. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/examples/providers.raspberrypi.yaml +0 -0
  131. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/examples/providers.v2-context-budget.yaml +0 -0
  132. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/examples/providers.yaml +0 -0
  133. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/scripts/demo_traffic.sh +0 -0
  134. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/scripts/openrouter_roster_diff.py +0 -0
  135. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/scripts/smoke_v2_2.sh +0 -0
  136. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/scripts/verify-providers.yaml +0 -0
  137. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/scripts/verify_ollama_0_23.py +0 -0
  138. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/scripts/verify_v0_5.sh +0 -0
  139. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/scripts/verify_v1_0.sh +0 -0
  140. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/__init__.py +0 -0
  141. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/conftest.py +0 -0
  142. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_adapter_anthropic.py +0 -0
  143. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_audit_log.py +0 -0
  144. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_auto_router.py +0 -0
  145. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_backend_health.py +0 -0
  146. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_budget.py +0 -0
  147. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_capability.py +0 -0
  148. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_capability_degraded_payload.py +0 -0
  149. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_capability_registry.py +0 -0
  150. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_capability_registry_cache_control.py +0 -0
  151. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_claude_code_suitability.py +0 -0
  152. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_cli.py +0 -0
  153. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_cli_stats.py +0 -0
  154. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_config.py +0 -0
  155. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_context_budget.py +0 -0
  156. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_continuous_probe.py +0 -0
  157. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_dashboard_endpoint.py +0 -0
  158. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_doctor.py +0 -0
  159. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_doctor_apply.py +0 -0
  160. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_doctor_cache_probe.py +0 -0
  161. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_drift_actions.py +0 -0
  162. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_drift_detection.py +0 -0
  163. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_drift_detection_integration.py +0 -0
  164. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_env_file.py +0 -0
  165. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_env_security.py +0 -0
  166. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_errors.py +0 -0
  167. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_examples_yaml.py +0 -0
  168. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_fallback.py +0 -0
  169. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_fallback_anthropic.py +0 -0
  170. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_fallback_cache_control.py +0 -0
  171. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_fallback_cache_observed.py +0 -0
  172. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_fallback_misconfig_warn.py +0 -0
  173. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_fallback_paid_gate.py +0 -0
  174. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_fallback_thinking.py +0 -0
  175. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_guards_tool_loop.py +0 -0
  176. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_ingress_anthropic.py +0 -0
  177. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_ingress_profile.py +0 -0
  178. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_memory_pressure.py +0 -0
  179. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_metrics_cache.py +0 -0
  180. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_metrics_collector.py +0 -0
  181. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_metrics_cost.py +0 -0
  182. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_metrics_endpoint.py +0 -0
  183. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_metrics_jsonl.py +0 -0
  184. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_metrics_prometheus.py +0 -0
  185. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_metrics_prometheus_cache.py +0 -0
  186. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_openai_compat.py +0 -0
  187. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_openrouter_roster_diff.py +0 -0
  188. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_output_filters.py +0 -0
  189. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_output_filters_adapters.py +0 -0
  190. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_partial_stitch.py +0 -0
  191. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_plugins_integration.py +0 -0
  192. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_plugins_loader.py +0 -0
  193. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_plugins_registry.py +0 -0
  194. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_reasoning_strip.py +0 -0
  195. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_request_log.py +0 -0
  196. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_routing_adaptive.py +0 -0
  197. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_self_healing.py +0 -0
  198. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_setup_sh.py +0 -0
  199. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_state_store.py +0 -0
  200. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_token_estimation.py +0 -0
  201. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_tool_repair.py +0 -0
  202. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_translation_anthropic.py +0 -0
  203. {coderouter_cli-2.5.2 → coderouter_cli-2.5.3}/tests/test_translation_reverse.py +0 -0
@@ -89,3 +89,6 @@ docs/articles/
89
89
  # of the repo because the Vault path includes personal Vault names.
90
90
  .env.publish.tpl
91
91
  .env.tpl
92
+
93
+ # FUSE / virtiofs artifacts
94
+ .fuse_hidden*
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: coderouter-cli
3
- Version: 2.5.2
3
+ Version: 2.5.3
4
4
  Summary: Local-first, free-first, fallback-built-in LLM router. Claude Code / OpenAI compatible.
5
5
  Project-URL: Homepage, https://github.com/zephel01/CodeRouter
6
6
  Project-URL: Repository, https://github.com/zephel01/CodeRouter
@@ -27,6 +27,8 @@ Requires-Dist: httpx>=0.27.0
27
27
  Requires-Dist: pydantic>=2.9.0
28
28
  Requires-Dist: pyyaml>=6.0.2
29
29
  Requires-Dist: uvicorn[standard]>=0.32.0
30
+ Provides-Extra: accuracy
31
+ Requires-Dist: tokenizers>=0.20; extra == 'accuracy'
30
32
  Provides-Extra: dev
31
33
  Requires-Dist: mypy>=1.13.0; extra == 'dev'
32
34
  Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
@@ -37,6 +39,8 @@ Requires-Dist: ruff>=0.7.0; extra == 'dev'
37
39
  Requires-Dist: types-pyyaml>=6.0.12; extra == 'dev'
38
40
  Provides-Extra: doctor
39
41
  Requires-Dist: ruamel-yaml>=0.18.6; extra == 'doctor'
42
+ Provides-Extra: repair
43
+ Requires-Dist: json-repair>=0.30; extra == 'repair'
40
44
  Description-Content-Type: text/markdown
41
45
 
42
46
  <h1 align="center">CodeRouter</h1>
@@ -0,0 +1,304 @@
1
+ """Minimal, dependency-free GGUF header introspection (low-memory track).
2
+
3
+ Why self-written
4
+ ================
5
+
6
+ To right-size ``num_ctx`` *before* dispatch we need a model's layer
7
+ count and embedding width so the KV-cache footprint can be estimated.
8
+ That data lives in the GGUF metadata header. Rather than add the
9
+ official ``gguf`` package (and its ``numpy`` transitive dep) we read
10
+ only the handful of header fields we need with the standard library —
11
+ preserving the 5-deps invariant.
12
+
13
+ The GGUF binary layout we parse (little-endian):
14
+
15
+ magic : 4 bytes == b"GGUF"
16
+ version : uint32 (2 or 3 supported)
17
+ tensor_cnt : uint64 (ignored — we never read tensor data)
18
+ kv_count : uint64 (number of metadata key/value pairs)
19
+ kv_pairs : kv_count repetitions of:
20
+ key : gguf-string (uint64 length + UTF-8 bytes)
21
+ value_type : uint32 (see _GGUF_TYPE_*)
22
+ value : type-dependent
23
+
24
+ We walk the KV pairs, capturing only the keys we care about, and skip
25
+ the rest (including arbitrarily nested arrays) without materialising
26
+ them.
27
+
28
+ Security
29
+ ========
30
+
31
+ The parser treats the file as **untrusted input**:
32
+
33
+ * Every string length and array element count is clamped against
34
+ :data:`_MAX_STR_BYTES` / :data:`_MAX_ARRAY_LEN` so a corrupt or
35
+ hostile header cannot trigger a multi-GB allocation (DoS).
36
+ * Reads past EOF raise :class:`GGUFParseError`, never an unbounded
37
+ loop.
38
+ * No ``mmap``, no tensor payload read, no code execution path — we
39
+ only seek/read a small prefix.
40
+ """
41
+
42
+ from __future__ import annotations
43
+
44
+ import struct
45
+ from dataclasses import dataclass
46
+ from pathlib import Path
47
+ from typing import BinaryIO
48
+
49
+ # ---------------------------------------------------------------------------
50
+ # Constants / format
51
+ # ---------------------------------------------------------------------------
52
+
53
+ _GGUF_MAGIC = b"GGUF"
54
+
55
+ # GGUF metadata value type tags.
56
+ _GGUF_TYPE_UINT8 = 0
57
+ _GGUF_TYPE_INT8 = 1
58
+ _GGUF_TYPE_UINT16 = 2
59
+ _GGUF_TYPE_INT16 = 3
60
+ _GGUF_TYPE_UINT32 = 4
61
+ _GGUF_TYPE_INT32 = 5
62
+ _GGUF_TYPE_FLOAT32 = 6
63
+ _GGUF_TYPE_BOOL = 7
64
+ _GGUF_TYPE_STRING = 8
65
+ _GGUF_TYPE_ARRAY = 9
66
+ _GGUF_TYPE_UINT64 = 10
67
+ _GGUF_TYPE_INT64 = 11
68
+ _GGUF_TYPE_FLOAT64 = 12
69
+
70
+ # Fixed-width scalar (struct format, size) by type tag.
71
+ _SCALAR: dict[int, tuple[str, int]] = {
72
+ _GGUF_TYPE_UINT8: ("<B", 1),
73
+ _GGUF_TYPE_INT8: ("<b", 1),
74
+ _GGUF_TYPE_UINT16: ("<H", 2),
75
+ _GGUF_TYPE_INT16: ("<h", 2),
76
+ _GGUF_TYPE_UINT32: ("<I", 4),
77
+ _GGUF_TYPE_INT32: ("<i", 4),
78
+ _GGUF_TYPE_FLOAT32: ("<f", 4),
79
+ _GGUF_TYPE_BOOL: ("<?", 1),
80
+ _GGUF_TYPE_UINT64: ("<Q", 8),
81
+ _GGUF_TYPE_INT64: ("<q", 8),
82
+ _GGUF_TYPE_FLOAT64: ("<d", 8),
83
+ }
84
+
85
+ # Defensive clamps against hostile / corrupt headers.
86
+ _MAX_STR_BYTES: int = 1 << 20 # 1 MiB key/value string ceiling
87
+ _MAX_ARRAY_LEN: int = 1 << 24 # element-count ceiling for arrays
88
+ _MAX_KV_PAIRS: int = 1 << 20 # metadata pair ceiling
89
+
90
+ # Human-readable names for the GGUF ``general.file_type`` enum (subset).
91
+ _FILE_TYPE_NAMES: dict[int, str] = {
92
+ 0: "F32",
93
+ 1: "F16",
94
+ 2: "Q4_0",
95
+ 3: "Q4_1",
96
+ 7: "Q8_0",
97
+ 8: "Q5_0",
98
+ 9: "Q5_1",
99
+ 10: "Q2_K",
100
+ 11: "Q3_K_S",
101
+ 12: "Q3_K_M",
102
+ 13: "Q3_K_L",
103
+ 14: "Q4_K_S",
104
+ 15: "Q4_K_M",
105
+ 16: "Q5_K_S",
106
+ 17: "Q5_K_M",
107
+ 18: "Q6_K",
108
+ 19: "IQ2_XXS",
109
+ 20: "IQ2_XS",
110
+ 21: "Q2_K_S",
111
+ 22: "IQ3_XS",
112
+ 23: "IQ3_XXS",
113
+ 24: "IQ1_S",
114
+ 25: "IQ4_NL",
115
+ 26: "IQ3_S",
116
+ 27: "IQ3_M",
117
+ 28: "IQ2_S",
118
+ 29: "IQ2_M",
119
+ 30: "IQ4_XS",
120
+ 31: "IQ1_M",
121
+ }
122
+
123
+
124
+ class GGUFParseError(Exception):
125
+ """Raised when a file is not a parseable GGUF header."""
126
+
127
+
128
+ @dataclass(frozen=True, slots=True)
129
+ class GGUFInfo:
130
+ """The subset of GGUF metadata needed for memory accounting."""
131
+
132
+ architecture: str | None
133
+ n_layers: int | None
134
+ n_embd: int | None
135
+ n_heads: int | None
136
+ n_kv_heads: int | None
137
+ file_type: int | None
138
+ file_size_bytes: int
139
+
140
+ @property
141
+ def quant_name(self) -> str | None:
142
+ """Human-readable quantization label, or None if unknown."""
143
+ if self.file_type is None:
144
+ return None
145
+ return _FILE_TYPE_NAMES.get(self.file_type, f"type{self.file_type}")
146
+
147
+ @property
148
+ def weights_bytes(self) -> int:
149
+ """Approximate on-disk weight size — the file size is the best
150
+ proxy (GGUF is almost entirely tensor data)."""
151
+ return self.file_size_bytes
152
+
153
+
154
+ # ---------------------------------------------------------------------------
155
+ # Low-level readers
156
+ # ---------------------------------------------------------------------------
157
+
158
+
159
+ def _read_exact(fh: BinaryIO, n: int) -> bytes:
160
+ data = fh.read(n)
161
+ if len(data) != n:
162
+ raise GGUFParseError(f"unexpected EOF (wanted {n} bytes, got {len(data)})")
163
+ return data
164
+
165
+
166
+ def _read_scalar(fh: BinaryIO, type_tag: int) -> object:
167
+ fmt_size = _SCALAR.get(type_tag)
168
+ if fmt_size is None:
169
+ raise GGUFParseError(f"unknown scalar type tag {type_tag}")
170
+ fmt, size = fmt_size
171
+ return struct.unpack(fmt, _read_exact(fh, size))[0]
172
+
173
+
174
+ def _read_u32(fh: BinaryIO) -> int:
175
+ return struct.unpack("<I", _read_exact(fh, 4))[0]
176
+
177
+
178
+ def _read_u64(fh: BinaryIO) -> int:
179
+ return struct.unpack("<Q", _read_exact(fh, 8))[0]
180
+
181
+
182
+ def _read_gguf_string(fh: BinaryIO) -> str:
183
+ length = _read_u64(fh)
184
+ if length > _MAX_STR_BYTES:
185
+ raise GGUFParseError(f"string length {length} exceeds cap")
186
+ return _read_exact(fh, length).decode("utf-8", errors="replace")
187
+
188
+
189
+ def _skip_value(fh: BinaryIO, type_tag: int) -> None:
190
+ """Consume a metadata value of ``type_tag`` without retaining it."""
191
+ if type_tag == _GGUF_TYPE_STRING:
192
+ _read_gguf_string(fh)
193
+ return
194
+ if type_tag == _GGUF_TYPE_ARRAY:
195
+ elem_type = _read_u32(fh)
196
+ count = _read_u64(fh)
197
+ if count > _MAX_ARRAY_LEN:
198
+ raise GGUFParseError(f"array length {count} exceeds cap")
199
+ for _ in range(count):
200
+ _skip_value(fh, elem_type)
201
+ return
202
+ fmt_size = _SCALAR.get(type_tag)
203
+ if fmt_size is None:
204
+ raise GGUFParseError(f"unknown value type tag {type_tag}")
205
+ fh.seek(fmt_size[1], 1) # skip scalar bytes
206
+
207
+
208
+ def _read_scalar_value(fh: BinaryIO, type_tag: int) -> object:
209
+ """Read (and return) a value, skipping arrays/strings we don't need."""
210
+ if type_tag == _GGUF_TYPE_STRING:
211
+ return _read_gguf_string(fh)
212
+ if type_tag == _GGUF_TYPE_ARRAY:
213
+ _skip_value(fh, type_tag)
214
+ return None
215
+ return _read_scalar(fh, type_tag)
216
+
217
+
218
+ # ---------------------------------------------------------------------------
219
+ # Public API
220
+ # ---------------------------------------------------------------------------
221
+
222
+ # Suffixes of the arch-prefixed keys we capture (e.g. "llama.block_count").
223
+ _KEY_BLOCK_COUNT = ".block_count"
224
+ _KEY_EMBED_LEN = ".embedding_length"
225
+ _KEY_HEAD_COUNT = ".attention.head_count"
226
+ _KEY_HEAD_COUNT_KV = ".attention.head_count_kv"
227
+
228
+
229
+ def read_gguf_metadata(path: str | Path) -> GGUFInfo:
230
+ """Parse the GGUF header at ``path`` and return a :class:`GGUFInfo`.
231
+
232
+ Raises :class:`GGUFParseError` if the file is missing, too short,
233
+ or not a GGUF container. Captures only the keys needed for memory
234
+ accounting; everything else is skipped.
235
+ """
236
+ p = Path(path)
237
+ try:
238
+ file_size = p.stat().st_size
239
+ except OSError as exc: # missing / unreadable
240
+ raise GGUFParseError(f"cannot stat {path}: {exc}") from exc
241
+
242
+ arch: str | None = None
243
+ n_layers: int | None = None
244
+ n_embd: int | None = None
245
+ n_heads: int | None = None
246
+ n_kv_heads: int | None = None
247
+ file_type: int | None = None
248
+
249
+ with p.open("rb") as fh:
250
+ magic = fh.read(4)
251
+ if magic != _GGUF_MAGIC:
252
+ raise GGUFParseError(f"bad magic {magic!r} (not a GGUF file)")
253
+ version = _read_u32(fh)
254
+ if version not in (2, 3):
255
+ raise GGUFParseError(f"unsupported GGUF version {version}")
256
+ _read_u64(fh) # tensor_count: advance cursor, not needed
257
+ kv_count = _read_u64(fh)
258
+ if kv_count > _MAX_KV_PAIRS:
259
+ raise GGUFParseError(f"kv_count {kv_count} exceeds cap")
260
+
261
+ for _ in range(kv_count):
262
+ key = _read_gguf_string(fh)
263
+ value_type = _read_u32(fh)
264
+ value = _read_scalar_value(fh, value_type)
265
+
266
+ if key == "general.architecture" and isinstance(value, str):
267
+ arch = value
268
+ elif key == "general.file_type" and isinstance(value, int):
269
+ file_type = value
270
+ elif key.endswith(_KEY_BLOCK_COUNT) and isinstance(value, int):
271
+ n_layers = value
272
+ elif key.endswith(_KEY_EMBED_LEN) and isinstance(value, int):
273
+ n_embd = value
274
+ elif key.endswith(_KEY_HEAD_COUNT_KV) and isinstance(value, int):
275
+ n_kv_heads = value
276
+ elif key.endswith(_KEY_HEAD_COUNT) and isinstance(value, int):
277
+ n_heads = value
278
+
279
+ return GGUFInfo(
280
+ architecture=arch,
281
+ n_layers=n_layers,
282
+ n_embd=n_embd,
283
+ n_heads=n_heads,
284
+ n_kv_heads=n_kv_heads,
285
+ file_type=file_type,
286
+ file_size_bytes=file_size,
287
+ )
288
+
289
+
290
+ def try_read_gguf_metadata(path: str | Path) -> GGUFInfo | None:
291
+ """Like :func:`read_gguf_metadata` but returns None on any parse
292
+ failure — convenient for best-effort advisory paths."""
293
+ try:
294
+ return read_gguf_metadata(path)
295
+ except GGUFParseError:
296
+ return None
297
+
298
+
299
+ __all__ = [
300
+ "GGUFInfo",
301
+ "GGUFParseError",
302
+ "read_gguf_metadata",
303
+ "try_read_gguf_metadata",
304
+ ]
@@ -0,0 +1,249 @@
1
+ """Proactive memory-budget guard (low-memory track, L1).
2
+
3
+ Where :mod:`coderouter.guards.memory_pressure` reacts *after* an OOM,
4
+ this guard prevents it: given the host's available memory (from
5
+ :mod:`coderouter.hardware`) and the model's shape (from
6
+ :mod:`coderouter.gguf_introspect`), it computes the largest context
7
+ window (``num_ctx``) that will actually fit, *before* the request is
8
+ dispatched.
9
+
10
+ The engine then (a) caps the backend's ``num_ctx`` to that value and
11
+ (b) trims conversation history to the same budget via
12
+ :func:`coderouter.guards.context_budget.trim_to_budget`.
13
+
14
+ Everything here is **pure** (no I/O, no globals) so it is trivially
15
+ testable and free of the 5-deps constraint.
16
+
17
+ KV-cache model
18
+ ==============
19
+
20
+ The dominant runtime cost beyond the weights is the attention KV
21
+ cache, which grows linearly with context length:
22
+
23
+ kv_bytes ≈ 2 (K and V)
24
+ x n_layers
25
+ x n_ctx
26
+ x kv_dim
27
+ x bytes_per_element
28
+
29
+ ``kv_dim`` is the per-token key/value width. With grouped-query
30
+ attention (GQA) it is ``n_embd x n_kv_heads / n_heads``; without GQA
31
+ metadata it falls back to ``n_embd`` (conservative — over-counts, so
32
+ we under-promise context, which is the safe direction for OOM).
33
+
34
+ ``bytes_per_element`` defaults to 2 (fp16 KV cache). The estimate is
35
+ deliberately conservative; the headroom in :mod:`coderouter.hardware`
36
+ absorbs activation/compute buffers not modelled here.
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ from dataclasses import dataclass
42
+ from typing import Literal
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # Constants
46
+ # ---------------------------------------------------------------------------
47
+
48
+ _BYTES_PER_GB: int = 1024**3
49
+
50
+ #: Default bytes per KV-cache element (fp16).
51
+ DEFAULT_KV_BYTES_PER_ELEM: int = 2
52
+
53
+ #: Fraction of the post-weights budget held back for activations and
54
+ #: the compute buffer (not modelled explicitly). The remainder is what
55
+ #: the KV cache may consume.
56
+ DEFAULT_COMPUTE_OVERHEAD_RATIO: float = 0.10
57
+
58
+ #: Fallback layer/embedding shape when GGUF metadata is incomplete.
59
+ #: Chosen to over-estimate KV (safe: under-promises context).
60
+ _FALLBACK_N_LAYERS: int = 32
61
+ _FALLBACK_N_EMBD: int = 4096
62
+
63
+ FitAction = Literal["ok", "shrink", "insufficient", "unknown"]
64
+
65
+
66
+ # ---------------------------------------------------------------------------
67
+ # Result type
68
+ # ---------------------------------------------------------------------------
69
+
70
+
71
+ @dataclass(frozen=True, slots=True)
72
+ class FitDecision:
73
+ """Outcome of a pre-dispatch memory-fit computation.
74
+
75
+ ``action``:
76
+ * ``"ok"`` — requested context fits as-is.
77
+ * ``"shrink"`` — fits only at ``effective_num_ctx`` < requested.
78
+ * ``"insufficient"`` — won't fit even at ``min_num_ctx``; the model
79
+ is too big for this host (caller should warn
80
+ / fall through to another provider).
81
+ * ``"unknown"`` — hardware undetected; guard is a no-op.
82
+ """
83
+
84
+ action: FitAction
85
+ fits: bool
86
+ requested_num_ctx: int
87
+ effective_num_ctx: int
88
+ weights_bytes: int
89
+ kv_cache_bytes: int
90
+ available_bytes: int
91
+ reason: str
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # KV-cache math (pure)
96
+ # ---------------------------------------------------------------------------
97
+
98
+
99
+ def kv_dim(
100
+ n_embd: int | None,
101
+ n_heads: int | None,
102
+ n_kv_heads: int | None,
103
+ ) -> int:
104
+ """Per-token KV width in elements.
105
+
106
+ Applies the GQA reduction when both head counts are known and
107
+ valid; otherwise returns ``n_embd`` (over-counts → safe).
108
+ """
109
+ embd = n_embd if (n_embd and n_embd > 0) else _FALLBACK_N_EMBD
110
+ if (
111
+ n_heads
112
+ and n_kv_heads
113
+ and n_heads > 0
114
+ and 0 < n_kv_heads <= n_heads
115
+ ):
116
+ return max(1, int(embd * n_kv_heads / n_heads))
117
+ return embd
118
+
119
+
120
+ def kv_cache_bytes(
121
+ n_ctx: int,
122
+ n_layers: int,
123
+ kv_width: int,
124
+ *,
125
+ bytes_per_elem: int = DEFAULT_KV_BYTES_PER_ELEM,
126
+ ) -> int:
127
+ """KV-cache size in bytes for a given context length (K and V)."""
128
+ return 2 * max(0, n_layers) * max(0, n_ctx) * max(0, kv_width) * bytes_per_elem
129
+
130
+
131
+ def max_num_ctx_for_budget(
132
+ kv_budget_bytes: int,
133
+ n_layers: int,
134
+ kv_width: int,
135
+ *,
136
+ bytes_per_elem: int = DEFAULT_KV_BYTES_PER_ELEM,
137
+ ) -> int:
138
+ """Largest ``n_ctx`` whose KV cache fits in ``kv_budget_bytes``."""
139
+ per_token = 2 * max(1, n_layers) * max(1, kv_width) * bytes_per_elem
140
+ if per_token <= 0 or kv_budget_bytes <= 0:
141
+ return 0
142
+ return int(kv_budget_bytes // per_token)
143
+
144
+
145
+ # ---------------------------------------------------------------------------
146
+ # Fit decision (pure)
147
+ # ---------------------------------------------------------------------------
148
+
149
+
150
+ def plan_fit(
151
+ *,
152
+ available_budget_gb: float,
153
+ weights_bytes: int,
154
+ requested_num_ctx: int,
155
+ n_layers: int | None,
156
+ n_embd: int | None = None,
157
+ n_heads: int | None = None,
158
+ n_kv_heads: int | None = None,
159
+ min_num_ctx: int = 2048,
160
+ bytes_per_elem: int = DEFAULT_KV_BYTES_PER_ELEM,
161
+ compute_overhead_ratio: float = DEFAULT_COMPUTE_OVERHEAD_RATIO,
162
+ ) -> FitDecision:
163
+ """Decide whether ``requested_num_ctx`` fits, and by how much to shrink.
164
+
165
+ ``available_budget_gb`` is the net memory (after OS headroom) from
166
+ :func:`coderouter.hardware.available_budget_gb`. ``0.0`` means
167
+ hardware was undetected → returns an ``"unknown"`` no-op decision.
168
+ """
169
+ if available_budget_gb <= 0.0:
170
+ return FitDecision(
171
+ action="unknown",
172
+ fits=True, # don't block when we can't measure
173
+ requested_num_ctx=requested_num_ctx,
174
+ effective_num_ctx=requested_num_ctx,
175
+ weights_bytes=weights_bytes,
176
+ kv_cache_bytes=0,
177
+ available_bytes=0,
178
+ reason="hardware undetected; guard no-op",
179
+ )
180
+
181
+ available_bytes = int(available_budget_gb * _BYTES_PER_GB)
182
+ layers = n_layers if (n_layers and n_layers > 0) else _FALLBACK_N_LAYERS
183
+ width = kv_dim(n_embd, n_heads, n_kv_heads)
184
+
185
+ # Memory left for the KV cache after weights, minus a compute buffer.
186
+ post_weights = available_bytes - max(0, weights_bytes)
187
+ kv_budget = int(post_weights * (1.0 - compute_overhead_ratio))
188
+
189
+ # Can we even run the minimum context?
190
+ min_kv = kv_cache_bytes(min_num_ctx, layers, width, bytes_per_elem=bytes_per_elem)
191
+ if kv_budget < min_kv:
192
+ return FitDecision(
193
+ action="insufficient",
194
+ fits=False,
195
+ requested_num_ctx=requested_num_ctx,
196
+ effective_num_ctx=min_num_ctx,
197
+ weights_bytes=weights_bytes,
198
+ kv_cache_bytes=min_kv,
199
+ available_bytes=available_bytes,
200
+ reason=(
201
+ "weights + minimum KV cache exceed available memory; "
202
+ "model too large for this host"
203
+ ),
204
+ )
205
+
206
+ ctx_cap = max_num_ctx_for_budget(
207
+ kv_budget, layers, width, bytes_per_elem=bytes_per_elem
208
+ )
209
+
210
+ if ctx_cap >= requested_num_ctx:
211
+ kv = kv_cache_bytes(
212
+ requested_num_ctx, layers, width, bytes_per_elem=bytes_per_elem
213
+ )
214
+ return FitDecision(
215
+ action="ok",
216
+ fits=True,
217
+ requested_num_ctx=requested_num_ctx,
218
+ effective_num_ctx=requested_num_ctx,
219
+ weights_bytes=weights_bytes,
220
+ kv_cache_bytes=kv,
221
+ available_bytes=available_bytes,
222
+ reason="requested context fits",
223
+ )
224
+
225
+ # Shrink to the cap, but never below the floor.
226
+ effective = max(min_num_ctx, ctx_cap)
227
+ kv = kv_cache_bytes(effective, layers, width, bytes_per_elem=bytes_per_elem)
228
+ return FitDecision(
229
+ action="shrink",
230
+ fits=True,
231
+ requested_num_ctx=requested_num_ctx,
232
+ effective_num_ctx=effective,
233
+ weights_bytes=weights_bytes,
234
+ kv_cache_bytes=kv,
235
+ available_bytes=available_bytes,
236
+ reason=f"context shrunk from {requested_num_ctx} to {effective} to fit memory",
237
+ )
238
+
239
+
240
+ __all__ = [
241
+ "DEFAULT_COMPUTE_OVERHEAD_RATIO",
242
+ "DEFAULT_KV_BYTES_PER_ELEM",
243
+ "FitAction",
244
+ "FitDecision",
245
+ "kv_cache_bytes",
246
+ "kv_dim",
247
+ "max_num_ctx_for_budget",
248
+ "plan_fit",
249
+ ]