coderouter-cli 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,349 @@
1
+ """Continuous health probing (v2.0-I).
2
+
3
+ Background task that periodically sends minimal 1-token requests to each
4
+ configured provider, feeding the results into the L5 backend health
5
+ state machine. Detects provider crashes during idle periods (no user
6
+ traffic) so the chain resolver knows to skip/demote a dead backend
7
+ before the next real request hits it.
8
+
9
+ Architecture
10
+ ============
11
+
12
+ ::
13
+
14
+ lifespan startup
15
+ └─ asyncio.create_task(probe_loop(...))
16
+
17
+ probe_loop:
18
+ while not shutdown:
19
+ sleep(interval_s)
20
+ for provider in providers:
21
+ result = await probe_one(provider)
22
+ backend_health.record_attempt(...)
23
+ emit log + metrics
24
+
25
+ Design choices
26
+ ==============
27
+
28
+ - **1-token completion** rather than ``/api/version`` or ``/api/tags``
29
+ because version endpoints are Ollama-only; a 1-token generate confirms
30
+ the entire model-serving pipeline is operational (model loaded, KV
31
+ allocated, inference works).
32
+ - **Sequential** probing (not parallel) to avoid hammering backends and
33
+ to keep the implementation trivially correct without gather/semaphore.
34
+ - **No new dependency** — uses httpx (already a runtime dep) + asyncio
35
+ (stdlib).
36
+ - **Graceful shutdown** via an ``asyncio.Event`` set by the lifespan
37
+ exit path. The loop checks the event each iteration and breaks cleanly.
38
+ """
39
+
40
+ from __future__ import annotations
41
+
42
+ import asyncio
43
+ import contextlib
44
+ import time
45
+ from dataclasses import dataclass, field
46
+ from typing import Any
47
+
48
+ import httpx
49
+
50
+ from coderouter.config.schemas import ProviderConfig
51
+ from coderouter.logging import (
52
+ get_logger,
53
+ log_probe_capabilities_drift,
54
+ log_probe_completed,
55
+ log_probe_round_completed,
56
+ )
57
+
58
+ logger = get_logger(__name__)
59
+
60
+
61
+ # ---------------------------------------------------------------------------
62
+ # ProbeResult
63
+ # ---------------------------------------------------------------------------
64
+
65
+
66
+ @dataclass(slots=True)
67
+ class ProbeResult:
68
+ """Outcome of a single provider probe."""
69
+
70
+ provider: str
71
+ success: bool
72
+ latency_ms: float
73
+ error: str | None = None
74
+ model_name: str | None = None
75
+ timestamp: float = field(default_factory=time.time)
76
+
77
+
78
+ # ---------------------------------------------------------------------------
79
+ # probe_one: single-provider 1-token probe
80
+ # ---------------------------------------------------------------------------
81
+
82
+
83
+ async def probe_one(
84
+ provider: ProviderConfig,
85
+ *,
86
+ timeout_s: float = 10.0,
87
+ ) -> ProbeResult:
88
+ """Send a minimal 1-token completion request and measure response.
89
+
90
+ For ``kind: openai_compat``: POST /v1/chat/completions
91
+ For ``kind: anthropic``: POST /v1/messages
92
+
93
+ The request asks for ``max_tokens: 1`` so the probe is as cheap as
94
+ possible (a single output token is generated, exercising the full
95
+ model pipeline without producing meaningful output).
96
+
97
+ Never raises — all failures are captured in ProbeResult(success=False).
98
+ """
99
+ import os
100
+
101
+ start = time.monotonic()
102
+ provider_name = provider.name
103
+ base_url = str(provider.base_url).rstrip("/")
104
+
105
+ # Resolve API key from env (same logic as the adapters)
106
+ headers: dict[str, str] = {}
107
+ if provider.api_key_env:
108
+ api_key = os.environ.get(provider.api_key_env, "")
109
+ if api_key:
110
+ if provider.kind == "anthropic":
111
+ headers["x-api-key"] = api_key
112
+ headers["anthropic-version"] = "2023-06-01"
113
+ else:
114
+ headers["Authorization"] = f"Bearer {api_key}"
115
+
116
+ try:
117
+ async with httpx.AsyncClient(timeout=timeout_s) as client:
118
+ if provider.kind == "anthropic":
119
+ url = f"{base_url}/v1/messages"
120
+ body: dict[str, Any] = {
121
+ "model": provider.model,
122
+ "max_tokens": 1,
123
+ "messages": [{"role": "user", "content": "hi"}],
124
+ }
125
+ resp = await client.post(url, json=body, headers=headers)
126
+ else:
127
+ # openai_compat: Ollama, LM Studio, OpenRouter, etc.
128
+ url = f"{base_url}/chat/completions"
129
+ body = {
130
+ "model": provider.model,
131
+ "max_tokens": 1,
132
+ "messages": [{"role": "user", "content": "hi"}],
133
+ }
134
+ resp = await client.post(url, json=body, headers=headers)
135
+
136
+ latency_ms = (time.monotonic() - start) * 1000
137
+
138
+ if resp.status_code >= 400:
139
+ return ProbeResult(
140
+ provider=provider_name,
141
+ success=False,
142
+ latency_ms=latency_ms,
143
+ error=f"HTTP {resp.status_code}: {resp.text[:200]}",
144
+ )
145
+
146
+ # Extract model name from response (for capabilities drift check)
147
+ model_name: str | None = None
148
+ try:
149
+ data = resp.json()
150
+ model_name = data.get("model")
151
+ except Exception:
152
+ pass
153
+
154
+ return ProbeResult(
155
+ provider=provider_name,
156
+ success=True,
157
+ latency_ms=latency_ms,
158
+ model_name=model_name,
159
+ )
160
+
161
+ except httpx.TimeoutException:
162
+ latency_ms = (time.monotonic() - start) * 1000
163
+ return ProbeResult(
164
+ provider=provider_name,
165
+ success=False,
166
+ latency_ms=latency_ms,
167
+ error=f"timeout after {timeout_s}s",
168
+ )
169
+ except Exception as exc:
170
+ latency_ms = (time.monotonic() - start) * 1000
171
+ return ProbeResult(
172
+ provider=provider_name,
173
+ success=False,
174
+ latency_ms=latency_ms,
175
+ error=str(exc)[:200],
176
+ )
177
+
178
+
179
+ # ---------------------------------------------------------------------------
180
+ # capabilities drift detection (Phase 3)
181
+ # ---------------------------------------------------------------------------
182
+
183
+
184
+ @dataclass(slots=True)
185
+ class DriftReport:
186
+ """Report of a model-name mismatch between config and probe response."""
187
+
188
+ provider: str
189
+ configured_model: str
190
+ observed_model: str
191
+ in_registry: bool
192
+
193
+
194
+ def check_probe_drift(
195
+ provider: ProviderConfig,
196
+ observed_model: str | None,
197
+ *,
198
+ registry: Any = None,
199
+ ) -> DriftReport | None:
200
+ """Compare the probe response model name against the configured model.
201
+
202
+ Returns a :class:`DriftReport` when the observed model differs from
203
+ ``provider.model``, or ``None`` when they match (or when no model
204
+ name was returned by the probe). The ``registry`` argument is an
205
+ optional :class:`CapabilityRegistry` instance used to check whether
206
+ the observed model has a known entry — when it doesn't, the report
207
+ sets ``in_registry=False`` as an extra signal for the operator.
208
+
209
+ Never raises — a missing registry or lookup error just defaults to
210
+ ``in_registry=True`` (conservative, avoids false positives).
211
+ """
212
+ if not observed_model:
213
+ return None
214
+
215
+ configured = provider.model or ""
216
+
217
+ # Normalize: some backends return the model with a prefix or
218
+ # formatting variation. We compare case-sensitively but strip
219
+ # whitespace.
220
+ if observed_model.strip() == configured.strip():
221
+ return None
222
+
223
+ # Check registry for the observed model
224
+ in_registry = True
225
+ if registry is not None:
226
+ try:
227
+ resolved = registry.lookup(kind=provider.kind, model=observed_model)
228
+ # If every resolved field is None, the model is unknown
229
+ if (
230
+ resolved.thinking is None
231
+ and resolved.tools is None
232
+ and resolved.max_context_tokens is None
233
+ and resolved.claude_code_suitability is None
234
+ and resolved.cache_control is None
235
+ ):
236
+ in_registry = False
237
+ except Exception:
238
+ pass # defensive — never crash the probe loop
239
+
240
+ return DriftReport(
241
+ provider=provider.name,
242
+ configured_model=configured,
243
+ observed_model=observed_model,
244
+ in_registry=in_registry,
245
+ )
246
+
247
+
248
+ # ---------------------------------------------------------------------------
249
+ # probe_loop: background task
250
+ # ---------------------------------------------------------------------------
251
+
252
+
253
+ async def probe_loop(
254
+ providers: list[ProviderConfig],
255
+ *,
256
+ record_fn: Any = None,
257
+ interval_s: float = 60.0,
258
+ timeout_s: float = 10.0,
259
+ probe_paid: bool = False,
260
+ shutdown_event: asyncio.Event | None = None,
261
+ health_threshold: int = 3,
262
+ registry: Any = None,
263
+ ) -> None:
264
+ """Run continuous health probes in an infinite loop until shutdown.
265
+
266
+ Args:
267
+ providers: list of provider configs to probe.
268
+ record_fn: callable(provider_name, *, success, threshold) that
269
+ feeds the backend health state machine. When None, results
270
+ are only logged (useful for testing).
271
+ interval_s: seconds to sleep between probe rounds.
272
+ timeout_s: per-provider probe timeout.
273
+ probe_paid: if False, providers with ``paid=True`` are skipped.
274
+ shutdown_event: set this event to stop the loop gracefully.
275
+ health_threshold: consecutive-failure threshold passed to record_fn.
276
+ registry: optional CapabilityRegistry for model drift detection.
277
+ """
278
+ _shutdown = shutdown_event or asyncio.Event()
279
+
280
+ # Initial delay: let the server finish startup before first probe round.
281
+ try:
282
+ await asyncio.wait_for(_shutdown.wait(), timeout=interval_s)
283
+ return # shutdown during initial delay
284
+ except TimeoutError:
285
+ pass # normal: timeout means the delay elapsed without shutdown
286
+
287
+ while not _shutdown.is_set():
288
+ probed = 0
289
+ failures = 0
290
+
291
+ for provider in providers:
292
+ if _shutdown.is_set():
293
+ break
294
+ if provider.paid and not probe_paid:
295
+ continue
296
+
297
+ result = await probe_one(provider, timeout_s=timeout_s)
298
+ probed += 1
299
+
300
+ if not result.success:
301
+ failures += 1
302
+
303
+ # Feed into backend health state machine
304
+ if record_fn is not None:
305
+ with contextlib.suppress(Exception):
306
+ record_fn(
307
+ result.provider,
308
+ success=result.success,
309
+ threshold=health_threshold,
310
+ )
311
+
312
+ # Log individual result
313
+ log_probe_completed(
314
+ logger,
315
+ provider=result.provider,
316
+ success=result.success,
317
+ latency_ms=result.latency_ms,
318
+ error=result.error,
319
+ model_name=result.model_name,
320
+ )
321
+
322
+ # Check for model-capabilities drift on success
323
+ if result.success and result.model_name:
324
+ drift = check_probe_drift(
325
+ provider, result.model_name, registry=registry
326
+ )
327
+ if drift is not None:
328
+ log_probe_capabilities_drift(
329
+ logger,
330
+ provider=drift.provider,
331
+ configured_model=drift.configured_model,
332
+ observed_model=drift.observed_model,
333
+ in_registry=drift.in_registry,
334
+ )
335
+
336
+ # Log round summary
337
+ if probed > 0:
338
+ log_probe_round_completed(
339
+ logger,
340
+ providers_probed=probed,
341
+ failures=failures,
342
+ )
343
+
344
+ # Wait for next interval or shutdown
345
+ try:
346
+ await asyncio.wait_for(_shutdown.wait(), timeout=interval_s)
347
+ break # shutdown signaled
348
+ except TimeoutError:
349
+ pass # normal: sleep elapsed, start next round
@@ -0,0 +1,111 @@
1
+ """Drift detection corrective actions (v2.0-G, L4).
2
+
3
+ Currently the only non-trivial action is ``reload`` — flush the KV cache
4
+ on Ollama-shape providers by sending a ``keep_alive=0`` request to unload
5
+ the model, forcing a fresh context window on the next request.
6
+
7
+ The ``promote`` action is handled directly in fallback.py via
8
+ ``AdaptiveAdjuster.demote()``.
9
+
10
+ Architecture
11
+ ============
12
+
13
+ All functions are **best-effort**: failures are logged but never raised.
14
+ The engine continues regardless — the worst case is that the model stays
15
+ loaded with its existing (potentially degraded) KV cache and the adaptive
16
+ demotion still routes traffic elsewhere until cooldown expires.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import httpx
22
+
23
+ from coderouter.config.schemas import ProviderConfig
24
+ from coderouter.logging import get_logger, log_drift_reload_attempted
25
+
26
+ logger = get_logger(__name__)
27
+
28
+
29
+ def _is_ollama_shape(provider_config: ProviderConfig) -> bool:
30
+ """Return True if the provider looks like Ollama (port 11434 or num_ctx declared)."""
31
+ if provider_config.kind != "openai_compat":
32
+ return False
33
+ base_url = str(provider_config.base_url)
34
+ if ":11434" in base_url:
35
+ return True
36
+ extra = provider_config.extra_body or {}
37
+ options = extra.get("options")
38
+ return isinstance(options, dict) and "num_ctx" in options
39
+
40
+
41
+ def _ollama_base_url(provider_config: ProviderConfig) -> str:
42
+ """Derive the Ollama native API base URL from the OpenAI-compat base_url.
43
+
44
+ Typical patterns:
45
+ - ``http://localhost:11434/v1`` → ``http://localhost:11434``
46
+ - ``http://host:11434/v1/`` → ``http://host:11434``
47
+ """
48
+ url = str(provider_config.base_url).rstrip("/")
49
+ # Strip the /v1 suffix to get the Ollama native API root
50
+ if url.endswith("/v1"):
51
+ url = url[:-3]
52
+ return url
53
+
54
+
55
+ async def attempt_reload(provider_config: ProviderConfig) -> bool:
56
+ """Attempt to flush the Ollama KV cache by unloading the model.
57
+
58
+ Sends ``POST /api/generate`` with ``keep_alive: "0"`` to the Ollama
59
+ native API. This causes Ollama to unload the model from memory; the
60
+ next inference request will reload it with a fresh KV cache.
61
+
62
+ Parameters
63
+ ----------
64
+ provider_config:
65
+ The provider's configuration from providers.yaml. Must be
66
+ Ollama-shape (``kind: openai_compat`` + port 11434 or num_ctx).
67
+
68
+ Returns
69
+ -------
70
+ True if the unload request succeeded (HTTP 200), False otherwise.
71
+ Non-Ollama providers return False immediately (no-op).
72
+ """
73
+ if not _is_ollama_shape(provider_config):
74
+ logger.debug(
75
+ "drift-reload-skip",
76
+ extra={
77
+ "provider": provider_config.name,
78
+ "reason": "not-ollama-shape",
79
+ },
80
+ )
81
+ return False
82
+
83
+ base_url = _ollama_base_url(provider_config)
84
+ model = provider_config.model
85
+
86
+ try:
87
+ async with httpx.AsyncClient(timeout=10.0) as client:
88
+ resp = await client.post(
89
+ f"{base_url}/api/generate",
90
+ json={
91
+ "model": model,
92
+ "keep_alive": 0,
93
+ },
94
+ )
95
+ success = resp.status_code == 200
96
+ except (httpx.HTTPError, OSError) as exc:
97
+ logger.debug(
98
+ "drift-reload-http-error",
99
+ extra={
100
+ "provider": provider_config.name,
101
+ "error": str(exc)[:200],
102
+ },
103
+ )
104
+ success = False
105
+
106
+ log_drift_reload_attempted(
107
+ logger,
108
+ provider=provider_config.name,
109
+ success=success,
110
+ )
111
+ return success