coderouter-cli 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderouter/config/schemas.py +154 -1
- coderouter/doctor.py +1 -1
- coderouter/guards/backend_health.py +208 -0
- coderouter/guards/memory_pressure.py +210 -0
- coderouter/logging.py +352 -0
- coderouter/metrics/collector.py +86 -0
- coderouter/metrics/prometheus.py +84 -0
- coderouter/routing/auto_router.py +118 -13
- coderouter/routing/budget.py +191 -0
- coderouter/routing/fallback.py +594 -39
- {coderouter_cli-1.9.0.dist-info → coderouter_cli-1.10.0.dist-info}/METADATA +25 -8
- {coderouter_cli-1.9.0.dist-info → coderouter_cli-1.10.0.dist-info}/RECORD +15 -12
- {coderouter_cli-1.9.0.dist-info → coderouter_cli-1.10.0.dist-info}/WHEEL +0 -0
- {coderouter_cli-1.9.0.dist-info → coderouter_cli-1.10.0.dist-info}/entry_points.txt +0 -0
- {coderouter_cli-1.9.0.dist-info → coderouter_cli-1.10.0.dist-info}/licenses/LICENSE +0 -0
coderouter/config/schemas.py
CHANGED
|
@@ -128,6 +128,27 @@ class CostConfig(BaseModel):
|
|
|
128
128
|
"(unusual but theoretically supported by the schema)."
|
|
129
129
|
),
|
|
130
130
|
)
|
|
131
|
+
monthly_budget_usd: float | None = Field(
|
|
132
|
+
default=None,
|
|
133
|
+
ge=0.0,
|
|
134
|
+
description=(
|
|
135
|
+
"v1.10 (LiteLLM 由来 / v1.9-D の累積版): per-provider "
|
|
136
|
+
"monthly USD spend cap. When set, the engine's chain "
|
|
137
|
+
"resolver skips this provider and emits "
|
|
138
|
+
"``skip-budget-exceeded`` once the running per-provider "
|
|
139
|
+
"total for the current calendar month (UTC) reaches or "
|
|
140
|
+
"exceeds this value. Unset (None) = no cap (default). "
|
|
141
|
+
"\n\n"
|
|
142
|
+
"Reset semantics: in-memory only — running totals zero "
|
|
143
|
+
"out on process restart and on UTC calendar-month "
|
|
144
|
+
"rollover. Operators who need durable budget state "
|
|
145
|
+
"across restarts should pair this with external "
|
|
146
|
+
"monitoring on the cost dashboard's ``cost_total_usd`` "
|
|
147
|
+
"panel; persistent budget state is out of scope for "
|
|
148
|
+
"v1.10 (no on-disk store, no Redis, etc., per the "
|
|
149
|
+
"5-deps invariant in plan.md §5.4)."
|
|
150
|
+
),
|
|
151
|
+
)
|
|
131
152
|
|
|
132
153
|
|
|
133
154
|
class ProviderConfig(BaseModel):
|
|
@@ -317,6 +338,95 @@ class FallbackChain(BaseModel):
|
|
|
317
338
|
"error response. See FallbackChain comment for trade-offs."
|
|
318
339
|
),
|
|
319
340
|
)
|
|
341
|
+
# v1.9-E phase 2 (L2): memory-pressure detection + cooldown.
|
|
342
|
+
#
|
|
343
|
+
# Local backends (Ollama / LM Studio / llama.cpp) report VRAM
|
|
344
|
+
# exhaustion via 5xx responses with bodies like "out of memory" /
|
|
345
|
+
# "CUDA out of memory" / "insufficient memory". When the chain
|
|
346
|
+
# encounters one of these, marking the provider as "pressured"
|
|
347
|
+
# for a cooldown window prevents the engine from re-hammering the
|
|
348
|
+
# same exhausted backend on the very next request — the chain
|
|
349
|
+
# falls through to the next provider, which is typically a
|
|
350
|
+
# lighter-weight model or a remote fallback that has the headroom.
|
|
351
|
+
#
|
|
352
|
+
# Three actions trade off intervention against operator preference:
|
|
353
|
+
# * ``off`` — no detection / no logging / no skip. Backward-compat default.
|
|
354
|
+
# * ``warn`` — emit ``memory-pressure-detected`` log when an OOM
|
|
355
|
+
# error is observed; do not skip on subsequent calls.
|
|
356
|
+
# * ``skip`` — ``warn`` + put the provider in a cooldown window;
|
|
357
|
+
# subsequent chain resolves filter it out and emit
|
|
358
|
+
# ``skip-memory-pressure`` until the cooldown expires.
|
|
359
|
+
memory_pressure_action: Literal["off", "warn", "skip"] = Field(
|
|
360
|
+
default="warn",
|
|
361
|
+
description=(
|
|
362
|
+
"v1.9-E (L2 phase 2): action on observed backend OOM "
|
|
363
|
+
"(provider failure with an out-of-memory error body). "
|
|
364
|
+
"``warn`` (default) logs only — diagnostic, no chain "
|
|
365
|
+
"behavior change. ``skip`` enters a cooldown window so "
|
|
366
|
+
"the next request's chain resolver filters the pressured "
|
|
367
|
+
"provider out and falls through to the next entry. "
|
|
368
|
+
"``off`` disables the detector entirely (zero "
|
|
369
|
+
"observation overhead, identical to v1.9.x behavior)."
|
|
370
|
+
),
|
|
371
|
+
)
|
|
372
|
+
memory_pressure_cooldown_s: int = Field(
|
|
373
|
+
default=120,
|
|
374
|
+
ge=10,
|
|
375
|
+
le=3600,
|
|
376
|
+
description=(
|
|
377
|
+
"v1.9-E (L2 phase 2): cooldown window in seconds applied "
|
|
378
|
+
"after an OOM detection when ``memory_pressure_action`` "
|
|
379
|
+
"is ``skip``. Default 120 s gives the local backend "
|
|
380
|
+
"enough time to release model state from VRAM before the "
|
|
381
|
+
"engine re-attempts. Capped at 3600 s (1 hour) — anything "
|
|
382
|
+
"longer is better expressed as marking the provider "
|
|
383
|
+
"``paid: true`` and bouncing the process."
|
|
384
|
+
),
|
|
385
|
+
)
|
|
386
|
+
# v1.9-E phase 2 (L5): backend health monitoring (passive).
|
|
387
|
+
#
|
|
388
|
+
# A consecutive-failure state machine per provider:
|
|
389
|
+
# * HEALTHY — no recent failures (initial state).
|
|
390
|
+
# * DEGRADED — ``backend_health_threshold`` consecutive failures
|
|
391
|
+
# observed; the provider has lost its "fresh" status
|
|
392
|
+
# but is still attempted in chain order.
|
|
393
|
+
# * UNHEALTHY — ``2 x backend_health_threshold`` consecutive
|
|
394
|
+
# failures; depending on the action, the provider
|
|
395
|
+
# is either demoted to chain end or skipped entirely.
|
|
396
|
+
# A single success on ``provider-ok`` resets the counter and the
|
|
397
|
+
# state to HEALTHY immediately — no rolling window, no debounce.
|
|
398
|
+
# Distinct from the v1.9-C ``adaptive`` gradient (continuous
|
|
399
|
+
# latency / error-rate buffer with debounce) which handles the
|
|
400
|
+
# "slow but alive" case; L5 handles the "hard crash" case.
|
|
401
|
+
backend_health_action: Literal["off", "warn", "demote"] = Field(
|
|
402
|
+
default="warn",
|
|
403
|
+
description=(
|
|
404
|
+
"v1.9-E (L5 phase 2): action when a provider transitions "
|
|
405
|
+
"to UNHEALTHY (consecutive failures crossed the threshold). "
|
|
406
|
+
"``warn`` (default) emits a state-change log line only — "
|
|
407
|
+
"diagnostic, no chain reorder. ``demote`` additionally "
|
|
408
|
+
"moves the UNHEALTHY provider to the back of the chain "
|
|
409
|
+
"for the next ``_resolve_chain`` (similar to v1.9-C "
|
|
410
|
+
"adaptive demotion but state-machine-based, not "
|
|
411
|
+
"rolling-window-based). ``off`` disables the monitor "
|
|
412
|
+
"entirely (zero observation overhead, identical to "
|
|
413
|
+
"v1.9.x behavior)."
|
|
414
|
+
),
|
|
415
|
+
)
|
|
416
|
+
backend_health_threshold: int = Field(
|
|
417
|
+
default=3,
|
|
418
|
+
ge=2,
|
|
419
|
+
le=20,
|
|
420
|
+
description=(
|
|
421
|
+
"v1.9-E (L5 phase 2): consecutive-failure count that "
|
|
422
|
+
"triggers the HEALTHY → DEGRADED transition. The "
|
|
423
|
+
"DEGRADED → UNHEALTHY transition fires at ``2x`` this "
|
|
424
|
+
"value. Default 3 catches "
|
|
425
|
+
"Ollama / LM Studio crashes (which produce a deterministic "
|
|
426
|
+
"5xx pattern on every retry) without flapping on transient "
|
|
427
|
+
"blips that the v1.9-C adaptive adjuster already handles."
|
|
428
|
+
),
|
|
429
|
+
)
|
|
320
430
|
adaptive: bool = Field(
|
|
321
431
|
default=False,
|
|
322
432
|
description=(
|
|
@@ -358,6 +468,36 @@ class RuleMatcher(BaseModel):
|
|
|
358
468
|
- ``content_contains: "foo"`` — substring match (case-sensitive).
|
|
359
469
|
- ``content_regex: r"..."`` — Python ``re.search``; compiled at
|
|
360
470
|
model-construction time so typos fail startup.
|
|
471
|
+
|
|
472
|
+
Variants ([Unreleased] / per-model auto-routing, free-claude-code 由来):
|
|
473
|
+
|
|
474
|
+
- ``model_pattern: r"claude-3-5-haiku.*"`` — Python ``re.fullmatch``
|
|
475
|
+
against the request body's ``model`` field. Lets clients route on
|
|
476
|
+
the model identifier the agent (Claude Code / Cursor) sent
|
|
477
|
+
(Opus / Sonnet / Haiku → different profiles) without needing an
|
|
478
|
+
explicit ``profile`` field on the wire. Compiled at load like
|
|
479
|
+
``content_regex``. ``fullmatch`` semantics (vs ``search`` for
|
|
480
|
+
``content_regex``) because model identifiers are structured tokens
|
|
481
|
+
— users typically describe the whole identifier with a wildcard
|
|
482
|
+
tail, not an arbitrary substring.
|
|
483
|
+
|
|
484
|
+
Variants ([Unreleased] / longContext auto-switch, claude-code-router
|
|
485
|
+
由来):
|
|
486
|
+
|
|
487
|
+
- ``content_token_count_min: 32000`` — char-count ÷ 4 heuristic
|
|
488
|
+
across **all** messages in the request body (not just the
|
|
489
|
+
latest user message — this matcher describes the request's
|
|
490
|
+
overall size). When the estimated token count is ``>=`` the
|
|
491
|
+
threshold, route to a long-context profile (typically pointing
|
|
492
|
+
at Gemini Flash 1M ctx, Haiku 200K, etc.). Distinct from the
|
|
493
|
+
other content matchers which operate on the latest user
|
|
494
|
+
message only — context-window pressure is a request-shape
|
|
495
|
+
property, not a per-turn property. The estimator deliberately
|
|
496
|
+
avoids tiktoken / SentencePiece (forbidden by the 5-deps
|
|
497
|
+
invariant in plan.md §5.4); operators with non-English-heavy
|
|
498
|
+
workloads can compensate by tuning the threshold, since the
|
|
499
|
+
char/4 heuristic is conservative for CJK and looser for
|
|
500
|
+
English code.
|
|
361
501
|
"""
|
|
362
502
|
|
|
363
503
|
model_config = ConfigDict(extra="forbid")
|
|
@@ -366,12 +506,16 @@ class RuleMatcher(BaseModel):
|
|
|
366
506
|
code_fence_ratio_min: float | None = Field(default=None, ge=0.0, le=1.0)
|
|
367
507
|
content_contains: str | None = None
|
|
368
508
|
content_regex: str | None = None
|
|
509
|
+
model_pattern: str | None = None
|
|
510
|
+
content_token_count_min: int | None = Field(default=None, ge=1)
|
|
369
511
|
|
|
370
512
|
_MATCHER_FIELDS: tuple[str, ...] = (
|
|
371
513
|
"has_image",
|
|
372
514
|
"code_fence_ratio_min",
|
|
373
515
|
"content_contains",
|
|
374
516
|
"content_regex",
|
|
517
|
+
"model_pattern",
|
|
518
|
+
"content_token_count_min",
|
|
375
519
|
)
|
|
376
520
|
|
|
377
521
|
@model_validator(mode="after")
|
|
@@ -388,7 +532,9 @@ class RuleMatcher(BaseModel):
|
|
|
388
532
|
|
|
389
533
|
@model_validator(mode="after")
|
|
390
534
|
def _compile_regex_eagerly(self) -> Self:
|
|
391
|
-
"""Compile ``content_regex`` at load so bad
|
|
535
|
+
"""Compile ``content_regex`` / ``model_pattern`` at load so bad
|
|
536
|
+
patterns fail startup rather than at first request.
|
|
537
|
+
"""
|
|
392
538
|
if self.content_regex is not None:
|
|
393
539
|
try:
|
|
394
540
|
re.compile(self.content_regex)
|
|
@@ -396,6 +542,13 @@ class RuleMatcher(BaseModel):
|
|
|
396
542
|
raise ValueError(
|
|
397
543
|
f"Invalid regex for content_regex {self.content_regex!r}: {exc}"
|
|
398
544
|
) from exc
|
|
545
|
+
if self.model_pattern is not None:
|
|
546
|
+
try:
|
|
547
|
+
re.compile(self.model_pattern)
|
|
548
|
+
except re.error as exc:
|
|
549
|
+
raise ValueError(
|
|
550
|
+
f"Invalid regex for model_pattern {self.model_pattern!r}: {exc}"
|
|
551
|
+
) from exc
|
|
399
552
|
return self
|
|
400
553
|
|
|
401
554
|
|
coderouter/doctor.py
CHANGED
|
@@ -448,7 +448,7 @@ _STREAMING_PROBE_MIN_EXPECTED_CHARS = 40
|
|
|
448
448
|
# trace plus the actual answer.
|
|
449
449
|
#
|
|
450
450
|
# Numbers picked from the v1.8.1 reality-check session
|
|
451
|
-
# (docs/articles/note-v1-8-1-reality-check.md):
|
|
451
|
+
# (docs/articles/v1-saga/note-1-v1-8-1-reality-check.md):
|
|
452
452
|
# * Gemma 4 26B reasoning prefix observed at ~150-300 tokens before
|
|
453
453
|
# content starts → 1024 covers reasoning + 30-line count comfortably.
|
|
454
454
|
# * Non-thinking baseline kept conservative-but-non-tight (256/512) to
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""Backend health monitor (v1.9-E phase 2, L5).
|
|
2
|
+
|
|
3
|
+
Passive health state machine for chain providers. Counts consecutive
|
|
4
|
+
failures observed via :meth:`record_attempt(provider, success=False)`
|
|
5
|
+
and transitions the provider's state through:
|
|
6
|
+
|
|
7
|
+
HEALTHY → DEGRADED → UNHEALTHY
|
|
8
|
+
▲ │ │
|
|
9
|
+
└──────────┴─────────────┘
|
|
10
|
+
success=True
|
|
11
|
+
|
|
12
|
+
The engine consults :meth:`is_unhealthy(provider)` at chain-resolve
|
|
13
|
+
time and (when the active profile's ``backend_health_action`` is
|
|
14
|
+
``demote``) moves UNHEALTHY providers to the back of the chain, so
|
|
15
|
+
the chain prefers a known-up backend without ever skipping a
|
|
16
|
+
provider entirely. A single subsequent success snaps the provider
|
|
17
|
+
back to HEALTHY immediately — no rolling-window inertia, no debounce.
|
|
18
|
+
|
|
19
|
+
Why state-machine, not rolling window
|
|
20
|
+
=====================================
|
|
21
|
+
|
|
22
|
+
The v1.9-C :class:`coderouter.routing.adaptive.AdaptiveAdjuster`
|
|
23
|
+
already handles the **gradient** case (continuous latency / error-
|
|
24
|
+
rate observations, rolling window, debounced demotions). L5 handles
|
|
25
|
+
the **binary** case: did this backend just crash and start refusing
|
|
26
|
+
every request? A hard crash produces a deterministic stream of
|
|
27
|
+
identical errors on every retry — a state machine catches it in
|
|
28
|
+
``threshold`` attempts without waiting for a rolling window to
|
|
29
|
+
saturate.
|
|
30
|
+
|
|
31
|
+
The two adjusters are orthogonal and compose: the engine consults
|
|
32
|
+
the AdaptiveAdjuster's ``compute_effective_order`` first, then the
|
|
33
|
+
L5 monitor's UNHEALTHY demotion runs on the result. Either signal
|
|
34
|
+
alone is enough to demote a bad provider; both together produce the
|
|
35
|
+
expected chain reorder.
|
|
36
|
+
|
|
37
|
+
Concurrency
|
|
38
|
+
===========
|
|
39
|
+
|
|
40
|
+
Thread-safe via an internal ``RLock``. Reads (``state_for`` /
|
|
41
|
+
``is_unhealthy``) and writes (``record_attempt``) hold the lock for
|
|
42
|
+
the body of the operation so a chain resolve and a per-attempt
|
|
43
|
+
record can't observe a torn state.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
from __future__ import annotations
|
|
47
|
+
|
|
48
|
+
import threading
|
|
49
|
+
from dataclasses import dataclass
|
|
50
|
+
from typing import Literal
|
|
51
|
+
|
|
52
|
+
HealthState = Literal["HEALTHY", "DEGRADED", "UNHEALTHY"]
|
|
53
|
+
"""Three-class health classification.
|
|
54
|
+
|
|
55
|
+
- ``HEALTHY`` — initial state; consecutive failure count is 0.
|
|
56
|
+
- ``DEGRADED`` — failure count has reached ``threshold``; the
|
|
57
|
+
provider is still attempted in chain order, but
|
|
58
|
+
the state-changed log fires for operator visibility.
|
|
59
|
+
- ``UNHEALTHY`` — failure count has reached ``2 * threshold``; when
|
|
60
|
+
the profile's action is ``demote``, the chain
|
|
61
|
+
resolver moves this provider to the back. A single
|
|
62
|
+
success resets back to ``HEALTHY`` directly.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass(frozen=True)
|
|
67
|
+
class HealthTransition:
|
|
68
|
+
"""The outcome of a state-changing :meth:`record_attempt` call.
|
|
69
|
+
|
|
70
|
+
Returned by :meth:`record_attempt` so the engine can decide
|
|
71
|
+
whether to emit a ``backend-health-changed`` log line. The
|
|
72
|
+
engine treats ``None`` returns as "no state change" and stays
|
|
73
|
+
quiet; non-None returns surface a single info line carrying the
|
|
74
|
+
transition.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
provider: str
|
|
78
|
+
old_state: HealthState
|
|
79
|
+
new_state: HealthState
|
|
80
|
+
consecutive_failures: int
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class _ProviderHealth:
|
|
85
|
+
state: HealthState = "HEALTHY"
|
|
86
|
+
consecutive_failures: int = 0
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class BackendHealthMonitor:
|
|
90
|
+
"""Per-provider health state machine.
|
|
91
|
+
|
|
92
|
+
Public API:
|
|
93
|
+
|
|
94
|
+
- :meth:`record_attempt(provider, success)` — fold one observed
|
|
95
|
+
attempt outcome. Returns a :class:`HealthTransition` iff the
|
|
96
|
+
provider's state changed; ``None`` otherwise.
|
|
97
|
+
- :meth:`state_for(provider)` — current state (``HEALTHY``
|
|
98
|
+
default for never-observed providers).
|
|
99
|
+
- :meth:`is_unhealthy(provider)` — convenience predicate the
|
|
100
|
+
engine consults at chain-resolve time.
|
|
101
|
+
- :meth:`reset()` — drop all state. Mainly for tests.
|
|
102
|
+
|
|
103
|
+
The threshold parameter is supplied per-call (rather than stored
|
|
104
|
+
on the monitor) so different profiles in the same engine can use
|
|
105
|
+
different thresholds without forcing the monitor to be aware of
|
|
106
|
+
profile resolution. The transition rules are:
|
|
107
|
+
|
|
108
|
+
* failure → ``consecutive_failures += 1``
|
|
109
|
+
- if it reaches ``2 * threshold``: state = UNHEALTHY
|
|
110
|
+
- elif it reaches ``threshold``: state = DEGRADED
|
|
111
|
+
- else state unchanged
|
|
112
|
+
* success → ``consecutive_failures = 0``, state = HEALTHY
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
def __init__(self) -> None:
|
|
116
|
+
self._lock: threading.RLock = threading.RLock()
|
|
117
|
+
self._state: dict[str, _ProviderHealth] = {}
|
|
118
|
+
|
|
119
|
+
# ------------------------------------------------------------------
|
|
120
|
+
# Recording
|
|
121
|
+
# ------------------------------------------------------------------
|
|
122
|
+
|
|
123
|
+
def record_attempt(
|
|
124
|
+
self,
|
|
125
|
+
provider: str,
|
|
126
|
+
*,
|
|
127
|
+
success: bool,
|
|
128
|
+
threshold: int,
|
|
129
|
+
) -> HealthTransition | None:
|
|
130
|
+
"""Fold one observed attempt outcome and return a transition (if any).
|
|
131
|
+
|
|
132
|
+
Returns ``None`` when the operation didn't change the
|
|
133
|
+
provider's state — the engine uses this to gate
|
|
134
|
+
``backend-health-changed`` log emissions (no log spam when
|
|
135
|
+
a HEALTHY provider succeeds repeatedly).
|
|
136
|
+
"""
|
|
137
|
+
with self._lock:
|
|
138
|
+
entry = self._state.setdefault(provider, _ProviderHealth())
|
|
139
|
+
old_state = entry.state
|
|
140
|
+
|
|
141
|
+
if success:
|
|
142
|
+
entry.consecutive_failures = 0
|
|
143
|
+
if old_state != "HEALTHY":
|
|
144
|
+
entry.state = "HEALTHY"
|
|
145
|
+
return HealthTransition(
|
|
146
|
+
provider=provider,
|
|
147
|
+
old_state=old_state,
|
|
148
|
+
new_state="HEALTHY",
|
|
149
|
+
consecutive_failures=0,
|
|
150
|
+
)
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
entry.consecutive_failures += 1
|
|
154
|
+
new_state: HealthState
|
|
155
|
+
if entry.consecutive_failures >= 2 * threshold:
|
|
156
|
+
new_state = "UNHEALTHY"
|
|
157
|
+
elif entry.consecutive_failures >= threshold:
|
|
158
|
+
new_state = "DEGRADED"
|
|
159
|
+
else:
|
|
160
|
+
# Below threshold — state unchanged but failure
|
|
161
|
+
# counter is still ticking (caller may transition
|
|
162
|
+
# us on a future call).
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
if new_state == old_state:
|
|
166
|
+
# We've already been at this level (e.g. already
|
|
167
|
+
# UNHEALTHY and failing again) — no transition fired.
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
entry.state = new_state
|
|
171
|
+
return HealthTransition(
|
|
172
|
+
provider=provider,
|
|
173
|
+
old_state=old_state,
|
|
174
|
+
new_state=new_state,
|
|
175
|
+
consecutive_failures=entry.consecutive_failures,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
def reset(self) -> None:
|
|
179
|
+
"""Drop all per-provider state."""
|
|
180
|
+
with self._lock:
|
|
181
|
+
self._state.clear()
|
|
182
|
+
|
|
183
|
+
# ------------------------------------------------------------------
|
|
184
|
+
# Read-only API
|
|
185
|
+
# ------------------------------------------------------------------
|
|
186
|
+
|
|
187
|
+
def state_for(self, provider: str) -> HealthState:
|
|
188
|
+
"""Return ``provider``'s current health state.
|
|
189
|
+
|
|
190
|
+
Never-observed providers default to ``HEALTHY`` — the chain
|
|
191
|
+
resolver doesn't have to special-case "first attempt".
|
|
192
|
+
"""
|
|
193
|
+
with self._lock:
|
|
194
|
+
entry = self._state.get(provider)
|
|
195
|
+
if entry is None:
|
|
196
|
+
return "HEALTHY"
|
|
197
|
+
return entry.state
|
|
198
|
+
|
|
199
|
+
def is_unhealthy(self, provider: str) -> bool:
|
|
200
|
+
"""True iff ``provider``'s current state is ``UNHEALTHY``."""
|
|
201
|
+
return self.state_for(provider) == "UNHEALTHY"
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
__all__ = [
|
|
205
|
+
"BackendHealthMonitor",
|
|
206
|
+
"HealthState",
|
|
207
|
+
"HealthTransition",
|
|
208
|
+
]
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""Memory-pressure detection guard (v1.9-E phase 2, L2).
|
|
2
|
+
|
|
3
|
+
Local backends (Ollama / LM Studio / llama.cpp) report VRAM
|
|
4
|
+
exhaustion via HTTP 5xx with error bodies that include phrases like
|
|
5
|
+
``"out of memory"``, ``"CUDA out of memory"``, ``"insufficient
|
|
6
|
+
memory"``, etc. Without intervention, the engine retries against the
|
|
7
|
+
same backend on the very next request and trips the same OOM — wasted
|
|
8
|
+
latency, wasted tokens (when the failure happens after partial
|
|
9
|
+
generation), and an operator-visible cascade of 5xx in the dashboard.
|
|
10
|
+
|
|
11
|
+
This module gives the engine two pieces:
|
|
12
|
+
|
|
13
|
+
1. A **stateless detector** :func:`is_memory_pressure_error` that
|
|
14
|
+
decides "is this AdapterError an OOM-coded failure" from the
|
|
15
|
+
error message text. Pure, no observability dependencies.
|
|
16
|
+
2. A **stateful tracker** :class:`MemoryPressureGuard` that records
|
|
17
|
+
"provider X is pressured until ts" entries (TTL-based cooldown)
|
|
18
|
+
and answers ``is_pressured(provider)`` at chain-resolve time.
|
|
19
|
+
|
|
20
|
+
The combination lets the engine react to an observed OOM by skipping
|
|
21
|
+
the same provider for ``memory_pressure_cooldown_s`` seconds — the
|
|
22
|
+
chain falls through to the next provider, which is typically a
|
|
23
|
+
lighter-weight model or a remote fallback with the headroom.
|
|
24
|
+
|
|
25
|
+
Detection patterns
|
|
26
|
+
==================
|
|
27
|
+
|
|
28
|
+
Case-insensitive substring match against a curated phrase list; no
|
|
29
|
+
regex backtracking. Patterns chosen to match the actual error bodies
|
|
30
|
+
observed across:
|
|
31
|
+
|
|
32
|
+
* **Ollama** ``/api/generate`` and ``/v1/chat/completions``:
|
|
33
|
+
``"model requires more system memory"``,
|
|
34
|
+
``"out of memory"``.
|
|
35
|
+
* **LM Studio** ``/v1/messages`` and ``/v1/chat/completions``:
|
|
36
|
+
``"insufficient memory"``, ``"failed to load model"``.
|
|
37
|
+
* **llama.cpp** ``llama-server``:
|
|
38
|
+
``"failed to allocate"``, ``"CUDA out of memory"``.
|
|
39
|
+
* Generic CUDA / Metal patterns:
|
|
40
|
+
``"out of memory"``, ``"OOM"``.
|
|
41
|
+
|
|
42
|
+
False-positive risk is low because all patterns require the
|
|
43
|
+
substring "memory" or the literal "OOM" in the failure body — generic
|
|
44
|
+
HTTP errors don't include those words.
|
|
45
|
+
|
|
46
|
+
Concurrency
|
|
47
|
+
===========
|
|
48
|
+
|
|
49
|
+
The tracker is safe across asyncio tasks within one event loop and
|
|
50
|
+
across worker threads — every public method holds an ``RLock`` for
|
|
51
|
+
the body of its read/write.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
from __future__ import annotations
|
|
55
|
+
|
|
56
|
+
import threading
|
|
57
|
+
import time
|
|
58
|
+
|
|
59
|
+
from coderouter.adapters.base import AdapterError
|
|
60
|
+
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
# Detection patterns
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# Lowercased substrings — tested against ``str(adapter_error).lower()``.
|
|
67
|
+
# Order doesn't matter (any-match short-circuits). Each entry is a phrase
|
|
68
|
+
# observed in a real backend OOM response. New patterns can be added
|
|
69
|
+
# defensively (false-positive risk is low; see module docstring).
|
|
70
|
+
_MEMORY_PRESSURE_PHRASES: tuple[str, ...] = (
|
|
71
|
+
"out of memory",
|
|
72
|
+
"cuda out of memory",
|
|
73
|
+
"metal out of memory",
|
|
74
|
+
"insufficient memory",
|
|
75
|
+
"model requires more system memory",
|
|
76
|
+
"failed to allocate",
|
|
77
|
+
"failed to load model",
|
|
78
|
+
"ggml_cuda_host_malloc", # llama.cpp specific
|
|
79
|
+
"oom",
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def is_memory_pressure_error(exc: AdapterError) -> bool:
|
|
84
|
+
"""Return True iff ``exc`` looks like a backend OOM signal.
|
|
85
|
+
|
|
86
|
+
Pure function. Operates on ``str(exc).lower()`` and checks for
|
|
87
|
+
any of the curated phrases in :data:`_MEMORY_PRESSURE_PHRASES`.
|
|
88
|
+
Callers in the engine wrap each ``provider-failed`` site to
|
|
89
|
+
decide whether to mark the provider pressured.
|
|
90
|
+
|
|
91
|
+
The check is intentionally **only** about the message text, not
|
|
92
|
+
the HTTP status. Backends sometimes return 500 vs 503 for OOM
|
|
93
|
+
inconsistently, and a 500 carrying ``"missing model"`` should
|
|
94
|
+
NOT be treated as memory pressure (no cooldown helps recover
|
|
95
|
+
from a config error). The phrase-match keeps the detector
|
|
96
|
+
focused on the actual signal.
|
|
97
|
+
"""
|
|
98
|
+
text = str(exc).lower()
|
|
99
|
+
return any(phrase in text for phrase in _MEMORY_PRESSURE_PHRASES)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ---------------------------------------------------------------------------
|
|
103
|
+
# Tracker
|
|
104
|
+
# ---------------------------------------------------------------------------
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class MemoryPressureGuard:
|
|
108
|
+
"""In-memory per-provider OOM cooldown tracker.
|
|
109
|
+
|
|
110
|
+
Public API:
|
|
111
|
+
|
|
112
|
+
- :meth:`mark_pressured(provider, cooldown_s)` — start (or extend)
|
|
113
|
+
a cooldown window for ``provider``. Idempotent: re-marking a
|
|
114
|
+
pressured provider extends the deadline to ``now + cooldown_s``.
|
|
115
|
+
- :meth:`is_pressured(provider)` — True iff the provider's
|
|
116
|
+
cooldown deadline is in the future. Lazy expiry: when an
|
|
117
|
+
expired entry is observed, it's swept out so subsequent reads
|
|
118
|
+
see the entry-less default.
|
|
119
|
+
- :meth:`pressured_until(provider)` — monotonic timestamp of the
|
|
120
|
+
cooldown deadline (or 0.0 if not pressured). Useful for log
|
|
121
|
+
payloads that want to surface the human-readable expiry.
|
|
122
|
+
- :meth:`reset()` — drop all entries. Mainly for tests.
|
|
123
|
+
|
|
124
|
+
Internal lock: an ``RLock`` covers every read/write pair so a
|
|
125
|
+
concurrent ``mark_pressured`` from a failed call and an
|
|
126
|
+
``is_pressured`` from a chain resolve can't observe a torn
|
|
127
|
+
state.
|
|
128
|
+
|
|
129
|
+
Time source: ``time.monotonic`` so cooldowns are immune to
|
|
130
|
+
wall-clock skew. Tests inject ``now=`` for determinism.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
def __init__(self) -> None:
|
|
134
|
+
self._lock: threading.RLock = threading.RLock()
|
|
135
|
+
self._until: dict[str, float] = {}
|
|
136
|
+
|
|
137
|
+
# ------------------------------------------------------------------
|
|
138
|
+
# Mutating API
|
|
139
|
+
# ------------------------------------------------------------------
|
|
140
|
+
|
|
141
|
+
def mark_pressured(
|
|
142
|
+
self,
|
|
143
|
+
provider: str,
|
|
144
|
+
cooldown_s: float,
|
|
145
|
+
*,
|
|
146
|
+
now: float | None = None,
|
|
147
|
+
) -> float:
|
|
148
|
+
"""Mark ``provider`` as pressured for ``cooldown_s`` seconds.
|
|
149
|
+
|
|
150
|
+
Returns the resulting cooldown deadline (monotonic ts), so
|
|
151
|
+
callers that want to log the expiry can pull it out without
|
|
152
|
+
a second locked read. Idempotent — re-marking extends the
|
|
153
|
+
deadline.
|
|
154
|
+
"""
|
|
155
|
+
ts = now if now is not None else time.monotonic()
|
|
156
|
+
deadline = ts + cooldown_s
|
|
157
|
+
with self._lock:
|
|
158
|
+
self._until[provider] = deadline
|
|
159
|
+
return deadline
|
|
160
|
+
|
|
161
|
+
def reset(self) -> None:
|
|
162
|
+
"""Drop all cooldown entries immediately."""
|
|
163
|
+
with self._lock:
|
|
164
|
+
self._until.clear()
|
|
165
|
+
|
|
166
|
+
# ------------------------------------------------------------------
|
|
167
|
+
# Read-only API
|
|
168
|
+
# ------------------------------------------------------------------
|
|
169
|
+
|
|
170
|
+
def is_pressured(self, provider: str, *, now: float | None = None) -> bool:
|
|
171
|
+
"""True iff ``provider`` is currently in cooldown.
|
|
172
|
+
|
|
173
|
+
Lazy expiry: if the recorded deadline has passed, the entry
|
|
174
|
+
is dropped before this call returns False. Callers don't see
|
|
175
|
+
stale "pressured" entries.
|
|
176
|
+
"""
|
|
177
|
+
ts = now if now is not None else time.monotonic()
|
|
178
|
+
with self._lock:
|
|
179
|
+
deadline = self._until.get(provider)
|
|
180
|
+
if deadline is None:
|
|
181
|
+
return False
|
|
182
|
+
if deadline <= ts:
|
|
183
|
+
# Cooldown elapsed — sweep so subsequent calls don't
|
|
184
|
+
# re-take the lock for an empty entry.
|
|
185
|
+
del self._until[provider]
|
|
186
|
+
return False
|
|
187
|
+
return True
|
|
188
|
+
|
|
189
|
+
def pressured_until(
|
|
190
|
+
self, provider: str, *, now: float | None = None
|
|
191
|
+
) -> float:
|
|
192
|
+
"""Return ``provider``'s cooldown deadline, or 0.0 if not pressured.
|
|
193
|
+
|
|
194
|
+
Same lazy-expiry behavior as :meth:`is_pressured`.
|
|
195
|
+
"""
|
|
196
|
+
ts = now if now is not None else time.monotonic()
|
|
197
|
+
with self._lock:
|
|
198
|
+
deadline = self._until.get(provider)
|
|
199
|
+
if deadline is None:
|
|
200
|
+
return 0.0
|
|
201
|
+
if deadline <= ts:
|
|
202
|
+
del self._until[provider]
|
|
203
|
+
return 0.0
|
|
204
|
+
return deadline
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
__all__ = [
|
|
208
|
+
"MemoryPressureGuard",
|
|
209
|
+
"is_memory_pressure_error",
|
|
210
|
+
]
|