coderouter-cli 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -128,6 +128,27 @@ class CostConfig(BaseModel):
128
128
  "(unusual but theoretically supported by the schema)."
129
129
  ),
130
130
  )
131
+ monthly_budget_usd: float | None = Field(
132
+ default=None,
133
+ ge=0.0,
134
+ description=(
135
+ "v1.10 (LiteLLM 由来 / v1.9-D の累積版): per-provider "
136
+ "monthly USD spend cap. When set, the engine's chain "
137
+ "resolver skips this provider and emits "
138
+ "``skip-budget-exceeded`` once the running per-provider "
139
+ "total for the current calendar month (UTC) reaches or "
140
+ "exceeds this value. Unset (None) = no cap (default). "
141
+ "\n\n"
142
+ "Reset semantics: in-memory only — running totals zero "
143
+ "out on process restart and on UTC calendar-month "
144
+ "rollover. Operators who need durable budget state "
145
+ "across restarts should pair this with external "
146
+ "monitoring on the cost dashboard's ``cost_total_usd`` "
147
+ "panel; persistent budget state is out of scope for "
148
+ "v1.10 (no on-disk store, no Redis, etc., per the "
149
+ "5-deps invariant in plan.md §5.4)."
150
+ ),
151
+ )
131
152
 
132
153
 
133
154
  class ProviderConfig(BaseModel):
@@ -317,6 +338,95 @@ class FallbackChain(BaseModel):
317
338
  "error response. See FallbackChain comment for trade-offs."
318
339
  ),
319
340
  )
341
+ # v1.9-E phase 2 (L2): memory-pressure detection + cooldown.
342
+ #
343
+ # Local backends (Ollama / LM Studio / llama.cpp) report VRAM
344
+ # exhaustion via 5xx responses with bodies like "out of memory" /
345
+ # "CUDA out of memory" / "insufficient memory". When the chain
346
+ # encounters one of these, marking the provider as "pressured"
347
+ # for a cooldown window prevents the engine from re-hammering the
348
+ # same exhausted backend on the very next request — the chain
349
+ # falls through to the next provider, which is typically a
350
+ # lighter-weight model or a remote fallback that has the headroom.
351
+ #
352
+ # Three actions trade off intervention against operator preference:
353
+ # * ``off`` — no detection / no logging / no skip. Backward-compat default.
354
+ # * ``warn`` — emit ``memory-pressure-detected`` log when an OOM
355
+ # error is observed; do not skip on subsequent calls.
356
+ # * ``skip`` — ``warn`` + put the provider in a cooldown window;
357
+ # subsequent chain resolves filter it out and emit
358
+ # ``skip-memory-pressure`` until the cooldown expires.
359
+ memory_pressure_action: Literal["off", "warn", "skip"] = Field(
360
+ default="warn",
361
+ description=(
362
+ "v1.9-E (L2 phase 2): action on observed backend OOM "
363
+ "(provider failure with an out-of-memory error body). "
364
+ "``warn`` (default) logs only — diagnostic, no chain "
365
+ "behavior change. ``skip`` enters a cooldown window so "
366
+ "the next request's chain resolver filters the pressured "
367
+ "provider out and falls through to the next entry. "
368
+ "``off`` disables the detector entirely (zero "
369
+ "observation overhead, identical to v1.9.x behavior)."
370
+ ),
371
+ )
372
+ memory_pressure_cooldown_s: int = Field(
373
+ default=120,
374
+ ge=10,
375
+ le=3600,
376
+ description=(
377
+ "v1.9-E (L2 phase 2): cooldown window in seconds applied "
378
+ "after an OOM detection when ``memory_pressure_action`` "
379
+ "is ``skip``. Default 120 s gives the local backend "
380
+ "enough time to release model state from VRAM before the "
381
+ "engine re-attempts. Capped at 3600 s (1 hour) — anything "
382
+ "longer is better expressed as marking the provider "
383
+ "``paid: true`` and bouncing the process."
384
+ ),
385
+ )
386
+ # v1.9-E phase 2 (L5): backend health monitoring (passive).
387
+ #
388
+ # A consecutive-failure state machine per provider:
389
+ # * HEALTHY — no recent failures (initial state).
390
+ # * DEGRADED — ``backend_health_threshold`` consecutive failures
391
+ # observed; the provider has lost its "fresh" status
392
+ # but is still attempted in chain order.
393
+ # * UNHEALTHY — ``2 x backend_health_threshold`` consecutive
394
+ # failures; depending on the action, the provider
395
+ # is either demoted to chain end or skipped entirely.
396
+ # A single success on ``provider-ok`` resets the counter and the
397
+ # state to HEALTHY immediately — no rolling window, no debounce.
398
+ # Distinct from the v1.9-C ``adaptive`` gradient (continuous
399
+ # latency / error-rate buffer with debounce) which handles the
400
+ # "slow but alive" case; L5 handles the "hard crash" case.
401
+ backend_health_action: Literal["off", "warn", "demote"] = Field(
402
+ default="warn",
403
+ description=(
404
+ "v1.9-E (L5 phase 2): action when a provider transitions "
405
+ "to UNHEALTHY (consecutive failures crossed the threshold). "
406
+ "``warn`` (default) emits a state-change log line only — "
407
+ "diagnostic, no chain reorder. ``demote`` additionally "
408
+ "moves the UNHEALTHY provider to the back of the chain "
409
+ "for the next ``_resolve_chain`` (similar to v1.9-C "
410
+ "adaptive demotion but state-machine-based, not "
411
+ "rolling-window-based). ``off`` disables the monitor "
412
+ "entirely (zero observation overhead, identical to "
413
+ "v1.9.x behavior)."
414
+ ),
415
+ )
416
+ backend_health_threshold: int = Field(
417
+ default=3,
418
+ ge=2,
419
+ le=20,
420
+ description=(
421
+ "v1.9-E (L5 phase 2): consecutive-failure count that "
422
+ "triggers the HEALTHY → DEGRADED transition. The "
423
+ "DEGRADED → UNHEALTHY transition fires at ``2x`` this "
424
+ "value. Default 3 catches "
425
+ "Ollama / LM Studio crashes (which produce a deterministic "
426
+ "5xx pattern on every retry) without flapping on transient "
427
+ "blips that the v1.9-C adaptive adjuster already handles."
428
+ ),
429
+ )
320
430
  adaptive: bool = Field(
321
431
  default=False,
322
432
  description=(
@@ -358,6 +468,36 @@ class RuleMatcher(BaseModel):
358
468
  - ``content_contains: "foo"`` — substring match (case-sensitive).
359
469
  - ``content_regex: r"..."`` — Python ``re.search``; compiled at
360
470
  model-construction time so typos fail startup.
471
+
472
+ Variants ([Unreleased] / per-model auto-routing, free-claude-code 由来):
473
+
474
+ - ``model_pattern: r"claude-3-5-haiku.*"`` — Python ``re.fullmatch``
475
+ against the request body's ``model`` field. Lets clients route on
476
+ the model identifier the agent (Claude Code / Cursor) sent
477
+ (Opus / Sonnet / Haiku → different profiles) without needing an
478
+ explicit ``profile`` field on the wire. Compiled at load like
479
+ ``content_regex``. ``fullmatch`` semantics (vs ``search`` for
480
+ ``content_regex``) because model identifiers are structured tokens
481
+ — users typically describe the whole identifier with a wildcard
482
+ tail, not an arbitrary substring.
483
+
484
+ Variants ([Unreleased] / longContext auto-switch, claude-code-router
485
+ 由来):
486
+
487
+ - ``content_token_count_min: 32000`` — char-count ÷ 4 heuristic
488
+ across **all** messages in the request body (not just the
489
+ latest user message — this matcher describes the request's
490
+ overall size). When the estimated token count is ``>=`` the
491
+ threshold, route to a long-context profile (typically pointing
492
+ at Gemini Flash 1M ctx, Haiku 200K, etc.). Distinct from the
493
+ other content matchers which operate on the latest user
494
+ message only — context-window pressure is a request-shape
495
+ property, not a per-turn property. The estimator deliberately
496
+ avoids tiktoken / SentencePiece (forbidden by the 5-deps
497
+ invariant in plan.md §5.4); operators with non-English-heavy
498
+ workloads can compensate by tuning the threshold, since the
499
+ char/4 heuristic is conservative for CJK and looser for
500
+ English code.
361
501
  """
362
502
 
363
503
  model_config = ConfigDict(extra="forbid")
@@ -366,12 +506,16 @@ class RuleMatcher(BaseModel):
366
506
  code_fence_ratio_min: float | None = Field(default=None, ge=0.0, le=1.0)
367
507
  content_contains: str | None = None
368
508
  content_regex: str | None = None
509
+ model_pattern: str | None = None
510
+ content_token_count_min: int | None = Field(default=None, ge=1)
369
511
 
370
512
  _MATCHER_FIELDS: tuple[str, ...] = (
371
513
  "has_image",
372
514
  "code_fence_ratio_min",
373
515
  "content_contains",
374
516
  "content_regex",
517
+ "model_pattern",
518
+ "content_token_count_min",
375
519
  )
376
520
 
377
521
  @model_validator(mode="after")
@@ -388,7 +532,9 @@ class RuleMatcher(BaseModel):
388
532
 
389
533
  @model_validator(mode="after")
390
534
  def _compile_regex_eagerly(self) -> Self:
391
- """Compile ``content_regex`` at load so bad patterns fail startup."""
535
+ """Compile ``content_regex`` / ``model_pattern`` at load so bad
536
+ patterns fail startup rather than at first request.
537
+ """
392
538
  if self.content_regex is not None:
393
539
  try:
394
540
  re.compile(self.content_regex)
@@ -396,6 +542,13 @@ class RuleMatcher(BaseModel):
396
542
  raise ValueError(
397
543
  f"Invalid regex for content_regex {self.content_regex!r}: {exc}"
398
544
  ) from exc
545
+ if self.model_pattern is not None:
546
+ try:
547
+ re.compile(self.model_pattern)
548
+ except re.error as exc:
549
+ raise ValueError(
550
+ f"Invalid regex for model_pattern {self.model_pattern!r}: {exc}"
551
+ ) from exc
399
552
  return self
400
553
 
401
554
 
coderouter/doctor.py CHANGED
@@ -448,7 +448,7 @@ _STREAMING_PROBE_MIN_EXPECTED_CHARS = 40
448
448
  # trace plus the actual answer.
449
449
  #
450
450
  # Numbers picked from the v1.8.1 reality-check session
451
- # (docs/articles/note-v1-8-1-reality-check.md):
451
+ # (docs/articles/v1-saga/note-1-v1-8-1-reality-check.md):
452
452
  # * Gemma 4 26B reasoning prefix observed at ~150-300 tokens before
453
453
  # content starts → 1024 covers reasoning + 30-line count comfortably.
454
454
  # * Non-thinking baseline kept conservative-but-non-tight (256/512) to
@@ -0,0 +1,208 @@
1
+ """Backend health monitor (v1.9-E phase 2, L5).
2
+
3
+ Passive health state machine for chain providers. Counts consecutive
4
+ failures observed via :meth:`record_attempt(provider, success=False)`
5
+ and transitions the provider's state through:
6
+
7
+ HEALTHY → DEGRADED → UNHEALTHY
8
+ ▲ │ │
9
+ └──────────┴─────────────┘
10
+ success=True
11
+
12
+ The engine consults :meth:`is_unhealthy(provider)` at chain-resolve
13
+ time and (when the active profile's ``backend_health_action`` is
14
+ ``demote``) moves UNHEALTHY providers to the back of the chain, so
15
+ the chain prefers a known-up backend without ever skipping a
16
+ provider entirely. A single subsequent success snaps the provider
17
+ back to HEALTHY immediately — no rolling-window inertia, no debounce.
18
+
19
+ Why state-machine, not rolling window
20
+ =====================================
21
+
22
+ The v1.9-C :class:`coderouter.routing.adaptive.AdaptiveAdjuster`
23
+ already handles the **gradient** case (continuous latency / error-
24
+ rate observations, rolling window, debounced demotions). L5 handles
25
+ the **binary** case: did this backend just crash and start refusing
26
+ every request? A hard crash produces a deterministic stream of
27
+ identical errors on every retry — a state machine catches it in
28
+ ``threshold`` attempts without waiting for a rolling window to
29
+ saturate.
30
+
31
+ The two adjusters are orthogonal and compose: the engine consults
32
+ the AdaptiveAdjuster's ``compute_effective_order`` first, then the
33
+ L5 monitor's UNHEALTHY demotion runs on the result. Either signal
34
+ alone is enough to demote a bad provider; both together produce the
35
+ expected chain reorder.
36
+
37
+ Concurrency
38
+ ===========
39
+
40
+ Thread-safe via an internal ``RLock``. Reads (``state_for`` /
41
+ ``is_unhealthy``) and writes (``record_attempt``) hold the lock for
42
+ the body of the operation so a chain resolve and a per-attempt
43
+ record can't observe a torn state.
44
+ """
45
+
46
+ from __future__ import annotations
47
+
48
+ import threading
49
+ from dataclasses import dataclass
50
+ from typing import Literal
51
+
52
+ HealthState = Literal["HEALTHY", "DEGRADED", "UNHEALTHY"]
53
+ """Three-class health classification.
54
+
55
+ - ``HEALTHY`` — initial state; consecutive failure count is 0.
56
+ - ``DEGRADED`` — failure count has reached ``threshold``; the
57
+ provider is still attempted in chain order, but
58
+ the state-changed log fires for operator visibility.
59
+ - ``UNHEALTHY`` — failure count has reached ``2 * threshold``; when
60
+ the profile's action is ``demote``, the chain
61
+ resolver moves this provider to the back. A single
62
+ success resets back to ``HEALTHY`` directly.
63
+ """
64
+
65
+
66
+ @dataclass(frozen=True)
67
+ class HealthTransition:
68
+ """The outcome of a state-changing :meth:`record_attempt` call.
69
+
70
+ Returned by :meth:`record_attempt` so the engine can decide
71
+ whether to emit a ``backend-health-changed`` log line. The
72
+ engine treats ``None`` returns as "no state change" and stays
73
+ quiet; non-None returns surface a single info line carrying the
74
+ transition.
75
+ """
76
+
77
+ provider: str
78
+ old_state: HealthState
79
+ new_state: HealthState
80
+ consecutive_failures: int
81
+
82
+
83
+ @dataclass
84
+ class _ProviderHealth:
85
+ state: HealthState = "HEALTHY"
86
+ consecutive_failures: int = 0
87
+
88
+
89
+ class BackendHealthMonitor:
90
+ """Per-provider health state machine.
91
+
92
+ Public API:
93
+
94
+ - :meth:`record_attempt(provider, success)` — fold one observed
95
+ attempt outcome. Returns a :class:`HealthTransition` iff the
96
+ provider's state changed; ``None`` otherwise.
97
+ - :meth:`state_for(provider)` — current state (``HEALTHY``
98
+ default for never-observed providers).
99
+ - :meth:`is_unhealthy(provider)` — convenience predicate the
100
+ engine consults at chain-resolve time.
101
+ - :meth:`reset()` — drop all state. Mainly for tests.
102
+
103
+ The threshold parameter is supplied per-call (rather than stored
104
+ on the monitor) so different profiles in the same engine can use
105
+ different thresholds without forcing the monitor to be aware of
106
+ profile resolution. The transition rules are:
107
+
108
+ * failure → ``consecutive_failures += 1``
109
+ - if it reaches ``2 * threshold``: state = UNHEALTHY
110
+ - elif it reaches ``threshold``: state = DEGRADED
111
+ - else state unchanged
112
+ * success → ``consecutive_failures = 0``, state = HEALTHY
113
+ """
114
+
115
+ def __init__(self) -> None:
116
+ self._lock: threading.RLock = threading.RLock()
117
+ self._state: dict[str, _ProviderHealth] = {}
118
+
119
+ # ------------------------------------------------------------------
120
+ # Recording
121
+ # ------------------------------------------------------------------
122
+
123
+ def record_attempt(
124
+ self,
125
+ provider: str,
126
+ *,
127
+ success: bool,
128
+ threshold: int,
129
+ ) -> HealthTransition | None:
130
+ """Fold one observed attempt outcome and return a transition (if any).
131
+
132
+ Returns ``None`` when the operation didn't change the
133
+ provider's state — the engine uses this to gate
134
+ ``backend-health-changed`` log emissions (no log spam when
135
+ a HEALTHY provider succeeds repeatedly).
136
+ """
137
+ with self._lock:
138
+ entry = self._state.setdefault(provider, _ProviderHealth())
139
+ old_state = entry.state
140
+
141
+ if success:
142
+ entry.consecutive_failures = 0
143
+ if old_state != "HEALTHY":
144
+ entry.state = "HEALTHY"
145
+ return HealthTransition(
146
+ provider=provider,
147
+ old_state=old_state,
148
+ new_state="HEALTHY",
149
+ consecutive_failures=0,
150
+ )
151
+ return None
152
+
153
+ entry.consecutive_failures += 1
154
+ new_state: HealthState
155
+ if entry.consecutive_failures >= 2 * threshold:
156
+ new_state = "UNHEALTHY"
157
+ elif entry.consecutive_failures >= threshold:
158
+ new_state = "DEGRADED"
159
+ else:
160
+ # Below threshold — state unchanged but failure
161
+ # counter is still ticking (caller may transition
162
+ # us on a future call).
163
+ return None
164
+
165
+ if new_state == old_state:
166
+ # We've already been at this level (e.g. already
167
+ # UNHEALTHY and failing again) — no transition fired.
168
+ return None
169
+
170
+ entry.state = new_state
171
+ return HealthTransition(
172
+ provider=provider,
173
+ old_state=old_state,
174
+ new_state=new_state,
175
+ consecutive_failures=entry.consecutive_failures,
176
+ )
177
+
178
+ def reset(self) -> None:
179
+ """Drop all per-provider state."""
180
+ with self._lock:
181
+ self._state.clear()
182
+
183
+ # ------------------------------------------------------------------
184
+ # Read-only API
185
+ # ------------------------------------------------------------------
186
+
187
+ def state_for(self, provider: str) -> HealthState:
188
+ """Return ``provider``'s current health state.
189
+
190
+ Never-observed providers default to ``HEALTHY`` — the chain
191
+ resolver doesn't have to special-case "first attempt".
192
+ """
193
+ with self._lock:
194
+ entry = self._state.get(provider)
195
+ if entry is None:
196
+ return "HEALTHY"
197
+ return entry.state
198
+
199
+ def is_unhealthy(self, provider: str) -> bool:
200
+ """True iff ``provider``'s current state is ``UNHEALTHY``."""
201
+ return self.state_for(provider) == "UNHEALTHY"
202
+
203
+
204
+ __all__ = [
205
+ "BackendHealthMonitor",
206
+ "HealthState",
207
+ "HealthTransition",
208
+ ]
@@ -0,0 +1,210 @@
1
+ """Memory-pressure detection guard (v1.9-E phase 2, L2).
2
+
3
+ Local backends (Ollama / LM Studio / llama.cpp) report VRAM
4
+ exhaustion via HTTP 5xx with error bodies that include phrases like
5
+ ``"out of memory"``, ``"CUDA out of memory"``, ``"insufficient
6
+ memory"``, etc. Without intervention, the engine retries against the
7
+ same backend on the very next request and trips the same OOM — wasted
8
+ latency, wasted tokens (when the failure happens after partial
9
+ generation), and an operator-visible cascade of 5xx in the dashboard.
10
+
11
+ This module gives the engine two pieces:
12
+
13
+ 1. A **stateless detector** :func:`is_memory_pressure_error` that
14
+ decides "is this AdapterError an OOM-coded failure" from the
15
+ error message text. Pure, no observability dependencies.
16
+ 2. A **stateful tracker** :class:`MemoryPressureGuard` that records
17
+ "provider X is pressured until ts" entries (TTL-based cooldown)
18
+ and answers ``is_pressured(provider)`` at chain-resolve time.
19
+
20
+ The combination lets the engine react to an observed OOM by skipping
21
+ the same provider for ``memory_pressure_cooldown_s`` seconds — the
22
+ chain falls through to the next provider, which is typically a
23
+ lighter-weight model or a remote fallback with the headroom.
24
+
25
+ Detection patterns
26
+ ==================
27
+
28
+ Case-insensitive substring match against a curated phrase list; no
29
+ regex backtracking. Patterns chosen to match the actual error bodies
30
+ observed across:
31
+
32
+ * **Ollama** ``/api/generate`` and ``/v1/chat/completions``:
33
+ ``"model requires more system memory"``,
34
+ ``"out of memory"``.
35
+ * **LM Studio** ``/v1/messages`` and ``/v1/chat/completions``:
36
+ ``"insufficient memory"``, ``"failed to load model"``.
37
+ * **llama.cpp** ``llama-server``:
38
+ ``"failed to allocate"``, ``"CUDA out of memory"``.
39
+ * Generic CUDA / Metal patterns:
40
+ ``"out of memory"``, ``"OOM"``.
41
+
42
+ False-positive risk is low because all patterns require the
43
+ substring "memory" or the literal "OOM" in the failure body — generic
44
+ HTTP errors don't include those words.
45
+
46
+ Concurrency
47
+ ===========
48
+
49
+ The tracker is safe across asyncio tasks within one event loop and
50
+ across worker threads — every public method holds an ``RLock`` for
51
+ the body of its read/write.
52
+ """
53
+
54
+ from __future__ import annotations
55
+
56
+ import threading
57
+ import time
58
+
59
+ from coderouter.adapters.base import AdapterError
60
+
61
+ # ---------------------------------------------------------------------------
62
+ # Detection patterns
63
+ # ---------------------------------------------------------------------------
64
+
65
+
66
+ # Lowercased substrings — tested against ``str(adapter_error).lower()``.
67
+ # Order doesn't matter (any-match short-circuits). Each entry is a phrase
68
+ # observed in a real backend OOM response. New patterns can be added
69
+ # defensively (false-positive risk is low; see module docstring).
70
+ _MEMORY_PRESSURE_PHRASES: tuple[str, ...] = (
71
+ "out of memory",
72
+ "cuda out of memory",
73
+ "metal out of memory",
74
+ "insufficient memory",
75
+ "model requires more system memory",
76
+ "failed to allocate",
77
+ "failed to load model",
78
+ "ggml_cuda_host_malloc", # llama.cpp specific
79
+ "oom",
80
+ )
81
+
82
+
83
+ def is_memory_pressure_error(exc: AdapterError) -> bool:
84
+ """Return True iff ``exc`` looks like a backend OOM signal.
85
+
86
+ Pure function. Operates on ``str(exc).lower()`` and checks for
87
+ any of the curated phrases in :data:`_MEMORY_PRESSURE_PHRASES`.
88
+ Callers in the engine wrap each ``provider-failed`` site to
89
+ decide whether to mark the provider pressured.
90
+
91
+ The check is intentionally **only** about the message text, not
92
+ the HTTP status. Backends sometimes return 500 vs 503 for OOM
93
+ inconsistently, and a 500 carrying ``"missing model"`` should
94
+ NOT be treated as memory pressure (no cooldown helps recover
95
+ from a config error). The phrase-match keeps the detector
96
+ focused on the actual signal.
97
+ """
98
+ text = str(exc).lower()
99
+ return any(phrase in text for phrase in _MEMORY_PRESSURE_PHRASES)
100
+
101
+
102
+ # ---------------------------------------------------------------------------
103
+ # Tracker
104
+ # ---------------------------------------------------------------------------
105
+
106
+
107
+ class MemoryPressureGuard:
108
+ """In-memory per-provider OOM cooldown tracker.
109
+
110
+ Public API:
111
+
112
+ - :meth:`mark_pressured(provider, cooldown_s)` — start (or extend)
113
+ a cooldown window for ``provider``. Idempotent: re-marking a
114
+ pressured provider extends the deadline to ``now + cooldown_s``.
115
+ - :meth:`is_pressured(provider)` — True iff the provider's
116
+ cooldown deadline is in the future. Lazy expiry: when an
117
+ expired entry is observed, it's swept out so subsequent reads
118
+ see the entry-less default.
119
+ - :meth:`pressured_until(provider)` — monotonic timestamp of the
120
+ cooldown deadline (or 0.0 if not pressured). Useful for log
121
+ payloads that want to surface the human-readable expiry.
122
+ - :meth:`reset()` — drop all entries. Mainly for tests.
123
+
124
+ Internal lock: an ``RLock`` covers every read/write pair so a
125
+ concurrent ``mark_pressured`` from a failed call and an
126
+ ``is_pressured`` from a chain resolve can't observe a torn
127
+ state.
128
+
129
+ Time source: ``time.monotonic`` so cooldowns are immune to
130
+ wall-clock skew. Tests inject ``now=`` for determinism.
131
+ """
132
+
133
+ def __init__(self) -> None:
134
+ self._lock: threading.RLock = threading.RLock()
135
+ self._until: dict[str, float] = {}
136
+
137
+ # ------------------------------------------------------------------
138
+ # Mutating API
139
+ # ------------------------------------------------------------------
140
+
141
+ def mark_pressured(
142
+ self,
143
+ provider: str,
144
+ cooldown_s: float,
145
+ *,
146
+ now: float | None = None,
147
+ ) -> float:
148
+ """Mark ``provider`` as pressured for ``cooldown_s`` seconds.
149
+
150
+ Returns the resulting cooldown deadline (monotonic ts), so
151
+ callers that want to log the expiry can pull it out without
152
+ a second locked read. Idempotent — re-marking extends the
153
+ deadline.
154
+ """
155
+ ts = now if now is not None else time.monotonic()
156
+ deadline = ts + cooldown_s
157
+ with self._lock:
158
+ self._until[provider] = deadline
159
+ return deadline
160
+
161
+ def reset(self) -> None:
162
+ """Drop all cooldown entries immediately."""
163
+ with self._lock:
164
+ self._until.clear()
165
+
166
+ # ------------------------------------------------------------------
167
+ # Read-only API
168
+ # ------------------------------------------------------------------
169
+
170
+ def is_pressured(self, provider: str, *, now: float | None = None) -> bool:
171
+ """True iff ``provider`` is currently in cooldown.
172
+
173
+ Lazy expiry: if the recorded deadline has passed, the entry
174
+ is dropped before this call returns False. Callers don't see
175
+ stale "pressured" entries.
176
+ """
177
+ ts = now if now is not None else time.monotonic()
178
+ with self._lock:
179
+ deadline = self._until.get(provider)
180
+ if deadline is None:
181
+ return False
182
+ if deadline <= ts:
183
+ # Cooldown elapsed — sweep so subsequent calls don't
184
+ # re-take the lock for an empty entry.
185
+ del self._until[provider]
186
+ return False
187
+ return True
188
+
189
+ def pressured_until(
190
+ self, provider: str, *, now: float | None = None
191
+ ) -> float:
192
+ """Return ``provider``'s cooldown deadline, or 0.0 if not pressured.
193
+
194
+ Same lazy-expiry behavior as :meth:`is_pressured`.
195
+ """
196
+ ts = now if now is not None else time.monotonic()
197
+ with self._lock:
198
+ deadline = self._until.get(provider)
199
+ if deadline is None:
200
+ return 0.0
201
+ if deadline <= ts:
202
+ del self._until[provider]
203
+ return 0.0
204
+ return deadline
205
+
206
+
207
+ __all__ = [
208
+ "MemoryPressureGuard",
209
+ "is_memory_pressure_error",
210
+ ]