coderouter-cli 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderouter/cli.py +219 -0
- coderouter/config/schemas.py +235 -2
- coderouter/guards/__init__.py +6 -4
- coderouter/guards/backend_health.py +34 -0
- coderouter/guards/continuous_probe.py +349 -0
- coderouter/guards/drift_actions.py +111 -0
- coderouter/guards/drift_detection.py +308 -0
- coderouter/guards/self_healing.py +413 -0
- coderouter/guards/tool_loop.py +71 -0
- coderouter/ingress/anthropic_routes.py +106 -12
- coderouter/ingress/app.py +129 -0
- coderouter/logging.py +370 -0
- coderouter/metrics/collector.py +168 -0
- coderouter/metrics/prometheus.py +141 -0
- coderouter/output_filters.py +95 -4
- coderouter/routing/adaptive.py +23 -0
- coderouter/routing/budget.py +35 -0
- coderouter/routing/fallback.py +496 -5
- coderouter/state/__init__.py +15 -0
- coderouter/state/audit_log.py +269 -0
- coderouter/state/replay.py +316 -0
- coderouter/state/request_log.py +178 -0
- coderouter/state/store.py +212 -0
- coderouter/translation/tool_repair.py +42 -1
- coderouter_cli-2.2.0.dist-info/METADATA +243 -0
- {coderouter_cli-2.0.0.dist-info → coderouter_cli-2.2.0.dist-info}/RECORD +29 -20
- coderouter_cli-2.0.0.dist-info/METADATA +0 -559
- {coderouter_cli-2.0.0.dist-info → coderouter_cli-2.2.0.dist-info}/WHEEL +0 -0
- {coderouter_cli-2.0.0.dist-info → coderouter_cli-2.2.0.dist-info}/entry_points.txt +0 -0
- {coderouter_cli-2.0.0.dist-info → coderouter_cli-2.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""Drift detection guard (v2.0-G, L4).
|
|
2
|
+
|
|
3
|
+
Detects gradual quality degradation in model responses during long-running
|
|
4
|
+
agent sessions. Unlike L5 (binary crash) and adaptive routing (latency),
|
|
5
|
+
drift detection targets the "succeeds but quality is decaying" pattern:
|
|
6
|
+
empty responses, shrinking output length, tool-call silence, anomalous
|
|
7
|
+
stop reasons.
|
|
8
|
+
|
|
9
|
+
Architecture
|
|
10
|
+
============
|
|
11
|
+
|
|
12
|
+
Three layers:
|
|
13
|
+
|
|
14
|
+
1. **Observation model** — :class:`ResponseObservation` captures the
|
|
15
|
+
quality-relevant fields from each successful provider response.
|
|
16
|
+
2. **Detector** — :func:`detect_drift` is a pure function that takes
|
|
17
|
+
a window of observations and thresholds, returns a
|
|
18
|
+
:class:`DriftVerdict`.
|
|
19
|
+
3. **Window manager** — :class:`DriftWindow` maintains per-provider
|
|
20
|
+
rolling deques of observations, thread-safe for the async engine.
|
|
21
|
+
|
|
22
|
+
The engine calls :meth:`DriftWindow.record` after each provider-ok/failed
|
|
23
|
+
event, then calls :func:`detect_drift` to check whether corrective action
|
|
24
|
+
is needed.
|
|
25
|
+
|
|
26
|
+
Signals
|
|
27
|
+
=======
|
|
28
|
+
|
|
29
|
+
* ``empty_response_rate`` — fraction of responses with output_tokens == 0
|
|
30
|
+
* ``length_collapse`` — median output_tokens in the recent half vs. the
|
|
31
|
+
earlier half of the window; ratio below threshold = collapse
|
|
32
|
+
* ``tool_silence_rate`` — fraction of responses missing tool_use blocks
|
|
33
|
+
(only meaningful when the request contained tools)
|
|
34
|
+
* ``stop_anomaly_rate`` — fraction of responses with unexpected stop_reason
|
|
35
|
+
(not "end_turn" / "tool_use" / "max_tokens")
|
|
36
|
+
* ``error_rate`` — fraction of attempts that ended in failure
|
|
37
|
+
|
|
38
|
+
Thresholds are bundled as :class:`DriftThresholds` with three presets
|
|
39
|
+
(``low`` / ``normal`` / ``high`` sensitivity).
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
import statistics
|
|
45
|
+
from collections import deque
|
|
46
|
+
from dataclasses import dataclass, field
|
|
47
|
+
from typing import Literal
|
|
48
|
+
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
# Observation model
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass(frozen=True, slots=True)
|
|
55
|
+
class ResponseObservation:
|
|
56
|
+
"""Quality-relevant snapshot of a single provider response.
|
|
57
|
+
|
|
58
|
+
Captured post-response (after adapter translation), before returning
|
|
59
|
+
to the client. Fields are intentionally minimal — only what the
|
|
60
|
+
detector needs.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
provider: str
|
|
64
|
+
output_tokens: int
|
|
65
|
+
has_tool_use: bool
|
|
66
|
+
"""Whether the response contained at least one tool_use block."""
|
|
67
|
+
request_had_tools: bool
|
|
68
|
+
"""Whether the request included a tools[] array (context for tool_silence)."""
|
|
69
|
+
stop_reason: str | None
|
|
70
|
+
"""Anthropic stop_reason: 'end_turn' / 'tool_use' / 'max_tokens' / None."""
|
|
71
|
+
is_error: bool = False
|
|
72
|
+
"""True if the attempt ended in provider-failed / provider-failed-midstream."""
|
|
73
|
+
stream: bool = False
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# ---------------------------------------------------------------------------
|
|
77
|
+
# Thresholds
|
|
78
|
+
# ---------------------------------------------------------------------------
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass(frozen=True, slots=True)
|
|
82
|
+
class DriftThresholds:
|
|
83
|
+
"""Threshold set for drift detection.
|
|
84
|
+
|
|
85
|
+
Each field is the value above which (or below which for length_collapse)
|
|
86
|
+
the corresponding signal is considered anomalous.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
# Rate thresholds (signal > threshold → anomaly)
|
|
90
|
+
empty_response_rate: float = 0.3
|
|
91
|
+
"""Fraction of responses with output_tokens == 0 to trigger."""
|
|
92
|
+
stop_anomaly_rate: float = 0.4
|
|
93
|
+
"""Fraction of responses with unexpected stop_reason."""
|
|
94
|
+
error_rate: float = 0.25
|
|
95
|
+
"""Fraction of failed attempts."""
|
|
96
|
+
tool_silence_rate: float = 0.7
|
|
97
|
+
"""Fraction of tool-eligible responses missing tool_use."""
|
|
98
|
+
|
|
99
|
+
# Ratio threshold (recent_median / earlier_median < threshold → collapse)
|
|
100
|
+
length_collapse_ratio: float = 0.5
|
|
101
|
+
"""If recent half median is < 50% of earlier half median → collapse."""
|
|
102
|
+
|
|
103
|
+
# Minimum observations before detection fires
|
|
104
|
+
min_window_fill: int = 6
|
|
105
|
+
"""Don't trigger until at least this many observations in the window."""
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# Presets
|
|
109
|
+
THRESHOLDS_LOW = DriftThresholds(
|
|
110
|
+
empty_response_rate=0.5,
|
|
111
|
+
length_collapse_ratio=0.3,
|
|
112
|
+
tool_silence_rate=0.8,
|
|
113
|
+
stop_anomaly_rate=0.6,
|
|
114
|
+
error_rate=0.4,
|
|
115
|
+
min_window_fill=10,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
THRESHOLDS_NORMAL = DriftThresholds() # defaults
|
|
119
|
+
|
|
120
|
+
THRESHOLDS_HIGH = DriftThresholds(
|
|
121
|
+
empty_response_rate=0.2,
|
|
122
|
+
length_collapse_ratio=0.7,
|
|
123
|
+
tool_silence_rate=0.5,
|
|
124
|
+
stop_anomaly_rate=0.3,
|
|
125
|
+
error_rate=0.15,
|
|
126
|
+
min_window_fill=4,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
SENSITIVITY_PRESETS: dict[str, DriftThresholds] = {
|
|
130
|
+
"low": THRESHOLDS_LOW,
|
|
131
|
+
"normal": THRESHOLDS_NORMAL,
|
|
132
|
+
"high": THRESHOLDS_HIGH,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# ---------------------------------------------------------------------------
|
|
137
|
+
# Verdict
|
|
138
|
+
# ---------------------------------------------------------------------------
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@dataclass(frozen=True, slots=True)
|
|
142
|
+
class DriftVerdict:
|
|
143
|
+
"""Result of drift detection for a single provider window."""
|
|
144
|
+
|
|
145
|
+
drifted: bool
|
|
146
|
+
severity: Literal["none", "mild", "severe"]
|
|
147
|
+
signals: dict[str, float]
|
|
148
|
+
"""Signal name → computed value for observability."""
|
|
149
|
+
reason: str
|
|
150
|
+
"""Human-readable explanation of why drift was detected (empty if none)."""
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
_NO_DRIFT = DriftVerdict(drifted=False, severity="none", signals={}, reason="")
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# ---------------------------------------------------------------------------
|
|
157
|
+
# Detector (pure function)
|
|
158
|
+
# ---------------------------------------------------------------------------
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def detect_drift(
|
|
162
|
+
window: list[ResponseObservation],
|
|
163
|
+
thresholds: DriftThresholds | None = None,
|
|
164
|
+
) -> DriftVerdict:
|
|
165
|
+
"""Analyze a window of observations and return a drift verdict.
|
|
166
|
+
|
|
167
|
+
Pure function — no I/O, no side effects. Safe to call from any context.
|
|
168
|
+
|
|
169
|
+
Parameters
|
|
170
|
+
----------
|
|
171
|
+
window:
|
|
172
|
+
List of recent :class:`ResponseObservation` for a single provider,
|
|
173
|
+
ordered oldest-first.
|
|
174
|
+
thresholds:
|
|
175
|
+
Detection thresholds. Defaults to ``THRESHOLDS_NORMAL``.
|
|
176
|
+
|
|
177
|
+
Returns
|
|
178
|
+
-------
|
|
179
|
+
DriftVerdict with severity ``none`` / ``mild`` / ``severe``.
|
|
180
|
+
"""
|
|
181
|
+
if thresholds is None:
|
|
182
|
+
thresholds = THRESHOLDS_NORMAL
|
|
183
|
+
|
|
184
|
+
if len(window) < thresholds.min_window_fill:
|
|
185
|
+
return _NO_DRIFT
|
|
186
|
+
|
|
187
|
+
signals: dict[str, float] = {}
|
|
188
|
+
mild_flags: list[str] = []
|
|
189
|
+
severe_flags: list[str] = []
|
|
190
|
+
|
|
191
|
+
total = len(window)
|
|
192
|
+
|
|
193
|
+
# --- Signal 1: Empty response rate ---
|
|
194
|
+
empty_count = sum(1 for obs in window if obs.output_tokens == 0 and not obs.is_error)
|
|
195
|
+
non_error_count = sum(1 for obs in window if not obs.is_error)
|
|
196
|
+
if non_error_count > 0:
|
|
197
|
+
empty_rate = empty_count / non_error_count
|
|
198
|
+
signals["empty_response_rate"] = round(empty_rate, 3)
|
|
199
|
+
if empty_rate > thresholds.empty_response_rate:
|
|
200
|
+
severe_flags.append(f"empty_response_rate={empty_rate:.2f}")
|
|
201
|
+
|
|
202
|
+
# --- Signal 2: Length collapse (median comparison) ---
|
|
203
|
+
non_error_lengths = [obs.output_tokens for obs in window if not obs.is_error]
|
|
204
|
+
if len(non_error_lengths) >= 4:
|
|
205
|
+
mid = len(non_error_lengths) // 2
|
|
206
|
+
earlier_half = non_error_lengths[:mid]
|
|
207
|
+
recent_half = non_error_lengths[mid:]
|
|
208
|
+
earlier_median = statistics.median(earlier_half)
|
|
209
|
+
recent_median = statistics.median(recent_half)
|
|
210
|
+
if earlier_median > 0:
|
|
211
|
+
collapse_ratio = recent_median / earlier_median
|
|
212
|
+
signals["length_collapse_ratio"] = round(collapse_ratio, 3)
|
|
213
|
+
if collapse_ratio < thresholds.length_collapse_ratio:
|
|
214
|
+
severe_flags.append(
|
|
215
|
+
f"length_collapse={collapse_ratio:.2f}"
|
|
216
|
+
f" (recent_median={recent_median:.0f}, earlier={earlier_median:.0f})"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# --- Signal 3: Tool silence rate ---
|
|
220
|
+
tool_eligible = [obs for obs in window if obs.request_had_tools and not obs.is_error]
|
|
221
|
+
if len(tool_eligible) >= 3:
|
|
222
|
+
tool_silent_count = sum(1 for obs in tool_eligible if not obs.has_tool_use)
|
|
223
|
+
tool_silence_rate = tool_silent_count / len(tool_eligible)
|
|
224
|
+
signals["tool_silence_rate"] = round(tool_silence_rate, 3)
|
|
225
|
+
if tool_silence_rate > thresholds.tool_silence_rate:
|
|
226
|
+
mild_flags.append(f"tool_silence_rate={tool_silence_rate:.2f}")
|
|
227
|
+
|
|
228
|
+
# --- Signal 4: Stop reason anomaly rate ---
|
|
229
|
+
_EXPECTED_STOP = {"end_turn", "tool_use", "max_tokens"}
|
|
230
|
+
non_error_obs = [obs for obs in window if not obs.is_error]
|
|
231
|
+
if non_error_obs:
|
|
232
|
+
anomaly_count = sum(
|
|
233
|
+
1 for obs in non_error_obs if obs.stop_reason not in _EXPECTED_STOP
|
|
234
|
+
)
|
|
235
|
+
stop_anomaly_rate = anomaly_count / len(non_error_obs)
|
|
236
|
+
signals["stop_anomaly_rate"] = round(stop_anomaly_rate, 3)
|
|
237
|
+
if stop_anomaly_rate > thresholds.stop_anomaly_rate:
|
|
238
|
+
mild_flags.append(f"stop_anomaly_rate={stop_anomaly_rate:.2f}")
|
|
239
|
+
|
|
240
|
+
# --- Signal 5: Error rate ---
|
|
241
|
+
error_count = sum(1 for obs in window if obs.is_error)
|
|
242
|
+
error_rate = error_count / total
|
|
243
|
+
signals["error_rate"] = round(error_rate, 3)
|
|
244
|
+
if error_rate > thresholds.error_rate:
|
|
245
|
+
mild_flags.append(f"error_rate={error_rate:.2f}")
|
|
246
|
+
|
|
247
|
+
# --- Severity synthesis ---
|
|
248
|
+
if severe_flags:
|
|
249
|
+
severity: Literal["none", "mild", "severe"] = "severe"
|
|
250
|
+
elif len(mild_flags) >= 2:
|
|
251
|
+
severity = "severe"
|
|
252
|
+
elif mild_flags:
|
|
253
|
+
severity = "mild"
|
|
254
|
+
else:
|
|
255
|
+
return DriftVerdict(drifted=False, severity="none", signals=signals, reason="")
|
|
256
|
+
|
|
257
|
+
reason_parts = severe_flags + mild_flags
|
|
258
|
+
return DriftVerdict(
|
|
259
|
+
drifted=True,
|
|
260
|
+
severity=severity,
|
|
261
|
+
signals=signals,
|
|
262
|
+
reason=", ".join(reason_parts),
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
# ---------------------------------------------------------------------------
|
|
267
|
+
# Window manager
|
|
268
|
+
# ---------------------------------------------------------------------------
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
@dataclass
|
|
272
|
+
class DriftWindow:
|
|
273
|
+
"""Per-provider rolling window of response observations.
|
|
274
|
+
|
|
275
|
+
Thread-safe for the single-threaded async event loop (no locking needed
|
|
276
|
+
since all access happens on the same asyncio loop). If CodeRouter ever
|
|
277
|
+
goes multi-threaded, add a Lock.
|
|
278
|
+
"""
|
|
279
|
+
|
|
280
|
+
max_size: int = 20
|
|
281
|
+
_windows: dict[str, deque[ResponseObservation]] = field(default_factory=dict)
|
|
282
|
+
|
|
283
|
+
def record(self, obs: ResponseObservation) -> None:
|
|
284
|
+
"""Append an observation to the provider's window."""
|
|
285
|
+
dq = self._windows.get(obs.provider)
|
|
286
|
+
if dq is None:
|
|
287
|
+
dq = deque(maxlen=self.max_size)
|
|
288
|
+
self._windows[obs.provider] = dq
|
|
289
|
+
dq.append(obs)
|
|
290
|
+
|
|
291
|
+
def get_window(self, provider: str) -> list[ResponseObservation]:
|
|
292
|
+
"""Return a snapshot of the provider's window (oldest-first)."""
|
|
293
|
+
dq = self._windows.get(provider)
|
|
294
|
+
if dq is None:
|
|
295
|
+
return []
|
|
296
|
+
return list(dq)
|
|
297
|
+
|
|
298
|
+
def clear(self, provider: str) -> None:
|
|
299
|
+
"""Clear a provider's window (e.g. after recovery)."""
|
|
300
|
+
self._windows.pop(provider, None)
|
|
301
|
+
|
|
302
|
+
def clear_all(self) -> None:
|
|
303
|
+
"""Reset all windows."""
|
|
304
|
+
self._windows.clear()
|
|
305
|
+
|
|
306
|
+
def __len__(self) -> int:
|
|
307
|
+
"""Total observations across all providers."""
|
|
308
|
+
return sum(len(dq) for dq in self._windows.values())
|