coderouter-cli 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,308 @@
1
+ """Drift detection guard (v2.0-G, L4).
2
+
3
+ Detects gradual quality degradation in model responses during long-running
4
+ agent sessions. Unlike L5 (binary crash) and adaptive routing (latency),
5
+ drift detection targets the "succeeds but quality is decaying" pattern:
6
+ empty responses, shrinking output length, tool-call silence, anomalous
7
+ stop reasons.
8
+
9
+ Architecture
10
+ ============
11
+
12
+ Three layers:
13
+
14
+ 1. **Observation model** — :class:`ResponseObservation` captures the
15
+ quality-relevant fields from each successful provider response.
16
+ 2. **Detector** — :func:`detect_drift` is a pure function that takes
17
+ a window of observations and thresholds, returns a
18
+ :class:`DriftVerdict`.
19
+ 3. **Window manager** — :class:`DriftWindow` maintains per-provider
20
+ rolling deques of observations, thread-safe for the async engine.
21
+
22
+ The engine calls :meth:`DriftWindow.record` after each provider-ok/failed
23
+ event, then calls :func:`detect_drift` to check whether corrective action
24
+ is needed.
25
+
26
+ Signals
27
+ =======
28
+
29
+ * ``empty_response_rate`` — fraction of responses with output_tokens == 0
30
+ * ``length_collapse`` — median output_tokens in the recent half vs. the
31
+ earlier half of the window; ratio below threshold = collapse
32
+ * ``tool_silence_rate`` — fraction of responses missing tool_use blocks
33
+ (only meaningful when the request contained tools)
34
+ * ``stop_anomaly_rate`` — fraction of responses with unexpected stop_reason
35
+ (not "end_turn" / "tool_use" / "max_tokens")
36
+ * ``error_rate`` — fraction of attempts that ended in failure
37
+
38
+ Thresholds are bundled as :class:`DriftThresholds` with three presets
39
+ (``low`` / ``normal`` / ``high`` sensitivity).
40
+ """
41
+
42
+ from __future__ import annotations
43
+
44
+ import statistics
45
+ from collections import deque
46
+ from dataclasses import dataclass, field
47
+ from typing import Literal
48
+
49
+ # ---------------------------------------------------------------------------
50
+ # Observation model
51
+ # ---------------------------------------------------------------------------
52
+
53
+
54
+ @dataclass(frozen=True, slots=True)
55
+ class ResponseObservation:
56
+ """Quality-relevant snapshot of a single provider response.
57
+
58
+ Captured post-response (after adapter translation), before returning
59
+ to the client. Fields are intentionally minimal — only what the
60
+ detector needs.
61
+ """
62
+
63
+ provider: str
64
+ output_tokens: int
65
+ has_tool_use: bool
66
+ """Whether the response contained at least one tool_use block."""
67
+ request_had_tools: bool
68
+ """Whether the request included a tools[] array (context for tool_silence)."""
69
+ stop_reason: str | None
70
+ """Anthropic stop_reason: 'end_turn' / 'tool_use' / 'max_tokens' / None."""
71
+ is_error: bool = False
72
+ """True if the attempt ended in provider-failed / provider-failed-midstream."""
73
+ stream: bool = False
74
+
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # Thresholds
78
+ # ---------------------------------------------------------------------------
79
+
80
+
81
+ @dataclass(frozen=True, slots=True)
82
+ class DriftThresholds:
83
+ """Threshold set for drift detection.
84
+
85
+ Each field is the value above which (or below which for length_collapse)
86
+ the corresponding signal is considered anomalous.
87
+ """
88
+
89
+ # Rate thresholds (signal > threshold → anomaly)
90
+ empty_response_rate: float = 0.3
91
+ """Fraction of responses with output_tokens == 0 to trigger."""
92
+ stop_anomaly_rate: float = 0.4
93
+ """Fraction of responses with unexpected stop_reason."""
94
+ error_rate: float = 0.25
95
+ """Fraction of failed attempts."""
96
+ tool_silence_rate: float = 0.7
97
+ """Fraction of tool-eligible responses missing tool_use."""
98
+
99
+ # Ratio threshold (recent_median / earlier_median < threshold → collapse)
100
+ length_collapse_ratio: float = 0.5
101
+ """If recent half median is < 50% of earlier half median → collapse."""
102
+
103
+ # Minimum observations before detection fires
104
+ min_window_fill: int = 6
105
+ """Don't trigger until at least this many observations in the window."""
106
+
107
+
108
+ # Presets
109
+ THRESHOLDS_LOW = DriftThresholds(
110
+ empty_response_rate=0.5,
111
+ length_collapse_ratio=0.3,
112
+ tool_silence_rate=0.8,
113
+ stop_anomaly_rate=0.6,
114
+ error_rate=0.4,
115
+ min_window_fill=10,
116
+ )
117
+
118
+ THRESHOLDS_NORMAL = DriftThresholds() # defaults
119
+
120
+ THRESHOLDS_HIGH = DriftThresholds(
121
+ empty_response_rate=0.2,
122
+ length_collapse_ratio=0.7,
123
+ tool_silence_rate=0.5,
124
+ stop_anomaly_rate=0.3,
125
+ error_rate=0.15,
126
+ min_window_fill=4,
127
+ )
128
+
129
+ SENSITIVITY_PRESETS: dict[str, DriftThresholds] = {
130
+ "low": THRESHOLDS_LOW,
131
+ "normal": THRESHOLDS_NORMAL,
132
+ "high": THRESHOLDS_HIGH,
133
+ }
134
+
135
+
136
+ # ---------------------------------------------------------------------------
137
+ # Verdict
138
+ # ---------------------------------------------------------------------------
139
+
140
+
141
+ @dataclass(frozen=True, slots=True)
142
+ class DriftVerdict:
143
+ """Result of drift detection for a single provider window."""
144
+
145
+ drifted: bool
146
+ severity: Literal["none", "mild", "severe"]
147
+ signals: dict[str, float]
148
+ """Signal name → computed value for observability."""
149
+ reason: str
150
+ """Human-readable explanation of why drift was detected (empty if none)."""
151
+
152
+
153
+ _NO_DRIFT = DriftVerdict(drifted=False, severity="none", signals={}, reason="")
154
+
155
+
156
+ # ---------------------------------------------------------------------------
157
+ # Detector (pure function)
158
+ # ---------------------------------------------------------------------------
159
+
160
+
161
+ def detect_drift(
162
+ window: list[ResponseObservation],
163
+ thresholds: DriftThresholds | None = None,
164
+ ) -> DriftVerdict:
165
+ """Analyze a window of observations and return a drift verdict.
166
+
167
+ Pure function — no I/O, no side effects. Safe to call from any context.
168
+
169
+ Parameters
170
+ ----------
171
+ window:
172
+ List of recent :class:`ResponseObservation` for a single provider,
173
+ ordered oldest-first.
174
+ thresholds:
175
+ Detection thresholds. Defaults to ``THRESHOLDS_NORMAL``.
176
+
177
+ Returns
178
+ -------
179
+ DriftVerdict with severity ``none`` / ``mild`` / ``severe``.
180
+ """
181
+ if thresholds is None:
182
+ thresholds = THRESHOLDS_NORMAL
183
+
184
+ if len(window) < thresholds.min_window_fill:
185
+ return _NO_DRIFT
186
+
187
+ signals: dict[str, float] = {}
188
+ mild_flags: list[str] = []
189
+ severe_flags: list[str] = []
190
+
191
+ total = len(window)
192
+
193
+ # --- Signal 1: Empty response rate ---
194
+ empty_count = sum(1 for obs in window if obs.output_tokens == 0 and not obs.is_error)
195
+ non_error_count = sum(1 for obs in window if not obs.is_error)
196
+ if non_error_count > 0:
197
+ empty_rate = empty_count / non_error_count
198
+ signals["empty_response_rate"] = round(empty_rate, 3)
199
+ if empty_rate > thresholds.empty_response_rate:
200
+ severe_flags.append(f"empty_response_rate={empty_rate:.2f}")
201
+
202
+ # --- Signal 2: Length collapse (median comparison) ---
203
+ non_error_lengths = [obs.output_tokens for obs in window if not obs.is_error]
204
+ if len(non_error_lengths) >= 4:
205
+ mid = len(non_error_lengths) // 2
206
+ earlier_half = non_error_lengths[:mid]
207
+ recent_half = non_error_lengths[mid:]
208
+ earlier_median = statistics.median(earlier_half)
209
+ recent_median = statistics.median(recent_half)
210
+ if earlier_median > 0:
211
+ collapse_ratio = recent_median / earlier_median
212
+ signals["length_collapse_ratio"] = round(collapse_ratio, 3)
213
+ if collapse_ratio < thresholds.length_collapse_ratio:
214
+ severe_flags.append(
215
+ f"length_collapse={collapse_ratio:.2f}"
216
+ f" (recent_median={recent_median:.0f}, earlier={earlier_median:.0f})"
217
+ )
218
+
219
+ # --- Signal 3: Tool silence rate ---
220
+ tool_eligible = [obs for obs in window if obs.request_had_tools and not obs.is_error]
221
+ if len(tool_eligible) >= 3:
222
+ tool_silent_count = sum(1 for obs in tool_eligible if not obs.has_tool_use)
223
+ tool_silence_rate = tool_silent_count / len(tool_eligible)
224
+ signals["tool_silence_rate"] = round(tool_silence_rate, 3)
225
+ if tool_silence_rate > thresholds.tool_silence_rate:
226
+ mild_flags.append(f"tool_silence_rate={tool_silence_rate:.2f}")
227
+
228
+ # --- Signal 4: Stop reason anomaly rate ---
229
+ _EXPECTED_STOP = {"end_turn", "tool_use", "max_tokens"}
230
+ non_error_obs = [obs for obs in window if not obs.is_error]
231
+ if non_error_obs:
232
+ anomaly_count = sum(
233
+ 1 for obs in non_error_obs if obs.stop_reason not in _EXPECTED_STOP
234
+ )
235
+ stop_anomaly_rate = anomaly_count / len(non_error_obs)
236
+ signals["stop_anomaly_rate"] = round(stop_anomaly_rate, 3)
237
+ if stop_anomaly_rate > thresholds.stop_anomaly_rate:
238
+ mild_flags.append(f"stop_anomaly_rate={stop_anomaly_rate:.2f}")
239
+
240
+ # --- Signal 5: Error rate ---
241
+ error_count = sum(1 for obs in window if obs.is_error)
242
+ error_rate = error_count / total
243
+ signals["error_rate"] = round(error_rate, 3)
244
+ if error_rate > thresholds.error_rate:
245
+ mild_flags.append(f"error_rate={error_rate:.2f}")
246
+
247
+ # --- Severity synthesis ---
248
+ if severe_flags:
249
+ severity: Literal["none", "mild", "severe"] = "severe"
250
+ elif len(mild_flags) >= 2:
251
+ severity = "severe"
252
+ elif mild_flags:
253
+ severity = "mild"
254
+ else:
255
+ return DriftVerdict(drifted=False, severity="none", signals=signals, reason="")
256
+
257
+ reason_parts = severe_flags + mild_flags
258
+ return DriftVerdict(
259
+ drifted=True,
260
+ severity=severity,
261
+ signals=signals,
262
+ reason=", ".join(reason_parts),
263
+ )
264
+
265
+
266
+ # ---------------------------------------------------------------------------
267
+ # Window manager
268
+ # ---------------------------------------------------------------------------
269
+
270
+
271
+ @dataclass
272
+ class DriftWindow:
273
+ """Per-provider rolling window of response observations.
274
+
275
+ Thread-safe for the single-threaded async event loop (no locking needed
276
+ since all access happens on the same asyncio loop). If CodeRouter ever
277
+ goes multi-threaded, add a Lock.
278
+ """
279
+
280
+ max_size: int = 20
281
+ _windows: dict[str, deque[ResponseObservation]] = field(default_factory=dict)
282
+
283
+ def record(self, obs: ResponseObservation) -> None:
284
+ """Append an observation to the provider's window."""
285
+ dq = self._windows.get(obs.provider)
286
+ if dq is None:
287
+ dq = deque(maxlen=self.max_size)
288
+ self._windows[obs.provider] = dq
289
+ dq.append(obs)
290
+
291
+ def get_window(self, provider: str) -> list[ResponseObservation]:
292
+ """Return a snapshot of the provider's window (oldest-first)."""
293
+ dq = self._windows.get(provider)
294
+ if dq is None:
295
+ return []
296
+ return list(dq)
297
+
298
+ def clear(self, provider: str) -> None:
299
+ """Clear a provider's window (e.g. after recovery)."""
300
+ self._windows.pop(provider, None)
301
+
302
+ def clear_all(self) -> None:
303
+ """Reset all windows."""
304
+ self._windows.clear()
305
+
306
+ def __len__(self) -> int:
307
+ """Total observations across all providers."""
308
+ return sum(len(dq) for dq in self._windows.values())