coderouter-cli 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderouter/cli.py +219 -0
- coderouter/config/schemas.py +235 -2
- coderouter/guards/__init__.py +6 -4
- coderouter/guards/backend_health.py +34 -0
- coderouter/guards/continuous_probe.py +349 -0
- coderouter/guards/drift_actions.py +111 -0
- coderouter/guards/drift_detection.py +308 -0
- coderouter/guards/self_healing.py +413 -0
- coderouter/guards/tool_loop.py +71 -0
- coderouter/ingress/anthropic_routes.py +106 -12
- coderouter/ingress/app.py +129 -0
- coderouter/logging.py +370 -0
- coderouter/metrics/collector.py +168 -0
- coderouter/metrics/prometheus.py +141 -0
- coderouter/output_filters.py +95 -4
- coderouter/routing/adaptive.py +23 -0
- coderouter/routing/budget.py +35 -0
- coderouter/routing/fallback.py +496 -5
- coderouter/state/__init__.py +15 -0
- coderouter/state/audit_log.py +269 -0
- coderouter/state/replay.py +316 -0
- coderouter/state/request_log.py +178 -0
- coderouter/state/store.py +212 -0
- coderouter/translation/tool_repair.py +42 -1
- coderouter_cli-2.2.0.dist-info/METADATA +243 -0
- {coderouter_cli-2.0.0.dist-info → coderouter_cli-2.2.0.dist-info}/RECORD +29 -20
- coderouter_cli-2.0.0.dist-info/METADATA +0 -559
- {coderouter_cli-2.0.0.dist-info → coderouter_cli-2.2.0.dist-info}/WHEEL +0 -0
- {coderouter_cli-2.0.0.dist-info → coderouter_cli-2.2.0.dist-info}/entry_points.txt +0 -0
- {coderouter_cli-2.0.0.dist-info → coderouter_cli-2.2.0.dist-info}/licenses/LICENSE +0 -0
coderouter/ingress/app.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import contextlib
|
|
5
6
|
import os
|
|
6
7
|
from collections.abc import AsyncIterator
|
|
7
8
|
from contextlib import asynccontextmanager
|
|
@@ -71,7 +72,135 @@ def create_app(config_path: str | None = None) -> FastAPI:
|
|
|
71
72
|
# chronological order. Non-fatal — the chain still works, just
|
|
72
73
|
# potentially sub-optimally for the agentic harness.
|
|
73
74
|
check_claude_code_chain_suitability(config, logger=logger)
|
|
75
|
+
|
|
76
|
+
# v2.0-K: attach persistent state store + audit/request log if configured.
|
|
77
|
+
state_store = None
|
|
78
|
+
audit_handler = None
|
|
79
|
+
request_log_handler = None
|
|
80
|
+
if config.state_dir:
|
|
81
|
+
import logging as _logging
|
|
82
|
+
from pathlib import Path
|
|
83
|
+
|
|
84
|
+
from coderouter.state.audit_log import AuditLogHandler
|
|
85
|
+
from coderouter.state.store import StateStore
|
|
86
|
+
|
|
87
|
+
state_path = Path(config.state_dir).expanduser()
|
|
88
|
+
state_store = StateStore(state_path / "coderouter.db")
|
|
89
|
+
engine.attach_state_store(state_store)
|
|
90
|
+
|
|
91
|
+
# Restore MetricsCollector state from the store.
|
|
92
|
+
from coderouter.metrics import get_collector
|
|
93
|
+
|
|
94
|
+
collector = get_collector()
|
|
95
|
+
if collector is not None:
|
|
96
|
+
metrics_state = state_store.get("metrics", "state")
|
|
97
|
+
if metrics_state is not None:
|
|
98
|
+
with contextlib.suppress(Exception):
|
|
99
|
+
collector.load_state(metrics_state) # type: ignore[arg-type]
|
|
100
|
+
|
|
101
|
+
logger.info(
|
|
102
|
+
"state-store-attached",
|
|
103
|
+
extra={"state_dir": str(state_path)},
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
if config.audit_log == "active":
|
|
107
|
+
audit_handler = AuditLogHandler(
|
|
108
|
+
state_path / "audit.jsonl",
|
|
109
|
+
max_bytes=config.audit_log_max_bytes,
|
|
110
|
+
)
|
|
111
|
+
_logging.getLogger().addHandler(audit_handler)
|
|
112
|
+
logger.info(
|
|
113
|
+
"audit-log-started",
|
|
114
|
+
extra={
|
|
115
|
+
"path": str(state_path / "audit.jsonl"),
|
|
116
|
+
"max_bytes": config.audit_log_max_bytes,
|
|
117
|
+
},
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if config.request_log == "active":
|
|
121
|
+
from coderouter.state.request_log import RequestLogHandler
|
|
122
|
+
|
|
123
|
+
request_log_handler = RequestLogHandler(
|
|
124
|
+
state_path / "requests.jsonl",
|
|
125
|
+
max_bytes=config.request_log_max_bytes,
|
|
126
|
+
)
|
|
127
|
+
_logging.getLogger().addHandler(request_log_handler)
|
|
128
|
+
logger.info(
|
|
129
|
+
"request-log-started",
|
|
130
|
+
extra={
|
|
131
|
+
"path": str(state_path / "requests.jsonl"),
|
|
132
|
+
"max_bytes": config.request_log_max_bytes,
|
|
133
|
+
},
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# v2.0-I: launch continuous probe background task if configured.
|
|
137
|
+
probe_task = None
|
|
138
|
+
shutdown_event = None
|
|
139
|
+
if config.continuous_probe == "active":
|
|
140
|
+
import asyncio
|
|
141
|
+
|
|
142
|
+
from coderouter.guards.continuous_probe import probe_loop
|
|
143
|
+
from coderouter.routing.capability import get_default_registry
|
|
144
|
+
|
|
145
|
+
shutdown_event = asyncio.Event()
|
|
146
|
+
probe_task = asyncio.create_task(
|
|
147
|
+
probe_loop(
|
|
148
|
+
config.providers,
|
|
149
|
+
record_fn=engine.backend_health.record_attempt,
|
|
150
|
+
interval_s=config.probe_interval_s,
|
|
151
|
+
timeout_s=config.probe_timeout_s,
|
|
152
|
+
probe_paid=config.probe_paid,
|
|
153
|
+
shutdown_event=shutdown_event,
|
|
154
|
+
registry=get_default_registry(),
|
|
155
|
+
)
|
|
156
|
+
)
|
|
157
|
+
logger.info(
|
|
158
|
+
"continuous-probe-started",
|
|
159
|
+
extra={
|
|
160
|
+
"interval_s": config.probe_interval_s,
|
|
161
|
+
"probe_paid": config.probe_paid,
|
|
162
|
+
"providers": len(config.providers),
|
|
163
|
+
},
|
|
164
|
+
)
|
|
165
|
+
|
|
74
166
|
yield
|
|
167
|
+
|
|
168
|
+
# Graceful shutdown of probe task
|
|
169
|
+
if probe_task is not None and shutdown_event is not None:
|
|
170
|
+
shutdown_event.set()
|
|
171
|
+
with contextlib.suppress(Exception):
|
|
172
|
+
await probe_task
|
|
173
|
+
|
|
174
|
+
# v2.0-J: graceful shutdown of recovery probe tasks.
|
|
175
|
+
with contextlib.suppress(Exception):
|
|
176
|
+
await engine.shutdown_recovery_probes()
|
|
177
|
+
|
|
178
|
+
# v2.0-K: persist state and close audit log on shutdown.
|
|
179
|
+
if state_store is not None:
|
|
180
|
+
with contextlib.suppress(Exception):
|
|
181
|
+
engine.save_all_state()
|
|
182
|
+
# Save MetricsCollector state.
|
|
183
|
+
from coderouter.metrics import get_collector
|
|
184
|
+
|
|
185
|
+
collector = get_collector()
|
|
186
|
+
if collector is not None:
|
|
187
|
+
with contextlib.suppress(Exception):
|
|
188
|
+
state_store.put("metrics", "state", collector.save_state())
|
|
189
|
+
with contextlib.suppress(Exception):
|
|
190
|
+
state_store.close()
|
|
191
|
+
if audit_handler is not None:
|
|
192
|
+
import logging as _logging
|
|
193
|
+
|
|
194
|
+
with contextlib.suppress(Exception):
|
|
195
|
+
_logging.getLogger().removeHandler(audit_handler)
|
|
196
|
+
audit_handler.close()
|
|
197
|
+
if request_log_handler is not None:
|
|
198
|
+
import logging as _logging
|
|
199
|
+
|
|
200
|
+
with contextlib.suppress(Exception):
|
|
201
|
+
_logging.getLogger().removeHandler(request_log_handler)
|
|
202
|
+
request_log_handler.close()
|
|
203
|
+
|
|
75
204
|
logger.info("coderouter-shutdown")
|
|
76
205
|
|
|
77
206
|
app = FastAPI(
|
coderouter/logging.py
CHANGED
|
@@ -595,6 +595,114 @@ def log_demote_unhealthy_provider(
|
|
|
595
595
|
logger.info("demote-unhealthy-provider", extra=payload)
|
|
596
596
|
|
|
597
597
|
|
|
598
|
+
# ---------------------------------------------------------------------------
|
|
599
|
+
# v2.0-J: self-healing log shapes
|
|
600
|
+
# ---------------------------------------------------------------------------
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
class SelfHealingExcludePayload(TypedDict):
|
|
604
|
+
"""Structured shape of the ``self-healing-exclude`` log record."""
|
|
605
|
+
|
|
606
|
+
provider: str
|
|
607
|
+
profile: str
|
|
608
|
+
consecutive_failures: int
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
class SelfHealingRestorePayload(TypedDict):
|
|
612
|
+
"""Structured shape of the ``self-healing-restore`` log record."""
|
|
613
|
+
|
|
614
|
+
provider: str
|
|
615
|
+
profile: str
|
|
616
|
+
excluded_duration_s: float
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
class SelfHealingRestartPayload(TypedDict):
|
|
620
|
+
"""Structured shape of the ``self-healing-restart`` log record."""
|
|
621
|
+
|
|
622
|
+
provider: str
|
|
623
|
+
command: str
|
|
624
|
+
success: bool
|
|
625
|
+
error: str | None
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
class SelfHealingRecoveryProbePayload(TypedDict):
|
|
629
|
+
"""Structured shape of the ``self-healing-recovery-probe`` log record."""
|
|
630
|
+
|
|
631
|
+
provider: str
|
|
632
|
+
success: bool
|
|
633
|
+
next_interval_s: float
|
|
634
|
+
latency_ms: float
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
def log_self_healing_exclude(
|
|
638
|
+
logger: logging.Logger,
|
|
639
|
+
*,
|
|
640
|
+
provider: str,
|
|
641
|
+
profile: str,
|
|
642
|
+
consecutive_failures: int,
|
|
643
|
+
) -> None:
|
|
644
|
+
"""Emit when a provider is excluded from the chain by self-healing."""
|
|
645
|
+
payload: SelfHealingExcludePayload = {
|
|
646
|
+
"provider": provider,
|
|
647
|
+
"profile": profile,
|
|
648
|
+
"consecutive_failures": consecutive_failures,
|
|
649
|
+
}
|
|
650
|
+
logger.warning("self-healing-exclude", extra=payload)
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
def log_self_healing_restore(
|
|
654
|
+
logger: logging.Logger,
|
|
655
|
+
*,
|
|
656
|
+
provider: str,
|
|
657
|
+
profile: str,
|
|
658
|
+
excluded_duration_s: float,
|
|
659
|
+
) -> None:
|
|
660
|
+
"""Emit when a previously excluded provider is restored to the chain."""
|
|
661
|
+
payload: SelfHealingRestorePayload = {
|
|
662
|
+
"provider": provider,
|
|
663
|
+
"profile": profile,
|
|
664
|
+
"excluded_duration_s": round(excluded_duration_s, 1),
|
|
665
|
+
}
|
|
666
|
+
logger.info("self-healing-restore", extra=payload)
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
def log_self_healing_restart(
|
|
670
|
+
logger: logging.Logger,
|
|
671
|
+
*,
|
|
672
|
+
provider: str,
|
|
673
|
+
command: str,
|
|
674
|
+
success: bool,
|
|
675
|
+
error: str | None = None,
|
|
676
|
+
) -> None:
|
|
677
|
+
"""Emit after attempting to restart a provider's backend process."""
|
|
678
|
+
payload: SelfHealingRestartPayload = {
|
|
679
|
+
"provider": provider,
|
|
680
|
+
"command": command,
|
|
681
|
+
"success": success,
|
|
682
|
+
"error": error,
|
|
683
|
+
}
|
|
684
|
+
level = logging.INFO if success else logging.WARNING
|
|
685
|
+
logger.log(level, "self-healing-restart", extra=payload)
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
def log_self_healing_recovery_probe(
|
|
689
|
+
logger: logging.Logger,
|
|
690
|
+
*,
|
|
691
|
+
provider: str,
|
|
692
|
+
success: bool,
|
|
693
|
+
next_interval_s: float,
|
|
694
|
+
latency_ms: float,
|
|
695
|
+
) -> None:
|
|
696
|
+
"""Emit after each recovery probe attempt for an excluded provider."""
|
|
697
|
+
payload: SelfHealingRecoveryProbePayload = {
|
|
698
|
+
"provider": provider,
|
|
699
|
+
"success": success,
|
|
700
|
+
"next_interval_s": round(next_interval_s, 1),
|
|
701
|
+
"latency_ms": round(latency_ms, 1),
|
|
702
|
+
}
|
|
703
|
+
logger.info("self-healing-recovery-probe", extra=payload)
|
|
704
|
+
|
|
705
|
+
|
|
598
706
|
# ---------------------------------------------------------------------------
|
|
599
707
|
# v1.0-A: output-filter-applied log shape
|
|
600
708
|
#
|
|
@@ -1101,3 +1209,265 @@ def log_context_budget_trimmed(
|
|
|
1101
1209
|
"max_context_tokens": max_context_tokens,
|
|
1102
1210
|
}
|
|
1103
1211
|
logger.info("context-budget-trimmed", extra=payload)
|
|
1212
|
+
|
|
1213
|
+
|
|
1214
|
+
# ---------------------------------------------------------------------------
|
|
1215
|
+
# v2.0-G (L4): Drift detection logging
|
|
1216
|
+
# ---------------------------------------------------------------------------
|
|
1217
|
+
|
|
1218
|
+
|
|
1219
|
+
def log_drift_detected(
|
|
1220
|
+
logger: logging.Logger,
|
|
1221
|
+
*,
|
|
1222
|
+
provider: str,
|
|
1223
|
+
profile: str,
|
|
1224
|
+
severity: str,
|
|
1225
|
+
reason: str,
|
|
1226
|
+
action: str,
|
|
1227
|
+
signals: dict[str, float],
|
|
1228
|
+
) -> None:
|
|
1229
|
+
"""Emit a ``drift-detected`` warning line.
|
|
1230
|
+
|
|
1231
|
+
Fired when the drift detector finds quality degradation in the
|
|
1232
|
+
provider's rolling response window.
|
|
1233
|
+
"""
|
|
1234
|
+
logger.warning(
|
|
1235
|
+
"drift-detected",
|
|
1236
|
+
extra={
|
|
1237
|
+
"provider": provider,
|
|
1238
|
+
"profile": profile,
|
|
1239
|
+
"severity": severity,
|
|
1240
|
+
"reason": reason,
|
|
1241
|
+
"action": action,
|
|
1242
|
+
"signals": signals,
|
|
1243
|
+
},
|
|
1244
|
+
)
|
|
1245
|
+
|
|
1246
|
+
|
|
1247
|
+
def log_drift_promoted(
|
|
1248
|
+
logger: logging.Logger,
|
|
1249
|
+
*,
|
|
1250
|
+
provider: str,
|
|
1251
|
+
profile: str,
|
|
1252
|
+
demoted_to_rank: int,
|
|
1253
|
+
cooldown_s: int,
|
|
1254
|
+
) -> None:
|
|
1255
|
+
"""Emit a ``drift-promoted`` info line.
|
|
1256
|
+
|
|
1257
|
+
Fired when a drifted provider is demoted in the chain and a
|
|
1258
|
+
different provider takes over as primary.
|
|
1259
|
+
"""
|
|
1260
|
+
logger.info(
|
|
1261
|
+
"drift-promoted",
|
|
1262
|
+
extra={
|
|
1263
|
+
"provider": provider,
|
|
1264
|
+
"profile": profile,
|
|
1265
|
+
"demoted_to_rank": demoted_to_rank,
|
|
1266
|
+
"cooldown_s": cooldown_s,
|
|
1267
|
+
},
|
|
1268
|
+
)
|
|
1269
|
+
|
|
1270
|
+
|
|
1271
|
+
def log_drift_reload_attempted(
|
|
1272
|
+
logger: logging.Logger,
|
|
1273
|
+
*,
|
|
1274
|
+
provider: str,
|
|
1275
|
+
success: bool,
|
|
1276
|
+
) -> None:
|
|
1277
|
+
"""Emit a ``drift-reload-attempted`` info line.
|
|
1278
|
+
|
|
1279
|
+
Fired after attempting an Ollama KV cache flush (keep_alive=0).
|
|
1280
|
+
"""
|
|
1281
|
+
logger.info(
|
|
1282
|
+
"drift-reload-attempted",
|
|
1283
|
+
extra={
|
|
1284
|
+
"provider": provider,
|
|
1285
|
+
"success": success,
|
|
1286
|
+
},
|
|
1287
|
+
)
|
|
1288
|
+
|
|
1289
|
+
|
|
1290
|
+
def log_drift_recovered(
|
|
1291
|
+
logger: logging.Logger,
|
|
1292
|
+
*,
|
|
1293
|
+
provider: str,
|
|
1294
|
+
profile: str,
|
|
1295
|
+
after_s: float,
|
|
1296
|
+
) -> None:
|
|
1297
|
+
"""Emit a ``drift-recovered`` info line.
|
|
1298
|
+
|
|
1299
|
+
Fired when a previously-drifted provider's cooldown expires and its
|
|
1300
|
+
rank is restored.
|
|
1301
|
+
"""
|
|
1302
|
+
logger.info(
|
|
1303
|
+
"drift-recovered",
|
|
1304
|
+
extra={
|
|
1305
|
+
"provider": provider,
|
|
1306
|
+
"profile": profile,
|
|
1307
|
+
"after_s": round(after_s, 1),
|
|
1308
|
+
},
|
|
1309
|
+
)
|
|
1310
|
+
|
|
1311
|
+
|
|
1312
|
+
# ---------------------------------------------------------------------------
|
|
1313
|
+
# v2.0-H (L6): Partial stitch surfaced logging
|
|
1314
|
+
# ---------------------------------------------------------------------------
|
|
1315
|
+
|
|
1316
|
+
|
|
1317
|
+
def log_partial_stitch_surfaced(
|
|
1318
|
+
logger: logging.Logger,
|
|
1319
|
+
*,
|
|
1320
|
+
provider: str,
|
|
1321
|
+
profile: str,
|
|
1322
|
+
text_blocks: int,
|
|
1323
|
+
text_length: int,
|
|
1324
|
+
) -> None:
|
|
1325
|
+
"""Emit a ``partial-stitch-surfaced`` info line.
|
|
1326
|
+
|
|
1327
|
+
Fired when a mid-stream failure is gracefully terminated with partial
|
|
1328
|
+
content delivered to the client (partial_stitch_action=surface).
|
|
1329
|
+
"""
|
|
1330
|
+
logger.info(
|
|
1331
|
+
"partial-stitch-surfaced",
|
|
1332
|
+
extra={
|
|
1333
|
+
"provider": provider,
|
|
1334
|
+
"profile": profile,
|
|
1335
|
+
"text_blocks": text_blocks,
|
|
1336
|
+
"text_length": text_length,
|
|
1337
|
+
},
|
|
1338
|
+
)
|
|
1339
|
+
|
|
1340
|
+
|
|
1341
|
+
# ---------------------------------------------------------------------------
|
|
1342
|
+
# v2.0-I: Continuous probe log shapes
|
|
1343
|
+
#
|
|
1344
|
+
# Two event lanes mirror the backend-health triplet:
|
|
1345
|
+
# * ``probe-completed`` — info: a single provider probe finished
|
|
1346
|
+
# (success or failure). Quiet in normal
|
|
1347
|
+
# operation; operators grep for these to
|
|
1348
|
+
# diagnose individual backend issues.
|
|
1349
|
+
# * ``probe-round-completed`` — info: one full sweep across all probed
|
|
1350
|
+
# providers finished. Summary counter for
|
|
1351
|
+
# dashboards to render "probes/min" and
|
|
1352
|
+
# "failure ratio per round".
|
|
1353
|
+
# ---------------------------------------------------------------------------
|
|
1354
|
+
|
|
1355
|
+
|
|
1356
|
+
class ProbeCompletedPayload(TypedDict):
|
|
1357
|
+
"""Structured shape of the ``probe-completed`` log record.
|
|
1358
|
+
|
|
1359
|
+
Fields
|
|
1360
|
+
provider: the provider that was probed.
|
|
1361
|
+
success: whether the 1-token probe request succeeded.
|
|
1362
|
+
latency_ms: round-trip time of the probe in milliseconds.
|
|
1363
|
+
error: short error message (truncated) when success=False;
|
|
1364
|
+
None on success.
|
|
1365
|
+
model_name: model name extracted from the probe response,
|
|
1366
|
+
if available.
|
|
1367
|
+
"""
|
|
1368
|
+
|
|
1369
|
+
provider: str
|
|
1370
|
+
success: bool
|
|
1371
|
+
latency_ms: float
|
|
1372
|
+
error: str | None
|
|
1373
|
+
model_name: str | None
|
|
1374
|
+
|
|
1375
|
+
|
|
1376
|
+
class ProbeRoundCompletedPayload(TypedDict):
|
|
1377
|
+
"""Structured shape of the ``probe-round-completed`` log record.
|
|
1378
|
+
|
|
1379
|
+
Fields
|
|
1380
|
+
providers_probed: number of providers probed in this round.
|
|
1381
|
+
failures: number of probe failures in this round.
|
|
1382
|
+
"""
|
|
1383
|
+
|
|
1384
|
+
providers_probed: int
|
|
1385
|
+
failures: int
|
|
1386
|
+
|
|
1387
|
+
|
|
1388
|
+
def log_probe_completed(
|
|
1389
|
+
logger: logging.Logger,
|
|
1390
|
+
*,
|
|
1391
|
+
provider: str,
|
|
1392
|
+
success: bool,
|
|
1393
|
+
latency_ms: float,
|
|
1394
|
+
error: str | None = None,
|
|
1395
|
+
model_name: str | None = None,
|
|
1396
|
+
) -> None:
|
|
1397
|
+
"""Emit a ``probe-completed`` info line for one provider probe.
|
|
1398
|
+
|
|
1399
|
+
Single chokepoint mirroring :func:`log_capability_degraded`. Info
|
|
1400
|
+
level — individual probes are diagnostic noise at normal operation;
|
|
1401
|
+
operators filter on ``success=False`` for alerting.
|
|
1402
|
+
"""
|
|
1403
|
+
payload: ProbeCompletedPayload = {
|
|
1404
|
+
"provider": provider,
|
|
1405
|
+
"success": success,
|
|
1406
|
+
"latency_ms": round(latency_ms, 1),
|
|
1407
|
+
"error": error,
|
|
1408
|
+
"model_name": model_name,
|
|
1409
|
+
}
|
|
1410
|
+
logger.info("probe-completed", extra=payload)
|
|
1411
|
+
|
|
1412
|
+
|
|
1413
|
+
def log_probe_round_completed(
|
|
1414
|
+
logger: logging.Logger,
|
|
1415
|
+
*,
|
|
1416
|
+
providers_probed: int,
|
|
1417
|
+
failures: int,
|
|
1418
|
+
) -> None:
|
|
1419
|
+
"""Emit a ``probe-round-completed`` info line summarizing one sweep.
|
|
1420
|
+
|
|
1421
|
+
Info level — fires once per probe interval (default 60s). Dashboards
|
|
1422
|
+
render this as "probes/min" and "failure ratio" without needing to
|
|
1423
|
+
aggregate the individual ``probe-completed`` lines.
|
|
1424
|
+
"""
|
|
1425
|
+
payload: ProbeRoundCompletedPayload = {
|
|
1426
|
+
"providers_probed": providers_probed,
|
|
1427
|
+
"failures": failures,
|
|
1428
|
+
}
|
|
1429
|
+
logger.info("probe-round-completed", extra=payload)
|
|
1430
|
+
|
|
1431
|
+
|
|
1432
|
+
class ProbeCapabilitiesDriftPayload(TypedDict):
|
|
1433
|
+
"""Structured shape of the ``probe-capabilities-drift`` log record.
|
|
1434
|
+
|
|
1435
|
+
Fields
|
|
1436
|
+
provider: the provider whose probe response model mismatched.
|
|
1437
|
+
configured_model: the model string in providers.yaml.
|
|
1438
|
+
observed_model: the model name returned by the probe response.
|
|
1439
|
+
in_registry: whether the observed model has an entry in the
|
|
1440
|
+
capability registry.
|
|
1441
|
+
"""
|
|
1442
|
+
|
|
1443
|
+
provider: str
|
|
1444
|
+
configured_model: str
|
|
1445
|
+
observed_model: str
|
|
1446
|
+
in_registry: bool
|
|
1447
|
+
|
|
1448
|
+
|
|
1449
|
+
def log_probe_capabilities_drift(
|
|
1450
|
+
logger: logging.Logger,
|
|
1451
|
+
*,
|
|
1452
|
+
provider: str,
|
|
1453
|
+
configured_model: str,
|
|
1454
|
+
observed_model: str,
|
|
1455
|
+
in_registry: bool,
|
|
1456
|
+
) -> None:
|
|
1457
|
+
"""Emit a ``probe-capabilities-drift`` warning line.
|
|
1458
|
+
|
|
1459
|
+
Fired when the continuous probe detects that the model name returned
|
|
1460
|
+
by the provider differs from the configured model. This can indicate
|
|
1461
|
+
a model swap (Ollama auto-updated), a misconfiguration, or a new
|
|
1462
|
+
model that the capability registry doesn't know about yet.
|
|
1463
|
+
|
|
1464
|
+
Warn level — model drift can cause subtle behavior changes (e.g. a
|
|
1465
|
+
model that supports thinking being replaced by one that doesn't).
|
|
1466
|
+
"""
|
|
1467
|
+
payload: ProbeCapabilitiesDriftPayload = {
|
|
1468
|
+
"provider": provider,
|
|
1469
|
+
"configured_model": configured_model,
|
|
1470
|
+
"observed_model": observed_model,
|
|
1471
|
+
"in_registry": in_registry,
|
|
1472
|
+
}
|
|
1473
|
+
logger.warning("probe-capabilities-drift", extra=payload)
|