coderouter-cli 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
coderouter/ingress/app.py CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import contextlib
5
6
  import os
6
7
  from collections.abc import AsyncIterator
7
8
  from contextlib import asynccontextmanager
@@ -71,7 +72,135 @@ def create_app(config_path: str | None = None) -> FastAPI:
71
72
  # chronological order. Non-fatal — the chain still works, just
72
73
  # potentially sub-optimally for the agentic harness.
73
74
  check_claude_code_chain_suitability(config, logger=logger)
75
+
76
+ # v2.0-K: attach persistent state store + audit/request log if configured.
77
+ state_store = None
78
+ audit_handler = None
79
+ request_log_handler = None
80
+ if config.state_dir:
81
+ import logging as _logging
82
+ from pathlib import Path
83
+
84
+ from coderouter.state.audit_log import AuditLogHandler
85
+ from coderouter.state.store import StateStore
86
+
87
+ state_path = Path(config.state_dir).expanduser()
88
+ state_store = StateStore(state_path / "coderouter.db")
89
+ engine.attach_state_store(state_store)
90
+
91
+ # Restore MetricsCollector state from the store.
92
+ from coderouter.metrics import get_collector
93
+
94
+ collector = get_collector()
95
+ if collector is not None:
96
+ metrics_state = state_store.get("metrics", "state")
97
+ if metrics_state is not None:
98
+ with contextlib.suppress(Exception):
99
+ collector.load_state(metrics_state) # type: ignore[arg-type]
100
+
101
+ logger.info(
102
+ "state-store-attached",
103
+ extra={"state_dir": str(state_path)},
104
+ )
105
+
106
+ if config.audit_log == "active":
107
+ audit_handler = AuditLogHandler(
108
+ state_path / "audit.jsonl",
109
+ max_bytes=config.audit_log_max_bytes,
110
+ )
111
+ _logging.getLogger().addHandler(audit_handler)
112
+ logger.info(
113
+ "audit-log-started",
114
+ extra={
115
+ "path": str(state_path / "audit.jsonl"),
116
+ "max_bytes": config.audit_log_max_bytes,
117
+ },
118
+ )
119
+
120
+ if config.request_log == "active":
121
+ from coderouter.state.request_log import RequestLogHandler
122
+
123
+ request_log_handler = RequestLogHandler(
124
+ state_path / "requests.jsonl",
125
+ max_bytes=config.request_log_max_bytes,
126
+ )
127
+ _logging.getLogger().addHandler(request_log_handler)
128
+ logger.info(
129
+ "request-log-started",
130
+ extra={
131
+ "path": str(state_path / "requests.jsonl"),
132
+ "max_bytes": config.request_log_max_bytes,
133
+ },
134
+ )
135
+
136
+ # v2.0-I: launch continuous probe background task if configured.
137
+ probe_task = None
138
+ shutdown_event = None
139
+ if config.continuous_probe == "active":
140
+ import asyncio
141
+
142
+ from coderouter.guards.continuous_probe import probe_loop
143
+ from coderouter.routing.capability import get_default_registry
144
+
145
+ shutdown_event = asyncio.Event()
146
+ probe_task = asyncio.create_task(
147
+ probe_loop(
148
+ config.providers,
149
+ record_fn=engine.backend_health.record_attempt,
150
+ interval_s=config.probe_interval_s,
151
+ timeout_s=config.probe_timeout_s,
152
+ probe_paid=config.probe_paid,
153
+ shutdown_event=shutdown_event,
154
+ registry=get_default_registry(),
155
+ )
156
+ )
157
+ logger.info(
158
+ "continuous-probe-started",
159
+ extra={
160
+ "interval_s": config.probe_interval_s,
161
+ "probe_paid": config.probe_paid,
162
+ "providers": len(config.providers),
163
+ },
164
+ )
165
+
74
166
  yield
167
+
168
+ # Graceful shutdown of probe task
169
+ if probe_task is not None and shutdown_event is not None:
170
+ shutdown_event.set()
171
+ with contextlib.suppress(Exception):
172
+ await probe_task
173
+
174
+ # v2.0-J: graceful shutdown of recovery probe tasks.
175
+ with contextlib.suppress(Exception):
176
+ await engine.shutdown_recovery_probes()
177
+
178
+ # v2.0-K: persist state and close audit log on shutdown.
179
+ if state_store is not None:
180
+ with contextlib.suppress(Exception):
181
+ engine.save_all_state()
182
+ # Save MetricsCollector state.
183
+ from coderouter.metrics import get_collector
184
+
185
+ collector = get_collector()
186
+ if collector is not None:
187
+ with contextlib.suppress(Exception):
188
+ state_store.put("metrics", "state", collector.save_state())
189
+ with contextlib.suppress(Exception):
190
+ state_store.close()
191
+ if audit_handler is not None:
192
+ import logging as _logging
193
+
194
+ with contextlib.suppress(Exception):
195
+ _logging.getLogger().removeHandler(audit_handler)
196
+ audit_handler.close()
197
+ if request_log_handler is not None:
198
+ import logging as _logging
199
+
200
+ with contextlib.suppress(Exception):
201
+ _logging.getLogger().removeHandler(request_log_handler)
202
+ request_log_handler.close()
203
+
75
204
  logger.info("coderouter-shutdown")
76
205
 
77
206
  app = FastAPI(
coderouter/logging.py CHANGED
@@ -595,6 +595,114 @@ def log_demote_unhealthy_provider(
595
595
  logger.info("demote-unhealthy-provider", extra=payload)
596
596
 
597
597
 
598
+ # ---------------------------------------------------------------------------
599
+ # v2.0-J: self-healing log shapes
600
+ # ---------------------------------------------------------------------------
601
+
602
+
603
+ class SelfHealingExcludePayload(TypedDict):
604
+ """Structured shape of the ``self-healing-exclude`` log record."""
605
+
606
+ provider: str
607
+ profile: str
608
+ consecutive_failures: int
609
+
610
+
611
+ class SelfHealingRestorePayload(TypedDict):
612
+ """Structured shape of the ``self-healing-restore`` log record."""
613
+
614
+ provider: str
615
+ profile: str
616
+ excluded_duration_s: float
617
+
618
+
619
+ class SelfHealingRestartPayload(TypedDict):
620
+ """Structured shape of the ``self-healing-restart`` log record."""
621
+
622
+ provider: str
623
+ command: str
624
+ success: bool
625
+ error: str | None
626
+
627
+
628
+ class SelfHealingRecoveryProbePayload(TypedDict):
629
+ """Structured shape of the ``self-healing-recovery-probe`` log record."""
630
+
631
+ provider: str
632
+ success: bool
633
+ next_interval_s: float
634
+ latency_ms: float
635
+
636
+
637
+ def log_self_healing_exclude(
638
+ logger: logging.Logger,
639
+ *,
640
+ provider: str,
641
+ profile: str,
642
+ consecutive_failures: int,
643
+ ) -> None:
644
+ """Emit when a provider is excluded from the chain by self-healing."""
645
+ payload: SelfHealingExcludePayload = {
646
+ "provider": provider,
647
+ "profile": profile,
648
+ "consecutive_failures": consecutive_failures,
649
+ }
650
+ logger.warning("self-healing-exclude", extra=payload)
651
+
652
+
653
+ def log_self_healing_restore(
654
+ logger: logging.Logger,
655
+ *,
656
+ provider: str,
657
+ profile: str,
658
+ excluded_duration_s: float,
659
+ ) -> None:
660
+ """Emit when a previously excluded provider is restored to the chain."""
661
+ payload: SelfHealingRestorePayload = {
662
+ "provider": provider,
663
+ "profile": profile,
664
+ "excluded_duration_s": round(excluded_duration_s, 1),
665
+ }
666
+ logger.info("self-healing-restore", extra=payload)
667
+
668
+
669
+ def log_self_healing_restart(
670
+ logger: logging.Logger,
671
+ *,
672
+ provider: str,
673
+ command: str,
674
+ success: bool,
675
+ error: str | None = None,
676
+ ) -> None:
677
+ """Emit after attempting to restart a provider's backend process."""
678
+ payload: SelfHealingRestartPayload = {
679
+ "provider": provider,
680
+ "command": command,
681
+ "success": success,
682
+ "error": error,
683
+ }
684
+ level = logging.INFO if success else logging.WARNING
685
+ logger.log(level, "self-healing-restart", extra=payload)
686
+
687
+
688
+ def log_self_healing_recovery_probe(
689
+ logger: logging.Logger,
690
+ *,
691
+ provider: str,
692
+ success: bool,
693
+ next_interval_s: float,
694
+ latency_ms: float,
695
+ ) -> None:
696
+ """Emit after each recovery probe attempt for an excluded provider."""
697
+ payload: SelfHealingRecoveryProbePayload = {
698
+ "provider": provider,
699
+ "success": success,
700
+ "next_interval_s": round(next_interval_s, 1),
701
+ "latency_ms": round(latency_ms, 1),
702
+ }
703
+ logger.info("self-healing-recovery-probe", extra=payload)
704
+
705
+
598
706
  # ---------------------------------------------------------------------------
599
707
  # v1.0-A: output-filter-applied log shape
600
708
  #
@@ -1101,3 +1209,265 @@ def log_context_budget_trimmed(
1101
1209
  "max_context_tokens": max_context_tokens,
1102
1210
  }
1103
1211
  logger.info("context-budget-trimmed", extra=payload)
1212
+
1213
+
1214
+ # ---------------------------------------------------------------------------
1215
+ # v2.0-G (L4): Drift detection logging
1216
+ # ---------------------------------------------------------------------------
1217
+
1218
+
1219
+ def log_drift_detected(
1220
+ logger: logging.Logger,
1221
+ *,
1222
+ provider: str,
1223
+ profile: str,
1224
+ severity: str,
1225
+ reason: str,
1226
+ action: str,
1227
+ signals: dict[str, float],
1228
+ ) -> None:
1229
+ """Emit a ``drift-detected`` warning line.
1230
+
1231
+ Fired when the drift detector finds quality degradation in the
1232
+ provider's rolling response window.
1233
+ """
1234
+ logger.warning(
1235
+ "drift-detected",
1236
+ extra={
1237
+ "provider": provider,
1238
+ "profile": profile,
1239
+ "severity": severity,
1240
+ "reason": reason,
1241
+ "action": action,
1242
+ "signals": signals,
1243
+ },
1244
+ )
1245
+
1246
+
1247
+ def log_drift_promoted(
1248
+ logger: logging.Logger,
1249
+ *,
1250
+ provider: str,
1251
+ profile: str,
1252
+ demoted_to_rank: int,
1253
+ cooldown_s: int,
1254
+ ) -> None:
1255
+ """Emit a ``drift-promoted`` info line.
1256
+
1257
+ Fired when a drifted provider is demoted in the chain and a
1258
+ different provider takes over as primary.
1259
+ """
1260
+ logger.info(
1261
+ "drift-promoted",
1262
+ extra={
1263
+ "provider": provider,
1264
+ "profile": profile,
1265
+ "demoted_to_rank": demoted_to_rank,
1266
+ "cooldown_s": cooldown_s,
1267
+ },
1268
+ )
1269
+
1270
+
1271
+ def log_drift_reload_attempted(
1272
+ logger: logging.Logger,
1273
+ *,
1274
+ provider: str,
1275
+ success: bool,
1276
+ ) -> None:
1277
+ """Emit a ``drift-reload-attempted`` info line.
1278
+
1279
+ Fired after attempting an Ollama KV cache flush (keep_alive=0).
1280
+ """
1281
+ logger.info(
1282
+ "drift-reload-attempted",
1283
+ extra={
1284
+ "provider": provider,
1285
+ "success": success,
1286
+ },
1287
+ )
1288
+
1289
+
1290
+ def log_drift_recovered(
1291
+ logger: logging.Logger,
1292
+ *,
1293
+ provider: str,
1294
+ profile: str,
1295
+ after_s: float,
1296
+ ) -> None:
1297
+ """Emit a ``drift-recovered`` info line.
1298
+
1299
+ Fired when a previously-drifted provider's cooldown expires and its
1300
+ rank is restored.
1301
+ """
1302
+ logger.info(
1303
+ "drift-recovered",
1304
+ extra={
1305
+ "provider": provider,
1306
+ "profile": profile,
1307
+ "after_s": round(after_s, 1),
1308
+ },
1309
+ )
1310
+
1311
+
1312
+ # ---------------------------------------------------------------------------
1313
+ # v2.0-H (L6): Partial stitch surfaced logging
1314
+ # ---------------------------------------------------------------------------
1315
+
1316
+
1317
+ def log_partial_stitch_surfaced(
1318
+ logger: logging.Logger,
1319
+ *,
1320
+ provider: str,
1321
+ profile: str,
1322
+ text_blocks: int,
1323
+ text_length: int,
1324
+ ) -> None:
1325
+ """Emit a ``partial-stitch-surfaced`` info line.
1326
+
1327
+ Fired when a mid-stream failure is gracefully terminated with partial
1328
+ content delivered to the client (partial_stitch_action=surface).
1329
+ """
1330
+ logger.info(
1331
+ "partial-stitch-surfaced",
1332
+ extra={
1333
+ "provider": provider,
1334
+ "profile": profile,
1335
+ "text_blocks": text_blocks,
1336
+ "text_length": text_length,
1337
+ },
1338
+ )
1339
+
1340
+
1341
+ # ---------------------------------------------------------------------------
1342
+ # v2.0-I: Continuous probe log shapes
1343
+ #
1344
+ # Two event lanes mirror the backend-health triplet:
1345
+ # * ``probe-completed`` — info: a single provider probe finished
1346
+ # (success or failure). Quiet in normal
1347
+ # operation; operators grep for these to
1348
+ # diagnose individual backend issues.
1349
+ # * ``probe-round-completed`` — info: one full sweep across all probed
1350
+ # providers finished. Summary counter for
1351
+ # dashboards to render "probes/min" and
1352
+ # "failure ratio per round".
1353
+ # ---------------------------------------------------------------------------
1354
+
1355
+
1356
+ class ProbeCompletedPayload(TypedDict):
1357
+ """Structured shape of the ``probe-completed`` log record.
1358
+
1359
+ Fields
1360
+ provider: the provider that was probed.
1361
+ success: whether the 1-token probe request succeeded.
1362
+ latency_ms: round-trip time of the probe in milliseconds.
1363
+ error: short error message (truncated) when success=False;
1364
+ None on success.
1365
+ model_name: model name extracted from the probe response,
1366
+ if available.
1367
+ """
1368
+
1369
+ provider: str
1370
+ success: bool
1371
+ latency_ms: float
1372
+ error: str | None
1373
+ model_name: str | None
1374
+
1375
+
1376
+ class ProbeRoundCompletedPayload(TypedDict):
1377
+ """Structured shape of the ``probe-round-completed`` log record.
1378
+
1379
+ Fields
1380
+ providers_probed: number of providers probed in this round.
1381
+ failures: number of probe failures in this round.
1382
+ """
1383
+
1384
+ providers_probed: int
1385
+ failures: int
1386
+
1387
+
1388
+ def log_probe_completed(
1389
+ logger: logging.Logger,
1390
+ *,
1391
+ provider: str,
1392
+ success: bool,
1393
+ latency_ms: float,
1394
+ error: str | None = None,
1395
+ model_name: str | None = None,
1396
+ ) -> None:
1397
+ """Emit a ``probe-completed`` info line for one provider probe.
1398
+
1399
+ Single chokepoint mirroring :func:`log_capability_degraded`. Info
1400
+ level — individual probes are diagnostic noise at normal operation;
1401
+ operators filter on ``success=False`` for alerting.
1402
+ """
1403
+ payload: ProbeCompletedPayload = {
1404
+ "provider": provider,
1405
+ "success": success,
1406
+ "latency_ms": round(latency_ms, 1),
1407
+ "error": error,
1408
+ "model_name": model_name,
1409
+ }
1410
+ logger.info("probe-completed", extra=payload)
1411
+
1412
+
1413
+ def log_probe_round_completed(
1414
+ logger: logging.Logger,
1415
+ *,
1416
+ providers_probed: int,
1417
+ failures: int,
1418
+ ) -> None:
1419
+ """Emit a ``probe-round-completed`` info line summarizing one sweep.
1420
+
1421
+ Info level — fires once per probe interval (default 60s). Dashboards
1422
+ render this as "probes/min" and "failure ratio" without needing to
1423
+ aggregate the individual ``probe-completed`` lines.
1424
+ """
1425
+ payload: ProbeRoundCompletedPayload = {
1426
+ "providers_probed": providers_probed,
1427
+ "failures": failures,
1428
+ }
1429
+ logger.info("probe-round-completed", extra=payload)
1430
+
1431
+
1432
+ class ProbeCapabilitiesDriftPayload(TypedDict):
1433
+ """Structured shape of the ``probe-capabilities-drift`` log record.
1434
+
1435
+ Fields
1436
+ provider: the provider whose probe response model mismatched.
1437
+ configured_model: the model string in providers.yaml.
1438
+ observed_model: the model name returned by the probe response.
1439
+ in_registry: whether the observed model has an entry in the
1440
+ capability registry.
1441
+ """
1442
+
1443
+ provider: str
1444
+ configured_model: str
1445
+ observed_model: str
1446
+ in_registry: bool
1447
+
1448
+
1449
+ def log_probe_capabilities_drift(
1450
+ logger: logging.Logger,
1451
+ *,
1452
+ provider: str,
1453
+ configured_model: str,
1454
+ observed_model: str,
1455
+ in_registry: bool,
1456
+ ) -> None:
1457
+ """Emit a ``probe-capabilities-drift`` warning line.
1458
+
1459
+ Fired when the continuous probe detects that the model name returned
1460
+ by the provider differs from the configured model. This can indicate
1461
+ a model swap (Ollama auto-updated), a misconfiguration, or a new
1462
+ model that the capability registry doesn't know about yet.
1463
+
1464
+ Warn level — model drift can cause subtle behavior changes (e.g. a
1465
+ model that supports thinking being replaced by one that doesn't).
1466
+ """
1467
+ payload: ProbeCapabilitiesDriftPayload = {
1468
+ "provider": provider,
1469
+ "configured_model": configured_model,
1470
+ "observed_model": observed_model,
1471
+ "in_registry": in_registry,
1472
+ }
1473
+ logger.warning("probe-capabilities-drift", extra=payload)