coderouter-cli 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. coderouter/__init__.py +17 -0
  2. coderouter/__main__.py +6 -0
  3. coderouter/adapters/__init__.py +23 -0
  4. coderouter/adapters/anthropic_native.py +502 -0
  5. coderouter/adapters/base.py +220 -0
  6. coderouter/adapters/openai_compat.py +395 -0
  7. coderouter/adapters/registry.py +17 -0
  8. coderouter/cli.py +345 -0
  9. coderouter/cli_stats.py +751 -0
  10. coderouter/config/__init__.py +10 -0
  11. coderouter/config/capability_registry.py +339 -0
  12. coderouter/config/env_file.py +295 -0
  13. coderouter/config/loader.py +73 -0
  14. coderouter/config/schemas.py +515 -0
  15. coderouter/data/__init__.py +7 -0
  16. coderouter/data/model-capabilities.yaml +86 -0
  17. coderouter/doctor.py +1596 -0
  18. coderouter/env_security.py +434 -0
  19. coderouter/errors.py +29 -0
  20. coderouter/ingress/__init__.py +5 -0
  21. coderouter/ingress/anthropic_routes.py +205 -0
  22. coderouter/ingress/app.py +144 -0
  23. coderouter/ingress/dashboard_routes.py +493 -0
  24. coderouter/ingress/metrics_routes.py +92 -0
  25. coderouter/ingress/openai_routes.py +153 -0
  26. coderouter/logging.py +315 -0
  27. coderouter/metrics/__init__.py +39 -0
  28. coderouter/metrics/collector.py +471 -0
  29. coderouter/metrics/prometheus.py +221 -0
  30. coderouter/output_filters.py +407 -0
  31. coderouter/routing/__init__.py +13 -0
  32. coderouter/routing/auto_router.py +244 -0
  33. coderouter/routing/capability.py +285 -0
  34. coderouter/routing/fallback.py +611 -0
  35. coderouter/translation/__init__.py +57 -0
  36. coderouter/translation/anthropic.py +204 -0
  37. coderouter/translation/convert.py +1291 -0
  38. coderouter/translation/tool_repair.py +236 -0
  39. coderouter_cli-1.7.0.dist-info/METADATA +509 -0
  40. coderouter_cli-1.7.0.dist-info/RECORD +43 -0
  41. coderouter_cli-1.7.0.dist-info/WHEEL +4 -0
  42. coderouter_cli-1.7.0.dist-info/entry_points.txt +2 -0
  43. coderouter_cli-1.7.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,471 @@
1
+ """In-memory metrics collector — a ``logging.Handler`` that taps structured logs.
2
+
3
+ Design (plan.md §12.3):
4
+ - v1.5-A ships the collector + ``/metrics.json`` endpoint only.
5
+ Prometheus exposition, JSONL persistence, CLI TUI, and HTML dashboard
6
+ land in v1.5-B/C/D.
7
+ - Collection is a **pure log tap**: we install ourselves on the root
8
+ logger via :func:`install_collector` and inspect each ``LogRecord``.
9
+ Adapter / routing code stays untouched — v0.5's ``capability-
10
+ degraded`` gate, v0.6's ``chain-paid-gate-blocked`` warn, v0.7's
11
+ ``output-filter-applied`` info line are all already structured with
12
+ typed extras, so the collector just dispatches on ``record.msg``.
13
+ - Storage is in-memory only: counters (``collections.Counter``),
14
+ per-provider last-error snapshots, and a fixed-size ``deque`` of
15
+ recent events. Re-start clears the state — JSONL persistence lands
16
+ in v1.5-B (``CODEROUTER_EVENTS_PATH``).
17
+
18
+ Thread safety
19
+ ``logging.Handler.emit`` can be invoked from any thread (Python's
20
+ logging module acquires ``handler.lock`` itself). We additionally
21
+ guard the mutable state with an ``RLock`` so ``snapshot()`` — which
22
+ the FastAPI event loop calls — sees a consistent view. The lock is
23
+ held only for the small mutation sites.
24
+
25
+ Event inventory (dispatch table in :meth:`MetricsCollector._dispatch`)
26
+ ``try-provider`` → ``requests_total`` + ``provider_attempts``
27
+ ``provider-ok`` → ``provider_outcomes[provider]["ok"]``
28
+ ``provider-failed`` → ``provider_outcomes[provider]["failed"]``
29
+ + last_error[provider]
30
+ ``provider-failed-midstream``→ ``provider_outcomes[...]["failed_midstream"]``
31
+ ``skip-paid-provider`` → ``provider_skipped_paid``
32
+ ``skip-unknown-provider`` → ``provider_skipped_unknown``
33
+ ``capability-degraded`` → ``capability_degraded[capability]``
34
+ ``output-filter-applied`` → ``output_filter_applied[filter]``
35
+ ``chain-paid-gate-blocked`` → ``chain_paid_gate_blocked_total``
36
+ ``chain-uniform-auth-failure``→ ``chain_uniform_auth_failure_total``
37
+ ``auto-router-fallthrough`` → ``auto_router_fallthrough_total``
38
+ ``coderouter-startup`` → ``startup_info`` (stored for the UI header)
39
+
40
+ Unrecognized events are ignored (forward-compat: adding a new log
41
+ event never breaks the collector).
42
+ """
43
+
44
+ from __future__ import annotations
45
+
46
+ import contextlib
47
+ import logging
48
+ import os
49
+ import threading
50
+ import time
51
+ from collections import Counter, deque
52
+ from datetime import UTC, datetime
53
+ from pathlib import Path
54
+ from typing import Any, Final
55
+
56
+ from coderouter.logging import JsonLineFormatter
57
+
58
+ # Default ring-buffer size. Chosen to match a ~2-second refresh at 100 RPS
59
+ # without blowing memory; overridable via the :class:`MetricsCollector`
60
+ # constructor for tests.
61
+ _DEFAULT_RING_SIZE: Final[int] = 256
62
+
63
+ # Truncate ``error`` strings stored in the last-error snapshot. The raw
64
+ # log already truncates at 500 chars; 200 is plenty for dashboard display
65
+ # and keeps the snapshot dict small when many providers have errors.
66
+ _LAST_ERROR_MAX_CHARS: Final[int] = 200
67
+
68
+
69
+ def _utc_now_iso() -> str:
70
+ """Current UTC time as ``YYYY-MM-DDTHH:MM:SS`` (no microseconds, no TZ suffix).
71
+
72
+ Matches the format :class:`coderouter.logging.JsonLineFormatter` uses
73
+ for its ``ts`` field, so the recent-events ring reads the same way as
74
+ the stderr log stream.
75
+ """
76
+ return datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%S")
77
+
78
+
79
+ class MetricsCollector(logging.Handler):
80
+ """``logging.Handler`` subclass that aggregates metrics from log records.
81
+
82
+ One instance per process (see :func:`install_collector`). Thread-safe
83
+ mutation of the internal counters/ring buffer via ``self._lock``.
84
+ ``emit()`` is the hot path — it runs on every log record — so it
85
+ stays branchless outside the event dispatch table.
86
+ """
87
+
88
+ def __init__(self, *, ring_size: int = _DEFAULT_RING_SIZE) -> None:
89
+ """Construct an empty collector.
90
+
91
+ ``ring_size`` is the maximum number of recent events retained for
92
+ the dashboard's "Recent Requests" panel. Older events roll off
93
+ FIFO. Default 256 balances "enough history for a human glance" vs
94
+ memory; tests pass smaller values to keep assertions tight.
95
+ """
96
+ super().__init__(level=logging.DEBUG)
97
+ self._lock = threading.RLock()
98
+ self._started_monotonic: float = time.monotonic()
99
+ self._started_at: str = _utc_now_iso()
100
+
101
+ # Counters — monotone, process-lifetime.
102
+ self._requests_total: int = 0
103
+ self._provider_attempts: Counter[str] = Counter()
104
+ # nested: provider -> outcome -> count
105
+ self._provider_outcomes: dict[str, Counter[str]] = {}
106
+ self._provider_skipped_paid: Counter[str] = Counter()
107
+ self._provider_skipped_unknown: Counter[str] = Counter()
108
+ self._capability_degraded: Counter[str] = Counter()
109
+ self._output_filter_applied: Counter[str] = Counter()
110
+ self._chain_paid_gate_blocked_total: int = 0
111
+ self._chain_uniform_auth_failure_total: int = 0
112
+ # v1.6-B: classifier ran, no user rule matched, and the
113
+ # ``default_rule_profile`` was returned instead. Surfaced as its own
114
+ # Prometheus counter so operators can watch the fall-through rate as
115
+ # a stability signal on custom rulesets.
116
+ self._auto_router_fallthrough_total: int = 0
117
+
118
+ # Last-error snapshot per provider (overwrites previous). Enables the
119
+ # dashboard's "last error" column without scanning the ring.
120
+ self._last_error: dict[str, dict[str, Any]] = {}
121
+
122
+ # Recent events ring. Each entry is a flat dict ready for JSON.
123
+ self._recent: deque[dict[str, Any]] = deque(maxlen=ring_size)
124
+
125
+ # Populated by coderouter-startup — lets /metrics.json surface
126
+ # "which providers does this server know about" without re-reading
127
+ # YAML.
128
+ self._startup_info: dict[str, Any] = {}
129
+
130
+ # ------------------------------------------------------------------
131
+ # Handler API
132
+ # ------------------------------------------------------------------
133
+
134
+ def emit(self, record: logging.LogRecord) -> None:
135
+ """Dispatch a log record into the counter/ring updates.
136
+
137
+ Unknown event names are silently ignored so adding a new log line
138
+ elsewhere in the codebase never breaks metrics. Exceptions inside
139
+ dispatch are swallowed via :meth:`handleError` per the Handler
140
+ contract (we must never let metrics blow up a log call).
141
+ """
142
+ try:
143
+ self._dispatch(record)
144
+ except Exception: # pragma: no cover - defensive
145
+ self.handleError(record)
146
+
147
+ def _dispatch(self, record: logging.LogRecord) -> None:
148
+ """Event name → counter/ring mutation. Called under ``self._lock``."""
149
+ event = record.msg
150
+ if not isinstance(event, str):
151
+ return
152
+ extras = record.__dict__
153
+ with self._lock:
154
+ if event == "try-provider":
155
+ self._requests_total += 1
156
+ provider = _str(extras.get("provider"))
157
+ self._provider_attempts[provider] += 1
158
+ self._push_recent(event, extras, record)
159
+ elif event == "provider-ok":
160
+ provider = _str(extras.get("provider"))
161
+ self._provider_outcomes.setdefault(provider, Counter())["ok"] += 1
162
+ self._push_recent(event, extras, record)
163
+ elif event == "provider-failed":
164
+ provider = _str(extras.get("provider"))
165
+ self._provider_outcomes.setdefault(provider, Counter())["failed"] += 1
166
+ self._last_error[provider] = _make_last_error(extras, record)
167
+ self._push_recent(event, extras, record)
168
+ elif event == "provider-failed-midstream":
169
+ provider = _str(extras.get("provider"))
170
+ self._provider_outcomes.setdefault(provider, Counter())[
171
+ "failed_midstream"
172
+ ] += 1
173
+ self._last_error[provider] = _make_last_error(extras, record)
174
+ self._push_recent(event, extras, record)
175
+ elif event == "skip-paid-provider":
176
+ provider = _str(extras.get("provider"))
177
+ self._provider_skipped_paid[provider] += 1
178
+ elif event == "skip-unknown-provider":
179
+ provider = _str(extras.get("provider"))
180
+ self._provider_skipped_unknown[provider] += 1
181
+ elif event == "capability-degraded":
182
+ dropped = extras.get("dropped") or []
183
+ if isinstance(dropped, list):
184
+ for cap in dropped:
185
+ if isinstance(cap, str):
186
+ self._capability_degraded[cap] += 1
187
+ elif event == "output-filter-applied":
188
+ filters = extras.get("filters") or []
189
+ if isinstance(filters, list):
190
+ for name in filters:
191
+ if isinstance(name, str):
192
+ self._output_filter_applied[name] += 1
193
+ elif event == "chain-paid-gate-blocked":
194
+ self._chain_paid_gate_blocked_total += 1
195
+ elif event == "chain-uniform-auth-failure":
196
+ self._chain_uniform_auth_failure_total += 1
197
+ elif event == "auto-router-fallthrough":
198
+ # Every call into ``classify()`` that exits via the
199
+ # default-rule branch (no user/bundled rule matched, or
200
+ # ``auto_router.disabled: true``) bumps this counter.
201
+ self._auto_router_fallthrough_total += 1
202
+ elif event == "coderouter-startup":
203
+ # Snapshot a subset — startup payload contains lists that are
204
+ # safe to surface to /metrics.json. Version / providers /
205
+ # profiles / default_profile is all the dashboard needs.
206
+ self._startup_info = {
207
+ "version": _str(extras.get("version")),
208
+ "providers": list(extras.get("providers") or []),
209
+ "profiles": list(extras.get("profiles") or []),
210
+ "default_profile": _str(extras.get("default_profile")),
211
+ "allow_paid": bool(extras.get("allow_paid")),
212
+ "mode_source": _str(extras.get("mode_source")),
213
+ }
214
+
215
+ def _push_recent(
216
+ self, event: str, extras: dict[str, Any], record: logging.LogRecord
217
+ ) -> None:
218
+ """Append a minimal record to the ring buffer.
219
+
220
+ We keep the shape flat and only surface whitelisted fields — this
221
+ is what the dashboard renders, and avoids leaking transient log
222
+ attributes (``msecs``, ``thread``, etc.) that would just bloat
223
+ the payload.
224
+ """
225
+ entry: dict[str, Any] = {
226
+ "ts": _record_ts_iso(record),
227
+ "event": event,
228
+ }
229
+ for key in ("provider", "stream", "status", "retryable"):
230
+ if key in extras and extras[key] is not None:
231
+ entry[key] = extras[key]
232
+ self._recent.append(entry)
233
+
234
+ # ------------------------------------------------------------------
235
+ # Read API
236
+ # ------------------------------------------------------------------
237
+
238
+ def snapshot(self) -> dict[str, Any]:
239
+ """Return the current metrics as a JSON-safe dict.
240
+
241
+ Shape is stable within v1.5 (may evolve with a semver-compatible
242
+ additive bump). Keys absent from one install but present in
243
+ another just mean "that event never fired this process lifetime".
244
+ """
245
+ with self._lock:
246
+ providers = sorted(
247
+ set(self._provider_attempts)
248
+ | set(self._provider_outcomes)
249
+ | set(self._last_error)
250
+ )
251
+ provider_rows = [
252
+ {
253
+ "name": name,
254
+ "attempts": self._provider_attempts.get(name, 0),
255
+ "outcomes": dict(self._provider_outcomes.get(name, Counter())),
256
+ "last_error": self._last_error.get(name),
257
+ }
258
+ for name in providers
259
+ ]
260
+ return {
261
+ "uptime_s": round(time.monotonic() - self._started_monotonic, 3),
262
+ "started_at": self._started_at,
263
+ "startup": dict(self._startup_info),
264
+ "counters": {
265
+ "requests_total": self._requests_total,
266
+ "chain_paid_gate_blocked_total": self._chain_paid_gate_blocked_total,
267
+ "chain_uniform_auth_failure_total": self._chain_uniform_auth_failure_total,
268
+ "auto_router_fallthrough_total": self._auto_router_fallthrough_total,
269
+ "provider_attempts": dict(self._provider_attempts),
270
+ "provider_outcomes": {
271
+ name: dict(counter)
272
+ for name, counter in self._provider_outcomes.items()
273
+ },
274
+ "provider_skipped_paid": dict(self._provider_skipped_paid),
275
+ "provider_skipped_unknown": dict(self._provider_skipped_unknown),
276
+ "capability_degraded": dict(self._capability_degraded),
277
+ "output_filter_applied": dict(self._output_filter_applied),
278
+ },
279
+ "providers": provider_rows,
280
+ "recent": list(self._recent),
281
+ }
282
+
283
+ # ------------------------------------------------------------------
284
+ # Test hook
285
+ # ------------------------------------------------------------------
286
+
287
+ def reset(self) -> None:
288
+ """Drop all accumulated state. Tests call this between runs.
289
+
290
+ Not part of the ingress contract — production code should never
291
+ need to reset live metrics; a bounce of the process is the right
292
+ seam when operators want a clean slate.
293
+ """
294
+ with self._lock:
295
+ self._requests_total = 0
296
+ self._provider_attempts.clear()
297
+ self._provider_outcomes.clear()
298
+ self._provider_skipped_paid.clear()
299
+ self._provider_skipped_unknown.clear()
300
+ self._capability_degraded.clear()
301
+ self._output_filter_applied.clear()
302
+ self._chain_paid_gate_blocked_total = 0
303
+ self._chain_uniform_auth_failure_total = 0
304
+ self._auto_router_fallthrough_total = 0
305
+ self._last_error.clear()
306
+ self._recent.clear()
307
+ self._startup_info.clear()
308
+ self._started_monotonic = time.monotonic()
309
+ self._started_at = _utc_now_iso()
310
+
311
+
312
+ # ---------------------------------------------------------------------------
313
+ # Module-level singleton
314
+ # ---------------------------------------------------------------------------
315
+
316
+ _collector_lock: Final[threading.RLock] = threading.RLock()
317
+ _collector: MetricsCollector | None = None
318
+
319
+ # v1.5-B: optional JSONL mirror. Env-gated via ``CODEROUTER_EVENTS_PATH``.
320
+ # Stored as a module global so ``uninstall_collector`` can detach it in
321
+ # tandem with the MetricsCollector (keeps test isolation honest).
322
+ _JSONL_ENV_VAR: Final[str] = "CODEROUTER_EVENTS_PATH"
323
+ _jsonl_handler: logging.FileHandler | None = None
324
+
325
+
326
+ def install_collector(*, ring_size: int = _DEFAULT_RING_SIZE) -> MetricsCollector:
327
+ """Attach a :class:`MetricsCollector` to the root logger. Idempotent.
328
+
329
+ Called from :func:`coderouter.ingress.app.create_app` at lifespan
330
+ startup. Subsequent calls return the same instance — so tests that
331
+ build multiple FastAPI apps don't stack duplicate handlers. The
332
+ handler is installed alongside the existing
333
+ :class:`JsonLineFormatter` stderr handler; logging to stderr
334
+ continues unchanged.
335
+
336
+ v1.5-B side-effect: when ``$CODEROUTER_EVENTS_PATH`` is set at install
337
+ time, a JSONL mirror handler is attached too (see
338
+ :func:`_install_jsonl_mirror`). The env is read once; operators who
339
+ want to toggle mid-process must restart — which matches the "restart
340
+ to reset" policy the rest of the lifecycle follows.
341
+ """
342
+ global _collector
343
+ with _collector_lock:
344
+ if _collector is None:
345
+ _collector = MetricsCollector(ring_size=ring_size)
346
+ logging.getLogger().addHandler(_collector)
347
+ _install_jsonl_mirror()
348
+ return _collector
349
+
350
+
351
+ def get_collector() -> MetricsCollector:
352
+ """Return the installed collector, auto-installing if absent.
353
+
354
+ Allows ``/metrics.json`` to respond even when the ingress lifespan
355
+ hasn't fired yet (e.g. inside FastAPI TestClient where the lifespan
356
+ is async and may not have run before the first request). The auto-
357
+ install is equivalent to an explicit ``install_collector()`` call.
358
+ """
359
+ return install_collector()
360
+
361
+
362
+ def uninstall_collector() -> None:
363
+ """Detach the collector from the root logger. Tests use this for isolation.
364
+
365
+ Clears the module-level singleton so the next :func:`install_collector`
366
+ builds a fresh instance. Not called from production code — a process
367
+ restart is the right seam there. The JSONL mirror (v1.5-B) is
368
+ detached and closed in the same call so file handles don't leak
369
+ between tests.
370
+ """
371
+ global _collector
372
+ with _collector_lock:
373
+ if _collector is not None:
374
+ with contextlib.suppress(ValueError): # pragma: no cover - already detached
375
+ logging.getLogger().removeHandler(_collector)
376
+ _collector = None
377
+ _uninstall_jsonl_mirror()
378
+
379
+
380
+ def _install_jsonl_mirror() -> None:
381
+ """Attach a JSONL file handler if ``$CODEROUTER_EVENTS_PATH`` is set.
382
+
383
+ Read once at install time. The handler uses the same
384
+ :class:`JsonLineFormatter` as the stderr handler, so the mirror file
385
+ is byte-for-byte equivalent to what the operator sees on stderr
386
+ (except for the file rotation policy, which is delegated to external
387
+ ``logrotate`` — stdlib's ``RotatingFileHandler`` was rejected as
388
+ extra complexity for v1.5-B per plan.md §12.3.3).
389
+
390
+ Path expansion:
391
+ - ``~`` is expanded via :func:`os.path.expanduser`.
392
+ - Parent directories are created if missing (``parents=True``).
393
+
394
+ Idempotency is enforced at the outer :func:`install_collector` seam
395
+ (the module-level ``_collector`` check); this helper assumes a clean
396
+ slate at call time.
397
+ """
398
+ global _jsonl_handler
399
+ raw_path = os.environ.get(_JSONL_ENV_VAR, "").strip()
400
+ if not raw_path:
401
+ return
402
+ path = Path(os.path.expanduser(raw_path))
403
+ path.parent.mkdir(parents=True, exist_ok=True)
404
+ handler = logging.FileHandler(path, mode="a", encoding="utf-8", delay=True)
405
+ handler.setFormatter(JsonLineFormatter())
406
+ handler.setLevel(logging.DEBUG)
407
+ logging.getLogger().addHandler(handler)
408
+ _jsonl_handler = handler
409
+
410
+
411
+ def _uninstall_jsonl_mirror() -> None:
412
+ """Detach + close the JSONL handler if one is attached.
413
+
414
+ Called by :func:`uninstall_collector` for test isolation. Safe to
415
+ call when no handler is attached (no-op).
416
+ """
417
+ global _jsonl_handler
418
+ if _jsonl_handler is None:
419
+ return
420
+ with contextlib.suppress(ValueError): # pragma: no cover - already detached
421
+ logging.getLogger().removeHandler(_jsonl_handler)
422
+ with contextlib.suppress(Exception): # pragma: no cover - best-effort cleanup
423
+ _jsonl_handler.close()
424
+ _jsonl_handler = None
425
+
426
+
427
+ # ---------------------------------------------------------------------------
428
+ # Internal helpers
429
+ # ---------------------------------------------------------------------------
430
+
431
+
432
+ def _str(value: Any) -> str:
433
+ """Coerce a possibly-``None`` log extra to a string.
434
+
435
+ Log extras are typed ``str`` by convention (see
436
+ :class:`coderouter.logging.CapabilityDegradedPayload` and friends),
437
+ but the handler contract lets us receive anything — coerce
438
+ defensively so counter keys stay hashable ``str``.
439
+ """
440
+ return "" if value is None else str(value)
441
+
442
+
443
+ def _record_ts_iso(record: logging.LogRecord) -> str:
444
+ """Format the record's timestamp in the same shape as JsonLineFormatter.
445
+
446
+ Uses the record's ``created`` epoch-seconds instead of calling
447
+ ``datetime.now()`` so the recent-events ring and the stderr log line
448
+ for the same event carry identical timestamps.
449
+ """
450
+ return datetime.fromtimestamp(record.created, tz=UTC).strftime(
451
+ "%Y-%m-%dT%H:%M:%S"
452
+ )
453
+
454
+
455
+ def _make_last_error(extras: dict[str, Any], record: logging.LogRecord) -> dict[str, Any]:
456
+ """Build the per-provider last-error snapshot.
457
+
458
+ Trims the error message to ``_LAST_ERROR_MAX_CHARS`` (the raw log
459
+ already truncates at 500, but dashboard real estate is tight).
460
+ """
461
+ error_text = _str(extras.get("error"))
462
+ if len(error_text) > _LAST_ERROR_MAX_CHARS:
463
+ error_text = error_text[:_LAST_ERROR_MAX_CHARS] + "…"
464
+ status = extras.get("status")
465
+ retryable = extras.get("retryable")
466
+ return {
467
+ "ts": _record_ts_iso(record),
468
+ "status": status if isinstance(status, int) else None,
469
+ "retryable": bool(retryable) if retryable is not None else None,
470
+ "error": error_text,
471
+ }
@@ -0,0 +1,221 @@
1
+ """Prometheus text exposition format (v1.5-B).
2
+
3
+ Reads a :class:`MetricsCollector` snapshot dict and renders it in the
4
+ text exposition format documented at
5
+ https://prometheus.io/docs/instrumenting/exposition_formats/ —
6
+ specifically the 0.0.4 variant that ``promtool check metrics`` validates.
7
+
8
+ Why a hand-roll instead of ``prometheus_client``
9
+ plan.md §12.3.4: ~30 lines of format logic vs a 100kB+ dependency that
10
+ also wants to install its own metric objects (double bookkeeping with
11
+ our log-tap Collector). The format is stable (spec last changed 2017)
12
+ and promtool gives us E2E validation at zero lib cost.
13
+
14
+ Metric naming
15
+ Counters end in ``_total`` per convention. Gauges are plain names.
16
+ All CodeRouter metrics are prefixed ``coderouter_`` to avoid
17
+ collision when an operator already has app metrics on the same
18
+ Prometheus target.
19
+
20
+ Label escaping
21
+ Backslash, double-quote, and newline must be escaped inside label
22
+ values (per spec). Metric names / label names are constructed
23
+ internally and don't need escaping. Provider / profile / filter /
24
+ capability names come from user config and ARE passed through the
25
+ escape routine in case they contain funky characters.
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ from typing import Any
31
+
32
+ # ``_total`` is the canonical Prometheus convention for monotone counters.
33
+ # ``coderouter_`` prefix keeps us from colliding with other apps when an
34
+ # operator scrapes multiple services onto one Prometheus target.
35
+ _PREFIX = "coderouter_"
36
+
37
+
38
+ def format_prometheus(snapshot: dict[str, Any]) -> str:
39
+ """Render a MetricsCollector snapshot as Prometheus text exposition.
40
+
41
+ Pure function over the dict returned by
42
+ :meth:`coderouter.metrics.MetricsCollector.snapshot`, so unit tests
43
+ can feed canned data without spinning up the handler. Returns a
44
+ ``str`` terminated by a single newline — Prometheus parsers accept
45
+ either trailing-newline or not, but ending on ``\\n`` keeps
46
+ ``promtool`` happy.
47
+ """
48
+ lines: list[str] = []
49
+ counters = snapshot.get("counters", {})
50
+
51
+ # ---- Gauges ----------------------------------------------------------
52
+ lines.extend(
53
+ _gauge(
54
+ name="uptime_seconds",
55
+ help_text="Seconds since the CodeRouter process started.",
56
+ value=snapshot.get("uptime_s", 0.0),
57
+ )
58
+ )
59
+
60
+ # ---- Counters (scalar) ----------------------------------------------
61
+ lines.extend(
62
+ _counter(
63
+ name="requests_total",
64
+ help_text="Total requests dispatched to the fallback engine (``try-provider`` events).",
65
+ samples=[((), counters.get("requests_total", 0))],
66
+ )
67
+ )
68
+ lines.extend(
69
+ _counter(
70
+ name="chain_paid_gate_blocked_total",
71
+ help_text="Chains where ALLOW_PAID=false filtered every provider out.",
72
+ samples=[((), counters.get("chain_paid_gate_blocked_total", 0))],
73
+ )
74
+ )
75
+ lines.extend(
76
+ _counter(
77
+ name="chain_uniform_auth_failure_total",
78
+ help_text="Chains where every provider returned the same 401/403 auth failure.",
79
+ samples=[((), counters.get("chain_uniform_auth_failure_total", 0))],
80
+ )
81
+ )
82
+ lines.extend(
83
+ _counter(
84
+ name="auto_router_fallthrough_total",
85
+ help_text=(
86
+ "v1.6-A auto_router calls that exited via ``default_rule_profile`` "
87
+ "(no user/bundled rule matched, or auto_router.disabled=true)."
88
+ ),
89
+ samples=[((), counters.get("auto_router_fallthrough_total", 0))],
90
+ )
91
+ )
92
+
93
+ # ---- Counters (per-provider) ----------------------------------------
94
+ lines.extend(
95
+ _counter(
96
+ name="provider_attempts_total",
97
+ help_text="``try-provider`` log events, broken down by provider.",
98
+ samples=[
99
+ ((("provider", p),), v)
100
+ for p, v in sorted(counters.get("provider_attempts", {}).items())
101
+ ],
102
+ )
103
+ )
104
+ outcome_samples: list[tuple[tuple[tuple[str, str], ...], int]] = []
105
+ for provider, outcomes in sorted(counters.get("provider_outcomes", {}).items()):
106
+ for outcome, count in sorted(outcomes.items()):
107
+ outcome_samples.append(
108
+ ((("provider", provider), ("outcome", outcome)), count)
109
+ )
110
+ lines.extend(
111
+ _counter(
112
+ name="provider_outcomes_total",
113
+ help_text="Per-provider outcomes: ok | failed | failed_midstream.",
114
+ samples=outcome_samples,
115
+ )
116
+ )
117
+
118
+ skipped_samples: list[tuple[tuple[tuple[str, str], ...], int]] = []
119
+ for provider, count in sorted(counters.get("provider_skipped_paid", {}).items()):
120
+ skipped_samples.append(
121
+ ((("provider", provider), ("reason", "paid")), count)
122
+ )
123
+ for provider, count in sorted(counters.get("provider_skipped_unknown", {}).items()):
124
+ skipped_samples.append(
125
+ ((("provider", provider), ("reason", "unknown")), count)
126
+ )
127
+ lines.extend(
128
+ _counter(
129
+ name="provider_skipped_total",
130
+ help_text="Providers skipped before a call was attempted, by reason.",
131
+ samples=skipped_samples,
132
+ )
133
+ )
134
+
135
+ # ---- Counters (per-capability / per-filter) -------------------------
136
+ lines.extend(
137
+ _counter(
138
+ name="capability_degraded_total",
139
+ help_text="Capability gate degradations, by dropped capability (thinking | cache_control | reasoning).",
140
+ samples=[
141
+ ((("capability", c),), v)
142
+ for c, v in sorted(counters.get("capability_degraded", {}).items())
143
+ ],
144
+ )
145
+ )
146
+ lines.extend(
147
+ _counter(
148
+ name="output_filter_applied_total",
149
+ help_text="Output-filter firings, by filter name (strip_thinking | strip_stop_markers).",
150
+ samples=[
151
+ ((("filter", f),), v)
152
+ for f, v in sorted(counters.get("output_filter_applied", {}).items())
153
+ ],
154
+ )
155
+ )
156
+ return "\n".join(lines) + "\n"
157
+
158
+
159
+ # ---------------------------------------------------------------------------
160
+ # Internal helpers — compose HELP / TYPE / sample triples
161
+ # ---------------------------------------------------------------------------
162
+
163
+
164
+ def _counter(
165
+ *,
166
+ name: str,
167
+ help_text: str,
168
+ samples: list[tuple[tuple[tuple[str, str], ...], int]],
169
+ ) -> list[str]:
170
+ """Build HELP + TYPE + one line per (labels, value) sample.
171
+
172
+ Prometheus permits emitting a counter with zero samples — HELP/TYPE
173
+ still make the metric discoverable in the target metadata. We preserve
174
+ that shape so a dashboard knows the metric exists even before the
175
+ first event fires.
176
+ """
177
+ full_name = f"{_PREFIX}{name}"
178
+ lines = [
179
+ f"# HELP {full_name} {help_text}",
180
+ f"# TYPE {full_name} counter",
181
+ ]
182
+ for labels, value in samples:
183
+ lines.append(f"{full_name}{_fmt_labels(labels)} {value}")
184
+ return lines
185
+
186
+
187
+ def _gauge(*, name: str, help_text: str, value: float) -> list[str]:
188
+ """HELP + TYPE + a single sample for a scalar gauge.
189
+
190
+ Gauges here are always scalar (no labels) in v1.5-B. When we add
191
+ labeled gauges (e.g. per-provider last-tok/s), this helper will grow
192
+ a ``samples`` parameter to match :func:`_counter`.
193
+ """
194
+ full_name = f"{_PREFIX}{name}"
195
+ return [
196
+ f"# HELP {full_name} {help_text}",
197
+ f"# TYPE {full_name} gauge",
198
+ f"{full_name} {value}",
199
+ ]
200
+
201
+
202
+ def _fmt_labels(pairs: tuple[tuple[str, str], ...]) -> str:
203
+ """Render a tuple of (key, value) pairs as ``{k="v",k2="v2"}`` or ``""``.
204
+
205
+ Empty tuple → empty string (Prometheus permits unlabeled samples).
206
+ Values pass through :func:`_escape_label_value`; keys are trusted
207
+ (constructed internally).
208
+ """
209
+ if not pairs:
210
+ return ""
211
+ body = ",".join(f'{k}="{_escape_label_value(v)}"' for k, v in pairs)
212
+ return "{" + body + "}"
213
+
214
+
215
+ def _escape_label_value(value: str) -> str:
216
+ r"""Escape a label value per the Prometheus text format spec.
217
+
218
+ From the spec: ``\`` → ``\\``, ``"`` → ``\"``, newline → ``\n``.
219
+ Everything else (including dashes, dots, colons) is literal.
220
+ """
221
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")