coderouter-cli 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderouter/__init__.py +17 -0
- coderouter/__main__.py +6 -0
- coderouter/adapters/__init__.py +23 -0
- coderouter/adapters/anthropic_native.py +502 -0
- coderouter/adapters/base.py +220 -0
- coderouter/adapters/openai_compat.py +395 -0
- coderouter/adapters/registry.py +17 -0
- coderouter/cli.py +345 -0
- coderouter/cli_stats.py +751 -0
- coderouter/config/__init__.py +10 -0
- coderouter/config/capability_registry.py +339 -0
- coderouter/config/env_file.py +295 -0
- coderouter/config/loader.py +73 -0
- coderouter/config/schemas.py +515 -0
- coderouter/data/__init__.py +7 -0
- coderouter/data/model-capabilities.yaml +86 -0
- coderouter/doctor.py +1596 -0
- coderouter/env_security.py +434 -0
- coderouter/errors.py +29 -0
- coderouter/ingress/__init__.py +5 -0
- coderouter/ingress/anthropic_routes.py +205 -0
- coderouter/ingress/app.py +144 -0
- coderouter/ingress/dashboard_routes.py +493 -0
- coderouter/ingress/metrics_routes.py +92 -0
- coderouter/ingress/openai_routes.py +153 -0
- coderouter/logging.py +315 -0
- coderouter/metrics/__init__.py +39 -0
- coderouter/metrics/collector.py +471 -0
- coderouter/metrics/prometheus.py +221 -0
- coderouter/output_filters.py +407 -0
- coderouter/routing/__init__.py +13 -0
- coderouter/routing/auto_router.py +244 -0
- coderouter/routing/capability.py +285 -0
- coderouter/routing/fallback.py +611 -0
- coderouter/translation/__init__.py +57 -0
- coderouter/translation/anthropic.py +204 -0
- coderouter/translation/convert.py +1291 -0
- coderouter/translation/tool_repair.py +236 -0
- coderouter_cli-1.7.0.dist-info/METADATA +509 -0
- coderouter_cli-1.7.0.dist-info/RECORD +43 -0
- coderouter_cli-1.7.0.dist-info/WHEEL +4 -0
- coderouter_cli-1.7.0.dist-info/entry_points.txt +2 -0
- coderouter_cli-1.7.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
"""In-memory metrics collector — a ``logging.Handler`` that taps structured logs.
|
|
2
|
+
|
|
3
|
+
Design (plan.md §12.3):
|
|
4
|
+
- v1.5-A ships the collector + ``/metrics.json`` endpoint only.
|
|
5
|
+
Prometheus exposition, JSONL persistence, CLI TUI, and HTML dashboard
|
|
6
|
+
land in v1.5-B/C/D.
|
|
7
|
+
- Collection is a **pure log tap**: we install ourselves on the root
|
|
8
|
+
logger via :func:`install_collector` and inspect each ``LogRecord``.
|
|
9
|
+
Adapter / routing code stays untouched — v0.5's ``capability-
|
|
10
|
+
degraded`` gate, v0.6's ``chain-paid-gate-blocked`` warn, v0.7's
|
|
11
|
+
``output-filter-applied`` info line are all already structured with
|
|
12
|
+
typed extras, so the collector just dispatches on ``record.msg``.
|
|
13
|
+
- Storage is in-memory only: counters (``collections.Counter``),
|
|
14
|
+
per-provider last-error snapshots, and a fixed-size ``deque`` of
|
|
15
|
+
recent events. Re-start clears the state — JSONL persistence lands
|
|
16
|
+
in v1.5-B (``CODEROUTER_EVENTS_PATH``).
|
|
17
|
+
|
|
18
|
+
Thread safety
|
|
19
|
+
``logging.Handler.emit`` can be invoked from any thread (Python's
|
|
20
|
+
logging module acquires ``handler.lock`` itself). We additionally
|
|
21
|
+
guard the mutable state with an ``RLock`` so ``snapshot()`` — which
|
|
22
|
+
the FastAPI event loop calls — sees a consistent view. The lock is
|
|
23
|
+
held only for the small mutation sites.
|
|
24
|
+
|
|
25
|
+
Event inventory (dispatch table in :meth:`MetricsCollector._dispatch`)
|
|
26
|
+
``try-provider`` → ``requests_total`` + ``provider_attempts``
|
|
27
|
+
``provider-ok`` → ``provider_outcomes[provider]["ok"]``
|
|
28
|
+
``provider-failed`` → ``provider_outcomes[provider]["failed"]``
|
|
29
|
+
+ last_error[provider]
|
|
30
|
+
``provider-failed-midstream``→ ``provider_outcomes[...]["failed_midstream"]``
|
|
31
|
+
``skip-paid-provider`` → ``provider_skipped_paid``
|
|
32
|
+
``skip-unknown-provider`` → ``provider_skipped_unknown``
|
|
33
|
+
``capability-degraded`` → ``capability_degraded[capability]``
|
|
34
|
+
``output-filter-applied`` → ``output_filter_applied[filter]``
|
|
35
|
+
``chain-paid-gate-blocked`` → ``chain_paid_gate_blocked_total``
|
|
36
|
+
``chain-uniform-auth-failure``→ ``chain_uniform_auth_failure_total``
|
|
37
|
+
``auto-router-fallthrough`` → ``auto_router_fallthrough_total``
|
|
38
|
+
``coderouter-startup`` → ``startup_info`` (stored for the UI header)
|
|
39
|
+
|
|
40
|
+
Unrecognized events are ignored (forward-compat: adding a new log
|
|
41
|
+
event never breaks the collector).
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
from __future__ import annotations
|
|
45
|
+
|
|
46
|
+
import contextlib
|
|
47
|
+
import logging
|
|
48
|
+
import os
|
|
49
|
+
import threading
|
|
50
|
+
import time
|
|
51
|
+
from collections import Counter, deque
|
|
52
|
+
from datetime import UTC, datetime
|
|
53
|
+
from pathlib import Path
|
|
54
|
+
from typing import Any, Final
|
|
55
|
+
|
|
56
|
+
from coderouter.logging import JsonLineFormatter
|
|
57
|
+
|
|
58
|
+
# Default ring-buffer size. Chosen to match a ~2-second refresh at 100 RPS
|
|
59
|
+
# without blowing memory; overridable via the :class:`MetricsCollector`
|
|
60
|
+
# constructor for tests.
|
|
61
|
+
_DEFAULT_RING_SIZE: Final[int] = 256
|
|
62
|
+
|
|
63
|
+
# Truncate ``error`` strings stored in the last-error snapshot. The raw
|
|
64
|
+
# log already truncates at 500 chars; 200 is plenty for dashboard display
|
|
65
|
+
# and keeps the snapshot dict small when many providers have errors.
|
|
66
|
+
_LAST_ERROR_MAX_CHARS: Final[int] = 200
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _utc_now_iso() -> str:
|
|
70
|
+
"""Current UTC time as ``YYYY-MM-DDTHH:MM:SS`` (no microseconds, no TZ suffix).
|
|
71
|
+
|
|
72
|
+
Matches the format :class:`coderouter.logging.JsonLineFormatter` uses
|
|
73
|
+
for its ``ts`` field, so the recent-events ring reads the same way as
|
|
74
|
+
the stderr log stream.
|
|
75
|
+
"""
|
|
76
|
+
return datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%S")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class MetricsCollector(logging.Handler):
|
|
80
|
+
"""``logging.Handler`` subclass that aggregates metrics from log records.
|
|
81
|
+
|
|
82
|
+
One instance per process (see :func:`install_collector`). Thread-safe
|
|
83
|
+
mutation of the internal counters/ring buffer via ``self._lock``.
|
|
84
|
+
``emit()`` is the hot path — it runs on every log record — so it
|
|
85
|
+
stays branchless outside the event dispatch table.
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
def __init__(self, *, ring_size: int = _DEFAULT_RING_SIZE) -> None:
|
|
89
|
+
"""Construct an empty collector.
|
|
90
|
+
|
|
91
|
+
``ring_size`` is the maximum number of recent events retained for
|
|
92
|
+
the dashboard's "Recent Requests" panel. Older events roll off
|
|
93
|
+
FIFO. Default 256 balances "enough history for a human glance" vs
|
|
94
|
+
memory; tests pass smaller values to keep assertions tight.
|
|
95
|
+
"""
|
|
96
|
+
super().__init__(level=logging.DEBUG)
|
|
97
|
+
self._lock = threading.RLock()
|
|
98
|
+
self._started_monotonic: float = time.monotonic()
|
|
99
|
+
self._started_at: str = _utc_now_iso()
|
|
100
|
+
|
|
101
|
+
# Counters — monotone, process-lifetime.
|
|
102
|
+
self._requests_total: int = 0
|
|
103
|
+
self._provider_attempts: Counter[str] = Counter()
|
|
104
|
+
# nested: provider -> outcome -> count
|
|
105
|
+
self._provider_outcomes: dict[str, Counter[str]] = {}
|
|
106
|
+
self._provider_skipped_paid: Counter[str] = Counter()
|
|
107
|
+
self._provider_skipped_unknown: Counter[str] = Counter()
|
|
108
|
+
self._capability_degraded: Counter[str] = Counter()
|
|
109
|
+
self._output_filter_applied: Counter[str] = Counter()
|
|
110
|
+
self._chain_paid_gate_blocked_total: int = 0
|
|
111
|
+
self._chain_uniform_auth_failure_total: int = 0
|
|
112
|
+
# v1.6-B: classifier ran, no user rule matched, and the
|
|
113
|
+
# ``default_rule_profile`` was returned instead. Surfaced as its own
|
|
114
|
+
# Prometheus counter so operators can watch the fall-through rate as
|
|
115
|
+
# a stability signal on custom rulesets.
|
|
116
|
+
self._auto_router_fallthrough_total: int = 0
|
|
117
|
+
|
|
118
|
+
# Last-error snapshot per provider (overwrites previous). Enables the
|
|
119
|
+
# dashboard's "last error" column without scanning the ring.
|
|
120
|
+
self._last_error: dict[str, dict[str, Any]] = {}
|
|
121
|
+
|
|
122
|
+
# Recent events ring. Each entry is a flat dict ready for JSON.
|
|
123
|
+
self._recent: deque[dict[str, Any]] = deque(maxlen=ring_size)
|
|
124
|
+
|
|
125
|
+
# Populated by coderouter-startup — lets /metrics.json surface
|
|
126
|
+
# "which providers does this server know about" without re-reading
|
|
127
|
+
# YAML.
|
|
128
|
+
self._startup_info: dict[str, Any] = {}
|
|
129
|
+
|
|
130
|
+
# ------------------------------------------------------------------
|
|
131
|
+
# Handler API
|
|
132
|
+
# ------------------------------------------------------------------
|
|
133
|
+
|
|
134
|
+
def emit(self, record: logging.LogRecord) -> None:
|
|
135
|
+
"""Dispatch a log record into the counter/ring updates.
|
|
136
|
+
|
|
137
|
+
Unknown event names are silently ignored so adding a new log line
|
|
138
|
+
elsewhere in the codebase never breaks metrics. Exceptions inside
|
|
139
|
+
dispatch are swallowed via :meth:`handleError` per the Handler
|
|
140
|
+
contract (we must never let metrics blow up a log call).
|
|
141
|
+
"""
|
|
142
|
+
try:
|
|
143
|
+
self._dispatch(record)
|
|
144
|
+
except Exception: # pragma: no cover - defensive
|
|
145
|
+
self.handleError(record)
|
|
146
|
+
|
|
147
|
+
def _dispatch(self, record: logging.LogRecord) -> None:
|
|
148
|
+
"""Event name → counter/ring mutation. Called under ``self._lock``."""
|
|
149
|
+
event = record.msg
|
|
150
|
+
if not isinstance(event, str):
|
|
151
|
+
return
|
|
152
|
+
extras = record.__dict__
|
|
153
|
+
with self._lock:
|
|
154
|
+
if event == "try-provider":
|
|
155
|
+
self._requests_total += 1
|
|
156
|
+
provider = _str(extras.get("provider"))
|
|
157
|
+
self._provider_attempts[provider] += 1
|
|
158
|
+
self._push_recent(event, extras, record)
|
|
159
|
+
elif event == "provider-ok":
|
|
160
|
+
provider = _str(extras.get("provider"))
|
|
161
|
+
self._provider_outcomes.setdefault(provider, Counter())["ok"] += 1
|
|
162
|
+
self._push_recent(event, extras, record)
|
|
163
|
+
elif event == "provider-failed":
|
|
164
|
+
provider = _str(extras.get("provider"))
|
|
165
|
+
self._provider_outcomes.setdefault(provider, Counter())["failed"] += 1
|
|
166
|
+
self._last_error[provider] = _make_last_error(extras, record)
|
|
167
|
+
self._push_recent(event, extras, record)
|
|
168
|
+
elif event == "provider-failed-midstream":
|
|
169
|
+
provider = _str(extras.get("provider"))
|
|
170
|
+
self._provider_outcomes.setdefault(provider, Counter())[
|
|
171
|
+
"failed_midstream"
|
|
172
|
+
] += 1
|
|
173
|
+
self._last_error[provider] = _make_last_error(extras, record)
|
|
174
|
+
self._push_recent(event, extras, record)
|
|
175
|
+
elif event == "skip-paid-provider":
|
|
176
|
+
provider = _str(extras.get("provider"))
|
|
177
|
+
self._provider_skipped_paid[provider] += 1
|
|
178
|
+
elif event == "skip-unknown-provider":
|
|
179
|
+
provider = _str(extras.get("provider"))
|
|
180
|
+
self._provider_skipped_unknown[provider] += 1
|
|
181
|
+
elif event == "capability-degraded":
|
|
182
|
+
dropped = extras.get("dropped") or []
|
|
183
|
+
if isinstance(dropped, list):
|
|
184
|
+
for cap in dropped:
|
|
185
|
+
if isinstance(cap, str):
|
|
186
|
+
self._capability_degraded[cap] += 1
|
|
187
|
+
elif event == "output-filter-applied":
|
|
188
|
+
filters = extras.get("filters") or []
|
|
189
|
+
if isinstance(filters, list):
|
|
190
|
+
for name in filters:
|
|
191
|
+
if isinstance(name, str):
|
|
192
|
+
self._output_filter_applied[name] += 1
|
|
193
|
+
elif event == "chain-paid-gate-blocked":
|
|
194
|
+
self._chain_paid_gate_blocked_total += 1
|
|
195
|
+
elif event == "chain-uniform-auth-failure":
|
|
196
|
+
self._chain_uniform_auth_failure_total += 1
|
|
197
|
+
elif event == "auto-router-fallthrough":
|
|
198
|
+
# Every call into ``classify()`` that exits via the
|
|
199
|
+
# default-rule branch (no user/bundled rule matched, or
|
|
200
|
+
# ``auto_router.disabled: true``) bumps this counter.
|
|
201
|
+
self._auto_router_fallthrough_total += 1
|
|
202
|
+
elif event == "coderouter-startup":
|
|
203
|
+
# Snapshot a subset — startup payload contains lists that are
|
|
204
|
+
# safe to surface to /metrics.json. Version / providers /
|
|
205
|
+
# profiles / default_profile is all the dashboard needs.
|
|
206
|
+
self._startup_info = {
|
|
207
|
+
"version": _str(extras.get("version")),
|
|
208
|
+
"providers": list(extras.get("providers") or []),
|
|
209
|
+
"profiles": list(extras.get("profiles") or []),
|
|
210
|
+
"default_profile": _str(extras.get("default_profile")),
|
|
211
|
+
"allow_paid": bool(extras.get("allow_paid")),
|
|
212
|
+
"mode_source": _str(extras.get("mode_source")),
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
def _push_recent(
|
|
216
|
+
self, event: str, extras: dict[str, Any], record: logging.LogRecord
|
|
217
|
+
) -> None:
|
|
218
|
+
"""Append a minimal record to the ring buffer.
|
|
219
|
+
|
|
220
|
+
We keep the shape flat and only surface whitelisted fields — this
|
|
221
|
+
is what the dashboard renders, and avoids leaking transient log
|
|
222
|
+
attributes (``msecs``, ``thread``, etc.) that would just bloat
|
|
223
|
+
the payload.
|
|
224
|
+
"""
|
|
225
|
+
entry: dict[str, Any] = {
|
|
226
|
+
"ts": _record_ts_iso(record),
|
|
227
|
+
"event": event,
|
|
228
|
+
}
|
|
229
|
+
for key in ("provider", "stream", "status", "retryable"):
|
|
230
|
+
if key in extras and extras[key] is not None:
|
|
231
|
+
entry[key] = extras[key]
|
|
232
|
+
self._recent.append(entry)
|
|
233
|
+
|
|
234
|
+
# ------------------------------------------------------------------
|
|
235
|
+
# Read API
|
|
236
|
+
# ------------------------------------------------------------------
|
|
237
|
+
|
|
238
|
+
def snapshot(self) -> dict[str, Any]:
|
|
239
|
+
"""Return the current metrics as a JSON-safe dict.
|
|
240
|
+
|
|
241
|
+
Shape is stable within v1.5 (may evolve with a semver-compatible
|
|
242
|
+
additive bump). Keys absent from one install but present in
|
|
243
|
+
another just mean "that event never fired this process lifetime".
|
|
244
|
+
"""
|
|
245
|
+
with self._lock:
|
|
246
|
+
providers = sorted(
|
|
247
|
+
set(self._provider_attempts)
|
|
248
|
+
| set(self._provider_outcomes)
|
|
249
|
+
| set(self._last_error)
|
|
250
|
+
)
|
|
251
|
+
provider_rows = [
|
|
252
|
+
{
|
|
253
|
+
"name": name,
|
|
254
|
+
"attempts": self._provider_attempts.get(name, 0),
|
|
255
|
+
"outcomes": dict(self._provider_outcomes.get(name, Counter())),
|
|
256
|
+
"last_error": self._last_error.get(name),
|
|
257
|
+
}
|
|
258
|
+
for name in providers
|
|
259
|
+
]
|
|
260
|
+
return {
|
|
261
|
+
"uptime_s": round(time.monotonic() - self._started_monotonic, 3),
|
|
262
|
+
"started_at": self._started_at,
|
|
263
|
+
"startup": dict(self._startup_info),
|
|
264
|
+
"counters": {
|
|
265
|
+
"requests_total": self._requests_total,
|
|
266
|
+
"chain_paid_gate_blocked_total": self._chain_paid_gate_blocked_total,
|
|
267
|
+
"chain_uniform_auth_failure_total": self._chain_uniform_auth_failure_total,
|
|
268
|
+
"auto_router_fallthrough_total": self._auto_router_fallthrough_total,
|
|
269
|
+
"provider_attempts": dict(self._provider_attempts),
|
|
270
|
+
"provider_outcomes": {
|
|
271
|
+
name: dict(counter)
|
|
272
|
+
for name, counter in self._provider_outcomes.items()
|
|
273
|
+
},
|
|
274
|
+
"provider_skipped_paid": dict(self._provider_skipped_paid),
|
|
275
|
+
"provider_skipped_unknown": dict(self._provider_skipped_unknown),
|
|
276
|
+
"capability_degraded": dict(self._capability_degraded),
|
|
277
|
+
"output_filter_applied": dict(self._output_filter_applied),
|
|
278
|
+
},
|
|
279
|
+
"providers": provider_rows,
|
|
280
|
+
"recent": list(self._recent),
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
# ------------------------------------------------------------------
|
|
284
|
+
# Test hook
|
|
285
|
+
# ------------------------------------------------------------------
|
|
286
|
+
|
|
287
|
+
def reset(self) -> None:
|
|
288
|
+
"""Drop all accumulated state. Tests call this between runs.
|
|
289
|
+
|
|
290
|
+
Not part of the ingress contract — production code should never
|
|
291
|
+
need to reset live metrics; a bounce of the process is the right
|
|
292
|
+
seam when operators want a clean slate.
|
|
293
|
+
"""
|
|
294
|
+
with self._lock:
|
|
295
|
+
self._requests_total = 0
|
|
296
|
+
self._provider_attempts.clear()
|
|
297
|
+
self._provider_outcomes.clear()
|
|
298
|
+
self._provider_skipped_paid.clear()
|
|
299
|
+
self._provider_skipped_unknown.clear()
|
|
300
|
+
self._capability_degraded.clear()
|
|
301
|
+
self._output_filter_applied.clear()
|
|
302
|
+
self._chain_paid_gate_blocked_total = 0
|
|
303
|
+
self._chain_uniform_auth_failure_total = 0
|
|
304
|
+
self._auto_router_fallthrough_total = 0
|
|
305
|
+
self._last_error.clear()
|
|
306
|
+
self._recent.clear()
|
|
307
|
+
self._startup_info.clear()
|
|
308
|
+
self._started_monotonic = time.monotonic()
|
|
309
|
+
self._started_at = _utc_now_iso()
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# ---------------------------------------------------------------------------
|
|
313
|
+
# Module-level singleton
|
|
314
|
+
# ---------------------------------------------------------------------------
|
|
315
|
+
|
|
316
|
+
_collector_lock: Final[threading.RLock] = threading.RLock()
|
|
317
|
+
_collector: MetricsCollector | None = None
|
|
318
|
+
|
|
319
|
+
# v1.5-B: optional JSONL mirror. Env-gated via ``CODEROUTER_EVENTS_PATH``.
|
|
320
|
+
# Stored as a module global so ``uninstall_collector`` can detach it in
|
|
321
|
+
# tandem with the MetricsCollector (keeps test isolation honest).
|
|
322
|
+
_JSONL_ENV_VAR: Final[str] = "CODEROUTER_EVENTS_PATH"
|
|
323
|
+
_jsonl_handler: logging.FileHandler | None = None
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def install_collector(*, ring_size: int = _DEFAULT_RING_SIZE) -> MetricsCollector:
|
|
327
|
+
"""Attach a :class:`MetricsCollector` to the root logger. Idempotent.
|
|
328
|
+
|
|
329
|
+
Called from :func:`coderouter.ingress.app.create_app` at lifespan
|
|
330
|
+
startup. Subsequent calls return the same instance — so tests that
|
|
331
|
+
build multiple FastAPI apps don't stack duplicate handlers. The
|
|
332
|
+
handler is installed alongside the existing
|
|
333
|
+
:class:`JsonLineFormatter` stderr handler; logging to stderr
|
|
334
|
+
continues unchanged.
|
|
335
|
+
|
|
336
|
+
v1.5-B side-effect: when ``$CODEROUTER_EVENTS_PATH`` is set at install
|
|
337
|
+
time, a JSONL mirror handler is attached too (see
|
|
338
|
+
:func:`_install_jsonl_mirror`). The env is read once; operators who
|
|
339
|
+
want to toggle mid-process must restart — which matches the "restart
|
|
340
|
+
to reset" policy the rest of the lifecycle follows.
|
|
341
|
+
"""
|
|
342
|
+
global _collector
|
|
343
|
+
with _collector_lock:
|
|
344
|
+
if _collector is None:
|
|
345
|
+
_collector = MetricsCollector(ring_size=ring_size)
|
|
346
|
+
logging.getLogger().addHandler(_collector)
|
|
347
|
+
_install_jsonl_mirror()
|
|
348
|
+
return _collector
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def get_collector() -> MetricsCollector:
|
|
352
|
+
"""Return the installed collector, auto-installing if absent.
|
|
353
|
+
|
|
354
|
+
Allows ``/metrics.json`` to respond even when the ingress lifespan
|
|
355
|
+
hasn't fired yet (e.g. inside FastAPI TestClient where the lifespan
|
|
356
|
+
is async and may not have run before the first request). The auto-
|
|
357
|
+
install is equivalent to an explicit ``install_collector()`` call.
|
|
358
|
+
"""
|
|
359
|
+
return install_collector()
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def uninstall_collector() -> None:
|
|
363
|
+
"""Detach the collector from the root logger. Tests use this for isolation.
|
|
364
|
+
|
|
365
|
+
Clears the module-level singleton so the next :func:`install_collector`
|
|
366
|
+
builds a fresh instance. Not called from production code — a process
|
|
367
|
+
restart is the right seam there. The JSONL mirror (v1.5-B) is
|
|
368
|
+
detached and closed in the same call so file handles don't leak
|
|
369
|
+
between tests.
|
|
370
|
+
"""
|
|
371
|
+
global _collector
|
|
372
|
+
with _collector_lock:
|
|
373
|
+
if _collector is not None:
|
|
374
|
+
with contextlib.suppress(ValueError): # pragma: no cover - already detached
|
|
375
|
+
logging.getLogger().removeHandler(_collector)
|
|
376
|
+
_collector = None
|
|
377
|
+
_uninstall_jsonl_mirror()
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def _install_jsonl_mirror() -> None:
|
|
381
|
+
"""Attach a JSONL file handler if ``$CODEROUTER_EVENTS_PATH`` is set.
|
|
382
|
+
|
|
383
|
+
Read once at install time. The handler uses the same
|
|
384
|
+
:class:`JsonLineFormatter` as the stderr handler, so the mirror file
|
|
385
|
+
is byte-for-byte equivalent to what the operator sees on stderr
|
|
386
|
+
(except for the file rotation policy, which is delegated to external
|
|
387
|
+
``logrotate`` — stdlib's ``RotatingFileHandler`` was rejected as
|
|
388
|
+
extra complexity for v1.5-B per plan.md §12.3.3).
|
|
389
|
+
|
|
390
|
+
Path expansion:
|
|
391
|
+
- ``~`` is expanded via :func:`os.path.expanduser`.
|
|
392
|
+
- Parent directories are created if missing (``parents=True``).
|
|
393
|
+
|
|
394
|
+
Idempotency is enforced at the outer :func:`install_collector` seam
|
|
395
|
+
(the module-level ``_collector`` check); this helper assumes a clean
|
|
396
|
+
slate at call time.
|
|
397
|
+
"""
|
|
398
|
+
global _jsonl_handler
|
|
399
|
+
raw_path = os.environ.get(_JSONL_ENV_VAR, "").strip()
|
|
400
|
+
if not raw_path:
|
|
401
|
+
return
|
|
402
|
+
path = Path(os.path.expanduser(raw_path))
|
|
403
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
404
|
+
handler = logging.FileHandler(path, mode="a", encoding="utf-8", delay=True)
|
|
405
|
+
handler.setFormatter(JsonLineFormatter())
|
|
406
|
+
handler.setLevel(logging.DEBUG)
|
|
407
|
+
logging.getLogger().addHandler(handler)
|
|
408
|
+
_jsonl_handler = handler
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def _uninstall_jsonl_mirror() -> None:
|
|
412
|
+
"""Detach + close the JSONL handler if one is attached.
|
|
413
|
+
|
|
414
|
+
Called by :func:`uninstall_collector` for test isolation. Safe to
|
|
415
|
+
call when no handler is attached (no-op).
|
|
416
|
+
"""
|
|
417
|
+
global _jsonl_handler
|
|
418
|
+
if _jsonl_handler is None:
|
|
419
|
+
return
|
|
420
|
+
with contextlib.suppress(ValueError): # pragma: no cover - already detached
|
|
421
|
+
logging.getLogger().removeHandler(_jsonl_handler)
|
|
422
|
+
with contextlib.suppress(Exception): # pragma: no cover - best-effort cleanup
|
|
423
|
+
_jsonl_handler.close()
|
|
424
|
+
_jsonl_handler = None
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
# ---------------------------------------------------------------------------
|
|
428
|
+
# Internal helpers
|
|
429
|
+
# ---------------------------------------------------------------------------
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def _str(value: Any) -> str:
|
|
433
|
+
"""Coerce a possibly-``None`` log extra to a string.
|
|
434
|
+
|
|
435
|
+
Log extras are typed ``str`` by convention (see
|
|
436
|
+
:class:`coderouter.logging.CapabilityDegradedPayload` and friends),
|
|
437
|
+
but the handler contract lets us receive anything — coerce
|
|
438
|
+
defensively so counter keys stay hashable ``str``.
|
|
439
|
+
"""
|
|
440
|
+
return "" if value is None else str(value)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def _record_ts_iso(record: logging.LogRecord) -> str:
|
|
444
|
+
"""Format the record's timestamp in the same shape as JsonLineFormatter.
|
|
445
|
+
|
|
446
|
+
Uses the record's ``created`` epoch-seconds instead of calling
|
|
447
|
+
``datetime.now()`` so the recent-events ring and the stderr log line
|
|
448
|
+
for the same event carry identical timestamps.
|
|
449
|
+
"""
|
|
450
|
+
return datetime.fromtimestamp(record.created, tz=UTC).strftime(
|
|
451
|
+
"%Y-%m-%dT%H:%M:%S"
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def _make_last_error(extras: dict[str, Any], record: logging.LogRecord) -> dict[str, Any]:
|
|
456
|
+
"""Build the per-provider last-error snapshot.
|
|
457
|
+
|
|
458
|
+
Trims the error message to ``_LAST_ERROR_MAX_CHARS`` (the raw log
|
|
459
|
+
already truncates at 500, but dashboard real estate is tight).
|
|
460
|
+
"""
|
|
461
|
+
error_text = _str(extras.get("error"))
|
|
462
|
+
if len(error_text) > _LAST_ERROR_MAX_CHARS:
|
|
463
|
+
error_text = error_text[:_LAST_ERROR_MAX_CHARS] + "…"
|
|
464
|
+
status = extras.get("status")
|
|
465
|
+
retryable = extras.get("retryable")
|
|
466
|
+
return {
|
|
467
|
+
"ts": _record_ts_iso(record),
|
|
468
|
+
"status": status if isinstance(status, int) else None,
|
|
469
|
+
"retryable": bool(retryable) if retryable is not None else None,
|
|
470
|
+
"error": error_text,
|
|
471
|
+
}
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""Prometheus text exposition format (v1.5-B).
|
|
2
|
+
|
|
3
|
+
Reads a :class:`MetricsCollector` snapshot dict and renders it in the
|
|
4
|
+
text exposition format documented at
|
|
5
|
+
https://prometheus.io/docs/instrumenting/exposition_formats/ —
|
|
6
|
+
specifically the 0.0.4 variant that ``promtool check metrics`` validates.
|
|
7
|
+
|
|
8
|
+
Why a hand-roll instead of ``prometheus_client``
|
|
9
|
+
plan.md §12.3.4: ~30 lines of format logic vs a 100kB+ dependency that
|
|
10
|
+
also wants to install its own metric objects (double bookkeeping with
|
|
11
|
+
our log-tap Collector). The format is stable (spec last changed 2017)
|
|
12
|
+
and promtool gives us E2E validation at zero lib cost.
|
|
13
|
+
|
|
14
|
+
Metric naming
|
|
15
|
+
Counters end in ``_total`` per convention. Gauges are plain names.
|
|
16
|
+
All CodeRouter metrics are prefixed ``coderouter_`` to avoid
|
|
17
|
+
collision when an operator already has app metrics on the same
|
|
18
|
+
Prometheus target.
|
|
19
|
+
|
|
20
|
+
Label escaping
|
|
21
|
+
Backslash, double-quote, and newline must be escaped inside label
|
|
22
|
+
values (per spec). Metric names / label names are constructed
|
|
23
|
+
internally and don't need escaping. Provider / profile / filter /
|
|
24
|
+
capability names come from user config and ARE passed through the
|
|
25
|
+
escape routine in case they contain funky characters.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
from typing import Any
|
|
31
|
+
|
|
32
|
+
# ``_total`` is the canonical Prometheus convention for monotone counters.
|
|
33
|
+
# ``coderouter_`` prefix keeps us from colliding with other apps when an
|
|
34
|
+
# operator scrapes multiple services onto one Prometheus target.
|
|
35
|
+
_PREFIX = "coderouter_"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def format_prometheus(snapshot: dict[str, Any]) -> str:
|
|
39
|
+
"""Render a MetricsCollector snapshot as Prometheus text exposition.
|
|
40
|
+
|
|
41
|
+
Pure function over the dict returned by
|
|
42
|
+
:meth:`coderouter.metrics.MetricsCollector.snapshot`, so unit tests
|
|
43
|
+
can feed canned data without spinning up the handler. Returns a
|
|
44
|
+
``str`` terminated by a single newline — Prometheus parsers accept
|
|
45
|
+
either trailing-newline or not, but ending on ``\\n`` keeps
|
|
46
|
+
``promtool`` happy.
|
|
47
|
+
"""
|
|
48
|
+
lines: list[str] = []
|
|
49
|
+
counters = snapshot.get("counters", {})
|
|
50
|
+
|
|
51
|
+
# ---- Gauges ----------------------------------------------------------
|
|
52
|
+
lines.extend(
|
|
53
|
+
_gauge(
|
|
54
|
+
name="uptime_seconds",
|
|
55
|
+
help_text="Seconds since the CodeRouter process started.",
|
|
56
|
+
value=snapshot.get("uptime_s", 0.0),
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# ---- Counters (scalar) ----------------------------------------------
|
|
61
|
+
lines.extend(
|
|
62
|
+
_counter(
|
|
63
|
+
name="requests_total",
|
|
64
|
+
help_text="Total requests dispatched to the fallback engine (``try-provider`` events).",
|
|
65
|
+
samples=[((), counters.get("requests_total", 0))],
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
lines.extend(
|
|
69
|
+
_counter(
|
|
70
|
+
name="chain_paid_gate_blocked_total",
|
|
71
|
+
help_text="Chains where ALLOW_PAID=false filtered every provider out.",
|
|
72
|
+
samples=[((), counters.get("chain_paid_gate_blocked_total", 0))],
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
lines.extend(
|
|
76
|
+
_counter(
|
|
77
|
+
name="chain_uniform_auth_failure_total",
|
|
78
|
+
help_text="Chains where every provider returned the same 401/403 auth failure.",
|
|
79
|
+
samples=[((), counters.get("chain_uniform_auth_failure_total", 0))],
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
lines.extend(
|
|
83
|
+
_counter(
|
|
84
|
+
name="auto_router_fallthrough_total",
|
|
85
|
+
help_text=(
|
|
86
|
+
"v1.6-A auto_router calls that exited via ``default_rule_profile`` "
|
|
87
|
+
"(no user/bundled rule matched, or auto_router.disabled=true)."
|
|
88
|
+
),
|
|
89
|
+
samples=[((), counters.get("auto_router_fallthrough_total", 0))],
|
|
90
|
+
)
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# ---- Counters (per-provider) ----------------------------------------
|
|
94
|
+
lines.extend(
|
|
95
|
+
_counter(
|
|
96
|
+
name="provider_attempts_total",
|
|
97
|
+
help_text="``try-provider`` log events, broken down by provider.",
|
|
98
|
+
samples=[
|
|
99
|
+
((("provider", p),), v)
|
|
100
|
+
for p, v in sorted(counters.get("provider_attempts", {}).items())
|
|
101
|
+
],
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
outcome_samples: list[tuple[tuple[tuple[str, str], ...], int]] = []
|
|
105
|
+
for provider, outcomes in sorted(counters.get("provider_outcomes", {}).items()):
|
|
106
|
+
for outcome, count in sorted(outcomes.items()):
|
|
107
|
+
outcome_samples.append(
|
|
108
|
+
((("provider", provider), ("outcome", outcome)), count)
|
|
109
|
+
)
|
|
110
|
+
lines.extend(
|
|
111
|
+
_counter(
|
|
112
|
+
name="provider_outcomes_total",
|
|
113
|
+
help_text="Per-provider outcomes: ok | failed | failed_midstream.",
|
|
114
|
+
samples=outcome_samples,
|
|
115
|
+
)
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
skipped_samples: list[tuple[tuple[tuple[str, str], ...], int]] = []
|
|
119
|
+
for provider, count in sorted(counters.get("provider_skipped_paid", {}).items()):
|
|
120
|
+
skipped_samples.append(
|
|
121
|
+
((("provider", provider), ("reason", "paid")), count)
|
|
122
|
+
)
|
|
123
|
+
for provider, count in sorted(counters.get("provider_skipped_unknown", {}).items()):
|
|
124
|
+
skipped_samples.append(
|
|
125
|
+
((("provider", provider), ("reason", "unknown")), count)
|
|
126
|
+
)
|
|
127
|
+
lines.extend(
|
|
128
|
+
_counter(
|
|
129
|
+
name="provider_skipped_total",
|
|
130
|
+
help_text="Providers skipped before a call was attempted, by reason.",
|
|
131
|
+
samples=skipped_samples,
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# ---- Counters (per-capability / per-filter) -------------------------
|
|
136
|
+
lines.extend(
|
|
137
|
+
_counter(
|
|
138
|
+
name="capability_degraded_total",
|
|
139
|
+
help_text="Capability gate degradations, by dropped capability (thinking | cache_control | reasoning).",
|
|
140
|
+
samples=[
|
|
141
|
+
((("capability", c),), v)
|
|
142
|
+
for c, v in sorted(counters.get("capability_degraded", {}).items())
|
|
143
|
+
],
|
|
144
|
+
)
|
|
145
|
+
)
|
|
146
|
+
lines.extend(
|
|
147
|
+
_counter(
|
|
148
|
+
name="output_filter_applied_total",
|
|
149
|
+
help_text="Output-filter firings, by filter name (strip_thinking | strip_stop_markers).",
|
|
150
|
+
samples=[
|
|
151
|
+
((("filter", f),), v)
|
|
152
|
+
for f, v in sorted(counters.get("output_filter_applied", {}).items())
|
|
153
|
+
],
|
|
154
|
+
)
|
|
155
|
+
)
|
|
156
|
+
return "\n".join(lines) + "\n"
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
# ---------------------------------------------------------------------------
|
|
160
|
+
# Internal helpers — compose HELP / TYPE / sample triples
|
|
161
|
+
# ---------------------------------------------------------------------------
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _counter(
|
|
165
|
+
*,
|
|
166
|
+
name: str,
|
|
167
|
+
help_text: str,
|
|
168
|
+
samples: list[tuple[tuple[tuple[str, str], ...], int]],
|
|
169
|
+
) -> list[str]:
|
|
170
|
+
"""Build HELP + TYPE + one line per (labels, value) sample.
|
|
171
|
+
|
|
172
|
+
Prometheus permits emitting a counter with zero samples — HELP/TYPE
|
|
173
|
+
still make the metric discoverable in the target metadata. We preserve
|
|
174
|
+
that shape so a dashboard knows the metric exists even before the
|
|
175
|
+
first event fires.
|
|
176
|
+
"""
|
|
177
|
+
full_name = f"{_PREFIX}{name}"
|
|
178
|
+
lines = [
|
|
179
|
+
f"# HELP {full_name} {help_text}",
|
|
180
|
+
f"# TYPE {full_name} counter",
|
|
181
|
+
]
|
|
182
|
+
for labels, value in samples:
|
|
183
|
+
lines.append(f"{full_name}{_fmt_labels(labels)} {value}")
|
|
184
|
+
return lines
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _gauge(*, name: str, help_text: str, value: float) -> list[str]:
|
|
188
|
+
"""HELP + TYPE + a single sample for a scalar gauge.
|
|
189
|
+
|
|
190
|
+
Gauges here are always scalar (no labels) in v1.5-B. When we add
|
|
191
|
+
labeled gauges (e.g. per-provider last-tok/s), this helper will grow
|
|
192
|
+
a ``samples`` parameter to match :func:`_counter`.
|
|
193
|
+
"""
|
|
194
|
+
full_name = f"{_PREFIX}{name}"
|
|
195
|
+
return [
|
|
196
|
+
f"# HELP {full_name} {help_text}",
|
|
197
|
+
f"# TYPE {full_name} gauge",
|
|
198
|
+
f"{full_name} {value}",
|
|
199
|
+
]
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _fmt_labels(pairs: tuple[tuple[str, str], ...]) -> str:
|
|
203
|
+
"""Render a tuple of (key, value) pairs as ``{k="v",k2="v2"}`` or ``""``.
|
|
204
|
+
|
|
205
|
+
Empty tuple → empty string (Prometheus permits unlabeled samples).
|
|
206
|
+
Values pass through :func:`_escape_label_value`; keys are trusted
|
|
207
|
+
(constructed internally).
|
|
208
|
+
"""
|
|
209
|
+
if not pairs:
|
|
210
|
+
return ""
|
|
211
|
+
body = ",".join(f'{k}="{_escape_label_value(v)}"' for k, v in pairs)
|
|
212
|
+
return "{" + body + "}"
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _escape_label_value(value: str) -> str:
|
|
216
|
+
r"""Escape a label value per the Prometheus text format spec.
|
|
217
|
+
|
|
218
|
+
From the spec: ``\`` → ``\\``, ``"`` → ``\"``, newline → ``\n``.
|
|
219
|
+
Everything else (including dashes, dots, colons) is literal.
|
|
220
|
+
"""
|
|
221
|
+
return value.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
|