dos-kernel 0.22.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dos/__init__.py +261 -0
- dos/_bin/dos-hook.exe +0 -0
- dos/_filelock.py +255 -0
- dos/_job_policy.py +97 -0
- dos/_tree.py +145 -0
- dos/admission.py +433 -0
- dos/answer_shape.py +299 -0
- dos/arbiter.py +859 -0
- dos/archive_lock.py +266 -0
- dos/arg_provenance.py +814 -0
- dos/attest.py +472 -0
- dos/breaker.py +311 -0
- dos/churn.py +226 -0
- dos/claim_extract.py +229 -0
- dos/claim_ttl.py +150 -0
- dos/cli.py +8721 -0
- dos/commit_audit.py +666 -0
- dos/completion.py +466 -0
- dos/concurrency_class.py +154 -0
- dos/config.py +1380 -0
- dos/config_lint.py +464 -0
- dos/cooldown.py +390 -0
- dos/coverage.py +387 -0
- dos/dangling_intent.py +287 -0
- dos/data_class.py +397 -0
- dos/decisions.py +1274 -0
- dos/decisions_tui.py +251 -0
- dos/dispatch_top.py +740 -0
- dos/dispatch_top_tui.py +116 -0
- dos/drivers/__init__.py +40 -0
- dos/drivers/ci_status.py +630 -0
- dos/drivers/citation_resolve.py +703 -0
- dos/drivers/decision_stop.py +98 -0
- dos/drivers/export_file.py +173 -0
- dos/drivers/export_otlp.py +275 -0
- dos/drivers/export_statsd.py +242 -0
- dos/drivers/hook_dialects.py +391 -0
- dos/drivers/job.py +47 -0
- dos/drivers/llm_judge.py +360 -0
- dos/drivers/memory_recall.py +1231 -0
- dos/drivers/notify_slack.py +373 -0
- dos/drivers/notify_webhook.py +251 -0
- dos/drivers/operator_judge.py +114 -0
- dos/drivers/os_acceptance.py +228 -0
- dos/drivers/paste_log.py +132 -0
- dos/drivers/plan_scope.py +133 -0
- dos/drivers/self_improve.py +375 -0
- dos/drivers/similarity_judge.py +249 -0
- dos/drivers/state_diff.py +274 -0
- dos/drivers/supervisor.py +347 -0
- dos/drivers/watchdog.py +363 -0
- dos/drivers/workshop.py +160 -0
- dos/durable_schema.py +344 -0
- dos/effect_witness.py +393 -0
- dos/efficiency.py +318 -0
- dos/enforce.py +414 -0
- dos/enumerate.py +776 -0
- dos/env_print.py +378 -0
- dos/event_severity.py +258 -0
- dos/evidence.py +692 -0
- dos/exec_capability.py +256 -0
- dos/export_cursor.py +143 -0
- dos/exporter.py +320 -0
- dos/firing_label.py +353 -0
- dos/fleet_roll.py +226 -0
- dos/gate_classify.py +827 -0
- dos/gh4_coverage.py +179 -0
- dos/git_delta.py +122 -0
- dos/guard.py +215 -0
- dos/health.py +552 -0
- dos/help_summary.py +519 -0
- dos/home.py +934 -0
- dos/hook_binary.py +194 -0
- dos/hook_dialect.py +271 -0
- dos/hook_exit.py +191 -0
- dos/hook_install.py +437 -0
- dos/id_alloc.py +304 -0
- dos/improve.py +499 -0
- dos/intent_ledger.py +635 -0
- dos/interpret.py +176 -0
- dos/intervention.py +769 -0
- dos/intervention_eval.py +371 -0
- dos/journal_delta.py +308 -0
- dos/judge_eval.py +328 -0
- dos/judges.py +366 -0
- dos/lane_infer.py +127 -0
- dos/lane_journal.py +1001 -0
- dos/lane_lease.py +952 -0
- dos/lane_overlap.py +228 -0
- dos/lease_health.py +282 -0
- dos/lifecycle.py +211 -0
- dos/liveness.py +352 -0
- dos/lock_modes.py +185 -0
- dos/log_source.py +395 -0
- dos/loop_decide.py +1746 -0
- dos/marker_gate.py +254 -0
- dos/marker_sensor.py +396 -0
- dos/noop_streak.py +280 -0
- dos/notify.py +479 -0
- dos/observe.py +175 -0
- dos/oracle.py +1661 -0
- dos/overlap_eval.py +214 -0
- dos/overlap_policy.py +342 -0
- dos/packet_sidecar.py +267 -0
- dos/phase_shipped.py +1985 -0
- dos/pick_priority.py +225 -0
- dos/pickable.py +369 -0
- dos/picker_oracle.py +1037 -0
- dos/plan_board.py +513 -0
- dos/plan_board_tui.py +113 -0
- dos/plan_source.py +455 -0
- dos/posttool_sensor.py +528 -0
- dos/precursor_gate.py +499 -0
- dos/precursor_gate_eval.py +239 -0
- dos/preflight.py +825 -0
- dos/pretool_sensor.py +490 -0
- dos/proc_delta.py +181 -0
- dos/productivity.py +296 -0
- dos/provider_limit.py +242 -0
- dos/py.typed +4 -0
- dos/reason_morphology.py +299 -0
- dos/reasons.py +449 -0
- dos/reconcile.py +173 -0
- dos/recurring_wedge.py +206 -0
- dos/render.py +393 -0
- dos/result_state.py +468 -0
- dos/resume.py +578 -0
- dos/resume_evidence.py +293 -0
- dos/retention.py +344 -0
- dos/reward.py +372 -0
- dos/rewind.py +587 -0
- dos/rewind_evidence.py +168 -0
- dos/rewind_tokens.py +252 -0
- dos/run_id.py +342 -0
- dos/scope.py +520 -0
- dos/scope_source.py +382 -0
- dos/scout.py +982 -0
- dos/self_modify.py +209 -0
- dos/sibling_scan.py +569 -0
- dos/skills/EXAMPLES.md +584 -0
- dos/skills/dos-class-cycle/SKILL.md +107 -0
- dos/skills/dos-dispatch/SKILL.md +177 -0
- dos/skills/dos-dispatch-loop/SKILL.md +254 -0
- dos/skills/dos-goal-gate/SKILL.md +269 -0
- dos/skills/dos-next-up/SKILL.md +231 -0
- dos/skills/dos-promote/SKILL.md +114 -0
- dos/skills/dos-replan/SKILL.md +159 -0
- dos/skills/dos-replan-loop/SKILL.md +114 -0
- dos/skills/dos-self-improve/SKILL.md +213 -0
- dos/skills/dos-supervise-loop/SKILL.md +180 -0
- dos/skills/dos-unstick/SKILL.md +108 -0
- dos/skills/dos-witness-claim/SKILL.md +251 -0
- dos/stamp.py +1002 -0
- dos/state_health.py +387 -0
- dos/status.py +114 -0
- dos/stop_policy.py +334 -0
- dos/supervise.py +1014 -0
- dos/testwitness.py +392 -0
- dos/timeline.py +1027 -0
- dos/tokens.py +485 -0
- dos/tool_stream.py +393 -0
- dos/tool_stream_eval.py +226 -0
- dos/trace.py +524 -0
- dos/verdict.py +140 -0
- dos/verdict_cli.py +189 -0
- dos/verdict_journal.py +497 -0
- dos/verdict_rollup.py +217 -0
- dos/verdicts.py +181 -0
- dos/wedge_reason.py +282 -0
- dos_kernel-0.22.0.dist-info/METADATA +859 -0
- dos_kernel-0.22.0.dist-info/RECORD +178 -0
- dos_kernel-0.22.0.dist-info/WHEEL +5 -0
- dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
- dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
- dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
- dos_mcp/__init__.py +52 -0
- dos_mcp/py.typed +2 -0
- dos_mcp/server.py +779 -0
dos/exporter.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
"""exporter — drain the verdict journal outward to an observability backend (docs/266).
|
|
2
|
+
|
|
3
|
+
> **DOS records its own disbelief (docs/262) — but the record is a local island.**
|
|
4
|
+
> The verdict journal (`verdict_journal.py`) lands every adjudication a fleet makes
|
|
5
|
+
> as a `run_id`-correlated `VerdictEvent` in an append-only JSONL file, and `dos
|
|
6
|
+
> observe` reads it back. What was MISSING is the last hop: nothing ships that stream
|
|
7
|
+
> to where an operator's dashboards/alerts already live — Datadog, Grafana/Loki,
|
|
8
|
+
> Honeycomb, an OTLP collector, a syslog file. This seam is that outward connector.
|
|
9
|
+
> **No transport name appears in this module.**
|
|
10
|
+
|
|
11
|
+
Why this seam, not `notify`
|
|
12
|
+
===========================
|
|
13
|
+
|
|
14
|
+
`notify` (docs/225) pushes a *projection snapshot* (`decisions` = "what needs a human
|
|
15
|
+
now", `top` = "what's running now") to a transport — it answers "page me on the
|
|
16
|
+
current state." That is the wrong shape for observability:
|
|
17
|
+
|
|
18
|
+
* `notify` sends ONE rendered `Notification` from a point-in-time snapshot; an exporter
|
|
19
|
+
ships a STREAM of structured events — every `VerdictEvent`, counts intact — so a
|
|
20
|
+
time-series backend can chart "liveness STALLED rate over 24h."
|
|
21
|
+
* `notify`'s payload is human-facing prose; an exporter's is machine-facing structure
|
|
22
|
+
(a metric point, an OTLP log record, a JSON line) keyed for aggregation.
|
|
23
|
+
* `notify` is operator-triggered / cron-cadence; an exporter is DRAIN-shaped — it
|
|
24
|
+
follows the journal forward and flushes new events, the way a log shipper tails a file.
|
|
25
|
+
|
|
26
|
+
So the exporter is the kernel's FIFTH pure-protocol + by-name-resolver seam — after
|
|
27
|
+
`judges` (the JUDGE rung), `overlap_policy` (the disjointness scorer), `hook_dialect`
|
|
28
|
+
(the host-hook renderer), and `notify` (the projection-delivery side) — on a new axis:
|
|
29
|
+
the verdict *stream*, drained outward. The shape is byte-for-byte `notify.py`'s: a pure
|
|
30
|
+
Protocol + a frozen result type + an unshadowable built-in + a by-name resolver + a
|
|
31
|
+
fail-soft wrapper. Every real connector (which names a transport as code — an OTLP
|
|
32
|
+
shipper is inherently OTLP-specific) lives in a driver and registers through the
|
|
33
|
+
`dos.exporters` entry-point group.
|
|
34
|
+
|
|
35
|
+
The neutral record is already there — `VerdictEvent`
|
|
36
|
+
=====================================================
|
|
37
|
+
|
|
38
|
+
The exporter does NOT invent a payload type. `verdict_journal.VerdictEvent` is already
|
|
39
|
+
the transport-agnostic, byte-clean fact: its `detail` carries the
|
|
40
|
+
environment-authored evidence counts the verdict was computed from (tokens, work,
|
|
41
|
+
ages), NEVER the agent's narration (the docs/138 invariant the journal enforces). An
|
|
42
|
+
exporter takes a batch of `VerdictEvent`s and ships them. The hard part — a clean,
|
|
43
|
+
correlated, forgeable-narration-free record — is done (docs/262).
|
|
44
|
+
|
|
45
|
+
Failure direction = fail-SOFT
|
|
46
|
+
=============================
|
|
47
|
+
|
|
48
|
+
Observability must NEVER crash the thing it observes — the same rule as `notify` and
|
|
49
|
+
the journal itself (`record()` logs-and-drops). So `export_safely` converts any
|
|
50
|
+
transport raise into a non-exported `ExportResult`. A *resolve* of an unknown exporter
|
|
51
|
+
name still raises (operator config error, surfaced at config time, the
|
|
52
|
+
`resolve_notifier` rule); a *send* never does. A down collector, a bad endpoint, an
|
|
53
|
+
absent optional extra → a non-exported result, never an exception into the drain loop.
|
|
54
|
+
|
|
55
|
+
The advisory floor (docs/99)
|
|
56
|
+
============================
|
|
57
|
+
|
|
58
|
+
The exporter REPORTS; it never acts on the fleet. It reads the journal only — takes no
|
|
59
|
+
lease, mints no belief, stops no run, mutates no DOS state. It is a pure
|
|
60
|
+
read-of-the-journal → ship. The `decisions`/`observe`/`notify` read-only posture,
|
|
61
|
+
extended across the network boundary.
|
|
62
|
+
|
|
63
|
+
Pure-stdlib. The resolver is the unit-test surface; the only I/O in the whole spine is
|
|
64
|
+
inside a driver's `export` (and a `cmd_export` boundary that reads the journal).
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
from __future__ import annotations
|
|
68
|
+
|
|
69
|
+
import sys
|
|
70
|
+
from dataclasses import dataclass
|
|
71
|
+
from typing import TYPE_CHECKING, Protocol, Sequence, runtime_checkable
|
|
72
|
+
|
|
73
|
+
if TYPE_CHECKING: # pragma: no cover - typing only; never imported at runtime
|
|
74
|
+
from dos.verdict_journal import VerdictEvent
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
# The result a driver returns — fail-soft: exported=0 is normal, never a raise.
|
|
79
|
+
# The `NotifyResult` analogue.
|
|
80
|
+
# ---------------------------------------------------------------------------
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass(frozen=True)
|
|
84
|
+
class ExportResult:
|
|
85
|
+
"""The outcome of an `export` — always returned, never raised (fail-soft).
|
|
86
|
+
|
|
87
|
+
`exported` is how many of the handed events the transport accepted (0 is a
|
|
88
|
+
perfectly normal result — a `null` sink, a `--dry-run`, a down collector). `detail`
|
|
89
|
+
is a one-line human reason (`"wrote 12 lines to …"` / `"dry-run"` / `"null sink"` /
|
|
90
|
+
`"error: …"`). `cursor` is the seq/offset the drain reached — the highest `seq`
|
|
91
|
+
shipped — so a later `--since <cursor>` resumes the forward drain without re-shipping
|
|
92
|
+
(the journal's own monotonic `seq`, nothing fabricated; "" when no event was shipped).
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
exported: int
|
|
96
|
+
detail: str = ""
|
|
97
|
+
cursor: str = ""
|
|
98
|
+
|
|
99
|
+
def to_dict(self) -> dict:
|
|
100
|
+
return {"exported": self.exported, "detail": self.detail, "cursor": self.cursor}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# ---------------------------------------------------------------------------
|
|
104
|
+
# The Exporter protocol + the unshadowable built-in null sink.
|
|
105
|
+
# ---------------------------------------------------------------------------
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@runtime_checkable
|
|
109
|
+
class Exporter(Protocol):
|
|
110
|
+
"""A transport that drains a batch of `VerdictEvent`s outward. The driver-side seam.
|
|
111
|
+
|
|
112
|
+
`name` is the registered name (`"file"`, `"statsd"`, `"otlp"`); `export` ships the
|
|
113
|
+
batch and returns an `ExportResult`. An implementation MAY write JSONL lines, emit
|
|
114
|
+
one metric per event, or POST OTLP records — that choice is the driver's, not the
|
|
115
|
+
kernel's. It MUST be fail-soft (never raise) on a transport failure; `export_safely`
|
|
116
|
+
is the outer net regardless.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
name: str
|
|
120
|
+
|
|
121
|
+
def export(self, events: "Sequence[VerdictEvent]") -> ExportResult: ...
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class NullExporter:
|
|
125
|
+
"""The honest zero — ships nothing, the unshadowable built-in baseline.
|
|
126
|
+
|
|
127
|
+
The default exporter, so a bare `dos export` is a safe no-op that reports how many
|
|
128
|
+
events WOULD ship and sends them nowhere (the `NullNotifier` / `AbstainJudge` /
|
|
129
|
+
`prefix`-floor analogue: the built-in that can never loosen anything and is always
|
|
130
|
+
resolvable). A host opts IN to a real transport by naming one (`--to file`).
|
|
131
|
+
|
|
132
|
+
It still reports a `cursor` (the highest seq it saw) so a `--since` drain advances
|
|
133
|
+
even against `null` — useful for "mark these as seen without shipping them."
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
name = "null"
|
|
137
|
+
|
|
138
|
+
def export(self, events: "Sequence[VerdictEvent]") -> ExportResult:
|
|
139
|
+
n = len(events)
|
|
140
|
+
cursor = _max_seq_cursor(events)
|
|
141
|
+
return ExportResult(
|
|
142
|
+
exported=0,
|
|
143
|
+
detail=f"null sink (no transport configured) — {n} event(s) not shipped",
|
|
144
|
+
cursor=cursor,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
_BUILT_IN_EXPORTERS: dict[str, type] = {"null": NullExporter}
|
|
149
|
+
|
|
150
|
+
EXPORTER_ENTRY_POINT_GROUP = "dos.exporters"
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _max_seq_cursor(events: "Sequence[VerdictEvent]") -> str:
|
|
154
|
+
"""The highest `seq` in `events` as a string cursor, or "" when the batch is empty.
|
|
155
|
+
|
|
156
|
+
A tiny pure helper every driver reuses to fill `ExportResult.cursor` consistently —
|
|
157
|
+
the resumable-drain offset is the journal's own monotonic `seq`, so "ship past the
|
|
158
|
+
last shipped seq" is `--since <cursor>`. Robust to a non-int seq (degrades to 0)."""
|
|
159
|
+
mx = 0
|
|
160
|
+
for e in events:
|
|
161
|
+
try:
|
|
162
|
+
s = int(getattr(e, "seq", 0) or 0)
|
|
163
|
+
except (TypeError, ValueError):
|
|
164
|
+
s = 0
|
|
165
|
+
if s > mx:
|
|
166
|
+
mx = s
|
|
167
|
+
return str(mx) if events else ""
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# ---------------------------------------------------------------------------
|
|
171
|
+
# Resolver + discovery — built-ins first, then the `dos.exporters` plugins. The
|
|
172
|
+
# `resolve_notifier` / `resolve_judge` shape; discovery I/O at the call boundary.
|
|
173
|
+
# ---------------------------------------------------------------------------
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _discover_entry_point_exporters(*, _stderr=None) -> list[tuple[str, "Exporter"]]:
|
|
177
|
+
"""Every `dos.exporters` plugin as `(name, exporter)`, sorted by name.
|
|
178
|
+
|
|
179
|
+
A plugin that fails to load is SKIPPED with a one-line stderr note rather than
|
|
180
|
+
crashing — the `_discover_entry_point_notifiers` posture (a broken third-party
|
|
181
|
+
plugin is the operator's to fix, not a kernel fault). Does entry-point I/O, so it is
|
|
182
|
+
a call-boundary helper, never called inside the resolve hot path twice."""
|
|
183
|
+
stderr = _stderr if _stderr is not None else sys.stderr
|
|
184
|
+
out: list[tuple[str, Exporter]] = []
|
|
185
|
+
try:
|
|
186
|
+
from importlib.metadata import entry_points
|
|
187
|
+
except Exception: # pragma: no cover - importlib.metadata always present py3.11+
|
|
188
|
+
return out
|
|
189
|
+
try:
|
|
190
|
+
eps = entry_points(group=EXPORTER_ENTRY_POINT_GROUP)
|
|
191
|
+
except TypeError: # pragma: no cover - py<3.10 selectable-API fallback
|
|
192
|
+
eps = entry_points().get(EXPORTER_ENTRY_POINT_GROUP, []) # type: ignore[attr-defined]
|
|
193
|
+
except Exception: # pragma: no cover - defensive: never let discovery crash a call
|
|
194
|
+
return out
|
|
195
|
+
for ep in sorted(eps, key=lambda e: e.name):
|
|
196
|
+
try:
|
|
197
|
+
obj = ep.load()
|
|
198
|
+
exporter = obj() if isinstance(obj, type) else obj
|
|
199
|
+
except Exception as e: # pragma: no cover - depends on third-party plugin
|
|
200
|
+
print(
|
|
201
|
+
f"warning: exporter plugin {ep.name!r} failed to load ({e}); skipping",
|
|
202
|
+
file=stderr,
|
|
203
|
+
)
|
|
204
|
+
continue
|
|
205
|
+
out.append((ep.name, exporter))
|
|
206
|
+
return out
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _accepted_kwargs(ctor: type, kwargs: dict) -> dict:
|
|
210
|
+
"""Filter `kwargs` to the parameters `ctor.__init__` actually accepts.
|
|
211
|
+
|
|
212
|
+
The export CLI builds ONE superset bag (path/host/port/endpoint/dry_run/root) and
|
|
213
|
+
hands it to whichever transport was named, so it need not branch per driver. But a
|
|
214
|
+
transport's `__init__` is keyword-only and would raise on an unexpected kwarg (`file`
|
|
215
|
+
does not take `host`; `statsd` does not take `path`). So we pass only the parameters
|
|
216
|
+
the constructor declares — unless it declares `**kwargs` (a `VAR_KEYWORD` param), in
|
|
217
|
+
which case it absorbs the rest and we forward everything. Pure introspection; no I/O.
|
|
218
|
+
The `notify._accepted_kwargs` twin.
|
|
219
|
+
"""
|
|
220
|
+
try:
|
|
221
|
+
import inspect
|
|
222
|
+
|
|
223
|
+
params = inspect.signature(ctor).parameters
|
|
224
|
+
except (TypeError, ValueError): # pragma: no cover - builtins without a signature
|
|
225
|
+
return kwargs
|
|
226
|
+
if any(p.kind is p.VAR_KEYWORD for p in params.values()):
|
|
227
|
+
return kwargs
|
|
228
|
+
return {k: v for k, v in kwargs.items() if k in params}
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def resolve_exporter(name: str, *, _stderr=None, **kwargs) -> "Exporter":
|
|
232
|
+
"""Resolve an exporter by name: built-ins first, then `dos.exporters` plugins.
|
|
233
|
+
|
|
234
|
+
Built-ins (`null`) resolve FIRST and cannot be shadowed (the trusted-fallback
|
|
235
|
+
guarantee, identical to `resolve_notifier`). An unknown name fails LOUD with the
|
|
236
|
+
known list — a typo'd `--to` is an operator error, never a silent degrade to `null`
|
|
237
|
+
(which would drop every event quietly). `kwargs` (e.g. `path=…`, `host=…`, `port=…`,
|
|
238
|
+
`endpoint=…`, `dry_run=…`) are forwarded to a CONSTRUCTOR-style occupant (a `type`),
|
|
239
|
+
FILTERED to the parameters that constructor accepts (so the CLI can hand the same
|
|
240
|
+
superset to any transport); a pre-built instance ignores them.
|
|
241
|
+
"""
|
|
242
|
+
if name in _BUILT_IN_EXPORTERS:
|
|
243
|
+
cls = _BUILT_IN_EXPORTERS[name]
|
|
244
|
+
accepted = _accepted_kwargs(cls, kwargs)
|
|
245
|
+
return cls(**accepted) if accepted else cls()
|
|
246
|
+
# For discovered plugins we resolve the ENTRY POINT and, if it is a class, construct
|
|
247
|
+
# it with kwargs (so the CLI can pass path/host/port). A plugin that exposes a
|
|
248
|
+
# pre-built instance is used as-is.
|
|
249
|
+
stderr = _stderr if _stderr is not None else sys.stderr
|
|
250
|
+
try:
|
|
251
|
+
from importlib.metadata import entry_points
|
|
252
|
+
except Exception: # pragma: no cover
|
|
253
|
+
entry_points = None # type: ignore[assignment]
|
|
254
|
+
found: object | None = None
|
|
255
|
+
if entry_points is not None:
|
|
256
|
+
try:
|
|
257
|
+
eps = entry_points(group=EXPORTER_ENTRY_POINT_GROUP)
|
|
258
|
+
except TypeError: # pragma: no cover - py<3.10
|
|
259
|
+
eps = entry_points().get(EXPORTER_ENTRY_POINT_GROUP, []) # type: ignore[attr-defined]
|
|
260
|
+
except Exception: # pragma: no cover
|
|
261
|
+
eps = []
|
|
262
|
+
for ep in eps:
|
|
263
|
+
if ep.name == name:
|
|
264
|
+
try:
|
|
265
|
+
found = ep.load()
|
|
266
|
+
except Exception as e: # pragma: no cover - third-party
|
|
267
|
+
raise ValueError(
|
|
268
|
+
f"exporter {name!r} failed to load: {e}"
|
|
269
|
+
) from e
|
|
270
|
+
break
|
|
271
|
+
if found is not None:
|
|
272
|
+
if isinstance(found, type):
|
|
273
|
+
accepted = _accepted_kwargs(found, kwargs)
|
|
274
|
+
return found(**accepted) # type: ignore[return-value]
|
|
275
|
+
return found # a pre-built instance ignores kwargs
|
|
276
|
+
discovered = [n for n, _ in _discover_entry_point_exporters(_stderr=stderr)]
|
|
277
|
+
known = sorted(set(_BUILT_IN_EXPORTERS) | set(discovered))
|
|
278
|
+
raise ValueError(f"unknown exporter {name!r}; known: {', '.join(known)}")
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def active_exporters(*, _stderr=None) -> list[tuple[str, "Exporter"]]:
|
|
282
|
+
"""Every resolvable exporter as `(name, exporter)` — built-ins THEN discovered.
|
|
283
|
+
|
|
284
|
+
The order `dos doctor` would list. Does entry-point discovery (I/O), so it is a
|
|
285
|
+
call-boundary helper, never called inside an adapter (the `active_notifiers` rule).
|
|
286
|
+
"""
|
|
287
|
+
built: list[tuple[str, Exporter]] = [(n, cls()) for n, cls in _BUILT_IN_EXPORTERS.items()]
|
|
288
|
+
return built + _discover_entry_point_exporters(_stderr=_stderr)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def active_exporter_names(*, _stderr=None) -> list[str]:
|
|
292
|
+
"""The names of every active exporter (built-in + discovered)."""
|
|
293
|
+
return [name for name, _ in active_exporters(_stderr=_stderr)]
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
# ---------------------------------------------------------------------------
|
|
297
|
+
# export_safely — the fail-soft wrapper. An export NEVER crashes the drain.
|
|
298
|
+
# ---------------------------------------------------------------------------
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def export_safely(exporter: "Exporter", events: "Sequence[VerdictEvent]") -> ExportResult:
|
|
302
|
+
"""Ship `events` via `exporter`, converting ANY raise to a non-exported result.
|
|
303
|
+
|
|
304
|
+
The fail-soft floor (`notify.send_safely`, re-aimed at the verdict stream):
|
|
305
|
+
observability must never take down the observed, so a transport that raises (a down
|
|
306
|
+
collector, a bad endpoint, a buggy plugin) must never propagate into the drain loop
|
|
307
|
+
that emitted it. A clean `export` returns its own `ExportResult`; a raise becomes
|
|
308
|
+
`ExportResult(exported=0, detail="error: …")`. (Contrast `resolve_exporter`, which
|
|
309
|
+
DOES raise on an unknown name — that is a config-time operator error, surfaced before
|
|
310
|
+
any drain.)
|
|
311
|
+
"""
|
|
312
|
+
try:
|
|
313
|
+
result = exporter.export(events)
|
|
314
|
+
except Exception as e: # noqa: BLE001 - observability must not crash the observed
|
|
315
|
+
return ExportResult(exported=0, detail=f"error: {e}")
|
|
316
|
+
if isinstance(result, ExportResult):
|
|
317
|
+
return result
|
|
318
|
+
# A misbehaving occupant returned a non-ExportResult; treat as a soft failure rather
|
|
319
|
+
# than trusting an unknown shape downstream.
|
|
320
|
+
return ExportResult(exported=0, detail="exporter returned a non-ExportResult")
|
dos/firing_label.py
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
"""firing-label — turn each detector FIRING into a labeled (signal, ground-truth) point (docs/179).
|
|
2
|
+
|
|
3
|
+
> **The kernel already mints one git-authored label per phase (`oracle.is_shipped`).
|
|
4
|
+
> This module mints a SECOND, different kind of label for free: it joins a detector
|
|
5
|
+
> firing (an env/agent-authored event) to the run's git-minted outcome (a fact the
|
|
6
|
+
> judged agent did not author), producing a `(signal, was-it-real)` point the
|
|
7
|
+
> detector line is scored on — lift + false-alarm. It is the one fold in the
|
|
8
|
+
> docs/179 set that mints NEW ground truth, because it joins two independently-authored
|
|
9
|
+
> facts that were never compared before.**
|
|
10
|
+
|
|
11
|
+
The detector line (`tool_stream` / `terminal_error` / `dangling` / `precursor`,
|
|
12
|
+
docs/145/158/173) is scored by LIFT and FALSE-ALARM rate, not recall (a 76%-fail
|
|
13
|
+
bench makes recall meaningless — see the docs/159 naive-baseline result). Those two
|
|
14
|
+
numbers need LABELED firings: each time a detector fired, was it a true catch or a
|
|
15
|
+
false alarm? Today those labels come from hand-curated offline benchmark replays
|
|
16
|
+
(docs/158-163, 174, 177) — a tiny, expensive, static set. This module turns every
|
|
17
|
+
live run into a small batch of such labels, drawn from data the kernel already has.
|
|
18
|
+
|
|
19
|
+
The fold, stated plainly
|
|
20
|
+
=========================
|
|
21
|
+
|
|
22
|
+
A `DetectorFiring` is "detector D fired signal S at step N of run R." Its label is
|
|
23
|
+
the run's GIT-MINTED outcome read off `trace.TraceFrame` — never the agent's
|
|
24
|
+
`claimed` self-report (the docs/138 byte-author invariant; `StepRow.claimed_sha`
|
|
25
|
+
is shown by `trace` but is NEVER read here). `label_firings` joins each firing to
|
|
26
|
+
its run's frame and emits one `LabeledPoint` with a closed `LabelOutcome`:
|
|
27
|
+
|
|
28
|
+
* **TRUE_POSITIVE** — the detector fired AND the run's git-minted outcome
|
|
29
|
+
confirms the no-progress the detector accused: the run has
|
|
30
|
+
a non-empty residual (declared steps the kernel never
|
|
31
|
+
verified) and produced no commits since its start_sha. The
|
|
32
|
+
stall was real; the run did not recover. A true catch.
|
|
33
|
+
* **FALSE_ALARM** — the detector fired BUT the run's git-minted outcome shows
|
|
34
|
+
progress: the run verified at least one declared step OR
|
|
35
|
+
landed at least one commit since start. The loop was not
|
|
36
|
+
terminally stuck (a legitimate poll, an eventual-consistency
|
|
37
|
+
wait that resolved, a stall the agent recovered from). The
|
|
38
|
+
false-alarm count the detector is penalized on.
|
|
39
|
+
* **UNVERIFIABLE** — the firing joined to a run, but the run carries NO
|
|
40
|
+
git-minted ground truth to judge against: no INTENT
|
|
41
|
+
declared (nothing to have a residual of) and no commits.
|
|
42
|
+
Refuse-don't-guess — we will NOT call an unjudgeable firing
|
|
43
|
+
a TP or an FP (the §5a optimism trap, inverted).
|
|
44
|
+
* **BROKEN_LINK** — the firing carries no `run_id` (a pre-docs/179 record, or a
|
|
45
|
+
hook fired outside a DOS spine), so it cannot be joined to a
|
|
46
|
+
frame at all. Counted, never guessed onto a run by time (the
|
|
47
|
+
docs/118/137 "fail toward no-match" rule). The honest
|
|
48
|
+
coverage tally.
|
|
49
|
+
|
|
50
|
+
The ground-truth rule is deliberately RUN-TERMINAL, and its bias is named, not
|
|
51
|
+
buried: it judges a firing against the run's eventual verified-vs-declared state,
|
|
52
|
+
so it is most meaningful on runs that reached a terminal verdict (a long-lived run
|
|
53
|
+
still in flight reads as UNVERIFIABLE until it declares/verifies/commits). That
|
|
54
|
+
selection bias toward terminal runs is reported (`LabelSummary.unverifiable`), not
|
|
55
|
+
hidden — the docs/159 "no silent caps" discipline. A future phase can sharpen TP/FP
|
|
56
|
+
with a TIMESTAMP join (did progress land BEFORE or AFTER the firing's step), using
|
|
57
|
+
the `ts` the Phase-0 record already stamps; v1 is the conservative terminal rule.
|
|
58
|
+
|
|
59
|
+
Why the multiplier is honest (1-3×, not 5-15×)
|
|
60
|
+
==============================================
|
|
61
|
+
|
|
62
|
+
A single REPEATING→STALLED run on the SAME stuck step is ONE firing, not many: the
|
|
63
|
+
Phase-0 sensor stamps `verdict_state` on a record only when it fired, and a run of
|
|
64
|
+
identical steps is the same `(tool, args, result)` triple — `dedupe_firings`
|
|
65
|
+
collapses consecutive same-`(run_id, signal, args/result identity)` firings to one
|
|
66
|
+
labeled point. So the audited `8bd8c736` read-loop (22 identical reads) mints
|
|
67
|
+
EXACTLY ONE `LabeledPoint`, not 22 — re-counting one stall as 22 labels would be
|
|
68
|
+
the consistency-not-grounding sin (one env fact counted many times is fake data).
|
|
69
|
+
The honest yield is ~1 label per DISTINCT detector-fired step that has a verified
|
|
70
|
+
side — typically 1-3 per run. That is still a real, free gain over the 1-label/phase
|
|
71
|
+
baseline, and every point has clean provenance.
|
|
72
|
+
|
|
73
|
+
⚓ Kernel discipline (the litmus): PURE Layer-1 leaf — `label_firings`/`dedupe_firings`
|
|
74
|
+
are state-in / frozen-verdict-out, zero I/O (the firings + the `TraceFrame` are
|
|
75
|
+
gathered at the caller boundary, exactly as `liveness.classify` takes a pre-read
|
|
76
|
+
`ProgressEvidence`). It imports only `trace` (for the `TraceFrame`/`StepRow` types it
|
|
77
|
+
folds) + stdlib, names no host/driver, carries no policy. The label is read off the
|
|
78
|
+
git-minted columns of `TraceFrame`; `claimed_sha` is never consulted.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
from __future__ import annotations
|
|
82
|
+
|
|
83
|
+
import enum
|
|
84
|
+
from dataclasses import dataclass, field
|
|
85
|
+
from typing import Iterable, Optional
|
|
86
|
+
|
|
87
|
+
from dos.trace import TraceFrame
|
|
88
|
+
|
|
89
|
+
# The durable_schema floor (docs/116 §6): a LabeledPoint is a record the detector
|
|
90
|
+
# eval reads, so it carries a schema tag. Additive fields do not bump it.
|
|
91
|
+
FIRING_LABEL_SCHEMA = 1
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
# The firing — one detector event, the agent/env-authored INPUT to the join.
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
@dataclass(frozen=True)
|
|
98
|
+
class DetectorFiring:
|
|
99
|
+
"""One detector firing — "detector `detector` fired `signal` at step `step_index`".
|
|
100
|
+
|
|
101
|
+
The INPUT half of the join. Every field is gathered at the boundary from the
|
|
102
|
+
durable firing record the Phase-0 sensor stamps (`posttool_sensor._step_entry`'s
|
|
103
|
+
`run_id`/`step_index`/`verdict_state`), or from any other detector's equivalent
|
|
104
|
+
record. NONE of these fields is a ground-truth LABEL — they are what the detector
|
|
105
|
+
SAID; the label comes from the run's git-minted outcome, joined in `label_firings`.
|
|
106
|
+
|
|
107
|
+
* `run_id` — the spine id joining this firing to its run's `TraceFrame`.
|
|
108
|
+
Empty/None → `BROKEN_LINK` (cannot join; never time-guessed).
|
|
109
|
+
* `detector` — which detector fired ("tool_stream", "terminal_error", …).
|
|
110
|
+
* `signal` — the detector's verdict value ("REPEATING"/"STALLED"/…).
|
|
111
|
+
* `step_index` — the 0-based ordinal within the run's stream where it fired
|
|
112
|
+
(the durable position, for dedup + a future timestamp join).
|
|
113
|
+
* `identity` — an optional opaque repeat-identity (e.g. the env-authored
|
|
114
|
+
`result_digest`) so two firings on the SAME stuck step collapse
|
|
115
|
+
to one labeled point. Defaults to the step_index when absent.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
run_id: str
|
|
119
|
+
detector: str
|
|
120
|
+
signal: str
|
|
121
|
+
step_index: int = -1
|
|
122
|
+
identity: str = ""
|
|
123
|
+
|
|
124
|
+
def _dedup_key(self) -> tuple[str, str, str, str]:
|
|
125
|
+
"""The key two firings must share to be 'the same firing' (dedup). Uses the
|
|
126
|
+
repeat-identity when present (so a 22-read stall is one key regardless of
|
|
127
|
+
step_index), else falls back to the step_index (distinct steps stay distinct)."""
|
|
128
|
+
ident = self.identity or f"@{self.step_index}"
|
|
129
|
+
return (self.run_id, self.detector, self.signal, ident)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
# ---------------------------------------------------------------------------
|
|
133
|
+
# The label — the closed OUTCOME vocabulary + the labeled point.
|
|
134
|
+
# ---------------------------------------------------------------------------
|
|
135
|
+
class LabelOutcome(str, enum.Enum):
|
|
136
|
+
"""The closed outcome of joining a firing to its run's git-minted ground truth.
|
|
137
|
+
|
|
138
|
+
A label, never an optimism: `UNVERIFIABLE`/`BROKEN_LINK` are first-class refusals
|
|
139
|
+
(we decline to call an unjudgeable firing a catch), the same fail-toward-no-match
|
|
140
|
+
posture the kernel takes everywhere it lacks evidence."""
|
|
141
|
+
|
|
142
|
+
TRUE_POSITIVE = "TRUE_POSITIVE"
|
|
143
|
+
FALSE_ALARM = "FALSE_ALARM"
|
|
144
|
+
UNVERIFIABLE = "UNVERIFIABLE"
|
|
145
|
+
BROKEN_LINK = "BROKEN_LINK"
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@dataclass(frozen=True)
|
|
149
|
+
class LabeledPoint:
|
|
150
|
+
"""One (firing, git-minted-outcome) calibration point — the fold's unit of NEW data.
|
|
151
|
+
|
|
152
|
+
`firing` is what the detector said; `outcome` is the git-authored verdict on it;
|
|
153
|
+
`reason` names the rung that produced the label (the provenance the verdict must
|
|
154
|
+
carry). `ground_truth` is the compact evidence the label stands on (verified
|
|
155
|
+
count / residual size / commit count) so the eval is auditable. This is the
|
|
156
|
+
`(signal, was-it-real)` row the detector line is scored on."""
|
|
157
|
+
|
|
158
|
+
firing: DetectorFiring
|
|
159
|
+
outcome: LabelOutcome
|
|
160
|
+
reason: str = ""
|
|
161
|
+
ground_truth: dict = field(default_factory=dict)
|
|
162
|
+
schema: int = FIRING_LABEL_SCHEMA
|
|
163
|
+
|
|
164
|
+
def to_dict(self) -> dict:
|
|
165
|
+
return {
|
|
166
|
+
"schema": self.schema,
|
|
167
|
+
"run_id": self.firing.run_id,
|
|
168
|
+
"detector": self.firing.detector,
|
|
169
|
+
"signal": self.firing.signal,
|
|
170
|
+
"step_index": self.firing.step_index,
|
|
171
|
+
"outcome": self.outcome.value,
|
|
172
|
+
"reason": self.reason,
|
|
173
|
+
"ground_truth": dict(self.ground_truth),
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@dataclass(frozen=True)
|
|
178
|
+
class LabelSummary:
|
|
179
|
+
"""The confusion-grid fold over many `LabeledPoint`s — the detector-eval headline.
|
|
180
|
+
|
|
181
|
+
`false_alarm_rate` is over the JUDGEABLE points only (TP + FP), the honest
|
|
182
|
+
denominator (an UNVERIFIABLE/BROKEN_LINK firing is neither a catch nor a false
|
|
183
|
+
alarm). `coverage` is the share of firings that were judgeable at all — the
|
|
184
|
+
selection-bias the run-terminal rule introduces, reported not hidden."""
|
|
185
|
+
|
|
186
|
+
points: tuple[LabeledPoint, ...]
|
|
187
|
+
|
|
188
|
+
def _count(self, outcome: LabelOutcome) -> int:
|
|
189
|
+
return sum(1 for p in self.points if p.outcome is outcome)
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def true_positives(self) -> int:
|
|
193
|
+
return self._count(LabelOutcome.TRUE_POSITIVE)
|
|
194
|
+
|
|
195
|
+
@property
|
|
196
|
+
def false_alarms(self) -> int:
|
|
197
|
+
return self._count(LabelOutcome.FALSE_ALARM)
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def unverifiable(self) -> int:
|
|
201
|
+
return self._count(LabelOutcome.UNVERIFIABLE)
|
|
202
|
+
|
|
203
|
+
@property
|
|
204
|
+
def broken_links(self) -> int:
|
|
205
|
+
return self._count(LabelOutcome.BROKEN_LINK)
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def judgeable(self) -> int:
|
|
209
|
+
"""Points with a real label (TP or FP) — the denominator for the rates."""
|
|
210
|
+
return self.true_positives + self.false_alarms
|
|
211
|
+
|
|
212
|
+
@property
|
|
213
|
+
def false_alarm_rate(self) -> Optional[float]:
|
|
214
|
+
"""FP / (TP + FP) — None when nothing was judgeable (refuse a 0/0 number)."""
|
|
215
|
+
j = self.judgeable
|
|
216
|
+
return (self.false_alarms / j) if j else None
|
|
217
|
+
|
|
218
|
+
@property
|
|
219
|
+
def coverage(self) -> Optional[float]:
|
|
220
|
+
"""Judgeable / total firings — how much of the firing stream got a real label.
|
|
221
|
+
None on an empty input (no firings to cover)."""
|
|
222
|
+
return (self.judgeable / len(self.points)) if self.points else None
|
|
223
|
+
|
|
224
|
+
def to_dict(self) -> dict:
|
|
225
|
+
return {
|
|
226
|
+
"total": len(self.points),
|
|
227
|
+
"true_positives": self.true_positives,
|
|
228
|
+
"false_alarms": self.false_alarms,
|
|
229
|
+
"unverifiable": self.unverifiable,
|
|
230
|
+
"broken_links": self.broken_links,
|
|
231
|
+
"judgeable": self.judgeable,
|
|
232
|
+
"false_alarm_rate": self.false_alarm_rate,
|
|
233
|
+
"coverage": self.coverage,
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
# ---------------------------------------------------------------------------
|
|
238
|
+
# The pure fold — firings + a TraceFrame in, labeled points out. No I/O.
|
|
239
|
+
# ---------------------------------------------------------------------------
|
|
240
|
+
def dedupe_firings(firings: Iterable[DetectorFiring]) -> tuple[DetectorFiring, ...]:
|
|
241
|
+
"""Collapse firings that are 'the same firing' to ONE (order-preserving). PURE.
|
|
242
|
+
|
|
243
|
+
Two firings with the same `_dedup_key` — same run, detector, signal, and
|
|
244
|
+
repeat-identity — are one event seen twice (a REPEATING that became STALLED on the
|
|
245
|
+
same stuck step; a 22-read loop). Keeping only the FIRST is what makes the
|
|
246
|
+
multiplier honest: one stall is one labeled point, never N (the consistency-not-
|
|
247
|
+
grounding guard). Distinct steps / distinct identities are preserved.
|
|
248
|
+
"""
|
|
249
|
+
seen: set[tuple[str, str, str, str]] = set()
|
|
250
|
+
out: list[DetectorFiring] = []
|
|
251
|
+
for f in firings:
|
|
252
|
+
k = f._dedup_key()
|
|
253
|
+
if k in seen:
|
|
254
|
+
continue
|
|
255
|
+
seen.add(k)
|
|
256
|
+
out.append(f)
|
|
257
|
+
return tuple(out)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _ground_truth(trace: TraceFrame) -> dict:
|
|
261
|
+
"""The compact git-minted evidence a label stands on. PURE — reads only the
|
|
262
|
+
git-authored columns of the frame (verified steps, residual, commits); the
|
|
263
|
+
`claimed_sha` column is NEVER read (the byte-author invariant)."""
|
|
264
|
+
verified = sum(1 for s in trace.steps if s.state == "VERIFIED")
|
|
265
|
+
return {
|
|
266
|
+
"has_intent": bool(trace.has_intent),
|
|
267
|
+
"verified_steps": verified,
|
|
268
|
+
"declared_steps": len(trace.steps),
|
|
269
|
+
"residual": len(trace.residual),
|
|
270
|
+
"commits_since_start": len(trace.commits),
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def label_one(firing: DetectorFiring, trace: Optional[TraceFrame]) -> LabeledPoint:
|
|
275
|
+
"""Label ONE firing against its run's git-minted ground truth. PURE.
|
|
276
|
+
|
|
277
|
+
The labeling ladder (refuse before you guess):
|
|
278
|
+
|
|
279
|
+
1. No `run_id` on the firing → BROKEN_LINK (cannot join; never time-guessed).
|
|
280
|
+
2. No frame / `found == False` for the run → BROKEN_LINK (the run left no
|
|
281
|
+
surface to join to — same fail-toward-no-match).
|
|
282
|
+
3. The run has NO git-minted ground truth (no INTENT and no commits) →
|
|
283
|
+
UNVERIFIABLE (nothing to judge the firing against; we refuse to call it).
|
|
284
|
+
4. The run made git-minted PROGRESS (a verified step OR a commit since start) →
|
|
285
|
+
FALSE_ALARM (the loop the detector accused was not terminally stuck).
|
|
286
|
+
5. Else (a residual remains and no commits landed) → TRUE_POSITIVE (the
|
|
287
|
+
no-progress the detector accused is confirmed by git).
|
|
288
|
+
|
|
289
|
+
Rules 4/5 read ONLY the git-authored side of the frame. A run that verified a
|
|
290
|
+
step or landed a commit demonstrably advanced — so the stall accusation was a
|
|
291
|
+
false alarm — regardless of what the agent CLAIMED. A run that declared work,
|
|
292
|
+
verified none of it, and committed nothing is the stall the detector caught.
|
|
293
|
+
"""
|
|
294
|
+
if not firing.run_id:
|
|
295
|
+
return LabeledPoint(firing, LabelOutcome.BROKEN_LINK,
|
|
296
|
+
reason="firing carries no run_id — cannot join to a run "
|
|
297
|
+
"(pre-docs/179 record or non-spine hook)")
|
|
298
|
+
if trace is None or not trace.found:
|
|
299
|
+
return LabeledPoint(firing, LabelOutcome.BROKEN_LINK,
|
|
300
|
+
reason=f"no surface found for run {firing.run_id} "
|
|
301
|
+
f"(no run.json / intent ledger / WAL event)")
|
|
302
|
+
|
|
303
|
+
gt = _ground_truth(trace)
|
|
304
|
+
verified = gt["verified_steps"]
|
|
305
|
+
commits = gt["commits_since_start"]
|
|
306
|
+
residual = gt["residual"]
|
|
307
|
+
|
|
308
|
+
if not trace.has_intent and commits == 0:
|
|
309
|
+
return LabeledPoint(
|
|
310
|
+
firing, LabelOutcome.UNVERIFIABLE, ground_truth=gt,
|
|
311
|
+
reason="run declared no intent and landed no commits — no git-minted "
|
|
312
|
+
"ground truth to judge the firing against (refuse, don't guess)")
|
|
313
|
+
|
|
314
|
+
if verified > 0 or commits > 0:
|
|
315
|
+
return LabeledPoint(
|
|
316
|
+
firing, LabelOutcome.FALSE_ALARM, ground_truth=gt,
|
|
317
|
+
reason=f"run made git-minted progress ({verified} step(s) verified, "
|
|
318
|
+
f"{commits} commit(s) since start) — the loop was not terminally "
|
|
319
|
+
f"stuck; the firing was a false alarm")
|
|
320
|
+
|
|
321
|
+
return LabeledPoint(
|
|
322
|
+
firing, LabelOutcome.TRUE_POSITIVE, ground_truth=gt,
|
|
323
|
+
reason=f"run verified 0 of its declared steps and landed 0 commits; "
|
|
324
|
+
f"{residual} step(s) remain unverified — the no-progress the detector "
|
|
325
|
+
f"accused is confirmed by git (a true catch)")
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def label_firings(
|
|
329
|
+
firings: Iterable[DetectorFiring],
|
|
330
|
+
frame_for,
|
|
331
|
+
*,
|
|
332
|
+
dedupe: bool = True,
|
|
333
|
+
) -> tuple[LabeledPoint, ...]:
|
|
334
|
+
"""Label a batch of firings against their runs' ground truth. PURE.
|
|
335
|
+
|
|
336
|
+
`frame_for` is a callable `run_id -> TraceFrame | None` the caller supplies — the
|
|
337
|
+
boundary that did the `trace.build_trace` I/O (kept OUT of this fold, the
|
|
338
|
+
state-in/verdict-out rule). It may return None for an unknown run (→ BROKEN_LINK).
|
|
339
|
+
`dedupe` collapses same-firing duplicates first (the honest-multiplier guard);
|
|
340
|
+
pass False only to inspect raw firings.
|
|
341
|
+
|
|
342
|
+
Returns one `LabeledPoint` per (deduped) firing — the calibration batch this run
|
|
343
|
+
contributed to the detector line. Wrap in `LabelSummary` for the confusion grid.
|
|
344
|
+
"""
|
|
345
|
+
fs = dedupe_firings(firings) if dedupe else tuple(firings)
|
|
346
|
+
out: list[LabeledPoint] = []
|
|
347
|
+
cache: dict[str, Optional[TraceFrame]] = {}
|
|
348
|
+
for f in fs:
|
|
349
|
+
if f.run_id and f.run_id not in cache:
|
|
350
|
+
cache[f.run_id] = frame_for(f.run_id)
|
|
351
|
+
trace = cache.get(f.run_id)
|
|
352
|
+
out.append(label_one(f, trace))
|
|
353
|
+
return tuple(out)
|