dos-kernel 0.22.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. dos/__init__.py +261 -0
  2. dos/_bin/dos-hook.exe +0 -0
  3. dos/_filelock.py +255 -0
  4. dos/_job_policy.py +97 -0
  5. dos/_tree.py +145 -0
  6. dos/admission.py +433 -0
  7. dos/answer_shape.py +299 -0
  8. dos/arbiter.py +859 -0
  9. dos/archive_lock.py +266 -0
  10. dos/arg_provenance.py +814 -0
  11. dos/attest.py +472 -0
  12. dos/breaker.py +311 -0
  13. dos/churn.py +226 -0
  14. dos/claim_extract.py +229 -0
  15. dos/claim_ttl.py +150 -0
  16. dos/cli.py +8721 -0
  17. dos/commit_audit.py +666 -0
  18. dos/completion.py +466 -0
  19. dos/concurrency_class.py +154 -0
  20. dos/config.py +1380 -0
  21. dos/config_lint.py +464 -0
  22. dos/cooldown.py +390 -0
  23. dos/coverage.py +387 -0
  24. dos/dangling_intent.py +287 -0
  25. dos/data_class.py +397 -0
  26. dos/decisions.py +1274 -0
  27. dos/decisions_tui.py +251 -0
  28. dos/dispatch_top.py +740 -0
  29. dos/dispatch_top_tui.py +116 -0
  30. dos/drivers/__init__.py +40 -0
  31. dos/drivers/ci_status.py +630 -0
  32. dos/drivers/citation_resolve.py +703 -0
  33. dos/drivers/decision_stop.py +98 -0
  34. dos/drivers/export_file.py +173 -0
  35. dos/drivers/export_otlp.py +275 -0
  36. dos/drivers/export_statsd.py +242 -0
  37. dos/drivers/hook_dialects.py +391 -0
  38. dos/drivers/job.py +47 -0
  39. dos/drivers/llm_judge.py +360 -0
  40. dos/drivers/memory_recall.py +1231 -0
  41. dos/drivers/notify_slack.py +373 -0
  42. dos/drivers/notify_webhook.py +251 -0
  43. dos/drivers/operator_judge.py +114 -0
  44. dos/drivers/os_acceptance.py +228 -0
  45. dos/drivers/paste_log.py +132 -0
  46. dos/drivers/plan_scope.py +133 -0
  47. dos/drivers/self_improve.py +375 -0
  48. dos/drivers/similarity_judge.py +249 -0
  49. dos/drivers/state_diff.py +274 -0
  50. dos/drivers/supervisor.py +347 -0
  51. dos/drivers/watchdog.py +363 -0
  52. dos/drivers/workshop.py +160 -0
  53. dos/durable_schema.py +344 -0
  54. dos/effect_witness.py +393 -0
  55. dos/efficiency.py +318 -0
  56. dos/enforce.py +414 -0
  57. dos/enumerate.py +776 -0
  58. dos/env_print.py +378 -0
  59. dos/event_severity.py +258 -0
  60. dos/evidence.py +692 -0
  61. dos/exec_capability.py +256 -0
  62. dos/export_cursor.py +143 -0
  63. dos/exporter.py +320 -0
  64. dos/firing_label.py +353 -0
  65. dos/fleet_roll.py +226 -0
  66. dos/gate_classify.py +827 -0
  67. dos/gh4_coverage.py +179 -0
  68. dos/git_delta.py +122 -0
  69. dos/guard.py +215 -0
  70. dos/health.py +552 -0
  71. dos/help_summary.py +519 -0
  72. dos/home.py +934 -0
  73. dos/hook_binary.py +194 -0
  74. dos/hook_dialect.py +271 -0
  75. dos/hook_exit.py +191 -0
  76. dos/hook_install.py +437 -0
  77. dos/id_alloc.py +304 -0
  78. dos/improve.py +499 -0
  79. dos/intent_ledger.py +635 -0
  80. dos/interpret.py +176 -0
  81. dos/intervention.py +769 -0
  82. dos/intervention_eval.py +371 -0
  83. dos/journal_delta.py +308 -0
  84. dos/judge_eval.py +328 -0
  85. dos/judges.py +366 -0
  86. dos/lane_infer.py +127 -0
  87. dos/lane_journal.py +1001 -0
  88. dos/lane_lease.py +952 -0
  89. dos/lane_overlap.py +228 -0
  90. dos/lease_health.py +282 -0
  91. dos/lifecycle.py +211 -0
  92. dos/liveness.py +352 -0
  93. dos/lock_modes.py +185 -0
  94. dos/log_source.py +395 -0
  95. dos/loop_decide.py +1746 -0
  96. dos/marker_gate.py +254 -0
  97. dos/marker_sensor.py +396 -0
  98. dos/noop_streak.py +280 -0
  99. dos/notify.py +479 -0
  100. dos/observe.py +175 -0
  101. dos/oracle.py +1661 -0
  102. dos/overlap_eval.py +214 -0
  103. dos/overlap_policy.py +342 -0
  104. dos/packet_sidecar.py +267 -0
  105. dos/phase_shipped.py +1985 -0
  106. dos/pick_priority.py +225 -0
  107. dos/pickable.py +369 -0
  108. dos/picker_oracle.py +1037 -0
  109. dos/plan_board.py +513 -0
  110. dos/plan_board_tui.py +113 -0
  111. dos/plan_source.py +455 -0
  112. dos/posttool_sensor.py +528 -0
  113. dos/precursor_gate.py +499 -0
  114. dos/precursor_gate_eval.py +239 -0
  115. dos/preflight.py +825 -0
  116. dos/pretool_sensor.py +490 -0
  117. dos/proc_delta.py +181 -0
  118. dos/productivity.py +296 -0
  119. dos/provider_limit.py +242 -0
  120. dos/py.typed +4 -0
  121. dos/reason_morphology.py +299 -0
  122. dos/reasons.py +449 -0
  123. dos/reconcile.py +173 -0
  124. dos/recurring_wedge.py +206 -0
  125. dos/render.py +393 -0
  126. dos/result_state.py +468 -0
  127. dos/resume.py +578 -0
  128. dos/resume_evidence.py +293 -0
  129. dos/retention.py +344 -0
  130. dos/reward.py +372 -0
  131. dos/rewind.py +587 -0
  132. dos/rewind_evidence.py +168 -0
  133. dos/rewind_tokens.py +252 -0
  134. dos/run_id.py +342 -0
  135. dos/scope.py +520 -0
  136. dos/scope_source.py +382 -0
  137. dos/scout.py +982 -0
  138. dos/self_modify.py +209 -0
  139. dos/sibling_scan.py +569 -0
  140. dos/skills/EXAMPLES.md +584 -0
  141. dos/skills/dos-class-cycle/SKILL.md +107 -0
  142. dos/skills/dos-dispatch/SKILL.md +177 -0
  143. dos/skills/dos-dispatch-loop/SKILL.md +254 -0
  144. dos/skills/dos-goal-gate/SKILL.md +269 -0
  145. dos/skills/dos-next-up/SKILL.md +231 -0
  146. dos/skills/dos-promote/SKILL.md +114 -0
  147. dos/skills/dos-replan/SKILL.md +159 -0
  148. dos/skills/dos-replan-loop/SKILL.md +114 -0
  149. dos/skills/dos-self-improve/SKILL.md +213 -0
  150. dos/skills/dos-supervise-loop/SKILL.md +180 -0
  151. dos/skills/dos-unstick/SKILL.md +108 -0
  152. dos/skills/dos-witness-claim/SKILL.md +251 -0
  153. dos/stamp.py +1002 -0
  154. dos/state_health.py +387 -0
  155. dos/status.py +114 -0
  156. dos/stop_policy.py +334 -0
  157. dos/supervise.py +1014 -0
  158. dos/testwitness.py +392 -0
  159. dos/timeline.py +1027 -0
  160. dos/tokens.py +485 -0
  161. dos/tool_stream.py +393 -0
  162. dos/tool_stream_eval.py +226 -0
  163. dos/trace.py +524 -0
  164. dos/verdict.py +140 -0
  165. dos/verdict_cli.py +189 -0
  166. dos/verdict_journal.py +497 -0
  167. dos/verdict_rollup.py +217 -0
  168. dos/verdicts.py +181 -0
  169. dos/wedge_reason.py +282 -0
  170. dos_kernel-0.22.0.dist-info/METADATA +859 -0
  171. dos_kernel-0.22.0.dist-info/RECORD +178 -0
  172. dos_kernel-0.22.0.dist-info/WHEEL +5 -0
  173. dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
  174. dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
  175. dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
  176. dos_mcp/__init__.py +52 -0
  177. dos_mcp/py.typed +2 -0
  178. dos_mcp/server.py +779 -0
dos/picker_oracle.py ADDED
@@ -0,0 +1,1037 @@
1
+ """Picker oracle — provable ground truth for /next-up picker quality.
2
+
3
+ Background
4
+ ==========
5
+
6
+ `/next-up` and `/dispatch` emit `verdict=LIVE | WEDGE | DRAIN` per run.
7
+ That single token conflates several outcomes:
8
+
9
+ * **LIVE** — picker found work, child2 shipped it
10
+ * **NO-PICK true** — picker found no work AND none existed (correct DRAIN)
11
+ * **NO-PICK fake** — picker found no work BUT work existed (picker bug)
12
+ sub-causes: stale-claim ghost, regex false-pos,
13
+ misroute, renderer regression, unverified soak
14
+
15
+ The dangerous case is NO-PICK fake. Today it's invisible: every WEDGE looks
16
+ the same to the loop's stop signal, but the cost is real (~$2-5 per WEDGE
17
+ iter × 5-10 iters/day × ~$0 ships).
18
+
19
+ This module reconstructs ground truth for each historical dispatch:
20
+
21
+ 1. Read the picker's self-reported verdict envelope
22
+ (`output/next-up/.verdict-<tag>.json`)
23
+ 2. Read the lane state at run time (`execution-state.yaml` snapshot via
24
+ git, or current state for recent runs)
25
+ 3. Cross-check the picker's stated cause against on-disk facts:
26
+ * STALE_CLAIM claimed → is the colliding claim actually stale?
27
+ * OPERATOR_GATE claimed → is the soak deadline really open?
28
+ * MISROUTE claimed → does the pick really belong elsewhere?
29
+ * TRUE_DRAIN claimed → are all in-scope plans really `remaining:[]`?
30
+ 4. Emit a typed `PickerVerdict` with `oracle_disagrees: bool` flag.
31
+
32
+ The flag is what `/replan` consumes to route picker-bug findings.
33
+
34
+ Falsifiable metrics
35
+ -------------------
36
+
37
+ For any time window:
38
+
39
+ precision = picks_shipped / picks_emitted
40
+ recall = (oracle_live_picks - missed_picks) / oracle_live_picks
41
+ cost_per_ship = sum(dispatch_cost) / picks_shipped
42
+
43
+ `recall` is the hidden metric — today there's no number. Once the oracle
44
+ exists, it's a CI invariant.
45
+
46
+ CLI
47
+ ===
48
+
49
+ python scripts/picker_oracle.py classify <run_ts>
50
+ python scripts/picker_oracle.py sweep --since 7d
51
+ python scripts/picker_oracle.py report --window 24h
52
+ python scripts/picker_oracle.py check --min-recall 0.7
53
+
54
+ `classify` is the per-run primitive; `sweep` is the idempotent bulk driver
55
+ (skips runs already classified); `report` emits the human-readable audit
56
+ markdown; `check` is the CI gate.
57
+
58
+ Repeatability
59
+ =============
60
+
61
+ Outputs live at `docs/_picker_audits/<window>/oracle.jsonl` (append-only,
62
+ keyed by `(run_ts, child_idx)` — re-run is a no-op). The audit report at
63
+ `docs/_picker_audits/<window>/picker_recall_audit.md`.
64
+
65
+ Wired callers (post-ship):
66
+ * `/dispatch-loop` archive step — calls `classify` per iter
67
+ * `/replan` sweep — calls `sweep --since 24h`; routes oracle_disagrees=true
68
+ rows to `findings-followup-queue.md` as picker-bug findings
69
+ """
70
+ from __future__ import annotations
71
+
72
+ import argparse
73
+ import dataclasses
74
+ import datetime as dt
75
+ import enum
76
+ import json
77
+ import re
78
+ import sys
79
+ from pathlib import Path
80
+
81
+ import os
82
+
83
+ # The closed `reason_class` vocabulary lives in `dos.wedge_reason` — the single
84
+ # source of truth the producer emits against. Importing it here keeps the
85
+ # oracle's recognition set in lockstep so a new WEDGE token is verifiable the
86
+ # moment it is emittable. (DOS makes this a clean package import; the origin
87
+ # repo needed a `sys.path.insert` because scripts ran as bare files.)
88
+ from dos import wedge_reason
89
+ from dos import config as _config
90
+
91
+ # Path coupling resolves against the ACTIVE WORKSPACE (separation refactor).
92
+ # The pure `classify(...)` takes `state` explicitly; only the I/O loaders +
93
+ # the sweep/report commands read these.
94
+
95
+
96
+ def _state_path() -> Path:
97
+ env = os.environ.get("JOB_FANOUT_STATE_PATH") or os.environ.get("DISPATCH_STATE_PATH")
98
+ return Path(env) if env else _config.active().paths.execution_state
99
+
100
+
101
+ def _chained_runs() -> Path:
102
+ return _config.active().paths.chained_runs
103
+
104
+
105
+ def _next_up_dir() -> Path:
106
+ return _config.active().paths.next_packets
107
+
108
+
109
+ def _audits_dir() -> Path:
110
+ return _config.active().paths.picker_audits
111
+
112
+
113
+ SCHEMA_VERSION = 1
114
+
115
+
116
+ # ---------------------------------------------------------------------------
117
+ # Verdict taxonomy
118
+ # ---------------------------------------------------------------------------
119
+
120
+
121
+ class PickerOutcome(str, enum.Enum):
122
+ LIVE = "LIVE" # picker emitted >=1 pick AND >=1 shipped
123
+ LIVE_DIRTY = "LIVE_DIRTY" # picker emitted picks, none shipped (downstream issue)
124
+ NO_PICK = "NO_PICK" # picker emitted no usable packet
125
+ UNKNOWN = "UNKNOWN" # envelope missing/malformed
126
+
127
+
128
+ class NoPickCause(str, enum.Enum):
129
+ TRUE_DRAIN = "TRUE_DRAIN" # all in-scope plans `remaining:[]`, no findings
130
+ OPERATOR_GATE = "OPERATOR_GATE" # soak open / operator-attended / env-flag-gated
131
+ STALE_CLAIM = "STALE_CLAIM" # collision with claim > stale_threshold_h old
132
+ MISROUTE = "MISROUTE" # finding routed to wrong lane
133
+ REGEX_FP = "REGEX_FP" # scope-filter regex false-positive
134
+ RENDERER_BUG = "RENDERER_BUG" # packet rendered but soft-claim skipped
135
+ UNCLASSIFIED = "UNCLASSIFIED" # legacy envelope, no reason_class
136
+
137
+ # convenience for "the LLM's claim is contradicted by on-disk state"
138
+ @property
139
+ def is_picker_bug(self) -> bool:
140
+ return self in {
141
+ NoPickCause.STALE_CLAIM,
142
+ NoPickCause.MISROUTE,
143
+ NoPickCause.REGEX_FP,
144
+ NoPickCause.RENDERER_BUG,
145
+ }
146
+
147
+
148
+ # Mapping from the picker's `reason_class` token to our canonical NoPickCause.
149
+ #
150
+ # The LANE_* tokens come from the CLOSED `wedge_reason.WedgeReason` set — the
151
+ # single source of truth the producer (`next_up_render`) now emits against. Before
152
+ # 2026-05-31 these were LLM-prose-only and NONE of them were in this map, so every
153
+ # WEDGE classified `UNCLASSIFIED` ("cannot verify") — the oracle's blind spot that
154
+ # made `recall_proxy=1.0`/`oracle_disagrees=0` meaningless (it could not classify
155
+ # the rows that mattered, [[feedback_wedge_verdict_is_llm_prose_not_code]]). We
156
+ # derive the LANE_* half of the map FROM `wedge_reason.REASON_TO_CATEGORY` so the
157
+ # two can never drift again — a new WedgeReason member is recognised here the moment
158
+ # it is added there. The legacy free-form aliases below stay for older envelopes.
159
+ # Unknown tokens still fall through to UNCLASSIFIED so the oracle is forward-compat.
160
+ _LEGACY_REASON_ALIASES: dict[str, NoPickCause] = {
161
+ "MIS_ROUTED_FINDING": NoPickCause.MISROUTE,
162
+ "MISROUTED": NoPickCause.MISROUTE,
163
+ "STALE_CLAIM_COLLISION": NoPickCause.STALE_CLAIM,
164
+ "FOREIGN_COLLISION": NoPickCause.STALE_CLAIM,
165
+ "OPERATOR_GATED": NoPickCause.OPERATOR_GATE,
166
+ "SOAK_OPEN": NoPickCause.OPERATOR_GATE,
167
+ "DRAIN": NoPickCause.TRUE_DRAIN,
168
+ "REGEX_FP": NoPickCause.REGEX_FP,
169
+ "RENDERER_SKIP": NoPickCause.RENDERER_BUG,
170
+ }
171
+
172
+
173
+ def _build_reason_class_map() -> dict[str, NoPickCause]:
174
+ """Merge the closed WedgeReason set (categorised) with the legacy aliases.
175
+
176
+ Each `WedgeReason` carries a `NoPickCategory` whose value string equals a
177
+ `NoPickCause` member name (pinned by `tests/test_wedge_reason.py`), so we
178
+ resolve the cause via `NoPickCause[category.value]`. The legacy aliases are
179
+ layered on top (they don't collide with the LANE_* tokens).
180
+ """
181
+ out: dict[str, NoPickCause] = {}
182
+ for reason, category in wedge_reason.REASON_TO_CATEGORY.items():
183
+ out[reason.value] = NoPickCause[category.value]
184
+ out.update(_LEGACY_REASON_ALIASES)
185
+ return out
186
+
187
+
188
+ REASON_CLASS_MAP: dict[str, NoPickCause] = _build_reason_class_map()
189
+
190
+
191
+ # The recognizer-ladder rungs `resolve_cause_with_source` reports — the
192
+ # reason-class analogue of `verify`'s `source` (registry/grep/none). A cause that
193
+ # came from a fuzzy MORPHOLOGICAL match must NOT masquerade as one from an EXACT
194
+ # declared token: the cross-check downstream is weaker evidence, and the decisions
195
+ # queue / `oracle_disagrees` routing must be able to see the rung (`docs/105` §3.1).
196
+ CAUSE_SOURCE_EXACT = "exact" # frozen map or workspace ReasonRegistry
197
+ CAUSE_SOURCE_MORPHOLOGICAL = "morphological" # rung-2 substring recognizer
198
+ CAUSE_SOURCE_NONE = "none" # UNCLASSIFIED floor — nothing recognized
199
+
200
+
201
+ def resolve_cause_with_source(
202
+ reason_class: str | None,
203
+ ) -> tuple[NoPickCause, str, str]:
204
+ """Map a `reason_class` token onto `(cause, cause_source, matched)` — the
205
+ full three-rung recognizer ladder (`docs/105`).
206
+
207
+ Rungs, in descending authority (the first that answers wins, and NAMES itself):
208
+
209
+ 1. **exact** — the frozen `REASON_CLASS_MAP` (built-in `WedgeReason` set +
210
+ legacy aliases), then the active workspace's `ReasonRegistry`
211
+ (`dos.toml [reasons]`). A declared or built-in token resolves here;
212
+ `cause_source="exact"`, `matched` is the token.
213
+ 2. **morphological** — the active workspace's `reason_morphology`
214
+ (`MorphologyRuleset`, default `GENERIC_REASON_MORPHOLOGY`): an ordered
215
+ `(substring → category)` recognizer that classifies the legible tail of
216
+ LLM-authored compound tokens the exact rungs miss
217
+ (`*FALSE_SHIP*`/`*OPERATOR*`/…). `cause_source="morphological"`, `matched`
218
+ is the substring that fired (so the precedence is auditable).
219
+ 3. **none** — known to neither rung → `UNCLASSIFIED`, `cause_source="none"`,
220
+ `matched=""`. The honest floor (`docs/76` §2's `source="none"` analogue):
221
+ a genuinely-ambiguous token is abstained on, never guessed.
222
+
223
+ Pure aside from reading the process-active config (the same dependency the
224
+ other workspace-aware loaders here carry); every lookup is lazy and defensive
225
+ so a missing/uninitialised config degrades to the frozen map + generic
226
+ morphology alone, never raising.
227
+ """
228
+ if not reason_class:
229
+ return NoPickCause.UNCLASSIFIED, CAUSE_SOURCE_NONE, ""
230
+ key = reason_class.upper().strip()
231
+ # Rung 1a — frozen map (built-ins + aliases).
232
+ hit = REASON_CLASS_MAP.get(key)
233
+ if hit is not None:
234
+ return hit, CAUSE_SOURCE_EXACT, key
235
+ # Rung 1b — workspace ReasonRegistry (a host-declared exact token).
236
+ try:
237
+ reg = _config.active().reasons
238
+ cat = reg.category_for(key) # 'UNCLASSIFIED' for an unknown token
239
+ if cat != "UNCLASSIFIED":
240
+ return NoPickCause(cat), CAUSE_SOURCE_EXACT, key
241
+ except Exception:
242
+ pass
243
+ # Rung 2 — morphological recognizer (the legible-tail rung).
244
+ try:
245
+ ruleset = _config.active().reason_morphology
246
+ morph = ruleset.classify(key)
247
+ if morph is not None:
248
+ cat, matched = morph
249
+ return NoPickCause(cat), CAUSE_SOURCE_MORPHOLOGICAL, matched
250
+ except Exception:
251
+ pass
252
+ # Rung 3 — the honest floor.
253
+ return NoPickCause.UNCLASSIFIED, CAUSE_SOURCE_NONE, ""
254
+
255
+
256
+ def resolve_cause(reason_class: str | None) -> NoPickCause:
257
+ """Map a `reason_class` token onto its `NoPickCause` (cause only).
258
+
259
+ Thin back-compat wrapper over `resolve_cause_with_source` — returns just the
260
+ cause, for callers that don't need the rung. The full three-rung ladder
261
+ (exact → morphological → none) runs underneath; see
262
+ `resolve_cause_with_source` for the rung semantics.
263
+ """
264
+ cause, _source, _matched = resolve_cause_with_source(reason_class)
265
+ return cause
266
+
267
+
268
+ @dataclasses.dataclass(frozen=True)
269
+ class PickerVerdict:
270
+ """One dispatch run's oracle outcome.
271
+
272
+ `oracle_disagrees=True` means the picker's stated cause was contradicted
273
+ by on-disk evidence — this is the picker-bug signal /replan routes.
274
+ """
275
+
276
+ run_ts: str # e.g. "20260526T182233Z"
277
+ lane: str # e.g. "tailor" | "UP" | "apply"
278
+ tag: str # e.g. "next-up-2026-05-26-3"
279
+ outcome: PickerOutcome
280
+ no_pick_cause: NoPickCause | None
281
+ oracle_disagrees: bool
282
+ picks_emitted: int
283
+ picks_shipped: int
284
+ cost_usd: float | None
285
+ evidence: tuple[str, ...] # human-readable rationale lines
286
+ picker_reason: str # picker's own free-text reason (truncated)
287
+ # Which recognizer rung produced `no_pick_cause` — "exact" (a declared/built-in
288
+ # token), "morphological" (the rung-2 substring recognizer matched a compound
289
+ # token's shape), or "none" (UNCLASSIFIED floor / not a NO_PICK). The honesty
290
+ # knob (`docs/105` §3.1): a morphologically-guessed cause is weaker evidence
291
+ # than an exact one, and downstream routing must be able to tell them apart.
292
+ # Defaulted so the LIVE / UNKNOWN construction sites (no cause) need no change.
293
+ cause_source: str = CAUSE_SOURCE_NONE
294
+
295
+ def to_dict(self) -> dict:
296
+ return {
297
+ "schema_version": SCHEMA_VERSION,
298
+ "run_ts": self.run_ts,
299
+ "lane": self.lane,
300
+ "tag": self.tag,
301
+ "outcome": self.outcome.value,
302
+ "no_pick_cause": self.no_pick_cause.value if self.no_pick_cause else None,
303
+ "cause_source": self.cause_source,
304
+ "oracle_disagrees": self.oracle_disagrees,
305
+ "picks_emitted": self.picks_emitted,
306
+ "picks_shipped": self.picks_shipped,
307
+ "cost_usd": self.cost_usd,
308
+ "evidence": list(self.evidence),
309
+ "picker_reason": self.picker_reason,
310
+ }
311
+
312
+
313
+ # ---------------------------------------------------------------------------
314
+ # Inputs
315
+ # ---------------------------------------------------------------------------
316
+
317
+
318
+ def _load_yaml(path: Path) -> dict:
319
+ """Best-effort YAML load — degrades to {} so the oracle never crashes."""
320
+ if not path.exists():
321
+ return {}
322
+ try:
323
+ import yaml # type: ignore
324
+ except ImportError:
325
+ return {}
326
+ try:
327
+ return yaml.safe_load(path.read_text(encoding="utf-8")) or {}
328
+ except Exception:
329
+ return {}
330
+
331
+
332
+ def _load_verdict_envelope(tag: str) -> dict | None:
333
+ p = _next_up_dir() / f".verdict-{tag}.json"
334
+ if not p.exists():
335
+ return None
336
+ try:
337
+ return json.loads(p.read_text(encoding="utf-8"))
338
+ except Exception:
339
+ return None
340
+
341
+
342
+ def _load_dispatch_envelope(run_ts: str) -> dict | None:
343
+ p = _chained_runs() / run_ts / "result_envelopes" / "next-up.json"
344
+ if not p.exists():
345
+ return None
346
+ try:
347
+ return json.loads(p.read_text(encoding="utf-8"))
348
+ except Exception:
349
+ return None
350
+
351
+
352
+ def _load_dispatch_readme(run_ts: str) -> str:
353
+ p = _chained_runs() / run_ts / "README.md"
354
+ if not p.exists():
355
+ return ""
356
+ try:
357
+ return p.read_text(encoding="utf-8", errors="replace")
358
+ except Exception:
359
+ return ""
360
+
361
+
362
+ # Pull `--scope X` out of dispatch README args line. Falls back to lane lease
363
+ # stanza if `Args:` is bare.
364
+ _ARGS_SCOPE = re.compile(r"^- Args: --scope (\S+)", re.M)
365
+ _LANE_LEASE = re.compile(r"^- Lane lease: lane=(\S+)", re.M)
366
+ _CHILD1_COST = re.compile(r"\$(\d+(?:\.\d+)?)", re.M)
367
+
368
+
369
+ def _infer_lane(readme: str) -> str:
370
+ m = _ARGS_SCOPE.search(readme)
371
+ if m:
372
+ return m.group(1)
373
+ m = _LANE_LEASE.search(readme)
374
+ if m:
375
+ return m.group(1)
376
+ return "unknown"
377
+
378
+
379
+ def _extract_cost(readme: str) -> float | None:
380
+ """Sum the first two `$N.NN` figures (child1 + child2). Best-effort."""
381
+ matches = _CHILD1_COST.findall(readme)
382
+ if not matches:
383
+ return None
384
+ try:
385
+ return sum(float(m) for m in matches[:2])
386
+ except ValueError:
387
+ return None
388
+
389
+
390
+ # The producer prints `verdict=WEDGE … reason_class=LANE_… route=/replan` into the
391
+ # headless session's free-text result (and the dispatch README echoes it), but the
392
+ # structured `.verdict-<tag>.json` envelope it writes alongside does NOT always carry
393
+ # `reason_class` as a field. Measured on job's corpus (2026-06-02): of 62 NO-PICKs the
394
+ # oracle could not classify, 29 (47%) had a recoverable token sitting in the dispatch
395
+ # `result` prose that the field-only read missed — recall was vacuous over them for a
396
+ # plumbing reason, not a real one. This recovers that emitted-but-unlifted token so the
397
+ # oracle can grade the decision. Uppercase-token shape mirrors the closed WedgeReason
398
+ # vocabulary; an unknown token still resolves to UNCLASSIFIED via `resolve_cause`.
399
+ _PROSE_REASON_CLASS = re.compile(r"reason_class=([A-Z][A-Z0-9_]*)")
400
+
401
+
402
+ def _recover_reason_class(*texts: str) -> str:
403
+ """Best-effort extraction of an emitted `reason_class=` token from prose.
404
+
405
+ Pure. Scans each text in order (caller passes the dispatch `result` first,
406
+ then the README) and returns the FIRST match, or `""` if none. The fallback
407
+ used only when the structured verdict envelope did not carry the field — so a
408
+ real, emitted reason class is never silently dropped to UNCLASSIFIED.
409
+ """
410
+ for text in texts:
411
+ if not text:
412
+ continue
413
+ m = _PROSE_REASON_CLASS.search(text)
414
+ if m:
415
+ return m.group(1)
416
+ return ""
417
+
418
+
419
+ # ---------------------------------------------------------------------------
420
+ # Cross-check rules — these are what make the oracle disagree
421
+ # ---------------------------------------------------------------------------
422
+
423
+
424
+ STALE_CLAIM_THRESHOLD_HOURS = 48 # claims older than this are presumed orphan
425
+
426
+
427
+ def _check_stale_claim_real(state: dict, evidence: list[str]) -> bool:
428
+ """If the picker said STALE_CLAIM, does the claim actually look orphan?
429
+
430
+ Returns True if the cause is BELIEVABLE (claim genuinely stale).
431
+ Returns False if oracle DISAGREES (claim is fresh — picker bug).
432
+ Best-effort: if we can't find the claim, abstain (return True).
433
+ """
434
+ # Picker bug shape: picker says blocked by a claim that's <24h old or
435
+ # heart-beated recently. Without the specific claim ID in evidence, we
436
+ # can only check the overall claim staleness shape — if all active claims
437
+ # are <24h, the "stale claim" story doesn't hold.
438
+ claims = state.get("active_claims") or state.get("hard_claims") or []
439
+ if not claims:
440
+ evidence.append("oracle: no active hard claims found in state — STALE_CLAIM claim is unverifiable, abstaining")
441
+ return True
442
+ now = dt.datetime.now(dt.timezone.utc)
443
+ ages = []
444
+ for c in claims if isinstance(claims, list) else []:
445
+ ts = c.get("claimed_at") or c.get("heartbeat_at") or c.get("created_at")
446
+ if not ts:
447
+ continue
448
+ try:
449
+ t = dt.datetime.fromisoformat(str(ts).replace("Z", "+00:00"))
450
+ ages.append((now - t).total_seconds() / 3600)
451
+ except Exception:
452
+ continue
453
+ if not ages:
454
+ return True
455
+ stale_count = sum(1 for a in ages if a >= STALE_CLAIM_THRESHOLD_HOURS)
456
+ evidence.append(
457
+ f"oracle: {stale_count}/{len(ages)} active claims older than {STALE_CLAIM_THRESHOLD_HOURS}h "
458
+ f"(claims < threshold suggest STALE_CLAIM claim is suspect)"
459
+ )
460
+ return stale_count > 0
461
+
462
+
463
+ def _check_operator_gate_real(state: dict, picker_reason: str, evidence: list[str]) -> bool:
464
+ """Verify any cited soak deadline is actually still open at run time."""
465
+ # Look for soak-deadline-like dates in picker_reason
466
+ deadline_match = re.search(r"(?:thru|until|through)\s+(\d{4}-\d{2}-\d{2})", picker_reason)
467
+ if not deadline_match:
468
+ return True # nothing to verify; abstain in favor of picker
469
+ deadline = deadline_match.group(1)
470
+ today = dt.date.today().isoformat()
471
+ if deadline >= today:
472
+ evidence.append(f"oracle: soak deadline {deadline} >= today {today} — OPERATOR_GATE confirmed")
473
+ return True
474
+ evidence.append(f"oracle: soak deadline {deadline} < today {today} — gate has EXPIRED, picker should have re-picked")
475
+ return False
476
+
477
+
478
+ def _check_true_drain_real(state: dict, scope_plan_ids: list[str], evidence: list[str]) -> bool:
479
+ """If picker said TRUE_DRAIN, do any in-scope plans have non-empty `remaining`?"""
480
+ if not scope_plan_ids:
481
+ return True
482
+ plans = state.get("plans") or []
483
+ by_id = {p.get("id"): p for p in plans if isinstance(p, dict)}
484
+ non_drained = []
485
+ for pid in scope_plan_ids:
486
+ plan = by_id.get(pid)
487
+ if not plan:
488
+ continue
489
+ remaining = plan.get("remaining") or []
490
+ if remaining:
491
+ non_drained.append(f"{pid}({len(remaining)})")
492
+ if non_drained:
493
+ evidence.append(
494
+ f"oracle: in-scope plans with non-empty remaining: {', '.join(non_drained)} — "
495
+ f"TRUE_DRAIN is FALSE"
496
+ )
497
+ return False
498
+ evidence.append(f"oracle: all in-scope plans ({', '.join(scope_plan_ids)}) have remaining:[] — TRUE_DRAIN confirmed")
499
+ return True
500
+
501
+
502
+ def _check_misroute_real(envelope: dict, evidence: list[str]) -> bool:
503
+ """A MISROUTE claim is self-explanatory; check the envelope at least names
504
+ the misroute target. If the picker emits MIS_ROUTED_FINDING with no
505
+ target lane, the claim is unfounded."""
506
+ reason = envelope.get("reason", "")
507
+ has_target = bool(re.search(r"belongs?\s+to\s+(\w+)|actually\s+(\w+)", reason, re.I))
508
+ if has_target:
509
+ evidence.append("oracle: MISROUTE claim names a target lane — believable")
510
+ return True
511
+ evidence.append("oracle: MISROUTE claim has no target lane named — unverifiable, picker may be hand-waving")
512
+ return False
513
+
514
+
515
+ # ---------------------------------------------------------------------------
516
+ # Classification — pure function on assembled inputs
517
+ # ---------------------------------------------------------------------------
518
+
519
+
520
+ def classify(
521
+ *,
522
+ run_ts: str,
523
+ verdict_env: dict | None,
524
+ dispatch_env: dict | None,
525
+ readme: str,
526
+ state: dict,
527
+ ) -> PickerVerdict:
528
+ """Pure function — produces a PickerVerdict from assembled inputs."""
529
+ lane = _infer_lane(readme)
530
+ cost = _extract_cost(readme)
531
+
532
+ # Tag — prefer verdict envelope's own tag; fall back to extracting from readme.
533
+ tag = (verdict_env or {}).get("tag") or ""
534
+ if not tag and readme:
535
+ m = re.search(r"next-up-\d{4}-\d{2}-\d{2}-\d+", readme)
536
+ if m:
537
+ tag = m.group(0)
538
+
539
+ picks_total = (verdict_env or {}).get("picks") or []
540
+ picks_emitted = len(picks_total) if isinstance(picks_total, list) else 0
541
+ picks_shipped = _extract_picks_shipped(readme)
542
+
543
+ if not verdict_env:
544
+ return PickerVerdict(
545
+ run_ts=run_ts,
546
+ lane=lane,
547
+ tag=tag,
548
+ outcome=PickerOutcome.UNKNOWN,
549
+ no_pick_cause=None,
550
+ oracle_disagrees=False,
551
+ picks_emitted=0,
552
+ picks_shipped=picks_shipped,
553
+ cost_usd=cost,
554
+ evidence=("oracle: no verdict envelope found",),
555
+ picker_reason="",
556
+ )
557
+
558
+ picker_verdict = (verdict_env.get("verdict") or "").upper()
559
+ picker_reason = (verdict_env.get("reason") or "")[:500]
560
+
561
+ # LIVE shape: envelope has all_clear=true and picks emitted (round 0 is
562
+ # the picker's "pre-veto-clean" round before refinement; both round 0
563
+ # and final-round all_clear envelopes count as LIVE-shaped).
564
+ if verdict_env.get("all_clear") and picks_emitted > 0:
565
+ if picks_shipped > 0:
566
+ outcome = PickerOutcome.LIVE
567
+ else:
568
+ outcome = PickerOutcome.LIVE_DIRTY
569
+ return PickerVerdict(
570
+ run_ts=run_ts,
571
+ lane=lane,
572
+ tag=tag,
573
+ outcome=outcome,
574
+ no_pick_cause=None,
575
+ oracle_disagrees=False,
576
+ picks_emitted=picks_emitted,
577
+ picks_shipped=picks_shipped,
578
+ cost_usd=cost,
579
+ evidence=(f"oracle: picker emitted {picks_emitted} ACCEPT picks, {picks_shipped} shipped",),
580
+ picker_reason=picker_reason,
581
+ )
582
+
583
+ # NO_PICK shape — verdict is WEDGE/DRAIN/(missing)/blocked. `resolve_cause`
584
+ # is registry-aware: a workspace-declared reason class resolves to its
585
+ # category here, so a custom reason is verifiable, not just emittable.
586
+ evidence: list[str] = []
587
+ reason_class = (verdict_env.get("reason_class") or "").upper().strip()
588
+ # Prose fallback: the producer emits `reason_class=` into the dispatch result
589
+ # (and README) even when the structured envelope drops the field. Recover it
590
+ # so a genuinely-emitted token is graded, not lost to UNCLASSIFIED. Only fires
591
+ # when the field is absent — the structured value always wins.
592
+ if not reason_class:
593
+ recovered = _recover_reason_class(
594
+ str((dispatch_env or {}).get("result") or ""), readme
595
+ )
596
+ if recovered:
597
+ reason_class = recovered.upper().strip()
598
+ evidence.append(
599
+ f"oracle: recovered reason_class={reason_class} from dispatch prose "
600
+ f"(structured envelope dropped the field)"
601
+ )
602
+ cause, cause_source, matched = resolve_cause_with_source(reason_class)
603
+ if cause == NoPickCause.UNCLASSIFIED and picker_verdict == "DRAIN":
604
+ cause = NoPickCause.TRUE_DRAIN
605
+ cause_source = CAUSE_SOURCE_EXACT # the DRAIN verdict itself is the signal
606
+ if cause_source == CAUSE_SOURCE_MORPHOLOGICAL:
607
+ # The cause came from the rung-2 shape recognizer, not a declared token —
608
+ # record which substring fired so the (weaker) basis is auditable.
609
+ evidence.append(
610
+ f"oracle: classified reason_class={reason_class} as {cause.value} via "
611
+ f"morphological rung (matched {matched!r}; weaker than an exact token)"
612
+ )
613
+
614
+ # Cross-check: does on-disk state support the claim?
615
+ disagrees = False
616
+ # `scope` is the renderer's structured `{plan_ids: [...]}` block, but an
617
+ # LLM-written no-pick envelope sometimes writes a free-text label string
618
+ # (e.g. "tailor -> AR, IF, TS"). Only read plan_ids off a dict; a string
619
+ # scope carries no machine-readable ids, so abstain (empty list -> the
620
+ # TRUE_DRAIN cross-check abstains in favor of the picker).
621
+ scope_raw = verdict_env.get("scope")
622
+ scope_plan_ids = (
623
+ (scope_raw.get("plan_ids") or []) if isinstance(scope_raw, dict) else []
624
+ )
625
+
626
+ if cause == NoPickCause.STALE_CLAIM:
627
+ believable = _check_stale_claim_real(state, evidence)
628
+ if not believable:
629
+ disagrees = True
630
+ elif cause == NoPickCause.OPERATOR_GATE:
631
+ believable = _check_operator_gate_real(state, picker_reason, evidence)
632
+ if not believable:
633
+ disagrees = True
634
+ elif cause == NoPickCause.TRUE_DRAIN:
635
+ believable = _check_true_drain_real(state, scope_plan_ids, evidence)
636
+ if not believable:
637
+ disagrees = True
638
+ elif cause == NoPickCause.MISROUTE:
639
+ believable = _check_misroute_real(verdict_env, evidence)
640
+ if not believable:
641
+ disagrees = True
642
+ elif cause == NoPickCause.UNCLASSIFIED:
643
+ evidence.append("oracle: legacy envelope w/o reason_class; cannot verify — recommend backfill")
644
+
645
+ return PickerVerdict(
646
+ run_ts=run_ts,
647
+ lane=lane,
648
+ tag=tag,
649
+ outcome=PickerOutcome.NO_PICK,
650
+ no_pick_cause=cause,
651
+ oracle_disagrees=disagrees,
652
+ picks_emitted=picks_emitted,
653
+ picks_shipped=picks_shipped,
654
+ cost_usd=cost,
655
+ evidence=tuple(evidence),
656
+ picker_reason=picker_reason,
657
+ cause_source=cause_source,
658
+ )
659
+
660
+
661
+ # Order matters — `2/2 picks shipped` first, then bullet `Picks shipped: 2`, then last
662
+ # fallback `0 shipped`.
663
+ # Capture BOTH numerator and denominator so an inverted/oversized ratio
664
+ # (`315/82 shipped` — prose, not a per-run pick count) can be rejected: you
665
+ # cannot ship more picks than were dispatched. Seen 2026-05-28 inflating
666
+ # precision to 8.13 (data-trust-floor / DD axiom violation).
667
+ _RATIO_SHIPPED = re.compile(
668
+ r"(\d+)\s*/\s*(\d+)\s+(?:chain phases?\s+)?(?:picks?\s+)?shipped", re.I
669
+ )
670
+ _BULLET_SHIPPED = re.compile(r"Picks shipped:\s+(\d+)", re.I)
671
+
672
+ # A single /next-up packet caps at 5 picks (next_up_render max_picks=5); a
673
+ # chained-depth slot can add a few hops. Anything beyond this from a README
674
+ # scrape is a cross-run / prose false-match, not a real per-run ship count.
675
+ _MAX_PER_RUN_SHIPPED = 12
676
+
677
+
678
+ def _extract_picks_shipped(readme: str) -> int:
679
+ """Best-effort scan of dispatch README for ship count.
680
+
681
+ Recognises (in order):
682
+ * `2/2 picks shipped clean` (rejected when numerator > denominator)
683
+ * `Picks shipped: 2`
684
+ * `Picks shipped: none` / `0 shipped`
685
+ * `Picks shipped: GH3 (...), FQ-348 (...)` — count parenthesised SHAs as a list
686
+
687
+ Every recognised count is clamped to `_MAX_PER_RUN_SHIPPED`: a value past
688
+ that is a cross-run or prose false-match, never a real per-run ship count.
689
+ """
690
+ m = _RATIO_SHIPPED.search(readme)
691
+ if m:
692
+ try:
693
+ num, denom = int(m.group(1)), int(m.group(2))
694
+ # Reject inverted/oversized ratios — `315/82 shipped` is prose,
695
+ # not "315 of 82 picks". A real per-run ratio has num <= denom.
696
+ if num <= denom and num <= _MAX_PER_RUN_SHIPPED:
697
+ return num
698
+ except ValueError:
699
+ pass
700
+ m = _BULLET_SHIPPED.search(readme)
701
+ if m:
702
+ try:
703
+ return min(int(m.group(1)), _MAX_PER_RUN_SHIPPED)
704
+ except ValueError:
705
+ pass
706
+ # `Picks shipped: <list>` shape — count entries with parenthesised commits
707
+ list_match = re.search(r"Picks shipped:\s+([^\n]+)", readme, re.I)
708
+ if list_match:
709
+ candidates = list_match.group(1)
710
+ # count parenthesised entries `(... <sha>)` — heuristic but precise enough
711
+ n = len(re.findall(r"\([^)]+`?[0-9a-f]{6,}`?\)", candidates))
712
+ if n > 0:
713
+ return min(n, _MAX_PER_RUN_SHIPPED)
714
+ lowered = readme.lower()
715
+ if "picks shipped: none" in lowered or "0 shipped" in lowered or "none (lane drained)" in lowered:
716
+ return 0
717
+ return 0
718
+
719
+
720
+ # ---------------------------------------------------------------------------
721
+ # Sweep / report / check (the CLI surface)
722
+ # ---------------------------------------------------------------------------
723
+
724
+
725
+ def _list_recent_runs(since_iso: str | None) -> list[str]:
726
+ _chained = _chained_runs()
727
+ if not _chained.exists():
728
+ return []
729
+ out = []
730
+ for child in sorted(_chained.iterdir()):
731
+ if not child.is_dir():
732
+ continue
733
+ name = child.name
734
+ if not re.match(r"^\d{8}T\d{6}Z", name):
735
+ continue
736
+ if since_iso and name < since_iso:
737
+ continue
738
+ out.append(name)
739
+ return out
740
+
741
+
742
+ def _parse_since(s: str | None) -> str | None:
743
+ """Convert '7d' / '24h' / ISO -> 'YYYYMMDDTHHMMSSZ' string for comparison."""
744
+ if not s:
745
+ return None
746
+ now = dt.datetime.now(dt.timezone.utc)
747
+ m = re.fullmatch(r"(\d+)([dh])", s)
748
+ if m:
749
+ n, unit = int(m.group(1)), m.group(2)
750
+ delta = dt.timedelta(days=n) if unit == "d" else dt.timedelta(hours=n)
751
+ threshold = now - delta
752
+ return threshold.strftime("%Y%m%dT%H%M%SZ")
753
+ # assume ISO date
754
+ try:
755
+ d = dt.date.fromisoformat(s)
756
+ return d.strftime("%Y%m%dT000000Z")
757
+ except ValueError:
758
+ return None
759
+
760
+
761
+ def _audit_dir(window: str) -> Path:
762
+ return _audits_dir() / window
763
+
764
+
765
+ def _load_existing(window: str) -> dict[tuple[str, str], dict]:
766
+ """Idempotency: load already-classified rows so re-runs are no-ops."""
767
+ path = _audit_dir(window) / "oracle.jsonl"
768
+ if not path.exists():
769
+ return {}
770
+ out: dict[tuple[str, str], dict] = {}
771
+ for line in path.read_text(encoding="utf-8").splitlines():
772
+ if not line.strip():
773
+ continue
774
+ try:
775
+ row = json.loads(line)
776
+ except json.JSONDecodeError:
777
+ continue
778
+ key = (row.get("run_ts", ""), row.get("tag", ""))
779
+ out[key] = row
780
+ return out
781
+
782
+
783
+ def _classify_one(run_ts: str, state: dict) -> PickerVerdict:
784
+ readme = _load_dispatch_readme(run_ts)
785
+ dispatch_env = _load_dispatch_envelope(run_ts)
786
+ # Tag resolution — prefer dispatch envelope, else readme.
787
+ tag = ""
788
+ if dispatch_env and isinstance(dispatch_env.get("tag"), str):
789
+ tag = dispatch_env["tag"]
790
+ if not tag and readme:
791
+ m = re.search(r"next-up-\d{4}-\d{2}-\d{2}-\d+", readme)
792
+ if m:
793
+ tag = m.group(0)
794
+ verdict_env = _load_verdict_envelope(tag) if tag else None
795
+
796
+ # Envelope-clobber guard. The dispatch's own `result` field is the
797
+ # truth-bearer because it was written by the child's stdout at exit
798
+ # time — the on-disk `.verdict-<tag>.json` can be overwritten by a
799
+ # later same-tag dispatch (real bug seen in 20260526T155903Z's tag
800
+ # `next-up-2026-05-26-2`). If dispatch `result` mentions verdict=WEDGE
801
+ # but the verdict envelope claims all_clear, the envelope is stale —
802
+ # synthesize a minimal verdict_env so classification doesn't misread.
803
+ if dispatch_env and verdict_env:
804
+ result_text = str(dispatch_env.get("result") or "")
805
+ if "verdict=WEDGE" in result_text and verdict_env.get("all_clear"):
806
+ # build a minimal synthetic envelope reflecting the true outcome
807
+ verdict_env = {
808
+ "tag": tag,
809
+ "verdict": "WEDGE",
810
+ "all_clear": False,
811
+ "blocked": True,
812
+ "picks": [],
813
+ "reason": result_text[:500],
814
+ "_synthesized": True,
815
+ "_clobber_reason": "on-disk verdict envelope was overwritten by a later dispatch sharing the tag",
816
+ }
817
+
818
+ return classify(
819
+ run_ts=run_ts,
820
+ verdict_env=verdict_env,
821
+ dispatch_env=dispatch_env,
822
+ readme=readme,
823
+ state=state,
824
+ )
825
+
826
+
827
+ def cmd_classify(args: argparse.Namespace) -> int:
828
+ state = _load_yaml(_state_path())
829
+ v = _classify_one(args.run_ts, state)
830
+ print(json.dumps(v.to_dict(), indent=2))
831
+ return 0
832
+
833
+
834
+ def cmd_sweep(args: argparse.Namespace) -> int:
835
+ since_iso = _parse_since(args.since)
836
+ runs = _list_recent_runs(since_iso)
837
+ window = args.window or (args.since or "all")
838
+ audit_dir = _audit_dir(window)
839
+ audit_dir.mkdir(parents=True, exist_ok=True)
840
+ out_path = audit_dir / "oracle.jsonl"
841
+ existing = _load_existing(window)
842
+ state = _load_yaml(_state_path())
843
+ n_new = 0
844
+ n_skip = 0
845
+ rows: dict[tuple[str, str], dict] = dict(existing)
846
+ for run_ts in runs:
847
+ v = _classify_one(run_ts, state)
848
+ key = (run_ts, v.tag)
849
+ if key in existing and not args.force:
850
+ n_skip += 1
851
+ continue
852
+ rows[key] = v.to_dict()
853
+ n_new += 1
854
+ rows_sorted = sorted(rows.values(), key=lambda r: (r.get("run_ts", ""), r.get("tag", "")))
855
+ # Rewrite the file atomically (small enough, append-only-shape but idempotent)
856
+ out_path.write_text(
857
+ "\n".join(json.dumps(r) for r in rows_sorted) + "\n",
858
+ encoding="utf-8",
859
+ )
860
+ print(f"sweep: {n_new} new, {n_skip} skipped, {len(rows_sorted)} total -> {out_path}")
861
+ return 0
862
+
863
+
864
+ def cmd_report(args: argparse.Namespace) -> int:
865
+ window = args.window
866
+ audit_dir = _audit_dir(window)
867
+ rows_path = audit_dir / "oracle.jsonl"
868
+ if not rows_path.exists():
869
+ print(f"no oracle.jsonl at {rows_path}; run `sweep --window {window}` first", file=sys.stderr)
870
+ return 2
871
+ rows = [json.loads(line) for line in rows_path.read_text(encoding="utf-8").splitlines() if line.strip()]
872
+ md = _render_report(window, rows)
873
+ out_path = audit_dir / "picker_recall_audit.md"
874
+ out_path.write_text(md, encoding="utf-8")
875
+ print(f"report: {len(rows)} rows -> {out_path}")
876
+ return 0
877
+
878
+
879
+ def cmd_check(args: argparse.Namespace) -> int:
880
+ """CI gate. Exit 0 if recall floor met, 1 if not."""
881
+ window = args.window
882
+ rows_path = _audit_dir(window) / "oracle.jsonl"
883
+ if not rows_path.exists():
884
+ print(f"no oracle.jsonl at {rows_path}", file=sys.stderr)
885
+ return 2
886
+ rows = [json.loads(line) for line in rows_path.read_text(encoding="utf-8").splitlines() if line.strip()]
887
+ metrics = _compute_metrics(rows)
888
+ print(json.dumps(metrics, indent=2))
889
+ if metrics["recall_proxy"] < args.min_recall:
890
+ print(f"FAIL: recall_proxy {metrics['recall_proxy']:.2f} < floor {args.min_recall:.2f}", file=sys.stderr)
891
+ return 1
892
+ print(f"PASS: recall_proxy {metrics['recall_proxy']:.2f} >= floor {args.min_recall:.2f}")
893
+ return 0
894
+
895
+
896
+ def _compute_metrics(rows: list[dict]) -> dict:
897
+ n = len(rows)
898
+ if n == 0:
899
+ return {"n": 0, "precision": None, "recall_proxy": None, "cost_per_ship": None}
900
+ picks_emitted = sum(r.get("picks_emitted", 0) for r in rows)
901
+ total_cost = sum((r.get("cost_usd") or 0.0) for r in rows)
902
+ # Precision counts ships ONLY from rows that actually emitted picks (LIVE /
903
+ # LIVE_DIRTY). A README ship-count scraped off an UNKNOWN row (no verdict
904
+ # envelope, picks_emitted=0) is a cross-run artefact — including it pushes
905
+ # precision above 1.0 (the data-trust-floor violation seen 2026-05-28).
906
+ emitted_rows = [r for r in rows if r.get("picks_emitted", 0) > 0]
907
+ picks_shipped = sum(r.get("picks_shipped", 0) for r in emitted_rows)
908
+ # recall_proxy: 1 - (oracle-disagrees fraction of no-pick runs).
909
+ no_picks = [r for r in rows if r.get("outcome") == "NO_PICK"]
910
+ picker_bugs = sum(1 for r in no_picks if r.get("oracle_disagrees"))
911
+ # UNCLASSIFIED no-pick rows are NOT verified clean — they're unverifiable
912
+ # (legacy envelope, no reason_class). Surface the count so a green
913
+ # recall_proxy is honest about how much of the no-pick set it could not
914
+ # check. The proxy itself stays conservative (unverifiable ≠ bug) so the CI
915
+ # floor doesn't flap, but the operator sees the unverified denominator.
916
+ unverifiable = sum(
917
+ 1 for r in no_picks if r.get("no_pick_cause") == "UNCLASSIFIED"
918
+ )
919
+ verified_no_pick = len(no_picks) - unverifiable
920
+ recall_proxy = 1.0 - (picker_bugs / max(len(no_picks), 1)) if no_picks else 1.0
921
+ precision = (picks_shipped / picks_emitted) if picks_emitted else None
922
+ cost_per_ship = (total_cost / picks_shipped) if picks_shipped else None
923
+ return {
924
+ "n": n,
925
+ "picks_emitted": picks_emitted,
926
+ "picks_shipped": picks_shipped,
927
+ "total_cost_usd": round(total_cost, 2),
928
+ "precision": round(precision, 3) if precision is not None else None,
929
+ "recall_proxy": round(recall_proxy, 3),
930
+ "cost_per_ship": round(cost_per_ship, 2) if cost_per_ship is not None else None,
931
+ "picker_bug_count": picker_bugs,
932
+ "no_pick_count": len(no_picks),
933
+ "unverifiable_no_pick_count": unverifiable,
934
+ "verified_no_pick_count": verified_no_pick,
935
+ }
936
+
937
+
938
+ def _render_report(window: str, rows: list[dict]) -> str:
939
+ metrics = _compute_metrics(rows)
940
+ # Cause histogram
941
+ causes: dict[str, int] = {}
942
+ for r in rows:
943
+ c = r.get("no_pick_cause") or ""
944
+ if c:
945
+ causes[c] = causes.get(c, 0) + 1
946
+ # Top oracle-disagreement rows
947
+ bugs = [r for r in rows if r.get("oracle_disagrees")]
948
+ bugs_sorted = sorted(bugs, key=lambda r: r.get("run_ts", ""), reverse=True)
949
+
950
+ lines = [
951
+ f"# Picker recall audit — window `{window}`",
952
+ "",
953
+ f"_Generated by `scripts/picker_oracle.py report --window {window}`._",
954
+ "",
955
+ "## Headline metrics",
956
+ "",
957
+ f"- **N dispatches:** {metrics['n']}",
958
+ f"- **Picks emitted:** {metrics['picks_emitted']}",
959
+ f"- **Picks shipped:** {metrics['picks_shipped']}",
960
+ f"- **Precision** (shipped / emitted): `{metrics['precision']}`",
961
+ f"- **Recall proxy** (1 − picker-bug / no-pick): `{metrics['recall_proxy']}`",
962
+ f"- **No-pick verified / unverifiable:** "
963
+ f"{metrics.get('verified_no_pick_count', 0)} / "
964
+ f"{metrics.get('unverifiable_no_pick_count', 0)} "
965
+ f"(recall_proxy is honest only over the verified set)",
966
+ f"- **Total cost:** `${metrics['total_cost_usd']}`",
967
+ f"- **Cost per ship:** `${metrics['cost_per_ship']}`",
968
+ f"- **Picker-bug NO-PICKs:** {metrics['picker_bug_count']} / {metrics['no_pick_count']}",
969
+ "",
970
+ "## NO-PICK cause histogram",
971
+ "",
972
+ "| Cause | Count |",
973
+ "|---|---|",
974
+ ]
975
+ for cause, count in sorted(causes.items(), key=lambda kv: -kv[1]):
976
+ lines.append(f"| `{cause}` | {count} |")
977
+ lines.append("")
978
+ lines.append("## Oracle-disagrees rows (picker bugs)")
979
+ lines.append("")
980
+ if not bugs_sorted:
981
+ lines.append("_None._")
982
+ else:
983
+ lines.append("| run_ts | lane | tag | cause | picker reason (truncated) |")
984
+ lines.append("|---|---|---|---|---|")
985
+ for r in bugs_sorted:
986
+ reason = (r.get("picker_reason") or "").replace("|", "\\|").replace("\n", " ")[:120]
987
+ lines.append(
988
+ f"| `{r['run_ts']}` | {r.get('lane','')} | `{r.get('tag','')}` | "
989
+ f"`{r.get('no_pick_cause','')}` | {reason} |"
990
+ )
991
+ lines.append("")
992
+ lines.append("## Notes")
993
+ lines.append("")
994
+ lines.append("- `recall_proxy` is a proxy because we can't enumerate the true set of pickable ")
995
+ lines.append(" work without a parallel-universe picker run. Instead, we measure NO-PICK rows ")
996
+ lines.append(" where on-disk state contradicts the picker's claim — those are *known* missed ")
997
+ lines.append(" picks. A floor of 0.7 means at most 30% of NO-PICKs may be unverifiable; tune ")
998
+ lines.append(" upward as cross-check coverage grows.")
999
+ lines.append("- Backfill: legacy envelopes (`UNCLASSIFIED`) inflate the noise floor. As more ")
1000
+ lines.append(" envelopes carry `reason_class`, recall_proxy gets sharper.")
1001
+ return "\n".join(lines) + "\n"
1002
+
1003
+
1004
+ # ---------------------------------------------------------------------------
1005
+ # CLI entrypoint
1006
+ # ---------------------------------------------------------------------------
1007
+
1008
+
1009
+ def main(argv: list[str] | None = None) -> int:
1010
+ p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
1011
+ sub = p.add_subparsers(dest="cmd", required=True)
1012
+
1013
+ p_c = sub.add_parser("classify", help="classify a single dispatch run")
1014
+ p_c.add_argument("run_ts", help="e.g. 20260526T182233Z")
1015
+ p_c.set_defaults(func=cmd_classify)
1016
+
1017
+ p_s = sub.add_parser("sweep", help="classify all dispatches in window; idempotent")
1018
+ p_s.add_argument("--since", default="7d", help="e.g. 7d, 24h, or ISO date")
1019
+ p_s.add_argument("--window", default=None, help="output dir name (default: --since value)")
1020
+ p_s.add_argument("--force", action="store_true", help="re-classify already-classified runs")
1021
+ p_s.set_defaults(func=cmd_sweep)
1022
+
1023
+ p_r = sub.add_parser("report", help="render picker_recall_audit.md")
1024
+ p_r.add_argument("--window", required=True)
1025
+ p_r.set_defaults(func=cmd_report)
1026
+
1027
+ p_k = sub.add_parser("check", help="CI gate: exit non-zero if recall floor not met")
1028
+ p_k.add_argument("--window", required=True)
1029
+ p_k.add_argument("--min-recall", type=float, default=0.7)
1030
+ p_k.set_defaults(func=cmd_check)
1031
+
1032
+ args = p.parse_args(argv)
1033
+ return args.func(args)
1034
+
1035
+
1036
+ if __name__ == "__main__":
1037
+ sys.exit(main())