dos-kernel 0.22.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. dos/__init__.py +261 -0
  2. dos/_bin/dos-hook.exe +0 -0
  3. dos/_filelock.py +255 -0
  4. dos/_job_policy.py +97 -0
  5. dos/_tree.py +145 -0
  6. dos/admission.py +433 -0
  7. dos/answer_shape.py +299 -0
  8. dos/arbiter.py +859 -0
  9. dos/archive_lock.py +266 -0
  10. dos/arg_provenance.py +814 -0
  11. dos/attest.py +472 -0
  12. dos/breaker.py +311 -0
  13. dos/churn.py +226 -0
  14. dos/claim_extract.py +229 -0
  15. dos/claim_ttl.py +150 -0
  16. dos/cli.py +8721 -0
  17. dos/commit_audit.py +666 -0
  18. dos/completion.py +466 -0
  19. dos/concurrency_class.py +154 -0
  20. dos/config.py +1380 -0
  21. dos/config_lint.py +464 -0
  22. dos/cooldown.py +390 -0
  23. dos/coverage.py +387 -0
  24. dos/dangling_intent.py +287 -0
  25. dos/data_class.py +397 -0
  26. dos/decisions.py +1274 -0
  27. dos/decisions_tui.py +251 -0
  28. dos/dispatch_top.py +740 -0
  29. dos/dispatch_top_tui.py +116 -0
  30. dos/drivers/__init__.py +40 -0
  31. dos/drivers/ci_status.py +630 -0
  32. dos/drivers/citation_resolve.py +703 -0
  33. dos/drivers/decision_stop.py +98 -0
  34. dos/drivers/export_file.py +173 -0
  35. dos/drivers/export_otlp.py +275 -0
  36. dos/drivers/export_statsd.py +242 -0
  37. dos/drivers/hook_dialects.py +391 -0
  38. dos/drivers/job.py +47 -0
  39. dos/drivers/llm_judge.py +360 -0
  40. dos/drivers/memory_recall.py +1231 -0
  41. dos/drivers/notify_slack.py +373 -0
  42. dos/drivers/notify_webhook.py +251 -0
  43. dos/drivers/operator_judge.py +114 -0
  44. dos/drivers/os_acceptance.py +228 -0
  45. dos/drivers/paste_log.py +132 -0
  46. dos/drivers/plan_scope.py +133 -0
  47. dos/drivers/self_improve.py +375 -0
  48. dos/drivers/similarity_judge.py +249 -0
  49. dos/drivers/state_diff.py +274 -0
  50. dos/drivers/supervisor.py +347 -0
  51. dos/drivers/watchdog.py +363 -0
  52. dos/drivers/workshop.py +160 -0
  53. dos/durable_schema.py +344 -0
  54. dos/effect_witness.py +393 -0
  55. dos/efficiency.py +318 -0
  56. dos/enforce.py +414 -0
  57. dos/enumerate.py +776 -0
  58. dos/env_print.py +378 -0
  59. dos/event_severity.py +258 -0
  60. dos/evidence.py +692 -0
  61. dos/exec_capability.py +256 -0
  62. dos/export_cursor.py +143 -0
  63. dos/exporter.py +320 -0
  64. dos/firing_label.py +353 -0
  65. dos/fleet_roll.py +226 -0
  66. dos/gate_classify.py +827 -0
  67. dos/gh4_coverage.py +179 -0
  68. dos/git_delta.py +122 -0
  69. dos/guard.py +215 -0
  70. dos/health.py +552 -0
  71. dos/help_summary.py +519 -0
  72. dos/home.py +934 -0
  73. dos/hook_binary.py +194 -0
  74. dos/hook_dialect.py +271 -0
  75. dos/hook_exit.py +191 -0
  76. dos/hook_install.py +437 -0
  77. dos/id_alloc.py +304 -0
  78. dos/improve.py +499 -0
  79. dos/intent_ledger.py +635 -0
  80. dos/interpret.py +176 -0
  81. dos/intervention.py +769 -0
  82. dos/intervention_eval.py +371 -0
  83. dos/journal_delta.py +308 -0
  84. dos/judge_eval.py +328 -0
  85. dos/judges.py +366 -0
  86. dos/lane_infer.py +127 -0
  87. dos/lane_journal.py +1001 -0
  88. dos/lane_lease.py +952 -0
  89. dos/lane_overlap.py +228 -0
  90. dos/lease_health.py +282 -0
  91. dos/lifecycle.py +211 -0
  92. dos/liveness.py +352 -0
  93. dos/lock_modes.py +185 -0
  94. dos/log_source.py +395 -0
  95. dos/loop_decide.py +1746 -0
  96. dos/marker_gate.py +254 -0
  97. dos/marker_sensor.py +396 -0
  98. dos/noop_streak.py +280 -0
  99. dos/notify.py +479 -0
  100. dos/observe.py +175 -0
  101. dos/oracle.py +1661 -0
  102. dos/overlap_eval.py +214 -0
  103. dos/overlap_policy.py +342 -0
  104. dos/packet_sidecar.py +267 -0
  105. dos/phase_shipped.py +1985 -0
  106. dos/pick_priority.py +225 -0
  107. dos/pickable.py +369 -0
  108. dos/picker_oracle.py +1037 -0
  109. dos/plan_board.py +513 -0
  110. dos/plan_board_tui.py +113 -0
  111. dos/plan_source.py +455 -0
  112. dos/posttool_sensor.py +528 -0
  113. dos/precursor_gate.py +499 -0
  114. dos/precursor_gate_eval.py +239 -0
  115. dos/preflight.py +825 -0
  116. dos/pretool_sensor.py +490 -0
  117. dos/proc_delta.py +181 -0
  118. dos/productivity.py +296 -0
  119. dos/provider_limit.py +242 -0
  120. dos/py.typed +4 -0
  121. dos/reason_morphology.py +299 -0
  122. dos/reasons.py +449 -0
  123. dos/reconcile.py +173 -0
  124. dos/recurring_wedge.py +206 -0
  125. dos/render.py +393 -0
  126. dos/result_state.py +468 -0
  127. dos/resume.py +578 -0
  128. dos/resume_evidence.py +293 -0
  129. dos/retention.py +344 -0
  130. dos/reward.py +372 -0
  131. dos/rewind.py +587 -0
  132. dos/rewind_evidence.py +168 -0
  133. dos/rewind_tokens.py +252 -0
  134. dos/run_id.py +342 -0
  135. dos/scope.py +520 -0
  136. dos/scope_source.py +382 -0
  137. dos/scout.py +982 -0
  138. dos/self_modify.py +209 -0
  139. dos/sibling_scan.py +569 -0
  140. dos/skills/EXAMPLES.md +584 -0
  141. dos/skills/dos-class-cycle/SKILL.md +107 -0
  142. dos/skills/dos-dispatch/SKILL.md +177 -0
  143. dos/skills/dos-dispatch-loop/SKILL.md +254 -0
  144. dos/skills/dos-goal-gate/SKILL.md +269 -0
  145. dos/skills/dos-next-up/SKILL.md +231 -0
  146. dos/skills/dos-promote/SKILL.md +114 -0
  147. dos/skills/dos-replan/SKILL.md +159 -0
  148. dos/skills/dos-replan-loop/SKILL.md +114 -0
  149. dos/skills/dos-self-improve/SKILL.md +213 -0
  150. dos/skills/dos-supervise-loop/SKILL.md +180 -0
  151. dos/skills/dos-unstick/SKILL.md +108 -0
  152. dos/skills/dos-witness-claim/SKILL.md +251 -0
  153. dos/stamp.py +1002 -0
  154. dos/state_health.py +387 -0
  155. dos/status.py +114 -0
  156. dos/stop_policy.py +334 -0
  157. dos/supervise.py +1014 -0
  158. dos/testwitness.py +392 -0
  159. dos/timeline.py +1027 -0
  160. dos/tokens.py +485 -0
  161. dos/tool_stream.py +393 -0
  162. dos/tool_stream_eval.py +226 -0
  163. dos/trace.py +524 -0
  164. dos/verdict.py +140 -0
  165. dos/verdict_cli.py +189 -0
  166. dos/verdict_journal.py +497 -0
  167. dos/verdict_rollup.py +217 -0
  168. dos/verdicts.py +181 -0
  169. dos/wedge_reason.py +282 -0
  170. dos_kernel-0.22.0.dist-info/METADATA +859 -0
  171. dos_kernel-0.22.0.dist-info/RECORD +178 -0
  172. dos_kernel-0.22.0.dist-info/WHEEL +5 -0
  173. dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
  174. dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
  175. dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
  176. dos_mcp/__init__.py +52 -0
  177. dos_mcp/py.typed +2 -0
  178. dos_mcp/server.py +779 -0
@@ -0,0 +1,375 @@
1
+ """dos.drivers.self_improve — the self-improving-loop ENGINE (docs/280).
2
+
3
+ The driver half of the first self-improving work loop for DOS. The kernel leaf
4
+ (`dos.improve`) is a PURE keep-gate — `classify(CandidateEvidence, policy) ->
5
+ KEEP/REVERT/ESCALATE`. This module is the layer-4 driver that does the I/O the
6
+ kernel refuses to: it runs the test suite, runs the truth syscall, measures the
7
+ host's improvement metric, counts the tokens a candidate spent, calls the kernel,
8
+ and carries out the verdict (merge / discard / escalate).
9
+
10
+ THE DELIBERATE BOUNDARY — the engine proposes NOTHING
11
+ =====================================================
12
+
13
+ The intelligent, non-deterministic part of a self-improving loop is *proposing a
14
+ candidate change*. That part lives ENTIRELY outside this engine — in the
15
+ `self-improve` skill's subagent brief — for the same reason `llm_judge` is the
16
+ only non-deterministic rung and it is a driver: the kernel (and this engine)
17
+ contribute ZERO intelligence to the proposal, only the refusal to keep an
18
+ unwitnessed one. The engine takes the candidate as an injected callback
19
+ (`propose`/`apply`), so:
20
+
21
+ * the engine is fully DETERMINISTIC and unit-testable on a fake proposer (no
22
+ model, no network), and
23
+ * the keep-decision is provably a function of env-authored facts, never of
24
+ whatever the proposer narrated.
25
+
26
+ This is the `propose → verify → measure → keep-or-revert` cycle with the verify /
27
+ measure / keep-or-revert steps mechanized here and the propose step left to a
28
+ capable agent the engine treats as untrusted.
29
+
30
+ THE WORKTREE ISOLATION — the kernel adjudicating is not the kernel rewritten
31
+ ============================================================================
32
+
33
+ A candidate edit to DOS is the `SELF_MODIFY` / `global`-lane hazard (docs/89,
34
+ [[self-modification-hazard]]): editing the kernel's own running path is exactly
35
+ what the arbiter refuses. So a candidate is applied + measured in an ISOLATED git
36
+ worktree (the host supplies the worktree paths in `CycleContext`), never the live
37
+ tree the loop is running from. The kernel that adjudicates the candidate is not
38
+ the kernel being rewritten by it — the engine reads the verdict from a clean
39
+ process, then merges only on KEEP.
40
+
41
+ This module names no host beyond the `SubstrateConfig` seam and reads the metric
42
+ through an injected callback, so it is domain-free: the host names *what
43
+ improvement means* (the metric) and *how to propose* (the callback); the engine
44
+ owns the loop skeleton + the witness-gather.
45
+ """
46
+
47
+ from __future__ import annotations
48
+
49
+ import enum
50
+ from dataclasses import dataclass, field, replace
51
+ from typing import Callable, Optional
52
+
53
+ from dos import improve
54
+
55
+
56
+ # ---------------------------------------------------------------------------
57
+ # The injected boundary — what the host supplies per loop.
58
+ # ---------------------------------------------------------------------------
59
+
60
+
61
+ class CycleAction(str, enum.Enum):
62
+ """What the engine DID with a candidate this cycle — the carried-out verdict.
63
+
64
+ Mirrors `improve.Candidate` (the verdict) but names the ACTUATION the engine
65
+ performed, so a loop record reads as a log of *acts*, not just verdicts:
66
+
67
+ MERGED — the candidate was KEPT: its worktree commit was merged onto the
68
+ lane and the baseline was raised. The loop ratchets.
69
+ DISCARDED — the candidate was REVERTED: its worktree was thrown away, the
70
+ live tree is untouched. The breaker count was bumped.
71
+ ESCALATED — the breaker OPENed: the engine stopped and filed a human decision.
72
+ SKIPPED — the proposer returned no candidate this cycle (nothing to judge);
73
+ not a fault, not a revert — the engine simply moves on.
74
+ """
75
+
76
+ MERGED = "merged"
77
+ DISCARDED = "discarded"
78
+ ESCALATED = "escalated"
79
+ SKIPPED = "skipped"
80
+
81
+ def __str__(self) -> str: # pragma: no cover - trivial
82
+ return self.value
83
+
84
+
85
+ @dataclass(frozen=True)
86
+ class Candidate:
87
+ """One proposed self-improvement, as the injected proposer returns it.
88
+
89
+ The proposer (a capable agent, OUTSIDE the engine's trust) applies a single
90
+ scoped change inside the isolated worktree and returns this descriptor. Every
91
+ field the engine later trusts is RE-MEASURED by the engine from the worktree —
92
+ none is taken from the proposer's word:
93
+
94
+ present — did the proposer actually produce a candidate this cycle? False ⇒
95
+ the engine SKIPs (nothing to judge). The proposer's honest "I have
96
+ nothing" — not a revert.
97
+ commit — the worktree commit SHA the candidate landed (for the merge on
98
+ KEEP and the truth syscall). May be "" when `present` is False.
99
+ narrated — the proposer's own description of what it did. Carried to the
100
+ operator surface and the kernel's `narrated` field — parsed for
101
+ NOTHING (docs/234). This is the ONE field that is the proposer's
102
+ word, and by construction it touches no part of the keep-decision.
103
+ tokens — the tokens the proposer spent producing the candidate (for the
104
+ efficiency rung). The host reads this from the provider usage
105
+ record, not the proposer's claim.
106
+ """
107
+
108
+ present: bool
109
+ commit: str = ""
110
+ narrated: str = ""
111
+ tokens: int = 0
112
+
113
+
114
+ @dataclass(frozen=True)
115
+ class WitnessReadback:
116
+ """The env-authored witnesses the engine gathered for one candidate.
117
+
118
+ The host's gather functions (see `CycleContext`) produce this AFTER the
119
+ candidate is applied in the worktree. Every field is authored by the
120
+ environment, never the proposer — the docs/138 invariant the keep-bit rests on:
121
+
122
+ suite_passed — the test runner's exit status on the worktree (True iff the
123
+ suite the host runs exited 0). The runner authored it.
124
+ truth_clean — True iff `dos verify` / `dos commit-audit` agreed over git
125
+ ancestry for the candidate commit. The oracle authored it.
126
+ work — the host's improvement metric measured on the worktree AFTER
127
+ the candidate. The environment measured it.
128
+ """
129
+
130
+ suite_passed: bool
131
+ truth_clean: bool
132
+ work: int
133
+
134
+
135
+ @dataclass(frozen=True)
136
+ class CycleContext:
137
+ """Everything the engine needs to run ONE cycle — the host's injected I/O.
138
+
139
+ The callbacks are the seam: the engine owns the loop skeleton and the
140
+ witness→kernel→actuation wiring; the host owns every side-effecting step. All
141
+ are plain callables so the engine is testable on fakes (no model, no git, no
142
+ subprocess in a unit test).
143
+
144
+ propose — () -> Candidate. Apply ONE scoped self-improvement in the
145
+ isolated worktree and return its descriptor (or
146
+ `Candidate(present=False)` to skip). The capable, untrusted
147
+ step — the only place intelligence enters the loop.
148
+ gather — (Candidate) -> WitnessReadback. Run the suite, run the truth
149
+ syscall, measure the metric on the worktree. The env-authored
150
+ witness-gather. Called ONLY when a candidate is present.
151
+ merge — (Candidate) -> None. KEEP actuation: merge the candidate's
152
+ worktree commit onto the lane (the engine calls this only on
153
+ a KEEP verdict).
154
+ discard — (Candidate) -> None. REVERT actuation: throw the worktree's
155
+ candidate away, leaving the live tree untouched.
156
+ escalate — (improve.CandidateVerdict) -> None. ESCALATE actuation: file a
157
+ `dos decisions` entry for a human (the engine calls this once,
158
+ then stops).
159
+ baseline_work — the host metric measured on the GREEN baseline tree at the
160
+ start of this cycle (the `work` the candidate must strictly
161
+ beat). The engine raises it after a KEEP so the loop ratchets.
162
+ policy — the `improve.ImprovePolicy` (thresholds; the host's
163
+ `dos.toml [improve]`).
164
+ """
165
+
166
+ propose: Callable[[], Candidate]
167
+ gather: Callable[[Candidate], WitnessReadback]
168
+ merge: Callable[[Candidate], None]
169
+ discard: Callable[[Candidate], None]
170
+ escalate: Callable[["improve.CandidateVerdict"], None]
171
+ baseline_work: int
172
+ policy: improve.ImprovePolicy = field(default_factory=improve.ImprovePolicy)
173
+
174
+
175
+ @dataclass(frozen=True)
176
+ class CycleResult:
177
+ """The outcome of ONE cycle — the verdict, the act taken, and the carry-forward.
178
+
179
+ `verdict` is the kernel's `CandidateVerdict` (None on a SKIP — nothing was
180
+ judged). `action` is what the engine DID (the `CycleAction`). `candidate` is the
181
+ descriptor that was judged (None on a SKIP). `next_baseline` and
182
+ `next_consecutive_reverts` are the state the driver threads into the NEXT cycle
183
+ — `next_baseline` is raised on a KEEP (the ratchet), unchanged otherwise;
184
+ `next_consecutive_reverts` is the kernel's carried breaker count. `should_stop`
185
+ is True iff the loop must halt now (an ESCALATE).
186
+ """
187
+
188
+ action: CycleAction
189
+ next_baseline: int
190
+ next_consecutive_reverts: int
191
+ should_stop: bool
192
+ verdict: "Optional[improve.CandidateVerdict]" = None
193
+ candidate: Optional[Candidate] = None
194
+
195
+ @property
196
+ def reason(self) -> str:
197
+ """A one-line operator-facing summary for the loop record's tally row."""
198
+ if self.verdict is None:
199
+ return "no candidate proposed this cycle — skipped (nothing to judge)"
200
+ return self.verdict.reason
201
+
202
+
203
+ def run_cycle(ctx: CycleContext, consecutive_reverts: int = 0) -> CycleResult:
204
+ """Run ONE self-improvement cycle: propose → gather → classify → actuate.
205
+
206
+ The deterministic engine skeleton (the proposer is the only non-deterministic
207
+ step, and it is injected). Steps:
208
+
209
+ 1. PROPOSE — ask the injected proposer for one candidate. If none is present,
210
+ return a SKIP immediately (nothing to judge — not a revert, the breaker is
211
+ untouched).
212
+ 2. GATHER — run the host's witness-gather on the worktree (suite, truth
213
+ syscall, metric). Every fact is env-authored.
214
+ 3. CLASSIFY — hand the env-authored facts + the carried breaker count to the
215
+ PURE kernel (`improve.classify`). The keep-decision is the kernel's; the
216
+ proposer's narration rides along in `narrated` and moves nothing.
217
+ 4. ACTUATE — carry out the verdict: KEEP → merge + raise the baseline (the
218
+ ratchet) + reset the breaker; REVERT → discard + bump the breaker;
219
+ ESCALATE → discard + file a human decision + stop.
220
+
221
+ Returns a `CycleResult` carrying the verdict, the act, and the state to thread
222
+ into the next cycle. PURE of policy: every threshold is in `ctx.policy`, every
223
+ side effect is in `ctx`'s callbacks — the engine just wires them.
224
+ """
225
+ # 1. PROPOSE — the one untrusted, intelligent step.
226
+ candidate = ctx.propose()
227
+ if not candidate.present:
228
+ return CycleResult(
229
+ action=CycleAction.SKIPPED,
230
+ next_baseline=ctx.baseline_work,
231
+ next_consecutive_reverts=consecutive_reverts,
232
+ should_stop=False,
233
+ )
234
+
235
+ # 2. GATHER — the env-authored witnesses, measured on the worktree.
236
+ readback = ctx.gather(candidate)
237
+
238
+ # 3. CLASSIFY — the PURE kernel keep-gate. The proposer's `narrated` rides along
239
+ # but, by construction (docs/234), cannot move the verdict.
240
+ evidence = improve.CandidateEvidence(
241
+ suite_passed=readback.suite_passed,
242
+ truth_clean=readback.truth_clean,
243
+ work=readback.work,
244
+ baseline_work=ctx.baseline_work,
245
+ tokens=candidate.tokens,
246
+ consecutive_reverts=consecutive_reverts,
247
+ narrated=candidate.narrated,
248
+ )
249
+ verdict = improve.classify(evidence, ctx.policy)
250
+
251
+ # 4. ACTUATE — carry out the kernel's verdict.
252
+ if verdict.verdict is improve.Candidate.KEEP:
253
+ ctx.merge(candidate)
254
+ return CycleResult(
255
+ action=CycleAction.MERGED,
256
+ next_baseline=readback.work, # the ratchet: the next candidate must beat THIS
257
+ next_consecutive_reverts=verdict.next_consecutive_reverts, # 0
258
+ should_stop=False,
259
+ verdict=verdict,
260
+ candidate=candidate,
261
+ )
262
+
263
+ if verdict.verdict is improve.Candidate.ESCALATE:
264
+ # Discard the candidate that tipped the breaker, then surface to a human and stop.
265
+ ctx.discard(candidate)
266
+ ctx.escalate(verdict)
267
+ return CycleResult(
268
+ action=CycleAction.ESCALATED,
269
+ next_baseline=ctx.baseline_work, # unchanged — nothing was kept
270
+ next_consecutive_reverts=verdict.next_consecutive_reverts,
271
+ should_stop=True,
272
+ verdict=verdict,
273
+ candidate=candidate,
274
+ )
275
+
276
+ # REVERT — discard the worktree candidate; the live tree is untouched.
277
+ ctx.discard(candidate)
278
+ return CycleResult(
279
+ action=CycleAction.DISCARDED,
280
+ next_baseline=ctx.baseline_work, # unchanged
281
+ next_consecutive_reverts=verdict.next_consecutive_reverts,
282
+ should_stop=False,
283
+ verdict=verdict,
284
+ candidate=candidate,
285
+ )
286
+
287
+
288
+ @dataclass(frozen=True)
289
+ class LoopOutcome:
290
+ """The result of a bounded run of cycles — the loop's final tally.
291
+
292
+ `cycles` is the per-cycle record (in order). `kept` / `reverted` / `skipped`
293
+ are the counts. `escalated` is True iff the loop stopped on an ESCALATE.
294
+ `final_baseline` is the metric after the last KEEP (the ratchet's high-water
295
+ mark — the measure of how much the loop improved DOS). `stop_reason` is a
296
+ one-line summary of why the loop ended.
297
+ """
298
+
299
+ cycles: tuple[CycleResult, ...]
300
+ kept: int
301
+ reverted: int
302
+ skipped: int
303
+ escalated: bool
304
+ final_baseline: int
305
+ stop_reason: str
306
+
307
+
308
+ def run_loop(
309
+ ctx: CycleContext,
310
+ *,
311
+ max_cycles: int,
312
+ consecutive_reverts: int = 0,
313
+ on_cycle: "Optional[Callable[[CycleResult], None]]" = None,
314
+ ) -> LoopOutcome:
315
+ """Run up to `max_cycles` self-improvement cycles, ratcheting the baseline.
316
+
317
+ The outer-loop skeleton — the `loop_decide` of the self-improvement loop, but
318
+ simpler because every stop condition is the kernel's: a cycle stops the loop iff
319
+ its `CycleResult.should_stop` (an ESCALATE), and the bare `max_cycles` is the
320
+ backstop (the `ITERATION_CAP` analogue). Between cycles the engine threads two
321
+ pieces of state — the (possibly raised) `baseline_work` and the carried breaker
322
+ count — so the loop RATCHETS: after a KEEP the next candidate must beat the
323
+ improved tree, not the original.
324
+
325
+ `on_cycle` is an optional sink (the host's run-record writer / `dos top`
326
+ surface) called once per cycle with its result. The engine itself writes nothing
327
+ — archiving is the host's actuation (the CLAUDE.md "the kernel reports, the host
328
+ acts" line).
329
+
330
+ Stops on the FIRST of: an ESCALATE (the breaker — surface to a human), or
331
+ `max_cycles` reached (the backstop). A run of SKIPs (the proposer keeps finding
332
+ nothing) burns cycles up to the cap — the host may choose a smaller cap when it
333
+ expects the well to be shallow.
334
+ """
335
+ cycles: list[CycleResult] = []
336
+ kept = reverted = skipped = 0
337
+ baseline = ctx.baseline_work
338
+ reverts = consecutive_reverts
339
+ escalated = False
340
+ stop_reason = f"reached the {max_cycles}-cycle cap"
341
+
342
+ for i in range(max_cycles):
343
+ cycle_ctx = replace(ctx, baseline_work=baseline)
344
+ result = run_cycle(cycle_ctx, consecutive_reverts=reverts)
345
+ cycles.append(result)
346
+ if on_cycle is not None:
347
+ on_cycle(result)
348
+
349
+ if result.action is CycleAction.MERGED:
350
+ kept += 1
351
+ elif result.action is CycleAction.DISCARDED:
352
+ reverted += 1
353
+ elif result.action is CycleAction.SKIPPED:
354
+ skipped += 1
355
+
356
+ baseline = result.next_baseline
357
+ reverts = result.next_consecutive_reverts
358
+
359
+ if result.should_stop:
360
+ escalated = True
361
+ stop_reason = (
362
+ f"ESCALATED to a human after {reverts} candidates in a row that "
363
+ f"nothing accepted (cycle {i + 1})"
364
+ )
365
+ break
366
+
367
+ return LoopOutcome(
368
+ cycles=tuple(cycles),
369
+ kept=kept,
370
+ reverted=reverted,
371
+ skipped=skipped,
372
+ escalated=escalated,
373
+ final_baseline=baseline,
374
+ stop_reason=stop_reason,
375
+ )
@@ -0,0 +1,249 @@
1
+ """dos.drivers.similarity_judge — the DISTANCE adjudicator (outside the kernel line).
2
+
3
+ Why this exists (read `docs/76` first — the flexibility geometry)
4
+ =================================================================
5
+
6
+ The kernel's truth surface is **byte-exact on purpose**: `verify()` asks "is this
7
+ *identical* to an un-forgeable effect?" and `tool_stream` asks "did the env return
8
+ the *byte-identical* result N times?" — both measured facts no agent can forge in its
9
+ own favor. The recurring operator question is "why so rigid — what about *fuzzy* /
10
+ *distance-based* matching, where 'close enough' counts?"
11
+
12
+ The answer the layering contract gives (CLAUDE.md, `docs/76`): flexibility is welcome,
13
+ but it moves UP, out of the kernel verdict and into a **JUDGE driver** — because a
14
+ distance *threshold* is a tunable dial, and a tunable dial deciding "is this claim
15
+ true?" is exactly the forgeable knob the kernel is built to NOT have
16
+ (`flexibility-geometry`: "Anti-pattern ruled out: a `confidence: float` ... INSIDE the
17
+ kernel"). So this driver is where "close enough" is allowed to live:
18
+
19
+ * It runs ONLY on the residue the deterministic oracle ABSTAINED on (deterministic-
20
+ first is the composition's job — `judge_eval.compose_deterministic_first` /
21
+ `decisions._resolver_for` hand a judge only what the oracle could not settle).
22
+ * It is **advisory-only** — it returns a `JudgeVerdict`, mutates nothing.
23
+ * It **fails to ABSTAIN, never to AGREE** — below threshold, no evidence, or any
24
+ error punts to a human; it can never auto-clear a claim by being uncertain.
25
+
26
+ The byte-inequality discipline, kept (the load-bearing subtlety)
27
+ ================================================================
28
+
29
+ A naive "similarity judge" is a TRAP: if it scored the agent's `claim_text` against the
30
+ agent's own `stated_reason` (narration), it would be re-deriving the agent's OWN bytes —
31
+ **consistency, not grounding** (the [[consistency-is-not-grounding]] / mirror-verifier
32
+ disease, docs/141 §5a). Two strings the same author wrote being similar proves nothing.
33
+
34
+ So the comparison here is **structural, not against narration**: it scores `claim_text`
35
+ distance against the `Claim.evidence` tuple — the forgery-resistant, *env/git-authored*
36
+ bytes the kernel gathered (`Claim`'s docstring: "git lines, file state, a diff"). And it
37
+ **ABSTAINS when there is no evidence** — it will not agree off narration alone. The
38
+ distance is fuzzy; the *thing it is fuzzy against* is still un-authored by the judged
39
+ agent. That is the whole trick: flexibility on the MATCH, never on the PROVENANCE.
40
+
41
+ Purity & the optional embedding seam
42
+ ====================================
43
+
44
+ The default scorer is **pure stdlib** — `difflib.SequenceMatcher`, a normalized
45
+ token-overlap ratio — so the package ships with ZERO new dependency and the judge is
46
+ always usable (the near-stdlib-kernel discipline, applied to a driver). A heavier
47
+ semantic scorer (sentence-embeddings cosine) is reachable through ONE guarded seam,
48
+ `_embedding_similarity`, gated on `$DOS_SIMILARITY_CMD` — the same env-configured,
49
+ never-raises provider shape as `llm_judge._call_provider`. With no command wired the
50
+ seam returns None and the judge falls back to the lexical scorer; it never hard-depends
51
+ on an embedding library. The coupling lives in the operator's env, not the code.
52
+
53
+ Register it under the `dos.judges` entry-point group (it is discoverable, not a
54
+ built-in — only the `abstain` baseline is unshadowable):
55
+
56
+ [project.entry-points."dos.judges"]
57
+ similarity = "dos.drivers.similarity_judge:SimilarityJudge"
58
+ """
59
+
60
+ from __future__ import annotations
61
+
62
+ import difflib
63
+ import os
64
+ import re
65
+ import subprocess
66
+
67
+
68
+ # The env var naming an OPTIONAL embedding-similarity command. It must read two
69
+ # texts on stdin separated by a NUL byte (\x00) and write a single float in [0,1]
70
+ # (cosine similarity) on stdout. With it unset the judge uses the pure-stdlib
71
+ # lexical scorer — so this is a strict ENHANCEMENT seam, never a dependency.
72
+ ENV_SIMILARITY_CMD = "DOS_SIMILARITY_CMD"
73
+
74
+ # The env var overriding the default agree-threshold (a float in [0,1]). The
75
+ # threshold is DATA, declared by the operator — never a constant baked into a
76
+ # kernel verdict. Default below.
77
+ ENV_SIMILARITY_THRESHOLD = "DOS_SIMILARITY_THRESHOLD"
78
+
79
+ # The default agree-threshold. Deliberately HIGH (0.82): a judge that clears a
80
+ # claim is the one dangerous outcome the seam guards, so "close enough to agree"
81
+ # must mean *very* close. Below this AND above the abstain-floor → DISAGREE; below
82
+ # the abstain-floor with usable evidence → still DISAGREE (low overlap = unsupported);
83
+ # the ABSTAIN cases are "no evidence to score against" and "scorer errored," never
84
+ # "the score was middling" — a middling score is a real DISAGREE signal, not an
85
+ # I-can't-tell.
86
+ DEFAULT_THRESHOLD = 0.82
87
+
88
+
89
+ _TOKEN_RE = re.compile(r"\w+", re.UNICODE)
90
+
91
+
92
+ def _tokens(text: str) -> list[str]:
93
+ """Lowercased word tokens of `text` — the unit the lexical scorer compares.
94
+
95
+ Pure. Casefolded so 'AUTH2' and 'auth2' match; `\\w+` drops punctuation/quoting
96
+ so a claim and an evidence line that differ only in formatting still score high.
97
+ """
98
+ return _TOKEN_RE.findall(text.casefold())
99
+
100
+
101
+ def _lexical_similarity(claim_text: str, evidence_blob: str) -> float:
102
+ """A pure-stdlib similarity in [0,1] between a claim and the evidence blob.
103
+
104
+ Two cheap, forgery-irrelevant signals, maxed (the claim is "supported" if EITHER
105
+ the wording lines up OR the claim's tokens are largely present in the evidence):
106
+
107
+ * `difflib.SequenceMatcher` ratio over the casefolded raw strings — catches
108
+ near-verbatim phrasing (a claim quoted back by a git line / file state).
109
+ * token-recall — the fraction of the claim's distinct tokens that appear in the
110
+ evidence's token set — catches a claim whose key terms are all witnessed even
111
+ if the surrounding prose differs.
112
+
113
+ Both are symmetric-enough and bounded [0,1]; `max` is the right combinator because
114
+ either kind of match is sufficient evidence of support. PURE — no I/O, no clock.
115
+ """
116
+ if not claim_text or not evidence_blob:
117
+ return 0.0
118
+ seq = difflib.SequenceMatcher(None, claim_text.casefold(), evidence_blob.casefold()).ratio()
119
+ claim_toks = set(_tokens(claim_text))
120
+ if not claim_toks:
121
+ return seq
122
+ ev_toks = set(_tokens(evidence_blob))
123
+ recall = len(claim_toks & ev_toks) / len(claim_toks)
124
+ return max(seq, recall)
125
+
126
+
127
+ def _embedding_similarity(claim_text: str, evidence_blob: str) -> float | None:
128
+ """The OPTIONAL semantic-similarity seam. Returns cosine in [0,1], or None.
129
+
130
+ Honors `$DOS_SIMILARITY_CMD` (a shell command reading `claim\\x00evidence` on
131
+ stdin, writing one float on stdout). Never raises — any failure (command unset,
132
+ missing, timeout, non-zero exit, unparseable output) returns None so the caller
133
+ falls back to the lexical scorer. This is the ONE place a heavier model is
134
+ touched; keeping it a single guarded seam is what lets the package ship with zero
135
+ embedding dependency while still allowing an operator to wire one in by env var
136
+ (the exact `llm_judge._call_provider` discipline, re-aimed at a similarity score).
137
+ """
138
+ cmd = os.environ.get(ENV_SIMILARITY_CMD)
139
+ if not cmd:
140
+ return None
141
+ try:
142
+ payload = (claim_text + "\x00" + evidence_blob).encode("utf-8")
143
+ p = subprocess.run(
144
+ cmd, shell=True, input=payload, capture_output=True, timeout=120,
145
+ )
146
+ except (OSError, subprocess.SubprocessError):
147
+ return None
148
+ if p.returncode != 0:
149
+ return None
150
+ out = (p.stdout or b"").decode("utf-8", errors="replace").strip()
151
+ try:
152
+ val = float(out.split()[0]) if out else None
153
+ except (ValueError, IndexError):
154
+ return None
155
+ if val is None:
156
+ return None
157
+ # Clamp into [0,1] — a provider that returns a cosine in [-1,1] (or noise) can
158
+ # never push the score past the bounds the threshold logic assumes.
159
+ return max(0.0, min(1.0, val))
160
+
161
+
162
+ def _threshold() -> float:
163
+ """The agree-threshold, read from `$DOS_SIMILARITY_THRESHOLD` or the default.
164
+
165
+ A malformed value falls back to the default rather than crashing — the threshold
166
+ is operator data, and a typo should degrade safely, not take down adjudication.
167
+ """
168
+ raw = os.environ.get(ENV_SIMILARITY_THRESHOLD)
169
+ if not raw:
170
+ return DEFAULT_THRESHOLD
171
+ try:
172
+ val = float(raw)
173
+ except ValueError:
174
+ return DEFAULT_THRESHOLD
175
+ return max(0.0, min(1.0, val))
176
+
177
+
178
+ class SimilarityJudge:
179
+ """A DISTANCE-based occupant of the JUDGE rung — a `dos.judges.Judge`.
180
+
181
+ Rules on a generic `Claim` by scoring how well its `claim_text` matches the
182
+ forgery-resistant `evidence` (NOT the agent's narration — that would be a mirror).
183
+ Fuzzy on the match, strict on the provenance, advisory-only, fail-to-abstain:
184
+
185
+ * **no evidence** → ABSTAIN. It refuses to agree off narration alone — the
186
+ byte-inequality floor (you cannot confirm a claim with the claimant's bytes).
187
+ * score **≥ threshold** → AGREE (the claim is near-verbatim witnessed by the
188
+ evidence). The one clearing verdict, reachable only on a high, *measured*
189
+ overlap with un-authored bytes.
190
+ * score **< threshold** (with evidence present) → DISAGREE (the evidence does
191
+ not support the claim). A middling score is a real "unsupported" signal, not
192
+ an "I can't tell."
193
+
194
+ The threshold is DATA (`$DOS_SIMILARITY_THRESHOLD`, default 0.82), never a knob
195
+ inside a kernel verdict. The scorer is pure stdlib by default; an embedding scorer
196
+ is an opt-in env seam (`$DOS_SIMILARITY_CMD`). With nothing wired it is fully
197
+ usable — it just uses the lexical scorer — so it is always safe to register and
198
+ `dos judge-eval`.
199
+ """
200
+
201
+ name = "similarity"
202
+
203
+ def rule(self, claim, config):
204
+ from dos.judges import JudgeVerdict
205
+
206
+ # The byte-inequality floor: with no evidence there are no un-authored bytes
207
+ # to score against. Agreeing here would mean believing the agent's own
208
+ # narration — the mirror-verifier trap. ABSTAIN (route to a human).
209
+ evidence = tuple(claim.evidence or ())
210
+ if not evidence:
211
+ return JudgeVerdict.abstain(
212
+ "no evidence to score the claim against — a distance judge will not "
213
+ "agree off narration alone (that would re-derive the agent's own "
214
+ "bytes); routing this claim to a human.",
215
+ )
216
+
217
+ claim_text = (claim.claim_text or "").strip()
218
+ if not claim_text:
219
+ return JudgeVerdict.abstain(
220
+ "empty claim_text — nothing to match against the evidence; abstaining.",
221
+ )
222
+
223
+ evidence_blob = "\n".join(evidence)
224
+ threshold = _threshold()
225
+
226
+ # Prefer the semantic seam if wired; else the pure lexical scorer. The seam
227
+ # never raises (it returns None on any failure), so this never needs a guard.
228
+ embedded = _embedding_similarity(claim_text, evidence_blob)
229
+ if embedded is not None:
230
+ score = embedded
231
+ scorer = "embedding"
232
+ else:
233
+ score = _lexical_similarity(claim_text, evidence_blob)
234
+ scorer = "lexical"
235
+
236
+ detail = f"{scorer} similarity {score:.3f} vs threshold {threshold:.2f}"
237
+ ev = (f"similarity: {detail}",)
238
+
239
+ if score >= threshold:
240
+ return JudgeVerdict.agree(
241
+ f"claim is witnessed by the evidence ({detail}) — near-verbatim match "
242
+ f"to un-authored bytes the agent did not write.",
243
+ evidence=ev,
244
+ )
245
+ return JudgeVerdict.disagree(
246
+ f"claim is NOT supported by the evidence ({detail}) — the gathered "
247
+ f"un-authored bytes do not match the assertion.",
248
+ evidence=ev,
249
+ )