dos-kernel 0.22.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dos/__init__.py +261 -0
- dos/_bin/dos-hook.exe +0 -0
- dos/_filelock.py +255 -0
- dos/_job_policy.py +97 -0
- dos/_tree.py +145 -0
- dos/admission.py +433 -0
- dos/answer_shape.py +299 -0
- dos/arbiter.py +859 -0
- dos/archive_lock.py +266 -0
- dos/arg_provenance.py +814 -0
- dos/attest.py +472 -0
- dos/breaker.py +311 -0
- dos/churn.py +226 -0
- dos/claim_extract.py +229 -0
- dos/claim_ttl.py +150 -0
- dos/cli.py +8721 -0
- dos/commit_audit.py +666 -0
- dos/completion.py +466 -0
- dos/concurrency_class.py +154 -0
- dos/config.py +1380 -0
- dos/config_lint.py +464 -0
- dos/cooldown.py +390 -0
- dos/coverage.py +387 -0
- dos/dangling_intent.py +287 -0
- dos/data_class.py +397 -0
- dos/decisions.py +1274 -0
- dos/decisions_tui.py +251 -0
- dos/dispatch_top.py +740 -0
- dos/dispatch_top_tui.py +116 -0
- dos/drivers/__init__.py +40 -0
- dos/drivers/ci_status.py +630 -0
- dos/drivers/citation_resolve.py +703 -0
- dos/drivers/decision_stop.py +98 -0
- dos/drivers/export_file.py +173 -0
- dos/drivers/export_otlp.py +275 -0
- dos/drivers/export_statsd.py +242 -0
- dos/drivers/hook_dialects.py +391 -0
- dos/drivers/job.py +47 -0
- dos/drivers/llm_judge.py +360 -0
- dos/drivers/memory_recall.py +1231 -0
- dos/drivers/notify_slack.py +373 -0
- dos/drivers/notify_webhook.py +251 -0
- dos/drivers/operator_judge.py +114 -0
- dos/drivers/os_acceptance.py +228 -0
- dos/drivers/paste_log.py +132 -0
- dos/drivers/plan_scope.py +133 -0
- dos/drivers/self_improve.py +375 -0
- dos/drivers/similarity_judge.py +249 -0
- dos/drivers/state_diff.py +274 -0
- dos/drivers/supervisor.py +347 -0
- dos/drivers/watchdog.py +363 -0
- dos/drivers/workshop.py +160 -0
- dos/durable_schema.py +344 -0
- dos/effect_witness.py +393 -0
- dos/efficiency.py +318 -0
- dos/enforce.py +414 -0
- dos/enumerate.py +776 -0
- dos/env_print.py +378 -0
- dos/event_severity.py +258 -0
- dos/evidence.py +692 -0
- dos/exec_capability.py +256 -0
- dos/export_cursor.py +143 -0
- dos/exporter.py +320 -0
- dos/firing_label.py +353 -0
- dos/fleet_roll.py +226 -0
- dos/gate_classify.py +827 -0
- dos/gh4_coverage.py +179 -0
- dos/git_delta.py +122 -0
- dos/guard.py +215 -0
- dos/health.py +552 -0
- dos/help_summary.py +519 -0
- dos/home.py +934 -0
- dos/hook_binary.py +194 -0
- dos/hook_dialect.py +271 -0
- dos/hook_exit.py +191 -0
- dos/hook_install.py +437 -0
- dos/id_alloc.py +304 -0
- dos/improve.py +499 -0
- dos/intent_ledger.py +635 -0
- dos/interpret.py +176 -0
- dos/intervention.py +769 -0
- dos/intervention_eval.py +371 -0
- dos/journal_delta.py +308 -0
- dos/judge_eval.py +328 -0
- dos/judges.py +366 -0
- dos/lane_infer.py +127 -0
- dos/lane_journal.py +1001 -0
- dos/lane_lease.py +952 -0
- dos/lane_overlap.py +228 -0
- dos/lease_health.py +282 -0
- dos/lifecycle.py +211 -0
- dos/liveness.py +352 -0
- dos/lock_modes.py +185 -0
- dos/log_source.py +395 -0
- dos/loop_decide.py +1746 -0
- dos/marker_gate.py +254 -0
- dos/marker_sensor.py +396 -0
- dos/noop_streak.py +280 -0
- dos/notify.py +479 -0
- dos/observe.py +175 -0
- dos/oracle.py +1661 -0
- dos/overlap_eval.py +214 -0
- dos/overlap_policy.py +342 -0
- dos/packet_sidecar.py +267 -0
- dos/phase_shipped.py +1985 -0
- dos/pick_priority.py +225 -0
- dos/pickable.py +369 -0
- dos/picker_oracle.py +1037 -0
- dos/plan_board.py +513 -0
- dos/plan_board_tui.py +113 -0
- dos/plan_source.py +455 -0
- dos/posttool_sensor.py +528 -0
- dos/precursor_gate.py +499 -0
- dos/precursor_gate_eval.py +239 -0
- dos/preflight.py +825 -0
- dos/pretool_sensor.py +490 -0
- dos/proc_delta.py +181 -0
- dos/productivity.py +296 -0
- dos/provider_limit.py +242 -0
- dos/py.typed +4 -0
- dos/reason_morphology.py +299 -0
- dos/reasons.py +449 -0
- dos/reconcile.py +173 -0
- dos/recurring_wedge.py +206 -0
- dos/render.py +393 -0
- dos/result_state.py +468 -0
- dos/resume.py +578 -0
- dos/resume_evidence.py +293 -0
- dos/retention.py +344 -0
- dos/reward.py +372 -0
- dos/rewind.py +587 -0
- dos/rewind_evidence.py +168 -0
- dos/rewind_tokens.py +252 -0
- dos/run_id.py +342 -0
- dos/scope.py +520 -0
- dos/scope_source.py +382 -0
- dos/scout.py +982 -0
- dos/self_modify.py +209 -0
- dos/sibling_scan.py +569 -0
- dos/skills/EXAMPLES.md +584 -0
- dos/skills/dos-class-cycle/SKILL.md +107 -0
- dos/skills/dos-dispatch/SKILL.md +177 -0
- dos/skills/dos-dispatch-loop/SKILL.md +254 -0
- dos/skills/dos-goal-gate/SKILL.md +269 -0
- dos/skills/dos-next-up/SKILL.md +231 -0
- dos/skills/dos-promote/SKILL.md +114 -0
- dos/skills/dos-replan/SKILL.md +159 -0
- dos/skills/dos-replan-loop/SKILL.md +114 -0
- dos/skills/dos-self-improve/SKILL.md +213 -0
- dos/skills/dos-supervise-loop/SKILL.md +180 -0
- dos/skills/dos-unstick/SKILL.md +108 -0
- dos/skills/dos-witness-claim/SKILL.md +251 -0
- dos/stamp.py +1002 -0
- dos/state_health.py +387 -0
- dos/status.py +114 -0
- dos/stop_policy.py +334 -0
- dos/supervise.py +1014 -0
- dos/testwitness.py +392 -0
- dos/timeline.py +1027 -0
- dos/tokens.py +485 -0
- dos/tool_stream.py +393 -0
- dos/tool_stream_eval.py +226 -0
- dos/trace.py +524 -0
- dos/verdict.py +140 -0
- dos/verdict_cli.py +189 -0
- dos/verdict_journal.py +497 -0
- dos/verdict_rollup.py +217 -0
- dos/verdicts.py +181 -0
- dos/wedge_reason.py +282 -0
- dos_kernel-0.22.0.dist-info/METADATA +859 -0
- dos_kernel-0.22.0.dist-info/RECORD +178 -0
- dos_kernel-0.22.0.dist-info/WHEEL +5 -0
- dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
- dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
- dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
- dos_mcp/__init__.py +52 -0
- dos_mcp/py.typed +2 -0
- dos_mcp/server.py +779 -0
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
"""dos.drivers.self_improve — the self-improving-loop ENGINE (docs/280).
|
|
2
|
+
|
|
3
|
+
The driver half of the first self-improving work loop for DOS. The kernel leaf
|
|
4
|
+
(`dos.improve`) is a PURE keep-gate — `classify(CandidateEvidence, policy) ->
|
|
5
|
+
KEEP/REVERT/ESCALATE`. This module is the layer-4 driver that does the I/O the
|
|
6
|
+
kernel refuses to: it runs the test suite, runs the truth syscall, measures the
|
|
7
|
+
host's improvement metric, counts the tokens a candidate spent, calls the kernel,
|
|
8
|
+
and carries out the verdict (merge / discard / escalate).
|
|
9
|
+
|
|
10
|
+
THE DELIBERATE BOUNDARY — the engine proposes NOTHING
|
|
11
|
+
=====================================================
|
|
12
|
+
|
|
13
|
+
The intelligent, non-deterministic part of a self-improving loop is *proposing a
|
|
14
|
+
candidate change*. That part lives ENTIRELY outside this engine — in the
|
|
15
|
+
`self-improve` skill's subagent brief — for the same reason `llm_judge` is the
|
|
16
|
+
only non-deterministic rung and it is a driver: the kernel (and this engine)
|
|
17
|
+
contribute ZERO intelligence to the proposal, only the refusal to keep an
|
|
18
|
+
unwitnessed one. The engine takes the candidate as an injected callback
|
|
19
|
+
(`propose`/`apply`), so:
|
|
20
|
+
|
|
21
|
+
* the engine is fully DETERMINISTIC and unit-testable on a fake proposer (no
|
|
22
|
+
model, no network), and
|
|
23
|
+
* the keep-decision is provably a function of env-authored facts, never of
|
|
24
|
+
whatever the proposer narrated.
|
|
25
|
+
|
|
26
|
+
This is the `propose → verify → measure → keep-or-revert` cycle with the verify /
|
|
27
|
+
measure / keep-or-revert steps mechanized here and the propose step left to a
|
|
28
|
+
capable agent the engine treats as untrusted.
|
|
29
|
+
|
|
30
|
+
THE WORKTREE ISOLATION — the kernel adjudicating is not the kernel rewritten
|
|
31
|
+
============================================================================
|
|
32
|
+
|
|
33
|
+
A candidate edit to DOS is the `SELF_MODIFY` / `global`-lane hazard (docs/89,
|
|
34
|
+
[[self-modification-hazard]]): editing the kernel's own running path is exactly
|
|
35
|
+
what the arbiter refuses. So a candidate is applied + measured in an ISOLATED git
|
|
36
|
+
worktree (the host supplies the worktree paths in `CycleContext`), never the live
|
|
37
|
+
tree the loop is running from. The kernel that adjudicates the candidate is not
|
|
38
|
+
the kernel being rewritten by it — the engine reads the verdict from a clean
|
|
39
|
+
process, then merges only on KEEP.
|
|
40
|
+
|
|
41
|
+
This module names no host beyond the `SubstrateConfig` seam and reads the metric
|
|
42
|
+
through an injected callback, so it is domain-free: the host names *what
|
|
43
|
+
improvement means* (the metric) and *how to propose* (the callback); the engine
|
|
44
|
+
owns the loop skeleton + the witness-gather.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
from __future__ import annotations
|
|
48
|
+
|
|
49
|
+
import enum
|
|
50
|
+
from dataclasses import dataclass, field, replace
|
|
51
|
+
from typing import Callable, Optional
|
|
52
|
+
|
|
53
|
+
from dos import improve
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
# The injected boundary — what the host supplies per loop.
|
|
58
|
+
# ---------------------------------------------------------------------------
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class CycleAction(str, enum.Enum):
|
|
62
|
+
"""What the engine DID with a candidate this cycle — the carried-out verdict.
|
|
63
|
+
|
|
64
|
+
Mirrors `improve.Candidate` (the verdict) but names the ACTUATION the engine
|
|
65
|
+
performed, so a loop record reads as a log of *acts*, not just verdicts:
|
|
66
|
+
|
|
67
|
+
MERGED — the candidate was KEPT: its worktree commit was merged onto the
|
|
68
|
+
lane and the baseline was raised. The loop ratchets.
|
|
69
|
+
DISCARDED — the candidate was REVERTED: its worktree was thrown away, the
|
|
70
|
+
live tree is untouched. The breaker count was bumped.
|
|
71
|
+
ESCALATED — the breaker OPENed: the engine stopped and filed a human decision.
|
|
72
|
+
SKIPPED — the proposer returned no candidate this cycle (nothing to judge);
|
|
73
|
+
not a fault, not a revert — the engine simply moves on.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
MERGED = "merged"
|
|
77
|
+
DISCARDED = "discarded"
|
|
78
|
+
ESCALATED = "escalated"
|
|
79
|
+
SKIPPED = "skipped"
|
|
80
|
+
|
|
81
|
+
def __str__(self) -> str: # pragma: no cover - trivial
|
|
82
|
+
return self.value
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass(frozen=True)
|
|
86
|
+
class Candidate:
|
|
87
|
+
"""One proposed self-improvement, as the injected proposer returns it.
|
|
88
|
+
|
|
89
|
+
The proposer (a capable agent, OUTSIDE the engine's trust) applies a single
|
|
90
|
+
scoped change inside the isolated worktree and returns this descriptor. Every
|
|
91
|
+
field the engine later trusts is RE-MEASURED by the engine from the worktree —
|
|
92
|
+
none is taken from the proposer's word:
|
|
93
|
+
|
|
94
|
+
present — did the proposer actually produce a candidate this cycle? False ⇒
|
|
95
|
+
the engine SKIPs (nothing to judge). The proposer's honest "I have
|
|
96
|
+
nothing" — not a revert.
|
|
97
|
+
commit — the worktree commit SHA the candidate landed (for the merge on
|
|
98
|
+
KEEP and the truth syscall). May be "" when `present` is False.
|
|
99
|
+
narrated — the proposer's own description of what it did. Carried to the
|
|
100
|
+
operator surface and the kernel's `narrated` field — parsed for
|
|
101
|
+
NOTHING (docs/234). This is the ONE field that is the proposer's
|
|
102
|
+
word, and by construction it touches no part of the keep-decision.
|
|
103
|
+
tokens — the tokens the proposer spent producing the candidate (for the
|
|
104
|
+
efficiency rung). The host reads this from the provider usage
|
|
105
|
+
record, not the proposer's claim.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
present: bool
|
|
109
|
+
commit: str = ""
|
|
110
|
+
narrated: str = ""
|
|
111
|
+
tokens: int = 0
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@dataclass(frozen=True)
|
|
115
|
+
class WitnessReadback:
|
|
116
|
+
"""The env-authored witnesses the engine gathered for one candidate.
|
|
117
|
+
|
|
118
|
+
The host's gather functions (see `CycleContext`) produce this AFTER the
|
|
119
|
+
candidate is applied in the worktree. Every field is authored by the
|
|
120
|
+
environment, never the proposer — the docs/138 invariant the keep-bit rests on:
|
|
121
|
+
|
|
122
|
+
suite_passed — the test runner's exit status on the worktree (True iff the
|
|
123
|
+
suite the host runs exited 0). The runner authored it.
|
|
124
|
+
truth_clean — True iff `dos verify` / `dos commit-audit` agreed over git
|
|
125
|
+
ancestry for the candidate commit. The oracle authored it.
|
|
126
|
+
work — the host's improvement metric measured on the worktree AFTER
|
|
127
|
+
the candidate. The environment measured it.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
suite_passed: bool
|
|
131
|
+
truth_clean: bool
|
|
132
|
+
work: int
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@dataclass(frozen=True)
|
|
136
|
+
class CycleContext:
|
|
137
|
+
"""Everything the engine needs to run ONE cycle — the host's injected I/O.
|
|
138
|
+
|
|
139
|
+
The callbacks are the seam: the engine owns the loop skeleton and the
|
|
140
|
+
witness→kernel→actuation wiring; the host owns every side-effecting step. All
|
|
141
|
+
are plain callables so the engine is testable on fakes (no model, no git, no
|
|
142
|
+
subprocess in a unit test).
|
|
143
|
+
|
|
144
|
+
propose — () -> Candidate. Apply ONE scoped self-improvement in the
|
|
145
|
+
isolated worktree and return its descriptor (or
|
|
146
|
+
`Candidate(present=False)` to skip). The capable, untrusted
|
|
147
|
+
step — the only place intelligence enters the loop.
|
|
148
|
+
gather — (Candidate) -> WitnessReadback. Run the suite, run the truth
|
|
149
|
+
syscall, measure the metric on the worktree. The env-authored
|
|
150
|
+
witness-gather. Called ONLY when a candidate is present.
|
|
151
|
+
merge — (Candidate) -> None. KEEP actuation: merge the candidate's
|
|
152
|
+
worktree commit onto the lane (the engine calls this only on
|
|
153
|
+
a KEEP verdict).
|
|
154
|
+
discard — (Candidate) -> None. REVERT actuation: throw the worktree's
|
|
155
|
+
candidate away, leaving the live tree untouched.
|
|
156
|
+
escalate — (improve.CandidateVerdict) -> None. ESCALATE actuation: file a
|
|
157
|
+
`dos decisions` entry for a human (the engine calls this once,
|
|
158
|
+
then stops).
|
|
159
|
+
baseline_work — the host metric measured on the GREEN baseline tree at the
|
|
160
|
+
start of this cycle (the `work` the candidate must strictly
|
|
161
|
+
beat). The engine raises it after a KEEP so the loop ratchets.
|
|
162
|
+
policy — the `improve.ImprovePolicy` (thresholds; the host's
|
|
163
|
+
`dos.toml [improve]`).
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
propose: Callable[[], Candidate]
|
|
167
|
+
gather: Callable[[Candidate], WitnessReadback]
|
|
168
|
+
merge: Callable[[Candidate], None]
|
|
169
|
+
discard: Callable[[Candidate], None]
|
|
170
|
+
escalate: Callable[["improve.CandidateVerdict"], None]
|
|
171
|
+
baseline_work: int
|
|
172
|
+
policy: improve.ImprovePolicy = field(default_factory=improve.ImprovePolicy)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
@dataclass(frozen=True)
|
|
176
|
+
class CycleResult:
|
|
177
|
+
"""The outcome of ONE cycle — the verdict, the act taken, and the carry-forward.
|
|
178
|
+
|
|
179
|
+
`verdict` is the kernel's `CandidateVerdict` (None on a SKIP — nothing was
|
|
180
|
+
judged). `action` is what the engine DID (the `CycleAction`). `candidate` is the
|
|
181
|
+
descriptor that was judged (None on a SKIP). `next_baseline` and
|
|
182
|
+
`next_consecutive_reverts` are the state the driver threads into the NEXT cycle
|
|
183
|
+
— `next_baseline` is raised on a KEEP (the ratchet), unchanged otherwise;
|
|
184
|
+
`next_consecutive_reverts` is the kernel's carried breaker count. `should_stop`
|
|
185
|
+
is True iff the loop must halt now (an ESCALATE).
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
action: CycleAction
|
|
189
|
+
next_baseline: int
|
|
190
|
+
next_consecutive_reverts: int
|
|
191
|
+
should_stop: bool
|
|
192
|
+
verdict: "Optional[improve.CandidateVerdict]" = None
|
|
193
|
+
candidate: Optional[Candidate] = None
|
|
194
|
+
|
|
195
|
+
@property
|
|
196
|
+
def reason(self) -> str:
|
|
197
|
+
"""A one-line operator-facing summary for the loop record's tally row."""
|
|
198
|
+
if self.verdict is None:
|
|
199
|
+
return "no candidate proposed this cycle — skipped (nothing to judge)"
|
|
200
|
+
return self.verdict.reason
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def run_cycle(ctx: CycleContext, consecutive_reverts: int = 0) -> CycleResult:
|
|
204
|
+
"""Run ONE self-improvement cycle: propose → gather → classify → actuate.
|
|
205
|
+
|
|
206
|
+
The deterministic engine skeleton (the proposer is the only non-deterministic
|
|
207
|
+
step, and it is injected). Steps:
|
|
208
|
+
|
|
209
|
+
1. PROPOSE — ask the injected proposer for one candidate. If none is present,
|
|
210
|
+
return a SKIP immediately (nothing to judge — not a revert, the breaker is
|
|
211
|
+
untouched).
|
|
212
|
+
2. GATHER — run the host's witness-gather on the worktree (suite, truth
|
|
213
|
+
syscall, metric). Every fact is env-authored.
|
|
214
|
+
3. CLASSIFY — hand the env-authored facts + the carried breaker count to the
|
|
215
|
+
PURE kernel (`improve.classify`). The keep-decision is the kernel's; the
|
|
216
|
+
proposer's narration rides along in `narrated` and moves nothing.
|
|
217
|
+
4. ACTUATE — carry out the verdict: KEEP → merge + raise the baseline (the
|
|
218
|
+
ratchet) + reset the breaker; REVERT → discard + bump the breaker;
|
|
219
|
+
ESCALATE → discard + file a human decision + stop.
|
|
220
|
+
|
|
221
|
+
Returns a `CycleResult` carrying the verdict, the act, and the state to thread
|
|
222
|
+
into the next cycle. PURE of policy: every threshold is in `ctx.policy`, every
|
|
223
|
+
side effect is in `ctx`'s callbacks — the engine just wires them.
|
|
224
|
+
"""
|
|
225
|
+
# 1. PROPOSE — the one untrusted, intelligent step.
|
|
226
|
+
candidate = ctx.propose()
|
|
227
|
+
if not candidate.present:
|
|
228
|
+
return CycleResult(
|
|
229
|
+
action=CycleAction.SKIPPED,
|
|
230
|
+
next_baseline=ctx.baseline_work,
|
|
231
|
+
next_consecutive_reverts=consecutive_reverts,
|
|
232
|
+
should_stop=False,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# 2. GATHER — the env-authored witnesses, measured on the worktree.
|
|
236
|
+
readback = ctx.gather(candidate)
|
|
237
|
+
|
|
238
|
+
# 3. CLASSIFY — the PURE kernel keep-gate. The proposer's `narrated` rides along
|
|
239
|
+
# but, by construction (docs/234), cannot move the verdict.
|
|
240
|
+
evidence = improve.CandidateEvidence(
|
|
241
|
+
suite_passed=readback.suite_passed,
|
|
242
|
+
truth_clean=readback.truth_clean,
|
|
243
|
+
work=readback.work,
|
|
244
|
+
baseline_work=ctx.baseline_work,
|
|
245
|
+
tokens=candidate.tokens,
|
|
246
|
+
consecutive_reverts=consecutive_reverts,
|
|
247
|
+
narrated=candidate.narrated,
|
|
248
|
+
)
|
|
249
|
+
verdict = improve.classify(evidence, ctx.policy)
|
|
250
|
+
|
|
251
|
+
# 4. ACTUATE — carry out the kernel's verdict.
|
|
252
|
+
if verdict.verdict is improve.Candidate.KEEP:
|
|
253
|
+
ctx.merge(candidate)
|
|
254
|
+
return CycleResult(
|
|
255
|
+
action=CycleAction.MERGED,
|
|
256
|
+
next_baseline=readback.work, # the ratchet: the next candidate must beat THIS
|
|
257
|
+
next_consecutive_reverts=verdict.next_consecutive_reverts, # 0
|
|
258
|
+
should_stop=False,
|
|
259
|
+
verdict=verdict,
|
|
260
|
+
candidate=candidate,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
if verdict.verdict is improve.Candidate.ESCALATE:
|
|
264
|
+
# Discard the candidate that tipped the breaker, then surface to a human and stop.
|
|
265
|
+
ctx.discard(candidate)
|
|
266
|
+
ctx.escalate(verdict)
|
|
267
|
+
return CycleResult(
|
|
268
|
+
action=CycleAction.ESCALATED,
|
|
269
|
+
next_baseline=ctx.baseline_work, # unchanged — nothing was kept
|
|
270
|
+
next_consecutive_reverts=verdict.next_consecutive_reverts,
|
|
271
|
+
should_stop=True,
|
|
272
|
+
verdict=verdict,
|
|
273
|
+
candidate=candidate,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# REVERT — discard the worktree candidate; the live tree is untouched.
|
|
277
|
+
ctx.discard(candidate)
|
|
278
|
+
return CycleResult(
|
|
279
|
+
action=CycleAction.DISCARDED,
|
|
280
|
+
next_baseline=ctx.baseline_work, # unchanged
|
|
281
|
+
next_consecutive_reverts=verdict.next_consecutive_reverts,
|
|
282
|
+
should_stop=False,
|
|
283
|
+
verdict=verdict,
|
|
284
|
+
candidate=candidate,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@dataclass(frozen=True)
|
|
289
|
+
class LoopOutcome:
|
|
290
|
+
"""The result of a bounded run of cycles — the loop's final tally.
|
|
291
|
+
|
|
292
|
+
`cycles` is the per-cycle record (in order). `kept` / `reverted` / `skipped`
|
|
293
|
+
are the counts. `escalated` is True iff the loop stopped on an ESCALATE.
|
|
294
|
+
`final_baseline` is the metric after the last KEEP (the ratchet's high-water
|
|
295
|
+
mark — the measure of how much the loop improved DOS). `stop_reason` is a
|
|
296
|
+
one-line summary of why the loop ended.
|
|
297
|
+
"""
|
|
298
|
+
|
|
299
|
+
cycles: tuple[CycleResult, ...]
|
|
300
|
+
kept: int
|
|
301
|
+
reverted: int
|
|
302
|
+
skipped: int
|
|
303
|
+
escalated: bool
|
|
304
|
+
final_baseline: int
|
|
305
|
+
stop_reason: str
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def run_loop(
|
|
309
|
+
ctx: CycleContext,
|
|
310
|
+
*,
|
|
311
|
+
max_cycles: int,
|
|
312
|
+
consecutive_reverts: int = 0,
|
|
313
|
+
on_cycle: "Optional[Callable[[CycleResult], None]]" = None,
|
|
314
|
+
) -> LoopOutcome:
|
|
315
|
+
"""Run up to `max_cycles` self-improvement cycles, ratcheting the baseline.
|
|
316
|
+
|
|
317
|
+
The outer-loop skeleton — the `loop_decide` of the self-improvement loop, but
|
|
318
|
+
simpler because every stop condition is the kernel's: a cycle stops the loop iff
|
|
319
|
+
its `CycleResult.should_stop` (an ESCALATE), and the bare `max_cycles` is the
|
|
320
|
+
backstop (the `ITERATION_CAP` analogue). Between cycles the engine threads two
|
|
321
|
+
pieces of state — the (possibly raised) `baseline_work` and the carried breaker
|
|
322
|
+
count — so the loop RATCHETS: after a KEEP the next candidate must beat the
|
|
323
|
+
improved tree, not the original.
|
|
324
|
+
|
|
325
|
+
`on_cycle` is an optional sink (the host's run-record writer / `dos top`
|
|
326
|
+
surface) called once per cycle with its result. The engine itself writes nothing
|
|
327
|
+
— archiving is the host's actuation (the CLAUDE.md "the kernel reports, the host
|
|
328
|
+
acts" line).
|
|
329
|
+
|
|
330
|
+
Stops on the FIRST of: an ESCALATE (the breaker — surface to a human), or
|
|
331
|
+
`max_cycles` reached (the backstop). A run of SKIPs (the proposer keeps finding
|
|
332
|
+
nothing) burns cycles up to the cap — the host may choose a smaller cap when it
|
|
333
|
+
expects the well to be shallow.
|
|
334
|
+
"""
|
|
335
|
+
cycles: list[CycleResult] = []
|
|
336
|
+
kept = reverted = skipped = 0
|
|
337
|
+
baseline = ctx.baseline_work
|
|
338
|
+
reverts = consecutive_reverts
|
|
339
|
+
escalated = False
|
|
340
|
+
stop_reason = f"reached the {max_cycles}-cycle cap"
|
|
341
|
+
|
|
342
|
+
for i in range(max_cycles):
|
|
343
|
+
cycle_ctx = replace(ctx, baseline_work=baseline)
|
|
344
|
+
result = run_cycle(cycle_ctx, consecutive_reverts=reverts)
|
|
345
|
+
cycles.append(result)
|
|
346
|
+
if on_cycle is not None:
|
|
347
|
+
on_cycle(result)
|
|
348
|
+
|
|
349
|
+
if result.action is CycleAction.MERGED:
|
|
350
|
+
kept += 1
|
|
351
|
+
elif result.action is CycleAction.DISCARDED:
|
|
352
|
+
reverted += 1
|
|
353
|
+
elif result.action is CycleAction.SKIPPED:
|
|
354
|
+
skipped += 1
|
|
355
|
+
|
|
356
|
+
baseline = result.next_baseline
|
|
357
|
+
reverts = result.next_consecutive_reverts
|
|
358
|
+
|
|
359
|
+
if result.should_stop:
|
|
360
|
+
escalated = True
|
|
361
|
+
stop_reason = (
|
|
362
|
+
f"ESCALATED to a human after {reverts} candidates in a row that "
|
|
363
|
+
f"nothing accepted (cycle {i + 1})"
|
|
364
|
+
)
|
|
365
|
+
break
|
|
366
|
+
|
|
367
|
+
return LoopOutcome(
|
|
368
|
+
cycles=tuple(cycles),
|
|
369
|
+
kept=kept,
|
|
370
|
+
reverted=reverted,
|
|
371
|
+
skipped=skipped,
|
|
372
|
+
escalated=escalated,
|
|
373
|
+
final_baseline=baseline,
|
|
374
|
+
stop_reason=stop_reason,
|
|
375
|
+
)
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""dos.drivers.similarity_judge — the DISTANCE adjudicator (outside the kernel line).
|
|
2
|
+
|
|
3
|
+
Why this exists (read `docs/76` first — the flexibility geometry)
|
|
4
|
+
=================================================================
|
|
5
|
+
|
|
6
|
+
The kernel's truth surface is **byte-exact on purpose**: `verify()` asks "is this
|
|
7
|
+
*identical* to an un-forgeable effect?" and `tool_stream` asks "did the env return
|
|
8
|
+
the *byte-identical* result N times?" — both measured facts no agent can forge in its
|
|
9
|
+
own favor. The recurring operator question is "why so rigid — what about *fuzzy* /
|
|
10
|
+
*distance-based* matching, where 'close enough' counts?"
|
|
11
|
+
|
|
12
|
+
The answer the layering contract gives (CLAUDE.md, `docs/76`): flexibility is welcome,
|
|
13
|
+
but it moves UP, out of the kernel verdict and into a **JUDGE driver** — because a
|
|
14
|
+
distance *threshold* is a tunable dial, and a tunable dial deciding "is this claim
|
|
15
|
+
true?" is exactly the forgeable knob the kernel is built to NOT have
|
|
16
|
+
(`flexibility-geometry`: "Anti-pattern ruled out: a `confidence: float` ... INSIDE the
|
|
17
|
+
kernel"). So this driver is where "close enough" is allowed to live:
|
|
18
|
+
|
|
19
|
+
* It runs ONLY on the residue the deterministic oracle ABSTAINED on (deterministic-
|
|
20
|
+
first is the composition's job — `judge_eval.compose_deterministic_first` /
|
|
21
|
+
`decisions._resolver_for` hand a judge only what the oracle could not settle).
|
|
22
|
+
* It is **advisory-only** — it returns a `JudgeVerdict`, mutates nothing.
|
|
23
|
+
* It **fails to ABSTAIN, never to AGREE** — below threshold, no evidence, or any
|
|
24
|
+
error punts to a human; it can never auto-clear a claim by being uncertain.
|
|
25
|
+
|
|
26
|
+
The byte-inequality discipline, kept (the load-bearing subtlety)
|
|
27
|
+
================================================================
|
|
28
|
+
|
|
29
|
+
A naive "similarity judge" is a TRAP: if it scored the agent's `claim_text` against the
|
|
30
|
+
agent's own `stated_reason` (narration), it would be re-deriving the agent's OWN bytes —
|
|
31
|
+
**consistency, not grounding** (the [[consistency-is-not-grounding]] / mirror-verifier
|
|
32
|
+
disease, docs/141 §5a). Two strings the same author wrote being similar proves nothing.
|
|
33
|
+
|
|
34
|
+
So the comparison here is **structural, not against narration**: it scores `claim_text`
|
|
35
|
+
distance against the `Claim.evidence` tuple — the forgery-resistant, *env/git-authored*
|
|
36
|
+
bytes the kernel gathered (`Claim`'s docstring: "git lines, file state, a diff"). And it
|
|
37
|
+
**ABSTAINS when there is no evidence** — it will not agree off narration alone. The
|
|
38
|
+
distance is fuzzy; the *thing it is fuzzy against* is still un-authored by the judged
|
|
39
|
+
agent. That is the whole trick: flexibility on the MATCH, never on the PROVENANCE.
|
|
40
|
+
|
|
41
|
+
Purity & the optional embedding seam
|
|
42
|
+
====================================
|
|
43
|
+
|
|
44
|
+
The default scorer is **pure stdlib** — `difflib.SequenceMatcher`, a normalized
|
|
45
|
+
token-overlap ratio — so the package ships with ZERO new dependency and the judge is
|
|
46
|
+
always usable (the near-stdlib-kernel discipline, applied to a driver). A heavier
|
|
47
|
+
semantic scorer (sentence-embeddings cosine) is reachable through ONE guarded seam,
|
|
48
|
+
`_embedding_similarity`, gated on `$DOS_SIMILARITY_CMD` — the same env-configured,
|
|
49
|
+
never-raises provider shape as `llm_judge._call_provider`. With no command wired the
|
|
50
|
+
seam returns None and the judge falls back to the lexical scorer; it never hard-depends
|
|
51
|
+
on an embedding library. The coupling lives in the operator's env, not the code.
|
|
52
|
+
|
|
53
|
+
Register it under the `dos.judges` entry-point group (it is discoverable, not a
|
|
54
|
+
built-in — only the `abstain` baseline is unshadowable):
|
|
55
|
+
|
|
56
|
+
[project.entry-points."dos.judges"]
|
|
57
|
+
similarity = "dos.drivers.similarity_judge:SimilarityJudge"
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
from __future__ import annotations
|
|
61
|
+
|
|
62
|
+
import difflib
|
|
63
|
+
import os
|
|
64
|
+
import re
|
|
65
|
+
import subprocess
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# The env var naming an OPTIONAL embedding-similarity command. It must read two
|
|
69
|
+
# texts on stdin separated by a NUL byte (\x00) and write a single float in [0,1]
|
|
70
|
+
# (cosine similarity) on stdout. With it unset the judge uses the pure-stdlib
|
|
71
|
+
# lexical scorer — so this is a strict ENHANCEMENT seam, never a dependency.
|
|
72
|
+
ENV_SIMILARITY_CMD = "DOS_SIMILARITY_CMD"
|
|
73
|
+
|
|
74
|
+
# The env var overriding the default agree-threshold (a float in [0,1]). The
|
|
75
|
+
# threshold is DATA, declared by the operator — never a constant baked into a
|
|
76
|
+
# kernel verdict. Default below.
|
|
77
|
+
ENV_SIMILARITY_THRESHOLD = "DOS_SIMILARITY_THRESHOLD"
|
|
78
|
+
|
|
79
|
+
# The default agree-threshold. Deliberately HIGH (0.82): a judge that clears a
|
|
80
|
+
# claim is the one dangerous outcome the seam guards, so "close enough to agree"
|
|
81
|
+
# must mean *very* close. Below this AND above the abstain-floor → DISAGREE; below
|
|
82
|
+
# the abstain-floor with usable evidence → still DISAGREE (low overlap = unsupported);
|
|
83
|
+
# the ABSTAIN cases are "no evidence to score against" and "scorer errored," never
|
|
84
|
+
# "the score was middling" — a middling score is a real DISAGREE signal, not an
|
|
85
|
+
# I-can't-tell.
|
|
86
|
+
DEFAULT_THRESHOLD = 0.82
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
_TOKEN_RE = re.compile(r"\w+", re.UNICODE)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _tokens(text: str) -> list[str]:
|
|
93
|
+
"""Lowercased word tokens of `text` — the unit the lexical scorer compares.
|
|
94
|
+
|
|
95
|
+
Pure. Casefolded so 'AUTH2' and 'auth2' match; `\\w+` drops punctuation/quoting
|
|
96
|
+
so a claim and an evidence line that differ only in formatting still score high.
|
|
97
|
+
"""
|
|
98
|
+
return _TOKEN_RE.findall(text.casefold())
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _lexical_similarity(claim_text: str, evidence_blob: str) -> float:
|
|
102
|
+
"""A pure-stdlib similarity in [0,1] between a claim and the evidence blob.
|
|
103
|
+
|
|
104
|
+
Two cheap, forgery-irrelevant signals, maxed (the claim is "supported" if EITHER
|
|
105
|
+
the wording lines up OR the claim's tokens are largely present in the evidence):
|
|
106
|
+
|
|
107
|
+
* `difflib.SequenceMatcher` ratio over the casefolded raw strings — catches
|
|
108
|
+
near-verbatim phrasing (a claim quoted back by a git line / file state).
|
|
109
|
+
* token-recall — the fraction of the claim's distinct tokens that appear in the
|
|
110
|
+
evidence's token set — catches a claim whose key terms are all witnessed even
|
|
111
|
+
if the surrounding prose differs.
|
|
112
|
+
|
|
113
|
+
Both are symmetric-enough and bounded [0,1]; `max` is the right combinator because
|
|
114
|
+
either kind of match is sufficient evidence of support. PURE — no I/O, no clock.
|
|
115
|
+
"""
|
|
116
|
+
if not claim_text or not evidence_blob:
|
|
117
|
+
return 0.0
|
|
118
|
+
seq = difflib.SequenceMatcher(None, claim_text.casefold(), evidence_blob.casefold()).ratio()
|
|
119
|
+
claim_toks = set(_tokens(claim_text))
|
|
120
|
+
if not claim_toks:
|
|
121
|
+
return seq
|
|
122
|
+
ev_toks = set(_tokens(evidence_blob))
|
|
123
|
+
recall = len(claim_toks & ev_toks) / len(claim_toks)
|
|
124
|
+
return max(seq, recall)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _embedding_similarity(claim_text: str, evidence_blob: str) -> float | None:
|
|
128
|
+
"""The OPTIONAL semantic-similarity seam. Returns cosine in [0,1], or None.
|
|
129
|
+
|
|
130
|
+
Honors `$DOS_SIMILARITY_CMD` (a shell command reading `claim\\x00evidence` on
|
|
131
|
+
stdin, writing one float on stdout). Never raises — any failure (command unset,
|
|
132
|
+
missing, timeout, non-zero exit, unparseable output) returns None so the caller
|
|
133
|
+
falls back to the lexical scorer. This is the ONE place a heavier model is
|
|
134
|
+
touched; keeping it a single guarded seam is what lets the package ship with zero
|
|
135
|
+
embedding dependency while still allowing an operator to wire one in by env var
|
|
136
|
+
(the exact `llm_judge._call_provider` discipline, re-aimed at a similarity score).
|
|
137
|
+
"""
|
|
138
|
+
cmd = os.environ.get(ENV_SIMILARITY_CMD)
|
|
139
|
+
if not cmd:
|
|
140
|
+
return None
|
|
141
|
+
try:
|
|
142
|
+
payload = (claim_text + "\x00" + evidence_blob).encode("utf-8")
|
|
143
|
+
p = subprocess.run(
|
|
144
|
+
cmd, shell=True, input=payload, capture_output=True, timeout=120,
|
|
145
|
+
)
|
|
146
|
+
except (OSError, subprocess.SubprocessError):
|
|
147
|
+
return None
|
|
148
|
+
if p.returncode != 0:
|
|
149
|
+
return None
|
|
150
|
+
out = (p.stdout or b"").decode("utf-8", errors="replace").strip()
|
|
151
|
+
try:
|
|
152
|
+
val = float(out.split()[0]) if out else None
|
|
153
|
+
except (ValueError, IndexError):
|
|
154
|
+
return None
|
|
155
|
+
if val is None:
|
|
156
|
+
return None
|
|
157
|
+
# Clamp into [0,1] — a provider that returns a cosine in [-1,1] (or noise) can
|
|
158
|
+
# never push the score past the bounds the threshold logic assumes.
|
|
159
|
+
return max(0.0, min(1.0, val))
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _threshold() -> float:
|
|
163
|
+
"""The agree-threshold, read from `$DOS_SIMILARITY_THRESHOLD` or the default.
|
|
164
|
+
|
|
165
|
+
A malformed value falls back to the default rather than crashing — the threshold
|
|
166
|
+
is operator data, and a typo should degrade safely, not take down adjudication.
|
|
167
|
+
"""
|
|
168
|
+
raw = os.environ.get(ENV_SIMILARITY_THRESHOLD)
|
|
169
|
+
if not raw:
|
|
170
|
+
return DEFAULT_THRESHOLD
|
|
171
|
+
try:
|
|
172
|
+
val = float(raw)
|
|
173
|
+
except ValueError:
|
|
174
|
+
return DEFAULT_THRESHOLD
|
|
175
|
+
return max(0.0, min(1.0, val))
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class SimilarityJudge:
|
|
179
|
+
"""A DISTANCE-based occupant of the JUDGE rung — a `dos.judges.Judge`.
|
|
180
|
+
|
|
181
|
+
Rules on a generic `Claim` by scoring how well its `claim_text` matches the
|
|
182
|
+
forgery-resistant `evidence` (NOT the agent's narration — that would be a mirror).
|
|
183
|
+
Fuzzy on the match, strict on the provenance, advisory-only, fail-to-abstain:
|
|
184
|
+
|
|
185
|
+
* **no evidence** → ABSTAIN. It refuses to agree off narration alone — the
|
|
186
|
+
byte-inequality floor (you cannot confirm a claim with the claimant's bytes).
|
|
187
|
+
* score **≥ threshold** → AGREE (the claim is near-verbatim witnessed by the
|
|
188
|
+
evidence). The one clearing verdict, reachable only on a high, *measured*
|
|
189
|
+
overlap with un-authored bytes.
|
|
190
|
+
* score **< threshold** (with evidence present) → DISAGREE (the evidence does
|
|
191
|
+
not support the claim). A middling score is a real "unsupported" signal, not
|
|
192
|
+
an "I can't tell."
|
|
193
|
+
|
|
194
|
+
The threshold is DATA (`$DOS_SIMILARITY_THRESHOLD`, default 0.82), never a knob
|
|
195
|
+
inside a kernel verdict. The scorer is pure stdlib by default; an embedding scorer
|
|
196
|
+
is an opt-in env seam (`$DOS_SIMILARITY_CMD`). With nothing wired it is fully
|
|
197
|
+
usable — it just uses the lexical scorer — so it is always safe to register and
|
|
198
|
+
`dos judge-eval`.
|
|
199
|
+
"""
|
|
200
|
+
|
|
201
|
+
name = "similarity"
|
|
202
|
+
|
|
203
|
+
def rule(self, claim, config):
|
|
204
|
+
from dos.judges import JudgeVerdict
|
|
205
|
+
|
|
206
|
+
# The byte-inequality floor: with no evidence there are no un-authored bytes
|
|
207
|
+
# to score against. Agreeing here would mean believing the agent's own
|
|
208
|
+
# narration — the mirror-verifier trap. ABSTAIN (route to a human).
|
|
209
|
+
evidence = tuple(claim.evidence or ())
|
|
210
|
+
if not evidence:
|
|
211
|
+
return JudgeVerdict.abstain(
|
|
212
|
+
"no evidence to score the claim against — a distance judge will not "
|
|
213
|
+
"agree off narration alone (that would re-derive the agent's own "
|
|
214
|
+
"bytes); routing this claim to a human.",
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
claim_text = (claim.claim_text or "").strip()
|
|
218
|
+
if not claim_text:
|
|
219
|
+
return JudgeVerdict.abstain(
|
|
220
|
+
"empty claim_text — nothing to match against the evidence; abstaining.",
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
evidence_blob = "\n".join(evidence)
|
|
224
|
+
threshold = _threshold()
|
|
225
|
+
|
|
226
|
+
# Prefer the semantic seam if wired; else the pure lexical scorer. The seam
|
|
227
|
+
# never raises (it returns None on any failure), so this never needs a guard.
|
|
228
|
+
embedded = _embedding_similarity(claim_text, evidence_blob)
|
|
229
|
+
if embedded is not None:
|
|
230
|
+
score = embedded
|
|
231
|
+
scorer = "embedding"
|
|
232
|
+
else:
|
|
233
|
+
score = _lexical_similarity(claim_text, evidence_blob)
|
|
234
|
+
scorer = "lexical"
|
|
235
|
+
|
|
236
|
+
detail = f"{scorer} similarity {score:.3f} vs threshold {threshold:.2f}"
|
|
237
|
+
ev = (f"similarity: {detail}",)
|
|
238
|
+
|
|
239
|
+
if score >= threshold:
|
|
240
|
+
return JudgeVerdict.agree(
|
|
241
|
+
f"claim is witnessed by the evidence ({detail}) — near-verbatim match "
|
|
242
|
+
f"to un-authored bytes the agent did not write.",
|
|
243
|
+
evidence=ev,
|
|
244
|
+
)
|
|
245
|
+
return JudgeVerdict.disagree(
|
|
246
|
+
f"claim is NOT supported by the evidence ({detail}) — the gathered "
|
|
247
|
+
f"un-authored bytes do not match the assertion.",
|
|
248
|
+
evidence=ev,
|
|
249
|
+
)
|