dos-kernel 0.22.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dos/__init__.py +261 -0
- dos/_bin/dos-hook.exe +0 -0
- dos/_filelock.py +255 -0
- dos/_job_policy.py +97 -0
- dos/_tree.py +145 -0
- dos/admission.py +433 -0
- dos/answer_shape.py +299 -0
- dos/arbiter.py +859 -0
- dos/archive_lock.py +266 -0
- dos/arg_provenance.py +814 -0
- dos/attest.py +472 -0
- dos/breaker.py +311 -0
- dos/churn.py +226 -0
- dos/claim_extract.py +229 -0
- dos/claim_ttl.py +150 -0
- dos/cli.py +8721 -0
- dos/commit_audit.py +666 -0
- dos/completion.py +466 -0
- dos/concurrency_class.py +154 -0
- dos/config.py +1380 -0
- dos/config_lint.py +464 -0
- dos/cooldown.py +390 -0
- dos/coverage.py +387 -0
- dos/dangling_intent.py +287 -0
- dos/data_class.py +397 -0
- dos/decisions.py +1274 -0
- dos/decisions_tui.py +251 -0
- dos/dispatch_top.py +740 -0
- dos/dispatch_top_tui.py +116 -0
- dos/drivers/__init__.py +40 -0
- dos/drivers/ci_status.py +630 -0
- dos/drivers/citation_resolve.py +703 -0
- dos/drivers/decision_stop.py +98 -0
- dos/drivers/export_file.py +173 -0
- dos/drivers/export_otlp.py +275 -0
- dos/drivers/export_statsd.py +242 -0
- dos/drivers/hook_dialects.py +391 -0
- dos/drivers/job.py +47 -0
- dos/drivers/llm_judge.py +360 -0
- dos/drivers/memory_recall.py +1231 -0
- dos/drivers/notify_slack.py +373 -0
- dos/drivers/notify_webhook.py +251 -0
- dos/drivers/operator_judge.py +114 -0
- dos/drivers/os_acceptance.py +228 -0
- dos/drivers/paste_log.py +132 -0
- dos/drivers/plan_scope.py +133 -0
- dos/drivers/self_improve.py +375 -0
- dos/drivers/similarity_judge.py +249 -0
- dos/drivers/state_diff.py +274 -0
- dos/drivers/supervisor.py +347 -0
- dos/drivers/watchdog.py +363 -0
- dos/drivers/workshop.py +160 -0
- dos/durable_schema.py +344 -0
- dos/effect_witness.py +393 -0
- dos/efficiency.py +318 -0
- dos/enforce.py +414 -0
- dos/enumerate.py +776 -0
- dos/env_print.py +378 -0
- dos/event_severity.py +258 -0
- dos/evidence.py +692 -0
- dos/exec_capability.py +256 -0
- dos/export_cursor.py +143 -0
- dos/exporter.py +320 -0
- dos/firing_label.py +353 -0
- dos/fleet_roll.py +226 -0
- dos/gate_classify.py +827 -0
- dos/gh4_coverage.py +179 -0
- dos/git_delta.py +122 -0
- dos/guard.py +215 -0
- dos/health.py +552 -0
- dos/help_summary.py +519 -0
- dos/home.py +934 -0
- dos/hook_binary.py +194 -0
- dos/hook_dialect.py +271 -0
- dos/hook_exit.py +191 -0
- dos/hook_install.py +437 -0
- dos/id_alloc.py +304 -0
- dos/improve.py +499 -0
- dos/intent_ledger.py +635 -0
- dos/interpret.py +176 -0
- dos/intervention.py +769 -0
- dos/intervention_eval.py +371 -0
- dos/journal_delta.py +308 -0
- dos/judge_eval.py +328 -0
- dos/judges.py +366 -0
- dos/lane_infer.py +127 -0
- dos/lane_journal.py +1001 -0
- dos/lane_lease.py +952 -0
- dos/lane_overlap.py +228 -0
- dos/lease_health.py +282 -0
- dos/lifecycle.py +211 -0
- dos/liveness.py +352 -0
- dos/lock_modes.py +185 -0
- dos/log_source.py +395 -0
- dos/loop_decide.py +1746 -0
- dos/marker_gate.py +254 -0
- dos/marker_sensor.py +396 -0
- dos/noop_streak.py +280 -0
- dos/notify.py +479 -0
- dos/observe.py +175 -0
- dos/oracle.py +1661 -0
- dos/overlap_eval.py +214 -0
- dos/overlap_policy.py +342 -0
- dos/packet_sidecar.py +267 -0
- dos/phase_shipped.py +1985 -0
- dos/pick_priority.py +225 -0
- dos/pickable.py +369 -0
- dos/picker_oracle.py +1037 -0
- dos/plan_board.py +513 -0
- dos/plan_board_tui.py +113 -0
- dos/plan_source.py +455 -0
- dos/posttool_sensor.py +528 -0
- dos/precursor_gate.py +499 -0
- dos/precursor_gate_eval.py +239 -0
- dos/preflight.py +825 -0
- dos/pretool_sensor.py +490 -0
- dos/proc_delta.py +181 -0
- dos/productivity.py +296 -0
- dos/provider_limit.py +242 -0
- dos/py.typed +4 -0
- dos/reason_morphology.py +299 -0
- dos/reasons.py +449 -0
- dos/reconcile.py +173 -0
- dos/recurring_wedge.py +206 -0
- dos/render.py +393 -0
- dos/result_state.py +468 -0
- dos/resume.py +578 -0
- dos/resume_evidence.py +293 -0
- dos/retention.py +344 -0
- dos/reward.py +372 -0
- dos/rewind.py +587 -0
- dos/rewind_evidence.py +168 -0
- dos/rewind_tokens.py +252 -0
- dos/run_id.py +342 -0
- dos/scope.py +520 -0
- dos/scope_source.py +382 -0
- dos/scout.py +982 -0
- dos/self_modify.py +209 -0
- dos/sibling_scan.py +569 -0
- dos/skills/EXAMPLES.md +584 -0
- dos/skills/dos-class-cycle/SKILL.md +107 -0
- dos/skills/dos-dispatch/SKILL.md +177 -0
- dos/skills/dos-dispatch-loop/SKILL.md +254 -0
- dos/skills/dos-goal-gate/SKILL.md +269 -0
- dos/skills/dos-next-up/SKILL.md +231 -0
- dos/skills/dos-promote/SKILL.md +114 -0
- dos/skills/dos-replan/SKILL.md +159 -0
- dos/skills/dos-replan-loop/SKILL.md +114 -0
- dos/skills/dos-self-improve/SKILL.md +213 -0
- dos/skills/dos-supervise-loop/SKILL.md +180 -0
- dos/skills/dos-unstick/SKILL.md +108 -0
- dos/skills/dos-witness-claim/SKILL.md +251 -0
- dos/stamp.py +1002 -0
- dos/state_health.py +387 -0
- dos/status.py +114 -0
- dos/stop_policy.py +334 -0
- dos/supervise.py +1014 -0
- dos/testwitness.py +392 -0
- dos/timeline.py +1027 -0
- dos/tokens.py +485 -0
- dos/tool_stream.py +393 -0
- dos/tool_stream_eval.py +226 -0
- dos/trace.py +524 -0
- dos/verdict.py +140 -0
- dos/verdict_cli.py +189 -0
- dos/verdict_journal.py +497 -0
- dos/verdict_rollup.py +217 -0
- dos/verdicts.py +181 -0
- dos/wedge_reason.py +282 -0
- dos_kernel-0.22.0.dist-info/METADATA +859 -0
- dos_kernel-0.22.0.dist-info/RECORD +178 -0
- dos_kernel-0.22.0.dist-info/WHEEL +5 -0
- dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
- dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
- dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
- dos_mcp/__init__.py +52 -0
- dos_mcp/py.typed +2 -0
- dos_mcp/server.py +779 -0
|
@@ -0,0 +1,859 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dos-kernel
|
|
3
|
+
Version: 0.22.0
|
|
4
|
+
Summary: Dispatch Operating System — the domain-free trust substrate for fleets of autonomous agents (verdict spine, ship oracle, structured refusal, lane arbiter, correlation spine).
|
|
5
|
+
Author: Anthony Chaudhary
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/anthony-chaudhary/dos-kernel
|
|
8
|
+
Project-URL: Documentation, https://github.com/anthony-chaudhary/dos-kernel/blob/master/docs/QUICKSTART.md
|
|
9
|
+
Project-URL: Repository, https://github.com/anthony-chaudhary/dos-kernel
|
|
10
|
+
Project-URL: Issues, https://github.com/anthony-chaudhary/dos-kernel/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/anthony-chaudhary/dos-kernel/tree/master/docs/releases
|
|
12
|
+
Keywords: agents,ai-agents,llm,multi-agent,agent-orchestration,orchestration,dispatch,scheduler,oracle,leases,verification,trust,mcp
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
22
|
+
Classifier: Topic :: Software Development :: Version Control :: Git
|
|
23
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
24
|
+
Classifier: Typing :: Typed
|
|
25
|
+
Requires-Python: >=3.11
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Requires-Dist: pyyaml>=6.0
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
31
|
+
Requires-Dist: build>=1.0; extra == "dev"
|
|
32
|
+
Requires-Dist: wheel>=0.43; extra == "dev"
|
|
33
|
+
Requires-Dist: ruff>=0.13; extra == "dev"
|
|
34
|
+
Requires-Dist: mypy>=1.17; extra == "dev"
|
|
35
|
+
Requires-Dist: hypothesis>=6.100; extra == "dev"
|
|
36
|
+
Provides-Extra: mcp
|
|
37
|
+
Requires-Dist: mcp>=1.2; extra == "mcp"
|
|
38
|
+
Provides-Extra: tui
|
|
39
|
+
Requires-Dist: rich>=13; extra == "tui"
|
|
40
|
+
Requires-Dist: windows-curses>=2.3; sys_platform == "win32" and extra == "tui"
|
|
41
|
+
Provides-Extra: notify-slack
|
|
42
|
+
Requires-Dist: slack-helpers>=0.2; extra == "notify-slack"
|
|
43
|
+
Provides-Extra: export-otlp
|
|
44
|
+
Requires-Dist: opentelemetry-sdk>=1.20; extra == "export-otlp"
|
|
45
|
+
Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.20; extra == "export-otlp"
|
|
46
|
+
Provides-Extra: paper
|
|
47
|
+
Requires-Dist: arxiv-latex-cleaner>=1.0; extra == "paper"
|
|
48
|
+
Dynamic: license-file
|
|
49
|
+
|
|
50
|
+
# DOS — the Dispatch Operating System
|
|
51
|
+
|
|
52
|
+
> ### Catch your AI agents when they lie about what they shipped.
|
|
53
|
+
|
|
54
|
+
<!-- PyPI / Python-version badges land with the PyPI release — until dos-kernel is
|
|
55
|
+
on PyPI they'd render broken (404 / unknown version). -->
|
|
56
|
+
[](https://github.com/anthony-chaudhary/dos-kernel/actions/workflows/ci.yml)
|
|
57
|
+
[](LICENSE)
|
|
58
|
+
|
|
59
|
+
<p align="center">
|
|
60
|
+
<img src="docs/assets/loop-hero.svg" alt="Two agent fleets side by side. Left, no referee: agents all report 'done!', every report is believed, and silent corruption (lies, collisions, spin) piles up into a codebase that 'sorta works' and can't be changed. Right, DOS adjudicates: dos verify reads git and the run branches to SHIPPED (exit 0, land it) or NOT_SHIPPED (exit 1, re-dispatch — caught), and that verdict steers the next step." width="100%">
|
|
61
|
+
<br>
|
|
62
|
+
<em>Run a fleet of agents on one repo. The left loop just feels like progress; the right one you can steer.
|
|
63
|
+
The only difference is a verdict DOS reads from the real world — here, git — never the agent's word.</em>
|
|
64
|
+
</p>
|
|
65
|
+
|
|
66
|
+
**An AI agent will tell you it finished. DOS checks the real world instead of
|
|
67
|
+
taking its word** — and the nearest piece of the real world is your git history.
|
|
68
|
+
|
|
69
|
+
That's the whole idea. An agent says it shipped the login endpoint. Did it? You
|
|
70
|
+
run one command — `dos verify` — and it answers from the **artifacts the work
|
|
71
|
+
actually left behind** (here, git history), not from what the agent typed. If a
|
|
72
|
+
commit backs the claim, you get `SHIPPED` and exit code `0`. If nothing landed,
|
|
73
|
+
you get `NOT_SHIPPED` and exit code `1`. The agent's story never enters into it.
|
|
74
|
+
(Git is just the first witness DOS reads; the file tree, the clock, a CI status, a
|
|
75
|
+
test environment's own state are others — anything the agent didn't author.)
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
dos verify AUTH AUTH1 # → SHIPPED AUTH AUTH1 e62f74d (exit 0)
|
|
79
|
+
dos verify AUTH AUTH2 # → NOT_SHIPPED AUTH AUTH2 (exit 1)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
That's the smallest version. It scales up too: point a dozen agents at one repo —
|
|
83
|
+
in CI, in a fleet, racing on the same files — and DOS also tells you which ones
|
|
84
|
+
are **stepping on each other**, which one is **spinning in circles**, and which
|
|
85
|
+
claim of "done" is **real**. Every answer comes from the artifacts (git, the file
|
|
86
|
+
tree, the clock), never the narration. It works on a plain `git` repo with **zero
|
|
87
|
+
config**, and the only thing you ever have to install is one small Python package.
|
|
88
|
+
|
|
89
|
+
> ⏱️ **Want to try it right now?** Jump to **[Try it in 60 seconds](#try-it-in-60-seconds)**
|
|
90
|
+
> — one command, real output, then come back for the why.
|
|
91
|
+
|
|
92
|
+
<sub>**v0.22.0** · 3900+ tests · CI: Python 3.11–3.13 on Linux + a Windows 3.13
|
|
93
|
+
smoke run · the only runtime dependency is **PyYAML** · **MIT**.</sub>
|
|
94
|
+
|
|
95
|
+
<details>
|
|
96
|
+
<summary><strong>The 30-second mental model</strong> (one paragraph, plain words) — click to expand</summary>
|
|
97
|
+
|
|
98
|
+
> Coding agents narrate everything: *what they shipped, which files they touched,
|
|
99
|
+
> whether they're still making progress.* DOS treats all of that as a **claim**, not
|
|
100
|
+
> a fact, and hands you a **verdict** built from what actually happened. Under the
|
|
101
|
+
> hood it's a small, deterministic **kernel** — the part that decides ground truth
|
|
102
|
+
> across a crowd of unreliable workers and keeps their edits from colliding. Nothing
|
|
103
|
+
> about it is coding-specific: your repo declares its own rules (which file regions
|
|
104
|
+
> each agent may touch, how a commit signals "done") as data in one `dos.toml`, and
|
|
105
|
+
> the kernel supplies only the machinery. You reach that machinery through small,
|
|
106
|
+
> do-one-thing commands — `verify`, `arbitrate`, `liveness`, `refuse` — from the
|
|
107
|
+
> `dos` CLI, an MCP server wired into the agent host you already run, or straight
|
|
108
|
+
> from Python.
|
|
109
|
+
|
|
110
|
+
</details>
|
|
111
|
+
|
|
112
|
+
> **Reading this as an AI agent?** Start with **[AGENTS.md](AGENTS.md)** — a short
|
|
113
|
+
> orientation written for you: what DOS is in three lines, how to build/test/check
|
|
114
|
+
> your work, the ~5 files actually worth reading, and the architecture rules a
|
|
115
|
+
> change must satisfy.
|
|
116
|
+
|
|
117
|
+
## Try it in 60 seconds
|
|
118
|
+
|
|
119
|
+
Got 60 seconds and a terminal? Run the whole aha-moment in a throwaway repo. This
|
|
120
|
+
one command scaffolds a repo, makes a real commit, verifies it, and cleans up
|
|
121
|
+
after itself:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
git clone https://github.com/anthony-chaudhary/dos-kernel.git && cd dos-kernel
|
|
125
|
+
pip install -e . # from the clone — PyYAML is the only runtime dep
|
|
126
|
+
dos quickstart # → SHIPPED AUTH AUTH1 … then NOT_SHIPPED AUTH AUTH2
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
That's it. One `SHIPPED`, one `NOT_SHIPPED` — the first is a claim git can back,
|
|
130
|
+
the second is a claim nothing landed for. **That contrast is the whole product in
|
|
131
|
+
one command.** (Add `--keep ./demo` to keep the repo and poke at it. No clone
|
|
132
|
+
wanted? `uvx --from git+https://github.com/anthony-chaudhary/dos-kernel dos
|
|
133
|
+
quickstart` runs the same demo ephemerally — nothing left behind.)
|
|
134
|
+
|
|
135
|
+
<details>
|
|
136
|
+
<summary><strong>Prefer to watch the gears turn?</strong> The same thing, by hand, in 5 lines — click to expand</summary>
|
|
137
|
+
|
|
138
|
+
A *plan* (`AUTH`) groups *phases* (`AUTH1`, `AUTH2`); `dos verify` takes
|
|
139
|
+
`<plan> <phase>`, and a commit whose subject starts `AUTH1:` is what stamps that
|
|
140
|
+
phase shipped.
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
mkdir hello-dos && cd hello-dos
|
|
144
|
+
dos init . # writes one dos.toml
|
|
145
|
+
git init -q
|
|
146
|
+
git config user.email you@example.com # skip if you have a global git identity
|
|
147
|
+
git config user.name "You"
|
|
148
|
+
echo 'def login(): ...' > login.py
|
|
149
|
+
git add -A
|
|
150
|
+
git commit -m "AUTH1: ship the login endpoint" # stamp AUTH1 shipped: <PHASE-ID>: <message>
|
|
151
|
+
|
|
152
|
+
dos verify --workspace . AUTH AUTH1 # → SHIPPED AUTH AUTH1 e389e8b (via grep-subject) exit 0
|
|
153
|
+
dos verify --workspace . AUTH AUTH2 # → NOT_SHIPPED AUTH AUTH2 (via none) exit 1
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
An agent can **claim** `AUTH2` is done all day long; `verify` just reports what the
|
|
157
|
+
artifacts say — and they say it isn't. The `via grep-subject` / `via none` tag tells
|
|
158
|
+
you *how it knows*: it found the phase token in a commit subject, or it found it
|
|
159
|
+
nowhere. The full walkthrough is in **[docs/QUICKSTART.md](docs/QUICKSTART.md)**.
|
|
160
|
+
|
|
161
|
+
</details>
|
|
162
|
+
|
|
163
|
+
<p align="center">
|
|
164
|
+
<img src="examples/demo/verify-moment.svg" alt="The dos verify money-moment. Two equally-confident agent claims, checked against git. Left, what the agent claims (forgeable): 'Shipped AUTH1 — the login endpoint is done' and 'AUTH2 is done too — all work completed!'. Right, what git actually records: one real commit e389e8b 'AUTH1: ship the login endpoint', and no commit anywhere mentions AUTH2. The two verdicts: dos verify AUTH AUTH1 finds the token in a real commit subject → SHIPPED, exit 0, via grep-subject; dos verify AUTH AUTH2 finds it nowhere → NOT_SHIPPED, exit 1, via none. The confident AUTH2 claim collapses the instant no commit backs it." width="100%">
|
|
165
|
+
<br>
|
|
166
|
+
<sub><em>The money-moment: two equally-confident claims, one verdict each — <code>SHIPPED</code> for the one git can back, <code>NOT_SHIPPED</code> for the one nothing landed for. Every string is verbatim output of <a href="examples/demo/verify_demo.sh"><code>examples/demo/verify_demo.sh</code></a>. <a href="examples/demo/verify_visual.html">Step through it locally</a> for the click-through version (it's an HTML file — clone the repo and open it in a browser; GitHub shows its source, not the running page).</em></sub>
|
|
167
|
+
</p>
|
|
168
|
+
|
|
169
|
+
**The smallest real win:** in a CI step or dispatch loop, replace the line that
|
|
170
|
+
trusts an agent's "done" with `dos verify PLAN PHASE` and branch on its exit code
|
|
171
|
+
(`0` shipped / `1` not). No parsing, no plan, no config — the
|
|
172
|
+
[CI integration cookbook](examples/playbooks/cookbook-ci-integration.md) walks it
|
|
173
|
+
end-to-end. To run it on a repo shaped like yours, start with
|
|
174
|
+
[Onboard a repo in 10 minutes](examples/playbooks/01_onboard-a-repo.md).
|
|
175
|
+
|
|
176
|
+
## What goes wrong in a fleet — and what catches it
|
|
177
|
+
|
|
178
|
+
Now the *why*. Run a pile of agents at once with nobody refereeing, and here's how
|
|
179
|
+
it goes. Each worker reports its own success, and you believe the reports — what
|
|
180
|
+
else is there to go on? Meanwhile the unchecked problems pile up quietly: a lie
|
|
181
|
+
here, two agents clobbering the same file there, a little scope-creep, one worker
|
|
182
|
+
spinning in circles. Eventually the codebase *sorta* works and nobody can safely
|
|
183
|
+
change it.
|
|
184
|
+
|
|
185
|
+
The trouble is you launched the agents, they graded their own homework, and you
|
|
186
|
+
have no signal you trust to steer on. DOS gives you that missing signal — a verdict
|
|
187
|
+
from ground truth — so the loop closes. Here is the same fleet under both regimes:
|
|
188
|
+
|
|
189
|
+
<details open>
|
|
190
|
+
<summary>The two regimes as a flowchart — <strong>left:</strong> you believe the narration; <strong>right:</strong> you steer on a verdict</summary>
|
|
191
|
+
|
|
192
|
+
```mermaid
|
|
193
|
+
flowchart LR
|
|
194
|
+
subgraph OPEN["NO REFEREE — you believe the narration"]
|
|
195
|
+
direction TB
|
|
196
|
+
A1["agent: 'done!'"] --> B1[["believed"]]
|
|
197
|
+
A2["agent: 'done!'"] --> B1
|
|
198
|
+
A3["agent: 'done!'"] --> B1
|
|
199
|
+
B1 --> C1["silent corruption piles up<br/>(lies · collisions · spin)"]
|
|
200
|
+
C1 --> D1["'sorta works' — can't be changed"]
|
|
201
|
+
end
|
|
202
|
+
subgraph CLOSED["DOS ADJUDICATES — you steer on a verdict"]
|
|
203
|
+
direction TB
|
|
204
|
+
A4["agent: 'done!'"] --> V{{"dos verify<br/>reads git"}}
|
|
205
|
+
V -->|in git ancestry| S["SHIPPED (exit 0)"]
|
|
206
|
+
V -->|found nowhere| N["NOT_SHIPPED (exit 1)"]
|
|
207
|
+
S --> L["land it"]
|
|
208
|
+
N --> R["re-dispatch / flag — caught"]
|
|
209
|
+
R -.verdict steers the loop.-> A4
|
|
210
|
+
end
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
</details>
|
|
214
|
+
|
|
215
|
+
Here are the failures a fleet actually produces, each next to the ground truth
|
|
216
|
+
that quietly contradicts the worker's story — and the verdict DOS hands back:
|
|
217
|
+
|
|
218
|
+
| A worker… | …but the ground truth is | DOS verdict |
|
|
219
|
+
|---|---|---|
|
|
220
|
+
| says it shipped a unit of work | no commit ever landed | `verify` → **caught lie** |
|
|
221
|
+
| tried, but the commit silently failed | no commit ever landed | `verify` (the flake — indistinguishable from a lie *without* git) |
|
|
222
|
+
| edits files another worker owns | two agents, one shared file | `arbitrate` → **refuse** the second |
|
|
223
|
+
| overruns the file region it claimed | footprint reaches beyond the declared tree | `scope-gate` → **REFUSE** (before the write lands) |
|
|
224
|
+
| reports "making progress" | 0 commits, only a fresh heartbeat | `liveness` → **SPINNING** |
|
|
225
|
+
|
|
226
|
+
Pause on the first row — it's the most common one. The classic tell is a cheerful
|
|
227
|
+
one-liner, *"all work completed!"*, from a worker that did little or nothing. DOS
|
|
228
|
+
never reads that line; it reads the ground truth, so the claim collapses the instant
|
|
229
|
+
no artifact backs it (more in [docs/108](docs/108_the-cheap-lie-and-the-narration-taxonomy.md)).
|
|
230
|
+
That's what makes it cheap to adopt: `verify` needs **no plan, no registry, no
|
|
231
|
+
config**, and **the exit code _is_ the verdict** — any shell or CI step can branch
|
|
232
|
+
on it without parsing a word.
|
|
233
|
+
|
|
234
|
+
<sub>*Prefer to watch it move?* The two loops are also a self-contained animation you
|
|
235
|
+
step through one frame at a time — clone the repo and open
|
|
236
|
+
[`docs/assets/loop_visual.html`](docs/assets/loop_visual.html) in a browser. (It's an
|
|
237
|
+
HTML file, so GitHub shows its source rather than running it — open it locally.)</sub>
|
|
238
|
+
|
|
239
|
+
### How far you take it — one slope, not a menu
|
|
240
|
+
|
|
241
|
+
**It works on a plain `git init` with zero config, and gets smarter the more you
|
|
242
|
+
tell it.** You don't adopt a framework and pick a tier; you start at the shallow
|
|
243
|
+
end and it keeps paying off as you wade deeper — the same kernel the whole way:
|
|
244
|
+
|
|
245
|
+
- **Zero config.** Point `dos verify PLAN PHASE` at a plain git
|
|
246
|
+
repo — no plan, no registry, no `dos.toml`. It answers from commit history
|
|
247
|
+
alone (`via grep-subject` / `via none`). This is the whole of
|
|
248
|
+
[QUICKSTART](docs/QUICKSTART.md) and the day-one CI win above.
|
|
249
|
+
- **Tell it your structure.** `dos init` writes a `dos.toml` (lanes, paths,
|
|
250
|
+
ship grammar as data); add a **plan doc** and `dos plan` lays each phase's
|
|
251
|
+
*claim* beside the oracle's verdict. Here's [exactly what a plan file looks
|
|
252
|
+
like](examples/plans/example-plan.md) (copyable, round-trips with the built-in
|
|
253
|
+
reader), and four worked [example workspaces](examples/workspaces/).
|
|
254
|
+
- **Teach it your own types.** Declare your own block reasons, gate
|
|
255
|
+
verdicts, output renderers, admission predicates, a model-backed **judge**, a
|
|
256
|
+
custom **plan dialect**, or a whole host **driver** — all as workspace policy,
|
|
257
|
+
never a fork. The map is **[docs/HACKING.md](docs/HACKING.md)** (seven extension
|
|
258
|
+
axes) + the copy-me **[`examples/dos_ext/`](examples/dos_ext/)**.
|
|
259
|
+
|
|
260
|
+
### How you plug it in — pick the surface, not a rewrite
|
|
261
|
+
|
|
262
|
+
That slope is *how deep* your config goes. The other axis is *how you call the
|
|
263
|
+
referee at all* — and you adopt through whichever surface matches how you already
|
|
264
|
+
work, not by restructuring your stack. The same kernel verdicts are reachable six
|
|
265
|
+
ways, lowest-friction first:
|
|
266
|
+
|
|
267
|
+
| Surface | Adopt it when… | The move |
|
|
268
|
+
|---|---|---|
|
|
269
|
+
| **MCP server** | you drive an agent through an MCP host (Claude Desktop, Cursor, Cline, an Agent-SDK app) | add one line to the host config (`{ "command": "dos-mcp" }`) and ask the agent to `dos_verify` its own last claim — **zero code**. The *advisory* path (the agent asks). See [Give your agent a lie detector](#give-your-agent-a-lie-detector-mcp). |
|
|
270
|
+
| **Runtime hooks** | you run an agent loop (Claude Code, Cursor, Codex CLI, Gemini CLI) and want the verdict to *act*, not just be available | `dos init --hooks <runtime>` wires the verdict into that host's own hook config — a refused call is **denied before it runs**, a false "done" is **refused**. The *enforcement* path (the host denies). One command, no hand-edited YAML. See [QUICKSTART](docs/QUICKSTART.md) + [docs/221](docs/221_the-cross-vendor-hook-installer.md). |
|
|
271
|
+
| **CLI exit-code** | you have a shell pipeline or CI step that trusts an agent's "done" | replace that step with `dos verify PLAN PHASE` and branch on the exit code (`0` shipped / `1` not) — **the verdict *is* the exit code**. The day-one win above. |
|
|
272
|
+
| **Python API** | your dispatcher/orchestrator is already Python | `import dos` and call the pure syscalls (`dos.oracle.is_shipped`, `dos.arbiter.arbitrate`, …) — state-in / verdict-out, no subprocess. The [Python cookbook](examples/playbooks/cookbook-python-api.md). |
|
|
273
|
+
| **Fleet framework** | your fleet already runs on LangGraph, CrewAI, AutoGen, or the OpenAI/Claude Agents SDK | bolt the referee onto the framework's own seam — a referee node, a termination condition only git can satisfy, an output guardrail with a git tripwire. One function, no rewrite; every seam executed against the real framework. The [fleet-framework cookbook](examples/playbooks/cookbook-fleet-frameworks.md). |
|
|
274
|
+
| **Skill pack** | you run agents in Claude Code and want the workflow, not just the verdict | `dos init --skills` drops editable `SKILL.md` screenplays that wire the syscalls into a snapshot → audit → gate → take-a-lane loop. See [QUICKSTART §2](docs/QUICKSTART.md). |
|
|
275
|
+
| **Driver** | your lanes must be *computed*, or you add a provider-backed judge | write one `dos/drivers/<host>.py` (a `LaneTaxonomy` + a config factory), loaded by name, never imported by the kernel. The map is [HACKING.md](docs/HACKING.md). |
|
|
276
|
+
|
|
277
|
+
The two axes are independent: a zero-config repo can adopt through any surface, and
|
|
278
|
+
a deeply-configured one still answers over the same CLI and MCP tools. Start at the
|
|
279
|
+
top row — it's the one that costs nothing to try. The first two rows compose:
|
|
280
|
+
**MCP advises** (the agent checks its own work), **hooks enforce** (the host stops a
|
|
281
|
+
bad action) — wire both for the full loop.
|
|
282
|
+
|
|
283
|
+
## Why not just run N agents?
|
|
284
|
+
|
|
285
|
+
Fair question — why add a referee at all? Because N agents with no referee is that
|
|
286
|
+
**open loop** again: you launch them, they self-report, and you've got nothing
|
|
287
|
+
solid to steer on. DOS hands you that missing signal. Specifically, it gives you
|
|
288
|
+
**sensors** —
|
|
289
|
+
|
|
290
|
+
- `verify` — did it really ship? (from git, not the agent's word)
|
|
291
|
+
- `liveness` — is it ADVANCING, or just SPINNING / STALLED?
|
|
292
|
+
- `scope-gate` — did it stay in its lane? A **binding pre-effect** gate
|
|
293
|
+
(`dos scope-gate`, ALLOW/REFUSE, exit 0/5/6) over the same `dos.scope`
|
|
294
|
+
classifier that also reports post-hoc.
|
|
295
|
+
|
|
296
|
+
— and **actuators**: `arbitrate` (let this lane in, or refuse the collision) and
|
|
297
|
+
`refuse` (say no with a reason a machine can act on). Together they turn a pile of
|
|
298
|
+
workers into something you can actually drive. The kernel's job is the **signal**,
|
|
299
|
+
but it also ships a reference supervisor to show what you do with it: `dos watch`
|
|
300
|
+
checks `liveness` on each tracked run and *proposes* a halt when one spins or
|
|
301
|
+
blows its budget — it recommends, it never pulls the trigger — and `dos loop`
|
|
302
|
+
keeps N dispatch-loops alive. Use those, or build your own on the same signal.
|
|
303
|
+
Either way, it's the difference between *"I launched 20 sessions and I'm hoping"*
|
|
304
|
+
and *"I can see which two are lying and which one is wedged."*
|
|
305
|
+
|
|
306
|
+
You see that signal through **three read-only screens** — `dos top` (what's
|
|
307
|
+
running), `dos decisions` (what's waiting on you), `dos plan` (claim vs. ground
|
|
308
|
+
truth) — covered in [Three live projections](#three-live-projections-read-only-tuis)
|
|
309
|
+
below and walked end-to-end in
|
|
310
|
+
**[Debug a stuck fleet](examples/playbooks/06_debug-a-stuck-fleet.md)**.
|
|
311
|
+
|
|
312
|
+
The referee grows along **two axes**: deterministic *verdicts* that read artifacts
|
|
313
|
+
(`verify`, `liveness`, `scope`), and provider-backed *judges* — a model, a debate
|
|
314
|
+
— that rule on what no deterministic check can, kept outside the kernel under a
|
|
315
|
+
discipline that stops a wrong judge from clearing a falsehood. See
|
|
316
|
+
**[the adjudicator-population note](docs/88_the-adjudicator-population.md)** for
|
|
317
|
+
that scalable-oversight story in code.
|
|
318
|
+
|
|
319
|
+
> **We caught ourselves doing the exact thing DOS exists to catch.** A design doc
|
|
320
|
+
> in this repo included a small worked example — "here's what this snippet prints" —
|
|
321
|
+
> written by the agent building DOS. It read perfectly plausible. It was reviewed. It
|
|
322
|
+
> was committed. And it was **wrong**, for the dullest possible reason: *nobody had
|
|
323
|
+
> actually run it.* The agent had reasoned out what the code "would" print and typed
|
|
324
|
+
> that down as fact. An adversarial review later did the one thing the author hadn't
|
|
325
|
+
> — **executed the snippet** — and the real output flatly contradicted the prose.
|
|
326
|
+
> That's the whole thesis in one anecdote: **a confident narration is not evidence,
|
|
327
|
+
> even when the narrator is us, even after a human reviewed it.** The reasoning felt
|
|
328
|
+
> like checking; it wasn't. The only thing that settled it was running the code and
|
|
329
|
+
> reading what came back — an independent witness, exactly the move `verify` makes
|
|
330
|
+
> against an agent's "done." The correction is pinned in git (`docs/124`, commit
|
|
331
|
+
> `651ba03`), because here too the record is the commit, not the claim.
|
|
332
|
+
|
|
333
|
+
## What's proven — and what's still a bet
|
|
334
|
+
|
|
335
|
+
We apply the same honesty to our own claims that the kernel applies to your agents.
|
|
336
|
+
It would be easy to lead with one big number; instead, here's the honest split —
|
|
337
|
+
**what we actually measured, what we extrapolated from those measurements, and what
|
|
338
|
+
is still a bet.** Draw the line yourself. (Every *proven* number is from a live,
|
|
339
|
+
re-runnable benchmark written up under [`benchmark/`](benchmark/) and the paper.)
|
|
340
|
+
|
|
341
|
+
**✅ Proven — measured in live runs, scored against a fact the agent can't fake**
|
|
342
|
+
(a test environment's database state, git history — bytes the agent wrote none of):
|
|
343
|
+
|
|
344
|
+
- **It catches the lie and blocks it.** Across **120 clean tasks** on a standard
|
|
345
|
+
agent benchmark, a DOS gate caught **10 genuine "I shipped it" lies** and let
|
|
346
|
+
every honest write through — at the same **8.3% catch rate on both a mid-size and
|
|
347
|
+
a top-tier model.** The signal doesn't fade when you upgrade the model. (Over the
|
|
348
|
+
full benchmark: 15 lies caught in 258 tasks, two models, **zero false alarms**.)
|
|
349
|
+
*(▶ the money-moment is the [gate figure below](#the-two-money-moments-rendered).)*
|
|
350
|
+
- **It prevents the collision.** The same referee put two live agents on one shared
|
|
351
|
+
record and stopped **6 of 8** cases of one silently overwriting the other — **4 of
|
|
352
|
+
6** when the cases were drawn from the real task mix. This is the half a sandbox
|
|
353
|
+
*can't* cover: an isolated workspace still shares the outside world.
|
|
354
|
+
*(▶ the collision being prevented is the [coordination figure below](#the-two-money-moments-rendered).)*
|
|
355
|
+
- **Mid-run "fixes" don't help; quitting a doomed run does.** Every active fix we
|
|
356
|
+
tried mid-run (warn it, rewind it, inject a hint) came out flat-to-negative —
|
|
357
|
+
poking a run also disturbs the ones that would have passed. The one move that
|
|
358
|
+
helps writes nothing: **give up at the right moment** — 0 runs wrongly killed out
|
|
359
|
+
of 1,634 winners across 22 models, ~11% of fleet compute saved.
|
|
360
|
+
- **The training label can't be gamed.** For "may a fine-tune learn from this run?"
|
|
361
|
+
(`dos reward`), the yes/no is computed from environment state the agent authored none of — so no
|
|
362
|
+
amount of clever output text can flip a *no* to a *yes*. That's a proof, plus a
|
|
363
|
+
measured **60% → 100% precision** lift from filtering out the poison a naive
|
|
364
|
+
self-graded collector would have kept.
|
|
365
|
+
|
|
366
|
+
<a id="the-two-money-moments-rendered"></a>
|
|
367
|
+
|
|
368
|
+
The two proven moments above, each rendered as a single figure from its own live
|
|
369
|
+
run (every number, hash, and ID is a verbatim read-off — never a hand-typed
|
|
370
|
+
dramatization):
|
|
371
|
+
|
|
372
|
+
<p align="center">
|
|
373
|
+
<img src="benchmark/agentprocessbench/writeadmit/gate-moment.svg" alt="The DOS write-admission gate catching a real over-claim. A live gemini-2.5-pro agent on a tau2 airline task reports 'You are all set! Your reservation number is HATHAT' — a confident write the agent authored. The witness is the environment DB-hash the agent wrote zero bytes of: gold hash 9f2c…gold vs the agent's resulting hash 4b7e…actual, so db_match = False — the booking it swore it made is not in the database. The gate verdict: a confident write REFUTED by an OS_RECORDED witness → GATE BLOCK, the phantom never reaches the next agent. Result across two models: J = 10 of 120 over-claims caught and blocked off the DB-hash, 9 of 9 honest writes admitted, zero correct work blocked, an identical 8.3% over-claim rate on the mid model and the strong one." width="100%">
|
|
374
|
+
<br>
|
|
375
|
+
<sub><em><strong>It catches the lie and blocks it.</strong> A confident booking, refuted by the DB-hash the agent couldn't author, blocked before a downstream agent inherits the phantom. <a href="benchmark/agentprocessbench/writeadmit/gate_visual.html">Step through it locally</a> (an HTML walkthrough — clone and open in a browser; GitHub shows its source).</em></sub>
|
|
376
|
+
</p>
|
|
377
|
+
|
|
378
|
+
<p align="center">
|
|
379
|
+
<img src="benchmark/agentprocessbench/writeadmit/f2-moment.svg" alt="The DOS coordination payoff: a stale write clobbering a cancellation, then prevented. Two live agents act on one shared reservation NM1VX1, each having planned its tool-calls against the same original state, neither aware of the other. A1 cancels the reservation (DB-hash a3f1…afterA1). Under naive replay, A2's add-bag — computed on the original active state — blindly re-activates the reservation and adds a bag, silently overwriting A1's cancellation (composed hash 77c2…naive, a real lost update). Under the arbiter, dos.arbiter leases the region reservations/NM1VX1 to A1, refuses A2's overlapping lease, and A2 re-plans against the post-A1 cancelled state and correctly declines — the DB-hash matches the serialized-correct value and no update is lost. Across six natural conflict pairs drawn from the real task distribution, J = 4 of 6 clobbers were structurally prevented off the DB-hash." width="100%">
|
|
380
|
+
<br>
|
|
381
|
+
<sub><em><strong>It prevents the collision.</strong> A stale add-bag clobbers a cancellation under naive replay; the arbiter serializes the two agents on the same region so neither overwrites the other. <a href="benchmark/agentprocessbench/writeadmit/f2_visual.html">Step through it locally</a> (an HTML walkthrough — clone and open in a browser).</em></sub>
|
|
382
|
+
</p>
|
|
383
|
+
|
|
384
|
+
**📈 Projected — real measurements, composed into a curve (and labelled as one).**
|
|
385
|
+
Here's the honest crux: **catching a lie is only worth something to whoever can't
|
|
386
|
+
catch it themselves.** Hand the verdict to one strong agent that re-checks its own
|
|
387
|
+
inputs and it buys you almost nothing — that agent recovers on its own. Hand it to
|
|
388
|
+
something that *can't* re-check — a non-LLM system, a weaker model, a long
|
|
389
|
+
multi-step chain, or a training loop — and it pays off (up to a full +1.0 in our
|
|
390
|
+
no-recovery upper bound). In short: **DOS is worth more the less your downstream can
|
|
391
|
+
check itself.** Our fleet-scale figure (≈173–505 corrupted results prevented at a
|
|
392
|
+
32-agent fleet) projects these real per-run rates onto fleet math — it's geometry on
|
|
393
|
+
top of measured numbers, not a measured fleet run.
|
|
394
|
+
|
|
395
|
+
**🎲 A bet — stated as one.** Where this goes if the floor holds: a frozen,
|
|
396
|
+
cross-vendor **trust standard** (the "deny" message is already byte-identical across
|
|
397
|
+
Claude Code, Codex, and Qwen — a de-facto standard waiting to be named), a shared
|
|
398
|
+
**arbiter for real-world effects**, the claim-vs-reality **corpus** only a neutral
|
|
399
|
+
party can hold, and a **notary** that proves what an agent did *to a skeptic who
|
|
400
|
+
wasn't in the room* (the mechanism already ships — `dos attest` mints an
|
|
401
|
+
HMAC-signed receipt over an effect-witness verdict and `dos verify-receipt` checks
|
|
402
|
+
it with the shared key alone; [docs/246](docs/246_dos-attest-the-portable-signed-receipt.md)).
|
|
403
|
+
The seeds are in the tree; we claim no results for any of it.
|
|
404
|
+
|
|
405
|
+
> **The one distinction that keeps this honest:** a **J** is a *count of failures
|
|
406
|
+
> blocked off ground truth* — never a downstream outcome delta. "Blocked 10 real
|
|
407
|
+
> over-claims" is proven; "made the fleet 10% better" is not the same sentence, and
|
|
408
|
+
> we don't write it.
|
|
409
|
+
|
|
410
|
+
## What DOS does *not* do
|
|
411
|
+
|
|
412
|
+
The proven/bet gradient above is about *evidence*; this is about *capability* — the
|
|
413
|
+
boundaries are part of the contract, and stating them is the same honesty the
|
|
414
|
+
kernel applies to your agents:
|
|
415
|
+
|
|
416
|
+
- **It adjudicates that a ship *happened*, not that the code is correct or good.**
|
|
417
|
+
`verify` reads git ancestry, so it catches "no commit landed," not "the
|
|
418
|
+
committed work is wrong." Judging *quality* is the JUDGE / HUMAN rung, not the
|
|
419
|
+
deterministic oracle.
|
|
420
|
+
- **It computes verdicts and admission decisions; it never spawns or kills an OS
|
|
421
|
+
process.** `liveness` is advisory — it *reports* SPINNING, it doesn't stop the
|
|
422
|
+
run — and `dos loop` *emits* a spawn/reap/flag plan you act on. (`arbitrate` and
|
|
423
|
+
`refuse` are decisions you enforce, not force the kernel applies.)
|
|
424
|
+
- **It is not a CI replacement or a test runner.** It sits *beside* them and lets a
|
|
425
|
+
step branch on the exit-code verdict.
|
|
426
|
+
- **The pluggable verdict/JUDGE adjudicator *registry* is specced, not yet
|
|
427
|
+
shipped** (see [docs/88](docs/88_the-adjudicator-population.md) §5); the JUDGE
|
|
428
|
+
*seam* and built-in judges are.
|
|
429
|
+
|
|
430
|
+
## Give your agent a lie detector (MCP)
|
|
431
|
+
|
|
432
|
+
The easiest way in doesn't involve writing any Python. Point the agent host you
|
|
433
|
+
already use at the bundled **MCP server**, then ask your agent to `dos_verify` its
|
|
434
|
+
own last claim. The first time it comes back `NOT_SHIPPED … (via none)` on work the
|
|
435
|
+
agent *swore* it finished, the whole point of this repo clicks into place — in your
|
|
436
|
+
terminal, on your fleet.
|
|
437
|
+
|
|
438
|
+
Installed with the `[mcp]` extra (`pip install -e ".[mcp]"` from your clone — see
|
|
439
|
+
[Install](#install)), DOS exposes the syscalls as **MCP tools** — the truth tools first (`dos_verify` "did it ship?",
|
|
440
|
+
`dos_commit_audit` "does this commit's claim match its diff?", `dos_status` one
|
|
441
|
+
folded fact about a run), then `dos_arbitrate` (may two workers run without
|
|
442
|
+
colliding?), the structured-refusal pair (`dos_refuse_reasons` / `dos_check_reason`),
|
|
443
|
+
`dos_recall` (is this recalled memory still true?), and `dos_doctor` (the workspace
|
|
444
|
+
report) — so any MCP-speaking host — **Claude Desktop, Cursor, Cline, an Agent-SDK
|
|
445
|
+
app** — can call the referee over JSON-on-stdio with **zero Python coupling**. Each
|
|
446
|
+
verdict comes back with a one-line interpretation of what it means for the agent's
|
|
447
|
+
next move. (See **[the MCP server surface](docs/80_mcp-server-surface.md)**.)
|
|
448
|
+
|
|
449
|
+
```jsonc
|
|
450
|
+
// claude_desktop_config.json — paste, restart, then say:
|
|
451
|
+
// "use dos_verify to confirm you actually shipped that"
|
|
452
|
+
{ "mcpServers": { "dos": { "command": "dos-mcp" } } }
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
The MCP server is **advisory**: the agent *calls* the referee when it (or you) thinks
|
|
456
|
+
to. The per-host wiring for Cursor / Codex / Gemini is in
|
|
457
|
+
**[the MCP README](src/dos_mcp/README.md)** — all four are MCP clients, so this works
|
|
458
|
+
on every one of them with zero code.
|
|
459
|
+
|
|
460
|
+
### …then make the verdict *act* (hooks)
|
|
461
|
+
|
|
462
|
+
To go from "the agent can ask" to "the host won't let a bad call through," wire DOS's
|
|
463
|
+
**hooks** into the runtime you actually run. One command per host — it writes that
|
|
464
|
+
host's own hook-config file, merged into anything already there:
|
|
465
|
+
|
|
466
|
+
```bash
|
|
467
|
+
dos init --hooks claude-code . # .claude/settings.json
|
|
468
|
+
dos init --hooks cursor . # .cursor/hooks.json
|
|
469
|
+
dos init --hooks codex . # .codex/config.toml
|
|
470
|
+
dos init --hooks gemini . # .gemini/settings.json
|
|
471
|
+
dos init --hooks antigravity . # .agents/hooks.json
|
|
472
|
+
```
|
|
473
|
+
|
|
474
|
+
That binds three shipped hooks: **`pretool`** denies a structurally-refused call
|
|
475
|
+
before it runs, **`stop`** refuses a stop on an unverified "done," **`posttool`**
|
|
476
|
+
re-surfaces a stalled stream. This is the **enforcement** path (the *host* denies on a
|
|
477
|
+
DOS verdict) — the complement to MCP's advisory path. Until recently this spoke only
|
|
478
|
+
Claude Code; it now installs across five hosts — Claude Code, Cursor, Codex, Gemini,
|
|
479
|
+
and Antigravity ([docs/221](docs/221_the-cross-vendor-hook-installer.md),
|
|
480
|
+
[docs/269](docs/269_antigravity-the-fifth-host.md)).
|
|
481
|
+
`--with-hooks` is the back-compat alias for `--hooks claude-code`.
|
|
482
|
+
|
|
483
|
+
Because these hooks run on **every** tool call, the core kernel logic on the hot path is
|
|
484
|
+
reimplemented in **native Go** — a `dos-hook` binary that ports the actual decision
|
|
485
|
+
predicates (the conjunctive-only lease-admission and prefix-disjointness floor, the
|
|
486
|
+
`verify()` grep rung, self-modify, the marker budget, the WAL) rather than just shelling
|
|
487
|
+
out to Python. It is **highly performant**: it serves the per-call verdict in ~10 ms —
|
|
488
|
+
**16–43× faster** than shelling `python -m dos.cli hook …` (~0.25–0.8 s, dominated by
|
|
489
|
+
interpreter cold-start) — and is **byte-identical** to the Python kernel on the gated
|
|
490
|
+
decision (the docs/124 parity contract, pinned by Go parity tests). It owns the common
|
|
491
|
+
fast path and falls back to the always-available Python verb for anything it doesn't yet
|
|
492
|
+
serve, so a machine without the binary degrades cleanly with no wiring change
|
|
493
|
+
([docs/125](docs/125_go-hook-fastpath-build-plan.md),
|
|
494
|
+
[docs/270](docs/270_go-hook-fastpath-benchmarks.md)). You don't build it yourself:
|
|
495
|
+
the per-platform wheels bundle the binary, so a wheel install gets the native fast
|
|
496
|
+
path with **no Go toolchain** — and any platform without a bundled binary (including
|
|
497
|
+
a plain source install) just runs the pure-Python path
|
|
498
|
+
([docs/286](docs/286_shipping-the-go-binary-through-pypi-per-platform-wheels.md)).
|
|
499
|
+
|
|
500
|
+
## The syscall ABI
|
|
501
|
+
|
|
502
|
+
Every syscall answers a question you'd otherwise have to *take the agent's word
|
|
503
|
+
for*. "Reach for this when…" is the plain-English trigger; the rest is the
|
|
504
|
+
contract — and the module names are auditable.
|
|
505
|
+
|
|
506
|
+
| Syscall | Reach for this when… | What it is | Module |
|
|
507
|
+
|---|---|---|---|
|
|
508
|
+
| `verify()` | an agent says a unit of work is **done** and you don't want to take its word | the **truth syscall** — "did (plan, phase) actually ship?" registry-first, ancestry-checked, from git history if there's no plan at all | `dos.oracle`, `dos.phase_shipped` |
|
|
509
|
+
| `liveness()` | a long run says it's **"making progress"** and you want to know if it actually is | the **temporal verdict** — "is the run ADVANCING, or just SPINNING / STALLED?" from the git/journal delta and the clock | `dos.liveness` |
|
|
510
|
+
| `verify-result()` | a **subagent hands a result back** to an orchestrator that folds it as a finding — but the result string may be a harness-synthesized error the worker never authored | the **fold-site result-state witness** ([docs/197](docs/197_how-dos-is-directly-useful-to-ultracode.md)) — classifies a subagent transcript's terminal record, gating on `message.model == "<synthetic>"` (the unforgeable harness-authorship marker), never the agent's self-report; **exit 3 = DEAD** (a harness 429 / quota / auth / server error), `0` = HEALTHY / UNREADABLE | `dos.result_state` |
|
|
511
|
+
| `resume()` | a run **died or paused mid-flight** and you need to continue without re-doing work or double-applying it | the **third ARIES phase** — "how far did the *fossils* say it got, and what's the residual?" over a run-id-keyed intent ledger; re-enters from a git-VERIFIED SHA, never the dead run's self-report (RESUMABLE / COMPLETE / DIVERGED / UNRESUMABLE) | `dos.resume`, `dos.intent_ledger` |
|
|
512
|
+
| `complete()` | you need to know if the **whole declared job** is verifiably done, not just one phase | the **completion verdict** — `residual = declared − verified`, asked forward; read-only, never self-certifies | `dos.completion` |
|
|
513
|
+
| `rewind()` | a run thrashed and you want to **excise the dead-end turns** without the kernel authoring a correction | the **conversation-rewind verdict** — replays the ledger for a minted checkpoint and PROPOSES the excision (never truncates; the host owns the transcript) | `dos.rewind` |
|
|
514
|
+
| `productivity()` | a long run is burning turns and you want to know if it's **still doing work, or fading** | the **loop-economics verdict** ([docs/218](docs/218_the-productivity-verdict-diminishing-returns-as-a-syscall.md)) — `classify(work-deltas) -> PRODUCTIVE / DIMINISHING / STALLED` over a trend of per-step work; pure, no I/O | `dos.productivity` |
|
|
515
|
+
| `efficiency()` | you want to know if the **tokens a run spent actually bought work** (a run can be productive yet burn 10× its work's worth) | the **token-effectiveness verdict** ([docs/263](docs/263_token-effectiveness-verdict-plan.md)) — `work / tokens -> EFFICIENT / COSTLY / WASTEFUL`; both counts are env-authored, so a run can't narrate its way to EFFICIENT | `dos.efficiency` |
|
|
516
|
+
| `improve()` | a **self-improving loop** proposes a change to its own code and you must decide **keep or revert** — without trusting the loop's own claim that it helped | the **keep-gate** ([docs/280](docs/280_the-self-improving-work-loop-the-kernel-adjudicates-its-own-improvement.md)) — `KEEP / REVERT / ESCALATE` from witnesses the candidate's author didn't write: the suite green on the candidate-only tree, the truth syscall clean, and a strictly-measured metric gain; a regression always REVERTs, a run of non-keeps ESCALATEs to a human | `dos.improve` |
|
|
517
|
+
| `reward()` | a fine-tune is about to **train on an agent's trajectory** and the "it worked" label came from the agent itself | the **reward-set admission verdict** ([docs/230](docs/230_the-lab-facing-twin-rlvr-admit-the-non-distillable-reward-label.md)) — `ACCEPT / REJECT_POISON / ABSTAIN` off a witness the agent authored zero bytes of, so no answer text can flip a reject to an accept (the non-distillable label) | `dos.reward` |
|
|
518
|
+
| `breaker()` | a **failure class keeps tripping** and you want to stop retrying and escalate | the **circuit-breaker primitive** ([docs/223](docs/223_the-circuit-breaker-primitive-failure-counting-as-mechanism.md)) — a pure two-counter state machine, `CLOSED / OPEN`, tripping on consecutive *or* total failures; an OPEN verdict names the escalation rung (none / judge / human) | `dos.breaker` |
|
|
519
|
+
| `hook_exit()` / `exec_capability()` | you wire a **plain shell hook** into a runtime, or need to know if a command **grants arbitrary code execution** | two classifier leaves the cheapest integrations consult — `hook_exit` maps an exit code to an intervention ([docs/226](docs/226_the-hook-exit-classifier-a-shell-scripts-exit-code-as-a-verdict.md): 0 pass / 2 BLOCK / other WARN), `exec_capability` classifies the *invoked program token* — never a substring — as `GRANTS_ARBITRARY_EXEC / BOUNDED` ([docs/224](docs/224_the-exec-capability-classifier-a-shape-not-a-word.md)) | `dos.hook_exit`, `dos.exec_capability` |
|
|
520
|
+
| `refuse(reason)` | you need to say **why** a pick was blocked in a way a machine can act on | **structured refusal** — a closed, declared reason vocabulary (`dos.reasons`, extensible per-workspace), every reason emittable, verifiable, and refusable | `dos.wedge_reason`, `dos.picker_oracle` |
|
|
521
|
+
| `lease()` / `arbitrate()` | two agents might **touch the same files** and you need to admit one without a collision | the **pure admission kernel** — `arbitrate(request, live_leases, config) -> decision`, state-in / decision-out, no I/O | `dos.arbiter` |
|
|
522
|
+
| `spawn()` / `reap()` | you need every run to carry a **traceable identity** and its effects to be replayable | the **correlation spine** (sortable, lineage-carrying run-ids) + the lease **write-ahead log** | `dos.run_id`, `dos.lane_journal` |
|
|
523
|
+
| `enumerate()` / `pickable()` / `cooldown()` / `reconcile()` | an unattended loop must know **is there anything pickable, why-not, have I tried it, and did the claim hold?** — without re-storming a known drain or believing a "done" the git can't confirm | the **picker substrate** ([docs/207](docs/207_dispatch-workflow-extraction-and-the-pickable-substrate-completion.md)) — `enumerate` is the phase-list producer (the `declared` set, never a silent empty); `pickable` the pre-dispatch gate (OFFERABLE / HELD(reason)); `cooldown` the anti-churn fold over pick-attempts (CLEAR / RECENTLY_ATTEMPTED); `reconcile` the quiet-completion join (VERIFIED / QUIET_INCOMPLETE / HONEST_OPEN, fail-closed on the claim) | `dos.enumerate`, `dos.pickable`, `dos.cooldown`, `dos.reconcile` |
|
|
524
|
+
|
|
525
|
+
> Three terms the table assumes: a **plan** (e.g. `AUTH`) groups **phases** —
|
|
526
|
+
> a phase is a named unit of work (e.g. `AUTH1`); a **lane** is a leased region of
|
|
527
|
+
> the file tree an agent works in. All are defined in the
|
|
528
|
+
> [quickstart](docs/QUICKSTART.md).
|
|
529
|
+
|
|
530
|
+
> **The newest catch — a result that *died*.** When a subagent hands a result
|
|
531
|
+
> back to an orchestrator that folds it as a finding (an ultracode `Workflow`, an
|
|
532
|
+
> Agent-SDK fan-out), the result string itself may be a **harness-synthesized
|
|
533
|
+
> error** the worker never authored — and ~32% of real subagents return exactly
|
|
534
|
+
> that (a 429 / quota / auth string) where the fold expects a finding ([docs/197](docs/197_how-dos-is-directly-useful-to-ultracode.md)).
|
|
535
|
+
> `verify-result` reads the transcript's terminal record and refuses to believe a
|
|
536
|
+
> harness-authored death:
|
|
537
|
+
>
|
|
538
|
+
> ```bash
|
|
539
|
+
> dos verify-result --transcript dead.jsonl
|
|
540
|
+
> # DEAD SYNTHETIC class=OTHER — harness-authored terminal
|
|
541
|
+
> # (model=<synthetic> + stop_reason=stop_sequence) — not a finding; route to DEAD, do not fold
|
|
542
|
+
> echo $? # → 3 (count it in the denominator; never bank it as a result)
|
|
543
|
+
>
|
|
544
|
+
> dos verify-result --transcript real.jsonl
|
|
545
|
+
> # HEALTHY — terminal assistant record is real-model authored with content
|
|
546
|
+
> echo $? # → 0
|
|
547
|
+
> ```
|
|
548
|
+
>
|
|
549
|
+
> It gates on `message.model == "<synthetic>"` — the marker the agent's own model
|
|
550
|
+
> *cannot forge* (the runtime harness authored those bytes, not the worker) — which
|
|
551
|
+
> is broader than rate-limits alone: quota, auth, and server deaths are caught too.
|
|
552
|
+
|
|
553
|
+
Around these sit ~30 supporting kernel modules — the file-tree disjointness
|
|
554
|
+
algebra, the timeline reader, the gate/loop classifiers, the typed-verdict
|
|
555
|
+
contract, the JUDGE-rung seam. The full map is in **[CLAUDE.md](CLAUDE.md)**.
|
|
556
|
+
|
|
557
|
+
## Install
|
|
558
|
+
|
|
559
|
+
Pick the row that matches how you work — the full matrix (every OS, every
|
|
560
|
+
channel, upgrade/uninstall, WSL, troubleshooting) is in
|
|
561
|
+
**[docs/INSTALL.md](docs/INSTALL.md)**:
|
|
562
|
+
|
|
563
|
+
```bash
|
|
564
|
+
# uv — the modern, fast, isolated CLI install (recommended), straight from GitHub:
|
|
565
|
+
uv tool install git+https://github.com/anthony-chaudhary/dos-kernel # `dos` + `dos-mcp` on PATH
|
|
566
|
+
uvx --from git+https://github.com/anthony-chaudhary/dos-kernel dos doctor # or run it once, ephemerally
|
|
567
|
+
|
|
568
|
+
# pip — the library-consumer path (a host pins dos-kernel in its own venv):
|
|
569
|
+
pip install "dos-kernel @ git+https://github.com/anthony-chaudhary/dos-kernel" # core kernel (PyYAML only)
|
|
570
|
+
pip install "dos-kernel[mcp] @ git+https://github.com/anthony-chaudhary/dos-kernel" # + the MCP server (dos-mcp)
|
|
571
|
+
|
|
572
|
+
# from a clone — editable, the contributor path:
|
|
573
|
+
git clone https://github.com/anthony-chaudhary/dos-kernel.git && cd dos-kernel
|
|
574
|
+
pip install -e . # editable: your edits are live in the install
|
|
575
|
+
./install.sh # or .\install.ps1 on Windows — venv + install + PATH, one line
|
|
576
|
+
```
|
|
577
|
+
|
|
578
|
+
> **The distribution name is `dos-kernel`, not `dos`** — a bare `pip install dos`
|
|
579
|
+
> pulls an unrelated package that squats the name. The *import* name and the CLI
|
|
580
|
+
> are still `dos`. The **core kernel's only runtime dependency is PyYAML** (the
|
|
581
|
+
> `[mcp]` extra adds the MCP framework; `[tui]` adds the live `dos top` screens).
|
|
582
|
+
> See [SECURITY.md](SECURITY.md), "Supply chain."
|
|
583
|
+
|
|
584
|
+
Prefer a package manager? **uv** is the 2026 default — faster than `pipx`,
|
|
585
|
+
isolates the tool, and manages Python versions; `pipx install
|
|
586
|
+
git+https://github.com/anthony-chaudhary/dos-kernel` works the same way if your
|
|
587
|
+
team already uses it. PyPI / Homebrew / WinGet / Scoop one-liners are next on the
|
|
588
|
+
release runway (see [docs/INSTALL.md](docs/INSTALL.md)).
|
|
589
|
+
|
|
590
|
+
A host repo adds DOS as a pinned dependency and points it at its own tree — never
|
|
591
|
+
by vendoring the code in. DOS is **stateless about which repo it serves**: it
|
|
592
|
+
resolves the workspace from `--workspace` › `$DISPATCH_WORKSPACE` › cwd, never its
|
|
593
|
+
own install location, so the ground truth stays legible as the codebase grows.
|
|
594
|
+
(The full separation contract — mechanism in the package, policy in the
|
|
595
|
+
workspace's `dos.toml` — is in **[CLAUDE.md](CLAUDE.md)**.)
|
|
596
|
+
|
|
597
|
+
For most repos that one `dos.toml` is the whole policy surface — but when your
|
|
598
|
+
lanes must be *computed* (from runtime state, an env var, a monorepo manifest)
|
|
599
|
+
rather than listed, or you add a provider-backed JUDGE, you write a small
|
|
600
|
+
**driver** instead: a `dos/drivers/<host>.py` exposing a `LaneTaxonomy` constant +
|
|
601
|
+
a `<host>_config` factory, loaded by name via `dos --driver <host>` and never
|
|
602
|
+
imported by the kernel. Copy [`dos/drivers/workshop.py`](src/dos/drivers/workshop.py)
|
|
603
|
+
as the template; the full driver/plugin map is in **[docs/HACKING.md](docs/HACKING.md)**.
|
|
604
|
+
|
|
605
|
+
### Claude Code plugin — hooks + MCP + skills in one install
|
|
606
|
+
|
|
607
|
+
If you drive a fleet with **Claude Code**, the lowest-friction way to bind the
|
|
608
|
+
verdict to the runtime is the bundled plugin under
|
|
609
|
+
[`claude-plugin/`](claude-plugin/) — it packages all three runtime surfaces at once:
|
|
610
|
+
|
|
611
|
+
- the **hooks** (`PreToolUse` → deny a structurally-refused call · `PostToolUse` →
|
|
612
|
+
re-surface a stalled tool stream · `Stop` → refuse to stop on an unverified
|
|
613
|
+
claim) — all fail-safe (they emit nothing and exit 0 on any error, so they never
|
|
614
|
+
break a turn);
|
|
615
|
+
- the **MCP server** (`dos_verify` / `dos_arbitrate` / `dos_commit_audit` /
|
|
616
|
+
`dos_refuse_reasons` … as tools the model calls directly);
|
|
617
|
+
- the **generic skill pack** (the domain-free dispatch screenplays), namespaced as
|
|
618
|
+
`/dos-kernel:dos-next-up`, `/dos-kernel:dos-dispatch`, …
|
|
619
|
+
|
|
620
|
+
```bash
|
|
621
|
+
# 1. The plugin ships JSON + markdown; the brains ship as the pip package, so
|
|
622
|
+
# install it FIRST into the interpreter Claude Code runs (the [mcp] extra is
|
|
623
|
+
# what the bundled MCP server needs):
|
|
624
|
+
pip install "dos-kernel[mcp] @ git+https://github.com/anthony-chaudhary/dos-kernel"
|
|
625
|
+
|
|
626
|
+
# 2. Then, inside Claude Code:
|
|
627
|
+
/plugin marketplace add anthony-chaudhary/dos-kernel
|
|
628
|
+
/plugin install dos-kernel@dos
|
|
629
|
+
```
|
|
630
|
+
|
|
631
|
+
After installing, run **`/dos-kernel:dos-setup`** once — it confirms the package is
|
|
632
|
+
importable, reports what the plugin wired, and points at the next skill. The same
|
|
633
|
+
three hooks are available à la carte via `dos init --hooks claude-code` (and for
|
|
634
|
+
Cursor / Codex / Gemini); the plugin is just the pre-packaged Claude Code form. The
|
|
635
|
+
bundle's design + the build that keeps its skills in lockstep with the source are in
|
|
636
|
+
**[claude-plugin/README.md](claude-plugin/README.md)**.
|
|
637
|
+
|
|
638
|
+
## CLI
|
|
639
|
+
|
|
640
|
+
One `dos` entrypoint over the syscalls (see [QUICKSTART.md](docs/QUICKSTART.md) for
|
|
641
|
+
a runnable tour of the core ones):
|
|
642
|
+
|
|
643
|
+
```bash
|
|
644
|
+
# --- the syscalls ---
|
|
645
|
+
dos verify PLAN PHASE # truth: did (plan,phase) ship? (works with no plan)
|
|
646
|
+
dos commit-audit [REF] [--sweep] # truth: does a commit's SUBJECT match its own diff? (--sweep = drift rate over a range)
|
|
647
|
+
dos verify-result --transcript T # fold-site witness: did a subagent's terminal record DIE (harness 429/quota)? (exit 3 = DEAD)
|
|
648
|
+
dos coverage --declared N # fan-out coverage: how many of N declared workers REALLY returned a result vs died?
|
|
649
|
+
dos liveness --run-id R --start-sha S # temporal: ADVANCING / SPINNING / STALLED?
|
|
650
|
+
dos resume --run-id R # the resume verdict: replay a run's intent ledger, re-verify against git, PROPOSE the continuation
|
|
651
|
+
dos complete --run-id R [--diverged] # completion verdict: is the WHOLE declared job done? (residual = declared − verified)
|
|
652
|
+
dos rewind --run-id R [--fire SIGNAL] # conversation-rewind verdict: PROPOSE excising dead-end turns (never truncates)
|
|
653
|
+
dos status --run-id R # the folded fact: one fail-closed digest of a run (liveness + verified progress + lease)
|
|
654
|
+
dos arg-provenance --tool T --args J [--new-key K] # did the model MINT this id/FK, or RESOLVE it from env bytes? (exit 0 believe / 3 UNSUPPORTED)
|
|
655
|
+
dos arbitrate --lane L --kind K --leases '[…]' # admission: may a lane start without collision?
|
|
656
|
+
dos scope-gate --lane L [--staged] # binding pre-effect scope gate: may this PROPOSED write land in its lane? (ALLOW/REFUSE)
|
|
657
|
+
dos lease {acquire,release,status} OWNER # the cross-process archive lock
|
|
658
|
+
dos lease-lane {acquire,release,heartbeat,live} # durable lane lease over the pure arbiter (write-back to the WAL)
|
|
659
|
+
dos run-id mint PROCESS # mint a correlation run-id
|
|
660
|
+
dos id-alloc {allocate,peek} SCOPE # atomically allocate a never-reused, monotonic id for a scope
|
|
661
|
+
dos journal {tail,replay,seq,compact} # the lane write-ahead log
|
|
662
|
+
dos halt --handle H # the reap verb: emit the stop-plan for a live run/lease
|
|
663
|
+
dos pickable / enumerate / cooldown / reconcile # picker substrate: anything pickable? why-not? tried recently? did the claim hold?
|
|
664
|
+
|
|
665
|
+
# --- workspace & inspection ---
|
|
666
|
+
dos init [DIR] # scaffold a dos.toml workspace config
|
|
667
|
+
dos doctor [--json] [--check] # report the active workspace + taxonomy + predicates
|
|
668
|
+
dos lint [--strict] [--json] # dead policy in this workspace's own dos.toml? (unreachable lanes, dangling refs)
|
|
669
|
+
dos man {wedge,lane} [ID] # the self-describing manual over the registries
|
|
670
|
+
dos exit-codes [VERB] # print the verdict-IS-the-exit-code table (all verbs or one)
|
|
671
|
+
dos gate PACKET # typed empty-packet verdict (LIVE/DRAIN/STALE-STAMP/…)
|
|
672
|
+
dos judge wedge RUN_TS # adjudicate a no-pick verdict (deterministic)
|
|
673
|
+
dos judge-eval --judge N --cases C # score a JUDGE-rung adjudicator against labelled claims
|
|
674
|
+
dos overlap-eval --policy P --cases C # score an overlap scorer by false-admit rate (the disjointness backtest)
|
|
675
|
+
dos intervention-eval --cases C # score an intervention policy by NET task delta (not verdict accuracy)
|
|
676
|
+
dos tool-stream-eval --cases C # score a stall-reader policy by NET recovery (not detection accuracy)
|
|
677
|
+
dos precursor-gate-eval --cases C # score a precursor grammar by recall vs false-refute waste
|
|
678
|
+
dos memory {recall,verify} # re-verify recalled agent-memory at read time (RECALL_FRESH/STALE/UNVERIFIABLE)
|
|
679
|
+
dos health --lane L # pre-dispatch lane-health gate (overlap + recurring-blocker → route)
|
|
680
|
+
dos scout # pre-dispatch chooser: pick the next activity before leasing a lane
|
|
681
|
+
dos trace RUN_ID # walk one run across spine + intent ledger + WAL + git, joined by run_id
|
|
682
|
+
|
|
683
|
+
# --- agent-host binding (Claude Code / MCP) ---
|
|
684
|
+
dos guard [--verify-on-stop] -- CMD… # wrap a headless agent launch: inject the DOS MCP server (+ optional verify-on-stop Stop hook)
|
|
685
|
+
dos hook {pretool,posttool,stop} # the live agent-host hook surface (PreToolUse deny / PostToolUse sensor / Stop verify)
|
|
686
|
+
|
|
687
|
+
# --- live projections (read-only TUIs) ---
|
|
688
|
+
dos top [--once] [--json] # live fleet watchdog: lanes, leases, verdicts, commits
|
|
689
|
+
dos decisions [N] # the operator-decision queue (list + drill-in TUI)
|
|
690
|
+
dos plan [--once] [--json] # work-terrain board: every phase, the plan's claim vs the oracle's verdict
|
|
691
|
+
dos watch --track R [--budget-ms M] # the watchdog driver: poll liveness for tracked runs + propose halts on spin/hang
|
|
692
|
+
dos loop --target N [--watch] [--json] # supervisor (init/PID-1): keep N dispatch-loops alive — emits a spawn/reap/flag plan
|
|
693
|
+
|
|
694
|
+
# --- loop-economics & reliability verdicts (pure; exit code is the verdict) ---
|
|
695
|
+
dos productivity --deltas 5,3,1,0 # is the run still doing work? PRODUCTIVE / DIMINISHING / STALLED
|
|
696
|
+
dos efficiency --work W --tokens N # did the tokens buy work? EFFICIENT / COSTLY / WASTEFUL
|
|
697
|
+
dos breaker --consecutive N --max-consecutive M # has this failure class tripped? CLOSED / OPEN (+ escalation rung)
|
|
698
|
+
dos hook-exit --code N # map a shell hook's exit code → PASS / BLOCK / WARN
|
|
699
|
+
dos exec-capability --command "…" # does this command grant arbitrary exec? BOUNDED / GRANTS_ARBITRARY_EXEC
|
|
700
|
+
dos improve --suite-passed --truth-clean --work W --baseline-work B # self-improving loop: KEEP / REVERT / ESCALATE
|
|
701
|
+
dos reward --claim --witness {confirm,refute,none} # may a fine-tune TRAIN on this trajectory? ACCEPT / REJECT_POISON
|
|
702
|
+
|
|
703
|
+
# --- observability: the verdict journal → your dashboards ---
|
|
704
|
+
dos observe [--run R] [--json] # project the verdict journal: every kernel adjudication, folded by run/syscall/verdict
|
|
705
|
+
dos helped [--since TS] [--json] # the operator rollup: how many things DOS productively caught for you
|
|
706
|
+
dos export [--to file|statsd|otlp] [--since SEQ] # drain the journal outward (Datadog / Honeycomb / Grafana); null = report only
|
|
707
|
+
dos notify {decisions,top} [--notifier slack|webhook --channel NAME] # push what-needs-a-human / what's-running to where the operator is; null = render only
|
|
708
|
+
|
|
709
|
+
# --- portable proof (third-party verifiable, no loop access) ---
|
|
710
|
+
dos attest --claim KEY {--accept-cmd CMD | --before P --after P} # mint an HMAC-signed receipt over an effect-witness verdict
|
|
711
|
+
dos verify-receipt --receipt R # the skeptic's side: check the signature with the shared key alone (fails LOUD on tamper)
|
|
712
|
+
|
|
713
|
+
# --- cross-project (machine-local index) ---
|
|
714
|
+
dos projects # the projects DOS has served
|
|
715
|
+
dos learn AXIS # aggregates over resolved decisions
|
|
716
|
+
dos reindex # rebuild the central store from the .dos/ dirs
|
|
717
|
+
```
|
|
718
|
+
|
|
719
|
+
Most verbs take `--workspace .` (or honor `$DISPATCH_WORKSPACE` / cwd) and
|
|
720
|
+
`--json` for machine-readable output. For verdict-bearing commands (`verify` /
|
|
721
|
+
`liveness` / `gate`) **the exit code is the verdict.** A pluggable `--output
|
|
722
|
+
<name>` renderer (the `dos.renderers` entry-point group) is covered in
|
|
723
|
+
[HACKING.md](docs/HACKING.md).
|
|
724
|
+
|
|
725
|
+
### Three live projections (read-only TUIs)
|
|
726
|
+
|
|
727
|
+
A fleet leaves its state scattered across git history, a write-ahead log, and a
|
|
728
|
+
pile of verdict envelopes. DOS folds that into **three read-only screens**, each
|
|
729
|
+
answering a different operator question. They are *projections*, not stores: every
|
|
730
|
+
one reads kernel state, **mutates nothing, takes no lease, launches nothing** —
|
|
731
|
+
delete any of them and you lose the screen, not the data. Pick by the question
|
|
732
|
+
you're asking:
|
|
733
|
+
|
|
734
|
+
| Screen | Answers | Reads |
|
|
735
|
+
|---|---|---|
|
|
736
|
+
| `dos top` | *What's **running** right now?* — the lanes, the leases holding them, recent verdicts, live git activity. The screen you leave open in a side terminal during a run. | leases (WAL) + per-lane `liveness` + verdict envelopes + git |
|
|
737
|
+
| `dos decisions` | *What's waiting on **me** right now?* — the no-picks (refusals, wedges, open gates) that need a decision, each tagged by *who can resolve it*. | the four refusal sources, joined |
|
|
738
|
+
| `dos plan` | *Does the plan's **claim** match the **ground truth**?* — every declared phase, the plan's self-reported status beside the oracle's verdict, so an over-claim is its own cell. | the plan source × `verify()` per phase |
|
|
739
|
+
|
|
740
|
+
In `dos top` a held lane's status chip **is** its `liveness` verdict — green
|
|
741
|
+
`ADVANCING` / yellow `SPINNING` / red `STALLED` — so "which one is wedged" is one
|
|
742
|
+
glance, not a log dig. `dos decisions` tags each row by resolver — a deterministic
|
|
743
|
+
**ORACLE** (may auto-clear), an **LLM JUDGE** (could rule before you spend
|
|
744
|
+
attention), or a **HUMAN** (a genuine operator call) — and on a keypress prints
|
|
745
|
+
the exact shell command and exits; *you* run it, the screen never mutates
|
|
746
|
+
substrate. `dos plan` is a `verify()` fan-out, **not** a plan reader: a human runs
|
|
747
|
+
it from *outside* the agent loop, so an over-claiming loop is caught by ground
|
|
748
|
+
truth, not by re-reading its own narration.
|
|
749
|
+
|
|
750
|
+
All three have a **plain-text floor that needs no dependencies** — the live
|
|
751
|
+
`rich` redraw is the optional `[tui]` extra, but `--once` (one frame) and `--json`
|
|
752
|
+
work on a bare core install (no extras). Here is `dos top --once` on a fresh
|
|
753
|
+
checkout (no leases yet, so every lane is `FREE` and the git strip carries the
|
|
754
|
+
content):
|
|
755
|
+
|
|
756
|
+
```text
|
|
757
|
+
┌─ dos top · /path/to/repo · 2026-06-07T17:14:32+00:00 ──────────────────────
|
|
758
|
+
LANES
|
|
759
|
+
benchmark ⚪ FREE
|
|
760
|
+
docs ⚪ FREE
|
|
761
|
+
… (one concurrent lane per source dir)
|
|
762
|
+
*global ⚪ FREE (* = the exclusive whole-repo lane)
|
|
763
|
+
8 lanes · 0 advancing · 0 spinning · 0 stalled · 8 free
|
|
764
|
+
RECENT VERDICTS [trust = ship-oracle cross-check]
|
|
765
|
+
(no verdicts yet)
|
|
766
|
+
RECENT COMMITS [ground truth — git history]
|
|
767
|
+
0857bd4 docs/206 Appendix A: the whole program in plain words
|
|
768
|
+
… (last 10 commits — the content even a
|
|
769
|
+
zero-lease repo always has)
|
|
770
|
+
──────────────────────────────────────────────────────────────────────────────
|
|
771
|
+
read-only · q quit · this screen mutates nothing
|
|
772
|
+
```
|
|
773
|
+
|
|
774
|
+
The stuck-fleet walkthrough that drives all three end-to-end is
|
|
775
|
+
**[Debug a stuck fleet](examples/playbooks/06_debug-a-stuck-fleet.md)**.
|
|
776
|
+
|
|
777
|
+
<p align="center">
|
|
778
|
+
<img src="docs/assets/decisions-tui.svg" alt="The dos decisions queue: four pending arbiter refusals on the left, each routed to who can resolve it — a deterministic ORACLE (may auto-clear), an LLM JUDGE (could rule), or a HUMAN (your call) — and on the right the selected SELF_MODIFY decision expanded with its meaning, typical fix, and the exact commands to run." width="100%">
|
|
779
|
+
</p>
|
|
780
|
+
|
|
781
|
+
### Observability — the verdict journal, drained to where dashboards live
|
|
782
|
+
|
|
783
|
+
Those three screens read a fleet's *running* state. Underneath, every verdict the
|
|
784
|
+
kernel computes — each `verify` / `liveness` / `efficiency` / `breaker` / `reward`
|
|
785
|
+
/ hook decision — also lands in a **verdict journal**: a `run_id`-correlated
|
|
786
|
+
write-ahead log of the kernel's *own* adjudications
|
|
787
|
+
([docs/262](docs/262_the-verdict-journal-observability-as-a-first-class-surface.md)).
|
|
788
|
+
Two verbs make it useful. `dos observe` is the read-only projection — fold the
|
|
789
|
+
journal by run, syscall, or verdict, or replay one run's verdict history. `dos
|
|
790
|
+
export` is the **delivery seam**: it drains the journal outward to an
|
|
791
|
+
observability backend through the `dos.exporters` entry-point group, with three
|
|
792
|
+
shipped transports — `file` (JSONL), `statsd` (DogStatsD counters), and `otlp`
|
|
793
|
+
(OpenTelemetry log records → Datadog / Honeycomb / Grafana), the `null` default
|
|
794
|
+
reporting only ([docs/266](docs/266_the-verdict-exporter-shipping-the-journal-to-where-dashboards-live.md)).
|
|
795
|
+
So "how often did the fleet over-claim this week, and on which lanes?" becomes a
|
|
796
|
+
dashboard panel, not a log grep — and adding a transport is a driver, never a
|
|
797
|
+
kernel edit (the same kernel/driver split as judges and notifiers).
|
|
798
|
+
|
|
799
|
+
## Hacking it
|
|
800
|
+
|
|
801
|
+
DOS is built to be extended **without forking the package** — add your own block
|
|
802
|
+
reasons, gate verdicts, admission/safety predicates, output renderers (the
|
|
803
|
+
`dos.renderers` entry-point group), and your own **judge** for the JUDGE rung
|
|
804
|
+
(`dos.judges`, scored by `dos judge-eval`), all as *workspace policy*, not package
|
|
805
|
+
edits. The block-reason vocabulary is fully data-driven: declare a reason in four
|
|
806
|
+
lines of `dos.toml` and it becomes emittable, verifiable, refusable, and `dos man
|
|
807
|
+
wedge`-documented through the same kernel calls a built-in uses. See
|
|
808
|
+
**[docs/HACKING.md](docs/HACKING.md)** for the seven extension axes and the plugin
|
|
809
|
+
model, and **[`examples/dos_ext/`](examples/dos_ext/)** for a copy-me skeleton.
|
|
810
|
+
|
|
811
|
+
## Documentation
|
|
812
|
+
|
|
813
|
+
- **[docs/QUICKSTART.md](docs/QUICKSTART.md)** — runnable 5-minute hello-world. Start here.
|
|
814
|
+
- **[docs/README.md](docs/README.md)** — the docs index (guides vs. design notes
|
|
815
|
+
vs. the dated build-journal; the numbers are chronology, not a reading order).
|
|
816
|
+
- **[docs/HACKING.md](docs/HACKING.md)** — extend DOS without forking it.
|
|
817
|
+
- **[CLAUDE.md](CLAUDE.md)** / **[CONTRIBUTING.md](CONTRIBUTING.md)** — the
|
|
818
|
+
architecture contract and how to send a change.
|
|
819
|
+
- **[docs/releases/](docs/releases/)** — per-version release notes (the changelog).
|
|
820
|
+
|
|
821
|
+
## Playbooks & examples
|
|
822
|
+
|
|
823
|
+
**[`examples/playbooks/`](examples/playbooks/)** walks the syscalls end-to-end on
|
|
824
|
+
anonymized real-world repo shapes — every command was run and its output pasted
|
|
825
|
+
back verbatim:
|
|
826
|
+
|
|
827
|
+
- **[Onboard a repo in 10 minutes](examples/playbooks/01_onboard-a-repo.md)** —
|
|
828
|
+
`pip install` → first verified ship, on any repo.
|
|
829
|
+
- Four archetypes — a [polyglot web-service fleet](examples/playbooks/02_polyglot-web-service.md)
|
|
830
|
+
(concurrent lanes), an [OSS library release](examples/playbooks/03_oss-library-release.md)
|
|
831
|
+
(the stamp grammar), a [data/ML pipeline](examples/playbooks/04_data-ml-pipeline.md)
|
|
832
|
+
(liveness), an [infra monorepo](examples/playbooks/05_infra-monorepo.md) (refusals).
|
|
833
|
+
- [**Debug a stuck fleet** + FAQ](examples/playbooks/06_debug-a-stuck-fleet.md) —
|
|
834
|
+
symptom → the one command that diagnoses it.
|
|
835
|
+
- Two cookbooks: [from Python](examples/playbooks/cookbook-python-api.md) and
|
|
836
|
+
[CI / MCP integration](examples/playbooks/cookbook-ci-integration.md).
|
|
837
|
+
- Runnable [`examples/workspaces/`](examples/workspaces/) — `cd` in and run `dos`
|
|
838
|
+
against a realistic lane taxonomy.
|
|
839
|
+
|
|
840
|
+
## Citation
|
|
841
|
+
|
|
842
|
+
The ideas here are written up in a paper — *"Verification Is All You Need — But
|
|
843
|
+
Not Where You Think"* — on the out-of-loop referee for agent fleets. A built PDF
|
|
844
|
+
lives at [`paper/releases/`](paper/releases/); the arXiv preprint is in
|
|
845
|
+
preparation. Until the arXiv ID lands, cite the repository:
|
|
846
|
+
|
|
847
|
+
```bibtex
|
|
848
|
+
@misc{dos_kernel,
|
|
849
|
+
title = {Verification Is All You Need --- But Not Where You Think},
|
|
850
|
+
author = {Chaudhary, Anthony},
|
|
851
|
+
howpublished = {\url{https://github.com/anthony-chaudhary/dos-kernel}},
|
|
852
|
+
note = {DOS --- the Dispatch Operating System; arXiv preprint in preparation},
|
|
853
|
+
year = {2026}
|
|
854
|
+
}
|
|
855
|
+
```
|
|
856
|
+
|
|
857
|
+
## License
|
|
858
|
+
|
|
859
|
+
MIT — see [LICENSE](LICENSE).
|