dos-kernel 0.22.0__py3-none-win_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dos/__init__.py +261 -0
- dos/_bin/dos-hook.exe +0 -0
- dos/_filelock.py +255 -0
- dos/_job_policy.py +97 -0
- dos/_tree.py +145 -0
- dos/admission.py +433 -0
- dos/answer_shape.py +299 -0
- dos/arbiter.py +859 -0
- dos/archive_lock.py +266 -0
- dos/arg_provenance.py +814 -0
- dos/attest.py +472 -0
- dos/breaker.py +311 -0
- dos/churn.py +226 -0
- dos/claim_extract.py +229 -0
- dos/claim_ttl.py +150 -0
- dos/cli.py +8721 -0
- dos/commit_audit.py +666 -0
- dos/completion.py +466 -0
- dos/concurrency_class.py +154 -0
- dos/config.py +1380 -0
- dos/config_lint.py +464 -0
- dos/cooldown.py +390 -0
- dos/coverage.py +387 -0
- dos/dangling_intent.py +287 -0
- dos/data_class.py +397 -0
- dos/decisions.py +1274 -0
- dos/decisions_tui.py +251 -0
- dos/dispatch_top.py +740 -0
- dos/dispatch_top_tui.py +116 -0
- dos/drivers/__init__.py +40 -0
- dos/drivers/ci_status.py +630 -0
- dos/drivers/citation_resolve.py +703 -0
- dos/drivers/decision_stop.py +98 -0
- dos/drivers/export_file.py +173 -0
- dos/drivers/export_otlp.py +275 -0
- dos/drivers/export_statsd.py +242 -0
- dos/drivers/hook_dialects.py +391 -0
- dos/drivers/job.py +47 -0
- dos/drivers/llm_judge.py +360 -0
- dos/drivers/memory_recall.py +1231 -0
- dos/drivers/notify_slack.py +373 -0
- dos/drivers/notify_webhook.py +251 -0
- dos/drivers/operator_judge.py +114 -0
- dos/drivers/os_acceptance.py +228 -0
- dos/drivers/paste_log.py +132 -0
- dos/drivers/plan_scope.py +133 -0
- dos/drivers/self_improve.py +375 -0
- dos/drivers/similarity_judge.py +249 -0
- dos/drivers/state_diff.py +274 -0
- dos/drivers/supervisor.py +347 -0
- dos/drivers/watchdog.py +363 -0
- dos/drivers/workshop.py +160 -0
- dos/durable_schema.py +344 -0
- dos/effect_witness.py +393 -0
- dos/efficiency.py +318 -0
- dos/enforce.py +414 -0
- dos/enumerate.py +776 -0
- dos/env_print.py +378 -0
- dos/event_severity.py +258 -0
- dos/evidence.py +692 -0
- dos/exec_capability.py +256 -0
- dos/export_cursor.py +143 -0
- dos/exporter.py +320 -0
- dos/firing_label.py +353 -0
- dos/fleet_roll.py +226 -0
- dos/gate_classify.py +827 -0
- dos/gh4_coverage.py +179 -0
- dos/git_delta.py +122 -0
- dos/guard.py +215 -0
- dos/health.py +552 -0
- dos/help_summary.py +519 -0
- dos/home.py +934 -0
- dos/hook_binary.py +194 -0
- dos/hook_dialect.py +271 -0
- dos/hook_exit.py +191 -0
- dos/hook_install.py +437 -0
- dos/id_alloc.py +304 -0
- dos/improve.py +499 -0
- dos/intent_ledger.py +635 -0
- dos/interpret.py +176 -0
- dos/intervention.py +769 -0
- dos/intervention_eval.py +371 -0
- dos/journal_delta.py +308 -0
- dos/judge_eval.py +328 -0
- dos/judges.py +366 -0
- dos/lane_infer.py +127 -0
- dos/lane_journal.py +1001 -0
- dos/lane_lease.py +952 -0
- dos/lane_overlap.py +228 -0
- dos/lease_health.py +282 -0
- dos/lifecycle.py +211 -0
- dos/liveness.py +352 -0
- dos/lock_modes.py +185 -0
- dos/log_source.py +395 -0
- dos/loop_decide.py +1746 -0
- dos/marker_gate.py +254 -0
- dos/marker_sensor.py +396 -0
- dos/noop_streak.py +280 -0
- dos/notify.py +479 -0
- dos/observe.py +175 -0
- dos/oracle.py +1661 -0
- dos/overlap_eval.py +214 -0
- dos/overlap_policy.py +342 -0
- dos/packet_sidecar.py +267 -0
- dos/phase_shipped.py +1985 -0
- dos/pick_priority.py +225 -0
- dos/pickable.py +369 -0
- dos/picker_oracle.py +1037 -0
- dos/plan_board.py +513 -0
- dos/plan_board_tui.py +113 -0
- dos/plan_source.py +455 -0
- dos/posttool_sensor.py +528 -0
- dos/precursor_gate.py +499 -0
- dos/precursor_gate_eval.py +239 -0
- dos/preflight.py +825 -0
- dos/pretool_sensor.py +490 -0
- dos/proc_delta.py +181 -0
- dos/productivity.py +296 -0
- dos/provider_limit.py +242 -0
- dos/py.typed +4 -0
- dos/reason_morphology.py +299 -0
- dos/reasons.py +449 -0
- dos/reconcile.py +173 -0
- dos/recurring_wedge.py +206 -0
- dos/render.py +393 -0
- dos/result_state.py +468 -0
- dos/resume.py +578 -0
- dos/resume_evidence.py +293 -0
- dos/retention.py +344 -0
- dos/reward.py +372 -0
- dos/rewind.py +587 -0
- dos/rewind_evidence.py +168 -0
- dos/rewind_tokens.py +252 -0
- dos/run_id.py +342 -0
- dos/scope.py +520 -0
- dos/scope_source.py +382 -0
- dos/scout.py +982 -0
- dos/self_modify.py +209 -0
- dos/sibling_scan.py +569 -0
- dos/skills/EXAMPLES.md +584 -0
- dos/skills/dos-class-cycle/SKILL.md +107 -0
- dos/skills/dos-dispatch/SKILL.md +177 -0
- dos/skills/dos-dispatch-loop/SKILL.md +254 -0
- dos/skills/dos-goal-gate/SKILL.md +269 -0
- dos/skills/dos-next-up/SKILL.md +231 -0
- dos/skills/dos-promote/SKILL.md +114 -0
- dos/skills/dos-replan/SKILL.md +159 -0
- dos/skills/dos-replan-loop/SKILL.md +114 -0
- dos/skills/dos-self-improve/SKILL.md +213 -0
- dos/skills/dos-supervise-loop/SKILL.md +180 -0
- dos/skills/dos-unstick/SKILL.md +108 -0
- dos/skills/dos-witness-claim/SKILL.md +251 -0
- dos/stamp.py +1002 -0
- dos/state_health.py +387 -0
- dos/status.py +114 -0
- dos/stop_policy.py +334 -0
- dos/supervise.py +1014 -0
- dos/testwitness.py +392 -0
- dos/timeline.py +1027 -0
- dos/tokens.py +485 -0
- dos/tool_stream.py +393 -0
- dos/tool_stream_eval.py +226 -0
- dos/trace.py +524 -0
- dos/verdict.py +140 -0
- dos/verdict_cli.py +189 -0
- dos/verdict_journal.py +497 -0
- dos/verdict_rollup.py +217 -0
- dos/verdicts.py +181 -0
- dos/wedge_reason.py +282 -0
- dos_kernel-0.22.0.dist-info/METADATA +859 -0
- dos_kernel-0.22.0.dist-info/RECORD +178 -0
- dos_kernel-0.22.0.dist-info/WHEEL +5 -0
- dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
- dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
- dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
- dos_mcp/__init__.py +52 -0
- dos_mcp/py.typed +2 -0
- dos_mcp/server.py +779 -0
dos/drivers/watchdog.py
ADDED
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
"""dos.drivers.watchdog — the push-model supervisor that polls `liveness()`.
|
|
2
|
+
|
|
3
|
+
`liveness.classify` (docs/82) mints the in-flight verdict — is THIS run ADVANCING,
|
|
4
|
+
SPINNING, or STALLED? — but it is a *pull* verdict: something has to ask it. The
|
|
5
|
+
self-stop seam (`loop_decide.StopReason.SPINNING`) lets a loop ask it about itself;
|
|
6
|
+
the stop-recorder (`lane_lease.halt`) lets a verb record a stop decision. What was
|
|
7
|
+
still missing is the actor that asks the question **on a timer, from outside the
|
|
8
|
+
watched run's own process** — and acts on the answer. This driver is that actor.
|
|
9
|
+
|
|
10
|
+
It directly answers the most expensive incident in the historical record
|
|
11
|
+
(docs/99 §2.1): eight jobs hung ~4.4 h each because the wall-clock budget fired
|
|
12
|
+
2.2 h late — the orchestrator loop stalled inside a long poll, so the timer meant
|
|
13
|
+
to kill the stuck run never got a turn. The fix is structural: a poller in its OWN
|
|
14
|
+
process, whose clock keeps ticking no matter what the watched runs do. That is why
|
|
15
|
+
the watchdog is a separate long-lived process, not a callback the dispatch loop
|
|
16
|
+
runs on itself (the thing that already failed).
|
|
17
|
+
|
|
18
|
+
## Why this is a DIFFERENT driver from `drivers/supervisor.py`
|
|
19
|
+
|
|
20
|
+
Two axes, deliberately kept apart (docs/101 §1):
|
|
21
|
+
|
|
22
|
+
* `supervisor.py` — the POPULATION axis. `supervise()` → is the roster full?
|
|
23
|
+
SPAWN free lanes / REAP STALLED *leases* / FLAG spinners. It frees a lane so a
|
|
24
|
+
replacement can take it; it does NOTHING about a spinner beyond FLAG, because a
|
|
25
|
+
spinner still holds a live lease and the supervisor has no standing to halt a
|
|
26
|
+
peer's control flow (docs/99 §3.1).
|
|
27
|
+
* `watchdog.py` (THIS) — the PER-RUN-HEALTH axis. `liveness.classify` → is THIS
|
|
28
|
+
run moving? A SPINNING / hung-past-budget run → record an `OP_HALT` and propose
|
|
29
|
+
the stop command. The operator delegated the watchdog to watch a NAMED set of
|
|
30
|
+
runs, so (unlike the supervisor over a peer) it has standing to record the stop
|
|
31
|
+
decision and propose the kill.
|
|
32
|
+
|
|
33
|
+
The §2.1 incident is a per-run-health failure, not a population one: the roster was
|
|
34
|
+
*full* (eight workers alive); a supervisor would have reported AT_TARGET. Each of
|
|
35
|
+
those runs was hung, and the timer was asleep inside their loop. The watchdog,
|
|
36
|
+
independent by construction, is immune to that.
|
|
37
|
+
|
|
38
|
+
## The actuation boundary holds (docs/99 §3, §5)
|
|
39
|
+
|
|
40
|
+
"Auto-halt-record" means the watchdog itself calls `lane_lease.halt` to RECORD the
|
|
41
|
+
`OP_HALT` and EMIT the host-supplied stop command — so the proposed stop is one
|
|
42
|
+
paste away (in the journal + the `dos decisions` queue). It does NOT mean the
|
|
43
|
+
watchdog kills anything: `lane_lease.halt` records intent and proposes a command
|
|
44
|
+
and NEVER signals a process, because *delivering* the signal requires knowing what
|
|
45
|
+
the opaque `handle` IS (a pid? a container? a remote task?), and that domain
|
|
46
|
+
knowledge is a driver's, never a domain-free kernel's. The watchdog (a driver)
|
|
47
|
+
*could* in principle carry that knowledge — but it deliberately does not: it stops
|
|
48
|
+
at the propose line, exactly where the supervisor stops (journal the decision, let
|
|
49
|
+
a human/driver enact). Enacting the kill is a separate, even-more-host-specific
|
|
50
|
+
act left to the operator's paste or a further driver that consumes `OP_HALT`. This
|
|
51
|
+
driver NEVER calls `os.kill`/`subprocess`/`TaskStop` (pinned by
|
|
52
|
+
`test_watchdog_proposes_does_not_signal`).
|
|
53
|
+
|
|
54
|
+
## Structure (testable without real I/O — the supervisor-driver idiom)
|
|
55
|
+
|
|
56
|
+
`assess_run(cfg, tracked, *, now_ms)` is NEAR-PURE: it gathers this run's evidence
|
|
57
|
+
by calling the SAME boundary helpers `cmd_liveness` uses (`cli._git_delta_count`,
|
|
58
|
+
`cli._journal_delta`, `run_id.ts_ms_of`) and returns `liveness.classify(...)` — NO
|
|
59
|
+
effects, and no re-implementation of the git/journal rungs (the LVN-1b no-drift
|
|
60
|
+
rule: the watchdog's verdict can never diverge from `dos liveness`). `tick(...)`
|
|
61
|
+
calls `assess_run` per run, applies the verdict→action map, and records an
|
|
62
|
+
`OP_HALT` (via the injectable `halt`) for each run that warrants one. `run(...)`
|
|
63
|
+
loops `tick` + sleep on a long cadence. Tests drive `assess_run`/`tick` with the
|
|
64
|
+
evidence helpers and `halt` monkeypatched, so no real git, no real journal, no
|
|
65
|
+
real `claude`, and `os.kill`/`Popen` can be made to raise to prove they are never
|
|
66
|
+
called.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
from __future__ import annotations
|
|
70
|
+
|
|
71
|
+
import subprocess # noqa: F401 — imported so a test can monkeypatch it to prove we never Popen
|
|
72
|
+
import time
|
|
73
|
+
from dataclasses import dataclass, field
|
|
74
|
+
from typing import Optional
|
|
75
|
+
|
|
76
|
+
from dos import config as _config
|
|
77
|
+
from dos import lane_lease, liveness, run_id
|
|
78
|
+
|
|
79
|
+
DEFAULT_INTERVAL_S = 300.0 # a watchdog wakes rarely — not a busy-poll
|
|
80
|
+
# One halt proposal per genuine spin episode, not one per tick. A SPINNING run
|
|
81
|
+
# stays SPINNING across many ticks; without this memory the watchdog would append
|
|
82
|
+
# an OP_HALT every tick forever. A run that recovers to ADVANCING is dropped from
|
|
83
|
+
# `proposed`, so a later re-spin earns a fresh proposal. Long by default — a halt
|
|
84
|
+
# proposal is not something to spam.
|
|
85
|
+
DEFAULT_REPROPOSE_MS = 1_800_000 # 30 min
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass(frozen=True)
|
|
89
|
+
class TrackedRun:
|
|
90
|
+
"""One run the watchdog watches — the tuple `liveness.classify` needs, plus the
|
|
91
|
+
opaque stop handle/command the proposal carries.
|
|
92
|
+
|
|
93
|
+
run_id — the CID token; decodes `run_started_ms` (the clock is free in
|
|
94
|
+
the token). REQUIRED — a run with no valid run-id is skipped.
|
|
95
|
+
start_sha — the git SHA the run started at (the commit-rung floor). "" ⇒
|
|
96
|
+
the commit rung is silent (0 commits) and the run is judged on
|
|
97
|
+
the journal rung alone (the discovered-run honest floor).
|
|
98
|
+
lane/loop_ts — the lease's `(loop_ts, lane)` identity; both required for the
|
|
99
|
+
journal rung to be attributed to this run (the LVN P2
|
|
100
|
+
identity rule). Also carried onto the OP_HALT for correlation.
|
|
101
|
+
handle — the OPAQUE stop handle (a pid string / container id / task
|
|
102
|
+
token). The kernel records it verbatim, interprets nothing.
|
|
103
|
+
Defaults to the lease pid when discovered; "" is recorded fine.
|
|
104
|
+
budget_ms — wall-clock budget. A STALLED run past it → halt; within it →
|
|
105
|
+
not yet (the grace guard, lifted to the budget axis). None ⇒
|
|
106
|
+
no budget, so any STALLED run is treated as past-budget (a
|
|
107
|
+
hung run with no declared budget is still hung).
|
|
108
|
+
stop_command — the host-supplied stop command echoed in the OP_HALT proposal
|
|
109
|
+
(the paste-to-stop). "" records the proposal with no command
|
|
110
|
+
(the operator supplies the kill by hand).
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
run_id: str
|
|
114
|
+
start_sha: str = ""
|
|
115
|
+
lane: str = ""
|
|
116
|
+
loop_ts: str = ""
|
|
117
|
+
handle: str = ""
|
|
118
|
+
budget_ms: Optional[int] = None
|
|
119
|
+
stop_command: str = ""
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@dataclass
|
|
123
|
+
class WatchActions:
|
|
124
|
+
"""What a tick did — the audit record a test asserts on."""
|
|
125
|
+
|
|
126
|
+
proposed_halts: list[str] = field(default_factory=list) # run-ids an OP_HALT was recorded for
|
|
127
|
+
advancing: list[str] = field(default_factory=list) # run-ids classified ADVANCING
|
|
128
|
+
spinning: list[str] = field(default_factory=list) # run-ids classified SPINNING
|
|
129
|
+
stalled_within_budget: list[str] = field(default_factory=list) # STALLED but too young to halt
|
|
130
|
+
skipped: list[str] = field(default_factory=list) # bad run-id / unclassifiable
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def assess_run(cfg, tracked: TrackedRun, *, now_ms: int) -> Optional[liveness.LivenessVerdict]:
|
|
134
|
+
"""Classify ONE tracked run's liveness — NEAR-PURE (the testable seam).
|
|
135
|
+
|
|
136
|
+
Gathers this run's evidence by calling the SAME boundary helpers `cmd_liveness`
|
|
137
|
+
uses, so the watchdog's verdict can NEVER drift from `dos liveness` (the LVN-1b
|
|
138
|
+
no-drift rule): the start ms decodes from the run-id, the commit rung is
|
|
139
|
+
`cli._git_delta_count(start_sha)`, the journal rung is `cli._journal_delta(...)`
|
|
140
|
+
scoped to this run's `(loop_ts, lane)` lease. No effects. Returns None for a run
|
|
141
|
+
whose run-id is not a valid CID token (it cannot be timed, so it is skipped).
|
|
142
|
+
"""
|
|
143
|
+
from dos import cli # consumer→consumer import (a driver may import the CLI)
|
|
144
|
+
|
|
145
|
+
started_ms = run_id.ts_ms_of(tracked.run_id)
|
|
146
|
+
if started_ms is None:
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
# The commit rung. A run with no start SHA has no commit-delta floor, so the
|
|
150
|
+
# rung is silent (0) and the journal rung carries the signal — the discovered-
|
|
151
|
+
# run honest floor (`_supervise_evidence` lives with the same: "a live lease
|
|
152
|
+
# records no start SHA, so the commit rung is 0").
|
|
153
|
+
commits = cli._git_delta_count(tracked.start_sha, cfg) if tracked.start_sha else 0
|
|
154
|
+
|
|
155
|
+
# The journal rung — scoped to THIS run's lease; identity required (the LVN P2
|
|
156
|
+
# rule). Without both lane and loop_ts the journal cannot be attributed to this
|
|
157
|
+
# run, so the rung stays silent (events 0, no journal heartbeat) and the commit
|
|
158
|
+
# rung + age decide.
|
|
159
|
+
lease_key = (
|
|
160
|
+
(tracked.loop_ts, tracked.lane)
|
|
161
|
+
if tracked.lane and tracked.loop_ts
|
|
162
|
+
else None
|
|
163
|
+
)
|
|
164
|
+
jd = cli._journal_delta(cfg, started_ms=started_ms, now_ms=now_ms, lease_key=lease_key)
|
|
165
|
+
|
|
166
|
+
ev = liveness.ProgressEvidence(
|
|
167
|
+
run_started_ms=started_ms,
|
|
168
|
+
now_ms=now_ms,
|
|
169
|
+
commits_since_start=commits,
|
|
170
|
+
journal_events_since=jd.events_since_start,
|
|
171
|
+
last_heartbeat_age_ms=jd.newest_heartbeat_age_ms,
|
|
172
|
+
tokens_spent_since=None,
|
|
173
|
+
)
|
|
174
|
+
return liveness.classify(ev)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _run_age_ms(tracked: TrackedRun, now_ms: int) -> Optional[int]:
|
|
178
|
+
"""`now_ms − run_started_ms`, clamped at 0; None for a bad run-id."""
|
|
179
|
+
started_ms = run_id.ts_ms_of(tracked.run_id)
|
|
180
|
+
if started_ms is None:
|
|
181
|
+
return None
|
|
182
|
+
return max(0, now_ms - started_ms)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _warrants_halt(tracked: TrackedRun, verdict: liveness.Liveness, *, now_ms: int) -> bool:
|
|
186
|
+
"""The §3 verdict→action map: does this run warrant an OP_HALT THIS tick?
|
|
187
|
+
|
|
188
|
+
ADVANCING -> no (the run is moving)
|
|
189
|
+
SPINNING -> yes (alive but landing zero delta — the
|
|
190
|
+
textbook hung-but-narrating shape)
|
|
191
|
+
STALLED, age < budget_ms -> no (too young — the grace guard)
|
|
192
|
+
STALLED, age >= budget_ms / no budget -> yes (the §2.1 case: hung past budget)
|
|
193
|
+
"""
|
|
194
|
+
if verdict == liveness.Liveness.SPINNING:
|
|
195
|
+
return True
|
|
196
|
+
if verdict == liveness.Liveness.STALLED:
|
|
197
|
+
if tracked.budget_ms is None:
|
|
198
|
+
return True # no declared budget — a hung run is still hung
|
|
199
|
+
age = _run_age_ms(tracked, now_ms)
|
|
200
|
+
if age is None:
|
|
201
|
+
return True # cannot age it (shouldn't happen post-assess) — fail toward halt
|
|
202
|
+
return age >= tracked.budget_ms
|
|
203
|
+
return False # ADVANCING (or an unknown future verdict — never auto-halt on it)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def tick(
|
|
207
|
+
cfg,
|
|
208
|
+
tracked_runs,
|
|
209
|
+
*,
|
|
210
|
+
now_ms: int,
|
|
211
|
+
proposed: dict,
|
|
212
|
+
repropose_ms: int = DEFAULT_REPROPOSE_MS,
|
|
213
|
+
halt=lane_lease.halt,
|
|
214
|
+
) -> "tuple[dict, WatchActions]":
|
|
215
|
+
"""One watchdog tick: assess each tracked run, record an OP_HALT for the ones
|
|
216
|
+
that warrant one (auto-halt-record + emit-command), return (verdicts, actions).
|
|
217
|
+
|
|
218
|
+
Mutates `proposed` in place: records each proposal's ms; DROPS a run that
|
|
219
|
+
recovered to ADVANCING (so a later re-spin earns a fresh proposal). The
|
|
220
|
+
idempotence guard — at most one OP_HALT per run per `repropose_ms` window —
|
|
221
|
+
bounds the journal to one record per genuine spin episode, not one per poll.
|
|
222
|
+
|
|
223
|
+
`halt` is injectable (defaults to the kernel boundary verb `lane_lease.halt`,
|
|
224
|
+
which records the OP_HALT + proposes the command and NEVER signals) so a test
|
|
225
|
+
can assert the proposal without a real journal write, and can monkeypatch
|
|
226
|
+
`os.kill`/`subprocess` to raise and prove the watchdog never calls them.
|
|
227
|
+
"""
|
|
228
|
+
actions = WatchActions()
|
|
229
|
+
verdicts: dict = {}
|
|
230
|
+
|
|
231
|
+
for tracked in tracked_runs:
|
|
232
|
+
verdict = assess_run(cfg, tracked, now_ms=now_ms)
|
|
233
|
+
if verdict is None:
|
|
234
|
+
actions.skipped.append(tracked.run_id)
|
|
235
|
+
continue
|
|
236
|
+
verdicts[tracked.run_id] = verdict
|
|
237
|
+
v = verdict.verdict
|
|
238
|
+
|
|
239
|
+
# 1. Tally the verdict + handle the ADVANCING (recovered) case.
|
|
240
|
+
if v == liveness.Liveness.ADVANCING:
|
|
241
|
+
actions.advancing.append(tracked.run_id)
|
|
242
|
+
# Recovered — drop any prior proposal memory so a later re-spin can be
|
|
243
|
+
# re-proposed (the recovered-run-can-be-reproposed property).
|
|
244
|
+
proposed.pop(tracked.run_id, None)
|
|
245
|
+
continue
|
|
246
|
+
if v == liveness.Liveness.SPINNING:
|
|
247
|
+
actions.spinning.append(tracked.run_id)
|
|
248
|
+
|
|
249
|
+
# 2. The §3 warrant decision. A STALLED run too young for its budget is
|
|
250
|
+
# tallied as within-budget and skipped; everything else that doesn't
|
|
251
|
+
# warrant a halt (an unknown future verdict) just continues.
|
|
252
|
+
if not _warrants_halt(tracked, v, now_ms=now_ms):
|
|
253
|
+
if v == liveness.Liveness.STALLED:
|
|
254
|
+
actions.stalled_within_budget.append(tracked.run_id)
|
|
255
|
+
continue
|
|
256
|
+
|
|
257
|
+
# 3. Idempotence: at most one proposal per run per repropose window.
|
|
258
|
+
last = proposed.get(tracked.run_id)
|
|
259
|
+
if last is not None and (now_ms - last) < repropose_ms:
|
|
260
|
+
continue
|
|
261
|
+
|
|
262
|
+
reason = (
|
|
263
|
+
f"watchdog: {v.value} "
|
|
264
|
+
f"({'no forward delta' if v == liveness.Liveness.SPINNING else 'hung past budget'})"
|
|
265
|
+
)
|
|
266
|
+
try:
|
|
267
|
+
halt(
|
|
268
|
+
cfg,
|
|
269
|
+
handle=tracked.handle,
|
|
270
|
+
lane=tracked.lane,
|
|
271
|
+
loop_ts=tracked.loop_ts,
|
|
272
|
+
owner="watchdog",
|
|
273
|
+
reason=reason,
|
|
274
|
+
run_id=tracked.run_id,
|
|
275
|
+
command=tracked.stop_command or None,
|
|
276
|
+
)
|
|
277
|
+
proposed[tracked.run_id] = now_ms
|
|
278
|
+
actions.proposed_halts.append(tracked.run_id)
|
|
279
|
+
except Exception: # noqa: BLE001 — a failed record is non-fatal; retry next tick
|
|
280
|
+
pass
|
|
281
|
+
|
|
282
|
+
return verdicts, actions
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def discover_tracked_runs(cfg, *, budget_ms: Optional[int] = None) -> "list[TrackedRun]":
|
|
286
|
+
"""Fold the live-lease set into tracked runs (the --discover mode, docs/101 §2).
|
|
287
|
+
|
|
288
|
+
Read-only: replays the lane journal's live leases (`lane_lease.live_leases`) and
|
|
289
|
+
derives `lane`/`loop_ts`/`handle`(pid) from each. A discovered run carries NO
|
|
290
|
+
start SHA (a journal lease records none — the honest floor), so it is judged on
|
|
291
|
+
the journal rung alone; that is strictly weaker but never wrong. The lease's
|
|
292
|
+
`loop_ts` doubles as a stand-in run-id ONLY if it parses as a CID token; a lease
|
|
293
|
+
whose `loop_ts` is not a run-id is skipped here (it cannot be timed by
|
|
294
|
+
`liveness`), the no-plan-per-run degrade. A host that wants the commit rung
|
|
295
|
+
passes an explicit `TrackedRun` with a real run-id + start SHA instead.
|
|
296
|
+
"""
|
|
297
|
+
out: list[TrackedRun] = []
|
|
298
|
+
try:
|
|
299
|
+
leases = lane_lease.live_leases(cfg)
|
|
300
|
+
except Exception: # noqa: BLE001 — a bad journal yields no discovered runs
|
|
301
|
+
return out
|
|
302
|
+
for l in leases:
|
|
303
|
+
loop_ts = str(l.get("loop_ts") or "")
|
|
304
|
+
# A discovered run needs a CID-shaped identity to be timed. Prefer an
|
|
305
|
+
# explicit run_id on the lease; fall back to loop_ts only if it decodes.
|
|
306
|
+
rid = str(l.get("run_id") or "")
|
|
307
|
+
if run_id.ts_ms_of(rid) is None:
|
|
308
|
+
rid = loop_ts if run_id.ts_ms_of(loop_ts) is not None else ""
|
|
309
|
+
if not rid:
|
|
310
|
+
continue
|
|
311
|
+
out.append(
|
|
312
|
+
TrackedRun(
|
|
313
|
+
run_id=rid,
|
|
314
|
+
start_sha="", # the honest floor: a lease records no start SHA
|
|
315
|
+
lane=str(l.get("lane") or ""),
|
|
316
|
+
loop_ts=loop_ts,
|
|
317
|
+
handle=str(l.get("pid") or ""),
|
|
318
|
+
budget_ms=budget_ms,
|
|
319
|
+
stop_command="",
|
|
320
|
+
)
|
|
321
|
+
)
|
|
322
|
+
return out
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def run(
|
|
326
|
+
config=None,
|
|
327
|
+
*,
|
|
328
|
+
tracked_runs,
|
|
329
|
+
interval: float = DEFAULT_INTERVAL_S,
|
|
330
|
+
max_ticks: Optional[int] = None,
|
|
331
|
+
repropose_ms: int = DEFAULT_REPROPOSE_MS,
|
|
332
|
+
clock_ms=None,
|
|
333
|
+
sleep=time.sleep,
|
|
334
|
+
halt=lane_lease.halt,
|
|
335
|
+
) -> int:
|
|
336
|
+
"""Run the watchdog until `max_ticks` or an operator interrupt.
|
|
337
|
+
|
|
338
|
+
Each tick assesses every tracked run and records an OP_HALT for the ones that
|
|
339
|
+
warrant one, then sleeps `interval` (long — a watchdog, not a busy-poll). The
|
|
340
|
+
clock keeps ticking in THIS process no matter what the watched runs do — the
|
|
341
|
+
structural independence that answers the §2.1 budget-late incident.
|
|
342
|
+
`clock_ms`/`sleep`/`halt` are injectable for deterministic, journal-free tests.
|
|
343
|
+
`tracked_runs` is fixed for the life of the run (a host re-launches `run` to
|
|
344
|
+
change the set, or passes a callable — kept simple here: a fixed list). Returns
|
|
345
|
+
0 on a clean stop.
|
|
346
|
+
"""
|
|
347
|
+
cfg = config if config is not None else _config.active()
|
|
348
|
+
runs = list(tracked_runs)
|
|
349
|
+
proposed: dict = {}
|
|
350
|
+
ticks = 0
|
|
351
|
+
_clock = clock_ms if clock_ms is not None else (lambda: int(time.time() * 1000))
|
|
352
|
+
try:
|
|
353
|
+
while max_ticks is None or ticks < max_ticks:
|
|
354
|
+
now_ms = _clock()
|
|
355
|
+
tick(cfg, runs, now_ms=now_ms, proposed=proposed,
|
|
356
|
+
repropose_ms=repropose_ms, halt=halt)
|
|
357
|
+
ticks += 1
|
|
358
|
+
if max_ticks is not None and ticks >= max_ticks:
|
|
359
|
+
break
|
|
360
|
+
sleep(interval)
|
|
361
|
+
except KeyboardInterrupt:
|
|
362
|
+
return 0
|
|
363
|
+
return 0
|
dos/drivers/workshop.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""dos.drivers.workshop — a generic, self-contained reference host policy pack.
|
|
2
|
+
|
|
3
|
+
This is the **copy-me template** for adding a new host to DOS. It is a driver
|
|
4
|
+
(layer 4): the *policy* a particular host workload supplies on top of the kernel
|
|
5
|
+
*mechanism*. Where `dos.drivers.job` (the kernel's first userland app) delegates
|
|
6
|
+
its taxonomy back to `dos.config` for backward-compatibility, `workshop` declares
|
|
7
|
+
everything it needs **inline, in this one file** — so a new host can read a single
|
|
8
|
+
module and see the whole shape of "what a driver is."
|
|
9
|
+
|
|
10
|
+
The "workshop" frame: a shop where two benches build distinct parts of one product
|
|
11
|
+
*concurrently*, and a single release bench *exclusively* ships it. It names no
|
|
12
|
+
company, no challenge, no real product — it is a deliberately generic stand-in
|
|
13
|
+
whose lanes are evocative enough to host real-looking trees.
|
|
14
|
+
|
|
15
|
+
A driver is two things, the same two `job` has:
|
|
16
|
+
|
|
17
|
+
* a `LaneTaxonomy` constant (`WORKSHOP_LANE_TAXONOMY`) — the concurrency policy
|
|
18
|
+
as pure data, and
|
|
19
|
+
* a `<name>_config(workspace)` factory (`workshop_config`) — binds that taxonomy
|
|
20
|
+
to a workspace root and returns a `SubstrateConfig`.
|
|
21
|
+
|
|
22
|
+
The factory name matches the module stem (`workshop` → `workshop_config`), which
|
|
23
|
+
is the **by-convention contract** the generic `dos --driver <name>` CLI loader
|
|
24
|
+
resolves (`dos.drivers.<name>.<name>_config`), exactly as `job` → `job_config`.
|
|
25
|
+
Adding a host = a module like this one; the kernel/CLI never learns its name.
|
|
26
|
+
|
|
27
|
+
## The lane taxonomy — why these lanes, and the four things it teaches
|
|
28
|
+
|
|
29
|
+
Two **concurrent** cluster lanes, `frontend` and `backend`, plus an **exclusive**
|
|
30
|
+
`release` lane and the catch-all exclusive `global` (the same escape hatch the
|
|
31
|
+
generic `default_config` and `job_config` carry — keeping the taxonomy a clean
|
|
32
|
+
superset of the default).
|
|
33
|
+
|
|
34
|
+
1. **Concurrent + tree-disjoint.** `frontend` (`app/`, `web/`, `ui/`) and
|
|
35
|
+
`backend` (`service/`, `api/`, `worker/`) touch provably disjoint file trees,
|
|
36
|
+
so the arbiter (`dos.arbiter` + `dos.lane_overlap`) admits a `backend` request
|
|
37
|
+
*alongside* a live `frontend` lease — two build agents run at once. No prefix of
|
|
38
|
+
one tree is a prefix of the other, which is the whole disjointness rule.
|
|
39
|
+
|
|
40
|
+
2. **The docs-prefix distinction trick.** Both clusters also own a doc tree under
|
|
41
|
+
the SAME `docs/` directory, kept disjoint by FILENAME PREFIX: `frontend` owns
|
|
42
|
+
`docs/UI-*`, `backend` owns `docs/SVC-*`. `dos._tree.norm_tree_prefix` truncates
|
|
43
|
+
a glob at its first `*` but keeps the literal before it — so `docs/UI-*` →
|
|
44
|
+
`docs/UI-` and `docs/SVC-*` → `docs/SVC-`, which do NOT collide (neither
|
|
45
|
+
`startswith` the other). A bare `docs/` would normalize to `docs/` and collide,
|
|
46
|
+
defeating concurrency — so this is the load-bearing teaching point: two lanes can
|
|
47
|
+
share a parent directory and still run concurrently if their globs discriminate.
|
|
48
|
+
|
|
49
|
+
3. **Exclusive `release`.** While `release` is held, every other request refuses;
|
|
50
|
+
a deploy / version-cut never races a build. NOTE the honesty of its tree:
|
|
51
|
+
`**/VERSION` normalizes to the *universal* (empty) prefix, so `release`'s blast
|
|
52
|
+
radius really is the whole repo — which is exactly WHY it must run alone. An
|
|
53
|
+
exclusive lane is admitted/refused on liveness (is another lease live?), never on
|
|
54
|
+
tree-disjointness, so this whole-repo glob is correct, not a bug. (One consequence
|
|
55
|
+
worth knowing: because `**/VERSION` collides with the kernel's own source files,
|
|
56
|
+
a `release` request arbitrated through the workspace-blind PURE path would trip
|
|
57
|
+
the SELF_MODIFY guard; the CLI's `dos arbitrate` scopes the guard to files that
|
|
58
|
+
actually exist under the served workspace, so in a foreign repo `release` admits.)
|
|
59
|
+
|
|
60
|
+
4. **`--lane` keyword aliases.** A request can say `--lane ui` / `--lane api` /
|
|
61
|
+
`--lane ship` and reach the canonical lane; `aliases` routes keyword → named lane.
|
|
62
|
+
|
|
63
|
+
The lane trees are the discriminating *path prefixes* the kernel normalizes a glob
|
|
64
|
+
to (`dos._tree.norm_tree_prefix`), so `docs/UI-` and `docs/SVC-` stay distinct even
|
|
65
|
+
though both live under `docs/`.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
from __future__ import annotations
|
|
69
|
+
|
|
70
|
+
from pathlib import Path
|
|
71
|
+
|
|
72
|
+
from dos.config import (
|
|
73
|
+
LaneTaxonomy,
|
|
74
|
+
PathLayout,
|
|
75
|
+
SubstrateConfig,
|
|
76
|
+
gather_workspace_facts,
|
|
77
|
+
resolve_workspace_root,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# The workshop's concurrency policy, as data. `frontend` ∩ `backend` is provably
|
|
81
|
+
# tree-disjoint (`app/` vs `service/`; `docs/UI-` vs `docs/SVC-`), so the two build
|
|
82
|
+
# agents run concurrently; `release`/`global` are exclusive so a deploy/version-cut
|
|
83
|
+
# runs alone.
|
|
84
|
+
WORKSHOP_LANE_TAXONOMY = LaneTaxonomy(
|
|
85
|
+
concurrent=("frontend", "backend"),
|
|
86
|
+
exclusive=("release", "global"),
|
|
87
|
+
autopick=("frontend", "backend"),
|
|
88
|
+
trees={
|
|
89
|
+
# The UI half — its source + its plan/ship docs (docs/UI-*).
|
|
90
|
+
"frontend": (
|
|
91
|
+
"app/**/*",
|
|
92
|
+
"web/**/*",
|
|
93
|
+
"ui/**/*",
|
|
94
|
+
"docs/UI-*",
|
|
95
|
+
),
|
|
96
|
+
# The service half — API + workers + its docs (docs/SVC-*).
|
|
97
|
+
"backend": (
|
|
98
|
+
"service/**/*",
|
|
99
|
+
"api/**/*",
|
|
100
|
+
"worker/**/*",
|
|
101
|
+
"docs/SVC-*",
|
|
102
|
+
),
|
|
103
|
+
# The exclusive deploy / version-cut ceremony. `**/VERSION` is a
|
|
104
|
+
# whole-repo glob (honest: a release touches everything), which is why
|
|
105
|
+
# the lane is exclusive.
|
|
106
|
+
"release": (
|
|
107
|
+
"deploy/**/*",
|
|
108
|
+
".github/workflows/**/*",
|
|
109
|
+
"docs/REL-*",
|
|
110
|
+
"**/VERSION",
|
|
111
|
+
),
|
|
112
|
+
# The catch-all exclusive lane (mirrors the kernel default's escape hatch).
|
|
113
|
+
"global": ("**/*",),
|
|
114
|
+
},
|
|
115
|
+
aliases={
|
|
116
|
+
# Keyword routing so a request can say `--lane ui` / `--lane api` /
|
|
117
|
+
# `--lane ship` and reach the canonical lane.
|
|
118
|
+
"ui": "frontend",
|
|
119
|
+
"web": "frontend",
|
|
120
|
+
"frontend": "frontend",
|
|
121
|
+
"svc": "backend",
|
|
122
|
+
"api": "backend",
|
|
123
|
+
"service": "backend",
|
|
124
|
+
"backend": "backend",
|
|
125
|
+
"ship": "release",
|
|
126
|
+
"deploy": "release",
|
|
127
|
+
"release": "release",
|
|
128
|
+
},
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def workshop_config(workspace: Path | str | None = None) -> SubstrateConfig:
|
|
133
|
+
"""The workshop reference policy, pointed at ``workspace``.
|
|
134
|
+
|
|
135
|
+
Mirrors `dos.config.job_config`: binds this driver's lane taxonomy to the
|
|
136
|
+
workspace root (resolved by the standard precedence — explicit arg ›
|
|
137
|
+
``DISPATCH_WORKSPACE`` › cwd) with the job-repo-shaped default path layout.
|
|
138
|
+
A host whose plans/state live elsewhere either swaps `PathLayout` here or
|
|
139
|
+
declares `[paths]` in its workspace's ``dos.toml`` (the no-code path); the
|
|
140
|
+
ship-stamp grammar is likewise layered from ``dos.toml`` ``[stamp]``, so it is
|
|
141
|
+
not hardcoded — the factory stays minimal and parallel to `job_config`.
|
|
142
|
+
|
|
143
|
+
Like `job_config` / `default_config`, it gathers the workspace facts
|
|
144
|
+
(`gather_workspace_facts`) and caches them on the config so the SELF_MODIFY
|
|
145
|
+
guard is workspace-scoped: in a foreign repo (no `src/dos/` runtime files) the
|
|
146
|
+
exclusive `release` lane's whole-repo `**/VERSION` glob admits rather than
|
|
147
|
+
tripping self-modify against kernel files that aren't there. Omitting this
|
|
148
|
+
leaves `config.workspace=None`, which forces the guard to the conservative full
|
|
149
|
+
static set and (wrongly) refuses `release` — so a driver factory MUST gather
|
|
150
|
+
facts, exactly as the kernel's own factories do.
|
|
151
|
+
"""
|
|
152
|
+
root = resolve_workspace_root(workspace)
|
|
153
|
+
return SubstrateConfig(
|
|
154
|
+
lanes=WORKSHOP_LANE_TAXONOMY,
|
|
155
|
+
paths=PathLayout.for_root(root),
|
|
156
|
+
workspace=gather_workspace_facts(root),
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
__all__ = ["WORKSHOP_LANE_TAXONOMY", "workshop_config"]
|