dos-kernel 0.22.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dos/__init__.py +261 -0
- dos/_bin/dos-hook.exe +0 -0
- dos/_filelock.py +255 -0
- dos/_job_policy.py +97 -0
- dos/_tree.py +145 -0
- dos/admission.py +433 -0
- dos/answer_shape.py +299 -0
- dos/arbiter.py +859 -0
- dos/archive_lock.py +266 -0
- dos/arg_provenance.py +814 -0
- dos/attest.py +472 -0
- dos/breaker.py +311 -0
- dos/churn.py +226 -0
- dos/claim_extract.py +229 -0
- dos/claim_ttl.py +150 -0
- dos/cli.py +8721 -0
- dos/commit_audit.py +666 -0
- dos/completion.py +466 -0
- dos/concurrency_class.py +154 -0
- dos/config.py +1380 -0
- dos/config_lint.py +464 -0
- dos/cooldown.py +390 -0
- dos/coverage.py +387 -0
- dos/dangling_intent.py +287 -0
- dos/data_class.py +397 -0
- dos/decisions.py +1274 -0
- dos/decisions_tui.py +251 -0
- dos/dispatch_top.py +740 -0
- dos/dispatch_top_tui.py +116 -0
- dos/drivers/__init__.py +40 -0
- dos/drivers/ci_status.py +630 -0
- dos/drivers/citation_resolve.py +703 -0
- dos/drivers/decision_stop.py +98 -0
- dos/drivers/export_file.py +173 -0
- dos/drivers/export_otlp.py +275 -0
- dos/drivers/export_statsd.py +242 -0
- dos/drivers/hook_dialects.py +391 -0
- dos/drivers/job.py +47 -0
- dos/drivers/llm_judge.py +360 -0
- dos/drivers/memory_recall.py +1231 -0
- dos/drivers/notify_slack.py +373 -0
- dos/drivers/notify_webhook.py +251 -0
- dos/drivers/operator_judge.py +114 -0
- dos/drivers/os_acceptance.py +228 -0
- dos/drivers/paste_log.py +132 -0
- dos/drivers/plan_scope.py +133 -0
- dos/drivers/self_improve.py +375 -0
- dos/drivers/similarity_judge.py +249 -0
- dos/drivers/state_diff.py +274 -0
- dos/drivers/supervisor.py +347 -0
- dos/drivers/watchdog.py +363 -0
- dos/drivers/workshop.py +160 -0
- dos/durable_schema.py +344 -0
- dos/effect_witness.py +393 -0
- dos/efficiency.py +318 -0
- dos/enforce.py +414 -0
- dos/enumerate.py +776 -0
- dos/env_print.py +378 -0
- dos/event_severity.py +258 -0
- dos/evidence.py +692 -0
- dos/exec_capability.py +256 -0
- dos/export_cursor.py +143 -0
- dos/exporter.py +320 -0
- dos/firing_label.py +353 -0
- dos/fleet_roll.py +226 -0
- dos/gate_classify.py +827 -0
- dos/gh4_coverage.py +179 -0
- dos/git_delta.py +122 -0
- dos/guard.py +215 -0
- dos/health.py +552 -0
- dos/help_summary.py +519 -0
- dos/home.py +934 -0
- dos/hook_binary.py +194 -0
- dos/hook_dialect.py +271 -0
- dos/hook_exit.py +191 -0
- dos/hook_install.py +437 -0
- dos/id_alloc.py +304 -0
- dos/improve.py +499 -0
- dos/intent_ledger.py +635 -0
- dos/interpret.py +176 -0
- dos/intervention.py +769 -0
- dos/intervention_eval.py +371 -0
- dos/journal_delta.py +308 -0
- dos/judge_eval.py +328 -0
- dos/judges.py +366 -0
- dos/lane_infer.py +127 -0
- dos/lane_journal.py +1001 -0
- dos/lane_lease.py +952 -0
- dos/lane_overlap.py +228 -0
- dos/lease_health.py +282 -0
- dos/lifecycle.py +211 -0
- dos/liveness.py +352 -0
- dos/lock_modes.py +185 -0
- dos/log_source.py +395 -0
- dos/loop_decide.py +1746 -0
- dos/marker_gate.py +254 -0
- dos/marker_sensor.py +396 -0
- dos/noop_streak.py +280 -0
- dos/notify.py +479 -0
- dos/observe.py +175 -0
- dos/oracle.py +1661 -0
- dos/overlap_eval.py +214 -0
- dos/overlap_policy.py +342 -0
- dos/packet_sidecar.py +267 -0
- dos/phase_shipped.py +1985 -0
- dos/pick_priority.py +225 -0
- dos/pickable.py +369 -0
- dos/picker_oracle.py +1037 -0
- dos/plan_board.py +513 -0
- dos/plan_board_tui.py +113 -0
- dos/plan_source.py +455 -0
- dos/posttool_sensor.py +528 -0
- dos/precursor_gate.py +499 -0
- dos/precursor_gate_eval.py +239 -0
- dos/preflight.py +825 -0
- dos/pretool_sensor.py +490 -0
- dos/proc_delta.py +181 -0
- dos/productivity.py +296 -0
- dos/provider_limit.py +242 -0
- dos/py.typed +4 -0
- dos/reason_morphology.py +299 -0
- dos/reasons.py +449 -0
- dos/reconcile.py +173 -0
- dos/recurring_wedge.py +206 -0
- dos/render.py +393 -0
- dos/result_state.py +468 -0
- dos/resume.py +578 -0
- dos/resume_evidence.py +293 -0
- dos/retention.py +344 -0
- dos/reward.py +372 -0
- dos/rewind.py +587 -0
- dos/rewind_evidence.py +168 -0
- dos/rewind_tokens.py +252 -0
- dos/run_id.py +342 -0
- dos/scope.py +520 -0
- dos/scope_source.py +382 -0
- dos/scout.py +982 -0
- dos/self_modify.py +209 -0
- dos/sibling_scan.py +569 -0
- dos/skills/EXAMPLES.md +584 -0
- dos/skills/dos-class-cycle/SKILL.md +107 -0
- dos/skills/dos-dispatch/SKILL.md +177 -0
- dos/skills/dos-dispatch-loop/SKILL.md +254 -0
- dos/skills/dos-goal-gate/SKILL.md +269 -0
- dos/skills/dos-next-up/SKILL.md +231 -0
- dos/skills/dos-promote/SKILL.md +114 -0
- dos/skills/dos-replan/SKILL.md +159 -0
- dos/skills/dos-replan-loop/SKILL.md +114 -0
- dos/skills/dos-self-improve/SKILL.md +213 -0
- dos/skills/dos-supervise-loop/SKILL.md +180 -0
- dos/skills/dos-unstick/SKILL.md +108 -0
- dos/skills/dos-witness-claim/SKILL.md +251 -0
- dos/stamp.py +1002 -0
- dos/state_health.py +387 -0
- dos/status.py +114 -0
- dos/stop_policy.py +334 -0
- dos/supervise.py +1014 -0
- dos/testwitness.py +392 -0
- dos/timeline.py +1027 -0
- dos/tokens.py +485 -0
- dos/tool_stream.py +393 -0
- dos/tool_stream_eval.py +226 -0
- dos/trace.py +524 -0
- dos/verdict.py +140 -0
- dos/verdict_cli.py +189 -0
- dos/verdict_journal.py +497 -0
- dos/verdict_rollup.py +217 -0
- dos/verdicts.py +181 -0
- dos/wedge_reason.py +282 -0
- dos_kernel-0.22.0.dist-info/METADATA +859 -0
- dos_kernel-0.22.0.dist-info/RECORD +178 -0
- dos_kernel-0.22.0.dist-info/WHEEL +5 -0
- dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
- dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
- dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
- dos_mcp/__init__.py +52 -0
- dos_mcp/py.typed +2 -0
- dos_mcp/server.py +779 -0
dos/breaker.py
ADDED
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
"""BRK — the circuit breaker: *this keeps failing; stop, and escalate the rung.*
|
|
2
|
+
|
|
3
|
+
docs/223 — idea **H2** from the Claude Code source audit (docs/189). A circuit
|
|
4
|
+
breaker is the oldest pattern in reliability engineering: count failures, and when
|
|
5
|
+
they pile up, *stop trying the same thing* — open the circuit so the caller does
|
|
6
|
+
something else instead of hammering a broken path. DOS already has this pattern
|
|
7
|
+
**six times over**, hand-coded inline in `loop_decide`: `consecutive_unclear`,
|
|
8
|
+
`consecutive_overloaded`, `consecutive_dirty_zero`, `consecutive_stale_stamp` —
|
|
9
|
+
each is the same ~15 lines (bump a counter, compare to a max, stop if reached,
|
|
10
|
+
reset on a clean outcome), differing only in *which* counter, *what* threshold, and
|
|
11
|
+
*what to do when it trips*. That repetition is the smell this module removes: the
|
|
12
|
+
control logic is **mechanism** (identical everywhere), and the counter / threshold /
|
|
13
|
+
trip-action are **policy** (different everywhere). Lift the mechanism into one pure
|
|
14
|
+
leaf; make the policy data.
|
|
15
|
+
|
|
16
|
+
This is the `malloc` move, stated plainly. `malloc` is in every C program because
|
|
17
|
+
it is mechanism (hand out bytes) with policy (what you allocate) pushed out. A
|
|
18
|
+
breaker hard-wired to "stop the dispatch loop after 3 UNCLEAR iterations" can never
|
|
19
|
+
be universal — the 3, the UNCLEAR, and the dispatch loop are someone's policy baked
|
|
20
|
+
into the mechanism. A breaker that knows only "this failure class has now happened N
|
|
21
|
+
times consecutively (or M times total); the policy says that's too many" *can* be,
|
|
22
|
+
because the caller names the class, the thresholds, and the response. The kernel
|
|
23
|
+
counts; it never knows what failed.
|
|
24
|
+
|
|
25
|
+
It is `liveness`/`productivity`'s shape — a pure verdict over already-gathered
|
|
26
|
+
state — but for a different question. Where those ask "is the run moving / still
|
|
27
|
+
productive?", BRK asks "has this *kind of thing* failed too many times to keep
|
|
28
|
+
trying?":
|
|
29
|
+
|
|
30
|
+
liveness.classify (ProgressEvidence, policy) -> LivenessVerdict
|
|
31
|
+
productivity.classify (WorkHistory, policy) -> ProductivityVerdict
|
|
32
|
+
breaker.record_failure (BreakerState, policy) -> BreakerTransition
|
|
33
|
+
^ THIS module
|
|
34
|
+
|
|
35
|
+
**Two counters, lifted faithfully from CC** (`denialTracking.ts`). CC tracks
|
|
36
|
+
`consecutiveDenials` (reset on a success) AND `totalDenials` (never reset), and
|
|
37
|
+
trips on *either* (`shouldFallbackToPrompting`). The two catch different
|
|
38
|
+
pathologies, and you need both:
|
|
39
|
+
|
|
40
|
+
- **consecutive** catches a *sustained* failure — N in a row with no recovery, a
|
|
41
|
+
path that is simply broken right now. Resets the moment something succeeds (the
|
|
42
|
+
incident cleared).
|
|
43
|
+
- **total** catches a *flapping* failure — fail, succeed, fail, succeed… — which
|
|
44
|
+
never trips a consecutive-only breaker but is still pathological (the path is
|
|
45
|
+
unreliable, just not consistently down). The cumulative count, which never
|
|
46
|
+
resets, is the only thing that sees it.
|
|
47
|
+
|
|
48
|
+
A consecutive-only breaker (today's `loop_decide` shape) is blind to flapping; BRK
|
|
49
|
+
fixes that by carrying both, exactly as CC does.
|
|
50
|
+
|
|
51
|
+
**The DOS addition: the trip ESCALATES a rung, it doesn't just stop** (idea H3,
|
|
52
|
+
folded in). CC's breaker falls back from the classifier to *prompting the human*.
|
|
53
|
+
DOS already has a richer ladder for "who adjudicates when the cheaper mechanism is
|
|
54
|
+
stuck" — the trust ladder ORACLE → JUDGE → HUMAN (`docs/86`, `dos.judges`). So an
|
|
55
|
+
open breaker does not just say STOP; it names *where to escalate* — keep going on
|
|
56
|
+
the same rung (NONE — the breaker is advisory and the caller may continue), kick the
|
|
57
|
+
decision up to a non-deterministic JUDGE, or surface to a HUMAN. "Don't keep
|
|
58
|
+
refusing identically — escalate the rung." The kernel computes the *trip*; the host
|
|
59
|
+
decides what an escalation MEANS (re-dispatch under a judge, queue an operator
|
|
60
|
+
decision) — the same advisory line `liveness`/`productivity` hold: BRK reports, it
|
|
61
|
+
never kills a process or refuses a lease.
|
|
62
|
+
|
|
63
|
+
**Byte-clean / no-I/O / no-policy-names.** The state is two integers the caller
|
|
64
|
+
threads through its own loop; `record_failure` / `record_success` are pure folds —
|
|
65
|
+
no clock, no file, no host vocabulary. The breaker never sees the failure's
|
|
66
|
+
*identity* (it is handed a count, not an UNCLEAR token), so it cannot smuggle in a
|
|
67
|
+
host assumption. A workspace declares the thresholds + escalation in `dos.toml
|
|
68
|
+
[breaker]` per failure class (the closed-config-as-data pattern); the defaults are
|
|
69
|
+
the CC constants (3 consecutive / 20 total).
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
from __future__ import annotations
|
|
73
|
+
|
|
74
|
+
import enum
|
|
75
|
+
from dataclasses import dataclass, replace
|
|
76
|
+
from typing import Optional
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class BreakerState(str, enum.Enum):
|
|
80
|
+
"""Is the circuit CLOSED (keep going) or OPEN (tripped)?
|
|
81
|
+
|
|
82
|
+
The classic two-state breaker. `str`-valued so it round-trips through a CLI
|
|
83
|
+
stdout token / exit-code map without a lookup table (the `liveness.Liveness` /
|
|
84
|
+
`productivity.Productivity` idiom). (No HALF_OPEN state: that is a *recovery*
|
|
85
|
+
probe — "let one request through to test the waters" — which is a host
|
|
86
|
+
actuation, not a kernel verdict. BRK reports the trip; the host decides whether
|
|
87
|
+
to retry, exactly the advisory line.)
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
CLOSED = "CLOSED" # failures are under the limits — the path is still usable
|
|
91
|
+
OPEN = "OPEN" # a limit was reached — stop hammering this path, escalate
|
|
92
|
+
|
|
93
|
+
def __str__(self) -> str: # pragma: no cover - trivial
|
|
94
|
+
return self.value
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class Escalation(str, enum.Enum):
|
|
98
|
+
"""Where an OPEN breaker says to escalate — the trust-ladder rung (docs/86).
|
|
99
|
+
|
|
100
|
+
The DOS enrichment of CC's binary "fall back to prompting": instead of one
|
|
101
|
+
fallback, name the rung. The kernel computes which rung the *policy* declared
|
|
102
|
+
for this failure class; the host decides what acting on it means. ORACLE→JUDGE
|
|
103
|
+
→HUMAN is monotonic in trust-cost — a policy escalates UP the ladder, never
|
|
104
|
+
down (you don't answer a stuck human with a deterministic re-check).
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
NONE = "NONE" # advisory only — report OPEN, let the caller decide (the default floor)
|
|
108
|
+
JUDGE = "JUDGE" # kick the stuck decision to a non-deterministic adjudicator (dos.judges)
|
|
109
|
+
HUMAN = "HUMAN" # surface to an operator — the irreducible seed (the dos decisions queue)
|
|
110
|
+
|
|
111
|
+
def __str__(self) -> str: # pragma: no cover - trivial
|
|
112
|
+
return self.value
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@dataclass(frozen=True)
|
|
116
|
+
class BreakerPolicy:
|
|
117
|
+
"""The thresholds + escalation that define ONE failure class's breaker — policy, not mechanism.
|
|
118
|
+
|
|
119
|
+
The same "mechanism is kernel, thresholds are config" split as `liveness`'s
|
|
120
|
+
windows and `productivity`'s floor. The defaults are the CC `denialTracking.ts`
|
|
121
|
+
constants (3 consecutive, 20 total). A workspace declares one of these per
|
|
122
|
+
failure class in `dos.toml [breaker]` (closed-config-as-data, like
|
|
123
|
+
`[lanes]`/`[liveness]`); the host names the class, the kernel just counts.
|
|
124
|
+
|
|
125
|
+
max_consecutive — trip when this many failures occur IN A ROW (reset by any
|
|
126
|
+
success). Catches a *sustained* outage. CC's
|
|
127
|
+
`maxConsecutive`. 0 disables the consecutive rung (only the
|
|
128
|
+
total rung can trip).
|
|
129
|
+
max_total — trip when this many failures occur in TOTAL over the
|
|
130
|
+
breaker's life (never reset). Catches a *flapping* failure a
|
|
131
|
+
consecutive count misses. CC's `maxTotal`. 0 disables the
|
|
132
|
+
total rung.
|
|
133
|
+
on_trip — the `Escalation` rung an OPEN verdict names (NONE / JUDGE /
|
|
134
|
+
HUMAN). Default NONE — advisory, the kernel's safe floor.
|
|
135
|
+
|
|
136
|
+
At least one rung must be enabled (a policy with both maxima 0 can never trip,
|
|
137
|
+
which is almost certainly a config mistake — refuse it rather than silently
|
|
138
|
+
build a breaker that does nothing).
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
max_consecutive: int = 3 # CC maxConsecutive — N-in-a-row (sustained outage)
|
|
142
|
+
max_total: int = 20 # CC maxTotal — cumulative cap (flapping failure)
|
|
143
|
+
on_trip: Escalation = Escalation.NONE
|
|
144
|
+
|
|
145
|
+
def __post_init__(self) -> None:
|
|
146
|
+
if self.max_consecutive < 0 or self.max_total < 0:
|
|
147
|
+
raise ValueError("breaker thresholds must be non-negative")
|
|
148
|
+
if self.max_consecutive == 0 and self.max_total == 0:
|
|
149
|
+
raise ValueError(
|
|
150
|
+
"a breaker with both thresholds 0 can never trip — enable at least "
|
|
151
|
+
"one rung (max_consecutive or max_total)"
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
DEFAULT_POLICY = BreakerPolicy()
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@dataclass(frozen=True)
|
|
159
|
+
class BreakerCounts:
|
|
160
|
+
"""The breaker's carried state — two integers the caller threads through its loop.
|
|
161
|
+
|
|
162
|
+
The whole state, by design: the breaker is a fold over a failure/success stream,
|
|
163
|
+
and these two counts are everything the fold needs (the `loop_decide.LoopState`
|
|
164
|
+
counters, extracted and named generically). Immutable — every transition returns
|
|
165
|
+
a NEW `BreakerCounts`, so a caller never re-derives the count by hand and the
|
|
166
|
+
state is replay-testable on frozen fixtures.
|
|
167
|
+
|
|
168
|
+
consecutive — failures since the last success (reset by `record_success`).
|
|
169
|
+
total — failures over the breaker's whole life (never reset).
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
consecutive: int = 0
|
|
173
|
+
total: int = 0
|
|
174
|
+
|
|
175
|
+
def __post_init__(self) -> None:
|
|
176
|
+
if self.consecutive < 0 or self.total < 0:
|
|
177
|
+
raise ValueError("breaker counts must be non-negative")
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@dataclass(frozen=True)
|
|
181
|
+
class BreakerVerdict:
|
|
182
|
+
"""The verdict for one transition: CLOSED/OPEN + WHY + where to escalate.
|
|
183
|
+
|
|
184
|
+
`state` is the typed `BreakerState`. `escalation` is the rung an OPEN verdict
|
|
185
|
+
names (always NONE when CLOSED — there is nothing to escalate). `reason` is the
|
|
186
|
+
one-line operator-facing summary. `tripped_on` names which rung fired
|
|
187
|
+
("consecutive"/"total"/None) so the consumer/forensics can tell a sustained
|
|
188
|
+
outage from a flapping one — legible distrust, the `liveness`/`productivity`
|
|
189
|
+
echo-the-evidence discipline.
|
|
190
|
+
"""
|
|
191
|
+
|
|
192
|
+
state: BreakerState
|
|
193
|
+
escalation: Escalation
|
|
194
|
+
reason: str
|
|
195
|
+
tripped_on: Optional[str] = None # "consecutive" | "total" | None (CLOSED)
|
|
196
|
+
|
|
197
|
+
@property
|
|
198
|
+
def is_open(self) -> bool:
|
|
199
|
+
return self.state is BreakerState.OPEN
|
|
200
|
+
|
|
201
|
+
def to_dict(self) -> dict:
|
|
202
|
+
return {
|
|
203
|
+
"state": self.state.value,
|
|
204
|
+
"escalation": self.escalation.value,
|
|
205
|
+
"reason": self.reason,
|
|
206
|
+
"tripped_on": self.tripped_on,
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
@dataclass(frozen=True)
|
|
211
|
+
class BreakerTransition:
|
|
212
|
+
"""What `record_failure`/`record_success` return: the new counts + the verdict.
|
|
213
|
+
|
|
214
|
+
Bundling the two means the caller never has to re-thread the counts AND
|
|
215
|
+
re-classify them — it gets the next state to carry and the decision to act on in
|
|
216
|
+
one object (the `loop_decide.LoopDecision` shape: `next_state` + the action).
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
counts: BreakerCounts
|
|
220
|
+
verdict: BreakerVerdict
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _classify(counts: BreakerCounts, policy: BreakerPolicy) -> BreakerVerdict:
|
|
224
|
+
"""Classify already-counted state. PURE — the trip test, top to bottom.
|
|
225
|
+
|
|
226
|
+
Trips on EITHER rung (CC's `shouldFallbackToPrompting` OR-semantics). The
|
|
227
|
+
consecutive rung is checked first only so its (more specific, more urgent)
|
|
228
|
+
reason wins when both would fire; the verdict is OPEN either way. A disabled
|
|
229
|
+
rung (threshold 0) never fires (`__post_init__` guarantees at least one is on).
|
|
230
|
+
"""
|
|
231
|
+
# consecutive rung — a sustained run of failures with no recovery.
|
|
232
|
+
if policy.max_consecutive > 0 and counts.consecutive >= policy.max_consecutive:
|
|
233
|
+
return BreakerVerdict(
|
|
234
|
+
state=BreakerState.OPEN,
|
|
235
|
+
escalation=policy.on_trip,
|
|
236
|
+
tripped_on="consecutive",
|
|
237
|
+
reason=(
|
|
238
|
+
f"{counts.consecutive} consecutive failures "
|
|
239
|
+
f"(>= max {policy.max_consecutive}) — a sustained failure, open the "
|
|
240
|
+
f"circuit"
|
|
241
|
+
+ (f"; escalate to {policy.on_trip.value}"
|
|
242
|
+
if policy.on_trip is not Escalation.NONE else "")
|
|
243
|
+
),
|
|
244
|
+
)
|
|
245
|
+
# total rung — a flapping failure a consecutive count would miss.
|
|
246
|
+
if policy.max_total > 0 and counts.total >= policy.max_total:
|
|
247
|
+
return BreakerVerdict(
|
|
248
|
+
state=BreakerState.OPEN,
|
|
249
|
+
escalation=policy.on_trip,
|
|
250
|
+
tripped_on="total",
|
|
251
|
+
reason=(
|
|
252
|
+
f"{counts.total} total failures (>= max {policy.max_total}) — a "
|
|
253
|
+
f"flapping/unreliable path, open the circuit"
|
|
254
|
+
+ (f"; escalate to {policy.on_trip.value}"
|
|
255
|
+
if policy.on_trip is not Escalation.NONE else "")
|
|
256
|
+
),
|
|
257
|
+
)
|
|
258
|
+
# CLOSED — under both limits.
|
|
259
|
+
return BreakerVerdict(
|
|
260
|
+
state=BreakerState.CLOSED,
|
|
261
|
+
escalation=Escalation.NONE,
|
|
262
|
+
tripped_on=None,
|
|
263
|
+
reason=(
|
|
264
|
+
f"{counts.consecutive} consecutive / {counts.total} total failures — "
|
|
265
|
+
f"under the limits (consecutive {policy.max_consecutive}, total "
|
|
266
|
+
f"{policy.max_total}); circuit closed"
|
|
267
|
+
),
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def record_failure(
|
|
272
|
+
counts: BreakerCounts, policy: BreakerPolicy = DEFAULT_POLICY
|
|
273
|
+
) -> BreakerTransition:
|
|
274
|
+
"""Record one failure of this class and classify. PURE — no I/O.
|
|
275
|
+
|
|
276
|
+
Bumps BOTH counters (CC `recordDenial`), then classifies the new state. Returns
|
|
277
|
+
the next `BreakerCounts` to carry and the `BreakerVerdict` to act on. An already-
|
|
278
|
+
OPEN breaker stays OPEN (the counts only grow); recording past the trip is safe
|
|
279
|
+
and idempotent in outcome (the verdict stays OPEN), so a caller need not special-
|
|
280
|
+
case "already tripped."
|
|
281
|
+
"""
|
|
282
|
+
bumped = replace(counts, consecutive=counts.consecutive + 1, total=counts.total + 1)
|
|
283
|
+
return BreakerTransition(counts=bumped, verdict=_classify(bumped, policy))
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def record_success(
|
|
287
|
+
counts: BreakerCounts, policy: BreakerPolicy = DEFAULT_POLICY
|
|
288
|
+
) -> BreakerTransition:
|
|
289
|
+
"""Record one success of this class and classify. PURE — no I/O.
|
|
290
|
+
|
|
291
|
+
Resets the CONSECUTIVE counter (the sustained-outage signal cleared) but NOT the
|
|
292
|
+
total (CC `recordSuccess` — a flapping path's cumulative count must survive its
|
|
293
|
+
intermittent successes, or flapping could never trip). So a success can CLOSE a
|
|
294
|
+
consecutive-tripped breaker, but it cannot un-trip a total-tripped one — which is
|
|
295
|
+
correct: a path that has failed 20 times total is unreliable no matter how many
|
|
296
|
+
times it also succeeded.
|
|
297
|
+
"""
|
|
298
|
+
healed = replace(counts, consecutive=0)
|
|
299
|
+
return BreakerTransition(counts=healed, verdict=_classify(healed, policy))
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def classify(
|
|
303
|
+
counts: BreakerCounts, policy: BreakerPolicy = DEFAULT_POLICY
|
|
304
|
+
) -> BreakerVerdict:
|
|
305
|
+
"""Classify the CURRENT counts without recording anything. PURE — no I/O.
|
|
306
|
+
|
|
307
|
+
The read-only verdict (the `dos breaker` CLI / a `dos top` chip): "given these
|
|
308
|
+
counts, is the circuit open?" — without mutating the stream. `record_failure`/
|
|
309
|
+
`record_success` are the write path; this is the peek.
|
|
310
|
+
"""
|
|
311
|
+
return _classify(counts, policy)
|
dos/churn.py
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""churn — the pure "should this no-op archive coalesce into the prior commit?" fold.
|
|
2
|
+
|
|
3
|
+
THE PROBLEM (measured 2026-06-04, the operator's "dispatch is still mega
|
|
4
|
+
churning"). The dispatch family already gates the *push* surface — a repeated
|
|
5
|
+
0-pick `BLOCKED`/`DRAIN` archive classifies NOOP and never reaches `origin`
|
|
6
|
+
(`event_severity` + the per-sink `JOB_DISPATCH_*` thresholds). But every such
|
|
7
|
+
iteration still writes its OWN local commit (the archive is unconditional by
|
|
8
|
+
design — "archive always" so downstream tools see a terminal envelope). So
|
|
9
|
+
`git log` fills with a limit cycle the operator stares at:
|
|
10
|
+
|
|
11
|
+
archive … verdict=BLOCKED, /replan recommended
|
|
12
|
+
replan … quiet sweep (0 closed, 0 added)
|
|
13
|
+
archive … verdict=BLOCKED, /replan recommended <- same cause, again
|
|
14
|
+
archive … verdict=BLOCKED, /replan recommended <- and again
|
|
15
|
+
…
|
|
16
|
+
|
|
17
|
+
Measured: 199 commits → 24 picks shipped (8.3 commits/pick); the single phrase
|
|
18
|
+
`child2 skipped (/replan recommended)` recurred 22× over the window — a
|
|
19
|
+
BLOCKED → no-op-replan → BLOCKED cycle that never converges. The push gate keeps
|
|
20
|
+
peers clean; it does nothing for the LOCAL commit flood, which IS the churn.
|
|
21
|
+
|
|
22
|
+
THE FIX (this kernel). When the current archive is a NOOP (a 0-pick
|
|
23
|
+
blocker/drain) AND the immediately-prior commit on the branch is the SAME-family,
|
|
24
|
+
SAME-cause NOOP archive, the write step should **amend** the prior commit instead
|
|
25
|
+
of adding a new one — folding this run's README into it and bumping a recurrence
|
|
26
|
+
count in the subject. The 22-commit cycle collapses to ~1 commit that says
|
|
27
|
+
`blocked ×4 (recurring, coalesced)`; the full per-run audit still lives in the
|
|
28
|
+
README tree (every run dir's `README.md` is preserved in the amended commit's
|
|
29
|
+
pathspec), so nothing is lost — only the redundant `git log` rows.
|
|
30
|
+
|
|
31
|
+
⚓ Pure kernel, I/O on the edge (the dos idiom — mirrors `classify_event` /
|
|
32
|
+
`classify_recurring_wedge`): `decide_coalesce(ChurnState) -> CoalesceVerdict` is a
|
|
33
|
+
frozen dataclass in, a frozen verdict out. The caller reduces the two facts the
|
|
34
|
+
decision needs — *this* event and the *prior commit* — to scalars at the write
|
|
35
|
+
step (one `git log -1` read + the `event_severity` classification it already
|
|
36
|
+
runs), then hands them in frozen. No subprocess, no git/clock/file call here.
|
|
37
|
+
|
|
38
|
+
⚓ Reuse the severity + cause vocabulary, never re-list it. The coalesce decision
|
|
39
|
+
is layered ON TOP of `event_severity.classify_event` (only a NOOP coalesces) and
|
|
40
|
+
keys recurrence on the SAME opaque `cause_key` string the host's
|
|
41
|
+
`unstick_audit.classify_cause` / `dos.recurring_wedge` already produce. This
|
|
42
|
+
module adds the *commit-shaping* rule; it does not re-derive severity or re-match
|
|
43
|
+
cues.
|
|
44
|
+
|
|
45
|
+
WHY a separate leaf and not a branch inside `event_severity`: severity answers
|
|
46
|
+
"what operator value does this event carry?" (a push/report/terminal question).
|
|
47
|
+
Coalescing answers "given the PRIOR commit, should this one merge into it?" (a
|
|
48
|
+
git-history-shaping question that needs a second input — the prior commit — that
|
|
49
|
+
severity never sees). Different question, different input, separate leaf — the
|
|
50
|
+
`recurring_wedge`-vs-`wedge_reason` split pattern.
|
|
51
|
+
"""
|
|
52
|
+
from __future__ import annotations
|
|
53
|
+
|
|
54
|
+
from dataclasses import dataclass
|
|
55
|
+
|
|
56
|
+
from .event_severity import EventState, Severity, classify_event
|
|
57
|
+
from .tokens import normalize_token
|
|
58
|
+
|
|
59
|
+
# The minimum recurrence at which we coalesce. The FIRST no-op archive of a cause
|
|
60
|
+
# always stands alone (it may be a genuine one-off the operator should see in the
|
|
61
|
+
# log); the SECOND consecutive same-cause no-op is where the cycle starts and
|
|
62
|
+
# coalescing kicks in. Mirrors `recurring_wedge.DEFAULT_MIN_RECURRENCE` (a cause
|
|
63
|
+
# is "recurring" at 2 occurrences) so the two thresholds agree by construction.
|
|
64
|
+
DEFAULT_MIN_COALESCE_RUN = 2
|
|
65
|
+
|
|
66
|
+
# Families whose archives carry the per-run no-op cycle. `next-up` / `replan`
|
|
67
|
+
# bookkeeping has its own quiet-sweep shape (already NOOP-gated for push) but does
|
|
68
|
+
# NOT form the same prior-commit-amend cycle — a replan's commit legitimately
|
|
69
|
+
# follows a dispatch archive and must not absorb it. So only the two dispatch
|
|
70
|
+
# archive families coalesce.
|
|
71
|
+
_COALESCING_FAMILIES = frozenset({"dispatch", "dispatch-loop"})
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass(frozen=True)
|
|
75
|
+
class PriorCommit:
|
|
76
|
+
"""The single prior commit the write step read with one `git log -1`.
|
|
77
|
+
|
|
78
|
+
Every field is parsed from the committed subject at the I/O edge — the kernel
|
|
79
|
+
never reads git. `is_coalesced` / `coalesce_count` let a THIRD consecutive
|
|
80
|
+
no-op extend an ALREADY-coalesced commit (×2 → ×3) rather than starting a new
|
|
81
|
+
coalesced commit beside it.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
family: str # the dispatch family the prior subject led with ("" if not ours)
|
|
85
|
+
severity: str # the Severity value the prior event classified to ("" if unknown)
|
|
86
|
+
cause_key: str # the opaque cause the prior no-op carried ("" if none / not ours)
|
|
87
|
+
is_coalesced: bool = False # was the prior commit itself an ×N coalesced archive?
|
|
88
|
+
coalesce_count: int = 1 # the ×N already folded into the prior commit (≥1)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass(frozen=True)
|
|
92
|
+
class ChurnState:
|
|
93
|
+
"""Everything the coalesce decision needs — the current event + the prior commit.
|
|
94
|
+
|
|
95
|
+
`event` is the SAME `EventState` the write step already built for the push
|
|
96
|
+
gate (so severity is computed once, here, not re-derived). `cause_key` is the
|
|
97
|
+
current event's opaque cause (from `unstick_audit.classify_cause` over the
|
|
98
|
+
Outcome cell, or "" when the host did not classify one — an unkeyed no-op
|
|
99
|
+
never coalesces, since we cannot prove it is the *same* cause as the prior).
|
|
100
|
+
`prior` is the parsed prior commit (None when there is no prior commit, e.g.
|
|
101
|
+
the very first archive on a fresh branch).
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
event: EventState
|
|
105
|
+
cause_key: str
|
|
106
|
+
prior: PriorCommit | None
|
|
107
|
+
min_coalesce_run: int = DEFAULT_MIN_COALESCE_RUN
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataclass(frozen=True)
|
|
111
|
+
class CoalesceVerdict:
|
|
112
|
+
"""Whether — and how — to coalesce. PURE given the `ChurnState`.
|
|
113
|
+
|
|
114
|
+
`coalesce` is the load-bearing field the write step branches on: True →
|
|
115
|
+
`git commit --amend` (fold this run's pathspec into the prior commit), False →
|
|
116
|
+
a normal new `git commit`. `recurrence` is the ×N to stamp into the amended
|
|
117
|
+
subject (the count INCLUDING this occurrence). `subject_suffix` is the ready
|
|
118
|
+
`×N (recurring, coalesced)` tail the write step appends to the family-prefixed
|
|
119
|
+
subject so the rendered headline is mechanical (no model retype, no ordinal —
|
|
120
|
+
the `subject_lead_token` discipline). `reason` is operator-facing telemetry.
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
coalesce: bool
|
|
124
|
+
recurrence: int
|
|
125
|
+
subject_suffix: str
|
|
126
|
+
reason: str
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _is_noop_dispatch_archive(ev: EventState) -> bool:
|
|
130
|
+
"""True iff `ev` is a dispatch-family archive that classifies NOOP — the only
|
|
131
|
+
event eligible to coalesce. A SHIPPED pick or a first-seen BLOCKED-NEW blocker
|
|
132
|
+
is operator-relevant and must keep its own standalone commit."""
|
|
133
|
+
fam = (ev.family or "").strip().lower()
|
|
134
|
+
if fam not in _COALESCING_FAMILIES:
|
|
135
|
+
return False
|
|
136
|
+
return classify_event(ev) is Severity.NOOP
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def decide_coalesce(state: ChurnState) -> CoalesceVerdict:
|
|
140
|
+
"""Decide whether the current archive should fold into the prior commit.
|
|
141
|
+
|
|
142
|
+
The rule, in order:
|
|
143
|
+
|
|
144
|
+
1. The current event must be a NOOP dispatch archive (a 0-pick
|
|
145
|
+
blocker/drain). A SHIPPED or first-seen BLOCKED-NEW event never
|
|
146
|
+
coalesces — it is what the operator wants to SEE in the log.
|
|
147
|
+
2. It must carry a cause_key. An unkeyed no-op cannot be proven to be the
|
|
148
|
+
*same* cause as the prior commit, so it stands alone (fail-safe: when in
|
|
149
|
+
doubt, do not merge — a separate commit is always correct, just noisier).
|
|
150
|
+
3. The prior commit must be a SAME-family, SAME-cause NOOP archive. Same
|
|
151
|
+
family AND same opaque cause_key is the "this is the same cycle
|
|
152
|
+
repeating" signal. (A different cause, or a SHIPPED/replan/next-up
|
|
153
|
+
commit in between, breaks the run — the new no-op starts fresh.)
|
|
154
|
+
4. The recurrence (prior's folded count + 1) must reach `min_coalesce_run`.
|
|
155
|
+
The default 2 means the first no-op stands alone and the second folds
|
|
156
|
+
into it (→ ×2); a third extends the already-coalesced commit (→ ×3).
|
|
157
|
+
|
|
158
|
+
PURE — no I/O. `Severity` is computed via the shared `classify_event`, so the
|
|
159
|
+
coalesce decision and the push gate can never disagree about NOOP-ness.
|
|
160
|
+
"""
|
|
161
|
+
ev = state.event
|
|
162
|
+
cause = (state.cause_key or "").strip()
|
|
163
|
+
|
|
164
|
+
if not _is_noop_dispatch_archive(ev):
|
|
165
|
+
return CoalesceVerdict(
|
|
166
|
+
coalesce=False, recurrence=1, subject_suffix="",
|
|
167
|
+
reason="not a no-op dispatch archive — stands alone (operator-relevant)",
|
|
168
|
+
)
|
|
169
|
+
if not cause:
|
|
170
|
+
return CoalesceVerdict(
|
|
171
|
+
coalesce=False, recurrence=1, subject_suffix="",
|
|
172
|
+
reason="no-op carries no cause_key — cannot prove same-cause, stands alone",
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
prior = state.prior
|
|
176
|
+
if prior is None:
|
|
177
|
+
return CoalesceVerdict(
|
|
178
|
+
coalesce=False, recurrence=1, subject_suffix="",
|
|
179
|
+
reason="no prior commit to coalesce into — first archive stands alone",
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
prior_fam = (prior.family or "").strip().lower()
|
|
183
|
+
prior_cause = (prior.cause_key or "").strip()
|
|
184
|
+
prior_sev = normalize_token(prior.severity) or ""
|
|
185
|
+
|
|
186
|
+
same_family = prior_fam in _COALESCING_FAMILIES
|
|
187
|
+
same_cause = bool(prior_cause) and prior_cause == cause
|
|
188
|
+
prior_is_noop = prior_sev == Severity.NOOP.value
|
|
189
|
+
|
|
190
|
+
if not (same_family and prior_is_noop and same_cause):
|
|
191
|
+
return CoalesceVerdict(
|
|
192
|
+
coalesce=False, recurrence=1, subject_suffix="",
|
|
193
|
+
reason=(
|
|
194
|
+
"prior commit is not a same-family same-cause no-op archive "
|
|
195
|
+
f"(prior family={prior_fam or 'none'} sev={prior_sev or 'none'} "
|
|
196
|
+
f"cause={prior_cause or 'none'} vs this cause={cause}) - stands alone"
|
|
197
|
+
),
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# The run continues: this is the (prior.coalesce_count + 1)-th consecutive
|
|
201
|
+
# same-cause no-op. The prior count is ≥1 (a plain prior no-op counts as 1).
|
|
202
|
+
recurrence = max(1, prior.coalesce_count) + 1
|
|
203
|
+
if recurrence < state.min_coalesce_run:
|
|
204
|
+
return CoalesceVerdict(
|
|
205
|
+
coalesce=False, recurrence=recurrence, subject_suffix="",
|
|
206
|
+
reason=(
|
|
207
|
+
f"recurrence {recurrence} < min_coalesce_run "
|
|
208
|
+
f"{state.min_coalesce_run} — stands alone"
|
|
209
|
+
),
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
return CoalesceVerdict(
|
|
213
|
+
coalesce=True,
|
|
214
|
+
recurrence=recurrence,
|
|
215
|
+
# The `[cause:<key>]` token makes the coalesced subject SELF-DESCRIBING:
|
|
216
|
+
# the next no-op's prior-commit parse recovers the cause from this token
|
|
217
|
+
# (the original Outcome prose is gone once the subject collapses to the
|
|
218
|
+
# `blocked ×N` headline, so prose-classifying the coalesced subject would
|
|
219
|
+
# lose the cause and break the run at ×N+1). The host renders the suffix
|
|
220
|
+
# verbatim into the amended subject; the bridge parses `[cause:…]` back.
|
|
221
|
+
subject_suffix=f"×{recurrence} (recurring, coalesced) [cause:{cause}]",
|
|
222
|
+
reason=(
|
|
223
|
+
f"same-cause no-op '{cause}' repeats (×{recurrence}) — "
|
|
224
|
+
"amend prior commit instead of adding a new one"
|
|
225
|
+
),
|
|
226
|
+
)
|