dos-kernel 0.22.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dos/__init__.py +261 -0
- dos/_bin/dos-hook.exe +0 -0
- dos/_filelock.py +255 -0
- dos/_job_policy.py +97 -0
- dos/_tree.py +145 -0
- dos/admission.py +433 -0
- dos/answer_shape.py +299 -0
- dos/arbiter.py +859 -0
- dos/archive_lock.py +266 -0
- dos/arg_provenance.py +814 -0
- dos/attest.py +472 -0
- dos/breaker.py +311 -0
- dos/churn.py +226 -0
- dos/claim_extract.py +229 -0
- dos/claim_ttl.py +150 -0
- dos/cli.py +8721 -0
- dos/commit_audit.py +666 -0
- dos/completion.py +466 -0
- dos/concurrency_class.py +154 -0
- dos/config.py +1380 -0
- dos/config_lint.py +464 -0
- dos/cooldown.py +390 -0
- dos/coverage.py +387 -0
- dos/dangling_intent.py +287 -0
- dos/data_class.py +397 -0
- dos/decisions.py +1274 -0
- dos/decisions_tui.py +251 -0
- dos/dispatch_top.py +740 -0
- dos/dispatch_top_tui.py +116 -0
- dos/drivers/__init__.py +40 -0
- dos/drivers/ci_status.py +630 -0
- dos/drivers/citation_resolve.py +703 -0
- dos/drivers/decision_stop.py +98 -0
- dos/drivers/export_file.py +173 -0
- dos/drivers/export_otlp.py +275 -0
- dos/drivers/export_statsd.py +242 -0
- dos/drivers/hook_dialects.py +391 -0
- dos/drivers/job.py +47 -0
- dos/drivers/llm_judge.py +360 -0
- dos/drivers/memory_recall.py +1231 -0
- dos/drivers/notify_slack.py +373 -0
- dos/drivers/notify_webhook.py +251 -0
- dos/drivers/operator_judge.py +114 -0
- dos/drivers/os_acceptance.py +228 -0
- dos/drivers/paste_log.py +132 -0
- dos/drivers/plan_scope.py +133 -0
- dos/drivers/self_improve.py +375 -0
- dos/drivers/similarity_judge.py +249 -0
- dos/drivers/state_diff.py +274 -0
- dos/drivers/supervisor.py +347 -0
- dos/drivers/watchdog.py +363 -0
- dos/drivers/workshop.py +160 -0
- dos/durable_schema.py +344 -0
- dos/effect_witness.py +393 -0
- dos/efficiency.py +318 -0
- dos/enforce.py +414 -0
- dos/enumerate.py +776 -0
- dos/env_print.py +378 -0
- dos/event_severity.py +258 -0
- dos/evidence.py +692 -0
- dos/exec_capability.py +256 -0
- dos/export_cursor.py +143 -0
- dos/exporter.py +320 -0
- dos/firing_label.py +353 -0
- dos/fleet_roll.py +226 -0
- dos/gate_classify.py +827 -0
- dos/gh4_coverage.py +179 -0
- dos/git_delta.py +122 -0
- dos/guard.py +215 -0
- dos/health.py +552 -0
- dos/help_summary.py +519 -0
- dos/home.py +934 -0
- dos/hook_binary.py +194 -0
- dos/hook_dialect.py +271 -0
- dos/hook_exit.py +191 -0
- dos/hook_install.py +437 -0
- dos/id_alloc.py +304 -0
- dos/improve.py +499 -0
- dos/intent_ledger.py +635 -0
- dos/interpret.py +176 -0
- dos/intervention.py +769 -0
- dos/intervention_eval.py +371 -0
- dos/journal_delta.py +308 -0
- dos/judge_eval.py +328 -0
- dos/judges.py +366 -0
- dos/lane_infer.py +127 -0
- dos/lane_journal.py +1001 -0
- dos/lane_lease.py +952 -0
- dos/lane_overlap.py +228 -0
- dos/lease_health.py +282 -0
- dos/lifecycle.py +211 -0
- dos/liveness.py +352 -0
- dos/lock_modes.py +185 -0
- dos/log_source.py +395 -0
- dos/loop_decide.py +1746 -0
- dos/marker_gate.py +254 -0
- dos/marker_sensor.py +396 -0
- dos/noop_streak.py +280 -0
- dos/notify.py +479 -0
- dos/observe.py +175 -0
- dos/oracle.py +1661 -0
- dos/overlap_eval.py +214 -0
- dos/overlap_policy.py +342 -0
- dos/packet_sidecar.py +267 -0
- dos/phase_shipped.py +1985 -0
- dos/pick_priority.py +225 -0
- dos/pickable.py +369 -0
- dos/picker_oracle.py +1037 -0
- dos/plan_board.py +513 -0
- dos/plan_board_tui.py +113 -0
- dos/plan_source.py +455 -0
- dos/posttool_sensor.py +528 -0
- dos/precursor_gate.py +499 -0
- dos/precursor_gate_eval.py +239 -0
- dos/preflight.py +825 -0
- dos/pretool_sensor.py +490 -0
- dos/proc_delta.py +181 -0
- dos/productivity.py +296 -0
- dos/provider_limit.py +242 -0
- dos/py.typed +4 -0
- dos/reason_morphology.py +299 -0
- dos/reasons.py +449 -0
- dos/reconcile.py +173 -0
- dos/recurring_wedge.py +206 -0
- dos/render.py +393 -0
- dos/result_state.py +468 -0
- dos/resume.py +578 -0
- dos/resume_evidence.py +293 -0
- dos/retention.py +344 -0
- dos/reward.py +372 -0
- dos/rewind.py +587 -0
- dos/rewind_evidence.py +168 -0
- dos/rewind_tokens.py +252 -0
- dos/run_id.py +342 -0
- dos/scope.py +520 -0
- dos/scope_source.py +382 -0
- dos/scout.py +982 -0
- dos/self_modify.py +209 -0
- dos/sibling_scan.py +569 -0
- dos/skills/EXAMPLES.md +584 -0
- dos/skills/dos-class-cycle/SKILL.md +107 -0
- dos/skills/dos-dispatch/SKILL.md +177 -0
- dos/skills/dos-dispatch-loop/SKILL.md +254 -0
- dos/skills/dos-goal-gate/SKILL.md +269 -0
- dos/skills/dos-next-up/SKILL.md +231 -0
- dos/skills/dos-promote/SKILL.md +114 -0
- dos/skills/dos-replan/SKILL.md +159 -0
- dos/skills/dos-replan-loop/SKILL.md +114 -0
- dos/skills/dos-self-improve/SKILL.md +213 -0
- dos/skills/dos-supervise-loop/SKILL.md +180 -0
- dos/skills/dos-unstick/SKILL.md +108 -0
- dos/skills/dos-witness-claim/SKILL.md +251 -0
- dos/stamp.py +1002 -0
- dos/state_health.py +387 -0
- dos/status.py +114 -0
- dos/stop_policy.py +334 -0
- dos/supervise.py +1014 -0
- dos/testwitness.py +392 -0
- dos/timeline.py +1027 -0
- dos/tokens.py +485 -0
- dos/tool_stream.py +393 -0
- dos/tool_stream_eval.py +226 -0
- dos/trace.py +524 -0
- dos/verdict.py +140 -0
- dos/verdict_cli.py +189 -0
- dos/verdict_journal.py +497 -0
- dos/verdict_rollup.py +217 -0
- dos/verdicts.py +181 -0
- dos/wedge_reason.py +282 -0
- dos_kernel-0.22.0.dist-info/METADATA +859 -0
- dos_kernel-0.22.0.dist-info/RECORD +178 -0
- dos_kernel-0.22.0.dist-info/WHEEL +5 -0
- dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
- dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
- dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
- dos_mcp/__init__.py +52 -0
- dos_mcp/py.typed +2 -0
- dos_mcp/server.py +779 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
"""The precursor-gate evaluation harness — score a `PrecursorGrammar` by its RECALL vs WASTE.
|
|
2
|
+
|
|
3
|
+
docs/147 §5/§9.2 — the per-axis eval, the `tool_stream_eval` / `intervention_eval` /
|
|
4
|
+
`overlap_eval` / `judge_eval` discipline re-aimed at the precursor-presence axis. Every DOS
|
|
5
|
+
axis ships an eval that turns its config from a hunch into a measured, per-deployment decision
|
|
6
|
+
(the research-friendliness thesis, docs/90 §2). The precursor gate's config — the
|
|
7
|
+
hand-authored `requires` grammar + the `aliases` allow-list — needs exactly that instrument: a
|
|
8
|
+
backtest that answers **"on this deployment's real call streams, does the gate catch the
|
|
9
|
+
prerequisite-skips that matter without false-REFUTING a precursor that fired under an unlisted
|
|
10
|
+
alias?"**
|
|
11
|
+
|
|
12
|
+
The decisive numbers (the dual of `tool_stream_eval`'s recovered/false-resurface pair):
|
|
13
|
+
|
|
14
|
+
* **missed_precursor_recall** — of the calls that ACTUALLY skipped a required precursor (a
|
|
15
|
+
real Missing-Prerequisite-Lookup), the fraction the gate fired REFUTED on. Recall-of-action:
|
|
16
|
+
a grammar that covers too few mutating tools scores low here — it never fires, never catches
|
|
17
|
+
(the grammar-coverage bound docs/147 §1 names). This is the number that tells a host how
|
|
18
|
+
much of *its* mutating surface the declared grammar reaches.
|
|
19
|
+
* **false_refute_rate** — of the calls whose precursor ACTUALLY fired (the lookup was done),
|
|
20
|
+
the fraction the gate WRONGLY fired REFUTED on (because it fired under a name the grammar did
|
|
21
|
+
not list as the precursor or an alias). The dangerous cell — the §3 residual made
|
|
22
|
+
measurable. A false REFUTED is *harmless by design* (the intervention is a WARN that
|
|
23
|
+
preserves the turn — re-surfacing a requirement the agent already met is a no-op nudge), but
|
|
24
|
+
a high rate means the `aliases` allow-list is incomplete and the host should grow it (the
|
|
25
|
+
calibration the R3 rung performs, docs/147 §6).
|
|
26
|
+
|
|
27
|
+
The honesty stance (the same as the sibling evals)
|
|
28
|
+
==================================================
|
|
29
|
+
|
|
30
|
+
The labels are the RESEARCHER's ground truth, derived from EXECUTED replay, never from the gate:
|
|
31
|
+
|
|
32
|
+
* `precursor_required` — did this mutating call ACTUALLY require a precursor per the policy
|
|
33
|
+
PROSE (read by a human / the scorer, not the grammar)? The `overlap_eval.collided` "did it
|
|
34
|
+
actually collide" discipline — the ground truth the grammar is graded AGAINST, never derived
|
|
35
|
+
from the grammar under test.
|
|
36
|
+
* `precursor_actually_fired` — did the agent ACTUALLY call a satisfying precursor (under ANY
|
|
37
|
+
name, listed or not) before this call? The false_refute denominator's truth. A call that
|
|
38
|
+
required a precursor AND fired one is a *correctly-sequenced* call; a REFUTED on it is a
|
|
39
|
+
false fire (the lookup happened under an alias the grammar missed).
|
|
40
|
+
* `mattered_to_score` — did this prerequisite feed a verifier the run was scored on? Carried so
|
|
41
|
+
a host can weight recall by what actually moves the score (the `intervention_eval`
|
|
42
|
+
mattered-axis), never scored directly here.
|
|
43
|
+
|
|
44
|
+
Everything is **pure**: it consumes already-built `PrecursorCase`s, runs each through the SAME
|
|
45
|
+
`precursor_gate.classify_call` the consumer takes (so the grid reflects what would actually fire
|
|
46
|
+
— the "score under the floor" discipline), and counts in one pass. No I/O, no host names — it
|
|
47
|
+
sits in the kernel layer beside `precursor_gate`.
|
|
48
|
+
|
|
49
|
+
⚠ This is NOT `arg_provenance`'s detector eval and NOT `intervention_eval`. It measures the
|
|
50
|
+
GRAMMAR specifically — does this declared precursor map catch the real skips without
|
|
51
|
+
false-REFUTING on an unlisted alias — an axis orthogonal to the mint detector and the actuation
|
|
52
|
+
ladder.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
from __future__ import annotations
|
|
56
|
+
|
|
57
|
+
from dataclasses import dataclass
|
|
58
|
+
from typing import Iterable
|
|
59
|
+
|
|
60
|
+
from dos.evidence import EvidenceStance
|
|
61
|
+
from dos.precursor_gate import (
|
|
62
|
+
CallStream,
|
|
63
|
+
MutatingCall,
|
|
64
|
+
PrecursorGrammar,
|
|
65
|
+
PrecursorPolicy,
|
|
66
|
+
classify_call,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# ---------------------------------------------------------------------------
|
|
71
|
+
# A labelled example — one replayed (mutating call, stream) + ground-truth labels.
|
|
72
|
+
# ---------------------------------------------------------------------------
|
|
73
|
+
@dataclass(frozen=True)
|
|
74
|
+
class PrecursorCase:
|
|
75
|
+
"""One replayed mutating call + its prior stream + the ground-truth labels.
|
|
76
|
+
|
|
77
|
+
The `stance` the gate assigns is NOT stored — it is DERIVED in `score()` via `classify_call`
|
|
78
|
+
from the embedded `call`/`stream`/grammar, so the scored fire can never drift from a
|
|
79
|
+
hand-labelled stance (the label-drift trap, the sibling-eval discipline). Every other field is
|
|
80
|
+
a researcher ground-truth label from a replay, NOT a guess.
|
|
81
|
+
|
|
82
|
+
Fields:
|
|
83
|
+
call — the mutating `MutatingCall` under scrutiny.
|
|
84
|
+
stream — the `CallStream` of prior calls (the env-authored corpus).
|
|
85
|
+
precursor_required — ground truth (from the policy PROSE, NOT the grammar under test):
|
|
86
|
+
did this call actually require a mandated precursor? The recall
|
|
87
|
+
numerator's truth.
|
|
88
|
+
precursor_actually_fired — ground truth: did a satisfying precursor actually fire before this
|
|
89
|
+
call, under ANY name (listed or not)? Distinguishes a correctly-
|
|
90
|
+
sequenced call (fired) from a real skip (not fired). The
|
|
91
|
+
false_refute denominator's truth.
|
|
92
|
+
mattered_to_score — ground truth: did this prerequisite feed a scored verifier?
|
|
93
|
+
(carried for weighting, not scored directly).
|
|
94
|
+
label — optional human handle (carried, never scored).
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
call: MutatingCall
|
|
98
|
+
stream: CallStream
|
|
99
|
+
precursor_required: bool
|
|
100
|
+
precursor_actually_fired: bool
|
|
101
|
+
mattered_to_score: bool = False
|
|
102
|
+
label: str = ""
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# ---------------------------------------------------------------------------
|
|
106
|
+
# The report — frozen, @property rates with div-guard, to_dict (mirror tool_stream_eval).
|
|
107
|
+
# ---------------------------------------------------------------------------
|
|
108
|
+
@dataclass(frozen=True)
|
|
109
|
+
class PrecursorEvalReport:
|
|
110
|
+
"""A `PrecursorGrammar` scored over labelled cases — the recall ledger + the false-fire rate.
|
|
111
|
+
|
|
112
|
+
The grid splits the ground-truth crosstab (independent of the grammar) from the firing ledger
|
|
113
|
+
(what the grammar actually flagged REFUTED). The named dangerous cell is `refuted_on_fired` —
|
|
114
|
+
a REFUTED on a call whose precursor actually fired (an unlisted-alias miss).
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
n: int
|
|
118
|
+
# ground-truth grid (grammar-independent):
|
|
119
|
+
n_real_skip: int # precursor_required AND NOT precursor_actually_fired (the recoverable population)
|
|
120
|
+
n_correctly_sequenced: int # precursor_required AND precursor_actually_fired (the false-fire denominator)
|
|
121
|
+
# firing ledger (what the grammar did):
|
|
122
|
+
n_refuted: int # REFUTED assigned
|
|
123
|
+
n_refuted_skip: int # REFUTED AND a real skip (a useful catch)
|
|
124
|
+
n_refuted_fired: int # REFUTED AND the precursor actually fired (the dangerous cell)
|
|
125
|
+
n_refuted_skip_mattered: int # of the useful catches, those that fed a scored verifier
|
|
126
|
+
|
|
127
|
+
# --- derived rates (all guard against divide-by-zero) ---
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def missed_precursor_recall(self) -> float:
|
|
131
|
+
"""Of all REAL prerequisite-skips, the fraction the gate fired REFUTED on — the HEADLINE.
|
|
132
|
+
A grammar that covers too few mutating tools scores ~0 (it never fires); growing
|
|
133
|
+
`requires` raises it. The grammar-coverage instrument (docs/147 §1)."""
|
|
134
|
+
return (self.n_refuted_skip / self.n_real_skip) if self.n_real_skip else 0.0
|
|
135
|
+
|
|
136
|
+
@property
|
|
137
|
+
def false_refute_rate(self) -> float:
|
|
138
|
+
"""Of all CORRECTLY-SEQUENCED calls (the precursor actually fired), the fraction the gate
|
|
139
|
+
WRONGLY fired REFUTED on — THE DANGEROUS-CELL RATE (the
|
|
140
|
+
`tool_stream_eval.false_resurface_rate` / `intervention_eval.wasted_disruption_rate`
|
|
141
|
+
analogue). Harmless by design (a WARN preserving the turn), but a high rate says the
|
|
142
|
+
`aliases` allow-list is incomplete — grow it (the R3 calibration, docs/147 §6)."""
|
|
143
|
+
return (self.n_refuted_fired / self.n_correctly_sequenced) if self.n_correctly_sequenced else 0.0
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def fire_precision(self) -> float:
|
|
147
|
+
"""Of all the calls the gate fired REFUTED on, the fraction that were real skips — how much
|
|
148
|
+
of the firing was well-aimed (vs a false REFUTED on an unlisted alias)."""
|
|
149
|
+
return (self.n_refuted_skip / self.n_refuted) if self.n_refuted else 0.0
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def mattered_recall(self) -> float:
|
|
153
|
+
"""Of all real skips, the fraction the gate caught AND that fed a scored verifier — recall
|
|
154
|
+
weighted by what actually moves the score (the value side of the grammar-coverage bound)."""
|
|
155
|
+
return (self.n_refuted_skip_mattered / self.n_real_skip) if self.n_real_skip else 0.0
|
|
156
|
+
|
|
157
|
+
@property
|
|
158
|
+
def net_positive(self) -> bool:
|
|
159
|
+
"""True iff the grammar catches more real skips than it false-REFUTES on correctly-sequenced
|
|
160
|
+
calls — the boolean a `dos precursor-gate-eval` exit code could ride (the friendly-direction
|
|
161
|
+
`net_harmful` analogue). A catch is a real nudge toward the scored fix; a false REFUTED is
|
|
162
|
+
harmless-but-noise, so net-positive is `refuted_skip > refuted_fired`."""
|
|
163
|
+
return self.n_refuted_skip > self.n_refuted_fired
|
|
164
|
+
|
|
165
|
+
def to_dict(self) -> dict:
|
|
166
|
+
return {
|
|
167
|
+
"n": self.n,
|
|
168
|
+
"grid": {
|
|
169
|
+
"real_skip": self.n_real_skip,
|
|
170
|
+
"correctly_sequenced": self.n_correctly_sequenced,
|
|
171
|
+
},
|
|
172
|
+
"firing": {
|
|
173
|
+
"refuted": self.n_refuted,
|
|
174
|
+
"refuted_skip": self.n_refuted_skip,
|
|
175
|
+
"refuted_fired": self.n_refuted_fired,
|
|
176
|
+
"refuted_skip_mattered": self.n_refuted_skip_mattered,
|
|
177
|
+
},
|
|
178
|
+
"rates": {
|
|
179
|
+
"missed_precursor_recall": round(self.missed_precursor_recall, 4),
|
|
180
|
+
"false_refute_rate": round(self.false_refute_rate, 4),
|
|
181
|
+
"fire_precision": round(self.fire_precision, 4),
|
|
182
|
+
"mattered_recall": round(self.mattered_recall, 4),
|
|
183
|
+
},
|
|
184
|
+
"net_positive": self.net_positive,
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def score(
|
|
189
|
+
grammar: PrecursorGrammar,
|
|
190
|
+
cases: Iterable[PrecursorCase],
|
|
191
|
+
policy: PrecursorPolicy = PrecursorPolicy(),
|
|
192
|
+
*,
|
|
193
|
+
_classify=classify_call,
|
|
194
|
+
) -> PrecursorEvalReport:
|
|
195
|
+
"""Run `grammar` over labelled `cases` (via `classify_call`) and tabulate the ledger.
|
|
196
|
+
|
|
197
|
+
The grammar is scored through the SAME `classify_call` path the consumer's gate uses (the
|
|
198
|
+
"score under the floor" discipline), so the grid reflects exactly what would FIRE. A call FIRES
|
|
199
|
+
iff its stance is REFUTED (the one actionable stance — ATTESTED/NO_SIGNAL never intervene).
|
|
200
|
+
PURE: reads cases, counts in one pass.
|
|
201
|
+
|
|
202
|
+
Invariant (pinned by a test): a call is counted in `n_refuted_skip` / `n_refuted_fired` only if
|
|
203
|
+
it both fired REFUTED AND carried the matching ground-truth label, so the firing ledger never
|
|
204
|
+
exceeds `n_refuted`, and `n_refuted_skip_mattered <= n_refuted_skip`.
|
|
205
|
+
"""
|
|
206
|
+
n = 0
|
|
207
|
+
n_real_skip = n_correctly_sequenced = 0
|
|
208
|
+
n_refuted = n_refuted_skip = n_refuted_fired = n_refuted_skip_mattered = 0
|
|
209
|
+
|
|
210
|
+
for case in cases:
|
|
211
|
+
n += 1
|
|
212
|
+
verdict = _classify(case.call, case.stream, grammar, policy)
|
|
213
|
+
fired = verdict.stance is EvidenceStance.REFUTED
|
|
214
|
+
|
|
215
|
+
real_skip = case.precursor_required and not case.precursor_actually_fired
|
|
216
|
+
correctly_sequenced = case.precursor_required and case.precursor_actually_fired
|
|
217
|
+
if real_skip:
|
|
218
|
+
n_real_skip += 1
|
|
219
|
+
if correctly_sequenced:
|
|
220
|
+
n_correctly_sequenced += 1
|
|
221
|
+
|
|
222
|
+
if fired:
|
|
223
|
+
n_refuted += 1
|
|
224
|
+
if real_skip:
|
|
225
|
+
n_refuted_skip += 1
|
|
226
|
+
if case.mattered_to_score:
|
|
227
|
+
n_refuted_skip_mattered += 1
|
|
228
|
+
if correctly_sequenced:
|
|
229
|
+
n_refuted_fired += 1
|
|
230
|
+
|
|
231
|
+
return PrecursorEvalReport(
|
|
232
|
+
n=n,
|
|
233
|
+
n_real_skip=n_real_skip,
|
|
234
|
+
n_correctly_sequenced=n_correctly_sequenced,
|
|
235
|
+
n_refuted=n_refuted,
|
|
236
|
+
n_refuted_skip=n_refuted_skip,
|
|
237
|
+
n_refuted_fired=n_refuted_fired,
|
|
238
|
+
n_refuted_skip_mattered=n_refuted_skip_mattered,
|
|
239
|
+
)
|