dos-kernel 0.22.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. dos/__init__.py +261 -0
  2. dos/_bin/dos-hook.exe +0 -0
  3. dos/_filelock.py +255 -0
  4. dos/_job_policy.py +97 -0
  5. dos/_tree.py +145 -0
  6. dos/admission.py +433 -0
  7. dos/answer_shape.py +299 -0
  8. dos/arbiter.py +859 -0
  9. dos/archive_lock.py +266 -0
  10. dos/arg_provenance.py +814 -0
  11. dos/attest.py +472 -0
  12. dos/breaker.py +311 -0
  13. dos/churn.py +226 -0
  14. dos/claim_extract.py +229 -0
  15. dos/claim_ttl.py +150 -0
  16. dos/cli.py +8721 -0
  17. dos/commit_audit.py +666 -0
  18. dos/completion.py +466 -0
  19. dos/concurrency_class.py +154 -0
  20. dos/config.py +1380 -0
  21. dos/config_lint.py +464 -0
  22. dos/cooldown.py +390 -0
  23. dos/coverage.py +387 -0
  24. dos/dangling_intent.py +287 -0
  25. dos/data_class.py +397 -0
  26. dos/decisions.py +1274 -0
  27. dos/decisions_tui.py +251 -0
  28. dos/dispatch_top.py +740 -0
  29. dos/dispatch_top_tui.py +116 -0
  30. dos/drivers/__init__.py +40 -0
  31. dos/drivers/ci_status.py +630 -0
  32. dos/drivers/citation_resolve.py +703 -0
  33. dos/drivers/decision_stop.py +98 -0
  34. dos/drivers/export_file.py +173 -0
  35. dos/drivers/export_otlp.py +275 -0
  36. dos/drivers/export_statsd.py +242 -0
  37. dos/drivers/hook_dialects.py +391 -0
  38. dos/drivers/job.py +47 -0
  39. dos/drivers/llm_judge.py +360 -0
  40. dos/drivers/memory_recall.py +1231 -0
  41. dos/drivers/notify_slack.py +373 -0
  42. dos/drivers/notify_webhook.py +251 -0
  43. dos/drivers/operator_judge.py +114 -0
  44. dos/drivers/os_acceptance.py +228 -0
  45. dos/drivers/paste_log.py +132 -0
  46. dos/drivers/plan_scope.py +133 -0
  47. dos/drivers/self_improve.py +375 -0
  48. dos/drivers/similarity_judge.py +249 -0
  49. dos/drivers/state_diff.py +274 -0
  50. dos/drivers/supervisor.py +347 -0
  51. dos/drivers/watchdog.py +363 -0
  52. dos/drivers/workshop.py +160 -0
  53. dos/durable_schema.py +344 -0
  54. dos/effect_witness.py +393 -0
  55. dos/efficiency.py +318 -0
  56. dos/enforce.py +414 -0
  57. dos/enumerate.py +776 -0
  58. dos/env_print.py +378 -0
  59. dos/event_severity.py +258 -0
  60. dos/evidence.py +692 -0
  61. dos/exec_capability.py +256 -0
  62. dos/export_cursor.py +143 -0
  63. dos/exporter.py +320 -0
  64. dos/firing_label.py +353 -0
  65. dos/fleet_roll.py +226 -0
  66. dos/gate_classify.py +827 -0
  67. dos/gh4_coverage.py +179 -0
  68. dos/git_delta.py +122 -0
  69. dos/guard.py +215 -0
  70. dos/health.py +552 -0
  71. dos/help_summary.py +519 -0
  72. dos/home.py +934 -0
  73. dos/hook_binary.py +194 -0
  74. dos/hook_dialect.py +271 -0
  75. dos/hook_exit.py +191 -0
  76. dos/hook_install.py +437 -0
  77. dos/id_alloc.py +304 -0
  78. dos/improve.py +499 -0
  79. dos/intent_ledger.py +635 -0
  80. dos/interpret.py +176 -0
  81. dos/intervention.py +769 -0
  82. dos/intervention_eval.py +371 -0
  83. dos/journal_delta.py +308 -0
  84. dos/judge_eval.py +328 -0
  85. dos/judges.py +366 -0
  86. dos/lane_infer.py +127 -0
  87. dos/lane_journal.py +1001 -0
  88. dos/lane_lease.py +952 -0
  89. dos/lane_overlap.py +228 -0
  90. dos/lease_health.py +282 -0
  91. dos/lifecycle.py +211 -0
  92. dos/liveness.py +352 -0
  93. dos/lock_modes.py +185 -0
  94. dos/log_source.py +395 -0
  95. dos/loop_decide.py +1746 -0
  96. dos/marker_gate.py +254 -0
  97. dos/marker_sensor.py +396 -0
  98. dos/noop_streak.py +280 -0
  99. dos/notify.py +479 -0
  100. dos/observe.py +175 -0
  101. dos/oracle.py +1661 -0
  102. dos/overlap_eval.py +214 -0
  103. dos/overlap_policy.py +342 -0
  104. dos/packet_sidecar.py +267 -0
  105. dos/phase_shipped.py +1985 -0
  106. dos/pick_priority.py +225 -0
  107. dos/pickable.py +369 -0
  108. dos/picker_oracle.py +1037 -0
  109. dos/plan_board.py +513 -0
  110. dos/plan_board_tui.py +113 -0
  111. dos/plan_source.py +455 -0
  112. dos/posttool_sensor.py +528 -0
  113. dos/precursor_gate.py +499 -0
  114. dos/precursor_gate_eval.py +239 -0
  115. dos/preflight.py +825 -0
  116. dos/pretool_sensor.py +490 -0
  117. dos/proc_delta.py +181 -0
  118. dos/productivity.py +296 -0
  119. dos/provider_limit.py +242 -0
  120. dos/py.typed +4 -0
  121. dos/reason_morphology.py +299 -0
  122. dos/reasons.py +449 -0
  123. dos/reconcile.py +173 -0
  124. dos/recurring_wedge.py +206 -0
  125. dos/render.py +393 -0
  126. dos/result_state.py +468 -0
  127. dos/resume.py +578 -0
  128. dos/resume_evidence.py +293 -0
  129. dos/retention.py +344 -0
  130. dos/reward.py +372 -0
  131. dos/rewind.py +587 -0
  132. dos/rewind_evidence.py +168 -0
  133. dos/rewind_tokens.py +252 -0
  134. dos/run_id.py +342 -0
  135. dos/scope.py +520 -0
  136. dos/scope_source.py +382 -0
  137. dos/scout.py +982 -0
  138. dos/self_modify.py +209 -0
  139. dos/sibling_scan.py +569 -0
  140. dos/skills/EXAMPLES.md +584 -0
  141. dos/skills/dos-class-cycle/SKILL.md +107 -0
  142. dos/skills/dos-dispatch/SKILL.md +177 -0
  143. dos/skills/dos-dispatch-loop/SKILL.md +254 -0
  144. dos/skills/dos-goal-gate/SKILL.md +269 -0
  145. dos/skills/dos-next-up/SKILL.md +231 -0
  146. dos/skills/dos-promote/SKILL.md +114 -0
  147. dos/skills/dos-replan/SKILL.md +159 -0
  148. dos/skills/dos-replan-loop/SKILL.md +114 -0
  149. dos/skills/dos-self-improve/SKILL.md +213 -0
  150. dos/skills/dos-supervise-loop/SKILL.md +180 -0
  151. dos/skills/dos-unstick/SKILL.md +108 -0
  152. dos/skills/dos-witness-claim/SKILL.md +251 -0
  153. dos/stamp.py +1002 -0
  154. dos/state_health.py +387 -0
  155. dos/status.py +114 -0
  156. dos/stop_policy.py +334 -0
  157. dos/supervise.py +1014 -0
  158. dos/testwitness.py +392 -0
  159. dos/timeline.py +1027 -0
  160. dos/tokens.py +485 -0
  161. dos/tool_stream.py +393 -0
  162. dos/tool_stream_eval.py +226 -0
  163. dos/trace.py +524 -0
  164. dos/verdict.py +140 -0
  165. dos/verdict_cli.py +189 -0
  166. dos/verdict_journal.py +497 -0
  167. dos/verdict_rollup.py +217 -0
  168. dos/verdicts.py +181 -0
  169. dos/wedge_reason.py +282 -0
  170. dos_kernel-0.22.0.dist-info/METADATA +859 -0
  171. dos_kernel-0.22.0.dist-info/RECORD +178 -0
  172. dos_kernel-0.22.0.dist-info/WHEEL +5 -0
  173. dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
  174. dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
  175. dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
  176. dos_mcp/__init__.py +52 -0
  177. dos_mcp/py.typed +2 -0
  178. dos_mcp/server.py +779 -0
@@ -0,0 +1,239 @@
1
+ """The precursor-gate evaluation harness — score a `PrecursorGrammar` by its RECALL vs WASTE.
2
+
3
+ docs/147 §5/§9.2 — the per-axis eval, the `tool_stream_eval` / `intervention_eval` /
4
+ `overlap_eval` / `judge_eval` discipline re-aimed at the precursor-presence axis. Every DOS
5
+ axis ships an eval that turns its config from a hunch into a measured, per-deployment decision
6
+ (the research-friendliness thesis, docs/90 §2). The precursor gate's config — the
7
+ hand-authored `requires` grammar + the `aliases` allow-list — needs exactly that instrument: a
8
+ backtest that answers **"on this deployment's real call streams, does the gate catch the
9
+ prerequisite-skips that matter without false-REFUTING a precursor that fired under an unlisted
10
+ alias?"**
11
+
12
+ The decisive numbers (the dual of `tool_stream_eval`'s recovered/false-resurface pair):
13
+
14
+ * **missed_precursor_recall** — of the calls that ACTUALLY skipped a required precursor (a
15
+ real Missing-Prerequisite-Lookup), the fraction the gate fired REFUTED on. Recall-of-action:
16
+ a grammar that covers too few mutating tools scores low here — it never fires, never catches
17
+ (the grammar-coverage bound docs/147 §1 names). This is the number that tells a host how
18
+ much of *its* mutating surface the declared grammar reaches.
19
+ * **false_refute_rate** — of the calls whose precursor ACTUALLY fired (the lookup was done),
20
+ the fraction the gate WRONGLY fired REFUTED on (because it fired under a name the grammar did
21
+ not list as the precursor or an alias). The dangerous cell — the §3 residual made
22
+ measurable. A false REFUTED is *harmless by design* (the intervention is a WARN that
23
+ preserves the turn — re-surfacing a requirement the agent already met is a no-op nudge), but
24
+ a high rate means the `aliases` allow-list is incomplete and the host should grow it (the
25
+ calibration the R3 rung performs, docs/147 §6).
26
+
27
+ The honesty stance (the same as the sibling evals)
28
+ ==================================================
29
+
30
+ The labels are the RESEARCHER's ground truth, derived from EXECUTED replay, never from the gate:
31
+
32
+ * `precursor_required` — did this mutating call ACTUALLY require a precursor per the policy
33
+ PROSE (read by a human / the scorer, not the grammar)? The `overlap_eval.collided` "did it
34
+ actually collide" discipline — the ground truth the grammar is graded AGAINST, never derived
35
+ from the grammar under test.
36
+ * `precursor_actually_fired` — did the agent ACTUALLY call a satisfying precursor (under ANY
37
+ name, listed or not) before this call? The false_refute denominator's truth. A call that
38
+ required a precursor AND fired one is a *correctly-sequenced* call; a REFUTED on it is a
39
+ false fire (the lookup happened under an alias the grammar missed).
40
+ * `mattered_to_score` — did this prerequisite feed a verifier the run was scored on? Carried so
41
+ a host can weight recall by what actually moves the score (the `intervention_eval`
42
+ mattered-axis), never scored directly here.
43
+
44
+ Everything is **pure**: it consumes already-built `PrecursorCase`s, runs each through the SAME
45
+ `precursor_gate.classify_call` the consumer takes (so the grid reflects what would actually fire
46
+ — the "score under the floor" discipline), and counts in one pass. No I/O, no host names — it
47
+ sits in the kernel layer beside `precursor_gate`.
48
+
49
+ ⚠ This is NOT `arg_provenance`'s detector eval and NOT `intervention_eval`. It measures the
50
+ GRAMMAR specifically — does this declared precursor map catch the real skips without
51
+ false-REFUTING on an unlisted alias — an axis orthogonal to the mint detector and the actuation
52
+ ladder.
53
+ """
54
+
55
+ from __future__ import annotations
56
+
57
+ from dataclasses import dataclass
58
+ from typing import Iterable
59
+
60
+ from dos.evidence import EvidenceStance
61
+ from dos.precursor_gate import (
62
+ CallStream,
63
+ MutatingCall,
64
+ PrecursorGrammar,
65
+ PrecursorPolicy,
66
+ classify_call,
67
+ )
68
+
69
+
70
+ # ---------------------------------------------------------------------------
71
+ # A labelled example — one replayed (mutating call, stream) + ground-truth labels.
72
+ # ---------------------------------------------------------------------------
73
+ @dataclass(frozen=True)
74
+ class PrecursorCase:
75
+ """One replayed mutating call + its prior stream + the ground-truth labels.
76
+
77
+ The `stance` the gate assigns is NOT stored — it is DERIVED in `score()` via `classify_call`
78
+ from the embedded `call`/`stream`/grammar, so the scored fire can never drift from a
79
+ hand-labelled stance (the label-drift trap, the sibling-eval discipline). Every other field is
80
+ a researcher ground-truth label from a replay, NOT a guess.
81
+
82
+ Fields:
83
+ call — the mutating `MutatingCall` under scrutiny.
84
+ stream — the `CallStream` of prior calls (the env-authored corpus).
85
+ precursor_required — ground truth (from the policy PROSE, NOT the grammar under test):
86
+ did this call actually require a mandated precursor? The recall
87
+ numerator's truth.
88
+ precursor_actually_fired — ground truth: did a satisfying precursor actually fire before this
89
+ call, under ANY name (listed or not)? Distinguishes a correctly-
90
+ sequenced call (fired) from a real skip (not fired). The
91
+ false_refute denominator's truth.
92
+ mattered_to_score — ground truth: did this prerequisite feed a scored verifier?
93
+ (carried for weighting, not scored directly).
94
+ label — optional human handle (carried, never scored).
95
+ """
96
+
97
+ call: MutatingCall
98
+ stream: CallStream
99
+ precursor_required: bool
100
+ precursor_actually_fired: bool
101
+ mattered_to_score: bool = False
102
+ label: str = ""
103
+
104
+
105
+ # ---------------------------------------------------------------------------
106
+ # The report — frozen, @property rates with div-guard, to_dict (mirror tool_stream_eval).
107
+ # ---------------------------------------------------------------------------
108
+ @dataclass(frozen=True)
109
+ class PrecursorEvalReport:
110
+ """A `PrecursorGrammar` scored over labelled cases — the recall ledger + the false-fire rate.
111
+
112
+ The grid splits the ground-truth crosstab (independent of the grammar) from the firing ledger
113
+ (what the grammar actually flagged REFUTED). The named dangerous cell is `refuted_on_fired` —
114
+ a REFUTED on a call whose precursor actually fired (an unlisted-alias miss).
115
+ """
116
+
117
+ n: int
118
+ # ground-truth grid (grammar-independent):
119
+ n_real_skip: int # precursor_required AND NOT precursor_actually_fired (the recoverable population)
120
+ n_correctly_sequenced: int # precursor_required AND precursor_actually_fired (the false-fire denominator)
121
+ # firing ledger (what the grammar did):
122
+ n_refuted: int # REFUTED assigned
123
+ n_refuted_skip: int # REFUTED AND a real skip (a useful catch)
124
+ n_refuted_fired: int # REFUTED AND the precursor actually fired (the dangerous cell)
125
+ n_refuted_skip_mattered: int # of the useful catches, those that fed a scored verifier
126
+
127
+ # --- derived rates (all guard against divide-by-zero) ---
128
+
129
+ @property
130
+ def missed_precursor_recall(self) -> float:
131
+ """Of all REAL prerequisite-skips, the fraction the gate fired REFUTED on — the HEADLINE.
132
+ A grammar that covers too few mutating tools scores ~0 (it never fires); growing
133
+ `requires` raises it. The grammar-coverage instrument (docs/147 §1)."""
134
+ return (self.n_refuted_skip / self.n_real_skip) if self.n_real_skip else 0.0
135
+
136
+ @property
137
+ def false_refute_rate(self) -> float:
138
+ """Of all CORRECTLY-SEQUENCED calls (the precursor actually fired), the fraction the gate
139
+ WRONGLY fired REFUTED on — THE DANGEROUS-CELL RATE (the
140
+ `tool_stream_eval.false_resurface_rate` / `intervention_eval.wasted_disruption_rate`
141
+ analogue). Harmless by design (a WARN preserving the turn), but a high rate says the
142
+ `aliases` allow-list is incomplete — grow it (the R3 calibration, docs/147 §6)."""
143
+ return (self.n_refuted_fired / self.n_correctly_sequenced) if self.n_correctly_sequenced else 0.0
144
+
145
+ @property
146
+ def fire_precision(self) -> float:
147
+ """Of all the calls the gate fired REFUTED on, the fraction that were real skips — how much
148
+ of the firing was well-aimed (vs a false REFUTED on an unlisted alias)."""
149
+ return (self.n_refuted_skip / self.n_refuted) if self.n_refuted else 0.0
150
+
151
+ @property
152
+ def mattered_recall(self) -> float:
153
+ """Of all real skips, the fraction the gate caught AND that fed a scored verifier — recall
154
+ weighted by what actually moves the score (the value side of the grammar-coverage bound)."""
155
+ return (self.n_refuted_skip_mattered / self.n_real_skip) if self.n_real_skip else 0.0
156
+
157
+ @property
158
+ def net_positive(self) -> bool:
159
+ """True iff the grammar catches more real skips than it false-REFUTES on correctly-sequenced
160
+ calls — the boolean a `dos precursor-gate-eval` exit code could ride (the friendly-direction
161
+ `net_harmful` analogue). A catch is a real nudge toward the scored fix; a false REFUTED is
162
+ harmless-but-noise, so net-positive is `refuted_skip > refuted_fired`."""
163
+ return self.n_refuted_skip > self.n_refuted_fired
164
+
165
+ def to_dict(self) -> dict:
166
+ return {
167
+ "n": self.n,
168
+ "grid": {
169
+ "real_skip": self.n_real_skip,
170
+ "correctly_sequenced": self.n_correctly_sequenced,
171
+ },
172
+ "firing": {
173
+ "refuted": self.n_refuted,
174
+ "refuted_skip": self.n_refuted_skip,
175
+ "refuted_fired": self.n_refuted_fired,
176
+ "refuted_skip_mattered": self.n_refuted_skip_mattered,
177
+ },
178
+ "rates": {
179
+ "missed_precursor_recall": round(self.missed_precursor_recall, 4),
180
+ "false_refute_rate": round(self.false_refute_rate, 4),
181
+ "fire_precision": round(self.fire_precision, 4),
182
+ "mattered_recall": round(self.mattered_recall, 4),
183
+ },
184
+ "net_positive": self.net_positive,
185
+ }
186
+
187
+
188
+ def score(
189
+ grammar: PrecursorGrammar,
190
+ cases: Iterable[PrecursorCase],
191
+ policy: PrecursorPolicy = PrecursorPolicy(),
192
+ *,
193
+ _classify=classify_call,
194
+ ) -> PrecursorEvalReport:
195
+ """Run `grammar` over labelled `cases` (via `classify_call`) and tabulate the ledger.
196
+
197
+ The grammar is scored through the SAME `classify_call` path the consumer's gate uses (the
198
+ "score under the floor" discipline), so the grid reflects exactly what would FIRE. A call FIRES
199
+ iff its stance is REFUTED (the one actionable stance — ATTESTED/NO_SIGNAL never intervene).
200
+ PURE: reads cases, counts in one pass.
201
+
202
+ Invariant (pinned by a test): a call is counted in `n_refuted_skip` / `n_refuted_fired` only if
203
+ it both fired REFUTED AND carried the matching ground-truth label, so the firing ledger never
204
+ exceeds `n_refuted`, and `n_refuted_skip_mattered <= n_refuted_skip`.
205
+ """
206
+ n = 0
207
+ n_real_skip = n_correctly_sequenced = 0
208
+ n_refuted = n_refuted_skip = n_refuted_fired = n_refuted_skip_mattered = 0
209
+
210
+ for case in cases:
211
+ n += 1
212
+ verdict = _classify(case.call, case.stream, grammar, policy)
213
+ fired = verdict.stance is EvidenceStance.REFUTED
214
+
215
+ real_skip = case.precursor_required and not case.precursor_actually_fired
216
+ correctly_sequenced = case.precursor_required and case.precursor_actually_fired
217
+ if real_skip:
218
+ n_real_skip += 1
219
+ if correctly_sequenced:
220
+ n_correctly_sequenced += 1
221
+
222
+ if fired:
223
+ n_refuted += 1
224
+ if real_skip:
225
+ n_refuted_skip += 1
226
+ if case.mattered_to_score:
227
+ n_refuted_skip_mattered += 1
228
+ if correctly_sequenced:
229
+ n_refuted_fired += 1
230
+
231
+ return PrecursorEvalReport(
232
+ n=n,
233
+ n_real_skip=n_real_skip,
234
+ n_correctly_sequenced=n_correctly_sequenced,
235
+ n_refuted=n_refuted,
236
+ n_refuted_skip=n_refuted_skip,
237
+ n_refuted_fired=n_refuted_fired,
238
+ n_refuted_skip_mattered=n_refuted_skip_mattered,
239
+ )