PyPI - dos-kernel - Versions diffs - 0.22.0__py3-none-win_amd64.whl - Mend

dos-kernel 0.22.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (178) hide show

dos/__init__.py +261 -0
dos/_bin/dos-hook.exe +0 -0
dos/_filelock.py +255 -0
dos/_job_policy.py +97 -0
dos/_tree.py +145 -0
dos/admission.py +433 -0
dos/answer_shape.py +299 -0
dos/arbiter.py +859 -0
dos/archive_lock.py +266 -0
dos/arg_provenance.py +814 -0
dos/attest.py +472 -0
dos/breaker.py +311 -0
dos/churn.py +226 -0
dos/claim_extract.py +229 -0
dos/claim_ttl.py +150 -0
dos/cli.py +8721 -0
dos/commit_audit.py +666 -0
dos/completion.py +466 -0
dos/concurrency_class.py +154 -0
dos/config.py +1380 -0
dos/config_lint.py +464 -0
dos/cooldown.py +390 -0
dos/coverage.py +387 -0
dos/dangling_intent.py +287 -0
dos/data_class.py +397 -0
dos/decisions.py +1274 -0
dos/decisions_tui.py +251 -0
dos/dispatch_top.py +740 -0
dos/dispatch_top_tui.py +116 -0
dos/drivers/__init__.py +40 -0
dos/drivers/ci_status.py +630 -0
dos/drivers/citation_resolve.py +703 -0
dos/drivers/decision_stop.py +98 -0
dos/drivers/export_file.py +173 -0
dos/drivers/export_otlp.py +275 -0
dos/drivers/export_statsd.py +242 -0
dos/drivers/hook_dialects.py +391 -0
dos/drivers/job.py +47 -0
dos/drivers/llm_judge.py +360 -0
dos/drivers/memory_recall.py +1231 -0
dos/drivers/notify_slack.py +373 -0
dos/drivers/notify_webhook.py +251 -0
dos/drivers/operator_judge.py +114 -0
dos/drivers/os_acceptance.py +228 -0
dos/drivers/paste_log.py +132 -0
dos/drivers/plan_scope.py +133 -0
dos/drivers/self_improve.py +375 -0
dos/drivers/similarity_judge.py +249 -0
dos/drivers/state_diff.py +274 -0
dos/drivers/supervisor.py +347 -0
dos/drivers/watchdog.py +363 -0
dos/drivers/workshop.py +160 -0
dos/durable_schema.py +344 -0
dos/effect_witness.py +393 -0
dos/efficiency.py +318 -0
dos/enforce.py +414 -0
dos/enumerate.py +776 -0
dos/env_print.py +378 -0
dos/event_severity.py +258 -0
dos/evidence.py +692 -0
dos/exec_capability.py +256 -0
dos/export_cursor.py +143 -0
dos/exporter.py +320 -0
dos/firing_label.py +353 -0
dos/fleet_roll.py +226 -0
dos/gate_classify.py +827 -0
dos/gh4_coverage.py +179 -0
dos/git_delta.py +122 -0
dos/guard.py +215 -0
dos/health.py +552 -0
dos/help_summary.py +519 -0
dos/home.py +934 -0
dos/hook_binary.py +194 -0
dos/hook_dialect.py +271 -0
dos/hook_exit.py +191 -0
dos/hook_install.py +437 -0
dos/id_alloc.py +304 -0
dos/improve.py +499 -0
dos/intent_ledger.py +635 -0
dos/interpret.py +176 -0
dos/intervention.py +769 -0
dos/intervention_eval.py +371 -0
dos/journal_delta.py +308 -0
dos/judge_eval.py +328 -0
dos/judges.py +366 -0
dos/lane_infer.py +127 -0
dos/lane_journal.py +1001 -0
dos/lane_lease.py +952 -0
dos/lane_overlap.py +228 -0
dos/lease_health.py +282 -0
dos/lifecycle.py +211 -0
dos/liveness.py +352 -0
dos/lock_modes.py +185 -0
dos/log_source.py +395 -0
dos/loop_decide.py +1746 -0
dos/marker_gate.py +254 -0
dos/marker_sensor.py +396 -0
dos/noop_streak.py +280 -0
dos/notify.py +479 -0
dos/observe.py +175 -0
dos/oracle.py +1661 -0
dos/overlap_eval.py +214 -0
dos/overlap_policy.py +342 -0
dos/packet_sidecar.py +267 -0
dos/phase_shipped.py +1985 -0
dos/pick_priority.py +225 -0
dos/pickable.py +369 -0
dos/picker_oracle.py +1037 -0
dos/plan_board.py +513 -0
dos/plan_board_tui.py +113 -0
dos/plan_source.py +455 -0
dos/posttool_sensor.py +528 -0
dos/precursor_gate.py +499 -0
dos/precursor_gate_eval.py +239 -0
dos/preflight.py +825 -0
dos/pretool_sensor.py +490 -0
dos/proc_delta.py +181 -0
dos/productivity.py +296 -0
dos/provider_limit.py +242 -0
dos/py.typed +4 -0
dos/reason_morphology.py +299 -0
dos/reasons.py +449 -0
dos/reconcile.py +173 -0
dos/recurring_wedge.py +206 -0
dos/render.py +393 -0
dos/result_state.py +468 -0
dos/resume.py +578 -0
dos/resume_evidence.py +293 -0
dos/retention.py +344 -0
dos/reward.py +372 -0
dos/rewind.py +587 -0
dos/rewind_evidence.py +168 -0
dos/rewind_tokens.py +252 -0
dos/run_id.py +342 -0
dos/scope.py +520 -0
dos/scope_source.py +382 -0
dos/scout.py +982 -0
dos/self_modify.py +209 -0
dos/sibling_scan.py +569 -0
dos/skills/EXAMPLES.md +584 -0
dos/skills/dos-class-cycle/SKILL.md +107 -0
dos/skills/dos-dispatch/SKILL.md +177 -0
dos/skills/dos-dispatch-loop/SKILL.md +254 -0
dos/skills/dos-goal-gate/SKILL.md +269 -0
dos/skills/dos-next-up/SKILL.md +231 -0
dos/skills/dos-promote/SKILL.md +114 -0
dos/skills/dos-replan/SKILL.md +159 -0
dos/skills/dos-replan-loop/SKILL.md +114 -0
dos/skills/dos-self-improve/SKILL.md +213 -0
dos/skills/dos-supervise-loop/SKILL.md +180 -0
dos/skills/dos-unstick/SKILL.md +108 -0
dos/skills/dos-witness-claim/SKILL.md +251 -0
dos/stamp.py +1002 -0
dos/state_health.py +387 -0
dos/status.py +114 -0
dos/stop_policy.py +334 -0
dos/supervise.py +1014 -0
dos/testwitness.py +392 -0
dos/timeline.py +1027 -0
dos/tokens.py +485 -0
dos/tool_stream.py +393 -0
dos/tool_stream_eval.py +226 -0
dos/trace.py +524 -0
dos/verdict.py +140 -0
dos/verdict_cli.py +189 -0
dos/verdict_journal.py +497 -0
dos/verdict_rollup.py +217 -0
dos/verdicts.py +181 -0
dos/wedge_reason.py +282 -0
dos_kernel-0.22.0.dist-info/METADATA +859 -0
dos_kernel-0.22.0.dist-info/RECORD +178 -0
dos_kernel-0.22.0.dist-info/WHEEL +5 -0
dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
dos_mcp/__init__.py +52 -0
dos_mcp/py.typed +2 -0
dos_mcp/server.py +779 -0

dos/efficiency.py ADDED Viewed

@@ -0,0 +1,318 @@
+"""EFF — the token-effectiveness verdict: *did the tokens this run spent buy work?*
+The **token-economics completion of `productivity()`**. The kernel already has two
+loop-economics verdicts and a clean gap between them:
+    liveness.classify      (ProgressEvidence, policy)   -> did state move AT ALL?      (a binary, lifetime count)
+    productivity.classify  (WorkHistory, policy)        -> is the work-per-step RATE fading?  (a trend over steps)
+    efficiency.classify    (EfficiencyEvidence, policy) -> did the tokens buy WORK?     (a ratio: work per token)
+                           ^ THIS module
+`liveness` reads a single since-start count; `productivity` reads a *trend* of
+per-step work deltas; neither relates the work to its **price**. A run can be
+ADVANCING (it committed) AND PRODUCTIVE (each step lands work) and still be
+spending ten times the tokens that work was worth — the gap between *did it do
+work?* and *was the work worth what it cost?* That gap is **efficiency**, and it
+is the question an operator means by "token effectiveness": not "is the run
+moving" but "is the run **spending well**." EFF is `productivity`'s lateral
+sibling, re-aimed from a trend over time onto a **ratio**: `work / tokens`.
+**Byte-clean by construction (the docs/138 invariant).** Both inputs are counts
+the *runtime / environment* authors, never the judged agent's narration:
+  * `tokens` — what the model API billed this run (the usage record the provider
+    returns), the same env-authored counter `liveness.tokens_spent_since` reads.
+  * `work` — a count of ground-truth work the environment measured: commits
+    landed, bytes diffed, tests passed — the same `productivity` work-unit, the
+    same kind of thing `verify` confirms off git. Whatever unit the host chooses.
+So WASTEFUL is "the environment recorded N tokens spent and ~0 work landed,"
+never "the agent says it was inefficient." A quantity, not a self-report — and
+crucially **non-forgeable in the direction that matters**: an agent cannot move
+the verdict toward EFFICIENT by *narrating* productivity, because the numerator
+is work the environment witnessed (a commit the git machinery authored, a test
+the runner authored), not a claim the agent emitted. This is the same reason
+`reward.admit` trusts the read-back and not the answer text: the bytes that move
+the verdict are bytes the claimant did not write.
+**EFF reports a price, never a quality.** Like `productivity` says the *rate*
+fell (never that the work was *wrong*), EFF says the *cost per unit of work* is
+high — it never says the work was bad. A run can be perfectly correct and
+WASTEFUL (it burned tokens deliberating, re-reading, marker-spinning); it can be
+EFFICIENT and wrong (cheap garbage). Quality is an advisory judge's call
+(`llm_judge`), never this deterministic verb — the distrust-state / distrust-
+judgment line the whole temporal-verdict family draws.
+**Withhold the accusation until there is enough spend to judge.** The whole
+reason EFF has a `min_tokens` floor is the `productivity.min_steps` reason: a run
+that has barely started has spent too little to have an honest ratio (3 tokens
+and 0 work is not a wasteful run, it is a run that has not done anything yet).
+Below the floor EFF returns EFFICIENT-benign ("not enough spend to judge") — the
+young-and-alive guard, lateral. The accusation (COSTLY / WASTEFUL) fires only
+once the run has spent enough that a low ratio is real signal.
+**No-telemetry / no-plan discipline** (the `test_verify_no_plan` sibling, the
+strongest of the verdict family alongside `productivity`): EFF needs *nothing*
+but the two counts the caller already has. No git, no registry, no journal, no
+clock — `classify()` makes no I/O at all (EFF is timeless, like `productivity`;
+it reads two numbers, not ages). A caller with a work count and a token count
+gets a verdict; a caller with too few tokens gets the honest "not enough spend to
+judge" (EFFICIENT-benign).
+"""
+from __future__ import annotations
+import enum
+from dataclasses import dataclass
+class Efficiency(str, enum.Enum):
+    """The typed token-effectiveness verdict — three states, mutually exclusive.
+    `str`-valued so it round-trips through a CLI stdout token / exit-code map
+    without a lookup table (mirrors `productivity.Productivity` and
+    `liveness.Liveness`).
+    """
+    EFFICIENT = "EFFICIENT"  # work-per-token at/above the floor (or too little spend to judge)
+    COSTLY = "COSTLY"        # nonzero work, but the ratio is under the floor — spending a lot per unit
+    WASTEFUL = "WASTEFUL"    # meaningful tokens spent, ~0 work landed — the tokens bought nothing
+    def __str__(self) -> str:  # pragma: no cover - trivial
+        return self.value
+@dataclass(frozen=True)
+class EfficiencyPolicy:
+    """The thresholds that separate EFFICIENT / COSTLY / WASTEFUL — policy, not mechanism.
+    The same "mechanism is kernel, thresholds are config" split as
+    `productivity`'s `min_steps`/`floor` and `liveness`'s windows. The defaults
+    are GENERIC; a workspace declares its own in `dos.toml [efficiency]` (the
+    closed-config-as-data pattern, the forward-looking seam `productivity` also
+    documents).
+      min_tokens — the **minimum tokens spent** before EFF will accuse a run of
+                   being COSTLY / WASTEFUL. Below it the run has spent too little
+                   to have an honest ratio (a handful of tokens and no work is a
+                   run that has barely started, not a wasteful one), and the
+                   verdict withholds the accusation. The token analogue of
+                   `productivity.min_steps` — the `liveness.grace_ms` guard,
+                   measured in spend instead of steps or time.
+      floor      — the **work-per-token efficiency floor**: the minimum ratio
+                   `work / tokens` a run must clear to be EFFICIENT. Below it (but
+                   with nonzero work) the run is COSTLY — it is doing work, but
+                   paying a lot per unit. The UNIT of `work` is the host's
+                   (commits, changed bytes, passed tests); the kernel only
+                   compares the ratio to the floor. A float, because work-per-
+                   token is normally « 1 (one commit might be tens of thousands of
+                   tokens → a floor like 0.00002 commits/token, or — far more
+                   legibly — the host counts work in a coarser unit so the floor
+                   is a readable number).
+    Defaults: `min_tokens=1000` (a run that has spent under ~1k tokens has barely
+    started — too little to judge), `floor=0.0` (DISABLED by default — see below).
+    **Why the default floor is 0.0 (disabled), not a guessed number.** Unlike
+    `productivity`, which could lift a real constant from Claude Code's own loop
+    (`tokenBudget.ts`'s 500-token diminishing threshold), there is no universal
+    "good" work-per-token ratio — it depends entirely on what the host counts as a
+    work unit (a ratio sensible for "changed bytes" is meaningless for "commits").
+    Shipping a guessed floor would manufacture COSTLY verdicts out of a unit
+    mismatch (the docs/235 slice-must-have-power lesson: a threshold that fires for
+    the wrong reason is worse than none). So the default floor is 0.0 — every
+    nonzero-work run is EFFICIENT until the host declares a floor that means
+    something for *its* unit. The one verdict EFF always gives for free, no floor
+    needed, is **WASTEFUL** (zero work for meaningful spend), because "tokens
+    bought literally nothing" is unit-independent: 0 work is 0 work whatever the
+    unit. That is the cost-free, always-correct half of the verdict; COSTLY is the
+    opt-in half a host arms by setting a floor.
+    """
+    min_tokens: int = 1000   # below this spend, withhold the accusation (the productivity.min_steps analogue)
+    floor: float = 0.0       # work-per-token floor; 0.0 = disabled (only WASTEFUL fires) — see docstring
+    def __post_init__(self) -> None:
+        if self.min_tokens < 0:
+            raise ValueError("min_tokens must be non-negative")
+        if self.floor < 0:
+            raise ValueError("the work-per-token floor must be non-negative")
+DEFAULT_POLICY = EfficiencyPolicy()
+@dataclass(frozen=True)
+class EfficiencyEvidence:
+    """The two counts `classify()` reads — gathered by the CALLER at its boundary.
+    No clock, no I/O inside the verdict — the arbiter rule, sharpened the way
+    `productivity` sharpens it: there is not even a clock rung (EFF is *timeless*;
+    it reads two numbers, never an age). The caller's boundary (the `dos
+    efficiency` evidence-gather, or a loop reading the provider usage record + its
+    own git delta) measures the work and the spend and freezes them here.
+      work   — the count of ground-truth **work units** the environment measured
+               for this run (commits landed, bytes diffed, tests passed — the
+               host's unit, the same one `productivity` counts). Non-negative: a
+               run that *removed* work still did the work of removing it (the host
+               passes the magnitude, never a signed regression), and a run that
+               landed nothing passes 0.
+      tokens — the count of **tokens** the run spent (the provider usage record),
+               the env-authored price. Non-negative. Zero tokens is the degenerate
+               "no spend yet" case (a ratio is undefined) — handled as
+               EFFICIENT-benign, never a divide-by-zero.
+    Both are env-authored (the docs/138 invariant): `work` is what git/the test
+    runner witnessed, `tokens` is what the API billed — neither is the agent's
+    "I was efficient" narration. The ratio `work / tokens` is the run's
+    efficiency; the verdict compares it to the policy floor.
+    """
+    work: int = 0
+    tokens: int = 0
+    def __post_init__(self) -> None:
+        if self.work < 0:
+            raise ValueError("work must be non-negative (a count of work done)")
+        if self.tokens < 0:
+            raise ValueError("tokens must be non-negative (a count of tokens spent)")
+    @property
+    def ratio(self) -> float:
+        """Work per token spent — the efficiency. 0.0 when no tokens were spent
+        (the degenerate no-spend case; the verdict treats it as benign, never a
+        divide-by-zero)."""
+        if self.tokens <= 0:
+            return 0.0
+        return self.work / self.tokens
+    @classmethod
+    def of(cls, work: int, tokens: int) -> "EfficiencyEvidence":
+        """Build evidence from a work count and a token count."""
+        return cls(work=work, tokens=tokens)
+@dataclass(frozen=True)
+class EfficiencyVerdict:
+    """The single verdict `classify()` returns, with the facts echoed back.
+    `verdict` is the typed `Efficiency`. `reason` is a one-line operator-facing
+    summary (the tally-row string). `evidence` is the `EfficiencyEvidence` that
+    drove the call, carried so `dos efficiency --json` can emit the verdict *and
+    the facts behind it* in one object (the legible-distrust renderer seam): the
+    operator sees not just WASTEFUL but *why* (80,000 tokens spent, 0 work landed),
+    and not just COSTLY but the ratio and the floor it fell under. `to_dict` is the
+    json shape.
+    """
+    verdict: Efficiency
+    reason: str
+    evidence: EfficiencyEvidence
+    def to_dict(self) -> dict:
+        e = self.evidence
+        return {
+            "verdict": self.verdict.value,
+            "reason": self.reason,
+            "evidence": {
+                "work": e.work,
+                "tokens": e.tokens,
+                "ratio": e.ratio,
+            },
+        }
+def classify(
+    evidence: EfficiencyEvidence, policy: EfficiencyPolicy = DEFAULT_POLICY
+) -> EfficiencyVerdict:
+    """Classify a run's token effectiveness from its work and its spend. PURE — no I/O.
+    Reads the ladder top to bottom (this function IS the answer to "did the tokens
+    buy work?"):
+      1. EFFICIENT (too little spend) — fewer than `min_tokens` tokens spent (or
+         zero): the run has barely started; there is not enough spend to have an
+         honest ratio, so withhold the COSTLY/WASTEFUL accusation (the
+         `productivity` young-and-alive guard, lateral). Checked FIRST so a
+         just-launched run with one token and no commit is never mislabelled
+         WASTEFUL on a spend technicality.
+      2. WASTEFUL — meaningful tokens spent (`tokens >= min_tokens`) AND zero work
+         landed (`work == 0`): the tokens bought nothing — the degenerate floor of
+         inefficiency, the unit-independent half of the verdict (0 work is 0 work
+         whatever the unit, so this fires with NO floor needed). Named distinctly
+         from COSTLY (a fading-but-nonzero ratio) because zero is the operator's
+         clearest "the spend was pure overhead" signal — the marker-storm /
+         spin-without-shipping rung. Checked before COSTLY so an exact zero is
+         named precisely.
+      3. COSTLY — meaningful spend AND nonzero work AND the ratio under `floor`:
+         the run is doing work but paying a lot per unit (fading efficiency, but
+         not pure waste). The opt-in half of the verdict — fires only when the host
+         has armed a `floor` that means something for its work unit (with the
+         default `floor=0.0` this rung never fires; every nonzero-work run is
+         EFFICIENT). The efficiency analogue of `productivity.DIMINISHING`.
+      4. EFFICIENT — none of the above: the ratio is at/above the floor (or the
+         floor is disabled and work is nonzero). The tokens bought their work.
+    The COSTLY test uses `>` on the floor (ratio strictly under floor is costly),
+    so a ratio exactly AT the floor is EFFICIENT — the floor is the minimum
+    acceptable efficiency, inclusive. With the default `floor=0.0`, no nonzero-work
+    ratio is under it, so only WASTEFUL ever fires without an explicit floor.
+    """
+    tokens = evidence.tokens
+    work = evidence.work
+    # 1. EFFICIENT (too little spend) — not enough tokens spent to judge a ratio.
+    #    Withhold the COSTLY/WASTEFUL accusation; report the benign verdict. A run
+    #    that has spent nothing at all also lands here (no spend, no problem yet).
+    if tokens < policy.min_tokens or tokens == 0:
+        return EfficiencyVerdict(
+            verdict=Efficiency.EFFICIENT,
+            reason=(
+                f"{tokens} token(s) spent (< min {policy.min_tokens}) — not enough "
+                f"spend to judge token effectiveness; no efficiency problem yet"
+            ),
+            evidence=evidence,
+        )
+    # 2. WASTEFUL — meaningful spend bought ZERO work. The pure-overhead rung, named
+    #    distinctly from a merely-low ratio so the operator's clearest signal
+    #    ("the tokens bought nothing") is not blurred into COSTLY. Unit-independent:
+    #    fires with no floor, because 0 work is 0 work whatever the host's unit.
+    if work == 0:
+        return EfficiencyVerdict(
+            verdict=Efficiency.WASTEFUL,
+            reason=(
+                f"{tokens} tokens spent and 0 work units landed — the spend bought "
+                f"nothing (pure overhead)"
+            ),
+            evidence=evidence,
+        )
+    ratio = evidence.ratio
+    # 3. COSTLY — a low-but-nonzero efficiency: the run is doing work but paying a
+    #    lot per unit. The opt-in half — fires only when the host armed a floor that
+    #    means something for its work unit. With the default floor=0.0 this never
+    #    fires (no nonzero ratio is < 0.0). The productivity.DIMINISHING analogue.
+    if ratio < policy.floor:
+        return EfficiencyVerdict(
+            verdict=Efficiency.COSTLY,
+            reason=(
+                f"{work} work units for {tokens} tokens — {ratio:.6g} work/token, "
+                f"under the {policy.floor:.6g} floor (doing work, but spending a lot "
+                f"per unit)"
+            ),
+            evidence=evidence,
+        )
+    # 4. EFFICIENT — the ratio cleared the floor (or the floor is disabled and work
+    #    is nonzero). The tokens bought their work.
+    return EfficiencyVerdict(
+        verdict=Efficiency.EFFICIENT,
+        reason=(
+            f"{work} work units for {tokens} tokens — {ratio:.6g} work/token "
+            f"(at/above the {policy.floor:.6g} floor) — the spend bought its work"
+        ),
+        evidence=evidence,
+    )