npm - @possumtech/rummy - Versions diffs - 2.0.1 → 2.1.0 - Mend

@possumtech/rummy 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

package/.env.example +12 -7
package/BENCH_ENVIRONMENT.md +230 -0
package/CLIENT_INTERFACE.md +396 -0
package/PLUGINS.md +93 -1
package/SPEC.md +305 -28
package/bin/postinstall.js +2 -2
package/bin/rummy.js +2 -2
package/last_run.txt +5617 -0
package/migrations/001_initial_schema.sql +2 -1
package/package.json +6 -2
package/scriptify/cache_probe.js +66 -0
package/scriptify/cache_probe_grok.js +74 -0
package/service.js +22 -11
package/src/agent/AgentLoop.js +33 -139
package/src/agent/ContextAssembler.js +2 -9
package/src/agent/Entries.js +36 -101
package/src/agent/ProjectAgent.js +2 -9
package/src/agent/TurnExecutor.js +45 -83
package/src/agent/XmlParser.js +247 -273
package/src/agent/budget.js +5 -28
package/src/agent/config.js +38 -0
package/src/agent/errors.js +7 -13
package/src/agent/httpStatus.js +1 -19
package/src/agent/known_store.sql +7 -2
package/src/agent/materializeContext.js +12 -17
package/src/agent/pathEncode.js +5 -0
package/src/agent/rummyHome.js +9 -0
package/src/agent/runs.sql +18 -0
package/src/agent/tokens.js +2 -8
package/src/hooks/HookRegistry.js +1 -16
package/src/hooks/Hooks.js +8 -33
package/src/hooks/PluginContext.js +3 -21
package/src/hooks/RpcRegistry.js +1 -4
package/src/hooks/RummyContext.js +2 -16
package/src/hooks/ToolRegistry.js +5 -15
package/src/llm/LlmProvider.js +28 -23
package/src/llm/errors.js +41 -4
package/src/llm/openaiStream.js +125 -0
package/src/llm/retry.js +61 -15
package/src/plugins/budget/budget.js +14 -81
package/src/plugins/cli/README.md +87 -0
package/src/plugins/cli/bin.js +61 -0
package/src/plugins/cli/cli.js +120 -0
package/src/plugins/env/README.md +2 -1
package/src/plugins/env/env.js +4 -6
package/src/plugins/env/envDoc.md +2 -2
package/src/plugins/error/error.js +23 -23
package/src/plugins/file/file.js +2 -22
package/src/plugins/get/get.js +12 -34
package/src/plugins/get/getDoc.md +5 -3
package/src/plugins/hedberg/edits.js +1 -11
package/src/plugins/hedberg/hedberg.js +3 -26
package/src/plugins/hedberg/normalize.js +1 -5
package/src/plugins/hedberg/patterns.js +4 -15
package/src/plugins/hedberg/sed.js +1 -7
package/src/plugins/helpers.js +28 -20
package/src/plugins/index.js +25 -41
package/src/plugins/instructions/README.md +18 -0
package/src/plugins/instructions/instructions.js +13 -76
package/src/plugins/instructions/instructions.md +19 -18
package/src/plugins/instructions/instructions_104.md +5 -4
package/src/plugins/instructions/instructions_105.md +16 -15
package/src/plugins/instructions/instructions_106.md +15 -14
package/src/plugins/instructions/instructions_107.md +13 -6
package/src/plugins/known/README.md +26 -6
package/src/plugins/known/known.js +36 -34
package/src/plugins/log/README.md +2 -2
package/src/plugins/log/log.js +6 -33
package/src/plugins/ollama/ollama.js +50 -66
package/src/plugins/openai/openai.js +26 -44
package/src/plugins/openrouter/openrouter.js +28 -52
package/src/plugins/policy/README.md +8 -2
package/src/plugins/policy/policy.js +8 -21
package/src/plugins/prompt/README.md +22 -0
package/src/plugins/prompt/prompt.js +8 -16
package/src/plugins/rm/rm.js +5 -2
package/src/plugins/rm/rmDoc.md +4 -4
package/src/plugins/rpc/README.md +2 -1
package/src/plugins/rpc/rpc.js +51 -47
package/src/plugins/set/README.md +5 -1
package/src/plugins/set/set.js +23 -33
package/src/plugins/set/setDoc.md +1 -1
package/src/plugins/sh/README.md +2 -1
package/src/plugins/sh/sh.js +5 -11
package/src/plugins/sh/shDoc.md +2 -2
package/src/plugins/stream/README.md +6 -5
package/src/plugins/stream/stream.js +6 -35
package/src/plugins/telemetry/telemetry.js +26 -19
package/src/plugins/think/think.js +4 -7
package/src/plugins/unknown/unknown.js +8 -13
package/src/plugins/update/update.js +36 -35
package/src/plugins/update/updateDoc.md +3 -3
package/src/plugins/xai/xai.js +30 -20
package/src/plugins/yolo/yolo.js +8 -41
package/src/server/ClientConnection.js +17 -47
package/src/server/SocketServer.js +14 -14
package/src/server/protocol.js +1 -10
package/src/sql/functions/slugify.js +5 -7
package/src/sql/v_model_context.sql +4 -11
package/turns/cli_1777462658211/turn_001.txt +772 -0
package/turns/cli_1777462658211/turn_002.txt +606 -0
package/turns/cli_1777462658211/turn_003.txt +667 -0
package/turns/cli_1777462658211/turn_004.txt +297 -0
package/turns/cli_1777462658211/turn_005.txt +301 -0
package/turns/cli_1777462658211/turn_006.txt +262 -0
package/turns/cli_1777465095132/turn_001.txt +715 -0
package/turns/cli_1777465095132/turn_002.txt +236 -0
package/turns/cli_1777465095132/turn_003.txt +287 -0
package/turns/cli_1777465095132/turn_004.txt +694 -0
package/turns/cli_1777465095132/turn_005.txt +422 -0
package/turns/cli_1777465095132/turn_006.txt +365 -0
package/turns/cli_1777465095132/turn_007.txt +885 -0
package/turns/cli_1777465095132/turn_008.txt +1277 -0
package/turns/cli_1777465095132/turn_009.txt +736 -0

package/PLUGINS.md CHANGED Viewed

@@ -241,11 +241,49 @@ ctx = {
     toolSet,           // Set<string> of active tool names for this loop
     contextSize,       // Model context window size
     lastContextTokens, // Actual API tokens from the prior turn (0 on turn 1)
-    demoted,           // Mutable array — plugins push paths they summarized
     turn,              // Current turn number
 }
 ```
+#### Filter Priority Bands {#plugins_filter_bands}
+Filters run in ascending priority order. The packet renders in
+top-to-bottom order matching that — lower priority appears earlier in
+the message. Current `assembly.user` registrations:
+| Priority | Block | Plugin | Mutates per turn? |
+|---|---|---|---|
+| 50 | `<summarized>` | `known.js` | Slow — only on new entry |
+| 75 | `<visible>` | `known.js` | Fast — on every promote/demote |
+| 100 | `<log>` | `log.js` | Always — appends per action |
+| 200 | `<unknowns>` | `unknown.js` | On unknown lifecycle |
+| 250 | `<instructions>` | `instructions.js` | On phase transition |
+| 275 | `<budget>` | `budget.js` | Every turn (live) |
+| 300 | `<prompt>` | `prompt.js` | Stable within a loop |
+**Recommended ranges for new plugins** (for cache-friendly placement
+and predictable rendering position):
+| Range | Position | Use for |
+|---|---|---|
+| `0–49` | Top of user | Reserved (stable identity-tier blocks above `<summarized>`) |
+| `50–99` | Codebase data surface | Don't add here — owned by `known.js` |
+| `100–149` | History tier | Action history, timeline-style content |
+| `150–199` | Open slot | Inter-history blocks (e.g. recent-decisions, tracked progress) |
+| `200–249` | State tier | Model state (open questions, work-in-progress) |
+| `250–299` | Phase + budget | Avoid; current phase / budget arithmetic owned here |
+| `300–349` | Task | Reserved for prompt-tier content |
+| `350–999` | Bottom | Append-after-prompt content (rare; usually wrong) |
+Within a band, lower priority = renders higher. Pick the smallest
+priority that lands you in the right band and leaves room above and
+below.
+`assembly.system` currently has no registrations — system message is
+the static identity surface (instructions base + tool docs). Adding
+to `assembly.system` invalidates the system-prefix cache on whatever
+provider you target; reserve for content that's truly stable per-run.
 ### Tool Docs {#plugins_tool_docs}
 Each tool plugin has a `*Doc.js` file with annotated line arrays.
@@ -282,6 +320,60 @@ entry = {
 Multiple handlers per scheme. Lower priority runs first. Return
 `false` to stop the chain.
+#### Reporting outcomes {#plugins_handler_outcomes}
+**The action entry IS its outcome.** Your handler finalizes the action's
+own log entry at `entry.resultPath`. Success and failure are two values
+of the same shape — body, state, outcome. The model sees both through
+the same channel under your tool's scheme:
+```js
+async handler(entry, rummy) {
+    const { entries: store, runId, turn, loopId } = rummy;
+    const result = await runMyTool(entry.attributes);
+    if (result.failed) {
+        await store.set({
+            runId, turn, loopId,
+            path: entry.resultPath,
+            body: result.failureMessage,
+            state: "failed",
+            outcome: result.label,    // "not_found", "validation", etc.
+        });
+        return;
+    }
+    await store.set({
+        runId, turn, loopId,
+        path: entry.resultPath,
+        body: result.output,
+        state: "resolved",
+    });
+}
+```
+That's the whole failure-reporting surface. Body is the result on
+success, the failure message on failure. State labels the verdict
+(`resolved` / `failed`). Outcome is a short machine-readable label.
+The framework reads the post-handler state of every recorded entry
+each turn; any `state="failed"` result counts as a strike toward
+`MAX_STRIKES`. You don't need to do anything else to make the strike
+fire — write the entry's outcome and the framework follows.
+You do **not** call `hooks.error.log.emit` from a tool handler. That
+hook is reserved for the framework's actionless-failure cases (parser
+warnings, dispatch crashes, runtime watchdog, budget overflow) — none
+of which a third-party plugin should be writing.
+If your handler throws, the framework catches and emits a status-500
+error entry on your behalf. That's the one case where the framework
+writes for you. Throw with intent; don't try-catch your own handler
+just to avoid a stack trace.
+See SPEC [failure_reporting](SPEC.md#failure_reporting) for the
+full contract and the rationale.
 ### full(entry) / summary(entry) {#plugins_views}
 Returns the string the model sees for this tool's entries at the

package/SPEC.md CHANGED Viewed

@@ -7,6 +7,31 @@ everything else.
 ---
+## Glossary {#glossary}
+Canonical meanings. When a doc, comment, test name, or commit message
+uses one of these words, it should mean exactly what's written here.
+| Term | Meaning |
+|---|---|
+| **run** | The alias-keyed lifetime of one project-agent invocation. Begins on `set run://{alias}` with a prompt; ends at terminal status (200/204/422/499/500). One run per alias; aliases are unique per project. |
+| **loop** | One `ask` or `act` invocation and all its continuation turns until terminal `<update>`, abandonment, or abort. A run can contain multiple loops if a fresh prompt arrives on an existing run. |
+| **turn** | One round-trip with the LLM: one assembled prompt sent, one response parsed. A loop is a sequence of turns. |
+| **mode** | `ask` (read-only — no proposals, no `<sh>`, no edits) or `act` (full tool surface). Per loop, set at the entry point. |
+| **phase** | (Primary, FCRM sense.) One of five FCRM states selected by `<update status="1XY">`: 104=Definition, 105=Discovery, 106=Demotion, 107=Deployment, 108=Verification. Maps to `instructions_10N.md` rendered in `<instructions>`. **The model-facing instructions call these "stages"** — same concept, dual vocabulary kept for the model's surface stability. Two non-FCRM uses of "phase" coexist in the codebase and AGENTS.md: (1) "two-phase turn execution" refers to RECORD→DISPATCH within a single turn; (2) AGENTS.md "Phase 1 / Phase 2 / ..." entries refer to project-development milestones (Schema, Primitives, etc.) — neither is the FCRM phase. Context disambiguates; if it doesn't, it's a doc bug. |
+| **stage** | Model-facing synonym for **phase**. Lives in `instructions_*.md` and tooldocs. |
+| **proposal** | A tool-call entry at status 202 awaiting client resolution (accept/reject). Side-effecting actions (`<sh>`, `<env>`, file `<set>`, file `<rm>`/`<mv>`/`<cp>`, `<ask_user>`) emit proposals. YOLO mode auto-accepts. |
+| **verdict** | The end-of-turn ruling from `hooks.error.verdict` (owned by the error plugin). Returns `{continue, status, reason}`. Decides whether the loop continues to another turn or terminates. |
+| **strike** | A turn whose verdict counts toward `MAX_STRIKES`. A strike fires when `turnErrors > 0` (any `error.log` entry that turn) or when cycle detection trips silently. The streak counter resets on a clean turn (no errors, no cycle); reaches `MAX_STRIKES` → loop abandons at 499. |
+| **resolution** | Client's accept/reject of a proposal via `run/resolve` RPC. |
+| **dispatch** | The DISPATCH phase of a turn — actually executing recorded action entries. |
+**Hierarchy:** project ⊃ run ⊃ loop ⊃ turn. A turn is the smallest
+unit of model interaction. A strike is a per-turn property that
+accumulates across turns within a loop.
+---
 ## The Contract
 Rummy has one contract. Every actor speaks it.
@@ -227,7 +252,7 @@ Every entry plays one of four roles:
 | Role | Category | Section | Description |
 |------|----------|---------|-------------|
-| **Data** | `data` | `<context>` | Entries the model works with — persistent state and captured payload |
+| **Data** | `data` | `<summarized>` + `<visible>` | Entries the model works with — persistent state and captured payload. Summary line in `<summarized>` for visible+summarized tiers; full body in `<visible>` only when promoted. |
 | **Logging** | `logging` | `<log>` | Records of what happened — tool results, lifecycle signals |
 | **Unknowns** | `unknown` | `<unknowns>` | Open questions the model is tracking |
 | **Prompt** | `prompt` | `<prompt>` | The task driving the loop |
@@ -262,10 +287,11 @@ across two namespaces as a direct consequence:
   scheme=`log`, category=`logging`. Renders in `<log>`.
 - **Payload channels** live in `{action}://turn_N/{slug}_N` —
   scheme=`{action}` (registered as `category: "data"`). Render in
-  `<context>`.
+  `<summarized>` (always, while tracked) and `<visible>` (when
+  promoted).
 This keeps `<log>` a terse audit trail (what happened, exit code,
-paths) while `<context>` carries the actual streamed bytes the model
+paths) while `<visible>` carries the actual streamed bytes the model
 reads. Conflating the two — e.g., writing channels under `log://...` —
 mislabels payload as audit and pollutes the logging section with
 multi-line command output. See [streaming_entries](#streaming_entries).
@@ -434,6 +460,32 @@ policy filtering, abort cascade). Plugin-tier convenience verbs
 don't invoke the handler chain. Plugin code that wants full handler
 semantics calls `hooks.tools.dispatch` directly.
+**Two-phase turn execution.** Model output flows through
+`TurnExecutor.execute` in strict order:
+1. **RECORD** — every parsed command is materialized as a
+   `log://turn_N/action/slug` audit entry via `#record()`. Each
+   tool's parser shape surfaces exactly one of `path` / `command` /
+   `question` as its addressable target; absent fields are treated
+   as empty so the validation gate catches bad shapes rather than
+   letting `undefined` propagate. Targets longer than 512 chars or
+   containing control characters are rejected as likely reasoning
+   bleed (the model's chain-of-thought leaking into a tool path).
+   Plugins can validate or transform via the `entry.recording`
+   filter before the row is committed.
+2. **DISPATCH** — recorded entries fire sequentially via
+   `hooks.tools.dispatch`. Each tool runs to completion before the
+   next starts. A failed entry sets `abortAfter`; subsequent
+   entries record as `outcome="aborted"`. Crashes inside dispatch
+   route through `hooks.error.log` at status 500 and trigger the
+   same abort cascade. After each entry, `proposal.prepare` lets
+   plugins materialize pending 202 proposals (e.g. `set`'s
+   search/replace revisions) from the just-recorded entry.
+Narration outside tags is fine when the turn also emitted at least
+one command — "OK", "Let me check:", reasoning prefixes are natural
+and don't trigger the no-actionable-tags error path.
 **Tool dispatch:** Commands are dispatched sequentially in the order
 the model emitted them. Each tool either succeeds (200), fails (400+),
 or proposes (202). On failure, all remaining tools are aborted. On
@@ -508,6 +560,68 @@ export default class Rm {
 A plugin can be multiple types. Known is a tool AND an assembly plugin.
+### Failure Reporting {#failure_reporting}
+**The action entry IS its outcome.** Every action plugin's handler
+finalizes the action's own log entry (`log://turn_N/{action}/{slug}`)
+with body, state, and outcome. Success and failure are two values of
+the same shape — only the field values change. The model sees both
+through the same channel, rendered under the action's scheme.
+```
+<get path="src/x.js" status="200">…file body…</get>          # success
+<get path="src/x.js" state="failed" outcome="not_found">     # failure
+  src/x.js not found
+</get>
+```
+State + outcome label the verdict; body is the result — file content
+on success, failure message on failure. No separate error entry is
+written for action-level failures; the model finds the failure exactly
+where it would find the success: at the action's own log path.
+**Strike attribution.** `error.js#verdict` looks up the post-handler
+state of every recorded entry on each turn. Any `state="failed"`
+result counts as a strike. Plugin authors write their action entry
+once with the right state; the strike machinery follows. They never
+call `error.log.emit` for action-level failures.
+**`error.log.emit` is for actionless failures** — failures that have
+no corresponding action entry to attach to:
+- Dispatch crash — the framework caught an exception thrown from inside
+  a handler before the handler had a chance to write its own entry.
+- Parser-level failures — malformed XML warnings, no-actionable-tags
+  responses, fired before any action entry could be recorded.
+- Runtime watchdog firings — `ContextExceededError`, RPC timeout,
+  stream timeout — not bound to a specific action.
+- Budget overflow — pre-dispatch rejection.
+`error.log.emit` writes a `log://turn_N/error/<slug>` entry and
+increments `state.turnErrors`, which also feeds strike accumulation.
+Both channels (action-entry state=failed and `error.log.emit`)
+contribute to the strike streak; either path advances it.
+**Recording-filter rejection.** Plugins on the `entry.recording` filter
+chain (e.g. `policy`) can return an entry with `state="failed"`. The
+framework writes that entry to the store before returning from
+`#record`, and dispatch skips it. The model sees the rejection at the
+action's own log path, exactly like any other action-level failure.
+Cycle detection is **silent** — it does not call `error.log.emit`.
+The strike accumulates internally via `state.turnErrors++`; on
+`MAX_STRIKES` the run abandons at 499 with a telemetry-side reason.
+The model sees no special signal, because telling the model "you're
+looping" invites superficial evasion (vary an attribute to bust the
+fingerprint) without addressing the underlying confusion.
+**Plugin author contract.** Your handler does one job: finalize the
+action's own log entry with the right body/state/outcome. That's the
+whole API for failure reporting. You do not call `error.log.emit`.
+If your handler throws, the framework catches and routes through
+`error.log.emit` at status 500 — that's the only situation where the
+framework writes on your behalf.
 ### Mode Enforcement {#mode_enforcement}
 Two mechanisms, operating at different layers:
@@ -520,9 +634,11 @@ Two mechanisms, operating at different layers:
 2. **Per-invocation filtering** — the `policy` plugin subscribes to
    `entry.recording` and inspects individual emissions for ask-mode
    violations that the tool-list alone can't catch (file-scheme `<set>`
-   edits, file `<rm>`, file-destination `<mv>`/`<cp>`). Rejects with
-   status 403 and emits `error://`. The tool remains advertised; the
-   specific invocation is blocked.
+   edits, file `<rm>`, file-destination `<mv>`/`<cp>`). Rejects by
+   marking the action entry `state="failed"`, `outcome="permission"`
+   with a body describing the rejection. Per the failure-reporting
+   contract — see [failure_reporting](#failure_reporting). The tool
+   remains advertised; the specific invocation is blocked.
 ### YOLO Mode {#yolo_mode}
@@ -634,11 +750,13 @@ log://turn_N/{action}/{slug}    scheme=log       category=logging   status=202
 {action}://turn_N/{slug}_1      scheme={action}  category=data      status=102 → 200/500
                                 body: primary stream (stdout for shell)
                                 summary="{command}" visibility=summarized
-                                (renders in <context>)
+                                (line in <summarized>; full body in
+                                 <visible> when promoted)
 {action}://turn_N/{slug}_2      scheme={action}  category=data      status=102 → 200/500
                                 body: alt stream (stderr for shell)
-                                (renders in <context>, often empty)
+                                (line in <summarized>; full body in
+                                 <visible> when promoted, often empty)
 ```
 `{action}` is the producer plugin's name (`sh`, `env`, future: `search`,
@@ -653,6 +771,16 @@ channels. Non-process producers (search, fetch) map their streams onto
 the same numeric space: `_1` for the primary data stream, `_2` for
 anomalies/errors, `_3`+ for auxiliary streams.
+**Search prefetch.** The `search` producer (provided by `rummy.web`
+when wired) may prefetch its result URLs as separate `<https>` data
+entries before the model emits any `<get>`. The model sees those
+pages as already-summarized data without having explicitly loaded
+them. Auditors reading dumps should be aware: the absence of a
+corresponding `log://turn_N/get/` for a URL does **not** mean the
+URL wasn't loaded — it may have arrived via search prefetch. The
+prefetch policy is the search plugin's implementation detail; the
+data entries themselves obey the streaming-producer shape above.
 **Status 102 ("Processing") marks an entry in mid-stream:** body is
 partial, will change; tokens grow as chunks arrive. Agents reading a
 102 entry use `<get>` with `line`/`limit` (including negative `line`
@@ -684,11 +812,26 @@ Two messages per turn. System = stable truth. User = active task.
     instructions text
         (instructions.md base template + tool docs injected via
          instructions.toolDocs filter; optional persona appended)
-    <context>
-        all category=data entries (knowns, files, http/https),
-        wrapped by known.js on assembly.system at priority 100
-    </context>
 [user message]
+    <summarized>
+        one entry per category=data entry whose visibility is visible
+        or summarized; plus the named carve-out (archived prompts pass
+        through with visibility="archived" so the model can <get> the
+        active prompt back). Each entry renders under its scheme tag
+        with its summarized projection as the tag body — this is the
+        compact-but-informative view produced by the plugin's summary()
+        hook (e.g. truncated knowns, code symbols for files, page
+        abstracts for URLs). Identity-keyed, slow-mutating: only grows
+        when a new entry lands. (known.js, assembly.user priority 50)
+    </summarized>
+    <visible>
+        each category=data entry whose visibility is visible, rendered
+        under its scheme tag with its visible projection as the tag
+        body (full body per the plugin's visible() hook). Working-set:
+        append on promote, remove on demote. A visible entry exists in
+        BOTH blocks — summary projection up top, full body below.
+        (known.js, assembly.user priority 75)
+    </visible>
     <log>
         action history — log:// entries + pre-latest prompts
         (log.js, assembly.user priority 100)
@@ -705,13 +848,23 @@ Two messages per turn. System = stable truth. User = active task.
 ```
 **System** = stable world state the model operates within (identity,
-tools, tool docs, reference context). Stable across turns within a
-run, which keeps prompt caching intact. **User** = active work (what
-the model is doing right now): history, open questions, current
-phase, and current prompt. The phase-specific `<instructions>` block
-lives in the user message precisely *because* it changes between
-turns — putting it in system would invalidate the cache on every
-phase transition.
+tools, tool docs). Stable across turns within a run, which keeps
+prompt caching intact. **User** = active work (what the model is
+doing right now): the project's data surface, history, open questions,
+current phase, and current prompt. Both phase-specific
+`<instructions>` and the codebase blocks (`<summarized>` / `<visible>`)
+live in the user message because they change turn-to-turn — putting
+mutable state in system would invalidate the cache on every promote
+or phase transition.
+**Why two blocks instead of one `<context>`.** Promote/demote is the
+dominant intra-phase operation. Today's single-block render
+invalidates the entire data surface every time. With the split,
+`<summarized>` mutates only when a new entry lands (slow); `<visible>`
+mutates on every promote/demote (fast). Ordering slow-above-fast
+preserves the prefix cache for `<summarized>` across the common case.
+Cognitively: `<summarized>` is "what I know exists" (identity);
+`<visible>` is "what I'm reading right now" (working memory).
 The `<prompt>` tag is present on every turn — first turn and
 continuations alike. The model always sees its task. The active prompt
@@ -896,6 +1049,22 @@ and emits a 413 error via `hooks.error.log.emit` with the descriptive
 body (what was demoted, the 50% rule for the next turn). The model
 sees the `error://` entry next turn and adjusts.
+**Delta-from-actual prediction.** Post-dispatch uses
+`predictNextPacket = lastContextTokens + Σ countTokens(body) for rows added this turn`,
+not the conservative measureMessages estimator. Reason: a 60%+
+divergence between the pre-call `<prompt tokenUsage>` (real API
+prompt_tokens) and the post-check estimator made the model dismiss
+the budget as janky and stop following demote rules. The two numbers
+must live on the same scale.
+**Prior-turn-pressure fallback.** If post-dispatch finds nothing to
+demote in the current turn but the packet still overflows, the
+pressure is coming from prior-turn promotions the model never demoted
+itself. Demotion widens to all currently-visible entries in the run
+and the prompt is also demoted. Without this fallback, observed
+behavior was strikes accumulating on runs whose base context had
+drifted over ceiling through no fault of the current turn.
 **LLM-reported context exceeded.** If the LLM rejects the request
 with a "context too long" error (detected via the regex in
 `src/llm/errors.js`), the LlmProvider raises `ContextExceededError`
@@ -1075,6 +1244,16 @@ is raw JSON; parse client-side. Mid-turn emissions have `telemetry:
 null`; the final emission of each turn includes the full telemetry
 block (token usage, context distribution, cost).
+**Telemetry completeness guarantee.** Every `run/state` emission
+computes a real budget from real numbers — never undefined, never
+synthesized. When no fresh turn result is available
+(abort/max-turns/crash paths fire before any turn executed, or after
+a turn that produced no tokens), `AgentLoop.#emitRunState` reads the
+last turn's `context_tokens` from the DB. Absent means no turn ran
+yet; zero is the truth, not a fallback. The shape and the math are
+the same on every code path so the client's renderer never needs to
+discriminate by emission cause.
 `stream/cancelled` payload: `{ run, path, reason }`. Server has
 already transitioned the entries to 499 (`Client Closed Request`);
 client should stop sending `stream` chunks for that path.
@@ -1089,6 +1268,30 @@ client should stop sending `stream` chunks for that path.
 | accept | neither | `running` — healer decides |
 | error | any | `running` — error state, model retries |
+**RPC ack vs run terminal status.** `resolve` and `inject` return the
+*current* run status (typically 102 mid-run), not 200. The client's
+dispatch handler must distinguish the synchronous RPC ack from the
+asynchronous `run/state` notification that carries real terminal
+state at end-of-turn — otherwise an HTTP-style 200 ack on a
+successful resolve would prematurely close the document.
+**Proposal hook chain.** Resolution flows through three filter/event
+hooks plugins can subscribe to:
+- `proposal.accepting` (filter) — first plugin to return
+  `{ allow: false, outcome, body }` vetoes acceptance. The entry
+  resolves to `state="failed"` with the plugin-supplied outcome and
+  body. Used by `policy` for read-only enforcement and similar
+  guards. First veto wins; later filters don't run.
+- `proposal.content` (filter) — when acceptance proceeds, plugins
+  override the resolved body. Default is `output ?? ""`. The `set`
+  plugin uses this to prefer the proposed body it already staged
+  on the audit entry over whatever literal body the client passed
+  through `resolve`.
+- `proposal.accepted` / `proposal.rejected` (events) — fired after
+  the resolution is committed; plugins side-effect on either
+  outcome.
 ---
 ## Plugin System {#plugin_system}
@@ -1165,12 +1368,12 @@ one unified mechanism.
 ### Pattern Distribution
-Hedbergian pattern matching (globs, body filters, preview) is taught
+Hedbergian pattern matching (globs, body filters, manifest) is taught
 across multiple tools, not concentrated in one. `get` shows content
-filtering. `cp` shows glob batch operations. `rm` shows preview safety.
+filtering. `cp` shows glob batch operations. `rm` shows manifest safety.
 Each tool reinforces the pattern vocabulary from a different angle.
 A model that sees `path="known://*"` in get, `path="known://plan_*"` in
-cp, and `path="known://temp_*" preview` in rm learns that patterns
+cp, and `path="known://temp_*" manifest` in rm learns that patterns
 are universal — not a feature of any single tool.
 ---
@@ -1205,11 +1408,14 @@ Termination protocol:
   (the claim of doneness is refuted by the failures)
 - `<update status="102">` → run continues
 - Multiple `<update>` → last one wins
-- No `<update>` + investigation tools → stall counter (RUMMY_MAX_STALLS)
 - No `<update>` + action-only tools → healer infers terminal from body
 - No `<update>` + plain text → healer infers terminal from body
-- Repeated commands → cycle detection (RUMMY_MIN_CYCLES, RUMMY_MAX_CYCLE_PERIOD)
-- Repeated update text without non-update work → stall (RUMMY_MAX_UPDATE_REPEATS)
+- Repeated turn fingerprints (commands, attributes, or empty turns) →
+  cycle detection (`RUMMY_MIN_CYCLES`, `RUMMY_MAX_CYCLE_PERIOD`); after
+  detection, strikes accumulate up to `RUMMY_MAX_STRIKES` then close 499.
+- Hard ceiling: `RUMMY_MAX_LOOP_TURNS` caps turns within a single loop,
+  regardless of any other guard. There is no per-run cap; a run may
+  comprise many loops.
 Format normalization:
 - Gemma `\`\`\`tool_code` fences → stripped before parsing
@@ -1218,6 +1424,78 @@ Format normalization:
 - Mistral `[TOOL_CALLS]` → normalized to XML
 - Sed alternate delimiters (`s|old|new|`) → parsed like `s/old/new/`
+### XML Parser {#xml_parser}
+`src/agent/XmlParser.js` is the syntax layer between raw model output
+and the dispatch pipeline. Models routinely emit malformed XML —
+unclosed tags, missing slashes, mismatched closes, unterminated
+attribute values, embedded code-fences, training-format tool calls.
+The parser's contract is: never throw, never silently drop a tool
+call, surface every recovery as a warning so error.log can route it.
+**Pre-flight repair pipeline** (order is load-bearing):
+1. `#normalizeToolCalls` — translate native training formats (gemma
+   `\`\`\`tool_code\n<xml>\n\`\`\``, Qwen `<|tool_call>call:NAME{...}`,
+   OpenAI `{"name":"...","arguments":{...}}`, Anthropic
+   `<tool_use><name>...</name><input>{...}</input></tool_use>`,
+   Mistral `[TOOL_CALLS] [{...}]`, harmony role/channel pseudo-tags
+   `<|channel>` / `<channel|>`). Catch-all malformed `<|tool_call>`
+   tokens become `<error>` blocks (in prose — never with literal
+   `<get>`/`<set>`/etc. tags, which would re-enter the parser as
+   phantom tool calls).
+2. `#neutralizeCodeSpans` — entity-encode tag brackets inside
+   backtick spans (`` `<get/>` `` → `` `&lt;get/&gt;` ``). Models
+   quote instructions; quoted tool names must not parse.
+3. `#correctMismatchedCloses` — at outer tool depth (stack=1),
+   rewrite `</WRONG>` to `</RIGHT>`. htmlparser2 silently drops
+   unmatched closes, which would make the explicit recovery path
+   unreachable and absorb every sibling command as body text.
+   Conservative: only outermost depth; nested mismatches inside
+   tool bodies are left alone (bodies are opaque, see below).
+4. `#balanceAttrQuotes` — close `ATTR="..."` values that never
+   quote-close before the next tag. Without this repair,
+   htmlparser2 consumes the rest of input as one giant attribute
+   value and silently drops every subsequent tool call. Triggers
+   only when the value contains no quote, no `>`, and is followed
+   by another tag opening or close.
+**Body opacity.** Tool bodies are opaque text, not nested XML. The
+model writing a plan with `<get/>` examples in it, SEARCH/REPLACE
+markers in `<set>`, or XML samples in `<known>` all need to survive
+intact. Nested tag opens push onto a per-tool stack; matching closes
+pop. Orphan closes that don't match the stack but match a known tool
+name are treated as recovery (likely typo); unknown orphan closes
+are kept as body text.
+**Empty-body recovery.** A new tool tag opens while the current tool
+has no body content yet — the model meant the current tag to
+self-close but typed it paired, or emitted a mismatched close that
+htmlparser2 dropped. Close current, open new, emit recovery warning.
+**Per-tool attr-vs-body resolution** (`resolveCommand`). Tools accept
+attributes on the open tag *and* body text inside the tag. If the
+canonical attribute is missing, the body silently fills it. The
+shape per tool:
+- `set` — structured edit detection (merge-conflict markers, udiff,
+  Claude `<old_text>` XML, JSON `{search,replace}`, sed `s/.../.../`,
+  attribute-mode `search=`/`replace=`, body-as-search-when-`body=`
+  attr-set, plain write).
+- `update` — body fills `body`, status defaults to 102 if absent.
+- `get` / `rm` — attr `path` or body fills target. Spread `a` so
+  `line` / `limit` / `visibility` / future attrs reach the handler.
+- `search` — attr `path` or body fills target; `results` numeric.
+- `mv` / `cp` — attr `path` (source); attr `to` or body fills dest.
+  Spread `a` so `visibility` reaches the handler for batch
+  visibility changes.
+- `sh` / `env` — attr `command` or body fills the command.
+- `ask_user` — attr `question`; attr `options` or body for options.
+**Tool-call cap.** `RUMMY_MAX_COMMANDS` caps the number of tool
+calls per turn. When hit, remaining commands drop with a warning;
+the model sees one structured error so it can adjust on the next
+turn rather than rediscovering silent truncation.
 ---
 ## Testing
@@ -1381,10 +1659,9 @@ Full reference is `.env.example` — these are the load-bearing vars.
 | Var | Default | Purpose |
 |-----|---------|---------|
-| `RUMMY_MAX_TURNS` | 15 | Hard loop iteration cap |
+| `RUMMY_MAX_LOOP_TURNS` | 99 | Per-loop turn cap (no per-run cap) |
 | `RUMMY_MAX_COMMANDS` | 99 | Max parsed tool calls per turn |
-| `RUMMY_MAX_STALLS` | 3 | Turns without `<update>` before force-complete |
-| `RUMMY_MAX_UPDATE_REPEATS` | 3 | Same-text repeat threshold without progress |
+| `RUMMY_MAX_STRIKES` | 3 | Strikes (errors or detected cycles) before close at 499 |
 | `RUMMY_MIN_CYCLES` | 3 | Consecutive repetitions to trigger cycle detection |
 | `RUMMY_MAX_CYCLE_PERIOD` | 4 | Max cycle period checked by healer |
 | `RUMMY_RETENTION_DAYS` | 31 | Days of completed/aborted runs kept |

package/bin/postinstall.js CHANGED Viewed

@@ -1,13 +1,13 @@
 import { existsSync, mkdirSync, copyFileSync } from "node:fs";
 import { join, dirname } from "node:path";
 import { fileURLToPath } from "node:url";
-import { homedir } from "node:os";
+import resolveRummyHome from "../src/agent/rummyHome.js";
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const packageRoot = join(__dirname, "..");
 const envExample = join(packageRoot, ".env.example");
-const rummyHome = process.env.RUMMY_HOME || join(homedir(), ".rummy");
+const rummyHome = resolveRummyHome();
 if (!existsSync(rummyHome)) {
 	mkdirSync(rummyHome, { recursive: true });

package/bin/rummy.js CHANGED Viewed

@@ -3,12 +3,12 @@
 import { existsSync } from "node:fs";
 import { isAbsolute, join, dirname } from "node:path";
 import { fileURLToPath } from "node:url";
-import { homedir } from "node:os";
+import resolveRummyHome from "../src/agent/rummyHome.js";
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const packageRoot = join(__dirname, "..");
-const rummyHome = process.env.RUMMY_HOME || join(homedir(), ".rummy");
+const rummyHome = resolveRummyHome();
 // Base dir for env files: cwd if it has .env.example, else $RUMMY_HOME.
 // The package's own .env.example is never consulted — silent package-