npm - @possumtech/rummy - Versions diffs - 0.2.8 → 0.3.1 - Mend

@possumtech/rummy 0.2.8 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

package/.env.example +13 -2
package/EXCEPTIONS.md +46 -0
package/PLUGINS.md +422 -188
package/SPEC.md +440 -106
package/migrations/001_initial_schema.sql +5 -3
package/package.json +17 -5
package/service.js +5 -3
package/src/agent/AgentLoop.js +252 -55
package/src/agent/ContextAssembler.js +20 -4
package/src/agent/KnownStore.js +82 -25
package/src/agent/ProjectAgent.js +4 -1
package/src/agent/ResponseHealer.js +86 -32
package/src/agent/TurnExecutor.js +542 -207
package/src/agent/XmlParser.js +77 -41
package/src/agent/known_store.sql +68 -4
package/src/agent/schemes.sql +3 -0
package/src/agent/tokens.js +7 -21
package/src/agent/turns.sql +15 -1
package/src/hooks/HookRegistry.js +7 -0
package/src/hooks/Hooks.js +15 -0
package/src/hooks/PluginContext.js +14 -1
package/src/hooks/RummyContext.js +16 -4
package/src/hooks/ToolRegistry.js +77 -19
package/src/llm/LlmProvider.js +27 -8
package/src/llm/OpenAiClient.js +20 -0
package/src/llm/OpenRouterClient.js +24 -2
package/src/llm/XaiClient.js +47 -2
package/src/plugins/ask_user/README.md +4 -4
package/src/plugins/ask_user/ask_user.js +5 -5
package/src/plugins/ask_user/ask_userDoc.js +29 -0
package/src/plugins/budget/README.md +31 -0
package/src/plugins/budget/budget.js +55 -0
package/src/plugins/cp/README.md +5 -4
package/src/plugins/cp/cp.js +10 -6
package/src/plugins/cp/cpDoc.js +29 -0
package/src/plugins/engine/engine.sql +1 -8
package/src/plugins/engine/turn_context.sql +4 -9
package/src/plugins/env/README.md +3 -4
package/src/plugins/env/env.js +5 -5
package/src/plugins/env/envDoc.js +29 -0
package/src/plugins/file/README.md +9 -12
package/src/plugins/file/file.js +34 -35
package/src/plugins/get/README.md +2 -2
package/src/plugins/get/get.js +77 -6
package/src/plugins/get/getDoc.js +51 -0
package/src/plugins/hedberg/hedberg.js +2 -1
package/src/plugins/hedberg/matcher.js +10 -29
package/src/plugins/hedberg/normalize.js +28 -0
package/src/plugins/hedberg/patterns.js +25 -27
package/src/plugins/hedberg/sed.js +17 -10
package/src/plugins/index.js +66 -14
package/src/plugins/instructions/README.md +6 -2
package/src/plugins/instructions/instructions.js +20 -4
package/src/plugins/instructions/preamble.md +19 -5
package/src/plugins/known/README.md +10 -7
package/src/plugins/known/known.js +23 -17
package/src/plugins/known/knownDoc.js +34 -0
package/src/plugins/mv/README.md +5 -4
package/src/plugins/mv/mv.js +27 -6
package/src/plugins/mv/mvDoc.js +45 -0
package/src/plugins/performed/README.md +15 -0
package/src/plugins/performed/performed.js +45 -0
package/src/plugins/persona/persona.js +78 -0
package/src/plugins/previous/README.md +3 -2
package/src/plugins/previous/previous.js +33 -24
package/src/plugins/progress/README.md +1 -2
package/src/plugins/progress/progress.js +33 -21
package/src/plugins/prompt/README.md +5 -5
package/src/plugins/prompt/prompt.js +15 -17
package/src/plugins/rm/README.md +4 -4
package/src/plugins/rm/rm.js +32 -20
package/src/plugins/rm/rmDoc.js +30 -0
package/src/plugins/rpc/README.md +15 -28
package/src/plugins/rpc/rpc.js +42 -77
package/src/plugins/set/README.md +13 -12
package/src/plugins/set/set.js +107 -16
package/src/plugins/set/setDoc.js +49 -0
package/src/plugins/sh/README.md +4 -4
package/src/plugins/sh/sh.js +5 -5
package/src/plugins/sh/shDoc.js +29 -0
package/src/plugins/{skills/skills.js → skill/skill.js} +10 -51
package/src/plugins/summarize/README.md +6 -5
package/src/plugins/summarize/summarize.js +7 -6
package/src/plugins/summarize/summarizeDoc.js +33 -0
package/src/plugins/telemetry/telemetry.js +16 -9
package/src/plugins/think/README.md +20 -0
package/src/plugins/think/think.js +5 -0
package/src/plugins/unknown/README.md +6 -5
package/src/plugins/unknown/unknown.js +12 -9
package/src/plugins/unknown/unknownDoc.js +31 -0
package/src/plugins/update/README.md +3 -8
package/src/plugins/update/update.js +7 -6
package/src/plugins/update/updateDoc.js +33 -0
package/src/server/ClientConnection.js +59 -45
package/src/server/RpcRegistry.js +52 -4
package/src/sql/v_model_context.sql +10 -25
package/src/plugins/ask_user/docs.md +0 -2
package/src/plugins/cp/docs.md +0 -2
package/src/plugins/current/README.md +0 -14
package/src/plugins/current/current.js +0 -47
package/src/plugins/env/docs.md +0 -4
package/src/plugins/get/docs.md +0 -10
package/src/plugins/known/docs.md +0 -3
package/src/plugins/mv/docs.md +0 -2
package/src/plugins/rm/docs.md +0 -6
package/src/plugins/set/docs.md +0 -6
package/src/plugins/sh/docs.md +0 -2
package/src/plugins/skills/README.md +0 -25
package/src/plugins/store/README.md +0 -20
package/src/plugins/store/docs.md +0 -6
package/src/plugins/store/store.js +0 -63
package/src/plugins/summarize/docs.md +0 -4
package/src/plugins/unknown/docs.md +0 -5
package/src/plugins/update/docs.md +0 -4

package/SPEC.md CHANGED Viewed

@@ -15,8 +15,8 @@ that thread a value through subscribers in priority order).
 **Every `<tag>` the model sees is a plugin.** The `<known>` section
 of the system message is rendered by the known plugin. The `<progress>`
-section is rendered by the progress plugin. The `<ask>` tag is rendered
-by the prompt plugin. No monolithic assembler decides what goes where.
+section is rendered by the progress plugin. The `<prompt>` tag is
+rendered by the prompt plugin. No monolithic assembler decides what goes where.
 Each plugin filters for its own data from the shared row set, renders
 its section, and returns.
@@ -42,7 +42,8 @@ body, attributes, and state.
 ```sql
 known_entries (
-    id, run_id, turn, path, body, scheme, state, hash,
+    id, run_id, loop_id, turn, path, body, scheme,
+    status INTEGER, fidelity TEXT, hash,
     attributes, tokens, tokens_full, refs, write_count,
     created_at, updated_at
 )
@@ -50,58 +51,62 @@ known_entries (
 | Column | Purpose |
 |--------|---------|
-| `path` | Entry identity. Bare paths (`src/app.js`) or URIs (`known://auth`) |
+| `path` | Entry identity. Bare paths (`src/app.js`) or URIs (`known://auth`). Max 2048 chars. |
 | `body` | Tag body text. File content, tool output, skill docs. |
 | `attributes` | Tag attributes as JSON. Handler-private workspace. `CHECK (json_valid)` |
 | `scheme` | Generated from path via `schemeOf()`. Drives dispatch and view routing |
-| `state` | Lifecycle stage. Determines model visibility |
+| `status` | HTTP status code (200, 202, 400, 413, etc.) |
+| `fidelity` | Visibility level: full, summary, index, archive |
 | `hash` | SHA-256 for file change detection |
-| `tokens` | Context cost at current state |
+| `tokens` | Display-only token count at current fidelity. NEVER used for budget. |
 | `tokens_full` | Cost of raw body at full fidelity |
 | `turn` | Freshness — when was this entry last touched |
-### 1.2 Schemes & States
+### 1.2 Schemes, Status & Fidelity
-Paths use URI scheme syntax. Bare paths (no `://`) are files.
-**Files** (`scheme IS NULL`):
-| State | Model sees |
-|-------|-----------|
-| `full` | File content in code fence |
-| `index` | Path listed in File Index |
-| `stored` | Invisible, retrievable via `<get>` |
-**Knowledge** (`known://`, `unknown://`):
-| State | Model sees |
-|-------|-----------|
-| `full` | Key — value in bullet list |
-| `stored` | Key listed, no value |
-**Tool results** (`set://`, `sh://`, `env://`, `rm://`, `ask_user://`,
-`mv://`, `cp://`, `search://`, `get://`, `store://`):
-All start at `full` state when recorded. Handlers set the final state:
-`proposed`, `pass`, `rejected`, `error`, `pattern`, `read`, `stored`, `info`.
-**Skills** (`skill://`): `full` or `stored`. Rendered in system message.
+Every entry has two independent dimensions: **status** (HTTP integer)
+and **fidelity** (visibility level). These are separate concerns.
-**Tools** (`tool://`): `full`, `model_visible = 0`. Internal plugin metadata.
+**Status** (lifecycle): 200 (OK), 202 (proposed), 400 (bad request),
+404 (not found), 409 (conflict), 413 (too large), 499 (aborted),
+500 (error).
-**URLs** (`http://`, `https://`): `full`, `summary`, `stored`.
+**Fidelity** (visibility): `full` (body visible), `summary`
+(model-authored summary), `index` (path only), `archive` (invisible,
+retrievable via `<get>`).
-**Structural** (`summarize://`, `update://`): Status signals.
-**Audit** (`system://`, `prompt://`, `ask://`, `act://`, `progress://`,
-`reasoning://`, `model://`, `error://`, `user://`, `assistant://`,
-`content://`): `info` state, `model_visible = 0` (hidden from model).
-### 1.3 State Validation
+Paths use URI scheme syntax. Bare paths (no `://`) are files.
-The `schemes` table is a bootstrap registry — 30 rows of static config.
-INSERT/UPDATE triggers validate state against `schemes.valid_states`.
-Plugins cannot bypass this (circular dependency prevents schemes as entries).
+Every entry plays one of four roles:
+| Role | Category | Section | Description |
+|------|----------|---------|-------------|
+| **Data** | `data` | `<knowns>` | Entries the model works with — persistent state |
+| **Logging** | `logging` | `<performed>`/`<previous>` | Records of what happened — tool results, lifecycle signals |
+| **Unknowns** | `unknown` | `<unknowns>` | Open questions the model is tracking |
+| **Prompt** | `prompt` | `<prompt>` | The task driving the loop |
+`logging` is the default category. Plugins opt into `data` explicitly.
+| Scheme | Category | Description |
+|--------|----------|-------------|
+| `NULL` (bare path) | data | File content. JOINs via `COALESCE(scheme, 'file')`. `file://` prefix stripped by hedberg. |
+| `known://` | data | Model-registered knowledge. One fact per entry. |
+| `skill://` | data | Skill docs. Rendered in system message. |
+| `http://`, `https://` | data | Web content. |
+| `unknown://` | unknown | Unresolved questions. |
+| `prompt://` | prompt | User prompt with `mode` attribute (`ask`/`act`). |
+| `set://`, `get://`, `sh://`, `env://`, `rm://`, `mv://`, `cp://`, `ask_user://`, `search://` | logging | Tool result entries. |
+| `summarize://`, `update://` | logging | Lifecycle signals. |
+| `tool://` | audit | Internal plugin metadata. `model_visible = 0`. |
+| `system://`, `reasoning://`, `model://`, `error://`, `user://`, `assistant://`, `content://` | audit | Audit entries. `model_visible = 0`. |
+### 1.3 Scheme Registry
+The `schemes` table is a bootstrap registry — static rows of
+`(name, model_visible, category)`. Plugins register their scheme
+via `core.registerScheme()` in the constructor. The `model_visible`
+flag controls whether entries appear in `v_model_context`.
 ### 1.4 UPSERT Semantics
@@ -117,13 +122,21 @@ The K/V store is the memory. Relational tables are the skeleton.
 ```sql
 projects (id, name UNIQUE, project_root, config_path, created_at)
 models   (id, alias UNIQUE, actual, context_length, created_at)
-runs     (id, project_id, parent_run_id, model, alias UNIQUE, status,
-          temperature, persona, context_limit, next_turn, created_at)
-turns    (id, run_id, sequence, prompt_tokens, completion_tokens,
-          total_tokens, cost, created_at)
+runs     (id, project_id, parent_run_id, model, alias UNIQUE,
+          status INTEGER, temperature, persona, context_limit,
+          next_turn, next_loop, created_at)
+loops    (id, run_id, sequence, mode, model, prompt, status INTEGER,
+          config JSON, result JSON, created_at)
+turns    (id, run_id, loop_id, sequence, context_tokens,
+          reasoning_content, prompt_tokens, cached_tokens,
+          completion_tokens, reasoning_tokens, total_tokens, cost,
+          created_at)
 file_constraints (id, project_id, pattern, visibility, created_at)
-prompt_queue     (id, run_id, mode, model, prompt, config, status, result)
+  -- Project-level config. NOT tool dispatch. See §2.3.
+turn_context     (id, run_id, loop_id, turn, ordinal, path, scheme,
+                  status, fidelity, body, tokens, attributes,
+                  category, source_turn)
 rpc_log          (id, project_id, method, rpc_id, params, result, error)
 ```
@@ -136,19 +149,39 @@ client picks for every run.
 ### 2.1 Run State Machine
+All status fields are HTTP integer codes:
 ```
-queued → running → proposed → running → completed
-                → completed
-                → failed → running
-                → aborted → running
+100 (queued) → 200 (running) → 202 (proposed) → 200 (running) → 200 (completed)
+                              → 200 (completed)
+                              → 500 (failed) → 200 (running)
+                              → 499 (aborted) → 200 (running)
 ```
 All terminal states allow transition back to `running`. Runs are long-lived.
-### 2.2 Prompt Queue
+### 2.2 Loops Table
+The loops table IS the prompt queue. Each `ask`/`act` creates a loop.
+FIFO per run (ordered by sequence). One active at a time. Abort stops
+the current loop; pending loops survive. Projects > runs > loops > turns.
+### 2.3 File Constraints
+The `file_constraints` table is project-level configuration — it
+defines which files a project cares about. This is backbone, not tool
+dispatch. Constraints have three visibilities: `active` (promoted to
+full), `readonly` (promoted but not editable), `ignore` (demoted).
+**Boundary:** Setting a constraint (`File.setConstraint`) is a
+project-config write. Promoting/demoting the matching entries is tool
+dispatch that goes through the handler chain with budget enforcement.
+These are separate operations: constraint persists across runs, entry
+promotion is scoped to a run and subject to the same budget rules as
+a model `<get>`.
-All prompts flow through `prompt_queue`. FIFO per run. One active at a time.
-Abort stops the current prompt; pending prompts survive.
+`store` RPC manages constraints directly — it is not a model tool.
+`get` RPC with `persist` sets the constraint AND dispatches promotion.
 ---
@@ -169,13 +202,17 @@ object is the same shape at every tier.
 | Method | Model | Client | Plugin |
 |--------|-------|--------|--------|
-| `get`, `set`, `rm`, `mv`, `cp`, `sh`, `env`, `store` | ✓ | ✓ | ✓ |
+| `get`, `set`, `rm`, `mv`, `cp`, `sh`, `env`, `search` | ✓ | ✓ | ✓ |
 | `known`, `unknown`, `ask_user`, `summarize`, `update` | ✓ | ✓ | ✓ |
 | `ask`, `act`, `resolve`, `abort`, `startRun` | — | ✓ | ✓ |
 | `getRuns`, `getModels`, `getEntries` | — | ✓ | ✓ |
 | `on()`, `filter()`, db/store access | — | — | ✓ |
-Model tier restrictions enforced by mode (ask removes act-only tools).
+Model tier restrictions enforced by unified `resolveForLoop(mode, flags)`.
+Ask mode excludes `sh`. Flags: `noInteraction` excludes `ask_user`,
+`noWeb` excludes `search`, `noProposals` excludes `ask_user`/`env`/`sh`.
+13 model tools: get, set, known, unknown, env, sh, rm, cp, mv, search,
+summarize, update, ask_user.
 Client tier requires project init. Plugin tier has no restrictions.
 ### 3.2 Dispatch Path
@@ -188,6 +225,14 @@ Client: JSON-RPC  → { method, params }   → #record() → dispatch(scheme, en
 Plugin: rummy.rm({ path })               → #record() → dispatch(scheme, entry, rummy)
 ```
+**Lifecycle/action split:** Commands are classified as lifecycle signals
+(`summarize`, `update`, `unknown`, `known`) or action commands (everything
+else). Lifecycle signals always dispatch — they are state declarations that
+cannot be 409'd by sequential dispatch. Action commands dispatch sequentially;
+a 202 proposal or error aborts subsequent actions. If the model sends
+`<summarize>` but actions in the same turn failed, the summarize is
+overridden to an update (the model's assertion that it's done is false).
 ### 3.3 Plugin Convention
 A plugin is an instantiated class. The class name matches the file name.
@@ -247,35 +292,35 @@ Two messages per turn. System = stable truth. User = active task.
         [persona/]
         [skills/]
     [/instructions]
-    <knowledge>
+    <knowns>
         ...entries sorted by fidelity (index, summary, full), then by scheme
-    </knowledge>
+    </knowns>
     <previous>
-        (pre-loop user prompt, model responses, agent warnings, and tools used, in order)
+        (pre-loop entries, each with turn, status, summary, fidelity, tokens)
     </previous>
-    <unknowns></unknowns>
+    <unknowns>
+        (open questions, each with path, turn, fidelity, tokens)
+    </unknowns>
 [/system]
 [user]
-    <current>
-        (current loop model responses, agent warnings, and tools used, in order)
-    </current>
-    <progress>the above actions have been performed on this user prompt:</progress>
-    <ask tools="..." warn="...">user prompt</ask>
-    — OR —
-    <act tools="...">user prompt</act>
+    <performed>
+        (current loop entries, each with turn, status, summary, fidelity, tokens)
+    </performed>
+    <progress turn="N">token budget, fidelity stats, causal bridge</progress>
+    <prompt mode="ask|act" tools="...">user prompt</prompt>
 [/user]
 ```
 **System** contains everything the model needs to know.
 **User** contains everything the model needs to do.
-The `<ask>`/`<act>` tag is present on every turn — first turn and
+The `<prompt>` tag is present on every turn — first turn and
 continuations alike. The model always sees its task. The active prompt
 is extracted from its chronological position and placed last for maximum
 recency. `<progress>` bridges the gap, narrating the causal relationship
-between `<current>` (the work) and the prompt (the cause).
+between `<performed>` (the work) and the prompt (the cause).
-### 4.2 Loops, Previous, and Current
+### 4.2 Loops, Previous, and Performed
 A **loop** is one `ask` or `act` invocation and all its continuation
 turns until summarize, fail, or abort.
@@ -285,14 +330,14 @@ responses, tool results, agent warnings — the full chronicle in order.
 Lives in the system message as established history. Omitted on the
 first turn of the first loop.
-**Current** = the active loop's work so far. Model responses, tool
+**Performed** = the active loop's work so far. Model responses, tool
 results, agent warnings — in order. Does NOT include the user prompt
-(one per loop, extracted to `<ask>`/`<act>`). Lives in the user
+(one per loop, extracted to `<prompt>`). Lives in the user
 message as immediate context. Empty on the first turn of a loop.
 When a new prompt arrives on an existing run, the prior loop's
-`<current>` content plus its prompt move to `<previous>`. When a loop
-continues (next turn), new results append to `<current>`.
+`<performed>` content plus its prompt move to `<previous>`. When a loop
+continues (next turn), new results append to `<performed>`.
 ### 4.3 Key Entries
@@ -313,7 +358,7 @@ text from body + attributes.
 Each turn:
 1. Write `instructions://system` (empty body, attributes = { persona })
-2. Run plugin hooks (`onTurn`) — plugins modify entries before the model sees them
+2. Emit `turn.started` — plugins write prompt/instructions entries
 3. Project `instructions://system` → instructions text
 4. Query `v_model_context` VIEW → visible entries
 5. Project each entry through its tool's `full`/`summary` projection
@@ -323,23 +368,171 @@ Each turn:
    - Previous plugin (priority 200) → `<previous>` section
    - Unknown plugin (priority 300) → `<unknowns>` section
 8. Invoke `assembly.user` filter chain (empty string as base):
-   - Current plugin (priority 100) → `<current>` section
+   - Performed plugin (priority 100) → `<performed>` section
    - Progress plugin (priority 200) → `<progress>` section
-   - Prompt plugin (priority 300) → `<ask>`/`<act>` section
+   - Prompt plugin (priority 300) → `<prompt>` section
 9. Store as `system://N` and `user://N` audit entries
-The VIEW determines visibility. State IS fidelity:
+The VIEW determines visibility from `fidelity` and `status`:
 - `full` → body visible
-- `summary` → body visible
+- `summary` → summary visible (model-authored `summary` attribute if set)
 - `index` → path listed, no content
-- `stored` → invisible
-- `proposed` → invisible (pending client)
+- `archive` → invisible (retrievable via `<get>`)
+**Partial read:** `<get path="..." line="N" limit="M"/>` returns lines N through
+N+M−1 of the entry body as the log item without changing fidelity or promoting
+the entry to context. Use after reading `summary` fidelity (which gives line
+numbers via repomap) to target a specific symbol. Single-path only — glob or
+body filter with `line`/`limit` is a 400 error.
+- `status = 202` → invisible (proposed, pending client)
 - `model_visible = 0` → invisible (audit, tool, instructions)
-### 4.5 progress:// as Entry
+Model controls fidelity via `<set>` attributes: `archive`, `summary`,
+`index`, `full`. The `summary="..."` attribute attaches a description
+(<= 80 chars) that persists across fidelity changes.
+### 4.5 Budget Enforcement
+The model owns its context. The system enforces a hard ceiling and
+provides advisory warnings — it does not automatically manage entries.
+**Pre-LLM check:** The budget plugin measures `countTokens()` on the
+assembled messages. If assembled tokens exceed `contextSize`, the turn
+returns 413 without calling the LLM. This triggers panic mode (see
+§4.6).
+**Write-layer gate:** BudgetGuard on KnownStore gates every write
+during dispatch. `upsert()`, `promoteByPattern()`, and
+`updateBodyByPattern()` check token delta against remaining headroom.
+Exceeding the budget throws `BudgetExceeded` — the tool 413s, the
+guard trips, and all subsequent tools in the turn fail.
+BudgetGuard ceiling = `floor(contextSize × 0.9) − 500`. The 500-token
+buffer below the enforce ceiling absorbs two sources of overhead that
+BudgetGuard cannot see: (a) `#record()`-phase writes that bypass the
+guard (~15 tokens per command), and (b) loop transition overhead —
+when a loop completes and a new one starts, entries shift from
+`<performed>` to `<previous>` format, adding ~200–300 tokens to the
+next assembly. Without this buffer, the base context can accumulate
+to exactly the enforce ceiling, making it impossible for the panic
+loop to start (panic prompt + loop overhead > ceiling).
+**Exemptions:** `status >= 400` entries (error results), `model_visible
+= 0` entries (audit), `fidelity = "archive"` entries (not in context).
+**Size gate:** Known entries exceeding 500 tokens are rejected with
+413, forcing atomic entries.
+**Advisory warnings** (progress plugin):
+- 50%: "You may free space by lowering the fidelity of entries"
+- 75%: "YOU MUST free space... or the run will fail"
+**Token math:** `Math.ceil(text.length / RUMMY_TOKEN_DIVISOR)`. One
+formula, one file (`src/agent/tokens.js`), env-configurable. No
+external dependencies. `contextSize` is the ceiling. Over = 413.
+Under = 200. No margins.
+**Three token measures — never conflate them:**
+| Measure | Source | Scope | Use |
+|---|---|---|---|
+| SQL entry tokens | `known_entries.tokens` = `ceil(chars / DIVISOR)` | Per entry | Model decision-making: "this entry costs N tokens" |
+| Assembled estimate | `measureMessages(messages)` = sum of entry projections | Full packet | First-turn budget fallback only |
+| Actual API tokens | `turns.context_tokens` = `usage.input_tokens` back-filled from LLM | Per turn | Budget enforcement on turns 2+; ground truth |
+`budget.enforce` uses the **actual API tokens** (`get_last_context_tokens`) when
+available (turn 2+) and falls back to the assembled estimate on turn 1. The
+estimate can be 3–7× off for XML/JSON-heavy content — do not rely on it for
+anything that matters.
+**`context_tokens` vs `prompt_tokens` in step telemetry:**
+- `context_tokens` in the step JSON = `turns.context_tokens` for that turn =
+  per-turn actual input tokens from the LLM API (e.g. 7900 tokens sent this turn)
+- `prompt_tokens` in the step JSON = `SUM(turns.prompt_tokens)` for the run =
+  **cumulative** total across all turns (cost tracking, not a context size)
+These two will diverge rapidly on any multi-turn run. A run at turn 50 might show
+`context_tokens: 8000` (context under control) and `prompt_tokens: 400000`
+(total input tokens billed across the whole run). They are measuring orthogonal things.
+### 4.6 Panic Mode
+**The invariant.** A panic is only ever triggered because the
+assembled context was under the ceiling — and the new prompt pushed
+it over. The existing context fit; the incoming prompt did not.
+Panic mode replaces that too-large incoming prompt with a small
+panic prompt on the same context. Therefore: the first turn of a
+panic loop cannot 413. If it does, it is a bug.
+**Trigger.** `TurnExecutor.execute()` assembles the full packet
+(context + incoming prompt) before calling the LLM. If
+`assembledTokens > contextSize`, it returns 413 without calling
+the LLM. `#drainQueue` intercepts this and enters panic mode.
+**Flow.**
+1. Complete the failed loop with status 413 (audit trail).
+2. Enqueue a panic loop (`mode = "panic"`, `noRepo = true`,
+   `prompt = panicPrompt`, `panicTarget` in config).
+3. Re-enqueue the original loop with `panicAttempted: true` in
+   its config JSON. This flag persists across drain cycles.
+4. `continue` — the drain loop claims the panic loop next.
+After panic completes (model freed enough space), the retry loop
+runs. If the retry also 413s, hard-fail to client. One panic
+attempt per drain cycle — `panicAttempted` is checked both as a
+local variable and on the re-enqueued loop's config.
+**Panic target.** The model must compress context to below:
+```
+panicTarget = MIN(contextSize × 0.75, contextSize − incomingTokens) − cushion
+```
-The continuation prompt is a `progress://N` entry. Plugins can modify its
-body before materialization.
+`incomingTokens` is the raw token count of the original prompt.
+`cushion` is a small safety margin (500 tokens) to absorb
+materialization overhead. The target is expressed in materialized
+token units — the same unit the system uses to measure completion
+(see Token Math below).
+**Two token contexts.**
+The model reasons in *per-entry SQL tokens* — the token counts
+visible in `<knowns>` entries. These are the granular unit the model
+uses to decide which entries to target: "this entry is 200 tokens;
+if I archive it, I save 200 tokens."
+The system makes decisions using *actual API tokens* —
+`turns.context_tokens` back-filled from `usage.input_tokens` after
+each LLM call. SQL token sums do not equal actual API counts because
+projections, assembly overhead, and fidelity transforms alter the
+output; and the SQL estimate (`ceil(chars / DIVISOR)`) can be 3–7×
+off for structured content. **Never use SQL token sums for ceiling or
+budget decisions.** See §4.5 Token Measures for the full breakdown.
+**Strike system.** After each panic turn, compare
+`result.assembledTokens` (materialized) with `_lastPanicTokens`
+(previous turn's materialized total):
+- Decreased → reset strike counter to 0.
+- Same or increased → increment strikes.
+- 3 consecutive strikes → return 413 to `#drainQueue` → hard-fail.
+Progress (any reduction) resets the counter. The model has
+unlimited turns as long as it makes progress.
+**Panic success.** After each turn, if `result.assembledTokens
+<= panicTarget`, the panic loop exits with 200. The retry loop
+then runs with the original prompt on the now-compressed context.
+**Tool set.** `resolveForLoop("panic")` includes: get, set, known,
+unknown, rm, mv, cp, summarize, update. Excludes: sh, env, search,
+ask_user. `noRepo: true` — no file scanning during panic.
+**What the model sees.** Turn 1 receives the panic prompt from
+`budget.panicPrompt()`: the assembled token count, the target, and
+the exact number of tokens to free. Turn 2+ receives a continuation
+prompt. The model uses `<set fidelity="archive">`, `<mv
+fidelity="index">`, and similar fidelity operations to free space,
+concluding with `<summarize>` when done or `<update>` while working.
 ---
@@ -369,22 +562,25 @@ JSON-RPC 2.0 over WebSocket. `discover` returns the live catalog.
 | Method | Params |
 |--------|--------|
-| `read` | `{ path, run?, persist?, readonly? }` |
+| `get` | `{ path, run, persist?, readonly? }` |
+| `set` | `{ run, path, body?, attributes? }` |
+| `rm` | `{ run, path }` |
+| `mv` | `{ run, path, to }` |
+| `cp` | `{ run, path, to }` |
 | `store` | `{ path, run?, persist?, ignore?, clear? }` |
-| `write` | `{ run, path, body?, state?, attributes? }` |
-| `delete` | `{ run, path }` |
 | `getEntries` | `{ pattern?, body?, run?, limit?, offset? }` |
-`persist` creates a project-level file constraint (operator privilege).
-Without `persist`, operations dispatch through the handler chain.
+All entry operations dispatch through the handler chain. `persist`
+on `get` also sets a project-level file constraint (operator privilege).
+`store` manages file constraints — not a model tool.
 #### Runs
 | Method | Params |
 |--------|--------|
 | `startRun` | `{ model, temperature?, persona?, contextLimit? }` |
-| `ask` | `{ prompt, model, run?, temperature?, persona?, contextLimit?, noContext?, fork? }` |
-| `act` | `{ prompt, model, run?, temperature?, persona?, contextLimit?, noContext?, fork? }` |
+| `ask` | `{ prompt, model, run?, temperature?, persona?, contextLimit?, noRepo?, noInteraction?, noWeb?, fork? }` |
+| `act` | `{ prompt, model, run?, temperature?, persona?, contextLimit?, noRepo?, noInteraction?, noWeb?, fork? }` |
 | `run/resolve` | `{ run, resolution: { path, action, output? } }` |
 | `run/abort` | `{ run }` |
 | `run/rename` | `{ run, name }` |
@@ -392,6 +588,10 @@ Without `persist`, operations dispatch through the handler chain.
 | `run/config` | `{ run, temperature?, persona?, contextLimit?, model? }` |
 `model` is required on `ask`, `act`, and `startRun`. No default.
+`noRepo` disables default project/repo file scanning (files can still
+be added explicitly by the client).
+`noInteraction` removes `ask_user` from the tool list.
+`noWeb` removes `search` from the tool list.
 #### Queries
@@ -445,7 +645,80 @@ Each plugin has its own README at `src/plugins/{name}/README.md`.
 ---
-## 7. Hedberg Editing Syntax
+## 7. Tool Documentation Design
+Tool docs are the most carefully designed text in rummy. Every line
+simultaneously teaches syntax, implies workflow priority, demonstrates
+pattern capabilities, and constrains misuse. Each letter earns its place.
+### Principles
+**Show, don't tell.** Examples ARE the documentation. A model learns
+`<get path="known://*">auth</get>` from seeing it, not from being told
+"you can filter known entries by keyword." Examples are ordered from
+simple to powerful — weak models learn from examples 1-2, strong models
+pick up the pattern from example 3.
+**Lifecycle continuity.** Examples weave stories across tools. The get
+docs end with `<set path="..." fidelity="index"/>`. The known docs
+reference `<get path="known://*">keyword</get>` for recall and
+`<set path="known://..." archive/>` for archiving. The unknown docs
+reference `<get/>` for investigation and `<rm/>` for cleanup. A model
+reading the full tool docs encounters a coherent workflow:
+discover → load → reason → edit → archive → recall.
+**RFC 2119 semantics.** Constraint bullets use YOU MUST, YOU MUST NOT,
+YOU SHOULD, YOU MAY from RFC 2119. Every LLM has extensive pretraining
+on RFC documents where these keywords carry precise semantic weight.
+MUST is absolute. SHOULD is strong advisory. MAY is permissive. This
+is not decorative — it's leveraging the model's existing understanding
+of requirement levels.
+**Consistent structure.** Every tool doc follows: header (syntax), 2+
+examples, 2+ constraint bullets. Inconsistent formatting reads as
+inconsistent importance. A tool with 5 examples and dense bullets feels
+complex; a tool with 1 line feels disposable. Both are wrong — every
+tool is equally real, each doc is proportional to the tool's surface area.
+### Format
+Tool docs live in `*Doc.js` files as annotated line arrays:
+```js
+const LINES = [
+    ["* Body text filters results by content match",
+        "Generalizes examples 2-3. Body = filter, not just path."],
+];
+export default LINES.map(([text]) => text).join("\n");
+```
+The first element is the model-facing text. The second is the rationale —
+visible only in source. Changing any line requires reading all rationales
+first. This prevents well-intentioned edits from breaking subtle behavioral
+guarantees that adjacent lines depend on.
+### Tool Display Order
+Tools are presented gather → reason → act → communicate. Position in
+the list implies priority. `get` is first. `ask_user` is last. The
+order is defined in `ToolRegistry.TOOL_ORDER` and applied by
+`resolveForLoop()`. The same method handles all tool exclusions —
+mode restrictions, `noInteraction`, `noWeb`, `noProposals` — through
+one unified mechanism.
+### Pattern Distribution
+Hedbergian pattern matching (globs, body filters, preview) is taught
+across multiple tools, not concentrated in one. `get` shows content
+filtering. `cp` shows glob batch operations. `rm` shows preview safety.
+Each tool reinforces the pattern vocabulary from a different angle.
+A model that sees `path="known://*"` in get, `path="known://plan_*"` in
+cp, and `path="known://temp_*" preview` in rm learns that patterns
+are universal — not a feature of any single tool.
+---
+## 8. Hedberg Editing Syntax
 The model picks its preferred edit format. The parser understands all of them:
@@ -460,26 +733,36 @@ The model picks its preferred edit format. The parser understands all of them:
 ---
-## 8. Response Healing
+## 9. Response Healing
-The server never throws on model output. Recovery order:
+The server never throws on model output. "Model behavior" is never an
+acceptable explanation. Recovery order:
 1. Can we recover? Extract the data and continue.
 2. Can we warn? Log structured warnings.
 3. Did our structure cause this? Check formatting, prompts.
-4. Model drift is the LAST answer.
 Termination protocol:
 - `<summarize>` → run terminates
+- `<summarize>` + failed actions → overridden to `<update>` (continue)
 - `<update>` → run continues
-- Both → summarize wins
-- Neither + tools → stall counter
+- Both → update wins (if the model can't decide, it's not done)
+- Neither + investigation tools → stall counter (RUMMY_MAX_STALLS)
+- Neither + action-only tools → healed to summarize
 - Neither + plain text → healed to summarize
-- Repeated commands → loop detection
+- Repeated commands → cycle detection (RUMMY_MIN_CYCLES, RUMMY_MAX_CYCLE_PERIOD)
+- Repeated update text → stall (RUMMY_MAX_UPDATE_REPEATS)
+Format normalization:
+- Gemma `\`\`\`tool_code` fences → stripped before parsing
+- Qwen `<|tool_call>` format → normalized to XML
+- OpenAI function_call JSON → normalized to XML
+- Mistral `[TOOL_CALLS]` → normalized to XML
+- Sed alternate delimiters (`s|old|new|`) → parsed like `s/old/new/`
 ---
-## 9. Testing
+## 10. Testing
 | Tier | Location | LLM? |
 |------|----------|------|
@@ -493,12 +776,12 @@ E2E tests must NEVER mock the LLM. Environment cascade:
 ---
-## 10. SQL Functions
+## 11. SQL Functions
 | Function | Purpose |
 |----------|---------|
 | `schemeOf(path)` | Extract URI scheme |
-| `countTokens(text)` | Token count (tiktoken o200k_base, `ceil(len/4)` fallback) |
+| `countTokens(text)` | Token count (`ceil(len / RUMMY_TOKEN_DIVISOR)`) |
 | `hedmatch(pattern, string)` | Full-string pattern match (paths, equality) |
 | `hedsearch(pattern, string)` | Substring pattern search (content filtering) |
 | `hedreplace(pattern, replacement, string)` | Pattern-based replacement |
@@ -508,15 +791,66 @@ See [PLUGINS.md](PLUGINS.md) for the hedberg pattern type reference.
 ---
-## 11. Configuration
+## 13. Debugging: E2E and Benchmark Results
+### E2E test failures
+E2E tests use a temp DB at `/tmp/rummy_test_<timestamp>_<random>.db` (cleaned up after).
+On failure, `AuditClient.assertRun` calls `dumpRun`, which prints a full turn-by-turn audit
+to stdout. That output is in the background task log:
+```
+/tmp/claude-1000/-home-hyzen-repo-rummy-main/<session-id>/tasks/<task-id>.output
+```
+If oversized, the harness saves to:
+```
+/home/hyzen/.claude/projects/-home-hyzen-repo-rummy-main/<session-id>/tool-results/<id>.txt
+```
+The dump format is: `scheme:state path {attributes}\n  body (120 chars)` grouped by turn.
+Key things to look for in a dump:
+- **202**: unresolved proposals — model issued `<sh>`, `<rm>`, or `<mv>` that needs approval
+- **413**: budget overflow — assembled context exceeded ceiling before LLM call
+- **BudgetGuard errors**: per-tool rejections mid-turn (`Budget exceeded: N tokens requested`)
+- **`<sh>` in act/panic mode**: model fell back to shell when blocked (doc/prompt gap)
+- Loop sequence: look for `mode` in `instructions://system` attrs to see which loop type ran
+### MAB benchmark
+Results live in `test/mab/results/<ISO-timestamp>/mab.db`. Latest run = most recent dir.
+```js
+// Query a MAB result DB directly:
+import { DatabaseSync } from 'node:sqlite';
+const db = new DatabaseSync('test/mab/results/<timestamp>/mab.db');
+db.prepare('SELECT * FROM questions').all();      // all questions + scores
+db.prepare('SELECT * FROM runs').all();           // individual model runs
+```
+Run with: `npm run test:mab`
+### LME benchmark
+Results live in `test/lme/results/<ISO-timestamp>/lme.db`. Same structure.
+Run with: `npm run test:lme`
+---
+## 12. Configuration
 ```env
 RUMMY_HOME=~/.rummy
-RUMMY_MAX_TURNS=15
+RUMMY_TOKEN_DIVISOR=2
+RUMMY_MAX_TURNS=99
 RUMMY_MAX_STALLS=3
-RUMMY_MAX_REPETITIONS=3
+RUMMY_MIN_CYCLES=3
+RUMMY_MAX_CYCLE_PERIOD=4
+RUMMY_MAX_UPDATE_REPEATS=3
 RUMMY_RETENTION_DAYS=31
-RUMMY_TEMPERATURE=0.7
+RUMMY_TEMPERATURE=0.5
 RUMMY_DEBUG=false
 ```