npm - @possumtech/rummy - Versions diffs - 0.3.0 → 0.4.0 - Mend

@possumtech/rummy 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

package/.env.example +13 -1
package/PLUGINS.md +1 -1
package/README.md +5 -1
package/SPEC.md +211 -54
package/migrations/001_initial_schema.sql +3 -4
package/package.json +7 -3
package/service.js +5 -3
package/src/agent/AgentLoop.js +183 -238
package/src/agent/ContextAssembler.js +2 -0
package/src/agent/KnownStore.js +36 -85
package/src/agent/ResponseHealer.js +65 -31
package/src/agent/TurnExecutor.js +284 -382
package/src/agent/XmlParser.js +28 -4
package/src/agent/known_queries.sql +1 -1
package/src/agent/known_store.sql +32 -34
package/src/agent/runs.sql +2 -2
package/src/agent/tokens.js +1 -0
package/src/agent/turns.sql +5 -0
package/src/hooks/HookRegistry.js +7 -0
package/src/hooks/Hooks.js +2 -4
package/src/hooks/ToolRegistry.js +8 -13
package/src/plugins/ask_user/ask_userDoc.js +3 -8
package/src/plugins/budget/README.md +26 -30
package/src/plugins/budget/budget.js +69 -36
package/src/plugins/budget/recovery.js +47 -0
package/src/plugins/cp/cp.js +1 -1
package/src/plugins/cp/cpDoc.js +5 -10
package/src/plugins/env/envDoc.js +3 -8
package/src/plugins/get/get.js +70 -2
package/src/plugins/get/getDoc.js +19 -16
package/src/plugins/hedberg/matcher.js +10 -29
package/src/plugins/helpers.js +2 -2
package/src/plugins/instructions/instructions.js +3 -2
package/src/plugins/instructions/preamble.md +33 -12
package/src/plugins/known/known.js +66 -17
package/src/plugins/known/knownDoc.js +7 -10
package/src/plugins/mv/mv.js +18 -1
package/src/plugins/mv/mvDoc.js +9 -10
package/src/plugins/{current → performed}/README.md +4 -3
package/src/plugins/{current/current.js → performed/performed.js} +15 -20
package/src/plugins/policy/policy.js +47 -0
package/src/plugins/previous/README.md +2 -1
package/src/plugins/previous/previous.js +31 -25
package/src/plugins/progress/README.md +1 -2
package/src/plugins/progress/progress.js +10 -60
package/src/plugins/prompt/prompt.js +10 -8
package/src/plugins/rm/rm.js +27 -15
package/src/plugins/rm/rmDoc.js +6 -11
package/src/plugins/rpc/rpc.js +3 -1
package/src/plugins/set/set.js +125 -92
package/src/plugins/set/setDoc.js +28 -37
package/src/plugins/sh/shDoc.js +2 -7
package/src/plugins/summarize/summarize.js +7 -0
package/src/plugins/summarize/summarizeDoc.js +6 -11
package/src/plugins/telemetry/telemetry.js +14 -9
package/src/plugins/think/think.js +12 -0
package/src/plugins/think/thinkDoc.js +18 -0
package/src/plugins/unknown/README.md +2 -1
package/src/plugins/unknown/unknown.js +26 -4
package/src/plugins/unknown/unknownDoc.js +9 -14
package/src/plugins/update/update.js +7 -0
package/src/plugins/update/updateDoc.js +6 -11
package/src/server/ClientConnection.js +69 -45
package/src/sql/v_model_context.sql +7 -17
package/src/plugins/budget/BudgetGuard.js +0 -74

package/.env.example CHANGED Viewed

@@ -17,9 +17,11 @@ RUMMY_MMAP_MB=0
 # Agent Loop Limits
 RUMMY_MAX_TURNS=99
+RUMMY_MAX_COMMANDS=15
 RUMMY_MAX_UNKNOWN_WARNINGS=3
 RUMMY_MAX_STALLS=3
-RUMMY_MAX_REPETITIONS=3
+RUMMY_MIN_CYCLES=3
+RUMMY_MAX_CYCLE_PERIOD=4
 RUMMY_MAX_UPDATE_REPEATS=3
 # Hygiene
@@ -33,6 +35,16 @@ RUMMY_FETCH_TIMEOUT=300000
 # Debug
 # RUMMY_DEBUG=true
+# Think tag: 1 = model uses <think> tags for reasoning (default)
+# 0 = disabled, model reasons via API reasoning_content field only
+RUMMY_THINK=1
+# Budget
+# Fraction of context window used as ceiling. 0.9 = 90%, 10% reserved as headroom.
+RUMMY_BUDGET_CEILING=0.9
+# Maximum tokens per known entry. Entries exceeding this are rejected with 413.
+RUMMY_MAX_ENTRY_TOKENS=512
 # Token Estimation
 # Characters per token. Lower = more conservative (fewer tokens per character).
 # Default 2. Set to 1 for worst-case (1 token per character).

package/PLUGINS.md CHANGED Viewed

@@ -467,7 +467,7 @@ prepended above the plugin's summary view output.
 | `update` | Structural | Signal continued work |
 | `unknown` | Structural + Assembly | Register unknowns, render `<unknowns>` |
 | `previous` | Assembly | Render `<previous>` loop history |
-| `current` | Assembly | Render `<current>` active loop work |
+| `performed` | Assembly | Render `<performed>` active loop work |
 | `progress` | Assembly | Render `<progress>` telemetry + warnings |
 | `prompt` | Assembly | Render `<prompt mode="ask|act">` tag |
 | `hedberg` | Utility | Pattern matching, interpretation, normalization |

package/README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # RUMMY: Relational Unknowns Memory Management Yoke
-Rummy is the only LLM agent service inspired by and dedicated to the memory of former Secretary of State Donald "Rummy" Rumsfeld. Our unique fusion of apophatic and hedbergian engineering strategies yields more accurate and efficient results than any other agent. Our client/server and plugin architecture integrates it into more workflows than any other agent. It's also more flexible and lean than any other agent. Our dynamic cache management, model hot-swapping, and flexible router interface make it more affordable than any other agent.
+Rummy is the only LLM agent service inspired by and dedicated to the memory of former Secretary of Defense Donald "Rummy" Rumsfeld. Our unique fusion of apophatic and hedbergian engineering strategies yields more accurate and efficient results than any other agent. Our client/server and plugin architecture integrates it into more workflows than any other agent. It's also more flexible and lean than any other agent. Our dynamic cache management, model hot-swapping, and flexible router interface make it more affordable than any other agent.
 ## Key Features
@@ -10,6 +10,10 @@ Rummy is the only LLM agent service inspired by and dedicated to the memory of f
 - **Hedberg:** The interpretation boundary between stochastic model output and deterministic system operations. Models speak in whatever syntax they were trained on — sed regex, SEARCH/REPLACE blocks, escaped characters. Hedberg normalizes all of it. Available to all plugins via `core.hooks.hedberg`.
+- **Folksonomic Memory:** The model organizes its own knowledge into navigable path hierarchies with searchable summary tags. Not RAG — the model builds and curates its own taxonomy using `<known>` entries with paths like `known://project/architecture`.
+- **Fidelity System:** Every entry has a visibility level: full, summary, index, archive. The model manages its own context by promoting what it needs and demoting what it doesn't. Budget enforcement catches overflow post-dispatch — tools run uninterrupted, demotion happens after.
 - **Plugin Architecture:** Every `<tag>` the model sees is a plugin. Every scheme is registered by its owner. The prompt itself is assembled from plugins. Drop a directory into `~/.rummy/plugins/` or install via npm. See [PLUGINS.md](PLUGINS.md) for the complete plugin API.
 - **Symbols Done Right:** Designed with universal language support in mind. Powered by [@possumtech/antlrmap](https://github.com/possumtech/antlrmap).

package/SPEC.md CHANGED Viewed

@@ -44,7 +44,7 @@ body, attributes, and state.
 known_entries (
     id, run_id, loop_id, turn, path, body, scheme,
     status INTEGER, fidelity TEXT, hash,
-    attributes, tokens, tokens_full, refs, write_count,
+    attributes, tokens, refs, write_count,
     created_at, updated_at
 )
 ```
@@ -56,10 +56,9 @@ known_entries (
 | `attributes` | Tag attributes as JSON. Handler-private workspace. `CHECK (json_valid)` |
 | `scheme` | Generated from path via `schemeOf()`. Drives dispatch and view routing |
 | `status` | HTTP status code (200, 202, 400, 413, etc.) |
-| `fidelity` | Visibility level: full, summary, index, archive |
+| `fidelity` | Visibility level: full, summary, archive |
 | `hash` | SHA-256 for file change detection |
-| `tokens` | Display-only token count at current fidelity. NEVER used for budget. |
-| `tokens_full` | Cost of raw body at full fidelity |
+| `tokens` | Full-body token cost. Never changes on demotion/promotion. |
 | `turn` | Freshness — when was this entry last touched |
 ### 1.2 Schemes, Status & Fidelity
@@ -82,7 +81,7 @@ Every entry plays one of four roles:
 | Role | Category | Section | Description |
 |------|----------|---------|-------------|
 | **Data** | `data` | `<knowns>` | Entries the model works with — persistent state |
-| **Logging** | `logging` | `<current>`/`<previous>` | Records of what happened — tool results, lifecycle signals |
+| **Logging** | `logging` | `<performed>`/`<previous>` | Records of what happened — tool results, lifecycle signals |
 | **Unknowns** | `unknown` | `<unknowns>` | Open questions the model is tracking |
 | **Prompt** | `prompt` | `<prompt>` | The task driving the loop |
@@ -96,7 +95,6 @@ Every entry plays one of four roles:
 | `http://`, `https://` | data | Web content. |
 | `unknown://` | unknown | Unresolved questions. |
 | `prompt://` | prompt | User prompt with `mode` attribute (`ask`/`act`). |
-| `progress://` | prompt | Continuation prompt. |
 | `set://`, `get://`, `sh://`, `env://`, `rm://`, `mv://`, `cp://`, `ask_user://`, `search://` | logging | Tool result entries. |
 | `summarize://`, `update://` | logging | Lifecycle signals. |
 | `tool://` | audit | Internal plugin metadata. `model_visible = 0`. |
@@ -211,9 +209,9 @@ object is the same shape at every tier.
 Model tier restrictions enforced by unified `resolveForLoop(mode, flags)`.
 Ask mode excludes `sh`. Flags: `noInteraction` excludes `ask_user`,
-`noWeb` excludes `search`, `noBench` excludes `ask_user`/`env`/`sh`.
-13 model tools: get, set, known, unknown, env, sh, rm, cp, mv, search,
-summarize, update, ask_user.
+`noWeb` excludes `search`, `noProposals` excludes `ask_user`/`env`/`sh`.
+14 model tools: think, unknown, known, get, set, env, sh, rm, cp, mv,
+ask_user, update, summarize, search.
 Client tier requires project init. Plugin tier has no restrictions.
 ### 3.2 Dispatch Path
@@ -226,13 +224,28 @@ Client: JSON-RPC  → { method, params }   → #record() → dispatch(scheme, en
 Plugin: rummy.rm({ path })               → #record() → dispatch(scheme, entry, rummy)
 ```
-**Lifecycle/action split:** Commands are classified as lifecycle signals
-(`summarize`, `update`, `unknown`, `known`) or action commands (everything
-else). Lifecycle signals always dispatch — they are state declarations that
-cannot be 409'd by sequential dispatch. Action commands dispatch sequentially;
-a 202 proposal or error aborts subsequent actions. If the model sends
-`<summarize>` but actions in the same turn failed, the summarize is
-overridden to an update (the model's assertion that it's done is false).
+**Tool dispatch:** Commands are dispatched sequentially in the order
+the model emitted them. Each tool either succeeds (200), fails (400+),
+or proposes (202). On failure, all remaining tools are aborted. On
+proposal, dispatch pauses, a notification is pushed to the client
+(same WebSocket push pattern as `run/progress`), the client resolves
+(accept/reject), and dispatch resumes — the proposal becomes 200 or
+400+ like any other tool. The `ask`/`act` RPC response is only sent
+when all tools have completed. Proposals are NOT batched — each is
+sent and resolved inline during dispatch. The model controls tool
+ordering; the system respects it.
+If the model sends `<summarize>` but a preceding action in the same
+turn failed, the summarize is overridden to an update (the model's
+assertion that it's done is false). Both `<summarize>` and `<update>`
+present → last signal wins.
+**Post-dispatch budget check:** After all tools dispatch, the system
+materializes context and checks the budget ceiling. If context exceeds
+the ceiling, Turn Demotion fires — all entries from this turn are
+demoted to summary and a `budget://` entry is written. This is a
+system housekeeping step independent of tool success/failure. The
+tools already ran; their outcomes are settled.
 ### 3.3 Plugin Convention
@@ -294,18 +307,20 @@ Two messages per turn. System = stable truth. User = active task.
         [skills/]
     [/instructions]
     <knowns>
-        ...entries sorted by fidelity (index, summary, full), then by scheme
+        ...entries sorted by fidelity (summary, full), then by scheme
     </knowns>
     <previous>
-        (pre-loop user prompt, model responses, agent warnings, and tools used, in order)
+        (pre-loop entries, each with turn, status, summary, fidelity, tokens)
     </previous>
-    <unknowns></unknowns>
+    <unknowns>
+        (open questions, each with path, turn, fidelity, tokens)
+    </unknowns>
 [/system]
 [user]
-    <current>
-        (current loop model responses, agent warnings, and tools used, in order)
-    </current>
-    <progress>the above actions have been performed on this user prompt:</progress>
+    <performed>
+        (current loop entries, each with turn, status, summary, fidelity, tokens)
+    </performed>
+    <progress turn="N">token budget, fidelity stats, causal bridge</progress>
     <prompt mode="ask|act" tools="...">user prompt</prompt>
 [/user]
 ```
@@ -317,9 +332,9 @@ The `<prompt>` tag is present on every turn — first turn and
 continuations alike. The model always sees its task. The active prompt
 is extracted from its chronological position and placed last for maximum
 recency. `<progress>` bridges the gap, narrating the causal relationship
-between `<current>` (the work) and the prompt (the cause).
+between `<performed>` (the work) and the prompt (the cause).
-### 4.2 Loops, Previous, and Current
+### 4.2 Loops, Previous, and Performed
 A **loop** is one `ask` or `act` invocation and all its continuation
 turns until summarize, fail, or abort.
@@ -329,14 +344,14 @@ responses, tool results, agent warnings — the full chronicle in order.
 Lives in the system message as established history. Omitted on the
 first turn of the first loop.
-**Current** = the active loop's work so far. Model responses, tool
+**Performed** = the active loop's work so far. Model responses, tool
 results, agent warnings — in order. Does NOT include the user prompt
 (one per loop, extracted to `<prompt>`). Lives in the user
 message as immediate context. Empty on the first turn of a loop.
 When a new prompt arrives on an existing run, the prior loop's
-`<current>` content plus its prompt move to `<previous>`. When a loop
-continues (next turn), new results append to `<current>`.
+`<performed>` content plus its prompt move to `<previous>`. When a loop
+continues (next turn), new results append to `<performed>`.
 ### 4.3 Key Entries
@@ -357,7 +372,7 @@ text from body + attributes.
 Each turn:
 1. Write `instructions://system` (empty body, attributes = { persona })
-2. Emit `turn.started` — plugins write prompt/progress/instructions entries
+2. Emit `turn.started` — plugins write prompt/instructions entries
 3. Project `instructions://system` → instructions text
 4. Query `v_model_context` VIEW → visible entries
 5. Project each entry through its tool's `full`/`summary` projection
@@ -367,7 +382,7 @@ Each turn:
    - Previous plugin (priority 200) → `<previous>` section
    - Unknown plugin (priority 300) → `<unknowns>` section
 8. Invoke `assembly.user` filter chain (empty string as base):
-   - Current plugin (priority 100) → `<current>` section
+   - Performed plugin (priority 100) → `<performed>` section
    - Progress plugin (priority 200) → `<progress>` section
    - Prompt plugin (priority 300) → `<prompt>` section
 9. Store as `system://N` and `user://N` audit entries
@@ -377,6 +392,12 @@ The VIEW determines visibility from `fidelity` and `status`:
 - `summary` → summary visible (model-authored `summary` attribute if set)
 - `index` → path listed, no content
 - `archive` → invisible (retrievable via `<get>`)
+**Partial read:** `<get path="..." line="N" limit="M"/>` returns lines N through
+N+M−1 of the entry body as the log item without changing fidelity or promoting
+the entry to context. Use after reading `summary` fidelity (which gives line
+numbers via repomap) to target a specific symbol. Single-path only — glob or
+body filter with `line`/`limit` is a 400 error.
 - `status = 202` → invisible (proposed, pending client)
 - `model_visible = 0` → invisible (audit, tool, instructions)
@@ -400,6 +421,16 @@ during dispatch. `upsert()`, `promoteByPattern()`, and
 Exceeding the budget throws `BudgetExceeded` — the tool 413s, the
 guard trips, and all subsequent tools in the turn fail.
+BudgetGuard ceiling = `floor(contextSize × 0.9) − 500`. The 500-token
+buffer below the enforce ceiling absorbs two sources of overhead that
+BudgetGuard cannot see: (a) `#record()`-phase writes that bypass the
+guard (~15 tokens per command), and (b) loop transition overhead —
+when a loop completes and a new one starts, entries shift from
+`<performed>` to `<previous>` format, adding ~200–300 tokens to the
+next assembly. Without this buffer, the base context can accumulate
+to exactly the enforce ceiling, making it impossible for the panic
+loop to start (panic prompt + loop overhead > ceiling).
 **Exemptions:** `status >= 400` entries (error results), `model_visible
 = 0` entries (audit), `fidelity = "archive"` entries (not in context).
@@ -415,30 +446,107 @@ formula, one file (`src/agent/tokens.js`), env-configurable. No
 external dependencies. `contextSize` is the ceiling. Over = 413.
 Under = 200. No margins.
-### 4.6 Panic Mode
+**Three token measures — never conflate them:**
+| Measure | Source | Scope | Use |
+|---|---|---|---|
+| SQL entry tokens | `known_entries.tokens` = `ceil(chars / DIVISOR)` | Per entry | Model decision-making: "this entry costs N tokens" |
+| Assembled estimate | `measureMessages(messages)` = sum of entry projections | Full packet | First-turn budget fallback only |
+| Actual API tokens | `turns.context_tokens` = `usage.input_tokens` back-filled from LLM | Per turn | Budget enforcement on turns 2+; ground truth |
-When a new prompt arrives and the assembled context exceeds
-`contextSize`, the system enters panic mode instead of failing to
-the client.
+`budget.enforce` uses the **actual API tokens** (`get_last_context_tokens`) when
+available (turn 2+) and falls back to the assembled estimate on turn 1. The
+estimate can be 3–7× off for XML/JSON-heavy content — do not rely on it for
+anything that matters.
-1. The failed loop is completed with 413 (audit trail)
-2. A panic loop is enqueued (`mode = "panic"`, `noRepo = true`)
-3. The original loop is re-enqueued to retry after panic
-4. The model receives a prompt with the exact shortfall in tokens
-5. Tools: get, set, known, unknown, rm, mv, cp, summarize, update
-6. Excluded: sh, env, search, ask_user
+**`context_tokens` vs `prompt_tokens` in step telemetry:**
+- `context_tokens` in the step JSON = `turns.context_tokens` for that turn =
+  per-turn actual input tokens from the LLM API (e.g. 7900 tokens sent this turn)
+- `prompt_tokens` in the step JSON = `SUM(turns.prompt_tokens)` for the run =
+  **cumulative** total across all turns (cost tracking, not a context size)
-**Strike system:** Each turn without context reduction = 1 strike.
-Any reduction resets the counter. 3 consecutive strikes = hard 413
-to client. Unlimited turns as long as the model makes progress.
+These two will diverge rapidly on any multi-turn run. A run at turn 50 might show
+`context_tokens: 8000` (context under control) and `prompt_tokens: 400000`
+(total input tokens billed across the whole run). They are measuring orthogonal things.
-One panic attempt per drain cycle. If the retried original loop also
-413s, hard-fail to the client.
+### 4.6 Panic Mode
+**The invariant.** A panic is only ever triggered because the
+assembled context was under the ceiling — and the new prompt pushed
+it over. The existing context fit; the incoming prompt did not.
+Panic mode replaces that too-large incoming prompt with a small
+panic prompt on the same context. Therefore: the first turn of a
+panic loop cannot 413. If it does, it is a bug.
+**Trigger.** `TurnExecutor.execute()` assembles the full packet
+(context + incoming prompt) before calling the LLM. If
+`assembledTokens > contextSize`, it returns 413 without calling
+the LLM. `#drainQueue` intercepts this and enters panic mode.
+**Flow.**
+1. Complete the failed loop with status 413 (audit trail).
+2. Enqueue a panic loop (`mode = "panic"`, `noRepo = true`,
+   `prompt = panicPrompt`, `panicTarget` in config).
+3. Re-enqueue the original loop with `panicAttempted: true` in
+   its config JSON. This flag persists across drain cycles.
+4. `continue` — the drain loop claims the panic loop next.
+After panic completes (model freed enough space), the retry loop
+runs. If the retry also 413s, hard-fail to client. One panic
+attempt per drain cycle — `panicAttempted` is checked both as a
+local variable and on the re-enqueued loop's config.
+**Panic target.** The model must compress context to below:
+```
+panicTarget = MIN(contextSize × 0.75, contextSize − incomingTokens) − cushion
+```
-**`ToolRegistry.view()`** prepends `attributes.summary` above the
-plugin's summary view output at summary fidelity. The model authors
-summaries (<= 80 chars) via `<set summary="...">`. Summaries persist
-across fidelity changes.
+`incomingTokens` is the raw token count of the original prompt.
+`cushion` is a small safety margin (500 tokens) to absorb
+materialization overhead. The target is expressed in materialized
+token units — the same unit the system uses to measure completion
+(see Token Math below).
+**Two token contexts.**
+The model reasons in *per-entry SQL tokens* — the token counts
+visible in `<knowns>` entries. These are the granular unit the model
+uses to decide which entries to target: "this entry is 200 tokens;
+if I archive it, I save 200 tokens."
+The system makes decisions using *actual API tokens* —
+`turns.context_tokens` back-filled from `usage.input_tokens` after
+each LLM call. SQL token sums do not equal actual API counts because
+projections, assembly overhead, and fidelity transforms alter the
+output; and the SQL estimate (`ceil(chars / DIVISOR)`) can be 3–7×
+off for structured content. **Never use SQL token sums for ceiling or
+budget decisions.** See §4.5 Token Measures for the full breakdown.
+**Strike system.** After each panic turn, compare
+`result.assembledTokens` (materialized) with `_lastPanicTokens`
+(previous turn's materialized total):
+- Decreased → reset strike counter to 0.
+- Same or increased → increment strikes.
+- 3 consecutive strikes → return 413 to `#drainQueue` → hard-fail.
+Progress (any reduction) resets the counter. The model has
+unlimited turns as long as it makes progress.
+**Panic success.** After each turn, if `result.assembledTokens
+<= panicTarget`, the panic loop exits with 200. The retry loop
+then runs with the original prompt on the now-compressed context.
+**Tool set.** `resolveForLoop("panic")` includes: get, set, known,
+unknown, rm, mv, cp, summarize, update. Excludes: sh, env, search,
+ask_user. `noRepo: true` — no file scanning during panic.
+**What the model sees.** Turn 1 receives the panic prompt from
+`budget.panicPrompt()`: the assembled token count, the target, and
+the exact number of tokens to free. Turn 2+ receives a continuation
+prompt. The model uses `<set fidelity="archive">`, `<mv
+fidelity="summary">`, and similar fidelity operations to free space,
+concluding with `<summarize>` when done or `<update>` while working.
 ---
@@ -566,7 +674,7 @@ simple to powerful — weak models learn from examples 1-2, strong models
 pick up the pattern from example 3.
 **Lifecycle continuity.** Examples weave stories across tools. The get
-docs end with `<set path="..." fidelity="index"/>`. The known docs
+docs end with `<set path="..." fidelity="summary"/>`. The known docs
 reference `<get path="known://*">keyword</get>` for recall and
 `<set path="known://..." archive/>` for archiving. The unknown docs
 reference `<get/>` for investigation and `<rm/>` for cleanup. A model
@@ -609,7 +717,7 @@ Tools are presented gather → reason → act → communicate. Position in
 the list implies priority. `get` is first. `ask_user` is last. The
 order is defined in `ToolRegistry.TOOL_ORDER` and applied by
 `resolveForLoop()`. The same method handles all tool exclusions —
-mode restrictions, `noInteraction`, `noWeb`, `noBench` — through
+mode restrictions, `noInteraction`, `noWeb`, `noProposals` — through
 one unified mechanism.
 ### Pattern Distribution
@@ -652,11 +760,11 @@ Termination protocol:
 - `<summarize>` → run terminates
 - `<summarize>` + failed actions → overridden to `<update>` (continue)
 - `<update>` → run continues
-- Both → update wins (if the model can't decide, it's not done)
+- Both → last signal wins (respects the model's final intent)
 - Neither + investigation tools → stall counter (RUMMY_MAX_STALLS)
 - Neither + action-only tools → healed to summarize
 - Neither + plain text → healed to summarize
-- Repeated commands → loop detection (RUMMY_MAX_REPETITIONS)
+- Repeated commands → cycle detection (RUMMY_MIN_CYCLES, RUMMY_MAX_CYCLE_PERIOD)
 - Repeated update text → stall (RUMMY_MAX_UPDATE_REPEATS)
 Format normalization:
@@ -697,6 +805,54 @@ See [PLUGINS.md](PLUGINS.md) for the hedberg pattern type reference.
 ---
+## 13. Debugging: E2E and Benchmark Results
+### E2E test failures
+E2E tests use a temp DB at `/tmp/rummy_test_<timestamp>_<random>.db` (cleaned up after).
+On failure, `AuditClient.assertRun` calls `dumpRun`, which prints a full turn-by-turn audit
+to stdout. That output is in the background task log:
+```
+/tmp/claude-1000/-home-hyzen-repo-rummy-main/<session-id>/tasks/<task-id>.output
+```
+If oversized, the harness saves to:
+```
+/home/hyzen/.claude/projects/-home-hyzen-repo-rummy-main/<session-id>/tool-results/<id>.txt
+```
+The dump format is: `scheme:state path {attributes}\n  body (120 chars)` grouped by turn.
+Key things to look for in a dump:
+- **202**: unresolved proposals — model issued `<sh>`, `<rm>`, or `<mv>` that needs approval
+- **413**: budget overflow — assembled context exceeded ceiling before LLM call
+- **BudgetGuard errors**: per-tool rejections mid-turn (`Budget exceeded: N tokens requested`)
+- **`<sh>` in act/panic mode**: model fell back to shell when blocked (doc/prompt gap)
+- Loop sequence: look for `mode` in `instructions://system` attrs to see which loop type ran
+### MAB benchmark
+Results live in `test/mab/results/<ISO-timestamp>/mab.db`. Latest run = most recent dir.
+```js
+// Query a MAB result DB directly:
+import { DatabaseSync } from 'node:sqlite';
+const db = new DatabaseSync('test/mab/results/<timestamp>/mab.db');
+db.prepare('SELECT * FROM questions').all();      // all questions + scores
+db.prepare('SELECT * FROM runs').all();           // individual model runs
+```
+Run with: `npm run test:mab`
+### LME benchmark
+Results live in `test/lme/results/<ISO-timestamp>/lme.db`. Same structure.
+Run with: `npm run test:lme`
+---
 ## 12. Configuration
 ```env
@@ -704,7 +860,8 @@ RUMMY_HOME=~/.rummy
 RUMMY_TOKEN_DIVISOR=2
 RUMMY_MAX_TURNS=99
 RUMMY_MAX_STALLS=3
-RUMMY_MAX_REPETITIONS=3
+RUMMY_MIN_CYCLES=3
+RUMMY_MAX_CYCLE_PERIOD=4
 RUMMY_MAX_UPDATE_REPEATS=3
 RUMMY_RETENTION_DAYS=31
 RUMMY_TEMPERATURE=0.5

package/migrations/001_initial_schema.sql CHANGED Viewed

@@ -65,7 +65,7 @@ CREATE TABLE IF NOT EXISTS loops (
 	id INTEGER PRIMARY KEY AUTOINCREMENT
 	, run_id INTEGER NOT NULL REFERENCES runs (id) ON DELETE CASCADE
 	, sequence INTEGER NOT NULL CHECK (sequence >= 1)
-	, mode TEXT NOT NULL CHECK (mode IN ('ask', 'act', 'panic'))
+	, mode TEXT NOT NULL CHECK (mode IN ('ask', 'act'))
 	, model TEXT
 	, prompt TEXT NOT NULL DEFAULT ''
 	, status INTEGER NOT NULL DEFAULT 100 CHECK (status BETWEEN 100 AND 599)
@@ -125,12 +125,11 @@ CREATE TABLE IF NOT EXISTS known_entries (
 	, scheme TEXT GENERATED ALWAYS AS (schemeOf(path)) STORED
 	, status INTEGER NOT NULL DEFAULT 200 CHECK (status BETWEEN 100 AND 599)
 	, fidelity TEXT NOT NULL DEFAULT 'full' CHECK (
-		fidelity IN ('full', 'summary', 'index', 'archive')
+		fidelity IN ('full', 'summary', 'archive')
 	)
 	, hash TEXT
 	, attributes JSON NOT NULL DEFAULT '{}' CHECK (json_valid(attributes))
 	, tokens INTEGER NOT NULL DEFAULT 0 CHECK (tokens >= 0)
-	, tokens_full INTEGER NOT NULL DEFAULT 0 CHECK (tokens_full >= 0)
 	, refs INTEGER NOT NULL DEFAULT 0 CHECK (refs >= 0)
 	, write_count INTEGER NOT NULL DEFAULT 1 CHECK (write_count >= 1)
 	, created_at DATETIME DEFAULT CURRENT_TIMESTAMP
@@ -167,7 +166,7 @@ CREATE TABLE IF NOT EXISTS turn_context (
 	, path TEXT NOT NULL
 	, scheme TEXT GENERATED ALWAYS AS (schemeOf(path)) STORED
 	, status INTEGER NOT NULL DEFAULT 200 CHECK (status BETWEEN 100 AND 599)
-	, fidelity TEXT NOT NULL CHECK (fidelity IN ('full', 'summary', 'index'))
+	, fidelity TEXT NOT NULL CHECK (fidelity IN ('full', 'summary'))
 	, body TEXT NOT NULL DEFAULT ''
 	, tokens INTEGER NOT NULL DEFAULT 0 CHECK (tokens >= 0)
 	, attributes JSON NOT NULL DEFAULT '{}' CHECK (json_valid(attributes))

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "@possumtech/rummy",
-	"version": "0.3.0",
+	"version": "0.4.0",
 	"description": "Relational Unknowns Memory Management Yoke",
 	"keywords": [
 		"llm"
@@ -43,9 +43,12 @@
 		"test:live": "mkdir -p /tmp/rummy_test_diag && node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test --test-concurrency=1 --test-force-exit --test-reporter=spec --test $(find test/live -name '*.test.js') 2>&1 | tee /tmp/rummy_test_diag/live_$(date +%Y%m%dT%H%M%S).log",
 		"test:clean": "rm -rf test/lme/results test/mab/results test/tmp /tmp/rummy_test_diag /tmp/rummy_test_*.db /tmp/rummy_test_*.db-shm /tmp/rummy_test_*.db-wal && echo 'Test artifacts cleaned.'",
 		"test:mab:get": "node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test test/mab/download.js",
-		"test:mab": "mkdir -p /tmp/rummy_test_diag && node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test test/mab/runner.js 2>&1 | tee /tmp/rummy_test_diag/mab_$(date +%Y%m%dT%H%M%S).log",
+		"test:mab": "bash -c 'mkdir -p /tmp/rummy_test_diag && node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test test/mab/runner.js \"$@\" 2>&1 | tee /tmp/rummy_test_diag/mab_$(date +%Y%m%dT%H%M%S).log' --",
+		"test:grok": "bash -c 'mkdir -p /tmp/rummy_test_diag && node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test --env-file-if-exists=.env.grok test/mab/runner.js \"$@\" 2>&1 | tee /tmp/rummy_test_diag/mab_grok_$(date +%Y%m%dT%H%M%S).log' --",
+		"test:mab:taxonomy": "bash -c 'mkdir -p /tmp/rummy_test_diag && node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test test/mab/runner.js --split Conflict_Resolution --row 0 --no-questions 2>&1 | tee /tmp/rummy_test_diag/taxonomy_$(date +%Y%m%dT%H%M%S).log' --",
+		"test:grok:taxonomy": "bash -c 'mkdir -p /tmp/rummy_test_diag && node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test --env-file-if-exists=.env.grok test/mab/runner.js --split Conflict_Resolution --row 0 --no-questions 2>&1 | tee /tmp/rummy_test_diag/taxonomy_grok_$(date +%Y%m%dT%H%M%S).log' --",
 		"test:lme:get": "node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test test/lme/download.js",
-		"test:lme": "mkdir -p /tmp/rummy_test_diag && node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test test/lme/runner.js 2>&1 | tee /tmp/rummy_test_diag/lme_$(date +%Y%m%dT%H%M%S).log",
+		"test:lme": "bash -c 'mkdir -p /tmp/rummy_test_diag && node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test test/lme/runner.js \"$@\" 2>&1 | tee /tmp/rummy_test_diag/lme_$(date +%Y%m%dT%H%M%S).log' --",
 		"test:mab:clean": "rm -rf test/mab/results/*/",
 		"test:lme:clean": "rm -rf test/lme/results/*/",
 		"test:clear": "rm -rf /tmp/rummy_test_diag /tmp/rummy_test_*.db /tmp/rummy_test_*.db-shm /tmp/rummy_test_*.db-wal /tmp/rummy-stories-*"
@@ -56,6 +59,7 @@
 	"dependencies": {
 		"@possumtech/sqlrite": "^3.1.0",
 		"@xmldom/xmldom": "^0.9.9",
+		"diff": "^8.0.4",
 		"htmlparser2": "^12.0.0",
 		"picomatch": "^4.0.4",
 		"ws": "^8.19.0",

package/service.js CHANGED Viewed

@@ -18,13 +18,13 @@ if (gitCheck.error || gitCheck.status !== 0) {
 	console.warn("[RUMMY] WARNING: 'git' not found. File tracking will use manual activation only.");
 }
-let SqlRite, SocketServer, registerPlugins, createHooks, RpcRegistry;
+let SqlRite, SocketServer, registerPlugins, initPlugins, createHooks, RpcRegistry;
 try {
 	SqlRite = (await import("@possumtech/sqlrite")).default;
 	SocketServer = (await import("./src/server/SocketServer.js")).default;
 	const pluginIndex = await import("./src/plugins/index.js");
 	registerPlugins = pluginIndex.registerPlugins;
-	var initPlugins = pluginIndex.initPlugins;
+	initPlugins = pluginIndex.initPlugins;
 	createHooks = (await import("./src/hooks/Hooks.js")).default;
 	RpcRegistry = (await import("./src/server/RpcRegistry.js")).default;
 } catch (err) {
@@ -81,10 +81,12 @@ async function main() {
 			if (!key.startsWith("RUMMY_MODEL_")) continue;
 			const alias = key.replace("RUMMY_MODEL_", "");
 			const actual = process.env[key];
+			const contextEnv = process.env[`RUMMY_CONTEXT_${alias}`];
+			const context_length = contextEnv ? Number.parseInt(contextEnv, 10) : null;
 			await db.upsert_model.get({
 				alias,
 				actual,
-				context_length: null,
+				context_length,
 			});
 			modelAliases.push(alias);
 		}