@possumtech/rummy 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/.env.example +13 -1
  2. package/PLUGINS.md +1 -1
  3. package/README.md +5 -1
  4. package/SPEC.md +211 -54
  5. package/migrations/001_initial_schema.sql +3 -4
  6. package/package.json +7 -3
  7. package/service.js +5 -3
  8. package/src/agent/AgentLoop.js +183 -238
  9. package/src/agent/ContextAssembler.js +2 -0
  10. package/src/agent/KnownStore.js +36 -85
  11. package/src/agent/ResponseHealer.js +65 -31
  12. package/src/agent/TurnExecutor.js +284 -382
  13. package/src/agent/XmlParser.js +28 -4
  14. package/src/agent/known_queries.sql +1 -1
  15. package/src/agent/known_store.sql +32 -34
  16. package/src/agent/runs.sql +2 -2
  17. package/src/agent/tokens.js +1 -0
  18. package/src/agent/turns.sql +5 -0
  19. package/src/hooks/HookRegistry.js +7 -0
  20. package/src/hooks/Hooks.js +2 -4
  21. package/src/hooks/ToolRegistry.js +8 -13
  22. package/src/plugins/ask_user/ask_userDoc.js +3 -8
  23. package/src/plugins/budget/README.md +26 -30
  24. package/src/plugins/budget/budget.js +69 -36
  25. package/src/plugins/budget/recovery.js +47 -0
  26. package/src/plugins/cp/cp.js +1 -1
  27. package/src/plugins/cp/cpDoc.js +5 -10
  28. package/src/plugins/env/envDoc.js +3 -8
  29. package/src/plugins/get/get.js +70 -2
  30. package/src/plugins/get/getDoc.js +19 -16
  31. package/src/plugins/hedberg/matcher.js +10 -29
  32. package/src/plugins/helpers.js +2 -2
  33. package/src/plugins/instructions/instructions.js +3 -2
  34. package/src/plugins/instructions/preamble.md +33 -12
  35. package/src/plugins/known/known.js +66 -17
  36. package/src/plugins/known/knownDoc.js +7 -10
  37. package/src/plugins/mv/mv.js +18 -1
  38. package/src/plugins/mv/mvDoc.js +9 -10
  39. package/src/plugins/{current → performed}/README.md +4 -3
  40. package/src/plugins/{current/current.js → performed/performed.js} +15 -20
  41. package/src/plugins/policy/policy.js +47 -0
  42. package/src/plugins/previous/README.md +2 -1
  43. package/src/plugins/previous/previous.js +31 -25
  44. package/src/plugins/progress/README.md +1 -2
  45. package/src/plugins/progress/progress.js +10 -60
  46. package/src/plugins/prompt/prompt.js +10 -8
  47. package/src/plugins/rm/rm.js +27 -15
  48. package/src/plugins/rm/rmDoc.js +6 -11
  49. package/src/plugins/rpc/rpc.js +3 -1
  50. package/src/plugins/set/set.js +125 -92
  51. package/src/plugins/set/setDoc.js +28 -37
  52. package/src/plugins/sh/shDoc.js +2 -7
  53. package/src/plugins/summarize/summarize.js +7 -0
  54. package/src/plugins/summarize/summarizeDoc.js +6 -11
  55. package/src/plugins/telemetry/telemetry.js +14 -9
  56. package/src/plugins/think/think.js +12 -0
  57. package/src/plugins/think/thinkDoc.js +18 -0
  58. package/src/plugins/unknown/README.md +2 -1
  59. package/src/plugins/unknown/unknown.js +26 -4
  60. package/src/plugins/unknown/unknownDoc.js +9 -14
  61. package/src/plugins/update/update.js +7 -0
  62. package/src/plugins/update/updateDoc.js +6 -11
  63. package/src/server/ClientConnection.js +69 -45
  64. package/src/sql/v_model_context.sql +7 -17
  65. package/src/plugins/budget/BudgetGuard.js +0 -74
package/.env.example CHANGED
@@ -17,9 +17,11 @@ RUMMY_MMAP_MB=0
17
17
 
18
18
  # Agent Loop Limits
19
19
  RUMMY_MAX_TURNS=99
20
+ RUMMY_MAX_COMMANDS=15
20
21
  RUMMY_MAX_UNKNOWN_WARNINGS=3
21
22
  RUMMY_MAX_STALLS=3
22
- RUMMY_MAX_REPETITIONS=3
23
+ RUMMY_MIN_CYCLES=3
24
+ RUMMY_MAX_CYCLE_PERIOD=4
23
25
  RUMMY_MAX_UPDATE_REPEATS=3
24
26
 
25
27
  # Hygiene
@@ -33,6 +35,16 @@ RUMMY_FETCH_TIMEOUT=300000
33
35
  # Debug
34
36
  # RUMMY_DEBUG=true
35
37
 
38
+ # Think tag: 1 = model uses <think> tags for reasoning (default)
39
+ # 0 = disabled, model reasons via API reasoning_content field only
40
+ RUMMY_THINK=1
41
+
42
+ # Budget
43
+ # Fraction of context window used as ceiling. 0.9 = 90%, 10% reserved as headroom.
44
+ RUMMY_BUDGET_CEILING=0.9
45
+ # Maximum tokens per known entry. Entries exceeding this are rejected with 413.
46
+ RUMMY_MAX_ENTRY_TOKENS=512
47
+
36
48
  # Token Estimation
37
49
  # Characters per token. Lower = more conservative (fewer tokens per character).
38
50
  # Default 2. Set to 1 for worst-case (1 token per character).
package/PLUGINS.md CHANGED
@@ -467,7 +467,7 @@ prepended above the plugin's summary view output.
467
467
  | `update` | Structural | Signal continued work |
468
468
  | `unknown` | Structural + Assembly | Register unknowns, render `<unknowns>` |
469
469
  | `previous` | Assembly | Render `<previous>` loop history |
470
- | `current` | Assembly | Render `<current>` active loop work |
470
+ | `performed` | Assembly | Render `<performed>` active loop work |
471
471
  | `progress` | Assembly | Render `<progress>` telemetry + warnings |
472
472
  | `prompt` | Assembly | Render `<prompt mode="ask|act">` tag |
473
473
  | `hedberg` | Utility | Pattern matching, interpretation, normalization |
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # RUMMY: Relational Unknowns Memory Management Yoke
2
2
 
3
- Rummy is the only LLM agent service inspired by and dedicated to the memory of former Secretary of State Donald "Rummy" Rumsfeld. Our unique fusion of apophatic and hedbergian engineering strategies yields more accurate and efficient results than any other agent. Our client/server and plugin architecture integrates it into more workflows than any other agent. It's also more flexible and lean than any other agent. Our dynamic cache management, model hot-swapping, and flexible router interface make it more affordable than any other agent.
3
+ Rummy is the only LLM agent service inspired by and dedicated to the memory of former Secretary of Defense Donald "Rummy" Rumsfeld. Our unique fusion of apophatic and hedbergian engineering strategies yields more accurate and efficient results than any other agent. Our client/server and plugin architecture integrates it into more workflows than any other agent. It's also more flexible and lean than any other agent. Our dynamic cache management, model hot-swapping, and flexible router interface make it more affordable than any other agent.
4
4
 
5
5
  ## Key Features
6
6
 
@@ -10,6 +10,10 @@ Rummy is the only LLM agent service inspired by and dedicated to the memory of f
10
10
 
11
11
  - **Hedberg:** The interpretation boundary between stochastic model output and deterministic system operations. Models speak in whatever syntax they were trained on — sed regex, SEARCH/REPLACE blocks, escaped characters. Hedberg normalizes all of it. Available to all plugins via `core.hooks.hedberg`.
12
12
 
13
+ - **Folksonomic Memory:** The model organizes its own knowledge into navigable path hierarchies with searchable summary tags. Not RAG — the model builds and curates its own taxonomy using `<known>` entries with paths like `known://project/architecture`.
14
+
15
+ - **Fidelity System:** Every entry has a visibility level: full, summary, index, archive. The model manages its own context by promoting what it needs and demoting what it doesn't. Budget enforcement catches overflow post-dispatch — tools run uninterrupted, demotion happens after.
16
+
13
17
  - **Plugin Architecture:** Every `<tag>` the model sees is a plugin. Every scheme is registered by its owner. The prompt itself is assembled from plugins. Drop a directory into `~/.rummy/plugins/` or install via npm. See [PLUGINS.md](PLUGINS.md) for the complete plugin API.
14
18
 
15
19
  - **Symbols Done Right:** Designed with universal language support in mind. Powered by [@possumtech/antlrmap](https://github.com/possumtech/antlrmap).
package/SPEC.md CHANGED
@@ -44,7 +44,7 @@ body, attributes, and state.
44
44
  known_entries (
45
45
  id, run_id, loop_id, turn, path, body, scheme,
46
46
  status INTEGER, fidelity TEXT, hash,
47
- attributes, tokens, tokens_full, refs, write_count,
47
+ attributes, tokens, refs, write_count,
48
48
  created_at, updated_at
49
49
  )
50
50
  ```
@@ -56,10 +56,9 @@ known_entries (
56
56
  | `attributes` | Tag attributes as JSON. Handler-private workspace. `CHECK (json_valid)` |
57
57
  | `scheme` | Generated from path via `schemeOf()`. Drives dispatch and view routing |
58
58
  | `status` | HTTP status code (200, 202, 400, 413, etc.) |
59
- | `fidelity` | Visibility level: full, summary, index, archive |
59
+ | `fidelity` | Visibility level: full, summary, archive |
60
60
  | `hash` | SHA-256 for file change detection |
61
- | `tokens` | Display-only token count at current fidelity. NEVER used for budget. |
62
- | `tokens_full` | Cost of raw body at full fidelity |
61
+ | `tokens` | Full-body token cost. Never changes on demotion/promotion. |
63
62
  | `turn` | Freshness — when was this entry last touched |
64
63
 
65
64
  ### 1.2 Schemes, Status & Fidelity
@@ -82,7 +81,7 @@ Every entry plays one of four roles:
82
81
  | Role | Category | Section | Description |
83
82
  |------|----------|---------|-------------|
84
83
  | **Data** | `data` | `<knowns>` | Entries the model works with — persistent state |
85
- | **Logging** | `logging` | `<current>`/`<previous>` | Records of what happened — tool results, lifecycle signals |
84
+ | **Logging** | `logging` | `<performed>`/`<previous>` | Records of what happened — tool results, lifecycle signals |
86
85
  | **Unknowns** | `unknown` | `<unknowns>` | Open questions the model is tracking |
87
86
  | **Prompt** | `prompt` | `<prompt>` | The task driving the loop |
88
87
 
@@ -96,7 +95,6 @@ Every entry plays one of four roles:
96
95
  | `http://`, `https://` | data | Web content. |
97
96
  | `unknown://` | unknown | Unresolved questions. |
98
97
  | `prompt://` | prompt | User prompt with `mode` attribute (`ask`/`act`). |
99
- | `progress://` | prompt | Continuation prompt. |
100
98
  | `set://`, `get://`, `sh://`, `env://`, `rm://`, `mv://`, `cp://`, `ask_user://`, `search://` | logging | Tool result entries. |
101
99
  | `summarize://`, `update://` | logging | Lifecycle signals. |
102
100
  | `tool://` | audit | Internal plugin metadata. `model_visible = 0`. |
@@ -211,9 +209,9 @@ object is the same shape at every tier.
211
209
 
212
210
  Model tier restrictions enforced by unified `resolveForLoop(mode, flags)`.
213
211
  Ask mode excludes `sh`. Flags: `noInteraction` excludes `ask_user`,
214
- `noWeb` excludes `search`, `noBench` excludes `ask_user`/`env`/`sh`.
215
- 13 model tools: get, set, known, unknown, env, sh, rm, cp, mv, search,
216
- summarize, update, ask_user.
212
+ `noWeb` excludes `search`, `noProposals` excludes `ask_user`/`env`/`sh`.
213
+ 14 model tools: think, unknown, known, get, set, env, sh, rm, cp, mv,
214
+ ask_user, update, summarize, search.
217
215
  Client tier requires project init. Plugin tier has no restrictions.
218
216
 
219
217
  ### 3.2 Dispatch Path
@@ -226,13 +224,28 @@ Client: JSON-RPC → { method, params } → #record() → dispatch(scheme, en
226
224
  Plugin: rummy.rm({ path }) → #record() → dispatch(scheme, entry, rummy)
227
225
  ```
228
226
 
229
- **Lifecycle/action split:** Commands are classified as lifecycle signals
230
- (`summarize`, `update`, `unknown`, `known`) or action commands (everything
231
- else). Lifecycle signals always dispatch they are state declarations that
232
- cannot be 409'd by sequential dispatch. Action commands dispatch sequentially;
233
- a 202 proposal or error aborts subsequent actions. If the model sends
234
- `<summarize>` but actions in the same turn failed, the summarize is
235
- overridden to an update (the model's assertion that it's done is false).
227
+ **Tool dispatch:** Commands are dispatched sequentially in the order
228
+ the model emitted them. Each tool either succeeds (200), fails (400+),
229
+ or proposes (202). On failure, all remaining tools are aborted. On
230
+ proposal, dispatch pauses, a notification is pushed to the client
231
+ (same WebSocket push pattern as `run/progress`), the client resolves
232
+ (accept/reject), and dispatch resumes the proposal becomes 200 or
233
+ 400+ like any other tool. The `ask`/`act` RPC response is only sent
234
+ when all tools have completed. Proposals are NOT batched — each is
235
+ sent and resolved inline during dispatch. The model controls tool
236
+ ordering; the system respects it.
237
+
238
+ If the model sends `<summarize>` but a preceding action in the same
239
+ turn failed, the summarize is overridden to an update (the model's
240
+ assertion that it's done is false). Both `<summarize>` and `<update>`
241
+ present → last signal wins.
242
+
243
+ **Post-dispatch budget check:** After all tools dispatch, the system
244
+ materializes context and checks the budget ceiling. If context exceeds
245
+ the ceiling, Turn Demotion fires — all entries from this turn are
246
+ demoted to summary and a `budget://` entry is written. This is a
247
+ system housekeeping step independent of tool success/failure. The
248
+ tools already ran; their outcomes are settled.
236
249
 
237
250
  ### 3.3 Plugin Convention
238
251
 
@@ -294,18 +307,20 @@ Two messages per turn. System = stable truth. User = active task.
294
307
  [skills/]
295
308
  [/instructions]
296
309
  <knowns>
297
- ...entries sorted by fidelity (index, summary, full), then by scheme
310
+ ...entries sorted by fidelity (summary, full), then by scheme
298
311
  </knowns>
299
312
  <previous>
300
- (pre-loop user prompt, model responses, agent warnings, and tools used, in order)
313
+ (pre-loop entries, each with turn, status, summary, fidelity, tokens)
301
314
  </previous>
302
- <unknowns></unknowns>
315
+ <unknowns>
316
+ (open questions, each with path, turn, fidelity, tokens)
317
+ </unknowns>
303
318
  [/system]
304
319
  [user]
305
- <current>
306
- (current loop model responses, agent warnings, and tools used, in order)
307
- </current>
308
- <progress>the above actions have been performed on this user prompt:</progress>
320
+ <performed>
321
+ (current loop entries, each with turn, status, summary, fidelity, tokens)
322
+ </performed>
323
+ <progress turn="N">token budget, fidelity stats, causal bridge</progress>
309
324
  <prompt mode="ask|act" tools="...">user prompt</prompt>
310
325
  [/user]
311
326
  ```
@@ -317,9 +332,9 @@ The `<prompt>` tag is present on every turn — first turn and
317
332
  continuations alike. The model always sees its task. The active prompt
318
333
  is extracted from its chronological position and placed last for maximum
319
334
  recency. `<progress>` bridges the gap, narrating the causal relationship
320
- between `<current>` (the work) and the prompt (the cause).
335
+ between `<performed>` (the work) and the prompt (the cause).
321
336
 
322
- ### 4.2 Loops, Previous, and Current
337
+ ### 4.2 Loops, Previous, and Performed
323
338
 
324
339
  A **loop** is one `ask` or `act` invocation and all its continuation
325
340
  turns until summarize, fail, or abort.
@@ -329,14 +344,14 @@ responses, tool results, agent warnings — the full chronicle in order.
329
344
  Lives in the system message as established history. Omitted on the
330
345
  first turn of the first loop.
331
346
 
332
- **Current** = the active loop's work so far. Model responses, tool
347
+ **Performed** = the active loop's work so far. Model responses, tool
333
348
  results, agent warnings — in order. Does NOT include the user prompt
334
349
  (one per loop, extracted to `<prompt>`). Lives in the user
335
350
  message as immediate context. Empty on the first turn of a loop.
336
351
 
337
352
  When a new prompt arrives on an existing run, the prior loop's
338
- `<current>` content plus its prompt move to `<previous>`. When a loop
339
- continues (next turn), new results append to `<current>`.
353
+ `<performed>` content plus its prompt move to `<previous>`. When a loop
354
+ continues (next turn), new results append to `<performed>`.
340
355
 
341
356
  ### 4.3 Key Entries
342
357
 
@@ -357,7 +372,7 @@ text from body + attributes.
357
372
  Each turn:
358
373
 
359
374
  1. Write `instructions://system` (empty body, attributes = { persona })
360
- 2. Emit `turn.started` — plugins write prompt/progress/instructions entries
375
+ 2. Emit `turn.started` — plugins write prompt/instructions entries
361
376
  3. Project `instructions://system` → instructions text
362
377
  4. Query `v_model_context` VIEW → visible entries
363
378
  5. Project each entry through its tool's `full`/`summary` projection
@@ -367,7 +382,7 @@ Each turn:
367
382
  - Previous plugin (priority 200) → `<previous>` section
368
383
  - Unknown plugin (priority 300) → `<unknowns>` section
369
384
  8. Invoke `assembly.user` filter chain (empty string as base):
370
- - Current plugin (priority 100) → `<current>` section
385
+ - Performed plugin (priority 100) → `<performed>` section
371
386
  - Progress plugin (priority 200) → `<progress>` section
372
387
  - Prompt plugin (priority 300) → `<prompt>` section
373
388
  9. Store as `system://N` and `user://N` audit entries
@@ -377,6 +392,12 @@ The VIEW determines visibility from `fidelity` and `status`:
377
392
  - `summary` → summary visible (model-authored `summary` attribute if set)
378
393
  - `index` → path listed, no content
379
394
  - `archive` → invisible (retrievable via `<get>`)
395
+
396
+ **Partial read:** `<get path="..." line="N" limit="M"/>` returns lines N through
397
+ N+M−1 of the entry body as the log item without changing fidelity or promoting
398
+ the entry to context. Use after reading `summary` fidelity (which gives line
399
+ numbers via repomap) to target a specific symbol. Single-path only — glob or
400
+ body filter with `line`/`limit` is a 400 error.
380
401
  - `status = 202` → invisible (proposed, pending client)
381
402
  - `model_visible = 0` → invisible (audit, tool, instructions)
382
403
 
@@ -400,6 +421,16 @@ during dispatch. `upsert()`, `promoteByPattern()`, and
400
421
  Exceeding the budget throws `BudgetExceeded` — the tool 413s, the
401
422
  guard trips, and all subsequent tools in the turn fail.
402
423
 
424
+ BudgetGuard ceiling = `floor(contextSize × 0.9) − 500`. The 500-token
425
+ buffer below the enforce ceiling absorbs two sources of overhead that
426
+ BudgetGuard cannot see: (a) `#record()`-phase writes that bypass the
427
+ guard (~15 tokens per command), and (b) loop transition overhead —
428
+ when a loop completes and a new one starts, entries shift from
429
+ `<performed>` to `<previous>` format, adding ~200–300 tokens to the
430
+ next assembly. Without this buffer, the base context can accumulate
431
+ to exactly the enforce ceiling, making it impossible for the panic
432
+ loop to start (panic prompt + loop overhead > ceiling).
433
+
403
434
  **Exemptions:** `status >= 400` entries (error results), `model_visible
404
435
  = 0` entries (audit), `fidelity = "archive"` entries (not in context).
405
436
 
@@ -415,30 +446,107 @@ formula, one file (`src/agent/tokens.js`), env-configurable. No
415
446
  external dependencies. `contextSize` is the ceiling. Over = 413.
416
447
  Under = 200. No margins.
417
448
 
418
- ### 4.6 Panic Mode
449
+ **Three token measures — never conflate them:**
450
+
451
+ | Measure | Source | Scope | Use |
452
+ |---|---|---|---|
453
+ | SQL entry tokens | `known_entries.tokens` = `ceil(chars / DIVISOR)` | Per entry | Model decision-making: "this entry costs N tokens" |
454
+ | Assembled estimate | `measureMessages(messages)` = sum of entry projections | Full packet | First-turn budget fallback only |
455
+ | Actual API tokens | `turns.context_tokens` = `usage.input_tokens` back-filled from LLM | Per turn | Budget enforcement on turns 2+; ground truth |
419
456
 
420
- When a new prompt arrives and the assembled context exceeds
421
- `contextSize`, the system enters panic mode instead of failing to
422
- the client.
457
+ `budget.enforce` uses the **actual API tokens** (`get_last_context_tokens`) when
458
+ available (turn 2+) and falls back to the assembled estimate on turn 1. The
459
+ estimate can be 3–7× off for XML/JSON-heavy content — do not rely on it for
460
+ anything that matters.
423
461
 
424
- 1. The failed loop is completed with 413 (audit trail)
425
- 2. A panic loop is enqueued (`mode = "panic"`, `noRepo = true`)
426
- 3. The original loop is re-enqueued to retry after panic
427
- 4. The model receives a prompt with the exact shortfall in tokens
428
- 5. Tools: get, set, known, unknown, rm, mv, cp, summarize, update
429
- 6. Excluded: sh, env, search, ask_user
462
+ **`context_tokens` vs `prompt_tokens` in step telemetry:**
463
+ - `context_tokens` in the step JSON = `turns.context_tokens` for that turn =
464
+ per-turn actual input tokens from the LLM API (e.g. 7900 tokens sent this turn)
465
+ - `prompt_tokens` in the step JSON = `SUM(turns.prompt_tokens)` for the run =
466
+ **cumulative** total across all turns (cost tracking, not a context size)
430
467
 
431
- **Strike system:** Each turn without context reduction = 1 strike.
432
- Any reduction resets the counter. 3 consecutive strikes = hard 413
433
- to client. Unlimited turns as long as the model makes progress.
468
+ These two will diverge rapidly on any multi-turn run. A run at turn 50 might show
469
+ `context_tokens: 8000` (context under control) and `prompt_tokens: 400000`
470
+ (total input tokens billed across the whole run). They are measuring orthogonal things.
434
471
 
435
- One panic attempt per drain cycle. If the retried original loop also
436
- 413s, hard-fail to the client.
472
+ ### 4.6 Panic Mode
473
+
474
+ **The invariant.** A panic is only ever triggered because the
475
+ assembled context was under the ceiling — and the new prompt pushed
476
+ it over. The existing context fit; the incoming prompt did not.
477
+ Panic mode replaces that too-large incoming prompt with a small
478
+ panic prompt on the same context. Therefore: the first turn of a
479
+ panic loop cannot 413. If it does, it is a bug.
480
+
481
+ **Trigger.** `TurnExecutor.execute()` assembles the full packet
482
+ (context + incoming prompt) before calling the LLM. If
483
+ `assembledTokens > contextSize`, it returns 413 without calling
484
+ the LLM. `#drainQueue` intercepts this and enters panic mode.
485
+
486
+ **Flow.**
487
+ 1. Complete the failed loop with status 413 (audit trail).
488
+ 2. Enqueue a panic loop (`mode = "panic"`, `noRepo = true`,
489
+ `prompt = panicPrompt`, `panicTarget` in config).
490
+ 3. Re-enqueue the original loop with `panicAttempted: true` in
491
+ its config JSON. This flag persists across drain cycles.
492
+ 4. `continue` — the drain loop claims the panic loop next.
493
+
494
+ After panic completes (model freed enough space), the retry loop
495
+ runs. If the retry also 413s, hard-fail to client. One panic
496
+ attempt per drain cycle — `panicAttempted` is checked both as a
497
+ local variable and on the re-enqueued loop's config.
498
+
499
+ **Panic target.** The model must compress context to below:
500
+
501
+ ```
502
+ panicTarget = MIN(contextSize × 0.75, contextSize − incomingTokens) − cushion
503
+ ```
437
504
 
438
- **`ToolRegistry.view()`** prepends `attributes.summary` above the
439
- plugin's summary view output at summary fidelity. The model authors
440
- summaries (<= 80 chars) via `<set summary="...">`. Summaries persist
441
- across fidelity changes.
505
+ `incomingTokens` is the raw token count of the original prompt.
506
+ `cushion` is a small safety margin (500 tokens) to absorb
507
+ materialization overhead. The target is expressed in materialized
508
+ token units — the same unit the system uses to measure completion
509
+ (see Token Math below).
510
+
511
+ **Two token contexts.**
512
+
513
+ The model reasons in *per-entry SQL tokens* — the token counts
514
+ visible in `<knowns>` entries. These are the granular unit the model
515
+ uses to decide which entries to target: "this entry is 200 tokens;
516
+ if I archive it, I save 200 tokens."
517
+
518
+ The system makes decisions using *actual API tokens* —
519
+ `turns.context_tokens` back-filled from `usage.input_tokens` after
520
+ each LLM call. SQL token sums do not equal actual API counts because
521
+ projections, assembly overhead, and fidelity transforms alter the
522
+ output; and the SQL estimate (`ceil(chars / DIVISOR)`) can be 3–7×
523
+ off for structured content. **Never use SQL token sums for ceiling or
524
+ budget decisions.** See §4.5 Token Measures for the full breakdown.
525
+
526
+ **Strike system.** After each panic turn, compare
527
+ `result.assembledTokens` (materialized) with `_lastPanicTokens`
528
+ (previous turn's materialized total):
529
+ - Decreased → reset strike counter to 0.
530
+ - Same or increased → increment strikes.
531
+ - 3 consecutive strikes → return 413 to `#drainQueue` → hard-fail.
532
+
533
+ Progress (any reduction) resets the counter. The model has
534
+ unlimited turns as long as it makes progress.
535
+
536
+ **Panic success.** After each turn, if `result.assembledTokens
537
+ <= panicTarget`, the panic loop exits with 200. The retry loop
538
+ then runs with the original prompt on the now-compressed context.
539
+
540
+ **Tool set.** `resolveForLoop("panic")` includes: get, set, known,
541
+ unknown, rm, mv, cp, summarize, update. Excludes: sh, env, search,
542
+ ask_user. `noRepo: true` — no file scanning during panic.
543
+
544
+ **What the model sees.** Turn 1 receives the panic prompt from
545
+ `budget.panicPrompt()`: the assembled token count, the target, and
546
+ the exact number of tokens to free. Turn 2+ receives a continuation
547
+ prompt. The model uses `<set fidelity="archive">`, `<mv
548
+ fidelity="summary">`, and similar fidelity operations to free space,
549
+ concluding with `<summarize>` when done or `<update>` while working.
442
550
 
443
551
  ---
444
552
 
@@ -566,7 +674,7 @@ simple to powerful — weak models learn from examples 1-2, strong models
566
674
  pick up the pattern from example 3.
567
675
 
568
676
  **Lifecycle continuity.** Examples weave stories across tools. The get
569
- docs end with `<set path="..." fidelity="index"/>`. The known docs
677
+ docs end with `<set path="..." fidelity="summary"/>`. The known docs
570
678
  reference `<get path="known://*">keyword</get>` for recall and
571
679
  `<set path="known://..." archive/>` for archiving. The unknown docs
572
680
  reference `<get/>` for investigation and `<rm/>` for cleanup. A model
@@ -609,7 +717,7 @@ Tools are presented gather → reason → act → communicate. Position in
609
717
  the list implies priority. `get` is first. `ask_user` is last. The
610
718
  order is defined in `ToolRegistry.TOOL_ORDER` and applied by
611
719
  `resolveForLoop()`. The same method handles all tool exclusions —
612
- mode restrictions, `noInteraction`, `noWeb`, `noBench` — through
720
+ mode restrictions, `noInteraction`, `noWeb`, `noProposals` — through
613
721
  one unified mechanism.
614
722
 
615
723
  ### Pattern Distribution
@@ -652,11 +760,11 @@ Termination protocol:
652
760
  - `<summarize>` → run terminates
653
761
  - `<summarize>` + failed actions → overridden to `<update>` (continue)
654
762
  - `<update>` → run continues
655
- - Both → update wins (if the model can't decide, it's not done)
763
+ - Both → last signal wins (respects the model's final intent)
656
764
  - Neither + investigation tools → stall counter (RUMMY_MAX_STALLS)
657
765
  - Neither + action-only tools → healed to summarize
658
766
  - Neither + plain text → healed to summarize
659
- - Repeated commands → loop detection (RUMMY_MAX_REPETITIONS)
767
+ - Repeated commands → cycle detection (RUMMY_MIN_CYCLES, RUMMY_MAX_CYCLE_PERIOD)
660
768
  - Repeated update text → stall (RUMMY_MAX_UPDATE_REPEATS)
661
769
 
662
770
  Format normalization:
@@ -697,6 +805,54 @@ See [PLUGINS.md](PLUGINS.md) for the hedberg pattern type reference.
697
805
 
698
806
  ---
699
807
 
808
+ ## 13. Debugging: E2E and Benchmark Results
809
+
810
+ ### E2E test failures
811
+
812
+ E2E tests use a temp DB at `/tmp/rummy_test_<timestamp>_<random>.db` (cleaned up after).
813
+ On failure, `AuditClient.assertRun` calls `dumpRun`, which prints a full turn-by-turn audit
814
+ to stdout. That output is in the background task log:
815
+
816
+ ```
817
+ /tmp/claude-1000/-home-hyzen-repo-rummy-main/<session-id>/tasks/<task-id>.output
818
+ ```
819
+
820
+ If oversized, the harness saves to:
821
+ ```
822
+ /home/hyzen/.claude/projects/-home-hyzen-repo-rummy-main/<session-id>/tool-results/<id>.txt
823
+ ```
824
+
825
+ The dump format is: `scheme:state path {attributes}\n body (120 chars)` grouped by turn.
826
+
827
+ Key things to look for in a dump:
828
+ - **202**: unresolved proposals — model issued `<sh>`, `<rm>`, or `<mv>` that needs approval
829
+ - **413**: budget overflow — assembled context exceeded ceiling before LLM call
830
+ - **BudgetGuard errors**: per-tool rejections mid-turn (`Budget exceeded: N tokens requested`)
831
+ - **`<sh>` in act/panic mode**: model fell back to shell when blocked (doc/prompt gap)
832
+ - Loop sequence: look for `mode` in `instructions://system` attrs to see which loop type ran
833
+
834
+ ### MAB benchmark
835
+
836
+ Results live in `test/mab/results/<ISO-timestamp>/mab.db`. Latest run = most recent dir.
837
+
838
+ ```js
839
+ // Query a MAB result DB directly:
840
+ import { DatabaseSync } from 'node:sqlite';
841
+ const db = new DatabaseSync('test/mab/results/<timestamp>/mab.db');
842
+ db.prepare('SELECT * FROM questions').all(); // all questions + scores
843
+ db.prepare('SELECT * FROM runs').all(); // individual model runs
844
+ ```
845
+
846
+ Run with: `npm run test:mab`
847
+
848
+ ### LME benchmark
849
+
850
+ Results live in `test/lme/results/<ISO-timestamp>/lme.db`. Same structure.
851
+
852
+ Run with: `npm run test:lme`
853
+
854
+ ---
855
+
700
856
  ## 12. Configuration
701
857
 
702
858
  ```env
@@ -704,7 +860,8 @@ RUMMY_HOME=~/.rummy
704
860
  RUMMY_TOKEN_DIVISOR=2
705
861
  RUMMY_MAX_TURNS=99
706
862
  RUMMY_MAX_STALLS=3
707
- RUMMY_MAX_REPETITIONS=3
863
+ RUMMY_MIN_CYCLES=3
864
+ RUMMY_MAX_CYCLE_PERIOD=4
708
865
  RUMMY_MAX_UPDATE_REPEATS=3
709
866
  RUMMY_RETENTION_DAYS=31
710
867
  RUMMY_TEMPERATURE=0.5
@@ -65,7 +65,7 @@ CREATE TABLE IF NOT EXISTS loops (
65
65
  id INTEGER PRIMARY KEY AUTOINCREMENT
66
66
  , run_id INTEGER NOT NULL REFERENCES runs (id) ON DELETE CASCADE
67
67
  , sequence INTEGER NOT NULL CHECK (sequence >= 1)
68
- , mode TEXT NOT NULL CHECK (mode IN ('ask', 'act', 'panic'))
68
+ , mode TEXT NOT NULL CHECK (mode IN ('ask', 'act'))
69
69
  , model TEXT
70
70
  , prompt TEXT NOT NULL DEFAULT ''
71
71
  , status INTEGER NOT NULL DEFAULT 100 CHECK (status BETWEEN 100 AND 599)
@@ -125,12 +125,11 @@ CREATE TABLE IF NOT EXISTS known_entries (
125
125
  , scheme TEXT GENERATED ALWAYS AS (schemeOf(path)) STORED
126
126
  , status INTEGER NOT NULL DEFAULT 200 CHECK (status BETWEEN 100 AND 599)
127
127
  , fidelity TEXT NOT NULL DEFAULT 'full' CHECK (
128
- fidelity IN ('full', 'summary', 'index', 'archive')
128
+ fidelity IN ('full', 'summary', 'archive')
129
129
  )
130
130
  , hash TEXT
131
131
  , attributes JSON NOT NULL DEFAULT '{}' CHECK (json_valid(attributes))
132
132
  , tokens INTEGER NOT NULL DEFAULT 0 CHECK (tokens >= 0)
133
- , tokens_full INTEGER NOT NULL DEFAULT 0 CHECK (tokens_full >= 0)
134
133
  , refs INTEGER NOT NULL DEFAULT 0 CHECK (refs >= 0)
135
134
  , write_count INTEGER NOT NULL DEFAULT 1 CHECK (write_count >= 1)
136
135
  , created_at DATETIME DEFAULT CURRENT_TIMESTAMP
@@ -167,7 +166,7 @@ CREATE TABLE IF NOT EXISTS turn_context (
167
166
  , path TEXT NOT NULL
168
167
  , scheme TEXT GENERATED ALWAYS AS (schemeOf(path)) STORED
169
168
  , status INTEGER NOT NULL DEFAULT 200 CHECK (status BETWEEN 100 AND 599)
170
- , fidelity TEXT NOT NULL CHECK (fidelity IN ('full', 'summary', 'index'))
169
+ , fidelity TEXT NOT NULL CHECK (fidelity IN ('full', 'summary'))
171
170
  , body TEXT NOT NULL DEFAULT ''
172
171
  , tokens INTEGER NOT NULL DEFAULT 0 CHECK (tokens >= 0)
173
172
  , attributes JSON NOT NULL DEFAULT '{}' CHECK (json_valid(attributes))
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@possumtech/rummy",
3
- "version": "0.3.0",
3
+ "version": "0.4.0",
4
4
  "description": "Relational Unknowns Memory Management Yoke",
5
5
  "keywords": [
6
6
  "llm"
@@ -43,9 +43,12 @@
43
43
  "test:live": "mkdir -p /tmp/rummy_test_diag && node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test --test-concurrency=1 --test-force-exit --test-reporter=spec --test $(find test/live -name '*.test.js') 2>&1 | tee /tmp/rummy_test_diag/live_$(date +%Y%m%dT%H%M%S).log",
44
44
  "test:clean": "rm -rf test/lme/results test/mab/results test/tmp /tmp/rummy_test_diag /tmp/rummy_test_*.db /tmp/rummy_test_*.db-shm /tmp/rummy_test_*.db-wal && echo 'Test artifacts cleaned.'",
45
45
  "test:mab:get": "node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test test/mab/download.js",
46
- "test:mab": "mkdir -p /tmp/rummy_test_diag && node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test test/mab/runner.js 2>&1 | tee /tmp/rummy_test_diag/mab_$(date +%Y%m%dT%H%M%S).log",
46
+ "test:mab": "bash -c 'mkdir -p /tmp/rummy_test_diag && node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test test/mab/runner.js \"$@\" 2>&1 | tee /tmp/rummy_test_diag/mab_$(date +%Y%m%dT%H%M%S).log' --",
47
+ "test:grok": "bash -c 'mkdir -p /tmp/rummy_test_diag && node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test --env-file-if-exists=.env.grok test/mab/runner.js \"$@\" 2>&1 | tee /tmp/rummy_test_diag/mab_grok_$(date +%Y%m%dT%H%M%S).log' --",
48
+ "test:mab:taxonomy": "bash -c 'mkdir -p /tmp/rummy_test_diag && node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test test/mab/runner.js --split Conflict_Resolution --row 0 --no-questions 2>&1 | tee /tmp/rummy_test_diag/taxonomy_$(date +%Y%m%dT%H%M%S).log' --",
49
+ "test:grok:taxonomy": "bash -c 'mkdir -p /tmp/rummy_test_diag && node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test --env-file-if-exists=.env.grok test/mab/runner.js --split Conflict_Resolution --row 0 --no-questions 2>&1 | tee /tmp/rummy_test_diag/taxonomy_grok_$(date +%Y%m%dT%H%M%S).log' --",
47
50
  "test:lme:get": "node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test test/lme/download.js",
48
- "test:lme": "mkdir -p /tmp/rummy_test_diag && node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test test/lme/runner.js 2>&1 | tee /tmp/rummy_test_diag/lme_$(date +%Y%m%dT%H%M%S).log",
51
+ "test:lme": "bash -c 'mkdir -p /tmp/rummy_test_diag && node --env-file-if-exists=.env.example --env-file-if-exists=.env --env-file-if-exists=.env.test test/lme/runner.js \"$@\" 2>&1 | tee /tmp/rummy_test_diag/lme_$(date +%Y%m%dT%H%M%S).log' --",
49
52
  "test:mab:clean": "rm -rf test/mab/results/*/",
50
53
  "test:lme:clean": "rm -rf test/lme/results/*/",
51
54
  "test:clear": "rm -rf /tmp/rummy_test_diag /tmp/rummy_test_*.db /tmp/rummy_test_*.db-shm /tmp/rummy_test_*.db-wal /tmp/rummy-stories-*"
@@ -56,6 +59,7 @@
56
59
  "dependencies": {
57
60
  "@possumtech/sqlrite": "^3.1.0",
58
61
  "@xmldom/xmldom": "^0.9.9",
62
+ "diff": "^8.0.4",
59
63
  "htmlparser2": "^12.0.0",
60
64
  "picomatch": "^4.0.4",
61
65
  "ws": "^8.19.0",
package/service.js CHANGED
@@ -18,13 +18,13 @@ if (gitCheck.error || gitCheck.status !== 0) {
18
18
  console.warn("[RUMMY] WARNING: 'git' not found. File tracking will use manual activation only.");
19
19
  }
20
20
 
21
- let SqlRite, SocketServer, registerPlugins, createHooks, RpcRegistry;
21
+ let SqlRite, SocketServer, registerPlugins, initPlugins, createHooks, RpcRegistry;
22
22
  try {
23
23
  SqlRite = (await import("@possumtech/sqlrite")).default;
24
24
  SocketServer = (await import("./src/server/SocketServer.js")).default;
25
25
  const pluginIndex = await import("./src/plugins/index.js");
26
26
  registerPlugins = pluginIndex.registerPlugins;
27
- var initPlugins = pluginIndex.initPlugins;
27
+ initPlugins = pluginIndex.initPlugins;
28
28
  createHooks = (await import("./src/hooks/Hooks.js")).default;
29
29
  RpcRegistry = (await import("./src/server/RpcRegistry.js")).default;
30
30
  } catch (err) {
@@ -81,10 +81,12 @@ async function main() {
81
81
  if (!key.startsWith("RUMMY_MODEL_")) continue;
82
82
  const alias = key.replace("RUMMY_MODEL_", "");
83
83
  const actual = process.env[key];
84
+ const contextEnv = process.env[`RUMMY_CONTEXT_${alias}`];
85
+ const context_length = contextEnv ? Number.parseInt(contextEnv, 10) : null;
84
86
  await db.upsert_model.get({
85
87
  alias,
86
88
  actual,
87
- context_length: null,
89
+ context_length,
88
90
  });
89
91
  modelAliases.push(alias);
90
92
  }