@possumtech/rummy 0.2.8 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. package/.env.example +13 -2
  2. package/EXCEPTIONS.md +46 -0
  3. package/PLUGINS.md +422 -188
  4. package/SPEC.md +440 -106
  5. package/migrations/001_initial_schema.sql +5 -3
  6. package/package.json +17 -5
  7. package/service.js +5 -3
  8. package/src/agent/AgentLoop.js +252 -55
  9. package/src/agent/ContextAssembler.js +20 -4
  10. package/src/agent/KnownStore.js +82 -25
  11. package/src/agent/ProjectAgent.js +4 -1
  12. package/src/agent/ResponseHealer.js +86 -32
  13. package/src/agent/TurnExecutor.js +542 -207
  14. package/src/agent/XmlParser.js +77 -41
  15. package/src/agent/known_store.sql +68 -4
  16. package/src/agent/schemes.sql +3 -0
  17. package/src/agent/tokens.js +7 -21
  18. package/src/agent/turns.sql +15 -1
  19. package/src/hooks/HookRegistry.js +7 -0
  20. package/src/hooks/Hooks.js +15 -0
  21. package/src/hooks/PluginContext.js +14 -1
  22. package/src/hooks/RummyContext.js +16 -4
  23. package/src/hooks/ToolRegistry.js +77 -19
  24. package/src/llm/LlmProvider.js +27 -8
  25. package/src/llm/OpenAiClient.js +20 -0
  26. package/src/llm/OpenRouterClient.js +24 -2
  27. package/src/llm/XaiClient.js +47 -2
  28. package/src/plugins/ask_user/README.md +4 -4
  29. package/src/plugins/ask_user/ask_user.js +5 -5
  30. package/src/plugins/ask_user/ask_userDoc.js +29 -0
  31. package/src/plugins/budget/README.md +31 -0
  32. package/src/plugins/budget/budget.js +55 -0
  33. package/src/plugins/cp/README.md +5 -4
  34. package/src/plugins/cp/cp.js +10 -6
  35. package/src/plugins/cp/cpDoc.js +29 -0
  36. package/src/plugins/engine/engine.sql +1 -8
  37. package/src/plugins/engine/turn_context.sql +4 -9
  38. package/src/plugins/env/README.md +3 -4
  39. package/src/plugins/env/env.js +5 -5
  40. package/src/plugins/env/envDoc.js +29 -0
  41. package/src/plugins/file/README.md +9 -12
  42. package/src/plugins/file/file.js +34 -35
  43. package/src/plugins/get/README.md +2 -2
  44. package/src/plugins/get/get.js +77 -6
  45. package/src/plugins/get/getDoc.js +51 -0
  46. package/src/plugins/hedberg/hedberg.js +2 -1
  47. package/src/plugins/hedberg/matcher.js +10 -29
  48. package/src/plugins/hedberg/normalize.js +28 -0
  49. package/src/plugins/hedberg/patterns.js +25 -27
  50. package/src/plugins/hedberg/sed.js +17 -10
  51. package/src/plugins/index.js +66 -14
  52. package/src/plugins/instructions/README.md +6 -2
  53. package/src/plugins/instructions/instructions.js +20 -4
  54. package/src/plugins/instructions/preamble.md +19 -5
  55. package/src/plugins/known/README.md +10 -7
  56. package/src/plugins/known/known.js +23 -17
  57. package/src/plugins/known/knownDoc.js +34 -0
  58. package/src/plugins/mv/README.md +5 -4
  59. package/src/plugins/mv/mv.js +27 -6
  60. package/src/plugins/mv/mvDoc.js +45 -0
  61. package/src/plugins/performed/README.md +15 -0
  62. package/src/plugins/performed/performed.js +45 -0
  63. package/src/plugins/persona/persona.js +78 -0
  64. package/src/plugins/previous/README.md +3 -2
  65. package/src/plugins/previous/previous.js +33 -24
  66. package/src/plugins/progress/README.md +1 -2
  67. package/src/plugins/progress/progress.js +33 -21
  68. package/src/plugins/prompt/README.md +5 -5
  69. package/src/plugins/prompt/prompt.js +15 -17
  70. package/src/plugins/rm/README.md +4 -4
  71. package/src/plugins/rm/rm.js +32 -20
  72. package/src/plugins/rm/rmDoc.js +30 -0
  73. package/src/plugins/rpc/README.md +15 -28
  74. package/src/plugins/rpc/rpc.js +42 -77
  75. package/src/plugins/set/README.md +13 -12
  76. package/src/plugins/set/set.js +107 -16
  77. package/src/plugins/set/setDoc.js +49 -0
  78. package/src/plugins/sh/README.md +4 -4
  79. package/src/plugins/sh/sh.js +5 -5
  80. package/src/plugins/sh/shDoc.js +29 -0
  81. package/src/plugins/{skills/skills.js → skill/skill.js} +10 -51
  82. package/src/plugins/summarize/README.md +6 -5
  83. package/src/plugins/summarize/summarize.js +7 -6
  84. package/src/plugins/summarize/summarizeDoc.js +33 -0
  85. package/src/plugins/telemetry/telemetry.js +16 -9
  86. package/src/plugins/think/README.md +20 -0
  87. package/src/plugins/think/think.js +5 -0
  88. package/src/plugins/unknown/README.md +6 -5
  89. package/src/plugins/unknown/unknown.js +12 -9
  90. package/src/plugins/unknown/unknownDoc.js +31 -0
  91. package/src/plugins/update/README.md +3 -8
  92. package/src/plugins/update/update.js +7 -6
  93. package/src/plugins/update/updateDoc.js +33 -0
  94. package/src/server/ClientConnection.js +59 -45
  95. package/src/server/RpcRegistry.js +52 -4
  96. package/src/sql/v_model_context.sql +10 -25
  97. package/src/plugins/ask_user/docs.md +0 -2
  98. package/src/plugins/cp/docs.md +0 -2
  99. package/src/plugins/current/README.md +0 -14
  100. package/src/plugins/current/current.js +0 -47
  101. package/src/plugins/env/docs.md +0 -4
  102. package/src/plugins/get/docs.md +0 -10
  103. package/src/plugins/known/docs.md +0 -3
  104. package/src/plugins/mv/docs.md +0 -2
  105. package/src/plugins/rm/docs.md +0 -6
  106. package/src/plugins/set/docs.md +0 -6
  107. package/src/plugins/sh/docs.md +0 -2
  108. package/src/plugins/skills/README.md +0 -25
  109. package/src/plugins/store/README.md +0 -20
  110. package/src/plugins/store/docs.md +0 -6
  111. package/src/plugins/store/store.js +0 -63
  112. package/src/plugins/summarize/docs.md +0 -4
  113. package/src/plugins/unknown/docs.md +0 -5
  114. package/src/plugins/update/docs.md +0 -4
package/SPEC.md CHANGED
@@ -15,8 +15,8 @@ that thread a value through subscribers in priority order).
15
15
 
16
16
  **Every `<tag>` the model sees is a plugin.** The `<known>` section
17
17
  of the system message is rendered by the known plugin. The `<progress>`
18
- section is rendered by the progress plugin. The `<ask>` tag is rendered
19
- by the prompt plugin. No monolithic assembler decides what goes where.
18
+ section is rendered by the progress plugin. The `<prompt>` tag is
19
+ rendered by the prompt plugin. No monolithic assembler decides what goes where.
20
20
  Each plugin filters for its own data from the shared row set, renders
21
21
  its section, and returns.
22
22
 
@@ -42,7 +42,8 @@ body, attributes, and state.
42
42
 
43
43
  ```sql
44
44
  known_entries (
45
- id, run_id, turn, path, body, scheme, state, hash,
45
+ id, run_id, loop_id, turn, path, body, scheme,
46
+ status INTEGER, fidelity TEXT, hash,
46
47
  attributes, tokens, tokens_full, refs, write_count,
47
48
  created_at, updated_at
48
49
  )
@@ -50,58 +51,62 @@ known_entries (
50
51
 
51
52
  | Column | Purpose |
52
53
  |--------|---------|
53
- | `path` | Entry identity. Bare paths (`src/app.js`) or URIs (`known://auth`) |
54
+ | `path` | Entry identity. Bare paths (`src/app.js`) or URIs (`known://auth`). Max 2048 chars. |
54
55
  | `body` | Tag body text. File content, tool output, skill docs. |
55
56
  | `attributes` | Tag attributes as JSON. Handler-private workspace. `CHECK (json_valid)` |
56
57
  | `scheme` | Generated from path via `schemeOf()`. Drives dispatch and view routing |
57
- | `state` | Lifecycle stage. Determines model visibility |
58
+ | `status` | HTTP status code (200, 202, 400, 413, etc.) |
59
+ | `fidelity` | Visibility level: full, summary, index, archive |
58
60
  | `hash` | SHA-256 for file change detection |
59
- | `tokens` | Context cost at current state |
61
+ | `tokens` | Display-only token count at current fidelity. NEVER used for budget. |
60
62
  | `tokens_full` | Cost of raw body at full fidelity |
61
63
  | `turn` | Freshness — when was this entry last touched |
62
64
 
63
- ### 1.2 Schemes & States
65
+ ### 1.2 Schemes, Status & Fidelity
64
66
 
65
- Paths use URI scheme syntax. Bare paths (no `://`) are files.
66
-
67
- **Files** (`scheme IS NULL`):
68
-
69
- | State | Model sees |
70
- |-------|-----------|
71
- | `full` | File content in code fence |
72
- | `index` | Path listed in File Index |
73
- | `stored` | Invisible, retrievable via `<get>` |
74
-
75
- **Knowledge** (`known://`, `unknown://`):
76
-
77
- | State | Model sees |
78
- |-------|-----------|
79
- | `full` | Key — value in bullet list |
80
- | `stored` | Key listed, no value |
81
-
82
- **Tool results** (`set://`, `sh://`, `env://`, `rm://`, `ask_user://`,
83
- `mv://`, `cp://`, `search://`, `get://`, `store://`):
84
-
85
- All start at `full` state when recorded. Handlers set the final state:
86
- `proposed`, `pass`, `rejected`, `error`, `pattern`, `read`, `stored`, `info`.
87
-
88
- **Skills** (`skill://`): `full` or `stored`. Rendered in system message.
67
+ Every entry has two independent dimensions: **status** (HTTP integer)
68
+ and **fidelity** (visibility level). These are separate concerns.
89
69
 
90
- **Tools** (`tool://`): `full`, `model_visible = 0`. Internal plugin metadata.
70
+ **Status** (lifecycle): 200 (OK), 202 (proposed), 400 (bad request),
71
+ 404 (not found), 409 (conflict), 413 (too large), 499 (aborted),
72
+ 500 (error).
91
73
 
92
- **URLs** (`http://`, `https://`): `full`, `summary`, `stored`.
74
+ **Fidelity** (visibility): `full` (body visible), `summary`
75
+ (model-authored summary), `index` (path only), `archive` (invisible,
76
+ retrievable via `<get>`).
93
77
 
94
- **Structural** (`summarize://`, `update://`): Status signals.
95
-
96
- **Audit** (`system://`, `prompt://`, `ask://`, `act://`, `progress://`,
97
- `reasoning://`, `model://`, `error://`, `user://`, `assistant://`,
98
- `content://`): `info` state, `model_visible = 0` (hidden from model).
99
-
100
- ### 1.3 State Validation
78
+ Paths use URI scheme syntax. Bare paths (no `://`) are files.
101
79
 
102
- The `schemes` table is a bootstrap registry — 30 rows of static config.
103
- INSERT/UPDATE triggers validate state against `schemes.valid_states`.
104
- Plugins cannot bypass this (circular dependency prevents schemes as entries).
80
+ Every entry plays one of four roles:
81
+
82
+ | Role | Category | Section | Description |
83
+ |------|----------|---------|-------------|
84
+ | **Data** | `data` | `<knowns>` | Entries the model works with — persistent state |
85
+ | **Logging** | `logging` | `<performed>`/`<previous>` | Records of what happened — tool results, lifecycle signals |
86
+ | **Unknowns** | `unknown` | `<unknowns>` | Open questions the model is tracking |
87
+ | **Prompt** | `prompt` | `<prompt>` | The task driving the loop |
88
+
89
+ `logging` is the default category. Plugins opt into `data` explicitly.
90
+
91
+ | Scheme | Category | Description |
92
+ |--------|----------|-------------|
93
+ | `NULL` (bare path) | data | File content. JOINs via `COALESCE(scheme, 'file')`. `file://` prefix stripped by hedberg. |
94
+ | `known://` | data | Model-registered knowledge. One fact per entry. |
95
+ | `skill://` | data | Skill docs. Rendered in system message. |
96
+ | `http://`, `https://` | data | Web content. |
97
+ | `unknown://` | unknown | Unresolved questions. |
98
+ | `prompt://` | prompt | User prompt with `mode` attribute (`ask`/`act`). |
99
+ | `set://`, `get://`, `sh://`, `env://`, `rm://`, `mv://`, `cp://`, `ask_user://`, `search://` | logging | Tool result entries. |
100
+ | `summarize://`, `update://` | logging | Lifecycle signals. |
101
+ | `tool://` | audit | Internal plugin metadata. `model_visible = 0`. |
102
+ | `system://`, `reasoning://`, `model://`, `error://`, `user://`, `assistant://`, `content://` | audit | Audit entries. `model_visible = 0`. |
103
+
104
+ ### 1.3 Scheme Registry
105
+
106
+ The `schemes` table is a bootstrap registry — static rows of
107
+ `(name, model_visible, category)`. Plugins register their scheme
108
+ via `core.registerScheme()` in the constructor. The `model_visible`
109
+ flag controls whether entries appear in `v_model_context`.
105
110
 
106
111
  ### 1.4 UPSERT Semantics
107
112
 
@@ -117,13 +122,21 @@ The K/V store is the memory. Relational tables are the skeleton.
117
122
  ```sql
118
123
  projects (id, name UNIQUE, project_root, config_path, created_at)
119
124
  models (id, alias UNIQUE, actual, context_length, created_at)
120
- runs (id, project_id, parent_run_id, model, alias UNIQUE, status,
121
- temperature, persona, context_limit, next_turn, created_at)
122
- turns (id, run_id, sequence, prompt_tokens, completion_tokens,
123
- total_tokens, cost, created_at)
125
+ runs (id, project_id, parent_run_id, model, alias UNIQUE,
126
+ status INTEGER, temperature, persona, context_limit,
127
+ next_turn, next_loop, created_at)
128
+ loops (id, run_id, sequence, mode, model, prompt, status INTEGER,
129
+ config JSON, result JSON, created_at)
130
+ turns (id, run_id, loop_id, sequence, context_tokens,
131
+ reasoning_content, prompt_tokens, cached_tokens,
132
+ completion_tokens, reasoning_tokens, total_tokens, cost,
133
+ created_at)
124
134
 
125
135
  file_constraints (id, project_id, pattern, visibility, created_at)
126
- prompt_queue (id, run_id, mode, model, prompt, config, status, result)
136
+ -- Project-level config. NOT tool dispatch. See §2.3.
137
+ turn_context (id, run_id, loop_id, turn, ordinal, path, scheme,
138
+ status, fidelity, body, tokens, attributes,
139
+ category, source_turn)
127
140
  rpc_log (id, project_id, method, rpc_id, params, result, error)
128
141
  ```
129
142
 
@@ -136,19 +149,39 @@ client picks for every run.
136
149
 
137
150
  ### 2.1 Run State Machine
138
151
 
152
+ All status fields are HTTP integer codes:
153
+
139
154
  ```
140
- queued → running → proposed → running → completed
141
- → completed
142
- → failed → running
143
- → aborted → running
155
+ 100 (queued)200 (running)202 (proposed)200 (running)200 (completed)
156
+ 200 (completed)
157
+ 500 (failed)200 (running)
158
+ 499 (aborted)200 (running)
144
159
  ```
145
160
 
146
161
  All terminal states allow transition back to `running`. Runs are long-lived.
147
162
 
148
- ### 2.2 Prompt Queue
163
+ ### 2.2 Loops Table
164
+
165
+ The loops table IS the prompt queue. Each `ask`/`act` creates a loop.
166
+ FIFO per run (ordered by sequence). One active at a time. Abort stops
167
+ the current loop; pending loops survive. Projects > runs > loops > turns.
168
+
169
+ ### 2.3 File Constraints
170
+
171
+ The `file_constraints` table is project-level configuration — it
172
+ defines which files a project cares about. This is backbone, not tool
173
+ dispatch. Constraints have three visibilities: `active` (promoted to
174
+ full), `readonly` (promoted but not editable), `ignore` (demoted).
175
+
176
+ **Boundary:** Setting a constraint (`File.setConstraint`) is a
177
+ project-config write. Promoting/demoting the matching entries is tool
178
+ dispatch that goes through the handler chain with budget enforcement.
179
+ These are separate operations: constraint persists across runs, entry
180
+ promotion is scoped to a run and subject to the same budget rules as
181
+ a model `<get>`.
149
182
 
150
- All prompts flow through `prompt_queue`. FIFO per run. One active at a time.
151
- Abort stops the current prompt; pending prompts survive.
183
+ `store` RPC manages constraints directly it is not a model tool.
184
+ `get` RPC with `persist` sets the constraint AND dispatches promotion.
152
185
 
153
186
  ---
154
187
 
@@ -169,13 +202,17 @@ object is the same shape at every tier.
169
202
 
170
203
  | Method | Model | Client | Plugin |
171
204
  |--------|-------|--------|--------|
172
- | `get`, `set`, `rm`, `mv`, `cp`, `sh`, `env`, `store` | ✓ | ✓ | ✓ |
205
+ | `get`, `set`, `rm`, `mv`, `cp`, `sh`, `env`, `search` | ✓ | ✓ | ✓ |
173
206
  | `known`, `unknown`, `ask_user`, `summarize`, `update` | ✓ | ✓ | ✓ |
174
207
  | `ask`, `act`, `resolve`, `abort`, `startRun` | — | ✓ | ✓ |
175
208
  | `getRuns`, `getModels`, `getEntries` | — | ✓ | ✓ |
176
209
  | `on()`, `filter()`, db/store access | — | — | ✓ |
177
210
 
178
- Model tier restrictions enforced by mode (ask removes act-only tools).
211
+ Model tier restrictions enforced by unified `resolveForLoop(mode, flags)`.
212
+ Ask mode excludes `sh`. Flags: `noInteraction` excludes `ask_user`,
213
+ `noWeb` excludes `search`, `noProposals` excludes `ask_user`/`env`/`sh`.
214
+ 13 model tools: get, set, known, unknown, env, sh, rm, cp, mv, search,
215
+ summarize, update, ask_user.
179
216
  Client tier requires project init. Plugin tier has no restrictions.
180
217
 
181
218
  ### 3.2 Dispatch Path
@@ -188,6 +225,14 @@ Client: JSON-RPC → { method, params } → #record() → dispatch(scheme, en
188
225
  Plugin: rummy.rm({ path }) → #record() → dispatch(scheme, entry, rummy)
189
226
  ```
190
227
 
228
+ **Lifecycle/action split:** Commands are classified as lifecycle signals
229
+ (`summarize`, `update`, `unknown`, `known`) or action commands (everything
230
+ else). Lifecycle signals always dispatch — they are state declarations that
231
+ cannot be 409'd by sequential dispatch. Action commands dispatch sequentially;
232
+ a 202 proposal or error aborts subsequent actions. If the model sends
233
+ `<summarize>` but actions in the same turn failed, the summarize is
234
+ overridden to an update (the model's assertion that it's done is false).
235
+
191
236
  ### 3.3 Plugin Convention
192
237
 
193
238
  A plugin is an instantiated class. The class name matches the file name.
@@ -247,35 +292,35 @@ Two messages per turn. System = stable truth. User = active task.
247
292
  [persona/]
248
293
  [skills/]
249
294
  [/instructions]
250
- <knowledge>
295
+ <knowns>
251
296
  ...entries sorted by fidelity (index, summary, full), then by scheme
252
- </knowledge>
297
+ </knowns>
253
298
  <previous>
254
- (pre-loop user prompt, model responses, agent warnings, and tools used, in order)
299
+ (pre-loop entries, each with turn, status, summary, fidelity, tokens)
255
300
  </previous>
256
- <unknowns></unknowns>
301
+ <unknowns>
302
+ (open questions, each with path, turn, fidelity, tokens)
303
+ </unknowns>
257
304
  [/system]
258
305
  [user]
259
- <current>
260
- (current loop model responses, agent warnings, and tools used, in order)
261
- </current>
262
- <progress>the above actions have been performed on this user prompt:</progress>
263
- <ask tools="..." warn="...">user prompt</ask>
264
- — OR —
265
- <act tools="...">user prompt</act>
306
+ <performed>
307
+ (current loop entries, each with turn, status, summary, fidelity, tokens)
308
+ </performed>
309
+ <progress turn="N">token budget, fidelity stats, causal bridge</progress>
310
+ <prompt mode="ask|act" tools="...">user prompt</prompt>
266
311
  [/user]
267
312
  ```
268
313
 
269
314
  **System** contains everything the model needs to know.
270
315
  **User** contains everything the model needs to do.
271
316
 
272
- The `<ask>`/`<act>` tag is present on every turn — first turn and
317
+ The `<prompt>` tag is present on every turn — first turn and
273
318
  continuations alike. The model always sees its task. The active prompt
274
319
  is extracted from its chronological position and placed last for maximum
275
320
  recency. `<progress>` bridges the gap, narrating the causal relationship
276
- between `<current>` (the work) and the prompt (the cause).
321
+ between `<performed>` (the work) and the prompt (the cause).
277
322
 
278
- ### 4.2 Loops, Previous, and Current
323
+ ### 4.2 Loops, Previous, and Performed
279
324
 
280
325
  A **loop** is one `ask` or `act` invocation and all its continuation
281
326
  turns until summarize, fail, or abort.
@@ -285,14 +330,14 @@ responses, tool results, agent warnings — the full chronicle in order.
285
330
  Lives in the system message as established history. Omitted on the
286
331
  first turn of the first loop.
287
332
 
288
- **Current** = the active loop's work so far. Model responses, tool
333
+ **Performed** = the active loop's work so far. Model responses, tool
289
334
  results, agent warnings — in order. Does NOT include the user prompt
290
- (one per loop, extracted to `<ask>`/`<act>`). Lives in the user
335
+ (one per loop, extracted to `<prompt>`). Lives in the user
291
336
  message as immediate context. Empty on the first turn of a loop.
292
337
 
293
338
  When a new prompt arrives on an existing run, the prior loop's
294
- `<current>` content plus its prompt move to `<previous>`. When a loop
295
- continues (next turn), new results append to `<current>`.
339
+ `<performed>` content plus its prompt move to `<previous>`. When a loop
340
+ continues (next turn), new results append to `<performed>`.
296
341
 
297
342
  ### 4.3 Key Entries
298
343
 
@@ -313,7 +358,7 @@ text from body + attributes.
313
358
  Each turn:
314
359
 
315
360
  1. Write `instructions://system` (empty body, attributes = { persona })
316
- 2. Run plugin hooks (`onTurn`) — plugins modify entries before the model sees them
361
+ 2. Emit `turn.started` — plugins write prompt/instructions entries
317
362
  3. Project `instructions://system` → instructions text
318
363
  4. Query `v_model_context` VIEW → visible entries
319
364
  5. Project each entry through its tool's `full`/`summary` projection
@@ -323,23 +368,171 @@ Each turn:
323
368
  - Previous plugin (priority 200) → `<previous>` section
324
369
  - Unknown plugin (priority 300) → `<unknowns>` section
325
370
  8. Invoke `assembly.user` filter chain (empty string as base):
326
- - Current plugin (priority 100) → `<current>` section
371
+ - Performed plugin (priority 100) → `<performed>` section
327
372
  - Progress plugin (priority 200) → `<progress>` section
328
- - Prompt plugin (priority 300) → `<ask>`/`<act>` section
373
+ - Prompt plugin (priority 300) → `<prompt>` section
329
374
  9. Store as `system://N` and `user://N` audit entries
330
375
 
331
- The VIEW determines visibility. State IS fidelity:
376
+ The VIEW determines visibility from `fidelity` and `status`:
332
377
  - `full` → body visible
333
- - `summary` → body visible
378
+ - `summary` → summary visible (model-authored `summary` attribute if set)
334
379
  - `index` → path listed, no content
335
- - `stored` → invisible
336
- - `proposed` → invisible (pending client)
380
+ - `archive` → invisible (retrievable via `<get>`)
381
+
382
+ **Partial read:** `<get path="..." line="N" limit="M"/>` returns lines N through
383
+ N+M−1 of the entry body as the log item without changing fidelity or promoting
384
+ the entry to context. Use after reading `summary` fidelity (which gives line
385
+ numbers via repomap) to target a specific symbol. Single-path only — glob or
386
+ body filter with `line`/`limit` is a 400 error.
387
+ - `status = 202` → invisible (proposed, pending client)
337
388
  - `model_visible = 0` → invisible (audit, tool, instructions)
338
389
 
339
- ### 4.5 progress:// as Entry
390
+ Model controls fidelity via `<set>` attributes: `archive`, `summary`,
391
+ `index`, `full`. The `summary="..."` attribute attaches a description
392
+ (<= 80 chars) that persists across fidelity changes.
393
+
394
+ ### 4.5 Budget Enforcement
395
+
396
+ The model owns its context. The system enforces a hard ceiling and
397
+ provides advisory warnings — it does not automatically manage entries.
398
+
399
+ **Pre-LLM check:** The budget plugin measures `countTokens()` on the
400
+ assembled messages. If assembled tokens exceed `contextSize`, the turn
401
+ returns 413 without calling the LLM. This triggers panic mode (see
402
+ §4.6).
403
+
404
+ **Write-layer gate:** BudgetGuard on KnownStore gates every write
405
+ during dispatch. `upsert()`, `promoteByPattern()`, and
406
+ `updateBodyByPattern()` check token delta against remaining headroom.
407
+ Exceeding the budget throws `BudgetExceeded` — the tool 413s, the
408
+ guard trips, and all subsequent tools in the turn fail.
409
+
410
+ BudgetGuard ceiling = `floor(contextSize × 0.9) − 500`. The 500-token
411
+ buffer below the enforce ceiling absorbs two sources of overhead that
412
+ BudgetGuard cannot see: (a) `#record()`-phase writes that bypass the
413
+ guard (~15 tokens per command), and (b) loop transition overhead —
414
+ when a loop completes and a new one starts, entries shift from
415
+ `<performed>` to `<previous>` format, adding ~200–300 tokens to the
416
+ next assembly. Without this buffer, the base context can accumulate
417
+ to exactly the enforce ceiling, making it impossible for the panic
418
+ loop to start (panic prompt + loop overhead > ceiling).
419
+
420
+ **Exemptions:** `status >= 400` entries (error results), `model_visible
421
+ = 0` entries (audit), `fidelity = "archive"` entries (not in context).
422
+
423
+ **Size gate:** Known entries exceeding 500 tokens are rejected with
424
+ 413, forcing atomic entries.
425
+
426
+ **Advisory warnings** (progress plugin):
427
+ - 50%: "You may free space by lowering the fidelity of entries"
428
+ - 75%: "YOU MUST free space... or the run will fail"
429
+
430
+ **Token math:** `Math.ceil(text.length / RUMMY_TOKEN_DIVISOR)`. One
431
+ formula, one file (`src/agent/tokens.js`), env-configurable. No
432
+ external dependencies. `contextSize` is the ceiling. Over = 413.
433
+ Under = 200. No margins.
434
+
435
+ **Three token measures — never conflate them:**
436
+
437
+ | Measure | Source | Scope | Use |
438
+ |---|---|---|---|
439
+ | SQL entry tokens | `known_entries.tokens` = `ceil(chars / DIVISOR)` | Per entry | Model decision-making: "this entry costs N tokens" |
440
+ | Assembled estimate | `measureMessages(messages)` = sum of entry projections | Full packet | First-turn budget fallback only |
441
+ | Actual API tokens | `turns.context_tokens` = `usage.input_tokens` back-filled from LLM | Per turn | Budget enforcement on turns 2+; ground truth |
442
+
443
+ `budget.enforce` uses the **actual API tokens** (`get_last_context_tokens`) when
444
+ available (turn 2+) and falls back to the assembled estimate on turn 1. The
445
+ estimate can be 3–7× off for XML/JSON-heavy content — do not rely on it for
446
+ anything that matters.
447
+
448
+ **`context_tokens` vs `prompt_tokens` in step telemetry:**
449
+ - `context_tokens` in the step JSON = `turns.context_tokens` for that turn =
450
+ per-turn actual input tokens from the LLM API (e.g. 7900 tokens sent this turn)
451
+ - `prompt_tokens` in the step JSON = `SUM(turns.prompt_tokens)` for the run =
452
+ **cumulative** total across all turns (cost tracking, not a context size)
453
+
454
+ These two will diverge rapidly on any multi-turn run. A run at turn 50 might show
455
+ `context_tokens: 8000` (context under control) and `prompt_tokens: 400000`
456
+ (total input tokens billed across the whole run). They are measuring orthogonal things.
457
+
458
+ ### 4.6 Panic Mode
459
+
460
+ **The invariant.** A panic is only ever triggered because the
461
+ assembled context was under the ceiling — and the new prompt pushed
462
+ it over. The existing context fit; the incoming prompt did not.
463
+ Panic mode replaces that too-large incoming prompt with a small
464
+ panic prompt on the same context. Therefore: the first turn of a
465
+ panic loop cannot 413. If it does, it is a bug.
466
+
467
+ **Trigger.** `TurnExecutor.execute()` assembles the full packet
468
+ (context + incoming prompt) before calling the LLM. If
469
+ `assembledTokens > contextSize`, it returns 413 without calling
470
+ the LLM. `#drainQueue` intercepts this and enters panic mode.
471
+
472
+ **Flow.**
473
+ 1. Complete the failed loop with status 413 (audit trail).
474
+ 2. Enqueue a panic loop (`mode = "panic"`, `noRepo = true`,
475
+ `prompt = panicPrompt`, `panicTarget` in config).
476
+ 3. Re-enqueue the original loop with `panicAttempted: true` in
477
+ its config JSON. This flag persists across drain cycles.
478
+ 4. `continue` — the drain loop claims the panic loop next.
479
+
480
+ After panic completes (model freed enough space), the retry loop
481
+ runs. If the retry also 413s, hard-fail to client. One panic
482
+ attempt per drain cycle — `panicAttempted` is checked both as a
483
+ local variable and on the re-enqueued loop's config.
484
+
485
+ **Panic target.** The model must compress context to below:
486
+
487
+ ```
488
+ panicTarget = MIN(contextSize × 0.75, contextSize − incomingTokens) − cushion
489
+ ```
340
490
 
341
- The continuation prompt is a `progress://N` entry. Plugins can modify its
342
- body before materialization.
491
+ `incomingTokens` is the raw token count of the original prompt.
492
+ `cushion` is a small safety margin (500 tokens) to absorb
493
+ materialization overhead. The target is expressed in materialized
494
+ token units — the same unit the system uses to measure completion
495
+ (see Token Math below).
496
+
497
+ **Two token contexts.**
498
+
499
+ The model reasons in *per-entry SQL tokens* — the token counts
500
+ visible in `<knowns>` entries. These are the granular unit the model
501
+ uses to decide which entries to target: "this entry is 200 tokens;
502
+ if I archive it, I save 200 tokens."
503
+
504
+ The system makes decisions using *actual API tokens* —
505
+ `turns.context_tokens` back-filled from `usage.input_tokens` after
506
+ each LLM call. SQL token sums do not equal actual API counts because
507
+ projections, assembly overhead, and fidelity transforms alter the
508
+ output; and the SQL estimate (`ceil(chars / DIVISOR)`) can be 3–7×
509
+ off for structured content. **Never use SQL token sums for ceiling or
510
+ budget decisions.** See §4.5 Token Measures for the full breakdown.
511
+
512
+ **Strike system.** After each panic turn, compare
513
+ `result.assembledTokens` (materialized) with `_lastPanicTokens`
514
+ (previous turn's materialized total):
515
+ - Decreased → reset strike counter to 0.
516
+ - Same or increased → increment strikes.
517
+ - 3 consecutive strikes → return 413 to `#drainQueue` → hard-fail.
518
+
519
+ Progress (any reduction) resets the counter. The model has
520
+ unlimited turns as long as it makes progress.
521
+
522
+ **Panic success.** After each turn, if `result.assembledTokens
523
+ <= panicTarget`, the panic loop exits with 200. The retry loop
524
+ then runs with the original prompt on the now-compressed context.
525
+
526
+ **Tool set.** `resolveForLoop("panic")` includes: get, set, known,
527
+ unknown, rm, mv, cp, summarize, update. Excludes: sh, env, search,
528
+ ask_user. `noRepo: true` — no file scanning during panic.
529
+
530
+ **What the model sees.** Turn 1 receives the panic prompt from
531
+ `budget.panicPrompt()`: the assembled token count, the target, and
532
+ the exact number of tokens to free. Turn 2+ receives a continuation
533
+ prompt. The model uses `<set fidelity="archive">`, `<mv
534
+ fidelity="index">`, and similar fidelity operations to free space,
535
+ concluding with `<summarize>` when done or `<update>` while working.
343
536
 
344
537
  ---
345
538
 
@@ -369,22 +562,25 @@ JSON-RPC 2.0 over WebSocket. `discover` returns the live catalog.
369
562
 
370
563
  | Method | Params |
371
564
  |--------|--------|
372
- | `read` | `{ path, run?, persist?, readonly? }` |
565
+ | `get` | `{ path, run, persist?, readonly? }` |
566
+ | `set` | `{ run, path, body?, attributes? }` |
567
+ | `rm` | `{ run, path }` |
568
+ | `mv` | `{ run, path, to }` |
569
+ | `cp` | `{ run, path, to }` |
373
570
  | `store` | `{ path, run?, persist?, ignore?, clear? }` |
374
- | `write` | `{ run, path, body?, state?, attributes? }` |
375
- | `delete` | `{ run, path }` |
376
571
  | `getEntries` | `{ pattern?, body?, run?, limit?, offset? }` |
377
572
 
378
- `persist` creates a project-level file constraint (operator privilege).
379
- Without `persist`, operations dispatch through the handler chain.
573
+ All entry operations dispatch through the handler chain. `persist`
574
+ on `get` also sets a project-level file constraint (operator privilege).
575
+ `store` manages file constraints — not a model tool.
380
576
 
381
577
  #### Runs
382
578
 
383
579
  | Method | Params |
384
580
  |--------|--------|
385
581
  | `startRun` | `{ model, temperature?, persona?, contextLimit? }` |
386
- | `ask` | `{ prompt, model, run?, temperature?, persona?, contextLimit?, noContext?, fork? }` |
387
- | `act` | `{ prompt, model, run?, temperature?, persona?, contextLimit?, noContext?, fork? }` |
582
+ | `ask` | `{ prompt, model, run?, temperature?, persona?, contextLimit?, noRepo?, noInteraction?, noWeb?, fork? }` |
583
+ | `act` | `{ prompt, model, run?, temperature?, persona?, contextLimit?, noRepo?, noInteraction?, noWeb?, fork? }` |
388
584
  | `run/resolve` | `{ run, resolution: { path, action, output? } }` |
389
585
  | `run/abort` | `{ run }` |
390
586
  | `run/rename` | `{ run, name }` |
@@ -392,6 +588,10 @@ Without `persist`, operations dispatch through the handler chain.
392
588
  | `run/config` | `{ run, temperature?, persona?, contextLimit?, model? }` |
393
589
 
394
590
  `model` is required on `ask`, `act`, and `startRun`. No default.
591
+ `noRepo` disables default project/repo file scanning (files can still
592
+ be added explicitly by the client).
593
+ `noInteraction` removes `ask_user` from the tool list.
594
+ `noWeb` removes `search` from the tool list.
395
595
 
396
596
  #### Queries
397
597
 
@@ -445,7 +645,80 @@ Each plugin has its own README at `src/plugins/{name}/README.md`.
445
645
 
446
646
  ---
447
647
 
448
- ## 7. Hedberg Editing Syntax
648
+ ## 7. Tool Documentation Design
649
+
650
+ Tool docs are the most carefully designed text in rummy. Every line
651
+ simultaneously teaches syntax, implies workflow priority, demonstrates
652
+ pattern capabilities, and constrains misuse. Each letter earns its place.
653
+
654
+ ### Principles
655
+
656
+ **Show, don't tell.** Examples ARE the documentation. A model learns
657
+ `<get path="known://*">auth</get>` from seeing it, not from being told
658
+ "you can filter known entries by keyword." Examples are ordered from
659
+ simple to powerful — weak models learn from examples 1-2, strong models
660
+ pick up the pattern from example 3.
661
+
662
+ **Lifecycle continuity.** Examples weave stories across tools. The get
663
+ docs end with `<set path="..." fidelity="index"/>`. The known docs
664
+ reference `<get path="known://*">keyword</get>` for recall and
665
+ `<set path="known://..." archive/>` for archiving. The unknown docs
666
+ reference `<get/>` for investigation and `<rm/>` for cleanup. A model
667
+ reading the full tool docs encounters a coherent workflow:
668
+ discover → load → reason → edit → archive → recall.
669
+
670
+ **RFC 2119 semantics.** Constraint bullets use YOU MUST, YOU MUST NOT,
671
+ YOU SHOULD, YOU MAY from RFC 2119. Every LLM has extensive pretraining
672
+ on RFC documents where these keywords carry precise semantic weight.
673
+ MUST is absolute. SHOULD is strong advisory. MAY is permissive. This
674
+ is not decorative — it's leveraging the model's existing understanding
675
+ of requirement levels.
676
+
677
+ **Consistent structure.** Every tool doc follows: header (syntax), 2+
678
+ examples, 2+ constraint bullets. Inconsistent formatting reads as
679
+ inconsistent importance. A tool with 5 examples and dense bullets feels
680
+ complex; a tool with 1 line feels disposable. Both are wrong — every
681
+ tool is equally real, each doc is proportional to the tool's surface area.
682
+
683
+ ### Format
684
+
685
+ Tool docs live in `*Doc.js` files as annotated line arrays:
686
+
687
+ ```js
688
+ const LINES = [
689
+ ["* Body text filters results by content match",
690
+ "Generalizes examples 2-3. Body = filter, not just path."],
691
+ ];
692
+ export default LINES.map(([text]) => text).join("\n");
693
+ ```
694
+
695
+ The first element is the model-facing text. The second is the rationale —
696
+ visible only in source. Changing any line requires reading all rationales
697
+ first. This prevents well-intentioned edits from breaking subtle behavioral
698
+ guarantees that adjacent lines depend on.
699
+
700
+ ### Tool Display Order
701
+
702
+ Tools are presented gather → reason → act → communicate. Position in
703
+ the list implies priority. `get` is first. `ask_user` is last. The
704
+ order is defined in `ToolRegistry.TOOL_ORDER` and applied by
705
+ `resolveForLoop()`. The same method handles all tool exclusions —
706
+ mode restrictions, `noInteraction`, `noWeb`, `noProposals` — through
707
+ one unified mechanism.
708
+
709
+ ### Pattern Distribution
710
+
711
+ Hedbergian pattern matching (globs, body filters, preview) is taught
712
+ across multiple tools, not concentrated in one. `get` shows content
713
+ filtering. `cp` shows glob batch operations. `rm` shows preview safety.
714
+ Each tool reinforces the pattern vocabulary from a different angle.
715
+ A model that sees `path="known://*"` in get, `path="known://plan_*"` in
716
+ cp, and `path="known://temp_*" preview` in rm learns that patterns
717
+ are universal — not a feature of any single tool.
718
+
719
+ ---
720
+
721
+ ## 8. Hedberg Editing Syntax
449
722
 
450
723
  The model picks its preferred edit format. The parser understands all of them:
451
724
 
@@ -460,26 +733,36 @@ The model picks its preferred edit format. The parser understands all of them:
460
733
 
461
734
  ---
462
735
 
463
- ## 8. Response Healing
736
+ ## 9. Response Healing
464
737
 
465
- The server never throws on model output. Recovery order:
738
+ The server never throws on model output. "Model behavior" is never an
739
+ acceptable explanation. Recovery order:
466
740
 
467
741
  1. Can we recover? Extract the data and continue.
468
742
  2. Can we warn? Log structured warnings.
469
743
  3. Did our structure cause this? Check formatting, prompts.
470
- 4. Model drift is the LAST answer.
471
744
 
472
745
  Termination protocol:
473
746
  - `<summarize>` → run terminates
747
+ - `<summarize>` + failed actions → overridden to `<update>` (continue)
474
748
  - `<update>` → run continues
475
- - Both → summarize wins
476
- - Neither + tools → stall counter
749
+ - Both → update wins (if the model can't decide, it's not done)
750
+ - Neither + investigation tools → stall counter (RUMMY_MAX_STALLS)
751
+ - Neither + action-only tools → healed to summarize
477
752
  - Neither + plain text → healed to summarize
478
- - Repeated commands → loop detection
753
+ - Repeated commands → cycle detection (RUMMY_MIN_CYCLES, RUMMY_MAX_CYCLE_PERIOD)
754
+ - Repeated update text → stall (RUMMY_MAX_UPDATE_REPEATS)
755
+
756
+ Format normalization:
757
+ - Gemma `\`\`\`tool_code` fences → stripped before parsing
758
+ - Qwen `<|tool_call>` format → normalized to XML
759
+ - OpenAI function_call JSON → normalized to XML
760
+ - Mistral `[TOOL_CALLS]` → normalized to XML
761
+ - Sed alternate delimiters (`s|old|new|`) → parsed like `s/old/new/`
479
762
 
480
763
  ---
481
764
 
482
- ## 9. Testing
765
+ ## 10. Testing
483
766
 
484
767
  | Tier | Location | LLM? |
485
768
  |------|----------|------|
@@ -493,12 +776,12 @@ E2E tests must NEVER mock the LLM. Environment cascade:
493
776
 
494
777
  ---
495
778
 
496
- ## 10. SQL Functions
779
+ ## 11. SQL Functions
497
780
 
498
781
  | Function | Purpose |
499
782
  |----------|---------|
500
783
  | `schemeOf(path)` | Extract URI scheme |
501
- | `countTokens(text)` | Token count (tiktoken o200k_base, `ceil(len/4)` fallback) |
784
+ | `countTokens(text)` | Token count (`ceil(len / RUMMY_TOKEN_DIVISOR)`) |
502
785
  | `hedmatch(pattern, string)` | Full-string pattern match (paths, equality) |
503
786
  | `hedsearch(pattern, string)` | Substring pattern search (content filtering) |
504
787
  | `hedreplace(pattern, replacement, string)` | Pattern-based replacement |
@@ -508,15 +791,66 @@ See [PLUGINS.md](PLUGINS.md) for the hedberg pattern type reference.
508
791
 
509
792
  ---
510
793
 
511
- ## 11. Configuration
794
+ ## 13. Debugging: E2E and Benchmark Results
795
+
796
+ ### E2E test failures
797
+
798
+ E2E tests use a temp DB at `/tmp/rummy_test_<timestamp>_<random>.db` (cleaned up after).
799
+ On failure, `AuditClient.assertRun` calls `dumpRun`, which prints a full turn-by-turn audit
800
+ to stdout. That output is in the background task log:
801
+
802
+ ```
803
+ /tmp/claude-1000/-home-hyzen-repo-rummy-main/<session-id>/tasks/<task-id>.output
804
+ ```
805
+
806
+ If oversized, the harness saves to:
807
+ ```
808
+ /home/hyzen/.claude/projects/-home-hyzen-repo-rummy-main/<session-id>/tool-results/<id>.txt
809
+ ```
810
+
811
+ The dump format is: `scheme:state path {attributes}\n body (120 chars)` grouped by turn.
812
+
813
+ Key things to look for in a dump:
814
+ - **202**: unresolved proposals — model issued `<sh>`, `<rm>`, or `<mv>` that needs approval
815
+ - **413**: budget overflow — assembled context exceeded ceiling before LLM call
816
+ - **BudgetGuard errors**: per-tool rejections mid-turn (`Budget exceeded: N tokens requested`)
817
+ - **`<sh>` in act/panic mode**: model fell back to shell when blocked (doc/prompt gap)
818
+ - Loop sequence: look for `mode` in `instructions://system` attrs to see which loop type ran
819
+
820
+ ### MAB benchmark
821
+
822
+ Results live in `test/mab/results/<ISO-timestamp>/mab.db`. Latest run = most recent dir.
823
+
824
+ ```js
825
+ // Query a MAB result DB directly:
826
+ import { DatabaseSync } from 'node:sqlite';
827
+ const db = new DatabaseSync('test/mab/results/<timestamp>/mab.db');
828
+ db.prepare('SELECT * FROM questions').all(); // all questions + scores
829
+ db.prepare('SELECT * FROM runs').all(); // individual model runs
830
+ ```
831
+
832
+ Run with: `npm run test:mab`
833
+
834
+ ### LME benchmark
835
+
836
+ Results live in `test/lme/results/<ISO-timestamp>/lme.db`. Same structure.
837
+
838
+ Run with: `npm run test:lme`
839
+
840
+ ---
841
+
842
+ ## 12. Configuration
512
843
 
513
844
  ```env
514
845
  RUMMY_HOME=~/.rummy
515
- RUMMY_MAX_TURNS=15
846
+ RUMMY_TOKEN_DIVISOR=2
847
+ RUMMY_MAX_TURNS=99
516
848
  RUMMY_MAX_STALLS=3
517
- RUMMY_MAX_REPETITIONS=3
849
+ RUMMY_MIN_CYCLES=3
850
+ RUMMY_MAX_CYCLE_PERIOD=4
851
+ RUMMY_MAX_UPDATE_REPEATS=3
518
852
  RUMMY_RETENTION_DAYS=31
519
- RUMMY_TEMPERATURE=0.7
853
+ RUMMY_TEMPERATURE=0.5
520
854
  RUMMY_DEBUG=false
521
855
  ```
522
856