@possumtech/rummy 0.2.8 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +13 -2
- package/EXCEPTIONS.md +46 -0
- package/PLUGINS.md +422 -188
- package/SPEC.md +440 -106
- package/migrations/001_initial_schema.sql +5 -3
- package/package.json +17 -5
- package/service.js +5 -3
- package/src/agent/AgentLoop.js +252 -55
- package/src/agent/ContextAssembler.js +20 -4
- package/src/agent/KnownStore.js +82 -25
- package/src/agent/ProjectAgent.js +4 -1
- package/src/agent/ResponseHealer.js +86 -32
- package/src/agent/TurnExecutor.js +542 -207
- package/src/agent/XmlParser.js +77 -41
- package/src/agent/known_store.sql +68 -4
- package/src/agent/schemes.sql +3 -0
- package/src/agent/tokens.js +7 -21
- package/src/agent/turns.sql +15 -1
- package/src/hooks/HookRegistry.js +7 -0
- package/src/hooks/Hooks.js +15 -0
- package/src/hooks/PluginContext.js +14 -1
- package/src/hooks/RummyContext.js +16 -4
- package/src/hooks/ToolRegistry.js +77 -19
- package/src/llm/LlmProvider.js +27 -8
- package/src/llm/OpenAiClient.js +20 -0
- package/src/llm/OpenRouterClient.js +24 -2
- package/src/llm/XaiClient.js +47 -2
- package/src/plugins/ask_user/README.md +4 -4
- package/src/plugins/ask_user/ask_user.js +5 -5
- package/src/plugins/ask_user/ask_userDoc.js +29 -0
- package/src/plugins/budget/README.md +31 -0
- package/src/plugins/budget/budget.js +55 -0
- package/src/plugins/cp/README.md +5 -4
- package/src/plugins/cp/cp.js +10 -6
- package/src/plugins/cp/cpDoc.js +29 -0
- package/src/plugins/engine/engine.sql +1 -8
- package/src/plugins/engine/turn_context.sql +4 -9
- package/src/plugins/env/README.md +3 -4
- package/src/plugins/env/env.js +5 -5
- package/src/plugins/env/envDoc.js +29 -0
- package/src/plugins/file/README.md +9 -12
- package/src/plugins/file/file.js +34 -35
- package/src/plugins/get/README.md +2 -2
- package/src/plugins/get/get.js +77 -6
- package/src/plugins/get/getDoc.js +51 -0
- package/src/plugins/hedberg/hedberg.js +2 -1
- package/src/plugins/hedberg/matcher.js +10 -29
- package/src/plugins/hedberg/normalize.js +28 -0
- package/src/plugins/hedberg/patterns.js +25 -27
- package/src/plugins/hedberg/sed.js +17 -10
- package/src/plugins/index.js +66 -14
- package/src/plugins/instructions/README.md +6 -2
- package/src/plugins/instructions/instructions.js +20 -4
- package/src/plugins/instructions/preamble.md +19 -5
- package/src/plugins/known/README.md +10 -7
- package/src/plugins/known/known.js +23 -17
- package/src/plugins/known/knownDoc.js +34 -0
- package/src/plugins/mv/README.md +5 -4
- package/src/plugins/mv/mv.js +27 -6
- package/src/plugins/mv/mvDoc.js +45 -0
- package/src/plugins/performed/README.md +15 -0
- package/src/plugins/performed/performed.js +45 -0
- package/src/plugins/persona/persona.js +78 -0
- package/src/plugins/previous/README.md +3 -2
- package/src/plugins/previous/previous.js +33 -24
- package/src/plugins/progress/README.md +1 -2
- package/src/plugins/progress/progress.js +33 -21
- package/src/plugins/prompt/README.md +5 -5
- package/src/plugins/prompt/prompt.js +15 -17
- package/src/plugins/rm/README.md +4 -4
- package/src/plugins/rm/rm.js +32 -20
- package/src/plugins/rm/rmDoc.js +30 -0
- package/src/plugins/rpc/README.md +15 -28
- package/src/plugins/rpc/rpc.js +42 -77
- package/src/plugins/set/README.md +13 -12
- package/src/plugins/set/set.js +107 -16
- package/src/plugins/set/setDoc.js +49 -0
- package/src/plugins/sh/README.md +4 -4
- package/src/plugins/sh/sh.js +5 -5
- package/src/plugins/sh/shDoc.js +29 -0
- package/src/plugins/{skills/skills.js → skill/skill.js} +10 -51
- package/src/plugins/summarize/README.md +6 -5
- package/src/plugins/summarize/summarize.js +7 -6
- package/src/plugins/summarize/summarizeDoc.js +33 -0
- package/src/plugins/telemetry/telemetry.js +16 -9
- package/src/plugins/think/README.md +20 -0
- package/src/plugins/think/think.js +5 -0
- package/src/plugins/unknown/README.md +6 -5
- package/src/plugins/unknown/unknown.js +12 -9
- package/src/plugins/unknown/unknownDoc.js +31 -0
- package/src/plugins/update/README.md +3 -8
- package/src/plugins/update/update.js +7 -6
- package/src/plugins/update/updateDoc.js +33 -0
- package/src/server/ClientConnection.js +59 -45
- package/src/server/RpcRegistry.js +52 -4
- package/src/sql/v_model_context.sql +10 -25
- package/src/plugins/ask_user/docs.md +0 -2
- package/src/plugins/cp/docs.md +0 -2
- package/src/plugins/current/README.md +0 -14
- package/src/plugins/current/current.js +0 -47
- package/src/plugins/env/docs.md +0 -4
- package/src/plugins/get/docs.md +0 -10
- package/src/plugins/known/docs.md +0 -3
- package/src/plugins/mv/docs.md +0 -2
- package/src/plugins/rm/docs.md +0 -6
- package/src/plugins/set/docs.md +0 -6
- package/src/plugins/sh/docs.md +0 -2
- package/src/plugins/skills/README.md +0 -25
- package/src/plugins/store/README.md +0 -20
- package/src/plugins/store/docs.md +0 -6
- package/src/plugins/store/store.js +0 -63
- package/src/plugins/summarize/docs.md +0 -4
- package/src/plugins/unknown/docs.md +0 -5
- package/src/plugins/update/docs.md +0 -4
package/SPEC.md
CHANGED
|
@@ -15,8 +15,8 @@ that thread a value through subscribers in priority order).
|
|
|
15
15
|
|
|
16
16
|
**Every `<tag>` the model sees is a plugin.** The `<known>` section
|
|
17
17
|
of the system message is rendered by the known plugin. The `<progress>`
|
|
18
|
-
section is rendered by the progress plugin. The `<
|
|
19
|
-
by the prompt plugin. No monolithic assembler decides what goes where.
|
|
18
|
+
section is rendered by the progress plugin. The `<prompt>` tag is
|
|
19
|
+
rendered by the prompt plugin. No monolithic assembler decides what goes where.
|
|
20
20
|
Each plugin filters for its own data from the shared row set, renders
|
|
21
21
|
its section, and returns.
|
|
22
22
|
|
|
@@ -42,7 +42,8 @@ body, attributes, and state.
|
|
|
42
42
|
|
|
43
43
|
```sql
|
|
44
44
|
known_entries (
|
|
45
|
-
id, run_id, turn, path, body, scheme,
|
|
45
|
+
id, run_id, loop_id, turn, path, body, scheme,
|
|
46
|
+
status INTEGER, fidelity TEXT, hash,
|
|
46
47
|
attributes, tokens, tokens_full, refs, write_count,
|
|
47
48
|
created_at, updated_at
|
|
48
49
|
)
|
|
@@ -50,58 +51,62 @@ known_entries (
|
|
|
50
51
|
|
|
51
52
|
| Column | Purpose |
|
|
52
53
|
|--------|---------|
|
|
53
|
-
| `path` | Entry identity. Bare paths (`src/app.js`) or URIs (`known://auth`) |
|
|
54
|
+
| `path` | Entry identity. Bare paths (`src/app.js`) or URIs (`known://auth`). Max 2048 chars. |
|
|
54
55
|
| `body` | Tag body text. File content, tool output, skill docs. |
|
|
55
56
|
| `attributes` | Tag attributes as JSON. Handler-private workspace. `CHECK (json_valid)` |
|
|
56
57
|
| `scheme` | Generated from path via `schemeOf()`. Drives dispatch and view routing |
|
|
57
|
-
| `
|
|
58
|
+
| `status` | HTTP status code (200, 202, 400, 413, etc.) |
|
|
59
|
+
| `fidelity` | Visibility level: full, summary, index, archive |
|
|
58
60
|
| `hash` | SHA-256 for file change detection |
|
|
59
|
-
| `tokens` |
|
|
61
|
+
| `tokens` | Display-only token count at current fidelity. NEVER used for budget. |
|
|
60
62
|
| `tokens_full` | Cost of raw body at full fidelity |
|
|
61
63
|
| `turn` | Freshness — when was this entry last touched |
|
|
62
64
|
|
|
63
|
-
### 1.2 Schemes &
|
|
65
|
+
### 1.2 Schemes, Status & Fidelity
|
|
64
66
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
**Files** (`scheme IS NULL`):
|
|
68
|
-
|
|
69
|
-
| State | Model sees |
|
|
70
|
-
|-------|-----------|
|
|
71
|
-
| `full` | File content in code fence |
|
|
72
|
-
| `index` | Path listed in File Index |
|
|
73
|
-
| `stored` | Invisible, retrievable via `<get>` |
|
|
74
|
-
|
|
75
|
-
**Knowledge** (`known://`, `unknown://`):
|
|
76
|
-
|
|
77
|
-
| State | Model sees |
|
|
78
|
-
|-------|-----------|
|
|
79
|
-
| `full` | Key — value in bullet list |
|
|
80
|
-
| `stored` | Key listed, no value |
|
|
81
|
-
|
|
82
|
-
**Tool results** (`set://`, `sh://`, `env://`, `rm://`, `ask_user://`,
|
|
83
|
-
`mv://`, `cp://`, `search://`, `get://`, `store://`):
|
|
84
|
-
|
|
85
|
-
All start at `full` state when recorded. Handlers set the final state:
|
|
86
|
-
`proposed`, `pass`, `rejected`, `error`, `pattern`, `read`, `stored`, `info`.
|
|
87
|
-
|
|
88
|
-
**Skills** (`skill://`): `full` or `stored`. Rendered in system message.
|
|
67
|
+
Every entry has two independent dimensions: **status** (HTTP integer)
|
|
68
|
+
and **fidelity** (visibility level). These are separate concerns.
|
|
89
69
|
|
|
90
|
-
**
|
|
70
|
+
**Status** (lifecycle): 200 (OK), 202 (proposed), 400 (bad request),
|
|
71
|
+
404 (not found), 409 (conflict), 413 (too large), 499 (aborted),
|
|
72
|
+
500 (error).
|
|
91
73
|
|
|
92
|
-
**
|
|
74
|
+
**Fidelity** (visibility): `full` (body visible), `summary`
|
|
75
|
+
(model-authored summary), `index` (path only), `archive` (invisible,
|
|
76
|
+
retrievable via `<get>`).
|
|
93
77
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
**Audit** (`system://`, `prompt://`, `ask://`, `act://`, `progress://`,
|
|
97
|
-
`reasoning://`, `model://`, `error://`, `user://`, `assistant://`,
|
|
98
|
-
`content://`): `info` state, `model_visible = 0` (hidden from model).
|
|
99
|
-
|
|
100
|
-
### 1.3 State Validation
|
|
78
|
+
Paths use URI scheme syntax. Bare paths (no `://`) are files.
|
|
101
79
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
80
|
+
Every entry plays one of four roles:
|
|
81
|
+
|
|
82
|
+
| Role | Category | Section | Description |
|
|
83
|
+
|------|----------|---------|-------------|
|
|
84
|
+
| **Data** | `data` | `<knowns>` | Entries the model works with — persistent state |
|
|
85
|
+
| **Logging** | `logging` | `<performed>`/`<previous>` | Records of what happened — tool results, lifecycle signals |
|
|
86
|
+
| **Unknowns** | `unknown` | `<unknowns>` | Open questions the model is tracking |
|
|
87
|
+
| **Prompt** | `prompt` | `<prompt>` | The task driving the loop |
|
|
88
|
+
|
|
89
|
+
`logging` is the default category. Plugins opt into `data` explicitly.
|
|
90
|
+
|
|
91
|
+
| Scheme | Category | Description |
|
|
92
|
+
|--------|----------|-------------|
|
|
93
|
+
| `NULL` (bare path) | data | File content. JOINs via `COALESCE(scheme, 'file')`. `file://` prefix stripped by hedberg. |
|
|
94
|
+
| `known://` | data | Model-registered knowledge. One fact per entry. |
|
|
95
|
+
| `skill://` | data | Skill docs. Rendered in system message. |
|
|
96
|
+
| `http://`, `https://` | data | Web content. |
|
|
97
|
+
| `unknown://` | unknown | Unresolved questions. |
|
|
98
|
+
| `prompt://` | prompt | User prompt with `mode` attribute (`ask`/`act`). |
|
|
99
|
+
| `set://`, `get://`, `sh://`, `env://`, `rm://`, `mv://`, `cp://`, `ask_user://`, `search://` | logging | Tool result entries. |
|
|
100
|
+
| `summarize://`, `update://` | logging | Lifecycle signals. |
|
|
101
|
+
| `tool://` | audit | Internal plugin metadata. `model_visible = 0`. |
|
|
102
|
+
| `system://`, `reasoning://`, `model://`, `error://`, `user://`, `assistant://`, `content://` | audit | Audit entries. `model_visible = 0`. |
|
|
103
|
+
|
|
104
|
+
### 1.3 Scheme Registry
|
|
105
|
+
|
|
106
|
+
The `schemes` table is a bootstrap registry — static rows of
|
|
107
|
+
`(name, model_visible, category)`. Plugins register their scheme
|
|
108
|
+
via `core.registerScheme()` in the constructor. The `model_visible`
|
|
109
|
+
flag controls whether entries appear in `v_model_context`.
|
|
105
110
|
|
|
106
111
|
### 1.4 UPSERT Semantics
|
|
107
112
|
|
|
@@ -117,13 +122,21 @@ The K/V store is the memory. Relational tables are the skeleton.
|
|
|
117
122
|
```sql
|
|
118
123
|
projects (id, name UNIQUE, project_root, config_path, created_at)
|
|
119
124
|
models (id, alias UNIQUE, actual, context_length, created_at)
|
|
120
|
-
runs (id, project_id, parent_run_id, model, alias UNIQUE,
|
|
121
|
-
temperature, persona, context_limit,
|
|
122
|
-
|
|
123
|
-
|
|
125
|
+
runs (id, project_id, parent_run_id, model, alias UNIQUE,
|
|
126
|
+
status INTEGER, temperature, persona, context_limit,
|
|
127
|
+
next_turn, next_loop, created_at)
|
|
128
|
+
loops (id, run_id, sequence, mode, model, prompt, status INTEGER,
|
|
129
|
+
config JSON, result JSON, created_at)
|
|
130
|
+
turns (id, run_id, loop_id, sequence, context_tokens,
|
|
131
|
+
reasoning_content, prompt_tokens, cached_tokens,
|
|
132
|
+
completion_tokens, reasoning_tokens, total_tokens, cost,
|
|
133
|
+
created_at)
|
|
124
134
|
|
|
125
135
|
file_constraints (id, project_id, pattern, visibility, created_at)
|
|
126
|
-
|
|
136
|
+
-- Project-level config. NOT tool dispatch. See §2.3.
|
|
137
|
+
turn_context (id, run_id, loop_id, turn, ordinal, path, scheme,
|
|
138
|
+
status, fidelity, body, tokens, attributes,
|
|
139
|
+
category, source_turn)
|
|
127
140
|
rpc_log (id, project_id, method, rpc_id, params, result, error)
|
|
128
141
|
```
|
|
129
142
|
|
|
@@ -136,19 +149,39 @@ client picks for every run.
|
|
|
136
149
|
|
|
137
150
|
### 2.1 Run State Machine
|
|
138
151
|
|
|
152
|
+
All status fields are HTTP integer codes:
|
|
153
|
+
|
|
139
154
|
```
|
|
140
|
-
queued → running → proposed → running → completed
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
155
|
+
100 (queued) → 200 (running) → 202 (proposed) → 200 (running) → 200 (completed)
|
|
156
|
+
→ 200 (completed)
|
|
157
|
+
→ 500 (failed) → 200 (running)
|
|
158
|
+
→ 499 (aborted) → 200 (running)
|
|
144
159
|
```
|
|
145
160
|
|
|
146
161
|
All terminal states allow transition back to `running`. Runs are long-lived.
|
|
147
162
|
|
|
148
|
-
### 2.2
|
|
163
|
+
### 2.2 Loops Table
|
|
164
|
+
|
|
165
|
+
The loops table IS the prompt queue. Each `ask`/`act` creates a loop.
|
|
166
|
+
FIFO per run (ordered by sequence). One active at a time. Abort stops
|
|
167
|
+
the current loop; pending loops survive. Projects > runs > loops > turns.
|
|
168
|
+
|
|
169
|
+
### 2.3 File Constraints
|
|
170
|
+
|
|
171
|
+
The `file_constraints` table is project-level configuration — it
|
|
172
|
+
defines which files a project cares about. This is backbone, not tool
|
|
173
|
+
dispatch. Constraints have three visibilities: `active` (promoted to
|
|
174
|
+
full), `readonly` (promoted but not editable), `ignore` (demoted).
|
|
175
|
+
|
|
176
|
+
**Boundary:** Setting a constraint (`File.setConstraint`) is a
|
|
177
|
+
project-config write. Promoting/demoting the matching entries is tool
|
|
178
|
+
dispatch that goes through the handler chain with budget enforcement.
|
|
179
|
+
These are separate operations: constraint persists across runs, entry
|
|
180
|
+
promotion is scoped to a run and subject to the same budget rules as
|
|
181
|
+
a model `<get>`.
|
|
149
182
|
|
|
150
|
-
|
|
151
|
-
|
|
183
|
+
`store` RPC manages constraints directly — it is not a model tool.
|
|
184
|
+
`get` RPC with `persist` sets the constraint AND dispatches promotion.
|
|
152
185
|
|
|
153
186
|
---
|
|
154
187
|
|
|
@@ -169,13 +202,17 @@ object is the same shape at every tier.
|
|
|
169
202
|
|
|
170
203
|
| Method | Model | Client | Plugin |
|
|
171
204
|
|--------|-------|--------|--------|
|
|
172
|
-
| `get`, `set`, `rm`, `mv`, `cp`, `sh`, `env`, `
|
|
205
|
+
| `get`, `set`, `rm`, `mv`, `cp`, `sh`, `env`, `search` | ✓ | ✓ | ✓ |
|
|
173
206
|
| `known`, `unknown`, `ask_user`, `summarize`, `update` | ✓ | ✓ | ✓ |
|
|
174
207
|
| `ask`, `act`, `resolve`, `abort`, `startRun` | — | ✓ | ✓ |
|
|
175
208
|
| `getRuns`, `getModels`, `getEntries` | — | ✓ | ✓ |
|
|
176
209
|
| `on()`, `filter()`, db/store access | — | — | ✓ |
|
|
177
210
|
|
|
178
|
-
Model tier restrictions enforced by
|
|
211
|
+
Model tier restrictions enforced by unified `resolveForLoop(mode, flags)`.
|
|
212
|
+
Ask mode excludes `sh`. Flags: `noInteraction` excludes `ask_user`,
|
|
213
|
+
`noWeb` excludes `search`, `noProposals` excludes `ask_user`/`env`/`sh`.
|
|
214
|
+
13 model tools: get, set, known, unknown, env, sh, rm, cp, mv, search,
|
|
215
|
+
summarize, update, ask_user.
|
|
179
216
|
Client tier requires project init. Plugin tier has no restrictions.
|
|
180
217
|
|
|
181
218
|
### 3.2 Dispatch Path
|
|
@@ -188,6 +225,14 @@ Client: JSON-RPC → { method, params } → #record() → dispatch(scheme, en
|
|
|
188
225
|
Plugin: rummy.rm({ path }) → #record() → dispatch(scheme, entry, rummy)
|
|
189
226
|
```
|
|
190
227
|
|
|
228
|
+
**Lifecycle/action split:** Commands are classified as lifecycle signals
|
|
229
|
+
(`summarize`, `update`, `unknown`, `known`) or action commands (everything
|
|
230
|
+
else). Lifecycle signals always dispatch — they are state declarations that
|
|
231
|
+
cannot be 409'd by sequential dispatch. Action commands dispatch sequentially;
|
|
232
|
+
a 202 proposal or error aborts subsequent actions. If the model sends
|
|
233
|
+
`<summarize>` but actions in the same turn failed, the summarize is
|
|
234
|
+
overridden to an update (the model's assertion that it's done is false).
|
|
235
|
+
|
|
191
236
|
### 3.3 Plugin Convention
|
|
192
237
|
|
|
193
238
|
A plugin is an instantiated class. The class name matches the file name.
|
|
@@ -247,35 +292,35 @@ Two messages per turn. System = stable truth. User = active task.
|
|
|
247
292
|
[persona/]
|
|
248
293
|
[skills/]
|
|
249
294
|
[/instructions]
|
|
250
|
-
<
|
|
295
|
+
<knowns>
|
|
251
296
|
...entries sorted by fidelity (index, summary, full), then by scheme
|
|
252
|
-
</
|
|
297
|
+
</knowns>
|
|
253
298
|
<previous>
|
|
254
|
-
(pre-loop
|
|
299
|
+
(pre-loop entries, each with turn, status, summary, fidelity, tokens)
|
|
255
300
|
</previous>
|
|
256
|
-
<unknowns
|
|
301
|
+
<unknowns>
|
|
302
|
+
(open questions, each with path, turn, fidelity, tokens)
|
|
303
|
+
</unknowns>
|
|
257
304
|
[/system]
|
|
258
305
|
[user]
|
|
259
|
-
<
|
|
260
|
-
(current loop
|
|
261
|
-
</
|
|
262
|
-
<progress>
|
|
263
|
-
<
|
|
264
|
-
— OR —
|
|
265
|
-
<act tools="...">user prompt</act>
|
|
306
|
+
<performed>
|
|
307
|
+
(current loop entries, each with turn, status, summary, fidelity, tokens)
|
|
308
|
+
</performed>
|
|
309
|
+
<progress turn="N">token budget, fidelity stats, causal bridge</progress>
|
|
310
|
+
<prompt mode="ask|act" tools="...">user prompt</prompt>
|
|
266
311
|
[/user]
|
|
267
312
|
```
|
|
268
313
|
|
|
269
314
|
**System** contains everything the model needs to know.
|
|
270
315
|
**User** contains everything the model needs to do.
|
|
271
316
|
|
|
272
|
-
The `<
|
|
317
|
+
The `<prompt>` tag is present on every turn — first turn and
|
|
273
318
|
continuations alike. The model always sees its task. The active prompt
|
|
274
319
|
is extracted from its chronological position and placed last for maximum
|
|
275
320
|
recency. `<progress>` bridges the gap, narrating the causal relationship
|
|
276
|
-
between `<
|
|
321
|
+
between `<performed>` (the work) and the prompt (the cause).
|
|
277
322
|
|
|
278
|
-
### 4.2 Loops, Previous, and
|
|
323
|
+
### 4.2 Loops, Previous, and Performed
|
|
279
324
|
|
|
280
325
|
A **loop** is one `ask` or `act` invocation and all its continuation
|
|
281
326
|
turns until summarize, fail, or abort.
|
|
@@ -285,14 +330,14 @@ responses, tool results, agent warnings — the full chronicle in order.
|
|
|
285
330
|
Lives in the system message as established history. Omitted on the
|
|
286
331
|
first turn of the first loop.
|
|
287
332
|
|
|
288
|
-
**
|
|
333
|
+
**Performed** = the active loop's work so far. Model responses, tool
|
|
289
334
|
results, agent warnings — in order. Does NOT include the user prompt
|
|
290
|
-
(one per loop, extracted to `<
|
|
335
|
+
(one per loop, extracted to `<prompt>`). Lives in the user
|
|
291
336
|
message as immediate context. Empty on the first turn of a loop.
|
|
292
337
|
|
|
293
338
|
When a new prompt arrives on an existing run, the prior loop's
|
|
294
|
-
`<
|
|
295
|
-
continues (next turn), new results append to `<
|
|
339
|
+
`<performed>` content plus its prompt move to `<previous>`. When a loop
|
|
340
|
+
continues (next turn), new results append to `<performed>`.
|
|
296
341
|
|
|
297
342
|
### 4.3 Key Entries
|
|
298
343
|
|
|
@@ -313,7 +358,7 @@ text from body + attributes.
|
|
|
313
358
|
Each turn:
|
|
314
359
|
|
|
315
360
|
1. Write `instructions://system` (empty body, attributes = { persona })
|
|
316
|
-
2.
|
|
361
|
+
2. Emit `turn.started` — plugins write prompt/instructions entries
|
|
317
362
|
3. Project `instructions://system` → instructions text
|
|
318
363
|
4. Query `v_model_context` VIEW → visible entries
|
|
319
364
|
5. Project each entry through its tool's `full`/`summary` projection
|
|
@@ -323,23 +368,171 @@ Each turn:
|
|
|
323
368
|
- Previous plugin (priority 200) → `<previous>` section
|
|
324
369
|
- Unknown plugin (priority 300) → `<unknowns>` section
|
|
325
370
|
8. Invoke `assembly.user` filter chain (empty string as base):
|
|
326
|
-
-
|
|
371
|
+
- Performed plugin (priority 100) → `<performed>` section
|
|
327
372
|
- Progress plugin (priority 200) → `<progress>` section
|
|
328
|
-
- Prompt plugin (priority 300) → `<
|
|
373
|
+
- Prompt plugin (priority 300) → `<prompt>` section
|
|
329
374
|
9. Store as `system://N` and `user://N` audit entries
|
|
330
375
|
|
|
331
|
-
The VIEW determines visibility
|
|
376
|
+
The VIEW determines visibility from `fidelity` and `status`:
|
|
332
377
|
- `full` → body visible
|
|
333
|
-
- `summary` →
|
|
378
|
+
- `summary` → summary visible (model-authored `summary` attribute if set)
|
|
334
379
|
- `index` → path listed, no content
|
|
335
|
-
- `
|
|
336
|
-
|
|
380
|
+
- `archive` → invisible (retrievable via `<get>`)
|
|
381
|
+
|
|
382
|
+
**Partial read:** `<get path="..." line="N" limit="M"/>` returns lines N through
|
|
383
|
+
N+M−1 of the entry body as the log item without changing fidelity or promoting
|
|
384
|
+
the entry to context. Use after reading `summary` fidelity (which gives line
|
|
385
|
+
numbers via repomap) to target a specific symbol. Single-path only — glob or
|
|
386
|
+
body filter with `line`/`limit` is a 400 error.
|
|
387
|
+
- `status = 202` → invisible (proposed, pending client)
|
|
337
388
|
- `model_visible = 0` → invisible (audit, tool, instructions)
|
|
338
389
|
|
|
339
|
-
|
|
390
|
+
Model controls fidelity via `<set>` attributes: `archive`, `summary`,
|
|
391
|
+
`index`, `full`. The `summary="..."` attribute attaches a description
|
|
392
|
+
(<= 80 chars) that persists across fidelity changes.
|
|
393
|
+
|
|
394
|
+
### 4.5 Budget Enforcement
|
|
395
|
+
|
|
396
|
+
The model owns its context. The system enforces a hard ceiling and
|
|
397
|
+
provides advisory warnings — it does not automatically manage entries.
|
|
398
|
+
|
|
399
|
+
**Pre-LLM check:** The budget plugin measures `countTokens()` on the
|
|
400
|
+
assembled messages. If assembled tokens exceed `contextSize`, the turn
|
|
401
|
+
returns 413 without calling the LLM. This triggers panic mode (see
|
|
402
|
+
§4.6).
|
|
403
|
+
|
|
404
|
+
**Write-layer gate:** BudgetGuard on KnownStore gates every write
|
|
405
|
+
during dispatch. `upsert()`, `promoteByPattern()`, and
|
|
406
|
+
`updateBodyByPattern()` check token delta against remaining headroom.
|
|
407
|
+
Exceeding the budget throws `BudgetExceeded` — the tool 413s, the
|
|
408
|
+
guard trips, and all subsequent tools in the turn fail.
|
|
409
|
+
|
|
410
|
+
BudgetGuard ceiling = `floor(contextSize × 0.9) − 500`. The 500-token
|
|
411
|
+
buffer below the enforce ceiling absorbs two sources of overhead that
|
|
412
|
+
BudgetGuard cannot see: (a) `#record()`-phase writes that bypass the
|
|
413
|
+
guard (~15 tokens per command), and (b) loop transition overhead —
|
|
414
|
+
when a loop completes and a new one starts, entries shift from
|
|
415
|
+
`<performed>` to `<previous>` format, adding ~200–300 tokens to the
|
|
416
|
+
next assembly. Without this buffer, the base context can accumulate
|
|
417
|
+
to exactly the enforce ceiling, making it impossible for the panic
|
|
418
|
+
loop to start (panic prompt + loop overhead > ceiling).
|
|
419
|
+
|
|
420
|
+
**Exemptions:** `status >= 400` entries (error results), `model_visible
|
|
421
|
+
= 0` entries (audit), `fidelity = "archive"` entries (not in context).
|
|
422
|
+
|
|
423
|
+
**Size gate:** Known entries exceeding 500 tokens are rejected with
|
|
424
|
+
413, forcing atomic entries.
|
|
425
|
+
|
|
426
|
+
**Advisory warnings** (progress plugin):
|
|
427
|
+
- 50%: "You may free space by lowering the fidelity of entries"
|
|
428
|
+
- 75%: "YOU MUST free space... or the run will fail"
|
|
429
|
+
|
|
430
|
+
**Token math:** `Math.ceil(text.length / RUMMY_TOKEN_DIVISOR)`. One
|
|
431
|
+
formula, one file (`src/agent/tokens.js`), env-configurable. No
|
|
432
|
+
external dependencies. `contextSize` is the ceiling. Over = 413.
|
|
433
|
+
Under = 200. No margins.
|
|
434
|
+
|
|
435
|
+
**Three token measures — never conflate them:**
|
|
436
|
+
|
|
437
|
+
| Measure | Source | Scope | Use |
|
|
438
|
+
|---|---|---|---|
|
|
439
|
+
| SQL entry tokens | `known_entries.tokens` = `ceil(chars / DIVISOR)` | Per entry | Model decision-making: "this entry costs N tokens" |
|
|
440
|
+
| Assembled estimate | `measureMessages(messages)` = sum of entry projections | Full packet | First-turn budget fallback only |
|
|
441
|
+
| Actual API tokens | `turns.context_tokens` = `usage.input_tokens` back-filled from LLM | Per turn | Budget enforcement on turns 2+; ground truth |
|
|
442
|
+
|
|
443
|
+
`budget.enforce` uses the **actual API tokens** (`get_last_context_tokens`) when
|
|
444
|
+
available (turn 2+) and falls back to the assembled estimate on turn 1. The
|
|
445
|
+
estimate can be 3–7× off for XML/JSON-heavy content — do not rely on it for
|
|
446
|
+
anything that matters.
|
|
447
|
+
|
|
448
|
+
**`context_tokens` vs `prompt_tokens` in step telemetry:**
|
|
449
|
+
- `context_tokens` in the step JSON = `turns.context_tokens` for that turn =
|
|
450
|
+
per-turn actual input tokens from the LLM API (e.g. 7900 tokens sent this turn)
|
|
451
|
+
- `prompt_tokens` in the step JSON = `SUM(turns.prompt_tokens)` for the run =
|
|
452
|
+
**cumulative** total across all turns (cost tracking, not a context size)
|
|
453
|
+
|
|
454
|
+
These two will diverge rapidly on any multi-turn run. A run at turn 50 might show
|
|
455
|
+
`context_tokens: 8000` (context under control) and `prompt_tokens: 400000`
|
|
456
|
+
(total input tokens billed across the whole run). They are measuring orthogonal things.
|
|
457
|
+
|
|
458
|
+
### 4.6 Panic Mode
|
|
459
|
+
|
|
460
|
+
**The invariant.** A panic is only ever triggered because the
|
|
461
|
+
assembled context was under the ceiling — and the new prompt pushed
|
|
462
|
+
it over. The existing context fit; the incoming prompt did not.
|
|
463
|
+
Panic mode replaces that too-large incoming prompt with a small
|
|
464
|
+
panic prompt on the same context. Therefore: the first turn of a
|
|
465
|
+
panic loop cannot 413. If it does, it is a bug.
|
|
466
|
+
|
|
467
|
+
**Trigger.** `TurnExecutor.execute()` assembles the full packet
|
|
468
|
+
(context + incoming prompt) before calling the LLM. If
|
|
469
|
+
`assembledTokens > contextSize`, it returns 413 without calling
|
|
470
|
+
the LLM. `#drainQueue` intercepts this and enters panic mode.
|
|
471
|
+
|
|
472
|
+
**Flow.**
|
|
473
|
+
1. Complete the failed loop with status 413 (audit trail).
|
|
474
|
+
2. Enqueue a panic loop (`mode = "panic"`, `noRepo = true`,
|
|
475
|
+
`prompt = panicPrompt`, `panicTarget` in config).
|
|
476
|
+
3. Re-enqueue the original loop with `panicAttempted: true` in
|
|
477
|
+
its config JSON. This flag persists across drain cycles.
|
|
478
|
+
4. `continue` — the drain loop claims the panic loop next.
|
|
479
|
+
|
|
480
|
+
After panic completes (model freed enough space), the retry loop
|
|
481
|
+
runs. If the retry also 413s, hard-fail to client. One panic
|
|
482
|
+
attempt per drain cycle — `panicAttempted` is checked both as a
|
|
483
|
+
local variable and on the re-enqueued loop's config.
|
|
484
|
+
|
|
485
|
+
**Panic target.** The model must compress context to below:
|
|
486
|
+
|
|
487
|
+
```
|
|
488
|
+
panicTarget = MIN(contextSize × 0.75, contextSize − incomingTokens) − cushion
|
|
489
|
+
```
|
|
340
490
|
|
|
341
|
-
|
|
342
|
-
|
|
491
|
+
`incomingTokens` is the raw token count of the original prompt.
|
|
492
|
+
`cushion` is a small safety margin (500 tokens) to absorb
|
|
493
|
+
materialization overhead. The target is expressed in materialized
|
|
494
|
+
token units — the same unit the system uses to measure completion
|
|
495
|
+
(see Token Math below).
|
|
496
|
+
|
|
497
|
+
**Two token contexts.**
|
|
498
|
+
|
|
499
|
+
The model reasons in *per-entry SQL tokens* — the token counts
|
|
500
|
+
visible in `<knowns>` entries. These are the granular unit the model
|
|
501
|
+
uses to decide which entries to target: "this entry is 200 tokens;
|
|
502
|
+
if I archive it, I save 200 tokens."
|
|
503
|
+
|
|
504
|
+
The system makes decisions using *actual API tokens* —
|
|
505
|
+
`turns.context_tokens` back-filled from `usage.input_tokens` after
|
|
506
|
+
each LLM call. SQL token sums do not equal actual API counts because
|
|
507
|
+
projections, assembly overhead, and fidelity transforms alter the
|
|
508
|
+
output; and the SQL estimate (`ceil(chars / DIVISOR)`) can be 3–7×
|
|
509
|
+
off for structured content. **Never use SQL token sums for ceiling or
|
|
510
|
+
budget decisions.** See §4.5 Token Measures for the full breakdown.
|
|
511
|
+
|
|
512
|
+
**Strike system.** After each panic turn, compare
|
|
513
|
+
`result.assembledTokens` (materialized) with `_lastPanicTokens`
|
|
514
|
+
(previous turn's materialized total):
|
|
515
|
+
- Decreased → reset strike counter to 0.
|
|
516
|
+
- Same or increased → increment strikes.
|
|
517
|
+
- 3 consecutive strikes → return 413 to `#drainQueue` → hard-fail.
|
|
518
|
+
|
|
519
|
+
Progress (any reduction) resets the counter. The model has
|
|
520
|
+
unlimited turns as long as it makes progress.
|
|
521
|
+
|
|
522
|
+
**Panic success.** After each turn, if `result.assembledTokens
|
|
523
|
+
<= panicTarget`, the panic loop exits with 200. The retry loop
|
|
524
|
+
then runs with the original prompt on the now-compressed context.
|
|
525
|
+
|
|
526
|
+
**Tool set.** `resolveForLoop("panic")` includes: get, set, known,
|
|
527
|
+
unknown, rm, mv, cp, summarize, update. Excludes: sh, env, search,
|
|
528
|
+
ask_user. `noRepo: true` — no file scanning during panic.
|
|
529
|
+
|
|
530
|
+
**What the model sees.** Turn 1 receives the panic prompt from
|
|
531
|
+
`budget.panicPrompt()`: the assembled token count, the target, and
|
|
532
|
+
the exact number of tokens to free. Turn 2+ receives a continuation
|
|
533
|
+
prompt. The model uses `<set fidelity="archive">`, `<mv
|
|
534
|
+
fidelity="index">`, and similar fidelity operations to free space,
|
|
535
|
+
concluding with `<summarize>` when done or `<update>` while working.
|
|
343
536
|
|
|
344
537
|
---
|
|
345
538
|
|
|
@@ -369,22 +562,25 @@ JSON-RPC 2.0 over WebSocket. `discover` returns the live catalog.
|
|
|
369
562
|
|
|
370
563
|
| Method | Params |
|
|
371
564
|
|--------|--------|
|
|
372
|
-
| `
|
|
565
|
+
| `get` | `{ path, run, persist?, readonly? }` |
|
|
566
|
+
| `set` | `{ run, path, body?, attributes? }` |
|
|
567
|
+
| `rm` | `{ run, path }` |
|
|
568
|
+
| `mv` | `{ run, path, to }` |
|
|
569
|
+
| `cp` | `{ run, path, to }` |
|
|
373
570
|
| `store` | `{ path, run?, persist?, ignore?, clear? }` |
|
|
374
|
-
| `write` | `{ run, path, body?, state?, attributes? }` |
|
|
375
|
-
| `delete` | `{ run, path }` |
|
|
376
571
|
| `getEntries` | `{ pattern?, body?, run?, limit?, offset? }` |
|
|
377
572
|
|
|
378
|
-
|
|
379
|
-
|
|
573
|
+
All entry operations dispatch through the handler chain. `persist`
|
|
574
|
+
on `get` also sets a project-level file constraint (operator privilege).
|
|
575
|
+
`store` manages file constraints — not a model tool.
|
|
380
576
|
|
|
381
577
|
#### Runs
|
|
382
578
|
|
|
383
579
|
| Method | Params |
|
|
384
580
|
|--------|--------|
|
|
385
581
|
| `startRun` | `{ model, temperature?, persona?, contextLimit? }` |
|
|
386
|
-
| `ask` | `{ prompt, model, run?, temperature?, persona?, contextLimit?,
|
|
387
|
-
| `act` | `{ prompt, model, run?, temperature?, persona?, contextLimit?,
|
|
582
|
+
| `ask` | `{ prompt, model, run?, temperature?, persona?, contextLimit?, noRepo?, noInteraction?, noWeb?, fork? }` |
|
|
583
|
+
| `act` | `{ prompt, model, run?, temperature?, persona?, contextLimit?, noRepo?, noInteraction?, noWeb?, fork? }` |
|
|
388
584
|
| `run/resolve` | `{ run, resolution: { path, action, output? } }` |
|
|
389
585
|
| `run/abort` | `{ run }` |
|
|
390
586
|
| `run/rename` | `{ run, name }` |
|
|
@@ -392,6 +588,10 @@ Without `persist`, operations dispatch through the handler chain.
|
|
|
392
588
|
| `run/config` | `{ run, temperature?, persona?, contextLimit?, model? }` |
|
|
393
589
|
|
|
394
590
|
`model` is required on `ask`, `act`, and `startRun`. No default.
|
|
591
|
+
`noRepo` disables default project/repo file scanning (files can still
|
|
592
|
+
be added explicitly by the client).
|
|
593
|
+
`noInteraction` removes `ask_user` from the tool list.
|
|
594
|
+
`noWeb` removes `search` from the tool list.
|
|
395
595
|
|
|
396
596
|
#### Queries
|
|
397
597
|
|
|
@@ -445,7 +645,80 @@ Each plugin has its own README at `src/plugins/{name}/README.md`.
|
|
|
445
645
|
|
|
446
646
|
---
|
|
447
647
|
|
|
448
|
-
## 7.
|
|
648
|
+
## 7. Tool Documentation Design
|
|
649
|
+
|
|
650
|
+
Tool docs are the most carefully designed text in rummy. Every line
|
|
651
|
+
simultaneously teaches syntax, implies workflow priority, demonstrates
|
|
652
|
+
pattern capabilities, and constrains misuse. Each letter earns its place.
|
|
653
|
+
|
|
654
|
+
### Principles
|
|
655
|
+
|
|
656
|
+
**Show, don't tell.** Examples ARE the documentation. A model learns
|
|
657
|
+
`<get path="known://*">auth</get>` from seeing it, not from being told
|
|
658
|
+
"you can filter known entries by keyword." Examples are ordered from
|
|
659
|
+
simple to powerful — weak models learn from examples 1-2, strong models
|
|
660
|
+
pick up the pattern from example 3.
|
|
661
|
+
|
|
662
|
+
**Lifecycle continuity.** Examples weave stories across tools. The get
|
|
663
|
+
docs end with `<set path="..." fidelity="index"/>`. The known docs
|
|
664
|
+
reference `<get path="known://*">keyword</get>` for recall and
|
|
665
|
+
`<set path="known://..." archive/>` for archiving. The unknown docs
|
|
666
|
+
reference `<get/>` for investigation and `<rm/>` for cleanup. A model
|
|
667
|
+
reading the full tool docs encounters a coherent workflow:
|
|
668
|
+
discover → load → reason → edit → archive → recall.
|
|
669
|
+
|
|
670
|
+
**RFC 2119 semantics.** Constraint bullets use YOU MUST, YOU MUST NOT,
|
|
671
|
+
YOU SHOULD, YOU MAY from RFC 2119. Every LLM has extensive pretraining
|
|
672
|
+
on RFC documents where these keywords carry precise semantic weight.
|
|
673
|
+
MUST is absolute. SHOULD is strong advisory. MAY is permissive. This
|
|
674
|
+
is not decorative — it's leveraging the model's existing understanding
|
|
675
|
+
of requirement levels.
|
|
676
|
+
|
|
677
|
+
**Consistent structure.** Every tool doc follows: header (syntax), 2+
|
|
678
|
+
examples, 2+ constraint bullets. Inconsistent formatting reads as
|
|
679
|
+
inconsistent importance. A tool with 5 examples and dense bullets feels
|
|
680
|
+
complex; a tool with 1 line feels disposable. Both are wrong — every
|
|
681
|
+
tool is equally real, each doc is proportional to the tool's surface area.
|
|
682
|
+
|
|
683
|
+
### Format
|
|
684
|
+
|
|
685
|
+
Tool docs live in `*Doc.js` files as annotated line arrays:
|
|
686
|
+
|
|
687
|
+
```js
|
|
688
|
+
const LINES = [
|
|
689
|
+
["* Body text filters results by content match",
|
|
690
|
+
"Generalizes examples 2-3. Body = filter, not just path."],
|
|
691
|
+
];
|
|
692
|
+
export default LINES.map(([text]) => text).join("\n");
|
|
693
|
+
```
|
|
694
|
+
|
|
695
|
+
The first element is the model-facing text. The second is the rationale —
|
|
696
|
+
visible only in source. Changing any line requires reading all rationales
|
|
697
|
+
first. This prevents well-intentioned edits from breaking subtle behavioral
|
|
698
|
+
guarantees that adjacent lines depend on.
|
|
699
|
+
|
|
700
|
+
### Tool Display Order
|
|
701
|
+
|
|
702
|
+
Tools are presented gather → reason → act → communicate. Position in
|
|
703
|
+
the list implies priority. `get` is first. `ask_user` is last. The
|
|
704
|
+
order is defined in `ToolRegistry.TOOL_ORDER` and applied by
|
|
705
|
+
`resolveForLoop()`. The same method handles all tool exclusions —
|
|
706
|
+
mode restrictions, `noInteraction`, `noWeb`, `noProposals` — through
|
|
707
|
+
one unified mechanism.
|
|
708
|
+
|
|
709
|
+
### Pattern Distribution
|
|
710
|
+
|
|
711
|
+
Hedbergian pattern matching (globs, body filters, preview) is taught
|
|
712
|
+
across multiple tools, not concentrated in one. `get` shows content
|
|
713
|
+
filtering. `cp` shows glob batch operations. `rm` shows preview safety.
|
|
714
|
+
Each tool reinforces the pattern vocabulary from a different angle.
|
|
715
|
+
A model that sees `path="known://*"` in get, `path="known://plan_*"` in
|
|
716
|
+
cp, and `path="known://temp_*" preview` in rm learns that patterns
|
|
717
|
+
are universal — not a feature of any single tool.
|
|
718
|
+
|
|
719
|
+
---
|
|
720
|
+
|
|
721
|
+
## 8. Hedberg Editing Syntax
|
|
449
722
|
|
|
450
723
|
The model picks its preferred edit format. The parser understands all of them:
|
|
451
724
|
|
|
@@ -460,26 +733,36 @@ The model picks its preferred edit format. The parser understands all of them:
|
|
|
460
733
|
|
|
461
734
|
---
|
|
462
735
|
|
|
463
|
-
##
|
|
736
|
+
## 9. Response Healing
|
|
464
737
|
|
|
465
|
-
The server never throws on model output.
|
|
738
|
+
The server never throws on model output. "Model behavior" is never an
|
|
739
|
+
acceptable explanation. Recovery order:
|
|
466
740
|
|
|
467
741
|
1. Can we recover? Extract the data and continue.
|
|
468
742
|
2. Can we warn? Log structured warnings.
|
|
469
743
|
3. Did our structure cause this? Check formatting, prompts.
|
|
470
|
-
4. Model drift is the LAST answer.
|
|
471
744
|
|
|
472
745
|
Termination protocol:
|
|
473
746
|
- `<summarize>` → run terminates
|
|
747
|
+
- `<summarize>` + failed actions → overridden to `<update>` (continue)
|
|
474
748
|
- `<update>` → run continues
|
|
475
|
-
- Both →
|
|
476
|
-
- Neither + tools → stall counter
|
|
749
|
+
- Both → update wins (if the model can't decide, it's not done)
|
|
750
|
+
- Neither + investigation tools → stall counter (RUMMY_MAX_STALLS)
|
|
751
|
+
- Neither + action-only tools → healed to summarize
|
|
477
752
|
- Neither + plain text → healed to summarize
|
|
478
|
-
- Repeated commands →
|
|
753
|
+
- Repeated commands → cycle detection (RUMMY_MIN_CYCLES, RUMMY_MAX_CYCLE_PERIOD)
|
|
754
|
+
- Repeated update text → stall (RUMMY_MAX_UPDATE_REPEATS)
|
|
755
|
+
|
|
756
|
+
Format normalization:
|
|
757
|
+
- Gemma `\`\`\`tool_code` fences → stripped before parsing
|
|
758
|
+
- Qwen `<|tool_call>` format → normalized to XML
|
|
759
|
+
- OpenAI function_call JSON → normalized to XML
|
|
760
|
+
- Mistral `[TOOL_CALLS]` → normalized to XML
|
|
761
|
+
- Sed alternate delimiters (`s|old|new|`) → parsed like `s/old/new/`
|
|
479
762
|
|
|
480
763
|
---
|
|
481
764
|
|
|
482
|
-
##
|
|
765
|
+
## 10. Testing
|
|
483
766
|
|
|
484
767
|
| Tier | Location | LLM? |
|
|
485
768
|
|------|----------|------|
|
|
@@ -493,12 +776,12 @@ E2E tests must NEVER mock the LLM. Environment cascade:
|
|
|
493
776
|
|
|
494
777
|
---
|
|
495
778
|
|
|
496
|
-
##
|
|
779
|
+
## 11. SQL Functions
|
|
497
780
|
|
|
498
781
|
| Function | Purpose |
|
|
499
782
|
|----------|---------|
|
|
500
783
|
| `schemeOf(path)` | Extract URI scheme |
|
|
501
|
-
| `countTokens(text)` | Token count (
|
|
784
|
+
| `countTokens(text)` | Token count (`ceil(len / RUMMY_TOKEN_DIVISOR)`) |
|
|
502
785
|
| `hedmatch(pattern, string)` | Full-string pattern match (paths, equality) |
|
|
503
786
|
| `hedsearch(pattern, string)` | Substring pattern search (content filtering) |
|
|
504
787
|
| `hedreplace(pattern, replacement, string)` | Pattern-based replacement |
|
|
@@ -508,15 +791,66 @@ See [PLUGINS.md](PLUGINS.md) for the hedberg pattern type reference.
|
|
|
508
791
|
|
|
509
792
|
---
|
|
510
793
|
|
|
511
|
-
##
|
|
794
|
+
## 13. Debugging: E2E and Benchmark Results
|
|
795
|
+
|
|
796
|
+
### E2E test failures
|
|
797
|
+
|
|
798
|
+
E2E tests use a temp DB at `/tmp/rummy_test_<timestamp>_<random>.db` (cleaned up after).
|
|
799
|
+
On failure, `AuditClient.assertRun` calls `dumpRun`, which prints a full turn-by-turn audit
|
|
800
|
+
to stdout. That output is in the background task log:
|
|
801
|
+
|
|
802
|
+
```
|
|
803
|
+
/tmp/claude-1000/-home-hyzen-repo-rummy-main/<session-id>/tasks/<task-id>.output
|
|
804
|
+
```
|
|
805
|
+
|
|
806
|
+
If oversized, the harness saves to:
|
|
807
|
+
```
|
|
808
|
+
/home/hyzen/.claude/projects/-home-hyzen-repo-rummy-main/<session-id>/tool-results/<id>.txt
|
|
809
|
+
```
|
|
810
|
+
|
|
811
|
+
The dump format is: `scheme:state path {attributes}\n body (120 chars)` grouped by turn.
|
|
812
|
+
|
|
813
|
+
Key things to look for in a dump:
|
|
814
|
+
- **202**: unresolved proposals — model issued `<sh>`, `<rm>`, or `<mv>` that needs approval
|
|
815
|
+
- **413**: budget overflow — assembled context exceeded ceiling before LLM call
|
|
816
|
+
- **BudgetGuard errors**: per-tool rejections mid-turn (`Budget exceeded: N tokens requested`)
|
|
817
|
+
- **`<sh>` in act/panic mode**: model fell back to shell when blocked (doc/prompt gap)
|
|
818
|
+
- Loop sequence: look for `mode` in `instructions://system` attrs to see which loop type ran
|
|
819
|
+
|
|
820
|
+
### MAB benchmark
|
|
821
|
+
|
|
822
|
+
Results live in `test/mab/results/<ISO-timestamp>/mab.db`. Latest run = most recent dir.
|
|
823
|
+
|
|
824
|
+
```js
|
|
825
|
+
// Query a MAB result DB directly:
|
|
826
|
+
import { DatabaseSync } from 'node:sqlite';
|
|
827
|
+
const db = new DatabaseSync('test/mab/results/<timestamp>/mab.db');
|
|
828
|
+
db.prepare('SELECT * FROM questions').all(); // all questions + scores
|
|
829
|
+
db.prepare('SELECT * FROM runs').all(); // individual model runs
|
|
830
|
+
```
|
|
831
|
+
|
|
832
|
+
Run with: `npm run test:mab`
|
|
833
|
+
|
|
834
|
+
### LME benchmark
|
|
835
|
+
|
|
836
|
+
Results live in `test/lme/results/<ISO-timestamp>/lme.db`. Same structure.
|
|
837
|
+
|
|
838
|
+
Run with: `npm run test:lme`
|
|
839
|
+
|
|
840
|
+
---
|
|
841
|
+
|
|
842
|
+
## 12. Configuration
|
|
512
843
|
|
|
513
844
|
```env
|
|
514
845
|
RUMMY_HOME=~/.rummy
|
|
515
|
-
|
|
846
|
+
RUMMY_TOKEN_DIVISOR=2
|
|
847
|
+
RUMMY_MAX_TURNS=99
|
|
516
848
|
RUMMY_MAX_STALLS=3
|
|
517
|
-
|
|
849
|
+
RUMMY_MIN_CYCLES=3
|
|
850
|
+
RUMMY_MAX_CYCLE_PERIOD=4
|
|
851
|
+
RUMMY_MAX_UPDATE_REPEATS=3
|
|
518
852
|
RUMMY_RETENTION_DAYS=31
|
|
519
|
-
RUMMY_TEMPERATURE=0.
|
|
853
|
+
RUMMY_TEMPERATURE=0.5
|
|
520
854
|
RUMMY_DEBUG=false
|
|
521
855
|
```
|
|
522
856
|
|