@possumtech/rummy 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. package/.env.example +12 -7
  2. package/BENCH_ENVIRONMENT.md +230 -0
  3. package/CLIENT_INTERFACE.md +396 -0
  4. package/PLUGINS.md +93 -1
  5. package/SPEC.md +305 -28
  6. package/bin/postinstall.js +2 -2
  7. package/bin/rummy.js +2 -2
  8. package/last_run.txt +5617 -0
  9. package/migrations/001_initial_schema.sql +2 -1
  10. package/package.json +6 -2
  11. package/scriptify/cache_probe.js +66 -0
  12. package/scriptify/cache_probe_grok.js +74 -0
  13. package/service.js +22 -11
  14. package/src/agent/AgentLoop.js +33 -139
  15. package/src/agent/ContextAssembler.js +2 -9
  16. package/src/agent/Entries.js +36 -101
  17. package/src/agent/ProjectAgent.js +2 -9
  18. package/src/agent/TurnExecutor.js +45 -83
  19. package/src/agent/XmlParser.js +247 -273
  20. package/src/agent/budget.js +5 -28
  21. package/src/agent/config.js +38 -0
  22. package/src/agent/errors.js +7 -13
  23. package/src/agent/httpStatus.js +1 -19
  24. package/src/agent/known_store.sql +7 -2
  25. package/src/agent/materializeContext.js +12 -17
  26. package/src/agent/pathEncode.js +5 -0
  27. package/src/agent/rummyHome.js +9 -0
  28. package/src/agent/runs.sql +18 -0
  29. package/src/agent/tokens.js +2 -8
  30. package/src/hooks/HookRegistry.js +1 -16
  31. package/src/hooks/Hooks.js +8 -33
  32. package/src/hooks/PluginContext.js +3 -21
  33. package/src/hooks/RpcRegistry.js +1 -4
  34. package/src/hooks/RummyContext.js +2 -16
  35. package/src/hooks/ToolRegistry.js +5 -15
  36. package/src/llm/LlmProvider.js +28 -23
  37. package/src/llm/errors.js +41 -4
  38. package/src/llm/openaiStream.js +125 -0
  39. package/src/llm/retry.js +61 -15
  40. package/src/plugins/budget/budget.js +14 -81
  41. package/src/plugins/cli/README.md +87 -0
  42. package/src/plugins/cli/bin.js +61 -0
  43. package/src/plugins/cli/cli.js +120 -0
  44. package/src/plugins/env/README.md +2 -1
  45. package/src/plugins/env/env.js +4 -6
  46. package/src/plugins/env/envDoc.md +2 -2
  47. package/src/plugins/error/error.js +23 -23
  48. package/src/plugins/file/file.js +2 -22
  49. package/src/plugins/get/get.js +12 -34
  50. package/src/plugins/get/getDoc.md +5 -3
  51. package/src/plugins/hedberg/edits.js +1 -11
  52. package/src/plugins/hedberg/hedberg.js +3 -26
  53. package/src/plugins/hedberg/normalize.js +1 -5
  54. package/src/plugins/hedberg/patterns.js +4 -15
  55. package/src/plugins/hedberg/sed.js +1 -7
  56. package/src/plugins/helpers.js +28 -20
  57. package/src/plugins/index.js +25 -41
  58. package/src/plugins/instructions/README.md +18 -0
  59. package/src/plugins/instructions/instructions.js +13 -76
  60. package/src/plugins/instructions/instructions.md +19 -18
  61. package/src/plugins/instructions/instructions_104.md +5 -4
  62. package/src/plugins/instructions/instructions_105.md +16 -15
  63. package/src/plugins/instructions/instructions_106.md +15 -14
  64. package/src/plugins/instructions/instructions_107.md +13 -6
  65. package/src/plugins/known/README.md +26 -6
  66. package/src/plugins/known/known.js +36 -34
  67. package/src/plugins/log/README.md +2 -2
  68. package/src/plugins/log/log.js +6 -33
  69. package/src/plugins/ollama/ollama.js +50 -66
  70. package/src/plugins/openai/openai.js +26 -44
  71. package/src/plugins/openrouter/openrouter.js +28 -52
  72. package/src/plugins/policy/README.md +8 -2
  73. package/src/plugins/policy/policy.js +8 -21
  74. package/src/plugins/prompt/README.md +22 -0
  75. package/src/plugins/prompt/prompt.js +8 -16
  76. package/src/plugins/rm/rm.js +5 -2
  77. package/src/plugins/rm/rmDoc.md +4 -4
  78. package/src/plugins/rpc/README.md +2 -1
  79. package/src/plugins/rpc/rpc.js +51 -47
  80. package/src/plugins/set/README.md +5 -1
  81. package/src/plugins/set/set.js +23 -33
  82. package/src/plugins/set/setDoc.md +1 -1
  83. package/src/plugins/sh/README.md +2 -1
  84. package/src/plugins/sh/sh.js +5 -11
  85. package/src/plugins/sh/shDoc.md +2 -2
  86. package/src/plugins/stream/README.md +6 -5
  87. package/src/plugins/stream/stream.js +6 -35
  88. package/src/plugins/telemetry/telemetry.js +26 -19
  89. package/src/plugins/think/think.js +4 -7
  90. package/src/plugins/unknown/unknown.js +8 -13
  91. package/src/plugins/update/update.js +36 -35
  92. package/src/plugins/update/updateDoc.md +3 -3
  93. package/src/plugins/xai/xai.js +30 -20
  94. package/src/plugins/yolo/yolo.js +8 -41
  95. package/src/server/ClientConnection.js +17 -47
  96. package/src/server/SocketServer.js +14 -14
  97. package/src/server/protocol.js +1 -10
  98. package/src/sql/functions/slugify.js +5 -7
  99. package/src/sql/v_model_context.sql +4 -11
  100. package/turns/cli_1777462658211/turn_001.txt +772 -0
  101. package/turns/cli_1777462658211/turn_002.txt +606 -0
  102. package/turns/cli_1777462658211/turn_003.txt +667 -0
  103. package/turns/cli_1777462658211/turn_004.txt +297 -0
  104. package/turns/cli_1777462658211/turn_005.txt +301 -0
  105. package/turns/cli_1777462658211/turn_006.txt +262 -0
  106. package/turns/cli_1777465095132/turn_001.txt +715 -0
  107. package/turns/cli_1777465095132/turn_002.txt +236 -0
  108. package/turns/cli_1777465095132/turn_003.txt +287 -0
  109. package/turns/cli_1777465095132/turn_004.txt +694 -0
  110. package/turns/cli_1777465095132/turn_005.txt +422 -0
  111. package/turns/cli_1777465095132/turn_006.txt +365 -0
  112. package/turns/cli_1777465095132/turn_007.txt +885 -0
  113. package/turns/cli_1777465095132/turn_008.txt +1277 -0
  114. package/turns/cli_1777465095132/turn_009.txt +736 -0
package/PLUGINS.md CHANGED
@@ -241,11 +241,49 @@ ctx = {
241
241
  toolSet, // Set<string> of active tool names for this loop
242
242
  contextSize, // Model context window size
243
243
  lastContextTokens, // Actual API tokens from the prior turn (0 on turn 1)
244
- demoted, // Mutable array — plugins push paths they summarized
245
244
  turn, // Current turn number
246
245
  }
247
246
  ```
248
247
 
248
+ #### Filter Priority Bands {#plugins_filter_bands}
249
+
250
+ Filters run in ascending priority order. The packet renders in
251
+ top-to-bottom order matching that — lower priority appears earlier in
252
+ the message. Current `assembly.user` registrations:
253
+
254
+ | Priority | Block | Plugin | Mutates per turn? |
255
+ |---|---|---|---|
256
+ | 50 | `<summarized>` | `known.js` | Slow — only on new entry |
257
+ | 75 | `<visible>` | `known.js` | Fast — on every promote/demote |
258
+ | 100 | `<log>` | `log.js` | Always — appends per action |
259
+ | 200 | `<unknowns>` | `unknown.js` | On unknown lifecycle |
260
+ | 250 | `<instructions>` | `instructions.js` | On phase transition |
261
+ | 275 | `<budget>` | `budget.js` | Every turn (live) |
262
+ | 300 | `<prompt>` | `prompt.js` | Stable within a loop |
263
+
264
+ **Recommended ranges for new plugins** (for cache-friendly placement
265
+ and predictable rendering position):
266
+
267
+ | Range | Position | Use for |
268
+ |---|---|---|
269
+ | `0–49` | Top of user | Reserved (stable identity-tier blocks above `<summarized>`) |
270
+ | `50–99` | Codebase data surface | Don't add here — owned by `known.js` |
271
+ | `100–149` | History tier | Action history, timeline-style content |
272
+ | `150–199` | Open slot | Inter-history blocks (e.g. recent-decisions, tracked progress) |
273
+ | `200–249` | State tier | Model state (open questions, work-in-progress) |
274
+ | `250–299` | Phase + budget | Avoid; current phase / budget arithmetic owned here |
275
+ | `300–349` | Task | Reserved for prompt-tier content |
276
+ | `350–999` | Bottom | Append-after-prompt content (rare; usually wrong) |
277
+
278
+ Within a band, lower priority = renders higher. Pick the smallest
279
+ priority that lands you in the right band and leaves room above and
280
+ below.
281
+
282
+ `assembly.system` currently has no registrations — system message is
283
+ the static identity surface (instructions base + tool docs). Adding
284
+ to `assembly.system` invalidates the system-prefix cache on whatever
285
+ provider you target; reserve for content that's truly stable per-run.
286
+
249
287
  ### Tool Docs {#plugins_tool_docs}
250
288
 
251
289
  Each tool plugin has a `*Doc.js` file with annotated line arrays.
@@ -282,6 +320,60 @@ entry = {
282
320
  Multiple handlers per scheme. Lower priority runs first. Return
283
321
  `false` to stop the chain.
284
322
 
323
+ #### Reporting outcomes {#plugins_handler_outcomes}
324
+
325
+ **The action entry IS its outcome.** Your handler finalizes the action's
326
+ own log entry at `entry.resultPath`. Success and failure are two values
327
+ of the same shape — body, state, outcome. The model sees both through
328
+ the same channel under your tool's scheme:
329
+
330
+ ```js
331
+ async handler(entry, rummy) {
332
+ const { entries: store, runId, turn, loopId } = rummy;
333
+ const result = await runMyTool(entry.attributes);
334
+
335
+ if (result.failed) {
336
+ await store.set({
337
+ runId, turn, loopId,
338
+ path: entry.resultPath,
339
+ body: result.failureMessage,
340
+ state: "failed",
341
+ outcome: result.label, // "not_found", "validation", etc.
342
+ });
343
+ return;
344
+ }
345
+
346
+ await store.set({
347
+ runId, turn, loopId,
348
+ path: entry.resultPath,
349
+ body: result.output,
350
+ state: "resolved",
351
+ });
352
+ }
353
+ ```
354
+
355
+ That's the whole failure-reporting surface. Body is the result on
356
+ success, the failure message on failure. State labels the verdict
357
+ (`resolved` / `failed`). Outcome is a short machine-readable label.
358
+
359
+ The framework reads the post-handler state of every recorded entry
360
+ each turn; any `state="failed"` result counts as a strike toward
361
+ `MAX_STRIKES`. You don't need to do anything else to make the strike
362
+ fire — write the entry's outcome and the framework follows.
363
+
364
+ You do **not** call `hooks.error.log.emit` from a tool handler. That
365
+ hook is reserved for the framework's actionless-failure cases (parser
366
+ warnings, dispatch crashes, runtime watchdog, budget overflow) — none
367
+ of which a third-party plugin should be writing.
368
+
369
+ If your handler throws, the framework catches and emits a status-500
370
+ error entry on your behalf. That's the one case where the framework
371
+ writes for you. Throw with intent; don't try-catch your own handler
372
+ just to avoid a stack trace.
373
+
374
+ See SPEC [failure_reporting](SPEC.md#failure_reporting) for the
375
+ full contract and the rationale.
376
+
285
377
  ### full(entry) / summary(entry) {#plugins_views}
286
378
 
287
379
  Returns the string the model sees for this tool's entries at the
package/SPEC.md CHANGED
@@ -7,6 +7,31 @@ everything else.
7
7
 
8
8
  ---
9
9
 
10
+ ## Glossary {#glossary}
11
+
12
+ Canonical meanings. When a doc, comment, test name, or commit message
13
+ uses one of these words, it should mean exactly what's written here.
14
+
15
+ | Term | Meaning |
16
+ |---|---|
17
+ | **run** | The alias-keyed lifetime of one project-agent invocation. Begins on `set run://{alias}` with a prompt; ends at terminal status (200/204/422/499/500). One run per alias; aliases are unique per project. |
18
+ | **loop** | One `ask` or `act` invocation and all its continuation turns until terminal `<update>`, abandonment, or abort. A run can contain multiple loops if a fresh prompt arrives on an existing run. |
19
+ | **turn** | One round-trip with the LLM: one assembled prompt sent, one response parsed. A loop is a sequence of turns. |
20
+ | **mode** | `ask` (read-only — no proposals, no `<sh>`, no edits) or `act` (full tool surface). Per loop, set at the entry point. |
21
+ | **phase** | (Primary, FCRM sense.) One of five FCRM states selected by `<update status="1XY">`: 104=Definition, 105=Discovery, 106=Demotion, 107=Deployment, 108=Verification. Maps to `instructions_10N.md` rendered in `<instructions>`. **The model-facing instructions call these "stages"** — same concept, dual vocabulary kept for the model's surface stability. Two non-FCRM uses of "phase" coexist in the codebase and AGENTS.md: (1) "two-phase turn execution" refers to RECORD→DISPATCH within a single turn; (2) AGENTS.md "Phase 1 / Phase 2 / ..." entries refer to project-development milestones (Schema, Primitives, etc.) — neither is the FCRM phase. Context disambiguates; if it doesn't, it's a doc bug. |
22
+ | **stage** | Model-facing synonym for **phase**. Lives in `instructions_*.md` and tooldocs. |
23
+ | **proposal** | A tool-call entry at status 202 awaiting client resolution (accept/reject). Side-effecting actions (`<sh>`, `<env>`, file `<set>`, file `<rm>`/`<mv>`/`<cp>`, `<ask_user>`) emit proposals. YOLO mode auto-accepts. |
24
+ | **verdict** | The end-of-turn ruling from `hooks.error.verdict` (owned by the error plugin). Returns `{continue, status, reason}`. Decides whether the loop continues to another turn or terminates. |
25
+ | **strike** | A turn whose verdict counts toward `MAX_STRIKES`. A strike fires when `turnErrors > 0` (any `error.log` entry that turn) or when cycle detection trips silently. The streak counter resets on a clean turn (no errors, no cycle); reaches `MAX_STRIKES` → loop abandons at 499. |
26
+ | **resolution** | Client's accept/reject of a proposal via `run/resolve` RPC. |
27
+ | **dispatch** | The DISPATCH phase of a turn — actually executing recorded action entries. |
28
+
29
+ **Hierarchy:** project ⊃ run ⊃ loop ⊃ turn. A turn is the smallest
30
+ unit of model interaction. A strike is a per-turn property that
31
+ accumulates across turns within a loop.
32
+
33
+ ---
34
+
10
35
  ## The Contract
11
36
 
12
37
  Rummy has one contract. Every actor speaks it.
@@ -227,7 +252,7 @@ Every entry plays one of four roles:
227
252
 
228
253
  | Role | Category | Section | Description |
229
254
  |------|----------|---------|-------------|
230
- | **Data** | `data` | `<context>` | Entries the model works with — persistent state and captured payload |
255
+ | **Data** | `data` | `<summarized>` + `<visible>` | Entries the model works with — persistent state and captured payload. Summary line in `<summarized>` for visible+summarized tiers; full body in `<visible>` only when promoted. |
231
256
  | **Logging** | `logging` | `<log>` | Records of what happened — tool results, lifecycle signals |
232
257
  | **Unknowns** | `unknown` | `<unknowns>` | Open questions the model is tracking |
233
258
  | **Prompt** | `prompt` | `<prompt>` | The task driving the loop |
@@ -262,10 +287,11 @@ across two namespaces as a direct consequence:
262
287
  scheme=`log`, category=`logging`. Renders in `<log>`.
263
288
  - **Payload channels** live in `{action}://turn_N/{slug}_N` —
264
289
  scheme=`{action}` (registered as `category: "data"`). Render in
265
- `<context>`.
290
+ `<summarized>` (always, while tracked) and `<visible>` (when
291
+ promoted).
266
292
 
267
293
  This keeps `<log>` a terse audit trail (what happened, exit code,
268
- paths) while `<context>` carries the actual streamed bytes the model
294
+ paths) while `<visible>` carries the actual streamed bytes the model
269
295
  reads. Conflating the two — e.g., writing channels under `log://...` —
270
296
  mislabels payload as audit and pollutes the logging section with
271
297
  multi-line command output. See [streaming_entries](#streaming_entries).
@@ -434,6 +460,32 @@ policy filtering, abort cascade). Plugin-tier convenience verbs
434
460
  don't invoke the handler chain. Plugin code that wants full handler
435
461
  semantics calls `hooks.tools.dispatch` directly.
436
462
 
463
+ **Two-phase turn execution.** Model output flows through
464
+ `TurnExecutor.execute` in strict order:
465
+
466
+ 1. **RECORD** — every parsed command is materialized as a
467
+ `log://turn_N/action/slug` audit entry via `#record()`. Each
468
+ tool's parser shape surfaces exactly one of `path` / `command` /
469
+ `question` as its addressable target; absent fields are treated
470
+ as empty so the validation gate catches bad shapes rather than
471
+ letting `undefined` propagate. Targets longer than 512 chars or
472
+ containing control characters are rejected as likely reasoning
473
+ bleed (the model's chain-of-thought leaking into a tool path).
474
+ Plugins can validate or transform via the `entry.recording`
475
+ filter before the row is committed.
476
+ 2. **DISPATCH** — recorded entries fire sequentially via
477
+ `hooks.tools.dispatch`. Each tool runs to completion before the
478
+ next starts. A failed entry sets `abortAfter`; subsequent
479
+ entries record as `outcome="aborted"`. Crashes inside dispatch
480
+ route through `hooks.error.log` at status 500 and trigger the
481
+ same abort cascade. After each entry, `proposal.prepare` lets
482
+ plugins materialize pending 202 proposals (e.g. `set`'s
483
+ search/replace revisions) from the just-recorded entry.
484
+
485
+ Narration outside tags is fine when the turn also emitted at least
486
+ one command — "OK", "Let me check:", reasoning prefixes are natural
487
+ and don't trigger the no-actionable-tags error path.
488
+
437
489
  **Tool dispatch:** Commands are dispatched sequentially in the order
438
490
  the model emitted them. Each tool either succeeds (200), fails (400+),
439
491
  or proposes (202). On failure, all remaining tools are aborted. On
@@ -508,6 +560,68 @@ export default class Rm {
508
560
 
509
561
  A plugin can be multiple types. Known is a tool AND an assembly plugin.
510
562
 
563
+ ### Failure Reporting {#failure_reporting}
564
+
565
+ **The action entry IS its outcome.** Every action plugin's handler
566
+ finalizes the action's own log entry (`log://turn_N/{action}/{slug}`)
567
+ with body, state, and outcome. Success and failure are two values of
568
+ the same shape — only the field values change. The model sees both
569
+ through the same channel, rendered under the action's scheme.
570
+
571
+ ```
572
+ <get path="src/x.js" status="200">…file body…</get> # success
573
+ <get path="src/x.js" state="failed" outcome="not_found"> # failure
574
+ src/x.js not found
575
+ </get>
576
+ ```
577
+
578
+ State + outcome label the verdict; body is the result — file content
579
+ on success, failure message on failure. No separate error entry is
580
+ written for action-level failures; the model finds the failure exactly
581
+ where it would find the success: at the action's own log path.
582
+
583
+ **Strike attribution.** `error.js#verdict` looks up the post-handler
584
+ state of every recorded entry on each turn. Any `state="failed"`
585
+ result counts as a strike. Plugin authors write their action entry
586
+ once with the right state; the strike machinery follows. They never
587
+ call `error.log.emit` for action-level failures.
588
+
589
+ **`error.log.emit` is for actionless failures** — failures that have
590
+ no corresponding action entry to attach to:
591
+
592
+ - Dispatch crash — the framework caught an exception thrown from inside
593
+ a handler before the handler had a chance to write its own entry.
594
+ - Parser-level failures — malformed XML warnings, no-actionable-tags
595
+ responses, fired before any action entry could be recorded.
596
+ - Runtime watchdog firings — `ContextExceededError`, RPC timeout,
597
+ stream timeout — not bound to a specific action.
598
+ - Budget overflow — pre-dispatch rejection.
599
+
600
+ `error.log.emit` writes a `log://turn_N/error/<slug>` entry and
601
+ increments `state.turnErrors`, which also feeds strike accumulation.
602
+ Both channels (action-entry state=failed and `error.log.emit`)
603
+ contribute to the strike streak; either path advances it.
604
+
605
+ **Recording-filter rejection.** Plugins on the `entry.recording` filter
606
+ chain (e.g. `policy`) can return an entry with `state="failed"`. The
607
+ framework writes that entry to the store before returning from
608
+ `#record`, and dispatch skips it. The model sees the rejection at the
609
+ action's own log path, exactly like any other action-level failure.
610
+
611
+ Cycle detection is **silent** — it does not call `error.log.emit`.
612
+ The strike accumulates internally via `state.turnErrors++`; on
613
+ `MAX_STRIKES` the run abandons at 499 with a telemetry-side reason.
614
+ The model sees no special signal, because telling the model "you're
615
+ looping" invites superficial evasion (vary an attribute to bust the
616
+ fingerprint) without addressing the underlying confusion.
617
+
618
+ **Plugin author contract.** Your handler does one job: finalize the
619
+ action's own log entry with the right body/state/outcome. That's the
620
+ whole API for failure reporting. You do not call `error.log.emit`.
621
+ If your handler throws, the framework catches and routes through
622
+ `error.log.emit` at status 500 — that's the only situation where the
623
+ framework writes on your behalf.
624
+
511
625
  ### Mode Enforcement {#mode_enforcement}
512
626
 
513
627
  Two mechanisms, operating at different layers:
@@ -520,9 +634,11 @@ Two mechanisms, operating at different layers:
520
634
  2. **Per-invocation filtering** — the `policy` plugin subscribes to
521
635
  `entry.recording` and inspects individual emissions for ask-mode
522
636
  violations that the tool-list alone can't catch (file-scheme `<set>`
523
- edits, file `<rm>`, file-destination `<mv>`/`<cp>`). Rejects with
524
- status 403 and emits `error://`. The tool remains advertised; the
525
- specific invocation is blocked.
637
+ edits, file `<rm>`, file-destination `<mv>`/`<cp>`). Rejects by
638
+ marking the action entry `state="failed"`, `outcome="permission"`
639
+ with a body describing the rejection. Per the failure-reporting
640
+ contract — see [failure_reporting](#failure_reporting). The tool
641
+ remains advertised; the specific invocation is blocked.
526
642
 
527
643
  ### YOLO Mode {#yolo_mode}
528
644
 
@@ -634,11 +750,13 @@ log://turn_N/{action}/{slug} scheme=log category=logging status=202
634
750
  {action}://turn_N/{slug}_1 scheme={action} category=data status=102 → 200/500
635
751
  body: primary stream (stdout for shell)
636
752
  summary="{command}" visibility=summarized
637
- (renders in <context>)
753
+ (line in <summarized>; full body in
754
+ <visible> when promoted)
638
755
 
639
756
  {action}://turn_N/{slug}_2 scheme={action} category=data status=102 → 200/500
640
757
  body: alt stream (stderr for shell)
641
- (renders in <context>, often empty)
758
+ (line in <summarized>; full body in
759
+ <visible> when promoted, often empty)
642
760
  ```
643
761
 
644
762
  `{action}` is the producer plugin's name (`sh`, `env`, future: `search`,
@@ -653,6 +771,16 @@ channels. Non-process producers (search, fetch) map their streams onto
653
771
  the same numeric space: `_1` for the primary data stream, `_2` for
654
772
  anomalies/errors, `_3`+ for auxiliary streams.
655
773
 
774
+ **Search prefetch.** The `search` producer (provided by `rummy.web`
775
+ when wired) may prefetch its result URLs as separate `<https>` data
776
+ entries before the model emits any `<get>`. The model sees those
777
+ pages as already-summarized data without having explicitly loaded
778
+ them. Auditors reading dumps should be aware: the absence of a
779
+ corresponding `log://turn_N/get/` for a URL does **not** mean the
780
+ URL wasn't loaded — it may have arrived via search prefetch. The
781
+ prefetch policy is the search plugin's implementation detail; the
782
+ data entries themselves obey the streaming-producer shape above.
783
+
656
784
  **Status 102 ("Processing") marks an entry in mid-stream:** body is
657
785
  partial, will change; tokens grow as chunks arrive. Agents reading a
658
786
  102 entry use `<get>` with `line`/`limit` (including negative `line`
@@ -684,11 +812,26 @@ Two messages per turn. System = stable truth. User = active task.
684
812
  instructions text
685
813
  (instructions.md base template + tool docs injected via
686
814
  instructions.toolDocs filter; optional persona appended)
687
- <context>
688
- all category=data entries (knowns, files, http/https),
689
- wrapped by known.js on assembly.system at priority 100
690
- </context>
691
815
  [user message]
816
+ <summarized>
817
+ one entry per category=data entry whose visibility is visible
818
+ or summarized; plus the named carve-out (archived prompts pass
819
+ through with visibility="archived" so the model can <get> the
820
+ active prompt back). Each entry renders under its scheme tag
821
+ with its summarized projection as the tag body — this is the
822
+ compact-but-informative view produced by the plugin's summary()
823
+ hook (e.g. truncated knowns, code symbols for files, page
824
+ abstracts for URLs). Identity-keyed, slow-mutating: only grows
825
+ when a new entry lands. (known.js, assembly.user priority 50)
826
+ </summarized>
827
+ <visible>
828
+ each category=data entry whose visibility is visible, rendered
829
+ under its scheme tag with its visible projection as the tag
830
+ body (full body per the plugin's visible() hook). Working-set:
831
+ append on promote, remove on demote. A visible entry exists in
832
+ BOTH blocks — summary projection up top, full body below.
833
+ (known.js, assembly.user priority 75)
834
+ </visible>
692
835
  <log>
693
836
  action history — log:// entries + pre-latest prompts
694
837
  (log.js, assembly.user priority 100)
@@ -705,13 +848,23 @@ Two messages per turn. System = stable truth. User = active task.
705
848
  ```
706
849
 
707
850
  **System** = stable world state the model operates within (identity,
708
- tools, tool docs, reference context). Stable across turns within a
709
- run, which keeps prompt caching intact. **User** = active work (what
710
- the model is doing right now): history, open questions, current
711
- phase, and current prompt. The phase-specific `<instructions>` block
712
- lives in the user message precisely *because* it changes between
713
- turns putting it in system would invalidate the cache on every
714
- phase transition.
851
+ tools, tool docs). Stable across turns within a run, which keeps
852
+ prompt caching intact. **User** = active work (what the model is
853
+ doing right now): the project's data surface, history, open questions,
854
+ current phase, and current prompt. Both phase-specific
855
+ `<instructions>` and the codebase blocks (`<summarized>` / `<visible>`)
856
+ live in the user message because they change turn-to-turn putting
857
+ mutable state in system would invalidate the cache on every promote
858
+ or phase transition.
859
+
860
+ **Why two blocks instead of one `<context>`.** Promote/demote is the
861
+ dominant intra-phase operation. Today's single-block render
862
+ invalidates the entire data surface every time. With the split,
863
+ `<summarized>` mutates only when a new entry lands (slow); `<visible>`
864
+ mutates on every promote/demote (fast). Ordering slow-above-fast
865
+ preserves the prefix cache for `<summarized>` across the common case.
866
+ Cognitively: `<summarized>` is "what I know exists" (identity);
867
+ `<visible>` is "what I'm reading right now" (working memory).
715
868
 
716
869
  The `<prompt>` tag is present on every turn — first turn and
717
870
  continuations alike. The model always sees its task. The active prompt
@@ -896,6 +1049,22 @@ and emits a 413 error via `hooks.error.log.emit` with the descriptive
896
1049
  body (what was demoted, the 50% rule for the next turn). The model
897
1050
  sees the `error://` entry next turn and adjusts.
898
1051
 
1052
+ **Delta-from-actual prediction.** Post-dispatch uses
1053
+ `predictNextPacket = lastContextTokens + Σ countTokens(body) for rows added this turn`,
1054
+ not the conservative measureMessages estimator. Reason: a 60%+
1055
+ divergence between the pre-call `<prompt tokenUsage>` (real API
1056
+ prompt_tokens) and the post-check estimator made the model dismiss
1057
+ the budget as janky and stop following demote rules. The two numbers
1058
+ must live on the same scale.
1059
+
1060
+ **Prior-turn-pressure fallback.** If post-dispatch finds nothing to
1061
+ demote in the current turn but the packet still overflows, the
1062
+ pressure is coming from prior-turn promotions the model never demoted
1063
+ itself. Demotion widens to all currently-visible entries in the run
1064
+ and the prompt is also demoted. Without this fallback, observed
1065
+ behavior was strikes accumulating on runs whose base context had
1066
+ drifted over ceiling through no fault of the current turn.
1067
+
899
1068
  **LLM-reported context exceeded.** If the LLM rejects the request
900
1069
  with a "context too long" error (detected via the regex in
901
1070
  `src/llm/errors.js`), the LlmProvider raises `ContextExceededError`
@@ -1075,6 +1244,16 @@ is raw JSON; parse client-side. Mid-turn emissions have `telemetry:
1075
1244
  null`; the final emission of each turn includes the full telemetry
1076
1245
  block (token usage, context distribution, cost).
1077
1246
 
1247
+ **Telemetry completeness guarantee.** Every `run/state` emission
1248
+ computes a real budget from real numbers — never undefined, never
1249
+ synthesized. When no fresh turn result is available
1250
+ (abort/max-turns/crash paths fire before any turn executed, or after
1251
+ a turn that produced no tokens), `AgentLoop.#emitRunState` reads the
1252
+ last turn's `context_tokens` from the DB. Absent means no turn ran
1253
+ yet; zero is the truth, not a fallback. The shape and the math are
1254
+ the same on every code path so the client's renderer never needs to
1255
+ discriminate by emission cause.
1256
+
1078
1257
  `stream/cancelled` payload: `{ run, path, reason }`. Server has
1079
1258
  already transitioned the entries to 499 (`Client Closed Request`);
1080
1259
  client should stop sending `stream` chunks for that path.
@@ -1089,6 +1268,30 @@ client should stop sending `stream` chunks for that path.
1089
1268
  | accept | neither | `running` — healer decides |
1090
1269
  | error | any | `running` — error state, model retries |
1091
1270
 
1271
+ **RPC ack vs run terminal status.** `resolve` and `inject` return the
1272
+ *current* run status (typically 102 mid-run), not 200. The client's
1273
+ dispatch handler must distinguish the synchronous RPC ack from the
1274
+ asynchronous `run/state` notification that carries real terminal
1275
+ state at end-of-turn — otherwise an HTTP-style 200 ack on a
1276
+ successful resolve would prematurely close the document.
1277
+
1278
+ **Proposal hook chain.** Resolution flows through three filter/event
1279
+ hooks plugins can subscribe to:
1280
+
1281
+ - `proposal.accepting` (filter) — first plugin to return
1282
+ `{ allow: false, outcome, body }` vetoes acceptance. The entry
1283
+ resolves to `state="failed"` with the plugin-supplied outcome and
1284
+ body. Used by `policy` for read-only enforcement and similar
1285
+ guards. First veto wins; later filters don't run.
1286
+ - `proposal.content` (filter) — when acceptance proceeds, plugins
1287
+ override the resolved body. Default is `output ?? ""`. The `set`
1288
+ plugin uses this to prefer the proposed body it already staged
1289
+ on the audit entry over whatever literal body the client passed
1290
+ through `resolve`.
1291
+ - `proposal.accepted` / `proposal.rejected` (events) — fired after
1292
+ the resolution is committed; plugins side-effect on either
1293
+ outcome.
1294
+
1092
1295
  ---
1093
1296
 
1094
1297
  ## Plugin System {#plugin_system}
@@ -1165,12 +1368,12 @@ one unified mechanism.
1165
1368
 
1166
1369
  ### Pattern Distribution
1167
1370
 
1168
- Hedbergian pattern matching (globs, body filters, preview) is taught
1371
+ Hedbergian pattern matching (globs, body filters, manifest) is taught
1169
1372
  across multiple tools, not concentrated in one. `get` shows content
1170
- filtering. `cp` shows glob batch operations. `rm` shows preview safety.
1373
+ filtering. `cp` shows glob batch operations. `rm` shows manifest safety.
1171
1374
  Each tool reinforces the pattern vocabulary from a different angle.
1172
1375
  A model that sees `path="known://*"` in get, `path="known://plan_*"` in
1173
- cp, and `path="known://temp_*" preview` in rm learns that patterns
1376
+ cp, and `path="known://temp_*" manifest` in rm learns that patterns
1174
1377
  are universal — not a feature of any single tool.
1175
1378
 
1176
1379
  ---
@@ -1205,11 +1408,14 @@ Termination protocol:
1205
1408
  (the claim of doneness is refuted by the failures)
1206
1409
  - `<update status="102">` → run continues
1207
1410
  - Multiple `<update>` → last one wins
1208
- - No `<update>` + investigation tools → stall counter (RUMMY_MAX_STALLS)
1209
1411
  - No `<update>` + action-only tools → healer infers terminal from body
1210
1412
  - No `<update>` + plain text → healer infers terminal from body
1211
- - Repeated commands cycle detection (RUMMY_MIN_CYCLES, RUMMY_MAX_CYCLE_PERIOD)
1212
- - Repeated update text without non-update work → stall (RUMMY_MAX_UPDATE_REPEATS)
1413
+ - Repeated turn fingerprints (commands, attributes, or empty turns)
1414
+ cycle detection (`RUMMY_MIN_CYCLES`, `RUMMY_MAX_CYCLE_PERIOD`); after
1415
+ detection, strikes accumulate up to `RUMMY_MAX_STRIKES` then close 499.
1416
+ - Hard ceiling: `RUMMY_MAX_LOOP_TURNS` caps turns within a single loop,
1417
+ regardless of any other guard. There is no per-run cap; a run may
1418
+ comprise many loops.
1213
1419
 
1214
1420
  Format normalization:
1215
1421
  - Gemma `\`\`\`tool_code` fences → stripped before parsing
@@ -1218,6 +1424,78 @@ Format normalization:
1218
1424
  - Mistral `[TOOL_CALLS]` → normalized to XML
1219
1425
  - Sed alternate delimiters (`s|old|new|`) → parsed like `s/old/new/`
1220
1426
 
1427
+ ### XML Parser {#xml_parser}
1428
+
1429
+ `src/agent/XmlParser.js` is the syntax layer between raw model output
1430
+ and the dispatch pipeline. Models routinely emit malformed XML —
1431
+ unclosed tags, missing slashes, mismatched closes, unterminated
1432
+ attribute values, embedded code-fences, training-format tool calls.
1433
+ The parser's contract is: never throw, never silently drop a tool
1434
+ call, surface every recovery as a warning so error.log can route it.
1435
+
1436
+ **Pre-flight repair pipeline** (order is load-bearing):
1437
+
1438
+ 1. `#normalizeToolCalls` — translate native training formats (gemma
1439
+ `\`\`\`tool_code\n<xml>\n\`\`\``, Qwen `<|tool_call>call:NAME{...}`,
1440
+ OpenAI `{"name":"...","arguments":{...}}`, Anthropic
1441
+ `<tool_use><name>...</name><input>{...}</input></tool_use>`,
1442
+ Mistral `[TOOL_CALLS] [{...}]`, harmony role/channel pseudo-tags
1443
+ `<|channel>` / `<channel|>`). Catch-all malformed `<|tool_call>`
1444
+ tokens become `<error>` blocks (in prose — never with literal
1445
+ `<get>`/`<set>`/etc. tags, which would re-enter the parser as
1446
+ phantom tool calls).
1447
+ 2. `#neutralizeCodeSpans` — entity-encode tag brackets inside
1448
+ backtick spans (`` `<get/>` `` → `` `&lt;get/&gt;` ``). Models
1449
+ quote instructions; quoted tool names must not parse.
1450
+ 3. `#correctMismatchedCloses` — at outer tool depth (stack=1),
1451
+ rewrite `</WRONG>` to `</RIGHT>`. htmlparser2 silently drops
1452
+ unmatched closes, which would make the explicit recovery path
1453
+ unreachable and absorb every sibling command as body text.
1454
+ Conservative: only outermost depth; nested mismatches inside
1455
+ tool bodies are left alone (bodies are opaque, see below).
1456
+ 4. `#balanceAttrQuotes` — close `ATTR="..."` values that never
1457
+ quote-close before the next tag. Without this repair,
1458
+ htmlparser2 consumes the rest of input as one giant attribute
1459
+ value and silently drops every subsequent tool call. Triggers
1460
+ only when the value contains no quote, no `>`, and is followed
1461
+ by another tag opening or close.
1462
+
1463
+ **Body opacity.** Tool bodies are opaque text, not nested XML. The
1464
+ model writing a plan with `<get/>` examples in it, SEARCH/REPLACE
1465
+ markers in `<set>`, or XML samples in `<known>` all need to survive
1466
+ intact. Nested tag opens push onto a per-tool stack; matching closes
1467
+ pop. Orphan closes that don't match the stack but match a known tool
1468
+ name are treated as recovery (likely typo); unknown orphan closes
1469
+ are kept as body text.
1470
+
1471
+ **Empty-body recovery.** A new tool tag opens while the current tool
1472
+ has no body content yet — the model meant the current tag to
1473
+ self-close but typed it paired, or emitted a mismatched close that
1474
+ htmlparser2 dropped. Close current, open new, emit recovery warning.
1475
+
1476
+ **Per-tool attr-vs-body resolution** (`resolveCommand`). Tools accept
1477
+ attributes on the open tag *and* body text inside the tag. If the
1478
+ canonical attribute is missing, the body silently fills it. The
1479
+ shape per tool:
1480
+ - `set` — structured edit detection (merge-conflict markers, udiff,
1481
+ Claude `<old_text>` XML, JSON `{search,replace}`, sed `s/.../.../`,
1482
+ attribute-mode `search=`/`replace=`, body-as-search-when-`body=`
1483
+ attr-set, plain write).
1484
+ - `update` — body fills `body`, status defaults to 102 if absent.
1485
+ - `get` / `rm` — attr `path` or body fills target. Spread `a` so
1486
+ `line` / `limit` / `visibility` / future attrs reach the handler.
1487
+ - `search` — attr `path` or body fills target; `results` numeric.
1488
+ - `mv` / `cp` — attr `path` (source); attr `to` or body fills dest.
1489
+ Spread `a` so `visibility` reaches the handler for batch
1490
+ visibility changes.
1491
+ - `sh` / `env` — attr `command` or body fills the command.
1492
+ - `ask_user` — attr `question`; attr `options` or body for options.
1493
+
1494
+ **Tool-call cap.** `RUMMY_MAX_COMMANDS` caps the number of tool
1495
+ calls per turn. When hit, remaining commands drop with a warning;
1496
+ the model sees one structured error so it can adjust on the next
1497
+ turn rather than rediscovering silent truncation.
1498
+
1221
1499
  ---
1222
1500
 
1223
1501
  ## Testing
@@ -1381,10 +1659,9 @@ Full reference is `.env.example` — these are the load-bearing vars.
1381
1659
 
1382
1660
  | Var | Default | Purpose |
1383
1661
  |-----|---------|---------|
1384
- | `RUMMY_MAX_TURNS` | 15 | Hard loop iteration cap |
1662
+ | `RUMMY_MAX_LOOP_TURNS` | 99 | Per-loop turn cap (no per-run cap) |
1385
1663
  | `RUMMY_MAX_COMMANDS` | 99 | Max parsed tool calls per turn |
1386
- | `RUMMY_MAX_STALLS` | 3 | Turns without `<update>` before force-complete |
1387
- | `RUMMY_MAX_UPDATE_REPEATS` | 3 | Same-text repeat threshold without progress |
1664
+ | `RUMMY_MAX_STRIKES` | 3 | Strikes (errors or detected cycles) before close at 499 |
1388
1665
  | `RUMMY_MIN_CYCLES` | 3 | Consecutive repetitions to trigger cycle detection |
1389
1666
  | `RUMMY_MAX_CYCLE_PERIOD` | 4 | Max cycle period checked by healer |
1390
1667
  | `RUMMY_RETENTION_DAYS` | 31 | Days of completed/aborted runs kept |
@@ -1,13 +1,13 @@
1
1
  import { existsSync, mkdirSync, copyFileSync } from "node:fs";
2
2
  import { join, dirname } from "node:path";
3
3
  import { fileURLToPath } from "node:url";
4
- import { homedir } from "node:os";
4
+ import resolveRummyHome from "../src/agent/rummyHome.js";
5
5
 
6
6
  const __dirname = dirname(fileURLToPath(import.meta.url));
7
7
  const packageRoot = join(__dirname, "..");
8
8
  const envExample = join(packageRoot, ".env.example");
9
9
 
10
- const rummyHome = process.env.RUMMY_HOME || join(homedir(), ".rummy");
10
+ const rummyHome = resolveRummyHome();
11
11
 
12
12
  if (!existsSync(rummyHome)) {
13
13
  mkdirSync(rummyHome, { recursive: true });
package/bin/rummy.js CHANGED
@@ -3,12 +3,12 @@
3
3
  import { existsSync } from "node:fs";
4
4
  import { isAbsolute, join, dirname } from "node:path";
5
5
  import { fileURLToPath } from "node:url";
6
- import { homedir } from "node:os";
6
+ import resolveRummyHome from "../src/agent/rummyHome.js";
7
7
 
8
8
  const __dirname = dirname(fileURLToPath(import.meta.url));
9
9
  const packageRoot = join(__dirname, "..");
10
10
 
11
- const rummyHome = process.env.RUMMY_HOME || join(homedir(), ".rummy");
11
+ const rummyHome = resolveRummyHome();
12
12
 
13
13
  // Base dir for env files: cwd if it has .env.example, else $RUMMY_HOME.
14
14
  // The package's own .env.example is never consulted — silent package-