onbuzz 4.8.1 → 4.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -481,6 +481,147 @@ describe('TaskManagerTool', () => {
481
481
  expect(result.error).toContain('Invalid status');
482
482
  });
483
483
 
484
+ // ── Destructive-sync guardrail ───────────────────────────────────
485
+ // Real production failure: post-compaction, an agent forgot it had
486
+ // a 9-task plan and called sync with 4 unrelated tasks → all 9
487
+ // were silently dropped. These tests pin the guardrail.
488
+
489
+ describe('REGRESSION: destructive-sync guardrail', () => {
490
+ // Helper: seed the agent with two pending + one in_progress task,
491
+ // then attempt to sync with a totally different list.
492
+ async function setupAgentWithPlan() {
493
+ const { tool, context } = createTestSetup();
494
+ await tool.execute({
495
+ action: 'sync',
496
+ tasks: [
497
+ { title: 'Char select bg', status: 'in_progress', priority: 'high' },
498
+ { title: 'Board art', status: 'pending', priority: 'high' },
499
+ { title: 'Dice animation', status: 'pending', priority: 'medium' },
500
+ ],
501
+ }, context);
502
+ return { tool, context };
503
+ }
504
+
505
+ test('REFUSES sync that would drop pending/in_progress tasks without confirmReplace', async () => {
506
+ const { tool, context } = await setupAgentWithPlan();
507
+ const result = await tool.execute({
508
+ action: 'sync',
509
+ tasks: [
510
+ { title: 'Add Settings UI', status: 'pending', priority: 'high' },
511
+ { title: 'Add Settings logic', status: 'pending', priority: 'high' },
512
+ ],
513
+ }, context);
514
+ expect(result.success).toBe(false);
515
+ expect(result.error).toMatch(/Sync would drop 3 non-terminal task/);
516
+ // Must name the at-risk tasks so the agent can see them.
517
+ expect(result.error).toContain('Char select bg');
518
+ expect(result.error).toContain('Board art');
519
+ expect(result.error).toContain('Dice animation');
520
+ // Must explain the escape hatch.
521
+ expect(result.error).toContain('confirmReplace: true');
522
+ });
523
+
524
+ test('PROCEEDS when confirmReplace=true is explicitly set', async () => {
525
+ const { tool, context } = await setupAgentWithPlan();
526
+ const result = await tool.execute({
527
+ action: 'sync',
528
+ confirmReplace: true,
529
+ tasks: [
530
+ { title: 'Add Settings UI', status: 'pending', priority: 'high' },
531
+ ],
532
+ }, context);
533
+ expect(result.success).toBe(true);
534
+ expect(result.result.summary.total).toBe(1);
535
+ expect(result.result.summary.removed).toBe(3);
536
+ });
537
+
538
+ test('does NOT trigger when the incoming list keeps every open task (rename only)', async () => {
539
+ const { tool, context } = await setupAgentWithPlan();
540
+ // Update statuses, but keep all titles. No drops.
541
+ const result = await tool.execute({
542
+ action: 'sync',
543
+ tasks: [
544
+ { title: 'Char select bg', status: 'completed', priority: 'high' },
545
+ { title: 'Board art', status: 'in_progress', priority: 'high' },
546
+ { title: 'Dice animation', status: 'pending', priority: 'medium' },
547
+ ],
548
+ }, context);
549
+ expect(result.success).toBe(true);
550
+ expect(result.result.summary.updated).toBe(3);
551
+ expect(result.result.summary.removed).toBe(0);
552
+ });
553
+
554
+ test('does NOT trigger when only completed/cancelled tasks would be dropped', async () => {
555
+ const { tool, context } = createTestSetup();
556
+ // Seed with one done task + one pending.
557
+ await tool.execute({
558
+ action: 'sync',
559
+ tasks: [
560
+ { title: 'Already done', status: 'completed', priority: 'low' },
561
+ { title: 'Still going', status: 'pending', priority: 'high' },
562
+ ],
563
+ }, context);
564
+ // New sync drops the completed one but keeps the pending one.
565
+ const result = await tool.execute({
566
+ action: 'sync',
567
+ tasks: [{ title: 'Still going', status: 'in_progress', priority: 'high' }],
568
+ }, context);
569
+ // No guardrail trip — completed task is safe to drop.
570
+ expect(result.success).toBe(true);
571
+ });
572
+
573
+ test('error response includes droppedTasks metadata for programmatic recovery', async () => {
574
+ const { tool, context } = await setupAgentWithPlan();
575
+ const result = await tool.execute({
576
+ action: 'sync',
577
+ tasks: [{ title: 'New thing', status: 'pending', priority: 'high' }],
578
+ }, context);
579
+ expect(result.success).toBe(false);
580
+ // BaseTool's execute() catches the thrown Error; the message
581
+ // carries the human-readable hint. We assert on the hint
582
+ // contents — that's what the agent sees.
583
+ expect(result.error).toMatch(/3 non-terminal task/);
584
+ });
585
+
586
+ test('REGRESSION: the exact Talisman failure scenario is now blocked', async () => {
587
+ // Reproduce the production failure: agent has a 9-task plan
588
+ // reflecting the user's UI revision request. Post-compaction,
589
+ // agent loses context and tries to sync a 4-task Settings plan.
590
+ const { tool, context } = createTestSetup();
591
+ await tool.execute({
592
+ action: 'sync',
593
+ tasks: [
594
+ { title: 'Explore current code and image assets', status: 'in_progress', priority: 'high' },
595
+ { title: 'Generate character select background', status: 'pending', priority: 'high' },
596
+ { title: 'Generate board space art for all nodes', status: 'pending', priority: 'high' },
597
+ { title: 'Generate adventure card art', status: 'pending', priority: 'medium' },
598
+ { title: 'Fix board to rectangular layout', status: 'pending', priority: 'high' },
599
+ { title: 'Fix character select sticky buttons', status: 'pending', priority: 'medium' },
600
+ { title: 'Fix dice animation and combat', status: 'pending', priority: 'high' },
601
+ { title: 'Remove all emojis from UI', status: 'pending', priority: 'medium' },
602
+ { title: 'Test and verify all changes', status: 'pending', priority: 'medium' },
603
+ ],
604
+ }, context);
605
+ // The agent now (mistakenly) tries to sync a Settings plan.
606
+ const result = await tool.execute({
607
+ action: 'sync',
608
+ tasks: [
609
+ { title: 'Add Settings UI to index.html', status: 'pending', priority: 'high' },
610
+ { title: 'Add Settings logic to game.js', status: 'pending', priority: 'high' },
611
+ { title: 'Wire Settings to title screen', status: 'pending', priority: 'medium' },
612
+ { title: 'Test Settings persistence', status: 'pending', priority: 'medium' },
613
+ ],
614
+ }, context);
615
+ expect(result.success).toBe(false);
616
+ // ALL 9 original tasks must be named so the agent sees them.
617
+ expect(result.error).toContain('Explore current code');
618
+ expect(result.error).toContain('Generate character select background');
619
+ expect(result.error).toContain('Fix dice animation and combat');
620
+ expect(result.error).toContain('Remove all emojis from UI');
621
+ expect(result.error).toMatch(/Sync would drop 9 non-terminal task/);
622
+ });
623
+ });
624
+
484
625
  test('enforces only one in_progress task', async () => {
485
626
  const { tool, agent, context } = createTestSetup();
486
627
  await tool.execute({
@@ -852,14 +852,20 @@ class ToolsRegistry {
852
852
  }
853
853
 
854
854
  if (hasMemory) {
855
- description += '**When you recognize the work is multi-turn or multi-session:**\n';
856
- description += '- Save a `memory` entry with title starting `plan/` (e.g. `plan/refactor-auth`). The content auto-injects into your system prompt every turn under "AGENT WORKING PLAN" until you delete it. This is how you survive compaction — anything important enough to remember next session belongs here.\n';
857
- description += '- Update or delete the plan as the situation changes. A stale plan is worse than no plan.\n\n';
858
- }
859
-
860
- if (hasMemory) {
861
- description += '**When you learn something durable** (a user preference, a non-obvious constraint, an architectural fact the next agent will want):\n';
862
- description += '- Save it as a `memory` (non-`plan/` title). One-shot facts go here, NOT in your reply.\n\n';
855
+ // Concrete event-based write triggers. The previous version asked
856
+ // the agent to "save a memory when you recognize the work is
857
+ // multi-turn" that's a judgment call, and in practice agents
858
+ // never made the call. Observed in production: 670 messages,
859
+ // 0 memory writes, despite the OPERATING POSTURE block being
860
+ // present in the system prompt. The fix is to replace "when
861
+ // you recognize" with concrete event triggers the model can
862
+ // pattern-match on without judgment.
863
+ description += '**WRITE memory on these events (not "when you think it\'s a good idea" — these are mandatory):**\n';
864
+ description += '- **A new user message contains a numbered list, a multi-bullet ask, OR more than ~30 words of substantive request.** Your VERY NEXT tool call must be `memory` → `add` with title `plan/<short-topic>` and content = the user\'s entire message verbatim. Do this BEFORE any other tool call, including taskmanager. Why mandatory: compaction may later destroy that message and the agent (you, next session) will not remember what the user actually asked for.\n';
865
+ description += '- **You are about to call `taskmanager` → `sync` with more than 3 tasks at once.** First, save a `memory` titled `plan/<feature>` containing the user\'s original request + your rationale for the plan. The task list is fragile (it can be wiped by a later sync); the plan/* memory is the source of truth.\n';
866
+ description += '- **You made a non-obvious decision** (chose approach A over B, fixed a tricky bug, discovered a constraint while exploring code, hit an unexpected error and figured out the cause). Save it as a `memory` with a non-`plan/` title before the next tool call. Format: title = the conclusion, content = the evidence. The next agent will not re-derive it.\n';
867
+ description += '- **The user gave you a preference or rule that should apply to all future work** ("never use emojis", "always use Tailwind", "the API base URL is X"). Save it immediately as a `memory`.\n\n';
868
+ description += '**`plan/*` memories auto-inject into your system prompt every turn under `## AGENT WORKING PLAN` — that\'s how they survive compaction. Update or delete them as the situation evolves; a stale plan is worse than no plan.**\n\n';
863
869
  }
864
870
 
865
871
  description += '**Distinction:**\n';
@@ -219,7 +219,7 @@ ACTIONS:
219
219
 
220
220
  EXAMPLES:
221
221
 
222
- Sync task list (RECOMMENDED):
222
+ Sync task list (RECOMMENDED — but BEWARE: replaces the whole list):
223
223
  \`\`\`json
224
224
  {
225
225
  "toolId": "taskmanager",
@@ -234,6 +234,27 @@ Sync task list (RECOMMENDED):
234
234
  }
235
235
  \`\`\`
236
236
 
237
+ ⚠️ **DESTRUCTIVE-SYNC GUARDRAIL** — sync replaces the entire task list. Any existing pending/in_progress task whose title doesn't match an incoming task is dropped. If you don't intend to drop those tasks, sync will REFUSE the call and tell you which tasks were at risk.
238
+
239
+ To proceed with a destructive sync intentionally, add \`"confirmReplace": true\`:
240
+ \`\`\`json
241
+ {
242
+ "toolId": "taskmanager",
243
+ "actions": [{
244
+ "type": "sync",
245
+ "confirmReplace": true,
246
+ "tasks": [ /* the full new plan */ ]
247
+ }]
248
+ }
249
+ \`\`\`
250
+
251
+ **Before issuing sync, prefer one of these instead** (they don't risk dropping tasks):
252
+ - \`{"type": "list"}\` first — see what tasks already exist before deciding to replace them.
253
+ - \`{"type": "create", "title": "..."}\` — add a single task without touching the rest.
254
+ - \`{"type": "update", "taskId": "...", "status": "in_progress"}\` — change one task.
255
+
256
+ This matters most right after compaction: the conversation history that mentioned your previous sync may have been compressed, but your task list is still there. Always \`list\` before \`sync\` if you suspect context loss.
257
+
237
258
  Create a task:
238
259
  \`\`\`json
239
260
  {
@@ -650,7 +671,7 @@ Always use a detailed task description to provide context for the task, and leve
650
671
  * @private
651
672
  */
652
673
  async syncTasks(agent, params, context) {
653
- let { tasks } = params;
674
+ let { tasks, confirmReplace } = params;
654
675
 
655
676
  // Parse tasks if provided as JSON string
656
677
  if (typeof tasks === 'string') {
@@ -669,6 +690,55 @@ Always use a detailed task description to provide context for the task, and leve
669
690
  throw new Error('Tasks array cannot be empty');
670
691
  }
671
692
 
693
+ // ── DESTRUCTIVE-SYNC GUARDRAIL ───────────────────────────────────
694
+ // Real failure observed in production: an agent (post-compaction)
695
+ // lost track of its existing 9-task plan and called sync with a
696
+ // 4-task list of unrelated work. sync silently dropped all 9
697
+ // → agent built the wrong thing → user had to repeat themselves
698
+ // multiple times before the agent finally re-read the user request.
699
+ //
700
+ // Rule: if this sync would drop ANY pending or in_progress task
701
+ // whose title doesn't match an incoming task, refuse the call
702
+ // unless the agent explicitly passes `confirmReplace: true`.
703
+ // Completed/cancelled tasks can be silently pruned — they're done.
704
+ //
705
+ // Why this isn't too strict: matching is fuzzy (case-insensitive
706
+ // title compare; see findExistingTask below). An honest plan
707
+ // refinement that renames a few tasks will hit this guard, which
708
+ // is correct — the agent should acknowledge it's replacing work.
709
+ const existingTasksForGuard = agent.taskList?.tasks || [];
710
+ const incomingTitles = new Set(
711
+ tasks
712
+ .map(t => (t.title || '').toLowerCase().trim())
713
+ .filter(Boolean)
714
+ );
715
+ const dropped = existingTasksForGuard.filter(t => {
716
+ // Already terminal — safe to drop.
717
+ if (t.status === 'completed' || t.status === 'cancelled') return false;
718
+ // Match against any incoming title — same fuzzy rule used below.
719
+ return !incomingTitles.has((t.title || '').toLowerCase().trim());
720
+ });
721
+
722
+ if (dropped.length > 0 && confirmReplace !== true) {
723
+ const summary = dropped
724
+ .map(t => ` - [${t.status}] ${t.title}`)
725
+ .join('\n');
726
+ const hint = [
727
+ `Sync would drop ${dropped.length} non-terminal task(s) that don't match any incoming title:`,
728
+ summary,
729
+ '',
730
+ 'If this is intentional (you really mean to replace the plan), retry with `confirmReplace: true`.',
731
+ 'If you instead want to ADD tasks without dropping existing ones, use action "create" per task.',
732
+ 'If you want to keep an existing task, include its title verbatim in the incoming list.',
733
+ '',
734
+ 'This guardrail prevents post-compaction context loss from silently destroying in-flight work.',
735
+ ].join('\n');
736
+ const err = new Error(hint);
737
+ err.code = 'SYNC_WOULD_DROP_OPEN_TASKS';
738
+ err.droppedTasks = dropped.map(t => ({ id: t.id, title: t.title, status: t.status, priority: t.priority }));
739
+ throw err;
740
+ }
741
+
672
742
  const timestamp = new Date().toISOString();
673
743
  const existingTasks = agent.taskList.tasks || [];
674
744
  const updatedTasks = [];