onbuzz 4.8.1 → 4.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/core/__tests__/agentPool.test.js +185 -0
- package/src/core/__tests__/agentScheduler.taskListInjection.test.js +94 -0
- package/src/core/agentPool.js +307 -0
- package/src/core/agentScheduler.js +42 -0
- package/src/services/__tests__/conversationCompactionService.test.js +141 -0
- package/src/services/conversationCompactionService.js +120 -46
- package/src/tools/__tests__/baseTool.test.js +29 -0
- package/src/tools/__tests__/codeMapTool.test.js +179 -0
- package/src/tools/__tests__/taskManagerTool.test.js +141 -0
- package/src/tools/baseTool.js +14 -8
- package/src/tools/taskManagerTool.js +72 -2
|
@@ -481,6 +481,147 @@ describe('TaskManagerTool', () => {
|
|
|
481
481
|
expect(result.error).toContain('Invalid status');
|
|
482
482
|
});
|
|
483
483
|
|
|
484
|
+
// ── Destructive-sync guardrail ───────────────────────────────────
|
|
485
|
+
// Real production failure: post-compaction, an agent forgot it had
|
|
486
|
+
// a 9-task plan and called sync with 4 unrelated tasks → all 9
|
|
487
|
+
// were silently dropped. These tests pin the guardrail.
|
|
488
|
+
|
|
489
|
+
describe('REGRESSION: destructive-sync guardrail', () => {
|
|
490
|
+
// Helper: seed the agent with two pending + one in_progress task,
|
|
491
|
+
// then attempt to sync with a totally different list.
|
|
492
|
+
async function setupAgentWithPlan() {
|
|
493
|
+
const { tool, context } = createTestSetup();
|
|
494
|
+
await tool.execute({
|
|
495
|
+
action: 'sync',
|
|
496
|
+
tasks: [
|
|
497
|
+
{ title: 'Char select bg', status: 'in_progress', priority: 'high' },
|
|
498
|
+
{ title: 'Board art', status: 'pending', priority: 'high' },
|
|
499
|
+
{ title: 'Dice animation', status: 'pending', priority: 'medium' },
|
|
500
|
+
],
|
|
501
|
+
}, context);
|
|
502
|
+
return { tool, context };
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
test('REFUSES sync that would drop pending/in_progress tasks without confirmReplace', async () => {
|
|
506
|
+
const { tool, context } = await setupAgentWithPlan();
|
|
507
|
+
const result = await tool.execute({
|
|
508
|
+
action: 'sync',
|
|
509
|
+
tasks: [
|
|
510
|
+
{ title: 'Add Settings UI', status: 'pending', priority: 'high' },
|
|
511
|
+
{ title: 'Add Settings logic', status: 'pending', priority: 'high' },
|
|
512
|
+
],
|
|
513
|
+
}, context);
|
|
514
|
+
expect(result.success).toBe(false);
|
|
515
|
+
expect(result.error).toMatch(/Sync would drop 3 non-terminal task/);
|
|
516
|
+
// Must name the at-risk tasks so the agent can see them.
|
|
517
|
+
expect(result.error).toContain('Char select bg');
|
|
518
|
+
expect(result.error).toContain('Board art');
|
|
519
|
+
expect(result.error).toContain('Dice animation');
|
|
520
|
+
// Must explain the escape hatch.
|
|
521
|
+
expect(result.error).toContain('confirmReplace: true');
|
|
522
|
+
});
|
|
523
|
+
|
|
524
|
+
test('PROCEEDS when confirmReplace=true is explicitly set', async () => {
|
|
525
|
+
const { tool, context } = await setupAgentWithPlan();
|
|
526
|
+
const result = await tool.execute({
|
|
527
|
+
action: 'sync',
|
|
528
|
+
confirmReplace: true,
|
|
529
|
+
tasks: [
|
|
530
|
+
{ title: 'Add Settings UI', status: 'pending', priority: 'high' },
|
|
531
|
+
],
|
|
532
|
+
}, context);
|
|
533
|
+
expect(result.success).toBe(true);
|
|
534
|
+
expect(result.result.summary.total).toBe(1);
|
|
535
|
+
expect(result.result.summary.removed).toBe(3);
|
|
536
|
+
});
|
|
537
|
+
|
|
538
|
+
test('does NOT trigger when the incoming list keeps every open task (rename only)', async () => {
|
|
539
|
+
const { tool, context } = await setupAgentWithPlan();
|
|
540
|
+
// Update statuses, but keep all titles. No drops.
|
|
541
|
+
const result = await tool.execute({
|
|
542
|
+
action: 'sync',
|
|
543
|
+
tasks: [
|
|
544
|
+
{ title: 'Char select bg', status: 'completed', priority: 'high' },
|
|
545
|
+
{ title: 'Board art', status: 'in_progress', priority: 'high' },
|
|
546
|
+
{ title: 'Dice animation', status: 'pending', priority: 'medium' },
|
|
547
|
+
],
|
|
548
|
+
}, context);
|
|
549
|
+
expect(result.success).toBe(true);
|
|
550
|
+
expect(result.result.summary.updated).toBe(3);
|
|
551
|
+
expect(result.result.summary.removed).toBe(0);
|
|
552
|
+
});
|
|
553
|
+
|
|
554
|
+
test('does NOT trigger when only completed/cancelled tasks would be dropped', async () => {
|
|
555
|
+
const { tool, context } = createTestSetup();
|
|
556
|
+
// Seed with one done task + one pending.
|
|
557
|
+
await tool.execute({
|
|
558
|
+
action: 'sync',
|
|
559
|
+
tasks: [
|
|
560
|
+
{ title: 'Already done', status: 'completed', priority: 'low' },
|
|
561
|
+
{ title: 'Still going', status: 'pending', priority: 'high' },
|
|
562
|
+
],
|
|
563
|
+
}, context);
|
|
564
|
+
// New sync drops the completed one but keeps the pending one.
|
|
565
|
+
const result = await tool.execute({
|
|
566
|
+
action: 'sync',
|
|
567
|
+
tasks: [{ title: 'Still going', status: 'in_progress', priority: 'high' }],
|
|
568
|
+
}, context);
|
|
569
|
+
// No guardrail trip — completed task is safe to drop.
|
|
570
|
+
expect(result.success).toBe(true);
|
|
571
|
+
});
|
|
572
|
+
|
|
573
|
+
test('error response includes droppedTasks metadata for programmatic recovery', async () => {
|
|
574
|
+
const { tool, context } = await setupAgentWithPlan();
|
|
575
|
+
const result = await tool.execute({
|
|
576
|
+
action: 'sync',
|
|
577
|
+
tasks: [{ title: 'New thing', status: 'pending', priority: 'high' }],
|
|
578
|
+
}, context);
|
|
579
|
+
expect(result.success).toBe(false);
|
|
580
|
+
// BaseTool's execute() catches the thrown Error; the message
|
|
581
|
+
// carries the human-readable hint. We assert on the hint
|
|
582
|
+
// contents — that's what the agent sees.
|
|
583
|
+
expect(result.error).toMatch(/3 non-terminal task/);
|
|
584
|
+
});
|
|
585
|
+
|
|
586
|
+
test('REGRESSION: the exact Talisman failure scenario is now blocked', async () => {
|
|
587
|
+
// Reproduce the production failure: agent has a 9-task plan
|
|
588
|
+
// reflecting the user's UI revision request. Post-compaction,
|
|
589
|
+
// agent loses context and tries to sync a 4-task Settings plan.
|
|
590
|
+
const { tool, context } = createTestSetup();
|
|
591
|
+
await tool.execute({
|
|
592
|
+
action: 'sync',
|
|
593
|
+
tasks: [
|
|
594
|
+
{ title: 'Explore current code and image assets', status: 'in_progress', priority: 'high' },
|
|
595
|
+
{ title: 'Generate character select background', status: 'pending', priority: 'high' },
|
|
596
|
+
{ title: 'Generate board space art for all nodes', status: 'pending', priority: 'high' },
|
|
597
|
+
{ title: 'Generate adventure card art', status: 'pending', priority: 'medium' },
|
|
598
|
+
{ title: 'Fix board to rectangular layout', status: 'pending', priority: 'high' },
|
|
599
|
+
{ title: 'Fix character select sticky buttons', status: 'pending', priority: 'medium' },
|
|
600
|
+
{ title: 'Fix dice animation and combat', status: 'pending', priority: 'high' },
|
|
601
|
+
{ title: 'Remove all emojis from UI', status: 'pending', priority: 'medium' },
|
|
602
|
+
{ title: 'Test and verify all changes', status: 'pending', priority: 'medium' },
|
|
603
|
+
],
|
|
604
|
+
}, context);
|
|
605
|
+
// The agent now (mistakenly) tries to sync a Settings plan.
|
|
606
|
+
const result = await tool.execute({
|
|
607
|
+
action: 'sync',
|
|
608
|
+
tasks: [
|
|
609
|
+
{ title: 'Add Settings UI to index.html', status: 'pending', priority: 'high' },
|
|
610
|
+
{ title: 'Add Settings logic to game.js', status: 'pending', priority: 'high' },
|
|
611
|
+
{ title: 'Wire Settings to title screen', status: 'pending', priority: 'medium' },
|
|
612
|
+
{ title: 'Test Settings persistence', status: 'pending', priority: 'medium' },
|
|
613
|
+
],
|
|
614
|
+
}, context);
|
|
615
|
+
expect(result.success).toBe(false);
|
|
616
|
+
// ALL 9 original tasks must be named so the agent sees them.
|
|
617
|
+
expect(result.error).toContain('Explore current code');
|
|
618
|
+
expect(result.error).toContain('Generate character select background');
|
|
619
|
+
expect(result.error).toContain('Fix dice animation and combat');
|
|
620
|
+
expect(result.error).toContain('Remove all emojis from UI');
|
|
621
|
+
expect(result.error).toMatch(/Sync would drop 9 non-terminal task/);
|
|
622
|
+
});
|
|
623
|
+
});
|
|
624
|
+
|
|
484
625
|
test('enforces only one in_progress task', async () => {
|
|
485
626
|
const { tool, agent, context } = createTestSetup();
|
|
486
627
|
await tool.execute({
|
package/src/tools/baseTool.js
CHANGED
|
@@ -852,14 +852,20 @@ class ToolsRegistry {
|
|
|
852
852
|
}
|
|
853
853
|
|
|
854
854
|
if (hasMemory) {
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
855
|
+
// Concrete event-based write triggers. The previous version asked
|
|
856
|
+
// the agent to "save a memory when you recognize the work is
|
|
857
|
+
// multi-turn" — that's a judgment call, and in practice agents
|
|
858
|
+
// never made the call. Observed in production: 670 messages,
|
|
859
|
+
// 0 memory writes, despite the OPERATING POSTURE block being
|
|
860
|
+
// present in the system prompt. The fix is to replace "when
|
|
861
|
+
// you recognize" with concrete event triggers the model can
|
|
862
|
+
// pattern-match on without judgment.
|
|
863
|
+
description += '**WRITE memory on these events (not "when you think it\'s a good idea" — these are mandatory):**\n';
|
|
864
|
+
description += '- **A new user message contains a numbered list, a multi-bullet ask, OR more than ~30 words of substantive request.** Your VERY NEXT tool call must be `memory` → `add` with title `plan/<short-topic>` and content = the user\'s entire message verbatim. Do this BEFORE any other tool call, including taskmanager. Why mandatory: compaction may later destroy that message and the agent (you, next session) will not remember what the user actually asked for.\n';
|
|
865
|
+
description += '- **You are about to call `taskmanager` → `sync` with more than 3 tasks at once.** First, save a `memory` titled `plan/<feature>` containing the user\'s original request + your rationale for the plan. The task list is fragile (it can be wiped by a later sync); the plan/* memory is the source of truth.\n';
|
|
866
|
+
description += '- **You made a non-obvious decision** (chose approach A over B, fixed a tricky bug, discovered a constraint while exploring code, hit an unexpected error and figured out the cause). Save it as a `memory` with a non-`plan/` title before the next tool call. Format: title = the conclusion, content = the evidence. The next agent will not re-derive it.\n';
|
|
867
|
+
description += '- **The user gave you a preference or rule that should apply to all future work** ("never use emojis", "always use Tailwind", "the API base URL is X"). Save it immediately as a `memory`.\n\n';
|
|
868
|
+
description += '**`plan/*` memories auto-inject into your system prompt every turn under `## AGENT WORKING PLAN` — that\'s how they survive compaction. Update or delete them as the situation evolves; a stale plan is worse than no plan.**\n\n';
|
|
863
869
|
}
|
|
864
870
|
|
|
865
871
|
description += '**Distinction:**\n';
|
|
@@ -219,7 +219,7 @@ ACTIONS:
|
|
|
219
219
|
|
|
220
220
|
EXAMPLES:
|
|
221
221
|
|
|
222
|
-
Sync task list (RECOMMENDED):
|
|
222
|
+
Sync task list (RECOMMENDED — but BEWARE: replaces the whole list):
|
|
223
223
|
\`\`\`json
|
|
224
224
|
{
|
|
225
225
|
"toolId": "taskmanager",
|
|
@@ -234,6 +234,27 @@ Sync task list (RECOMMENDED):
|
|
|
234
234
|
}
|
|
235
235
|
\`\`\`
|
|
236
236
|
|
|
237
|
+
⚠️ **DESTRUCTIVE-SYNC GUARDRAIL** — sync replaces the entire task list. Any existing pending/in_progress task whose title doesn't match an incoming task is dropped. If you don't intend to drop those tasks, sync will REFUSE the call and tell you which tasks were at risk.
|
|
238
|
+
|
|
239
|
+
To proceed with a destructive sync intentionally, add \`"confirmReplace": true\`:
|
|
240
|
+
\`\`\`json
|
|
241
|
+
{
|
|
242
|
+
"toolId": "taskmanager",
|
|
243
|
+
"actions": [{
|
|
244
|
+
"type": "sync",
|
|
245
|
+
"confirmReplace": true,
|
|
246
|
+
"tasks": [ /* the full new plan */ ]
|
|
247
|
+
}]
|
|
248
|
+
}
|
|
249
|
+
\`\`\`
|
|
250
|
+
|
|
251
|
+
**Before issuing sync, prefer one of these instead** (they don't risk dropping tasks):
|
|
252
|
+
- \`{"type": "list"}\` first — see what tasks already exist before deciding to replace them.
|
|
253
|
+
- \`{"type": "create", "title": "..."}\` — add a single task without touching the rest.
|
|
254
|
+
- \`{"type": "update", "taskId": "...", "status": "in_progress"}\` — change one task.
|
|
255
|
+
|
|
256
|
+
This matters most right after compaction: the conversation history that mentioned your previous sync may have been compressed, but your task list is still there. Always \`list\` before \`sync\` if you suspect context loss.
|
|
257
|
+
|
|
237
258
|
Create a task:
|
|
238
259
|
\`\`\`json
|
|
239
260
|
{
|
|
@@ -650,7 +671,7 @@ Always use a detailed task description to provide context for the task, and leve
|
|
|
650
671
|
* @private
|
|
651
672
|
*/
|
|
652
673
|
async syncTasks(agent, params, context) {
|
|
653
|
-
let { tasks } = params;
|
|
674
|
+
let { tasks, confirmReplace } = params;
|
|
654
675
|
|
|
655
676
|
// Parse tasks if provided as JSON string
|
|
656
677
|
if (typeof tasks === 'string') {
|
|
@@ -669,6 +690,55 @@ Always use a detailed task description to provide context for the task, and leve
|
|
|
669
690
|
throw new Error('Tasks array cannot be empty');
|
|
670
691
|
}
|
|
671
692
|
|
|
693
|
+
// ── DESTRUCTIVE-SYNC GUARDRAIL ───────────────────────────────────
|
|
694
|
+
// Real failure observed in production: an agent (post-compaction)
|
|
695
|
+
// lost track of its existing 9-task plan and called sync with a
|
|
696
|
+
// 4-task list of unrelated work. sync silently dropped all 9
|
|
697
|
+
// → agent built the wrong thing → user had to repeat themselves
|
|
698
|
+
// multiple times before the agent finally re-read the user request.
|
|
699
|
+
//
|
|
700
|
+
// Rule: if this sync would drop ANY pending or in_progress task
|
|
701
|
+
// whose title doesn't match an incoming task, refuse the call
|
|
702
|
+
// unless the agent explicitly passes `confirmReplace: true`.
|
|
703
|
+
// Completed/cancelled tasks can be silently pruned — they're done.
|
|
704
|
+
//
|
|
705
|
+
// Why this isn't too strict: matching is fuzzy (case-insensitive
|
|
706
|
+
// title compare; see findExistingTask below). An honest plan
|
|
707
|
+
// refinement that renames a few tasks will hit this guard, which
|
|
708
|
+
// is correct — the agent should acknowledge it's replacing work.
|
|
709
|
+
const existingTasksForGuard = agent.taskList?.tasks || [];
|
|
710
|
+
const incomingTitles = new Set(
|
|
711
|
+
tasks
|
|
712
|
+
.map(t => (t.title || '').toLowerCase().trim())
|
|
713
|
+
.filter(Boolean)
|
|
714
|
+
);
|
|
715
|
+
const dropped = existingTasksForGuard.filter(t => {
|
|
716
|
+
// Already terminal — safe to drop.
|
|
717
|
+
if (t.status === 'completed' || t.status === 'cancelled') return false;
|
|
718
|
+
// Match against any incoming title — same fuzzy rule used below.
|
|
719
|
+
return !incomingTitles.has((t.title || '').toLowerCase().trim());
|
|
720
|
+
});
|
|
721
|
+
|
|
722
|
+
if (dropped.length > 0 && confirmReplace !== true) {
|
|
723
|
+
const summary = dropped
|
|
724
|
+
.map(t => ` - [${t.status}] ${t.title}`)
|
|
725
|
+
.join('\n');
|
|
726
|
+
const hint = [
|
|
727
|
+
`Sync would drop ${dropped.length} non-terminal task(s) that don't match any incoming title:`,
|
|
728
|
+
summary,
|
|
729
|
+
'',
|
|
730
|
+
'If this is intentional (you really mean to replace the plan), retry with `confirmReplace: true`.',
|
|
731
|
+
'If you instead want to ADD tasks without dropping existing ones, use action "create" per task.',
|
|
732
|
+
'If you want to keep an existing task, include its title verbatim in the incoming list.',
|
|
733
|
+
'',
|
|
734
|
+
'This guardrail prevents post-compaction context loss from silently destroying in-flight work.',
|
|
735
|
+
].join('\n');
|
|
736
|
+
const err = new Error(hint);
|
|
737
|
+
err.code = 'SYNC_WOULD_DROP_OPEN_TASKS';
|
|
738
|
+
err.droppedTasks = dropped.map(t => ({ id: t.id, title: t.title, status: t.status, priority: t.priority }));
|
|
739
|
+
throw err;
|
|
740
|
+
}
|
|
741
|
+
|
|
672
742
|
const timestamp = new Date().toISOString();
|
|
673
743
|
const existingTasks = agent.taskList.tasks || [];
|
|
674
744
|
const updatedTasks = [];
|