onbuzz 4.8.1 → 4.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/core/__tests__/agentPool.test.js +185 -0
- package/src/core/__tests__/agentScheduler.taskListInjection.test.js +94 -0
- package/src/core/agentPool.js +307 -0
- package/src/core/agentScheduler.js +42 -0
- package/src/services/__tests__/conversationCompactionService.test.js +141 -0
- package/src/services/conversationCompactionService.js +120 -46
- package/src/tools/__tests__/baseTool.test.js +29 -0
- package/src/tools/__tests__/codeMapTool.test.js +179 -0
- package/src/tools/__tests__/taskManagerTool.test.js +141 -0
- package/src/tools/baseTool.js +14 -8
- package/src/tools/taskManagerTool.js +72 -2
package/package.json
CHANGED
|
@@ -420,6 +420,191 @@ describe('AgentPool', () => {
|
|
|
420
420
|
expect(agent.delayEndTime).toBe(pastIso);
|
|
421
421
|
});
|
|
422
422
|
|
|
423
|
+
// ── Auto-save user messages as plan/* memories ────────────────────
|
|
424
|
+
// Talisman case study: agents never voluntarily wrote memories
|
|
425
|
+
// and the user's literal asks vanished in compaction. The pool
|
|
426
|
+
// now auto-saves substantive user messages to the plan/* store
|
|
427
|
+
// as a system-level safety net (the agent itself can still write
|
|
428
|
+
// better-titled plans on top; these auto-saves are cleanup-safe).
|
|
429
|
+
|
|
430
|
+
describe('REGRESSION: auto-save substantive user messages as plan/*', () => {
|
|
431
|
+
test('calls _autoSaveUserMessageAsPlan for each user message (best-effort)', async () => {
|
|
432
|
+
const agent = await pool.createAgent(agentCfg());
|
|
433
|
+
const spy = jest.spyOn(pool, '_autoSaveUserMessageAsPlan').mockResolvedValue(undefined);
|
|
434
|
+
await pool.addUserMessage(agent.id, { content: 'please do a thing that is long enough to count as substantive content here, more than 120 chars total for the heuristic', role: 'user' });
|
|
435
|
+
expect(spy).toHaveBeenCalledTimes(1);
|
|
436
|
+
expect(spy.mock.calls[0][0]).toBe(agent.id);
|
|
437
|
+
expect(spy.mock.calls[0][1].content).toMatch(/please do a thing/);
|
|
438
|
+
spy.mockRestore();
|
|
439
|
+
});
|
|
440
|
+
|
|
441
|
+
test('a memory-service failure does NOT block the message-enqueue path', async () => {
|
|
442
|
+
// Auto-save is best-effort. If memoryService blows up, the
|
|
443
|
+
// user's message must still land on the queue.
|
|
444
|
+
const agent = await pool.createAgent(agentCfg());
|
|
445
|
+
jest.spyOn(pool, '_autoSaveUserMessageAsPlan').mockRejectedValue(new Error('memory store offline'));
|
|
446
|
+
await pool.addUserMessage(agent.id, { content: 'hello', role: 'user' });
|
|
447
|
+
expect(agent.messageQueues.userMessages).toHaveLength(1);
|
|
448
|
+
});
|
|
449
|
+
});
|
|
450
|
+
|
|
451
|
+
describe('_looksSubstantive heuristic — pollution audit', () => {
|
|
452
|
+
// ── TRUE POSITIVES — these SHOULD be saved ─────────────────────
|
|
453
|
+
test('TRUE POSITIVE: numbered request (Talisman literal case)', () => {
|
|
454
|
+
const msg = `Amazing! a few more things
|
|
455
|
+
1. "choose your hero" page - lets put a nice fantasy image at the background.
|
|
456
|
+
2. play screen - why is the board still round? plus, not all nodes have their own art.
|
|
457
|
+
3. when we roll for combat - there is no dice animation visible`;
|
|
458
|
+
expect(pool._looksSubstantive(msg)).toBe(true);
|
|
459
|
+
});
|
|
460
|
+
|
|
461
|
+
test('TRUE POSITIVE: multi-paragraph feature request', () => {
|
|
462
|
+
const msg = 'Implement OAuth login. Replace the existing session-cookie flow with Google and Microsoft sign-in. Migrate existing users by linking their email to the new auth records on first sign-in. Update the docs.';
|
|
463
|
+
expect(pool._looksSubstantive(msg)).toBe(true);
|
|
464
|
+
});
|
|
465
|
+
|
|
466
|
+
test('TRUE POSITIVE: numbered bug list (descriptions, not paths)', () => {
|
|
467
|
+
const msg = `Fix these bugs:\n1. login button does nothing on Safari\n2. cart total wrong when discount applied\n3. 500 error on /api/orders/export`;
|
|
468
|
+
expect(pool._looksSubstantive(msg)).toBe(true);
|
|
469
|
+
});
|
|
470
|
+
|
|
471
|
+
test('TRUE POSITIVE: bulleted plan with imperative verbs', () => {
|
|
472
|
+
const msg = `fixes needed:\n- fix bug in login flow that breaks on slow networks\n- add retry logic to the cache layer\n- write a test for the missing error case`;
|
|
473
|
+
expect(pool._looksSubstantive(msg)).toBe(true);
|
|
474
|
+
});
|
|
475
|
+
|
|
476
|
+
// ── POLLUTION RISKS — these should NOT be saved ────────────────
|
|
477
|
+
test('POLLUTION: short ack is rejected', () => {
|
|
478
|
+
expect(pool._looksSubstantive('ok thanks')).toBe(false);
|
|
479
|
+
expect(pool._looksSubstantive('thanks!')).toBe(false);
|
|
480
|
+
expect(pool._looksSubstantive('yes please')).toBe(false);
|
|
481
|
+
});
|
|
482
|
+
|
|
483
|
+
test('POLLUTION: question with no imperative is rejected (user wants an answer, not a plan)', () => {
|
|
484
|
+
const msg = 'what does this function do? I see it referenced in three places but the docs do not explain its purpose at all.';
|
|
485
|
+
expect(pool._looksSubstantive(msg)).toBe(false);
|
|
486
|
+
});
|
|
487
|
+
|
|
488
|
+
test('POLLUTION: numbered list of QUESTIONS is rejected (not a plan)', () => {
|
|
489
|
+
const msg = `quick questions:\n1. why is the cache key the user id?\n2. is the rate limit per-user or per-ip?`;
|
|
490
|
+
expect(pool._looksSubstantive(msg)).toBe(false);
|
|
491
|
+
});
|
|
492
|
+
|
|
493
|
+
test('POLLUTION: list of file paths is rejected (just references)', () => {
|
|
494
|
+
const msg = `look at these:\n- src/auth/login.js\n- src/auth/session.js\n- src/middleware/auth.js\n- tests/auth.test.js`;
|
|
495
|
+
expect(pool._looksSubstantive(msg)).toBe(false);
|
|
496
|
+
});
|
|
497
|
+
|
|
498
|
+
test('POLLUTION: long pleasantry with no imperative is rejected', () => {
|
|
499
|
+
const msg = 'hey, hope you are well today, that was a good run earlier and I am happy with the progress so far!';
|
|
500
|
+
expect(pool._looksSubstantive(msg)).toBe(false);
|
|
501
|
+
});
|
|
502
|
+
|
|
503
|
+
test('POLLUTION: trivial numbered nonsense is rejected (too short under list rule)', () => {
|
|
504
|
+
expect(pool._looksSubstantive('1. yes 2. no 3. maybe')).toBe(false);
|
|
505
|
+
});
|
|
506
|
+
|
|
507
|
+
test('POLLUTION: reading-comprehension feedback is rejected', () => {
|
|
508
|
+
expect(pool._looksSubstantive('I do not understand what you wrote in your last message. Can you explain it differently?')).toBe(false);
|
|
509
|
+
});
|
|
510
|
+
|
|
511
|
+
test('POLLUTION: agreement / confirmation is rejected', () => {
|
|
512
|
+
expect(pool._looksSubstantive('yes, please proceed with that plan. it looks correct to me.')).toBe(false);
|
|
513
|
+
});
|
|
514
|
+
|
|
515
|
+
test('POLLUTION: simple one-shot instruction is rejected (too short)', () => {
|
|
516
|
+
expect(pool._looksSubstantive('run the tests again')).toBe(false);
|
|
517
|
+
});
|
|
518
|
+
|
|
519
|
+
// ── Defensive corner cases ─────────────────────────────────────
|
|
520
|
+
test('tool-result wrappers are rejected', () => {
|
|
521
|
+
expect(pool._looksSubstantive('[Tool Results — 1 result from 1 tool batch: filesystem] {...lots of content...}')).toBe(false);
|
|
522
|
+
expect(pool._looksSubstantive('[Previous Task — Final Tool Results] [jobdone] {...}')).toBe(false);
|
|
523
|
+
});
|
|
524
|
+
|
|
525
|
+
test('non-string content is rejected defensively', () => {
|
|
526
|
+
expect(pool._looksSubstantive(null)).toBe(false);
|
|
527
|
+
expect(pool._looksSubstantive(undefined)).toBe(false);
|
|
528
|
+
expect(pool._looksSubstantive({ obj: true })).toBe(false);
|
|
529
|
+
});
|
|
530
|
+
});
|
|
531
|
+
|
|
532
|
+
describe('helpers: question / reference / imperative detection', () => {
|
|
533
|
+
test('_dominatedByQuestions detects majority-question content', () => {
|
|
534
|
+
expect(pool._dominatedByQuestions('what does X do?')).toBe(true);
|
|
535
|
+
expect(pool._dominatedByQuestions('1. what?\n2. why?\n3. how?')).toBe(true);
|
|
536
|
+
expect(pool._dominatedByQuestions('fix the bug')).toBe(false);
|
|
537
|
+
// Mixed: 1 question + 2 commands → not dominated.
|
|
538
|
+
expect(pool._dominatedByQuestions('why is this slow?\nfix the cache\nadd metrics')).toBe(false);
|
|
539
|
+
});
|
|
540
|
+
|
|
541
|
+
test('_listItemsAreJustReferences detects path-only lists', () => {
|
|
542
|
+
const refList = `look:\n- src/a.js\n- src/b.js\n- src/c.js`;
|
|
543
|
+
expect(pool._listItemsAreJustReferences(refList)).toBe(true);
|
|
544
|
+
});
|
|
545
|
+
|
|
546
|
+
test('_listItemsAreJustReferences does NOT flag bug descriptions', () => {
|
|
547
|
+
const bugList = `bugs:\n- login breaks on Safari\n- cart total wrong with discount`;
|
|
548
|
+
expect(pool._listItemsAreJustReferences(bugList)).toBe(false);
|
|
549
|
+
});
|
|
550
|
+
|
|
551
|
+
test('_hasImperativeSignal matches common command verbs', () => {
|
|
552
|
+
expect(pool._hasImperativeSignal('fix the login bug')).toBe(true);
|
|
553
|
+
expect(pool._hasImperativeSignal('add a retry')).toBe(true);
|
|
554
|
+
expect(pool._hasImperativeSignal('refactor this module')).toBe(true);
|
|
555
|
+
expect(pool._hasImperativeSignal('what does this do')).toBe(false);
|
|
556
|
+
expect(pool._hasImperativeSignal('looks good')).toBe(false);
|
|
557
|
+
});
|
|
558
|
+
});
|
|
559
|
+
|
|
560
|
+
describe('dedup + cap (Jaccard similarity, AUTO_PLAN_CAP)', () => {
|
|
561
|
+
test('_tokenize lowercases + drops short words + strips punctuation', () => {
|
|
562
|
+
const tokens = pool._tokenize('Fix THE login button on iOS!');
|
|
563
|
+
// 'the', 'on' are <3 chars dropped; 'fix', 'login', 'button', 'ios' remain.
|
|
564
|
+
expect(tokens.has('fix')).toBe(true);
|
|
565
|
+
expect(tokens.has('login')).toBe(true);
|
|
566
|
+
expect(tokens.has('button')).toBe(true);
|
|
567
|
+
expect(tokens.has('ios')).toBe(true);
|
|
568
|
+
expect(tokens.has('the')).toBe(false);
|
|
569
|
+
expect(tokens.has('on')).toBe(false);
|
|
570
|
+
});
|
|
571
|
+
|
|
572
|
+
test('_jaccard returns 1 for identical text, 0 for disjoint, 1 for both empty', () => {
|
|
573
|
+
const a = pool._tokenize('fix the login button');
|
|
574
|
+
const b = pool._tokenize('fix the login button');
|
|
575
|
+
const c = pool._tokenize('build the cache layer');
|
|
576
|
+
expect(pool._jaccard(a, b)).toBe(1);
|
|
577
|
+
expect(pool._jaccard(a, c)).toBe(0);
|
|
578
|
+
expect(pool._jaccard(new Set(), new Set())).toBe(1);
|
|
579
|
+
});
|
|
580
|
+
|
|
581
|
+
test('_overlapCoefficient catches "I repeat my old message" near-duplicate (containment ≥0.85)', () => {
|
|
582
|
+
// Real Talisman case: user repeated themselves verbatim with
|
|
583
|
+
// an "I repeat my old message" preamble. Jaccard alone marks
|
|
584
|
+
// these as merely "similar" because the preamble adds new
|
|
585
|
+
// words; the overlap coefficient (intersection / smaller-set)
|
|
586
|
+
// correctly reports that the original is fully contained in
|
|
587
|
+
// the repeated version.
|
|
588
|
+
const original = pool._tokenize(
|
|
589
|
+
`1. "choose your hero" page - lets put a nice fantasy image at the background. also, lets create a character card art (general)`,
|
|
590
|
+
);
|
|
591
|
+
const repeated = pool._tokenize(
|
|
592
|
+
`Amazing! I repeat my old message - a few more things\n\n"choose your hero" page - lets put a nice fantasy image at the background. also, lets create a character card art (general)`,
|
|
593
|
+
);
|
|
594
|
+
expect(pool._overlapCoefficient(original, repeated)).toBeGreaterThanOrEqual(0.85);
|
|
595
|
+
});
|
|
596
|
+
|
|
597
|
+
test('_overlapCoefficient is 0 for disjoint sets, 1 for one fully contained in the other', () => {
|
|
598
|
+
const a = pool._tokenize('fix login button');
|
|
599
|
+
const b = pool._tokenize('fix login button additional extra words');
|
|
600
|
+
const c = pool._tokenize('build cache layer');
|
|
601
|
+
// 'a' fully contained in 'b' → 1.0
|
|
602
|
+
expect(pool._overlapCoefficient(a, b)).toBe(1);
|
|
603
|
+
// disjoint
|
|
604
|
+
expect(pool._overlapCoefficient(a, c)).toBe(0);
|
|
605
|
+
});
|
|
606
|
+
});
|
|
607
|
+
|
|
423
608
|
test('no-ops when delayEndTime is null', async () => {
|
|
424
609
|
const agent = await pool.createAgent(agentCfg());
|
|
425
610
|
agent.delayEndTime = null;
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Contract tests for the per-turn task-list injection in agentScheduler.
|
|
3
|
+
*
|
|
4
|
+
* Why this exists: the Talisman production failure was caused by
|
|
5
|
+
* post-compaction context loss — the agent forgot it already had a
|
|
6
|
+
* 9-task plan and called `taskmanager sync` with 4 unrelated tasks,
|
|
7
|
+
* silently destroying the in-flight work. The destructive-sync
|
|
8
|
+
* guardrail in taskManagerTool catches that AT THE SYNC POINT, but
|
|
9
|
+
* the cleaner fix is to never let the agent forget the task list in
|
|
10
|
+
* the first place — inject it into the system prompt every turn.
|
|
11
|
+
*
|
|
12
|
+
* This test pins the contract by source-grep (matches the existing
|
|
13
|
+
* pattern in agentScheduler.taskLifecycleInstruction.test.js): the
|
|
14
|
+
* scheduler's source must contain the injection block AND its key
|
|
15
|
+
* properties. Reading the file is cheaper than instantiating the
|
|
16
|
+
* full scheduler graph (websocket, compaction service, AI service,
|
|
17
|
+
* etc.) just to assert on prompt content.
|
|
18
|
+
*/
|
|
19
|
+
import { describe, test, expect } from '@jest/globals';
|
|
20
|
+
import { readFileSync } from 'fs';
|
|
21
|
+
import { fileURLToPath } from 'url';
|
|
22
|
+
import { dirname, join } from 'path';
|
|
23
|
+
|
|
24
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
25
|
+
const SRC = readFileSync(join(__dirname, '../agentScheduler.js'), 'utf-8');
|
|
26
|
+
|
|
27
|
+
describe('agentScheduler — per-turn task-list injection', () => {
|
|
28
|
+
test('injection block exists in agentScheduler.js source', () => {
|
|
29
|
+
expect(SRC).toContain('CURRENT TASK LIST');
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
test('emits the heading the agent will key off of', () => {
|
|
33
|
+
// The block heading must be specific and stable — the agent's
|
|
34
|
+
// system prompt is the only thing it sees, so the heading needs
|
|
35
|
+
// to be unambiguous.
|
|
36
|
+
expect(SRC).toMatch(/##\s*CURRENT TASK LIST/);
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
test('reads from durable agent state (agent.taskList.tasks), not from conversation history', () => {
|
|
40
|
+
// The whole point of this injection is to recover from a
|
|
41
|
+
// conversation-history loss (compaction). It MUST read the live,
|
|
42
|
+
// durable taskList — not derive from messages.
|
|
43
|
+
expect(SRC).toMatch(/agent\.taskList\??\.tasks/);
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
test('the block runs AFTER the plan/* injection (cache-friendly ordering)', () => {
|
|
47
|
+
// Stable cacheable system-prompt prefix is critical for the
|
|
48
|
+
// model-side prompt cache. Both injections are dynamic per turn,
|
|
49
|
+
// but ordering must be consistent so the prefix up to plan/*
|
|
50
|
+
// stays stable across turns when only the task list moves.
|
|
51
|
+
const planIdx = SRC.indexOf('AGENT WORKING PLAN');
|
|
52
|
+
const taskIdx = SRC.indexOf('CURRENT TASK LIST');
|
|
53
|
+
expect(planIdx).toBeGreaterThan(-1);
|
|
54
|
+
expect(taskIdx).toBeGreaterThan(-1);
|
|
55
|
+
expect(taskIdx).toBeGreaterThan(planIdx);
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
test('mentions compaction-resistance explicitly', () => {
|
|
59
|
+
// The agent reads the injected block. It must know that the
|
|
60
|
+
// injection is a deliberate signal — not just historical residue
|
|
61
|
+
// — so it gives appropriate weight to the listed tasks.
|
|
62
|
+
expect(SRC).toMatch(/(survives compaction|compaction summarized|compaction)/);
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
test('warns the agent about destructive sync inside the injected block', () => {
|
|
66
|
+
// The injection is the natural place to point the agent at the
|
|
67
|
+
// guardrail. Strong message: do not blindly sync past the
|
|
68
|
+
// existing tasks.
|
|
69
|
+
expect(SRC).toMatch(/(sync|drop).*existing|before issuing.*sync/i);
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
test('groups tasks by status (in_progress / pending / completed / cancelled)', () => {
|
|
73
|
+
// A flat list of 30 tasks is hard to scan. Status grouping puts
|
|
74
|
+
// in_progress first — what the agent needs to act on.
|
|
75
|
+
expect(SRC).toContain('in_progress');
|
|
76
|
+
expect(SRC).toContain('pending');
|
|
77
|
+
expect(SRC).toContain('completed');
|
|
78
|
+
expect(SRC).toContain('cancelled');
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
test('degrades gracefully — never throws on missing taskList', () => {
|
|
82
|
+
// If taskList is undefined or .tasks is missing/empty, the
|
|
83
|
+
// injection should silently skip — never break the turn.
|
|
84
|
+
expect(SRC).toMatch(/agent\.taskList\?\.tasks/);
|
|
85
|
+
// And the whole block is wrapped in try/catch with a logger warn.
|
|
86
|
+
const block = SRC.slice(
|
|
87
|
+
SRC.indexOf('Auto-inject CURRENT TASK LIST'),
|
|
88
|
+
SRC.indexOf('Check if streaming is enabled'),
|
|
89
|
+
);
|
|
90
|
+
expect(block).toMatch(/try\s*{/);
|
|
91
|
+
expect(block).toMatch(/catch\s*\(/);
|
|
92
|
+
expect(block).toMatch(/continuing without/);
|
|
93
|
+
});
|
|
94
|
+
});
|
package/src/core/agentPool.js
CHANGED
|
@@ -23,6 +23,17 @@ import DirectoryAccessManager from '../utilities/directoryAccessManager.js';
|
|
|
23
23
|
import { getVisualEditorBridge } from '../services/visualEditorBridge.js';
|
|
24
24
|
|
|
25
25
|
class AgentPool {
|
|
26
|
+
// Stopwords for the _tokenize / _jaccard similarity check used by
|
|
27
|
+
// auto-save-as-plan dedup. Tight list — only words that appear in
|
|
28
|
+
// virtually every English sentence regardless of content, so that
|
|
29
|
+
// their presence in both messages doesn't inflate similarity.
|
|
30
|
+
static _STOPWORDS = new Set([
|
|
31
|
+
'the', 'and', 'for', 'but', 'are', 'was', 'were',
|
|
32
|
+
'has', 'have', 'had', 'this', 'that', 'with', 'will',
|
|
33
|
+
'you', 'your', 'our', 'their', 'them', 'they',
|
|
34
|
+
'can', 'could', 'should', 'would',
|
|
35
|
+
]);
|
|
36
|
+
|
|
26
37
|
constructor(config, logger, stateManager, contextManager, toolsRegistry = null) {
|
|
27
38
|
this.config = config;
|
|
28
39
|
this.logger = logger;
|
|
@@ -1437,6 +1448,23 @@ class AgentPool {
|
|
|
1437
1448
|
this._autoCreateTaskForMessage(agent, queuedMessage, 'user', 'high');
|
|
1438
1449
|
}
|
|
1439
1450
|
|
|
1451
|
+
// ── Auto-save substantive user messages as plan/* memories ───────
|
|
1452
|
+
// Observed in production: across 670-message agent sessions the
|
|
1453
|
+
// agent NEVER wrote a memory voluntarily. Compaction then summarized
|
|
1454
|
+
// away the user's literal asks, the agent paraphrased what was left,
|
|
1455
|
+
// and ended up doing work the user never requested. Belt-and-
|
|
1456
|
+
// suspenders alongside the OPERATING POSTURE prompt nudge: when a
|
|
1457
|
+
// user message looks substantive (long, or contains a numbered/
|
|
1458
|
+
// bulleted multi-part ask), the SYSTEM saves it as `plan/<auto>` so
|
|
1459
|
+
// the system-prompt auto-injection makes the user's words visible
|
|
1460
|
+
// every turn — even if the agent itself never thought to save.
|
|
1461
|
+
// Best-effort: never block the message-enqueue path.
|
|
1462
|
+
this._autoSaveUserMessageAsPlan(agentId, queuedMessage).catch(err => {
|
|
1463
|
+
this.logger.debug?.('Auto-save of user message as plan/* failed (continuing)', {
|
|
1464
|
+
agentId, error: err?.message,
|
|
1465
|
+
});
|
|
1466
|
+
});
|
|
1467
|
+
|
|
1440
1468
|
await this.persistAgentState(agentId);
|
|
1441
1469
|
|
|
1442
1470
|
// If we cleared a delay, surface it on the WS so the delay chip in the
|
|
@@ -1556,6 +1584,285 @@ class AgentPool {
|
|
|
1556
1584
|
* @param {string} priority - Task priority ('high', 'medium', 'low')
|
|
1557
1585
|
* @private
|
|
1558
1586
|
*/
|
|
1587
|
+
/**
|
|
1588
|
+
* Save a substantive user message as a `plan/*` memory automatically.
|
|
1589
|
+
*
|
|
1590
|
+
* Rationale (Talisman case study, May 2026): agents observed in
|
|
1591
|
+
* production never wrote a single memory across hundreds of
|
|
1592
|
+
* messages, even when the OPERATING POSTURE prompt explicitly told
|
|
1593
|
+
* them to. The user's literal ask then got lost in compaction and
|
|
1594
|
+
* the agent went off-course. This system-level safety net puts the
|
|
1595
|
+
* user's message into the durable plan/* store — which the system
|
|
1596
|
+
* prompt auto-injects every turn — without depending on the model
|
|
1597
|
+
* making the call.
|
|
1598
|
+
*
|
|
1599
|
+
* What counts as "substantive":
|
|
1600
|
+
* - Content length ≥ 60 chars (~12 words) — short acks/yes-no don't qualify
|
|
1601
|
+
* - AND any of:
|
|
1602
|
+
* • contains a numbered list ("1.", "2.", "3." …)
|
|
1603
|
+
* • contains a bullet list (-, *, • at line start)
|
|
1604
|
+
* • OR is ≥ 120 chars (longer than a one-line ack)
|
|
1605
|
+
*
|
|
1606
|
+
* What gets saved:
|
|
1607
|
+
* - title: `plan/user-<short-slug>-<timestamp>`
|
|
1608
|
+
* - description: "auto-saved from user message at <iso>"
|
|
1609
|
+
* - content: the verbatim user message
|
|
1610
|
+
*
|
|
1611
|
+
* The agent can rename, consolidate, or delete these later. They
|
|
1612
|
+
* exist as a fail-safe — if the agent does its job and saves its
|
|
1613
|
+
* own better-named plan, these auto-saves can be cleaned up. If
|
|
1614
|
+
* the agent doesn't, at least the user's words survive compaction.
|
|
1615
|
+
*
|
|
1616
|
+
* @param {string} agentId
|
|
1617
|
+
* @param {Object} message - The queued user message
|
|
1618
|
+
* @private
|
|
1619
|
+
*/
|
|
1620
|
+
async _autoSaveUserMessageAsPlan(agentId, message) {
|
|
1621
|
+
const content = typeof message?.content === 'string' ? message.content : '';
|
|
1622
|
+
if (!content) return;
|
|
1623
|
+
if (!this._looksSubstantive(content)) return;
|
|
1624
|
+
|
|
1625
|
+
// Lazy-load to keep agentPool's load order light. The same import
|
|
1626
|
+
// pattern as agentScheduler's plan injection.
|
|
1627
|
+
let memoryService;
|
|
1628
|
+
try {
|
|
1629
|
+
const mod = await import('../services/memoryService.js');
|
|
1630
|
+
memoryService = mod.getMemoryService(this.logger);
|
|
1631
|
+
await memoryService.initialize();
|
|
1632
|
+
} catch (e) {
|
|
1633
|
+
this.logger.debug?.('Auto-save plan: memory service unavailable', { error: e.message });
|
|
1634
|
+
return;
|
|
1635
|
+
}
|
|
1636
|
+
|
|
1637
|
+
// ── Deduplication ────────────────────────────────────────────────
|
|
1638
|
+
// Users repeat themselves ("I repeat my old message", "did you do
|
|
1639
|
+
// it all?" + paste the same thing). Without dedup the auto-saver
|
|
1640
|
+
// would create N copies of essentially the same plan. Load
|
|
1641
|
+
// existing plan/user-* memories and skip when the new content is
|
|
1642
|
+
// ≥70% similar to any of them (Jaccard over normalized word sets).
|
|
1643
|
+
let existingPlans = [];
|
|
1644
|
+
try {
|
|
1645
|
+
const all = await memoryService.loadMemories(agentId);
|
|
1646
|
+
existingPlans = (all || []).filter(m =>
|
|
1647
|
+
typeof m?.title === 'string' && m.title.startsWith('plan/user-')
|
|
1648
|
+
);
|
|
1649
|
+
} catch (e) {
|
|
1650
|
+
// Treat unreadable store as empty — we may still write a fresh entry.
|
|
1651
|
+
this.logger.debug?.('Auto-save plan: existing memories unreadable', { agentId, error: e.message });
|
|
1652
|
+
}
|
|
1653
|
+
|
|
1654
|
+
const newTokens = this._tokenize(content);
|
|
1655
|
+
for (const existing of existingPlans) {
|
|
1656
|
+
const existingTokens = this._tokenize(existing.content || '');
|
|
1657
|
+
const sim = this._jaccard(newTokens, existingTokens);
|
|
1658
|
+
const containment = this._overlapCoefficient(newTokens, existingTokens);
|
|
1659
|
+
// Jaccard catches near-identical reformulations. Containment
|
|
1660
|
+
// catches the "I repeat my old message — <same content>" case
|
|
1661
|
+
// where the user re-pastes the original plus a preamble. Either
|
|
1662
|
+
// signal is enough to suppress the duplicate.
|
|
1663
|
+
if (sim >= 0.7 || containment >= 0.85) {
|
|
1664
|
+
this.logger.info?.('Auto-save plan: skipping near-duplicate of existing plan', {
|
|
1665
|
+
agentId, existingTitle: existing.title,
|
|
1666
|
+
jaccard: sim.toFixed(2), containment: containment.toFixed(2),
|
|
1667
|
+
});
|
|
1668
|
+
return;
|
|
1669
|
+
}
|
|
1670
|
+
}
|
|
1671
|
+
|
|
1672
|
+
// ── Per-agent cap ────────────────────────────────────────────────
|
|
1673
|
+
// Bound the total auto-saved plans so an active session doesn't
|
|
1674
|
+
// bloat the agent's plan/* namespace indefinitely. Keep the K most
|
|
1675
|
+
// recent; delete the oldest auto-saves beyond that.
|
|
1676
|
+
const AUTO_PLAN_CAP = 8;
|
|
1677
|
+
const existingAutoSaves = existingPlans
|
|
1678
|
+
.filter(m => /^plan\/user-/.test(m.title))
|
|
1679
|
+
.sort((a, b) => String(a.createdAt || '').localeCompare(String(b.createdAt || '')));
|
|
1680
|
+
while (existingAutoSaves.length >= AUTO_PLAN_CAP) {
|
|
1681
|
+
const oldest = existingAutoSaves.shift();
|
|
1682
|
+
try {
|
|
1683
|
+
await memoryService.deleteMemory(agentId, oldest.id);
|
|
1684
|
+
this.logger.info?.('Auto-save plan: retired oldest auto-save to keep cap', {
|
|
1685
|
+
agentId, retiredTitle: oldest.title, cap: AUTO_PLAN_CAP,
|
|
1686
|
+
});
|
|
1687
|
+
} catch (e) {
|
|
1688
|
+
// Non-fatal — if we can't delete the oldest, just skip this entry
|
|
1689
|
+
// and proceed with the write. Worst case the plan list grows
|
|
1690
|
+
// by one beyond the cap — still bounded over time.
|
|
1691
|
+
this.logger.debug?.('Auto-save plan: retire-oldest failed', { agentId, error: e.message });
|
|
1692
|
+
break;
|
|
1693
|
+
}
|
|
1694
|
+
}
|
|
1695
|
+
|
|
1696
|
+
// ── Write the new memory ─────────────────────────────────────────
|
|
1697
|
+
const firstLine = (content.match(/[^\n]+/) || [''])[0].trim();
|
|
1698
|
+
const slug = firstLine
|
|
1699
|
+
.toLowerCase()
|
|
1700
|
+
.replace(/[^a-z0-9]+/g, '-')
|
|
1701
|
+
.replace(/^-+|-+$/g, '')
|
|
1702
|
+
.slice(0, 40) || 'request';
|
|
1703
|
+
const ts = new Date().toISOString().slice(0, 19).replace(/[:T]/g, '-');
|
|
1704
|
+
const title = `plan/user-${slug}-${ts}`;
|
|
1705
|
+
|
|
1706
|
+
try {
|
|
1707
|
+
await memoryService.addMemory(agentId, {
|
|
1708
|
+
title,
|
|
1709
|
+
description: `Auto-saved from user message at ${message.timestamp || new Date().toISOString()}`,
|
|
1710
|
+
content,
|
|
1711
|
+
});
|
|
1712
|
+
this.logger.info?.('Auto-saved user message as plan/* memory', {
|
|
1713
|
+
agentId, title, contentLength: content.length,
|
|
1714
|
+
});
|
|
1715
|
+
} catch (e) {
|
|
1716
|
+
this.logger.debug?.('Auto-save plan: write failed', { agentId, title, error: e.message });
|
|
1717
|
+
}
|
|
1718
|
+
}
|
|
1719
|
+
|
|
1720
|
+
/**
|
|
1721
|
+
* Tokenize a string into a lowercased word set for similarity checks.
|
|
1722
|
+
* Strips punctuation, drops short words (<3 chars), and drops a
|
|
1723
|
+
* small stopword set so that common words like "the" / "and" don't
|
|
1724
|
+
* inflate similarity scores between otherwise different messages.
|
|
1725
|
+
* @private
|
|
1726
|
+
*/
|
|
1727
|
+
_tokenize(s) {
|
|
1728
|
+
if (typeof s !== 'string') return new Set();
|
|
1729
|
+
return new Set(
|
|
1730
|
+
s.toLowerCase()
|
|
1731
|
+
.replace(/[^a-z0-9\s]+/g, ' ')
|
|
1732
|
+
.split(/\s+/)
|
|
1733
|
+
.filter(w => w.length >= 3 && !AgentPool._STOPWORDS.has(w))
|
|
1734
|
+
);
|
|
1735
|
+
}
|
|
1736
|
+
|
|
1737
|
+
/**
|
|
1738
|
+
* Jaccard similarity over two word sets.
|
|
1739
|
+
* @private
|
|
1740
|
+
*/
|
|
1741
|
+
_jaccard(a, b) {
|
|
1742
|
+
if (a.size === 0 && b.size === 0) return 1;
|
|
1743
|
+
if (a.size === 0 || b.size === 0) return 0;
|
|
1744
|
+
let intersection = 0;
|
|
1745
|
+
for (const w of a) if (b.has(w)) intersection += 1;
|
|
1746
|
+
return intersection / (a.size + b.size - intersection);
|
|
1747
|
+
}
|
|
1748
|
+
|
|
1749
|
+
/**
|
|
1750
|
+
* Overlap coefficient — intersection / size-of-smaller-set.
|
|
1751
|
+
* Returns 1.0 when one set is fully contained in the other,
|
|
1752
|
+
* regardless of how much the other set adds. Catches the "user
|
|
1753
|
+
* re-pastes their request with a preamble" duplicate case where
|
|
1754
|
+
* Jaccard would mark the messages as merely similar.
|
|
1755
|
+
* @private
|
|
1756
|
+
*/
|
|
1757
|
+
_overlapCoefficient(a, b) {
|
|
1758
|
+
if (a.size === 0 || b.size === 0) return 0;
|
|
1759
|
+
let intersection = 0;
|
|
1760
|
+
for (const w of a) if (b.has(w)) intersection += 1;
|
|
1761
|
+
return intersection / Math.min(a.size, b.size);
|
|
1762
|
+
}
|
|
1763
|
+
|
|
1764
|
+
/**
|
|
1765
|
+
* Heuristic — does this user message look like a real request worth
|
|
1766
|
+
* preserving as a plan/*? Errs on the side of saving more (recall
|
|
1767
|
+
* over precision) — a stray auto-save is cheap; a lost user request
|
|
1768
|
+
* is catastrophic.
|
|
1769
|
+
* @private
|
|
1770
|
+
*/
|
|
1771
|
+
_looksSubstantive(text) {
|
|
1772
|
+
if (typeof text !== 'string') return false;
|
|
1773
|
+
const t = text.trim();
|
|
1774
|
+
if (t.length < 30) return false;
|
|
1775
|
+
// Tool-result wrappers and previous-task boundaries are not user voice.
|
|
1776
|
+
if (t.startsWith('[Tool Results') || t.startsWith('[Previous Task')) return false;
|
|
1777
|
+
|
|
1778
|
+
// ── Pollution filter 1: dominated by questions ────────────────────
|
|
1779
|
+
// A message that's mostly questions wants an ANSWER, not a plan.
|
|
1780
|
+
// If the majority of non-empty lines end in '?' (or are
|
|
1781
|
+
// question-shaped), this is a query, not a request.
|
|
1782
|
+
if (this._dominatedByQuestions(t)) return false;
|
|
1783
|
+
|
|
1784
|
+
// ── Pollution filter 2: list items are just refs (paths, urls) ───
|
|
1785
|
+
// A list of file paths / URLs / commit hashes is the user pointing
|
|
1786
|
+
// the agent at things, not a multi-part plan. Save it only if the
|
|
1787
|
+
// surrounding prose carries imperative intent — and even then the
|
|
1788
|
+
// length gate handles that path.
|
|
1789
|
+
const hasList = /^\s*(?:\d+[.)]|[-*•])\s/m.test(t);
|
|
1790
|
+
if (hasList && this._listItemsAreJustReferences(t)) return false;
|
|
1791
|
+
|
|
1792
|
+
// ── Now apply the structural triggers ────────────────────────────
|
|
1793
|
+
// Numbered list — "1." / "1)" at a line start. Multi-part intent.
|
|
1794
|
+
// Require a minimum total length to avoid "1. yes 2. no" nonsense.
|
|
1795
|
+
if (/^\s*\d+[.)]\s/m.test(t) && t.length >= 60) return true;
|
|
1796
|
+
// Bullet list at line start. Same — strong intent signal + length.
|
|
1797
|
+
if (/^\s*[-*•]\s/m.test(t) && t.length >= 60) return true;
|
|
1798
|
+
// Free-form prose with no list markers must be substantial AND
|
|
1799
|
+
// contain an imperative-like signal (a verb you'd give as an
|
|
1800
|
+
// order). Raised from 120 → 150 to skip more pleasantries.
|
|
1801
|
+
if (t.length >= 150 && this._hasImperativeSignal(t)) return true;
|
|
1802
|
+
return false;
|
|
1803
|
+
}
|
|
1804
|
+
|
|
1805
|
+
/**
|
|
1806
|
+
* Heuristic: is this message mostly questions?
|
|
1807
|
+
* @private
|
|
1808
|
+
*/
|
|
1809
|
+
_dominatedByQuestions(t) {
|
|
1810
|
+
// Split into non-empty lines.
|
|
1811
|
+
const lines = t.split(/\r?\n/).map(l => l.trim()).filter(Boolean);
|
|
1812
|
+
if (lines.length === 0) return false;
|
|
1813
|
+
// Strip leading list markers so we can look at the line's intent.
|
|
1814
|
+
const stripMarker = (l) => l.replace(/^(?:\d+[.)]|[-*•])\s+/, '');
|
|
1815
|
+
let questionLines = 0;
|
|
1816
|
+
for (const raw of lines) {
|
|
1817
|
+
const line = stripMarker(raw);
|
|
1818
|
+
// Ends in '?', OR starts with a question word at the line head.
|
|
1819
|
+
if (/\?\s*$/.test(line) || /^(?:what|why|how|when|where|who|which|is\b|are\b|do\b|does\b|can\b|could\b|should\b|would\b)\b/i.test(line)) {
|
|
1820
|
+
questionLines += 1;
|
|
1821
|
+
}
|
|
1822
|
+
}
|
|
1823
|
+
// Strict-majority rule: more than half of lines are questions.
|
|
1824
|
+
return questionLines * 2 > lines.length;
|
|
1825
|
+
}
|
|
1826
|
+
|
|
1827
|
+
/**
|
|
1828
|
+
* Heuristic: are the list items in this message just references
|
|
1829
|
+
* (file paths, URLs, commit hashes) with no imperative verb of their own?
|
|
1830
|
+
* @private
|
|
1831
|
+
*/
|
|
1832
|
+
_listItemsAreJustReferences(t) {
|
|
1833
|
+
const lines = t.split(/\r?\n/).map(l => l.trim()).filter(Boolean);
|
|
1834
|
+
const listItems = lines.filter(l => /^(?:\d+[.)]|[-*•])\s/.test(l));
|
|
1835
|
+
if (listItems.length === 0) return false;
|
|
1836
|
+
let refLikeCount = 0;
|
|
1837
|
+
for (const li of listItems) {
|
|
1838
|
+
const body = li.replace(/^(?:\d+[.)]|[-*•])\s+/, '').trim();
|
|
1839
|
+
// Only treat as a "reference" if the line IS the reference —
|
|
1840
|
+
// i.e. a path/URL/hash with no surrounding English. A short bug
|
|
1841
|
+
// description like "login button does nothing on Safari" still
|
|
1842
|
+
// counts as content, not a reference.
|
|
1843
|
+
// Path: contains '/' or '\' OR starts with '.' AND has NO spaces
|
|
1844
|
+
// URL: starts with http(s)://
|
|
1845
|
+
// Hash: 7-40 hex chars only, no spaces
|
|
1846
|
+
const isPath = (/[/\\]/.test(body) || /^\./.test(body)) && !/\s/.test(body);
|
|
1847
|
+
const isUrl = /^https?:\/\//.test(body) && !/\s/.test(body);
|
|
1848
|
+
const isHash = /^[0-9a-f]{7,40}$/i.test(body);
|
|
1849
|
+
if (isPath || isUrl || isHash) refLikeCount += 1;
|
|
1850
|
+
}
|
|
1851
|
+
// Strict-majority of list items are reference-like → ignore.
|
|
1852
|
+
return refLikeCount * 2 > listItems.length;
|
|
1853
|
+
}
|
|
1854
|
+
|
|
1855
|
+
/**
|
|
1856
|
+
* Heuristic: does the message contain a verb that signals "do this"?
|
|
1857
|
+
* Conservative — favors recall over precision.
|
|
1858
|
+
* @private
|
|
1859
|
+
*/
|
|
1860
|
+
_hasImperativeSignal(t) {
|
|
1861
|
+
// Word-boundary match against a set of common imperative verbs.
|
|
1862
|
+
// Order matters only for readability — we check membership.
|
|
1863
|
+
return /\b(?:fix|add|build|implement|create|change|remove|delete|update|refactor|rewrite|migrate|integrate|configure|setup|set\s+up|design|generate|make|write|test|verify|ensure|review|optimize|improve|replace|move|rename|extract|split|merge|deploy|publish|ship|release|debug|investigate|analyze|reproduce|escalate|prioritize|schedule)\b/i.test(t);
|
|
1864
|
+
}
|
|
1865
|
+
|
|
1559
1866
|
_autoCreateTaskForMessage(agent, message, source, priority) {
|
|
1560
1867
|
if (!agent.taskList) {
|
|
1561
1868
|
agent.taskList = { tasks: [], lastUpdated: new Date().toISOString() };
|
|
@@ -2105,6 +2105,48 @@ class AgentScheduler {
|
|
|
2105
2105
|
});
|
|
2106
2106
|
}
|
|
2107
2107
|
|
|
2108
|
+
// ── Auto-inject CURRENT TASK LIST every turn ───────────────────
|
|
2109
|
+
// The task list lives in `agent.taskList.tasks` — durable, never
|
|
2110
|
+
// affected by compaction. But the conversation messages that
|
|
2111
|
+
// CREATED those tasks ARE compacted, so an agent that lost its
|
|
2112
|
+
// recent history may forget the task list exists. That's how
|
|
2113
|
+
// the Talisman bug happened: the agent called sync with a fresh
|
|
2114
|
+
// 4-task plan, silently wiping 9 in-flight tasks the user had
|
|
2115
|
+
// implicitly requested. Surface the current task list to the
|
|
2116
|
+
// agent every turn so it can never "forget" what's already on
|
|
2117
|
+
// the plan. Cheap (a few hundred chars), invariant to
|
|
2118
|
+
// compaction, and a natural deterrent against destructive sync.
|
|
2119
|
+
try {
|
|
2120
|
+
const tasks = agent.taskList?.tasks || [];
|
|
2121
|
+
if (Array.isArray(tasks) && tasks.length > 0) {
|
|
2122
|
+
const lines = ['\n\n## CURRENT TASK LIST (live from agent state — survives compaction)\n'];
|
|
2123
|
+
lines.push('These tasks exist in your durable state RIGHT NOW. If the conversation history doesn\'t mention them, that\'s because compaction summarized that section away — the tasks are still there.\n');
|
|
2124
|
+
lines.push('Before issuing `taskmanager sync`, READ this list. If you sync with a different plan, you will be dropping these.\n');
|
|
2125
|
+
// Compact, scannable. Title + status + priority is enough.
|
|
2126
|
+
const byStatus = { in_progress: [], pending: [], completed: [], cancelled: [] };
|
|
2127
|
+
for (const t of tasks) {
|
|
2128
|
+
const status = t.status || 'pending';
|
|
2129
|
+
(byStatus[status] || (byStatus[status] = [])).push(t);
|
|
2130
|
+
}
|
|
2131
|
+
const order = ['in_progress', 'pending', 'completed', 'cancelled'];
|
|
2132
|
+
for (const status of order) {
|
|
2133
|
+
const group = byStatus[status] || [];
|
|
2134
|
+
if (group.length === 0) continue;
|
|
2135
|
+
lines.push(`\n**${status}** (${group.length}):`);
|
|
2136
|
+
for (const t of group) {
|
|
2137
|
+
const pri = t.priority ? ` [${t.priority}]` : '';
|
|
2138
|
+
lines.push(`- ${t.title}${pri}`);
|
|
2139
|
+
}
|
|
2140
|
+
}
|
|
2141
|
+
enhancedSystemPrompt = (enhancedSystemPrompt || '') + lines.join('\n');
|
|
2142
|
+
}
|
|
2143
|
+
} catch (taskInjectErr) {
|
|
2144
|
+
// Best-effort — never block the turn on this.
|
|
2145
|
+
this.logger.warn(`Task list injection failed for agent ${agentId} (continuing without)`, {
|
|
2146
|
+
error: taskInjectErr?.message,
|
|
2147
|
+
});
|
|
2148
|
+
}
|
|
2149
|
+
|
|
2108
2150
|
// Check if streaming is enabled - consider both agent config and user message preference
|
|
2109
2151
|
// Get the last user message to check for streaming preference
|
|
2110
2152
|
const lastUserMsg = [...conversationHistory].reverse().find(m => m.role === 'user');
|