onbuzz 4.8.1 → 4.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "onbuzz",
3
- "version": "4.8.1",
3
+ "version": "4.8.2",
4
4
  "description": "Loxia OnBuzz - Your AI Fleet",
5
5
  "type": "module",
6
6
  "main": "src/index.js",
@@ -420,6 +420,191 @@ describe('AgentPool', () => {
420
420
  expect(agent.delayEndTime).toBe(pastIso);
421
421
  });
422
422
 
423
+ // ── Auto-save user messages as plan/* memories ────────────────────
424
+ // Talisman case study: agents never voluntarily wrote memories
425
+ // and the user's literal asks vanished in compaction. The pool
426
+ // now auto-saves substantive user messages to the plan/* store
427
+ // as a system-level safety net (the agent itself can still write
428
+ // better-titled plans on top; these auto-saves are cleanup-safe).
429
+
430
+ describe('REGRESSION: auto-save substantive user messages as plan/*', () => {
431
+ test('calls _autoSaveUserMessageAsPlan for each user message (best-effort)', async () => {
432
+ const agent = await pool.createAgent(agentCfg());
433
+ const spy = jest.spyOn(pool, '_autoSaveUserMessageAsPlan').mockResolvedValue(undefined);
434
+ await pool.addUserMessage(agent.id, { content: 'please do a thing that is long enough to count as substantive content here, more than 120 chars total for the heuristic', role: 'user' });
435
+ expect(spy).toHaveBeenCalledTimes(1);
436
+ expect(spy.mock.calls[0][0]).toBe(agent.id);
437
+ expect(spy.mock.calls[0][1].content).toMatch(/please do a thing/);
438
+ spy.mockRestore();
439
+ });
440
+
441
+ test('a memory-service failure does NOT block the message-enqueue path', async () => {
442
+ // Auto-save is best-effort. If memoryService blows up, the
443
+ // user's message must still land on the queue.
444
+ const agent = await pool.createAgent(agentCfg());
445
+ jest.spyOn(pool, '_autoSaveUserMessageAsPlan').mockRejectedValue(new Error('memory store offline'));
446
+ await pool.addUserMessage(agent.id, { content: 'hello', role: 'user' });
447
+ expect(agent.messageQueues.userMessages).toHaveLength(1);
448
+ });
449
+ });
450
+
451
+ describe('_looksSubstantive heuristic — pollution audit', () => {
452
+ // ── TRUE POSITIVES — these SHOULD be saved ─────────────────────
453
+ test('TRUE POSITIVE: numbered request (Talisman literal case)', () => {
454
+ const msg = `Amazing! a few more things
455
+ 1. "choose your hero" page - lets put a nice fantasy image at the background.
456
+ 2. play screen - why is the board still round? plus, not all nodes have their own art.
457
+ 3. when we roll for combat - there is no dice animation visible`;
458
+ expect(pool._looksSubstantive(msg)).toBe(true);
459
+ });
460
+
461
+ test('TRUE POSITIVE: multi-paragraph feature request', () => {
462
+ const msg = 'Implement OAuth login. Replace the existing session-cookie flow with Google and Microsoft sign-in. Migrate existing users by linking their email to the new auth records on first sign-in. Update the docs.';
463
+ expect(pool._looksSubstantive(msg)).toBe(true);
464
+ });
465
+
466
+ test('TRUE POSITIVE: numbered bug list (descriptions, not paths)', () => {
467
+ const msg = `Fix these bugs:\n1. login button does nothing on Safari\n2. cart total wrong when discount applied\n3. 500 error on /api/orders/export`;
468
+ expect(pool._looksSubstantive(msg)).toBe(true);
469
+ });
470
+
471
+ test('TRUE POSITIVE: bulleted plan with imperative verbs', () => {
472
+ const msg = `fixes needed:\n- fix bug in login flow that breaks on slow networks\n- add retry logic to the cache layer\n- write a test for the missing error case`;
473
+ expect(pool._looksSubstantive(msg)).toBe(true);
474
+ });
475
+
476
+ // ── POLLUTION RISKS — these should NOT be saved ────────────────
477
+ test('POLLUTION: short ack is rejected', () => {
478
+ expect(pool._looksSubstantive('ok thanks')).toBe(false);
479
+ expect(pool._looksSubstantive('thanks!')).toBe(false);
480
+ expect(pool._looksSubstantive('yes please')).toBe(false);
481
+ });
482
+
483
+ test('POLLUTION: question with no imperative is rejected (user wants an answer, not a plan)', () => {
484
+ const msg = 'what does this function do? I see it referenced in three places but the docs do not explain its purpose at all.';
485
+ expect(pool._looksSubstantive(msg)).toBe(false);
486
+ });
487
+
488
+ test('POLLUTION: numbered list of QUESTIONS is rejected (not a plan)', () => {
489
+ const msg = `quick questions:\n1. why is the cache key the user id?\n2. is the rate limit per-user or per-ip?`;
490
+ expect(pool._looksSubstantive(msg)).toBe(false);
491
+ });
492
+
493
+ test('POLLUTION: list of file paths is rejected (just references)', () => {
494
+ const msg = `look at these:\n- src/auth/login.js\n- src/auth/session.js\n- src/middleware/auth.js\n- tests/auth.test.js`;
495
+ expect(pool._looksSubstantive(msg)).toBe(false);
496
+ });
497
+
498
+ test('POLLUTION: long pleasantry with no imperative is rejected', () => {
499
+ const msg = 'hey, hope you are well today, that was a good run earlier and I am happy with the progress so far!';
500
+ expect(pool._looksSubstantive(msg)).toBe(false);
501
+ });
502
+
503
+ test('POLLUTION: trivial numbered nonsense is rejected (too short under list rule)', () => {
504
+ expect(pool._looksSubstantive('1. yes 2. no 3. maybe')).toBe(false);
505
+ });
506
+
507
+ test('POLLUTION: reading-comprehension feedback is rejected', () => {
508
+ expect(pool._looksSubstantive('I do not understand what you wrote in your last message. Can you explain it differently?')).toBe(false);
509
+ });
510
+
511
+ test('POLLUTION: agreement / confirmation is rejected', () => {
512
+ expect(pool._looksSubstantive('yes, please proceed with that plan. it looks correct to me.')).toBe(false);
513
+ });
514
+
515
+ test('POLLUTION: simple one-shot instruction is rejected (too short)', () => {
516
+ expect(pool._looksSubstantive('run the tests again')).toBe(false);
517
+ });
518
+
519
+ // ── Defensive corner cases ─────────────────────────────────────
520
+ test('tool-result wrappers are rejected', () => {
521
+ expect(pool._looksSubstantive('[Tool Results — 1 result from 1 tool batch: filesystem] {...lots of content...}')).toBe(false);
522
+ expect(pool._looksSubstantive('[Previous Task — Final Tool Results] [jobdone] {...}')).toBe(false);
523
+ });
524
+
525
+ test('non-string content is rejected defensively', () => {
526
+ expect(pool._looksSubstantive(null)).toBe(false);
527
+ expect(pool._looksSubstantive(undefined)).toBe(false);
528
+ expect(pool._looksSubstantive({ obj: true })).toBe(false);
529
+ });
530
+ });
531
+
532
+ describe('helpers: question / reference / imperative detection', () => {
533
+ test('_dominatedByQuestions detects majority-question content', () => {
534
+ expect(pool._dominatedByQuestions('what does X do?')).toBe(true);
535
+ expect(pool._dominatedByQuestions('1. what?\n2. why?\n3. how?')).toBe(true);
536
+ expect(pool._dominatedByQuestions('fix the bug')).toBe(false);
537
+ // Mixed: 1 question + 2 commands → not dominated.
538
+ expect(pool._dominatedByQuestions('why is this slow?\nfix the cache\nadd metrics')).toBe(false);
539
+ });
540
+
541
+ test('_listItemsAreJustReferences detects path-only lists', () => {
542
+ const refList = `look:\n- src/a.js\n- src/b.js\n- src/c.js`;
543
+ expect(pool._listItemsAreJustReferences(refList)).toBe(true);
544
+ });
545
+
546
+ test('_listItemsAreJustReferences does NOT flag bug descriptions', () => {
547
+ const bugList = `bugs:\n- login breaks on Safari\n- cart total wrong with discount`;
548
+ expect(pool._listItemsAreJustReferences(bugList)).toBe(false);
549
+ });
550
+
551
+ test('_hasImperativeSignal matches common command verbs', () => {
552
+ expect(pool._hasImperativeSignal('fix the login bug')).toBe(true);
553
+ expect(pool._hasImperativeSignal('add a retry')).toBe(true);
554
+ expect(pool._hasImperativeSignal('refactor this module')).toBe(true);
555
+ expect(pool._hasImperativeSignal('what does this do')).toBe(false);
556
+ expect(pool._hasImperativeSignal('looks good')).toBe(false);
557
+ });
558
+ });
559
+
560
+ describe('dedup + cap (Jaccard similarity, AUTO_PLAN_CAP)', () => {
561
+ test('_tokenize lowercases + drops short words + strips punctuation', () => {
562
+ const tokens = pool._tokenize('Fix THE login button on iOS!');
563
+ // 'the', 'on' are <3 chars dropped; 'fix', 'login', 'button', 'ios' remain.
564
+ expect(tokens.has('fix')).toBe(true);
565
+ expect(tokens.has('login')).toBe(true);
566
+ expect(tokens.has('button')).toBe(true);
567
+ expect(tokens.has('ios')).toBe(true);
568
+ expect(tokens.has('the')).toBe(false);
569
+ expect(tokens.has('on')).toBe(false);
570
+ });
571
+
572
+ test('_jaccard returns 1 for identical text, 0 for disjoint, 1 for both empty', () => {
573
+ const a = pool._tokenize('fix the login button');
574
+ const b = pool._tokenize('fix the login button');
575
+ const c = pool._tokenize('build the cache layer');
576
+ expect(pool._jaccard(a, b)).toBe(1);
577
+ expect(pool._jaccard(a, c)).toBe(0);
578
+ expect(pool._jaccard(new Set(), new Set())).toBe(1);
579
+ });
580
+
581
+ test('_overlapCoefficient catches "I repeat my old message" near-duplicate (containment ≥0.85)', () => {
582
+ // Real Talisman case: user repeated themselves verbatim with
583
+ // an "I repeat my old message" preamble. Jaccard alone marks
584
+ // these as merely "similar" because the preamble adds new
585
+ // words; the overlap coefficient (intersection / smaller-set)
586
+ // correctly reports that the original is fully contained in
587
+ // the repeated version.
588
+ const original = pool._tokenize(
589
+ `1. "choose your hero" page - lets put a nice fantasy image at the background. also, lets create a character card art (general)`,
590
+ );
591
+ const repeated = pool._tokenize(
592
+ `Amazing! I repeat my old message - a few more things\n\n"choose your hero" page - lets put a nice fantasy image at the background. also, lets create a character card art (general)`,
593
+ );
594
+ expect(pool._overlapCoefficient(original, repeated)).toBeGreaterThanOrEqual(0.85);
595
+ });
596
+
597
+ test('_overlapCoefficient is 0 for disjoint sets, 1 for one fully contained in the other', () => {
598
+ const a = pool._tokenize('fix login button');
599
+ const b = pool._tokenize('fix login button additional extra words');
600
+ const c = pool._tokenize('build cache layer');
601
+ // 'a' fully contained in 'b' → 1.0
602
+ expect(pool._overlapCoefficient(a, b)).toBe(1);
603
+ // disjoint
604
+ expect(pool._overlapCoefficient(a, c)).toBe(0);
605
+ });
606
+ });
607
+
423
608
  test('no-ops when delayEndTime is null', async () => {
424
609
  const agent = await pool.createAgent(agentCfg());
425
610
  agent.delayEndTime = null;
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Contract tests for the per-turn task-list injection in agentScheduler.
3
+ *
4
+ * Why this exists: the Talisman production failure was caused by
5
+ * post-compaction context loss — the agent forgot it already had a
6
+ * 9-task plan and called `taskmanager sync` with 4 unrelated tasks,
7
+ * silently destroying the in-flight work. The destructive-sync
8
+ * guardrail in taskManagerTool catches that AT THE SYNC POINT, but
9
+ * the cleaner fix is to never let the agent forget the task list in
10
+ * the first place — inject it into the system prompt every turn.
11
+ *
12
+ * This test pins the contract by source-grep (matches the existing
13
+ * pattern in agentScheduler.taskLifecycleInstruction.test.js): the
14
+ * scheduler's source must contain the injection block AND its key
15
+ * properties. Reading the file is cheaper than instantiating the
16
+ * full scheduler graph (websocket, compaction service, AI service,
17
+ * etc.) just to assert on prompt content.
18
+ */
19
+ import { describe, test, expect } from '@jest/globals';
20
+ import { readFileSync } from 'fs';
21
+ import { fileURLToPath } from 'url';
22
+ import { dirname, join } from 'path';
23
+
24
+ const __dirname = dirname(fileURLToPath(import.meta.url));
25
+ const SRC = readFileSync(join(__dirname, '../agentScheduler.js'), 'utf-8');
26
+
27
+ describe('agentScheduler — per-turn task-list injection', () => {
28
+ test('injection block exists in agentScheduler.js source', () => {
29
+ expect(SRC).toContain('CURRENT TASK LIST');
30
+ });
31
+
32
+ test('emits the heading the agent will key off of', () => {
33
+ // The block heading must be specific and stable — the agent's
34
+ // system prompt is the only thing it sees, so the heading needs
35
+ // to be unambiguous.
36
+ expect(SRC).toMatch(/##\s*CURRENT TASK LIST/);
37
+ });
38
+
39
+ test('reads from durable agent state (agent.taskList.tasks), not from conversation history', () => {
40
+ // The whole point of this injection is to recover from a
41
+ // conversation-history loss (compaction). It MUST read the live,
42
+ // durable taskList — not derive from messages.
43
+ expect(SRC).toMatch(/agent\.taskList\??\.tasks/);
44
+ });
45
+
46
+ test('the block runs AFTER the plan/* injection (cache-friendly ordering)', () => {
47
+ // Stable cacheable system-prompt prefix is critical for the
48
+ // model-side prompt cache. Both injections are dynamic per turn,
49
+ // but ordering must be consistent so the prefix up to plan/*
50
+ // stays stable across turns when only the task list moves.
51
+ const planIdx = SRC.indexOf('AGENT WORKING PLAN');
52
+ const taskIdx = SRC.indexOf('CURRENT TASK LIST');
53
+ expect(planIdx).toBeGreaterThan(-1);
54
+ expect(taskIdx).toBeGreaterThan(-1);
55
+ expect(taskIdx).toBeGreaterThan(planIdx);
56
+ });
57
+
58
+ test('mentions compaction-resistance explicitly', () => {
59
+ // The agent reads the injected block. It must know that the
60
+ // injection is a deliberate signal — not just historical residue
61
+ // — so it gives appropriate weight to the listed tasks.
62
+ expect(SRC).toMatch(/(survives compaction|compaction summarized|compaction)/);
63
+ });
64
+
65
+ test('warns the agent about destructive sync inside the injected block', () => {
66
+ // The injection is the natural place to point the agent at the
67
+ // guardrail. Strong message: do not blindly sync past the
68
+ // existing tasks.
69
+ expect(SRC).toMatch(/(sync|drop).*existing|before issuing.*sync/i);
70
+ });
71
+
72
+ test('groups tasks by status (in_progress / pending / completed / cancelled)', () => {
73
+ // A flat list of 30 tasks is hard to scan. Status grouping puts
74
+ // in_progress first — what the agent needs to act on.
75
+ expect(SRC).toContain('in_progress');
76
+ expect(SRC).toContain('pending');
77
+ expect(SRC).toContain('completed');
78
+ expect(SRC).toContain('cancelled');
79
+ });
80
+
81
+ test('degrades gracefully — never throws on missing taskList', () => {
82
+ // If taskList is undefined or .tasks is missing/empty, the
83
+ // injection should silently skip — never break the turn.
84
+ expect(SRC).toMatch(/agent\.taskList\?\.tasks/);
85
+ // And the whole block is wrapped in try/catch with a logger warn.
86
+ const block = SRC.slice(
87
+ SRC.indexOf('Auto-inject CURRENT TASK LIST'),
88
+ SRC.indexOf('Check if streaming is enabled'),
89
+ );
90
+ expect(block).toMatch(/try\s*{/);
91
+ expect(block).toMatch(/catch\s*\(/);
92
+ expect(block).toMatch(/continuing without/);
93
+ });
94
+ });
@@ -23,6 +23,17 @@ import DirectoryAccessManager from '../utilities/directoryAccessManager.js';
23
23
  import { getVisualEditorBridge } from '../services/visualEditorBridge.js';
24
24
 
25
25
  class AgentPool {
26
+ // Stopwords for the _tokenize / _jaccard similarity check used by
27
+ // auto-save-as-plan dedup. Tight list — only words that appear in
28
+ // virtually every English sentence regardless of content, so that
29
+ // their presence in both messages doesn't inflate similarity.
30
+ static _STOPWORDS = new Set([
31
+ 'the', 'and', 'for', 'but', 'are', 'was', 'were',
32
+ 'has', 'have', 'had', 'this', 'that', 'with', 'will',
33
+ 'you', 'your', 'our', 'their', 'them', 'they',
34
+ 'can', 'could', 'should', 'would',
35
+ ]);
36
+
26
37
  constructor(config, logger, stateManager, contextManager, toolsRegistry = null) {
27
38
  this.config = config;
28
39
  this.logger = logger;
@@ -1437,6 +1448,23 @@ class AgentPool {
1437
1448
  this._autoCreateTaskForMessage(agent, queuedMessage, 'user', 'high');
1438
1449
  }
1439
1450
 
1451
+ // ── Auto-save substantive user messages as plan/* memories ───────
1452
+ // Observed in production: across 670-message agent sessions the
1453
+ // agent NEVER wrote a memory voluntarily. Compaction then summarized
1454
+ // away the user's literal asks, the agent paraphrased what was left,
1455
+ // and ended up doing work the user never requested. Belt-and-
1456
+ // suspenders alongside the OPERATING POSTURE prompt nudge: when a
1457
+ // user message looks substantive (long, or contains a numbered/
1458
+ // bulleted multi-part ask), the SYSTEM saves it as `plan/<auto>` so
1459
+ // the system-prompt auto-injection makes the user's words visible
1460
+ // every turn — even if the agent itself never thought to save.
1461
+ // Best-effort: never block the message-enqueue path.
1462
+ this._autoSaveUserMessageAsPlan(agentId, queuedMessage).catch(err => {
1463
+ this.logger.debug?.('Auto-save of user message as plan/* failed (continuing)', {
1464
+ agentId, error: err?.message,
1465
+ });
1466
+ });
1467
+
1440
1468
  await this.persistAgentState(agentId);
1441
1469
 
1442
1470
  // If we cleared a delay, surface it on the WS so the delay chip in the
@@ -1556,6 +1584,285 @@ class AgentPool {
1556
1584
  * @param {string} priority - Task priority ('high', 'medium', 'low')
1557
1585
  * @private
1558
1586
  */
1587
+ /**
1588
+ * Save a substantive user message as a `plan/*` memory automatically.
1589
+ *
1590
+ * Rationale (Talisman case study, May 2026): agents observed in
1591
+ * production never wrote a single memory across hundreds of
1592
+ * messages, even when the OPERATING POSTURE prompt explicitly told
1593
+ * them to. The user's literal ask then got lost in compaction and
1594
+ * the agent went off-course. This system-level safety net puts the
1595
+ * user's message into the durable plan/* store — which the system
1596
+ * prompt auto-injects every turn — without depending on the model
1597
+ * making the call.
1598
+ *
1599
+ * What counts as "substantive":
1600
+ * - Content length ≥ 60 chars (~12 words) — short acks/yes-no don't qualify
1601
+ * - AND any of:
1602
+ * • contains a numbered list ("1.", "2.", "3." …)
1603
+ * • contains a bullet list (-, *, • at line start)
1604
+ * • OR is ≥ 120 chars (longer than a one-line ack)
1605
+ *
1606
+ * What gets saved:
1607
+ * - title: `plan/user-<short-slug>-<timestamp>`
1608
+ * - description: "auto-saved from user message at <iso>"
1609
+ * - content: the verbatim user message
1610
+ *
1611
+ * The agent can rename, consolidate, or delete these later. They
1612
+ * exist as a fail-safe — if the agent does its job and saves its
1613
+ * own better-named plan, these auto-saves can be cleaned up. If
1614
+ * the agent doesn't, at least the user's words survive compaction.
1615
+ *
1616
+ * @param {string} agentId
1617
+ * @param {Object} message - The queued user message
1618
+ * @private
1619
+ */
1620
+ async _autoSaveUserMessageAsPlan(agentId, message) {
1621
+ const content = typeof message?.content === 'string' ? message.content : '';
1622
+ if (!content) return;
1623
+ if (!this._looksSubstantive(content)) return;
1624
+
1625
+ // Lazy-load to keep agentPool's load order light. The same import
1626
+ // pattern as agentScheduler's plan injection.
1627
+ let memoryService;
1628
+ try {
1629
+ const mod = await import('../services/memoryService.js');
1630
+ memoryService = mod.getMemoryService(this.logger);
1631
+ await memoryService.initialize();
1632
+ } catch (e) {
1633
+ this.logger.debug?.('Auto-save plan: memory service unavailable', { error: e.message });
1634
+ return;
1635
+ }
1636
+
1637
+ // ── Deduplication ────────────────────────────────────────────────
1638
+ // Users repeat themselves ("I repeat my old message", "did you do
1639
+ // it all?" + paste the same thing). Without dedup the auto-saver
1640
+ // would create N copies of essentially the same plan. Load
1641
+ // existing plan/user-* memories and skip when the new content is
1642
+ // ≥70% similar to any of them (Jaccard over normalized word sets).
1643
+ let existingPlans = [];
1644
+ try {
1645
+ const all = await memoryService.loadMemories(agentId);
1646
+ existingPlans = (all || []).filter(m =>
1647
+ typeof m?.title === 'string' && m.title.startsWith('plan/user-')
1648
+ );
1649
+ } catch (e) {
1650
+ // Treat unreadable store as empty — we may still write a fresh entry.
1651
+ this.logger.debug?.('Auto-save plan: existing memories unreadable', { agentId, error: e.message });
1652
+ }
1653
+
1654
+ const newTokens = this._tokenize(content);
1655
+ for (const existing of existingPlans) {
1656
+ const existingTokens = this._tokenize(existing.content || '');
1657
+ const sim = this._jaccard(newTokens, existingTokens);
1658
+ const containment = this._overlapCoefficient(newTokens, existingTokens);
1659
+ // Jaccard catches near-identical reformulations. Containment
1660
+ // catches the "I repeat my old message — <same content>" case
1661
+ // where the user re-pastes the original plus a preamble. Either
1662
+ // signal is enough to suppress the duplicate.
1663
+ if (sim >= 0.7 || containment >= 0.85) {
1664
+ this.logger.info?.('Auto-save plan: skipping near-duplicate of existing plan', {
1665
+ agentId, existingTitle: existing.title,
1666
+ jaccard: sim.toFixed(2), containment: containment.toFixed(2),
1667
+ });
1668
+ return;
1669
+ }
1670
+ }
1671
+
1672
+ // ── Per-agent cap ────────────────────────────────────────────────
1673
+ // Bound the total auto-saved plans so an active session doesn't
1674
+ // bloat the agent's plan/* namespace indefinitely. Keep the K most
1675
+ // recent; delete the oldest auto-saves beyond that.
1676
+ const AUTO_PLAN_CAP = 8;
1677
+ const existingAutoSaves = existingPlans
1678
+ .filter(m => /^plan\/user-/.test(m.title))
1679
+ .sort((a, b) => String(a.createdAt || '').localeCompare(String(b.createdAt || '')));
1680
+ while (existingAutoSaves.length >= AUTO_PLAN_CAP) {
1681
+ const oldest = existingAutoSaves.shift();
1682
+ try {
1683
+ await memoryService.deleteMemory(agentId, oldest.id);
1684
+ this.logger.info?.('Auto-save plan: retired oldest auto-save to keep cap', {
1685
+ agentId, retiredTitle: oldest.title, cap: AUTO_PLAN_CAP,
1686
+ });
1687
+ } catch (e) {
1688
+ // Non-fatal — if we can't delete the oldest, just skip this entry
1689
+ // and proceed with the write. Worst case the plan list grows
1690
+ // by one beyond the cap — still bounded over time.
1691
+ this.logger.debug?.('Auto-save plan: retire-oldest failed', { agentId, error: e.message });
1692
+ break;
1693
+ }
1694
+ }
1695
+
1696
+ // ── Write the new memory ─────────────────────────────────────────
1697
+ const firstLine = (content.match(/[^\n]+/) || [''])[0].trim();
1698
+ const slug = firstLine
1699
+ .toLowerCase()
1700
+ .replace(/[^a-z0-9]+/g, '-')
1701
+ .replace(/^-+|-+$/g, '')
1702
+ .slice(0, 40) || 'request';
1703
+ const ts = new Date().toISOString().slice(0, 19).replace(/[:T]/g, '-');
1704
+ const title = `plan/user-${slug}-${ts}`;
1705
+
1706
+ try {
1707
+ await memoryService.addMemory(agentId, {
1708
+ title,
1709
+ description: `Auto-saved from user message at ${message.timestamp || new Date().toISOString()}`,
1710
+ content,
1711
+ });
1712
+ this.logger.info?.('Auto-saved user message as plan/* memory', {
1713
+ agentId, title, contentLength: content.length,
1714
+ });
1715
+ } catch (e) {
1716
+ this.logger.debug?.('Auto-save plan: write failed', { agentId, title, error: e.message });
1717
+ }
1718
+ }
1719
+
1720
+ /**
1721
+ * Tokenize a string into a lowercased word set for similarity checks.
1722
+ * Strips punctuation, drops short words (<3 chars), and drops a
1723
+ * small stopword set so that common words like "the" / "and" don't
1724
+ * inflate similarity scores between otherwise different messages.
1725
+ * @private
1726
+ */
1727
+ _tokenize(s) {
1728
+ if (typeof s !== 'string') return new Set();
1729
+ return new Set(
1730
+ s.toLowerCase()
1731
+ .replace(/[^a-z0-9\s]+/g, ' ')
1732
+ .split(/\s+/)
1733
+ .filter(w => w.length >= 3 && !AgentPool._STOPWORDS.has(w))
1734
+ );
1735
+ }
1736
+
1737
+ /**
1738
+ * Jaccard similarity over two word sets.
1739
+ * @private
1740
+ */
1741
+ _jaccard(a, b) {
1742
+ if (a.size === 0 && b.size === 0) return 1;
1743
+ if (a.size === 0 || b.size === 0) return 0;
1744
+ let intersection = 0;
1745
+ for (const w of a) if (b.has(w)) intersection += 1;
1746
+ return intersection / (a.size + b.size - intersection);
1747
+ }
1748
+
1749
+ /**
1750
+ * Overlap coefficient — intersection / size-of-smaller-set.
1751
+ * Returns 1.0 when one set is fully contained in the other,
1752
+ * regardless of how much the other set adds. Catches the "user
1753
+ * re-pastes their request with a preamble" duplicate case where
1754
+ * Jaccard would mark the messages as merely similar.
1755
+ * @private
1756
+ */
1757
+ _overlapCoefficient(a, b) {
1758
+ if (a.size === 0 || b.size === 0) return 0;
1759
+ let intersection = 0;
1760
+ for (const w of a) if (b.has(w)) intersection += 1;
1761
+ return intersection / Math.min(a.size, b.size);
1762
+ }
1763
+
1764
+ /**
1765
+ * Heuristic — does this user message look like a real request worth
1766
+ * preserving as a plan/*? Errs on the side of saving more (recall
1767
+ * over precision) — a stray auto-save is cheap; a lost user request
1768
+ * is catastrophic.
1769
+ * @private
1770
+ */
1771
+ _looksSubstantive(text) {
1772
+ if (typeof text !== 'string') return false;
1773
+ const t = text.trim();
1774
+ if (t.length < 30) return false;
1775
+ // Tool-result wrappers and previous-task boundaries are not user voice.
1776
+ if (t.startsWith('[Tool Results') || t.startsWith('[Previous Task')) return false;
1777
+
1778
+ // ── Pollution filter 1: dominated by questions ────────────────────
1779
+ // A message that's mostly questions wants an ANSWER, not a plan.
1780
+ // If the majority of non-empty lines end in '?' (or are
1781
+ // question-shaped), this is a query, not a request.
1782
+ if (this._dominatedByQuestions(t)) return false;
1783
+
1784
+ // ── Pollution filter 2: list items are just refs (paths, urls) ───
1785
+ // A list of file paths / URLs / commit hashes is the user pointing
1786
+ // the agent at things, not a multi-part plan. Save it only if the
1787
+ // surrounding prose carries imperative intent — and even then the
1788
+ // length gate handles that path.
1789
+ const hasList = /^\s*(?:\d+[.)]|[-*•])\s/m.test(t);
1790
+ if (hasList && this._listItemsAreJustReferences(t)) return false;
1791
+
1792
+ // ── Now apply the structural triggers ────────────────────────────
1793
+ // Numbered list — "1." / "1)" at a line start. Multi-part intent.
1794
+ // Require a minimum total length to avoid "1. yes 2. no" nonsense.
1795
+ if (/^\s*\d+[.)]\s/m.test(t) && t.length >= 60) return true;
1796
+ // Bullet list at line start. Same — strong intent signal + length.
1797
+ if (/^\s*[-*•]\s/m.test(t) && t.length >= 60) return true;
1798
+ // Free-form prose with no list markers must be substantial AND
1799
+ // contain an imperative-like signal (a verb you'd give as an
1800
+ // order). Raised from 120 → 150 to skip more pleasantries.
1801
+ if (t.length >= 150 && this._hasImperativeSignal(t)) return true;
1802
+ return false;
1803
+ }
1804
+
1805
+ /**
1806
+ * Heuristic: is this message mostly questions?
1807
+ * @private
1808
+ */
1809
+ _dominatedByQuestions(t) {
1810
+ // Split into non-empty lines.
1811
+ const lines = t.split(/\r?\n/).map(l => l.trim()).filter(Boolean);
1812
+ if (lines.length === 0) return false;
1813
+ // Strip leading list markers so we can look at the line's intent.
1814
+ const stripMarker = (l) => l.replace(/^(?:\d+[.)]|[-*•])\s+/, '');
1815
+ let questionLines = 0;
1816
+ for (const raw of lines) {
1817
+ const line = stripMarker(raw);
1818
+ // Ends in '?', OR starts with a question word at the line head.
1819
+ if (/\?\s*$/.test(line) || /^(?:what|why|how|when|where|who|which|is\b|are\b|do\b|does\b|can\b|could\b|should\b|would\b)\b/i.test(line)) {
1820
+ questionLines += 1;
1821
+ }
1822
+ }
1823
+ // Strict-majority rule: more than half of lines are questions.
1824
+ return questionLines * 2 > lines.length;
1825
+ }
1826
+
1827
+ /**
1828
+ * Heuristic: are the list items in this message just references
1829
+ * (file paths, URLs, commit hashes) with no imperative verb of their own?
1830
+ * @private
1831
+ */
1832
+ _listItemsAreJustReferences(t) {
1833
+ const lines = t.split(/\r?\n/).map(l => l.trim()).filter(Boolean);
1834
+ const listItems = lines.filter(l => /^(?:\d+[.)]|[-*•])\s/.test(l));
1835
+ if (listItems.length === 0) return false;
1836
+ let refLikeCount = 0;
1837
+ for (const li of listItems) {
1838
+ const body = li.replace(/^(?:\d+[.)]|[-*•])\s+/, '').trim();
1839
+ // Only treat as a "reference" if the line IS the reference —
1840
+ // i.e. a path/URL/hash with no surrounding English. A short bug
1841
+ // description like "login button does nothing on Safari" still
1842
+ // counts as content, not a reference.
1843
+ // Path: contains '/' or '\' OR starts with '.' AND has NO spaces
1844
+ // URL: starts with http(s)://
1845
+ // Hash: 7-40 hex chars only, no spaces
1846
+ const isPath = (/[/\\]/.test(body) || /^\./.test(body)) && !/\s/.test(body);
1847
+ const isUrl = /^https?:\/\//.test(body) && !/\s/.test(body);
1848
+ const isHash = /^[0-9a-f]{7,40}$/i.test(body);
1849
+ if (isPath || isUrl || isHash) refLikeCount += 1;
1850
+ }
1851
+ // Strict-majority of list items are reference-like → ignore.
1852
+ return refLikeCount * 2 > listItems.length;
1853
+ }
1854
+
1855
+ /**
1856
+ * Heuristic: does the message contain a verb that signals "do this"?
1857
+ * Conservative — favors recall over precision.
1858
+ * @private
1859
+ */
1860
+ _hasImperativeSignal(t) {
1861
+ // Word-boundary match against a set of common imperative verbs.
1862
+ // Order matters only for readability — we check membership.
1863
+ return /\b(?:fix|add|build|implement|create|change|remove|delete|update|refactor|rewrite|migrate|integrate|configure|setup|set\s+up|design|generate|make|write|test|verify|ensure|review|optimize|improve|replace|move|rename|extract|split|merge|deploy|publish|ship|release|debug|investigate|analyze|reproduce|escalate|prioritize|schedule)\b/i.test(t);
1864
+ }
1865
+
1559
1866
  _autoCreateTaskForMessage(agent, message, source, priority) {
1560
1867
  if (!agent.taskList) {
1561
1868
  agent.taskList = { tasks: [], lastUpdated: new Date().toISOString() };
@@ -2105,6 +2105,48 @@ class AgentScheduler {
2105
2105
  });
2106
2106
  }
2107
2107
 
2108
+ // ── Auto-inject CURRENT TASK LIST every turn ───────────────────
2109
+ // The task list lives in `agent.taskList.tasks` — durable, never
2110
+ // affected by compaction. But the conversation messages that
2111
+ // CREATED those tasks ARE compacted, so an agent that lost its
2112
+ // recent history may forget the task list exists. That's how
2113
+ // the Talisman bug happened: the agent called sync with a fresh
2114
+ // 4-task plan, silently wiping 9 in-flight tasks the user had
2115
+ // implicitly requested. Surface the current task list to the
2116
+ // agent every turn so it can never "forget" what's already on
2117
+ // the plan. Cheap (a few hundred chars), invariant to
2118
+ // compaction, and a natural deterrent against destructive sync.
2119
+ try {
2120
+ const tasks = agent.taskList?.tasks || [];
2121
+ if (Array.isArray(tasks) && tasks.length > 0) {
2122
+ const lines = ['\n\n## CURRENT TASK LIST (live from agent state — survives compaction)\n'];
2123
+ lines.push('These tasks exist in your durable state RIGHT NOW. If the conversation history doesn\'t mention them, that\'s because compaction summarized that section away — the tasks are still there.\n');
2124
+ lines.push('Before issuing `taskmanager sync`, READ this list. If you sync with a different plan, you will be dropping these.\n');
2125
+ // Compact, scannable. Title + status + priority is enough.
2126
+ const byStatus = { in_progress: [], pending: [], completed: [], cancelled: [] };
2127
+ for (const t of tasks) {
2128
+ const status = t.status || 'pending';
2129
+ (byStatus[status] || (byStatus[status] = [])).push(t);
2130
+ }
2131
+ const order = ['in_progress', 'pending', 'completed', 'cancelled'];
2132
+ for (const status of order) {
2133
+ const group = byStatus[status] || [];
2134
+ if (group.length === 0) continue;
2135
+ lines.push(`\n**${status}** (${group.length}):`);
2136
+ for (const t of group) {
2137
+ const pri = t.priority ? ` [${t.priority}]` : '';
2138
+ lines.push(`- ${t.title}${pri}`);
2139
+ }
2140
+ }
2141
+ enhancedSystemPrompt = (enhancedSystemPrompt || '') + lines.join('\n');
2142
+ }
2143
+ } catch (taskInjectErr) {
2144
+ // Best-effort — never block the turn on this.
2145
+ this.logger.warn(`Task list injection failed for agent ${agentId} (continuing without)`, {
2146
+ error: taskInjectErr?.message,
2147
+ });
2148
+ }
2149
+
2108
2150
  // Check if streaming is enabled - consider both agent config and user message preference
2109
2151
  // Get the last user message to check for streaming preference
2110
2152
  const lastUserMsg = [...conversationHistory].reverse().find(m => m.role === 'user');