onbuzz 4.8.0 → 4.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/core/__tests__/agentPool.test.js +185 -0
- package/src/core/__tests__/agentScheduler.nativePromptPick.test.js +319 -0
- package/src/core/__tests__/agentScheduler.taskListInjection.test.js +94 -0
- package/src/core/agentPool.js +319 -0
- package/src/core/agentScheduler.js +216 -2
- package/src/services/__tests__/conversationCompactionService.test.js +141 -0
- package/src/services/__tests__/modelRouterNaming.test.js +41 -23
- package/src/services/conversationCompactionService.js +120 -46
- package/src/tools/__tests__/baseTool.test.js +171 -0
- package/src/tools/__tests__/codeMapTool.test.js +179 -0
- package/src/tools/__tests__/taskManagerTool.test.js +141 -0
- package/src/tools/baseTool.js +89 -1
- package/src/tools/openaiFunctionSchemas.js +14 -0
- package/src/tools/skillsTool.js +282 -277
- package/src/tools/taskManagerTool.js +72 -2
- package/src/utilities/constants.js +19 -1
package/package.json
CHANGED
|
@@ -420,6 +420,191 @@ describe('AgentPool', () => {
|
|
|
420
420
|
expect(agent.delayEndTime).toBe(pastIso);
|
|
421
421
|
});
|
|
422
422
|
|
|
423
|
+
// ── Auto-save user messages as plan/* memories ────────────────────
|
|
424
|
+
// Talisman case study: agents never voluntarily wrote memories
|
|
425
|
+
// and the user's literal asks vanished in compaction. The pool
|
|
426
|
+
// now auto-saves substantive user messages to the plan/* store
|
|
427
|
+
// as a system-level safety net (the agent itself can still write
|
|
428
|
+
// better-titled plans on top; these auto-saves are cleanup-safe).
|
|
429
|
+
|
|
430
|
+
describe('REGRESSION: auto-save substantive user messages as plan/*', () => {
|
|
431
|
+
test('calls _autoSaveUserMessageAsPlan for each user message (best-effort)', async () => {
|
|
432
|
+
const agent = await pool.createAgent(agentCfg());
|
|
433
|
+
const spy = jest.spyOn(pool, '_autoSaveUserMessageAsPlan').mockResolvedValue(undefined);
|
|
434
|
+
await pool.addUserMessage(agent.id, { content: 'please do a thing that is long enough to count as substantive content here, more than 120 chars total for the heuristic', role: 'user' });
|
|
435
|
+
expect(spy).toHaveBeenCalledTimes(1);
|
|
436
|
+
expect(spy.mock.calls[0][0]).toBe(agent.id);
|
|
437
|
+
expect(spy.mock.calls[0][1].content).toMatch(/please do a thing/);
|
|
438
|
+
spy.mockRestore();
|
|
439
|
+
});
|
|
440
|
+
|
|
441
|
+
test('a memory-service failure does NOT block the message-enqueue path', async () => {
|
|
442
|
+
// Auto-save is best-effort. If memoryService blows up, the
|
|
443
|
+
// user's message must still land on the queue.
|
|
444
|
+
const agent = await pool.createAgent(agentCfg());
|
|
445
|
+
jest.spyOn(pool, '_autoSaveUserMessageAsPlan').mockRejectedValue(new Error('memory store offline'));
|
|
446
|
+
await pool.addUserMessage(agent.id, { content: 'hello', role: 'user' });
|
|
447
|
+
expect(agent.messageQueues.userMessages).toHaveLength(1);
|
|
448
|
+
});
|
|
449
|
+
});
|
|
450
|
+
|
|
451
|
+
describe('_looksSubstantive heuristic — pollution audit', () => {
|
|
452
|
+
// ── TRUE POSITIVES — these SHOULD be saved ─────────────────────
|
|
453
|
+
test('TRUE POSITIVE: numbered request (Talisman literal case)', () => {
|
|
454
|
+
const msg = `Amazing! a few more things
|
|
455
|
+
1. "choose your hero" page - lets put a nice fantasy image at the background.
|
|
456
|
+
2. play screen - why is the board still round? plus, not all nodes have their own art.
|
|
457
|
+
3. when we roll for combat - there is no dice animation visible`;
|
|
458
|
+
expect(pool._looksSubstantive(msg)).toBe(true);
|
|
459
|
+
});
|
|
460
|
+
|
|
461
|
+
test('TRUE POSITIVE: multi-paragraph feature request', () => {
|
|
462
|
+
const msg = 'Implement OAuth login. Replace the existing session-cookie flow with Google and Microsoft sign-in. Migrate existing users by linking their email to the new auth records on first sign-in. Update the docs.';
|
|
463
|
+
expect(pool._looksSubstantive(msg)).toBe(true);
|
|
464
|
+
});
|
|
465
|
+
|
|
466
|
+
test('TRUE POSITIVE: numbered bug list (descriptions, not paths)', () => {
|
|
467
|
+
const msg = `Fix these bugs:\n1. login button does nothing on Safari\n2. cart total wrong when discount applied\n3. 500 error on /api/orders/export`;
|
|
468
|
+
expect(pool._looksSubstantive(msg)).toBe(true);
|
|
469
|
+
});
|
|
470
|
+
|
|
471
|
+
test('TRUE POSITIVE: bulleted plan with imperative verbs', () => {
|
|
472
|
+
const msg = `fixes needed:\n- fix bug in login flow that breaks on slow networks\n- add retry logic to the cache layer\n- write a test for the missing error case`;
|
|
473
|
+
expect(pool._looksSubstantive(msg)).toBe(true);
|
|
474
|
+
});
|
|
475
|
+
|
|
476
|
+
// ── POLLUTION RISKS — these should NOT be saved ────────────────
|
|
477
|
+
test('POLLUTION: short ack is rejected', () => {
|
|
478
|
+
expect(pool._looksSubstantive('ok thanks')).toBe(false);
|
|
479
|
+
expect(pool._looksSubstantive('thanks!')).toBe(false);
|
|
480
|
+
expect(pool._looksSubstantive('yes please')).toBe(false);
|
|
481
|
+
});
|
|
482
|
+
|
|
483
|
+
test('POLLUTION: question with no imperative is rejected (user wants an answer, not a plan)', () => {
|
|
484
|
+
const msg = 'what does this function do? I see it referenced in three places but the docs do not explain its purpose at all.';
|
|
485
|
+
expect(pool._looksSubstantive(msg)).toBe(false);
|
|
486
|
+
});
|
|
487
|
+
|
|
488
|
+
test('POLLUTION: numbered list of QUESTIONS is rejected (not a plan)', () => {
|
|
489
|
+
const msg = `quick questions:\n1. why is the cache key the user id?\n2. is the rate limit per-user or per-ip?`;
|
|
490
|
+
expect(pool._looksSubstantive(msg)).toBe(false);
|
|
491
|
+
});
|
|
492
|
+
|
|
493
|
+
test('POLLUTION: list of file paths is rejected (just references)', () => {
|
|
494
|
+
const msg = `look at these:\n- src/auth/login.js\n- src/auth/session.js\n- src/middleware/auth.js\n- tests/auth.test.js`;
|
|
495
|
+
expect(pool._looksSubstantive(msg)).toBe(false);
|
|
496
|
+
});
|
|
497
|
+
|
|
498
|
+
test('POLLUTION: long pleasantry with no imperative is rejected', () => {
|
|
499
|
+
const msg = 'hey, hope you are well today, that was a good run earlier and I am happy with the progress so far!';
|
|
500
|
+
expect(pool._looksSubstantive(msg)).toBe(false);
|
|
501
|
+
});
|
|
502
|
+
|
|
503
|
+
test('POLLUTION: trivial numbered nonsense is rejected (too short under list rule)', () => {
|
|
504
|
+
expect(pool._looksSubstantive('1. yes 2. no 3. maybe')).toBe(false);
|
|
505
|
+
});
|
|
506
|
+
|
|
507
|
+
test('POLLUTION: reading-comprehension feedback is rejected', () => {
|
|
508
|
+
expect(pool._looksSubstantive('I do not understand what you wrote in your last message. Can you explain it differently?')).toBe(false);
|
|
509
|
+
});
|
|
510
|
+
|
|
511
|
+
test('POLLUTION: agreement / confirmation is rejected', () => {
|
|
512
|
+
expect(pool._looksSubstantive('yes, please proceed with that plan. it looks correct to me.')).toBe(false);
|
|
513
|
+
});
|
|
514
|
+
|
|
515
|
+
test('POLLUTION: simple one-shot instruction is rejected (too short)', () => {
|
|
516
|
+
expect(pool._looksSubstantive('run the tests again')).toBe(false);
|
|
517
|
+
});
|
|
518
|
+
|
|
519
|
+
// ── Defensive corner cases ─────────────────────────────────────
|
|
520
|
+
test('tool-result wrappers are rejected', () => {
|
|
521
|
+
expect(pool._looksSubstantive('[Tool Results — 1 result from 1 tool batch: filesystem] {...lots of content...}')).toBe(false);
|
|
522
|
+
expect(pool._looksSubstantive('[Previous Task — Final Tool Results] [jobdone] {...}')).toBe(false);
|
|
523
|
+
});
|
|
524
|
+
|
|
525
|
+
test('non-string content is rejected defensively', () => {
|
|
526
|
+
expect(pool._looksSubstantive(null)).toBe(false);
|
|
527
|
+
expect(pool._looksSubstantive(undefined)).toBe(false);
|
|
528
|
+
expect(pool._looksSubstantive({ obj: true })).toBe(false);
|
|
529
|
+
});
|
|
530
|
+
});
|
|
531
|
+
|
|
532
|
+
describe('helpers: question / reference / imperative detection', () => {
|
|
533
|
+
test('_dominatedByQuestions detects majority-question content', () => {
|
|
534
|
+
expect(pool._dominatedByQuestions('what does X do?')).toBe(true);
|
|
535
|
+
expect(pool._dominatedByQuestions('1. what?\n2. why?\n3. how?')).toBe(true);
|
|
536
|
+
expect(pool._dominatedByQuestions('fix the bug')).toBe(false);
|
|
537
|
+
// Mixed: 1 question + 2 commands → not dominated.
|
|
538
|
+
expect(pool._dominatedByQuestions('why is this slow?\nfix the cache\nadd metrics')).toBe(false);
|
|
539
|
+
});
|
|
540
|
+
|
|
541
|
+
test('_listItemsAreJustReferences detects path-only lists', () => {
|
|
542
|
+
const refList = `look:\n- src/a.js\n- src/b.js\n- src/c.js`;
|
|
543
|
+
expect(pool._listItemsAreJustReferences(refList)).toBe(true);
|
|
544
|
+
});
|
|
545
|
+
|
|
546
|
+
test('_listItemsAreJustReferences does NOT flag bug descriptions', () => {
|
|
547
|
+
const bugList = `bugs:\n- login breaks on Safari\n- cart total wrong with discount`;
|
|
548
|
+
expect(pool._listItemsAreJustReferences(bugList)).toBe(false);
|
|
549
|
+
});
|
|
550
|
+
|
|
551
|
+
test('_hasImperativeSignal matches common command verbs', () => {
|
|
552
|
+
expect(pool._hasImperativeSignal('fix the login bug')).toBe(true);
|
|
553
|
+
expect(pool._hasImperativeSignal('add a retry')).toBe(true);
|
|
554
|
+
expect(pool._hasImperativeSignal('refactor this module')).toBe(true);
|
|
555
|
+
expect(pool._hasImperativeSignal('what does this do')).toBe(false);
|
|
556
|
+
expect(pool._hasImperativeSignal('looks good')).toBe(false);
|
|
557
|
+
});
|
|
558
|
+
});
|
|
559
|
+
|
|
560
|
+
describe('dedup + cap (Jaccard similarity, AUTO_PLAN_CAP)', () => {
|
|
561
|
+
test('_tokenize lowercases + drops short words + strips punctuation', () => {
|
|
562
|
+
const tokens = pool._tokenize('Fix THE login button on iOS!');
|
|
563
|
+
// 'the', 'on' are <3 chars dropped; 'fix', 'login', 'button', 'ios' remain.
|
|
564
|
+
expect(tokens.has('fix')).toBe(true);
|
|
565
|
+
expect(tokens.has('login')).toBe(true);
|
|
566
|
+
expect(tokens.has('button')).toBe(true);
|
|
567
|
+
expect(tokens.has('ios')).toBe(true);
|
|
568
|
+
expect(tokens.has('the')).toBe(false);
|
|
569
|
+
expect(tokens.has('on')).toBe(false);
|
|
570
|
+
});
|
|
571
|
+
|
|
572
|
+
test('_jaccard returns 1 for identical text, 0 for disjoint, 1 for both empty', () => {
|
|
573
|
+
const a = pool._tokenize('fix the login button');
|
|
574
|
+
const b = pool._tokenize('fix the login button');
|
|
575
|
+
const c = pool._tokenize('build the cache layer');
|
|
576
|
+
expect(pool._jaccard(a, b)).toBe(1);
|
|
577
|
+
expect(pool._jaccard(a, c)).toBe(0);
|
|
578
|
+
expect(pool._jaccard(new Set(), new Set())).toBe(1);
|
|
579
|
+
});
|
|
580
|
+
|
|
581
|
+
test('_overlapCoefficient catches "I repeat my old message" near-duplicate (containment ≥0.85)', () => {
|
|
582
|
+
// Real Talisman case: user repeated themselves verbatim with
|
|
583
|
+
// an "I repeat my old message" preamble. Jaccard alone marks
|
|
584
|
+
// these as merely "similar" because the preamble adds new
|
|
585
|
+
// words; the overlap coefficient (intersection / smaller-set)
|
|
586
|
+
// correctly reports that the original is fully contained in
|
|
587
|
+
// the repeated version.
|
|
588
|
+
const original = pool._tokenize(
|
|
589
|
+
`1. "choose your hero" page - lets put a nice fantasy image at the background. also, lets create a character card art (general)`,
|
|
590
|
+
);
|
|
591
|
+
const repeated = pool._tokenize(
|
|
592
|
+
`Amazing! I repeat my old message - a few more things\n\n"choose your hero" page - lets put a nice fantasy image at the background. also, lets create a character card art (general)`,
|
|
593
|
+
);
|
|
594
|
+
expect(pool._overlapCoefficient(original, repeated)).toBeGreaterThanOrEqual(0.85);
|
|
595
|
+
});
|
|
596
|
+
|
|
597
|
+
test('_overlapCoefficient is 0 for disjoint sets, 1 for one fully contained in the other', () => {
|
|
598
|
+
const a = pool._tokenize('fix login button');
|
|
599
|
+
const b = pool._tokenize('fix login button additional extra words');
|
|
600
|
+
const c = pool._tokenize('build cache layer');
|
|
601
|
+
// 'a' fully contained in 'b' → 1.0
|
|
602
|
+
expect(pool._overlapCoefficient(a, b)).toBe(1);
|
|
603
|
+
// disjoint
|
|
604
|
+
expect(pool._overlapCoefficient(a, c)).toBe(0);
|
|
605
|
+
});
|
|
606
|
+
});
|
|
607
|
+
|
|
423
608
|
test('no-ops when delayEndTime is null', async () => {
|
|
424
609
|
const agent = await pool.createAgent(agentCfg());
|
|
425
610
|
agent.delayEndTime = null;
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for `_resolveModelApiType` + `_pickSystemPromptForModel`
|
|
3
|
+
* on AgentScheduler — the runtime-side half of the "trim duplicated
|
|
4
|
+
* tool docs for Responses-API models" feature.
|
|
5
|
+
*
|
|
6
|
+
* Three concerns covered:
|
|
7
|
+
* 1. Classification (_resolveModelApiType) — every catalog shape the
|
|
8
|
+
* backend's _inferRouting recognizes must produce the same answer
|
|
9
|
+
* on the CLI side, or the optimization fires on the wrong models.
|
|
10
|
+
* 2. Selection (_pickSystemPromptForModel) — must fall back to
|
|
11
|
+
* `agent.systemPrompt` for every safety-net path:
|
|
12
|
+
* • modelsService missing → fallback
|
|
13
|
+
* • model not in catalog → fallback
|
|
14
|
+
* • apiType resolves to chat_completion → fallback
|
|
15
|
+
* • originalSystemPrompt missing → fallback
|
|
16
|
+
* • toolsRegistry missing → fallback
|
|
17
|
+
* • rebuild throws → fallback
|
|
18
|
+
* Only the happy path (Responses model + everything available)
|
|
19
|
+
* returns the trimmed rebuild.
|
|
20
|
+
* 3. Caching — same (agent, model) twice should rebuild ONCE; the
|
|
21
|
+
* invalidator should clear entries for ONE agent only.
|
|
22
|
+
*/
|
|
23
|
+
import { jest, describe, test, expect, beforeEach } from '@jest/globals';
|
|
24
|
+
|
|
25
|
+
// Stub the activity service so importing the scheduler doesn't drag in
|
|
26
|
+
// the full agent dependency graph for these unit tests.
|
|
27
|
+
jest.unstable_mockModule('../../services/agentActivityService.js', () => ({
|
|
28
|
+
shouldAgentBeActive: () => ({ active: false, reason: 'stub' }),
|
|
29
|
+
getActiveAgents: () => [],
|
|
30
|
+
shouldSkipIteration: () => false,
|
|
31
|
+
}));
|
|
32
|
+
|
|
33
|
+
const { default: AgentScheduler } = await import('../agentScheduler.js');
|
|
34
|
+
|
|
35
|
+
// ─── Test-only helpers ────────────────────────────────────────────────
|
|
36
|
+
function makeScheduler({ models = [], registry = null, logger = null } = {}) {
|
|
37
|
+
const modelsService = {
|
|
38
|
+
getModels: () => models,
|
|
39
|
+
};
|
|
40
|
+
const agentPool = {
|
|
41
|
+
toolsRegistry: registry,
|
|
42
|
+
getAllAgents: async () => new Map(),
|
|
43
|
+
getAgent: async () => null,
|
|
44
|
+
};
|
|
45
|
+
const aiService = {};
|
|
46
|
+
const messageProcessor = {};
|
|
47
|
+
const log = logger || {
|
|
48
|
+
info() {}, warn() {}, error() {}, debug() {},
|
|
49
|
+
};
|
|
50
|
+
return new AgentScheduler(
|
|
51
|
+
agentPool,
|
|
52
|
+
messageProcessor,
|
|
53
|
+
aiService,
|
|
54
|
+
log,
|
|
55
|
+
null, // webSocketManager
|
|
56
|
+
null, // modelRouterService
|
|
57
|
+
modelsService, // ← what we care about
|
|
58
|
+
);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// A minimal fake registry that records the apiType it was called with
|
|
62
|
+
// so we can assert the scheduler propagates it correctly.
|
|
63
|
+
function makeRegistry({ shouldThrow = false } = {}) {
|
|
64
|
+
const calls = [];
|
|
65
|
+
return {
|
|
66
|
+
calls,
|
|
67
|
+
enhanceSystemPrompt(prompt, capabilities, options) {
|
|
68
|
+
calls.push({ prompt, capabilities, options });
|
|
69
|
+
if (shouldThrow) throw new Error('boom');
|
|
70
|
+
const apiTag = options?.apiType === 'responses' ? '[trimmed]' : '[full]';
|
|
71
|
+
return `${prompt}\n## TOOLS ${apiTag} for caps=${(capabilities || []).join(',')}`;
|
|
72
|
+
},
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// ──────────────────────────────────────────────────────────────────────
|
|
77
|
+
// 1. _resolveModelApiType — parity with backend's _inferRouting
|
|
78
|
+
// ──────────────────────────────────────────────────────────────────────
|
|
79
|
+
|
|
80
|
+
describe('_resolveModelApiType — catalog → routing decision', () => {
|
|
81
|
+
test('api_type=["responses"] alone → "responses"', () => {
|
|
82
|
+
const s = makeScheduler({ models: [{ name: 'codex-mini', api_type: ['responses'] }] });
|
|
83
|
+
expect(s._resolveModelApiType('codex-mini')).toBe('responses');
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
test('api_type=["chat_completion","responses"] (BOTH present) → "chat_completion"', () => {
|
|
87
|
+
// Backend rule: only flips to responses when chat_completion is ABSENT.
|
|
88
|
+
const s = makeScheduler({
|
|
89
|
+
models: [{ name: 'gpt-5', api_type: ['chat_completion', 'responses'] }],
|
|
90
|
+
});
|
|
91
|
+
expect(s._resolveModelApiType('gpt-5')).toBe('chat_completion');
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
test('api_type=["chat_completion"] → "chat_completion"', () => {
|
|
95
|
+
const s = makeScheduler({ models: [{ name: 'claude', api_type: ['chat_completion'] }] });
|
|
96
|
+
expect(s._resolveModelApiType('claude')).toBe('chat_completion');
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
test('capabilities.responses=="true" + chatCompletion=="false" → "responses"', () => {
|
|
100
|
+
const s = makeScheduler({
|
|
101
|
+
models: [{
|
|
102
|
+
name: 'o3',
|
|
103
|
+
api_type: ['responses'],
|
|
104
|
+
capabilities: { responses: 'true', chatCompletion: 'false' },
|
|
105
|
+
}],
|
|
106
|
+
});
|
|
107
|
+
expect(s._resolveModelApiType('o3')).toBe('responses');
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
test('explicit useResponsesApi flag → "responses"', () => {
|
|
111
|
+
const s = makeScheduler({
|
|
112
|
+
models: [{ name: 'foo', api_type: [], useResponsesApi: true }],
|
|
113
|
+
});
|
|
114
|
+
expect(s._resolveModelApiType('foo')).toBe('responses');
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
test('name-pattern fallback: "codex" → "responses" even with no catalog data', () => {
|
|
118
|
+
const s = makeScheduler({ models: [{ name: 'gpt-5-1-codex-mini' }] });
|
|
119
|
+
expect(s._resolveModelApiType('gpt-5-1-codex-mini')).toBe('responses');
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
test('name-pattern fallback: "gpt-X-pro" → "responses"', () => {
|
|
123
|
+
const s = makeScheduler({ models: [{ name: 'gpt-5-pro' }] });
|
|
124
|
+
expect(s._resolveModelApiType('gpt-5-pro')).toBe('responses');
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
test('unknown model returns undefined (caller falls back to old behaviour)', () => {
|
|
128
|
+
const s = makeScheduler({ models: [{ name: 'claude', api_type: ['chat_completion'] }] });
|
|
129
|
+
expect(s._resolveModelApiType('mystery-model')).toBeUndefined();
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
test('modelsService missing → undefined (safe — caller falls back)', () => {
|
|
133
|
+
const s = new AgentScheduler({ toolsRegistry: null }, {}, {}, {
|
|
134
|
+
info() {}, warn() {}, error() {}, debug() {},
|
|
135
|
+
});
|
|
136
|
+
expect(s._resolveModelApiType('codex-mini')).toBeUndefined();
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
test('arbitrary exception in catalog → undefined (defensive)', () => {
|
|
140
|
+
// getModels() throws — must not propagate.
|
|
141
|
+
const s = new AgentScheduler(
|
|
142
|
+
{ toolsRegistry: null },
|
|
143
|
+
{},
|
|
144
|
+
{},
|
|
145
|
+
{ info() {}, warn() {}, error() {}, debug() {} },
|
|
146
|
+
null,
|
|
147
|
+
null,
|
|
148
|
+
{ getModels() { throw new Error('catalog offline'); } },
|
|
149
|
+
);
|
|
150
|
+
expect(s._resolveModelApiType('codex-mini')).toBeUndefined();
|
|
151
|
+
});
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
// ──────────────────────────────────────────────────────────────────────
|
|
155
|
+
// 2. _pickSystemPromptForModel — happy path + every safety-net path
|
|
156
|
+
// ──────────────────────────────────────────────────────────────────────
|
|
157
|
+
|
|
158
|
+
describe('_pickSystemPromptForModel — back-compat fallbacks', () => {
|
|
159
|
+
const BAKED_PROMPT = 'BAKED: agent persona\n## AVAILABLE TOOLS\n…lots of text…';
|
|
160
|
+
const ORIGINAL_PROMPT = 'Agent persona';
|
|
161
|
+
|
|
162
|
+
const agentFor = (overrides = {}) => ({
|
|
163
|
+
id: 'agent-1',
|
|
164
|
+
systemPrompt: BAKED_PROMPT,
|
|
165
|
+
originalSystemPrompt: ORIGINAL_PROMPT,
|
|
166
|
+
capabilities: ['memory', 'terminal'],
|
|
167
|
+
skills: [],
|
|
168
|
+
...overrides,
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
test('chat-completion model → returns agent.systemPrompt verbatim', async () => {
|
|
172
|
+
const s = makeScheduler({
|
|
173
|
+
models: [{ name: 'claude', api_type: ['chat_completion'] }],
|
|
174
|
+
registry: makeRegistry(),
|
|
175
|
+
});
|
|
176
|
+
const out = await s._pickSystemPromptForModel(agentFor(), 'claude');
|
|
177
|
+
expect(out).toBe(BAKED_PROMPT);
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
test('unknown model → returns agent.systemPrompt verbatim', async () => {
|
|
181
|
+
const s = makeScheduler({ models: [], registry: makeRegistry() });
|
|
182
|
+
const out = await s._pickSystemPromptForModel(agentFor(), 'never-heard-of');
|
|
183
|
+
expect(out).toBe(BAKED_PROMPT);
|
|
184
|
+
});
|
|
185
|
+
|
|
186
|
+
test('Responses model BUT originalSystemPrompt missing → fallback', async () => {
|
|
187
|
+
// Very old persisted agent (pre-originalSystemPrompt storage).
|
|
188
|
+
const s = makeScheduler({
|
|
189
|
+
models: [{ name: 'codex', api_type: ['responses'] }],
|
|
190
|
+
registry: makeRegistry(),
|
|
191
|
+
});
|
|
192
|
+
const out = await s._pickSystemPromptForModel(
|
|
193
|
+
agentFor({ originalSystemPrompt: undefined }),
|
|
194
|
+
'codex',
|
|
195
|
+
);
|
|
196
|
+
expect(out).toBe(BAKED_PROMPT);
|
|
197
|
+
});
|
|
198
|
+
|
|
199
|
+
test('Responses model BUT toolsRegistry missing → fallback', async () => {
|
|
200
|
+
const s = makeScheduler({
|
|
201
|
+
models: [{ name: 'codex', api_type: ['responses'] }],
|
|
202
|
+
registry: null,
|
|
203
|
+
});
|
|
204
|
+
const out = await s._pickSystemPromptForModel(agentFor(), 'codex');
|
|
205
|
+
expect(out).toBe(BAKED_PROMPT);
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
test('Responses model AND rebuild throws → fallback (no crash)', async () => {
|
|
209
|
+
const s = makeScheduler({
|
|
210
|
+
models: [{ name: 'codex', api_type: ['responses'] }],
|
|
211
|
+
registry: makeRegistry({ shouldThrow: true }),
|
|
212
|
+
});
|
|
213
|
+
const out = await s._pickSystemPromptForModel(agentFor(), 'codex');
|
|
214
|
+
expect(out).toBe(BAKED_PROMPT);
|
|
215
|
+
});
|
|
216
|
+
});
|
|
217
|
+
|
|
218
|
+
// ──────────────────────────────────────────────────────────────────────
|
|
219
|
+
// 3. _pickSystemPromptForModel — happy path
|
|
220
|
+
// ──────────────────────────────────────────────────────────────────────
|
|
221
|
+
|
|
222
|
+
describe('_pickSystemPromptForModel — Responses model rebuild', () => {
|
|
223
|
+
const agent = {
|
|
224
|
+
id: 'agent-1',
|
|
225
|
+
systemPrompt: 'STALE (chat-completion shape)',
|
|
226
|
+
originalSystemPrompt: 'Agent persona',
|
|
227
|
+
capabilities: ['memory', 'terminal'],
|
|
228
|
+
skills: [],
|
|
229
|
+
};
|
|
230
|
+
|
|
231
|
+
test('rebuilds with apiType:"responses" and returns trimmed prompt', async () => {
|
|
232
|
+
const reg = makeRegistry();
|
|
233
|
+
const s = makeScheduler({
|
|
234
|
+
models: [{ name: 'codex', api_type: ['responses'] }],
|
|
235
|
+
registry: reg,
|
|
236
|
+
});
|
|
237
|
+
const out = await s._pickSystemPromptForModel(agent, 'codex');
|
|
238
|
+
expect(out).toContain('[trimmed]');
|
|
239
|
+
expect(out).toContain('Agent persona');
|
|
240
|
+
// The registry was called with apiType: 'responses' AND the agent's capabilities.
|
|
241
|
+
expect(reg.calls).toHaveLength(1);
|
|
242
|
+
expect(reg.calls[0].options.apiType).toBe('responses');
|
|
243
|
+
expect(reg.calls[0].capabilities).toEqual(['memory', 'terminal']);
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
test('caches per (agentId, modelName) — second call does NOT rebuild', async () => {
|
|
247
|
+
const reg = makeRegistry();
|
|
248
|
+
const s = makeScheduler({
|
|
249
|
+
models: [{ name: 'codex', api_type: ['responses'] }],
|
|
250
|
+
registry: reg,
|
|
251
|
+
});
|
|
252
|
+
await s._pickSystemPromptForModel(agent, 'codex');
|
|
253
|
+
await s._pickSystemPromptForModel(agent, 'codex');
|
|
254
|
+
await s._pickSystemPromptForModel(agent, 'codex');
|
|
255
|
+
expect(reg.calls).toHaveLength(1);
|
|
256
|
+
});
|
|
257
|
+
|
|
258
|
+
test('different models for same agent → SEPARATE cache entries', async () => {
|
|
259
|
+
const reg = makeRegistry();
|
|
260
|
+
const s = makeScheduler({
|
|
261
|
+
models: [
|
|
262
|
+
{ name: 'codex', api_type: ['responses'] },
|
|
263
|
+
{ name: 'gpt-5-pro', api_type: ['responses'] },
|
|
264
|
+
],
|
|
265
|
+
registry: reg,
|
|
266
|
+
});
|
|
267
|
+
await s._pickSystemPromptForModel(agent, 'codex');
|
|
268
|
+
await s._pickSystemPromptForModel(agent, 'gpt-5-pro');
|
|
269
|
+
expect(reg.calls).toHaveLength(2);
|
|
270
|
+
});
|
|
271
|
+
|
|
272
|
+
test('switching back to chat-completion mid-session uses the persisted prompt unchanged', async () => {
|
|
273
|
+
const reg = makeRegistry();
|
|
274
|
+
const s = makeScheduler({
|
|
275
|
+
models: [
|
|
276
|
+
{ name: 'codex', api_type: ['responses'] },
|
|
277
|
+
{ name: 'claude', api_type: ['chat_completion'] },
|
|
278
|
+
],
|
|
279
|
+
registry: reg,
|
|
280
|
+
});
|
|
281
|
+
const native = await s._pickSystemPromptForModel(agent, 'codex');
|
|
282
|
+
const inline = await s._pickSystemPromptForModel(agent, 'claude');
|
|
283
|
+
expect(native).toContain('[trimmed]');
|
|
284
|
+
expect(inline).toBe('STALE (chat-completion shape)'); // persisted, untouched
|
|
285
|
+
expect(reg.calls).toHaveLength(1); // only the codex rebuild ran
|
|
286
|
+
});
|
|
287
|
+
});
|
|
288
|
+
|
|
289
|
+
// ──────────────────────────────────────────────────────────────────────
|
|
290
|
+
// 4. Cache invalidation
|
|
291
|
+
// ──────────────────────────────────────────────────────────────────────
|
|
292
|
+
|
|
293
|
+
describe('_invalidateNativePromptCache — selective per-agent clear', () => {
|
|
294
|
+
test('clears entries for ONE agent only, leaves others alone', async () => {
|
|
295
|
+
const reg = makeRegistry();
|
|
296
|
+
const s = makeScheduler({
|
|
297
|
+
models: [{ name: 'codex', api_type: ['responses'] }],
|
|
298
|
+
registry: reg,
|
|
299
|
+
});
|
|
300
|
+
const a1 = { id: 'a1', systemPrompt: 'p1', originalSystemPrompt: 'persona-1', capabilities: ['memory'], skills: [] };
|
|
301
|
+
const a2 = { id: 'a2', systemPrompt: 'p2', originalSystemPrompt: 'persona-2', capabilities: ['terminal'], skills: [] };
|
|
302
|
+
|
|
303
|
+
await s._pickSystemPromptForModel(a1, 'codex');
|
|
304
|
+
await s._pickSystemPromptForModel(a2, 'codex');
|
|
305
|
+
expect(reg.calls).toHaveLength(2);
|
|
306
|
+
|
|
307
|
+
s._invalidateNativePromptCache('a1');
|
|
308
|
+
|
|
309
|
+
// Re-fetching a1 → rebuild. a2 → still cached.
|
|
310
|
+
await s._pickSystemPromptForModel(a1, 'codex');
|
|
311
|
+
await s._pickSystemPromptForModel(a2, 'codex');
|
|
312
|
+
expect(reg.calls).toHaveLength(3); // only a1 rebuilt
|
|
313
|
+
});
|
|
314
|
+
|
|
315
|
+
test('invalidating for an agent that never rendered is a no-op (does not throw)', async () => {
|
|
316
|
+
const s = makeScheduler({ models: [], registry: makeRegistry() });
|
|
317
|
+
expect(() => s._invalidateNativePromptCache('never-seen')).not.toThrow();
|
|
318
|
+
});
|
|
319
|
+
});
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Contract tests for the per-turn task-list injection in agentScheduler.
|
|
3
|
+
*
|
|
4
|
+
* Why this exists: the Talisman production failure was caused by
|
|
5
|
+
* post-compaction context loss — the agent forgot it already had a
|
|
6
|
+
* 9-task plan and called `taskmanager sync` with 4 unrelated tasks,
|
|
7
|
+
* silently destroying the in-flight work. The destructive-sync
|
|
8
|
+
* guardrail in taskManagerTool catches that AT THE SYNC POINT, but
|
|
9
|
+
* the cleaner fix is to never let the agent forget the task list in
|
|
10
|
+
* the first place — inject it into the system prompt every turn.
|
|
11
|
+
*
|
|
12
|
+
* This test pins the contract by source-grep (matches the existing
|
|
13
|
+
* pattern in agentScheduler.taskLifecycleInstruction.test.js): the
|
|
14
|
+
* scheduler's source must contain the injection block AND its key
|
|
15
|
+
* properties. Reading the file is cheaper than instantiating the
|
|
16
|
+
* full scheduler graph (websocket, compaction service, AI service,
|
|
17
|
+
* etc.) just to assert on prompt content.
|
|
18
|
+
*/
|
|
19
|
+
import { describe, test, expect } from '@jest/globals';
|
|
20
|
+
import { readFileSync } from 'fs';
|
|
21
|
+
import { fileURLToPath } from 'url';
|
|
22
|
+
import { dirname, join } from 'path';
|
|
23
|
+
|
|
24
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
25
|
+
const SRC = readFileSync(join(__dirname, '../agentScheduler.js'), 'utf-8');
|
|
26
|
+
|
|
27
|
+
describe('agentScheduler — per-turn task-list injection', () => {
|
|
28
|
+
test('injection block exists in agentScheduler.js source', () => {
|
|
29
|
+
expect(SRC).toContain('CURRENT TASK LIST');
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
test('emits the heading the agent will key off of', () => {
|
|
33
|
+
// The block heading must be specific and stable — the agent's
|
|
34
|
+
// system prompt is the only thing it sees, so the heading needs
|
|
35
|
+
// to be unambiguous.
|
|
36
|
+
expect(SRC).toMatch(/##\s*CURRENT TASK LIST/);
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
test('reads from durable agent state (agent.taskList.tasks), not from conversation history', () => {
|
|
40
|
+
// The whole point of this injection is to recover from a
|
|
41
|
+
// conversation-history loss (compaction). It MUST read the live,
|
|
42
|
+
// durable taskList — not derive from messages.
|
|
43
|
+
expect(SRC).toMatch(/agent\.taskList\??\.tasks/);
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
test('the block runs AFTER the plan/* injection (cache-friendly ordering)', () => {
|
|
47
|
+
// Stable cacheable system-prompt prefix is critical for the
|
|
48
|
+
// model-side prompt cache. Both injections are dynamic per turn,
|
|
49
|
+
// but ordering must be consistent so the prefix up to plan/*
|
|
50
|
+
// stays stable across turns when only the task list moves.
|
|
51
|
+
const planIdx = SRC.indexOf('AGENT WORKING PLAN');
|
|
52
|
+
const taskIdx = SRC.indexOf('CURRENT TASK LIST');
|
|
53
|
+
expect(planIdx).toBeGreaterThan(-1);
|
|
54
|
+
expect(taskIdx).toBeGreaterThan(-1);
|
|
55
|
+
expect(taskIdx).toBeGreaterThan(planIdx);
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
test('mentions compaction-resistance explicitly', () => {
|
|
59
|
+
// The agent reads the injected block. It must know that the
|
|
60
|
+
// injection is a deliberate signal — not just historical residue
|
|
61
|
+
// — so it gives appropriate weight to the listed tasks.
|
|
62
|
+
expect(SRC).toMatch(/(survives compaction|compaction summarized|compaction)/);
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
test('warns the agent about destructive sync inside the injected block', () => {
|
|
66
|
+
// The injection is the natural place to point the agent at the
|
|
67
|
+
// guardrail. Strong message: do not blindly sync past the
|
|
68
|
+
// existing tasks.
|
|
69
|
+
expect(SRC).toMatch(/(sync|drop).*existing|before issuing.*sync/i);
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
test('groups tasks by status (in_progress / pending / completed / cancelled)', () => {
|
|
73
|
+
// A flat list of 30 tasks is hard to scan. Status grouping puts
|
|
74
|
+
// in_progress first — what the agent needs to act on.
|
|
75
|
+
expect(SRC).toContain('in_progress');
|
|
76
|
+
expect(SRC).toContain('pending');
|
|
77
|
+
expect(SRC).toContain('completed');
|
|
78
|
+
expect(SRC).toContain('cancelled');
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
test('degrades gracefully — never throws on missing taskList', () => {
|
|
82
|
+
// If taskList is undefined or .tasks is missing/empty, the
|
|
83
|
+
// injection should silently skip — never break the turn.
|
|
84
|
+
expect(SRC).toMatch(/agent\.taskList\?\.tasks/);
|
|
85
|
+
// And the whole block is wrapped in try/catch with a logger warn.
|
|
86
|
+
const block = SRC.slice(
|
|
87
|
+
SRC.indexOf('Auto-inject CURRENT TASK LIST'),
|
|
88
|
+
SRC.indexOf('Check if streaming is enabled'),
|
|
89
|
+
);
|
|
90
|
+
expect(block).toMatch(/try\s*{/);
|
|
91
|
+
expect(block).toMatch(/catch\s*\(/);
|
|
92
|
+
expect(block).toMatch(/continuing without/);
|
|
93
|
+
});
|
|
94
|
+
});
|