@vellumai/assistant 0.10.1-dev.202606240206.7c2bca6 → 0.10.1-dev.202606240317.ea25efe

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vellumai/assistant",
3
- "version": "0.10.1-dev.202606240206.7c2bca6",
3
+ "version": "0.10.1-dev.202606240317.ea25efe",
4
4
  "license": "MIT",
5
5
  "type": "module",
6
6
  "exports": {
@@ -38,6 +38,7 @@ mock.module("../runtime/assistant-event-hub.js", () => ({
38
38
 
39
39
  const {
40
40
  DISK_PRESSURE_CLEAR_THRESHOLD_PERCENT,
41
+ DISK_PRESSURE_MIN_FREE_FLOOR_MB,
41
42
  DISK_PRESSURE_OVERRIDE_CONFIRMATION,
42
43
  DISK_PRESSURE_THRESHOLD_PERCENT,
43
44
  DISK_PRESSURE_WARNING_CLEAR_THRESHOLD_PERCENT,
@@ -342,4 +343,44 @@ describe("disk pressure guard", () => {
342
343
  setDiskUsage(DISK_PRESSURE_WARNING_CLEAR_THRESHOLD_PERCENT - 1);
343
344
  expect(evaluateDiskPressureNow().state).toBe("ok");
344
345
  });
346
+
347
+ test("stays ok at a critical usage percentage while ample free space remains", () => {
348
+ // 99% used of a large volume still leaves gigabytes free — above the floor.
349
+ const totalMb = 1_000_000;
350
+ const usedMb = Math.round(totalMb * 0.99); // freeMb ~= 10_000 MiB
351
+ setDiskUsage(usedMb, totalMb);
352
+ expect(diskSample!.freeMb).toBeGreaterThanOrEqual(
353
+ DISK_PRESSURE_MIN_FREE_FLOOR_MB,
354
+ );
355
+
356
+ const status = evaluateDiskPressureNow();
357
+
358
+ expect(status.state).toBe("ok");
359
+ expect(status.locked).toBe(false);
360
+ expect(status.effectivelyLocked).toBe(false);
361
+ });
362
+
363
+ test("stays ok at a warning usage percentage while ample free space remains", () => {
364
+ const totalMb = 1_000_000;
365
+ const usedMb = Math.round(totalMb * 0.85); // 85% used, freeMb ~= 150_000 MiB
366
+ setDiskUsage(usedMb, totalMb);
367
+
368
+ const status = evaluateDiskPressureNow();
369
+
370
+ expect(status.state).toBe("ok");
371
+ });
372
+
373
+ test("locks at a critical usage percentage once free space drops below the floor", () => {
374
+ // High percentage AND little absolute headroom: floor does not apply.
375
+ const totalMb = 100_000;
376
+ const freeMb = DISK_PRESSURE_MIN_FREE_FLOOR_MB - 1;
377
+ setDiskUsage(totalMb - freeMb, totalMb);
378
+ expect(diskSample!.freeMb).toBeLessThan(DISK_PRESSURE_MIN_FREE_FLOOR_MB);
379
+
380
+ const status = evaluateDiskPressureNow();
381
+
382
+ expect(status.state).toBe("critical");
383
+ expect(status.locked).toBe(true);
384
+ expect(status.effectivelyLocked).toBe(true);
385
+ });
345
386
  });
@@ -200,3 +200,42 @@ describe("UsageTrackingProvider", () => {
200
200
  });
201
201
  });
202
202
  });
203
+
204
+ describe("native web-search capability survives the wrapper chain", () => {
205
+ function leaf(supports: boolean | undefined): Provider {
206
+ return {
207
+ name: "anthropic",
208
+ ...(supports === undefined ? {} : { supportsNativeWebSearch: supports }),
209
+ async sendMessage(): Promise<ProviderResponse> {
210
+ return {
211
+ content: [{ type: "text", text: "" }],
212
+ model: "m",
213
+ usage: { inputTokens: 0, outputTokens: 0 },
214
+ stopReason: "end_turn",
215
+ };
216
+ },
217
+ };
218
+ }
219
+
220
+ test("UsageTrackingProvider forwards supportsNativeWebSearch", () => {
221
+ expect(new UsageTrackingProvider(leaf(true)).supportsNativeWebSearch).toBe(
222
+ true,
223
+ );
224
+ expect(new UsageTrackingProvider(leaf(false)).supportsNativeWebSearch).toBe(
225
+ false,
226
+ );
227
+ expect(
228
+ new UsageTrackingProvider(leaf(undefined)).supportsNativeWebSearch,
229
+ ).toBeUndefined();
230
+ });
231
+
232
+ test("CallSiteConfiguredProvider forwards it through a nested wrapper", () => {
233
+ // The exact chain getConfiguredProvider returns: CallSiteConfigured →
234
+ // UsageTracking → leaf. The advisor consult reads the flag off the top.
235
+ const wrapped = new CallSiteConfiguredProvider(
236
+ new UsageTrackingProvider(leaf(true)),
237
+ "advisor",
238
+ );
239
+ expect(wrapped.supportsNativeWebSearch).toBe(true);
240
+ });
241
+ });
@@ -36,6 +36,9 @@ function makeFakeTool(name: string): Tool {
36
36
  category: "test",
37
37
  defaultRiskLevel: RiskLevel.Low,
38
38
  executionTarget: "sandbox",
39
+ // Match the finalized shape the registry stores, so identity comparisons
40
+ // (`getTool(name)` toEqual coreTool) hold after registration fills defaults.
41
+ exclusive: false,
39
42
  input_schema: { type: "object", properties: {}, required: [] },
40
43
  async execute(
41
44
  _input: Record<string, unknown>,
@@ -94,6 +94,9 @@ function makeFakeCoreTool(name: string): Tool {
94
94
  category: "test",
95
95
  defaultRiskLevel: RiskLevel.Low,
96
96
  executionTarget: "sandbox",
97
+ // Match the finalized shape the registry stores (defaults filled), so
98
+ // `getCoreToolOverride(name)` toEqual comparisons hold after registration.
99
+ exclusive: false,
97
100
  input_schema: { type: "object", properties: {}, required: [] },
98
101
  async execute(
99
102
  _input: Record<string, unknown>,
@@ -0,0 +1,150 @@
1
+ /**
2
+ * Verifies the agent loop's exclusive-tool dispatch: when a tool the loop is
3
+ * told is exclusive (e.g. the advisor) appears in a multi-call turn, only that
4
+ * tool runs and the siblings are deferred un-run with a benign result — so the
5
+ * model incorporates the exclusive tool's output before acting on anything
6
+ * else. Drives the REAL loop, mocking only the provider boundary.
7
+ */
8
+ import { describe, expect, test } from "bun:test";
9
+
10
+ import { createMockProvider } from "../__tests__/helpers/mock-provider.js";
11
+ import type { ContentBlock, ProviderResponse } from "../providers/types.js";
12
+ import { AgentLoop } from "./loop.js";
13
+
14
+ const endTurn = (text: string): ProviderResponse => ({
15
+ content: [{ type: "text", text }],
16
+ model: "mock-model",
17
+ usage: { inputTokens: 1, outputTokens: 1 },
18
+ stopReason: "end_turn",
19
+ });
20
+
21
+ const toolUseTurn = (
22
+ blocks: Array<{ id: string; name: string }>,
23
+ ): ProviderResponse => ({
24
+ content: [
25
+ { type: "text", text: "working" },
26
+ ...blocks.map((b) => ({
27
+ type: "tool_use" as const,
28
+ id: b.id,
29
+ name: b.name,
30
+ input: {},
31
+ })),
32
+ ],
33
+ model: "mock-model",
34
+ usage: { inputTokens: 1, outputTokens: 1 },
35
+ stopReason: "tool_use",
36
+ });
37
+
38
+ function toolResults(history: { content: ContentBlock[] }[]) {
39
+ return history
40
+ .flatMap((m) => m.content)
41
+ .filter(
42
+ (b): b is Extract<ContentBlock, { type: "tool_result" }> =>
43
+ b.type === "tool_result",
44
+ );
45
+ }
46
+
47
+ const baseRun = {
48
+ requestId: "req-excl",
49
+ onEvent: () => {},
50
+ callSite: "mainAgent" as const,
51
+ trust: { sourceChannel: "vellum" as const, trustClass: "unknown" as const },
52
+ };
53
+
54
+ describe("AgentLoop — exclusive tool deferral", () => {
55
+ test("runs the exclusive tool alone and defers sibling calls un-run", async () => {
56
+ const { provider } = createMockProvider([
57
+ toolUseTurn([
58
+ { id: "call-advisor", name: "advisor" },
59
+ { id: "call-edit", name: "write_file" },
60
+ ]),
61
+ endTurn("done"),
62
+ ]);
63
+
64
+ const executed: string[] = [];
65
+ const loop = new AgentLoop({
66
+ provider,
67
+ systemPrompt: "sys",
68
+ conversationId: "excl-1",
69
+ tools: [
70
+ { name: "advisor", description: "", input_schema: { type: "object" } },
71
+ {
72
+ name: "write_file",
73
+ description: "",
74
+ input_schema: { type: "object" },
75
+ },
76
+ ],
77
+ toolExecutor: async (name) => {
78
+ executed.push(name);
79
+ return { content: `ran ${name}`, isError: false };
80
+ },
81
+ isExclusiveTool: (name) => name === "advisor",
82
+ });
83
+
84
+ const { history } = await loop.run({
85
+ ...baseRun,
86
+ messages: [{ role: "user", content: [{ type: "text", text: "do it" }] }],
87
+ });
88
+
89
+ // Only the exclusive tool actually executed.
90
+ expect(executed).toEqual(["advisor"]);
91
+
92
+ const results = toolResults(history);
93
+ const advisorResult = results.find(
94
+ (b) => b.tool_use_id === "call-advisor",
95
+ )!;
96
+ const editResult = results.find((b) => b.tool_use_id === "call-edit")!;
97
+
98
+ // The advisor ran; the sibling came back un-run (not an error) so the model
99
+ // can re-issue it after reading the guidance.
100
+ expect(advisorResult.content).toBe("ran advisor");
101
+ expect(editResult.content).toContain("not run");
102
+ expect(editResult.content).toContain("advisor");
103
+ expect(editResult.is_error).toBe(false);
104
+ });
105
+
106
+ test("runs sibling tools normally when no exclusive tool is present", async () => {
107
+ const { provider } = createMockProvider([
108
+ toolUseTurn([
109
+ { id: "call-read", name: "read_file" },
110
+ { id: "call-write", name: "write_file" },
111
+ ]),
112
+ endTurn("done"),
113
+ ]);
114
+
115
+ const executed: string[] = [];
116
+ const loop = new AgentLoop({
117
+ provider,
118
+ systemPrompt: "sys",
119
+ conversationId: "excl-2",
120
+ tools: [
121
+ {
122
+ name: "read_file",
123
+ description: "",
124
+ input_schema: { type: "object" },
125
+ },
126
+ {
127
+ name: "write_file",
128
+ description: "",
129
+ input_schema: { type: "object" },
130
+ },
131
+ ],
132
+ toolExecutor: async (name) => {
133
+ executed.push(name);
134
+ return { content: `ran ${name}`, isError: false };
135
+ },
136
+ isExclusiveTool: (name) => name === "advisor",
137
+ });
138
+
139
+ const { history } = await loop.run({
140
+ ...baseRun,
141
+ messages: [{ role: "user", content: [{ type: "text", text: "do it" }] }],
142
+ });
143
+
144
+ // Both non-exclusive tools ran; nothing was deferred.
145
+ expect(executed.sort()).toEqual(["read_file", "write_file"]);
146
+ for (const result of toolResults(history)) {
147
+ expect(result.content).not.toContain("not run");
148
+ }
149
+ });
150
+ });
package/src/agent/loop.ts CHANGED
@@ -625,6 +625,20 @@ export type LoopToolExecutor = (
625
625
  activityMetadata?: ToolActivityMetadata;
626
626
  }>;
627
627
 
628
+ /**
629
+ * The benign result returned for a sibling tool call that was deferred because
630
+ * an exclusive tool ran in the same turn. Phrased so the model treats it as a
631
+ * "not run yet" signal — read the exclusive tool's output, then re-issue this
632
+ * call if it is still the right next step.
633
+ */
634
+ function deferredForExclusiveMessage(exclusiveToolName: string): string {
635
+ return (
636
+ `(not run: \`${exclusiveToolName}\` was called this turn and runs first, on its own, ` +
637
+ `so the rest of your tool calls were held back. Read its output, then call this tool ` +
638
+ `again if it is still the right next step.)`
639
+ );
640
+ }
641
+
628
642
  export interface AgentLoopConstructorOptions {
629
643
  /** LLM provider the loop issues every call through. */
630
644
  provider: Provider;
@@ -634,6 +648,14 @@ export interface AgentLoopConstructorOptions {
634
648
  tools?: ToolDefinition[];
635
649
  toolExecutor?: LoopToolExecutor;
636
650
  resolveTools?: (history: Message[]) => ToolDefinition[];
651
+ /**
652
+ * Decide whether a tool runs exclusively in its turn (see
653
+ * {@link ToolDefinition.exclusive}). When it returns true for a tool present
654
+ * in a multi-call turn, the loop runs only that tool and defers the siblings
655
+ * un-run. Injected by the conversation wiring, which can read the tool
656
+ * registry; lightweight loops that omit it never defer.
657
+ */
658
+ isExclusiveTool?: (toolName: string) => boolean;
637
659
  /**
638
660
  * Conversation this loop drives. Scopes the loop-held compaction circuit
639
661
  * breaker and is the source of truth the loop's pipeline contexts and
@@ -659,6 +681,7 @@ export class AgentLoop {
659
681
  private tools: ToolDefinition[];
660
682
  private resolveTools: ((history: Message[]) => ToolDefinition[]) | null;
661
683
  private toolExecutor: LoopToolExecutor | null;
684
+ private isExclusiveTool: ((toolName: string) => boolean) | null;
662
685
 
663
686
  /**
664
687
  * Conversation this loop drives. Source of truth for the `conversationId`
@@ -688,6 +711,7 @@ export class AgentLoop {
688
711
  tools,
689
712
  toolExecutor,
690
713
  resolveTools,
714
+ isExclusiveTool,
691
715
  conversationId,
692
716
  resolveConversationDir,
693
717
  } = options;
@@ -697,6 +721,7 @@ export class AgentLoop {
697
721
  this.tools = tools ?? [];
698
722
  this.resolveTools = resolveTools ?? null;
699
723
  this.toolExecutor = toolExecutor ?? null;
724
+ this.isExclusiveTool = isExclusiveTool ?? null;
700
725
  this.conversationId = conversationId;
701
726
  this.resolveConversationDir = resolveConversationDir ?? null;
702
727
  this.compactionCircuit = new CompactionCircuit(this.conversationId);
@@ -1883,8 +1908,39 @@ export class AgentLoop {
1883
1908
  "Tool execution start",
1884
1909
  );
1885
1910
 
1911
+ // When an exclusive tool (e.g. the advisor) is among this turn's calls,
1912
+ // it must run alone: the model should incorporate its output before
1913
+ // acting on anything else. Run only the first exclusive call and defer
1914
+ // the siblings with a benign, un-run result so the model re-issues them
1915
+ // next turn if still needed. Every tool_use still gets a matching
1916
+ // tool_result, so history stays well-formed.
1917
+ const exclusiveBlock = this.isExclusiveTool
1918
+ ? toolUseBlocks.find((block) => this.isExclusiveTool!(block.name))
1919
+ : undefined;
1920
+ const deferSiblings =
1921
+ exclusiveBlock !== undefined && toolUseBlocks.length > 1;
1922
+ if (deferSiblings) {
1923
+ rlog.info(
1924
+ {
1925
+ turn: toolUseTurns,
1926
+ exclusiveTool: exclusiveBlock!.name,
1927
+ deferred: toolUseBlocks
1928
+ .filter((block) => block !== exclusiveBlock)
1929
+ .map((block) => block.name),
1930
+ },
1931
+ "Exclusive tool present — running it alone and deferring sibling tool calls this turn",
1932
+ );
1933
+ }
1934
+
1886
1935
  const toolExecutionPromise = Promise.all(
1887
1936
  toolUseBlocks.map(async (toolUse) => {
1937
+ if (deferSiblings && toolUse !== exclusiveBlock) {
1938
+ const result: Awaited<ReturnType<LoopToolExecutor>> = {
1939
+ content: deferredForExclusiveMessage(exclusiveBlock!.name),
1940
+ isError: false,
1941
+ };
1942
+ return { toolUse, result };
1943
+ }
1888
1944
  const result = await this.toolExecutor!(
1889
1945
  toolUse.name,
1890
1946
  toolUse.input,
@@ -91,7 +91,7 @@ import {
91
91
  isActivationMomentParam,
92
92
  } from "../telemetry/activation-funnel.js";
93
93
  import { ToolExecutor } from "../tools/executor.js";
94
- import { getAllToolDefinitions } from "../tools/registry.js";
94
+ import { getAllToolDefinitions, getTool } from "../tools/registry.js";
95
95
  import type { ToolLifecycleEvent } from "../tools/types.js";
96
96
  import type { OnboardingContext } from "../types/onboarding-context.js";
97
97
  import type { AbortReason } from "../util/abort-reasons.js";
@@ -702,6 +702,9 @@ export class Conversation {
702
702
  tools: toolDefs.length > 0 ? toolDefs : undefined,
703
703
  toolExecutor: toolDefs.length > 0 ? toolExecutor : undefined,
704
704
  resolveTools,
705
+ // A tool the registry marks exclusive (e.g. `advisor`) runs alone in its
706
+ // turn; the loop defers any sibling calls until the next turn.
707
+ isExclusiveTool: (name) => getTool(name)?.exclusive === true,
705
708
  resolveConversationDir: () => {
706
709
  const conv = getConversation(this.conversationId);
707
710
  if (!conv) return null;
@@ -24,6 +24,12 @@ export const DISK_PRESSURE_CLEAR_THRESHOLD_PERCENT = 90;
24
24
  // clears the warning state, which discards the banner's (state-scoped) dismissal
25
25
  // so it re-appears the moment usage ticks back up.
26
26
  export const DISK_PRESSURE_WARNING_CLEAR_THRESHOLD_PERCENT = 77;
27
+ // Absolute free-space floor (MiB). Regardless of usage percentage, never enter
28
+ // the warning or critical state while at least this much space remains free. A
29
+ // high usage percentage on a large disk can still leave many gigabytes
30
+ // available, where locking is pointless. Small volumes (where a high percentage
31
+ // genuinely means near-full) drop below the floor and remain protected.
32
+ export const DISK_PRESSURE_MIN_FREE_FLOOR_MB = 2048;
27
33
  export const DISK_PRESSURE_CHECK_INTERVAL_MS = 60_000;
28
34
  export const DISK_PRESSURE_OVERRIDE_CONFIRMATION = "I understand the risks";
29
35
  export const DISK_PRESSURE_BLOCKED_CAPABILITIES = [
@@ -219,7 +225,10 @@ export function evaluateDiskPressureNow(): DiskPressureStatus {
219
225
  const criticalThreshold = state.status.locked
220
226
  ? DISK_PRESSURE_CLEAR_THRESHOLD_PERCENT
221
227
  : DISK_PRESSURE_THRESHOLD_PERCENT;
222
- const isCritical = usagePercent >= criticalThreshold;
228
+ // Absolute free-space floor overrides the percentage thresholds: while ample
229
+ // space remains free, report "ok" no matter how full the volume is by percent.
230
+ const hasAmpleFreeSpace = usageInfo.freeMb >= DISK_PRESSURE_MIN_FREE_FLOOR_MB;
231
+ const isCritical = !hasAmpleFreeSpace && usagePercent >= criticalThreshold;
223
232
  // Mirror the critical deadband for the warning band: once in an active
224
233
  // pressure state (warning or critical), hold warning until usage clears the
225
234
  // lower warning-clear threshold. Treating "critical" as active here matters
@@ -235,7 +244,8 @@ export function evaluateDiskPressureNow(): DiskPressureStatus {
235
244
  const warningThreshold = inActivePressureState
236
245
  ? DISK_PRESSURE_WARNING_CLEAR_THRESHOLD_PERCENT
237
246
  : DISK_PRESSURE_WARNING_THRESHOLD_PERCENT;
238
- const isWarning = !isCritical && usagePercent >= warningThreshold;
247
+ const isWarning =
248
+ !hasAmpleFreeSpace && !isCritical && usagePercent >= warningThreshold;
239
249
  const lastCheckedAt = new Date().toISOString();
240
250
 
241
251
  if (!isCritical && !isWarning) {
@@ -8,10 +8,14 @@ let sendMessageArgs: Record<string, unknown> | null = null;
8
8
  let responseText = "Use a channel-based worker pool; drain on shutdown.";
9
9
  let sendMessageError: Error | null = null;
10
10
  let providerResolves = true;
11
+ let providerSupportsWeb = false;
11
12
  let streamDeltas: string[] = [];
12
13
 
13
14
  const fakeProvider = {
14
15
  name: "mock-advisor-provider",
16
+ get supportsNativeWebSearch() {
17
+ return providerSupportsWeb;
18
+ },
15
19
  async sendMessage(messages: unknown, options: unknown) {
16
20
  sendMessageArgs = { messages, options } as Record<string, unknown>;
17
21
  if (sendMessageError) throw sendMessageError;
@@ -36,6 +40,14 @@ mock.module("../../../../providers/provider-send-message.js", () => ({
36
40
  getConfiguredProvider: async () => (providerResolves ? fakeProvider : null),
37
41
  }));
38
42
 
43
+ // Keep the tool tests focused on the consult wiring: stub the context pack so
44
+ // they don't reach into the registry / workspace / memory sources (those have
45
+ // their own coverage). The consult itself never imports this module.
46
+ mock.module("../context-pack.js", () => ({
47
+ buildAdvisorContext: async () => null,
48
+ deriveRecallQuery: () => null,
49
+ }));
50
+
39
51
  const { consultAdvisor } = await import("../consult.js");
40
52
  const advisorTool = (await import("../tools/advisor.js")).default;
41
53
  const { recordSystemPrompt, recordMessages, resetAdvisorStateForTests } =
@@ -56,6 +68,7 @@ beforeEach(() => {
56
68
  responseText = "Use a channel-based worker pool; drain on shutdown.";
57
69
  sendMessageError = null;
58
70
  providerResolves = true;
71
+ providerSupportsWeb = false;
59
72
  streamDeltas = [];
60
73
  resetAdvisorStateForTests();
61
74
  });
@@ -108,6 +121,37 @@ describe("consultAdvisor", () => {
108
121
  expect(options.systemPrompt).toContain("You are a coding agent.");
109
122
  });
110
123
 
124
+ test("stays tool-less when the provider has no native web search", async () => {
125
+ providerSupportsWeb = false;
126
+ await consultAdvisor({ systemPrompt: null, messages: [userMsg("hi")] });
127
+ const options = sendMessageArgs?.options as { tools?: unknown };
128
+ expect(options.tools).toBeUndefined();
129
+ expect(optionConfig().tool_choice).toEqual({ type: "none" });
130
+ });
131
+
132
+ test("enables native web search when the provider supports it", async () => {
133
+ providerSupportsWeb = true;
134
+ await consultAdvisor({ systemPrompt: null, messages: [userMsg("hi")] });
135
+
136
+ const options = sendMessageArgs?.options as {
137
+ tools?: Array<{ name: string }>;
138
+ };
139
+ expect(options.tools?.map((t) => t.name)).toEqual(["web_search"]);
140
+ // tool_choice must not be `none`, or the provider suppresses its server tool.
141
+ expect(optionConfig().tool_choice).toEqual({ type: "auto" });
142
+ });
143
+
144
+ test("embeds the runtime context in the advisor system prompt", async () => {
145
+ await consultAdvisor({
146
+ systemPrompt: "You are a coding agent.",
147
+ messages: [userMsg("hi")],
148
+ runtimeContext: "## Available tools\n- bash — run commands",
149
+ });
150
+ const options = sendMessageArgs?.options as { systemPrompt: string };
151
+ expect(options.systemPrompt).toContain("<agent_runtime_context>");
152
+ expect(options.systemPrompt).toContain("- bash — run commands");
153
+ });
154
+
111
155
  test("soft-fails when no provider is configured", async () => {
112
156
  providerResolves = false;
113
157
  const advice = await consultAdvisor({
@@ -0,0 +1,106 @@
1
+ /**
2
+ * Personal-memory gating for the advisor context pack: NOW.md and PKB must only
3
+ * reach the advisor when the turn's trust admits personal memory (and, for
4
+ * NOW.md, when the scratchpad-injection toggle is on) — the same policy the
5
+ * runtime memory injectors apply. Without it, a low-risk advisor consult on a
6
+ * remote/trusted-contact turn could forward private content the main agent
7
+ * would never receive.
8
+ *
9
+ * Mocks are isolated to this file (the test runner runs each file in its own
10
+ * process), so the broad module stubs here don't leak into other suites.
11
+ */
12
+ import { beforeEach, describe, expect, mock, test } from "bun:test";
13
+
14
+ let personalAllowed = false;
15
+ let scratchpadEnabled = true;
16
+ let gateArg: unknown = null;
17
+
18
+ mock.module("../../../../daemon/trust-context.js", () => ({
19
+ isPersonalMemoryAllowed: (trust: unknown) => {
20
+ gateArg = trust;
21
+ return personalAllowed;
22
+ },
23
+ }));
24
+ mock.module("../../../../daemon/now-scratchpad.js", () => ({
25
+ readNowScratchpad: () => "NOW-CONTENT",
26
+ }));
27
+ mock.module("../../../../memory/pkb/context.js", () => ({
28
+ readPkbContext: () => "PKB-CONTENT",
29
+ }));
30
+ mock.module("../../../../config/loader.js", () => ({
31
+ getConfig: () => ({
32
+ memory: {
33
+ retrieval: { scratchpadInjection: { enabled: scratchpadEnabled } },
34
+ },
35
+ llm: {},
36
+ }),
37
+ }));
38
+ // Keep every other section empty so the assertions isolate NOW.md / PKB.
39
+ mock.module("../../../../daemon/conversation-workspace.js", () => ({
40
+ resolveWorkspaceTopLevelContext: () => null,
41
+ }));
42
+ mock.module("../../../../daemon/conversation-runtime-assembly.js", () => ({
43
+ buildActiveDocuments: () => null,
44
+ }));
45
+ mock.module("../../../../runtime/capabilities.js", () => ({
46
+ resolveCapabilities: () => ({ canAccessMemory: false }),
47
+ }));
48
+ mock.module("../../../../config/skills.js", () => ({
49
+ loadSkillCatalog: () => [],
50
+ }));
51
+
52
+ const { buildAdvisorContext } = await import("../context-pack.js");
53
+
54
+ const sources = {
55
+ conversationId: "c1",
56
+ workingDir: "/tmp",
57
+ // A remote, non-guardian per-turn snapshot — the case the live-state read
58
+ // could have wrongly elevated.
59
+ trustClass: "unknown" as const,
60
+ sourceChannel: "telegram",
61
+ transcript: [],
62
+ allowedToolNames: new Set<string>(),
63
+ };
64
+
65
+ beforeEach(() => {
66
+ personalAllowed = false;
67
+ scratchpadEnabled = true;
68
+ gateArg = null;
69
+ });
70
+
71
+ describe("advisor context pack — personal-memory gating", () => {
72
+ test("withholds NOW.md and PKB when personal memory is disallowed", async () => {
73
+ personalAllowed = false;
74
+ const ctx = (await buildAdvisorContext(sources)) ?? "";
75
+ expect(ctx).not.toContain("NOW-CONTENT");
76
+ expect(ctx).not.toContain("PKB-CONTENT");
77
+ });
78
+
79
+ test("includes NOW.md and PKB when allowed and the scratchpad toggle is on", async () => {
80
+ personalAllowed = true;
81
+ scratchpadEnabled = true;
82
+ const ctx = await buildAdvisorContext(sources);
83
+ expect(ctx).toContain("NOW-CONTENT");
84
+ expect(ctx).toContain("PKB-CONTENT");
85
+ });
86
+
87
+ test("withholds NOW.md when the scratchpad toggle is off, PKB still allowed", async () => {
88
+ personalAllowed = true;
89
+ scratchpadEnabled = false;
90
+ const ctx = (await buildAdvisorContext(sources)) ?? "";
91
+ expect(ctx).not.toContain("NOW-CONTENT");
92
+ expect(ctx).toContain("PKB-CONTENT");
93
+ });
94
+
95
+ test("feeds the gate the per-turn trust snapshot, not live conversation state", async () => {
96
+ personalAllowed = true;
97
+ await buildAdvisorContext(sources);
98
+ // The gate must see exactly the snapshot threaded from ToolContext —
99
+ // trustClass + executionChannel — so a concurrent live-trust change can't
100
+ // elevate this invocation.
101
+ expect(gateArg).toEqual({
102
+ sourceChannel: "telegram",
103
+ trustClass: "unknown",
104
+ });
105
+ });
106
+ });
@@ -0,0 +1,60 @@
1
+ import { describe, expect, test } from "bun:test";
2
+
3
+ import type { Message } from "../../../../providers/types.js";
4
+ import { buildAdvisorContext, deriveRecallQuery } from "../context-pack.js";
5
+
6
+ const userMsg = (t: string): Message => ({
7
+ role: "user",
8
+ content: [{ type: "text", text: t }],
9
+ });
10
+
11
+ describe("deriveRecallQuery", () => {
12
+ test("returns the most recent user message text", () => {
13
+ const query = deriveRecallQuery([
14
+ userMsg("the original task"),
15
+ { role: "assistant", content: [{ type: "text", text: "ok" }] },
16
+ userMsg("the latest question"),
17
+ ]);
18
+ expect(query).toBe("the latest question");
19
+ });
20
+
21
+ test("returns null when there is no user text", () => {
22
+ expect(
23
+ deriveRecallQuery([
24
+ { role: "assistant", content: [{ type: "text", text: "hi" }] },
25
+ ]),
26
+ ).toBeNull();
27
+ expect(deriveRecallQuery([])).toBeNull();
28
+ });
29
+ });
30
+
31
+ describe("buildAdvisorContext", () => {
32
+ test("lists the agent's available tools, skipping the advisor itself", async () => {
33
+ const context = await buildAdvisorContext({
34
+ conversationId: "ctx-1",
35
+ workingDir: "/tmp/does-not-exist",
36
+ allowedToolNames: new Set(["bash", "advisor", "read_file"]),
37
+ trustClass: "unknown",
38
+ transcript: [userMsg("hi")],
39
+ });
40
+
41
+ expect(context).toContain("## Available tools");
42
+ expect(context).toContain("- bash");
43
+ expect(context).toContain("- read_file");
44
+ // The advisor advises; it never tells the agent to consult itself.
45
+ expect(context).not.toContain("- advisor");
46
+ });
47
+
48
+ test("omits the tools section when no tools are available", async () => {
49
+ const context = await buildAdvisorContext({
50
+ conversationId: "ctx-2",
51
+ workingDir: "/tmp/does-not-exist",
52
+ allowedToolNames: new Set(),
53
+ trustClass: "unknown",
54
+ transcript: [],
55
+ });
56
+ // Other sources (e.g. the skills catalog) may still contribute, but with no
57
+ // allowed tools the tools section must not appear.
58
+ if (context !== null) expect(context).not.toContain("## Available tools");
59
+ });
60
+ });