@vellumai/assistant 0.3.19 → 0.3.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. package/ARCHITECTURE.md +151 -15
  2. package/Dockerfile +1 -0
  3. package/README.md +40 -4
  4. package/bun.lock +139 -2
  5. package/docs/architecture/integrations.md +7 -11
  6. package/package.json +2 -1
  7. package/src/__tests__/__snapshots__/ipc-snapshot.test.ts.snap +54 -0
  8. package/src/__tests__/approval-primitive.test.ts +540 -0
  9. package/src/__tests__/assistant-feature-flag-guard.test.ts +206 -0
  10. package/src/__tests__/assistant-feature-flag-guardrails.test.ts +198 -0
  11. package/src/__tests__/assistant-feature-flags-integration.test.ts +272 -0
  12. package/src/__tests__/call-controller.test.ts +439 -108
  13. package/src/__tests__/channel-invite-transport.test.ts +264 -0
  14. package/src/__tests__/cli.test.ts +42 -1
  15. package/src/__tests__/config-schema.test.ts +11 -127
  16. package/src/__tests__/config-watcher.test.ts +0 -8
  17. package/src/__tests__/daemon-lifecycle.test.ts +1 -0
  18. package/src/__tests__/daemon-server-session-init.test.ts +8 -2
  19. package/src/__tests__/diff.test.ts +22 -0
  20. package/src/__tests__/guardian-action-copy-generator.test.ts +5 -0
  21. package/src/__tests__/guardian-action-grant-mint-consume.test.ts +300 -32
  22. package/src/__tests__/guardian-action-late-reply.test.ts +546 -1
  23. package/src/__tests__/guardian-actions-endpoint.test.ts +774 -0
  24. package/src/__tests__/guardian-control-plane-policy.test.ts +36 -3
  25. package/src/__tests__/guardian-dispatch.test.ts +124 -0
  26. package/src/__tests__/guardian-grant-minting.test.ts +6 -17
  27. package/src/__tests__/inbound-invite-redemption.test.ts +367 -0
  28. package/src/__tests__/invite-redemption-service.test.ts +306 -0
  29. package/src/__tests__/ipc-snapshot.test.ts +57 -0
  30. package/src/__tests__/notification-decision-fallback.test.ts +88 -0
  31. package/src/__tests__/sandbox-diagnostics.test.ts +6 -249
  32. package/src/__tests__/sandbox-host-parity.test.ts +6 -13
  33. package/src/__tests__/scoped-approval-grants.test.ts +6 -6
  34. package/src/__tests__/scoped-grant-security-matrix.test.ts +5 -4
  35. package/src/__tests__/script-proxy-session-manager.test.ts +1 -19
  36. package/src/__tests__/session-load-history-repair.test.ts +169 -2
  37. package/src/__tests__/session-runtime-assembly.test.ts +33 -5
  38. package/src/__tests__/skill-feature-flags-integration.test.ts +171 -0
  39. package/src/__tests__/skill-feature-flags.test.ts +188 -0
  40. package/src/__tests__/skill-load-feature-flag.test.ts +141 -0
  41. package/src/__tests__/skill-mirror-parity.test.ts +1 -0
  42. package/src/__tests__/skill-projection-feature-flag.test.ts +363 -0
  43. package/src/__tests__/system-prompt.test.ts +1 -1
  44. package/src/__tests__/terminal-sandbox.test.ts +142 -9
  45. package/src/__tests__/terminal-tools.test.ts +2 -93
  46. package/src/__tests__/thread-seed-composer.test.ts +18 -0
  47. package/src/__tests__/tool-approval-handler.test.ts +350 -0
  48. package/src/__tests__/trusted-contact-lifecycle-notifications.test.ts +8 -10
  49. package/src/__tests__/voice-scoped-grant-consumer.test.ts +46 -84
  50. package/src/agent/loop.ts +36 -1
  51. package/src/approvals/approval-primitive.ts +381 -0
  52. package/src/approvals/guardian-decision-primitive.ts +191 -0
  53. package/src/calls/call-controller.ts +252 -209
  54. package/src/calls/call-domain.ts +44 -6
  55. package/src/calls/guardian-dispatch.ts +48 -0
  56. package/src/calls/types.ts +1 -1
  57. package/src/calls/voice-session-bridge.ts +46 -30
  58. package/src/cli/core-commands.ts +0 -4
  59. package/src/cli/mcp.ts +58 -0
  60. package/src/cli.ts +76 -34
  61. package/src/config/__tests__/feature-flag-registry-guard.test.ts +179 -0
  62. package/src/config/assistant-feature-flags.ts +162 -0
  63. package/src/config/bundled-skills/api-mapping/icon.svg +18 -0
  64. package/src/config/bundled-skills/messaging/TOOLS.json +30 -0
  65. package/src/config/bundled-skills/messaging/tools/slack-delete-message.ts +24 -0
  66. package/src/config/bundled-skills/notifications/SKILL.md +1 -1
  67. package/src/config/bundled-skills/reminder/SKILL.md +49 -2
  68. package/src/config/bundled-skills/time-based-actions/SKILL.md +49 -2
  69. package/src/config/bundled-skills/voice-setup/SKILL.md +122 -0
  70. package/src/config/core-schema.ts +1 -1
  71. package/src/config/env-registry.ts +10 -0
  72. package/src/config/feature-flag-registry.json +61 -0
  73. package/src/config/loader.ts +22 -1
  74. package/src/config/mcp-schema.ts +46 -0
  75. package/src/config/sandbox-schema.ts +0 -39
  76. package/src/config/schema.ts +18 -2
  77. package/src/config/skill-state.ts +34 -0
  78. package/src/config/skills-schema.ts +0 -1
  79. package/src/config/skills.ts +9 -0
  80. package/src/config/system-prompt.ts +110 -46
  81. package/src/config/templates/SOUL.md +1 -1
  82. package/src/config/types.ts +19 -1
  83. package/src/config/vellum-skills/catalog.json +1 -1
  84. package/src/config/vellum-skills/guardian-verify-setup/SKILL.md +1 -0
  85. package/src/config/vellum-skills/sms-setup/SKILL.md +1 -1
  86. package/src/config/vellum-skills/telegram-setup/SKILL.md +6 -5
  87. package/src/config/vellum-skills/trusted-contacts/SKILL.md +105 -3
  88. package/src/config/vellum-skills/twilio-setup/SKILL.md +1 -1
  89. package/src/daemon/config-watcher.ts +0 -1
  90. package/src/daemon/daemon-control.ts +1 -1
  91. package/src/daemon/guardian-invite-intent.ts +124 -0
  92. package/src/daemon/handlers/avatar.ts +68 -0
  93. package/src/daemon/handlers/browser.ts +2 -2
  94. package/src/daemon/handlers/guardian-actions.ts +120 -0
  95. package/src/daemon/handlers/index.ts +4 -0
  96. package/src/daemon/handlers/sessions.ts +19 -0
  97. package/src/daemon/handlers/shared.ts +3 -1
  98. package/src/daemon/install-cli-launchers.ts +58 -13
  99. package/src/daemon/ipc-contract/guardian-actions.ts +53 -0
  100. package/src/daemon/ipc-contract/sessions.ts +8 -2
  101. package/src/daemon/ipc-contract/settings.ts +25 -2
  102. package/src/daemon/ipc-contract-inventory.json +10 -0
  103. package/src/daemon/ipc-contract.ts +4 -0
  104. package/src/daemon/lifecycle.ts +14 -2
  105. package/src/daemon/main.ts +1 -0
  106. package/src/daemon/providers-setup.ts +26 -1
  107. package/src/daemon/server.ts +1 -0
  108. package/src/daemon/session-lifecycle.ts +52 -7
  109. package/src/daemon/session-memory.ts +45 -0
  110. package/src/daemon/session-process.ts +258 -432
  111. package/src/daemon/session-runtime-assembly.ts +12 -0
  112. package/src/daemon/session-skill-tools.ts +14 -1
  113. package/src/daemon/session-tool-setup.ts +5 -0
  114. package/src/daemon/session.ts +11 -0
  115. package/src/daemon/shutdown-handlers.ts +11 -0
  116. package/src/daemon/tool-side-effects.ts +35 -9
  117. package/src/index.ts +2 -2
  118. package/src/mcp/client.ts +152 -0
  119. package/src/mcp/manager.ts +139 -0
  120. package/src/memory/conversation-display-order-migration.ts +44 -0
  121. package/src/memory/conversation-queries.ts +2 -0
  122. package/src/memory/conversation-store.ts +91 -0
  123. package/src/memory/db-init.ts +5 -1
  124. package/src/memory/embedding-local.ts +13 -8
  125. package/src/memory/guardian-action-store.ts +125 -2
  126. package/src/memory/ingress-invite-store.ts +95 -1
  127. package/src/memory/migrations/035-guardian-action-supersession.ts +23 -0
  128. package/src/memory/migrations/index.ts +2 -1
  129. package/src/memory/schema.ts +5 -1
  130. package/src/memory/scoped-approval-grants.ts +14 -5
  131. package/src/messaging/providers/slack/client.ts +12 -0
  132. package/src/messaging/providers/slack/types.ts +5 -0
  133. package/src/notifications/decision-engine.ts +49 -12
  134. package/src/notifications/emit-signal.ts +7 -0
  135. package/src/notifications/signal.ts +7 -0
  136. package/src/notifications/thread-seed-composer.ts +2 -1
  137. package/src/runtime/channel-approval-types.ts +16 -6
  138. package/src/runtime/channel-approvals.ts +19 -15
  139. package/src/runtime/channel-invite-transport.ts +85 -0
  140. package/src/runtime/channel-invite-transports/telegram.ts +105 -0
  141. package/src/runtime/guardian-action-grant-minter.ts +92 -35
  142. package/src/runtime/guardian-action-message-composer.ts +30 -0
  143. package/src/runtime/guardian-decision-types.ts +91 -0
  144. package/src/runtime/http-server.ts +23 -1
  145. package/src/runtime/ingress-service.ts +22 -0
  146. package/src/runtime/invite-redemption-service.ts +181 -0
  147. package/src/runtime/invite-redemption-templates.ts +39 -0
  148. package/src/runtime/routes/call-routes.ts +2 -1
  149. package/src/runtime/routes/guardian-action-routes.ts +206 -0
  150. package/src/runtime/routes/guardian-approval-interception.ts +66 -190
  151. package/src/runtime/routes/identity-routes.ts +73 -0
  152. package/src/runtime/routes/inbound-message-handler.ts +486 -394
  153. package/src/runtime/routes/pairing-routes.ts +4 -0
  154. package/src/security/encrypted-store.ts +31 -17
  155. package/src/security/keychain.ts +176 -2
  156. package/src/security/secure-keys.ts +97 -0
  157. package/src/security/tool-approval-digest.ts +1 -1
  158. package/src/tools/browser/browser-execution.ts +2 -2
  159. package/src/tools/browser/browser-manager.ts +46 -32
  160. package/src/tools/browser/browser-screencast.ts +2 -2
  161. package/src/tools/calls/call-start.ts +1 -1
  162. package/src/tools/executor.ts +22 -17
  163. package/src/tools/mcp/mcp-tool-factory.ts +100 -0
  164. package/src/tools/network/script-proxy/session-manager.ts +1 -5
  165. package/src/tools/registry.ts +64 -1
  166. package/src/tools/skills/load.ts +22 -8
  167. package/src/tools/system/avatar-generator.ts +119 -0
  168. package/src/tools/system/navigate-settings.ts +65 -0
  169. package/src/tools/system/open-system-settings.ts +75 -0
  170. package/src/tools/system/voice-config.ts +121 -32
  171. package/src/tools/terminal/backends/native.ts +40 -19
  172. package/src/tools/terminal/backends/types.ts +3 -3
  173. package/src/tools/terminal/parser.ts +1 -1
  174. package/src/tools/terminal/sandbox-diagnostics.ts +6 -87
  175. package/src/tools/terminal/sandbox.ts +1 -12
  176. package/src/tools/terminal/shell.ts +3 -31
  177. package/src/tools/tool-approval-handler.ts +141 -3
  178. package/src/tools/tool-manifest.ts +6 -0
  179. package/src/tools/types.ts +10 -2
  180. package/src/util/diff.ts +36 -13
  181. package/Dockerfile.sandbox +0 -5
  182. package/src/__tests__/doordash-client.test.ts +0 -187
  183. package/src/__tests__/doordash-session.test.ts +0 -154
  184. package/src/__tests__/signup-e2e.test.ts +0 -354
  185. package/src/__tests__/terminal-sandbox-docker.test.ts +0 -1065
  186. package/src/__tests__/terminal-sandbox.integration.test.ts +0 -180
  187. package/src/cli/doordash.ts +0 -1057
  188. package/src/config/bundled-skills/doordash/SKILL.md +0 -163
  189. package/src/config/templates/LOOKS.md +0 -25
  190. package/src/doordash/cart-queries.ts +0 -787
  191. package/src/doordash/client.ts +0 -1016
  192. package/src/doordash/order-queries.ts +0 -85
  193. package/src/doordash/queries.ts +0 -13
  194. package/src/doordash/query-extractor.ts +0 -94
  195. package/src/doordash/search-queries.ts +0 -203
  196. package/src/doordash/session.ts +0 -84
  197. package/src/doordash/store-queries.ts +0 -246
  198. package/src/doordash/types.ts +0 -367
  199. package/src/tools/terminal/backends/docker.ts +0 -379
@@ -0,0 +1,363 @@
1
+ /**
2
+ * Tests that projectSkillTools drops flag-OFF active skills from projected
3
+ * tools, even when conversation history contains old markers for those skills.
4
+ */
5
+ import * as realFs from 'node:fs';
6
+
7
+ import { beforeEach, describe, expect, mock, test } from 'bun:test';
8
+
9
+ import type { SkillSummary, SkillToolManifest } from '../config/skills.js';
10
+ import { RiskLevel } from '../permissions/types.js';
11
+ import type { Message } from '../providers/types.js';
12
+ import type { Tool } from '../tools/types.js';
13
+
14
+ // ---------------------------------------------------------------------------
15
+ // Mock state
16
+ // ---------------------------------------------------------------------------
17
+
18
+ let mockCatalog: SkillSummary[] = [];
19
+ let mockManifests: Record<string, SkillToolManifest | null> = {};
20
+ let mockRegisteredTools: Map<string, Tool[]> = new Map();
21
+ let mockUnregisteredSkillIds: string[] = [];
22
+ let mockSkillRefCount: Map<string, number> = new Map();
23
+
24
+ let currentConfig: Record<string, unknown> = { featureFlags: {} };
25
+ const DECLARED_SKILL_ID = 'hatch-new-assistant';
26
+ const DECLARED_LEGACY_KEY = 'skills.hatch-new-assistant.enabled';
27
+
28
+ // ---------------------------------------------------------------------------
29
+ // Mocks
30
+ // ---------------------------------------------------------------------------
31
+
32
+ mock.module('../config/skills.js', () => ({
33
+ loadSkillCatalog: () => mockCatalog,
34
+ }));
35
+
36
+ mock.module('../config/loader.js', () => ({
37
+ getConfig: () => currentConfig,
38
+ }));
39
+
40
+ mock.module('../skills/active-skill-tools.js', () => {
41
+ const parseMarkers = (messages: Message[]) => {
42
+ const skillLoadUseIds = new Set<string>();
43
+ for (const msg of messages) {
44
+ for (const block of msg.content) {
45
+ if (block.type === 'tool_use' && block.name === 'skill_load') {
46
+ skillLoadUseIds.add(block.id);
47
+ }
48
+ }
49
+ }
50
+ const re = /<loaded_skill\s+id="([^"]+)"(?:\s+version="([^"]+)")?\s*\/>/g;
51
+ const seen = new Set<string>();
52
+ const entries: Array<{ id: string; version?: string }> = [];
53
+ for (const msg of messages) {
54
+ for (const block of msg.content) {
55
+ if (block.type !== 'tool_result') continue;
56
+ if (!skillLoadUseIds.has(block.tool_use_id)) continue;
57
+ const text = block.content;
58
+ if (!text) continue;
59
+ for (const m of text.matchAll(re)) {
60
+ if (!seen.has(m[1])) {
61
+ seen.add(m[1]);
62
+ const entry: { id: string; version?: string } = { id: m[1] };
63
+ if (m[2]) entry.version = m[2];
64
+ entries.push(entry);
65
+ }
66
+ }
67
+ }
68
+ }
69
+ return entries;
70
+ };
71
+
72
+ return {
73
+ deriveActiveSkills: (messages: Message[]) => parseMarkers(messages),
74
+ deriveActiveSkillIds: (messages: Message[]) => parseMarkers(messages).map((e) => e.id),
75
+ };
76
+ });
77
+
78
+ mock.module('../skills/tool-manifest.js', () => ({
79
+ parseToolManifestFile: (filePath: string) => {
80
+ const parts = filePath.split('/');
81
+ const skillId = parts[parts.length - 2];
82
+ const manifest = mockManifests[skillId];
83
+ if (!manifest) throw new Error(`Mock: no manifest for skill "${skillId}"`);
84
+ return manifest;
85
+ },
86
+ }));
87
+
88
+ mock.module('../tools/skills/skill-tool-factory.js', () => ({
89
+ createSkillToolsFromManifest: (
90
+ entries: SkillToolManifest['tools'],
91
+ skillId: string,
92
+ _skillDir: string,
93
+ versionHash: string,
94
+ bundled?: boolean,
95
+ ): Tool[] => {
96
+ return entries.map((entry) => ({
97
+ name: entry.name,
98
+ description: entry.description,
99
+ category: entry.category,
100
+ defaultRiskLevel: RiskLevel.Medium,
101
+ origin: 'skill' as const,
102
+ ownerSkillId: skillId,
103
+ ownerSkillVersionHash: versionHash,
104
+ ownerSkillBundled: bundled ?? undefined,
105
+ getDefinition: () => ({
106
+ name: entry.name,
107
+ description: entry.description,
108
+ input_schema: entry.input_schema as object,
109
+ }),
110
+ execute: async () => ({ content: '', isError: false }),
111
+ }));
112
+ },
113
+ }));
114
+
115
+ mock.module('../tools/registry.js', () => ({
116
+ registerSkillTools: (tools: Tool[]) => {
117
+ const skillIds = new Set<string>();
118
+ for (const tool of tools) {
119
+ const skillId = tool.ownerSkillId!;
120
+ skillIds.add(skillId);
121
+ const existing = mockRegisteredTools.get(skillId) ?? [];
122
+ existing.push(tool);
123
+ mockRegisteredTools.set(skillId, existing);
124
+ }
125
+ for (const id of skillIds) {
126
+ mockSkillRefCount.set(id, (mockSkillRefCount.get(id) ?? 0) + 1);
127
+ }
128
+ return tools;
129
+ },
130
+ unregisterSkillTools: (skillId: string) => {
131
+ mockUnregisteredSkillIds.push(skillId);
132
+ const current = mockSkillRefCount.get(skillId) ?? 0;
133
+ if (current > 1) {
134
+ mockSkillRefCount.set(skillId, current - 1);
135
+ return;
136
+ }
137
+ mockSkillRefCount.delete(skillId);
138
+ mockRegisteredTools.delete(skillId);
139
+ },
140
+ getTool: (name: string): Tool | undefined => {
141
+ let found: Tool | undefined;
142
+ for (const tools of mockRegisteredTools.values()) {
143
+ for (const tool of tools) {
144
+ if (tool.name === name) found = tool;
145
+ }
146
+ }
147
+ return found;
148
+ },
149
+ getSkillToolNames: () => {
150
+ const names: string[] = [];
151
+ for (const tools of mockRegisteredTools.values()) {
152
+ for (const tool of tools) {
153
+ names.push(tool.name);
154
+ }
155
+ }
156
+ return names;
157
+ },
158
+ }));
159
+
160
+ mock.module('node:fs', () => ({
161
+ ...realFs,
162
+ existsSync: (p: string) => {
163
+ if (typeof p === 'string' && p.endsWith('TOOLS.json')) {
164
+ const parts = p.split('/');
165
+ const skillId = parts[parts.length - 2];
166
+ return skillId in mockManifests;
167
+ }
168
+ return realFs.existsSync(p);
169
+ },
170
+ }));
171
+
172
+ mock.module('../skills/version-hash.js', () => ({
173
+ computeSkillVersionHash: (skillDir: string) => {
174
+ const parts = skillDir.split('/');
175
+ const skillId = parts[parts.length - 1];
176
+ return `v1:default-hash-${skillId}`;
177
+ },
178
+ }));
179
+
180
+ mock.module('../util/logger.js', () => ({
181
+ getLogger: () => ({
182
+ info: () => {},
183
+ warn: () => {},
184
+ debug: () => {},
185
+ error: () => {},
186
+ }),
187
+ }));
188
+
189
+ // ---------------------------------------------------------------------------
190
+ // Import module under test (after mocks)
191
+ // ---------------------------------------------------------------------------
192
+
193
+ const { projectSkillTools, resetSkillToolProjection } = await import(
194
+ '../daemon/session-skill-tools.js'
195
+ );
196
+
197
+ // ---------------------------------------------------------------------------
198
+ // Helpers
199
+ // ---------------------------------------------------------------------------
200
+
201
+ function makeSkill(id: string): SkillSummary {
202
+ return {
203
+ id,
204
+ name: id,
205
+ description: `Skill ${id}`,
206
+ directoryPath: `/skills/${id}`,
207
+ skillFilePath: `/skills/${id}/SKILL.md`,
208
+ userInvocable: true,
209
+ disableModelInvocation: false,
210
+ source: 'managed',
211
+ };
212
+ }
213
+
214
+ function makeManifest(toolNames: string[]): SkillToolManifest {
215
+ return {
216
+ version: 1,
217
+ tools: toolNames.map((name) => ({
218
+ name,
219
+ description: `Tool ${name}`,
220
+ category: 'test',
221
+ risk: 'medium' as const,
222
+ input_schema: { type: 'object', properties: {} },
223
+ executor: 'run.ts',
224
+ execution_target: 'host' as const,
225
+ })),
226
+ };
227
+ }
228
+
229
+ /** Build conversation history with a loaded_skill marker. */
230
+ function buildHistoryWithMarker(skillId: string): Message[] {
231
+ return [
232
+ {
233
+ role: 'assistant',
234
+ content: [{ type: 'tool_use', id: 'tu-1', name: 'skill_load', input: { skill: skillId } }],
235
+ },
236
+ {
237
+ role: 'user',
238
+ content: [{
239
+ type: 'tool_result',
240
+ tool_use_id: 'tu-1',
241
+ content: `Loaded.\n\n<loaded_skill id="${skillId}" version="v1:default-hash-${skillId}" />`,
242
+ }],
243
+ },
244
+ ];
245
+ }
246
+
247
+ // ---------------------------------------------------------------------------
248
+ // Tests
249
+ // ---------------------------------------------------------------------------
250
+
251
+ describe('projectSkillTools feature flag enforcement', () => {
252
+ beforeEach(() => {
253
+ mockCatalog = [];
254
+ mockManifests = {};
255
+ mockRegisteredTools = new Map();
256
+ mockUnregisteredSkillIds = [];
257
+ mockSkillRefCount = new Map();
258
+ currentConfig = { featureFlags: {} };
259
+ resetSkillToolProjection();
260
+ });
261
+
262
+ test('no skill tools projected for flag OFF skill even with old markers', () => {
263
+ mockCatalog = [makeSkill(DECLARED_SKILL_ID)];
264
+ mockManifests = { [DECLARED_SKILL_ID]: makeManifest(['browser_navigate', 'browser_click']) };
265
+
266
+ // History contains a marker from before the flag was turned off
267
+ const history = buildHistoryWithMarker(DECLARED_SKILL_ID);
268
+ const prevActive = new Map<string, string>();
269
+
270
+ // Feature flag is OFF
271
+ currentConfig = { featureFlags: { [DECLARED_LEGACY_KEY]: false } };
272
+
273
+ const result = projectSkillTools(history, { previouslyActiveSkillIds: prevActive });
274
+
275
+ // No tools should be projected
276
+ expect(result.toolDefinitions).toHaveLength(0);
277
+ expect(result.allowedToolNames.size).toBe(0);
278
+ });
279
+
280
+ test('skill tools projected normally when flag is ON', () => {
281
+ mockCatalog = [makeSkill(DECLARED_SKILL_ID)];
282
+ mockManifests = { [DECLARED_SKILL_ID]: makeManifest(['browser_navigate', 'browser_click']) };
283
+
284
+ const history = buildHistoryWithMarker(DECLARED_SKILL_ID);
285
+ const prevActive = new Map<string, string>();
286
+
287
+ // Feature flag is ON
288
+ currentConfig = { featureFlags: { [DECLARED_LEGACY_KEY]: true } };
289
+
290
+ const result = projectSkillTools(history, { previouslyActiveSkillIds: prevActive });
291
+
292
+ expect(result.toolDefinitions).toHaveLength(2);
293
+ expect(result.allowedToolNames.has('browser_navigate')).toBe(true);
294
+ expect(result.allowedToolNames.has('browser_click')).toBe(true);
295
+ });
296
+
297
+ test('skill tools projected normally when flag key is absent (defaults to enabled)', () => {
298
+ mockCatalog = [makeSkill(DECLARED_SKILL_ID)];
299
+ mockManifests = { [DECLARED_SKILL_ID]: makeManifest(['browser_navigate']) };
300
+
301
+ const history = buildHistoryWithMarker(DECLARED_SKILL_ID);
302
+ const prevActive = new Map<string, string>();
303
+
304
+ // featureFlags is empty — should default to enabled
305
+ currentConfig = { featureFlags: {} };
306
+
307
+ const result = projectSkillTools(history, { previouslyActiveSkillIds: prevActive });
308
+
309
+ expect(result.toolDefinitions).toHaveLength(1);
310
+ expect(result.allowedToolNames.has('browser_navigate')).toBe(true);
311
+ });
312
+
313
+ test('mixed flag-on and flag-off skills — only flag-on tools projected', () => {
314
+ mockCatalog = [makeSkill(DECLARED_SKILL_ID), makeSkill('twitter')];
315
+ mockManifests = {
316
+ [DECLARED_SKILL_ID]: makeManifest(['browser_navigate']),
317
+ twitter: makeManifest(['twitter_post']),
318
+ };
319
+
320
+ const history: Message[] = [
321
+ {
322
+ role: 'assistant',
323
+ content: [
324
+ { type: 'tool_use', id: 'tu-1', name: 'skill_load', input: { skill: DECLARED_SKILL_ID } },
325
+ ],
326
+ },
327
+ {
328
+ role: 'user',
329
+ content: [{
330
+ type: 'tool_result',
331
+ tool_use_id: 'tu-1',
332
+ content: `<loaded_skill id="${DECLARED_SKILL_ID}" version="v1:default-hash-${DECLARED_SKILL_ID}" />`,
333
+ }],
334
+ },
335
+ {
336
+ role: 'assistant',
337
+ content: [
338
+ { type: 'tool_use', id: 'tu-2', name: 'skill_load', input: { skill: 'twitter' } },
339
+ ],
340
+ },
341
+ {
342
+ role: 'user',
343
+ content: [{
344
+ type: 'tool_result',
345
+ tool_use_id: 'tu-2',
346
+ content: '<loaded_skill id="twitter" version="v1:default-hash-twitter" />',
347
+ }],
348
+ },
349
+ ];
350
+ const prevActive = new Map<string, string>();
351
+
352
+ // Declared skill is OFF, twitter is undeclared with no persisted override so remains ON.
353
+ currentConfig = {
354
+ featureFlags: { [DECLARED_LEGACY_KEY]: false },
355
+ };
356
+
357
+ const result = projectSkillTools(history, { previouslyActiveSkillIds: prevActive });
358
+
359
+ const toolNames = result.toolDefinitions.map((t) => t.name);
360
+ expect(toolNames).toContain('twitter_post');
361
+ expect(toolNames).not.toContain('browser_navigate');
362
+ });
363
+ });
@@ -49,7 +49,7 @@ mock.module('../util/logger.js', () => ({
49
49
 
50
50
  mock.module('../config/loader.js', () => ({
51
51
  getConfig: () => ({
52
- sandbox: { enabled: true, backend: 'docker' },
52
+ sandbox: { enabled: true },
53
53
  }),
54
54
  }));
55
55
 
@@ -45,14 +45,12 @@ mock.module('node:fs', () => ({
45
45
  const { wrapCommand } = await import('../tools/terminal/sandbox.js');
46
46
  const { ToolError } = await import('../util/errors.js');
47
47
 
48
- const defaultDocker = { image: 'vellum-sandbox:latest', shell: 'bash', cpus: 1, memoryMb: 512, pidsLimit: 256, network: 'none' as const };
49
-
50
48
  function disabledConfig(): SandboxConfig {
51
- return { enabled: false, backend: 'native', docker: defaultDocker };
49
+ return { enabled: false };
52
50
  }
53
51
 
54
52
  function nativeConfig(): SandboxConfig {
55
- return { enabled: true, backend: 'native', docker: defaultDocker };
53
+ return { enabled: true };
56
54
  }
57
55
 
58
56
  describe('terminal sandbox — disabled behavior', () => {
@@ -101,14 +99,20 @@ describe('terminal sandbox — enabled fail-closed behavior', () => {
101
99
  });
102
100
 
103
101
  test('returns bwrap wrapper when bwrap is available on linux', () => {
102
+ // GIVEN bwrap is available on a linux platform
104
103
  execSyncMock.mockImplementation(() => undefined);
104
+
105
+ // WHEN wrapping a command with the native sandbox config
105
106
  const result = wrapCommand('echo hello', '/home/user/project', nativeConfig());
107
+
108
+ // THEN the result uses bwrap with network isolation
106
109
  expect(result.command).toBe('bwrap');
107
110
  expect(result.sandboxed).toBe(true);
108
111
  expect(result.args).toContain('--ro-bind');
109
112
  expect(result.args).toContain('--unshare-net');
110
113
  expect(result.args).toContain('--unshare-pid');
111
- // The user command runs via bash inside the sandbox
114
+
115
+ // AND the user command runs via bash inside the sandbox
112
116
  const bashIdx = result.args.indexOf('bash');
113
117
  expect(bashIdx).toBeGreaterThan(0);
114
118
  expect(result.args.slice(bashIdx)).toEqual(['bash', '-c', '--', 'echo hello']);
@@ -155,13 +159,21 @@ describe('terminal sandbox — macOS sandbox-exec behavior', () => {
155
159
  });
156
160
 
157
161
  test('returns sandbox-exec wrapper on macOS when enabled', () => {
162
+ // GIVEN the platform is macOS
163
+ // (set in beforeEach)
164
+
165
+ // WHEN wrapping a command with the native sandbox config
158
166
  const result = wrapCommand('echo hello', '/tmp/project', nativeConfig());
167
+
168
+ // THEN the result uses sandbox-exec
159
169
  expect(result.command).toBe('sandbox-exec');
160
170
  expect(result.sandboxed).toBe(true);
161
171
  expect(result.args[0]).toBe('-f');
162
- // Profile path is the second arg
172
+
173
+ // AND the profile path is the second arg
163
174
  expect(result.args[1]).toContain('sandbox-profile-');
164
- // bash -c -- command follows the profile
175
+
176
+ // AND bash -c -- command follows the profile
165
177
  expect(result.args.slice(2)).toEqual(['bash', '-c', '--', 'echo hello']);
166
178
  });
167
179
 
@@ -195,10 +207,131 @@ describe('terminal sandbox — backend selection', () => {
195
207
  expect(result.sandboxed).toBe(true);
196
208
  });
197
209
 
198
- test('disabled config ignores backend setting', () => {
199
- const config: SandboxConfig = { enabled: false, backend: 'docker', docker: defaultDocker };
210
+ test('disabled config returns unsandboxed wrapper', () => {
211
+ const config: SandboxConfig = { enabled: false };
200
212
  const result = wrapCommand('echo hello', '/tmp/project', config);
201
213
  expect(result.command).toBe('bash');
202
214
  expect(result.sandboxed).toBe(false);
203
215
  });
204
216
  });
217
+
218
+ describe('terminal sandbox — proxied network mode on Linux', () => {
219
+ beforeEach(() => {
220
+ platform = 'linux';
221
+ execSyncMock.mockImplementation(() => undefined);
222
+ });
223
+
224
+ test('omits --unshare-net when networkMode is proxied', () => {
225
+ /**
226
+ * Tests that bwrap args omit --unshare-net in proxied mode so the process
227
+ * can reach the local credential proxy on 127.0.0.1.
228
+ */
229
+
230
+ // GIVEN bwrap is available on linux
231
+ // (set in beforeEach)
232
+
233
+ // WHEN wrapping a command with proxied network mode
234
+ const result = wrapCommand('curl https://example.com', '/home/user/project', nativeConfig(), { networkMode: 'proxied' });
235
+
236
+ // THEN the result uses bwrap
237
+ expect(result.command).toBe('bwrap');
238
+ expect(result.sandboxed).toBe(true);
239
+
240
+ // AND --unshare-net is NOT present (network is allowed)
241
+ expect(result.args).not.toContain('--unshare-net');
242
+
243
+ // AND --unshare-pid is still present (PID isolation remains)
244
+ expect(result.args).toContain('--unshare-pid');
245
+ });
246
+
247
+ test('includes --unshare-net when networkMode is off', () => {
248
+ /**
249
+ * Tests that bwrap args include --unshare-net when network is off (default).
250
+ */
251
+
252
+ // GIVEN bwrap is available on linux
253
+ // (set in beforeEach)
254
+
255
+ // WHEN wrapping a command with network mode off
256
+ const result = wrapCommand('echo hello', '/home/user/project', nativeConfig(), { networkMode: 'off' });
257
+
258
+ // THEN --unshare-net is present (network is blocked)
259
+ expect(result.args).toContain('--unshare-net');
260
+ });
261
+
262
+ test('includes --unshare-net when no options are provided', () => {
263
+ /**
264
+ * Tests that the default behavior (no options) blocks network access.
265
+ */
266
+
267
+ // GIVEN bwrap is available on linux
268
+ // (set in beforeEach)
269
+
270
+ // WHEN wrapping a command without any options
271
+ const result = wrapCommand('echo hello', '/home/user/project', nativeConfig());
272
+
273
+ // THEN --unshare-net is present (network is blocked by default)
274
+ expect(result.args).toContain('--unshare-net');
275
+ });
276
+ });
277
+
278
+ describe('terminal sandbox — proxied network mode on macOS', () => {
279
+ beforeEach(() => {
280
+ platform = 'darwin';
281
+ writeFileSyncMock.mockClear();
282
+ existsSyncMock.mockImplementation(() => true);
283
+ });
284
+
285
+ test('writes SBPL profile with allow network when networkMode is proxied', () => {
286
+ /**
287
+ * Tests that the macOS sandbox profile allows network access in proxied mode
288
+ * so the process can reach the local credential proxy.
289
+ */
290
+
291
+ // GIVEN the platform is macOS
292
+ // (set in beforeEach)
293
+
294
+ // WHEN wrapping a command with proxied network mode
295
+ wrapCommand('curl https://example.com', '/tmp/project', nativeConfig(), { networkMode: 'proxied' });
296
+
297
+ // THEN the written profile contains (allow network*) instead of (deny network*)
298
+ const profileContent = writeFileSyncMock.mock.calls[0]?.[1] as string;
299
+ expect(profileContent).toContain('(allow network*)');
300
+ expect(profileContent).not.toContain('(deny network*)');
301
+ });
302
+
303
+ test('writes SBPL profile with deny network when networkMode is off', () => {
304
+ /**
305
+ * Tests that the macOS sandbox profile blocks network access when network
306
+ * mode is off (the default behavior).
307
+ */
308
+
309
+ // GIVEN the platform is macOS
310
+ // (set in beforeEach)
311
+
312
+ // WHEN wrapping a command with network mode off
313
+ wrapCommand('echo hello', '/tmp/project', nativeConfig(), { networkMode: 'off' });
314
+
315
+ // THEN the written profile contains (deny network*)
316
+ const profileContent = writeFileSyncMock.mock.calls[0]?.[1] as string;
317
+ expect(profileContent).toContain('(deny network*)');
318
+ expect(profileContent).not.toContain('(allow network*)');
319
+ });
320
+
321
+ test('writes SBPL profile with deny network when no options are provided', () => {
322
+ /**
323
+ * Tests that the default behavior (no options) blocks network access on macOS.
324
+ */
325
+
326
+ // GIVEN the platform is macOS
327
+ // (set in beforeEach)
328
+
329
+ // WHEN wrapping a command without any options
330
+ wrapCommand('echo hello', '/tmp/project', nativeConfig());
331
+
332
+ // THEN the written profile contains (deny network*)
333
+ const profileContent = writeFileSyncMock.mock.calls[0]?.[1] as string;
334
+ expect(profileContent).toContain('(deny network*)');
335
+ expect(profileContent).not.toContain('(allow network*)');
336
+ });
337
+ });
@@ -1,4 +1,4 @@
1
- import { mkdirSync, mkdtempSync, rmSync, symlinkSync } from 'node:fs';
1
+ import { mkdtempSync } from 'node:fs';
2
2
  import { tmpdir } from 'node:os';
3
3
  import { join } from 'node:path';
4
4
 
@@ -467,15 +467,6 @@ describe('buildSanitizedEnv', () => {
467
467
  describe('wrapCommand', () => {
468
468
  const disabledConfig: SandboxConfig = {
469
469
  enabled: false,
470
- backend: 'native',
471
- docker: {
472
- image: 'vellum-sandbox:latest',
473
- shell: 'bash',
474
- cpus: 1,
475
- memoryMb: 512,
476
- pidsLimit: 256,
477
- network: 'none',
478
- },
479
470
  };
480
471
 
481
472
  test('disabled sandbox returns plain bash invocation', () => {
@@ -546,89 +537,7 @@ describe('Native sandbox backend', () => {
546
537
  });
547
538
 
548
539
  // ═══════════════════════════════════════════════════════════════════════════
549
- // 5. Docker sandbox backend
550
- // ═══════════════════════════════════════════════════════════════════════════
551
-
552
- describe('Docker sandbox backend', () => {
553
- let DockerBackend: new (sandboxRoot: string, config?: Record<string, unknown>, uid?: number, gid?: number) => SandboxBackend;
554
- let _resetDockerChecks: () => void;
555
-
556
- const sandboxDir = join(testTmpDir, 'docker-sandbox');
557
-
558
- beforeEach(async () => {
559
- mkdirSync(sandboxDir, { recursive: true });
560
- const mod = await import('../tools/terminal/backends/docker.js');
561
- DockerBackend = mod.DockerBackend;
562
- _resetDockerChecks = mod._resetDockerChecks;
563
- _resetDockerChecks();
564
- });
565
-
566
- afterEach(() => {
567
- try { rmSync(sandboxDir, { recursive: true, force: true }); } catch {}
568
- });
569
-
570
- test('constructor resolves symlinks in sandbox root', () => {
571
- const realDir = join(testTmpDir, 'docker-real');
572
- const linkDir = join(testTmpDir, 'docker-link');
573
- mkdirSync(realDir, { recursive: true });
574
- try {
575
- symlinkSync(realDir, linkDir);
576
- // Construct backend with the symlink — it should resolve to the real path.
577
- const backend = new DockerBackend(linkDir, undefined, 1000, 1000);
578
- // We can't inspect private fields directly, but wrapping will fail at
579
- // preflight checks (Docker not available) — this tests that constructor
580
- // does not throw on a valid symlinked path.
581
- expect(backend).toBeDefined();
582
- } finally {
583
- try { rmSync(linkDir); } catch {}
584
- try { rmSync(realDir, { recursive: true, force: true }); } catch {}
585
- }
586
- });
587
-
588
- test('constructor rejects sandbox root with null bytes', () => {
589
- // realpathSync throws TypeError before validatePathSafety can run
590
- expect(() => new DockerBackend('/tmp/foo\0bar', undefined, 1000, 1000)).toThrow();
591
- });
592
-
593
- test('constructor rejects sandbox root with newlines', () => {
594
- // Create a real directory with a newline in its name so realpathSync
595
- // succeeds and the rejection comes from validatePathSafety, not ENOENT.
596
- const nlDir = join(testTmpDir, 'has\nnewline');
597
- mkdirSync(nlDir, { recursive: true });
598
- try {
599
- expect(() => new DockerBackend(nlDir, undefined, 1000, 1000)).toThrow(ToolError);
600
- } finally {
601
- try { rmSync(nlDir, { recursive: true, force: true }); } catch {}
602
- }
603
- });
604
-
605
- test('constructor rejects sandbox root with carriage returns', () => {
606
- // Create a real directory with a carriage return in its name so
607
- // realpathSync succeeds and validatePathSafety is what rejects it.
608
- const crDir = join(testTmpDir, 'has\rreturn');
609
- mkdirSync(crDir, { recursive: true });
610
- try {
611
- expect(() => new DockerBackend(crDir, undefined, 1000, 1000)).toThrow(ToolError);
612
- } finally {
613
- try { rmSync(crDir, { recursive: true, force: true }); } catch {}
614
- }
615
- });
616
-
617
- test('validates path safety after resolving symlinks', () => {
618
- // Create a directory with a comma in the name to test validatePathSafety.
619
- // On most filesystems this is allowed, so validatePathSafety should catch it.
620
- const commaDir = join(testTmpDir, 'has,comma');
621
- mkdirSync(commaDir, { recursive: true });
622
- try {
623
- expect(() => new DockerBackend(commaDir, undefined, 1000, 1000)).toThrow(ToolError);
624
- } finally {
625
- try { rmSync(commaDir, { recursive: true, force: true }); } catch {}
626
- }
627
- });
628
- });
629
-
630
- // ═══════════════════════════════════════════════════════════════════════════
631
- // 6. Shell tool — input validation
540
+ // 5. Shell tool — input validation
632
541
  // ═══════════════════════════════════════════════════════════════════════════
633
542
 
634
543
  describe('Shell tool input validation', () => {