@semalt-ai/code 1.8.5 → 1.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. package/.claude/settings.local.json +6 -1
  2. package/.github/workflows/ci.yml +69 -0
  3. package/CLAUDE.md +1584 -26
  4. package/README.md +147 -3
  5. package/examples/embed.js +74 -0
  6. package/index.js +251 -10
  7. package/lib/agent.js +711 -104
  8. package/lib/api.js +213 -49
  9. package/lib/args.js +74 -2
  10. package/lib/audit.js +23 -1
  11. package/lib/background.js +584 -0
  12. package/lib/checkpoints.js +757 -0
  13. package/lib/commands/auth.js +94 -0
  14. package/lib/commands/chat-session.js +306 -0
  15. package/lib/commands/chat-slash.js +399 -0
  16. package/lib/commands/chat-turn.js +446 -0
  17. package/lib/commands/chat.js +403 -0
  18. package/lib/commands/custom.js +157 -0
  19. package/lib/commands/history-utils.js +66 -0
  20. package/lib/commands/index.js +268 -0
  21. package/lib/commands/mcp.js +113 -0
  22. package/lib/commands/oneshot.js +193 -0
  23. package/lib/commands/registry.js +269 -0
  24. package/lib/commands/tasks.js +89 -0
  25. package/lib/compact.js +87 -0
  26. package/lib/config.js +333 -11
  27. package/lib/constants.js +372 -3
  28. package/lib/deny.js +199 -0
  29. package/lib/doctor.js +160 -0
  30. package/lib/headless.js +167 -0
  31. package/lib/hooks.js +286 -0
  32. package/lib/images.js +264 -0
  33. package/lib/internals.js +49 -0
  34. package/lib/mcp/boundary.js +131 -0
  35. package/lib/mcp/client.js +270 -0
  36. package/lib/mcp/oauth.js +134 -0
  37. package/lib/memory.js +209 -0
  38. package/lib/metrics.js +37 -2
  39. package/lib/payload.js +54 -0
  40. package/lib/permission-rules.js +401 -0
  41. package/lib/permissions.js +100 -10
  42. package/lib/pricing.js +67 -0
  43. package/lib/proc.js +62 -0
  44. package/lib/prompts.js +84 -5
  45. package/lib/sandbox.js +568 -0
  46. package/lib/sdk.js +328 -0
  47. package/lib/secrets.js +211 -0
  48. package/lib/skills.js +223 -0
  49. package/lib/subagents.js +516 -0
  50. package/lib/tool_registry.js +2558 -0
  51. package/lib/tool_specs.js +222 -2
  52. package/lib/tools.js +272 -1020
  53. package/lib/ui/format.js +22 -1
  54. package/lib/ui/input-field.js +16 -7
  55. package/lib/ui/status-bar.js +79 -11
  56. package/lib/ui/theme.js +1 -0
  57. package/lib/ui/web-activity.js +218 -0
  58. package/lib/verify.js +229 -0
  59. package/lib/web-extract.js +213 -0
  60. package/lib/web-summarize.js +68 -0
  61. package/package.json +19 -4
  62. package/scripts/lint.js +57 -0
  63. package/test/agent-loop.test.js +389 -0
  64. package/test/background.test.js +414 -0
  65. package/test/chat.test.js +114 -0
  66. package/test/checkpoints-agent.test.js +181 -0
  67. package/test/checkpoints.test.js +650 -0
  68. package/test/command-registry.test.js +160 -0
  69. package/test/compact.test.js +116 -0
  70. package/test/completion-lazy.test.js +52 -0
  71. package/test/config-merge.test.js +324 -0
  72. package/test/config-quarantine.test.js +128 -0
  73. package/test/config-write-guard-allow-anywhere.test.js +56 -0
  74. package/test/config-write-guard-skip.test.js +46 -0
  75. package/test/config-write-guard.test.js +153 -0
  76. package/test/context-split.test.js +215 -0
  77. package/test/cost-doctor.test.js +142 -0
  78. package/test/custom-commands-chat.test.js +106 -0
  79. package/test/custom-commands.test.js +230 -0
  80. package/test/deny-windows.test.js +120 -0
  81. package/test/deny.test.js +83 -0
  82. package/test/download-allow-anywhere.test.js +66 -0
  83. package/test/download-confine.test.js +153 -0
  84. package/test/executors.test.js +362 -0
  85. package/test/extract-tool-calls.test.js +315 -0
  86. package/test/fetch-url-validation.test.js +219 -0
  87. package/test/fixtures/tool-calls.js +57 -0
  88. package/test/fixtures/web-page.js +91 -0
  89. package/test/git-tools.test.js +384 -0
  90. package/test/grep-glob-serialize.test.js +242 -0
  91. package/test/grep-glob.test.js +268 -0
  92. package/test/harness/README.md +57 -0
  93. package/test/harness/chat-harness.js +142 -0
  94. package/test/harness/memwarn-headless-child.js +65 -0
  95. package/test/harness/mock-llm.js +120 -0
  96. package/test/harness/mock-mcp-server.js +142 -0
  97. package/test/harness/sse-server.js +69 -0
  98. package/test/headless.test.js +203 -0
  99. package/test/history-utils.test.js +88 -0
  100. package/test/hooks-agent.test.js +238 -0
  101. package/test/hooks-verify-sandbox.test.js +232 -0
  102. package/test/hooks.test.js +216 -0
  103. package/test/http-get-user-agent.test.js +142 -0
  104. package/test/images-api.test.js +208 -0
  105. package/test/images.test.js +238 -0
  106. package/test/max-iterations.test.js +216 -0
  107. package/test/mcp-boundary.test.js +57 -0
  108. package/test/mcp-client.test.js +267 -0
  109. package/test/mcp-oauth.test.js +86 -0
  110. package/test/memory-truncation-warning.test.js +222 -0
  111. package/test/memory.test.js +198 -0
  112. package/test/native-dispatch.test.js +356 -0
  113. package/test/output-chokepoint.test.js +188 -0
  114. package/test/path-guards.test.js +134 -0
  115. package/test/payload.test.js +99 -0
  116. package/test/permission-rules-agent.test.js +210 -0
  117. package/test/permission-rules.test.js +297 -0
  118. package/test/permissions.test.js +163 -0
  119. package/test/plan-mode.test.js +167 -0
  120. package/test/read-paginate.test.js +275 -0
  121. package/test/readonly-tools.test.js +177 -0
  122. package/test/result-cap.test.js +233 -0
  123. package/test/sandbox-agent.test.js +147 -0
  124. package/test/sandbox-integration.test.js +216 -0
  125. package/test/sandbox.test.js +408 -0
  126. package/test/sdk.test.js +234 -0
  127. package/test/shell-output-cap.test.js +181 -0
  128. package/test/skills-chat.test.js +110 -0
  129. package/test/skills.test.js +295 -0
  130. package/test/smoke.test.js +68 -0
  131. package/test/status-bar-pause.test.js +164 -0
  132. package/test/stream-parser.test.js +147 -0
  133. package/test/subagents-agent.test.js +178 -0
  134. package/test/subagents.test.js +222 -0
  135. package/test/tool-registry.test.js +85 -0
  136. package/test/trim-budget.test.js +101 -0
  137. package/test/verify-agent.test.js +317 -0
  138. package/test/verify.test.js +141 -0
  139. package/test/web-activity-ordering.test.js +194 -0
  140. package/test/web-activity.test.js +207 -0
  141. package/test/web-data-extraction-guidance.test.js +71 -0
  142. package/test/web-extract.test.js +185 -0
  143. package/test/web-fetch-agent.test.js +291 -0
  144. package/test/web-fetch-mode.test.js +193 -0
  145. package/test/web-search.test.js +380 -0
  146. package/lib/commands.js +0 -1438
@@ -0,0 +1,233 @@
1
+ 'use strict';
2
+
3
+ // Task W.8 — Cap MCP & subagent output entering context.
4
+ //
5
+ // THE CHANGE these tests pin: MCP tool results (lib/mcp/client.js) and subagent
6
+ // final text (lib/subagents.js) were the last two UNBOUNDED paths into context —
7
+ // both fenced as untrusted, but neither token-capped. A server (MCP) or a verbose
8
+ // child (subagent) could blow context wholesale. Both serializers now apply the
9
+ // standard capToTokens (consistent with W.5–W.7) BEFORE wrapping the text in the
10
+ // untrusted fence, with DIFFERENT budgets:
11
+ // * MCP — STRICTER (third-party, untrusted, server-controlled): the riskiest.
12
+ // * Subagent — GENEROUS (our own child's synthesized result): a safety net.
13
+ // Tests assert the MODEL-FACING (and parent-facing) result: the bound, the
14
+ // truncation notice, the fence-still-present, and that the two budgets differ.
15
+
16
+ const { test, before, after, afterEach } = require('node:test');
17
+ const assert = require('node:assert');
18
+
19
+ const ui = require('../lib/ui');
20
+ const { createApiClient } = require('../lib/api');
21
+ const { createToolExecutor, extractToolCalls } = require('../lib/tools');
22
+ const { createPermissionManager } = require('../lib/permissions');
23
+ const {
24
+ createAgentRunner, formatMcpResult, formatSubagentResult,
25
+ } = require('../lib/agent');
26
+ const toolRegistry = require('../lib/tool_registry');
27
+ const { createSubagentManager, buildSpawnAgentEntry } = require('../lib/subagents');
28
+ const {
29
+ DEFAULT_MCP_MAX_RESULT_TOKENS, DEFAULT_SUBAGENT_MAX_RESULT_TOKENS,
30
+ } = require('../lib/constants');
31
+ const { startMockLLM } = require('./harness/mock-llm');
32
+
33
+ const FENCE_OPEN = /<<<UNTRUSTED_EXTERNAL_CONTENT/;
34
+ const FENCE_CLOSE = /<<<END_UNTRUSTED_EXTERNAL_CONTENT>>>/;
35
+
36
+ // ---------------------------------------------------------------------------
37
+ // Part A — pure model-facing serializers (formatMcpResult / formatSubagentResult)
38
+ // ---------------------------------------------------------------------------
39
+
40
+ test('MCP: small result passes through fully, no notice, still fenced', () => {
41
+ const content = 'just a small payload from the server';
42
+ const out = formatMcpResult({ action: 'mcp__srv__tool', content, maxTokens: 10000 });
43
+ assert.match(out, /MCP tool mcp__srv__tool result:/);
44
+ assert.match(out, FENCE_OPEN);
45
+ assert.match(out, FENCE_CLOSE);
46
+ assert.ok(out.includes(content), 'full payload present');
47
+ assert.doesNotMatch(out, /capped at/);
48
+ });
49
+
50
+ test('MCP: large result is capped with a notice, INSIDE the untrusted fence', () => {
51
+ const content = 'x'.repeat(4000); // ~1000 tokens
52
+ const out = formatMcpResult({ action: 'mcp__srv__tool', content, maxTokens: 50 });
53
+ assert.match(out, /capped at ~50 tokens \(was ~\d+\)/, 'truncation notice present');
54
+ // The capped content (and its notice) must remain BETWEEN the fence delimiters.
55
+ const open = out.indexOf('<<<UNTRUSTED_EXTERNAL_CONTENT');
56
+ const close = out.indexOf('<<<END_UNTRUSTED_EXTERNAL_CONTENT>>>');
57
+ const noticeAt = out.indexOf('capped at');
58
+ assert.ok(open >= 0 && close > open, 'fence present and well-ordered');
59
+ assert.ok(noticeAt > open && noticeAt < close, 'notice sits inside the fence');
60
+ // The full payload did NOT enter context.
61
+ assert.ok(out.length < content.length, 'result is shorter than the raw payload');
62
+ });
63
+
64
+ test('MCP: isError surfaces the error note, still fenced', () => {
65
+ const out = formatMcpResult({ action: 'mcp__srv__t', content: 'boom', isError: true, maxTokens: 10000 });
66
+ assert.match(out, /\(the tool reported an error\)/);
67
+ assert.match(out, FENCE_OPEN);
68
+ });
69
+
70
+ test('subagent: short result passes through fully, no notice, fenced', () => {
71
+ const content = 'CHILD FINDINGS: the project is a CLI';
72
+ const out = formatSubagentResult({ count: 1, content, maxTokens: 20000 });
73
+ assert.match(out, /Result from 1 subagent/);
74
+ assert.match(out, FENCE_OPEN);
75
+ assert.match(out, FENCE_CLOSE);
76
+ assert.ok(out.includes(content));
77
+ assert.doesNotMatch(out, /capped at/);
78
+ });
79
+
80
+ test('subagent: long result is capped with a notice', () => {
81
+ const content = 'y'.repeat(4000);
82
+ const out = formatSubagentResult({ count: 1, content, maxTokens: 50 });
83
+ assert.match(out, /capped at ~50 tokens \(was ~\d+\)/);
84
+ assert.match(out, FENCE_OPEN);
85
+ assert.match(out, FENCE_CLOSE);
86
+ assert.ok(out.length < content.length);
87
+ });
88
+
89
+ test('subagent: plural label for multiple subagents', () => {
90
+ const out = formatSubagentResult({ count: 3, content: 'a', maxTokens: 20000 });
91
+ assert.match(out, /Result from 3 subagents/);
92
+ });
93
+
94
+ // ---------------------------------------------------------------------------
95
+ // Part B — the two budgets are DISTINCT and MCP is STRICTER
96
+ // ---------------------------------------------------------------------------
97
+
98
+ test('default budgets: MCP is strictly stricter than subagent', () => {
99
+ assert.ok(DEFAULT_MCP_MAX_RESULT_TOKENS < DEFAULT_SUBAGENT_MAX_RESULT_TOKENS,
100
+ 'MCP budget must be stricter than the subagent budget');
101
+ });
102
+
103
+ test('budgets differ: content between the two budgets is capped under MCP but passes under subagent', () => {
104
+ // Size the content so its estimate is ABOVE the MCP default and BELOW the
105
+ // subagent default (estimate ≈ chars/4). Midpoint of the two budgets.
106
+ const midTokens = Math.floor((DEFAULT_MCP_MAX_RESULT_TOKENS + DEFAULT_SUBAGENT_MAX_RESULT_TOKENS) / 2);
107
+ const content = 'z'.repeat(midTokens * 4);
108
+
109
+ // No explicit maxTokens → each serializer uses ITS OWN default budget.
110
+ const mcp = formatMcpResult({ action: 'mcp__s__t', content });
111
+ const sub = formatSubagentResult({ count: 1, content });
112
+
113
+ assert.match(mcp, /capped at/, 'MCP caps a payload above its stricter budget');
114
+ assert.doesNotMatch(sub, /capped at/, 'subagent passes the same payload under its generous budget');
115
+ });
116
+
117
+ // ---------------------------------------------------------------------------
118
+ // Part C — through the REAL agent loop (the wiring reads config; fence intact)
119
+ // ---------------------------------------------------------------------------
120
+
121
+ let prevKey;
122
+ before(() => { prevKey = process.env.SEMALT_API_KEY; process.env.SEMALT_API_KEY = 'test-key'; });
123
+ after(() => {
124
+ if (prevKey === undefined) delete process.env.SEMALT_API_KEY;
125
+ else process.env.SEMALT_API_KEY = prevKey;
126
+ });
127
+ afterEach(() => { toolRegistry.clearDynamicTools(); });
128
+
129
+ // Build a full parent stack (api + permissions + executors + agent runner). With
130
+ // `withSubagent` it also wires a subagent manager from the SAME building blocks
131
+ // and registers the spawn_agent tool — mirroring test/subagents-agent.test.js.
132
+ function buildStack(base, config, { withSubagent = false } = {}) {
133
+ const cfg = {
134
+ api_base: base, api_key: 'test-key', default_model: 'test-model',
135
+ temperature: 0.5, request_timeout_ms: 5000, stream: true, models: [],
136
+ ...config,
137
+ };
138
+ const getConfig = () => cfg;
139
+ const api = createApiClient({ getConfig, saveConfig: (c) => Object.assign(cfg, c), ui });
140
+ const pm = createPermissionManager(ui, { skipPermissions: true });
141
+ pm.setUICallbacks({ onAddMessage: () => {}, onShowModal: () => {}, onCloseModal: () => {}, onCaptureNavigation: () => () => {} });
142
+ const { agentExecShell, agentExecFile, describePermission } = createToolExecutor(pm, ui, getConfig);
143
+ const runner = createAgentRunner({
144
+ chatStream: api.chatStream, extractToolCalls, agentExecShell, agentExecFile,
145
+ describePermission, permissionManager: pm, ui, getConfig,
146
+ });
147
+ if (withSubagent) {
148
+ const manager = createSubagentManager({
149
+ chatStream: api.chatStream, extractToolCalls, agentExecShell, agentExecFile,
150
+ describePermission, permissionManager: pm, ui, getConfig,
151
+ });
152
+ toolRegistry.registerDynamicTool(buildSpawnAgentEntry(manager));
153
+ }
154
+ return { runner, getConfig, cfg };
155
+ }
156
+
157
+ // Register a fake MCP-style dynamic tool returning a fixed payload, so we exercise
158
+ // the formatFileResult MCP branch WITHOUT the real SDK / a live server.
159
+ function registerFakeMcpTool(content) {
160
+ toolRegistry.registerDynamicTool({
161
+ tool: 'mcp__test__big',
162
+ mcp: true,
163
+ server: 'test',
164
+ spec: { description: 'fake', parameters: { type: 'object', properties: {} } },
165
+ fromParams: (p) => ['mcp__test__big', p || {}],
166
+ parseXml: () => [],
167
+ permission: () => null,
168
+ execute: async () => ({ mcp: true, content, isError: false }),
169
+ });
170
+ }
171
+
172
+ test('real loop: a large MCP result is capped + still fenced in the tool message', async () => {
173
+ const mock = await startMockLLM();
174
+ registerFakeMcpTool('Q'.repeat(4000)); // ~1000 tokens
175
+ mock.replyWithToolCall('mcp__test__big', {});
176
+ mock.replyWith('done');
177
+ try {
178
+ const { runner } = buildStack(mock.base, { mcp: { servers: {}, max_result_tokens: 20 } });
179
+ const messages = [{ role: 'user', content: 'call the mcp tool' }];
180
+ await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: { onError: () => {} } });
181
+ const toolMsg = messages.find((m) => m.role === 'tool' && /mcp__test__big/.test(m.content || ''));
182
+ assert.ok(toolMsg, 'MCP result fed back');
183
+ assert.match(toolMsg.content, FENCE_OPEN, 'still fenced after capping');
184
+ assert.match(toolMsg.content, FENCE_CLOSE);
185
+ assert.match(toolMsg.content, /capped at ~20 tokens/, 'capped at the configured MCP budget');
186
+ assert.ok(toolMsg.content.length < 4000, 'the full payload did not enter context');
187
+ } finally {
188
+ await mock.close();
189
+ }
190
+ });
191
+
192
+ test('real loop: a small MCP result passes through fully (paired positive), still fenced', async () => {
193
+ const mock = await startMockLLM();
194
+ registerFakeMcpTool('tiny payload');
195
+ mock.replyWithToolCall('mcp__test__big', {});
196
+ mock.replyWith('done');
197
+ try {
198
+ const { runner } = buildStack(mock.base, { mcp: { servers: {}, max_result_tokens: 10000 } });
199
+ const messages = [{ role: 'user', content: 'call the mcp tool' }];
200
+ await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: { onError: () => {} } });
201
+ const toolMsg = messages.find((m) => m.role === 'tool' && /mcp__test__big/.test(m.content || ''));
202
+ assert.ok(toolMsg);
203
+ assert.match(toolMsg.content, FENCE_OPEN);
204
+ assert.ok(toolMsg.content.includes('tiny payload'));
205
+ assert.doesNotMatch(toolMsg.content, /capped at/);
206
+ } finally {
207
+ await mock.close();
208
+ }
209
+ });
210
+
211
+ test('real loop: a verbose subagent final text is capped + still fenced, isolation intact', async () => {
212
+ const mock = await startMockLLM();
213
+ const longChild = 'L'.repeat(4000); // ~1000 tokens
214
+ mock.replyWithToolCall('spawn_agent', { prompt: 'go research' }); // parent
215
+ mock.replyWith(longChild); // child final
216
+ mock.replyWith('noted'); // parent final
217
+ try {
218
+ const { runner } = buildStack(mock.base,
219
+ { subagents: { max_concurrency: 3, max_result_tokens: 30 } }, { withSubagent: true });
220
+ const messages = [{ role: 'user', content: 'investigate' }];
221
+ await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: { onError: () => {} } });
222
+ const toolMsg = messages.find((m) => m.role === 'tool' && /UNTRUSTED_EXTERNAL_CONTENT/.test(m.content || ''));
223
+ assert.ok(toolMsg, 'subagent result fed back fenced');
224
+ assert.match(toolMsg.content, FENCE_OPEN);
225
+ assert.match(toolMsg.content, FENCE_CLOSE);
226
+ assert.match(toolMsg.content, /capped at ~30 tokens/, 'capped at the configured subagent budget');
227
+ // Isolation unchanged: the parent did not absorb the child's long assistant turn.
228
+ const absorbed = messages.some((m) => m.role === 'assistant' && m.content === longChild);
229
+ assert.ok(!absorbed, 'the child assistant turn never lands in the parent history');
230
+ } finally {
231
+ await mock.close();
232
+ }
233
+ });
@@ -0,0 +1,147 @@
1
+ 'use strict';
2
+
3
+ // Integration tests for the OS-sandbox FALLBACK rules wired into agentExecShell
4
+ // (Task 4.4). These exercise the config×detection decision at the executor
5
+ // chokepoint without needing a real bwrap/sandbox-exec: the detection cache is
6
+ // primed to "unavailable" (or the runner genuinely lacks the tool), and we
7
+ // assert the fail-safe behavior — never a silent unsandboxed run.
8
+
9
+ const { test } = require('node:test');
10
+ const assert = require('node:assert');
11
+
12
+ const fs = require('fs');
13
+ const os = require('os');
14
+ const path = require('path');
15
+
16
+ const ui = require('../lib/ui');
17
+ const { createPermissionManager } = require('../lib/permissions');
18
+ const { createToolExecutor } = require('../lib/tools');
19
+ const { detectSandbox, _resetSandboxDetection } = require('../lib/sandbox');
20
+
21
+ // Force the shared detection cache to "unavailable" so the unavailable-path
22
+ // tests are deterministic on ANY runner (including macOS / a bwrap-equipped
23
+ // Linux box where the sandbox would otherwise be available).
24
+ function primeUnavailable() {
25
+ _resetSandboxDetection();
26
+ detectSandbox({
27
+ platform: 'linux',
28
+ which: () => null, // no bwrap
29
+ readFile: () => 'Linux version 6.0',
30
+ force: true,
31
+ });
32
+ }
33
+
34
+ function buildExec({ config, onUnsandboxed } = {}) {
35
+ const pm = createPermissionManager(ui, { skipPermissions: false, allowedTiers: ['exec'] });
36
+ pm.setUICallbacks({ onAddMessage: () => {}, onShowModal: () => {}, onCloseModal: () => {}, onCaptureNavigation: () => () => {} });
37
+ const getConfig = () => config;
38
+ return createToolExecutor(pm, ui, getConfig, { onUnsandboxed });
39
+ }
40
+
41
+ test('sandbox unavailable + auto + NO approver → REFUSED (never a silent unsandboxed run)', async () => {
42
+ primeUnavailable();
43
+ const { agentExecShell } = buildExec({ config: { sandbox: { mode: 'auto' } } });
44
+ const r = await agentExecShell('echo SHOULD_NOT_RUN');
45
+ assert.strictEqual(r.blocked, true);
46
+ assert.strictEqual(r.sandbox, 'unavailable');
47
+ assert.match(r.stderr, /refused to run unsandboxed/i);
48
+ assert.doesNotMatch(r.stdout || '', /SHOULD_NOT_RUN/);
49
+ _resetSandboxDetection();
50
+ });
51
+
52
+ test('sandbox unavailable + failIfUnavailable → HARD ERROR (strict gate)', async () => {
53
+ primeUnavailable();
54
+ const { agentExecShell } = buildExec({ config: { sandbox: { mode: 'auto', failIfUnavailable: true } } });
55
+ const r = await agentExecShell('echo SHOULD_NOT_RUN');
56
+ assert.strictEqual(r.blocked, true);
57
+ assert.strictEqual(r.sandbox, 'unavailable');
58
+ assert.match(r.stderr, /failIfUnavailable/);
59
+ _resetSandboxDetection();
60
+ });
61
+
62
+ test('sandbox unavailable + auto + approver says NO → refused', async () => {
63
+ primeUnavailable();
64
+ const { agentExecShell } = buildExec({ config: { sandbox: { mode: 'auto' } }, onUnsandboxed: async () => false });
65
+ const r = await agentExecShell('echo SHOULD_NOT_RUN');
66
+ assert.strictEqual(r.blocked, true);
67
+ _resetSandboxDetection();
68
+ });
69
+
70
+ test('sandbox unavailable + auto + human approver says YES → runs unsandboxed (status reflects it)', async () => {
71
+ primeUnavailable();
72
+ let asked = null;
73
+ const { agentExecShell } = buildExec({
74
+ config: { sandbox: { mode: 'auto' }, command_timeout_ms: 5000 },
75
+ onUnsandboxed: async (info) => { asked = info; return true; },
76
+ });
77
+ const r = await agentExecShell('echo RAN_UNSANDBOXED');
78
+ assert.ok(asked && typeof asked.reason === 'string' && asked.reason.length > 0, 'approver receives the reason');
79
+ assert.match(asked.reason, /bwrap|bubblewrap|not found/i);
80
+ assert.strictEqual(r.exit_code, 0);
81
+ assert.match(r.stdout, /RAN_UNSANDBOXED/);
82
+ assert.strictEqual(r.sandbox, 'unavailable'); // ran, but without kernel confinement
83
+ _resetSandboxDetection();
84
+ });
85
+
86
+ test('mode off → runs unsandboxed deterministically (human opt-out), status off', async () => {
87
+ // mode:off short-circuits before detection, so this is deterministic on every
88
+ // runner regardless of the cache.
89
+ primeUnavailable();
90
+ const { agentExecShell } = buildExec({ config: { sandbox: { mode: 'off' }, command_timeout_ms: 5000 } });
91
+ const r = await agentExecShell('echo MODE_OFF_RAN');
92
+ assert.strictEqual(r.exit_code, 0);
93
+ assert.match(r.stdout, /MODE_OFF_RAN/);
94
+ assert.strictEqual(r.sandbox, 'off');
95
+ // With no jail the command keeps the host network — surfaced honestly (Task 4.4b).
96
+ assert.strictEqual(r.network, 'on');
97
+ _resetSandboxDetection();
98
+ });
99
+
100
+ test('no MODEL-reachable path disables the sandbox: call-level options cannot flip the decision', async () => {
101
+ primeUnavailable();
102
+ const { agentExecShell } = buildExec({ config: { sandbox: { mode: 'auto' } } });
103
+ // A model only ever controls the command string + framework {signal}. Even if
104
+ // call options carried sandbox-ish keys (they never do in the real schema),
105
+ // they must NOT disable the gate — the decision reads only human config.
106
+ const r = await agentExecShell('echo SHOULD_NOT_RUN', { sandbox: 'off', mode: 'off', skipSandbox: true, failIfUnavailable: false });
107
+ assert.strictEqual(r.blocked, true, 'call-level options cannot disable the sandbox');
108
+ assert.strictEqual(r.sandbox, 'unavailable');
109
+ _resetSandboxDetection();
110
+ });
111
+
112
+ // Binary network isolation surfaced through the executor (Task 4.4b). Uses the
113
+ // REAL sandbox so the result's `network` field reflects an actual kernel jail;
114
+ // skips gracefully when the primitive is absent (mirrors the integration suite).
115
+ const _realDet = (() => { _resetSandboxDetection(); const d = detectSandbox({ force: true }); _resetSandboxDetection(); return d; })();
116
+ const NET_SKIP = _realDet.available && _realDet.tool === 'bwrap'
117
+ ? false
118
+ : `OS sandbox (bwrap) unavailable on this runner (${_realDet.reason || _realDet.platform})`;
119
+
120
+ test('sandbox.network off → the shell result reports net OFF and the command has no network (real jail)', { skip: NET_SKIP }, async () => {
121
+ _resetSandboxDetection();
122
+ detectSandbox({ force: true }); // real detection, available
123
+ const probe = path.join(os.tmpdir(), `semalt-agent-netprobe-${process.pid}.js`);
124
+ fs.writeFileSync(probe, "const i=require('os').networkInterfaces();const nonLo=Object.keys(i).filter(n=>n!=='lo'&&n!=='lo0');process.exit(nonLo.length>0?0:7);");
125
+ try {
126
+ const { agentExecShell } = buildExec({ config: { sandbox: { mode: 'auto', network: 'off' }, command_timeout_ms: 10000 } });
127
+ // Pass model-reachable-looking call options — they must NOT re-enable the network.
128
+ const r = await agentExecShell(`${JSON.stringify(process.execPath)} ${JSON.stringify(probe)}`, { network: 'on', noNetwork: false });
129
+ assert.strictEqual(r.sandbox, 'on', 'ran inside a real jail');
130
+ assert.strictEqual(r.network, 'off', 'the result reports kernel-level no-network');
131
+ assert.strictEqual(r.exit_code, 7, 'the jailed command genuinely had no network (only loopback)');
132
+ } finally {
133
+ try { fs.unlinkSync(probe); } catch {}
134
+ _resetSandboxDetection();
135
+ }
136
+ });
137
+
138
+ test('the deny-list still fires UNDER the sandbox layer (defense in depth, agent-initiated)', async () => {
139
+ primeUnavailable();
140
+ const { agentExecShell } = buildExec({ config: { sandbox: { mode: 'off' } } });
141
+ // Even with the sandbox off, the deny-list chokepoint must still hard-block a
142
+ // destructive agent command before it ever reaches the spawn/sandbox path.
143
+ const r = await agentExecShell('rm -rf /');
144
+ assert.strictEqual(r.blocked, true);
145
+ assert.match(r.stderr, /deny-list/i);
146
+ _resetSandboxDetection();
147
+ });
@@ -0,0 +1,216 @@
1
+ 'use strict';
2
+
3
+ // Kernel-level enforcement tests for the OS sandbox (Task 4.4). These run REAL
4
+ // bwrap (Linux/WSL2) or sandbox-exec (macOS) jails and assert the OS — not our
5
+ // pattern-matching — blocks the writes. They SKIP gracefully when the primitive
6
+ // is absent on the runner (mirroring the ripgrep-parity pattern), so the suite
7
+ // stays green on a Windows/WSL1 box or a Linux box without bubblewrap.
8
+
9
+ const { test } = require('node:test');
10
+ const assert = require('node:assert');
11
+ const fs = require('fs');
12
+ const os = require('os');
13
+ const path = require('path');
14
+ const { spawnSync } = require('child_process');
15
+
16
+ const { detectSandbox, buildBwrapArgs, buildSeatbeltPolicy } = require('../lib/sandbox');
17
+
18
+ const det = detectSandbox({ force: true });
19
+ const SKIP = det.available ? false : `OS sandbox tool unavailable on this runner (${det.reason || det.platform})`;
20
+
21
+ function mkdir(prefix) {
22
+ return fs.realpathSync(fs.mkdtempSync(path.join(os.tmpdir(), prefix)));
23
+ }
24
+
25
+ // Run `cmd` inside a real jail whose ONLY writable root is `writableRoots`, with
26
+ // `protectedPaths` forced read-only. Returns the spawn result ({ status, ... }).
27
+ function runJailed(cmd, { writableRoots, protectedPaths, chdir }) {
28
+ if (det.tool === 'bwrap') {
29
+ const args = buildBwrapArgs({
30
+ writableRoots,
31
+ protectedPaths,
32
+ rootWritable: false,
33
+ chdir,
34
+ fsExists: (p) => { try { return fs.existsSync(p); } catch { return false; } },
35
+ });
36
+ return spawnSync(det.binPath || 'bwrap', [...args, '/bin/sh', '-c', cmd], { encoding: 'utf8', timeout: 10000 });
37
+ }
38
+ // sandbox-exec (macOS Seatbelt)
39
+ const policy = buildSeatbeltPolicy({ writableRoots, protectedPaths, rootWritable: false });
40
+ return spawnSync(det.binPath || 'sandbox-exec', ['-p', policy, '/bin/sh', '-c', cmd], { encoding: 'utf8', timeout: 10000 });
41
+ }
42
+
43
+ test('a write INSIDE the working dir succeeds under the jail', { skip: SKIP }, () => {
44
+ const work = mkdir('semalt-sbx-work-');
45
+ const r = runJailed(`echo ok > ${work}/inside.txt`, { writableRoots: [work], protectedPaths: [], chdir: work });
46
+ assert.strictEqual(r.status, 0, `expected success, stderr: ${r.stderr}`);
47
+ assert.ok(fs.existsSync(path.join(work, 'inside.txt')), 'file inside the working dir should be written');
48
+ });
49
+
50
+ test('a write OUTSIDE the working dir is blocked by the kernel layer', { skip: SKIP }, () => {
51
+ const work = mkdir('semalt-sbx-work-');
52
+ const outside = mkdir('semalt-sbx-out-');
53
+ const target = path.join(outside, 'escaped.txt');
54
+ const r = runJailed(`echo pwned > ${target}`, { writableRoots: [work], protectedPaths: [], chdir: work });
55
+ assert.notStrictEqual(r.status, 0, 'write outside the working dir must fail');
56
+ assert.ok(!fs.existsSync(target), 'the out-of-jail file must NOT exist');
57
+ });
58
+
59
+ test('writes to a protected dir are denied — including a NOT-YET-EXISTING config (CVE-2026-25725)', { skip: SKIP }, () => {
60
+ const work = mkdir('semalt-sbx-work-');
61
+ // Simulate ~/.semalt-ai: the dir exists but config.json does NOT. The whole
62
+ // dir is bound read-only, so the jailed process cannot CREATE the missing file
63
+ // to inject host-privileged hooks.
64
+ const protectedDir = mkdir('semalt-sbx-prot-');
65
+ const missingConfig = path.join(protectedDir, 'config.json');
66
+ assert.ok(!fs.existsSync(missingConfig), 'precondition: config.json does not exist yet');
67
+ const r = runJailed(`echo '{"hooks":1}' > ${missingConfig}`, { writableRoots: [work], protectedPaths: [protectedDir], chdir: work });
68
+ assert.notStrictEqual(r.status, 0, 'creating a missing config in a protected dir must fail');
69
+ assert.ok(!fs.existsSync(missingConfig), 'the not-yet-existing config must NOT have been created');
70
+ });
71
+
72
+ test('a protected dir nested inside a writable root still wins (read-only)', { skip: SKIP }, () => {
73
+ // cwd == $HOME edge case: the writable root CONTAINS the protected dir, and the
74
+ // protected re-bind must still win.
75
+ const work = mkdir('semalt-sbx-home-');
76
+ const protectedDir = path.join(work, '.semalt-ai');
77
+ fs.mkdirSync(protectedDir);
78
+ const target = path.join(protectedDir, 'config.json');
79
+ const r = runJailed(`echo x > ${target}`, { writableRoots: [work], protectedPaths: [protectedDir], chdir: work });
80
+ assert.notStrictEqual(r.status, 0, 'protected dir nested in a writable root must stay read-only');
81
+ assert.ok(!fs.existsSync(target), 'config.json under the nested protected dir must NOT be created');
82
+ });
83
+
84
+ test('a project .semalt/config.json write is blocked by the kernel layer, incl. not-yet-existing (Pre-Task 5.0b)', { skip: SKIP }, () => {
85
+ // The project .semalt dir lives INSIDE the writable working dir, yet binding it
86
+ // read-only must still win — so a sandboxed shell cannot create or modify
87
+ // .semalt/config.json (or agents/hooks) to drive host-privileged execution.
88
+ // Mirrors the CVE-2026-25725 not-yet-existing-file pattern: the dir exists, the
89
+ // config file does not.
90
+ const work = mkdir('semalt-sbx-proj-');
91
+ const dotSemalt = path.join(work, '.semalt');
92
+ fs.mkdirSync(dotSemalt);
93
+ const missingConfig = path.join(dotSemalt, 'config.json');
94
+ assert.ok(!fs.existsSync(missingConfig), 'precondition: .semalt/config.json does not exist yet');
95
+ const r = runJailed(`echo '{"hooks":1}' > ${missingConfig}`, { writableRoots: [work], protectedPaths: [dotSemalt], chdir: work });
96
+ assert.notStrictEqual(r.status, 0, 'creating .semalt/config.json inside the jail must fail');
97
+ assert.ok(!fs.existsSync(missingConfig), 'the not-yet-existing .semalt/config.json must NOT have been created');
98
+ });
99
+
100
+ test('a /proc/self/root rewrite is confined on the RESOLVED path (the Ona bypass)', { skip: SKIP }, () => {
101
+ // bwrap mounts a fresh /proc, so /proc/self/root resolves to the jail root and
102
+ // the kernel enforces the bind on the resolved path. (Seatbelt enforces on the
103
+ // resolved vnode the same way.) Either way the write must be blocked.
104
+ const work = mkdir('semalt-sbx-work-');
105
+ const outside = mkdir('semalt-sbx-out-');
106
+ const viaProc = `/proc/self/root${path.join(outside, 'rewrite.txt')}`;
107
+ const r = runJailed(`echo pwned > ${viaProc}`, { writableRoots: [work], protectedPaths: [], chdir: work });
108
+ assert.notStrictEqual(r.status, 0, 'a /proc/self/root rewrite must be confined on the resolved path');
109
+ assert.ok(!fs.existsSync(path.join(outside, 'rewrite.txt')), 'the rewritten target must NOT be written');
110
+ });
111
+
112
+ test('child processes inherit the jail (a spawned subprocess is confined)', { skip: SKIP }, () => {
113
+ const work = mkdir('semalt-sbx-work-');
114
+ const outside = mkdir('semalt-sbx-out-');
115
+ const target = path.join(outside, 'child.txt');
116
+ // The outer sh spawns an inner sh that attempts the escape — the boundary must
117
+ // cover the whole subprocess tree, not just the first process.
118
+ const r = runJailed(`sh -c "echo pwned > ${target}"`, { writableRoots: [work], protectedPaths: [], chdir: work });
119
+ assert.notStrictEqual(r.status, 0, 'a child process must not be able to escape the jail');
120
+ assert.ok(!fs.existsSync(target), 'the child-written out-of-jail file must NOT exist');
121
+ });
122
+
123
+ test('reads are allowed broadly even when writes are confined', { skip: SKIP }, () => {
124
+ const work = mkdir('semalt-sbx-work-');
125
+ // /etc/hostname exists on Linux; on macOS read /etc/hosts. Pick one that exists.
126
+ const readable = fs.existsSync('/etc/hostname') ? '/etc/hostname' : '/etc/hosts';
127
+ const r = runJailed(`cat ${readable} > ${work}/copy.txt`, { writableRoots: [work], protectedPaths: ['/etc'], chdir: work });
128
+ assert.strictEqual(r.status, 0, `reading ${readable} should succeed, stderr: ${r.stderr}`);
129
+ });
130
+
131
+ // ---------------------------------------------------------------------------
132
+ // Binary network isolation (Task 4.4b) — REAL kernel enforcement.
133
+ // ---------------------------------------------------------------------------
134
+ //
135
+ // Discriminator (no external connectivity required, so it is reliable in CI):
136
+ // * bwrap (Linux/WSL2): --unshare-net gives the jail a fresh network namespace
137
+ // with NO real interfaces — only loopback. We count non-loopback interfaces.
138
+ // * sandbox-exec (macOS): (deny network*) blocks socket operations — we test
139
+ // whether a TCP bind is permitted. (Skips on this Linux runner.)
140
+ // Exit 0 ⇒ the probe HAS network; exit 7 ⇒ it does NOT.
141
+
142
+ const NODE = process.execPath;
143
+
144
+ function writeNetProbe(dir) {
145
+ const body = det.tool === 'bwrap'
146
+ ? "const i=require('os').networkInterfaces();const nonLo=Object.keys(i).filter(n=>n!=='lo'&&n!=='lo0');process.exit(nonLo.length>0?0:7);"
147
+ : "const s=require('net').createServer();s.on('error',()=>process.exit(7));s.listen(0,'0.0.0.0',()=>{s.close();process.exit(0);});";
148
+ const p = path.join(dir, 'netprobe.js');
149
+ fs.writeFileSync(p, body);
150
+ return p;
151
+ }
152
+
153
+ // Run a command inside a jail with the given network mode. `cmd` defaults to the
154
+ // network probe; callers may override to test composition with the fs layer.
155
+ function runJailedNet({ network, writableRoots, protectedPaths = [], chdir, cmd }) {
156
+ if (det.tool === 'bwrap') {
157
+ const args = buildBwrapArgs({
158
+ writableRoots, protectedPaths, rootWritable: false, chdir, network,
159
+ fsExists: (p) => { try { return fs.existsSync(p); } catch { return false; } },
160
+ });
161
+ return spawnSync(det.binPath || 'bwrap', [...args, '/bin/sh', '-c', cmd], { encoding: 'utf8', timeout: 10000 });
162
+ }
163
+ const policy = buildSeatbeltPolicy({ writableRoots, protectedPaths, rootWritable: false, network });
164
+ return spawnSync(det.binPath || 'sandbox-exec', ['-p', policy, '/bin/sh', '-c', cmd], { encoding: 'utf8', timeout: 10000 });
165
+ }
166
+
167
+ // Does the HOST itself have network the probe can see? (A degenerate CI box with
168
+ // only loopback would make the "network on" positive vacuous — guard for it.)
169
+ function hostHasNetwork() {
170
+ const dir = mkdir('semalt-sbx-host-');
171
+ const probe = writeNetProbe(dir);
172
+ const r = spawnSync(NODE, [probe], { encoding: 'utf8', timeout: 10000 });
173
+ return r.status === 0;
174
+ }
175
+
176
+ test('network OFF: a sandboxed command CANNOT reach the network (kernel-enforced)', { skip: SKIP }, () => {
177
+ const work = mkdir('semalt-sbx-net-off-');
178
+ const probe = writeNetProbe(work);
179
+ const r = runJailedNet({ network: 'off', writableRoots: [work], chdir: work, cmd: `${NODE} ${probe}` });
180
+ assert.strictEqual(r.status, 7, `no-network jail must have no network, got status ${r.status} stderr: ${r.stderr}`);
181
+ });
182
+
183
+ test('PAIRED positive — network ON: the same sandboxed command CAN reach the network', { skip: SKIP }, () => {
184
+ if (!hostHasNetwork()) { return; } // degenerate runner with only loopback — nothing to pair against
185
+ const work = mkdir('semalt-sbx-net-on-');
186
+ const probe = writeNetProbe(work);
187
+ const on = runJailedNet({ network: 'on', writableRoots: [work], chdir: work, cmd: `${NODE} ${probe}` });
188
+ assert.strictEqual(on.status, 0, `network-on jail must keep the host network, stderr: ${on.stderr}`);
189
+ // And the negative side of the pair, for an unambiguous on≠off difference.
190
+ const off = runJailedNet({ network: 'off', writableRoots: [work], chdir: work, cmd: `${NODE} ${probe}` });
191
+ assert.strictEqual(off.status, 7, 'network-off must differ from network-on');
192
+ });
193
+
194
+ test('network isolation COMPOSES with filesystem confinement (both apply)', { skip: SKIP }, () => {
195
+ // One no-network jail: prove the fs boundary AND the net boundary hold together.
196
+ const work = mkdir('semalt-sbx-compose-');
197
+ const outside = mkdir('semalt-sbx-compose-out-');
198
+ const probe = writeNetProbe(work);
199
+ // (a) filesystem: an out-of-CWD write is still blocked under the no-network jail.
200
+ const target = path.join(outside, 'escaped.txt');
201
+ const wr = runJailedNet({ network: 'off', writableRoots: [work], chdir: work, cmd: `echo pwned > ${target}` });
202
+ assert.notStrictEqual(wr.status, 0, 'fs confinement still applies under network-off');
203
+ assert.ok(!fs.existsSync(target), 'the out-of-jail write must not land');
204
+ // (b) network: the same jail config has no network.
205
+ const nr = runJailedNet({ network: 'off', writableRoots: [work], chdir: work, cmd: `${NODE} ${probe}` });
206
+ assert.strictEqual(nr.status, 7, 'network confinement applies in the same jail');
207
+ });
208
+
209
+ test('child processes inherit the no-network jail', { skip: SKIP }, () => {
210
+ const work = mkdir('semalt-sbx-net-child-');
211
+ const probe = writeNetProbe(work);
212
+ // The outer sh spawns an INNER sh that runs the probe — the no-network boundary
213
+ // must cover the whole subprocess tree, not just the first process.
214
+ const r = runJailedNet({ network: 'off', writableRoots: [work], chdir: work, cmd: `sh -c "${NODE} ${probe}"` });
215
+ assert.strictEqual(r.status, 7, 'a child process must also have no network');
216
+ });