@semalt-ai/code 1.8.5 → 1.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. package/.claude/settings.local.json +6 -1
  2. package/.github/workflows/ci.yml +69 -0
  3. package/CLAUDE.md +1584 -26
  4. package/README.md +147 -3
  5. package/examples/embed.js +74 -0
  6. package/index.js +251 -10
  7. package/lib/agent.js +711 -104
  8. package/lib/api.js +213 -49
  9. package/lib/args.js +74 -2
  10. package/lib/audit.js +23 -1
  11. package/lib/background.js +584 -0
  12. package/lib/checkpoints.js +757 -0
  13. package/lib/commands/auth.js +94 -0
  14. package/lib/commands/chat-session.js +306 -0
  15. package/lib/commands/chat-slash.js +399 -0
  16. package/lib/commands/chat-turn.js +446 -0
  17. package/lib/commands/chat.js +403 -0
  18. package/lib/commands/custom.js +157 -0
  19. package/lib/commands/history-utils.js +66 -0
  20. package/lib/commands/index.js +268 -0
  21. package/lib/commands/mcp.js +113 -0
  22. package/lib/commands/oneshot.js +193 -0
  23. package/lib/commands/registry.js +269 -0
  24. package/lib/commands/tasks.js +89 -0
  25. package/lib/compact.js +87 -0
  26. package/lib/config.js +333 -11
  27. package/lib/constants.js +372 -3
  28. package/lib/deny.js +199 -0
  29. package/lib/doctor.js +160 -0
  30. package/lib/headless.js +167 -0
  31. package/lib/hooks.js +286 -0
  32. package/lib/images.js +264 -0
  33. package/lib/internals.js +49 -0
  34. package/lib/mcp/boundary.js +131 -0
  35. package/lib/mcp/client.js +270 -0
  36. package/lib/mcp/oauth.js +134 -0
  37. package/lib/memory.js +209 -0
  38. package/lib/metrics.js +37 -2
  39. package/lib/payload.js +54 -0
  40. package/lib/permission-rules.js +401 -0
  41. package/lib/permissions.js +100 -10
  42. package/lib/pricing.js +67 -0
  43. package/lib/proc.js +62 -0
  44. package/lib/prompts.js +84 -5
  45. package/lib/sandbox.js +568 -0
  46. package/lib/sdk.js +328 -0
  47. package/lib/secrets.js +211 -0
  48. package/lib/skills.js +223 -0
  49. package/lib/subagents.js +516 -0
  50. package/lib/tool_registry.js +2558 -0
  51. package/lib/tool_specs.js +222 -2
  52. package/lib/tools.js +272 -1020
  53. package/lib/ui/format.js +22 -1
  54. package/lib/ui/input-field.js +16 -7
  55. package/lib/ui/status-bar.js +79 -11
  56. package/lib/ui/theme.js +1 -0
  57. package/lib/ui/web-activity.js +218 -0
  58. package/lib/verify.js +229 -0
  59. package/lib/web-extract.js +213 -0
  60. package/lib/web-summarize.js +68 -0
  61. package/package.json +19 -4
  62. package/scripts/lint.js +57 -0
  63. package/test/agent-loop.test.js +389 -0
  64. package/test/background.test.js +414 -0
  65. package/test/chat.test.js +114 -0
  66. package/test/checkpoints-agent.test.js +181 -0
  67. package/test/checkpoints.test.js +650 -0
  68. package/test/command-registry.test.js +160 -0
  69. package/test/compact.test.js +116 -0
  70. package/test/completion-lazy.test.js +52 -0
  71. package/test/config-merge.test.js +324 -0
  72. package/test/config-quarantine.test.js +128 -0
  73. package/test/config-write-guard-allow-anywhere.test.js +56 -0
  74. package/test/config-write-guard-skip.test.js +46 -0
  75. package/test/config-write-guard.test.js +153 -0
  76. package/test/context-split.test.js +215 -0
  77. package/test/cost-doctor.test.js +142 -0
  78. package/test/custom-commands-chat.test.js +106 -0
  79. package/test/custom-commands.test.js +230 -0
  80. package/test/deny-windows.test.js +120 -0
  81. package/test/deny.test.js +83 -0
  82. package/test/download-allow-anywhere.test.js +66 -0
  83. package/test/download-confine.test.js +153 -0
  84. package/test/executors.test.js +362 -0
  85. package/test/extract-tool-calls.test.js +315 -0
  86. package/test/fetch-url-validation.test.js +219 -0
  87. package/test/fixtures/tool-calls.js +57 -0
  88. package/test/fixtures/web-page.js +91 -0
  89. package/test/git-tools.test.js +384 -0
  90. package/test/grep-glob-serialize.test.js +242 -0
  91. package/test/grep-glob.test.js +268 -0
  92. package/test/harness/README.md +57 -0
  93. package/test/harness/chat-harness.js +142 -0
  94. package/test/harness/memwarn-headless-child.js +65 -0
  95. package/test/harness/mock-llm.js +120 -0
  96. package/test/harness/mock-mcp-server.js +142 -0
  97. package/test/harness/sse-server.js +69 -0
  98. package/test/headless.test.js +203 -0
  99. package/test/history-utils.test.js +88 -0
  100. package/test/hooks-agent.test.js +238 -0
  101. package/test/hooks-verify-sandbox.test.js +232 -0
  102. package/test/hooks.test.js +216 -0
  103. package/test/http-get-user-agent.test.js +142 -0
  104. package/test/images-api.test.js +208 -0
  105. package/test/images.test.js +238 -0
  106. package/test/max-iterations.test.js +216 -0
  107. package/test/mcp-boundary.test.js +57 -0
  108. package/test/mcp-client.test.js +267 -0
  109. package/test/mcp-oauth.test.js +86 -0
  110. package/test/memory-truncation-warning.test.js +222 -0
  111. package/test/memory.test.js +198 -0
  112. package/test/native-dispatch.test.js +356 -0
  113. package/test/output-chokepoint.test.js +188 -0
  114. package/test/path-guards.test.js +134 -0
  115. package/test/payload.test.js +99 -0
  116. package/test/permission-rules-agent.test.js +210 -0
  117. package/test/permission-rules.test.js +297 -0
  118. package/test/permissions.test.js +163 -0
  119. package/test/plan-mode.test.js +167 -0
  120. package/test/read-paginate.test.js +275 -0
  121. package/test/readonly-tools.test.js +177 -0
  122. package/test/result-cap.test.js +233 -0
  123. package/test/sandbox-agent.test.js +147 -0
  124. package/test/sandbox-integration.test.js +216 -0
  125. package/test/sandbox.test.js +408 -0
  126. package/test/sdk.test.js +234 -0
  127. package/test/shell-output-cap.test.js +181 -0
  128. package/test/skills-chat.test.js +110 -0
  129. package/test/skills.test.js +295 -0
  130. package/test/smoke.test.js +68 -0
  131. package/test/status-bar-pause.test.js +164 -0
  132. package/test/stream-parser.test.js +147 -0
  133. package/test/subagents-agent.test.js +178 -0
  134. package/test/subagents.test.js +222 -0
  135. package/test/tool-registry.test.js +85 -0
  136. package/test/trim-budget.test.js +101 -0
  137. package/test/verify-agent.test.js +317 -0
  138. package/test/verify.test.js +141 -0
  139. package/test/web-activity-ordering.test.js +194 -0
  140. package/test/web-activity.test.js +207 -0
  141. package/test/web-data-extraction-guidance.test.js +71 -0
  142. package/test/web-extract.test.js +185 -0
  143. package/test/web-fetch-agent.test.js +291 -0
  144. package/test/web-fetch-mode.test.js +193 -0
  145. package/test/web-search.test.js +380 -0
  146. package/lib/commands.js +0 -1438
@@ -0,0 +1,203 @@
1
+ 'use strict';
2
+
3
+ // Headless output tests (Task 2.4). Drives the REAL runAgentLoop against the
4
+ // mock LLM through runHeadless in each output mode, capturing stdout. Proves:
5
+ // * json → a single JSON object { result, toolCalls, usage, cost }
6
+ // * stream-json → newline-delimited JSON events (assistant / tool / result)
7
+ // * machine modes are byte-pure: no ANSI escapes leak, even though a tool runs
8
+ // (write_file's green ✓ and the permission diff would otherwise print).
9
+
10
+ const { test, before, after } = require('node:test');
11
+ const assert = require('node:assert');
12
+ const os = require('node:os');
13
+ const fs = require('node:fs');
14
+ const path = require('node:path');
15
+
16
+ const ui = require('../lib/ui');
17
+ const { createApiClient } = require('../lib/api');
18
+ const { createToolExecutor, extractToolCalls } = require('../lib/tools');
19
+ const { createPermissionManager } = require('../lib/permissions');
20
+ const { createAgentRunner } = require('../lib/agent');
21
+ const { startMockLLM } = require('./harness/mock-llm');
22
+ const { runHeadless, usageFromMetrics, finalResult, isMachineMode } = require('../lib/headless');
23
+
24
+ let prevKey;
25
+ let CWD;
26
+ let PREV_CWD;
27
+ before(() => {
28
+ prevKey = process.env.SEMALT_API_KEY; process.env.SEMALT_API_KEY = 'test-key';
29
+ PREV_CWD = process.cwd();
30
+ CWD = fs.realpathSync(fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-headless-')));
31
+ process.chdir(CWD);
32
+ });
33
+ after(() => {
34
+ process.chdir(PREV_CWD);
35
+ if (prevKey === undefined) delete process.env.SEMALT_API_KEY; else process.env.SEMALT_API_KEY = prevKey;
36
+ });
37
+
38
+ function buildRunner(base) {
39
+ const config = {
40
+ api_base: base, api_key: 'test-key', default_model: 'test-model',
41
+ temperature: 0.5, request_timeout_ms: 5000, stream: true, models: [],
42
+ };
43
+ const getConfig = () => config;
44
+ const saveConfig = () => {};
45
+ const api = createApiClient({ getConfig, saveConfig, ui });
46
+ const pm = createPermissionManager(ui, { skipPermissions: true });
47
+ pm.setUICallbacks({ onAddMessage: () => {}, onShowModal: () => {}, onCloseModal: () => {}, onCaptureNavigation: () => () => {} });
48
+ const { agentExecShell, agentExecFile, describePermission } = createToolExecutor(pm, ui, getConfig);
49
+ return createAgentRunner({
50
+ chatStream: api.chatStream, extractToolCalls, agentExecShell, agentExecFile,
51
+ describePermission, permissionManager: pm, ui, getConfig,
52
+ });
53
+ }
54
+
55
+ // Capture the headless JSON via the injectable `write` sink — no global stdout
56
+ // swap (which would collide with the node:test TAP reporter). Chrome is
57
+ // suppressed inside runHeadless via setUIActive, so the sink receives ONLY the
58
+ // structured output.
59
+ async function runCapture(mode, runner, messages) {
60
+ const chunks = [];
61
+ const res = await runHeadless({
62
+ runAgentLoop: runner.runAgentLoop,
63
+ messages, model: 'test-model', mode, maxIterations: 10,
64
+ agentOpts: { systemPromptMode: 'system_role' },
65
+ write: (s) => { chunks.push(typeof s === 'string' ? s : s.toString('utf8')); return true; },
66
+ });
67
+ return { out: chunks.join(''), res };
68
+ }
69
+
70
+ const HAS_ANSI = (s) => /\x1b\[/.test(s);
71
+
72
+ // ---------------------------------------------------------------------------
73
+ // Pure helpers
74
+ // ---------------------------------------------------------------------------
75
+
76
+ test('usageFromMetrics aggregates turns', () => {
77
+ const metrics = { turns: [
78
+ { promptTokens: 10, completionTokens: 4 },
79
+ { promptTokens: 20, completionTokens: 6 },
80
+ ] };
81
+ assert.deepStrictEqual(usageFromMetrics(metrics), {
82
+ prompt_tokens: 30, completion_tokens: 10, total_tokens: 40, context_tokens: 20,
83
+ context_base_est: 0, context_working_est: 0, turns: 2,
84
+ });
85
+ });
86
+
87
+ test('finalResult returns the last assistant message', () => {
88
+ const messages = [
89
+ { role: 'user', content: 'hi' },
90
+ { role: 'assistant', content: 'first' },
91
+ { role: 'user', content: 'tool results' },
92
+ { role: 'assistant', content: 'final answer' },
93
+ ];
94
+ assert.strictEqual(finalResult(messages, []), 'final answer');
95
+ });
96
+
97
+ test('isMachineMode classifies the formats', () => {
98
+ assert.strictEqual(isMachineMode('json'), true);
99
+ assert.strictEqual(isMachineMode('stream-json'), true);
100
+ assert.strictEqual(isMachineMode('text'), false);
101
+ });
102
+
103
+ // ---------------------------------------------------------------------------
104
+ // json mode
105
+ // ---------------------------------------------------------------------------
106
+
107
+ test('json mode prints exactly one JSON object with the documented shape', async () => {
108
+ const mock = await startMockLLM();
109
+ mock.replyWith('<write_file path="hl.txt">hi</write_file>', { usage: { prompt_tokens: 30, completion_tokens: 10 } });
110
+ mock.replyWith('Wrote it.', { usage: { prompt_tokens: 12, completion_tokens: 7 } });
111
+ try {
112
+ const runner = buildRunner(mock.base);
113
+ const { out } = await runCapture('json', runner, [{ role: 'user', content: 'write a file' }]);
114
+
115
+ assert.ok(!HAS_ANSI(out), 'no ANSI escapes leak into json stdout');
116
+ const lines = out.split('\n').filter((l) => l.trim());
117
+ assert.strictEqual(lines.length, 1, 'exactly one line of output');
118
+ const obj = JSON.parse(lines[0]);
119
+ assert.deepStrictEqual(Object.keys(obj).sort(), ['cost', 'result', 'stopReason', 'toolCalls', 'usage', 'verifyStatus']);
120
+ assert.strictEqual(obj.result, 'Wrote it.');
121
+ assert.strictEqual(obj.stopReason, 'end_turn', 'natural completion reports end_turn');
122
+ assert.strictEqual(obj.verifyStatus, 'skipped', 'no verify command configured → skipped');
123
+ assert.strictEqual(obj.cost, null);
124
+ assert.strictEqual(obj.usage.total_tokens, 59); // 30+10 (tool turn) + 12+7 (final)
125
+ assert.strictEqual(obj.usage.turns, 2);
126
+ assert.strictEqual(obj.toolCalls.length, 1, 'the write_file tool call is recorded');
127
+ assert.strictEqual(obj.toolCalls[0].tool, 'write');
128
+ assert.deepStrictEqual(obj.toolCalls[0].args, ['hl.txt', 'hi']);
129
+ assert.strictEqual(obj.toolCalls[0].ok, true);
130
+ assert.strictEqual(fs.readFileSync(path.join(CWD, 'hl.txt'), 'utf8'), 'hi', 'tool actually executed');
131
+ } finally {
132
+ await mock.close();
133
+ }
134
+ });
135
+
136
+ test('json mode computes cost from the price table when the model is priced', async () => {
137
+ const mock = await startMockLLM();
138
+ mock.replyWith('Answer.', { usage: { prompt_tokens: 1_000_000, completion_tokens: 1_000_000 } });
139
+ try {
140
+ const runner = buildRunner(mock.base);
141
+ const chunks = [];
142
+ await runHeadless({
143
+ runAgentLoop: runner.runAgentLoop, messages: [{ role: 'user', content: 'q' }],
144
+ model: 'test-model', mode: 'json', maxIterations: 10,
145
+ agentOpts: { systemPromptMode: 'system_role' },
146
+ priceOverrides: { 'test-model': { input: 2, output: 8 } }, // per Mtok
147
+ write: (s) => { chunks.push(s); return true; },
148
+ });
149
+ const obj = JSON.parse(chunks.join('').trim());
150
+ assert.strictEqual(obj.cost, 10, '1M in × $2 + 1M out × $8 = $10');
151
+ } finally {
152
+ await mock.close();
153
+ }
154
+ });
155
+
156
+ // ---------------------------------------------------------------------------
157
+ // stream-json mode
158
+ // ---------------------------------------------------------------------------
159
+
160
+ test('stream-json mode emits valid NDJSON events (assistant / tool / result)', async () => {
161
+ const mock = await startMockLLM();
162
+ mock.replyWith('<write_file path="s.txt">x</write_file>', { usage: { prompt_tokens: 3, completion_tokens: 1 } });
163
+ mock.replyWith('Done streaming.', { usage: { prompt_tokens: 5, completion_tokens: 2 } });
164
+ try {
165
+ const runner = buildRunner(mock.base);
166
+ const { out } = await runCapture('stream-json', runner, [{ role: 'user', content: 'go' }]);
167
+
168
+ assert.ok(!HAS_ANSI(out), 'no ANSI escapes leak into stream-json stdout');
169
+ const events = out.split('\n').filter((l) => l.trim()).map((l) => JSON.parse(l));
170
+ const types = events.map((e) => e.type);
171
+ assert.ok(types.includes('tool'), 'a tool event was emitted');
172
+ assert.ok(types.includes('result'), 'a terminal result event was emitted');
173
+ assert.strictEqual(types[types.length - 1], 'result', 'result is last');
174
+
175
+ const toolEvent = events.find((e) => e.type === 'tool');
176
+ assert.strictEqual(toolEvent.tool, 'write');
177
+ assert.deepStrictEqual(toolEvent.args, ['s.txt', 'x']);
178
+
179
+ const resultEvent = events[events.length - 1];
180
+ assert.strictEqual(resultEvent.result, 'Done streaming.');
181
+ assert.strictEqual(resultEvent.cost, null);
182
+ assert.strictEqual(resultEvent.usage.total_tokens, 11); // 3+1 + 5+2
183
+ } finally {
184
+ await mock.close();
185
+ }
186
+ });
187
+
188
+ // ---------------------------------------------------------------------------
189
+ // text mode does not emit machine output
190
+ // ---------------------------------------------------------------------------
191
+
192
+ test('text mode returns the loop result and emits no JSON envelope', async () => {
193
+ const mock = await startMockLLM();
194
+ mock.replyWith('Just a plain answer.');
195
+ try {
196
+ const runner = buildRunner(mock.base);
197
+ const { out, res } = await runCapture('text', runner, [{ role: 'user', content: 'hello' }]);
198
+ assert.ok(res && Array.isArray(res.messages), 'runHeadless returns the loop result');
199
+ assert.ok(!/"result"\s*:/.test(out), 'text mode does not print a json result envelope');
200
+ } finally {
201
+ await mock.close();
202
+ }
203
+ });
@@ -0,0 +1,88 @@
1
+ 'use strict';
2
+
3
+ // Unit tests for the pure history helpers extracted in Task 1.5.
4
+
5
+ const { test } = require('node:test');
6
+ const assert = require('node:assert');
7
+
8
+ const { cleanOrphanedToolMessages, reconstructLoadedMessage } = require('../lib/commands/history-utils');
9
+
10
+ test('cleanOrphanedToolMessages keeps fully-paired tool calls/results', () => {
11
+ const msgs = [
12
+ { role: 'user', content: 'hi' },
13
+ { role: 'assistant', content: '', tool_calls: [{ id: 'a', type: 'function', function: { name: 'read', arguments: '{}' } }] },
14
+ { role: 'tool', tool_call_id: 'a', content: 'result' },
15
+ ];
16
+ const r = cleanOrphanedToolMessages(msgs);
17
+ assert.deepStrictEqual(r.messages, msgs);
18
+ assert.strictEqual(r.droppedTool, 0);
19
+ assert.strictEqual(r.droppedAssistantCalls, 0);
20
+ assert.strictEqual(r.droppedAssistantMsgs, 0);
21
+ });
22
+
23
+ test('drops a tool result with no matching tool_call', () => {
24
+ const msgs = [
25
+ { role: 'user', content: 'hi' },
26
+ { role: 'tool', tool_call_id: 'orphan', content: 'x' },
27
+ ];
28
+ const r = cleanOrphanedToolMessages(msgs);
29
+ assert.deepStrictEqual(r.messages, [{ role: 'user', content: 'hi' }]);
30
+ assert.strictEqual(r.droppedTool, 1);
31
+ });
32
+
33
+ test('drops a tool result with empty/missing tool_call_id', () => {
34
+ const r = cleanOrphanedToolMessages([{ role: 'tool', tool_call_id: '', content: 'x' }]);
35
+ assert.strictEqual(r.messages.length, 0);
36
+ assert.strictEqual(r.droppedTool, 1);
37
+ });
38
+
39
+ test('strips an unpaired tool_call but keeps the assistant message when it has content', () => {
40
+ const msgs = [
41
+ { role: 'assistant', content: 'here you go', tool_calls: [{ id: 'unpaired', type: 'function', function: { name: 'x', arguments: '{}' } }] },
42
+ ];
43
+ const r = cleanOrphanedToolMessages(msgs);
44
+ assert.strictEqual(r.messages.length, 1);
45
+ assert.strictEqual(r.messages[0].content, 'here you go');
46
+ assert.ok(!('tool_calls' in r.messages[0]), 'unpaired tool_calls removed');
47
+ assert.strictEqual(r.droppedAssistantCalls, 1);
48
+ assert.strictEqual(r.droppedAssistantMsgs, 0);
49
+ });
50
+
51
+ test('drops an assistant message that has only unpaired tool_calls and no content', () => {
52
+ const msgs = [
53
+ { role: 'assistant', content: '', tool_calls: [{ id: 'unpaired', type: 'function', function: { name: 'x', arguments: '{}' } }] },
54
+ ];
55
+ const r = cleanOrphanedToolMessages(msgs);
56
+ assert.strictEqual(r.messages.length, 0);
57
+ assert.strictEqual(r.droppedAssistantCalls, 1);
58
+ assert.strictEqual(r.droppedAssistantMsgs, 1);
59
+ });
60
+
61
+ test('partially-paired tool_calls keep only the paired ids', () => {
62
+ const msgs = [
63
+ { role: 'assistant', content: '', tool_calls: [
64
+ { id: 'a', type: 'function', function: { name: 'x', arguments: '{}' } },
65
+ { id: 'b', type: 'function', function: { name: 'y', arguments: '{}' } },
66
+ ] },
67
+ { role: 'tool', tool_call_id: 'a', content: 'ra' },
68
+ ];
69
+ const r = cleanOrphanedToolMessages(msgs);
70
+ assert.strictEqual(r.messages[0].tool_calls.length, 1);
71
+ assert.strictEqual(r.messages[0].tool_calls[0].id, 'a');
72
+ assert.strictEqual(r.droppedAssistantCalls, 1);
73
+ });
74
+
75
+ test('reconstructLoadedMessage carries role/content and only present optional fields', () => {
76
+ assert.deepStrictEqual(reconstructLoadedMessage({ role: 'user', content: 'hi' }), { role: 'user', content: 'hi' });
77
+ assert.deepStrictEqual(
78
+ reconstructLoadedMessage({ role: 'tool', content: 'r', tool_call_id: 'a' }),
79
+ { role: 'tool', content: 'r', tool_call_id: 'a' },
80
+ );
81
+ // Empty tool_call_id and empty tool_calls are omitted.
82
+ assert.deepStrictEqual(
83
+ reconstructLoadedMessage({ role: 'tool', content: 'r', tool_call_id: '', tool_calls: [] }),
84
+ { role: 'tool', content: 'r' },
85
+ );
86
+ const withCalls = reconstructLoadedMessage({ role: 'assistant', content: '', tool_calls: [{ id: 'a' }] });
87
+ assert.deepStrictEqual(withCalls.tool_calls, [{ id: 'a' }]);
88
+ });
@@ -0,0 +1,238 @@
1
+ 'use strict';
2
+
3
+ // Integration tests for lifecycle hooks (Task 3.4) driving the REAL runAgentLoop
4
+ // against the mock-LLM harness, with the REAL createHookRunner reading
5
+ // config.hooks (so spawnSync actually runs the hook commands). Hook commands use
6
+ // `node -e …` so they are portable across the CI matrix (Linux/macOS/Windows).
7
+ // Sentinel paths are passed via env vars (merged into the hook's environment) to
8
+ // avoid embedding OS-specific path separators in a `node -e` string literal.
9
+
10
+ const { test, before, after } = require('node:test');
11
+ const assert = require('node:assert');
12
+ const fs = require('fs');
13
+ const os = require('os');
14
+ const path = require('path');
15
+
16
+ const ui = require('../lib/ui');
17
+ const { createApiClient } = require('../lib/api');
18
+ const { createToolExecutor, extractToolCalls } = require('../lib/tools');
19
+ const { createPermissionManager } = require('../lib/permissions');
20
+ const { createAgentRunner } = require('../lib/agent');
21
+ const { startMockLLM } = require('./harness/mock-llm');
22
+
23
+ let prevKey;
24
+ before(() => { prevKey = process.env.SEMALT_API_KEY; process.env.SEMALT_API_KEY = 'test-key'; });
25
+ after(() => {
26
+ if (prevKey === undefined) delete process.env.SEMALT_API_KEY;
27
+ else process.env.SEMALT_API_KEY = prevKey;
28
+ });
29
+
30
+ const NODE = JSON.stringify(process.execPath);
31
+
32
+ // buildRunner mirrors agent-loop.test.js, but threads `hooks` into the config so
33
+ // the real hook runner (built inside createAgentRunner from getConfig) sees them.
34
+ function buildRunner(base, hooks) {
35
+ const config = {
36
+ api_base: base, api_key: 'test-key', default_model: 'test-model',
37
+ temperature: 0.5, request_timeout_ms: 5000, stream: true, models: [],
38
+ hooks: hooks || {},
39
+ // This suite tests hook ORCHESTRATION, not the OS sandbox (covered by
40
+ // hooks-verify-sandbox.test.js). Disable the sandbox so the command hooks
41
+ // run deterministically across the CI matrix regardless of bwrap/Seatbelt.
42
+ sandbox: { mode: 'off' },
43
+ };
44
+ const getConfig = () => config;
45
+ const saveConfig = (c) => Object.assign(config, c);
46
+ const api = createApiClient({ getConfig, saveConfig, ui });
47
+ const pm = createPermissionManager(ui, { skipPermissions: true });
48
+ pm.setUICallbacks({ onAddMessage: () => {}, onShowModal: () => {}, onCloseModal: () => {}, onCaptureNavigation: () => () => {} });
49
+ const { agentExecShell, agentExecFile, describePermission } = createToolExecutor(pm, ui, getConfig);
50
+ const runner = createAgentRunner({
51
+ chatStream: api.chatStream, extractToolCalls, agentExecShell, agentExecFile,
52
+ describePermission, permissionManager: pm, ui, getConfig,
53
+ });
54
+ return { runner, config };
55
+ }
56
+
57
+ function collector() {
58
+ const ev = { tools: [], errors: [], assistants: [] };
59
+ const cb = {
60
+ onToolEnd: (tag, result) => ev.tools.push({ tag, result }),
61
+ onError: (e) => ev.errors.push(e),
62
+ onAssistantMessage: (m) => ev.assistants.push(m),
63
+ };
64
+ return { ev, cb };
65
+ }
66
+
67
+ function tmpdir() { return fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-hooks-')); }
68
+
69
+ // ---------------------------------------------------------------------------
70
+ // 1. PreToolUse hook blocks a tool (non-zero exit) — the tool never runs
71
+ // ---------------------------------------------------------------------------
72
+
73
+ test('PreToolUse hook with a non-zero exit BLOCKS the tool; it does not run and the agent gets the reason', async () => {
74
+ const dir = tmpdir();
75
+ const sentinel = path.join(dir, 'written.txt');
76
+ const hooks = { PreToolUse: [{
77
+ type: 'command', matcher: '*',
78
+ command: `${NODE} -e "process.stdout.write('blocked: writes are frozen'); process.exit(1)"`,
79
+ }] };
80
+ const mock = await startMockLLM();
81
+ // The model tries to write a file; the hook must stop it.
82
+ mock.replyWith(`<write_file path="${sentinel}">DATA</write_file>`);
83
+ mock.replyWith('Understood, I will not write.');
84
+ try {
85
+ const { runner } = buildRunner(mock.base, hooks);
86
+ const { ev, cb } = collector();
87
+ const messages = [{ role: 'user', content: 'write the file' }];
88
+ await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
89
+
90
+ assert.ok(!fs.existsSync(sentinel), 'the write tool was blocked — no file on disk');
91
+ assert.strictEqual(ev.tools.length, 0, 'a blocked tool never reaches onToolEnd');
92
+ const fed = messages.find((m) => m.role === 'user' && /Tool execution results/.test(m.content));
93
+ assert.ok(fed, 'the blocked-tool result is fed back to the model');
94
+ assert.match(fed.content, /BLOCKED by a PreToolUse hook/);
95
+ assert.match(fed.content, /writes are frozen/, 'hook stdout is the block reason');
96
+ assert.ok(messages.some((m) => m.role === 'assistant' && m.content === 'Understood, I will not write.'));
97
+ } finally {
98
+ await mock.close();
99
+ fs.rmSync(dir, { recursive: true, force: true });
100
+ }
101
+ });
102
+
103
+ // ---------------------------------------------------------------------------
104
+ // 2. PostToolUse hook observes a result and injects feedback
105
+ // ---------------------------------------------------------------------------
106
+
107
+ test('PostToolUse hook observes the tool result and its stdout is appended (untrusted-fenced)', async () => {
108
+ const dir = tmpdir();
109
+ const file = path.join(dir, 'fixture.txt');
110
+ fs.writeFileSync(file, 'FILE_BODY_7', 'utf8');
111
+ const hooks = { PostToolUse: [{
112
+ type: 'command',
113
+ // Echo back the tool name the hook saw via env — proof it observed the call.
114
+ command: `${NODE} -e "process.stdout.write('POSTHOOK_SAW=' + process.env.SEMALT_TOOL_NAME)"`,
115
+ }] };
116
+ const mock = await startMockLLM();
117
+ mock.replyWith(`<read_file>${file}</read_file>`);
118
+ mock.replyWith('Read it.');
119
+ try {
120
+ const { runner } = buildRunner(mock.base, hooks);
121
+ const { ev, cb } = collector();
122
+ const messages = [{ role: 'user', content: 'read the file' }];
123
+ await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
124
+
125
+ assert.strictEqual(ev.tools.length, 1);
126
+ assert.strictEqual(ev.tools[0].tag, 'read');
127
+ const fed = messages.find((m) => m.role === 'user' && /Tool execution results/.test(m.content));
128
+ assert.ok(fed);
129
+ assert.match(fed.content, /FILE_BODY_7/, 'the real tool result is present');
130
+ assert.match(fed.content, /POSTHOOK_SAW=read/, 'PostToolUse stdout was appended');
131
+ assert.match(fed.content, /UNTRUSTED_EXTERNAL_CONTENT/, 'hook feedback is fenced as untrusted');
132
+ } finally {
133
+ await mock.close();
134
+ fs.rmSync(dir, { recursive: true, force: true });
135
+ }
136
+ });
137
+
138
+ // ---------------------------------------------------------------------------
139
+ // 3. UserPromptSubmit hook injects context before the loop
140
+ // ---------------------------------------------------------------------------
141
+
142
+ test('UserPromptSubmit hook stdout is injected into the conversation as context', async () => {
143
+ const hooks = { UserPromptSubmit: [{
144
+ type: 'command',
145
+ command: `${NODE} -e "process.stdout.write('INJECTED_CONTEXT_42')"`,
146
+ }] };
147
+ const mock = await startMockLLM();
148
+ mock.replyWith('Acknowledged.');
149
+ try {
150
+ const { runner } = buildRunner(mock.base, hooks);
151
+ const { cb } = collector();
152
+ const messages = [{ role: 'user', content: 'hello' }];
153
+ await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
154
+
155
+ const injected = messages.find((m) => m.role === 'user' && /INJECTED_CONTEXT_42/.test(m.content));
156
+ assert.ok(injected, 'hook output was injected as a user message');
157
+ assert.match(injected.content, /UNTRUSTED_EXTERNAL_CONTENT/, 'injected context is fenced as untrusted');
158
+ } finally {
159
+ await mock.close();
160
+ }
161
+ });
162
+
163
+ // ---------------------------------------------------------------------------
164
+ // 4. A failing hook does not crash the loop
165
+ // ---------------------------------------------------------------------------
166
+
167
+ test('a failing (non-zero, no-output) PostToolUse hook is contained — the loop completes normally', async () => {
168
+ const hooks = { PostToolUse: [{ type: 'command', command: `${NODE} -e "process.exit(3)"` }] };
169
+ const mock = await startMockLLM();
170
+ mock.replyWith('<exec>echo hi</exec>');
171
+ mock.replyWith('All good.');
172
+ try {
173
+ const { runner } = buildRunner(mock.base, hooks);
174
+ const { ev, cb } = collector();
175
+ const messages = [{ role: 'user', content: 'run it' }];
176
+ const { metrics } = await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
177
+
178
+ assert.strictEqual(ev.tools.length, 1, 'the tool still executed');
179
+ assert.strictEqual(metrics.turns.length, 2, 'tool turn + final turn — the loop did not crash');
180
+ assert.ok(messages.some((m) => m.role === 'assistant' && m.content === 'All good.'));
181
+ } finally {
182
+ await mock.close();
183
+ }
184
+ });
185
+
186
+ // ---------------------------------------------------------------------------
187
+ // 5. A deny-listed hook command is never executed
188
+ // ---------------------------------------------------------------------------
189
+
190
+ test('a deny-listed PreToolUse hook command is skipped (never run) and does not block the tool', async () => {
191
+ const hooks = { PreToolUse: [{ type: 'command', matcher: '*', command: 'rm -rf /' }] };
192
+ const mock = await startMockLLM();
193
+ mock.replyWith('<exec>echo ALLOWED</exec>');
194
+ mock.replyWith('Done.');
195
+ try {
196
+ const { runner } = buildRunner(mock.base, hooks);
197
+ const { ev, cb } = collector();
198
+ const messages = [{ role: 'user', content: 'go' }];
199
+ await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
200
+
201
+ // The deny-listed hook is skipped, so the tool is NOT blocked and runs.
202
+ assert.strictEqual(ev.tools.length, 1, 'tool ran — a denied hook does not block');
203
+ const fed = messages.find((m) => m.role === 'user' && /Tool execution results/.test(m.content));
204
+ assert.match(fed.content, /ALLOWED/);
205
+ assert.ok(!fed.content.includes('BLOCKED by a PreToolUse hook'), 'no spurious block');
206
+ } finally {
207
+ await mock.close();
208
+ }
209
+ });
210
+
211
+ // ---------------------------------------------------------------------------
212
+ // 6. Stop hook fires once when the turn ends
213
+ // ---------------------------------------------------------------------------
214
+
215
+ test('Stop hook fires when the agent loop finishes the turn', async () => {
216
+ const dir = tmpdir();
217
+ const sentinel = path.join(dir, 'stopped.txt');
218
+ const prev = process.env.SEMALT_TEST_STOP_FILE;
219
+ process.env.SEMALT_TEST_STOP_FILE = sentinel;
220
+ const hooks = { Stop: [{
221
+ type: 'command',
222
+ command: `${NODE} -e "require('fs').writeFileSync(process.env.SEMALT_TEST_STOP_FILE,'x')"`,
223
+ }] };
224
+ const mock = await startMockLLM();
225
+ mock.replyWith('Final answer, no tools.');
226
+ try {
227
+ const { runner } = buildRunner(mock.base, hooks);
228
+ const { cb } = collector();
229
+ const messages = [{ role: 'user', content: 'just answer' }];
230
+ await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
231
+ assert.ok(fs.existsSync(sentinel), 'the Stop hook ran at end of turn');
232
+ } finally {
233
+ await mock.close();
234
+ if (prev === undefined) delete process.env.SEMALT_TEST_STOP_FILE;
235
+ else process.env.SEMALT_TEST_STOP_FILE = prev;
236
+ fs.rmSync(dir, { recursive: true, force: true });
237
+ }
238
+ });