@semalt-ai/code 1.8.5 → 1.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +6 -1
- package/.github/workflows/ci.yml +69 -0
- package/CLAUDE.md +1584 -26
- package/README.md +147 -3
- package/examples/embed.js +74 -0
- package/index.js +251 -10
- package/lib/agent.js +711 -104
- package/lib/api.js +213 -49
- package/lib/args.js +74 -2
- package/lib/audit.js +23 -1
- package/lib/background.js +584 -0
- package/lib/checkpoints.js +757 -0
- package/lib/commands/auth.js +94 -0
- package/lib/commands/chat-session.js +306 -0
- package/lib/commands/chat-slash.js +399 -0
- package/lib/commands/chat-turn.js +446 -0
- package/lib/commands/chat.js +403 -0
- package/lib/commands/custom.js +157 -0
- package/lib/commands/history-utils.js +66 -0
- package/lib/commands/index.js +268 -0
- package/lib/commands/mcp.js +113 -0
- package/lib/commands/oneshot.js +193 -0
- package/lib/commands/registry.js +269 -0
- package/lib/commands/tasks.js +89 -0
- package/lib/compact.js +87 -0
- package/lib/config.js +333 -11
- package/lib/constants.js +372 -3
- package/lib/deny.js +199 -0
- package/lib/doctor.js +160 -0
- package/lib/headless.js +167 -0
- package/lib/hooks.js +286 -0
- package/lib/images.js +264 -0
- package/lib/internals.js +49 -0
- package/lib/mcp/boundary.js +131 -0
- package/lib/mcp/client.js +270 -0
- package/lib/mcp/oauth.js +134 -0
- package/lib/memory.js +209 -0
- package/lib/metrics.js +37 -2
- package/lib/payload.js +54 -0
- package/lib/permission-rules.js +401 -0
- package/lib/permissions.js +100 -10
- package/lib/pricing.js +67 -0
- package/lib/proc.js +62 -0
- package/lib/prompts.js +84 -5
- package/lib/sandbox.js +568 -0
- package/lib/sdk.js +328 -0
- package/lib/secrets.js +211 -0
- package/lib/skills.js +223 -0
- package/lib/subagents.js +516 -0
- package/lib/tool_registry.js +2558 -0
- package/lib/tool_specs.js +222 -2
- package/lib/tools.js +272 -1020
- package/lib/ui/format.js +22 -1
- package/lib/ui/input-field.js +16 -7
- package/lib/ui/status-bar.js +79 -11
- package/lib/ui/theme.js +1 -0
- package/lib/ui/web-activity.js +218 -0
- package/lib/verify.js +229 -0
- package/lib/web-extract.js +213 -0
- package/lib/web-summarize.js +68 -0
- package/package.json +19 -4
- package/scripts/lint.js +57 -0
- package/test/agent-loop.test.js +389 -0
- package/test/background.test.js +414 -0
- package/test/chat.test.js +114 -0
- package/test/checkpoints-agent.test.js +181 -0
- package/test/checkpoints.test.js +650 -0
- package/test/command-registry.test.js +160 -0
- package/test/compact.test.js +116 -0
- package/test/completion-lazy.test.js +52 -0
- package/test/config-merge.test.js +324 -0
- package/test/config-quarantine.test.js +128 -0
- package/test/config-write-guard-allow-anywhere.test.js +56 -0
- package/test/config-write-guard-skip.test.js +46 -0
- package/test/config-write-guard.test.js +153 -0
- package/test/context-split.test.js +215 -0
- package/test/cost-doctor.test.js +142 -0
- package/test/custom-commands-chat.test.js +106 -0
- package/test/custom-commands.test.js +230 -0
- package/test/deny-windows.test.js +120 -0
- package/test/deny.test.js +83 -0
- package/test/download-allow-anywhere.test.js +66 -0
- package/test/download-confine.test.js +153 -0
- package/test/executors.test.js +362 -0
- package/test/extract-tool-calls.test.js +315 -0
- package/test/fetch-url-validation.test.js +219 -0
- package/test/fixtures/tool-calls.js +57 -0
- package/test/fixtures/web-page.js +91 -0
- package/test/git-tools.test.js +384 -0
- package/test/grep-glob-serialize.test.js +242 -0
- package/test/grep-glob.test.js +268 -0
- package/test/harness/README.md +57 -0
- package/test/harness/chat-harness.js +142 -0
- package/test/harness/memwarn-headless-child.js +65 -0
- package/test/harness/mock-llm.js +120 -0
- package/test/harness/mock-mcp-server.js +142 -0
- package/test/harness/sse-server.js +69 -0
- package/test/headless.test.js +203 -0
- package/test/history-utils.test.js +88 -0
- package/test/hooks-agent.test.js +238 -0
- package/test/hooks-verify-sandbox.test.js +232 -0
- package/test/hooks.test.js +216 -0
- package/test/http-get-user-agent.test.js +142 -0
- package/test/images-api.test.js +208 -0
- package/test/images.test.js +238 -0
- package/test/max-iterations.test.js +216 -0
- package/test/mcp-boundary.test.js +57 -0
- package/test/mcp-client.test.js +267 -0
- package/test/mcp-oauth.test.js +86 -0
- package/test/memory-truncation-warning.test.js +222 -0
- package/test/memory.test.js +198 -0
- package/test/native-dispatch.test.js +356 -0
- package/test/output-chokepoint.test.js +188 -0
- package/test/path-guards.test.js +134 -0
- package/test/payload.test.js +99 -0
- package/test/permission-rules-agent.test.js +210 -0
- package/test/permission-rules.test.js +297 -0
- package/test/permissions.test.js +163 -0
- package/test/plan-mode.test.js +167 -0
- package/test/read-paginate.test.js +275 -0
- package/test/readonly-tools.test.js +177 -0
- package/test/result-cap.test.js +233 -0
- package/test/sandbox-agent.test.js +147 -0
- package/test/sandbox-integration.test.js +216 -0
- package/test/sandbox.test.js +408 -0
- package/test/sdk.test.js +234 -0
- package/test/shell-output-cap.test.js +181 -0
- package/test/skills-chat.test.js +110 -0
- package/test/skills.test.js +295 -0
- package/test/smoke.test.js +68 -0
- package/test/status-bar-pause.test.js +164 -0
- package/test/stream-parser.test.js +147 -0
- package/test/subagents-agent.test.js +178 -0
- package/test/subagents.test.js +222 -0
- package/test/tool-registry.test.js +85 -0
- package/test/trim-budget.test.js +101 -0
- package/test/verify-agent.test.js +317 -0
- package/test/verify.test.js +141 -0
- package/test/web-activity-ordering.test.js +194 -0
- package/test/web-activity.test.js +207 -0
- package/test/web-data-extraction-guidance.test.js +71 -0
- package/test/web-extract.test.js +185 -0
- package/test/web-fetch-agent.test.js +291 -0
- package/test/web-fetch-mode.test.js +193 -0
- package/test/web-search.test.js +380 -0
- package/lib/commands.js +0 -1438
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Characterization tests for project memory (Task 2.3): the AGENTS.md/CLAUDE.md
|
|
4
|
+
// hierarchy loader and its injection into the system prompt. Filesystem work is
|
|
5
|
+
// isolated under temp dirs; HOME is redirected before requiring lib modules so
|
|
6
|
+
// the global-memory level resolves under the temp home.
|
|
7
|
+
|
|
8
|
+
const os = require('node:os');
|
|
9
|
+
const fs = require('node:fs');
|
|
10
|
+
const path = require('node:path');
|
|
11
|
+
|
|
12
|
+
const TMP_HOME = fs.realpathSync(fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-memhome-')));
|
|
13
|
+
const PREV_HOME = process.env.HOME;
|
|
14
|
+
const PREV_USERPROFILE = process.env.USERPROFILE;
|
|
15
|
+
process.env.HOME = TMP_HOME;
|
|
16
|
+
process.env.USERPROFILE = TMP_HOME;
|
|
17
|
+
|
|
18
|
+
const { test, after } = require('node:test');
|
|
19
|
+
const assert = require('node:assert');
|
|
20
|
+
|
|
21
|
+
const {
|
|
22
|
+
loadProjectMemory,
|
|
23
|
+
discoverMemoryFiles,
|
|
24
|
+
findRepoRoot,
|
|
25
|
+
memoryStatusLines,
|
|
26
|
+
} = require('../lib/memory');
|
|
27
|
+
const { getSystemPrompt } = require('../lib/prompts');
|
|
28
|
+
|
|
29
|
+
const PREV_CWD = process.cwd();
|
|
30
|
+
after(() => {
|
|
31
|
+
process.chdir(PREV_CWD);
|
|
32
|
+
if (PREV_HOME === undefined) delete process.env.HOME; else process.env.HOME = PREV_HOME;
|
|
33
|
+
if (PREV_USERPROFILE === undefined) delete process.env.USERPROFILE; else process.env.USERPROFILE = PREV_USERPROFILE;
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
function mkRepo(prefix) {
|
|
37
|
+
const root = fs.realpathSync(fs.mkdtempSync(path.join(os.tmpdir(), prefix)));
|
|
38
|
+
fs.mkdirSync(path.join(root, '.git'), { recursive: true });
|
|
39
|
+
return root;
|
|
40
|
+
}
|
|
41
|
+
function write(p, data) { fs.mkdirSync(path.dirname(p), { recursive: true }); fs.writeFileSync(p, data); }
|
|
42
|
+
|
|
43
|
+
// ---------------------------------------------------------------------------
|
|
44
|
+
// Absent files → no memory, base prompt unchanged
|
|
45
|
+
// ---------------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
test('no memory files present → empty block and no files', () => {
|
|
48
|
+
const emptyHome = fs.realpathSync(fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-emptyhome-')));
|
|
49
|
+
const root = mkRepo('semalt-mem-none-');
|
|
50
|
+
const r = loadProjectMemory({ cwd: root, home: emptyHome });
|
|
51
|
+
assert.strictEqual(r.block, '');
|
|
52
|
+
assert.deepStrictEqual(r.files, []);
|
|
53
|
+
assert.strictEqual(r.truncated, false);
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
test('getSystemPrompt is byte-for-byte the base prompt when memory is empty', () => {
|
|
57
|
+
const base = getSystemPrompt(false, '');
|
|
58
|
+
assert.ok(!base.includes('PROJECT_MEMORY'), 'no memory section in the base prompt');
|
|
59
|
+
// Memory is appended verbatim to the end — proves append-only + empty == base.
|
|
60
|
+
assert.strictEqual(getSystemPrompt(false, '\n\nMEMBLOCK'), base + '\n\nMEMBLOCK');
|
|
61
|
+
// Same for the native template.
|
|
62
|
+
const nbase = getSystemPrompt(true, '');
|
|
63
|
+
assert.strictEqual(getSystemPrompt(true, '\n\nX'), nbase + '\n\nX');
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
test('getSystemPrompt with no project memory in CWD/home equals the explicit-empty base', () => {
|
|
67
|
+
const emptyHome = fs.realpathSync(fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-eh2-')));
|
|
68
|
+
const dir = fs.realpathSync(fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-nomem-cwd-')));
|
|
69
|
+
const prevHome = process.env.HOME;
|
|
70
|
+
process.env.HOME = emptyHome; process.env.USERPROFILE = emptyHome;
|
|
71
|
+
process.chdir(dir);
|
|
72
|
+
try {
|
|
73
|
+
assert.strictEqual(getSystemPrompt(false), getSystemPrompt(false, ''));
|
|
74
|
+
} finally {
|
|
75
|
+
process.env.HOME = prevHome; process.env.USERPROFILE = prevHome;
|
|
76
|
+
process.chdir(PREV_CWD);
|
|
77
|
+
}
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
// ---------------------------------------------------------------------------
|
|
81
|
+
// Hierarchy: global → project root → nested CWD, in order
|
|
82
|
+
// ---------------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
test('hierarchy loads global, repo root, and nested CWD in order', () => {
|
|
85
|
+
write(path.join(TMP_HOME, '.semalt-ai', 'AGENTS.md'), 'GLOBAL_MEM');
|
|
86
|
+
const root = mkRepo('semalt-mem-hier-');
|
|
87
|
+
write(path.join(root, 'AGENTS.md'), 'ROOT_MEM');
|
|
88
|
+
const nested = path.join(root, 'pkg', 'sub');
|
|
89
|
+
write(path.join(nested, 'AGENTS.md'), 'CWD_MEM');
|
|
90
|
+
|
|
91
|
+
const r = loadProjectMemory({ cwd: nested, home: TMP_HOME });
|
|
92
|
+
assert.deepStrictEqual(r.files.map((f) => f.source), ['global', 'project-root', 'cwd']);
|
|
93
|
+
const gi = r.block.indexOf('GLOBAL_MEM');
|
|
94
|
+
const ri = r.block.indexOf('ROOT_MEM');
|
|
95
|
+
const ci = r.block.indexOf('CWD_MEM');
|
|
96
|
+
assert.ok(gi !== -1 && ri !== -1 && ci !== -1, 'all three present');
|
|
97
|
+
assert.ok(gi < ri && ri < ci, 'concatenated in hierarchy order');
|
|
98
|
+
assert.ok(r.block.includes('<<<PROJECT_MEMORY>>>') && r.block.includes('<<<END_PROJECT_MEMORY>>>'));
|
|
99
|
+
|
|
100
|
+
// cleanup global so it does not leak into other tests
|
|
101
|
+
fs.rmSync(path.join(TMP_HOME, '.semalt-ai'), { recursive: true, force: true });
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
test('CWD at the repo root is not double-loaded', () => {
|
|
105
|
+
const emptyHome = fs.realpathSync(fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-eh3-')));
|
|
106
|
+
const root = mkRepo('semalt-mem-rootcwd-');
|
|
107
|
+
write(path.join(root, 'AGENTS.md'), 'ONLY_ROOT');
|
|
108
|
+
const r = loadProjectMemory({ cwd: root, home: emptyHome });
|
|
109
|
+
assert.deepStrictEqual(r.files.map((f) => f.source), ['project-root']);
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
// ---------------------------------------------------------------------------
|
|
113
|
+
// AGENTS.md vs CLAUDE.md alias
|
|
114
|
+
// ---------------------------------------------------------------------------
|
|
115
|
+
|
|
116
|
+
test('AGENTS.md is preferred over CLAUDE.md and the choice is reported', () => {
|
|
117
|
+
const emptyHome = fs.realpathSync(fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-eh4-')));
|
|
118
|
+
const root = mkRepo('semalt-mem-alias-');
|
|
119
|
+
write(path.join(root, 'AGENTS.md'), 'A_WINS');
|
|
120
|
+
write(path.join(root, 'CLAUDE.md'), 'C_IGNORED');
|
|
121
|
+
const r = loadProjectMemory({ cwd: root, home: emptyHome });
|
|
122
|
+
assert.strictEqual(r.files.length, 1);
|
|
123
|
+
assert.strictEqual(r.files[0].name, 'AGENTS.md');
|
|
124
|
+
assert.strictEqual(r.files[0].alsoPresent, true);
|
|
125
|
+
assert.ok(r.block.includes('A_WINS'));
|
|
126
|
+
assert.ok(!r.block.includes('C_IGNORED'), 'CLAUDE.md content is not loaded when AGENTS.md exists');
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
test('CLAUDE.md is used as the alias when AGENTS.md is absent', () => {
|
|
130
|
+
const emptyHome = fs.realpathSync(fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-eh5-')));
|
|
131
|
+
const root = mkRepo('semalt-mem-claude-');
|
|
132
|
+
write(path.join(root, 'CLAUDE.md'), 'CLAUDE_ONLY');
|
|
133
|
+
const r = loadProjectMemory({ cwd: root, home: emptyHome });
|
|
134
|
+
assert.strictEqual(r.files[0].name, 'CLAUDE.md');
|
|
135
|
+
assert.strictEqual(r.files[0].alsoPresent, false);
|
|
136
|
+
assert.ok(r.block.includes('CLAUDE_ONLY'));
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
// ---------------------------------------------------------------------------
|
|
140
|
+
// Truncation
|
|
141
|
+
// ---------------------------------------------------------------------------
|
|
142
|
+
|
|
143
|
+
test('oversized memory is truncated with a visible notice', () => {
|
|
144
|
+
const emptyHome = fs.realpathSync(fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-eh6-')));
|
|
145
|
+
const root = mkRepo('semalt-mem-big-');
|
|
146
|
+
write(path.join(root, 'AGENTS.md'), 'X'.repeat(5000));
|
|
147
|
+
const r = loadProjectMemory({ cwd: root, home: emptyHome, maxBytes: 200 });
|
|
148
|
+
assert.strictEqual(r.truncated, true);
|
|
149
|
+
assert.ok(/truncated/i.test(r.block), 'block carries a truncation notice');
|
|
150
|
+
});
|
|
151
|
+
|
|
152
|
+
// ---------------------------------------------------------------------------
|
|
153
|
+
// findRepoRoot + status lines
|
|
154
|
+
// ---------------------------------------------------------------------------
|
|
155
|
+
|
|
156
|
+
test('findRepoRoot locates the nearest .git ancestor and returns null without one', () => {
|
|
157
|
+
const root = mkRepo('semalt-mem-root-');
|
|
158
|
+
const nested = path.join(root, 'a', 'b');
|
|
159
|
+
fs.mkdirSync(nested, { recursive: true });
|
|
160
|
+
assert.strictEqual(findRepoRoot(nested), root);
|
|
161
|
+
const bare = fs.realpathSync(fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-bare-')));
|
|
162
|
+
// a temp dir under the OS tmpdir is not inside a git repo
|
|
163
|
+
assert.strictEqual(findRepoRoot(bare), null);
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
test('memoryStatusLines reports loaded files, alias note, and edit target', () => {
|
|
167
|
+
const result = {
|
|
168
|
+
files: [
|
|
169
|
+
{ path: '/home/u/.semalt-ai/AGENTS.md', source: 'global', name: 'AGENTS.md', alsoPresent: false },
|
|
170
|
+
{ path: '/repo/AGENTS.md', source: 'project-root', name: 'AGENTS.md', alsoPresent: true },
|
|
171
|
+
],
|
|
172
|
+
truncated: false,
|
|
173
|
+
};
|
|
174
|
+
const lines = memoryStatusLines(result).join('\n');
|
|
175
|
+
assert.ok(lines.includes('/home/u/.semalt-ai/AGENTS.md'));
|
|
176
|
+
assert.ok(lines.includes('[global]') && lines.includes('[project-root]'));
|
|
177
|
+
assert.ok(/CLAUDE\.md also present/.test(lines), 'alias note shown');
|
|
178
|
+
assert.ok(lines.includes('Edit project memory: /repo/AGENTS.md'), 'points at the nearest project file');
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
test('memoryStatusLines handles the no-memory case', () => {
|
|
182
|
+
const lines = memoryStatusLines({ files: [], truncated: false }).join('\n');
|
|
183
|
+
assert.ok(/No project memory files found/.test(lines));
|
|
184
|
+
});
|
|
185
|
+
|
|
186
|
+
// ---------------------------------------------------------------------------
|
|
187
|
+
// discoverMemoryFiles direct
|
|
188
|
+
// ---------------------------------------------------------------------------
|
|
189
|
+
|
|
190
|
+
test('discoverMemoryFiles returns ordered entries with resolved paths', () => {
|
|
191
|
+
const emptyHome = fs.realpathSync(fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-eh7-')));
|
|
192
|
+
const root = mkRepo('semalt-disc-');
|
|
193
|
+
write(path.join(root, 'CLAUDE.md'), 'x');
|
|
194
|
+
const found = discoverMemoryFiles(root, emptyHome);
|
|
195
|
+
assert.strictEqual(found.length, 1);
|
|
196
|
+
assert.strictEqual(found[0].source, 'project-root');
|
|
197
|
+
assert.strictEqual(path.isAbsolute(found[0].path), true);
|
|
198
|
+
});
|
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Native-path dispatch tests (Pre-Task 4.0c). Closes the coverage-shape blind
|
|
4
|
+
// spot the re-audit found: end-to-end dispatch through the REAL runAgentLoop was
|
|
5
|
+
// proven for read-only tools on BOTH rails (Task 3.3b), but EFFECTFUL tools were
|
|
6
|
+
// only ever exercised through the XML path. The native function-calling path has
|
|
7
|
+
// distinct glue — mapInvokeToCall → descriptor gate → role:'tool' result rooting
|
|
8
|
+
// (lib/agent.js ~1378) — and Phase 4's per-pattern permissions + checkpoints both
|
|
9
|
+
// hook the MUTATING dispatch path. Layering that onto unverified native glue would
|
|
10
|
+
// repeat the 3.3b mistake, so these tests lock the native path end-to-end for:
|
|
11
|
+
// * file-mutating tools (write, edit, delete, move),
|
|
12
|
+
// * shell/exec,
|
|
13
|
+
// * plan-mode withhold + approve,
|
|
14
|
+
// each asserting the mutation actually happens, the permission gate fires (with
|
|
15
|
+
// the right descriptor), and the result is rooted as a role:'tool' message on the
|
|
16
|
+
// originating tool_call_id. Where useful each native case is paired with its XML
|
|
17
|
+
// equivalent IN THE SAME TEST so the two rails are proven equivalent at the loop
|
|
18
|
+
// level for effectful tools (extending the 3.3b read-only equivalence proof).
|
|
19
|
+
//
|
|
20
|
+
// Driven via mock.replyWithToolCall(name, args) — a native tool_calls response
|
|
21
|
+
// with EMPTY text content — against a temp $cwd so isPathSafe (CWD-confined)
|
|
22
|
+
// permits the writes. skipPermissions auto-approves so the loop runs unattended;
|
|
23
|
+
// pm.askPermission is wrapped to RECORD each gate consultation so we can assert
|
|
24
|
+
// the descriptor fired for mutating tools and did NOT for read-only ones.
|
|
25
|
+
|
|
26
|
+
const { test, before, after } = require('node:test');
|
|
27
|
+
const assert = require('node:assert');
|
|
28
|
+
const os = require('node:os');
|
|
29
|
+
const fs = require('node:fs');
|
|
30
|
+
const path = require('node:path');
|
|
31
|
+
|
|
32
|
+
const ui = require('../lib/ui');
|
|
33
|
+
const { createApiClient } = require('../lib/api');
|
|
34
|
+
const { createToolExecutor, extractToolCalls } = require('../lib/tools');
|
|
35
|
+
const { createPermissionManager } = require('../lib/permissions');
|
|
36
|
+
const { createAgentRunner } = require('../lib/agent');
|
|
37
|
+
const { startMockLLM } = require('./harness/mock-llm');
|
|
38
|
+
|
|
39
|
+
let prevKey;
|
|
40
|
+
let CWD;
|
|
41
|
+
let PREV_CWD;
|
|
42
|
+
before(() => {
|
|
43
|
+
prevKey = process.env.SEMALT_API_KEY; process.env.SEMALT_API_KEY = 'test-key';
|
|
44
|
+
PREV_CWD = process.cwd();
|
|
45
|
+
CWD = fs.realpathSync(fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-native-')));
|
|
46
|
+
process.chdir(CWD);
|
|
47
|
+
});
|
|
48
|
+
after(() => {
|
|
49
|
+
process.chdir(PREV_CWD);
|
|
50
|
+
try { fs.rmSync(CWD, { recursive: true, force: true }); } catch {}
|
|
51
|
+
if (prevKey === undefined) delete process.env.SEMALT_API_KEY; else process.env.SEMALT_API_KEY = prevKey;
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
// A real runner whose chatStream points at `base`. skipPermissions auto-approves
|
|
55
|
+
// so the loop runs unattended; `asks` records every permission-gate consultation
|
|
56
|
+
// (actionType + tag) so a test can prove the descriptor fired (mutating) or did
|
|
57
|
+
// not (read-only). Returns `getSaved` for the rare persistence assertion.
|
|
58
|
+
function buildRunner(base) {
|
|
59
|
+
const config = {
|
|
60
|
+
api_base: base, api_key: 'test-key', default_model: 'test-model',
|
|
61
|
+
temperature: 0.5, request_timeout_ms: 5000, stream: true, models: [],
|
|
62
|
+
// Dispatch test, not a sandbox test — run real `echo` unsandboxed regardless
|
|
63
|
+
// of the runner's bwrap/sandbox-exec availability (Task 4.4).
|
|
64
|
+
sandbox: { mode: 'off' },
|
|
65
|
+
};
|
|
66
|
+
let saved = null;
|
|
67
|
+
const getConfig = () => config;
|
|
68
|
+
const saveConfig = (c) => { saved = { ...c }; Object.assign(config, c); };
|
|
69
|
+
|
|
70
|
+
const api = createApiClient({ getConfig, saveConfig, ui });
|
|
71
|
+
const pm = createPermissionManager(ui, { skipPermissions: true });
|
|
72
|
+
pm.setUICallbacks({ onAddMessage: () => {}, onShowModal: () => {}, onCloseModal: () => {}, onCaptureNavigation: () => () => {} });
|
|
73
|
+
|
|
74
|
+
const asks = [];
|
|
75
|
+
const realAsk = pm.askPermission;
|
|
76
|
+
pm.askPermission = async (actionType, description, tag) => {
|
|
77
|
+
asks.push({ actionType, tag });
|
|
78
|
+
return realAsk(actionType, description, tag);
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
const { agentExecShell, agentExecFile, describePermission } = createToolExecutor(pm, ui, getConfig);
|
|
82
|
+
const runner = createAgentRunner({
|
|
83
|
+
chatStream: api.chatStream, extractToolCalls, agentExecShell, agentExecFile,
|
|
84
|
+
describePermission, permissionManager: pm, ui, getConfig,
|
|
85
|
+
});
|
|
86
|
+
return { runner, asks, getSaved: () => saved };
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
function collector(extra = {}) {
|
|
90
|
+
const ev = { tokens: [], tools: [], errors: [], retries: [], assistants: [], withheld: [] };
|
|
91
|
+
const cb = {
|
|
92
|
+
onToken: (t) => ev.tokens.push(t),
|
|
93
|
+
onToolStart: () => {},
|
|
94
|
+
onToolEnd: (tag, result) => ev.tools.push({ tag, result }),
|
|
95
|
+
onError: (e) => ev.errors.push(e),
|
|
96
|
+
onRetry: (next, max) => ev.retries.push({ next, max }),
|
|
97
|
+
onAssistantMessage: (m) => ev.assistants.push(m),
|
|
98
|
+
onPlanWithhold: (tag, arg, desc) => ev.withheld.push({ tag, arg, desc }),
|
|
99
|
+
...extra,
|
|
100
|
+
};
|
|
101
|
+
return { ev, cb };
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// The assistant turn that carried structured tool_calls (native shape), plus its
|
|
105
|
+
// rooted role:'tool' result. Shared assertion: a native tool turn records an
|
|
106
|
+
// assistant message with EMPTY text + tool_calls, and the result comes back on a
|
|
107
|
+
// role:'tool' message keyed to the originating tool_call id (lib/agent.js ~1378).
|
|
108
|
+
function assertNativeRooting(messages, fnName) {
|
|
109
|
+
const assistantWithCall = messages.find((m) => m.role === 'assistant' && Array.isArray(m.tool_calls));
|
|
110
|
+
assert.ok(assistantWithCall, 'assistant message recorded the native tool_calls');
|
|
111
|
+
assert.strictEqual(assistantWithCall.content, '', 'native tool-call turn has empty text content');
|
|
112
|
+
assert.strictEqual(assistantWithCall.tool_calls[0].function.name, fnName, `tool_calls names ${fnName}`);
|
|
113
|
+
const toolMsg = messages.find((m) => m.role === 'tool');
|
|
114
|
+
assert.ok(toolMsg, 'native path appends a role:"tool" result message');
|
|
115
|
+
assert.strictEqual(toolMsg.tool_call_id, assistantWithCall.tool_calls[0].id, 'result rooted to its tool_call id');
|
|
116
|
+
assert.ok(!messages.some((m) => m.role === 'user' && /Tool execution results/.test(m.content)),
|
|
117
|
+
'native path does NOT use the XML "Tool execution results" user message');
|
|
118
|
+
return toolMsg;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// ---------------------------------------------------------------------------
|
|
122
|
+
// 1. Native file-mutating: write — mutation + gate + role:'tool' rooting,
|
|
123
|
+
// paired with the XML equivalent for loop-level equivalence.
|
|
124
|
+
// ---------------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
test('native write_file: mutates, gate fires (file/write_file), result rooted as role:"tool"', async () => {
|
|
127
|
+
const mock = await startMockLLM();
|
|
128
|
+
mock.replyWithToolCall('write_file', { path: 'native-write.txt', content: 'NATIVE_WRITE_CONTENT' });
|
|
129
|
+
mock.replyWith('Wrote it.');
|
|
130
|
+
try {
|
|
131
|
+
const { runner, asks } = buildRunner(mock.base);
|
|
132
|
+
const { ev, cb } = collector();
|
|
133
|
+
const messages = [{ role: 'user', content: 'write the file' }];
|
|
134
|
+
const { metrics } = await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
|
|
135
|
+
|
|
136
|
+
// The mutation actually happened.
|
|
137
|
+
assert.strictEqual(fs.readFileSync(path.join(CWD, 'native-write.txt'), 'utf8'), 'NATIVE_WRITE_CONTENT');
|
|
138
|
+
// The permission gate fired with the mutating descriptor (NOT auto-skipped
|
|
139
|
+
// as a read-only tool).
|
|
140
|
+
assert.deepStrictEqual(asks, [{ actionType: 'file', tag: 'write_file' }], 'write gate consulted once');
|
|
141
|
+
// The tool dispatched and the result is rooted on the tool_call id.
|
|
142
|
+
assert.strictEqual(ev.tools.length, 1);
|
|
143
|
+
assert.strictEqual(ev.tools[0].tag, 'write');
|
|
144
|
+
const toolMsg = assertNativeRooting(messages, 'write_file');
|
|
145
|
+
assert.match(toolMsg.content, /Wrote \d+ bytes to native-write\.txt/);
|
|
146
|
+
|
|
147
|
+
assert.strictEqual(metrics.turns.length, 2, 'tool turn + final turn');
|
|
148
|
+
assert.ok(messages.some((m) => m.role === 'assistant' && m.content === 'Wrote it.'));
|
|
149
|
+
assert.strictEqual(mock.pending(), 0);
|
|
150
|
+
} finally {
|
|
151
|
+
await mock.close();
|
|
152
|
+
}
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
test('XML write_file equivalent: same tool, same mutation, XML "Tool execution results" shape', async () => {
|
|
156
|
+
const mock = await startMockLLM();
|
|
157
|
+
mock.replyWith('<write_file path="xml-write.txt">XML_WRITE_CONTENT</write_file>');
|
|
158
|
+
mock.replyWith('Wrote it.');
|
|
159
|
+
try {
|
|
160
|
+
const { runner, asks } = buildRunner(mock.base);
|
|
161
|
+
const { ev, cb } = collector();
|
|
162
|
+
const messages = [{ role: 'user', content: 'write the file' }];
|
|
163
|
+
await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
|
|
164
|
+
|
|
165
|
+
assert.strictEqual(fs.readFileSync(path.join(CWD, 'xml-write.txt'), 'utf8'), 'XML_WRITE_CONTENT');
|
|
166
|
+
assert.deepStrictEqual(asks, [{ actionType: 'file', tag: 'write_file' }], 'same gate fires on the XML rail');
|
|
167
|
+
assert.strictEqual(ev.tools[0].tag, 'write');
|
|
168
|
+
// XML results come back as a role:'user' message, never role:'tool'.
|
|
169
|
+
const toolResult = messages.find((m) => m.role === 'user' && /Tool execution results/.test(m.content));
|
|
170
|
+
assert.ok(toolResult && /Wrote \d+ bytes to xml-write\.txt/.test(toolResult.content));
|
|
171
|
+
assert.ok(!messages.some((m) => m.role === 'tool'), 'XML path does not use role:"tool" messages');
|
|
172
|
+
} finally {
|
|
173
|
+
await mock.close();
|
|
174
|
+
}
|
|
175
|
+
});
|
|
176
|
+
|
|
177
|
+
// ---------------------------------------------------------------------------
|
|
178
|
+
// 2. Native file-mutating: edit_file (line replacement) — multi-arg native call.
|
|
179
|
+
// ---------------------------------------------------------------------------
|
|
180
|
+
|
|
181
|
+
test('native edit_file: replaces the target line, gate fires, result rooted', async () => {
|
|
182
|
+
fs.writeFileSync(path.join(CWD, 'native-edit.txt'), 'line1\nline2\nline3\n');
|
|
183
|
+
const mock = await startMockLLM();
|
|
184
|
+
// fromParams: { path, line, content } → ['edit_file', path, parseInt(line), content]
|
|
185
|
+
mock.replyWithToolCall('edit_file', { path: 'native-edit.txt', line: 2, content: 'EDITED_LINE_2' });
|
|
186
|
+
mock.replyWith('Edited.');
|
|
187
|
+
try {
|
|
188
|
+
const { runner, asks } = buildRunner(mock.base);
|
|
189
|
+
const { ev, cb } = collector();
|
|
190
|
+
const messages = [{ role: 'user', content: 'edit line 2' }];
|
|
191
|
+
await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
|
|
192
|
+
|
|
193
|
+
assert.strictEqual(fs.readFileSync(path.join(CWD, 'native-edit.txt'), 'utf8'), 'line1\nEDITED_LINE_2\nline3\n');
|
|
194
|
+
assert.deepStrictEqual(asks, [{ actionType: 'file', tag: 'edit_file' }]);
|
|
195
|
+
assert.strictEqual(ev.tools[0].tag, 'edit_file');
|
|
196
|
+
assertNativeRooting(messages, 'edit_file');
|
|
197
|
+
} finally {
|
|
198
|
+
await mock.close();
|
|
199
|
+
}
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
// ---------------------------------------------------------------------------
|
|
203
|
+
// 3. Native file-mutating: delete_file.
|
|
204
|
+
// ---------------------------------------------------------------------------
|
|
205
|
+
|
|
206
|
+
test('native delete_file: removes the file, gate fires, result rooted', async () => {
|
|
207
|
+
const target = path.join(CWD, 'native-delete.txt');
|
|
208
|
+
fs.writeFileSync(target, 'doomed');
|
|
209
|
+
const mock = await startMockLLM();
|
|
210
|
+
mock.replyWithToolCall('delete_file', { path: 'native-delete.txt' });
|
|
211
|
+
mock.replyWith('Deleted.');
|
|
212
|
+
try {
|
|
213
|
+
const { runner, asks } = buildRunner(mock.base);
|
|
214
|
+
const { ev, cb } = collector();
|
|
215
|
+
const messages = [{ role: 'user', content: 'delete it' }];
|
|
216
|
+
await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
|
|
217
|
+
|
|
218
|
+
assert.ok(!fs.existsSync(target), 'the file was deleted');
|
|
219
|
+
assert.deepStrictEqual(asks, [{ actionType: 'file', tag: 'delete_file' }]);
|
|
220
|
+
assert.strictEqual(ev.tools[0].tag, 'delete_file');
|
|
221
|
+
const toolMsg = assertNativeRooting(messages, 'delete_file');
|
|
222
|
+
assert.match(toolMsg.content, /Deleted native-delete\.txt/);
|
|
223
|
+
} finally {
|
|
224
|
+
await mock.close();
|
|
225
|
+
}
|
|
226
|
+
});
|
|
227
|
+
|
|
228
|
+
// ---------------------------------------------------------------------------
|
|
229
|
+
// 4. Native file-mutating: move_file (multi-arg src/dst).
|
|
230
|
+
// ---------------------------------------------------------------------------
|
|
231
|
+
|
|
232
|
+
test('native move_file: renames src→dst, gate fires, result rooted', async () => {
|
|
233
|
+
fs.writeFileSync(path.join(CWD, 'native-src.txt'), 'movable');
|
|
234
|
+
const mock = await startMockLLM();
|
|
235
|
+
mock.replyWithToolCall('move_file', { src: 'native-src.txt', dst: 'native-dst.txt' });
|
|
236
|
+
mock.replyWith('Moved.');
|
|
237
|
+
try {
|
|
238
|
+
const { runner, asks } = buildRunner(mock.base);
|
|
239
|
+
const { ev, cb } = collector();
|
|
240
|
+
const messages = [{ role: 'user', content: 'move it' }];
|
|
241
|
+
await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
|
|
242
|
+
|
|
243
|
+
assert.ok(!fs.existsSync(path.join(CWD, 'native-src.txt')), 'source gone');
|
|
244
|
+
assert.strictEqual(fs.readFileSync(path.join(CWD, 'native-dst.txt'), 'utf8'), 'movable', 'dst has the content');
|
|
245
|
+
assert.deepStrictEqual(asks, [{ actionType: 'file', tag: 'move_file' }]);
|
|
246
|
+
assert.strictEqual(ev.tools[0].tag, 'move_file');
|
|
247
|
+
assertNativeRooting(messages, 'move_file');
|
|
248
|
+
} finally {
|
|
249
|
+
await mock.close();
|
|
250
|
+
}
|
|
251
|
+
});
|
|
252
|
+
|
|
253
|
+
// ---------------------------------------------------------------------------
|
|
254
|
+
// 5. Native shell/exec — paired with the XML <exec> equivalent.
|
|
255
|
+
// ---------------------------------------------------------------------------
|
|
256
|
+
|
|
257
|
+
test('native shell: dispatches, gate fires (shell/exec), result rooted as role:"tool"', async () => {
|
|
258
|
+
const mock = await startMockLLM();
|
|
259
|
+
mock.replyWithToolCall('shell', { command: 'echo NATIVE_SHELL_OUT' });
|
|
260
|
+
mock.replyWith('Ran it.');
|
|
261
|
+
try {
|
|
262
|
+
const { runner, asks } = buildRunner(mock.base);
|
|
263
|
+
const { ev, cb } = collector();
|
|
264
|
+
const messages = [{ role: 'user', content: 'run echo' }];
|
|
265
|
+
const { metrics } = await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
|
|
266
|
+
|
|
267
|
+
assert.deepStrictEqual(asks, [{ actionType: 'shell', tag: 'exec' }], 'shell gate consulted with exec tag');
|
|
268
|
+
assert.strictEqual(ev.tools.length, 1);
|
|
269
|
+
assert.strictEqual(ev.tools[0].tag, 'shell');
|
|
270
|
+
|
|
271
|
+
const toolMsg = assertNativeRooting(messages, 'shell');
|
|
272
|
+
assert.match(toolMsg.content, /NATIVE_SHELL_OUT/, 'command stdout flowed back');
|
|
273
|
+
assert.match(toolMsg.content, /Exit code: 0/);
|
|
274
|
+
|
|
275
|
+
assert.strictEqual(metrics.turns.length, 2);
|
|
276
|
+
assert.ok(messages.some((m) => m.role === 'assistant' && m.content === 'Ran it.'));
|
|
277
|
+
} finally {
|
|
278
|
+
await mock.close();
|
|
279
|
+
}
|
|
280
|
+
});
|
|
281
|
+
|
|
282
|
+
test('XML shell equivalent: same dispatch, XML "Tool execution results" shape', async () => {
|
|
283
|
+
const mock = await startMockLLM();
|
|
284
|
+
mock.replyWith('<exec>echo XML_SHELL_OUT</exec>');
|
|
285
|
+
mock.replyWith('Ran it.');
|
|
286
|
+
try {
|
|
287
|
+
const { runner, asks } = buildRunner(mock.base);
|
|
288
|
+
const { ev, cb } = collector();
|
|
289
|
+
const messages = [{ role: 'user', content: 'run echo' }];
|
|
290
|
+
await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
|
|
291
|
+
|
|
292
|
+
assert.deepStrictEqual(asks, [{ actionType: 'shell', tag: 'exec' }], 'same gate fires on the XML rail');
|
|
293
|
+
assert.strictEqual(ev.tools[0].tag, 'shell');
|
|
294
|
+
const toolResult = messages.find((m) => m.role === 'user' && /Tool execution results/.test(m.content));
|
|
295
|
+
assert.ok(toolResult && /XML_SHELL_OUT/.test(toolResult.content) && /Exit code: 0/.test(toolResult.content));
|
|
296
|
+
assert.ok(!messages.some((m) => m.role === 'tool'), 'XML path does not use role:"tool" messages');
|
|
297
|
+
} finally {
|
|
298
|
+
await mock.close();
|
|
299
|
+
}
|
|
300
|
+
});
|
|
301
|
+
|
|
302
|
+
// ---------------------------------------------------------------------------
|
|
303
|
+
// 6. Native plan-mode: a mutating tool arriving via the native path is WITHHELD,
|
|
304
|
+
// and approval (plan mode off) lets it proceed — mirrors the XML plan test,
|
|
305
|
+
// additionally proving the withheld result is rooted as role:'tool' (native).
|
|
306
|
+
// ---------------------------------------------------------------------------
|
|
307
|
+
|
|
308
|
+
test('native plan mode: withholds the native mutating tool (no mutation), result rooted as role:"tool"', async () => {
|
|
309
|
+
const target = path.join(CWD, 'native-planned.txt');
|
|
310
|
+
const mock = await startMockLLM();
|
|
311
|
+
mock.replyWithToolCall('write_file', { path: 'native-planned.txt', content: 'SHOULD_NOT_WRITE' });
|
|
312
|
+
mock.replyWith('Here is my plan.');
|
|
313
|
+
try {
|
|
314
|
+
const { runner, asks } = buildRunner(mock.base);
|
|
315
|
+
const { ev, cb } = collector();
|
|
316
|
+
const messages = [{ role: 'user', content: 'change the file' }];
|
|
317
|
+
const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb, planMode: true });
|
|
318
|
+
|
|
319
|
+
assert.ok(!fs.existsSync(target), 'the file was NOT written in plan mode');
|
|
320
|
+
assert.strictEqual(res.withheldActions.length, 1, 'one action withheld');
|
|
321
|
+
assert.strictEqual(res.withheldActions[0].tag, 'write');
|
|
322
|
+
assert.deepStrictEqual(ev.withheld.map((w) => w.tag), ['write'], 'onPlanWithhold fired for the native call');
|
|
323
|
+
// Plan-mode withholding happens BEFORE the permission gate — never consulted.
|
|
324
|
+
assert.deepStrictEqual(asks, [], 'no permission prompt for a withheld tool');
|
|
325
|
+
|
|
326
|
+
// The withheld notice is still rooted on the native tool_call id (the loop
|
|
327
|
+
// pushes role:'tool' for native calls — lib/agent.js ~1366), keeping the
|
|
328
|
+
// assistant tool_calls ↔ tool-result map consistent for the next turn.
|
|
329
|
+
const toolMsg = assertNativeRooting(messages, 'write_file');
|
|
330
|
+
assert.match(toolMsg.content, /\[plan mode\] Withheld pending approval/);
|
|
331
|
+
assert.ok(messages.some((m) => m.role === 'assistant' && m.content === 'Here is my plan.'), 'plan recorded');
|
|
332
|
+
} finally {
|
|
333
|
+
await mock.close();
|
|
334
|
+
}
|
|
335
|
+
});
|
|
336
|
+
|
|
337
|
+
test('native plan mode OFF (approval): the same native mutating tool executes', async () => {
|
|
338
|
+
const target = path.join(CWD, 'native-approved.txt');
|
|
339
|
+
const mock = await startMockLLM();
|
|
340
|
+
mock.replyWithToolCall('write_file', { path: 'native-approved.txt', content: 'APPROVED' });
|
|
341
|
+
mock.replyWith('Done.');
|
|
342
|
+
try {
|
|
343
|
+
const { runner, asks } = buildRunner(mock.base);
|
|
344
|
+
const { ev, cb } = collector();
|
|
345
|
+
const messages = [{ role: 'user', content: 'write it' }];
|
|
346
|
+
const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb, planMode: false });
|
|
347
|
+
|
|
348
|
+
assert.strictEqual(fs.readFileSync(target, 'utf8'), 'APPROVED', 'the file was written after approval');
|
|
349
|
+
assert.strictEqual(res.withheldActions.length, 0, 'nothing withheld with plan mode off');
|
|
350
|
+
assert.deepStrictEqual(asks, [{ actionType: 'file', tag: 'write_file' }], 'gate fired on the executing path');
|
|
351
|
+
assert.strictEqual(ev.tools[0].tag, 'write');
|
|
352
|
+
assertNativeRooting(messages, 'write_file');
|
|
353
|
+
} finally {
|
|
354
|
+
await mock.close();
|
|
355
|
+
}
|
|
356
|
+
});
|