@stigmer/runner 3.0.2 → 3.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.build-fingerprint +1 -1
- package/dist/activities/execute-cursor/approval-policy.d.ts +55 -16
- package/dist/activities/execute-cursor/approval-policy.js +93 -31
- package/dist/activities/execute-cursor/approval-policy.js.map +1 -1
- package/dist/activities/execute-cursor/approval-state.d.ts +54 -26
- package/dist/activities/execute-cursor/approval-state.js +41 -26
- package/dist/activities/execute-cursor/approval-state.js.map +1 -1
- package/dist/activities/execute-cursor/hook-script.d.ts +41 -14
- package/dist/activities/execute-cursor/hook-script.js +155 -63
- package/dist/activities/execute-cursor/hook-script.js.map +1 -1
- package/dist/activities/execute-cursor/message-translator.d.ts +23 -0
- package/dist/activities/execute-cursor/message-translator.js +100 -54
- package/dist/activities/execute-cursor/message-translator.js.map +1 -1
- package/dist/activities/execute-cursor/session-lifecycle.d.ts +9 -0
- package/dist/activities/execute-cursor/session-lifecycle.js +11 -3
- package/dist/activities/execute-cursor/session-lifecycle.js.map +1 -1
- package/package.json +2 -2
- package/src/activities/execute-cursor/__tests__/approval-gate.test.ts +93 -37
- package/src/activities/execute-cursor/__tests__/hitl-ledger.test.ts +33 -18
- package/src/activities/execute-cursor/__tests__/hook-script.test.ts +204 -0
- package/src/activities/execute-cursor/__tests__/message-translator.test.ts +93 -0
- package/src/activities/execute-cursor/__tests__/session-lifecycle.test.ts +73 -2
- package/src/activities/execute-cursor/approval-policy.ts +113 -31
- package/src/activities/execute-cursor/approval-state.ts +74 -32
- package/src/activities/execute-cursor/hook-script.ts +157 -63
- package/src/activities/execute-cursor/message-translator.ts +114 -57
- package/src/activities/execute-cursor/session-lifecycle.ts +21 -3
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Behavior tests for the generated preToolUse bash hook.
|
|
3
|
+
*
|
|
4
|
+
* These run the ACTUAL bash script the runner writes into the workspace, feeding
|
|
5
|
+
* it the REAL hook-input shape captured from @cursor/sdk (PascalCase
|
|
6
|
+
* `tool_name`; `file_path`/`command` in `tool_input`). They are the strongest
|
|
7
|
+
* guard against the regression this work fixes: a gated built-in must be denied,
|
|
8
|
+
* its denial must be recorded with a token byte-identical to the runner's
|
|
9
|
+
* grantToken, and an exact-resource grant must allow only that resource.
|
|
10
|
+
*
|
|
11
|
+
* Skipped automatically where bash is unavailable.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { describe, it, expect, beforeAll, afterEach } from "vitest";
|
|
15
|
+
import { execFileSync, execSync } from "node:child_process";
|
|
16
|
+
import { mkdtempSync, mkdirSync, writeFileSync, rmSync, readFileSync, existsSync } from "node:fs";
|
|
17
|
+
import { tmpdir } from "node:os";
|
|
18
|
+
import { join } from "node:path";
|
|
19
|
+
|
|
20
|
+
import { generateHookScript } from "../hook-script.js";
|
|
21
|
+
import { buildApprovalState, grantToken, toolIdentity, type ApprovalGrant } from "../approval-state.js";
|
|
22
|
+
import type { McpToolPolicyEntry } from "../approval-state.js";
|
|
23
|
+
|
|
24
|
+
let hasBash = false;
|
|
25
|
+
try {
|
|
26
|
+
execSync("bash -c 'exit 0'", { stdio: "ignore" });
|
|
27
|
+
hasBash = true;
|
|
28
|
+
} catch {
|
|
29
|
+
hasBash = false;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const d = hasBash ? describe : describe.skip;
|
|
33
|
+
|
|
34
|
+
const tempDirs: string[] = [];
|
|
35
|
+
afterEach(() => {
|
|
36
|
+
for (const dir of tempDirs.splice(0)) rmSync(dir, { recursive: true, force: true });
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
interface Harness {
|
|
40
|
+
decide(input: object): { permission: string; raw: string };
|
|
41
|
+
ledger(): Array<{ toolName: string; token: string }>;
|
|
42
|
+
resetLedger(): void;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function setup(opts: {
|
|
46
|
+
autoApproveAll?: boolean;
|
|
47
|
+
grants?: ApprovalGrant[];
|
|
48
|
+
mcpPolicies?: Record<string, McpToolPolicyEntry>;
|
|
49
|
+
noStateFile?: boolean;
|
|
50
|
+
}): Harness {
|
|
51
|
+
const ws = mkdtempSync(join(tmpdir(), "hook-script-"));
|
|
52
|
+
tempDirs.push(ws);
|
|
53
|
+
const dir = join(ws, ".cursor", "hooks");
|
|
54
|
+
mkdirSync(dir, { recursive: true });
|
|
55
|
+
const statePath = join(dir, "state.json");
|
|
56
|
+
const ledgerPath = join(dir, "denials.jsonl");
|
|
57
|
+
const scriptPath = join(dir, "hook.sh");
|
|
58
|
+
writeFileSync(scriptPath, generateHookScript(statePath, ledgerPath), "utf-8");
|
|
59
|
+
|
|
60
|
+
if (!opts.noStateFile) {
|
|
61
|
+
const policies = new Map(
|
|
62
|
+
Object.entries(opts.mcpPolicies ?? {}).map(([name, p]) => [
|
|
63
|
+
`srv/${name}`,
|
|
64
|
+
{ toolName: name, mcpServerSlug: "srv", requiresApproval: p.requiresApproval, approvalMessage: p.message ?? "" },
|
|
65
|
+
]),
|
|
66
|
+
);
|
|
67
|
+
const state = buildApprovalState(policies, opts.autoApproveAll ?? false, opts.grants);
|
|
68
|
+
writeFileSync(statePath, JSON.stringify(state), "utf-8");
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
return {
|
|
72
|
+
decide(input: object) {
|
|
73
|
+
const raw = execFileSync("bash", [scriptPath], { input: JSON.stringify(input) }).toString();
|
|
74
|
+
const permission = raw.includes('"permission":"deny"') ? "deny" : raw.includes('"permission":"allow"') ? "allow" : "?";
|
|
75
|
+
return { permission, raw };
|
|
76
|
+
},
|
|
77
|
+
ledger() {
|
|
78
|
+
if (!existsSync(ledgerPath)) return [];
|
|
79
|
+
return readFileSync(ledgerPath, "utf-8").split("\n").filter(Boolean).map((l) => JSON.parse(l));
|
|
80
|
+
},
|
|
81
|
+
resetLedger() {
|
|
82
|
+
writeFileSync(ledgerPath, "", "utf-8");
|
|
83
|
+
},
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Real hook-input shapes (PascalCase name, file_path/command in tool_input).
|
|
88
|
+
const hookWrite = (filePath: string) => ({ tool_name: "Write", tool_input: { file_path: filePath, content: "x" } });
|
|
89
|
+
const hookShell = (command: string) => ({ tool_name: "Shell", tool_input: { command, cwd: "/x", timeout: 30000 } });
|
|
90
|
+
const hookDelete = (filePath: string) => ({ tool_name: "Delete", tool_input: { file_path: filePath } });
|
|
91
|
+
const hookRead = (filePath: string) => ({ tool_name: "Read", tool_input: { file_path: filePath } });
|
|
92
|
+
|
|
93
|
+
d("generated preToolUse hook", () => {
|
|
94
|
+
it("denies gated built-ins (Write/Shell/Delete) and records a category+salient token", () => {
|
|
95
|
+
const h = setup({});
|
|
96
|
+
|
|
97
|
+
for (const [input, category, salient] of [
|
|
98
|
+
[hookWrite("/x/a.txt"), "write", "/x/a.txt"],
|
|
99
|
+
[hookShell("rm -rf build"), "shell", "rm -rf build"],
|
|
100
|
+
[hookDelete("/x/b.txt"), "delete", "/x/b.txt"],
|
|
101
|
+
] as const) {
|
|
102
|
+
h.resetLedger();
|
|
103
|
+
expect(h.decide(input).permission).toBe("deny");
|
|
104
|
+
const ledger = h.ledger();
|
|
105
|
+
expect(ledger).toHaveLength(1);
|
|
106
|
+
// Byte-identical to the runner's grantToken(category, salient).
|
|
107
|
+
expect(ledger[0].token).toBe(grantToken(category, salient));
|
|
108
|
+
}
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
it("allows read-only built-ins", () => {
|
|
112
|
+
const h = setup({});
|
|
113
|
+
expect(h.decide(hookRead("/x/a.txt")).permission).toBe("allow");
|
|
114
|
+
expect(h.ledger()).toEqual([]);
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
it("auto-approve-all allows even gated built-ins", () => {
|
|
118
|
+
const h = setup({ autoApproveAll: true });
|
|
119
|
+
expect(h.decide(hookWrite("/x/a.txt")).permission).toBe("allow");
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
it("allows the EXACT granted resource and re-gates any other (no name-only over-grant)", () => {
|
|
123
|
+
const id = toolIdentity("edit", "", { path: "/x/a.txt" });
|
|
124
|
+
const h = setup({ grants: [{ toolName: "edit", mcpServerSlug: "", key: id.key, salient: id.salient }] });
|
|
125
|
+
|
|
126
|
+
// Same resource the user approved -> allowed on the resumed turn.
|
|
127
|
+
expect(h.decide(hookWrite("/x/a.txt")).permission).toBe("allow");
|
|
128
|
+
// A different file is NOT covered by the grant -> still gated.
|
|
129
|
+
expect(h.decide(hookWrite("/x/OTHER.txt")).permission).toBe("deny");
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
it("denies require-approval MCP tools and allows them once granted (name-only)", () => {
|
|
133
|
+
const mcpPolicies = { apply_x: { requiresApproval: true, message: "Apply X" } };
|
|
134
|
+
const denyH = setup({ mcpPolicies });
|
|
135
|
+
expect(denyH.decide({ tool_name: "apply_x", tool_input: {} }).permission).toBe("deny");
|
|
136
|
+
expect(denyH.ledger()[0].token).toBe(grantToken("apply_x", ""));
|
|
137
|
+
|
|
138
|
+
const grantH = setup({
|
|
139
|
+
mcpPolicies,
|
|
140
|
+
grants: [{ toolName: "apply_x", mcpServerSlug: "srv", key: "apply_x", salient: "" }],
|
|
141
|
+
});
|
|
142
|
+
expect(grantH.decide({ tool_name: "apply_x", tool_input: {} }).permission).toBe("allow");
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
it("fails closed (deny) when the state file is missing", () => {
|
|
146
|
+
const h = setup({ noStateFile: true });
|
|
147
|
+
expect(h.decide(hookWrite("/x/a.txt")).permission).toBe("deny");
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
// Regression: the original grep-based extraction truncated string values at
|
|
151
|
+
// the first JSON-escaped character, so a shell command containing double
|
|
152
|
+
// quotes (e.g. `printf '%s' 'x' > "file"`) produced a ledger token that never
|
|
153
|
+
// matched the runner's grantToken — the denied call stayed COMPLETED in the
|
|
154
|
+
// persisted messages and a grant for it was re-denied on reinvocation.
|
|
155
|
+
// (Observed live in TestCursorHarness_HITL_ResumedTurn_StillGated.)
|
|
156
|
+
it("records a byte-identical token for commands with quotes, escapes, and newlines", () => {
|
|
157
|
+
const commands = [
|
|
158
|
+
'printf \'%s\' \'hello\' > "/tmp/a dir/resumed-gate.txt"',
|
|
159
|
+
'echo "double \\"nested\\" quotes" && echo done',
|
|
160
|
+
"line1\nline2\twith\ttabs",
|
|
161
|
+
'unicode: caf\u00e9 \u2014 emoji \u{1F600}',
|
|
162
|
+
];
|
|
163
|
+
for (const command of commands) {
|
|
164
|
+
const h = setup({});
|
|
165
|
+
expect(h.decide(hookShell(command)).permission).toBe("deny");
|
|
166
|
+
const ledger = h.ledger();
|
|
167
|
+
expect(ledger).toHaveLength(1);
|
|
168
|
+
expect(ledger[0].token).toBe(grantToken("shell", command));
|
|
169
|
+
}
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
it("allows the exact granted shell command even when it contains quotes", () => {
|
|
173
|
+
const command = 'printf \'%s\' \'hello-resume\' > "/x/resumed-gate.txt"';
|
|
174
|
+
const id = toolIdentity("shell", "", { command });
|
|
175
|
+
const h = setup({ grants: [{ toolName: "shell", mcpServerSlug: "", key: id.key, salient: id.salient }] });
|
|
176
|
+
|
|
177
|
+
expect(h.decide(hookShell(command)).permission).toBe("allow");
|
|
178
|
+
// A different command is NOT covered by the grant -> still gated.
|
|
179
|
+
expect(h.decide(hookShell('rm -rf "/x"')).permission).toBe("deny");
|
|
180
|
+
});
|
|
181
|
+
|
|
182
|
+
it("still denies gated tools via the bash fallback when the Node binary is unavailable", () => {
|
|
183
|
+
const ws = mkdtempSync(join(tmpdir(), "hook-script-fallback-"));
|
|
184
|
+
tempDirs.push(ws);
|
|
185
|
+
const dir = join(ws, ".cursor", "hooks");
|
|
186
|
+
mkdirSync(dir, { recursive: true });
|
|
187
|
+
const statePath = join(dir, "state.json");
|
|
188
|
+
const ledgerPath = join(dir, "denials.jsonl");
|
|
189
|
+
const scriptPath = join(dir, "hook.sh");
|
|
190
|
+
// Break the baked Node path to force the grep/cut fallback.
|
|
191
|
+
const script = generateHookScript(statePath, ledgerPath)
|
|
192
|
+
.replace(`NODE_BIN="${process.execPath}"`, 'NODE_BIN="/nonexistent/node"');
|
|
193
|
+
writeFileSync(scriptPath, script, "utf-8");
|
|
194
|
+
writeFileSync(statePath, JSON.stringify(buildApprovalState(new Map(), false)), "utf-8");
|
|
195
|
+
|
|
196
|
+
const raw = execFileSync("bash", [scriptPath], {
|
|
197
|
+
input: JSON.stringify(hookWrite("/x/a.txt")),
|
|
198
|
+
}).toString();
|
|
199
|
+
expect(raw).toContain('"permission":"deny"');
|
|
200
|
+
const ledger = readFileSync(ledgerPath, "utf-8").split("\n").filter(Boolean).map((l) => JSON.parse(l));
|
|
201
|
+
expect(ledger).toHaveLength(1);
|
|
202
|
+
expect(ledger[0].token).toBe(grantToken("write", "/x/a.txt"));
|
|
203
|
+
});
|
|
204
|
+
});
|
|
@@ -617,6 +617,99 @@ describe("MessageAccumulator tool call status transitions", () => {
|
|
|
617
617
|
});
|
|
618
618
|
});
|
|
619
619
|
|
|
620
|
+
// The Cursor SDK can emit the lifecycle for one call_id more than once.
|
|
621
|
+
// Observed in production: two "running" events ~0.5s apart for a task/edit
|
|
622
|
+
// tool produced two ToolCall entries with the SAME id (a "thin" copy with no
|
|
623
|
+
// result and a "full" copy), rendering the same call two or three times in
|
|
624
|
+
// the UI. The accumulator must upsert by call_id so a call maps to exactly
|
|
625
|
+
// one ToolCall.
|
|
626
|
+
describe("tool call idempotency (one ToolCall per call_id)", () => {
|
|
627
|
+
it("duplicate running events for one call_id create a single ToolCall", () => {
|
|
628
|
+
const messages: AgentMessage[] = [];
|
|
629
|
+
const acc = new MessageAccumulator(messages);
|
|
630
|
+
|
|
631
|
+
acc.processEvent(assistantEvent("r1", "Editing a file."));
|
|
632
|
+
acc.processEvent(toolCallEvent("tc-dup", "edit", "running", "r1", { args: { path: "a.ts" } }));
|
|
633
|
+
acc.processEvent(toolCallEvent("tc-dup", "edit", "running", "r1", { args: { path: "a.ts" } }));
|
|
634
|
+
|
|
635
|
+
expect(countToolCallsWithId(messages, "tc-dup")).toBe(1);
|
|
636
|
+
expect(findToolCallById(messages, "tc-dup")!.status).toBe(ToolCallStatus.TOOL_CALL_RUNNING);
|
|
637
|
+
});
|
|
638
|
+
|
|
639
|
+
it("running -> completed -> running re-emit keeps a single COMPLETED ToolCall", () => {
|
|
640
|
+
const messages: AgentMessage[] = [];
|
|
641
|
+
const acc = new MessageAccumulator(messages);
|
|
642
|
+
|
|
643
|
+
acc.processEvent(assistantEvent("r1", "Running a tool."));
|
|
644
|
+
acc.processEvent(toolCallEvent("tc-1", "Shell", "running", "r1"));
|
|
645
|
+
acc.processEvent(toolCallEvent("tc-1", "Shell", "completed", "r1", { result: "OK" }));
|
|
646
|
+
// A late "running" re-emit must not regress the terminal status.
|
|
647
|
+
acc.processEvent(toolCallEvent("tc-1", "Shell", "running", "r1"));
|
|
648
|
+
|
|
649
|
+
expect(countToolCallsWithId(messages, "tc-1")).toBe(1);
|
|
650
|
+
const tc = findToolCallById(messages, "tc-1")!;
|
|
651
|
+
expect(tc.status).toBe(ToolCallStatus.TOOL_CALL_COMPLETED);
|
|
652
|
+
expect(tc.result).toBe("OK");
|
|
653
|
+
expect(tc.completedAt).toBeTruthy();
|
|
654
|
+
});
|
|
655
|
+
|
|
656
|
+
it("thin-then-full: a result-bearing completion populates the single ToolCall created by an empty running", () => {
|
|
657
|
+
const messages: AgentMessage[] = [];
|
|
658
|
+
const acc = new MessageAccumulator(messages);
|
|
659
|
+
|
|
660
|
+
// Reproduces the production pattern: two running events, then one
|
|
661
|
+
// completion that carries the full result.
|
|
662
|
+
acc.processEvent(assistantEvent("r1", "Delegating work."));
|
|
663
|
+
acc.processEvent(toolCallEvent("tc-task", "task", "running", "r1", { result: "" }));
|
|
664
|
+
acc.processEvent(toolCallEvent("tc-task", "task", "running", "r1", { result: "" }));
|
|
665
|
+
acc.processEvent(toolCallEvent("tc-task", "task", "completed", "r1", { result: "full result blob" }));
|
|
666
|
+
|
|
667
|
+
expect(countToolCallsWithId(messages, "tc-task")).toBe(1);
|
|
668
|
+
const tc = findToolCallById(messages, "tc-task")!;
|
|
669
|
+
expect(tc.status).toBe(ToolCallStatus.TOOL_CALL_COMPLETED);
|
|
670
|
+
expect(tc.result).toBe("full result blob");
|
|
671
|
+
});
|
|
672
|
+
|
|
673
|
+
it("a result-less re-emit after completion does not wipe the captured result", () => {
|
|
674
|
+
const messages: AgentMessage[] = [];
|
|
675
|
+
const acc = new MessageAccumulator(messages);
|
|
676
|
+
|
|
677
|
+
acc.processEvent(assistantEvent("r1", "Running a tool."));
|
|
678
|
+
acc.processEvent(toolCallEvent("tc-1", "read", "running", "r1"));
|
|
679
|
+
acc.processEvent(toolCallEvent("tc-1", "read", "completed", "r1", { result: "file contents" }));
|
|
680
|
+
acc.processEvent(toolCallEvent("tc-1", "read", "completed", "r1", { result: "" }));
|
|
681
|
+
|
|
682
|
+
expect(countToolCallsWithId(messages, "tc-1")).toBe(1);
|
|
683
|
+
expect(findToolCallById(messages, "tc-1")!.result).toBe("file contents");
|
|
684
|
+
});
|
|
685
|
+
|
|
686
|
+
it("duplicate task running events yield one task ToolCall and one sub-agent (production repro)", () => {
|
|
687
|
+
const messages: AgentMessage[] = [];
|
|
688
|
+
const acc = new MessageAccumulator(messages);
|
|
689
|
+
|
|
690
|
+
// Mirror the ExecuteCursor stream loop: every task tool_call event is fed
|
|
691
|
+
// to both processEvent() (tool call) and trackSubAgentExecution().
|
|
692
|
+
acc.processEvent(assistantEvent("r1", "I'll explore the repo."));
|
|
693
|
+
const args = { subagentType: { kind: "explore" }, description: "Explore repo structure and docs", prompt: "Go" };
|
|
694
|
+
|
|
695
|
+
const run1 = toolCallEvent("tc-explore", "task", "running", "r1", { args, result: "" });
|
|
696
|
+
acc.processEvent(run1);
|
|
697
|
+
acc.trackSubAgentExecution(run1);
|
|
698
|
+
|
|
699
|
+
const run2 = toolCallEvent("tc-explore", "task", "running", "r1", { args, result: "" });
|
|
700
|
+
acc.processEvent(run2);
|
|
701
|
+
acc.trackSubAgentExecution(run2);
|
|
702
|
+
|
|
703
|
+
const done = toolCallEvent("tc-explore", "task", "completed", "r1", { result: "explored" });
|
|
704
|
+
acc.processEvent(done);
|
|
705
|
+
acc.trackSubAgentExecution(done);
|
|
706
|
+
|
|
707
|
+
expect(countToolCallsWithId(messages, "tc-explore")).toBe(1);
|
|
708
|
+
expect(acc.subAgentExecutions).toHaveLength(1);
|
|
709
|
+
expect(acc.subAgentExecutions[0].id).toBe("tc-explore");
|
|
710
|
+
});
|
|
711
|
+
});
|
|
712
|
+
|
|
620
713
|
describe("cancelInProgressSubAgentProtos standalone", () => {
|
|
621
714
|
it("cancels IN_PROGRESS/PENDING protos in place and reports whether anything changed", () => {
|
|
622
715
|
const running = create(SubAgentExecutionSchema, {
|
|
@@ -7,12 +7,20 @@
|
|
|
7
7
|
* collide. These invariants are correctness-critical, hence the explicit tests.
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
|
-
import { describe, it, expect, afterEach } from "vitest";
|
|
10
|
+
import { describe, it, expect, afterEach, vi } from "vitest";
|
|
11
11
|
import { mkdtempSync, rmSync, existsSync } from "node:fs";
|
|
12
12
|
import { tmpdir } from "node:os";
|
|
13
13
|
import { join } from "node:path";
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
vi.mock("@cursor/sdk", () => ({
|
|
16
|
+
Agent: {
|
|
17
|
+
create: vi.fn(async () => ({ agentId: "agent-created" })),
|
|
18
|
+
resume: vi.fn(async () => ({ agentId: "agent-resumed" })),
|
|
19
|
+
},
|
|
20
|
+
}));
|
|
21
|
+
|
|
22
|
+
import { Agent } from "@cursor/sdk";
|
|
23
|
+
import { resolvePlatformOptions, createAgent, resumeAgent } from "../session-lifecycle.js";
|
|
16
24
|
|
|
17
25
|
const tempRoots: string[] = [];
|
|
18
26
|
|
|
@@ -63,3 +71,66 @@ describe("resolvePlatformOptions", () => {
|
|
|
63
71
|
expect(() => resolvePlatformOptions("ses-123", "")).toThrow(/workspaceRootDir is required/);
|
|
64
72
|
});
|
|
65
73
|
});
|
|
74
|
+
|
|
75
|
+
// ---------------------------------------------------------------------------
|
|
76
|
+
// Workspace binding across create/resume
|
|
77
|
+
//
|
|
78
|
+
// Regression: Agent.resume() does not persist local.cwd. When resumeAgent()
|
|
79
|
+
// omitted it, the SDK fell back to process.cwd() — re-rooting the resumed
|
|
80
|
+
// agent in the runner's own working directory and loading the "project"
|
|
81
|
+
// setting source (the .cursor/hooks.json carrying the HITL approval hook)
|
|
82
|
+
// from that wrong directory. Result: on every resumed turn, file edits and
|
|
83
|
+
// shell commands ran unguarded with no approval card (observed in production
|
|
84
|
+
// execution aex_01ktr5na07f5xtmn0dz3mfjtdp).
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
86
|
+
|
|
87
|
+
describe("workspace binding on create/resume", () => {
|
|
88
|
+
const baseOptions = {
|
|
89
|
+
apiKey: "key",
|
|
90
|
+
sessionId: "ses-cwd-test",
|
|
91
|
+
model: "gpt-test",
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
it("createAgent passes the single workspace dir as local.cwd", async () => {
|
|
95
|
+
const workspaceRootDir = freshWorkspaceRoot();
|
|
96
|
+
await createAgent({
|
|
97
|
+
...baseOptions,
|
|
98
|
+
workspaceDirs: ["/work/repo-a"],
|
|
99
|
+
workspaceRootDir,
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
const callOptions = vi.mocked(Agent.create).mock.calls.at(-1)![0] as any;
|
|
103
|
+
expect(callOptions.local.cwd).toBe("/work/repo-a");
|
|
104
|
+
expect(callOptions.local.settingSources).toContain("project");
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
it("resumeAgent re-supplies local.cwd (not persisted by Agent.resume)", async () => {
|
|
108
|
+
const workspaceRootDir = freshWorkspaceRoot();
|
|
109
|
+
await resumeAgent({
|
|
110
|
+
...baseOptions,
|
|
111
|
+
agentId: "agent-123",
|
|
112
|
+
workspaceDirs: ["/work/repo-a"],
|
|
113
|
+
workspaceRootDir,
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
const [agentId, callOptions] = vi.mocked(Agent.resume).mock.calls.at(-1)! as [string, any];
|
|
117
|
+
expect(agentId).toBe("agent-123");
|
|
118
|
+
// The load-bearing assertion: without cwd the SDK re-roots the agent at
|
|
119
|
+
// process.cwd() and the project HITL hook never loads on resumed turns.
|
|
120
|
+
expect(callOptions.local.cwd).toBe("/work/repo-a");
|
|
121
|
+
expect(callOptions.local.settingSources).toContain("project");
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
it("resumeAgent passes multiple workspace dirs as an array cwd", async () => {
|
|
125
|
+
const workspaceRootDir = freshWorkspaceRoot();
|
|
126
|
+
await resumeAgent({
|
|
127
|
+
...baseOptions,
|
|
128
|
+
agentId: "agent-456",
|
|
129
|
+
workspaceDirs: ["/work/repo-a", "/work/repo-b"],
|
|
130
|
+
workspaceRootDir,
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
const callOptions = vi.mocked(Agent.resume).mock.calls.at(-1)![1] as any;
|
|
134
|
+
expect(callOptions.local.cwd).toEqual(["/work/repo-a", "/work/repo-b"]);
|
|
135
|
+
});
|
|
136
|
+
});
|
|
@@ -18,7 +18,9 @@
|
|
|
18
18
|
|
|
19
19
|
import type { ToolApprovalPolicy } from "@stigmer/protos/ai/stigmer/agentic/mcpserver/v1/spec_pb";
|
|
20
20
|
import type { ToolApprovalOverride } from "@stigmer/protos/ai/stigmer/agentic/agent/v1/spec_pb";
|
|
21
|
+
import { ToolKind } from "@stigmer/protos/ai/stigmer/agentic/agentexecution/v1/enum_pb";
|
|
21
22
|
import type { ResolvedMcpServer } from "./mcp-resolver.js";
|
|
23
|
+
import { classifyTool } from "../../shared/tool-kind.js";
|
|
22
24
|
|
|
23
25
|
/**
|
|
24
26
|
* A single tool's merged approval decision after evaluating all policy layers.
|
|
@@ -31,61 +33,141 @@ export interface MergedToolPolicy {
|
|
|
31
33
|
}
|
|
32
34
|
|
|
33
35
|
/**
|
|
34
|
-
* Built-in
|
|
35
|
-
*
|
|
36
|
-
*
|
|
37
|
-
*
|
|
38
|
-
*
|
|
39
|
-
*
|
|
36
|
+
* Built-in Cursor tools the preToolUse hook gates, named as the hook receives
|
|
37
|
+
* them.
|
|
38
|
+
*
|
|
39
|
+
* Critical: the Cursor preToolUse hook and the SDK event stream use DIFFERENT
|
|
40
|
+
* tool taxonomies for the same operation. The hook's `tool_name` is PascalCase
|
|
41
|
+
* (`Write` for any file create/edit, `Shell`, `Delete`); the stream's
|
|
42
|
+
* `event.name` is lowercase (`edit`, `shell`, `delete`). This set is the HOOK
|
|
43
|
+
* taxonomy because it is consulted only to build the hook's gated set and its
|
|
44
|
+
* name->category mapping. Cross-layer correlation never compares these raw
|
|
45
|
+
* names — it uses {@link approvalCategory} (see below).
|
|
40
46
|
*/
|
|
41
|
-
const BUILT_IN_GATED = new
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
+
const BUILT_IN_GATED: ReadonlySet<string> = new Set([
|
|
48
|
+
"Write",
|
|
49
|
+
"StrReplace",
|
|
50
|
+
"EditNotebook",
|
|
51
|
+
"Shell",
|
|
52
|
+
"Delete",
|
|
47
53
|
]);
|
|
48
54
|
|
|
55
|
+
/**
|
|
56
|
+
* Canonical approval category for a gated tool, derived from EITHER taxonomy's
|
|
57
|
+
* name via the shared {@link classifyTool}.
|
|
58
|
+
*
|
|
59
|
+
* The hook (`Write`/`Shell`/`Delete`) and the stream (`edit`/`shell`/`delete`)
|
|
60
|
+
* name the same operation differently, so neither raw name is a stable
|
|
61
|
+
* cross-layer identity. The category collapses both onto one value so the denial
|
|
62
|
+
* ledger (recorded by the hook) correlates to the streamed tool call (read by
|
|
63
|
+
* the runner) and so an approval grant matches the agent's re-attempt on
|
|
64
|
+
* reinvocation regardless of which taxonomy named it. `FILE_WRITE` and
|
|
65
|
+
* `FILE_EDIT` both map to `write` because the Cursor hook reports every file
|
|
66
|
+
* mutation — create or edit — as `Write`.
|
|
67
|
+
*
|
|
68
|
+
* Returns undefined for non-gated tools (read-only built-ins, MCP tools, and
|
|
69
|
+
* anything `classifyTool` does not place in a mutating kind).
|
|
70
|
+
*/
|
|
71
|
+
export type ApprovalCategory = "write" | "delete" | "shell";
|
|
72
|
+
|
|
73
|
+
export function approvalCategory(toolName: string): ApprovalCategory | undefined {
|
|
74
|
+
switch (classifyTool(toolName)) {
|
|
75
|
+
case ToolKind.FILE_WRITE:
|
|
76
|
+
case ToolKind.FILE_EDIT:
|
|
77
|
+
return "write";
|
|
78
|
+
case ToolKind.FILE_DELETE:
|
|
79
|
+
return "delete";
|
|
80
|
+
case ToolKind.SHELL:
|
|
81
|
+
return "shell";
|
|
82
|
+
default:
|
|
83
|
+
return undefined;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Human-readable approval-message template per canonical category. Keyed by
|
|
89
|
+
* category (not raw tool name) so a denial surfaced from either taxonomy renders
|
|
90
|
+
* the same message. Placeholders resolve against the tool args via
|
|
91
|
+
* {@link resolveApprovalMessage}; `{{args.path}}` and `{{args.command}}` are the
|
|
92
|
+
* stream-side field names (the runner builds the approval surface from the
|
|
93
|
+
* streamed tool call, whose args use `path`/`command`).
|
|
94
|
+
*/
|
|
95
|
+
const CATEGORY_APPROVAL_MESSAGE: Record<ApprovalCategory, string> = {
|
|
96
|
+
write: "Write file: {{args.path}}",
|
|
97
|
+
delete: "Delete: {{args.path}}",
|
|
98
|
+
shell: "Run command: {{args.command}}",
|
|
99
|
+
};
|
|
100
|
+
|
|
49
101
|
/**
|
|
50
102
|
* Top-level tool-argument fields, in priority order, that identify the specific
|
|
51
|
-
* resource a built-in tool acts on.
|
|
52
|
-
*
|
|
53
|
-
*
|
|
54
|
-
*
|
|
103
|
+
* resource a built-in tool acts on. The list deliberately spans BOTH taxonomies'
|
|
104
|
+
* arg shapes: the hook input names a file `file_path` and the stream names it
|
|
105
|
+
* `path`; both name a shell command `command`. Extracting the same resource
|
|
106
|
+
* VALUE on both sides (the absolute path / the command string) is what lets the
|
|
107
|
+
* hook-recorded denial token equal the stream-computed token. Authored here once
|
|
108
|
+
* and injected into the generated preToolUse hook script so the runner and the
|
|
109
|
+
* hook never disagree on which field to match.
|
|
55
110
|
*/
|
|
56
|
-
export const SALIENT_ARG_FIELDS = ["
|
|
111
|
+
export const SALIENT_ARG_FIELDS = ["file_path", "path", "target_notebook", "command"] as const;
|
|
57
112
|
|
|
58
113
|
/**
|
|
59
114
|
* Check whether a built-in (non-MCP) Cursor tool requires user approval.
|
|
60
115
|
*
|
|
61
|
-
*
|
|
62
|
-
*
|
|
63
|
-
*
|
|
64
|
-
*
|
|
65
|
-
*
|
|
66
|
-
*
|
|
67
|
-
*
|
|
68
|
-
*
|
|
116
|
+
* Resolved via {@link approvalCategory} so it answers correctly for BOTH
|
|
117
|
+
* taxonomies — the hook's `Write`/`Shell`/`Delete` and the stream's
|
|
118
|
+
* `edit`/`shell`/`delete` all return true. Only mutating/destructive tools are
|
|
119
|
+
* gated; everything else (read-only built-ins, and — at the hook layer —
|
|
120
|
+
* auto-approved MCP tools) is allowed. This "gate the dangerous set, allow the
|
|
121
|
+
* rest" model mirrors the native harness's resolveToolApproval. It is
|
|
122
|
+
* deliberately fail-OPEN for unknown tools: the merged MCP policy map carries
|
|
123
|
+
* only the tools that REQUIRE approval, so a fail-closed default would wrongly
|
|
124
|
+
* deny every auto-approved MCP tool, which the hook cannot distinguish from an
|
|
125
|
+
* unknown built-in by name.
|
|
69
126
|
*/
|
|
70
127
|
export function builtInRequiresApproval(toolName: string): boolean {
|
|
71
|
-
return
|
|
128
|
+
return approvalCategory(toolName) !== undefined;
|
|
72
129
|
}
|
|
73
130
|
|
|
74
131
|
/**
|
|
75
132
|
* Returns the built-in tool names that require approval (the gated set the
|
|
76
133
|
* preToolUse hook denies unless auto-approved or granted on reinvocation).
|
|
134
|
+
*
|
|
135
|
+
* These are HOOK-taxonomy names (PascalCase), because the hook matches its own
|
|
136
|
+
* `tool_name`. See {@link approvalCategory} for the cross-layer identity.
|
|
77
137
|
*/
|
|
78
138
|
export function getBuiltInGatedList(): string[] {
|
|
79
|
-
return [...BUILT_IN_GATED
|
|
139
|
+
return [...BUILT_IN_GATED];
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Returns the gated built-in tools as `(hookToolName, category)` pairs.
|
|
144
|
+
*
|
|
145
|
+
* Injected into the generated preToolUse hook so the bash script can map its
|
|
146
|
+
* incoming `tool_name` to the canonical category used for the denial/grant
|
|
147
|
+
* token — the same category the runner computes from the stream side via
|
|
148
|
+
* {@link approvalCategory}. Authoring it here keeps the mapping single-sourced;
|
|
149
|
+
* a gated built-in with no category would be a programming error, so it is
|
|
150
|
+
* filtered out (and would simply not be gated rather than crash the hook).
|
|
151
|
+
*/
|
|
152
|
+
export function getBuiltInGatedCategories(): Array<[string, ApprovalCategory]> {
|
|
153
|
+
const pairs: Array<[string, ApprovalCategory]> = [];
|
|
154
|
+
for (const name of BUILT_IN_GATED) {
|
|
155
|
+
const category = approvalCategory(name);
|
|
156
|
+
if (category) pairs.push([name, category]);
|
|
157
|
+
}
|
|
158
|
+
return pairs;
|
|
80
159
|
}
|
|
81
160
|
|
|
82
161
|
/**
|
|
83
|
-
* Approval-message template for a gated built-in tool, or
|
|
84
|
-
* tool is not
|
|
85
|
-
*
|
|
162
|
+
* Approval-message template for a gated built-in tool (either taxonomy), or
|
|
163
|
+
* undefined when the tool is not gated. Resolved via {@link approvalCategory}
|
|
164
|
+
* so stream-side names (`edit`/`shell`/`delete`) and hook-side names
|
|
165
|
+
* (`Write`/`Shell`/`Delete`) both map to the same template. Callers resolve the
|
|
166
|
+
* placeholders against the tool args via resolveApprovalMessage.
|
|
86
167
|
*/
|
|
87
168
|
export function getBuiltInApprovalMessage(toolName: string): string | undefined {
|
|
88
|
-
|
|
169
|
+
const category = approvalCategory(toolName);
|
|
170
|
+
return category ? CATEGORY_APPROVAL_MESSAGE[category] : undefined;
|
|
89
171
|
}
|
|
90
172
|
|
|
91
173
|
/**
|