@bridge_gpt/mcp-server 0.1.17 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/README.md +333 -197
  2. package/build/agent-capabilities/cli.js +152 -0
  3. package/build/agent-capabilities/default-deps.js +45 -0
  4. package/build/agent-capabilities/probe-context.js +111 -0
  5. package/build/agent-capabilities/probes.js +278 -0
  6. package/build/agent-capabilities/reporter.js +50 -0
  7. package/build/agent-capabilities/runner.js +56 -0
  8. package/build/agent-capabilities/types.js +10 -0
  9. package/build/agent-launchers/claude.js +4 -4
  10. package/build/agents.generated.js +1 -1
  11. package/build/brainstorm-files.js +89 -0
  12. package/build/bridge-config.js +404 -0
  13. package/build/chain-orchestrator.js +247 -33
  14. package/build/commands.generated.js +5 -5
  15. package/build/credential-materialization.js +128 -0
  16. package/build/credential-store.js +232 -0
  17. package/build/decision-page-schema.js +39 -6
  18. package/build/decision-page-template.js +54 -18
  19. package/build/doctor.js +18 -2
  20. package/build/git-ignore-utils.js +63 -0
  21. package/build/index.js +1510 -560
  22. package/build/mcp-invoke.js +417 -0
  23. package/build/mcp-provisioning.js +249 -0
  24. package/build/mcp-registration-doctor.js +96 -0
  25. package/build/pipeline-orchestrator.js +9 -1
  26. package/build/pipeline-utils.js +33 -0
  27. package/build/pipelines.generated.js +36 -5
  28. package/build/schedule-run.js +6 -6
  29. package/build/start-tickets-prereqs.js +90 -1
  30. package/build/start-tickets.js +106 -14
  31. package/build/third-party-mcp-targets.js +75 -0
  32. package/build/version.generated.js +1 -1
  33. package/package.json +3 -3
  34. package/pipelines/full-automation.json +3 -1
  35. package/pipelines/implement-ticket.json +28 -2
  36. package/smoke-test/SMOKE-TEST.md +4 -2
@@ -0,0 +1,152 @@
1
+ /**
2
+ * CLI entry for the `agent-capabilities` subcommand:
3
+ *
4
+ * npx -y @bridge_gpt/mcp-server agent-capabilities [--agent <name|all>] [--only <ids>] [--json]
5
+ *
6
+ * Mirrors `doctor`'s shape: strict arg parsing → non-throwing collection →
7
+ * format → exit code. Exit 0 when every applicable/non-filtered probe is
8
+ * pass/skip/not-applicable; nonzero on any FAIL or HANG.
9
+ */
10
+ import { DEFAULT_AGENT_NAME, formatValidAgentNames, isAgentName, listAgentNames, } from "../agent-registry.js";
11
+ import { createDefaultAgentCapabilitiesDeps } from "./default-deps.js";
12
+ import { formatCapabilityJson, formatCapabilityReport } from "./reporter.js";
13
+ import { collectCapabilityResults, hasFailureOrHang } from "./runner.js";
14
+ import { listProbeIds } from "./probes.js";
15
+ export function getAgentCapabilitiesUsage() {
16
+ return [
17
+ "Usage:",
18
+ " npx -y @bridge_gpt/mcp-server agent-capabilities [--agent <name|all>] [--only <ids>] [--json]",
19
+ "",
20
+ "Empirically validates what an agent CLI can do (binary resolution, headless",
21
+ "print mode + exit, .claude/commands resolution, preamble drift-check, output",
22
+ "formats). Runs disposable probes in temp projects; never writes to your repo.",
23
+ "",
24
+ "Flags:",
25
+ ` --agent <name|all> Agent(s) to probe: ${formatValidAgentNames()}, or 'all' (default: ${DEFAULT_AGENT_NAME})`,
26
+ ` --only <ids> Comma-separated probe ids to run (default: all). Valid ids:`,
27
+ ` ${listProbeIds().join(", ")}`,
28
+ " --json Emit JSON instead of the human report",
29
+ " -h, --help Show this help",
30
+ "",
31
+ "Auth: cursor-agent probes that spawn the agent require CURSOR_API_KEY (read from",
32
+ "the environment, never printed); they SKIP when it is unset. claude uses",
33
+ "keychain/OAuth, which cannot be verified from the environment.",
34
+ "",
35
+ "Exit code: 0 when every probe is pass/skip/n-a; nonzero when any probe FAILs or HANGs.",
36
+ ].join("\n");
37
+ }
38
+ function parseListFlag(raw) {
39
+ return raw
40
+ .split(",")
41
+ .map((s) => s.trim())
42
+ .filter((s) => s.length > 0);
43
+ }
44
+ export function parseAgentCapabilitiesArgs(argv) {
45
+ if (argv.includes("-h") || argv.includes("--help")) {
46
+ return { status: "help", usage: getAgentCapabilitiesUsage() };
47
+ }
48
+ let agentSelector = DEFAULT_AGENT_NAME;
49
+ let only;
50
+ let json = false;
51
+ let timeoutMs;
52
+ for (let i = 0; i < argv.length; i++) {
53
+ const arg = argv[i];
54
+ if (arg === "--json") {
55
+ json = true;
56
+ continue;
57
+ }
58
+ if (arg === "--agent" || arg.startsWith("--agent=")) {
59
+ let value;
60
+ if (arg.startsWith("--agent=")) {
61
+ value = arg.slice("--agent=".length);
62
+ }
63
+ else {
64
+ if (i + 1 >= argv.length) {
65
+ return { status: "error", message: "--agent requires a value (an agent name or 'all')." };
66
+ }
67
+ value = argv[++i];
68
+ }
69
+ if (value !== "all" && !isAgentName(value)) {
70
+ return {
71
+ status: "error",
72
+ message: `Invalid --agent value: '${value}' (allowed: ${formatValidAgentNames()}, all).`,
73
+ };
74
+ }
75
+ agentSelector = value;
76
+ continue;
77
+ }
78
+ if (arg === "--only" || arg.startsWith("--only=")) {
79
+ let value;
80
+ if (arg.startsWith("--only=")) {
81
+ value = arg.slice("--only=".length);
82
+ }
83
+ else {
84
+ if (i + 1 >= argv.length) {
85
+ return { status: "error", message: "--only requires a value (comma-separated probe ids)." };
86
+ }
87
+ value = argv[++i];
88
+ }
89
+ const ids = parseListFlag(value);
90
+ const valid = new Set(listProbeIds());
91
+ const unknown = ids.filter((id) => !valid.has(id));
92
+ if (unknown.length > 0) {
93
+ return {
94
+ status: "error",
95
+ message: `Unknown probe id(s): ${unknown.join(", ")} (valid: ${listProbeIds().join(", ")}).`,
96
+ };
97
+ }
98
+ only = ids;
99
+ continue;
100
+ }
101
+ if (arg === "--timeout-ms" || arg.startsWith("--timeout-ms=")) {
102
+ let value;
103
+ if (arg.startsWith("--timeout-ms=")) {
104
+ value = arg.slice("--timeout-ms=".length);
105
+ }
106
+ else {
107
+ if (i + 1 >= argv.length) {
108
+ return { status: "error", message: "--timeout-ms requires a numeric value." };
109
+ }
110
+ value = argv[++i];
111
+ }
112
+ const parsed = Number(value);
113
+ if (!Number.isFinite(parsed) || parsed <= 0) {
114
+ return { status: "error", message: `Invalid --timeout-ms value: '${value}'.` };
115
+ }
116
+ timeoutMs = parsed;
117
+ continue;
118
+ }
119
+ if (arg.startsWith("-")) {
120
+ return { status: "error", message: `Unknown flag: ${arg}` };
121
+ }
122
+ return {
123
+ status: "error",
124
+ message: `Unexpected positional argument: '${arg}'. agent-capabilities takes only flags.`,
125
+ };
126
+ }
127
+ return { status: "ok", options: { agentSelector, only, json, timeoutMs } };
128
+ }
129
+ export async function runAgentCapabilitiesCli(argv, overrides = {}) {
130
+ const log = overrides.log ?? ((m) => console.log(m));
131
+ const errorLog = overrides.errorLog ?? ((m) => console.error(m));
132
+ const parsed = parseAgentCapabilitiesArgs(argv);
133
+ if (parsed.status === "help") {
134
+ log(parsed.usage);
135
+ return 0;
136
+ }
137
+ if (parsed.status === "error") {
138
+ errorLog(`Error: ${parsed.message}`);
139
+ errorLog("");
140
+ errorLog(getAgentCapabilitiesUsage());
141
+ return 1;
142
+ }
143
+ const deps = overrides.deps ?? createDefaultAgentCapabilitiesDeps();
144
+ const agents = parsed.options.agentSelector === "all" ? listAgentNames() : [parsed.options.agentSelector];
145
+ const collection = await collectCapabilityResults(deps, {
146
+ agents,
147
+ only: parsed.options.only,
148
+ timeoutMs: parsed.options.timeoutMs,
149
+ });
150
+ log(parsed.options.json ? formatCapabilityJson(collection) : formatCapabilityReport(collection));
151
+ return hasFailureOrHang(collection) ? 1 : 0;
152
+ }
@@ -0,0 +1,45 @@
1
+ /**
2
+ * Production deps for the `agent-capabilities` toolkit: real subprocesses, temp
3
+ * dirs, and clock. Subprocess execution uses `execFile` (list-based, never
4
+ * `shell: true`) and forwards an `AbortSignal` with `killSignal: "SIGKILL"` so the
5
+ * timeout guard in `probe-context.ts` can actually kill a hung agent (the
6
+ * version-sensitive `cursor-agent -p` non-exit bug) instead of leaking it.
7
+ *
8
+ * Mirrors `createDefaultScheduleRunDeps()` in `schedule-run.ts`, adding the temp-dir
9
+ * filesystem ops and a unique-suffix source the probes need.
10
+ */
11
+ import { execFile } from "node:child_process";
12
+ import { mkdtemp, rm, writeFile, mkdir } from "node:fs/promises";
13
+ import os from "node:os";
14
+ import { randomBytes } from "node:crypto";
15
+ export function createDefaultAgentCapabilitiesDeps() {
16
+ const runCommand = (file, args, options) => new Promise((resolve) => {
17
+ execFile(file, args, {
18
+ cwd: options?.cwd,
19
+ env: options?.env ?? process.env,
20
+ signal: options?.signal,
21
+ killSignal: "SIGKILL",
22
+ maxBuffer: 64 * 1024 * 1024,
23
+ encoding: "utf-8",
24
+ }, (error, stdout, stderr) => {
25
+ const exitCode = error && typeof error.code === "number"
26
+ ? error.code
27
+ : error
28
+ ? 1
29
+ : 0;
30
+ resolve({ stdout: stdout ?? "", stderr: stderr ?? "", exitCode });
31
+ });
32
+ });
33
+ return {
34
+ platform: process.platform,
35
+ env: process.env,
36
+ runCommand,
37
+ tmpRoot: os.tmpdir(),
38
+ mkdtemp: (prefix) => mkdtemp(prefix),
39
+ rm: (target, opts) => rm(target, opts),
40
+ writeFile: (target, data) => writeFile(target, data, "utf-8"),
41
+ mkdir: (target, opts) => mkdir(target, opts).then(() => undefined),
42
+ now: () => Date.now(),
43
+ uniqueSuffix: () => randomBytes(3).toString("hex").toUpperCase(),
44
+ };
45
+ }
@@ -0,0 +1,111 @@
1
+ /**
2
+ * Per-agent probe context: resolves the agent binary once, mints marker tokens,
3
+ * creates auto-cleaned temp projects, and runs the agent headless behind a hard
4
+ * timeout guard.
5
+ *
6
+ * The timeout guard is the whole point of HANG handling: it starts a timer and an
7
+ * AbortController; whichever resolves first wins. If the timer fires we abort the
8
+ * child (SIGKILL via the deps runner) and report `{ kind: "hang" }` distinctly —
9
+ * so the version-sensitive `cursor-agent -p` non-exit bug surfaces as HANG rather
10
+ * than wedging the toolkit forever. This replaces the bash spike's `perl -e 'alarm'`.
11
+ */
12
+ import { join } from "node:path";
13
+ import { resolveCommandOnPath } from "../agent-launchers/claude.js";
14
+ import { DEFAULT_PROBE_TIMEOUT_MS, } from "./types.js";
15
+ /**
16
+ * Build the exact argv for a headless (`-p`) invocation, per agent. Argv arrays
17
+ * only — never a shell string. Exported for direct unit assertion.
18
+ *
19
+ * - cursor-agent: `-p --output-format <fmt> --trust --workspace <cwd> <prompt>`.
20
+ * `--trust` is MANDATORY headless or the workspace-trust prompt hangs; cursor has
21
+ * its own `--workspace` cwd flag.
22
+ * - claude: `-p [--output-format json] <prompt>`. Claude has NO working-dir flag —
23
+ * cwd is set via the spawn options, never an argument.
24
+ */
25
+ export function buildHeadlessArgs(agentName, opts) {
26
+ const fmt = opts.outputFormat ?? "text";
27
+ if (agentName === "cursor-agent") {
28
+ return ["-p", "--output-format", fmt, "--trust", "--workspace", opts.cwd, opts.prompt];
29
+ }
30
+ // claude (and any positional-prompt agent without a cwd flag)
31
+ const args = ["-p"];
32
+ if (fmt === "json")
33
+ args.push("--output-format", "json");
34
+ args.push(opts.prompt);
35
+ return args;
36
+ }
37
+ /** Create a probe context for one agent plus a `cleanup()` that removes its temp dirs. */
38
+ export async function createProbeContext(deps, agent, defaultTimeoutMs = DEFAULT_PROBE_TIMEOUT_MS) {
39
+ const launcherDeps = {
40
+ platform: deps.platform,
41
+ env: deps.env,
42
+ runCommand: (file, args, options) => deps.runCommand(file, args, options),
43
+ };
44
+ const resolvedBinary = await resolveCommandOnPath(agent.command, deps.env.PATH ?? "", launcherDeps);
45
+ const createdDirs = [];
46
+ let counter = 0;
47
+ const ctx = {
48
+ agent,
49
+ deps,
50
+ resolvedBinary,
51
+ marker(name) {
52
+ return `${name}_${deps.uniqueSuffix()}_${counter++}`;
53
+ },
54
+ async makeTempProject(seed) {
55
+ const dir = await deps.mkdtemp(join(deps.tmpRoot, "agent-cap-"));
56
+ createdDirs.push(dir);
57
+ if (seed)
58
+ await seed(dir);
59
+ return dir;
60
+ },
61
+ async runHeadless(opts) {
62
+ const exe = resolvedBinary ?? agent.command;
63
+ const args = buildHeadlessArgs(agent.name, opts);
64
+ const timeoutMs = opts.timeoutMs ?? defaultTimeoutMs;
65
+ const controller = new AbortController();
66
+ let timedOut = false;
67
+ const timer = setTimeout(() => {
68
+ timedOut = true;
69
+ controller.abort();
70
+ }, timeoutMs);
71
+ const start = deps.now();
72
+ try {
73
+ const result = await deps.runCommand(exe, args, {
74
+ cwd: opts.cwd,
75
+ env: deps.env,
76
+ signal: controller.signal,
77
+ });
78
+ const elapsedMs = deps.now() - start;
79
+ if (timedOut)
80
+ return { kind: "hang", elapsedMs, partialStdout: result.stdout ?? "" };
81
+ return {
82
+ kind: "exited",
83
+ exitCode: result.exitCode,
84
+ stdout: result.stdout ?? "",
85
+ stderr: result.stderr ?? "",
86
+ elapsedMs,
87
+ };
88
+ }
89
+ catch (err) {
90
+ const elapsedMs = deps.now() - start;
91
+ if (timedOut)
92
+ return { kind: "hang", elapsedMs, partialStdout: "" };
93
+ return { kind: "spawn-error", message: err instanceof Error ? err.message : String(err) };
94
+ }
95
+ finally {
96
+ clearTimeout(timer);
97
+ }
98
+ },
99
+ };
100
+ const cleanup = async () => {
101
+ for (const dir of createdDirs.splice(0)) {
102
+ try {
103
+ await deps.rm(dir, { recursive: true, force: true });
104
+ }
105
+ catch {
106
+ // best-effort cleanup; a leaked temp dir must not fail the run
107
+ }
108
+ }
109
+ };
110
+ return { ctx, cleanup };
111
+ }
@@ -0,0 +1,278 @@
1
+ /**
2
+ * The structural/cheap capability probe set. Each probe is small, isolated, and
3
+ * marker-based: it seeds a disposable temp project, runs the agent headless, and
4
+ * asserts a token that only the seeded command/prompt could have produced — so a
5
+ * pass means the capability genuinely worked, not that the model guessed.
6
+ *
7
+ * Reuses the validated approach from the cursor-agent spike (baseline, the
8
+ * `.claude/commands` resolution check, and the preamble-then-command drift-check
9
+ * pattern that the schedulable-automations design depends on).
10
+ */
11
+ import { join } from "node:path";
12
+ import { buildHeadlessArgs } from "./probe-context.js";
13
+ const EVIDENCE_MAX = 200;
14
+ function truncate(text) {
15
+ const flat = text.replace(/\s+/g, " ").trim();
16
+ return flat.length > EVIDENCE_MAX ? `${flat.slice(0, EVIDENCE_MAX)}…` : flat;
17
+ }
18
+ /** Verbatim body of the disposable `.claude/commands/echo-test.md` probe command. */
19
+ function echoCommandBody(marker) {
20
+ return [
21
+ "---",
22
+ "description: Disposable no-op probe command that emits a marker token.",
23
+ "---",
24
+ "This is an agent-capability probe. Do not use any tools.",
25
+ `As part of your output, include the token ${marker} on its own line.`,
26
+ "You may also include other content from earlier steps.",
27
+ "",
28
+ ].join("\n");
29
+ }
30
+ /** Seed `<dir>/.claude/commands/echo-test.md` with a marker-emitting command. */
31
+ async function seedEchoCommand(ctx, dir, marker) {
32
+ const cmdDir = join(dir, ".claude", "commands");
33
+ await ctx.deps.mkdir(cmdDir, { recursive: true });
34
+ await ctx.deps.writeFile(join(cmdDir, "echo-test.md"), echoCommandBody(marker));
35
+ }
36
+ /** Map a non-exited guarded run to a probe result; returns null when it exited cleanly. */
37
+ function nonExitedResult(run) {
38
+ if (run.kind === "hang") {
39
+ return {
40
+ status: "hang",
41
+ detail: `agent did not exit within the timeout (${run.elapsedMs}ms) — likely the version-sensitive -p hang`,
42
+ elapsedMs: run.elapsedMs,
43
+ evidence: run.partialStdout ? truncate(run.partialStdout) : undefined,
44
+ };
45
+ }
46
+ if (run.kind === "spawn-error") {
47
+ return { status: "fail", detail: `could not spawn agent: ${run.message}` };
48
+ }
49
+ return null;
50
+ }
51
+ /** Assert exit code 0 and that every expected marker appears in stdout. */
52
+ function assertMarkers(run, markers, failHint) {
53
+ const nonExited = nonExitedResult(run);
54
+ if (nonExited)
55
+ return nonExited;
56
+ // run.kind === "exited"
57
+ const { exitCode, stdout, elapsedMs } = run;
58
+ if (exitCode !== 0) {
59
+ return {
60
+ status: "fail",
61
+ detail: `agent exited ${exitCode} (${failHint})`,
62
+ elapsedMs,
63
+ evidence: truncate(stdout),
64
+ };
65
+ }
66
+ const missing = markers.filter((m) => !stdout.includes(m));
67
+ if (missing.length > 0) {
68
+ return {
69
+ status: "fail",
70
+ detail: `${failHint}; missing marker(s): ${missing.join(", ")}`,
71
+ elapsedMs,
72
+ evidence: truncate(stdout),
73
+ };
74
+ }
75
+ return { status: "pass", detail: `markers present, clean exit`, elapsedMs };
76
+ }
77
+ const binaryResolves = {
78
+ id: "binary-resolves",
79
+ title: "Binary resolves on PATH",
80
+ description: "The agent command resolves to an absolute path on PATH.",
81
+ tier: "structural",
82
+ appliesTo: ["claude", "cursor-agent"],
83
+ spawnsAgent: false,
84
+ async run(ctx) {
85
+ if (ctx.resolvedBinary) {
86
+ return { status: "pass", detail: `resolved at ${ctx.resolvedBinary}` };
87
+ }
88
+ const hint = ctx.deps.platform === "darwin" || ctx.deps.platform === "linux" || ctx.deps.platform === "win32"
89
+ ? ctx.agent.installHint[ctx.deps.platform]
90
+ : "see the agent's install docs";
91
+ return { status: "fail", detail: `'${ctx.agent.command}' not found on PATH. Install: ${hint}` };
92
+ },
93
+ };
94
+ const authEnvPresent = {
95
+ id: "auth-env-present",
96
+ title: "Headless auth credential present",
97
+ description: "The agent's headless auth env var (CURSOR_API_KEY) is set — presence only, value never read into output.",
98
+ tier: "structural",
99
+ appliesTo: ["cursor-agent"],
100
+ spawnsAgent: false,
101
+ async run(ctx) {
102
+ const value = ctx.deps.env.CURSOR_API_KEY;
103
+ if (value && value.length > 0) {
104
+ return { status: "pass", detail: "CURSOR_API_KEY is set" };
105
+ }
106
+ return {
107
+ status: "skip",
108
+ detail: "CURSOR_API_KEY not set — export it to enable headless cursor-agent probes",
109
+ };
110
+ },
111
+ };
112
+ const headlessPrintMode = {
113
+ id: "headless-print-mode",
114
+ title: "Headless print mode runs and exits",
115
+ description: "`-p` print mode produces output and exits cleanly (a hang is reported distinctly).",
116
+ tier: "structural",
117
+ appliesTo: ["claude", "cursor-agent"],
118
+ spawnsAgent: true,
119
+ async run(ctx) {
120
+ const marker = ctx.marker("BASE_OK");
121
+ const dir = await ctx.makeTempProject();
122
+ const run = await ctx.runHeadless({
123
+ prompt: `Do not use any tools. Output exactly the token ${marker} on its own line.`,
124
+ cwd: dir,
125
+ });
126
+ return assertMarkers(run, [marker], "headless print mode did not emit the marker");
127
+ },
128
+ };
129
+ const claudeCommandsResolve = {
130
+ id: "claude-commands-resolve",
131
+ title: "Resolves .claude/commands slash commands",
132
+ description: "A `/echo-test` command seeded in `.claude/commands/` is expanded and its instructions run.",
133
+ tier: "structural",
134
+ appliesTo: ["claude", "cursor-agent"],
135
+ spawnsAgent: true,
136
+ async run(ctx) {
137
+ const marker = ctx.marker("ECHO_CMD_RAN");
138
+ const dir = await ctx.makeTempProject((d) => seedEchoCommand(ctx, d, marker));
139
+ const run = await ctx.runHeadless({ prompt: "/echo-test", cwd: dir });
140
+ return assertMarkers(run, [marker], "agent did not expand the .claude/commands/ slash command");
141
+ },
142
+ };
143
+ /** How many times the (inherently probabilistic) preamble probe samples the behavior. */
144
+ const PREAMBLE_ATTEMPTS = 3;
145
+ const preambleMidPrompt = {
146
+ id: "preamble-mid-prompt",
147
+ title: "Runs a preamble then a mid-prompt command",
148
+ description: "A preamble step followed by a mid-prompt `/echo-test` runs BOTH (the drift-check gate pattern). " +
149
+ "This behavior is probabilistic — slash-command expansion can suppress the preceding text — so it is " +
150
+ "sampled several times and reported as a ratio (majority passes).",
151
+ tier: "structural",
152
+ appliesTo: ["claude", "cursor-agent"],
153
+ spawnsAgent: true,
154
+ async run(ctx) {
155
+ const pre = ctx.marker("PREAMBLE_OK");
156
+ const echo = ctx.marker("ECHO_CMD_RAN");
157
+ const dir = await ctx.makeTempProject((d) => seedEchoCommand(ctx, d, echo));
158
+ let honored = 0;
159
+ let totalMs = 0;
160
+ let lastNonExited = null;
161
+ for (let attempt = 0; attempt < PREAMBLE_ATTEMPTS; attempt++) {
162
+ const run = await ctx.runHeadless({
163
+ prompt: `You must produce two things in your final output. First, output the token ${pre} on its own line. ` +
164
+ `Then run the /echo-test command. Your output must contain BOTH ${pre} and the echo-test command's token.`,
165
+ cwd: dir,
166
+ });
167
+ if (run.kind === "hang")
168
+ return nonExitedResult(run); // a hang is decisive — stop sampling
169
+ const nonExited = nonExitedResult(run);
170
+ if (nonExited) {
171
+ lastNonExited = nonExited;
172
+ continue;
173
+ }
174
+ const exited = run;
175
+ totalMs += exited.elapsedMs;
176
+ if (exited.exitCode === 0 && exited.stdout.includes(pre) && exited.stdout.includes(echo)) {
177
+ honored += 1;
178
+ }
179
+ }
180
+ if (honored === 0 && lastNonExited)
181
+ return lastNonExited;
182
+ const pass = honored * 2 >= PREAMBLE_ATTEMPTS; // majority
183
+ return {
184
+ status: pass ? "pass" : "fail",
185
+ detail: `${honored}/${PREAMBLE_ATTEMPTS} runs honored preamble + mid-prompt command` +
186
+ (pass ? "" : " — preamble is unreliable when a slash command follows it"),
187
+ elapsedMs: totalMs,
188
+ };
189
+ },
190
+ };
191
+ const outputFormat = {
192
+ id: "output-format",
193
+ title: "Honors --output-format text and json",
194
+ description: "Text mode emits the plain marker; json mode emits a parseable JSON envelope.",
195
+ tier: "structural",
196
+ appliesTo: ["claude", "cursor-agent"],
197
+ spawnsAgent: true,
198
+ async run(ctx) {
199
+ const marker = ctx.marker("FMT_OK");
200
+ const dir = await ctx.makeTempProject();
201
+ const textRun = await ctx.runHeadless({
202
+ prompt: `Do not use any tools. Output exactly the token ${marker} on its own line.`,
203
+ cwd: dir,
204
+ outputFormat: "text",
205
+ });
206
+ const textResult = assertMarkers(textRun, [marker], "text output-format did not emit the marker");
207
+ if (textResult.status !== "pass")
208
+ return textResult;
209
+ const jsonRun = await ctx.runHeadless({
210
+ prompt: `Do not use any tools. Output exactly the token ${marker}.`,
211
+ cwd: dir,
212
+ outputFormat: "json",
213
+ });
214
+ const nonExited = nonExitedResult(jsonRun);
215
+ if (nonExited)
216
+ return nonExited;
217
+ const { exitCode, stdout, elapsedMs } = jsonRun;
218
+ if (exitCode !== 0) {
219
+ return { status: "fail", detail: `json output-format exited ${exitCode}`, elapsedMs, evidence: truncate(stdout) };
220
+ }
221
+ try {
222
+ JSON.parse(stdout.trim());
223
+ }
224
+ catch {
225
+ return { status: "fail", detail: "json output-format did not emit parseable JSON", elapsedMs, evidence: truncate(stdout) };
226
+ }
227
+ return { status: "pass", detail: "text marker present and json parseable", elapsedMs };
228
+ },
229
+ };
230
+ const workspaceFlag = {
231
+ id: "workspace-flag",
232
+ title: "Accepts the --workspace working-dir flag",
233
+ description: "cursor-agent accepts `--workspace <dir>` headless and exits cleanly.",
234
+ tier: "structural",
235
+ appliesTo: ["cursor-agent"],
236
+ spawnsAgent: true,
237
+ async run(ctx) {
238
+ const marker = ctx.marker("WS_OK");
239
+ const dir = await ctx.makeTempProject();
240
+ // buildHeadlessArgs always adds --workspace <dir> for cursor-agent.
241
+ const run = await ctx.runHeadless({
242
+ prompt: `Do not use any tools. Output exactly the token ${marker} on its own line.`,
243
+ cwd: dir,
244
+ });
245
+ return assertMarkers(run, [marker], "--workspace run did not complete cleanly");
246
+ },
247
+ };
248
+ const noCwdFlag = {
249
+ id: "no-cwd-flag",
250
+ title: "Has no working-dir flag (cwd via spawn)",
251
+ description: "Claude takes no working-dir flag; cwd is set via the spawn options, not an argument.",
252
+ tier: "structural",
253
+ appliesTo: ["claude"],
254
+ spawnsAgent: false,
255
+ async run(ctx) {
256
+ const args = buildHeadlessArgs(ctx.agent.name, { prompt: "noop", cwd: "/tmp/x" });
257
+ const offenders = args.filter((a) => a === "--workspace" || a === "--cwd" || a === "-C");
258
+ if (offenders.length === 0) {
259
+ return { status: "pass", detail: "no working-dir flag emitted; cwd set via spawn options" };
260
+ }
261
+ return { status: "fail", detail: `unexpected cwd flag(s) in argv: ${offenders.join(", ")}` };
262
+ },
263
+ };
264
+ /** All probes in deterministic report order. */
265
+ export const ALL_PROBES = [
266
+ binaryResolves,
267
+ authEnvPresent,
268
+ headlessPrintMode,
269
+ claudeCommandsResolve,
270
+ preambleMidPrompt,
271
+ outputFormat,
272
+ workspaceFlag,
273
+ noCwdFlag,
274
+ ];
275
+ /** Stable set of valid probe ids for `--only` validation. */
276
+ export function listProbeIds() {
277
+ return ALL_PROBES.map((p) => p.id);
278
+ }
@@ -0,0 +1,50 @@
1
+ const LABEL = {
2
+ pass: "PASS",
3
+ fail: "FAIL",
4
+ skip: "SKIP",
5
+ hang: "HANG",
6
+ "not-applicable": "N/A ",
7
+ };
8
+ export function formatCapabilityReport(collection) {
9
+ const lines = ["agent-capabilities report", ""];
10
+ const agents = [];
11
+ for (const rec of collection.records) {
12
+ if (!agents.includes(rec.agent))
13
+ agents.push(rec.agent);
14
+ }
15
+ const totals = {
16
+ pass: 0,
17
+ fail: 0,
18
+ skip: 0,
19
+ hang: 0,
20
+ "not-applicable": 0,
21
+ };
22
+ for (const agent of agents) {
23
+ lines.push(`Agent: ${agent}`);
24
+ for (const rec of collection.records.filter((r) => r.agent === agent)) {
25
+ totals[rec.result.status] += 1;
26
+ const ms = rec.result.elapsedMs !== undefined ? ` (${rec.result.elapsedMs}ms)` : "";
27
+ lines.push(` [${LABEL[rec.result.status]}] ${rec.probeId.padEnd(24)} ${rec.result.detail}${ms}`);
28
+ if (rec.result.evidence) {
29
+ lines.push(` evidence: ${rec.result.evidence}`);
30
+ }
31
+ }
32
+ lines.push("");
33
+ }
34
+ const na = totals["not-applicable"] ? `, ${totals["not-applicable"]} n/a` : "";
35
+ lines.push(`Summary: ${totals.pass} pass, ${totals.fail} fail, ${totals.skip} skip, ${totals.hang} hang${na}`);
36
+ return lines.join("\n");
37
+ }
38
+ export function formatCapabilityJson(collection) {
39
+ return JSON.stringify({
40
+ records: collection.records.map((r) => ({
41
+ agent: r.agent,
42
+ probe: r.probeId,
43
+ title: r.title,
44
+ status: r.result.status,
45
+ detail: r.result.detail,
46
+ elapsedMs: r.result.elapsedMs ?? null,
47
+ evidence: r.result.evidence ?? null,
48
+ })),
49
+ }, null, 2);
50
+ }