@sean.holung/minicode 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/README.md +48 -43
  2. package/dist/scripts/run-benchmarks.js +147 -0
  3. package/dist/src/agent/config.js +149 -40
  4. package/dist/src/agent/editable-config.js +314 -0
  5. package/dist/src/analysis/structural-analysis.js +379 -0
  6. package/dist/src/benchmark/evaluator.js +79 -0
  7. package/dist/src/benchmark/index.js +4 -0
  8. package/dist/src/benchmark/reporter.js +177 -0
  9. package/dist/src/benchmark/runner.js +100 -0
  10. package/dist/src/benchmark/task-loader.js +78 -0
  11. package/dist/src/benchmark/types.js +5 -0
  12. package/dist/src/cli/args.js +10 -0
  13. package/dist/src/cli/config-slash-command.js +135 -0
  14. package/dist/src/cli/plugin-install.js +69 -0
  15. package/dist/src/index.js +76 -6
  16. package/dist/src/indexer/cache.js +6 -4
  17. package/dist/src/indexer/code-map.js +41 -13
  18. package/dist/src/indexer/plugins/typescript.js +70 -23
  19. package/dist/src/indexer/project-index.js +175 -36
  20. package/dist/src/indexer/symbol-names.js +92 -0
  21. package/dist/src/model-utils.js +18 -0
  22. package/dist/src/serve/agent-bridge.js +203 -24
  23. package/dist/src/serve/mcp-server.js +405 -0
  24. package/dist/src/serve/server.js +165 -10
  25. package/dist/src/serve/websocket.js +8 -0
  26. package/dist/src/shared/graph-styles.js +119 -0
  27. package/dist/src/tools/find-path.js +75 -0
  28. package/dist/src/tools/find-references.js +7 -2
  29. package/dist/src/tools/get-dependencies.js +3 -2
  30. package/dist/src/tools/read-symbol.js +12 -5
  31. package/dist/src/tools/registry.js +3 -1
  32. package/dist/src/tools/search-code-map.js +4 -2
  33. package/dist/src/ui/app.js +1 -1
  34. package/dist/src/ui/cli-ink.js +79 -4
  35. package/dist/src/ui/components/header-bar.js +6 -2
  36. package/dist/src/ui/state/ui-store.js +5 -0
  37. package/dist/src/web/app.js +1124 -176
  38. package/dist/src/web/index.html +113 -3
  39. package/dist/src/web/style.css +973 -55
  40. package/dist/tests/agent.test.js +31 -0
  41. package/dist/tests/analysis-helpers.test.js +89 -0
  42. package/dist/tests/analysis-ui.test.js +29 -0
  43. package/dist/tests/benchmark-harness.test.js +527 -0
  44. package/dist/tests/config-api.test.js +143 -0
  45. package/dist/tests/config-integration.test.js +751 -0
  46. package/dist/tests/config-slash-command.test.js +106 -0
  47. package/dist/tests/config.test.js +42 -1
  48. package/dist/tests/context-indicator.test.js +220 -0
  49. package/dist/tests/editable-config.test.js +109 -0
  50. package/dist/tests/find-path.test.js +183 -0
  51. package/dist/tests/focus-tracker.test.js +62 -0
  52. package/dist/tests/graph-onboarding.test.js +55 -0
  53. package/dist/tests/graph-styles.test.js +65 -0
  54. package/dist/tests/indexer.test.js +137 -0
  55. package/dist/tests/mcp-and-plugin.test.js +186 -0
  56. package/dist/tests/model-client-openai.test.js +29 -0
  57. package/dist/tests/model-selection.test.js +136 -0
  58. package/dist/tests/model-utils.test.js +22 -0
  59. package/dist/tests/reasoning-effort.test.js +264 -0
  60. package/dist/tests/run-benchmarks.test.js +161 -0
  61. package/dist/tests/search-code-map.test.js +18 -0
  62. package/dist/tests/serve.integration.test.js +218 -2
  63. package/dist/tests/session-ui.test.js +21 -0
  64. package/dist/tests/session.test.js +50 -0
  65. package/dist/tests/settings-ui.test.js +30 -0
  66. package/dist/tests/structural-analysis.test.js +218 -0
  67. package/node_modules/@minicode/agent-sdk/README.md +80 -51
  68. package/node_modules/@minicode/agent-sdk/dist/src/agent/agent.d.ts +16 -5
  69. package/node_modules/@minicode/agent-sdk/dist/src/agent/agent.d.ts.map +1 -1
  70. package/node_modules/@minicode/agent-sdk/dist/src/agent/agent.js +51 -33
  71. package/node_modules/@minicode/agent-sdk/dist/src/agent/agent.js.map +1 -1
  72. package/node_modules/@minicode/agent-sdk/dist/src/agent/types.d.ts +14 -0
  73. package/node_modules/@minicode/agent-sdk/dist/src/agent/types.d.ts.map +1 -1
  74. package/node_modules/@minicode/agent-sdk/dist/src/index.d.ts +3 -2
  75. package/node_modules/@minicode/agent-sdk/dist/src/index.d.ts.map +1 -1
  76. package/node_modules/@minicode/agent-sdk/dist/src/index.js +2 -0
  77. package/node_modules/@minicode/agent-sdk/dist/src/index.js.map +1 -1
  78. package/node_modules/@minicode/agent-sdk/dist/src/indexer/focus-tracker.d.ts +35 -0
  79. package/node_modules/@minicode/agent-sdk/dist/src/indexer/focus-tracker.d.ts.map +1 -0
  80. package/node_modules/@minicode/agent-sdk/dist/src/indexer/focus-tracker.js +64 -0
  81. package/node_modules/@minicode/agent-sdk/dist/src/indexer/focus-tracker.js.map +1 -0
  82. package/node_modules/@minicode/agent-sdk/dist/src/indexer/types.d.ts +7 -0
  83. package/node_modules/@minicode/agent-sdk/dist/src/indexer/types.d.ts.map +1 -1
  84. package/node_modules/@minicode/agent-sdk/dist/src/model/client.d.ts +5 -1
  85. package/node_modules/@minicode/agent-sdk/dist/src/model/client.d.ts.map +1 -1
  86. package/node_modules/@minicode/agent-sdk/dist/src/model/client.js +83 -11
  87. package/node_modules/@minicode/agent-sdk/dist/src/model/client.js.map +1 -1
  88. package/node_modules/@minicode/agent-sdk/dist/src/safety/guardrails.d.ts +1 -0
  89. package/node_modules/@minicode/agent-sdk/dist/src/safety/guardrails.d.ts.map +1 -1
  90. package/node_modules/@minicode/agent-sdk/dist/src/safety/guardrails.js +8 -1
  91. package/node_modules/@minicode/agent-sdk/dist/src/safety/guardrails.js.map +1 -1
  92. package/node_modules/@minicode/agent-sdk/dist/src/session/session.d.ts.map +1 -1
  93. package/node_modules/@minicode/agent-sdk/dist/src/session/session.js +4 -1
  94. package/node_modules/@minicode/agent-sdk/dist/src/session/session.js.map +1 -1
  95. package/node_modules/@minicode/agent-sdk/dist/tests/agent.test.js +3 -1
  96. package/node_modules/@minicode/agent-sdk/dist/tests/agent.test.js.map +1 -1
  97. package/node_modules/@minicode/agent-sdk/dist/tests/guardrails.test.js +8 -2
  98. package/node_modules/@minicode/agent-sdk/dist/tests/guardrails.test.js.map +1 -1
  99. package/node_modules/@minicode/agent-sdk/dist/tsconfig.tsbuildinfo +1 -1
  100. package/package.json +9 -5
  101. package/plugin/.claude-plugin/plugin.json +12 -0
  102. package/plugin/.mcp.json +8 -0
  103. package/plugin/CLAUDE.md +26 -0
  104. package/plugin/skills/analyze/SKILL.md +12 -0
  105. package/plugin/skills/focus/SKILL.md +20 -0
  106. package/plugin/skills/graph/SKILL.md +13 -0
  107. package/plugin/skills/symbols/SKILL.md +13 -0
@@ -29,6 +29,22 @@ class RepeatingModelClient {
29
29
  };
30
30
  }
31
31
  }
32
+ class InfiniteToolModelClient {
33
+ callCount = 0;
34
+ getCalls() {
35
+ return this.callCount;
36
+ }
37
+ async chat(params) {
38
+ void params;
39
+ this.callCount += 1;
40
+ return {
41
+ text: `step ${this.callCount}`,
42
+ toolCalls: [{ id: `tool-${this.callCount}`, name: "echo_tool", input: { value: String(this.callCount) } }],
43
+ stopReason: "tool_use",
44
+ usage: { inputTokens: 1, outputTokens: 1 },
45
+ };
46
+ }
47
+ }
32
48
  function createEchoTool() {
33
49
  return {
34
50
  name: "echo_tool",
@@ -81,6 +97,21 @@ test("agent stops on repeated identical tool calls", async () => {
81
97
  const { text } = await agent.runTurn("Do something");
82
98
  assert.match(text, /repeated identical tool calls/);
83
99
  });
100
+ test("agent tells the user how to continue when the turn call limit is reached", async () => {
101
+ const config = createTestAgentConfig("/tmp");
102
+ config.maxSteps = 2;
103
+ const modelClient = new InfiniteToolModelClient();
104
+ const agent = new CodingAgent({
105
+ config,
106
+ modelClient,
107
+ toolRegistry: new ToolRegistry([createEchoTool()]),
108
+ });
109
+ const { text } = await agent.runTurn("Keep working");
110
+ assert.match(text, /turn call limit/);
111
+ assert.match(text, /Type "continue"/);
112
+ assert.match(text, /Settings/);
113
+ assert.equal(modelClient.getCalls(), 2);
114
+ });
84
115
  test("agent omits code map when projectIndex is not provided", async () => {
85
116
  let capturedSystem = "";
86
117
  const spyClient = {
@@ -0,0 +1,89 @@
1
+ import assert from "node:assert/strict";
2
+ import { test } from "node:test";
3
+ import { buildFindingGraphContext, buildFindingMetricChips, countFindingsByType, filterFindings, findingTypeLabel, } from "../src/web/analysis-helpers.js";
4
+ test("buildFindingGraphContext limits cycle highlights to internal cycle edges", () => {
5
+ const finding = {
6
+ id: "cycle:alpha->beta",
7
+ type: "cycle",
8
+ severity: "warning",
9
+ title: "Cycle across 2 symbols",
10
+ summary: "alpha and beta are mutually dependent.",
11
+ symbols: ["alpha", "beta"],
12
+ files: ["src/a.ts", "src/b.ts"],
13
+ metrics: { cycleSize: 2, edgeCount: 2, fileCount: 2 },
14
+ rationale: ["Strongly connected component detected."],
15
+ };
16
+ const context = buildFindingGraphContext(finding, [
17
+ { source: "alpha", target: "beta", kind: "calls" },
18
+ { source: "beta", target: "alpha", kind: "calls" },
19
+ { source: "alpha", target: "gamma", kind: "calls" },
20
+ ]);
21
+ assert.deepEqual(context.nodes, ["alpha", "beta"]);
22
+ assert.deepEqual(context.edgeIds, [
23
+ "alpha->beta:calls",
24
+ "beta->alpha:calls",
25
+ ]);
26
+ });
27
+ test("buildFindingGraphContext includes incident edges for hotspot-style findings", () => {
28
+ const finding = {
29
+ id: "hotspot:service",
30
+ type: "hotspot",
31
+ severity: "info",
32
+ title: "service is a structural hotspot",
33
+ summary: "service has total degree 4.",
34
+ symbols: ["service"],
35
+ files: ["src/service.ts"],
36
+ metrics: { totalDegree: 4, fanIn: 1, fanOut: 3, threshold: 4 },
37
+ rationale: ["Total degree exceeds hotspot threshold."],
38
+ };
39
+ const context = buildFindingGraphContext(finding, [
40
+ { source: "entry", target: "service", kind: "calls" },
41
+ { source: "service", target: "repo", kind: "calls" },
42
+ { source: "service", target: "util", kind: "calls" },
43
+ { source: "other", target: "elsewhere", kind: "calls" },
44
+ ]);
45
+ assert.deepEqual(context.nodes, ["service"]);
46
+ assert.deepEqual(context.edgeIds, [
47
+ "entry->service:calls",
48
+ "service->repo:calls",
49
+ "service->util:calls",
50
+ ]);
51
+ });
52
+ test("analysis helpers summarize types and metrics for rendering", () => {
53
+ const findings = [
54
+ {
55
+ id: "fanin:util",
56
+ type: "fanInOutlier",
57
+ severity: "info",
58
+ title: "util has high fan-in",
59
+ summary: "util is widely referenced.",
60
+ symbols: ["util"],
61
+ files: ["src/util.ts"],
62
+ metrics: { fanIn: 5, threshold: 3 },
63
+ rationale: [],
64
+ },
65
+ {
66
+ id: "file:service",
67
+ type: "fileCoupling",
68
+ severity: "warning",
69
+ title: "service.ts is highly coupled",
70
+ summary: "service.ts has high afferent/efferent coupling.",
71
+ symbols: ["service"],
72
+ files: ["src/service.ts"],
73
+ metrics: { totalCoupling: 4, instability: 0.75 },
74
+ rationale: [],
75
+ },
76
+ ];
77
+ assert.deepEqual(countFindingsByType(findings), {
78
+ cycle: 0,
79
+ fanInOutlier: 1,
80
+ fanOutOutlier: 0,
81
+ hotspot: 0,
82
+ fileCoupling: 1,
83
+ });
84
+ assert.deepEqual(buildFindingMetricChips(findings[0]), ["fan-in 5", "threshold 3"]);
85
+ assert.deepEqual(buildFindingMetricChips(findings[1]), ["coupling 4", "instability 0.75"]);
86
+ assert.equal(findingTypeLabel("fileCoupling"), "File coupling");
87
+ assert.deepEqual(filterFindings(findings, "all").map((finding) => finding.id), ["fanin:util", "file:service"]);
88
+ assert.deepEqual(filterFindings(findings, "fileCoupling").map((finding) => finding.id), ["file:service"]);
89
+ });
@@ -0,0 +1,29 @@
1
+ import assert from "node:assert/strict";
2
+ import { readFileSync } from "node:fs";
3
+ import { join } from "node:path";
4
+ import { test } from "node:test";
5
+ const distWeb = join(import.meta.dirname, "..", "dist", "src", "web");
6
+ test("built HTML contains analysis entry point and drawer shell", () => {
7
+ const html = readFileSync(join(distWeb, "index.html"), "utf8");
8
+ assert.ok(html.includes('id="graph-analyze"'), "HTML should contain the Analyze toolbar button");
9
+ assert.ok(html.includes('id="analysis-panel"'), "HTML should contain the analysis panel");
10
+ assert.ok(html.includes("These signals come from the dependency graph itself, not from agent judgment."), "HTML should explain the deterministic scope of the analysis");
11
+ });
12
+ test("built CSS contains analysis drawer and finding card styles", () => {
13
+ const css = readFileSync(join(distWeb, "style.css"), "utf8");
14
+ assert.ok(css.includes("#analysis-panel"), "CSS should contain analysis panel styles");
15
+ assert.ok(css.includes(".analysis-finding"), "CSS should contain finding card styles");
16
+ assert.ok(css.includes(".analysis-summary-card"), "CSS should contain summary card styles");
17
+ assert.ok(css.includes(".analysis-summary-card.active"), "CSS should style the active analysis filter");
18
+ assert.ok(css.includes(".analysis-explanation"), "CSS should contain AI explanation styles");
19
+ });
20
+ test("built JS contains structural analysis loading and highlighting logic", () => {
21
+ const js = readFileSync(join(distWeb, "app.js"), "utf8");
22
+ assert.ok(js.includes("/api/analysis"), "JS should fetch the analysis API");
23
+ assert.ok(js.includes("/api/analysis/explain"), "JS should call the analysis explanation API");
24
+ assert.ok(js.includes("analysis-selected"), "JS should apply selected analysis highlight classes");
25
+ assert.ok(js.includes("graph-derived structural signals"), "JS should surface deterministic analysis messaging");
26
+ assert.ok(js.includes("AI interpretation"), "JS should label advisory AI interpretation distinctly");
27
+ assert.ok(js.includes("activeAnalysisFilter"), "JS should track the active analysis filter");
28
+ assert.ok(js.includes("data-filter"), "JS should wire summary cards into finding filters");
29
+ });
@@ -0,0 +1,527 @@
1
+ import assert from "node:assert/strict";
2
+ import { mkdtemp, mkdir, writeFile, rm } from "node:fs/promises";
3
+ import path from "node:path";
4
+ import { tmpdir } from "node:os";
5
+ import { test } from "node:test";
6
+ import { loadBenchmarkTasks, loadBenchmarkTask } from "../src/benchmark/task-loader.js";
7
+ import { evaluate } from "../src/benchmark/evaluator.js";
8
+ import { runBenchmarkTask } from "../src/benchmark/runner.js";
9
+ import { buildReport, formatReport, compareReports, } from "../src/benchmark/reporter.js";
10
+ import { createTestAgentConfig } from "./test-utils.js";
11
+ // ─── Helpers ───────────────────────────────────────────────────
12
+ async function createTempTaskDir() {
13
+ const tmpDir = await mkdtemp(path.join(tmpdir(), "bench-test-"));
14
+ await mkdir(path.join(tmpDir, "navigation", "find-foo"), { recursive: true });
15
+ await mkdir(path.join(tmpDir, "editing", "fix-bar"), { recursive: true });
16
+ await writeFile(path.join(tmpDir, "navigation", "find-foo", "task.json"), JSON.stringify({
17
+ title: "Find foo",
18
+ prompt: "Find where foo is defined",
19
+ rubric: {
20
+ expectedOutputPatterns: ["foo"],
21
+ maxToolCalls: 5,
22
+ },
23
+ }));
24
+ await writeFile(path.join(tmpDir, "editing", "fix-bar", "task.json"), JSON.stringify({
25
+ title: "Fix bar",
26
+ prompt: "Fix the bar function",
27
+ rubric: {
28
+ expectedOutputPatterns: ["bar"],
29
+ expectedFilesRead: ["bar.ts"],
30
+ forbiddenPatterns: ["error"],
31
+ maxToolCalls: 10,
32
+ maxTotalTokens: 5000,
33
+ },
34
+ }));
35
+ return tmpDir;
36
+ }
37
+ function makeTrace(overrides = {}) {
38
+ return {
39
+ taskId: "navigation/find-foo",
40
+ model: "test-model",
41
+ variant: "baseline",
42
+ commitSha: "abc123",
43
+ response: "Found foo in src/foo.ts at line 10.",
44
+ toolCalls: [
45
+ { name: "search", input: { query: "foo" }, output: "src/foo.ts:10", durationMs: 50 },
46
+ { name: "read_file", input: { path: "src/foo.ts" }, output: "function foo() {}", durationMs: 30 },
47
+ ],
48
+ filesRead: ["src/foo.ts"],
49
+ symbolsQueried: [],
50
+ usage: { inputTokens: 500, outputTokens: 100, totalTokens: 600 },
51
+ durationMs: 200,
52
+ startedAt: new Date().toISOString(),
53
+ ...overrides,
54
+ };
55
+ }
56
+ class MockModelClient {
57
+ response;
58
+ constructor(response) {
59
+ this.response = response;
60
+ }
61
+ async chat(params) {
62
+ void params;
63
+ return {
64
+ text: this.response,
65
+ toolCalls: [],
66
+ stopReason: "end_turn",
67
+ usage: { inputTokens: 100, outputTokens: 50 },
68
+ };
69
+ }
70
+ }
71
+ class ToolCallingMockClient {
72
+ callCount = 0;
73
+ async chat(params) {
74
+ void params;
75
+ this.callCount += 1;
76
+ if (this.callCount === 1) {
77
+ return {
78
+ text: "Let me search for that.",
79
+ toolCalls: [{ id: "t1", name: "echo_tool", input: { value: "hello" } }],
80
+ stopReason: "tool_use",
81
+ usage: { inputTokens: 100, outputTokens: 50 },
82
+ };
83
+ }
84
+ return {
85
+ text: "Found: echo:hello",
86
+ toolCalls: [],
87
+ stopReason: "end_turn",
88
+ usage: { inputTokens: 150, outputTokens: 60 },
89
+ };
90
+ }
91
+ }
92
+ // ─── Task Loader Tests ─────────────────────────────────────────
93
+ test("loadBenchmarkTasks loads tasks from directory structure", async () => {
94
+ const tmpDir = await createTempTaskDir();
95
+ try {
96
+ const tasks = await loadBenchmarkTasks(tmpDir);
97
+ assert.equal(tasks.length, 2);
98
+ const first = tasks[0];
99
+ const second = tasks[1];
100
+ assert.equal(first.id, "editing/fix-bar");
101
+ assert.equal(second.id, "navigation/find-foo");
102
+ assert.equal(first.category, "editing");
103
+ assert.equal(second.category, "navigation");
104
+ assert.equal(second.prompt, "Find where foo is defined");
105
+ }
106
+ finally {
107
+ await rm(tmpDir, { recursive: true });
108
+ }
109
+ });
110
+ test("loadBenchmarkTasks returns empty array for empty directory", async () => {
111
+ const tmpDir = await mkdtemp(path.join(tmpdir(), "bench-empty-"));
112
+ try {
113
+ const tasks = await loadBenchmarkTasks(tmpDir);
114
+ assert.equal(tasks.length, 0);
115
+ }
116
+ finally {
117
+ await rm(tmpDir, { recursive: true });
118
+ }
119
+ });
120
+ test("loadBenchmarkTasks ignores invalid category directories", async () => {
121
+ const tmpDir = await mkdtemp(path.join(tmpdir(), "bench-invalid-"));
122
+ await mkdir(path.join(tmpDir, "invalid-category", "task1"), { recursive: true });
123
+ await writeFile(path.join(tmpDir, "invalid-category", "task1", "task.json"), JSON.stringify({ title: "X", prompt: "X", rubric: {} }));
124
+ try {
125
+ const tasks = await loadBenchmarkTasks(tmpDir);
126
+ assert.equal(tasks.length, 0);
127
+ }
128
+ finally {
129
+ await rm(tmpDir, { recursive: true });
130
+ }
131
+ });
132
+ test("loadBenchmarkTask loads a single task by id", async () => {
133
+ const tmpDir = await createTempTaskDir();
134
+ try {
135
+ const task = await loadBenchmarkTask(tmpDir, "navigation/find-foo");
136
+ assert.ok(task);
137
+ assert.equal(task.title, "Find foo");
138
+ assert.equal(task.category, "navigation");
139
+ }
140
+ finally {
141
+ await rm(tmpDir, { recursive: true });
142
+ }
143
+ });
144
+ test("loadBenchmarkTask returns undefined for missing task", async () => {
145
+ const tmpDir = await createTempTaskDir();
146
+ try {
147
+ const task = await loadBenchmarkTask(tmpDir, "navigation/nonexistent");
148
+ assert.equal(task, undefined);
149
+ }
150
+ finally {
151
+ await rm(tmpDir, { recursive: true });
152
+ }
153
+ });
154
+ // ─── Evaluator Tests ───────────────────────────────────────────
155
+ test("evaluate passes when all expected patterns match", () => {
156
+ const rubric = {
157
+ expectedOutputPatterns: ["foo", "line \\d+"],
158
+ };
159
+ const trace = makeTrace({ response: "Found foo at line 10" });
160
+ const result = evaluate("test/task", rubric, trace);
161
+ assert.equal(result.passed, true);
162
+ assert.equal(result.checks.length, 2);
163
+ assert.ok(result.checks.every((c) => c.passed));
164
+ });
165
+ test("evaluate fails when expected pattern is missing", () => {
166
+ const rubric = {
167
+ expectedOutputPatterns: ["bar"],
168
+ };
169
+ const trace = makeTrace({ response: "Found foo at line 10" });
170
+ const result = evaluate("test/task", rubric, trace);
171
+ assert.equal(result.passed, false);
172
+ assert.equal(result.checks[0].passed, false);
173
+ assert.ok(result.checks[0].detail?.includes("Pattern not found"));
174
+ });
175
+ test("evaluate checks expected files read", () => {
176
+ const rubric = {
177
+ expectedFilesRead: ["src/foo.ts", "src/bar.ts"],
178
+ };
179
+ const trace = makeTrace({ filesRead: ["src/foo.ts"] });
180
+ const result = evaluate("test/task", rubric, trace);
181
+ assert.equal(result.passed, false);
182
+ assert.equal(result.checks[0].passed, true);
183
+ assert.equal(result.checks[1].passed, false);
184
+ });
185
+ test("evaluate checks expected symbols queried", () => {
186
+ const rubric = {
187
+ expectedSymbols: ["buildProjectIndex"],
188
+ };
189
+ const trace = makeTrace({ symbolsQueried: ["buildProjectIndex"] });
190
+ const result = evaluate("test/task", rubric, trace);
191
+ assert.equal(result.passed, true);
192
+ });
193
+ test("evaluate checks forbidden patterns", () => {
194
+ const rubric = {
195
+ forbiddenPatterns: ["error"],
196
+ };
197
+ const traceOk = makeTrace({ response: "All good" });
198
+ const traceBad = makeTrace({ response: "There was an error" });
199
+ assert.equal(evaluate("t", rubric, traceOk).passed, true);
200
+ assert.equal(evaluate("t", rubric, traceBad).passed, false);
201
+ });
202
+ test("evaluate computes efficiency metrics", () => {
203
+ const rubric = {
204
+ maxToolCalls: 3,
205
+ maxTotalTokens: 1000,
206
+ };
207
+ const trace = makeTrace({
208
+ toolCalls: [
209
+ { name: "a", input: {}, output: "", durationMs: 10 },
210
+ { name: "b", input: {}, output: "", durationMs: 10 },
211
+ ],
212
+ usage: { inputTokens: 400, outputTokens: 200, totalTokens: 600 },
213
+ });
214
+ const result = evaluate("t", rubric, trace);
215
+ assert.equal(result.efficiency.toolCallCount, 2);
216
+ assert.equal(result.efficiency.totalTokens, 600);
217
+ assert.equal(result.efficiency.withinBudget, true);
218
+ });
219
+ test("evaluate marks over-budget when tool calls exceed limit", () => {
220
+ const rubric = { maxToolCalls: 1 };
221
+ const trace = makeTrace({
222
+ toolCalls: [
223
+ { name: "a", input: {}, output: "", durationMs: 10 },
224
+ { name: "b", input: {}, output: "", durationMs: 10 },
225
+ ],
226
+ });
227
+ const result = evaluate("t", rubric, trace);
228
+ assert.equal(result.efficiency.withinBudget, false);
229
+ });
230
+ test("evaluate marks over-budget when tokens exceed limit", () => {
231
+ const rubric = { maxTotalTokens: 100 };
232
+ const trace = makeTrace({
233
+ usage: { inputTokens: 500, outputTokens: 100, totalTokens: 600 },
234
+ });
235
+ const result = evaluate("t", rubric, trace);
236
+ assert.equal(result.efficiency.withinBudget, false);
237
+ });
238
+ test("evaluate passes with empty rubric", () => {
239
+ const rubric = {};
240
+ const trace = makeTrace();
241
+ const result = evaluate("t", rubric, trace);
242
+ assert.equal(result.passed, true);
243
+ assert.equal(result.checks.length, 0);
244
+ assert.equal(result.efficiency.withinBudget, true);
245
+ });
246
+ // ─── Runner Tests ──────────────────────────────────────────────
247
+ test("runBenchmarkTask captures trace with tool calls", async () => {
248
+ const task = {
249
+ id: "navigation/test-task",
250
+ title: "Test task",
251
+ category: "navigation",
252
+ prompt: "Find the thing",
253
+ rubric: {},
254
+ };
255
+ const echoTool = {
256
+ name: "echo_tool",
257
+ description: "Echoes a value",
258
+ inputSchema: {
259
+ type: "object",
260
+ properties: { value: { type: "string" } },
261
+ required: ["value"],
262
+ },
263
+ execute: async (input) => `echo:${String(input.value)}`,
264
+ };
265
+ const config = createTestAgentConfig(process.cwd());
266
+ const trace = await runBenchmarkTask(task, {
267
+ modelClient: new ToolCallingMockClient(),
268
+ config,
269
+ tools: [echoTool],
270
+ variant: "test",
271
+ });
272
+ assert.equal(trace.taskId, "navigation/test-task");
273
+ assert.equal(trace.variant, "test");
274
+ assert.equal(trace.toolCalls.length, 1);
275
+ assert.equal(trace.toolCalls[0].name, "echo_tool");
276
+ assert.equal(trace.toolCalls[0].output, "echo:hello");
277
+ assert.ok(trace.durationMs >= 0);
278
+ assert.ok(trace.usage.totalTokens > 0);
279
+ });
280
+ test("runBenchmarkTask captures trace without tool calls", async () => {
281
+ const task = {
282
+ id: "planning/simple",
283
+ title: "Simple task",
284
+ category: "planning",
285
+ prompt: "Explain something",
286
+ rubric: {},
287
+ };
288
+ const config = createTestAgentConfig(process.cwd());
289
+ const trace = await runBenchmarkTask(task, {
290
+ modelClient: new MockModelClient("Here is my explanation about foo."),
291
+ config,
292
+ tools: [],
293
+ variant: "baseline",
294
+ });
295
+ assert.equal(trace.response, "Here is my explanation about foo.");
296
+ assert.equal(trace.toolCalls.length, 0);
297
+ assert.equal(trace.variant, "baseline");
298
+ });
299
+ test("runBenchmarkTask calls onTaskComplete callback", async () => {
300
+ const task = {
301
+ id: "nav/cb-test",
302
+ title: "Callback test",
303
+ category: "navigation",
304
+ prompt: "Do something",
305
+ rubric: {},
306
+ };
307
+ let callbackTaskId;
308
+ const config = createTestAgentConfig(process.cwd());
309
+ await runBenchmarkTask(task, {
310
+ modelClient: new MockModelClient("done"),
311
+ config,
312
+ tools: [],
313
+ variant: "v1",
314
+ onTaskComplete: (taskId) => {
315
+ callbackTaskId = taskId;
316
+ },
317
+ });
318
+ assert.equal(callbackTaskId, "nav/cb-test");
319
+ });
320
+ test("runBenchmarkTask tracks files read from read_file tool", async () => {
321
+ const task = {
322
+ id: "nav/file-track",
323
+ title: "File tracking test",
324
+ category: "navigation",
325
+ prompt: "Read a file",
326
+ rubric: {},
327
+ };
328
+ let callCount = 0;
329
+ const mockClient = {
330
+ async chat(params) {
331
+ void params;
332
+ callCount += 1;
333
+ if (callCount === 1) {
334
+ return {
335
+ text: "Reading file",
336
+ toolCalls: [{ id: "t1", name: "read_file", input: { path: "src/main.ts" } }],
337
+ stopReason: "tool_use",
338
+ usage: { inputTokens: 100, outputTokens: 50 },
339
+ };
340
+ }
341
+ return {
342
+ text: "Found it",
343
+ toolCalls: [],
344
+ stopReason: "end_turn",
345
+ usage: { inputTokens: 100, outputTokens: 50 },
346
+ };
347
+ },
348
+ };
349
+ const readFileTool = {
350
+ name: "read_file",
351
+ description: "Read a file",
352
+ inputSchema: {
353
+ type: "object",
354
+ properties: { path: { type: "string" } },
355
+ required: ["path"],
356
+ },
357
+ execute: async () => "file contents here",
358
+ };
359
+ const config = createTestAgentConfig(process.cwd());
360
+ const trace = await runBenchmarkTask(task, {
361
+ modelClient: mockClient,
362
+ config,
363
+ tools: [readFileTool],
364
+ variant: "v1",
365
+ });
366
+ assert.ok(trace.filesRead.includes("src/main.ts"));
367
+ });
368
+ test("runBenchmarkTask tracks symbols from structural tools", async () => {
369
+ const task = {
370
+ id: "nav/sym-track",
371
+ title: "Symbol tracking test",
372
+ category: "navigation",
373
+ prompt: "Find references",
374
+ rubric: {},
375
+ };
376
+ let callCount = 0;
377
+ const mockClient = {
378
+ async chat(params) {
379
+ void params;
380
+ callCount += 1;
381
+ if (callCount === 1) {
382
+ return {
383
+ text: "Looking up symbol",
384
+ toolCalls: [{ id: "t1", name: "read_symbol", input: { symbol: "CodingAgent" } }],
385
+ stopReason: "tool_use",
386
+ usage: { inputTokens: 100, outputTokens: 50 },
387
+ };
388
+ }
389
+ return {
390
+ text: "Found CodingAgent",
391
+ toolCalls: [],
392
+ stopReason: "end_turn",
393
+ usage: { inputTokens: 100, outputTokens: 50 },
394
+ };
395
+ },
396
+ };
397
+ const readSymbolTool = {
398
+ name: "read_symbol",
399
+ description: "Read a symbol",
400
+ inputSchema: {
401
+ type: "object",
402
+ properties: { symbol: { type: "string" } },
403
+ required: ["symbol"],
404
+ },
405
+ execute: async () => "class CodingAgent { ... }",
406
+ };
407
+ const config = createTestAgentConfig(process.cwd());
408
+ const trace = await runBenchmarkTask(task, {
409
+ modelClient: mockClient,
410
+ config,
411
+ tools: [readSymbolTool],
412
+ variant: "v1",
413
+ });
414
+ assert.ok(trace.symbolsQueried.includes("CodingAgent"));
415
+ });
416
+ // ─── Reporter Tests ────────────────────────────────────────────
417
+ test("buildReport generates correct summary", () => {
418
+ const tasks = [
419
+ {
420
+ id: "navigation/find-foo",
421
+ title: "Find foo",
422
+ category: "navigation",
423
+ prompt: "Find foo",
424
+ rubric: { expectedOutputPatterns: ["foo"] },
425
+ },
426
+ {
427
+ id: "editing/fix-bar",
428
+ title: "Fix bar",
429
+ category: "editing",
430
+ prompt: "Fix bar",
431
+ rubric: { expectedOutputPatterns: ["bar"] },
432
+ },
433
+ ];
434
+ const traces = [
435
+ makeTrace({ taskId: "navigation/find-foo", response: "Found foo" }),
436
+ makeTrace({ taskId: "editing/fix-bar", response: "No match here" }),
437
+ ];
438
+ const report = buildReport(tasks, traces, "baseline", "test-model");
439
+ assert.equal(report.summary.totalTasks, 2);
440
+ assert.equal(report.summary.passed, 1);
441
+ assert.equal(report.summary.failed, 1);
442
+ assert.equal(report.summary.passRate, 0.5);
443
+ const navStats = report.summary.byCategory["navigation"];
444
+ assert.ok(navStats);
445
+ assert.equal(navStats.passed, 1);
446
+ const editStats = report.summary.byCategory["editing"];
447
+ assert.ok(editStats);
448
+ assert.equal(editStats.passed, 0);
449
+ });
450
+ test("buildReport throws for unknown task id in trace", () => {
451
+ assert.throws(() => buildReport([], [makeTrace({ taskId: "unknown/task" })], "v1", "m"), /No task definition found/);
452
+ });
453
+ test("formatReport produces readable output", () => {
454
+ const tasks = [
455
+ {
456
+ id: "navigation/find-foo",
457
+ title: "Find foo",
458
+ category: "navigation",
459
+ prompt: "Find foo",
460
+ rubric: { expectedOutputPatterns: ["foo"] },
461
+ },
462
+ ];
463
+ const traces = [makeTrace({ taskId: "navigation/find-foo", response: "Found foo" })];
464
+ const report = buildReport(tasks, traces, "baseline", "test-model");
465
+ const output = formatReport(report);
466
+ assert.ok(output.includes("Benchmark Report: baseline"));
467
+ assert.ok(output.includes("Model: test-model"));
468
+ assert.ok(output.includes("1/1 passed"));
469
+ assert.ok(output.includes("[PASS] navigation/find-foo"));
470
+ });
471
+ test("compareReports shows improvements and regressions", () => {
472
+ const tasks = [
473
+ {
474
+ id: "navigation/find-foo",
475
+ title: "Find foo",
476
+ category: "navigation",
477
+ prompt: "Find foo",
478
+ rubric: { expectedOutputPatterns: ["foo"] },
479
+ },
480
+ ];
481
+ const baseTraces = [makeTrace({ taskId: "navigation/find-foo", response: "no match" })];
482
+ const candTraces = [makeTrace({ taskId: "navigation/find-foo", response: "Found foo" })];
483
+ const baseline = buildReport(tasks, baseTraces, "baseline", "test-model");
484
+ const candidate = buildReport(tasks, candTraces, "v2", "test-model");
485
+ const comparison = compareReports(baseline, candidate);
486
+ assert.ok(comparison.includes("baseline"));
487
+ assert.ok(comparison.includes("v2"));
488
+ assert.ok(comparison.includes("FIXED"));
489
+ });
490
+ test("compareReports detects regressions", () => {
491
+ const tasks = [
492
+ {
493
+ id: "navigation/find-foo",
494
+ title: "Find foo",
495
+ category: "navigation",
496
+ prompt: "Find foo",
497
+ rubric: { expectedOutputPatterns: ["foo"] },
498
+ },
499
+ ];
500
+ const baseTraces = [makeTrace({ taskId: "navigation/find-foo", response: "Found foo here" })];
501
+ const candTraces = [makeTrace({ taskId: "navigation/find-foo", response: "no match" })];
502
+ const baseline = buildReport(tasks, baseTraces, "baseline", "test-model");
503
+ const candidate = buildReport(tasks, candTraces, "v2", "test-model");
504
+ const comparison = compareReports(baseline, candidate);
505
+ assert.ok(comparison.includes("REGRESSED"));
506
+ });
507
+ // ─── Integration: Load real tasks from benchmarks/ ─────────────
508
+ test("loads real benchmark tasks from benchmarks/tasks/", async () => {
509
+ const tasksDir = path.resolve(import.meta.dirname, "..", "benchmarks", "tasks");
510
+ const tasks = await loadBenchmarkTasks(tasksDir);
511
+ assert.ok(tasks.length >= 20, `Expected at least 20 tasks, got ${tasks.length}`);
512
+ // Verify each task has required fields
513
+ for (const task of tasks) {
514
+ assert.ok(task.id, "task must have an id");
515
+ assert.ok(task.title, "task must have a title");
516
+ assert.ok(task.prompt, "task must have a prompt");
517
+ assert.ok(task.category, "task must have a category");
518
+ assert.ok(task.rubric, "task must have a rubric");
519
+ }
520
+ // Check categories are covered
521
+ const categories = new Set(tasks.map((t) => t.category));
522
+ assert.ok(categories.has("navigation"));
523
+ assert.ok(categories.has("editing"));
524
+ assert.ok(categories.has("refactors"));
525
+ assert.ok(categories.has("debugging"));
526
+ assert.ok(categories.has("planning"));
527
+ });