@sean.holung/minicode 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +52 -42
- package/dist/scripts/run-benchmarks.js +147 -0
- package/dist/src/agent/config.js +149 -40
- package/dist/src/agent/editable-config.js +314 -0
- package/dist/src/analysis/structural-analysis.js +379 -0
- package/dist/src/benchmark/evaluator.js +79 -0
- package/dist/src/benchmark/index.js +4 -0
- package/dist/src/benchmark/reporter.js +177 -0
- package/dist/src/benchmark/runner.js +100 -0
- package/dist/src/benchmark/task-loader.js +78 -0
- package/dist/src/benchmark/types.js +5 -0
- package/dist/src/cli/args.js +10 -0
- package/dist/src/cli/config-slash-command.js +135 -0
- package/dist/src/cli/plugin-install.js +69 -0
- package/dist/src/index.js +76 -6
- package/dist/src/indexer/cache.js +6 -4
- package/dist/src/indexer/code-map.js +41 -13
- package/dist/src/indexer/plugins/typescript.js +70 -23
- package/dist/src/indexer/project-index.js +175 -36
- package/dist/src/indexer/symbol-names.js +92 -0
- package/dist/src/model-utils.js +18 -0
- package/dist/src/serve/agent-bridge.js +203 -24
- package/dist/src/serve/mcp-server.js +405 -0
- package/dist/src/serve/server.js +165 -10
- package/dist/src/serve/websocket.js +8 -0
- package/dist/src/shared/graph-styles.js +119 -0
- package/dist/src/tools/find-path.js +75 -0
- package/dist/src/tools/find-references.js +7 -2
- package/dist/src/tools/get-dependencies.js +3 -2
- package/dist/src/tools/read-symbol.js +12 -5
- package/dist/src/tools/registry.js +3 -1
- package/dist/src/tools/search-code-map.js +4 -2
- package/dist/src/ui/app.js +1 -1
- package/dist/src/ui/cli-ink.js +79 -4
- package/dist/src/ui/components/header-bar.js +6 -2
- package/dist/src/ui/state/ui-store.js +5 -0
- package/dist/src/web/app.js +1124 -176
- package/dist/src/web/index.html +113 -3
- package/dist/src/web/style.css +973 -55
- package/dist/tests/agent.test.js +31 -0
- package/dist/tests/analysis-helpers.test.js +89 -0
- package/dist/tests/analysis-ui.test.js +29 -0
- package/dist/tests/benchmark-harness.test.js +527 -0
- package/dist/tests/config-api.test.js +143 -0
- package/dist/tests/config-integration.test.js +751 -0
- package/dist/tests/config-slash-command.test.js +106 -0
- package/dist/tests/config.test.js +42 -1
- package/dist/tests/context-indicator.test.js +220 -0
- package/dist/tests/editable-config.test.js +109 -0
- package/dist/tests/find-path.test.js +183 -0
- package/dist/tests/focus-tracker.test.js +62 -0
- package/dist/tests/graph-onboarding.test.js +55 -0
- package/dist/tests/graph-styles.test.js +65 -0
- package/dist/tests/indexer.test.js +137 -0
- package/dist/tests/mcp-and-plugin.test.js +186 -0
- package/dist/tests/model-client-openai.test.js +29 -0
- package/dist/tests/model-selection.test.js +136 -0
- package/dist/tests/model-utils.test.js +22 -0
- package/dist/tests/reasoning-effort.test.js +264 -0
- package/dist/tests/run-benchmarks.test.js +161 -0
- package/dist/tests/search-code-map.test.js +18 -0
- package/dist/tests/serve.integration.test.js +218 -2
- package/dist/tests/session-ui.test.js +21 -0
- package/dist/tests/session.test.js +50 -0
- package/dist/tests/settings-ui.test.js +30 -0
- package/dist/tests/structural-analysis.test.js +218 -0
- package/node_modules/@minicode/agent-sdk/README.md +80 -51
- package/node_modules/@minicode/agent-sdk/dist/src/agent/agent.d.ts +16 -5
- package/node_modules/@minicode/agent-sdk/dist/src/agent/agent.d.ts.map +1 -1
- package/node_modules/@minicode/agent-sdk/dist/src/agent/agent.js +51 -33
- package/node_modules/@minicode/agent-sdk/dist/src/agent/agent.js.map +1 -1
- package/node_modules/@minicode/agent-sdk/dist/src/agent/types.d.ts +14 -0
- package/node_modules/@minicode/agent-sdk/dist/src/agent/types.d.ts.map +1 -1
- package/node_modules/@minicode/agent-sdk/dist/src/index.d.ts +3 -2
- package/node_modules/@minicode/agent-sdk/dist/src/index.d.ts.map +1 -1
- package/node_modules/@minicode/agent-sdk/dist/src/index.js +2 -0
- package/node_modules/@minicode/agent-sdk/dist/src/index.js.map +1 -1
- package/node_modules/@minicode/agent-sdk/dist/src/indexer/focus-tracker.d.ts +35 -0
- package/node_modules/@minicode/agent-sdk/dist/src/indexer/focus-tracker.d.ts.map +1 -0
- package/node_modules/@minicode/agent-sdk/dist/src/indexer/focus-tracker.js +64 -0
- package/node_modules/@minicode/agent-sdk/dist/src/indexer/focus-tracker.js.map +1 -0
- package/node_modules/@minicode/agent-sdk/dist/src/indexer/types.d.ts +7 -0
- package/node_modules/@minicode/agent-sdk/dist/src/indexer/types.d.ts.map +1 -1
- package/node_modules/@minicode/agent-sdk/dist/src/model/client.d.ts +5 -1
- package/node_modules/@minicode/agent-sdk/dist/src/model/client.d.ts.map +1 -1
- package/node_modules/@minicode/agent-sdk/dist/src/model/client.js +83 -11
- package/node_modules/@minicode/agent-sdk/dist/src/model/client.js.map +1 -1
- package/node_modules/@minicode/agent-sdk/dist/src/safety/guardrails.d.ts +1 -0
- package/node_modules/@minicode/agent-sdk/dist/src/safety/guardrails.d.ts.map +1 -1
- package/node_modules/@minicode/agent-sdk/dist/src/safety/guardrails.js +8 -1
- package/node_modules/@minicode/agent-sdk/dist/src/safety/guardrails.js.map +1 -1
- package/node_modules/@minicode/agent-sdk/dist/src/session/session.d.ts.map +1 -1
- package/node_modules/@minicode/agent-sdk/dist/src/session/session.js +4 -1
- package/node_modules/@minicode/agent-sdk/dist/src/session/session.js.map +1 -1
- package/node_modules/@minicode/agent-sdk/dist/tests/agent.test.js +3 -1
- package/node_modules/@minicode/agent-sdk/dist/tests/agent.test.js.map +1 -1
- package/node_modules/@minicode/agent-sdk/dist/tests/guardrails.test.js +8 -2
- package/node_modules/@minicode/agent-sdk/dist/tests/guardrails.test.js.map +1 -1
- package/node_modules/@minicode/agent-sdk/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +9 -5
- package/plugin/.claude-plugin/plugin.json +12 -0
- package/plugin/.mcp.json +8 -0
- package/plugin/CLAUDE.md +26 -0
- package/plugin/skills/analyze/SKILL.md +12 -0
- package/plugin/skills/focus/SKILL.md +20 -0
- package/plugin/skills/graph/SKILL.md +13 -0
- package/plugin/skills/symbols/SKILL.md +13 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluates a benchmark trace against a task's rubric.
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Evaluate a benchmark trace against the task rubric.
|
|
6
|
+
*/
|
|
7
|
+
export function evaluate(taskId, rubric, trace) {
|
|
8
|
+
const checks = [];
|
|
9
|
+
// 1. Check expected output patterns
|
|
10
|
+
if (rubric.expectedOutputPatterns) {
|
|
11
|
+
for (const pattern of rubric.expectedOutputPatterns) {
|
|
12
|
+
const regex = new RegExp(pattern, "i");
|
|
13
|
+
checks.push({
|
|
14
|
+
name: `output matches /${pattern}/i`,
|
|
15
|
+
passed: regex.test(trace.response),
|
|
16
|
+
detail: regex.test(trace.response)
|
|
17
|
+
? undefined
|
|
18
|
+
: `Pattern not found in response`,
|
|
19
|
+
});
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
// 2. Check expected files read
|
|
23
|
+
if (rubric.expectedFilesRead) {
|
|
24
|
+
for (const expectedFile of rubric.expectedFilesRead) {
|
|
25
|
+
const found = trace.filesRead.some((f) => f === expectedFile || f.endsWith(expectedFile));
|
|
26
|
+
checks.push({
|
|
27
|
+
name: `read file ${expectedFile}`,
|
|
28
|
+
passed: found,
|
|
29
|
+
detail: found ? undefined : `File was not read during the run`,
|
|
30
|
+
});
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
// 3. Check expected symbols queried
|
|
34
|
+
if (rubric.expectedSymbols) {
|
|
35
|
+
for (const sym of rubric.expectedSymbols) {
|
|
36
|
+
const found = trace.symbolsQueried.some((s) => s === sym || s.includes(sym));
|
|
37
|
+
checks.push({
|
|
38
|
+
name: `queried symbol ${sym}`,
|
|
39
|
+
passed: found,
|
|
40
|
+
detail: found ? undefined : `Symbol was not queried`,
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
// 4. Check forbidden patterns
|
|
45
|
+
if (rubric.forbiddenPatterns) {
|
|
46
|
+
for (const pattern of rubric.forbiddenPatterns) {
|
|
47
|
+
const regex = new RegExp(pattern, "i");
|
|
48
|
+
const absent = !regex.test(trace.response);
|
|
49
|
+
checks.push({
|
|
50
|
+
name: `output does NOT match /${pattern}/i`,
|
|
51
|
+
passed: absent,
|
|
52
|
+
detail: absent ? undefined : `Forbidden pattern found in response`,
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
// 5. Efficiency metrics
|
|
57
|
+
const efficiency = computeEfficiency(rubric, trace);
|
|
58
|
+
const allChecksPassed = checks.every((c) => c.passed);
|
|
59
|
+
return {
|
|
60
|
+
taskId,
|
|
61
|
+
passed: allChecksPassed,
|
|
62
|
+
checks,
|
|
63
|
+
efficiency,
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
function computeEfficiency(rubric, trace) {
|
|
67
|
+
const toolCallCount = trace.toolCalls.length;
|
|
68
|
+
const totalTokens = trace.usage.totalTokens;
|
|
69
|
+
const withinToolBudget = rubric.maxToolCalls == null || toolCallCount <= rubric.maxToolCalls;
|
|
70
|
+
const withinTokenBudget = rubric.maxTotalTokens == null || totalTokens <= rubric.maxTotalTokens;
|
|
71
|
+
return {
|
|
72
|
+
toolCallCount,
|
|
73
|
+
totalTokens,
|
|
74
|
+
durationMs: trace.durationMs,
|
|
75
|
+
filesReadCount: trace.filesRead.length,
|
|
76
|
+
symbolsQueriedCount: trace.symbolsQueried.length,
|
|
77
|
+
withinBudget: withinToolBudget && withinTokenBudget,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
export { loadBenchmarkTask, loadBenchmarkTasks } from "./task-loader.js";
|
|
2
|
+
export { evaluate } from "./evaluator.js";
|
|
3
|
+
export { runBenchmarkTask, runBenchmarkSuite, } from "./runner.js";
|
|
4
|
+
export { buildReport, buildReportFromEvaluations, formatReport, compareReports, } from "./reporter.js";
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generates benchmark reports from evaluation results and traces.
|
|
3
|
+
*/
|
|
4
|
+
import { evaluate } from "./evaluator.js";
|
|
5
|
+
/**
|
|
6
|
+
* Build a full benchmark report from tasks and their traces.
|
|
7
|
+
*/
|
|
8
|
+
export function buildReport(tasks, traces, variant, model) {
|
|
9
|
+
const taskMap = new Map(tasks.map((t) => [t.id, t]));
|
|
10
|
+
const results = traces.map((trace) => {
|
|
11
|
+
const task = taskMap.get(trace.taskId);
|
|
12
|
+
if (!task) {
|
|
13
|
+
throw new Error(`No task definition found for trace: ${trace.taskId}`);
|
|
14
|
+
}
|
|
15
|
+
const evaluation = evaluate(task.id, task.rubric, trace);
|
|
16
|
+
return {
|
|
17
|
+
taskId: task.id,
|
|
18
|
+
category: task.category,
|
|
19
|
+
evaluation,
|
|
20
|
+
trace,
|
|
21
|
+
};
|
|
22
|
+
});
|
|
23
|
+
return {
|
|
24
|
+
variant,
|
|
25
|
+
model,
|
|
26
|
+
generatedAt: new Date().toISOString(),
|
|
27
|
+
results,
|
|
28
|
+
summary: computeSummary(results),
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Build a report from pre-computed evaluations and traces.
|
|
33
|
+
*/
|
|
34
|
+
export function buildReportFromEvaluations(evaluations, traces, tasks, variant, model) {
|
|
35
|
+
const taskMap = new Map(tasks.map((t) => [t.id, t]));
|
|
36
|
+
const evalMap = new Map(evaluations.map((e) => [e.taskId, e]));
|
|
37
|
+
const results = traces.map((trace) => {
|
|
38
|
+
const task = taskMap.get(trace.taskId);
|
|
39
|
+
const evaluation = evalMap.get(trace.taskId);
|
|
40
|
+
if (!task || !evaluation) {
|
|
41
|
+
throw new Error(`Missing task or evaluation for: ${trace.taskId}`);
|
|
42
|
+
}
|
|
43
|
+
return {
|
|
44
|
+
taskId: task.id,
|
|
45
|
+
category: task.category,
|
|
46
|
+
evaluation,
|
|
47
|
+
trace,
|
|
48
|
+
};
|
|
49
|
+
});
|
|
50
|
+
return {
|
|
51
|
+
variant,
|
|
52
|
+
model,
|
|
53
|
+
generatedAt: new Date().toISOString(),
|
|
54
|
+
results,
|
|
55
|
+
summary: computeSummary(results),
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
function computeSummary(results) {
|
|
59
|
+
const total = results.length;
|
|
60
|
+
const passed = results.filter((r) => r.evaluation.passed).length;
|
|
61
|
+
// By category
|
|
62
|
+
const byCategory = {};
|
|
63
|
+
for (const r of results) {
|
|
64
|
+
const cat = r.category;
|
|
65
|
+
if (!byCategory[cat]) {
|
|
66
|
+
byCategory[cat] = { total: 0, passed: 0, passRate: 0 };
|
|
67
|
+
}
|
|
68
|
+
byCategory[cat].total += 1;
|
|
69
|
+
if (r.evaluation.passed)
|
|
70
|
+
byCategory[cat].passed += 1;
|
|
71
|
+
}
|
|
72
|
+
for (const entry of Object.values(byCategory)) {
|
|
73
|
+
entry.passRate = entry.total > 0 ? entry.passed / entry.total : 0;
|
|
74
|
+
}
|
|
75
|
+
// Averages
|
|
76
|
+
const avgToolCalls = total > 0
|
|
77
|
+
? results.reduce((sum, r) => sum + r.evaluation.efficiency.toolCallCount, 0) / total
|
|
78
|
+
: 0;
|
|
79
|
+
const avgTotalTokens = total > 0
|
|
80
|
+
? results.reduce((sum, r) => sum + r.evaluation.efficiency.totalTokens, 0) / total
|
|
81
|
+
: 0;
|
|
82
|
+
const avgDurationMs = total > 0
|
|
83
|
+
? results.reduce((sum, r) => sum + r.evaluation.efficiency.durationMs, 0) / total
|
|
84
|
+
: 0;
|
|
85
|
+
return {
|
|
86
|
+
totalTasks: total,
|
|
87
|
+
passed,
|
|
88
|
+
failed: total - passed,
|
|
89
|
+
passRate: total > 0 ? passed / total : 0,
|
|
90
|
+
byCategory,
|
|
91
|
+
avgToolCalls,
|
|
92
|
+
avgTotalTokens,
|
|
93
|
+
avgDurationMs,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Format a report as a human-readable string for terminal output.
|
|
98
|
+
*/
|
|
99
|
+
export function formatReport(report) {
|
|
100
|
+
const lines = [];
|
|
101
|
+
lines.push(`Benchmark Report: ${report.variant}`);
|
|
102
|
+
lines.push(`Model: ${report.model}`);
|
|
103
|
+
lines.push(`Generated: ${report.generatedAt}`);
|
|
104
|
+
lines.push("");
|
|
105
|
+
// Summary
|
|
106
|
+
const s = report.summary;
|
|
107
|
+
lines.push(`Results: ${s.passed}/${s.totalTasks} passed (${(s.passRate * 100).toFixed(1)}%)`);
|
|
108
|
+
lines.push("");
|
|
109
|
+
// By category
|
|
110
|
+
lines.push("By category:");
|
|
111
|
+
for (const [cat, stats] of Object.entries(s.byCategory)) {
|
|
112
|
+
lines.push(` ${cat}: ${stats.passed}/${stats.total} (${(stats.passRate * 100).toFixed(1)}%)`);
|
|
113
|
+
}
|
|
114
|
+
lines.push("");
|
|
115
|
+
// Efficiency
|
|
116
|
+
lines.push("Efficiency (averages):");
|
|
117
|
+
lines.push(` Tool calls: ${s.avgToolCalls.toFixed(1)}`);
|
|
118
|
+
lines.push(` Total tokens: ${s.avgTotalTokens.toFixed(0)}`);
|
|
119
|
+
lines.push(` Duration: ${s.avgDurationMs.toFixed(0)}ms`);
|
|
120
|
+
lines.push("");
|
|
121
|
+
// Per-task details
|
|
122
|
+
lines.push("Task details:");
|
|
123
|
+
for (const r of report.results) {
|
|
124
|
+
const status = r.evaluation.passed ? "PASS" : "FAIL";
|
|
125
|
+
lines.push(` [${status}] ${r.taskId}`);
|
|
126
|
+
for (const check of r.evaluation.checks) {
|
|
127
|
+
const checkMark = check.passed ? "+" : "-";
|
|
128
|
+
const detail = check.detail ? ` (${check.detail})` : "";
|
|
129
|
+
lines.push(` [${checkMark}] ${check.name}${detail}`);
|
|
130
|
+
}
|
|
131
|
+
const eff = r.evaluation.efficiency;
|
|
132
|
+
lines.push(` tools: ${eff.toolCallCount}, tokens: ${eff.totalTokens}, files: ${eff.filesReadCount}, symbols: ${eff.symbolsQueriedCount}`);
|
|
133
|
+
}
|
|
134
|
+
return lines.join("\n");
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Compare two reports side by side.
|
|
138
|
+
*/
|
|
139
|
+
export function compareReports(baseline, candidate) {
|
|
140
|
+
const lines = [];
|
|
141
|
+
lines.push(`Comparison: "${baseline.variant}" vs "${candidate.variant}"`);
|
|
142
|
+
lines.push(`Models: ${baseline.model} vs ${candidate.model}`);
|
|
143
|
+
lines.push("");
|
|
144
|
+
const bs = baseline.summary;
|
|
145
|
+
const cs = candidate.summary;
|
|
146
|
+
lines.push("Overall:");
|
|
147
|
+
lines.push(` Pass rate: ${(bs.passRate * 100).toFixed(1)}% -> ${(cs.passRate * 100).toFixed(1)}% (${formatDelta(cs.passRate - bs.passRate, true)})`);
|
|
148
|
+
lines.push(` Avg tool calls: ${bs.avgToolCalls.toFixed(1)} -> ${cs.avgToolCalls.toFixed(1)} (${formatDelta(cs.avgToolCalls - bs.avgToolCalls, false)})`);
|
|
149
|
+
lines.push(` Avg tokens: ${bs.avgTotalTokens.toFixed(0)} -> ${cs.avgTotalTokens.toFixed(0)} (${formatDelta(cs.avgTotalTokens - bs.avgTotalTokens, false)})`);
|
|
150
|
+
lines.push(` Avg duration: ${bs.avgDurationMs.toFixed(0)}ms -> ${cs.avgDurationMs.toFixed(0)}ms (${formatDelta(cs.avgDurationMs - bs.avgDurationMs, false)})`);
|
|
151
|
+
lines.push("");
|
|
152
|
+
// Per-task comparison
|
|
153
|
+
const baseResults = new Map(baseline.results.map((r) => [r.taskId, r]));
|
|
154
|
+
lines.push("Per-task changes:");
|
|
155
|
+
for (const cr of candidate.results) {
|
|
156
|
+
const br = baseResults.get(cr.taskId);
|
|
157
|
+
if (!br) {
|
|
158
|
+
lines.push(` [NEW] ${cr.taskId}: ${cr.evaluation.passed ? "PASS" : "FAIL"}`);
|
|
159
|
+
continue;
|
|
160
|
+
}
|
|
161
|
+
if (br.evaluation.passed !== cr.evaluation.passed) {
|
|
162
|
+
const change = cr.evaluation.passed ? "FIXED" : "REGRESSED";
|
|
163
|
+
lines.push(` [${change}] ${cr.taskId}`);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
return lines.join("\n");
|
|
167
|
+
}
|
|
168
|
+
function formatDelta(delta, higherIsBetter) {
|
|
169
|
+
const sign = delta >= 0 ? "+" : "";
|
|
170
|
+
const formatted = `${sign}${delta.toFixed(1)}`;
|
|
171
|
+
if (Math.abs(delta) < 0.01)
|
|
172
|
+
return "no change";
|
|
173
|
+
if (higherIsBetter) {
|
|
174
|
+
return delta > 0 ? `${formatted} better` : `${formatted} worse`;
|
|
175
|
+
}
|
|
176
|
+
return delta < 0 ? `${formatted} better` : `${formatted} worse`;
|
|
177
|
+
}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Benchmark runner — executes tasks via the agent and captures traces.
|
|
3
|
+
*
|
|
4
|
+
* Uses CodingAgent directly (not the CLI subprocess) for full control
|
|
5
|
+
* over tool-call instrumentation and trace capture.
|
|
6
|
+
*/
|
|
7
|
+
import { execSync } from "node:child_process";
|
|
8
|
+
import { CodingAgent, Session, ToolRegistry, } from "@minicode/agent-sdk";
|
|
9
|
+
function getGitCommitSha() {
|
|
10
|
+
try {
|
|
11
|
+
return execSync("git rev-parse HEAD", { encoding: "utf8" }).trim();
|
|
12
|
+
}
|
|
13
|
+
catch {
|
|
14
|
+
return "unknown";
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
const STRUCTURAL_TOOLS = new Set([
|
|
18
|
+
"read_symbol",
|
|
19
|
+
"find_references",
|
|
20
|
+
"get_dependencies",
|
|
21
|
+
"search_code_map",
|
|
22
|
+
]);
|
|
23
|
+
/**
|
|
24
|
+
* Run a single benchmark task and return the captured trace.
|
|
25
|
+
*/
|
|
26
|
+
export async function runBenchmarkTask(task, options) {
|
|
27
|
+
const captured = [];
|
|
28
|
+
const filesRead = new Set();
|
|
29
|
+
const symbolsQueried = new Set();
|
|
30
|
+
// Wrap each tool to capture calls
|
|
31
|
+
const instrumentedTools = options.tools.map((tool) => ({
|
|
32
|
+
...tool,
|
|
33
|
+
execute: async (input) => {
|
|
34
|
+
const start = performance.now();
|
|
35
|
+
const output = await tool.execute(input);
|
|
36
|
+
const durationMs = performance.now() - start;
|
|
37
|
+
captured.push({
|
|
38
|
+
name: tool.name,
|
|
39
|
+
input,
|
|
40
|
+
output: output.length > 2000 ? output.slice(0, 2000) + "…[truncated]" : output,
|
|
41
|
+
durationMs,
|
|
42
|
+
});
|
|
43
|
+
// Track files read
|
|
44
|
+
if (tool.name === "read_file" || tool.name === "read_symbol") {
|
|
45
|
+
const filePath = input.path ?? input.file_path ?? input.filePath;
|
|
46
|
+
if (typeof filePath === "string")
|
|
47
|
+
filesRead.add(filePath);
|
|
48
|
+
}
|
|
49
|
+
// Track symbol queries
|
|
50
|
+
if (STRUCTURAL_TOOLS.has(tool.name)) {
|
|
51
|
+
const sym = input.symbol ?? input.symbolName ?? input.name ?? input.query;
|
|
52
|
+
if (typeof sym === "string")
|
|
53
|
+
symbolsQueried.add(sym);
|
|
54
|
+
}
|
|
55
|
+
return output;
|
|
56
|
+
},
|
|
57
|
+
}));
|
|
58
|
+
const registry = new ToolRegistry(instrumentedTools);
|
|
59
|
+
const session = new Session();
|
|
60
|
+
const agent = new CodingAgent({
|
|
61
|
+
config: options.config,
|
|
62
|
+
modelClient: options.modelClient,
|
|
63
|
+
toolRegistry: registry,
|
|
64
|
+
session,
|
|
65
|
+
});
|
|
66
|
+
const startedAt = new Date().toISOString();
|
|
67
|
+
const start = performance.now();
|
|
68
|
+
const { text, usage } = await agent.runTurn(task.prompt);
|
|
69
|
+
const durationMs = performance.now() - start;
|
|
70
|
+
const trace = {
|
|
71
|
+
taskId: task.id,
|
|
72
|
+
model: options.config.model,
|
|
73
|
+
variant: options.variant,
|
|
74
|
+
commitSha: getGitCommitSha(),
|
|
75
|
+
response: text,
|
|
76
|
+
toolCalls: captured,
|
|
77
|
+
filesRead: [...filesRead],
|
|
78
|
+
symbolsQueried: [...symbolsQueried],
|
|
79
|
+
usage: {
|
|
80
|
+
inputTokens: usage?.inputTokens ?? 0,
|
|
81
|
+
outputTokens: usage?.outputTokens ?? 0,
|
|
82
|
+
totalTokens: (usage?.inputTokens ?? 0) + (usage?.outputTokens ?? 0),
|
|
83
|
+
},
|
|
84
|
+
durationMs,
|
|
85
|
+
startedAt,
|
|
86
|
+
};
|
|
87
|
+
options.onTaskComplete?.(task.id, trace);
|
|
88
|
+
return trace;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Run all provided benchmark tasks sequentially.
|
|
92
|
+
*/
|
|
93
|
+
export async function runBenchmarkSuite(tasks, options) {
|
|
94
|
+
const traces = [];
|
|
95
|
+
for (const task of tasks) {
|
|
96
|
+
const trace = await runBenchmarkTask(task, options);
|
|
97
|
+
traces.push(trace);
|
|
98
|
+
}
|
|
99
|
+
return traces;
|
|
100
|
+
}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Loads benchmark tasks from the benchmarks/tasks/ directory.
|
|
3
|
+
*
|
|
4
|
+
* Each task lives in a category subdirectory and contains:
|
|
5
|
+
* - task.json — task metadata, prompt, and rubric
|
|
6
|
+
*/
|
|
7
|
+
import { readFile, readdir, stat } from "node:fs/promises";
|
|
8
|
+
import path from "node:path";
|
|
9
|
+
const VALID_CATEGORIES = new Set([
|
|
10
|
+
"navigation",
|
|
11
|
+
"editing",
|
|
12
|
+
"refactors",
|
|
13
|
+
"debugging",
|
|
14
|
+
"planning",
|
|
15
|
+
]);
|
|
16
|
+
function isValidCategory(name) {
|
|
17
|
+
return VALID_CATEGORIES.has(name);
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Load all benchmark tasks from the given base directory.
|
|
21
|
+
* Expects: `<baseDir>/<category>/<task-name>/task.json`
|
|
22
|
+
*/
|
|
23
|
+
export async function loadBenchmarkTasks(baseDir) {
|
|
24
|
+
const tasks = [];
|
|
25
|
+
const entries = await readdir(baseDir, { withFileTypes: true });
|
|
26
|
+
for (const entry of entries) {
|
|
27
|
+
if (!entry.isDirectory() || !isValidCategory(entry.name))
|
|
28
|
+
continue;
|
|
29
|
+
const category = entry.name;
|
|
30
|
+
const categoryDir = path.join(baseDir, category);
|
|
31
|
+
const taskDirs = await readdir(categoryDir, { withFileTypes: true });
|
|
32
|
+
for (const taskDir of taskDirs) {
|
|
33
|
+
if (!taskDir.isDirectory())
|
|
34
|
+
continue;
|
|
35
|
+
const taskJsonPath = path.join(categoryDir, taskDir.name, "task.json");
|
|
36
|
+
const exists = await stat(taskJsonPath)
|
|
37
|
+
.then(() => true)
|
|
38
|
+
.catch(() => false);
|
|
39
|
+
if (!exists)
|
|
40
|
+
continue;
|
|
41
|
+
const raw = await readFile(taskJsonPath, "utf8");
|
|
42
|
+
const parsed = JSON.parse(raw);
|
|
43
|
+
tasks.push({
|
|
44
|
+
id: `${category}/${taskDir.name}`,
|
|
45
|
+
title: parsed.title,
|
|
46
|
+
category,
|
|
47
|
+
prompt: parsed.prompt,
|
|
48
|
+
workspaceRoot: parsed.workspaceRoot,
|
|
49
|
+
rubric: parsed.rubric,
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
return tasks.sort((a, b) => a.id.localeCompare(b.id));
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Load a single benchmark task by its id (e.g. "navigation/find-symbol-definition").
|
|
57
|
+
*/
|
|
58
|
+
export async function loadBenchmarkTask(baseDir, taskId) {
|
|
59
|
+
const taskJsonPath = path.join(baseDir, taskId, "task.json");
|
|
60
|
+
const exists = await stat(taskJsonPath)
|
|
61
|
+
.then(() => true)
|
|
62
|
+
.catch(() => false);
|
|
63
|
+
if (!exists)
|
|
64
|
+
return undefined;
|
|
65
|
+
const raw = await readFile(taskJsonPath, "utf8");
|
|
66
|
+
const parsed = JSON.parse(raw);
|
|
67
|
+
const [category] = taskId.split("/");
|
|
68
|
+
if (!category || !isValidCategory(category))
|
|
69
|
+
return undefined;
|
|
70
|
+
return {
|
|
71
|
+
id: taskId,
|
|
72
|
+
title: parsed.title,
|
|
73
|
+
category,
|
|
74
|
+
prompt: parsed.prompt,
|
|
75
|
+
workspaceRoot: parsed.workspaceRoot,
|
|
76
|
+
rubric: parsed.rubric,
|
|
77
|
+
};
|
|
78
|
+
}
|
package/dist/src/cli/args.js
CHANGED
|
@@ -11,6 +11,7 @@ export function parseCliArgs(argv) {
|
|
|
11
11
|
let json = false;
|
|
12
12
|
let outFile;
|
|
13
13
|
let serve = false;
|
|
14
|
+
let pluginInstall = false;
|
|
14
15
|
let port = 4567;
|
|
15
16
|
const taskParts = [];
|
|
16
17
|
for (let i = 0; i < args.length; i += 1) {
|
|
@@ -22,6 +23,11 @@ export function parseCliArgs(argv) {
|
|
|
22
23
|
serve = true;
|
|
23
24
|
continue;
|
|
24
25
|
}
|
|
26
|
+
if (arg === "plugin" && args[i + 1] === "install") {
|
|
27
|
+
pluginInstall = true;
|
|
28
|
+
i += 1;
|
|
29
|
+
continue;
|
|
30
|
+
}
|
|
25
31
|
if (arg === "--port") {
|
|
26
32
|
const value = args[i + 1];
|
|
27
33
|
if (!value || value.startsWith("-")) {
|
|
@@ -81,6 +87,7 @@ export function parseCliArgs(argv) {
|
|
|
81
87
|
serve,
|
|
82
88
|
port,
|
|
83
89
|
task: taskParts.join(" ").trim(),
|
|
90
|
+
pluginInstall,
|
|
84
91
|
};
|
|
85
92
|
}
|
|
86
93
|
export function validateCliArgs(args) {
|
|
@@ -93,4 +100,7 @@ export function validateCliArgs(args) {
|
|
|
93
100
|
if (args.serve && (args.oneshot || args.json || args.outFile)) {
|
|
94
101
|
throw new CliUsageError("serve mode is mutually exclusive with --oneshot, --json, and --out.");
|
|
95
102
|
}
|
|
103
|
+
if (args.pluginInstall && (args.serve || args.oneshot)) {
|
|
104
|
+
throw new CliUsageError("plugin install is mutually exclusive with serve and --oneshot.");
|
|
105
|
+
}
|
|
96
106
|
}
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import { formatConfigForDisplay, MINICODE_HOME, resolveConfigEnv } from "../agent/config.js";
|
|
2
|
+
import { formatPersistedConfigValue, getEditableConfigDefinition, getEffectiveEditableConfigValue, isEditableConfigKey, listEditableConfigDefinitions, loadPersistedConfig, setPersistedConfigValue, unsetPersistedConfigValue, } from "../agent/editable-config.js";
|
|
3
|
+
function renderUsage() {
|
|
4
|
+
return [
|
|
5
|
+
'Usage:',
|
|
6
|
+
' /config',
|
|
7
|
+
' /config keys',
|
|
8
|
+
' /config get <key>',
|
|
9
|
+
' /config set <key> <value>',
|
|
10
|
+
' /config unset <key>',
|
|
11
|
+
].join("\n");
|
|
12
|
+
}
|
|
13
|
+
function renderEditableKeys() {
|
|
14
|
+
const lines = [
|
|
15
|
+
"Editable config keys (persisted in ~/.minicode/agent.config.json; environment variables take precedence):",
|
|
16
|
+
];
|
|
17
|
+
for (const definition of listEditableConfigDefinitions()) {
|
|
18
|
+
const valueHint = definition.type === "enum"
|
|
19
|
+
? `<${definition.values?.join("|")}>`
|
|
20
|
+
: `<${definition.type}>`;
|
|
21
|
+
lines.push(` ${definition.key} ${valueHint} — ${definition.description} (env: ${definition.envVar})`);
|
|
22
|
+
}
|
|
23
|
+
lines.push("");
|
|
24
|
+
lines.push('Use "/config set <key> <value>" to update your global config.');
|
|
25
|
+
lines.push("Secrets like API keys stay env-only for now.");
|
|
26
|
+
return lines.join("\n");
|
|
27
|
+
}
|
|
28
|
+
async function renderConfigValue(key, context) {
|
|
29
|
+
if (!isEditableConfigKey(key)) {
|
|
30
|
+
return `Unknown editable config key "${key}".\n\n${renderEditableKeys()}`;
|
|
31
|
+
}
|
|
32
|
+
const minicodeHome = context.minicodeHome ?? MINICODE_HOME;
|
|
33
|
+
const definition = getEditableConfigDefinition(key);
|
|
34
|
+
const persisted = await loadPersistedConfig(minicodeHome);
|
|
35
|
+
const env = await resolveConfigEnv({ minicodeHome });
|
|
36
|
+
const envValue = env.values[definition.envVar];
|
|
37
|
+
return [
|
|
38
|
+
`${definition.key}`,
|
|
39
|
+
` effective: ${getEffectiveEditableConfigValue(context.config, key)}`,
|
|
40
|
+
` config file: ${formatPersistedConfigValue(persisted[definition.fileKey])}`,
|
|
41
|
+
` env override (${definition.envVar}): ${formatPersistedConfigValue(envValue)}`,
|
|
42
|
+
].join("\n");
|
|
43
|
+
}
|
|
44
|
+
async function persistConfigValue(key, rawValue, context) {
|
|
45
|
+
if (!isEditableConfigKey(key)) {
|
|
46
|
+
return `Unknown editable config key "${key}".\n\n${renderEditableKeys()}`;
|
|
47
|
+
}
|
|
48
|
+
const minicodeHome = context.minicodeHome ?? MINICODE_HOME;
|
|
49
|
+
const definition = getEditableConfigDefinition(key);
|
|
50
|
+
const env = await resolveConfigEnv({ minicodeHome });
|
|
51
|
+
try {
|
|
52
|
+
const result = await setPersistedConfigValue({
|
|
53
|
+
key,
|
|
54
|
+
rawValue,
|
|
55
|
+
minicodeHome,
|
|
56
|
+
});
|
|
57
|
+
const lines = [
|
|
58
|
+
`Saved config: ${key} = ${formatPersistedConfigValue(result.storedValue)}`,
|
|
59
|
+
`File: ${result.path}`,
|
|
60
|
+
"Restart minicode to pick up persisted config changes in a new session.",
|
|
61
|
+
];
|
|
62
|
+
if (env.values[definition.envVar] !== undefined) {
|
|
63
|
+
lines.push(`Note: ${definition.envVar} is currently set and will override this persisted value until it is unset.`);
|
|
64
|
+
}
|
|
65
|
+
return lines.join("\n");
|
|
66
|
+
}
|
|
67
|
+
catch (error) {
|
|
68
|
+
const message = error instanceof Error ? error.message : "Unknown error";
|
|
69
|
+
return `Failed to save config: ${message}`;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
async function removeConfigValue(key, context) {
|
|
73
|
+
if (!isEditableConfigKey(key)) {
|
|
74
|
+
return `Unknown editable config key "${key}".\n\n${renderEditableKeys()}`;
|
|
75
|
+
}
|
|
76
|
+
const minicodeHome = context.minicodeHome ?? MINICODE_HOME;
|
|
77
|
+
const definition = getEditableConfigDefinition(key);
|
|
78
|
+
const env = await resolveConfigEnv({ minicodeHome });
|
|
79
|
+
await unsetPersistedConfigValue({
|
|
80
|
+
key,
|
|
81
|
+
minicodeHome,
|
|
82
|
+
});
|
|
83
|
+
const lines = [
|
|
84
|
+
`Removed persisted value for "${key}".`,
|
|
85
|
+
`File: ${minicodeHome}/agent.config.json`,
|
|
86
|
+
"Restart minicode to ensure the updated config is applied in a new session.",
|
|
87
|
+
];
|
|
88
|
+
if (env.values[definition.envVar] !== undefined) {
|
|
89
|
+
lines.push(`Note: ${definition.envVar} is still set in the environment, so the effective value may remain unchanged.`);
|
|
90
|
+
}
|
|
91
|
+
return lines.join("\n");
|
|
92
|
+
}
|
|
93
|
+
export async function handleConfigSlashCommand(trimmed, context) {
|
|
94
|
+
if (!(trimmed === "/config" || trimmed.startsWith("/config "))) {
|
|
95
|
+
return { handled: false };
|
|
96
|
+
}
|
|
97
|
+
const rest = trimmed.slice("/config".length).trim();
|
|
98
|
+
if (rest.length === 0) {
|
|
99
|
+
return { handled: true, message: formatConfigForDisplay(context.config) };
|
|
100
|
+
}
|
|
101
|
+
const tokens = rest.split(/\s+/);
|
|
102
|
+
const [subcommand, ...subArgs] = tokens;
|
|
103
|
+
if (subcommand === "keys") {
|
|
104
|
+
return { handled: true, message: renderEditableKeys() };
|
|
105
|
+
}
|
|
106
|
+
if (subcommand === "get") {
|
|
107
|
+
if (subArgs.length !== 1) {
|
|
108
|
+
return { handled: true, message: renderUsage() };
|
|
109
|
+
}
|
|
110
|
+
return {
|
|
111
|
+
handled: true,
|
|
112
|
+
message: await renderConfigValue(subArgs[0], context),
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
if (subcommand === "set") {
|
|
116
|
+
if (subArgs.length < 2) {
|
|
117
|
+
return { handled: true, message: renderUsage() };
|
|
118
|
+
}
|
|
119
|
+
const [key, ...valueParts] = subArgs;
|
|
120
|
+
return {
|
|
121
|
+
handled: true,
|
|
122
|
+
message: await persistConfigValue(key, valueParts.join(" "), context),
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
if (subcommand === "unset") {
|
|
126
|
+
if (subArgs.length !== 1) {
|
|
127
|
+
return { handled: true, message: renderUsage() };
|
|
128
|
+
}
|
|
129
|
+
return {
|
|
130
|
+
handled: true,
|
|
131
|
+
message: await removeConfigValue(subArgs[0], context),
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
return { handled: true, message: renderUsage() };
|
|
135
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Install the minicode Claude Code plugin globally.
|
|
3
|
+
*
|
|
4
|
+
* Creates a symlink from ~/.claude/plugins/minicode → the plugin directory
|
|
5
|
+
* shipped alongside the minicode package.
|
|
6
|
+
*/
|
|
7
|
+
import { mkdir, symlink, readlink, unlink, stat } from "node:fs/promises";
|
|
8
|
+
import path from "node:path";
|
|
9
|
+
import os from "node:os";
|
|
10
|
+
import { fileURLToPath } from "node:url";
|
|
11
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
12
|
+
/**
|
|
13
|
+
* Resolve the plugin source directory.
|
|
14
|
+
* In dev (tsx): __dirname = src/cli → go up to project root, then plugin/
|
|
15
|
+
* In prod (dist): __dirname = dist/src/cli → go up to project root, then plugin/
|
|
16
|
+
*/
|
|
17
|
+
function getPluginSourceDir() {
|
|
18
|
+
if (__dirname.includes(`${path.sep}dist${path.sep}`)) {
|
|
19
|
+
return path.resolve(__dirname, "../../../plugin");
|
|
20
|
+
}
|
|
21
|
+
return path.resolve(__dirname, "../../plugin");
|
|
22
|
+
}
|
|
23
|
+
export async function installPlugin() {
|
|
24
|
+
const pluginsDir = path.join(os.homedir(), ".claude", "plugins");
|
|
25
|
+
const targetDir = path.join(pluginsDir, "minicode");
|
|
26
|
+
const sourceDir = getPluginSourceDir();
|
|
27
|
+
// Verify the plugin source exists
|
|
28
|
+
try {
|
|
29
|
+
await stat(path.join(sourceDir, ".claude-plugin", "plugin.json"));
|
|
30
|
+
}
|
|
31
|
+
catch {
|
|
32
|
+
console.error(`Error: plugin source not found at ${sourceDir}`);
|
|
33
|
+
console.error("Make sure you are running from a minicode installation.");
|
|
34
|
+
process.exit(1);
|
|
35
|
+
}
|
|
36
|
+
// Create ~/.claude/plugins/ if it doesn't exist
|
|
37
|
+
await mkdir(pluginsDir, { recursive: true });
|
|
38
|
+
// Check if target already exists
|
|
39
|
+
try {
|
|
40
|
+
const existing = await readlink(targetDir);
|
|
41
|
+
if (existing === sourceDir) {
|
|
42
|
+
console.log(`Plugin already installed at ${targetDir}`);
|
|
43
|
+
console.log(` → ${sourceDir}`);
|
|
44
|
+
return;
|
|
45
|
+
}
|
|
46
|
+
// Different target — remove and re-link
|
|
47
|
+
await unlink(targetDir);
|
|
48
|
+
}
|
|
49
|
+
catch {
|
|
50
|
+
// Check if it's a directory (not a symlink) that exists
|
|
51
|
+
try {
|
|
52
|
+
const stats = await stat(targetDir);
|
|
53
|
+
if (stats.isDirectory()) {
|
|
54
|
+
console.error(`Error: ${targetDir} exists and is not a symlink.`);
|
|
55
|
+
console.error("Remove it manually if you want to reinstall.");
|
|
56
|
+
process.exit(1);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
catch {
|
|
60
|
+
// Doesn't exist — good
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
await symlink(sourceDir, targetDir, "dir");
|
|
64
|
+
console.log("minicode plugin installed for Claude Code");
|
|
65
|
+
console.log(` ${targetDir} → ${sourceDir}`);
|
|
66
|
+
console.log("\nThe plugin will load automatically in Claude Code sessions.");
|
|
67
|
+
console.log("Make sure minicode serve is running for the MCP tools to work:");
|
|
68
|
+
console.log(" minicode serve");
|
|
69
|
+
}
|