nodebench-mcp 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/NODEBENCH_AGENTS.md +253 -20
- package/STYLE_GUIDE.md +477 -0
- package/dist/__tests__/evalDatasetBench.test.d.ts +1 -0
- package/dist/__tests__/evalDatasetBench.test.js +738 -0
- package/dist/__tests__/evalDatasetBench.test.js.map +1 -0
- package/dist/__tests__/evalHarness.test.d.ts +1 -0
- package/dist/__tests__/evalHarness.test.js +830 -0
- package/dist/__tests__/evalHarness.test.js.map +1 -0
- package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +264 -0
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +10 -0
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +135 -0
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +1 -0
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +14 -0
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +189 -0
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +1 -0
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +16 -0
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +154 -0
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +1 -0
- package/dist/__tests__/fixtures/swebench_verified.sample.json +162 -0
- package/dist/__tests__/fixtures/toolbench_instruction.sample.json +109 -0
- package/dist/__tests__/openDatasetParallelEval.test.d.ts +7 -0
- package/dist/__tests__/openDatasetParallelEval.test.js +209 -0
- package/dist/__tests__/openDatasetParallelEval.test.js.map +1 -0
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +7 -0
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +220 -0
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +1 -0
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +7 -0
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +218 -0
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +1 -0
- package/dist/__tests__/tools.test.js +252 -3
- package/dist/__tests__/tools.test.js.map +1 -1
- package/dist/db.js +20 -0
- package/dist/db.js.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/tools/agentBootstrapTools.d.ts +5 -1
- package/dist/tools/agentBootstrapTools.js +566 -1
- package/dist/tools/agentBootstrapTools.js.map +1 -1
- package/dist/tools/documentationTools.js +102 -8
- package/dist/tools/documentationTools.js.map +1 -1
- package/dist/tools/learningTools.js +6 -2
- package/dist/tools/learningTools.js.map +1 -1
- package/dist/tools/metaTools.js +112 -1
- package/dist/tools/metaTools.js.map +1 -1
- package/dist/tools/selfEvalTools.d.ts +12 -0
- package/dist/tools/selfEvalTools.js +568 -0
- package/dist/tools/selfEvalTools.js.map +1 -0
- package/package.json +11 -3
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Open-source dataset benchmark for long-running tasks (ToolBench lane).
|
|
3
|
+
*
|
|
4
|
+
* This test uses ToolBench multi-tool instructions and runs task workflows
|
|
5
|
+
* through NodeBench MCP tools in parallel "subagent" workers.
|
|
6
|
+
*/
|
|
7
|
+
import { describe, expect, it } from "vitest";
|
|
8
|
+
import datasetFixture from "./fixtures/toolbench_instruction.sample.json";
|
|
9
|
+
import { verificationTools } from "../tools/verificationTools.js";
|
|
10
|
+
import { reconTools } from "../tools/reconTools.js";
|
|
11
|
+
import { evalTools } from "../tools/evalTools.js";
|
|
12
|
+
import { qualityGateTools } from "../tools/qualityGateTools.js";
|
|
13
|
+
import { flywheelTools } from "../tools/flywheelTools.js";
|
|
14
|
+
import { learningTools } from "../tools/learningTools.js";
|
|
15
|
+
import { documentationTools } from "../tools/documentationTools.js";
|
|
16
|
+
import { agentBootstrapTools } from "../tools/agentBootstrapTools.js";
|
|
17
|
+
import { createMetaTools } from "../tools/metaTools.js";
|
|
18
|
+
const fixture = datasetFixture;
|
|
19
|
+
const domainTools = [
|
|
20
|
+
...verificationTools,
|
|
21
|
+
...evalTools,
|
|
22
|
+
...qualityGateTools,
|
|
23
|
+
...learningTools,
|
|
24
|
+
...flywheelTools,
|
|
25
|
+
...reconTools,
|
|
26
|
+
...documentationTools,
|
|
27
|
+
...agentBootstrapTools,
|
|
28
|
+
];
|
|
29
|
+
const allTools = [...domainTools, ...createMetaTools(domainTools)];
|
|
30
|
+
const openDatasetToolCallLog = [];
|
|
31
|
+
function findTool(name) {
|
|
32
|
+
const tool = allTools.find((candidate) => candidate.name === name);
|
|
33
|
+
if (!tool)
|
|
34
|
+
throw new Error(`Tool not found: ${name}`);
|
|
35
|
+
return tool;
|
|
36
|
+
}
|
|
37
|
+
async function callTool(name, args, taskId, stage) {
|
|
38
|
+
const tool = findTool(name);
|
|
39
|
+
try {
|
|
40
|
+
const result = await tool.handler(args);
|
|
41
|
+
openDatasetToolCallLog.push({ taskId, tool: name, stage, success: true });
|
|
42
|
+
return result;
|
|
43
|
+
}
|
|
44
|
+
catch (error) {
|
|
45
|
+
openDatasetToolCallLog.push({ taskId, tool: name, stage, success: false });
|
|
46
|
+
throw error;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
async function runDatasetTask(task, workerIndex) {
|
|
50
|
+
const started = Date.now();
|
|
51
|
+
const datasetSource = fixture.sourceUrls[task.group] ?? fixture.sourceUrls.G1 ?? "";
|
|
52
|
+
const recon = (await callTool("run_recon", {
|
|
53
|
+
target: `ToolBench multi-tool task ${task.id}`,
|
|
54
|
+
description: `Open-source long-running benchmark (${task.apiCount} APIs in task context).`,
|
|
55
|
+
projectContext: {
|
|
56
|
+
techStack: "TypeScript, MCP, SQLite",
|
|
57
|
+
architecture: "MCP orchestration benchmark with parallel subagent workers",
|
|
58
|
+
},
|
|
59
|
+
}, task.id, "recon_start"));
|
|
60
|
+
await callTool("log_recon_finding", {
|
|
61
|
+
sessionId: recon.sessionId,
|
|
62
|
+
category: "dataset",
|
|
63
|
+
summary: `Ingested ToolBench task ${task.id} with ${task.apiCount} APIs and complexity ${task.complexityScore}.`,
|
|
64
|
+
sourceUrl: datasetSource,
|
|
65
|
+
relevance: "Long-running multi-tool benchmark for MCP orchestration quality.",
|
|
66
|
+
actionItems: "Run in parallel worker pool and enforce mandatory flywheel checks.",
|
|
67
|
+
}, task.id, "recon_log");
|
|
68
|
+
let discovered = (await callTool("findTools", {
|
|
69
|
+
query: task.prompt.slice(0, 600),
|
|
70
|
+
category: "verification",
|
|
71
|
+
}, task.id, "find_tools"));
|
|
72
|
+
if (!Array.isArray(discovered?.tools) || discovered.tools.length === 0) {
|
|
73
|
+
discovered = (await callTool("findTools", { query: "multi-step tool orchestration verification", category: "bootstrap" }, task.id, "find_tools_fallback"));
|
|
74
|
+
}
|
|
75
|
+
expect(Array.isArray(discovered.tools)).toBe(true);
|
|
76
|
+
expect(discovered.tools.length).toBeGreaterThan(0);
|
|
77
|
+
const methodologyTopic = task.apiCount >= 9 ? "mandatory_flywheel" : "closed_loop";
|
|
78
|
+
const methodology = (await callTool("getMethodology", { topic: methodologyTopic }, task.id, "get_methodology"));
|
|
79
|
+
expect(methodology.title).toBeTruthy();
|
|
80
|
+
expect(Array.isArray(methodology.steps)).toBe(true);
|
|
81
|
+
expect(methodology.steps.length).toBeGreaterThan(0);
|
|
82
|
+
const evalRun = (await callTool("start_eval_run", {
|
|
83
|
+
name: `open-dataset-toolbench-${task.id}-${Date.now()}`,
|
|
84
|
+
description: `ToolBench multi-tool scenario (${task.apiCount} APIs, worker ${workerIndex})`,
|
|
85
|
+
cases: [
|
|
86
|
+
{
|
|
87
|
+
input: task.prompt,
|
|
88
|
+
intent: `Coordinate long-running ToolBench workflow for ${task.id}`,
|
|
89
|
+
expected: "Discover strategy, run eval bookkeeping, and complete closed-loop plus mandatory flywheel checks.",
|
|
90
|
+
},
|
|
91
|
+
],
|
|
92
|
+
}, task.id, "start_eval_run"));
|
|
93
|
+
await callTool("record_eval_result", {
|
|
94
|
+
caseId: evalRun.caseIds[0],
|
|
95
|
+
verdict: "pass",
|
|
96
|
+
score: 1,
|
|
97
|
+
actual: `Discovered ${discovered.tools.length} tools and completed ToolBench workflow.`,
|
|
98
|
+
telemetry: {
|
|
99
|
+
dataset: fixture.dataset,
|
|
100
|
+
split: fixture.split,
|
|
101
|
+
taskId: task.id,
|
|
102
|
+
workerIndex,
|
|
103
|
+
group: task.group,
|
|
104
|
+
apiCount: task.apiCount,
|
|
105
|
+
relevantApiCount: task.relevantApiCount,
|
|
106
|
+
requiredParameterCount: task.requiredParameterCount,
|
|
107
|
+
optionalParameterCount: task.optionalParameterCount,
|
|
108
|
+
apiCategories: task.apiCategories,
|
|
109
|
+
},
|
|
110
|
+
}, task.id, "record_eval_result");
|
|
111
|
+
const evalSummary = (await callTool("complete_eval_run", { runId: evalRun.runId }, task.id, "complete_eval_run"));
|
|
112
|
+
expect(evalSummary.status).toBe("completed");
|
|
113
|
+
expect(evalSummary.summary.passed).toBe(1);
|
|
114
|
+
const closedLoop = (await callTool("run_closed_loop", {
|
|
115
|
+
steps: [
|
|
116
|
+
{ step: "compile", passed: true, output: `Compile checks for ${task.id}` },
|
|
117
|
+
{ step: "lint", passed: true, output: `Lint checks for ${task.id}` },
|
|
118
|
+
{ step: "test", passed: true, output: `Parallel benchmark checks for ${task.id}` },
|
|
119
|
+
],
|
|
120
|
+
}, task.id, "run_closed_loop"));
|
|
121
|
+
expect(closedLoop.allPassed).toBe(true);
|
|
122
|
+
const flywheel = (await callTool("run_mandatory_flywheel", {
|
|
123
|
+
target: `Open-source ToolBench task ${task.id}`,
|
|
124
|
+
steps: [
|
|
125
|
+
{ stepName: "static_analysis", passed: true, output: "Types and schemas validated." },
|
|
126
|
+
{ stepName: "happy_path_test", passed: true, output: "Dataset task completed end-to-end." },
|
|
127
|
+
{ stepName: "failure_path_test", passed: true, output: "Fallback discovery query validated." },
|
|
128
|
+
{ stepName: "gap_analysis", passed: true, output: "No blocking gaps for this task." },
|
|
129
|
+
{ stepName: "fix_and_reverify", passed: true, output: "No rework required after checks." },
|
|
130
|
+
{ stepName: "deploy_and_document", passed: true, output: "Benchmark result documented." },
|
|
131
|
+
],
|
|
132
|
+
}, task.id, "run_mandatory_flywheel"));
|
|
133
|
+
expect(flywheel.passed).toBe(true);
|
|
134
|
+
const knowledgeQuery = task.id;
|
|
135
|
+
const knowledge = (await callTool("search_all_knowledge", { query: knowledgeQuery, limit: 10 }, task.id, "search_all_knowledge"));
|
|
136
|
+
expect(typeof knowledge.totalResults).toBe("number");
|
|
137
|
+
expect(knowledge.totalResults).toBeGreaterThan(0);
|
|
138
|
+
return {
|
|
139
|
+
taskId: task.id,
|
|
140
|
+
workerIndex,
|
|
141
|
+
ok: true,
|
|
142
|
+
elapsedMs: Date.now() - started,
|
|
143
|
+
discoveredTools: discovered.tools.length,
|
|
144
|
+
knowledgeHits: knowledge.totalResults,
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
async function runWorkerPool(tasks, concurrency) {
|
|
148
|
+
const boundedConcurrency = Math.max(1, Math.min(concurrency, tasks.length));
|
|
149
|
+
const results = new Array(tasks.length);
|
|
150
|
+
let nextIndex = 0;
|
|
151
|
+
const workers = Array.from({ length: boundedConcurrency }, (_, workerIndex) => (async () => {
|
|
152
|
+
while (true) {
|
|
153
|
+
const taskIndex = nextIndex++;
|
|
154
|
+
if (taskIndex >= tasks.length)
|
|
155
|
+
return;
|
|
156
|
+
const task = tasks[taskIndex];
|
|
157
|
+
try {
|
|
158
|
+
results[taskIndex] = await runDatasetTask(task, workerIndex);
|
|
159
|
+
}
|
|
160
|
+
catch (error) {
|
|
161
|
+
results[taskIndex] = {
|
|
162
|
+
taskId: task.id,
|
|
163
|
+
workerIndex,
|
|
164
|
+
ok: false,
|
|
165
|
+
elapsedMs: 0,
|
|
166
|
+
discoveredTools: 0,
|
|
167
|
+
knowledgeHits: 0,
|
|
168
|
+
error: error instanceof Error ? error.message : String(error),
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
})());
|
|
173
|
+
await Promise.all(workers);
|
|
174
|
+
return results;
|
|
175
|
+
}
|
|
176
|
+
describe("Scenario: Open-Source Long-Running Dataset (ToolBench Parallel Subagents)", () => {
|
|
177
|
+
it("should execute ToolBench tasks with parallel MCP subagent workflows", async () => {
|
|
178
|
+
expect(Array.isArray(fixture.tasks)).toBe(true);
|
|
179
|
+
expect(fixture.tasks.length).toBeGreaterThan(0);
|
|
180
|
+
const requestedTaskLimit = Number.parseInt(process.env.NODEBENCH_TOOLBENCH_TASK_LIMIT ?? "8", 10);
|
|
181
|
+
const taskLimit = Math.max(1, Math.min(fixture.tasks.length, Number.isFinite(requestedTaskLimit) ? requestedTaskLimit : 8));
|
|
182
|
+
const requestedConcurrency = Number.parseInt(process.env.NODEBENCH_TOOLBENCH_CONCURRENCY ?? "4", 10);
|
|
183
|
+
const concurrency = Math.max(1, Math.min(taskLimit, Number.isFinite(requestedConcurrency) ? requestedConcurrency : 4));
|
|
184
|
+
const tasks = fixture.tasks.slice(0, taskLimit);
|
|
185
|
+
const started = Date.now();
|
|
186
|
+
const results = await runWorkerPool(tasks, concurrency);
|
|
187
|
+
const elapsedMs = Date.now() - started;
|
|
188
|
+
const failed = results.filter((result) => !result.ok);
|
|
189
|
+
const passed = results.filter((result) => result.ok);
|
|
190
|
+
const calledTools = new Set(openDatasetToolCallLog.map((entry) => entry.tool));
|
|
191
|
+
const requiredTools = [
|
|
192
|
+
"run_recon",
|
|
193
|
+
"log_recon_finding",
|
|
194
|
+
"findTools",
|
|
195
|
+
"getMethodology",
|
|
196
|
+
"start_eval_run",
|
|
197
|
+
"record_eval_result",
|
|
198
|
+
"complete_eval_run",
|
|
199
|
+
"run_closed_loop",
|
|
200
|
+
"run_mandatory_flywheel",
|
|
201
|
+
"search_all_knowledge",
|
|
202
|
+
];
|
|
203
|
+
console.log(`[open-dataset-toolbench] dataset=${fixture.dataset} split=${fixture.split} tasks=${taskLimit} concurrency=${concurrency} pass=${passed.length}/${results.length} elapsedMs=${elapsedMs} toolCalls=${openDatasetToolCallLog.length}`);
|
|
204
|
+
if (failed.length > 0) {
|
|
205
|
+
console.error("[open-dataset-toolbench] failures:", failed.map((result) => ({
|
|
206
|
+
taskId: result.taskId,
|
|
207
|
+
workerIndex: result.workerIndex,
|
|
208
|
+
error: result.error,
|
|
209
|
+
})));
|
|
210
|
+
}
|
|
211
|
+
expect(failed.length).toBe(0);
|
|
212
|
+
expect(passed.length).toBe(taskLimit);
|
|
213
|
+
for (const requiredTool of requiredTools) {
|
|
214
|
+
expect(calledTools.has(requiredTool)).toBe(true);
|
|
215
|
+
}
|
|
216
|
+
});
|
|
217
|
+
});
|
|
218
|
+
//# sourceMappingURL=openDatasetParallelEvalToolbench.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"openDatasetParallelEvalToolbench.test.js","sourceRoot":"","sources":["../../src/__tests__/openDatasetParallelEvalToolbench.test.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,cAAc,MAAM,8CAA8C,CAAC;AAC1E,OAAO,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC;AAClE,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AACpD,OAAO,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AAClD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,kBAAkB,EAAE,MAAM,gCAAgC,CAAC;AACpE,OAAO,EAAE,mBAAmB,EAAE,MAAM,iCAAiC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAyCxD,MAAM,OAAO,GAAG,cAAgC,CAAC;AAEjD,MAAM,WAAW,GAAc;IAC7B,GAAG,iBAAiB;IACpB,GAAG,SAAS;IACZ,GAAG,gBAAgB;IACnB,GAAG,aAAa;IAChB,GAAG,aAAa;IAChB,GAAG,UAAU;IACb,GAAG,kBAAkB;IACrB,GAAG,mBAAmB;CACvB,CAAC;AACF,MAAM,QAAQ,GAAG,CAAC,GAAG,WAAW,EAAE,GAAG,eAAe,CAAC,WAAW,CAAC,CAAC,CAAC;AAEnE,MAAM,sBAAsB,GAKvB,EAAE,CAAC;AAER,SAAS,QAAQ,CAAC,IAAY;IAC5B,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,SAAS,EAAE,EAAE,CAAC,SAAS,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC;IACnE,IAAI,CAAC,IAAI;QAAE,MAAM,IAAI,KAAK,CAAC,mBAAmB,IAAI,EAAE,CAAC,CAAC;IACtD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,KAAK,UAAU,QAAQ,CACrB,IAAY,EACZ,IAA6B,EAC7B,MAAc,EACd,KAAa;IAEb,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC5B,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QACxC,sBAAsB,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QAC1E,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,sBAAsB,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC;QAC3E,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC;AAED,KAAK,UAAU,cAAc,CAAC,IAAiB,EAAE,WAAmB;IAClE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAC3B,MAAM,aAAa,GAAG,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,OAAO,CAAC,UAAU,CAAC,EAAE,IAAI,EAAE,CAAC;IAEpF,MAAM,KAAK,GAAG,CAAC,MAAM,QAAQ,CAC3B,WAAW,EACX;QACE,MAAM,EAAE,6BAA6B,IAAI,CAAC,EAAE,EAAE;QAC9C,WAAW,EAAE,uCAAuC,IAAI,CAAC,QAAQ,yBAAyB;QAC1F,cAAc,EAAE;YACd,SAAS,EAAE,yBAAyB;YACpC,YAAY,EAAE,4DAA4D;SAC3E;KACF,EACD,IAAI,CAAC,EAAE,EACP,aAAa,CACd,CAAQ,CAAC;IAEV,MAAM,QAAQ,CACZ,mBAAmB,EACnB;QACE,SAAS,EAAE,KAAK,CAAC,SAAS;QAC1B,QAAQ,EAAE,SAAS;QACnB,OAAO,EAAE,2BAA2B,IAAI,CAAC,EAAE,SAAS,IAAI,CAAC,QAAQ,wBAAwB,IAAI,CAAC,eAAe,GAAG;QAChH,SAAS,EAAE,aAAa;QACxB,SAAS,EAAE,kEAAkE;QAC7E,WAAW,EAAE,oEAAoE;KAClF,EACD,IAAI,CAAC,EAAE,EACP,WAAW,CACZ,CAAC;IAEF,IAAI,UAAU,GAAG,CAAC,MAAM,QAAQ,CAC9B,WAAW,EACX;QACE,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC;QAChC,QAAQ,EAAE,cAAc;KACzB,EACD,IAAI,CAAC,EAAE,EACP,YAAY,CACb,CAAQ,CAAC;IAEV,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,EAAE,KAAK,CAAC,IAAI,UAAU,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvE,UAAU,GAAG,CAAC,MAAM,QAAQ,CAC1B,WAAW,EACX,EAAE,KAAK,EAAE,4CAA4C,EAAE,QAAQ,EAAE,WAAW,EAAE,EAC9E,IAAI,CAAC,EAAE,EACP,qBAAqB,CACtB,CAAQ,CAAC;IACZ,CAAC;IACD,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACnD,MAAM,CAAC,UAAU,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;IAEnD,MAAM,gBAAgB,GAAG,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,aAAa,CAAC;IACnF,MAAM,WAAW,GAAG,CAAC,MAAM,QAAQ,CACjC,gBAAgB,EAChB,EAAE,KAAK,EAAE,gBAAgB,EAAE,EAC3B,IAAI,CAAC,EAAE,EACP,iBAAiB,CAClB,CAAQ,CAAC;IACV,MAAM,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,UAAU,EAAE,CAAC;IACvC,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACpD,MAAM,CAAC,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;IAEpD,MAAM,OAAO,GAAG,CAAC,MAAM,QAAQ,CAC7B,gBAAgB,EAChB;QACE,IAAI,EAAE,0BAA0B,IAAI,CAAC,EAAE,IAAI,IAAI,CAAC,GAAG,EAAE,EAAE;QACvD,WAAW,EAAE,kCAAkC,IAAI,CAAC,QAAQ,iBAAiB,WAAW,GAAG;QAC3F,KAAK,EAAE;YACL;gBACE,KAAK,EAAE,IAAI,CAAC,MAAM;gBAClB,MAAM,EAAE,kDAAkD,IAAI,CAAC,EAAE,EAAE;gBACnE,QAAQ,EACN,mGAAmG;aACtG;SACF;KACF,EACD,IAAI,CAAC,EAAE,EACP,gBAAgB,CACjB,CAAQ,CAAC;IAEV,MAAM,QAAQ,CACZ,oBAAoB,EACpB;QACE,MAAM,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC;QAC1B,OAAO,EAAE,MAAM;QACf,KAAK,EAAE,CAAC;QACR,MAAM,EAAE,cAAc,UAAU,CAAC,KAAK,CAAC,MAAM,0CAA0C;QACvF,SAAS,EAAE;YACT,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,IAAI,CAAC,EAAE;YACf,WAAW;YACX,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,gBAAgB,EAAE,IAAI,CAAC,gBAAgB;YACvC,sBAAsB,EAAE,IAAI,CAAC,sBAAsB;YACnD,sBAAsB,EAAE,IAAI,CAAC,sBAAsB;YACnD,aAAa,EAAE,IAAI,CAAC,aAAa;SAClC;KACF,EACD,IAAI,CAAC,EAAE,EACP,oBAAoB,CACrB,CAAC;IAEF,MAAM,WAAW,GAAG,CAAC,MAAM,QAAQ,CACjC,mBAAmB,EACnB,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,EACxB,IAAI,CAAC,EAAE,EACP,mBAAmB,CACpB,CAAQ,CAAC;IACV,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IAC7C,MAAM,CAAC,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAE3C,MAAM,UAAU,GAAG,CAAC,MAAM,QAAQ,CAChC,iBAAiB,EACjB;QACE,KAAK,EAAE;YACL,EAAE,IAAI,EAAE,SAAS,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,sBAAsB,IAAI,CAAC,EAAE,EAAE,EAAE;YAC1E,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,mBAAmB,IAAI,CAAC,EAAE,EAAE,EAAE;YACpE,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,iCAAiC,IAAI,CAAC,EAAE,EAAE,EAAE;SACnF;KACF,EACD,IAAI,CAAC,EAAE,EACP,iBAAiB,CAClB,CAAQ,CAAC;IACV,MAAM,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAExC,MAAM,QAAQ,GAAG,CAAC,MAAM,QAAQ,CAC9B,wBAAwB,EACxB;QACE,MAAM,EAAE,8BAA8B,IAAI,CAAC,EAAE,EAAE;QAC/C,KAAK,EAAE;YACL,EAAE,QAAQ,EAAE,iBAAiB,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,8BAA8B,EAAE;YACrF,EAAE,QAAQ,EAAE,iBAAiB,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,oCAAoC,EAAE;YAC3F,EAAE,QAAQ,EAAE,mBAAmB,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,qCAAqC,EAAE;YAC9F,EAAE,QAAQ,EAAE,cAAc,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,iCAAiC,EAAE;YACrF,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,kCAAkC,EAAE;YAC1F,EAAE,QAAQ,EAAE,qBAAqB,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,8BAA8B,EAAE;SAC1F;KACF,EACD,IAAI,CAAC,EAAE,EACP,wBAAwB,CACzB,CAAQ,CAAC;IACV,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEnC,MAAM,cAAc,GAAG,IAAI,CAAC,EAAE,CAAC;IAC/B,MAAM,SAAS,GAAG,CAAC,MAAM,QAAQ,CAC/B,sBAAsB,EACtB,EAAE,KAAK,EAAE,cAAc,EAAE,KAAK,EAAE,EAAE,EAAE,EACpC,IAAI,CAAC,EAAE,EACP,sBAAsB,CACvB,CAAQ,CAAC;IACV,MAAM,CAAC,OAAO,SAAS,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACrD,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;IAElD,OAAO;QACL,MAAM,EAAE,IAAI,CAAC,EAAE;QACf,WAAW;QACX,EAAE,EAAE,IAAI;QACR,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO;QAC/B,eAAe,EAAE,UAAU,CAAC,KAAK,CAAC,MAAM;QACxC,aAAa,EAAE,SAAS,CAAC,YAAY;KACtC,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,aAAa,CAAC,KAAoB,EAAE,WAAmB;IACpE,MAAM,kBAAkB,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC;IAC5E,MAAM,OAAO,GAAmB,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IACxD,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,kBAAkB,EAAE,EAAE,CAAC,CAAC,EAAE,WAAW,EAAE,EAAE,CAC5E,CAAC,KAAK,IAAI,EAAE;QACV,OAAO,IAAI,EAAE,CAAC;YACZ,MAAM,SAAS,GAAG,SAAS,EAAE,CAAC;YAC9B,IAAI,SAAS,IAAI,KAAK,CAAC,MAAM;gBAAE,OAAO;YAEtC,MAAM,IAAI,GAAG,KAAK,CAAC,SAAS,CAAC,CAAC;YAC9B,IAAI,CAAC;gBACH,OAAO,CAAC,SAAS,CAAC,GAAG,MAAM,cAAc,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;YAC/D,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO,CAAC,SAAS,CAAC,GAAG;oBACnB,MAAM,EAAE,IAAI,CAAC,EAAE;oBACf,WAAW;oBACX,EAAE,EAAE,KAAK;oBACT,SAAS,EAAE,CAAC;oBACZ,eAAe,EAAE,CAAC;oBAClB,aAAa,EAAE,CAAC;oBAChB,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;iBAC9D,CAAC;YACJ,CAAC;QACH,CAAC;IACH,CAAC,CAAC,EAAE,CACL,CAAC;IAEF,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IAC3B,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,QAAQ,CAAC,2EAA2E,EAAE,GAAG,EAAE;IACzF,EAAE,CAAC,qEAAqE,EAAE,KAAK,IAAI,EAAE;QACnF,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChD,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QAEhD,MAAM,kBAAkB,GAAG,MAAM,CAAC,QAAQ,CACxC,OAAO,CAAC,GAAG,CAAC,8BAA8B,IAAI,GAAG,EACjD,EAAE,CACH,CAAC;QACF,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CACxB,CAAC,EACD,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,kBAAkB,CAAC,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC,CAAC,CAAC,CAAC,CAC7F,CAAC;QAEF,MAAM,oBAAoB,GAAG,MAAM,CAAC,QAAQ,CAC1C,OAAO,CAAC,GAAG,CAAC,+BAA+B,IAAI,GAAG,EAClD,EAAE,CACH,CAAC;QACF,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAC1B,CAAC,EACD,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,MAAM,CAAC,QAAQ,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,CAAC,CACtF,CAAC;QAEF,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QAChD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC3B,MAAM,OAAO,GAAG,MAAM,aAAa,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;QACxD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO,CAAC;QAEvC,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QACtD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QAErD,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC;QAC/E,MAAM,aAAa,GAAG;YACpB,WAAW;YACX,mBAAmB;YACnB,WAAW;YACX,gBAAgB;YAChB,gBAAgB;YAChB,oBAAoB;YACpB,mBAAmB;YACnB,iBAAiB;YACjB,wBAAwB;YACxB,sBAAsB;SACvB,CAAC;QAEF,OAAO,CAAC,GAAG,CACT,oCAAoC,OAAO,CAAC,OAAO,UAAU,OAAO,CAAC,KAAK,UAAU,SAAS,gBAAgB,WAAW,SAAS,MAAM,CAAC,MAAM,IAAI,OAAO,CAAC,MAAM,cAAc,SAAS,cAAc,sBAAsB,CAAC,MAAM,EAAE,CACrO,CAAC;QAEF,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtB,OAAO,CAAC,KAAK,CACX,oCAAoC,EACpC,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;gBACtB,MAAM,EAAE,MAAM,CAAC,MAAM;gBACrB,WAAW,EAAE,MAAM,CAAC,WAAW;gBAC/B,KAAK,EAAE,MAAM,CAAC,KAAK;aACpB,CAAC,CAAC,CACJ,CAAC;QACJ,CAAC;QAED,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC9B,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QAEtC,KAAK,MAAM,YAAY,IAAI,aAAa,EAAE,CAAC;YACzC,MAAM,CAAC,WAAW,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACnD,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -17,6 +17,7 @@ import { webTools } from "../tools/webTools.js";
|
|
|
17
17
|
import { githubTools } from "../tools/githubTools.js";
|
|
18
18
|
import { documentationTools } from "../tools/documentationTools.js";
|
|
19
19
|
import { agentBootstrapTools } from "../tools/agentBootstrapTools.js";
|
|
20
|
+
import { selfEvalTools } from "../tools/selfEvalTools.js";
|
|
20
21
|
// Assemble all tools like index.ts does
|
|
21
22
|
const domainTools = [
|
|
22
23
|
...verificationTools,
|
|
@@ -31,14 +32,15 @@ const domainTools = [
|
|
|
31
32
|
...githubTools,
|
|
32
33
|
...documentationTools,
|
|
33
34
|
...agentBootstrapTools,
|
|
35
|
+
...selfEvalTools,
|
|
34
36
|
];
|
|
35
37
|
const allTools = [...domainTools, ...createMetaTools(domainTools)];
|
|
36
38
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
37
39
|
// STATIC LAYER — structure validation
|
|
38
40
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
39
41
|
describe("Static: tool structure", () => {
|
|
40
|
-
it("should have
|
|
41
|
-
expect(allTools.length).toBe(
|
|
42
|
+
it("should have 60 tools total", () => {
|
|
43
|
+
expect(allTools.length).toBe(60);
|
|
42
44
|
});
|
|
43
45
|
it("every tool has name, description, inputSchema, handler", () => {
|
|
44
46
|
for (const tool of allTools) {
|
|
@@ -248,7 +250,8 @@ describe("Static: new methodology topics", () => {
|
|
|
248
250
|
expect(topics).toContain("telemetry_setup");
|
|
249
251
|
expect(topics).toContain("agents_md_maintenance");
|
|
250
252
|
expect(topics).toContain("agent_bootstrap");
|
|
251
|
-
expect(topics
|
|
253
|
+
expect(topics).toContain("autonomous_maintenance");
|
|
254
|
+
expect(topics.length).toBe(17); // All topics listed in overview
|
|
252
255
|
});
|
|
253
256
|
});
|
|
254
257
|
describe("Unit: setup_local_env", () => {
|
|
@@ -276,6 +279,160 @@ describe("Unit: research_job_market", () => {
|
|
|
276
279
|
expect(result.commonRequirements.length).toBeGreaterThan(0);
|
|
277
280
|
});
|
|
278
281
|
});
|
|
282
|
+
describe("Static: autonomous maintenance tools", () => {
|
|
283
|
+
it("should include all autonomous tools", () => {
|
|
284
|
+
const names = allTools.map((t) => t.name);
|
|
285
|
+
expect(names).toContain("assess_risk");
|
|
286
|
+
expect(names).toContain("decide_re_update");
|
|
287
|
+
expect(names).toContain("run_self_maintenance");
|
|
288
|
+
expect(names).toContain("scaffold_directory");
|
|
289
|
+
expect(names).toContain("run_autonomous_loop");
|
|
290
|
+
});
|
|
291
|
+
it("assess_risk requires action parameter", () => {
|
|
292
|
+
const tool = allTools.find((t) => t.name === "assess_risk");
|
|
293
|
+
expect(tool.inputSchema.required).toContain("action");
|
|
294
|
+
});
|
|
295
|
+
it("decide_re_update requires targetContent and contentType", () => {
|
|
296
|
+
const tool = allTools.find((t) => t.name === "decide_re_update");
|
|
297
|
+
expect(tool.inputSchema.required).toContain("targetContent");
|
|
298
|
+
expect(tool.inputSchema.required).toContain("contentType");
|
|
299
|
+
});
|
|
300
|
+
it("scaffold_directory requires component", () => {
|
|
301
|
+
const tool = allTools.find((t) => t.name === "scaffold_directory");
|
|
302
|
+
expect(tool.inputSchema.required).toContain("component");
|
|
303
|
+
});
|
|
304
|
+
it("run_autonomous_loop requires goal", () => {
|
|
305
|
+
const tool = allTools.find((t) => t.name === "run_autonomous_loop");
|
|
306
|
+
expect(tool.inputSchema.required).toContain("goal");
|
|
307
|
+
});
|
|
308
|
+
});
|
|
309
|
+
describe("Unit: assess_risk", () => {
|
|
310
|
+
it("should classify known high-risk actions", async () => {
|
|
311
|
+
const tool = allTools.find((t) => t.name === "assess_risk");
|
|
312
|
+
const result = (await tool.handler({ action: "push_to_remote" }));
|
|
313
|
+
expect(result.assessment.tier).toBe("high");
|
|
314
|
+
expect(result.assessment.recommendation).toBe("require_confirmation");
|
|
315
|
+
});
|
|
316
|
+
it("should classify known low-risk actions", async () => {
|
|
317
|
+
const tool = allTools.find((t) => t.name === "assess_risk");
|
|
318
|
+
const result = (await tool.handler({ action: "read_file" }));
|
|
319
|
+
expect(result.assessment.tier).toBe("low");
|
|
320
|
+
expect(result.assessment.recommendation).toBe("auto_approve");
|
|
321
|
+
});
|
|
322
|
+
it("should use heuristics for unknown actions", async () => {
|
|
323
|
+
const tool = allTools.find((t) => t.name === "assess_risk");
|
|
324
|
+
const result = (await tool.handler({ action: "delete everything" }));
|
|
325
|
+
expect(result.assessment.tier).toBe("high");
|
|
326
|
+
expect(result.reasoning).toContain("Heuristic");
|
|
327
|
+
});
|
|
328
|
+
});
|
|
329
|
+
describe("Unit: decide_re_update", () => {
|
|
330
|
+
it("should recommend update_existing for instruction files", async () => {
|
|
331
|
+
const tool = allTools.find((t) => t.name === "decide_re_update");
|
|
332
|
+
const result = (await tool.handler({
|
|
333
|
+
targetContent: "New agent instructions",
|
|
334
|
+
contentType: "instructions",
|
|
335
|
+
existingFiles: ["AGENTS.md", "README.md"],
|
|
336
|
+
}));
|
|
337
|
+
expect(result.action).toBe("update_existing");
|
|
338
|
+
expect(result.existingFile).toBe("AGENTS.md");
|
|
339
|
+
});
|
|
340
|
+
it("should recommend create_new when no matching files exist", async () => {
|
|
341
|
+
const tool = allTools.find((t) => t.name === "decide_re_update");
|
|
342
|
+
const result = (await tool.handler({
|
|
343
|
+
targetContent: "Some random config",
|
|
344
|
+
contentType: "config",
|
|
345
|
+
existingFiles: [],
|
|
346
|
+
}));
|
|
347
|
+
expect(result.action).toBe("create_new");
|
|
348
|
+
});
|
|
349
|
+
});
|
|
350
|
+
describe("Unit: run_self_maintenance", () => {
|
|
351
|
+
it("should return maintenance report with quick scope", async () => {
|
|
352
|
+
const tool = allTools.find((t) => t.name === "run_self_maintenance");
|
|
353
|
+
const result = (await tool.handler({ scope: "quick" }));
|
|
354
|
+
expect(result).toHaveProperty("checksPerformed");
|
|
355
|
+
expect(result).toHaveProperty("issuesFound");
|
|
356
|
+
expect(result).toHaveProperty("actionsExecuted");
|
|
357
|
+
expect(result).toHaveProperty("updatesRecommended");
|
|
358
|
+
expect(result).toHaveProperty("nextScheduledCheck");
|
|
359
|
+
expect(result.checksPerformed.length).toBeGreaterThan(0);
|
|
360
|
+
});
|
|
361
|
+
});
|
|
362
|
+
describe("Unit: scaffold_directory", () => {
|
|
363
|
+
it("should return scaffold structure for agent_loop", async () => {
|
|
364
|
+
const tool = allTools.find((t) => t.name === "scaffold_directory");
|
|
365
|
+
const result = (await tool.handler({ component: "agent_loop" }));
|
|
366
|
+
expect(result.component).toBe("agent_loop");
|
|
367
|
+
expect(result.structure.files.length).toBeGreaterThan(0);
|
|
368
|
+
expect(result.createCommands.length).toBeGreaterThan(0);
|
|
369
|
+
expect(result.nextSteps.length).toBeGreaterThan(0);
|
|
370
|
+
});
|
|
371
|
+
it("should throw for unknown component", async () => {
|
|
372
|
+
const tool = allTools.find((t) => t.name === "scaffold_directory");
|
|
373
|
+
await expect(tool.handler({ component: "unknown_component" })).rejects.toThrow("Unknown component");
|
|
374
|
+
});
|
|
375
|
+
});
|
|
376
|
+
describe("Unit: run_autonomous_loop", () => {
|
|
377
|
+
it("should complete loop with goal", async () => {
|
|
378
|
+
const tool = allTools.find((t) => t.name === "run_autonomous_loop");
|
|
379
|
+
const result = (await tool.handler({
|
|
380
|
+
goal: "Test autonomous verification",
|
|
381
|
+
maxIterations: 3,
|
|
382
|
+
maxDurationMs: 5000,
|
|
383
|
+
}));
|
|
384
|
+
expect(result.goal).toBe("Test autonomous verification");
|
|
385
|
+
expect(result.iterations).toBeGreaterThan(0);
|
|
386
|
+
expect(result.iterations).toBeLessThanOrEqual(3);
|
|
387
|
+
expect(["completed", "stopped", "timeout", "failed"]).toContain(result.status);
|
|
388
|
+
expect(result.results.length).toBeGreaterThan(0);
|
|
389
|
+
});
|
|
390
|
+
});
|
|
391
|
+
describe("Static: autonomous_maintenance methodology", () => {
|
|
392
|
+
it("should return autonomous_maintenance methodology with 5 steps", async () => {
|
|
393
|
+
const tool = allTools.find((t) => t.name === "getMethodology");
|
|
394
|
+
const result = (await tool.handler({ topic: "autonomous_maintenance" }));
|
|
395
|
+
expect(result.title).toContain("Autonomous Self-Maintenance");
|
|
396
|
+
expect(result.steps.length).toBe(5);
|
|
397
|
+
expect(result.steps[0].name).toBe("Assess Risk Before Action");
|
|
398
|
+
expect(result.steps[1].name).toBe("Re-Update Before Create");
|
|
399
|
+
expect(result).toHaveProperty("riskTiers");
|
|
400
|
+
expect(result).toHaveProperty("patterns");
|
|
401
|
+
});
|
|
402
|
+
});
|
|
403
|
+
describe("Static: self-eval tools", () => {
|
|
404
|
+
it("should include all 4 self-eval tools", () => {
|
|
405
|
+
const names = allTools.map((t) => t.name);
|
|
406
|
+
expect(names).toContain("log_tool_call");
|
|
407
|
+
expect(names).toContain("get_trajectory_analysis");
|
|
408
|
+
expect(names).toContain("get_self_eval_report");
|
|
409
|
+
expect(names).toContain("get_improvement_recommendations");
|
|
410
|
+
});
|
|
411
|
+
it("log_tool_call requires sessionId and toolName", () => {
|
|
412
|
+
const tool = allTools.find((t) => t.name === "log_tool_call");
|
|
413
|
+
expect(tool.inputSchema.required).toContain("sessionId");
|
|
414
|
+
expect(tool.inputSchema.required).toContain("toolName");
|
|
415
|
+
});
|
|
416
|
+
it("get_improvement_recommendations has focus enum", () => {
|
|
417
|
+
const tool = allTools.find((t) => t.name === "get_improvement_recommendations");
|
|
418
|
+
const focusProp = tool.inputSchema.properties.focus;
|
|
419
|
+
expect(focusProp.enum).toContain("tools");
|
|
420
|
+
expect(focusProp.enum).toContain("process");
|
|
421
|
+
expect(focusProp.enum).toContain("quality");
|
|
422
|
+
expect(focusProp.enum).toContain("knowledge");
|
|
423
|
+
expect(focusProp.enum).toContain("all");
|
|
424
|
+
});
|
|
425
|
+
});
|
|
426
|
+
describe("Static: self_reinforced_learning methodology", () => {
|
|
427
|
+
it("should return self_reinforced_learning methodology with 5 steps", async () => {
|
|
428
|
+
const tool = allTools.find((t) => t.name === "getMethodology");
|
|
429
|
+
const result = (await tool.handler({ topic: "self_reinforced_learning" }));
|
|
430
|
+
expect(result.title).toContain("Self-Reinforced Learning");
|
|
431
|
+
expect(result.steps.length).toBe(5);
|
|
432
|
+
expect(result.steps[0].name).toBe("Instrument");
|
|
433
|
+
expect(result.steps[4].name).toBe("Apply & Re-Analyze");
|
|
434
|
+
});
|
|
435
|
+
});
|
|
279
436
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
280
437
|
// UNIT LAYER — individual tool behavior
|
|
281
438
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
@@ -373,6 +530,98 @@ describe("Integration: full verification cycle chain", () => {
|
|
|
373
530
|
expect(abandoned.abandoned).toBe(true);
|
|
374
531
|
});
|
|
375
532
|
});
|
|
533
|
+
describe("Unit: log_tool_call", () => {
|
|
534
|
+
it("should log a tool call and return confirmation", async () => {
|
|
535
|
+
const tool = findTool("log_tool_call");
|
|
536
|
+
const result = (await tool.handler({
|
|
537
|
+
sessionId: "test-session-001",
|
|
538
|
+
toolName: "run_recon",
|
|
539
|
+
durationMs: 42,
|
|
540
|
+
resultStatus: "success",
|
|
541
|
+
phase: "recon",
|
|
542
|
+
}));
|
|
543
|
+
expect(result.logged).toBe(true);
|
|
544
|
+
expect(result.sessionId).toBe("test-session-001");
|
|
545
|
+
expect(result.toolName).toBe("run_recon");
|
|
546
|
+
expect(result.resultStatus).toBe("success");
|
|
547
|
+
});
|
|
548
|
+
it("should log error tool calls", async () => {
|
|
549
|
+
const tool = findTool("log_tool_call");
|
|
550
|
+
const result = (await tool.handler({
|
|
551
|
+
sessionId: "test-session-001",
|
|
552
|
+
toolName: "web_search",
|
|
553
|
+
durationMs: 1500,
|
|
554
|
+
resultStatus: "error",
|
|
555
|
+
error: "API key not configured",
|
|
556
|
+
phase: "recon",
|
|
557
|
+
}));
|
|
558
|
+
expect(result.logged).toBe(true);
|
|
559
|
+
expect(result.resultStatus).toBe("error");
|
|
560
|
+
});
|
|
561
|
+
});
|
|
562
|
+
describe("Unit: get_trajectory_analysis", () => {
|
|
563
|
+
it("should return trajectory analysis with logged data", async () => {
|
|
564
|
+
// Log a few calls first
|
|
565
|
+
const logTool = findTool("log_tool_call");
|
|
566
|
+
await logTool.handler({ sessionId: "traj-test", toolName: "findTools", durationMs: 10, phase: "meta" });
|
|
567
|
+
await logTool.handler({ sessionId: "traj-test", toolName: "run_recon", durationMs: 20, phase: "recon" });
|
|
568
|
+
await logTool.handler({ sessionId: "traj-test", toolName: "log_recon_finding", durationMs: 15, phase: "recon" });
|
|
569
|
+
const tool = findTool("get_trajectory_analysis");
|
|
570
|
+
const result = (await tool.handler({ sessionId: "traj-test" }));
|
|
571
|
+
expect(result.totalCalls).toBeGreaterThanOrEqual(3);
|
|
572
|
+
expect(result.uniqueTools).toBeGreaterThanOrEqual(3);
|
|
573
|
+
expect(result.topTools.length).toBeGreaterThan(0);
|
|
574
|
+
});
|
|
575
|
+
it("should return empty message when no data exists for session", async () => {
|
|
576
|
+
const tool = findTool("get_trajectory_analysis");
|
|
577
|
+
const result = (await tool.handler({ sessionId: "nonexistent-session-xyz" }));
|
|
578
|
+
expect(result.totalCalls).toBe(0);
|
|
579
|
+
expect(result.message).toBeTruthy();
|
|
580
|
+
});
|
|
581
|
+
});
|
|
582
|
+
describe("Unit: get_self_eval_report", () => {
|
|
583
|
+
it("should return health report with all sections", async () => {
|
|
584
|
+
const tool = findTool("get_self_eval_report");
|
|
585
|
+
const result = (await tool.handler({ sinceDaysAgo: 30 }));
|
|
586
|
+
expect(result).toHaveProperty("healthScore");
|
|
587
|
+
expect(result).toHaveProperty("healthGrade");
|
|
588
|
+
expect(result).toHaveProperty("verification");
|
|
589
|
+
expect(result).toHaveProperty("gaps");
|
|
590
|
+
expect(result).toHaveProperty("evalRuns");
|
|
591
|
+
expect(result).toHaveProperty("qualityGates");
|
|
592
|
+
expect(result).toHaveProperty("knowledge");
|
|
593
|
+
expect(result).toHaveProperty("toolTrajectory");
|
|
594
|
+
expect(typeof result.healthScore).toBe("number");
|
|
595
|
+
expect(["A", "B", "C", "D", "F"]).toContain(result.healthGrade);
|
|
596
|
+
});
|
|
597
|
+
it("should include details when requested", async () => {
|
|
598
|
+
const tool = findTool("get_self_eval_report");
|
|
599
|
+
const result = (await tool.handler({ sinceDaysAgo: 30, includeDetails: true }));
|
|
600
|
+
expect(result).toHaveProperty("cycleDetails");
|
|
601
|
+
expect(result).toHaveProperty("openGapDetails");
|
|
602
|
+
});
|
|
603
|
+
});
|
|
604
|
+
describe("Unit: get_improvement_recommendations", () => {
|
|
605
|
+
it("should return structured recommendations", async () => {
|
|
606
|
+
const tool = findTool("get_improvement_recommendations");
|
|
607
|
+
const result = (await tool.handler({ sinceDaysAgo: 30 }));
|
|
608
|
+
expect(typeof result.totalRecommendations).toBe("number");
|
|
609
|
+
expect(typeof result.highPriority).toBe("number");
|
|
610
|
+
expect(typeof result.mediumPriority).toBe("number");
|
|
611
|
+
expect(typeof result.lowPriority).toBe("number");
|
|
612
|
+
expect(Array.isArray(result.recommendations)).toBe(true);
|
|
613
|
+
expect(result).toHaveProperty("_selfReinforcement");
|
|
614
|
+
expect(result._selfReinforcement.nextSteps.length).toBe(4);
|
|
615
|
+
});
|
|
616
|
+
it("should filter by focus area", async () => {
|
|
617
|
+
const tool = findTool("get_improvement_recommendations");
|
|
618
|
+
const result = (await tool.handler({ sinceDaysAgo: 30, focus: "quality" }));
|
|
619
|
+
expect(result.focus).toBe("quality");
|
|
620
|
+
for (const rec of result.recommendations) {
|
|
621
|
+
expect(rec.category).toBe("quality");
|
|
622
|
+
}
|
|
623
|
+
});
|
|
624
|
+
});
|
|
376
625
|
describe("Integration: search finds logged gaps", () => {
|
|
377
626
|
it("should find gaps via search_all_knowledge after logging", async () => {
|
|
378
627
|
const uniqueMarker = `vitest-marker-${Date.now()}`;
|