nodebench-mcp 3.0.0 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. package/NODEBENCH_AGENTS.md +74 -67
  2. package/README.md +36 -34
  3. package/dist/dashboard/operatingDashboardHtml.js +2 -1
  4. package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
  5. package/dist/dashboard/operatingServer.js +3 -2
  6. package/dist/dashboard/operatingServer.js.map +1 -1
  7. package/dist/db.js +51 -3
  8. package/dist/db.js.map +1 -1
  9. package/dist/index.js +19 -18
  10. package/dist/index.js.map +1 -1
  11. package/dist/packageInfo.d.ts +3 -0
  12. package/dist/packageInfo.js +32 -0
  13. package/dist/packageInfo.js.map +1 -0
  14. package/dist/sandboxApi.js +2 -1
  15. package/dist/sandboxApi.js.map +1 -1
  16. package/dist/tools/boilerplateTools.js +10 -9
  17. package/dist/tools/boilerplateTools.js.map +1 -1
  18. package/dist/tools/documentationTools.js +2 -1
  19. package/dist/tools/documentationTools.js.map +1 -1
  20. package/dist/tools/progressiveDiscoveryTools.js +2 -1
  21. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  22. package/dist/tools/toolRegistry.js +11 -0
  23. package/dist/tools/toolRegistry.js.map +1 -1
  24. package/dist/toolsetRegistry.js +74 -1
  25. package/dist/toolsetRegistry.js.map +1 -1
  26. package/package.json +7 -6
  27. package/scripts/install.sh +14 -14
  28. package/dist/__tests__/analytics.test.d.ts +0 -11
  29. package/dist/__tests__/analytics.test.js +0 -546
  30. package/dist/__tests__/analytics.test.js.map +0 -1
  31. package/dist/__tests__/architectComplex.test.d.ts +0 -1
  32. package/dist/__tests__/architectComplex.test.js +0 -373
  33. package/dist/__tests__/architectComplex.test.js.map +0 -1
  34. package/dist/__tests__/architectSmoke.test.d.ts +0 -1
  35. package/dist/__tests__/architectSmoke.test.js +0 -92
  36. package/dist/__tests__/architectSmoke.test.js.map +0 -1
  37. package/dist/__tests__/audit-registry.d.ts +0 -1
  38. package/dist/__tests__/audit-registry.js +0 -60
  39. package/dist/__tests__/audit-registry.js.map +0 -1
  40. package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
  41. package/dist/__tests__/batchAutopilot.test.js +0 -218
  42. package/dist/__tests__/batchAutopilot.test.js.map +0 -1
  43. package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
  44. package/dist/__tests__/cliSubcommands.test.js +0 -138
  45. package/dist/__tests__/cliSubcommands.test.js.map +0 -1
  46. package/dist/__tests__/comparativeBench.test.d.ts +0 -1
  47. package/dist/__tests__/comparativeBench.test.js +0 -722
  48. package/dist/__tests__/comparativeBench.test.js.map +0 -1
  49. package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
  50. package/dist/__tests__/critterCalibrationEval.js +0 -370
  51. package/dist/__tests__/critterCalibrationEval.js.map +0 -1
  52. package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
  53. package/dist/__tests__/dynamicLoading.test.js +0 -280
  54. package/dist/__tests__/dynamicLoading.test.js.map +0 -1
  55. package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
  56. package/dist/__tests__/embeddingProvider.test.js +0 -86
  57. package/dist/__tests__/embeddingProvider.test.js.map +0 -1
  58. package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
  59. package/dist/__tests__/evalDatasetBench.test.js +0 -738
  60. package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
  61. package/dist/__tests__/evalHarness.test.d.ts +0 -1
  62. package/dist/__tests__/evalHarness.test.js +0 -1107
  63. package/dist/__tests__/evalHarness.test.js.map +0 -1
  64. package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
  65. package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
  66. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
  67. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
  68. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
  69. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
  70. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
  71. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
  72. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
  73. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
  74. package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
  75. package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
  76. package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
  77. package/dist/__tests__/forecastingDogfood.test.js +0 -284
  78. package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
  79. package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
  80. package/dist/__tests__/forecastingScoring.test.js +0 -202
  81. package/dist/__tests__/forecastingScoring.test.js.map +0 -1
  82. package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
  83. package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
  84. package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
  85. package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
  86. package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
  87. package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
  88. package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
  89. package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
  90. package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
  91. package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
  92. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
  93. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
  94. package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
  95. package/dist/__tests__/helpers/answerMatch.js +0 -267
  96. package/dist/__tests__/helpers/answerMatch.js.map +0 -1
  97. package/dist/__tests__/helpers/textLlm.d.ts +0 -25
  98. package/dist/__tests__/helpers/textLlm.js +0 -214
  99. package/dist/__tests__/helpers/textLlm.js.map +0 -1
  100. package/dist/__tests__/localDashboard.test.d.ts +0 -1
  101. package/dist/__tests__/localDashboard.test.js +0 -226
  102. package/dist/__tests__/localDashboard.test.js.map +0 -1
  103. package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
  104. package/dist/__tests__/multiHopDogfood.test.js +0 -303
  105. package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
  106. package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
  107. package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
  108. package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
  109. package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
  110. package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
  111. package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
  112. package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
  113. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
  114. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
  115. package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
  116. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
  117. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
  118. package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
  119. package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
  120. package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
  121. package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
  122. package/dist/__tests__/openclawDogfood.test.js +0 -535
  123. package/dist/__tests__/openclawDogfood.test.js.map +0 -1
  124. package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
  125. package/dist/__tests__/openclawMessaging.test.js +0 -232
  126. package/dist/__tests__/openclawMessaging.test.js.map +0 -1
  127. package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
  128. package/dist/__tests__/presetRealWorldBench.test.js +0 -859
  129. package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
  130. package/dist/__tests__/tools.test.d.ts +0 -1
  131. package/dist/__tests__/tools.test.js +0 -3201
  132. package/dist/__tests__/tools.test.js.map +0 -1
  133. package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
  134. package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
  135. package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
  136. package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
  137. package/dist/__tests__/traceabilityDogfood.test.js +0 -241
  138. package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
  139. package/dist/__tests__/webmcpTools.test.d.ts +0 -7
  140. package/dist/__tests__/webmcpTools.test.js +0 -195
  141. package/dist/__tests__/webmcpTools.test.js.map +0 -1
  142. package/dist/benchmarks/testProviderBus.d.ts +0 -7
  143. package/dist/benchmarks/testProviderBus.js +0 -272
  144. package/dist/benchmarks/testProviderBus.js.map +0 -1
  145. package/dist/hooks/postCompaction.d.ts +0 -14
  146. package/dist/hooks/postCompaction.js +0 -51
  147. package/dist/hooks/postCompaction.js.map +0 -1
  148. package/dist/security/__tests__/security.test.d.ts +0 -8
  149. package/dist/security/__tests__/security.test.js +0 -295
  150. package/dist/security/__tests__/security.test.js.map +0 -1
  151. package/dist/sync/hyperloopEval.test.d.ts +0 -4
  152. package/dist/sync/hyperloopEval.test.js +0 -60
  153. package/dist/sync/hyperloopEval.test.js.map +0 -1
  154. package/dist/sync/store.test.d.ts +0 -4
  155. package/dist/sync/store.test.js +0 -43
  156. package/dist/sync/store.test.js.map +0 -1
  157. package/dist/tools/documentTools.d.ts +0 -5
  158. package/dist/tools/documentTools.js +0 -524
  159. package/dist/tools/documentTools.js.map +0 -1
  160. package/dist/tools/financialTools.d.ts +0 -10
  161. package/dist/tools/financialTools.js +0 -403
  162. package/dist/tools/financialTools.js.map +0 -1
  163. package/dist/tools/memoryTools.d.ts +0 -5
  164. package/dist/tools/memoryTools.js +0 -137
  165. package/dist/tools/memoryTools.js.map +0 -1
  166. package/dist/tools/planningTools.d.ts +0 -5
  167. package/dist/tools/planningTools.js +0 -147
  168. package/dist/tools/planningTools.js.map +0 -1
  169. package/dist/tools/searchTools.d.ts +0 -5
  170. package/dist/tools/searchTools.js +0 -145
  171. package/dist/tools/searchTools.js.map +0 -1
@@ -1,209 +0,0 @@
1
- /**
2
- * Open-source dataset benchmark for long-running tasks.
3
- *
4
- * This test uses BFCL v3 long-context scenarios and runs task workflows
5
- * through NodeBench MCP tools in parallel "subagent" workers.
6
- */
7
- import { describe, expect, it } from "vitest";
8
- import datasetFixture from "./fixtures/bfcl_v3_long_context.sample.json";
9
- import { verificationTools } from "../tools/verificationTools.js";
10
- import { reconTools } from "../tools/reconTools.js";
11
- import { evalTools } from "../tools/evalTools.js";
12
- import { qualityGateTools } from "../tools/qualityGateTools.js";
13
- import { flywheelTools } from "../tools/flywheelTools.js";
14
- import { learningTools } from "../tools/learningTools.js";
15
- import { documentationTools } from "../tools/documentationTools.js";
16
- import { agentBootstrapTools } from "../tools/agentBootstrapTools.js";
17
- import { createMetaTools } from "../tools/metaTools.js";
18
- const fixture = datasetFixture;
19
- const domainTools = [
20
- ...verificationTools,
21
- ...evalTools,
22
- ...qualityGateTools,
23
- ...learningTools,
24
- ...flywheelTools,
25
- ...reconTools,
26
- ...documentationTools,
27
- ...agentBootstrapTools,
28
- ];
29
- const allTools = [...domainTools, ...createMetaTools(domainTools)];
30
- const openDatasetToolCallLog = [];
31
- function findTool(name) {
32
- const tool = allTools.find((candidate) => candidate.name === name);
33
- if (!tool)
34
- throw new Error(`Tool not found: ${name}`);
35
- return tool;
36
- }
37
- async function callTool(name, args, taskId, stage) {
38
- const tool = findTool(name);
39
- try {
40
- const result = await tool.handler(args);
41
- openDatasetToolCallLog.push({ taskId, tool: name, stage, success: true });
42
- return result;
43
- }
44
- catch (error) {
45
- openDatasetToolCallLog.push({ taskId, tool: name, stage, success: false });
46
- throw error;
47
- }
48
- }
49
- async function runDatasetTask(task, workerIndex) {
50
- const started = Date.now();
51
- const recon = (await callTool("run_recon", {
52
- target: `BFCL long-context task ${task.id}`,
53
- description: `Open-source long-running benchmark task (${task.turnCount} turns).`,
54
- projectContext: {
55
- techStack: "TypeScript, MCP, SQLite",
56
- architecture: "MCP tool orchestration benchmark",
57
- },
58
- }, task.id, "recon_start"));
59
- await callTool("log_recon_finding", {
60
- sessionId: recon.sessionId,
61
- category: "dataset",
62
- summary: `Ingested BFCL task ${task.id} with ${task.turnCount} turns and ${task.expectedPathLength} expected calls.`,
63
- sourceUrl: `${fixture.sourceUrl}#${task.id}`,
64
- relevance: "Long-running multi-turn benchmark for MCP tool orchestration.",
65
- actionItems: "Run in parallel subagents and track flywheel compliance.",
66
- }, task.id, "recon_log");
67
- let discovered = (await callTool("findTools", {
68
- query: task.prompt.slice(0, 600),
69
- category: "bootstrap",
70
- }, task.id, "find_tools"));
71
- if (!Array.isArray(discovered?.tools) || discovered.tools.length === 0) {
72
- discovered = (await callTool("findTools", { query: "verification flywheel methodology", category: "verification" }, task.id, "find_tools_fallback"));
73
- }
74
- expect(Array.isArray(discovered.tools)).toBe(true);
75
- expect(discovered.tools.length).toBeGreaterThan(0);
76
- const methodology = (await callTool("getMethodology", { topic: "mandatory_flywheel" }, task.id, "get_methodology"));
77
- expect(methodology.title).toBeTruthy();
78
- expect(Array.isArray(methodology.steps)).toBe(true);
79
- expect(methodology.steps.length).toBeGreaterThan(0);
80
- const evalRun = (await callTool("start_eval_run", {
81
- name: `open-dataset-${task.id}-${Date.now()}`,
82
- description: `BFCL long-context scenario (${task.turnCount} turns, worker ${workerIndex})`,
83
- cases: [
84
- {
85
- input: task.prompt,
86
- intent: `Coordinate long-running workflow for ${task.id}`,
87
- expected: "Discover tool strategy, run eval bookkeeping, and enforce mandatory flywheel checks.",
88
- },
89
- ],
90
- }, task.id, "start_eval_run"));
91
- await callTool("record_eval_result", {
92
- caseId: evalRun.caseIds[0],
93
- verdict: "pass",
94
- score: 1,
95
- actual: `Discovered ${discovered.tools.length} candidate tools and completed the workflow.`,
96
- telemetry: {
97
- dataset: fixture.dataset,
98
- split: fixture.split,
99
- taskId: task.id,
100
- workerIndex,
101
- turnCount: task.turnCount,
102
- expectedPathLength: task.expectedPathLength,
103
- expectedPath: task.expectedPath,
104
- involvedClasses: task.involvedClasses,
105
- },
106
- }, task.id, "record_eval_result");
107
- const evalSummary = (await callTool("complete_eval_run", { runId: evalRun.runId }, task.id, "complete_eval_run"));
108
- expect(evalSummary.status).toBe("completed");
109
- expect(evalSummary.summary.passed).toBe(1);
110
- const closedLoop = (await callTool("run_closed_loop", {
111
- steps: [
112
- { step: "compile", passed: true, output: `Compile checks for ${task.id}` },
113
- { step: "lint", passed: true, output: `Lint checks for ${task.id}` },
114
- { step: "test", passed: true, output: `Benchmark assertions for ${task.id}` },
115
- ],
116
- }, task.id, "run_closed_loop"));
117
- expect(closedLoop.allPassed).toBe(true);
118
- const flywheel = (await callTool("run_mandatory_flywheel", {
119
- target: `Open-source BFCL long-context task ${task.id}`,
120
- steps: [
121
- { stepName: "static_analysis", passed: true, output: "Types and schemas validated." },
122
- { stepName: "happy_path_test", passed: true, output: "Dataset task completed end-to-end." },
123
- { stepName: "failure_path_test", passed: true, output: "Fallback discovery query validated." },
124
- { stepName: "gap_analysis", passed: true, output: "No blocking gaps for this task." },
125
- { stepName: "fix_and_reverify", passed: true, output: "No rework required after checks." },
126
- { stepName: "deploy_and_document", passed: true, output: "Benchmark result documented." },
127
- ],
128
- }, task.id, "run_mandatory_flywheel"));
129
- expect(flywheel.passed).toBe(true);
130
- const knowledge = (await callTool("search_all_knowledge", { query: task.id, limit: 10 }, task.id, "search_all_knowledge"));
131
- expect(typeof knowledge.totalResults).toBe("number");
132
- expect(knowledge.totalResults).toBeGreaterThan(0);
133
- return {
134
- taskId: task.id,
135
- workerIndex,
136
- ok: true,
137
- elapsedMs: Date.now() - started,
138
- discoveredTools: discovered.tools.length,
139
- knowledgeHits: knowledge.totalResults,
140
- };
141
- }
142
- async function runWorkerPool(tasks, concurrency) {
143
- const boundedConcurrency = Math.max(1, Math.min(concurrency, tasks.length));
144
- const results = new Array(tasks.length);
145
- let nextIndex = 0;
146
- const workers = Array.from({ length: boundedConcurrency }, (_, workerIndex) => (async () => {
147
- while (true) {
148
- const taskIndex = nextIndex++;
149
- if (taskIndex >= tasks.length)
150
- return;
151
- const task = tasks[taskIndex];
152
- try {
153
- results[taskIndex] = await runDatasetTask(task, workerIndex);
154
- }
155
- catch (error) {
156
- results[taskIndex] = {
157
- taskId: task.id,
158
- workerIndex,
159
- ok: false,
160
- elapsedMs: 0,
161
- discoveredTools: 0,
162
- knowledgeHits: 0,
163
- error: error instanceof Error ? error.message : String(error),
164
- };
165
- }
166
- }
167
- })());
168
- await Promise.all(workers);
169
- return results;
170
- }
171
- describe("Scenario: Open-Source Long-Running Dataset (Parallel Subagents)", () => {
172
- it("should execute BFCL long-context tasks with parallel MCP subagent workflows", async () => {
173
- expect(Array.isArray(fixture.tasks)).toBe(true);
174
- expect(fixture.tasks.length).toBeGreaterThan(0);
175
- const requestedTaskLimit = Number.parseInt(process.env.NODEBENCH_OPEN_DATASET_TASK_LIMIT ?? "8", 10);
176
- const taskLimit = Math.max(1, Math.min(fixture.tasks.length, Number.isFinite(requestedTaskLimit) ? requestedTaskLimit : 8));
177
- const requestedConcurrency = Number.parseInt(process.env.NODEBENCH_OPEN_DATASET_CONCURRENCY ?? "4", 10);
178
- const concurrency = Math.max(1, Math.min(taskLimit, Number.isFinite(requestedConcurrency) ? requestedConcurrency : 4));
179
- const tasks = fixture.tasks.slice(0, taskLimit);
180
- const started = Date.now();
181
- const results = await runWorkerPool(tasks, concurrency);
182
- const elapsedMs = Date.now() - started;
183
- const failed = results.filter((result) => !result.ok);
184
- const passed = results.filter((result) => result.ok);
185
- const calledTools = new Set(openDatasetToolCallLog.map((entry) => entry.tool));
186
- const requiredTools = [
187
- "run_recon",
188
- "log_recon_finding",
189
- "findTools",
190
- "getMethodology",
191
- "start_eval_run",
192
- "record_eval_result",
193
- "complete_eval_run",
194
- "run_closed_loop",
195
- "run_mandatory_flywheel",
196
- "search_all_knowledge",
197
- ];
198
- console.log(`[open-dataset] dataset=${fixture.dataset} split=${fixture.split} tasks=${taskLimit} concurrency=${concurrency} pass=${passed.length}/${results.length} elapsedMs=${elapsedMs} toolCalls=${openDatasetToolCallLog.length}`);
199
- if (failed.length > 0) {
200
- console.error("[open-dataset] failures:", failed.map((result) => ({ taskId: result.taskId, workerIndex: result.workerIndex, error: result.error })));
201
- }
202
- expect(failed.length).toBe(0);
203
- expect(passed.length).toBe(taskLimit);
204
- for (const requiredTool of requiredTools) {
205
- expect(calledTools.has(requiredTool)).toBe(true);
206
- }
207
- });
208
- });
209
- //# sourceMappingURL=openDatasetParallelEval.test.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"openDatasetParallelEval.test.js","sourceRoot":"","sources":["../../src/__tests__/openDatasetParallelEval.test.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,cAAc,MAAM,6CAA6C,CAAC;AACzE,OAAO,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC;AAClE,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AACpD,OAAO,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AAClD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,kBAAkB,EAAE,MAAM,gCAAgC,CAAC;AACpE,OAAO,EAAE,mBAAmB,EAAE,MAAM,iCAAiC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAuCxD,MAAM,OAAO,GAAG,cAAgC,CAAC;AAEjD,MAAM,WAAW,GAAc;IAC7B,GAAG,iBAAiB;IACpB,GAAG,SAAS;IACZ,GAAG,gBAAgB;IACnB,GAAG,aAAa;IAChB,GAAG,aAAa;IAChB,GAAG,UAAU;IACb,GAAG,kBAAkB;IACrB,GAAG,mBAAmB;CACvB,CAAC;AACF,MAAM,QAAQ,GAAG,CAAC,GAAG,WAAW,EAAE,GAAG,eAAe,CAAC,WAAW,CAAC,CAAC,CAAC;AAEnE,MAAM,sBAAsB,GAKvB,EAAE,CAAC;AAER,SAAS,QAAQ,CAAC,IAAY;IAC5B,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,SAAS,EAAE,EAAE,CAAC,SAAS,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC;IACnE,IAAI,CAAC,IAAI;QAAE,MAAM,IAAI,KAAK,CAAC,mBAAmB,IAAI,EAAE,CAAC,CAAC;IACtD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,KAAK,UAAU,QAAQ,CACrB,IAAY,EACZ,IAA6B,EAC7B,MAAc,EACd,KAAa;IAEb,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC5B,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QACxC,sBAAsB,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QAC1E,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,sBAAsB,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC;QAC3E,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC;AAED,KAAK,UAAU,cAAc,CAAC,IAAiB,EAAE,WAAmB;IAClE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE3B,MAAM,KAAK,GAAG,CAAC,MAAM,QAAQ,CAC3B,WAAW,EACX;QACE,MAAM,EAAE,0BAA0B,IAAI,CAAC,EAAE,EAAE;QAC3C,WAAW,EAAE,4CAA4C,IAAI,CAAC,SAAS,UAAU;QACjF,cAAc,EAAE;YACd,SAAS,EAAE,yBAAyB;YACpC,YAAY,EAAE,kCAAkC;SACjD;KACF,EACD,IAAI,CAAC,EAAE,EACP,aAAa,CACd,CAAQ,CAAC;IAEV,MAAM,QAAQ,CACZ,mBAAmB,EACnB;QACE,SAAS,EAAE,KAAK,CAAC,SAAS;QAC1B,QAAQ,EAAE,SAAS;QACnB,OAAO,EAAE,sBAAsB,IAAI,CAAC,EAAE,SAAS,IAAI,CAAC,SAAS,cAAc,IAAI,CAAC,kBAAkB,kBAAkB;QACpH,SAAS,EAAE,GAAG,OAAO,CAAC,SAAS,IAAI,IAAI,CAAC,EAAE,EAAE;QAC5C,SAAS,EAAE,+DAA+D;QAC1E,WAAW,EAAE,0DAA0D;KACxE,EACD,IAAI,CAAC,EAAE,EACP,WAAW,CACZ,CAAC;IAEF,IAAI,UAAU,GAAG,CAAC,MAAM,QAAQ,CAC9B,WAAW,EACX;QACE,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC;QAChC,QAAQ,EAAE,WAAW;KACtB,EACD,IAAI,CAAC,EAAE,EACP,YAAY,CACb,CAAQ,CAAC;IAEV,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,EAAE,KAAK,CAAC,IAAI,UAAU,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvE,UAAU,GAAG,CAAC,MAAM,QAAQ,CAC1B,WAAW,EACX,EAAE,KAAK,EAAE,mCAAmC,EAAE,QAAQ,EAAE,cAAc,EAAE,EACxE,IAAI,CAAC,EAAE,EACP,qBAAqB,CACtB,CAAQ,CAAC;IACZ,CAAC;IACD,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACnD,MAAM,CAAC,UAAU,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;IAEnD,MAAM,WAAW,GAAG,CAAC,MAAM,QAAQ,CACjC,gBAAgB,EAChB,EAAE,KAAK,EAAE,oBAAoB,EAAE,EAC/B,IAAI,CAAC,EAAE,EACP,iBAAiB,CAClB,CAAQ,CAAC;IACV,MAAM,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,UAAU,EAAE,CAAC;IACvC,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACpD,MAAM,CAAC,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;IAEpD,MAAM,OAAO,GAAG,CAAC,MAAM,QAAQ,CAC7B,gBAAgB,EAChB;QACE,IAAI,EAAE,gBAAgB,IAAI,CAAC,EAAE,IAAI,IAAI,CAAC,GAAG,EAAE,EAAE;QAC7C,WAAW,EAAE,+BAA+B,IAAI,CAAC,SAAS,kBAAkB,WAAW,GAAG;QAC1F,KAAK,EAAE;YACL;gBACE,KAAK,EAAE,IAAI,CAAC,MAAM;gBAClB,MAAM,EAAE,wCAAwC,IAAI,CAAC,EAAE,EAAE;gBACzD,QAAQ,EACN,sFAAsF;aACzF;SACF;KACF,EACD,IAAI,CAAC,EAAE,EACP,gBAAgB,CACjB,CAAQ,CAAC;IAEV,MAAM,QAAQ,CACZ,oBAAoB,EACpB;QACE,MAAM,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC;QAC1B,OAAO,EAAE,MAAM;QACf,KAAK,EAAE,CAAC;QACR,MAAM,EAAE,cAAc,UAAU,CAAC,KAAK,CAAC,MAAM,8CAA8C;QAC3F,SAAS,EAAE;YACT,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,IAAI,CAAC,EAAE;YACf,WAAW;YACX,SAAS,EAAE,IAAI,CAAC,SAAS;YACzB,kBAAkB,EAAE,IAAI,CAAC,kBAAkB;YAC3C,YAAY,EAAE,IAAI,CAAC,YAAY;YAC/B,eAAe,EAAE,IAAI,CAAC,eAAe;SACtC;KACF,EACD,IAAI,CAAC,EAAE,EACP,oBAAoB,CACrB,CAAC;IAEF,MAAM,WAAW,GAAG,CAAC,MAAM,QAAQ,CACjC,mBAAmB,EACnB,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,EACxB,IAAI,CAAC,EAAE,EACP,mBAAmB,CACpB,CAAQ,CAAC;IACV,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IAC7C,MAAM,CAAC,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAE3C,MAAM,UAAU,GAAG,CAAC,MAAM,QAAQ,CAChC,iBAAiB,EACjB;QACE,KAAK,EAAE;YACL,EAAE,IAAI,EAAE,SAAS,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,sBAAsB,IAAI,CAAC,EAAE,EAAE,EAAE;YAC1E,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,mBAAmB,IAAI,CAAC,EAAE,EAAE,EAAE;YACpE,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,4BAA4B,IAAI,CAAC,EAAE,EAAE,EAAE;SAC9E;KACF,EACD,IAAI,CAAC,EAAE,EACP,iBAAiB,CAClB,CAAQ,CAAC;IACV,MAAM,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAExC,MAAM,QAAQ,GAAG,CAAC,MAAM,QAAQ,CAC9B,wBAAwB,EACxB;QACE,MAAM,EAAE,sCAAsC,IAAI,CAAC,EAAE,EAAE;QACvD,KAAK,EAAE;YACL,EAAE,QAAQ,EAAE,iBAAiB,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,8BAA8B,EAAE;YACrF,EAAE,QAAQ,EAAE,iBAAiB,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,oCAAoC,EAAE;YAC3F,EAAE,QAAQ,EAAE,mBAAmB,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,qCAAqC,EAAE;YAC9F,EAAE,QAAQ,EAAE,cAAc,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,iCAAiC,EAAE;YACrF,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,kCAAkC,EAAE;YAC1F,EAAE,QAAQ,EAAE,qBAAqB,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,8BAA8B,EAAE;SAC1F;KACF,EACD,IAAI,CAAC,EAAE,EACP,wBAAwB,CACzB,CAAQ,CAAC;IACV,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEnC,MAAM,SAAS,GAAG,CAAC,MAAM,QAAQ,CAC/B,sBAAsB,EACtB,EAAE,KAAK,EAAE,IAAI,CAAC,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,EAC7B,IAAI,CAAC,EAAE,EACP,sBAAsB,CACvB,CAAQ,CAAC;IACV,MAAM,CAAC,OAAO,SAAS,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACrD,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;IAElD,OAAO;QACL,MAAM,EAAE,IAAI,CAAC,EAAE;QACf,WAAW;QACX,EAAE,EAAE,IAAI;QACR,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO;QAC/B,eAAe,EAAE,UAAU,CAAC,KAAK,CAAC,MAAM;QACxC,aAAa,EAAE,SAAS,CAAC,YAAY;KACtC,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,aAAa,CAAC,KAAoB,EAAE,WAAmB;IACpE,MAAM,kBAAkB,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC;IAC5E,MAAM,OAAO,GAAmB,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IACxD,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,kBAAkB,EAAE,EAAE,CAAC,CAAC,EAAE,WAAW,EAAE,EAAE,CAC5E,CAAC,KAAK,IAAI,EAAE;QACV,OAAO,IAAI,EAAE,CAAC;YACZ,MAAM,SAAS,GAAG,SAAS,EAAE,CAAC;YAC9B,IAAI,SAAS,IAAI,KAAK,CAAC,MAAM;gBAAE,OAAO;YAEtC,MAAM,IAAI,GAAG,KAAK,CAAC,SAAS,CAAC,CAAC;YAC9B,IAAI,CAAC;gBACH,OAAO,CAAC,SAAS,CAAC,GAAG,MAAM,cAAc,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;YAC/D,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO,CAAC,SAAS,CAAC,GAAG;oBACnB,MAAM,EAAE,IAAI,CAAC,EAAE;oBACf,WAAW;oBACX,EAAE,EAAE,KAAK;oBACT,SAAS,EAAE,CAAC;oBACZ,eAAe,EAAE,CAAC;oBAClB,aAAa,EAAE,CAAC;oBAChB,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;iBAC9D,CAAC;YACJ,CAAC;QACH,CAAC;IACH,CAAC,CAAC,EAAE,CACL,CAAC;IAEF,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IAC3B,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,QAAQ,CAAC,iEAAiE,EAAE,GAAG,EAAE;IAC/E,EAAE,CAAC,6EAA6E,EAAE,KAAK,IAAI,EAAE;QAC3F,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChD,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QAEhD,MAAM,kBAAkB,GAAG,MAAM,CAAC,QAAQ,CACxC,OAAO,CAAC,GAAG,CAAC,iCAAiC,IAAI,GAAG,EACpD,EAAE,CACH,CAAC;QACF,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CACxB,CAAC,EACD,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,kBAAkB,CAAC,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC,CAAC,CAAC,CAAC,CAC7F,CAAC;QAEF,MAAM,oBAAoB,GAAG,MAAM,CAAC,QAAQ,CAC1C,OAAO,CAAC,GAAG,CAAC,kCAAkC,IAAI,GAAG,EACrD,EAAE,CACH,CAAC;QACF,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAC1B,CAAC,EACD,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,MAAM,CAAC,QAAQ,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,CAAC,CACtF,CAAC;QAEF,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QAChD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC3B,MAAM,OAAO,GAAG,MAAM,aAAa,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;QACxD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO,CAAC;QAEvC,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QACtD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QAErD,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC;QAC/E,MAAM,aAAa,GAAG;YACpB,WAAW;YACX,mBAAmB;YACnB,WAAW;YACX,gBAAgB;YAChB,gBAAgB;YAChB,oBAAoB;YACpB,mBAAmB;YACnB,iBAAiB;YACjB,wBAAwB;YACxB,sBAAsB;SACvB,CAAC;QAEF,OAAO,CAAC,GAAG,CACT,0BAA0B,OAAO,CAAC,OAAO,UAAU,OAAO,CAAC,KAAK,UAAU,SAAS,gBAAgB,WAAW,SAAS,MAAM,CAAC,MAAM,IAAI,OAAO,CAAC,MAAM,cAAc,SAAS,cAAc,sBAAsB,CAAC,MAAM,EAAE,CAC3N,CAAC;QAEF,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtB,OAAO,CAAC,KAAK,CACX,0BAA0B,EAC1B,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,EAAE,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,WAAW,EAAE,MAAM,CAAC,WAAW,EAAE,KAAK,EAAE,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC,CAC1G,CAAC;QACJ,CAAC;QAED,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC9B,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QAEtC,KAAK,MAAM,YAAY,IAAI,aAAa,EAAE,CAAC;YACzC,MAAM,CAAC,WAAW,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACnD,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -1,7 +0,0 @@
1
- /**
2
- * Gated dataset benchmark for long-running tool-augmented tasks (GAIA lane).
3
- *
4
- * GAIA is a gated dataset. Fixtures are generated into `.cache/gaia` (gitignored)
5
- * and this test intentionally avoids logging the raw question text to stdout.
6
- */
7
- export {};
@@ -1,279 +0,0 @@
1
- /**
2
- * Gated dataset benchmark for long-running tool-augmented tasks (GAIA lane).
3
- *
4
- * GAIA is a gated dataset. Fixtures are generated into `.cache/gaia` (gitignored)
5
- * and this test intentionally avoids logging the raw question text to stdout.
6
- */
7
- import { describe, expect, it } from "vitest";
8
- import { existsSync } from "node:fs";
9
- import { readFile } from "node:fs/promises";
10
- import path from "node:path";
11
- import { fileURLToPath } from "node:url";
12
- import { verificationTools } from "../tools/verificationTools.js";
13
- import { reconTools } from "../tools/reconTools.js";
14
- import { evalTools } from "../tools/evalTools.js";
15
- import { qualityGateTools } from "../tools/qualityGateTools.js";
16
- import { flywheelTools } from "../tools/flywheelTools.js";
17
- import { learningTools } from "../tools/learningTools.js";
18
- import { documentationTools } from "../tools/documentationTools.js";
19
- import { agentBootstrapTools } from "../tools/agentBootstrapTools.js";
20
- import { createMetaTools } from "../tools/metaTools.js";
21
- const domainTools = [
22
- ...verificationTools,
23
- ...evalTools,
24
- ...qualityGateTools,
25
- ...learningTools,
26
- ...flywheelTools,
27
- ...reconTools,
28
- ...documentationTools,
29
- ...agentBootstrapTools,
30
- ];
31
- const allTools = [...domainTools, ...createMetaTools(domainTools)];
32
- const openDatasetToolCallLog = [];
33
- function findTool(name) {
34
- const tool = allTools.find((candidate) => candidate.name === name);
35
- if (!tool)
36
- throw new Error(`Tool not found: ${name}`);
37
- return tool;
38
- }
39
- async function callTool(name, args, taskId, stage) {
40
- const tool = findTool(name);
41
- try {
42
- const result = await tool.handler(args);
43
- openDatasetToolCallLog.push({ taskId, tool: name, stage, success: true });
44
- return result;
45
- }
46
- catch (error) {
47
- openDatasetToolCallLog.push({ taskId, tool: name, stage, success: false });
48
- throw error;
49
- }
50
- }
51
- function buildDiscoveryQuery(task) {
52
- const promptLower = task.prompt.toLowerCase();
53
- const tags = [];
54
- // These tags are intentionally high-level to avoid accidentally printing or persisting
55
- // the full gated question text while still exercising tool discovery paths.
56
- if (task.hasFile)
57
- tags.push(`${task.fileExt || "file"} attachment`);
58
- if (promptLower.includes("wikipedia"))
59
- tags.push("wikipedia lookup");
60
- if (promptLower.includes("github"))
61
- tags.push("github lookup");
62
- if (promptLower.includes("youtube") || promptLower.includes("video"))
63
- tags.push("video analysis");
64
- if (promptLower.includes("spreadsheet") || task.fileExt === "xlsx")
65
- tags.push("spreadsheet analysis");
66
- if (promptLower.includes("pdf") || task.fileExt === "pdf")
67
- tags.push("pdf parsing");
68
- if (promptLower.includes("image") || ["png", "jpg", "jpeg", "webp"].includes(task.fileExt))
69
- tags.push("image ocr");
70
- if (promptLower.includes("zip code") || promptLower.includes("zipcode"))
71
- tags.push("zip code lookup");
72
- if (promptLower.includes("calculate") || promptLower.includes("round"))
73
- tags.push("calculation");
74
- const tagText = tags.length > 0 ? tags.join(", ") : "general tool-augmented reasoning";
75
- return `GAIA task ${task.id} (level ${task.level || "?"}): ${tagText}`;
76
- }
77
- function redactedEvalInput(task) {
78
- const bits = [
79
- `GAIA task ${task.id} (prompt redacted)`,
80
- `level=${task.level || "?"}`,
81
- `questionLength=${task.questionLength}`,
82
- `hasFile=${task.hasFile}`,
83
- `fileExt=${task.fileExt || ""}`,
84
- `steps=${task.annotator?.numberOfSteps ?? 0}`,
85
- `tools=${task.annotator?.numberOfTools ?? 0}`,
86
- ];
87
- return bits.join(" | ");
88
- }
89
- async function loadGaiaFixture(fixturePath) {
90
- const raw = await readFile(fixturePath, "utf8");
91
- const parsed = JSON.parse(raw);
92
- if (!parsed || !Array.isArray(parsed.tasks)) {
93
- throw new Error("Invalid GAIA fixture payload");
94
- }
95
- return parsed;
96
- }
97
- async function runDatasetTask(fixture, task, workerIndex) {
98
- const started = Date.now();
99
- const recon = (await callTool("run_recon", {
100
- target: `GAIA task ${task.id}`,
101
- description: `Gated long-running benchmark task (${fixture.config}/${fixture.split}).`,
102
- projectContext: {
103
- techStack: "TypeScript, MCP, SQLite",
104
- architecture: "MCP orchestration benchmark with parallel subagent workers",
105
- },
106
- }, task.id, "recon_start"));
107
- await callTool("log_recon_finding", {
108
- sessionId: recon.sessionId,
109
- category: "dataset",
110
- summary: `Ingested GAIA task ${task.id} (level=${task.level}, questionLength=${task.questionLength}, hasFile=${task.hasFile}, ext=${task.fileExt}).`,
111
- sourceUrl: fixture.sourceUrl,
112
- relevance: "Tool-augmented multi-step benchmark lane (GAIA).",
113
- actionItems: "Run in parallel worker pool and enforce mandatory flywheel checks.",
114
- }, task.id, "recon_log");
115
- let discovered = (await callTool("findTools", {
116
- query: buildDiscoveryQuery(task),
117
- category: "bootstrap",
118
- }, task.id, "find_tools"));
119
- if (!Array.isArray(discovered?.tools) || discovered.tools.length === 0) {
120
- discovered = (await callTool("findTools", { query: "tool-augmented reasoning with files web and computation", category: "verification" }, task.id, "find_tools_fallback"));
121
- }
122
- expect(Array.isArray(discovered.tools)).toBe(true);
123
- expect(discovered.tools.length).toBeGreaterThan(0);
124
- const methodology = (await callTool("getMethodology", { topic: "mandatory_flywheel" }, task.id, "get_methodology"));
125
- expect(methodology.title).toBeTruthy();
126
- expect(Array.isArray(methodology.steps)).toBe(true);
127
- expect(methodology.steps.length).toBeGreaterThan(0);
128
- const evalRun = (await callTool("start_eval_run", {
129
- name: `open-dataset-gaia-${task.id}-${Date.now()}`,
130
- description: `GAIA scenario (level=${task.level}, worker ${workerIndex})`,
131
- cases: [
132
- {
133
- input: redactedEvalInput(task),
134
- intent: `Coordinate long-running GAIA workflow for ${task.id}`,
135
- expected: "Discover tool strategy, run eval bookkeeping, and enforce mandatory flywheel checks.",
136
- },
137
- ],
138
- }, task.id, "start_eval_run"));
139
- await callTool("record_eval_result", {
140
- caseId: evalRun.caseIds[0],
141
- verdict: "pass",
142
- score: 1,
143
- actual: `Discovered ${discovered.tools.length} tools and completed GAIA workflow bookkeeping.`,
144
- telemetry: {
145
- dataset: fixture.dataset,
146
- config: fixture.config,
147
- split: fixture.split,
148
- taskId: task.id,
149
- level: task.level,
150
- workerIndex,
151
- questionLength: task.questionLength,
152
- hasFile: task.hasFile,
153
- fileExt: task.fileExt,
154
- numberOfSteps: task.annotator?.numberOfSteps ?? 0,
155
- numberOfTools: task.annotator?.numberOfTools ?? 0,
156
- complexityScore: task.complexityScore,
157
- },
158
- }, task.id, "record_eval_result");
159
- const evalSummary = (await callTool("complete_eval_run", { runId: evalRun.runId }, task.id, "complete_eval_run"));
160
- expect(evalSummary.status).toBe("completed");
161
- expect(evalSummary.summary.passed).toBe(1);
162
- const closedLoop = (await callTool("run_closed_loop", {
163
- steps: [
164
- { step: "compile", passed: true, output: `Compile checks for ${task.id}` },
165
- { step: "lint", passed: true, output: `Lint checks for ${task.id}` },
166
- { step: "test", passed: true, output: `Parallel benchmark checks for ${task.id}` },
167
- ],
168
- }, task.id, "run_closed_loop"));
169
- expect(closedLoop.allPassed).toBe(true);
170
- const flywheel = (await callTool("run_mandatory_flywheel", {
171
- target: `Gated GAIA task ${task.id}`,
172
- steps: [
173
- { stepName: "static_analysis", passed: true, output: "Types and schemas validated." },
174
- { stepName: "happy_path_test", passed: true, output: "Benchmark workflow completed." },
175
- { stepName: "failure_path_test", passed: true, output: "Fallback discovery query validated." },
176
- { stepName: "gap_analysis", passed: true, output: "No blocking gaps for this task." },
177
- { stepName: "fix_and_reverify", passed: true, output: "No rework required after checks." },
178
- { stepName: "deploy_and_document", passed: true, output: "Benchmark result documented." },
179
- ],
180
- }, task.id, "run_mandatory_flywheel"));
181
- expect(flywheel.passed).toBe(true);
182
- const knowledge = (await callTool("search_all_knowledge", { query: task.id, limit: 10 }, task.id, "search_all_knowledge"));
183
- expect(typeof knowledge.totalResults).toBe("number");
184
- expect(knowledge.totalResults).toBeGreaterThan(0);
185
- return {
186
- taskId: task.id,
187
- workerIndex,
188
- ok: true,
189
- elapsedMs: Date.now() - started,
190
- discoveredTools: discovered.tools.length,
191
- knowledgeHits: knowledge.totalResults,
192
- };
193
- }
194
- async function runWorkerPool(fixture, tasks, concurrency) {
195
- const boundedConcurrency = Math.max(1, Math.min(concurrency, tasks.length));
196
- const results = new Array(tasks.length);
197
- let nextIndex = 0;
198
- const workers = Array.from({ length: boundedConcurrency }, (_, workerIndex) => (async () => {
199
- while (true) {
200
- const taskIndex = nextIndex++;
201
- if (taskIndex >= tasks.length)
202
- return;
203
- const task = tasks[taskIndex];
204
- try {
205
- results[taskIndex] = await runDatasetTask(fixture, task, workerIndex);
206
- }
207
- catch (error) {
208
- results[taskIndex] = {
209
- taskId: task.id,
210
- workerIndex,
211
- ok: false,
212
- elapsedMs: 0,
213
- discoveredTools: 0,
214
- knowledgeHits: 0,
215
- error: error instanceof Error ? error.message : String(error),
216
- };
217
- }
218
- }
219
- })());
220
- await Promise.all(workers);
221
- return results;
222
- }
223
- function resolveGaiaFixturePath() {
224
- const fixtureOverride = process.env.NODEBENCH_GAIA_FIXTURE_PATH;
225
- if (fixtureOverride)
226
- return fixtureOverride;
227
- const config = process.env.NODEBENCH_GAIA_CONFIG ?? "2023_level3";
228
- const split = process.env.NODEBENCH_GAIA_SPLIT ?? "validation";
229
- const testDir = path.dirname(fileURLToPath(import.meta.url));
230
- const repoRoot = path.resolve(testDir, "../../../..");
231
- return path.join(repoRoot, ".cache", "gaia", `gaia_${config}_${split}.sample.json`);
232
- }
233
- const gaiaFixturePath = resolveGaiaFixturePath();
234
- const hasGaiaFixture = existsSync(gaiaFixturePath);
235
- describe("Scenario: GAIA (Gated) Long-Running Dataset (Parallel Subagents)", () => {
236
- const testFn = hasGaiaFixture ? it : it.skip;
237
- testFn("should execute GAIA tasks with parallel MCP subagent workflows", async () => {
238
- const fixture = await loadGaiaFixture(gaiaFixturePath);
239
- expect(Array.isArray(fixture.tasks)).toBe(true);
240
- expect(fixture.tasks.length).toBeGreaterThan(0);
241
- const requestedTaskLimit = Number.parseInt(process.env.NODEBENCH_GAIA_TASK_LIMIT ?? "8", 10);
242
- const taskLimit = Math.max(1, Math.min(fixture.tasks.length, Number.isFinite(requestedTaskLimit) ? requestedTaskLimit : 8));
243
- const requestedConcurrency = Number.parseInt(process.env.NODEBENCH_GAIA_CONCURRENCY ?? "4", 10);
244
- const concurrency = Math.max(1, Math.min(taskLimit, Number.isFinite(requestedConcurrency) ? requestedConcurrency : 4));
245
- const tasks = fixture.tasks.slice(0, taskLimit);
246
- const started = Date.now();
247
- const results = await runWorkerPool(fixture, tasks, concurrency);
248
- const elapsedMs = Date.now() - started;
249
- const failed = results.filter((result) => !result.ok);
250
- const passed = results.filter((result) => result.ok);
251
- const calledTools = new Set(openDatasetToolCallLog.map((entry) => entry.tool));
252
- const requiredTools = [
253
- "run_recon",
254
- "log_recon_finding",
255
- "findTools",
256
- "getMethodology",
257
- "start_eval_run",
258
- "record_eval_result",
259
- "complete_eval_run",
260
- "run_closed_loop",
261
- "run_mandatory_flywheel",
262
- "search_all_knowledge",
263
- ];
264
- console.log(`[open-dataset] dataset=${fixture.dataset} config=${fixture.config} split=${fixture.split} tasks=${taskLimit} concurrency=${concurrency} pass=${passed.length}/${results.length} elapsedMs=${elapsedMs} toolCalls=${openDatasetToolCallLog.length}`);
265
- if (failed.length > 0) {
266
- console.error("[open-dataset] failures:", failed.map((result) => ({
267
- taskId: result.taskId,
268
- workerIndex: result.workerIndex,
269
- error: result.error,
270
- })));
271
- }
272
- expect(failed.length).toBe(0);
273
- expect(passed.length).toBe(taskLimit);
274
- for (const requiredTool of requiredTools) {
275
- expect(calledTools.has(requiredTool)).toBe(true);
276
- }
277
- });
278
- });
279
- //# sourceMappingURL=openDatasetParallelEvalGaia.test.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"openDatasetParallelEvalGaia.test.js","sourceRoot":"","sources":["../../src/__tests__/openDatasetParallelEvalGaia.test.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAC5C,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC;AAClE,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AACpD,OAAO,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AAClD,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAChE,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,kBAAkB,EAAE,MAAM,gCAAgC,CAAC;AACpE,OAAO,EAAE,mBAAmB,EAAE,MAAM,iCAAiC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AA+CxD,MAAM,WAAW,GAAc;IAC7B,GAAG,iBAAiB;IACpB,GAAG,SAAS;IACZ,GAAG,gBAAgB;IACnB,GAAG,aAAa;IAChB,GAAG,aAAa;IAChB,GAAG,UAAU;IACb,GAAG,kBAAkB;IACrB,GAAG,mBAAmB;CACvB,CAAC;AACF,MAAM,QAAQ,GAAG,CAAC,GAAG,WAAW,EAAE,GAAG,eAAe,CAAC,WAAW,CAAC,CAAC,CAAC;AAEnE,MAAM,sBAAsB,GAKvB,EAAE,CAAC;AAER,SAAS,QAAQ,CAAC,IAAY;IAC5B,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,SAAS,EAAE,EAAE,CAAC,SAAS,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC;IACnE,IAAI,CAAC,IAAI;QAAE,MAAM,IAAI,KAAK,CAAC,mBAAmB,IAAI,EAAE,CAAC,CAAC;IACtD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,KAAK,UAAU,QAAQ,CACrB,IAAY,EACZ,IAA6B,EAC7B,MAAc,EACd,KAAa;IAEb,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC5B,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QACxC,sBAAsB,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QAC1E,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,sBAAsB,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC;QAC3E,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC;AAED,SAAS,mBAAmB,CAAC,IAAiB;IAC5C,MAAM,WAAW,GAAG,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC;IAC9C,MAAM,IAAI,GAAa,EAAE,CAAC;IAE1B,uFAAuF;IACvF,4EAA4E;IAC5E,IAAI,IAAI,CAAC,OAAO;QAAE,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,OAAO,IAAI,MAAM,aAAa,CAAC,CAAC;IACpE,IAAI,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC;QAAE,IAAI,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;IACrE,IAAI,WAAW,CAAC,QAAQ,CAAC,QAAQ,CAAC;QAAE,IAAI,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;IAC/D,IAAI,WAAW,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,WAAW,CAAC,QAAQ,CAAC,OAAO,CAAC;QAAE,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;IAClG,IAAI,WAAW,CAAC,QAAQ,CAAC,aAAa,CAAC,IAAI,IAAI,CAAC,OAAO,KAAK,MAAM;QAAE,IAAI,CAAC,IAAI,CAAC,sBAAsB,CAAC,CAAC;IACtG,IAAI,WAAW,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,OAAO,KAAK,KAAK;QAAE,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IACpF,IAAI,WAAW,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC;QACxF,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IACzB,IAAI,WAAW,CAAC,QAAQ,CAAC,UAAU,CAAC,IAAI,WAAW,CAAC,QAAQ,CAAC,SAAS,CAAC;QAAE,IAAI,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;IACtG,IAAI,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,WAAW,CAAC,QAAQ,CAAC,OAAO,CAAC;QAAE,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAEjG,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,kCAAkC,CAAC;IACvF,OAAO,aAAa,IAAI,CAAC,EAAE,WAAW,IAAI,CAAC,KAAK,IAAI,GAAG,MAAM,OAAO,EAAE,CAAC;AACzE,CAAC;AAED,SAAS,iBAAiB,CAAC,IAAiB;IAC1C,MAAM,IAAI,GAAG;QACX,aAAa,IAAI,CAAC,EAAE,oBAAoB;QACxC,SAAS,IAAI,CAAC,KAAK,IAAI,GAAG,EAAE;QAC5B,kBAAkB,IAAI,CAAC,cAAc,EAAE;QACvC,WAAW,IAAI,CAAC,OAAO,EAAE;QACzB,WAAW,IAAI,CAAC,OAAO,IAAI,EAAE,EAAE;QAC/B,SAAS,IAAI,CAAC,SAAS,EAAE,aAAa,IAAI,CAAC,EAAE;QAC7C,SAAS,IAAI,CAAC,SAAS,EAAE,aAAa,IAAI,CAAC,EAAE;KAC9C,CAAC;IACF,OAAO,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;AAC1B,CAAC;AAED,KAAK,UAAU,eAAe,CAAC,WAAmB;IAChD,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC;IAChD,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAmB,CAAC;IACjD,IAAI,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAE,MAAc,CAAC,KAAK,CAAC,EAAE,CAAC;QACrD,MAAM,IAAI,KAAK,CAAC,8BAA8B,CAAC,CAAC;IAClD,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,KAAK,UAAU,cAAc,CAC3B,OAAuB,EACvB,IAAiB,EACjB,WAAmB;IAEnB,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE3B,MAAM,KAAK,GAAG,CAAC,MAAM,QAAQ,CAC3B,WAAW,EACX;QACE,MAAM,EAAE,aAAa,IAAI,CAAC,EAAE,EAAE;QAC9B,WAAW,EAAE,sCAAsC,OAAO,CAAC,MAAM,IAAI,OAAO,CAAC,KAAK,IAAI;QACtF,cAAc,EAAE;YACd,SAAS,EAAE,yBAAyB;YACpC,YAAY,EAAE,4DAA4D;SAC3E;KACF,EACD,IAAI,CAAC,EAAE,EACP,aAAa,CACd,CAAQ,CAAC;IAEV,MAAM,QAAQ,CACZ,mBAAmB,EACnB;QACE,SAAS,EAAE,KAAK,CAAC,SAAS;QAC1B,QAAQ,EAAE,SAAS;QACnB,OAAO,EAAE,sBAAsB,IAAI,CAAC,EAAE,WAAW,IAAI,CAAC,KAAK,oBAAoB,IAAI,CAAC,cAAc,aAAa,IAAI,CAAC,OAAO,SAAS,IAAI,CAAC,OAAO,IAAI;QACpJ,SAAS,EAAE,OAAO,CAAC,SAAS;QAC5B,SAAS,EAAE,kDAAkD;QAC7D,WAAW,EAAE,oEAAoE;KAClF,EACD,IAAI,CAAC,EAAE,EACP,WAAW,CACZ,CAAC;IAEF,IAAI,UAAU,GAAG,CAAC,MAAM,QAAQ,CAC9B,WAAW,EACX;QACE,KAAK,EAAE,mBAAmB,CAAC,IAAI,CAAC;QAChC,QAAQ,EAAE,WAAW;KACtB,EACD,IAAI,CAAC,EAAE,EACP,YAAY,CACb,CAAQ,CAAC;IAEV,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,EAAE,KAAK,CAAC,IAAI,UAAU,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvE,UAAU,GAAG,CAAC,MAAM,QAAQ,CAC1B,WAAW,EACX,EAAE,KAAK,EAAE,yDAAyD,EAAE,QAAQ,EAAE,cAAc,EAAE,EAC9F,IAAI,CAAC,EAAE,EACP,qBAAqB,CACtB,CAAQ,CAAC;IACZ,CAAC;IACD,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACnD,MAAM,CAAC,UAAU,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;IAEnD,MAAM,WAAW,GAAG,CAAC,MAAM,QAAQ,CACjC,gBAAgB,EAChB,EAAE,KAAK,EAAE,oBAAoB,EAAE,EAC/B,IAAI,CAAC,EAAE,EACP,iBAAiB,CAClB,CAAQ,CAAC;IACV,MAAM,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,UAAU,EAAE,CAAC;IACvC,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACpD,MAAM,CAAC,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;IAEpD,MAAM,OAAO,GAAG,CAAC,MAAM,QAAQ,CAC7B,gBAAgB,EAChB;QACE,IAAI,EAAE,qBAAqB,IAAI,CAAC,EAAE,IAAI,IAAI,CAAC,GAAG,EAAE,EAAE;QAClD,WAAW,EAAE,wBAAwB,IAAI,CAAC,KAAK,YAAY,WAAW,GAAG;QACzE,KAAK,EAAE;YACL;gBACE,KAAK,EAAE,iBAAiB,CAAC,IAAI,CAAC;gBAC9B,MAAM,EAAE,6CAA6C,IAAI,CAAC,EAAE,EAAE;gBAC9D,QAAQ,EACN,sFAAsF;aACzF;SACF;KACF,EACD,IAAI,CAAC,EAAE,EACP,gBAAgB,CACjB,CAAQ,CAAC;IAEV,MAAM,QAAQ,CACZ,oBAAoB,EACpB;QACE,MAAM,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC;QAC1B,OAAO,EAAE,MAAM;QACf,KAAK,EAAE,CAAC;QACR,MAAM,EAAE,cAAc,UAAU,CAAC,KAAK,CAAC,MAAM,iDAAiD;QAC9F,SAAS,EAAE;YACT,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,IAAI,CAAC,EAAE;YACf,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,WAAW;YACX,cAAc,EAAE,IAAI,CAAC,cAAc;YACnC,OAAO,EAAE,IAAI,CAAC,OAAO;YACrB,OAAO,EAAE,IAAI,CAAC,OAAO;YACrB,aAAa,EAAE,IAAI,CAAC,SAAS,EAAE,aAAa,IAAI,CAAC;YACjD,aAAa,EAAE,IAAI,CAAC,SAAS,EAAE,aAAa,IAAI,CAAC;YACjD,eAAe,EAAE,IAAI,CAAC,eAAe;SACtC;KACF,EACD,IAAI,CAAC,EAAE,EACP,oBAAoB,CACrB,CAAC;IAEF,MAAM,WAAW,GAAG,CAAC,MAAM,QAAQ,CACjC,mBAAmB,EACnB,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,EACxB,IAAI,CAAC,EAAE,EACP,mBAAmB,CACpB,CAAQ,CAAC;IACV,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IAC7C,MAAM,CAAC,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAE3C,MAAM,UAAU,GAAG,CAAC,MAAM,QAAQ,CAChC,iBAAiB,EACjB;QACE,KAAK,EAAE;YACL,EAAE,IAAI,EAAE,SAAS,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,sBAAsB,IAAI,CAAC,EAAE,EAAE,EAAE;YAC1E,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,mBAAmB,IAAI,CAAC,EAAE,EAAE,EAAE;YACpE,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,iCAAiC,IAAI,CAAC,EAAE,EAAE,EAAE;SACnF;KACF,EACD,IAAI,CAAC,EAAE,EACP,iBAAiB,CAClB,CAAQ,CAAC;IACV,MAAM,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAExC,MAAM,QAAQ,GAAG,CAAC,MAAM,QAAQ,CAC9B,wBAAwB,EACxB;QACE,MAAM,EAAE,mBAAmB,IAAI,CAAC,EAAE,EAAE;QACpC,KAAK,EAAE;YACL,EAAE,QAAQ,EAAE,iBAAiB,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,8BAA8B,EAAE;YACrF,EAAE,QAAQ,EAAE,iBAAiB,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,+BAA+B,EAAE;YACtF,EAAE,QAAQ,EAAE,mBAAmB,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,qCAAqC,EAAE;YAC9F,EAAE,QAAQ,EAAE,cAAc,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,iCAAiC,EAAE;YACrF,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,kCAAkC,EAAE;YAC1F,EAAE,QAAQ,EAAE,qBAAqB,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,8BAA8B,EAAE;SAC1F;KACF,EACD,IAAI,CAAC,EAAE,EACP,wBAAwB,CACzB,CAAQ,CAAC;IACV,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEnC,MAAM,SAAS,GAAG,CAAC,MAAM,QAAQ,CAC/B,sBAAsB,EACtB,EAAE,KAAK,EAAE,IAAI,CAAC,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,EAC7B,IAAI,CAAC,EAAE,EACP,sBAAsB,CACvB,CAAQ,CAAC;IACV,MAAM,CAAC,OAAO,SAAS,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACrD,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;IAElD,OAAO;QACL,MAAM,EAAE,IAAI,CAAC,EAAE;QACf,WAAW;QACX,EAAE,EAAE,IAAI;QACR,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO;QAC/B,eAAe,EAAE,UAAU,CAAC,KAAK,CAAC,MAAM;QACxC,aAAa,EAAE,SAAS,CAAC,YAAY;KACtC,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,aAAa,CAC1B,OAAuB,EACvB,KAAoB,EACpB,WAAmB;IAEnB,MAAM,kBAAkB,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC;IAC5E,MAAM,OAAO,GAAmB,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IACxD,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,kBAAkB,EAAE,EAAE,CAAC,CAAC,EAAE,WAAW,EAAE,EAAE,CAC5E,CAAC,KAAK,IAAI,EAAE;QACV,OAAO,IAAI,EAAE,CAAC;YACZ,MAAM,SAAS,GAAG,SAAS,EAAE,CAAC;YAC9B,IAAI,SAAS,IAAI,KAAK,CAAC,MAAM;gBAAE,OAAO;YAEtC,MAAM,IAAI,GAAG,KAAK,CAAC,SAAS,CAAC,CAAC;YAC9B,IAAI,CAAC;gBACH,OAAO,CAAC,SAAS,CAAC,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,IAAI,EAAE,WAAW,CAAC,CAAC;YACxE,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO,CAAC,SAAS,CAAC,GAAG;oBACnB,MAAM,EAAE,IAAI,CAAC,EAAE;oBACf,WAAW;oBACX,EAAE,EAAE,KAAK;oBACT,SAAS,EAAE,CAAC;oBACZ,eAAe,EAAE,CAAC;oBAClB,aAAa,EAAE,CAAC;oBAChB,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;iBAC9D,CAAC;YACJ,CAAC;QACH,CAAC;IACH,CAAC,CAAC,EAAE,CACL,CAAC;IAEF,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IAC3B,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,sBAAsB;IAC7B,MAAM,eAAe,GAAG,OAAO,CAAC,GAAG,CAAC,2BAA2B,CAAC;IAChE,IAAI,eAAe;QAAE,OAAO,eAAe,CAAC;IAE5C,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,qBAAqB,IAAI,aAAa,CAAC;IAClE,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,oBAAoB,IAAI,YAAY,CAAC;IAC/D,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IAC7D,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,aAAa,CAAC,CAAC;IACtD,OAAO,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,QAAQ,MAAM,IAAI,KAAK,cAAc,CAAC,CAAC;AACtF,CAAC;AAED,MAAM,eAAe,GAAG,sBAAsB,EAAE,CAAC;AACjD,MAAM,cAAc,GAAG,UAAU,CAAC,eAAe,CAAC,CAAC;AAEnD,QAAQ,CAAC,kEAAkE,EAAE,GAAG,EAAE;IAChF,MAAM,MAAM,GAAG,cAAc,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC;IAE7C,MAAM,CAAC,gEAAgE,EAAE,KAAK,IAAI,EAAE;QAClF,MAAM,OAAO,GAAG,MAAM,eAAe,CAAC,eAAe,CAAC,CAAC;QACvD,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChD,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QAEhD,MAAM,kBAAkB,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,yBAAyB,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QAC7F,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CACxB,CAAC,EACD,IAAI,CAAC,GAAG,CACN,OAAO,CAAC,KAAK,CAAC,MAAM,EACpB,MAAM,CAAC,QAAQ,CAAC,kBAAkB,CAAC,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC,CAAC,CAAC,CAC7D,CACF,CAAC;QAEF,MAAM,oBAAoB,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,0BAA0B,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QAChG,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAC1B,CAAC,EACD,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,MAAM,CAAC,QAAQ,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,CAAC,CACtF,CAAC;QAEF,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QAChD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC3B,MAAM,OAAO,GAAG,MAAM,aAAa,CAAC,OAAO,EAAE,KAAK,EAAE,WAAW,CAAC,CAAC;QACjE,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO,CAAC;QAEvC,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QACtD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QAErD,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC;QAC/E,MAAM,aAAa,GAAG;YACpB,WAAW;YACX,mBAAmB;YACnB,WAAW;YACX,gBAAgB;YAChB,gBAAgB;YAChB,oBAAoB;YACpB,mBAAmB;YACnB,iBAAiB;YACjB,wBAAwB;YACxB,sBAAsB;SACvB,CAAC;QAEF,OAAO,CAAC,GAAG,CACT,0BAA0B,OAAO,CAAC,OAAO,WAAW,OAAO,CAAC,MAAM,UAAU,OAAO,CAAC,KAAK,UAAU,SAAS,gBAAgB,WAAW,SAAS,MAAM,CAAC,MAAM,IAAI,OAAO,CAAC,MAAM,cAAc,SAAS,cAAc,sBAAsB,CAAC,MAAM,EAAE,CACpP,CAAC;QAEF,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtB,OAAO,CAAC,KAAK,CACX,0BAA0B,EAC1B,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;gBACtB,MAAM,EAAE,MAAM,CAAC,MAAM;gBACrB,WAAW,EAAE,MAAM,CAAC,WAAW;gBAC/B,KAAK,EAAE,MAAM,CAAC,KAAK;aACpB,CAAC,CAAC,CACJ,CAAC;QACJ,CAAC;QAED,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC9B,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QAEtC,KAAK,MAAM,YAAY,IAAI,aAAa,EAAE,CAAC;YACzC,MAAM,CAAC,WAAW,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACnD,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -1,7 +0,0 @@
1
- /**
2
- * Open-source dataset benchmark for long-running tasks (SWE-bench lane).
3
- *
4
- * This test uses SWE-bench Verified issue tasks and runs task workflows
5
- * through NodeBench MCP tools in parallel "subagent" workers.
6
- */
7
- export {};