nodebench-mcp 3.0.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/dashboard/operatingDashboardHtml.js +2 -1
- package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
- package/dist/dashboard/operatingServer.js +3 -2
- package/dist/dashboard/operatingServer.js.map +1 -1
- package/dist/db.js +51 -3
- package/dist/db.js.map +1 -1
- package/dist/index.js +13 -16
- package/dist/index.js.map +1 -1
- package/dist/packageInfo.d.ts +3 -0
- package/dist/packageInfo.js +32 -0
- package/dist/packageInfo.js.map +1 -0
- package/dist/sandboxApi.js +2 -1
- package/dist/sandboxApi.js.map +1 -1
- package/dist/tools/boilerplateTools.js +10 -9
- package/dist/tools/boilerplateTools.js.map +1 -1
- package/dist/tools/documentationTools.js +2 -1
- package/dist/tools/documentationTools.js.map +1 -1
- package/dist/tools/progressiveDiscoveryTools.js +2 -1
- package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
- package/dist/tools/toolRegistry.js +11 -0
- package/dist/tools/toolRegistry.js.map +1 -1
- package/dist/toolsetRegistry.js +74 -1
- package/dist/toolsetRegistry.js.map +1 -1
- package/package.json +4 -3
- package/dist/__tests__/analytics.test.d.ts +0 -11
- package/dist/__tests__/analytics.test.js +0 -546
- package/dist/__tests__/analytics.test.js.map +0 -1
- package/dist/__tests__/architectComplex.test.d.ts +0 -1
- package/dist/__tests__/architectComplex.test.js +0 -373
- package/dist/__tests__/architectComplex.test.js.map +0 -1
- package/dist/__tests__/architectSmoke.test.d.ts +0 -1
- package/dist/__tests__/architectSmoke.test.js +0 -92
- package/dist/__tests__/architectSmoke.test.js.map +0 -1
- package/dist/__tests__/audit-registry.d.ts +0 -1
- package/dist/__tests__/audit-registry.js +0 -60
- package/dist/__tests__/audit-registry.js.map +0 -1
- package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
- package/dist/__tests__/batchAutopilot.test.js +0 -218
- package/dist/__tests__/batchAutopilot.test.js.map +0 -1
- package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
- package/dist/__tests__/cliSubcommands.test.js +0 -138
- package/dist/__tests__/cliSubcommands.test.js.map +0 -1
- package/dist/__tests__/comparativeBench.test.d.ts +0 -1
- package/dist/__tests__/comparativeBench.test.js +0 -722
- package/dist/__tests__/comparativeBench.test.js.map +0 -1
- package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
- package/dist/__tests__/critterCalibrationEval.js +0 -370
- package/dist/__tests__/critterCalibrationEval.js.map +0 -1
- package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
- package/dist/__tests__/dynamicLoading.test.js +0 -280
- package/dist/__tests__/dynamicLoading.test.js.map +0 -1
- package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
- package/dist/__tests__/embeddingProvider.test.js +0 -86
- package/dist/__tests__/embeddingProvider.test.js.map +0 -1
- package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
- package/dist/__tests__/evalDatasetBench.test.js +0 -738
- package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
- package/dist/__tests__/evalHarness.test.d.ts +0 -1
- package/dist/__tests__/evalHarness.test.js +0 -1107
- package/dist/__tests__/evalHarness.test.js.map +0 -1
- package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
- package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
- package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
- package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
- package/dist/__tests__/forecastingDogfood.test.js +0 -284
- package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
- package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
- package/dist/__tests__/forecastingScoring.test.js +0 -202
- package/dist/__tests__/forecastingScoring.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
- package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
- package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
- package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
- package/dist/__tests__/helpers/answerMatch.js +0 -267
- package/dist/__tests__/helpers/answerMatch.js.map +0 -1
- package/dist/__tests__/helpers/textLlm.d.ts +0 -25
- package/dist/__tests__/helpers/textLlm.js +0 -214
- package/dist/__tests__/helpers/textLlm.js.map +0 -1
- package/dist/__tests__/localDashboard.test.d.ts +0 -1
- package/dist/__tests__/localDashboard.test.js +0 -226
- package/dist/__tests__/localDashboard.test.js.map +0 -1
- package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
- package/dist/__tests__/multiHopDogfood.test.js +0 -303
- package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
- package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
- package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
- package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
- package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
- package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
- package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
- package/dist/__tests__/openclawDogfood.test.js +0 -535
- package/dist/__tests__/openclawDogfood.test.js.map +0 -1
- package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
- package/dist/__tests__/openclawMessaging.test.js +0 -232
- package/dist/__tests__/openclawMessaging.test.js.map +0 -1
- package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
- package/dist/__tests__/presetRealWorldBench.test.js +0 -859
- package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
- package/dist/__tests__/tools.test.d.ts +0 -1
- package/dist/__tests__/tools.test.js +0 -3201
- package/dist/__tests__/tools.test.js.map +0 -1
- package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
- package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
- package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
- package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
- package/dist/__tests__/traceabilityDogfood.test.js +0 -241
- package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
- package/dist/__tests__/webmcpTools.test.d.ts +0 -7
- package/dist/__tests__/webmcpTools.test.js +0 -195
- package/dist/__tests__/webmcpTools.test.js.map +0 -1
- package/dist/benchmarks/testProviderBus.d.ts +0 -7
- package/dist/benchmarks/testProviderBus.js +0 -272
- package/dist/benchmarks/testProviderBus.js.map +0 -1
- package/dist/hooks/postCompaction.d.ts +0 -14
- package/dist/hooks/postCompaction.js +0 -51
- package/dist/hooks/postCompaction.js.map +0 -1
- package/dist/security/__tests__/security.test.d.ts +0 -8
- package/dist/security/__tests__/security.test.js +0 -295
- package/dist/security/__tests__/security.test.js.map +0 -1
- package/dist/sync/hyperloopEval.test.d.ts +0 -4
- package/dist/sync/hyperloopEval.test.js +0 -60
- package/dist/sync/hyperloopEval.test.js.map +0 -1
- package/dist/sync/store.test.d.ts +0 -4
- package/dist/sync/store.test.js +0 -43
- package/dist/sync/store.test.js.map +0 -1
- package/dist/tools/documentTools.d.ts +0 -5
- package/dist/tools/documentTools.js +0 -524
- package/dist/tools/documentTools.js.map +0 -1
- package/dist/tools/financialTools.d.ts +0 -10
- package/dist/tools/financialTools.js +0 -403
- package/dist/tools/financialTools.js.map +0 -1
- package/dist/tools/memoryTools.d.ts +0 -5
- package/dist/tools/memoryTools.js +0 -137
- package/dist/tools/memoryTools.js.map +0 -1
- package/dist/tools/planningTools.d.ts +0 -5
- package/dist/tools/planningTools.js +0 -147
- package/dist/tools/planningTools.js.map +0 -1
- package/dist/tools/searchTools.d.ts +0 -5
- package/dist/tools/searchTools.js +0 -145
- package/dist/tools/searchTools.js.map +0 -1
|
@@ -1,738 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Dataset-Driven Eval Bench for NodeBench MCP Tools
|
|
3
|
-
*
|
|
4
|
-
* Tests MCP tool orchestration against REAL open-source task descriptions
|
|
5
|
-
* from SWE-bench Verified (500 human-validated GitHub issues).
|
|
6
|
-
*
|
|
7
|
-
* Each task runs through the FULL agent pipeline:
|
|
8
|
-
* Recon → Verification → Eval → Quality Gate → Learning → Flywheel
|
|
9
|
-
*
|
|
10
|
-
* This proves the tools can orchestrate real-world development workflows
|
|
11
|
-
* end-to-end, not just pass unit tests in isolation.
|
|
12
|
-
*
|
|
13
|
-
* Dataset: SWE-bench Verified (princeton-nlp/SWE-bench_Verified)
|
|
14
|
-
* Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
|
|
15
|
-
*/
|
|
16
|
-
import { describe, it, expect, afterAll } from "vitest";
|
|
17
|
-
import { verificationTools } from "../tools/verificationTools.js";
|
|
18
|
-
import { reconTools } from "../tools/reconTools.js";
|
|
19
|
-
import { evalTools } from "../tools/evalTools.js";
|
|
20
|
-
import { qualityGateTools } from "../tools/qualityGateTools.js";
|
|
21
|
-
import { flywheelTools } from "../tools/flywheelTools.js";
|
|
22
|
-
import { learningTools } from "../tools/learningTools.js";
|
|
23
|
-
import { agentBootstrapTools } from "../tools/agentBootstrapTools.js";
|
|
24
|
-
import { createMetaTools } from "../tools/metaTools.js";
|
|
25
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
26
|
-
// TOOL SETUP
|
|
27
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
28
|
-
const domainTools = [
|
|
29
|
-
...verificationTools,
|
|
30
|
-
...evalTools,
|
|
31
|
-
...qualityGateTools,
|
|
32
|
-
...learningTools,
|
|
33
|
-
...flywheelTools,
|
|
34
|
-
...reconTools,
|
|
35
|
-
...agentBootstrapTools,
|
|
36
|
-
];
|
|
37
|
-
const allTools = [...domainTools, ...createMetaTools(domainTools)];
|
|
38
|
-
const findTool = (name) => {
|
|
39
|
-
const tool = allTools.find((t) => t.name === name);
|
|
40
|
-
if (!tool)
|
|
41
|
-
throw new Error(`Tool not found: ${name}`);
|
|
42
|
-
return tool;
|
|
43
|
-
};
|
|
44
|
-
// Telemetry
|
|
45
|
-
const pipelineLog = [];
|
|
46
|
-
async function callTool(name, args, taskId, phase) {
|
|
47
|
-
const tool = findTool(name);
|
|
48
|
-
const start = Date.now();
|
|
49
|
-
try {
|
|
50
|
-
const result = await tool.handler(args);
|
|
51
|
-
pipelineLog.push({
|
|
52
|
-
taskId,
|
|
53
|
-
tool: name,
|
|
54
|
-
phase,
|
|
55
|
-
success: true,
|
|
56
|
-
durationMs: Date.now() - start,
|
|
57
|
-
});
|
|
58
|
-
return result;
|
|
59
|
-
}
|
|
60
|
-
catch (error) {
|
|
61
|
-
pipelineLog.push({
|
|
62
|
-
taskId,
|
|
63
|
-
tool: name,
|
|
64
|
-
phase,
|
|
65
|
-
success: false,
|
|
66
|
-
durationMs: Date.now() - start,
|
|
67
|
-
});
|
|
68
|
-
throw error;
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
const SWE_BENCH_TASKS = [
|
|
72
|
-
{
|
|
73
|
-
instance_id: "django__django-11133",
|
|
74
|
-
repo: "django/django",
|
|
75
|
-
problem_statement: "HttpResponse doesn't handle memoryview objects. When a memoryview is passed to HttpResponse, it displays as a memory address string representation rather than the actual content bytes.",
|
|
76
|
-
category: "bug_fix",
|
|
77
|
-
complexity: "low",
|
|
78
|
-
},
|
|
79
|
-
{
|
|
80
|
-
instance_id: "scikit-learn__scikit-learn-14053",
|
|
81
|
-
repo: "scikit-learn/scikit-learn",
|
|
82
|
-
problem_statement: "IndexError: list index out of range in export_text when the decision tree only has one feature. The export_text function crashes with an IndexError when the trained DecisionTreeClassifier uses a single feature.",
|
|
83
|
-
category: "bug_fix",
|
|
84
|
-
complexity: "low",
|
|
85
|
-
},
|
|
86
|
-
{
|
|
87
|
-
instance_id: "sympy__sympy-13372",
|
|
88
|
-
repo: "sympy/sympy",
|
|
89
|
-
problem_statement: "UnboundLocalError in evalf. Calling Mul(Max(0, y), x, evaluate=False).evalf() raises an UnboundLocalError where local variable 'reprec' is referenced before assignment.",
|
|
90
|
-
category: "bug_fix",
|
|
91
|
-
complexity: "medium",
|
|
92
|
-
},
|
|
93
|
-
{
|
|
94
|
-
instance_id: "django__django-11099",
|
|
95
|
-
repo: "django/django",
|
|
96
|
-
problem_statement: "UsernameValidator allows trailing newline in usernames. ASCIIUsernameValidator and UnicodeUsernameValidator use a regex pattern with $ which matches a trailing newline in Python, allowing invalid usernames to pass validation.",
|
|
97
|
-
category: "bug_fix",
|
|
98
|
-
complexity: "low",
|
|
99
|
-
},
|
|
100
|
-
{
|
|
101
|
-
instance_id: "astropy__astropy-12907",
|
|
102
|
-
repo: "astropy/astropy",
|
|
103
|
-
problem_statement: "Modeling's separability_matrix does not compute separability correctly for nested CompoundModels. When nesting compound models using the & operator, the separability matrix incorrectly indicates coupled outputs.",
|
|
104
|
-
category: "bug_fix",
|
|
105
|
-
complexity: "high",
|
|
106
|
-
},
|
|
107
|
-
{
|
|
108
|
-
instance_id: "django__django-11095",
|
|
109
|
-
repo: "django/django",
|
|
110
|
-
problem_statement: "Add ModelAdmin.get_inlines() hook to allow setting inlines based on the request or model instance. Currently, users must override get_inline_instances to achieve dynamic inlines.",
|
|
111
|
-
category: "feature",
|
|
112
|
-
complexity: "medium",
|
|
113
|
-
},
|
|
114
|
-
{
|
|
115
|
-
instance_id: "scikit-learn__scikit-learn-13496",
|
|
116
|
-
repo: "scikit-learn/scikit-learn",
|
|
117
|
-
problem_statement: "Expose warm_start in Isolation Forest. sklearn.ensemble.IsolationForest supports incremental addition of new trees with warm_start but the parameter is not exposed in __init__().",
|
|
118
|
-
category: "feature",
|
|
119
|
-
complexity: "low",
|
|
120
|
-
},
|
|
121
|
-
{
|
|
122
|
-
instance_id: "matplotlib__matplotlib-24627",
|
|
123
|
-
repo: "matplotlib/matplotlib",
|
|
124
|
-
problem_statement: "cla(), clf() should unset the .axes and .figure attributes of deparented artists. Clearing the axes via cla() or the figure via clf() does not unset references, leaving stale references.",
|
|
125
|
-
category: "refactor",
|
|
126
|
-
complexity: "medium",
|
|
127
|
-
},
|
|
128
|
-
{
|
|
129
|
-
instance_id: "sphinx-doc__sphinx-8265",
|
|
130
|
-
repo: "sphinx-doc/sphinx",
|
|
131
|
-
problem_statement: "Python method signatures with tuple default arguments are rendered incorrectly in docstrings. For example, color=(1, 1, 1) appears as color=1, 1, 1 in the generated documentation.",
|
|
132
|
-
category: "bug_fix",
|
|
133
|
-
complexity: "medium",
|
|
134
|
-
},
|
|
135
|
-
{
|
|
136
|
-
instance_id: "pydata__xarray-3305",
|
|
137
|
-
repo: "pydata/xarray",
|
|
138
|
-
problem_statement: "DataArray.quantile does not honor keep_attrs. When calling quantile with keep_attrs=True on a DataArray with attributes, the returned object loses those attributes.",
|
|
139
|
-
category: "bug_fix",
|
|
140
|
-
complexity: "low",
|
|
141
|
-
},
|
|
142
|
-
{
|
|
143
|
-
instance_id: "pylint-dev__pylint-4661",
|
|
144
|
-
repo: "pylint-dev/pylint",
|
|
145
|
-
problem_statement: "Make pylint XDG Base Directory Specification compliant. The .pylint.d directory clutters the user's home folder; data should be stored in $HOME/.cache/pylint following XDG.",
|
|
146
|
-
category: "feature",
|
|
147
|
-
complexity: "medium",
|
|
148
|
-
},
|
|
149
|
-
{
|
|
150
|
-
instance_id: "django__django-14017",
|
|
151
|
-
repo: "django/django",
|
|
152
|
-
problem_statement: "Q(...) & Exists(...) raises a TypeError. The bitwise AND operator between Q and Exists objects is not commutative: Exists(...) & Q(...) works, but Q(...) & Exists(...) raises TypeError.",
|
|
153
|
-
category: "bug_fix",
|
|
154
|
-
complexity: "medium",
|
|
155
|
-
},
|
|
156
|
-
{
|
|
157
|
-
instance_id: "sympy__sympy-13647",
|
|
158
|
-
repo: "sympy/sympy",
|
|
159
|
-
problem_statement: "Matrix.col_insert() no longer seems to work correctly. When inserting columns into an identity matrix, the 3x3 identity portion shifts incorrectly, producing a wrong result.",
|
|
160
|
-
category: "bug_fix",
|
|
161
|
-
complexity: "medium",
|
|
162
|
-
},
|
|
163
|
-
{
|
|
164
|
-
instance_id: "scikit-learn__scikit-learn-14141",
|
|
165
|
-
repo: "scikit-learn/scikit-learn",
|
|
166
|
-
problem_statement: "Add joblib in show_versions. joblib is a key dependency of scikit-learn but is missing from the output of sklearn.show_versions(), making it harder to debug environment-related issues.",
|
|
167
|
-
category: "documentation",
|
|
168
|
-
complexity: "low",
|
|
169
|
-
},
|
|
170
|
-
{
|
|
171
|
-
instance_id: "django__django-11039",
|
|
172
|
-
repo: "django/django",
|
|
173
|
-
problem_statement: "sqlmigrate wraps its output in BEGIN/COMMIT even if the database doesn't support transactional DDL. Should only show transaction markers when the backend supports rolling back DDL.",
|
|
174
|
-
category: "bug_fix",
|
|
175
|
-
complexity: "low",
|
|
176
|
-
},
|
|
177
|
-
{
|
|
178
|
-
instance_id: "sphinx-doc__sphinx-9258",
|
|
179
|
-
repo: "sphinx-doc/sphinx",
|
|
180
|
-
problem_statement: "The Python domain does not recognize the pipe character | as a union type separator in type annotations (PEP 604). int | str syntax is not supported in Sphinx's type annotation parsing.",
|
|
181
|
-
category: "feature",
|
|
182
|
-
complexity: "high",
|
|
183
|
-
},
|
|
184
|
-
{
|
|
185
|
-
instance_id: "astropy__astropy-13398",
|
|
186
|
-
repo: "astropy/astropy",
|
|
187
|
-
problem_statement: "A direct approach to ITRS to Observed transformations that stays within the ITRS. Current implementations route through intermediate coordinate frames unnecessarily for satellite observations.",
|
|
188
|
-
category: "feature",
|
|
189
|
-
complexity: "high",
|
|
190
|
-
},
|
|
191
|
-
{
|
|
192
|
-
instance_id: "django__django-11964",
|
|
193
|
-
repo: "django/django",
|
|
194
|
-
problem_statement: "TextChoices and IntegerChoices instances lack proper string representation. The type and display behavior differs between newly created model instances using choices and instances retrieved from the database.",
|
|
195
|
-
category: "api_change",
|
|
196
|
-
complexity: "medium",
|
|
197
|
-
},
|
|
198
|
-
{
|
|
199
|
-
instance_id: "pydata__xarray-3993",
|
|
200
|
-
repo: "pydata/xarray",
|
|
201
|
-
problem_statement: "DataArray.integrate has a 'dim' arg, but Dataset.integrate has a 'coord' arg. The API syntax is inconsistent between the two methods across DataArray and Dataset.",
|
|
202
|
-
category: "api_change",
|
|
203
|
-
complexity: "medium",
|
|
204
|
-
},
|
|
205
|
-
{
|
|
206
|
-
instance_id: "scikit-learn__scikit-learn-14710",
|
|
207
|
-
repo: "scikit-learn/scikit-learn",
|
|
208
|
-
problem_statement: "HistGradientBoostingClassifier does not work with string target when early stopping is turned on. The scorer receives y_true as encoded integers while y_pred contains original string class labels.",
|
|
209
|
-
category: "bug_fix",
|
|
210
|
-
complexity: "high",
|
|
211
|
-
},
|
|
212
|
-
];
|
|
213
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
214
|
-
// FULL AGENT PIPELINE — runs each SWE-bench task through all tool domains
|
|
215
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
216
|
-
/**
|
|
217
|
-
* Runs a single SWE-bench task through the complete MCP tool pipeline.
|
|
218
|
-
* Returns cleanup IDs for resource teardown.
|
|
219
|
-
*/
|
|
220
|
-
async function runFullPipeline(task) {
|
|
221
|
-
const taskId = task.instance_id;
|
|
222
|
-
const cleanupIds = {
|
|
223
|
-
cycleIds: [],
|
|
224
|
-
learningKeys: [],
|
|
225
|
-
};
|
|
226
|
-
// ─── Phase 1: META — Discover the right tools for this task ───
|
|
227
|
-
const toolSearch = (await callTool("findTools", { query: task.category === "bug_fix" ? "verification gap" : "feature implementation" }, taskId, "meta"));
|
|
228
|
-
expect(toolSearch.tools.length).toBeGreaterThan(0);
|
|
229
|
-
const methodology = (await callTool("getMethodology", { topic: "verification" }, taskId, "meta"));
|
|
230
|
-
expect(methodology.steps.length).toBeGreaterThan(0);
|
|
231
|
-
// ─── Phase 2: RECON — Research the problem ───
|
|
232
|
-
const reconSession = (await callTool("run_recon", {
|
|
233
|
-
target: `${task.repo}: ${task.problem_statement.slice(0, 80)}`,
|
|
234
|
-
description: `Research for ${taskId}`,
|
|
235
|
-
}, taskId, "recon"));
|
|
236
|
-
expect(reconSession.sessionId).toBeTruthy();
|
|
237
|
-
await callTool("log_recon_finding", {
|
|
238
|
-
sessionId: reconSession.sessionId,
|
|
239
|
-
category: "codebase_pattern",
|
|
240
|
-
summary: `Root cause analysis: ${task.problem_statement.slice(0, 120)}`,
|
|
241
|
-
sourceUrl: `https://github.com/${task.repo}`,
|
|
242
|
-
relevance: `Directly affects ${task.category} implementation`,
|
|
243
|
-
}, taskId, "recon");
|
|
244
|
-
const reconSummary = (await callTool("get_recon_summary", { sessionId: reconSession.sessionId }, taskId, "recon"));
|
|
245
|
-
expect(reconSummary.totalFindings).toBeGreaterThan(0);
|
|
246
|
-
// ─── Phase 3: RISK ASSESSMENT — Evaluate before acting ───
|
|
247
|
-
const risk = (await callTool("assess_risk", {
|
|
248
|
-
action: task.category === "api_change" ? "modify_public_api" : "fix_implementation",
|
|
249
|
-
context: `${task.repo} — ${task.complexity} complexity ${task.category}`,
|
|
250
|
-
}, taskId, "risk"));
|
|
251
|
-
expect(risk.assessment).toBeDefined();
|
|
252
|
-
expect(risk.assessment.tier).toBeTruthy();
|
|
253
|
-
// ─── Phase 4: VERIFICATION CYCLE — Track implementation ───
|
|
254
|
-
const cycle = (await callTool("start_verification_cycle", {
|
|
255
|
-
title: `swebench-${taskId}`,
|
|
256
|
-
description: task.problem_statement.slice(0, 200),
|
|
257
|
-
}, taskId, "verification"));
|
|
258
|
-
expect(cycle.cycleId).toBeTruthy();
|
|
259
|
-
cleanupIds.cycleIds.push(cycle.cycleId);
|
|
260
|
-
// Phase 1 findings (context gathering)
|
|
261
|
-
await callTool("log_phase_findings", {
|
|
262
|
-
cycleId: cycle.cycleId,
|
|
263
|
-
phaseNumber: 1,
|
|
264
|
-
status: "passed",
|
|
265
|
-
findings: {
|
|
266
|
-
repo: task.repo,
|
|
267
|
-
category: task.category,
|
|
268
|
-
complexity: task.complexity,
|
|
269
|
-
reconFindings: reconSummary.totalFindings,
|
|
270
|
-
},
|
|
271
|
-
}, taskId, "verification");
|
|
272
|
-
// Log a gap (every real task has at least one)
|
|
273
|
-
const severityMap = { low: "LOW", medium: "MEDIUM", high: "HIGH" };
|
|
274
|
-
const gap = (await callTool("log_gap", {
|
|
275
|
-
cycleId: cycle.cycleId,
|
|
276
|
-
severity: severityMap[task.complexity],
|
|
277
|
-
title: task.problem_statement.split(".")[0],
|
|
278
|
-
description: task.problem_statement,
|
|
279
|
-
rootCause: `Identified via recon session ${reconSession.sessionId}`,
|
|
280
|
-
fixStrategy: `Apply ${task.category} patch following ${task.repo} conventions`,
|
|
281
|
-
}, taskId, "verification"));
|
|
282
|
-
expect(gap.gapId).toBeTruthy();
|
|
283
|
-
// Resolve the gap
|
|
284
|
-
const resolved = (await callTool("resolve_gap", { gapId: gap.gapId }, taskId, "verification"));
|
|
285
|
-
expect(resolved.status).toBe("resolved");
|
|
286
|
-
// Log test results across layers
|
|
287
|
-
const testLayers = ["static", "unit", "integration"];
|
|
288
|
-
for (const layer of testLayers) {
|
|
289
|
-
await callTool("log_test_result", {
|
|
290
|
-
cycleId: cycle.cycleId,
|
|
291
|
-
layer,
|
|
292
|
-
label: `${taskId}-${layer}`,
|
|
293
|
-
passed: true,
|
|
294
|
-
output: `${layer} tests passing for ${task.repo}`,
|
|
295
|
-
}, taskId, "verification");
|
|
296
|
-
}
|
|
297
|
-
// Check verification status
|
|
298
|
-
const status = (await callTool("get_verification_status", { cycleId: cycle.cycleId }, taskId, "verification"));
|
|
299
|
-
expect(status.status).toBeTruthy();
|
|
300
|
-
// ─── Phase 5: EVAL RUN — Score the implementation ───
|
|
301
|
-
const evalRun = (await callTool("start_eval_run", {
|
|
302
|
-
name: `swebench-eval-${taskId}`,
|
|
303
|
-
description: `Eval for ${task.repo} ${task.category}`,
|
|
304
|
-
cases: [
|
|
305
|
-
{
|
|
306
|
-
input: task.problem_statement.slice(0, 100),
|
|
307
|
-
intent: `Fix ${task.category} in ${task.repo}`,
|
|
308
|
-
},
|
|
309
|
-
{
|
|
310
|
-
input: `Regression test for ${taskId}`,
|
|
311
|
-
intent: "Ensure no regression",
|
|
312
|
-
},
|
|
313
|
-
],
|
|
314
|
-
}, taskId, "eval"));
|
|
315
|
-
expect(evalRun.runId).toBeTruthy();
|
|
316
|
-
// Record results
|
|
317
|
-
for (let i = 0; i < evalRun.caseIds.length; i++) {
|
|
318
|
-
await callTool("record_eval_result", {
|
|
319
|
-
caseId: evalRun.caseIds[i],
|
|
320
|
-
actual: `Verified ${task.category} fix applied correctly`,
|
|
321
|
-
verdict: "pass",
|
|
322
|
-
score: task.complexity === "high" ? 0.85 : task.complexity === "medium" ? 0.9 : 0.95,
|
|
323
|
-
}, taskId, "eval");
|
|
324
|
-
}
|
|
325
|
-
// Complete eval
|
|
326
|
-
const evalComplete = (await callTool("complete_eval_run", { runId: evalRun.runId }, taskId, "eval"));
|
|
327
|
-
expect(evalComplete.status).toBe("completed");
|
|
328
|
-
expect(evalComplete.summary).toBeDefined();
|
|
329
|
-
// ─── Phase 6: QUALITY GATE — Gate deployment ───
|
|
330
|
-
const gate = (await callTool("run_quality_gate", {
|
|
331
|
-
gateName: "deploy_readiness",
|
|
332
|
-
target: taskId,
|
|
333
|
-
rules: [
|
|
334
|
-
{ name: "tests_pass", passed: true },
|
|
335
|
-
{ name: "no_type_errors", passed: true },
|
|
336
|
-
{ name: "no_lint_errors", passed: true },
|
|
337
|
-
{ name: "coverage_threshold", passed: true },
|
|
338
|
-
],
|
|
339
|
-
}, taskId, "quality-gate"));
|
|
340
|
-
expect(gate.passed).toBe(true);
|
|
341
|
-
// Closed loop
|
|
342
|
-
await callTool("run_closed_loop", {
|
|
343
|
-
steps: [
|
|
344
|
-
{ step: "compile", passed: true },
|
|
345
|
-
{ step: "lint", passed: true },
|
|
346
|
-
{ step: "test", passed: true },
|
|
347
|
-
],
|
|
348
|
-
}, taskId, "quality-gate");
|
|
349
|
-
// ─── Phase 7: KNOWLEDGE — Record learning ───
|
|
350
|
-
const learningKey = `swebench-${taskId}-${Date.now()}`;
|
|
351
|
-
cleanupIds.learningKeys.push(learningKey);
|
|
352
|
-
await callTool("record_learning", {
|
|
353
|
-
key: learningKey,
|
|
354
|
-
category: "pattern",
|
|
355
|
-
content: `${task.repo} ${task.category}: ${task.problem_statement.slice(0, 150)}`,
|
|
356
|
-
tags: [task.category, task.complexity, task.repo.split("/")[0]],
|
|
357
|
-
}, taskId, "knowledge");
|
|
358
|
-
// Unified search
|
|
359
|
-
const knowledge = (await callTool("search_all_knowledge", { query: task.repo.split("/")[1] }, taskId, "knowledge"));
|
|
360
|
-
expect(knowledge).toHaveProperty("learnings");
|
|
361
|
-
expect(knowledge).toHaveProperty("reconFindings");
|
|
362
|
-
expect(knowledge).toHaveProperty("gaps");
|
|
363
|
-
// ─── Phase 8: FLYWHEEL — Connect verification to eval ───
|
|
364
|
-
const flywheel = (await callTool("run_mandatory_flywheel", {
|
|
365
|
-
target: `SWE-bench ${taskId}`,
|
|
366
|
-
steps: [
|
|
367
|
-
{ stepName: "static_analysis", passed: true },
|
|
368
|
-
{ stepName: "happy_path_test", passed: true },
|
|
369
|
-
{ stepName: "failure_path_test", passed: true },
|
|
370
|
-
{ stepName: "gap_analysis", passed: true },
|
|
371
|
-
{ stepName: "fix_and_reverify", passed: true },
|
|
372
|
-
{ stepName: "deploy_and_document", passed: true },
|
|
373
|
-
],
|
|
374
|
-
}, taskId, "flywheel"));
|
|
375
|
-
expect(flywheel.passed).toBe(true);
|
|
376
|
-
return cleanupIds;
|
|
377
|
-
}
|
|
378
|
-
/**
|
|
379
|
-
* Cleanup function to abandon cycles and delete learnings after tests
|
|
380
|
-
*/
|
|
381
|
-
async function cleanup(ids) {
|
|
382
|
-
for (const cycleId of ids.cycleIds) {
|
|
383
|
-
try {
|
|
384
|
-
await findTool("abandon_cycle").handler({
|
|
385
|
-
cycleId,
|
|
386
|
-
reason: "dataset bench cleanup",
|
|
387
|
-
});
|
|
388
|
-
}
|
|
389
|
-
catch {
|
|
390
|
-
// already abandoned or completed
|
|
391
|
-
}
|
|
392
|
-
}
|
|
393
|
-
for (const key of ids.learningKeys) {
|
|
394
|
-
try {
|
|
395
|
-
await findTool("delete_learning").handler({ key });
|
|
396
|
-
}
|
|
397
|
-
catch {
|
|
398
|
-
// already deleted
|
|
399
|
-
}
|
|
400
|
-
}
|
|
401
|
-
}
|
|
402
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
403
|
-
// TEST BATCHES — 4 parallel batches of 5 tasks each
|
|
404
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
405
|
-
const BATCH_SIZE = 5;
|
|
406
|
-
const batches = [];
|
|
407
|
-
for (let i = 0; i < SWE_BENCH_TASKS.length; i += BATCH_SIZE) {
|
|
408
|
-
batches.push(SWE_BENCH_TASKS.slice(i, i + BATCH_SIZE));
|
|
409
|
-
}
|
|
410
|
-
describe("SWE-bench Dataset Bench: Batch 1 (django + scikit-learn bugs)", () => {
|
|
411
|
-
const batch = batches[0];
|
|
412
|
-
const allCleanup = [];
|
|
413
|
-
afterAll(async () => {
|
|
414
|
-
for (const ids of allCleanup)
|
|
415
|
-
await cleanup(ids);
|
|
416
|
-
});
|
|
417
|
-
for (const task of batch) {
|
|
418
|
-
it(`Full pipeline: ${task.instance_id} (${task.category}/${task.complexity})`, async () => {
|
|
419
|
-
const ids = await runFullPipeline(task);
|
|
420
|
-
allCleanup.push(ids);
|
|
421
|
-
}, 30_000);
|
|
422
|
-
}
|
|
423
|
-
});
|
|
424
|
-
describe("SWE-bench Dataset Bench: Batch 2 (features + refactors)", () => {
|
|
425
|
-
const batch = batches[1];
|
|
426
|
-
const allCleanup = [];
|
|
427
|
-
afterAll(async () => {
|
|
428
|
-
for (const ids of allCleanup)
|
|
429
|
-
await cleanup(ids);
|
|
430
|
-
});
|
|
431
|
-
for (const task of batch) {
|
|
432
|
-
it(`Full pipeline: ${task.instance_id} (${task.category}/${task.complexity})`, async () => {
|
|
433
|
-
const ids = await runFullPipeline(task);
|
|
434
|
-
allCleanup.push(ids);
|
|
435
|
-
}, 30_000);
|
|
436
|
-
}
|
|
437
|
-
});
|
|
438
|
-
describe("SWE-bench Dataset Bench: Batch 3 (cross-repo medium complexity)", () => {
|
|
439
|
-
const batch = batches[2];
|
|
440
|
-
const allCleanup = [];
|
|
441
|
-
afterAll(async () => {
|
|
442
|
-
for (const ids of allCleanup)
|
|
443
|
-
await cleanup(ids);
|
|
444
|
-
});
|
|
445
|
-
for (const task of batch) {
|
|
446
|
-
it(`Full pipeline: ${task.instance_id} (${task.category}/${task.complexity})`, async () => {
|
|
447
|
-
const ids = await runFullPipeline(task);
|
|
448
|
-
allCleanup.push(ids);
|
|
449
|
-
}, 30_000);
|
|
450
|
-
}
|
|
451
|
-
});
|
|
452
|
-
describe("SWE-bench Dataset Bench: Batch 4 (high complexity + API changes)", () => {
|
|
453
|
-
const batch = batches[3];
|
|
454
|
-
const allCleanup = [];
|
|
455
|
-
afterAll(async () => {
|
|
456
|
-
for (const ids of allCleanup)
|
|
457
|
-
await cleanup(ids);
|
|
458
|
-
});
|
|
459
|
-
for (const task of batch) {
|
|
460
|
-
it(`Full pipeline: ${task.instance_id} (${task.category}/${task.complexity})`, async () => {
|
|
461
|
-
const ids = await runFullPipeline(task);
|
|
462
|
-
allCleanup.push(ids);
|
|
463
|
-
}, 30_000);
|
|
464
|
-
}
|
|
465
|
-
});
|
|
466
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
467
|
-
// CROSS-TASK INTEGRATION — Tests that span multiple tasks
|
|
468
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
469
|
-
describe("Cross-Task: Eval Comparison (baseline vs candidate)", () => {
|
|
470
|
-
it("compares eval runs from two different SWE-bench tasks", async () => {
|
|
471
|
-
// Create baseline from a low-complexity task
|
|
472
|
-
const baseline = (await callTool("start_eval_run", {
|
|
473
|
-
name: "swebench-baseline-cross",
|
|
474
|
-
cases: [
|
|
475
|
-
{ input: "django memoryview fix", intent: "Simple bug fix" },
|
|
476
|
-
],
|
|
477
|
-
}, "cross-task", "eval"));
|
|
478
|
-
await callTool("record_eval_result", {
|
|
479
|
-
caseId: baseline.caseIds[0],
|
|
480
|
-
actual: "Fixed memoryview handling",
|
|
481
|
-
verdict: "pass",
|
|
482
|
-
score: 0.95,
|
|
483
|
-
}, "cross-task", "eval");
|
|
484
|
-
await callTool("complete_eval_run", { runId: baseline.runId }, "cross-task", "eval");
|
|
485
|
-
// Create candidate from a high-complexity task
|
|
486
|
-
const candidate = (await callTool("start_eval_run", {
|
|
487
|
-
name: "swebench-candidate-cross",
|
|
488
|
-
cases: [
|
|
489
|
-
{
|
|
490
|
-
input: "astropy separability matrix fix",
|
|
491
|
-
intent: "Complex nested model fix",
|
|
492
|
-
},
|
|
493
|
-
],
|
|
494
|
-
}, "cross-task", "eval"));
|
|
495
|
-
await callTool("record_eval_result", {
|
|
496
|
-
caseId: candidate.caseIds[0],
|
|
497
|
-
actual: "Fixed separability computation",
|
|
498
|
-
verdict: "pass",
|
|
499
|
-
score: 0.85,
|
|
500
|
-
}, "cross-task", "eval");
|
|
501
|
-
await callTool("complete_eval_run", { runId: candidate.runId }, "cross-task", "eval");
|
|
502
|
-
// Compare
|
|
503
|
-
const comparison = (await callTool("compare_eval_runs", {
|
|
504
|
-
baselineRunId: baseline.runId,
|
|
505
|
-
candidateRunId: candidate.runId,
|
|
506
|
-
}, "cross-task", "eval"));
|
|
507
|
-
expect(comparison).toHaveProperty("recommendation");
|
|
508
|
-
expect(["DEPLOY", "REVERT", "INVESTIGATE"]).toContain(comparison.recommendation);
|
|
509
|
-
}, 30_000);
|
|
510
|
-
});
|
|
511
|
-
describe("Cross-Task: Promote Verification to Eval", () => {
|
|
512
|
-
it("promotes a verification cycle's findings into eval cases", async () => {
|
|
513
|
-
// Create a verification cycle
|
|
514
|
-
const cycle = (await callTool("start_verification_cycle", {
|
|
515
|
-
title: "swebench-promote-test",
|
|
516
|
-
description: "Testing promote_to_eval with SWE-bench data",
|
|
517
|
-
}, "cross-task-promote", "flywheel"));
|
|
518
|
-
// Promote to eval
|
|
519
|
-
const promoted = (await callTool("promote_to_eval", {
|
|
520
|
-
cycleId: cycle.cycleId,
|
|
521
|
-
evalRunName: "swebench-promoted-eval",
|
|
522
|
-
cases: [
|
|
523
|
-
{
|
|
524
|
-
input: "django__django-11133: memoryview bug",
|
|
525
|
-
intent: "Verify HttpResponse handles memoryview",
|
|
526
|
-
},
|
|
527
|
-
{
|
|
528
|
-
input: "sympy__sympy-13372: evalf crash",
|
|
529
|
-
intent: "Verify Max().evalf() doesn't crash",
|
|
530
|
-
},
|
|
531
|
-
],
|
|
532
|
-
}, "cross-task-promote", "flywheel"));
|
|
533
|
-
expect(promoted.evalRunId).toBeTruthy();
|
|
534
|
-
expect(promoted.caseCount).toBe(2);
|
|
535
|
-
// Cleanup
|
|
536
|
-
await findTool("abandon_cycle").handler({
|
|
537
|
-
cycleId: cycle.cycleId,
|
|
538
|
-
reason: "bench cleanup",
|
|
539
|
-
});
|
|
540
|
-
}, 30_000);
|
|
541
|
-
});
|
|
542
|
-
describe("Cross-Task: Trigger Investigation from Regression", () => {
|
|
543
|
-
it("triggers investigation when eval run shows failures", async () => {
|
|
544
|
-
// Create a failing eval run
|
|
545
|
-
const evalRun = (await callTool("start_eval_run", {
|
|
546
|
-
name: "swebench-regression-detect",
|
|
547
|
-
cases: [
|
|
548
|
-
{
|
|
549
|
-
input: "HistGradientBoosting string target",
|
|
550
|
-
intent: "Verify early stopping with strings",
|
|
551
|
-
},
|
|
552
|
-
],
|
|
553
|
-
}, "cross-task-investigate", "flywheel"));
|
|
554
|
-
await callTool("record_eval_result", {
|
|
555
|
-
caseId: evalRun.caseIds[0],
|
|
556
|
-
actual: "TypeError: comparison mismatch",
|
|
557
|
-
verdict: "fail",
|
|
558
|
-
score: 0.0,
|
|
559
|
-
}, "cross-task-investigate", "flywheel");
|
|
560
|
-
await callTool("complete_eval_run", { runId: evalRun.runId }, "cross-task-investigate", "flywheel");
|
|
561
|
-
// Trigger investigation
|
|
562
|
-
const investigation = (await callTool("trigger_investigation", {
|
|
563
|
-
evalRunId: evalRun.runId,
|
|
564
|
-
regressionDescription: "HistGradientBoosting fails with string targets when early stopping enabled",
|
|
565
|
-
}, "cross-task-investigate", "flywheel"));
|
|
566
|
-
expect(investigation.cycleId).toBeTruthy();
|
|
567
|
-
expect(investigation.title).toBeTruthy();
|
|
568
|
-
expect(investigation.linkedEvalRun).toBeTruthy();
|
|
569
|
-
// Cleanup
|
|
570
|
-
await findTool("abandon_cycle").handler({
|
|
571
|
-
cycleId: investigation.cycleId,
|
|
572
|
-
reason: "bench cleanup",
|
|
573
|
-
});
|
|
574
|
-
}, 30_000);
|
|
575
|
-
});
|
|
576
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
577
|
-
// DATASET BENCH REPORT
|
|
578
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
579
|
-
describe("Dataset Bench Report", () => {
|
|
580
|
-
it("generates comprehensive SWE-bench evaluation report", () => {
|
|
581
|
-
const totalCalls = pipelineLog.length;
|
|
582
|
-
const successCalls = pipelineLog.filter((l) => l.success).length;
|
|
583
|
-
const failCalls = pipelineLog.filter((l) => !l.success).length;
|
|
584
|
-
// Unique tools used
|
|
585
|
-
const uniqueTools = new Set(pipelineLog.map((l) => l.tool));
|
|
586
|
-
// By phase
|
|
587
|
-
const byPhase = new Map();
|
|
588
|
-
pipelineLog.forEach((l) => {
|
|
589
|
-
if (!byPhase.has(l.phase)) {
|
|
590
|
-
byPhase.set(l.phase, {
|
|
591
|
-
calls: 0,
|
|
592
|
-
success: 0,
|
|
593
|
-
fail: 0,
|
|
594
|
-
tools: new Set(),
|
|
595
|
-
});
|
|
596
|
-
}
|
|
597
|
-
const p = byPhase.get(l.phase);
|
|
598
|
-
p.calls++;
|
|
599
|
-
if (l.success)
|
|
600
|
-
p.success++;
|
|
601
|
-
else
|
|
602
|
-
p.fail++;
|
|
603
|
-
p.tools.add(l.tool);
|
|
604
|
-
});
|
|
605
|
-
// By task
|
|
606
|
-
const byTask = new Map();
|
|
607
|
-
pipelineLog.forEach((l) => {
|
|
608
|
-
if (!byTask.has(l.taskId)) {
|
|
609
|
-
byTask.set(l.taskId, {
|
|
610
|
-
calls: 0,
|
|
611
|
-
success: 0,
|
|
612
|
-
fail: 0,
|
|
613
|
-
phases: new Set(),
|
|
614
|
-
});
|
|
615
|
-
}
|
|
616
|
-
const t = byTask.get(l.taskId);
|
|
617
|
-
t.calls++;
|
|
618
|
-
if (l.success)
|
|
619
|
-
t.success++;
|
|
620
|
-
else
|
|
621
|
-
t.fail++;
|
|
622
|
-
t.phases.add(l.phase);
|
|
623
|
-
});
|
|
624
|
-
// By category
|
|
625
|
-
const byCategory = new Map();
|
|
626
|
-
SWE_BENCH_TASKS.forEach((task) => {
|
|
627
|
-
const taskLog = pipelineLog.filter((l) => l.taskId === task.instance_id);
|
|
628
|
-
if (taskLog.length === 0)
|
|
629
|
-
return;
|
|
630
|
-
if (!byCategory.has(task.category)) {
|
|
631
|
-
byCategory.set(task.category, { count: 0, success: 0, fail: 0 });
|
|
632
|
-
}
|
|
633
|
-
const c = byCategory.get(task.category);
|
|
634
|
-
c.count++;
|
|
635
|
-
c.success += taskLog.filter((l) => l.success).length;
|
|
636
|
-
c.fail += taskLog.filter((l) => !l.success).length;
|
|
637
|
-
});
|
|
638
|
-
// Average duration by phase
|
|
639
|
-
const durationByPhase = new Map();
|
|
640
|
-
pipelineLog.forEach((l) => {
|
|
641
|
-
if (!durationByPhase.has(l.phase))
|
|
642
|
-
durationByPhase.set(l.phase, []);
|
|
643
|
-
durationByPhase.get(l.phase).push(l.durationMs);
|
|
644
|
-
});
|
|
645
|
-
// Print report
|
|
646
|
-
console.log("\n");
|
|
647
|
-
console.log("╔═══════════════════════════════════════════════════════════════════════════╗");
|
|
648
|
-
console.log("║ SWE-BENCH DATASET BENCH — PROOF OF WORK REPORT ║");
|
|
649
|
-
console.log("╚═══════════════════════════════════════════════════════════════════════════╝");
|
|
650
|
-
console.log("");
|
|
651
|
-
console.log(" Dataset: SWE-bench Verified (princeton-nlp/SWE-bench_Verified)");
|
|
652
|
-
console.log(` Tasks: ${SWE_BENCH_TASKS.length} real GitHub issues from ${new Set(SWE_BENCH_TASKS.map((t) => t.repo)).size} repositories`);
|
|
653
|
-
console.log("");
|
|
654
|
-
console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
|
|
655
|
-
console.log("│ PIPELINE SUMMARY │");
|
|
656
|
-
console.log("├─────────────────────────────────────────────────────────────────────────────┤");
|
|
657
|
-
console.log(`│ Total Tool Calls: ${String(totalCalls).padStart(4)} │`);
|
|
658
|
-
console.log(`│ Unique Tools Exercised: ${String(uniqueTools.size).padStart(4)} │`);
|
|
659
|
-
console.log(`│ Success Rate: ${successCalls}/${totalCalls} (${Math.round((successCalls / totalCalls) * 100)}%) │`);
|
|
660
|
-
console.log(`│ Tasks Completed: ${String(byTask.size).padStart(4)} │`);
|
|
661
|
-
console.log(`│ Pipeline Phases: ${String(byPhase.size).padStart(4)} │`);
|
|
662
|
-
console.log("└─────────────────────────────────────────────────────────────────────────────┘");
|
|
663
|
-
console.log("");
|
|
664
|
-
// Phase breakdown
|
|
665
|
-
console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
|
|
666
|
-
console.log("│ PHASE BREAKDOWN │");
|
|
667
|
-
console.log("├─────────────────────────────────────────────────────────────────────────────┤");
|
|
668
|
-
byPhase.forEach((data, phase) => {
|
|
669
|
-
const avgMs = Math.round((durationByPhase.get(phase) || []).reduce((a, b) => a + b, 0) /
|
|
670
|
-
(durationByPhase.get(phase) || [1]).length);
|
|
671
|
-
const status = data.fail === 0 ? "OK" : "FAIL";
|
|
672
|
-
const line = `│ ${status.padEnd(4)} ${phase.padEnd(16)} ${String(data.calls).padStart(3)} calls ${String(data.tools.size).padStart(2)} tools avg ${String(avgMs).padStart(4)}ms`;
|
|
673
|
-
console.log(line.padEnd(78) + "│");
|
|
674
|
-
});
|
|
675
|
-
console.log("└─────────────────────────────────────────────────────────────────────────────┘");
|
|
676
|
-
console.log("");
|
|
677
|
-
// Category breakdown
|
|
678
|
-
console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
|
|
679
|
-
console.log("│ TASK CATEGORY BREAKDOWN │");
|
|
680
|
-
console.log("├─────────────────────────────────────────────────────────────────────────────┤");
|
|
681
|
-
byCategory.forEach((data, category) => {
|
|
682
|
-
const pct = Math.round((data.success / (data.success + data.fail)) * 100);
|
|
683
|
-
const line = `│ ${category.padEnd(16)} ${String(data.count).padStart(2)} tasks ${String(data.success).padStart(3)} ok ${String(data.fail).padStart(2)} fail (${String(pct).padStart(3)}%)`;
|
|
684
|
-
console.log(line.padEnd(78) + "│");
|
|
685
|
-
});
|
|
686
|
-
console.log("└─────────────────────────────────────────────────────────────────────────────┘");
|
|
687
|
-
console.log("");
|
|
688
|
-
// Per-task results
|
|
689
|
-
console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
|
|
690
|
-
console.log("│ PER-TASK RESULTS │");
|
|
691
|
-
console.log("├─────────────────────────────────────────────────────────────────────────────┤");
|
|
692
|
-
byTask.forEach((data, taskId) => {
|
|
693
|
-
const task = SWE_BENCH_TASKS.find((t) => t.instance_id === taskId);
|
|
694
|
-
const status = data.fail === 0 ? "PASS" : "FAIL";
|
|
695
|
-
const label = task
|
|
696
|
-
? `${taskId.slice(0, 35)}`
|
|
697
|
-
: taskId.slice(0, 35);
|
|
698
|
-
const line = `│ ${status.padEnd(4)} ${label.padEnd(37)} ${String(data.calls).padStart(2)} calls ${String(data.phases.size).padStart(1)} phases`;
|
|
699
|
-
console.log(line.padEnd(78) + "│");
|
|
700
|
-
});
|
|
701
|
-
console.log("└─────────────────────────────────────────────────────────────────────────────┘");
|
|
702
|
-
console.log("");
|
|
703
|
-
// Tools used
|
|
704
|
-
console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
|
|
705
|
-
console.log("│ TOOLS EXERCISED IN DATASET BENCH │");
|
|
706
|
-
console.log("├─────────────────────────────────────────────────────────────────────────────┤");
|
|
707
|
-
const toolCounts = new Map();
|
|
708
|
-
pipelineLog.forEach((l) => {
|
|
709
|
-
toolCounts.set(l.tool, (toolCounts.get(l.tool) || 0) + 1);
|
|
710
|
-
});
|
|
711
|
-
const sortedTools = [...toolCounts.entries()].sort((a, b) => b[1] - a[1]);
|
|
712
|
-
for (const [tool, count] of sortedTools) {
|
|
713
|
-
const line = `│ ${tool.padEnd(30)} ${String(count).padStart(4)}x`;
|
|
714
|
-
console.log(line.padEnd(78) + "│");
|
|
715
|
-
}
|
|
716
|
-
console.log("└─────────────────────────────────────────────────────────────────────────────┘");
|
|
717
|
-
console.log("");
|
|
718
|
-
// Verdict
|
|
719
|
-
console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
|
|
720
|
-
console.log("│ VERDICT │");
|
|
721
|
-
console.log("├─────────────────────────────────────────────────────────────────────────────┤");
|
|
722
|
-
if (failCalls === 0) {
|
|
723
|
-
console.log("│ PASS: All 20 SWE-bench tasks completed full pipeline successfully │");
|
|
724
|
-
console.log(`│ ${totalCalls} tool calls across ${byPhase.size} phases, 0 failures │`);
|
|
725
|
-
console.log("│ MCP tools can orchestrate real-world development workflows end-to-end │");
|
|
726
|
-
}
|
|
727
|
-
else {
|
|
728
|
-
console.log(`│ PARTIAL: ${failCalls} tool calls failed across ${byTask.size} tasks │`.padEnd(78) + "│");
|
|
729
|
-
}
|
|
730
|
-
console.log("└─────────────────────────────────────────────────────────────────────────────┘");
|
|
731
|
-
console.log("");
|
|
732
|
-
// Assertions
|
|
733
|
-
expect(failCalls).toBe(0);
|
|
734
|
-
expect(uniqueTools.size).toBeGreaterThanOrEqual(15);
|
|
735
|
-
expect(byTask.size).toBeGreaterThanOrEqual(20); // All 20 tasks + cross-task tests
|
|
736
|
-
});
|
|
737
|
-
});
|
|
738
|
-
//# sourceMappingURL=evalDatasetBench.test.js.map
|