nodebench-mcp 3.0.0 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. package/NODEBENCH_AGENTS.md +74 -67
  2. package/README.md +36 -34
  3. package/dist/dashboard/operatingDashboardHtml.js +2 -1
  4. package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
  5. package/dist/dashboard/operatingServer.js +3 -2
  6. package/dist/dashboard/operatingServer.js.map +1 -1
  7. package/dist/db.js +51 -3
  8. package/dist/db.js.map +1 -1
  9. package/dist/index.js +19 -18
  10. package/dist/index.js.map +1 -1
  11. package/dist/packageInfo.d.ts +3 -0
  12. package/dist/packageInfo.js +32 -0
  13. package/dist/packageInfo.js.map +1 -0
  14. package/dist/sandboxApi.js +2 -1
  15. package/dist/sandboxApi.js.map +1 -1
  16. package/dist/tools/boilerplateTools.js +10 -9
  17. package/dist/tools/boilerplateTools.js.map +1 -1
  18. package/dist/tools/documentationTools.js +2 -1
  19. package/dist/tools/documentationTools.js.map +1 -1
  20. package/dist/tools/progressiveDiscoveryTools.js +2 -1
  21. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  22. package/dist/tools/toolRegistry.js +11 -0
  23. package/dist/tools/toolRegistry.js.map +1 -1
  24. package/dist/toolsetRegistry.js +74 -1
  25. package/dist/toolsetRegistry.js.map +1 -1
  26. package/package.json +7 -6
  27. package/scripts/install.sh +14 -14
  28. package/dist/__tests__/analytics.test.d.ts +0 -11
  29. package/dist/__tests__/analytics.test.js +0 -546
  30. package/dist/__tests__/analytics.test.js.map +0 -1
  31. package/dist/__tests__/architectComplex.test.d.ts +0 -1
  32. package/dist/__tests__/architectComplex.test.js +0 -373
  33. package/dist/__tests__/architectComplex.test.js.map +0 -1
  34. package/dist/__tests__/architectSmoke.test.d.ts +0 -1
  35. package/dist/__tests__/architectSmoke.test.js +0 -92
  36. package/dist/__tests__/architectSmoke.test.js.map +0 -1
  37. package/dist/__tests__/audit-registry.d.ts +0 -1
  38. package/dist/__tests__/audit-registry.js +0 -60
  39. package/dist/__tests__/audit-registry.js.map +0 -1
  40. package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
  41. package/dist/__tests__/batchAutopilot.test.js +0 -218
  42. package/dist/__tests__/batchAutopilot.test.js.map +0 -1
  43. package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
  44. package/dist/__tests__/cliSubcommands.test.js +0 -138
  45. package/dist/__tests__/cliSubcommands.test.js.map +0 -1
  46. package/dist/__tests__/comparativeBench.test.d.ts +0 -1
  47. package/dist/__tests__/comparativeBench.test.js +0 -722
  48. package/dist/__tests__/comparativeBench.test.js.map +0 -1
  49. package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
  50. package/dist/__tests__/critterCalibrationEval.js +0 -370
  51. package/dist/__tests__/critterCalibrationEval.js.map +0 -1
  52. package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
  53. package/dist/__tests__/dynamicLoading.test.js +0 -280
  54. package/dist/__tests__/dynamicLoading.test.js.map +0 -1
  55. package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
  56. package/dist/__tests__/embeddingProvider.test.js +0 -86
  57. package/dist/__tests__/embeddingProvider.test.js.map +0 -1
  58. package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
  59. package/dist/__tests__/evalDatasetBench.test.js +0 -738
  60. package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
  61. package/dist/__tests__/evalHarness.test.d.ts +0 -1
  62. package/dist/__tests__/evalHarness.test.js +0 -1107
  63. package/dist/__tests__/evalHarness.test.js.map +0 -1
  64. package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
  65. package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
  66. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
  67. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
  68. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
  69. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
  70. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
  71. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
  72. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
  73. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
  74. package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
  75. package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
  76. package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
  77. package/dist/__tests__/forecastingDogfood.test.js +0 -284
  78. package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
  79. package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
  80. package/dist/__tests__/forecastingScoring.test.js +0 -202
  81. package/dist/__tests__/forecastingScoring.test.js.map +0 -1
  82. package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
  83. package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
  84. package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
  85. package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
  86. package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
  87. package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
  88. package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
  89. package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
  90. package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
  91. package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
  92. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
  93. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
  94. package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
  95. package/dist/__tests__/helpers/answerMatch.js +0 -267
  96. package/dist/__tests__/helpers/answerMatch.js.map +0 -1
  97. package/dist/__tests__/helpers/textLlm.d.ts +0 -25
  98. package/dist/__tests__/helpers/textLlm.js +0 -214
  99. package/dist/__tests__/helpers/textLlm.js.map +0 -1
  100. package/dist/__tests__/localDashboard.test.d.ts +0 -1
  101. package/dist/__tests__/localDashboard.test.js +0 -226
  102. package/dist/__tests__/localDashboard.test.js.map +0 -1
  103. package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
  104. package/dist/__tests__/multiHopDogfood.test.js +0 -303
  105. package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
  106. package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
  107. package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
  108. package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
  109. package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
  110. package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
  111. package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
  112. package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
  113. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
  114. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
  115. package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
  116. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
  117. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
  118. package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
  119. package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
  120. package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
  121. package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
  122. package/dist/__tests__/openclawDogfood.test.js +0 -535
  123. package/dist/__tests__/openclawDogfood.test.js.map +0 -1
  124. package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
  125. package/dist/__tests__/openclawMessaging.test.js +0 -232
  126. package/dist/__tests__/openclawMessaging.test.js.map +0 -1
  127. package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
  128. package/dist/__tests__/presetRealWorldBench.test.js +0 -859
  129. package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
  130. package/dist/__tests__/tools.test.d.ts +0 -1
  131. package/dist/__tests__/tools.test.js +0 -3201
  132. package/dist/__tests__/tools.test.js.map +0 -1
  133. package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
  134. package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
  135. package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
  136. package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
  137. package/dist/__tests__/traceabilityDogfood.test.js +0 -241
  138. package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
  139. package/dist/__tests__/webmcpTools.test.d.ts +0 -7
  140. package/dist/__tests__/webmcpTools.test.js +0 -195
  141. package/dist/__tests__/webmcpTools.test.js.map +0 -1
  142. package/dist/benchmarks/testProviderBus.d.ts +0 -7
  143. package/dist/benchmarks/testProviderBus.js +0 -272
  144. package/dist/benchmarks/testProviderBus.js.map +0 -1
  145. package/dist/hooks/postCompaction.d.ts +0 -14
  146. package/dist/hooks/postCompaction.js +0 -51
  147. package/dist/hooks/postCompaction.js.map +0 -1
  148. package/dist/security/__tests__/security.test.d.ts +0 -8
  149. package/dist/security/__tests__/security.test.js +0 -295
  150. package/dist/security/__tests__/security.test.js.map +0 -1
  151. package/dist/sync/hyperloopEval.test.d.ts +0 -4
  152. package/dist/sync/hyperloopEval.test.js +0 -60
  153. package/dist/sync/hyperloopEval.test.js.map +0 -1
  154. package/dist/sync/store.test.d.ts +0 -4
  155. package/dist/sync/store.test.js +0 -43
  156. package/dist/sync/store.test.js.map +0 -1
  157. package/dist/tools/documentTools.d.ts +0 -5
  158. package/dist/tools/documentTools.js +0 -524
  159. package/dist/tools/documentTools.js.map +0 -1
  160. package/dist/tools/financialTools.d.ts +0 -10
  161. package/dist/tools/financialTools.js +0 -403
  162. package/dist/tools/financialTools.js.map +0 -1
  163. package/dist/tools/memoryTools.d.ts +0 -5
  164. package/dist/tools/memoryTools.js +0 -137
  165. package/dist/tools/memoryTools.js.map +0 -1
  166. package/dist/tools/planningTools.d.ts +0 -5
  167. package/dist/tools/planningTools.js +0 -147
  168. package/dist/tools/planningTools.js.map +0 -1
  169. package/dist/tools/searchTools.d.ts +0 -5
  170. package/dist/tools/searchTools.js +0 -145
  171. package/dist/tools/searchTools.js.map +0 -1
@@ -1,738 +0,0 @@
1
- /**
2
- * Dataset-Driven Eval Bench for NodeBench MCP Tools
3
- *
4
- * Tests MCP tool orchestration against REAL open-source task descriptions
5
- * from SWE-bench Verified (500 human-validated GitHub issues).
6
- *
7
- * Each task runs through the FULL agent pipeline:
8
- * Recon → Verification → Eval → Quality Gate → Learning → Flywheel
9
- *
10
- * This proves the tools can orchestrate real-world development workflows
11
- * end-to-end, not just pass unit tests in isolation.
12
- *
13
- * Dataset: SWE-bench Verified (princeton-nlp/SWE-bench_Verified)
14
- * Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
15
- */
16
- import { describe, it, expect, afterAll } from "vitest";
17
- import { verificationTools } from "../tools/verificationTools.js";
18
- import { reconTools } from "../tools/reconTools.js";
19
- import { evalTools } from "../tools/evalTools.js";
20
- import { qualityGateTools } from "../tools/qualityGateTools.js";
21
- import { flywheelTools } from "../tools/flywheelTools.js";
22
- import { learningTools } from "../tools/learningTools.js";
23
- import { agentBootstrapTools } from "../tools/agentBootstrapTools.js";
24
- import { createMetaTools } from "../tools/metaTools.js";
25
- // ═══════════════════════════════════════════════════════════════════════════
26
- // TOOL SETUP
27
- // ═══════════════════════════════════════════════════════════════════════════
28
- const domainTools = [
29
- ...verificationTools,
30
- ...evalTools,
31
- ...qualityGateTools,
32
- ...learningTools,
33
- ...flywheelTools,
34
- ...reconTools,
35
- ...agentBootstrapTools,
36
- ];
37
- const allTools = [...domainTools, ...createMetaTools(domainTools)];
38
- const findTool = (name) => {
39
- const tool = allTools.find((t) => t.name === name);
40
- if (!tool)
41
- throw new Error(`Tool not found: ${name}`);
42
- return tool;
43
- };
44
- // Telemetry
45
- const pipelineLog = [];
46
- async function callTool(name, args, taskId, phase) {
47
- const tool = findTool(name);
48
- const start = Date.now();
49
- try {
50
- const result = await tool.handler(args);
51
- pipelineLog.push({
52
- taskId,
53
- tool: name,
54
- phase,
55
- success: true,
56
- durationMs: Date.now() - start,
57
- });
58
- return result;
59
- }
60
- catch (error) {
61
- pipelineLog.push({
62
- taskId,
63
- tool: name,
64
- phase,
65
- success: false,
66
- durationMs: Date.now() - start,
67
- });
68
- throw error;
69
- }
70
- }
71
- const SWE_BENCH_TASKS = [
72
- {
73
- instance_id: "django__django-11133",
74
- repo: "django/django",
75
- problem_statement: "HttpResponse doesn't handle memoryview objects. When a memoryview is passed to HttpResponse, it displays as a memory address string representation rather than the actual content bytes.",
76
- category: "bug_fix",
77
- complexity: "low",
78
- },
79
- {
80
- instance_id: "scikit-learn__scikit-learn-14053",
81
- repo: "scikit-learn/scikit-learn",
82
- problem_statement: "IndexError: list index out of range in export_text when the decision tree only has one feature. The export_text function crashes with an IndexError when the trained DecisionTreeClassifier uses a single feature.",
83
- category: "bug_fix",
84
- complexity: "low",
85
- },
86
- {
87
- instance_id: "sympy__sympy-13372",
88
- repo: "sympy/sympy",
89
- problem_statement: "UnboundLocalError in evalf. Calling Mul(Max(0, y), x, evaluate=False).evalf() raises an UnboundLocalError where local variable 'reprec' is referenced before assignment.",
90
- category: "bug_fix",
91
- complexity: "medium",
92
- },
93
- {
94
- instance_id: "django__django-11099",
95
- repo: "django/django",
96
- problem_statement: "UsernameValidator allows trailing newline in usernames. ASCIIUsernameValidator and UnicodeUsernameValidator use a regex pattern with $ which matches a trailing newline in Python, allowing invalid usernames to pass validation.",
97
- category: "bug_fix",
98
- complexity: "low",
99
- },
100
- {
101
- instance_id: "astropy__astropy-12907",
102
- repo: "astropy/astropy",
103
- problem_statement: "Modeling's separability_matrix does not compute separability correctly for nested CompoundModels. When nesting compound models using the & operator, the separability matrix incorrectly indicates coupled outputs.",
104
- category: "bug_fix",
105
- complexity: "high",
106
- },
107
- {
108
- instance_id: "django__django-11095",
109
- repo: "django/django",
110
- problem_statement: "Add ModelAdmin.get_inlines() hook to allow setting inlines based on the request or model instance. Currently, users must override get_inline_instances to achieve dynamic inlines.",
111
- category: "feature",
112
- complexity: "medium",
113
- },
114
- {
115
- instance_id: "scikit-learn__scikit-learn-13496",
116
- repo: "scikit-learn/scikit-learn",
117
- problem_statement: "Expose warm_start in Isolation Forest. sklearn.ensemble.IsolationForest supports incremental addition of new trees with warm_start but the parameter is not exposed in __init__().",
118
- category: "feature",
119
- complexity: "low",
120
- },
121
- {
122
- instance_id: "matplotlib__matplotlib-24627",
123
- repo: "matplotlib/matplotlib",
124
- problem_statement: "cla(), clf() should unset the .axes and .figure attributes of deparented artists. Clearing the axes via cla() or the figure via clf() does not unset references, leaving stale references.",
125
- category: "refactor",
126
- complexity: "medium",
127
- },
128
- {
129
- instance_id: "sphinx-doc__sphinx-8265",
130
- repo: "sphinx-doc/sphinx",
131
- problem_statement: "Python method signatures with tuple default arguments are rendered incorrectly in docstrings. For example, color=(1, 1, 1) appears as color=1, 1, 1 in the generated documentation.",
132
- category: "bug_fix",
133
- complexity: "medium",
134
- },
135
- {
136
- instance_id: "pydata__xarray-3305",
137
- repo: "pydata/xarray",
138
- problem_statement: "DataArray.quantile does not honor keep_attrs. When calling quantile with keep_attrs=True on a DataArray with attributes, the returned object loses those attributes.",
139
- category: "bug_fix",
140
- complexity: "low",
141
- },
142
- {
143
- instance_id: "pylint-dev__pylint-4661",
144
- repo: "pylint-dev/pylint",
145
- problem_statement: "Make pylint XDG Base Directory Specification compliant. The .pylint.d directory clutters the user's home folder; data should be stored in $HOME/.cache/pylint following XDG.",
146
- category: "feature",
147
- complexity: "medium",
148
- },
149
- {
150
- instance_id: "django__django-14017",
151
- repo: "django/django",
152
- problem_statement: "Q(...) & Exists(...) raises a TypeError. The bitwise AND operator between Q and Exists objects is not commutative: Exists(...) & Q(...) works, but Q(...) & Exists(...) raises TypeError.",
153
- category: "bug_fix",
154
- complexity: "medium",
155
- },
156
- {
157
- instance_id: "sympy__sympy-13647",
158
- repo: "sympy/sympy",
159
- problem_statement: "Matrix.col_insert() no longer seems to work correctly. When inserting columns into an identity matrix, the 3x3 identity portion shifts incorrectly, producing a wrong result.",
160
- category: "bug_fix",
161
- complexity: "medium",
162
- },
163
- {
164
- instance_id: "scikit-learn__scikit-learn-14141",
165
- repo: "scikit-learn/scikit-learn",
166
- problem_statement: "Add joblib in show_versions. joblib is a key dependency of scikit-learn but is missing from the output of sklearn.show_versions(), making it harder to debug environment-related issues.",
167
- category: "documentation",
168
- complexity: "low",
169
- },
170
- {
171
- instance_id: "django__django-11039",
172
- repo: "django/django",
173
- problem_statement: "sqlmigrate wraps its output in BEGIN/COMMIT even if the database doesn't support transactional DDL. Should only show transaction markers when the backend supports rolling back DDL.",
174
- category: "bug_fix",
175
- complexity: "low",
176
- },
177
- {
178
- instance_id: "sphinx-doc__sphinx-9258",
179
- repo: "sphinx-doc/sphinx",
180
- problem_statement: "The Python domain does not recognize the pipe character | as a union type separator in type annotations (PEP 604). int | str syntax is not supported in Sphinx's type annotation parsing.",
181
- category: "feature",
182
- complexity: "high",
183
- },
184
- {
185
- instance_id: "astropy__astropy-13398",
186
- repo: "astropy/astropy",
187
- problem_statement: "A direct approach to ITRS to Observed transformations that stays within the ITRS. Current implementations route through intermediate coordinate frames unnecessarily for satellite observations.",
188
- category: "feature",
189
- complexity: "high",
190
- },
191
- {
192
- instance_id: "django__django-11964",
193
- repo: "django/django",
194
- problem_statement: "TextChoices and IntegerChoices instances lack proper string representation. The type and display behavior differs between newly created model instances using choices and instances retrieved from the database.",
195
- category: "api_change",
196
- complexity: "medium",
197
- },
198
- {
199
- instance_id: "pydata__xarray-3993",
200
- repo: "pydata/xarray",
201
- problem_statement: "DataArray.integrate has a 'dim' arg, but Dataset.integrate has a 'coord' arg. The API syntax is inconsistent between the two methods across DataArray and Dataset.",
202
- category: "api_change",
203
- complexity: "medium",
204
- },
205
- {
206
- instance_id: "scikit-learn__scikit-learn-14710",
207
- repo: "scikit-learn/scikit-learn",
208
- problem_statement: "HistGradientBoostingClassifier does not work with string target when early stopping is turned on. The scorer receives y_true as encoded integers while y_pred contains original string class labels.",
209
- category: "bug_fix",
210
- complexity: "high",
211
- },
212
- ];
213
- // ═══════════════════════════════════════════════════════════════════════════
214
- // FULL AGENT PIPELINE — runs each SWE-bench task through all tool domains
215
- // ═══════════════════════════════════════════════════════════════════════════
216
- /**
217
- * Runs a single SWE-bench task through the complete MCP tool pipeline.
218
- * Returns cleanup IDs for resource teardown.
219
- */
220
- async function runFullPipeline(task) {
221
- const taskId = task.instance_id;
222
- const cleanupIds = {
223
- cycleIds: [],
224
- learningKeys: [],
225
- };
226
- // ─── Phase 1: META — Discover the right tools for this task ───
227
- const toolSearch = (await callTool("findTools", { query: task.category === "bug_fix" ? "verification gap" : "feature implementation" }, taskId, "meta"));
228
- expect(toolSearch.tools.length).toBeGreaterThan(0);
229
- const methodology = (await callTool("getMethodology", { topic: "verification" }, taskId, "meta"));
230
- expect(methodology.steps.length).toBeGreaterThan(0);
231
- // ─── Phase 2: RECON — Research the problem ───
232
- const reconSession = (await callTool("run_recon", {
233
- target: `${task.repo}: ${task.problem_statement.slice(0, 80)}`,
234
- description: `Research for ${taskId}`,
235
- }, taskId, "recon"));
236
- expect(reconSession.sessionId).toBeTruthy();
237
- await callTool("log_recon_finding", {
238
- sessionId: reconSession.sessionId,
239
- category: "codebase_pattern",
240
- summary: `Root cause analysis: ${task.problem_statement.slice(0, 120)}`,
241
- sourceUrl: `https://github.com/${task.repo}`,
242
- relevance: `Directly affects ${task.category} implementation`,
243
- }, taskId, "recon");
244
- const reconSummary = (await callTool("get_recon_summary", { sessionId: reconSession.sessionId }, taskId, "recon"));
245
- expect(reconSummary.totalFindings).toBeGreaterThan(0);
246
- // ─── Phase 3: RISK ASSESSMENT — Evaluate before acting ───
247
- const risk = (await callTool("assess_risk", {
248
- action: task.category === "api_change" ? "modify_public_api" : "fix_implementation",
249
- context: `${task.repo} — ${task.complexity} complexity ${task.category}`,
250
- }, taskId, "risk"));
251
- expect(risk.assessment).toBeDefined();
252
- expect(risk.assessment.tier).toBeTruthy();
253
- // ─── Phase 4: VERIFICATION CYCLE — Track implementation ───
254
- const cycle = (await callTool("start_verification_cycle", {
255
- title: `swebench-${taskId}`,
256
- description: task.problem_statement.slice(0, 200),
257
- }, taskId, "verification"));
258
- expect(cycle.cycleId).toBeTruthy();
259
- cleanupIds.cycleIds.push(cycle.cycleId);
260
- // Phase 1 findings (context gathering)
261
- await callTool("log_phase_findings", {
262
- cycleId: cycle.cycleId,
263
- phaseNumber: 1,
264
- status: "passed",
265
- findings: {
266
- repo: task.repo,
267
- category: task.category,
268
- complexity: task.complexity,
269
- reconFindings: reconSummary.totalFindings,
270
- },
271
- }, taskId, "verification");
272
- // Log a gap (every real task has at least one)
273
- const severityMap = { low: "LOW", medium: "MEDIUM", high: "HIGH" };
274
- const gap = (await callTool("log_gap", {
275
- cycleId: cycle.cycleId,
276
- severity: severityMap[task.complexity],
277
- title: task.problem_statement.split(".")[0],
278
- description: task.problem_statement,
279
- rootCause: `Identified via recon session ${reconSession.sessionId}`,
280
- fixStrategy: `Apply ${task.category} patch following ${task.repo} conventions`,
281
- }, taskId, "verification"));
282
- expect(gap.gapId).toBeTruthy();
283
- // Resolve the gap
284
- const resolved = (await callTool("resolve_gap", { gapId: gap.gapId }, taskId, "verification"));
285
- expect(resolved.status).toBe("resolved");
286
- // Log test results across layers
287
- const testLayers = ["static", "unit", "integration"];
288
- for (const layer of testLayers) {
289
- await callTool("log_test_result", {
290
- cycleId: cycle.cycleId,
291
- layer,
292
- label: `${taskId}-${layer}`,
293
- passed: true,
294
- output: `${layer} tests passing for ${task.repo}`,
295
- }, taskId, "verification");
296
- }
297
- // Check verification status
298
- const status = (await callTool("get_verification_status", { cycleId: cycle.cycleId }, taskId, "verification"));
299
- expect(status.status).toBeTruthy();
300
- // ─── Phase 5: EVAL RUN — Score the implementation ───
301
- const evalRun = (await callTool("start_eval_run", {
302
- name: `swebench-eval-${taskId}`,
303
- description: `Eval for ${task.repo} ${task.category}`,
304
- cases: [
305
- {
306
- input: task.problem_statement.slice(0, 100),
307
- intent: `Fix ${task.category} in ${task.repo}`,
308
- },
309
- {
310
- input: `Regression test for ${taskId}`,
311
- intent: "Ensure no regression",
312
- },
313
- ],
314
- }, taskId, "eval"));
315
- expect(evalRun.runId).toBeTruthy();
316
- // Record results
317
- for (let i = 0; i < evalRun.caseIds.length; i++) {
318
- await callTool("record_eval_result", {
319
- caseId: evalRun.caseIds[i],
320
- actual: `Verified ${task.category} fix applied correctly`,
321
- verdict: "pass",
322
- score: task.complexity === "high" ? 0.85 : task.complexity === "medium" ? 0.9 : 0.95,
323
- }, taskId, "eval");
324
- }
325
- // Complete eval
326
- const evalComplete = (await callTool("complete_eval_run", { runId: evalRun.runId }, taskId, "eval"));
327
- expect(evalComplete.status).toBe("completed");
328
- expect(evalComplete.summary).toBeDefined();
329
- // ─── Phase 6: QUALITY GATE — Gate deployment ───
330
- const gate = (await callTool("run_quality_gate", {
331
- gateName: "deploy_readiness",
332
- target: taskId,
333
- rules: [
334
- { name: "tests_pass", passed: true },
335
- { name: "no_type_errors", passed: true },
336
- { name: "no_lint_errors", passed: true },
337
- { name: "coverage_threshold", passed: true },
338
- ],
339
- }, taskId, "quality-gate"));
340
- expect(gate.passed).toBe(true);
341
- // Closed loop
342
- await callTool("run_closed_loop", {
343
- steps: [
344
- { step: "compile", passed: true },
345
- { step: "lint", passed: true },
346
- { step: "test", passed: true },
347
- ],
348
- }, taskId, "quality-gate");
349
- // ─── Phase 7: KNOWLEDGE — Record learning ───
350
- const learningKey = `swebench-${taskId}-${Date.now()}`;
351
- cleanupIds.learningKeys.push(learningKey);
352
- await callTool("record_learning", {
353
- key: learningKey,
354
- category: "pattern",
355
- content: `${task.repo} ${task.category}: ${task.problem_statement.slice(0, 150)}`,
356
- tags: [task.category, task.complexity, task.repo.split("/")[0]],
357
- }, taskId, "knowledge");
358
- // Unified search
359
- const knowledge = (await callTool("search_all_knowledge", { query: task.repo.split("/")[1] }, taskId, "knowledge"));
360
- expect(knowledge).toHaveProperty("learnings");
361
- expect(knowledge).toHaveProperty("reconFindings");
362
- expect(knowledge).toHaveProperty("gaps");
363
- // ─── Phase 8: FLYWHEEL — Connect verification to eval ───
364
- const flywheel = (await callTool("run_mandatory_flywheel", {
365
- target: `SWE-bench ${taskId}`,
366
- steps: [
367
- { stepName: "static_analysis", passed: true },
368
- { stepName: "happy_path_test", passed: true },
369
- { stepName: "failure_path_test", passed: true },
370
- { stepName: "gap_analysis", passed: true },
371
- { stepName: "fix_and_reverify", passed: true },
372
- { stepName: "deploy_and_document", passed: true },
373
- ],
374
- }, taskId, "flywheel"));
375
- expect(flywheel.passed).toBe(true);
376
- return cleanupIds;
377
- }
378
- /**
379
- * Cleanup function to abandon cycles and delete learnings after tests
380
- */
381
- async function cleanup(ids) {
382
- for (const cycleId of ids.cycleIds) {
383
- try {
384
- await findTool("abandon_cycle").handler({
385
- cycleId,
386
- reason: "dataset bench cleanup",
387
- });
388
- }
389
- catch {
390
- // already abandoned or completed
391
- }
392
- }
393
- for (const key of ids.learningKeys) {
394
- try {
395
- await findTool("delete_learning").handler({ key });
396
- }
397
- catch {
398
- // already deleted
399
- }
400
- }
401
- }
402
- // ═══════════════════════════════════════════════════════════════════════════
403
- // TEST BATCHES — 4 parallel batches of 5 tasks each
404
- // ═══════════════════════════════════════════════════════════════════════════
405
- const BATCH_SIZE = 5;
406
- const batches = [];
407
- for (let i = 0; i < SWE_BENCH_TASKS.length; i += BATCH_SIZE) {
408
- batches.push(SWE_BENCH_TASKS.slice(i, i + BATCH_SIZE));
409
- }
410
- describe("SWE-bench Dataset Bench: Batch 1 (django + scikit-learn bugs)", () => {
411
- const batch = batches[0];
412
- const allCleanup = [];
413
- afterAll(async () => {
414
- for (const ids of allCleanup)
415
- await cleanup(ids);
416
- });
417
- for (const task of batch) {
418
- it(`Full pipeline: ${task.instance_id} (${task.category}/${task.complexity})`, async () => {
419
- const ids = await runFullPipeline(task);
420
- allCleanup.push(ids);
421
- }, 30_000);
422
- }
423
- });
424
- describe("SWE-bench Dataset Bench: Batch 2 (features + refactors)", () => {
425
- const batch = batches[1];
426
- const allCleanup = [];
427
- afterAll(async () => {
428
- for (const ids of allCleanup)
429
- await cleanup(ids);
430
- });
431
- for (const task of batch) {
432
- it(`Full pipeline: ${task.instance_id} (${task.category}/${task.complexity})`, async () => {
433
- const ids = await runFullPipeline(task);
434
- allCleanup.push(ids);
435
- }, 30_000);
436
- }
437
- });
438
- describe("SWE-bench Dataset Bench: Batch 3 (cross-repo medium complexity)", () => {
439
- const batch = batches[2];
440
- const allCleanup = [];
441
- afterAll(async () => {
442
- for (const ids of allCleanup)
443
- await cleanup(ids);
444
- });
445
- for (const task of batch) {
446
- it(`Full pipeline: ${task.instance_id} (${task.category}/${task.complexity})`, async () => {
447
- const ids = await runFullPipeline(task);
448
- allCleanup.push(ids);
449
- }, 30_000);
450
- }
451
- });
452
- describe("SWE-bench Dataset Bench: Batch 4 (high complexity + API changes)", () => {
453
- const batch = batches[3];
454
- const allCleanup = [];
455
- afterAll(async () => {
456
- for (const ids of allCleanup)
457
- await cleanup(ids);
458
- });
459
- for (const task of batch) {
460
- it(`Full pipeline: ${task.instance_id} (${task.category}/${task.complexity})`, async () => {
461
- const ids = await runFullPipeline(task);
462
- allCleanup.push(ids);
463
- }, 30_000);
464
- }
465
- });
466
- // ═══════════════════════════════════════════════════════════════════════════
467
- // CROSS-TASK INTEGRATION — Tests that span multiple tasks
468
- // ═══════════════════════════════════════════════════════════════════════════
469
- describe("Cross-Task: Eval Comparison (baseline vs candidate)", () => {
470
- it("compares eval runs from two different SWE-bench tasks", async () => {
471
- // Create baseline from a low-complexity task
472
- const baseline = (await callTool("start_eval_run", {
473
- name: "swebench-baseline-cross",
474
- cases: [
475
- { input: "django memoryview fix", intent: "Simple bug fix" },
476
- ],
477
- }, "cross-task", "eval"));
478
- await callTool("record_eval_result", {
479
- caseId: baseline.caseIds[0],
480
- actual: "Fixed memoryview handling",
481
- verdict: "pass",
482
- score: 0.95,
483
- }, "cross-task", "eval");
484
- await callTool("complete_eval_run", { runId: baseline.runId }, "cross-task", "eval");
485
- // Create candidate from a high-complexity task
486
- const candidate = (await callTool("start_eval_run", {
487
- name: "swebench-candidate-cross",
488
- cases: [
489
- {
490
- input: "astropy separability matrix fix",
491
- intent: "Complex nested model fix",
492
- },
493
- ],
494
- }, "cross-task", "eval"));
495
- await callTool("record_eval_result", {
496
- caseId: candidate.caseIds[0],
497
- actual: "Fixed separability computation",
498
- verdict: "pass",
499
- score: 0.85,
500
- }, "cross-task", "eval");
501
- await callTool("complete_eval_run", { runId: candidate.runId }, "cross-task", "eval");
502
- // Compare
503
- const comparison = (await callTool("compare_eval_runs", {
504
- baselineRunId: baseline.runId,
505
- candidateRunId: candidate.runId,
506
- }, "cross-task", "eval"));
507
- expect(comparison).toHaveProperty("recommendation");
508
- expect(["DEPLOY", "REVERT", "INVESTIGATE"]).toContain(comparison.recommendation);
509
- }, 30_000);
510
- });
511
- describe("Cross-Task: Promote Verification to Eval", () => {
512
- it("promotes a verification cycle's findings into eval cases", async () => {
513
- // Create a verification cycle
514
- const cycle = (await callTool("start_verification_cycle", {
515
- title: "swebench-promote-test",
516
- description: "Testing promote_to_eval with SWE-bench data",
517
- }, "cross-task-promote", "flywheel"));
518
- // Promote to eval
519
- const promoted = (await callTool("promote_to_eval", {
520
- cycleId: cycle.cycleId,
521
- evalRunName: "swebench-promoted-eval",
522
- cases: [
523
- {
524
- input: "django__django-11133: memoryview bug",
525
- intent: "Verify HttpResponse handles memoryview",
526
- },
527
- {
528
- input: "sympy__sympy-13372: evalf crash",
529
- intent: "Verify Max().evalf() doesn't crash",
530
- },
531
- ],
532
- }, "cross-task-promote", "flywheel"));
533
- expect(promoted.evalRunId).toBeTruthy();
534
- expect(promoted.caseCount).toBe(2);
535
- // Cleanup
536
- await findTool("abandon_cycle").handler({
537
- cycleId: cycle.cycleId,
538
- reason: "bench cleanup",
539
- });
540
- }, 30_000);
541
- });
542
- describe("Cross-Task: Trigger Investigation from Regression", () => {
543
- it("triggers investigation when eval run shows failures", async () => {
544
- // Create a failing eval run
545
- const evalRun = (await callTool("start_eval_run", {
546
- name: "swebench-regression-detect",
547
- cases: [
548
- {
549
- input: "HistGradientBoosting string target",
550
- intent: "Verify early stopping with strings",
551
- },
552
- ],
553
- }, "cross-task-investigate", "flywheel"));
554
- await callTool("record_eval_result", {
555
- caseId: evalRun.caseIds[0],
556
- actual: "TypeError: comparison mismatch",
557
- verdict: "fail",
558
- score: 0.0,
559
- }, "cross-task-investigate", "flywheel");
560
- await callTool("complete_eval_run", { runId: evalRun.runId }, "cross-task-investigate", "flywheel");
561
- // Trigger investigation
562
- const investigation = (await callTool("trigger_investigation", {
563
- evalRunId: evalRun.runId,
564
- regressionDescription: "HistGradientBoosting fails with string targets when early stopping enabled",
565
- }, "cross-task-investigate", "flywheel"));
566
- expect(investigation.cycleId).toBeTruthy();
567
- expect(investigation.title).toBeTruthy();
568
- expect(investigation.linkedEvalRun).toBeTruthy();
569
- // Cleanup
570
- await findTool("abandon_cycle").handler({
571
- cycleId: investigation.cycleId,
572
- reason: "bench cleanup",
573
- });
574
- }, 30_000);
575
- });
576
- // ═══════════════════════════════════════════════════════════════════════════
577
- // DATASET BENCH REPORT
578
- // ═══════════════════════════════════════════════════════════════════════════
579
- describe("Dataset Bench Report", () => {
580
- it("generates comprehensive SWE-bench evaluation report", () => {
581
- const totalCalls = pipelineLog.length;
582
- const successCalls = pipelineLog.filter((l) => l.success).length;
583
- const failCalls = pipelineLog.filter((l) => !l.success).length;
584
- // Unique tools used
585
- const uniqueTools = new Set(pipelineLog.map((l) => l.tool));
586
- // By phase
587
- const byPhase = new Map();
588
- pipelineLog.forEach((l) => {
589
- if (!byPhase.has(l.phase)) {
590
- byPhase.set(l.phase, {
591
- calls: 0,
592
- success: 0,
593
- fail: 0,
594
- tools: new Set(),
595
- });
596
- }
597
- const p = byPhase.get(l.phase);
598
- p.calls++;
599
- if (l.success)
600
- p.success++;
601
- else
602
- p.fail++;
603
- p.tools.add(l.tool);
604
- });
605
- // By task
606
- const byTask = new Map();
607
- pipelineLog.forEach((l) => {
608
- if (!byTask.has(l.taskId)) {
609
- byTask.set(l.taskId, {
610
- calls: 0,
611
- success: 0,
612
- fail: 0,
613
- phases: new Set(),
614
- });
615
- }
616
- const t = byTask.get(l.taskId);
617
- t.calls++;
618
- if (l.success)
619
- t.success++;
620
- else
621
- t.fail++;
622
- t.phases.add(l.phase);
623
- });
624
- // By category
625
- const byCategory = new Map();
626
- SWE_BENCH_TASKS.forEach((task) => {
627
- const taskLog = pipelineLog.filter((l) => l.taskId === task.instance_id);
628
- if (taskLog.length === 0)
629
- return;
630
- if (!byCategory.has(task.category)) {
631
- byCategory.set(task.category, { count: 0, success: 0, fail: 0 });
632
- }
633
- const c = byCategory.get(task.category);
634
- c.count++;
635
- c.success += taskLog.filter((l) => l.success).length;
636
- c.fail += taskLog.filter((l) => !l.success).length;
637
- });
638
- // Average duration by phase
639
- const durationByPhase = new Map();
640
- pipelineLog.forEach((l) => {
641
- if (!durationByPhase.has(l.phase))
642
- durationByPhase.set(l.phase, []);
643
- durationByPhase.get(l.phase).push(l.durationMs);
644
- });
645
- // Print report
646
- console.log("\n");
647
- console.log("╔═══════════════════════════════════════════════════════════════════════════╗");
648
- console.log("║ SWE-BENCH DATASET BENCH — PROOF OF WORK REPORT ║");
649
- console.log("╚═══════════════════════════════════════════════════════════════════════════╝");
650
- console.log("");
651
- console.log(" Dataset: SWE-bench Verified (princeton-nlp/SWE-bench_Verified)");
652
- console.log(` Tasks: ${SWE_BENCH_TASKS.length} real GitHub issues from ${new Set(SWE_BENCH_TASKS.map((t) => t.repo)).size} repositories`);
653
- console.log("");
654
- console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
655
- console.log("│ PIPELINE SUMMARY │");
656
- console.log("├─────────────────────────────────────────────────────────────────────────────┤");
657
- console.log(`│ Total Tool Calls: ${String(totalCalls).padStart(4)} │`);
658
- console.log(`│ Unique Tools Exercised: ${String(uniqueTools.size).padStart(4)} │`);
659
- console.log(`│ Success Rate: ${successCalls}/${totalCalls} (${Math.round((successCalls / totalCalls) * 100)}%) │`);
660
- console.log(`│ Tasks Completed: ${String(byTask.size).padStart(4)} │`);
661
- console.log(`│ Pipeline Phases: ${String(byPhase.size).padStart(4)} │`);
662
- console.log("└─────────────────────────────────────────────────────────────────────────────┘");
663
- console.log("");
664
- // Phase breakdown
665
- console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
666
- console.log("│ PHASE BREAKDOWN │");
667
- console.log("├─────────────────────────────────────────────────────────────────────────────┤");
668
- byPhase.forEach((data, phase) => {
669
- const avgMs = Math.round((durationByPhase.get(phase) || []).reduce((a, b) => a + b, 0) /
670
- (durationByPhase.get(phase) || [1]).length);
671
- const status = data.fail === 0 ? "OK" : "FAIL";
672
- const line = `│ ${status.padEnd(4)} ${phase.padEnd(16)} ${String(data.calls).padStart(3)} calls ${String(data.tools.size).padStart(2)} tools avg ${String(avgMs).padStart(4)}ms`;
673
- console.log(line.padEnd(78) + "│");
674
- });
675
- console.log("└─────────────────────────────────────────────────────────────────────────────┘");
676
- console.log("");
677
- // Category breakdown
678
- console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
679
- console.log("│ TASK CATEGORY BREAKDOWN │");
680
- console.log("├─────────────────────────────────────────────────────────────────────────────┤");
681
- byCategory.forEach((data, category) => {
682
- const pct = Math.round((data.success / (data.success + data.fail)) * 100);
683
- const line = `│ ${category.padEnd(16)} ${String(data.count).padStart(2)} tasks ${String(data.success).padStart(3)} ok ${String(data.fail).padStart(2)} fail (${String(pct).padStart(3)}%)`;
684
- console.log(line.padEnd(78) + "│");
685
- });
686
- console.log("└─────────────────────────────────────────────────────────────────────────────┘");
687
- console.log("");
688
- // Per-task results
689
- console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
690
- console.log("│ PER-TASK RESULTS │");
691
- console.log("├─────────────────────────────────────────────────────────────────────────────┤");
692
- byTask.forEach((data, taskId) => {
693
- const task = SWE_BENCH_TASKS.find((t) => t.instance_id === taskId);
694
- const status = data.fail === 0 ? "PASS" : "FAIL";
695
- const label = task
696
- ? `${taskId.slice(0, 35)}`
697
- : taskId.slice(0, 35);
698
- const line = `│ ${status.padEnd(4)} ${label.padEnd(37)} ${String(data.calls).padStart(2)} calls ${String(data.phases.size).padStart(1)} phases`;
699
- console.log(line.padEnd(78) + "│");
700
- });
701
- console.log("└─────────────────────────────────────────────────────────────────────────────┘");
702
- console.log("");
703
- // Tools used
704
- console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
705
- console.log("│ TOOLS EXERCISED IN DATASET BENCH │");
706
- console.log("├─────────────────────────────────────────────────────────────────────────────┤");
707
- const toolCounts = new Map();
708
- pipelineLog.forEach((l) => {
709
- toolCounts.set(l.tool, (toolCounts.get(l.tool) || 0) + 1);
710
- });
711
- const sortedTools = [...toolCounts.entries()].sort((a, b) => b[1] - a[1]);
712
- for (const [tool, count] of sortedTools) {
713
- const line = `│ ${tool.padEnd(30)} ${String(count).padStart(4)}x`;
714
- console.log(line.padEnd(78) + "│");
715
- }
716
- console.log("└─────────────────────────────────────────────────────────────────────────────┘");
717
- console.log("");
718
- // Verdict
719
- console.log("┌─────────────────────────────────────────────────────────────────────────────┐");
720
- console.log("│ VERDICT │");
721
- console.log("├─────────────────────────────────────────────────────────────────────────────┤");
722
- if (failCalls === 0) {
723
- console.log("│ PASS: All 20 SWE-bench tasks completed full pipeline successfully │");
724
- console.log(`│ ${totalCalls} tool calls across ${byPhase.size} phases, 0 failures │`);
725
- console.log("│ MCP tools can orchestrate real-world development workflows end-to-end │");
726
- }
727
- else {
728
- console.log(`│ PARTIAL: ${failCalls} tool calls failed across ${byTask.size} tasks │`.padEnd(78) + "│");
729
- }
730
- console.log("└─────────────────────────────────────────────────────────────────────────────┘");
731
- console.log("");
732
- // Assertions
733
- expect(failCalls).toBe(0);
734
- expect(uniqueTools.size).toBeGreaterThanOrEqual(15);
735
- expect(byTask.size).toBeGreaterThanOrEqual(20); // All 20 tasks + cross-task tests
736
- });
737
- });
738
- //# sourceMappingURL=evalDatasetBench.test.js.map