nodebench-mcp 2.11.0 → 2.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/NODEBENCH_AGENTS.md +809 -809
- package/README.md +443 -431
- package/STYLE_GUIDE.md +477 -477
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js +153 -5
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +1 -1
- package/dist/__tests__/helpers/textLlm.d.ts +1 -1
- package/dist/__tests__/presetRealWorldBench.test.d.ts +1 -0
- package/dist/__tests__/presetRealWorldBench.test.js +839 -0
- package/dist/__tests__/presetRealWorldBench.test.js.map +1 -0
- package/dist/__tests__/tools.test.js +8 -5
- package/dist/__tests__/tools.test.js.map +1 -1
- package/dist/__tests__/toolsetGatingEval.test.js +11 -11
- package/dist/__tests__/toolsetGatingEval.test.js.map +1 -1
- package/dist/index.js +397 -327
- package/dist/index.js.map +1 -1
- package/dist/tools/agentBootstrapTools.js +258 -258
- package/dist/tools/boilerplateTools.js +144 -144
- package/dist/tools/cCompilerBenchmarkTools.js +33 -33
- package/dist/tools/documentationTools.js +59 -59
- package/dist/tools/flywheelTools.js +6 -6
- package/dist/tools/learningTools.js +26 -26
- package/dist/tools/localFileTools.d.ts +3 -0
- package/dist/tools/localFileTools.js +3164 -125
- package/dist/tools/localFileTools.js.map +1 -1
- package/dist/tools/reconTools.js +31 -31
- package/dist/tools/selfEvalTools.js +44 -44
- package/dist/tools/sessionMemoryTools.d.ts +15 -0
- package/dist/tools/sessionMemoryTools.js +348 -0
- package/dist/tools/sessionMemoryTools.js.map +1 -0
- package/dist/tools/toolRegistry.d.ts +4 -0
- package/dist/tools/toolRegistry.js +229 -0
- package/dist/tools/toolRegistry.js.map +1 -1
- package/dist/tools/verificationTools.js +41 -41
- package/dist/tools/visionTools.js +17 -17
- package/dist/tools/webTools.js +18 -18
- package/package.json +101 -101
|
@@ -12,52 +12,52 @@ const PHASE_NAMES = [
|
|
|
12
12
|
"document",
|
|
13
13
|
];
|
|
14
14
|
const PHASE_INSTRUCTIONS = {
|
|
15
|
-
context_gathering: `Phase 1: Context Gathering (Parallel Research)
|
|
16
|
-
Launch parallel research into:
|
|
17
|
-
- SDK/Protocol specs: Latest versions, blogs, announcements, GitHub repos, official SDKs
|
|
18
|
-
- Implementation audit: Current codebase patterns, inconsistencies, unused code
|
|
19
|
-
- Dispatcher/backend audit: Function signatures, allowlists, argument shapes
|
|
20
|
-
- External API research: Check if third-party APIs still work, find known breaking changes
|
|
21
|
-
|
|
22
|
-
Goal: Build a comprehensive picture of "what production looks like" vs "what we have."
|
|
15
|
+
context_gathering: `Phase 1: Context Gathering (Parallel Research)
|
|
16
|
+
Launch parallel research into:
|
|
17
|
+
- SDK/Protocol specs: Latest versions, blogs, announcements, GitHub repos, official SDKs
|
|
18
|
+
- Implementation audit: Current codebase patterns, inconsistencies, unused code
|
|
19
|
+
- Dispatcher/backend audit: Function signatures, allowlists, argument shapes
|
|
20
|
+
- External API research: Check if third-party APIs still work, find known breaking changes
|
|
21
|
+
|
|
22
|
+
Goal: Build a comprehensive picture of "what production looks like" vs "what we have."
|
|
23
23
|
TIP: Call search_learnings first to check for known issues related to this work.`,
|
|
24
|
-
gap_analysis: `Phase 2: Gap Analysis
|
|
25
|
-
Compare Phase 1 findings against current implementation. For each gap found, call log_gap with:
|
|
26
|
-
- severity: CRITICAL (protocol violations, security), HIGH (API incompatibilities, silent failures), MEDIUM (outdated versions, missing features), LOW (edge case handling)
|
|
27
|
-
- root_cause: Why the gap exists
|
|
28
|
-
- fix_strategy: How to fix it
|
|
29
|
-
|
|
24
|
+
gap_analysis: `Phase 2: Gap Analysis
|
|
25
|
+
Compare Phase 1 findings against current implementation. For each gap found, call log_gap with:
|
|
26
|
+
- severity: CRITICAL (protocol violations, security), HIGH (API incompatibilities, silent failures), MEDIUM (outdated versions, missing features), LOW (edge case handling)
|
|
27
|
+
- root_cause: Why the gap exists
|
|
28
|
+
- fix_strategy: How to fix it
|
|
29
|
+
|
|
30
30
|
Output: A numbered gap list. Fix CRITICAL and HIGH first.`,
|
|
31
|
-
implementation: `Phase 3: Implementation
|
|
32
|
-
Apply fixes following production patterns exactly. Rules:
|
|
33
|
-
- Fix CRITICAL and HIGH gaps first
|
|
34
|
-
- Each fix is a discrete, testable change
|
|
35
|
-
- Follow the reference pattern found in Phase 1 — don't invent new patterns
|
|
36
|
-
- Document why each change was made (comments in code where non-obvious)
|
|
31
|
+
implementation: `Phase 3: Implementation
|
|
32
|
+
Apply fixes following production patterns exactly. Rules:
|
|
33
|
+
- Fix CRITICAL and HIGH gaps first
|
|
34
|
+
- Each fix is a discrete, testable change
|
|
35
|
+
- Follow the reference pattern found in Phase 1 — don't invent new patterns
|
|
36
|
+
- Document why each change was made (comments in code where non-obvious)
|
|
37
37
|
- Call resolve_gap as you fix each gap`,
|
|
38
|
-
testing: `Phase 4: Testing & Validation (Multi-Layer — CRITICAL)
|
|
39
|
-
Run tests at all 5 layers. Call log_test_result for each:
|
|
40
|
-
- Layer 1: static — TypeScript tsc --noEmit, type checking
|
|
41
|
-
- Layer 2: unit — Run existing test suites, add targeted tests for fixes
|
|
42
|
-
- Layer 3: integration — End-to-end flow through handler chain
|
|
43
|
-
- Layer 4: manual — Spot-check critical paths with curl or direct invocation
|
|
44
|
-
- Layer 5: live_e2e — Deploy to staging, hit real endpoints, verify real responses
|
|
45
|
-
|
|
38
|
+
testing: `Phase 4: Testing & Validation (Multi-Layer — CRITICAL)
|
|
39
|
+
Run tests at all 5 layers. Call log_test_result for each:
|
|
40
|
+
- Layer 1: static — TypeScript tsc --noEmit, type checking
|
|
41
|
+
- Layer 2: unit — Run existing test suites, add targeted tests for fixes
|
|
42
|
+
- Layer 3: integration — End-to-end flow through handler chain
|
|
43
|
+
- Layer 4: manual — Spot-check critical paths with curl or direct invocation
|
|
44
|
+
- Layer 5: live_e2e — Deploy to staging, hit real endpoints, verify real responses
|
|
45
|
+
|
|
46
46
|
ALL layers must pass before proceeding to Phase 5.`,
|
|
47
|
-
self_verify: `Phase 5: Self-Closed-Loop Verification (Parallel Checks)
|
|
48
|
-
Launch parallel verification checks, each targeting a different dimension:
|
|
49
|
-
- Spec compliance: Does every response match the protocol spec exactly?
|
|
50
|
-
- Functional correctness: Do tools return correct data for known inputs?
|
|
51
|
-
- Argument compatibility: Do all handler-backend function pairs have matching shapes?
|
|
52
|
-
|
|
47
|
+
self_verify: `Phase 5: Self-Closed-Loop Verification (Parallel Checks)
|
|
48
|
+
Launch parallel verification checks, each targeting a different dimension:
|
|
49
|
+
- Spec compliance: Does every response match the protocol spec exactly?
|
|
50
|
+
- Functional correctness: Do tools return correct data for known inputs?
|
|
51
|
+
- Argument compatibility: Do all handler-backend function pairs have matching shapes?
|
|
52
|
+
|
|
53
53
|
Each check produces PASS/FAIL. Any FAIL loops back to Phase 3 (Implementation).`,
|
|
54
|
-
document: `Phase 6: Document Learnings
|
|
55
|
-
Record what you discovered. For each edge case, gotcha, or pattern, call record_learning with:
|
|
56
|
-
- key: Short identifier (e.g. 'convex-use-node-export-restriction')
|
|
57
|
-
- content: What happened, why, and how to avoid it
|
|
58
|
-
- category: edge_case | gotcha | pattern | regression | convention
|
|
59
|
-
- sourceCycle: This verification cycle's ID
|
|
60
|
-
|
|
54
|
+
document: `Phase 6: Document Learnings
|
|
55
|
+
Record what you discovered. For each edge case, gotcha, or pattern, call record_learning with:
|
|
56
|
+
- key: Short identifier (e.g. 'convex-use-node-export-restriction')
|
|
57
|
+
- content: What happened, why, and how to avoid it
|
|
58
|
+
- category: edge_case | gotcha | pattern | regression | convention
|
|
59
|
+
- sourceCycle: This verification cycle's ID
|
|
60
|
+
|
|
61
61
|
This prevents future regressions and expands the knowledge base.`,
|
|
62
62
|
};
|
|
63
63
|
export const verificationTools = [
|
|
@@ -46,23 +46,23 @@ function escapeXml(s) {
|
|
|
46
46
|
.replace(/'/g, "'");
|
|
47
47
|
}
|
|
48
48
|
// ─── Default analysis prompt ──────────────────────────────────────────────────
|
|
49
|
-
const DEFAULT_ANALYSIS_PROMPT = `Analyze this UI screenshot for quality issues. Evaluate:
|
|
50
|
-
|
|
51
|
-
1. LAYOUT: Is the layout balanced? Any overlapping elements, broken grids, or misaligned components?
|
|
52
|
-
2. SPACING: Is whitespace consistent? Any cramped or overly sparse areas?
|
|
53
|
-
3. TYPOGRAPHY: Are font sizes readable? Is there clear visual hierarchy (headings, body, captions)?
|
|
54
|
-
4. COLOR & CONTRAST: Are text/background combinations readable? Does it follow WCAG 2.1 AA contrast ratios?
|
|
55
|
-
5. RESPONSIVENESS: Does the layout look appropriate for its viewport width?
|
|
56
|
-
6. COMPONENT STATES: Are there visible loading spinners, error states, or empty states that look broken?
|
|
57
|
-
7. VISUAL CONSISTENCY: Do colors, borders, shadows, and rounding match a consistent design system?
|
|
58
|
-
8. ACCESSIBILITY: Are interactive elements visually distinct? Are focus indicators visible?
|
|
59
|
-
|
|
60
|
-
For each issue found, describe:
|
|
61
|
-
- What the issue is
|
|
62
|
-
- Where it is (describe the location in the screenshot)
|
|
63
|
-
- Severity: CRITICAL (broken/unusable), HIGH (visually wrong), MEDIUM (suboptimal), LOW (nitpick)
|
|
64
|
-
- Suggested fix
|
|
65
|
-
|
|
49
|
+
const DEFAULT_ANALYSIS_PROMPT = `Analyze this UI screenshot for quality issues. Evaluate:
|
|
50
|
+
|
|
51
|
+
1. LAYOUT: Is the layout balanced? Any overlapping elements, broken grids, or misaligned components?
|
|
52
|
+
2. SPACING: Is whitespace consistent? Any cramped or overly sparse areas?
|
|
53
|
+
3. TYPOGRAPHY: Are font sizes readable? Is there clear visual hierarchy (headings, body, captions)?
|
|
54
|
+
4. COLOR & CONTRAST: Are text/background combinations readable? Does it follow WCAG 2.1 AA contrast ratios?
|
|
55
|
+
5. RESPONSIVENESS: Does the layout look appropriate for its viewport width?
|
|
56
|
+
6. COMPONENT STATES: Are there visible loading spinners, error states, or empty states that look broken?
|
|
57
|
+
7. VISUAL CONSISTENCY: Do colors, borders, shadows, and rounding match a consistent design system?
|
|
58
|
+
8. ACCESSIBILITY: Are interactive elements visually distinct? Are focus indicators visible?
|
|
59
|
+
|
|
60
|
+
For each issue found, describe:
|
|
61
|
+
- What the issue is
|
|
62
|
+
- Where it is (describe the location in the screenshot)
|
|
63
|
+
- Severity: CRITICAL (broken/unusable), HIGH (visually wrong), MEDIUM (suboptimal), LOW (nitpick)
|
|
64
|
+
- Suggested fix
|
|
65
|
+
|
|
66
66
|
End with a summary: total issues by severity, overall quality score (1-10), and top 3 action items.`;
|
|
67
67
|
// ─── Provider implementations ─────────────────────────────────────────────────
|
|
68
68
|
async function analyzeWithGemini(imageBase64, prompt) {
|
package/dist/tools/webTools.js
CHANGED
|
@@ -117,26 +117,26 @@ async function searchWithGemini(query, maxResults) {
|
|
|
117
117
|
return null;
|
|
118
118
|
};
|
|
119
119
|
const attemptPrompts = [
|
|
120
|
-
`Search the web for: "${query}"
|
|
121
|
-
|
|
122
|
-
Return the top ${maxResults} most relevant results. For each result, provide:
|
|
123
|
-
1. Title
|
|
124
|
-
2. URL
|
|
125
|
-
3. A 1-2 sentence snippet summarizing the content
|
|
126
|
-
|
|
127
|
-
Format your response as JSON array:
|
|
128
|
-
[{"title": "...", "url": "...", "snippet": "..."}]
|
|
129
|
-
|
|
120
|
+
`Search the web for: "${query}"
|
|
121
|
+
|
|
122
|
+
Return the top ${maxResults} most relevant results. For each result, provide:
|
|
123
|
+
1. Title
|
|
124
|
+
2. URL
|
|
125
|
+
3. A 1-2 sentence snippet summarizing the content
|
|
126
|
+
|
|
127
|
+
Format your response as JSON array:
|
|
128
|
+
[{"title": "...", "url": "...", "snippet": "..."}]
|
|
129
|
+
|
|
130
130
|
Only return the JSON array, no other text.`,
|
|
131
131
|
// Retry prompt: explicitly require absolute URLs.
|
|
132
|
-
`Use Google Search to find sources for: "${query}"
|
|
133
|
-
|
|
134
|
-
Return a JSON array with up to ${maxResults} entries in this exact shape:
|
|
135
|
-
[{"title":"...","url":"https://...","snippet":"..."}]
|
|
136
|
-
|
|
137
|
-
Requirements:
|
|
138
|
-
- url MUST be an absolute URL starting with https://
|
|
139
|
-
- Do NOT return markdown, do NOT wrap in code fences.
|
|
132
|
+
`Use Google Search to find sources for: "${query}"
|
|
133
|
+
|
|
134
|
+
Return a JSON array with up to ${maxResults} entries in this exact shape:
|
|
135
|
+
[{"title":"...","url":"https://...","snippet":"..."}]
|
|
136
|
+
|
|
137
|
+
Requirements:
|
|
138
|
+
- url MUST be an absolute URL starting with https://
|
|
139
|
+
- Do NOT return markdown, do NOT wrap in code fences.
|
|
140
140
|
- If a source is Wikipedia, include the en.wikipedia.org URL directly.`,
|
|
141
141
|
];
|
|
142
142
|
for (const prompt of attemptPrompts) {
|
package/package.json
CHANGED
|
@@ -1,101 +1,101 @@
|
|
|
1
|
-
{
|
|
2
|
-
"name": "nodebench-mcp",
|
|
3
|
-
"version": "2.
|
|
4
|
-
"description": "Make AI agents catch the bugs they normally ship.
|
|
5
|
-
"type": "module",
|
|
6
|
-
"bin": {
|
|
7
|
-
"nodebench-mcp": "./dist/index.js"
|
|
8
|
-
},
|
|
9
|
-
"main": "./dist/index.js",
|
|
10
|
-
"files": [
|
|
11
|
-
"dist",
|
|
12
|
-
"README.md",
|
|
13
|
-
"NODEBENCH_AGENTS.md",
|
|
14
|
-
"STYLE_GUIDE.md"
|
|
15
|
-
],
|
|
16
|
-
"scripts": {
|
|
17
|
-
"build": "tsc",
|
|
18
|
-
"dev": "tsx src/index.ts",
|
|
19
|
-
"test": "vitest run",
|
|
20
|
-
"test:open-dataset": "vitest run src/__tests__/openDatasetParallelEval.test.ts",
|
|
21
|
-
"test:open-dataset:toolbench": "vitest run src/__tests__/openDatasetParallelEvalToolbench.test.ts",
|
|
22
|
-
"test:open-dataset:swebench": "vitest run src/__tests__/openDatasetParallelEvalSwebench.test.ts",
|
|
23
|
-
"test:open-dataset:gaia": "vitest run src/__tests__/openDatasetParallelEvalGaia.test.ts",
|
|
24
|
-
"test:open-dataset:all": "vitest run src/__tests__/openDatasetParallelEval.test.ts src/__tests__/openDatasetParallelEvalToolbench.test.ts src/__tests__/openDatasetParallelEvalSwebench.test.ts",
|
|
25
|
-
"test:open-dataset:full": "vitest run src/__tests__/openDatasetParallelEval.test.ts src/__tests__/openDatasetParallelEvalToolbench.test.ts src/__tests__/openDatasetParallelEvalSwebench.test.ts src/__tests__/openDatasetParallelEvalGaia.test.ts",
|
|
26
|
-
"bench:perf:compare": "cross-env NODEBENCH_RUN_PERF_COMPARE=1 vitest run src/__tests__/openDatasetPerfComparison.test.ts --reporter=verbose",
|
|
27
|
-
"test:watch": "vitest",
|
|
28
|
-
"dataset:bfcl:refresh": "tsx src/__tests__/fixtures/generateBfclLongContextFixture.ts",
|
|
29
|
-
"dataset:toolbench:refresh": "tsx src/__tests__/fixtures/generateToolbenchInstructionFixture.ts",
|
|
30
|
-
"dataset:swebench:refresh": "tsx src/__tests__/fixtures/generateSwebenchVerifiedFixture.ts",
|
|
31
|
-
"dataset:gaia:refresh": "python src/__tests__/fixtures/generateGaiaLevel3Fixture.py",
|
|
32
|
-
"dataset:gaia:capability:refresh": "python src/__tests__/fixtures/generateGaiaCapabilityFixture.py",
|
|
33
|
-
"dataset:gaia:capability:files:refresh": "python src/__tests__/fixtures/generateGaiaCapabilityFilesFixture.py",
|
|
34
|
-
"dataset:gaia:capability:media:refresh": "python src/__tests__/fixtures/generateGaiaCapabilityMediaFixture.py",
|
|
35
|
-
"dataset:gaia:capability:audio:refresh": "python src/__tests__/fixtures/generateGaiaCapabilityAudioFixture.py",
|
|
36
|
-
"verify": "node test-setup.mjs",
|
|
37
|
-
"test:gaia:capability": "cross-env NODEBENCH_RUN_GAIA_CAPABILITY=1 vitest run src/__tests__/gaiaCapabilityEval.test.ts --reporter=verbose",
|
|
38
|
-
"test:gaia:capability:files": "cross-env NODEBENCH_RUN_GAIA_CAPABILITY=1 vitest run src/__tests__/gaiaCapabilityFilesEval.test.ts --reporter=verbose",
|
|
39
|
-
"test:gaia:capability:media": "cross-env NODEBENCH_RUN_GAIA_CAPABILITY=1 vitest run src/__tests__/gaiaCapabilityMediaEval.test.ts --reporter=verbose",
|
|
40
|
-
"test:gaia:capability:audio": "cross-env NODEBENCH_RUN_GAIA_CAPABILITY=1 vitest run src/__tests__/gaiaCapabilityAudioEval.test.ts --reporter=verbose",
|
|
41
|
-
"prepublishOnly": "npm run build && npm run test"
|
|
42
|
-
},
|
|
43
|
-
"keywords": [
|
|
44
|
-
"mcp",
|
|
45
|
-
"model-context-protocol",
|
|
46
|
-
"claude",
|
|
47
|
-
"ai-agents",
|
|
48
|
-
"web-search",
|
|
49
|
-
"github",
|
|
50
|
-
"vision",
|
|
51
|
-
"verification",
|
|
52
|
-
"sqlite",
|
|
53
|
-
"quality-gates",
|
|
54
|
-
"parallel-agents",
|
|
55
|
-
"toolset-gating",
|
|
56
|
-
"eval",
|
|
57
|
-
"qa-automation",
|
|
58
|
-
"agentic",
|
|
59
|
-
"academic-writing",
|
|
60
|
-
"research-paper"
|
|
61
|
-
],
|
|
62
|
-
"repository": {
|
|
63
|
-
"type": "git",
|
|
64
|
-
"url": "https://github.com/HomenShum/nodebench-ai.git",
|
|
65
|
-
"directory": "packages/mcp-local"
|
|
66
|
-
},
|
|
67
|
-
"homepage": "https://github.com/HomenShum/nodebench-ai/tree/main/packages/mcp-local#readme",
|
|
68
|
-
"bugs": {
|
|
69
|
-
"url": "https://github.com/HomenShum/nodebench-ai/issues"
|
|
70
|
-
},
|
|
71
|
-
"license": "MIT",
|
|
72
|
-
"author": "HomenShum",
|
|
73
|
-
"dependencies": {
|
|
74
|
-
"@modelcontextprotocol/sdk": "^1.0.4",
|
|
75
|
-
"better-sqlite3": "^11.0.0",
|
|
76
|
-
"nodebench-ai": "file:../.."
|
|
77
|
-
},
|
|
78
|
-
"optionalDependencies": {
|
|
79
|
-
"@anthropic-ai/sdk": "^0.71.2",
|
|
80
|
-
"@google/genai": "^1.10.0",
|
|
81
|
-
"cheerio": "^1.0.0",
|
|
82
|
-
"openai": "^5.8.2",
|
|
83
|
-
"papaparse": "^5.5.3",
|
|
84
|
-
"pdf-parse": "^2.4.5",
|
|
85
|
-
"playwright": "^1.57.0",
|
|
86
|
-
"sharp": "^0.34.5",
|
|
87
|
-
"tesseract.js": "^7.0.0",
|
|
88
|
-
"xlsx": "^0.18.5",
|
|
89
|
-
"yauzl": "^2.10.0"
|
|
90
|
-
},
|
|
91
|
-
"devDependencies": {
|
|
92
|
-
"@types/better-sqlite3": "^7.6.0",
|
|
93
|
-
"@types/node": "^20.11.0",
|
|
94
|
-
"tsx": "^4.7.0",
|
|
95
|
-
"typescript": "^5.3.3",
|
|
96
|
-
"vitest": "^3.2.4"
|
|
97
|
-
},
|
|
98
|
-
"engines": {
|
|
99
|
-
"node": ">=18.0.0"
|
|
100
|
-
}
|
|
101
|
-
}
|
|
1
|
+
{
|
|
2
|
+
"name": "nodebench-mcp",
|
|
3
|
+
"version": "2.13.0",
|
|
4
|
+
"description": "Make AI agents catch the bugs they normally ship. 143 MCP tools across 25 domains: progressive discovery with 7-mode hybrid search, model-tier complexity routing (getToolComplexity), agent contract (front-door + anti-rationalization + 3-strike error), compaction-resilient session memory (filesystem notes + attention refresh), lightweight hooks (auto-save + refresh reminders), 6 GAIA media image solvers (separate gaia_solvers domain), project boilerplate scaffolding, autonomous capability benchmarks (C-compiler pattern), structured research, 3-layer testing, quality gates, persistent knowledge, LLM calling, security analysis, platform bridge, model benchmarking, visual regression, report generation, academic paper writing, deterministic local file parsing (19 tools), Android flicker detection, Figma flow analysis, and contract compliance scoring. --preset meta (5), lite (43), core (93), or full (143). Benchmarked: 13 issues caught, 26 blind spots prevented.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"nodebench-mcp": "./dist/index.js"
|
|
8
|
+
},
|
|
9
|
+
"main": "./dist/index.js",
|
|
10
|
+
"files": [
|
|
11
|
+
"dist",
|
|
12
|
+
"README.md",
|
|
13
|
+
"NODEBENCH_AGENTS.md",
|
|
14
|
+
"STYLE_GUIDE.md"
|
|
15
|
+
],
|
|
16
|
+
"scripts": {
|
|
17
|
+
"build": "tsc",
|
|
18
|
+
"dev": "tsx src/index.ts",
|
|
19
|
+
"test": "vitest run",
|
|
20
|
+
"test:open-dataset": "vitest run src/__tests__/openDatasetParallelEval.test.ts",
|
|
21
|
+
"test:open-dataset:toolbench": "vitest run src/__tests__/openDatasetParallelEvalToolbench.test.ts",
|
|
22
|
+
"test:open-dataset:swebench": "vitest run src/__tests__/openDatasetParallelEvalSwebench.test.ts",
|
|
23
|
+
"test:open-dataset:gaia": "vitest run src/__tests__/openDatasetParallelEvalGaia.test.ts",
|
|
24
|
+
"test:open-dataset:all": "vitest run src/__tests__/openDatasetParallelEval.test.ts src/__tests__/openDatasetParallelEvalToolbench.test.ts src/__tests__/openDatasetParallelEvalSwebench.test.ts",
|
|
25
|
+
"test:open-dataset:full": "vitest run src/__tests__/openDatasetParallelEval.test.ts src/__tests__/openDatasetParallelEvalToolbench.test.ts src/__tests__/openDatasetParallelEvalSwebench.test.ts src/__tests__/openDatasetParallelEvalGaia.test.ts",
|
|
26
|
+
"bench:perf:compare": "cross-env NODEBENCH_RUN_PERF_COMPARE=1 vitest run src/__tests__/openDatasetPerfComparison.test.ts --reporter=verbose",
|
|
27
|
+
"test:watch": "vitest",
|
|
28
|
+
"dataset:bfcl:refresh": "tsx src/__tests__/fixtures/generateBfclLongContextFixture.ts",
|
|
29
|
+
"dataset:toolbench:refresh": "tsx src/__tests__/fixtures/generateToolbenchInstructionFixture.ts",
|
|
30
|
+
"dataset:swebench:refresh": "tsx src/__tests__/fixtures/generateSwebenchVerifiedFixture.ts",
|
|
31
|
+
"dataset:gaia:refresh": "python src/__tests__/fixtures/generateGaiaLevel3Fixture.py",
|
|
32
|
+
"dataset:gaia:capability:refresh": "python src/__tests__/fixtures/generateGaiaCapabilityFixture.py",
|
|
33
|
+
"dataset:gaia:capability:files:refresh": "python src/__tests__/fixtures/generateGaiaCapabilityFilesFixture.py",
|
|
34
|
+
"dataset:gaia:capability:media:refresh": "python src/__tests__/fixtures/generateGaiaCapabilityMediaFixture.py",
|
|
35
|
+
"dataset:gaia:capability:audio:refresh": "python src/__tests__/fixtures/generateGaiaCapabilityAudioFixture.py",
|
|
36
|
+
"verify": "node test-setup.mjs",
|
|
37
|
+
"test:gaia:capability": "cross-env NODEBENCH_RUN_GAIA_CAPABILITY=1 vitest run src/__tests__/gaiaCapabilityEval.test.ts --reporter=verbose",
|
|
38
|
+
"test:gaia:capability:files": "cross-env NODEBENCH_RUN_GAIA_CAPABILITY=1 vitest run src/__tests__/gaiaCapabilityFilesEval.test.ts --reporter=verbose",
|
|
39
|
+
"test:gaia:capability:media": "cross-env NODEBENCH_RUN_GAIA_CAPABILITY=1 vitest run src/__tests__/gaiaCapabilityMediaEval.test.ts --reporter=verbose",
|
|
40
|
+
"test:gaia:capability:audio": "cross-env NODEBENCH_RUN_GAIA_CAPABILITY=1 vitest run src/__tests__/gaiaCapabilityAudioEval.test.ts --reporter=verbose",
|
|
41
|
+
"prepublishOnly": "npm run build && npm run test"
|
|
42
|
+
},
|
|
43
|
+
"keywords": [
|
|
44
|
+
"mcp",
|
|
45
|
+
"model-context-protocol",
|
|
46
|
+
"claude",
|
|
47
|
+
"ai-agents",
|
|
48
|
+
"web-search",
|
|
49
|
+
"github",
|
|
50
|
+
"vision",
|
|
51
|
+
"verification",
|
|
52
|
+
"sqlite",
|
|
53
|
+
"quality-gates",
|
|
54
|
+
"parallel-agents",
|
|
55
|
+
"toolset-gating",
|
|
56
|
+
"eval",
|
|
57
|
+
"qa-automation",
|
|
58
|
+
"agentic",
|
|
59
|
+
"academic-writing",
|
|
60
|
+
"research-paper"
|
|
61
|
+
],
|
|
62
|
+
"repository": {
|
|
63
|
+
"type": "git",
|
|
64
|
+
"url": "https://github.com/HomenShum/nodebench-ai.git",
|
|
65
|
+
"directory": "packages/mcp-local"
|
|
66
|
+
},
|
|
67
|
+
"homepage": "https://github.com/HomenShum/nodebench-ai/tree/main/packages/mcp-local#readme",
|
|
68
|
+
"bugs": {
|
|
69
|
+
"url": "https://github.com/HomenShum/nodebench-ai/issues"
|
|
70
|
+
},
|
|
71
|
+
"license": "MIT",
|
|
72
|
+
"author": "HomenShum",
|
|
73
|
+
"dependencies": {
|
|
74
|
+
"@modelcontextprotocol/sdk": "^1.0.4",
|
|
75
|
+
"better-sqlite3": "^11.0.0",
|
|
76
|
+
"nodebench-ai": "file:../.."
|
|
77
|
+
},
|
|
78
|
+
"optionalDependencies": {
|
|
79
|
+
"@anthropic-ai/sdk": "^0.71.2",
|
|
80
|
+
"@google/genai": "^1.10.0",
|
|
81
|
+
"cheerio": "^1.0.0",
|
|
82
|
+
"openai": "^5.8.2",
|
|
83
|
+
"papaparse": "^5.5.3",
|
|
84
|
+
"pdf-parse": "^2.4.5",
|
|
85
|
+
"playwright": "^1.57.0",
|
|
86
|
+
"sharp": "^0.34.5",
|
|
87
|
+
"tesseract.js": "^7.0.0",
|
|
88
|
+
"xlsx": "^0.18.5",
|
|
89
|
+
"yauzl": "^2.10.0"
|
|
90
|
+
},
|
|
91
|
+
"devDependencies": {
|
|
92
|
+
"@types/better-sqlite3": "^7.6.0",
|
|
93
|
+
"@types/node": "^20.11.0",
|
|
94
|
+
"tsx": "^4.7.0",
|
|
95
|
+
"typescript": "^5.3.3",
|
|
96
|
+
"vitest": "^3.2.4"
|
|
97
|
+
},
|
|
98
|
+
"engines": {
|
|
99
|
+
"node": ">=18.0.0"
|
|
100
|
+
}
|
|
101
|
+
}
|