@upstash/context7-mcp 1.0.34-canary.2 → 1.0.34-canary.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +37 -13
- package/dist/lib/api.js +3 -1
- package/package.json +1 -1
- package/dist/benchmark/benchmark.js +0 -347
- package/dist/benchmark/compare-benchmark.js +0 -289
- package/dist/benchmark/run-benchmark.js +0 -459
- package/dist/benchmark/simulate.js +0 -319
package/dist/index.js
CHANGED
|
@@ -77,9 +77,22 @@ server.registerTool("resolve-library-id", {
|
|
|
77
77
|
|
|
78
78
|
You MUST call this function before 'query-docs' to obtain a valid Context7-compatible library ID UNLESS the user explicitly provides a library ID in the format '/org/project' or '/org/project/version' in their query.
|
|
79
79
|
|
|
80
|
-
|
|
80
|
+
Selection Process:
|
|
81
|
+
1. Analyze the query to understand what library/package the user is looking for
|
|
82
|
+
2. Return the most relevant match based on:
|
|
83
|
+
- Name similarity to the query (exact matches prioritized)
|
|
84
|
+
- Description relevance to the query's intent
|
|
85
|
+
- Documentation coverage (prioritize libraries with higher Code Snippet counts)
|
|
86
|
+
- Source reputation (consider libraries with High or Medium reputation more authoritative)
|
|
87
|
+
- Benchmark Score: Quality indicator (100 is the highest score)
|
|
81
88
|
|
|
82
|
-
|
|
89
|
+
Response Format:
|
|
90
|
+
- Return the selected library ID in a clearly marked section
|
|
91
|
+
- Provide a brief explanation for why this library was chosen
|
|
92
|
+
- If multiple good matches exist, acknowledge this but proceed with the most relevant one
|
|
93
|
+
- If no good matches exist, clearly state this and suggest query refinements
|
|
94
|
+
|
|
95
|
+
For ambiguous queries, request clarification before proceeding with a best-guess match.`,
|
|
83
96
|
inputSchema: {
|
|
84
97
|
query: z
|
|
85
98
|
.string()
|
|
@@ -104,11 +117,28 @@ Select the best match based on: name similarity, description relevance, snippet
|
|
|
104
117
|
],
|
|
105
118
|
};
|
|
106
119
|
}
|
|
120
|
+
const resultsText = formatSearchResults(searchResponse);
|
|
121
|
+
const responseText = `Available Libraries:
|
|
122
|
+
|
|
123
|
+
Each result includes:
|
|
124
|
+
- Library ID: Context7-compatible identifier (format: /org/project)
|
|
125
|
+
- Name: Library or package name
|
|
126
|
+
- Description: Short summary
|
|
127
|
+
- Code Snippets: Number of available code examples
|
|
128
|
+
- Source Reputation: Authority indicator (High, Medium, Low, or Unknown)
|
|
129
|
+
- Benchmark Score: Quality indicator (100 is the highest score)
|
|
130
|
+
- Versions: List of versions if available. Use one of those versions if the user provides a version in their query. The format of the version is /org/project/version.
|
|
131
|
+
|
|
132
|
+
For best results, select libraries based on name match, source reputation, snippet coverage, benchmark score, and relevance to your use case.
|
|
133
|
+
|
|
134
|
+
----------
|
|
135
|
+
|
|
136
|
+
${resultsText}`;
|
|
107
137
|
return {
|
|
108
138
|
content: [
|
|
109
139
|
{
|
|
110
140
|
type: "text",
|
|
111
|
-
text:
|
|
141
|
+
text: responseText,
|
|
112
142
|
},
|
|
113
143
|
],
|
|
114
144
|
};
|
|
@@ -117,20 +147,14 @@ server.registerTool("query-docs", {
|
|
|
117
147
|
title: "Query Documentation",
|
|
118
148
|
description: `Retrieves and queries up-to-date documentation and code examples from Context7 for any programming library or framework.
|
|
119
149
|
|
|
120
|
-
You must call 'resolve-library-id' first to obtain the exact Context7-compatible library ID required to use this tool, UNLESS the user explicitly provides a library ID in the format '/org/project' or '/org/project/version' in their query
|
|
121
|
-
|
|
122
|
-
USE THIS TOOL TO:
|
|
123
|
-
- Get current, accurate documentation for libraries (e.g., React, Next.js, Express, LangChain)
|
|
124
|
-
- Find working code examples and implementation patterns
|
|
125
|
-
- Answer "how do I..." questions about specific libraries
|
|
126
|
-
- Look up API references, configuration options, and best practices`,
|
|
150
|
+
You must call 'resolve-library-id' first to obtain the exact Context7-compatible library ID required to use this tool, UNLESS the user explicitly provides a library ID in the format '/org/project' or '/org/project/version' in their query.`,
|
|
127
151
|
inputSchema: {
|
|
152
|
+
libraryId: z
|
|
153
|
+
.string()
|
|
154
|
+
.describe("Exact Context7-compatible library ID (e.g., '/mongodb/docs', '/vercel/next.js', '/supabase/supabase', '/vercel/next.js/v14.3.0-canary.87') retrieved from 'resolve-library-id' or directly from user query in the format '/org/project' or '/org/project/version'."),
|
|
128
155
|
query: z
|
|
129
156
|
.string()
|
|
130
157
|
.describe("The question or task you need help with. Be specific and include relevant details. Good: 'How to set up authentication with JWT in Express.js' or 'React useEffect cleanup function examples'. Bad: 'auth' or 'hooks'. IMPORTANT: Do not include any sensitive or confidential information such as API keys, passwords, credentials, or personal data in your query."),
|
|
131
|
-
libraryId: z
|
|
132
|
-
.string()
|
|
133
|
-
.describe("Context7-compatible library ID (e.g., '/mongodb/docs' or '/vercel/next.js'). Retrieved from 'resolve-library-id' or directly from user query in the format '/org/project' or '/org/project/version'."),
|
|
134
158
|
},
|
|
135
159
|
}, async ({ query, libraryId }) => {
|
|
136
160
|
const ctx = requestContext.getStore();
|
package/dist/lib/api.js
CHANGED
|
@@ -105,7 +105,9 @@ export async function fetchLibraryContext(request, clientIp, apiKey) {
|
|
|
105
105
|
}
|
|
106
106
|
const text = await response.text();
|
|
107
107
|
if (!text) {
|
|
108
|
-
return {
|
|
108
|
+
return {
|
|
109
|
+
data: "Documentation not found or not finalized for this library. This might have happened because you used an invalid Context7-compatible library ID. To get a valid Context7-compatible library ID, use the 'resolve-library-id' with the package name you wish to retrieve documentation for.",
|
|
110
|
+
};
|
|
109
111
|
}
|
|
110
112
|
return { data: text };
|
|
111
113
|
}
|
package/package.json
CHANGED
|
@@ -1,347 +0,0 @@
|
|
|
1
|
-
import "dotenv/config";
|
|
2
|
-
import { readFileSync, mkdirSync, renameSync, existsSync, readdirSync, writeFileSync } from "fs";
|
|
3
|
-
import { join, dirname } from "path";
|
|
4
|
-
import { fileURLToPath } from "url";
|
|
5
|
-
import { execSync } from "child_process";
|
|
6
|
-
import { simulate } from "./simulate.js";
|
|
7
|
-
import { generateText } from "ai";
|
|
8
|
-
import { anthropic } from "@ai-sdk/anthropic";
|
|
9
|
-
import { openai } from "@ai-sdk/openai";
|
|
10
|
-
import { google } from "@ai-sdk/google";
|
|
11
|
-
// Check for required environment variables
|
|
12
|
-
if (!process.env.CONTEXT7_API_KEY) {
|
|
13
|
-
console.error("Error: CONTEXT7_API_KEY environment variable is required");
|
|
14
|
-
console.error("Set it in your .env file or export it in your shell");
|
|
15
|
-
process.exit(1);
|
|
16
|
-
}
|
|
17
|
-
const __filename = fileURLToPath(import.meta.url);
|
|
18
|
-
const __dirname = dirname(__filename);
|
|
19
|
-
// Package root is two levels up from dist/benchmark/
|
|
20
|
-
const packageRoot = join(__dirname, "..", "..");
|
|
21
|
-
/**
|
|
22
|
-
* Get the current git branch name
|
|
23
|
-
* @returns The branch name or "unknown" if not in a git repo
|
|
24
|
-
*/
|
|
25
|
-
function getCurrentBranch() {
|
|
26
|
-
try {
|
|
27
|
-
const branch = execSync("git rev-parse --abbrev-ref HEAD", { encoding: "utf-8" }).trim();
|
|
28
|
-
return branch;
|
|
29
|
-
}
|
|
30
|
-
catch (error) {
|
|
31
|
-
console.error("Error getting current branch:", error);
|
|
32
|
-
return "unknown";
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
/**
|
|
36
|
-
* Runs benchmarks by simulating questions from questions.txt
|
|
37
|
-
*
|
|
38
|
-
* Usage:
|
|
39
|
-
* - pnpm run benchmark openai
|
|
40
|
-
* - pnpm run benchmark claude
|
|
41
|
-
* - pnpm run benchmark gemini
|
|
42
|
-
* - pnpm run benchmark openai --test (run only first question)
|
|
43
|
-
* - pnpm run benchmark claude 1 output-folder (questionset 1, custom output folder)
|
|
44
|
-
* - pnpm run benchmark claude aa.txt output-folder (use aa.txt, custom output folder)
|
|
45
|
-
*/
|
|
46
|
-
async function runBenchmark() {
|
|
47
|
-
// Parse arguments
|
|
48
|
-
const args = process.argv.slice(2);
|
|
49
|
-
const nonFlagArgs = args.filter((a) => !a.startsWith("--"));
|
|
50
|
-
const modelArg = nonFlagArgs[0]?.toLowerCase() || "claude";
|
|
51
|
-
const questionFileArg = nonFlagArgs[1] || null;
|
|
52
|
-
const outputFolderName = nonFlagArgs[2] || null;
|
|
53
|
-
const isTestMode = args.includes("--test");
|
|
54
|
-
let scoringModel;
|
|
55
|
-
let modelName;
|
|
56
|
-
if (modelArg === "openai") {
|
|
57
|
-
scoringModel = openai("gpt-5");
|
|
58
|
-
modelName = "GPT-5";
|
|
59
|
-
}
|
|
60
|
-
else if (modelArg === "gemini") {
|
|
61
|
-
scoringModel = google("gemini-2.5-pro");
|
|
62
|
-
modelName = "GEMINI-2.5-PRO";
|
|
63
|
-
}
|
|
64
|
-
else {
|
|
65
|
-
// Default to claude
|
|
66
|
-
scoringModel = anthropic("claude-sonnet-4-5");
|
|
67
|
-
modelName = "CLAUDE-SONNET-4.5";
|
|
68
|
-
}
|
|
69
|
-
// Determine the questions file to use
|
|
70
|
-
let questionsFileName;
|
|
71
|
-
if (!questionFileArg) {
|
|
72
|
-
questionsFileName = "questions.txt";
|
|
73
|
-
}
|
|
74
|
-
else if (questionFileArg.endsWith(".txt")) {
|
|
75
|
-
// Filename provided directly
|
|
76
|
-
questionsFileName = questionFileArg;
|
|
77
|
-
}
|
|
78
|
-
else {
|
|
79
|
-
// Number provided, construct filename
|
|
80
|
-
const questionSetNum = parseInt(questionFileArg, 10);
|
|
81
|
-
if (!isNaN(questionSetNum)) {
|
|
82
|
-
questionsFileName = `questions${questionSetNum}.txt`;
|
|
83
|
-
}
|
|
84
|
-
else {
|
|
85
|
-
questionsFileName = "questions.txt";
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
console.log("=".repeat(80));
|
|
89
|
-
console.log("Context7 MCP Benchmark");
|
|
90
|
-
console.log("=".repeat(80));
|
|
91
|
-
console.log(`Scoring Model: ${modelName}`);
|
|
92
|
-
console.log(`Question File: ${questionsFileName}`);
|
|
93
|
-
if (isTestMode) {
|
|
94
|
-
console.log(`Mode: TEST (first question only)`);
|
|
95
|
-
}
|
|
96
|
-
console.log();
|
|
97
|
-
// Read questions from questions.txt or questionsN.txt (in src/benchmark/questions directory)
|
|
98
|
-
const questionsPath = join(packageRoot, "src", "benchmark", "questions", questionsFileName);
|
|
99
|
-
console.log(`Reading questions from: ${questionsPath}`);
|
|
100
|
-
if (!existsSync(questionsPath)) {
|
|
101
|
-
console.error(`Error: questions.txt not found at ${questionsPath}`);
|
|
102
|
-
process.exit(1);
|
|
103
|
-
}
|
|
104
|
-
const questionsContent = readFileSync(questionsPath, "utf-8");
|
|
105
|
-
let questions = questionsContent
|
|
106
|
-
.split("\n")
|
|
107
|
-
.map((line) => line.trim())
|
|
108
|
-
.filter((line) => line.length > 0 && !line.startsWith("#")); // Filter empty lines and comments
|
|
109
|
-
// Limit to first question if in test mode
|
|
110
|
-
if (isTestMode) {
|
|
111
|
-
questions = questions.slice(0, 1);
|
|
112
|
-
console.log(`Test mode: Running only first question`);
|
|
113
|
-
}
|
|
114
|
-
else {
|
|
115
|
-
console.log(`Found ${questions.length} questions to benchmark`);
|
|
116
|
-
}
|
|
117
|
-
console.log();
|
|
118
|
-
// Get current git branch name
|
|
119
|
-
const branchName = getCurrentBranch();
|
|
120
|
-
// Create benchmark run directory with custom name or default naming
|
|
121
|
-
let benchmarkRunDir;
|
|
122
|
-
if (outputFolderName) {
|
|
123
|
-
benchmarkRunDir = join(packageRoot, "src", "benchmark", "reports", "benchmarks", outputFolderName);
|
|
124
|
-
}
|
|
125
|
-
else {
|
|
126
|
-
const timestamp = new Date().toISOString().replace(/[:.]/g, "-").split("Z")[0];
|
|
127
|
-
benchmarkRunDir = join(packageRoot, "src", "benchmark", "reports", "benchmarks", `${branchName}-run-${timestamp}_${modelName.replace(/[.\s]/g, "-")}`);
|
|
128
|
-
}
|
|
129
|
-
mkdirSync(benchmarkRunDir, { recursive: true });
|
|
130
|
-
console.log(`Benchmark results will be saved to: ${benchmarkRunDir}`);
|
|
131
|
-
console.log();
|
|
132
|
-
const results = [];
|
|
133
|
-
// Run simulation for questions in batches (parallel processing)
|
|
134
|
-
// BATCH_SIZE can be set via environment variable (e.g., BATCH_SIZE=1 for sequential)
|
|
135
|
-
const startTime = Date.now();
|
|
136
|
-
const BATCH_SIZE = process.env.BATCH_SIZE ? parseInt(process.env.BATCH_SIZE, 10) : 7;
|
|
137
|
-
console.log(`Execution Mode: ${BATCH_SIZE === 1 ? "Sequential (1 question at a time)" : "Parallel (batch size: " + BATCH_SIZE + ")"}`);
|
|
138
|
-
console.log();
|
|
139
|
-
for (let batchStart = 0; batchStart < questions.length; batchStart += BATCH_SIZE) {
|
|
140
|
-
const batchEnd = Math.min(batchStart + BATCH_SIZE, questions.length);
|
|
141
|
-
const batch = questions.slice(batchStart, batchEnd);
|
|
142
|
-
console.log("═".repeat(80));
|
|
143
|
-
console.log(`Processing Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} (Questions ${batchStart + 1}-${batchEnd})`);
|
|
144
|
-
console.log("═".repeat(80));
|
|
145
|
-
console.log();
|
|
146
|
-
// Process batch in parallel
|
|
147
|
-
const batchPromises = batch.map(async (question, batchIndex) => {
|
|
148
|
-
const questionNum = batchStart + batchIndex + 1;
|
|
149
|
-
console.log(`[Q${questionNum}] Starting: ${question.substring(0, 60)}...`);
|
|
150
|
-
try {
|
|
151
|
-
// Run simulation with unique ID to prevent filename collisions
|
|
152
|
-
const uniqueId = `q${questionNum}`;
|
|
153
|
-
await simulate(question, uniqueId);
|
|
154
|
-
// Wait a bit to ensure file system operations complete
|
|
155
|
-
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
156
|
-
// Find the report files created for this question by unique ID
|
|
157
|
-
const reportsDir = join(packageRoot, "src", "benchmark", "reports");
|
|
158
|
-
const files = readdirSync(reportsDir);
|
|
159
|
-
// Look for files containing the unique ID
|
|
160
|
-
const mdFile = files.find((f) => f.includes(`_${uniqueId}.md`) && !f.endsWith("_raw.md"));
|
|
161
|
-
const rawMdFile = files.find((f) => f.includes(`_${uniqueId}_raw.md`));
|
|
162
|
-
if (mdFile && rawMdFile) {
|
|
163
|
-
// Move files to benchmark directory with new names
|
|
164
|
-
const sourceMd = join(reportsDir, mdFile);
|
|
165
|
-
const sourceRawMd = join(reportsDir, rawMdFile);
|
|
166
|
-
const destMd = join(benchmarkRunDir, `q${questionNum}.md`);
|
|
167
|
-
const destRawMd = join(benchmarkRunDir, `q${questionNum}_raw.md`);
|
|
168
|
-
renameSync(sourceMd, destMd);
|
|
169
|
-
renameSync(sourceRawMd, destRawMd);
|
|
170
|
-
console.log(`[Q${questionNum}] ✅ Completed and saved`);
|
|
171
|
-
return {
|
|
172
|
-
questionNum,
|
|
173
|
-
question,
|
|
174
|
-
toolCount: 0, // Will be calculated during scoring
|
|
175
|
-
tokenCount: 0, // Will be calculated during scoring
|
|
176
|
-
totalTokens: 0, // Will be extracted from report
|
|
177
|
-
score: 0, // Will be calculated during scoring
|
|
178
|
-
};
|
|
179
|
-
}
|
|
180
|
-
else {
|
|
181
|
-
console.error(`[Q${questionNum}] ⚠️ No report files found (expected: *_${uniqueId}.md)`);
|
|
182
|
-
return null;
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
catch (error) {
|
|
186
|
-
console.error(`[Q${questionNum}] ❌ Error:`, error);
|
|
187
|
-
return null;
|
|
188
|
-
}
|
|
189
|
-
});
|
|
190
|
-
// Wait for all questions in this batch to complete
|
|
191
|
-
const batchResults = await Promise.all(batchPromises);
|
|
192
|
-
// Add successful results to the results array
|
|
193
|
-
batchResults.forEach((result) => {
|
|
194
|
-
if (result) {
|
|
195
|
-
results.push(result);
|
|
196
|
-
}
|
|
197
|
-
});
|
|
198
|
-
console.log();
|
|
199
|
-
console.log(`Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} completed: ${batchResults.filter((r) => r).length}/${batch.length} successful`);
|
|
200
|
-
console.log();
|
|
201
|
-
}
|
|
202
|
-
const duration = Date.now() - startTime;
|
|
203
|
-
// Scoring phase - also in batches of 5 for parallel processing
|
|
204
|
-
console.log();
|
|
205
|
-
console.log("=".repeat(80));
|
|
206
|
-
console.log("Scoring Phase");
|
|
207
|
-
console.log("=".repeat(80));
|
|
208
|
-
console.log(`Using ${modelName} to score context quality...`);
|
|
209
|
-
console.log();
|
|
210
|
-
for (let batchStart = 0; batchStart < results.length; batchStart += BATCH_SIZE) {
|
|
211
|
-
const batchEnd = Math.min(batchStart + BATCH_SIZE, results.length);
|
|
212
|
-
const batchResults = results.slice(batchStart, batchEnd);
|
|
213
|
-
console.log(`Scoring Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} (Questions ${batchStart + 1}-${batchEnd})`);
|
|
214
|
-
// Process scoring in parallel
|
|
215
|
-
const scoringPromises = batchResults.map(async (result) => {
|
|
216
|
-
const rawMdPath = join(benchmarkRunDir, `q${result.questionNum}_raw.md`);
|
|
217
|
-
const structuredMdPath = join(benchmarkRunDir, `q${result.questionNum}.md`);
|
|
218
|
-
try {
|
|
219
|
-
// Read raw markdown file
|
|
220
|
-
const rawContent = readFileSync(rawMdPath, "utf-8");
|
|
221
|
-
// Count tokens (approximate: split by whitespace and punctuation)
|
|
222
|
-
const tokenCount = rawContent.split(/[\s\n]+/).length;
|
|
223
|
-
result.tokenCount = tokenCount;
|
|
224
|
-
// Count tool calls from structured report and extract total tokens
|
|
225
|
-
const structuredContent = readFileSync(structuredMdPath, "utf-8");
|
|
226
|
-
const toolCallMatches = structuredContent.match(/### Tool Call \d+:/g);
|
|
227
|
-
result.toolCount = toolCallMatches ? toolCallMatches.length : 0;
|
|
228
|
-
// Extract total tokens from structured report
|
|
229
|
-
const totalTokensMatch = structuredContent.match(/\*\*Total Tokens\*\*: (\d+)/);
|
|
230
|
-
result.totalTokens = totalTokensMatch ? parseInt(totalTokensMatch[1], 10) : 0;
|
|
231
|
-
// Extract question and context from raw file
|
|
232
|
-
const lines = rawContent.split("\n");
|
|
233
|
-
const questionLine = lines.find((line) => line.startsWith("QUESTION:"));
|
|
234
|
-
const question = questionLine
|
|
235
|
-
? questionLine.replace("QUESTION:", "").trim()
|
|
236
|
-
: result.question;
|
|
237
|
-
// Get context (everything after "CONTEXT:")
|
|
238
|
-
const contextStart = rawContent.indexOf("CONTEXT:");
|
|
239
|
-
const context = contextStart !== -1 ? rawContent.substring(contextStart + 8).trim() : rawContent;
|
|
240
|
-
console.log(`[Q${result.questionNum}] Scoring...`);
|
|
241
|
-
// Ask the scoring model to evaluate the context
|
|
242
|
-
const scoringResult = await generateText({
|
|
243
|
-
model: scoringModel,
|
|
244
|
-
messages: [
|
|
245
|
-
{
|
|
246
|
-
role: "user",
|
|
247
|
-
content: `You are evaluating the quality and usefulness of documentation context for a given question.
|
|
248
|
-
|
|
249
|
-
Question: ${question}
|
|
250
|
-
|
|
251
|
-
Context provided:
|
|
252
|
-
${context}
|
|
253
|
-
|
|
254
|
-
Rate how helpful and relevant this context is for answering the question on a scale of 1-10, where:
|
|
255
|
-
- 1-3: Poor - Missing critical information, irrelevant, or unhelpful
|
|
256
|
-
- 4-6: Adequate - Has some useful information but gaps exist
|
|
257
|
-
- 7-8: Good - Covers most needs with relevant examples
|
|
258
|
-
- 9-10: Excellent - Comprehensive, relevant, with clear examples
|
|
259
|
-
|
|
260
|
-
Respond with ONLY a JSON object in this format:
|
|
261
|
-
{"score": <number>, "reasoning": "<brief explanation>"}`,
|
|
262
|
-
},
|
|
263
|
-
],
|
|
264
|
-
});
|
|
265
|
-
// Parse the score
|
|
266
|
-
try {
|
|
267
|
-
const jsonMatch = scoringResult.text.match(/\{[\s\S]*\}/);
|
|
268
|
-
if (jsonMatch) {
|
|
269
|
-
const scoreData = JSON.parse(jsonMatch[0]);
|
|
270
|
-
result.score = scoreData.score;
|
|
271
|
-
console.log(`[Q${result.questionNum}] Score: ${scoreData.score}/10 - ${scoreData.reasoning.substring(0, 60)}...`);
|
|
272
|
-
}
|
|
273
|
-
else {
|
|
274
|
-
console.log(`[Q${result.questionNum}] ⚠️ Could not parse score, defaulting to 0`);
|
|
275
|
-
result.score = 0;
|
|
276
|
-
}
|
|
277
|
-
}
|
|
278
|
-
catch (parseError) {
|
|
279
|
-
console.log(`[Q${result.questionNum}] ⚠️ Error parsing score: ${parseError}`);
|
|
280
|
-
result.score = 0;
|
|
281
|
-
}
|
|
282
|
-
}
|
|
283
|
-
catch (error) {
|
|
284
|
-
console.error(`[Q${result.questionNum}] ❌ Error scoring:`, error);
|
|
285
|
-
}
|
|
286
|
-
});
|
|
287
|
-
// Wait for all scoring in this batch to complete
|
|
288
|
-
await Promise.all(scoringPromises);
|
|
289
|
-
console.log(`Scoring Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} completed: ${batchEnd - batchStart} questions`);
|
|
290
|
-
console.log();
|
|
291
|
-
}
|
|
292
|
-
// Calculate averages
|
|
293
|
-
const avgToolCount = results.reduce((sum, r) => sum + r.toolCount, 0) / results.length;
|
|
294
|
-
const avgTokenCount = results.reduce((sum, r) => sum + r.tokenCount, 0) / results.length;
|
|
295
|
-
const avgTotalTokens = results.reduce((sum, r) => sum + r.totalTokens, 0) / results.length;
|
|
296
|
-
const avgScore = results.reduce((sum, r) => sum + r.score, 0) / results.length;
|
|
297
|
-
// Generate result.md
|
|
298
|
-
console.log("Generating result.md...");
|
|
299
|
-
let resultMd = `# Benchmark Results\n\n`;
|
|
300
|
-
resultMd += `**Scoring Model**: ${modelName}\n`;
|
|
301
|
-
resultMd += `**Date**: ${new Date().toISOString()}\n`;
|
|
302
|
-
resultMd += `**Total Questions**: ${results.length}\n`;
|
|
303
|
-
resultMd += `**Total Duration**: ${(duration / 1000).toFixed(2)}s\n\n`;
|
|
304
|
-
resultMd += `## Averages\n\n`;
|
|
305
|
-
resultMd += `| Metric | Value |\n`;
|
|
306
|
-
resultMd += `|--------|-------|\n`;
|
|
307
|
-
resultMd += `| Average Tool Calls | ${avgToolCount.toFixed(2)} |\n`;
|
|
308
|
-
resultMd += `| Average Token Count | ${avgTokenCount.toFixed(0)} |\n`;
|
|
309
|
-
resultMd += `| Average Total Tokens (API) | ${avgTotalTokens.toFixed(0)} |\n`;
|
|
310
|
-
resultMd += `| Average Score | ${avgScore.toFixed(2)}/10 |\n\n`;
|
|
311
|
-
resultMd += `## Results by Question\n\n`;
|
|
312
|
-
results.forEach((result) => {
|
|
313
|
-
resultMd += `### Q${result.questionNum}: ${result.question}\n\n`;
|
|
314
|
-
resultMd += `| Metric | Value |\n`;
|
|
315
|
-
resultMd += `|--------|-------|\n`;
|
|
316
|
-
resultMd += `| Tool Calls | ${result.toolCount} |\n`;
|
|
317
|
-
resultMd += `| Token Count | ${result.tokenCount} |\n`;
|
|
318
|
-
resultMd += `| Total Tokens (API) | ${result.totalTokens} |\n`;
|
|
319
|
-
resultMd += `| LLM Score | ${result.score}/10 |\n\n`;
|
|
320
|
-
});
|
|
321
|
-
const resultPath = join(benchmarkRunDir, "result.md");
|
|
322
|
-
writeFileSync(resultPath, resultMd);
|
|
323
|
-
console.log(`✅ Results saved to: ${resultPath}`);
|
|
324
|
-
console.log();
|
|
325
|
-
// Summary
|
|
326
|
-
console.log("=".repeat(80));
|
|
327
|
-
console.log("Benchmark Complete");
|
|
328
|
-
console.log("=".repeat(80));
|
|
329
|
-
console.log(`Scoring Model: ${modelName}`);
|
|
330
|
-
console.log(`Total questions: ${questions.length}`);
|
|
331
|
-
console.log(`Total time: ${(duration / 1000).toFixed(2)}s`);
|
|
332
|
-
console.log(`Average time per question: ${(duration / questions.length / 1000).toFixed(2)}s`);
|
|
333
|
-
console.log();
|
|
334
|
-
console.log(`📊 Scoring Results:`);
|
|
335
|
-
console.log(` - Average Tool Calls: ${avgToolCount.toFixed(2)}`);
|
|
336
|
-
console.log(` - Average Token Count: ${avgTokenCount.toFixed(0)}`);
|
|
337
|
-
console.log(` - Average Total Tokens (API): ${avgTotalTokens.toFixed(0)}`);
|
|
338
|
-
console.log(` - Average Score: ${avgScore.toFixed(2)}/10`);
|
|
339
|
-
console.log();
|
|
340
|
-
console.log(`Results saved to: ${benchmarkRunDir}`);
|
|
341
|
-
console.log("=".repeat(80));
|
|
342
|
-
}
|
|
343
|
-
// Run benchmark
|
|
344
|
-
runBenchmark().catch((error) => {
|
|
345
|
-
console.error("Fatal error:", error);
|
|
346
|
-
process.exit(1);
|
|
347
|
-
});
|