@upstash/context7-mcp 1.0.32 → 1.0.34-canary.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmark/benchmark.js +347 -0
- package/dist/benchmark/compare-benchmark.js +289 -0
- package/dist/benchmark/run-benchmark.js +459 -0
- package/dist/benchmark/simulate.js +319 -0
- package/dist/index.js +33 -130
- package/dist/lib/api.js +19 -80
- package/dist/lib/types.js +1 -7
- package/package.json +10 -2
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
import "dotenv/config";
|
|
2
|
+
import { readFileSync, mkdirSync, renameSync, existsSync, readdirSync, writeFileSync } from "fs";
|
|
3
|
+
import { join, dirname } from "path";
|
|
4
|
+
import { fileURLToPath } from "url";
|
|
5
|
+
import { execSync } from "child_process";
|
|
6
|
+
import { simulate } from "./simulate.js";
|
|
7
|
+
import { generateText } from "ai";
|
|
8
|
+
import { anthropic } from "@ai-sdk/anthropic";
|
|
9
|
+
import { openai } from "@ai-sdk/openai";
|
|
10
|
+
import { google } from "@ai-sdk/google";
|
|
11
|
+
// Check for required environment variables
|
|
12
|
+
if (!process.env.CONTEXT7_API_KEY) {
|
|
13
|
+
console.error("Error: CONTEXT7_API_KEY environment variable is required");
|
|
14
|
+
console.error("Set it in your .env file or export it in your shell");
|
|
15
|
+
process.exit(1);
|
|
16
|
+
}
|
|
17
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
18
|
+
const __dirname = dirname(__filename);
|
|
19
|
+
// Package root is two levels up from dist/benchmark/
|
|
20
|
+
const packageRoot = join(__dirname, "..", "..");
|
|
21
|
+
/**
|
|
22
|
+
* Get the current git branch name
|
|
23
|
+
* @returns The branch name or "unknown" if not in a git repo
|
|
24
|
+
*/
|
|
25
|
+
function getCurrentBranch() {
|
|
26
|
+
try {
|
|
27
|
+
const branch = execSync("git rev-parse --abbrev-ref HEAD", { encoding: "utf-8" }).trim();
|
|
28
|
+
return branch;
|
|
29
|
+
}
|
|
30
|
+
catch (error) {
|
|
31
|
+
console.error("Error getting current branch:", error);
|
|
32
|
+
return "unknown";
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Runs benchmarks by simulating questions from questions.txt
|
|
37
|
+
*
|
|
38
|
+
* Usage:
|
|
39
|
+
* - pnpm run benchmark openai
|
|
40
|
+
* - pnpm run benchmark claude
|
|
41
|
+
* - pnpm run benchmark gemini
|
|
42
|
+
* - pnpm run benchmark openai --test (run only first question)
|
|
43
|
+
* - pnpm run benchmark claude 1 output-folder (questionset 1, custom output folder)
|
|
44
|
+
* - pnpm run benchmark claude aa.txt output-folder (use aa.txt, custom output folder)
|
|
45
|
+
*/
|
|
46
|
+
async function runBenchmark() {
|
|
47
|
+
// Parse arguments
|
|
48
|
+
const args = process.argv.slice(2);
|
|
49
|
+
const nonFlagArgs = args.filter((a) => !a.startsWith("--"));
|
|
50
|
+
const modelArg = nonFlagArgs[0]?.toLowerCase() || "claude";
|
|
51
|
+
const questionFileArg = nonFlagArgs[1] || null;
|
|
52
|
+
const outputFolderName = nonFlagArgs[2] || null;
|
|
53
|
+
const isTestMode = args.includes("--test");
|
|
54
|
+
let scoringModel;
|
|
55
|
+
let modelName;
|
|
56
|
+
if (modelArg === "openai") {
|
|
57
|
+
scoringModel = openai("gpt-5");
|
|
58
|
+
modelName = "GPT-5";
|
|
59
|
+
}
|
|
60
|
+
else if (modelArg === "gemini") {
|
|
61
|
+
scoringModel = google("gemini-2.5-pro");
|
|
62
|
+
modelName = "GEMINI-2.5-PRO";
|
|
63
|
+
}
|
|
64
|
+
else {
|
|
65
|
+
// Default to claude
|
|
66
|
+
scoringModel = anthropic("claude-sonnet-4-5");
|
|
67
|
+
modelName = "CLAUDE-SONNET-4.5";
|
|
68
|
+
}
|
|
69
|
+
// Determine the questions file to use
|
|
70
|
+
let questionsFileName;
|
|
71
|
+
if (!questionFileArg) {
|
|
72
|
+
questionsFileName = "questions.txt";
|
|
73
|
+
}
|
|
74
|
+
else if (questionFileArg.endsWith(".txt")) {
|
|
75
|
+
// Filename provided directly
|
|
76
|
+
questionsFileName = questionFileArg;
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
// Number provided, construct filename
|
|
80
|
+
const questionSetNum = parseInt(questionFileArg, 10);
|
|
81
|
+
if (!isNaN(questionSetNum)) {
|
|
82
|
+
questionsFileName = `questions${questionSetNum}.txt`;
|
|
83
|
+
}
|
|
84
|
+
else {
|
|
85
|
+
questionsFileName = "questions.txt";
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
console.log("=".repeat(80));
|
|
89
|
+
console.log("Context7 MCP Benchmark");
|
|
90
|
+
console.log("=".repeat(80));
|
|
91
|
+
console.log(`Scoring Model: ${modelName}`);
|
|
92
|
+
console.log(`Question File: ${questionsFileName}`);
|
|
93
|
+
if (isTestMode) {
|
|
94
|
+
console.log(`Mode: TEST (first question only)`);
|
|
95
|
+
}
|
|
96
|
+
console.log();
|
|
97
|
+
// Read questions from questions.txt or questionsN.txt (in src/benchmark/questions directory)
|
|
98
|
+
const questionsPath = join(packageRoot, "src", "benchmark", "questions", questionsFileName);
|
|
99
|
+
console.log(`Reading questions from: ${questionsPath}`);
|
|
100
|
+
if (!existsSync(questionsPath)) {
|
|
101
|
+
console.error(`Error: questions.txt not found at ${questionsPath}`);
|
|
102
|
+
process.exit(1);
|
|
103
|
+
}
|
|
104
|
+
const questionsContent = readFileSync(questionsPath, "utf-8");
|
|
105
|
+
let questions = questionsContent
|
|
106
|
+
.split("\n")
|
|
107
|
+
.map((line) => line.trim())
|
|
108
|
+
.filter((line) => line.length > 0 && !line.startsWith("#")); // Filter empty lines and comments
|
|
109
|
+
// Limit to first question if in test mode
|
|
110
|
+
if (isTestMode) {
|
|
111
|
+
questions = questions.slice(0, 1);
|
|
112
|
+
console.log(`Test mode: Running only first question`);
|
|
113
|
+
}
|
|
114
|
+
else {
|
|
115
|
+
console.log(`Found ${questions.length} questions to benchmark`);
|
|
116
|
+
}
|
|
117
|
+
console.log();
|
|
118
|
+
// Get current git branch name
|
|
119
|
+
const branchName = getCurrentBranch();
|
|
120
|
+
// Create benchmark run directory with custom name or default naming
|
|
121
|
+
let benchmarkRunDir;
|
|
122
|
+
if (outputFolderName) {
|
|
123
|
+
benchmarkRunDir = join(packageRoot, "src", "benchmark", "reports", "benchmarks", outputFolderName);
|
|
124
|
+
}
|
|
125
|
+
else {
|
|
126
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, "-").split("Z")[0];
|
|
127
|
+
benchmarkRunDir = join(packageRoot, "src", "benchmark", "reports", "benchmarks", `${branchName}-run-${timestamp}_${modelName.replace(/[.\s]/g, "-")}`);
|
|
128
|
+
}
|
|
129
|
+
mkdirSync(benchmarkRunDir, { recursive: true });
|
|
130
|
+
console.log(`Benchmark results will be saved to: ${benchmarkRunDir}`);
|
|
131
|
+
console.log();
|
|
132
|
+
const results = [];
|
|
133
|
+
// Run simulation for questions in batches (parallel processing)
|
|
134
|
+
// BATCH_SIZE can be set via environment variable (e.g., BATCH_SIZE=1 for sequential)
|
|
135
|
+
const startTime = Date.now();
|
|
136
|
+
const BATCH_SIZE = process.env.BATCH_SIZE ? parseInt(process.env.BATCH_SIZE, 10) : 7;
|
|
137
|
+
console.log(`Execution Mode: ${BATCH_SIZE === 1 ? "Sequential (1 question at a time)" : "Parallel (batch size: " + BATCH_SIZE + ")"}`);
|
|
138
|
+
console.log();
|
|
139
|
+
for (let batchStart = 0; batchStart < questions.length; batchStart += BATCH_SIZE) {
|
|
140
|
+
const batchEnd = Math.min(batchStart + BATCH_SIZE, questions.length);
|
|
141
|
+
const batch = questions.slice(batchStart, batchEnd);
|
|
142
|
+
console.log("═".repeat(80));
|
|
143
|
+
console.log(`Processing Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} (Questions ${batchStart + 1}-${batchEnd})`);
|
|
144
|
+
console.log("═".repeat(80));
|
|
145
|
+
console.log();
|
|
146
|
+
// Process batch in parallel
|
|
147
|
+
const batchPromises = batch.map(async (question, batchIndex) => {
|
|
148
|
+
const questionNum = batchStart + batchIndex + 1;
|
|
149
|
+
console.log(`[Q${questionNum}] Starting: ${question.substring(0, 60)}...`);
|
|
150
|
+
try {
|
|
151
|
+
// Run simulation with unique ID to prevent filename collisions
|
|
152
|
+
const uniqueId = `q${questionNum}`;
|
|
153
|
+
await simulate(question, uniqueId);
|
|
154
|
+
// Wait a bit to ensure file system operations complete
|
|
155
|
+
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
156
|
+
// Find the report files created for this question by unique ID
|
|
157
|
+
const reportsDir = join(packageRoot, "src", "benchmark", "reports");
|
|
158
|
+
const files = readdirSync(reportsDir);
|
|
159
|
+
// Look for files containing the unique ID
|
|
160
|
+
const mdFile = files.find((f) => f.includes(`_${uniqueId}.md`) && !f.endsWith("_raw.md"));
|
|
161
|
+
const rawMdFile = files.find((f) => f.includes(`_${uniqueId}_raw.md`));
|
|
162
|
+
if (mdFile && rawMdFile) {
|
|
163
|
+
// Move files to benchmark directory with new names
|
|
164
|
+
const sourceMd = join(reportsDir, mdFile);
|
|
165
|
+
const sourceRawMd = join(reportsDir, rawMdFile);
|
|
166
|
+
const destMd = join(benchmarkRunDir, `q${questionNum}.md`);
|
|
167
|
+
const destRawMd = join(benchmarkRunDir, `q${questionNum}_raw.md`);
|
|
168
|
+
renameSync(sourceMd, destMd);
|
|
169
|
+
renameSync(sourceRawMd, destRawMd);
|
|
170
|
+
console.log(`[Q${questionNum}] ✅ Completed and saved`);
|
|
171
|
+
return {
|
|
172
|
+
questionNum,
|
|
173
|
+
question,
|
|
174
|
+
toolCount: 0, // Will be calculated during scoring
|
|
175
|
+
tokenCount: 0, // Will be calculated during scoring
|
|
176
|
+
totalTokens: 0, // Will be extracted from report
|
|
177
|
+
score: 0, // Will be calculated during scoring
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
else {
|
|
181
|
+
console.error(`[Q${questionNum}] ⚠️ No report files found (expected: *_${uniqueId}.md)`);
|
|
182
|
+
return null;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
catch (error) {
|
|
186
|
+
console.error(`[Q${questionNum}] ❌ Error:`, error);
|
|
187
|
+
return null;
|
|
188
|
+
}
|
|
189
|
+
});
|
|
190
|
+
// Wait for all questions in this batch to complete
|
|
191
|
+
const batchResults = await Promise.all(batchPromises);
|
|
192
|
+
// Add successful results to the results array
|
|
193
|
+
batchResults.forEach((result) => {
|
|
194
|
+
if (result) {
|
|
195
|
+
results.push(result);
|
|
196
|
+
}
|
|
197
|
+
});
|
|
198
|
+
console.log();
|
|
199
|
+
console.log(`Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} completed: ${batchResults.filter((r) => r).length}/${batch.length} successful`);
|
|
200
|
+
console.log();
|
|
201
|
+
}
|
|
202
|
+
const duration = Date.now() - startTime;
|
|
203
|
+
// Scoring phase - also in batches of 5 for parallel processing
|
|
204
|
+
console.log();
|
|
205
|
+
console.log("=".repeat(80));
|
|
206
|
+
console.log("Scoring Phase");
|
|
207
|
+
console.log("=".repeat(80));
|
|
208
|
+
console.log(`Using ${modelName} to score context quality...`);
|
|
209
|
+
console.log();
|
|
210
|
+
for (let batchStart = 0; batchStart < results.length; batchStart += BATCH_SIZE) {
|
|
211
|
+
const batchEnd = Math.min(batchStart + BATCH_SIZE, results.length);
|
|
212
|
+
const batchResults = results.slice(batchStart, batchEnd);
|
|
213
|
+
console.log(`Scoring Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} (Questions ${batchStart + 1}-${batchEnd})`);
|
|
214
|
+
// Process scoring in parallel
|
|
215
|
+
const scoringPromises = batchResults.map(async (result) => {
|
|
216
|
+
const rawMdPath = join(benchmarkRunDir, `q${result.questionNum}_raw.md`);
|
|
217
|
+
const structuredMdPath = join(benchmarkRunDir, `q${result.questionNum}.md`);
|
|
218
|
+
try {
|
|
219
|
+
// Read raw markdown file
|
|
220
|
+
const rawContent = readFileSync(rawMdPath, "utf-8");
|
|
221
|
+
// Count tokens (approximate: split by whitespace and punctuation)
|
|
222
|
+
const tokenCount = rawContent.split(/[\s\n]+/).length;
|
|
223
|
+
result.tokenCount = tokenCount;
|
|
224
|
+
// Count tool calls from structured report and extract total tokens
|
|
225
|
+
const structuredContent = readFileSync(structuredMdPath, "utf-8");
|
|
226
|
+
const toolCallMatches = structuredContent.match(/### Tool Call \d+:/g);
|
|
227
|
+
result.toolCount = toolCallMatches ? toolCallMatches.length : 0;
|
|
228
|
+
// Extract total tokens from structured report
|
|
229
|
+
const totalTokensMatch = structuredContent.match(/\*\*Total Tokens\*\*: (\d+)/);
|
|
230
|
+
result.totalTokens = totalTokensMatch ? parseInt(totalTokensMatch[1], 10) : 0;
|
|
231
|
+
// Extract question and context from raw file
|
|
232
|
+
const lines = rawContent.split("\n");
|
|
233
|
+
const questionLine = lines.find((line) => line.startsWith("QUESTION:"));
|
|
234
|
+
const question = questionLine
|
|
235
|
+
? questionLine.replace("QUESTION:", "").trim()
|
|
236
|
+
: result.question;
|
|
237
|
+
// Get context (everything after "CONTEXT:")
|
|
238
|
+
const contextStart = rawContent.indexOf("CONTEXT:");
|
|
239
|
+
const context = contextStart !== -1 ? rawContent.substring(contextStart + 8).trim() : rawContent;
|
|
240
|
+
console.log(`[Q${result.questionNum}] Scoring...`);
|
|
241
|
+
// Ask the scoring model to evaluate the context
|
|
242
|
+
const scoringResult = await generateText({
|
|
243
|
+
model: scoringModel,
|
|
244
|
+
messages: [
|
|
245
|
+
{
|
|
246
|
+
role: "user",
|
|
247
|
+
content: `You are evaluating the quality and usefulness of documentation context for a given question.
|
|
248
|
+
|
|
249
|
+
Question: ${question}
|
|
250
|
+
|
|
251
|
+
Context provided:
|
|
252
|
+
${context}
|
|
253
|
+
|
|
254
|
+
Rate how helpful and relevant this context is for answering the question on a scale of 1-10, where:
|
|
255
|
+
- 1-3: Poor - Missing critical information, irrelevant, or unhelpful
|
|
256
|
+
- 4-6: Adequate - Has some useful information but gaps exist
|
|
257
|
+
- 7-8: Good - Covers most needs with relevant examples
|
|
258
|
+
- 9-10: Excellent - Comprehensive, relevant, with clear examples
|
|
259
|
+
|
|
260
|
+
Respond with ONLY a JSON object in this format:
|
|
261
|
+
{"score": <number>, "reasoning": "<brief explanation>"}`,
|
|
262
|
+
},
|
|
263
|
+
],
|
|
264
|
+
});
|
|
265
|
+
// Parse the score
|
|
266
|
+
try {
|
|
267
|
+
const jsonMatch = scoringResult.text.match(/\{[\s\S]*\}/);
|
|
268
|
+
if (jsonMatch) {
|
|
269
|
+
const scoreData = JSON.parse(jsonMatch[0]);
|
|
270
|
+
result.score = scoreData.score;
|
|
271
|
+
console.log(`[Q${result.questionNum}] Score: ${scoreData.score}/10 - ${scoreData.reasoning.substring(0, 60)}...`);
|
|
272
|
+
}
|
|
273
|
+
else {
|
|
274
|
+
console.log(`[Q${result.questionNum}] ⚠️ Could not parse score, defaulting to 0`);
|
|
275
|
+
result.score = 0;
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
catch (parseError) {
|
|
279
|
+
console.log(`[Q${result.questionNum}] ⚠️ Error parsing score: ${parseError}`);
|
|
280
|
+
result.score = 0;
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
catch (error) {
|
|
284
|
+
console.error(`[Q${result.questionNum}] ❌ Error scoring:`, error);
|
|
285
|
+
}
|
|
286
|
+
});
|
|
287
|
+
// Wait for all scoring in this batch to complete
|
|
288
|
+
await Promise.all(scoringPromises);
|
|
289
|
+
console.log(`Scoring Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} completed: ${batchEnd - batchStart} questions`);
|
|
290
|
+
console.log();
|
|
291
|
+
}
|
|
292
|
+
// Calculate averages
|
|
293
|
+
const avgToolCount = results.reduce((sum, r) => sum + r.toolCount, 0) / results.length;
|
|
294
|
+
const avgTokenCount = results.reduce((sum, r) => sum + r.tokenCount, 0) / results.length;
|
|
295
|
+
const avgTotalTokens = results.reduce((sum, r) => sum + r.totalTokens, 0) / results.length;
|
|
296
|
+
const avgScore = results.reduce((sum, r) => sum + r.score, 0) / results.length;
|
|
297
|
+
// Generate result.md
|
|
298
|
+
console.log("Generating result.md...");
|
|
299
|
+
let resultMd = `# Benchmark Results\n\n`;
|
|
300
|
+
resultMd += `**Scoring Model**: ${modelName}\n`;
|
|
301
|
+
resultMd += `**Date**: ${new Date().toISOString()}\n`;
|
|
302
|
+
resultMd += `**Total Questions**: ${results.length}\n`;
|
|
303
|
+
resultMd += `**Total Duration**: ${(duration / 1000).toFixed(2)}s\n\n`;
|
|
304
|
+
resultMd += `## Averages\n\n`;
|
|
305
|
+
resultMd += `| Metric | Value |\n`;
|
|
306
|
+
resultMd += `|--------|-------|\n`;
|
|
307
|
+
resultMd += `| Average Tool Calls | ${avgToolCount.toFixed(2)} |\n`;
|
|
308
|
+
resultMd += `| Average Token Count | ${avgTokenCount.toFixed(0)} |\n`;
|
|
309
|
+
resultMd += `| Average Total Tokens (API) | ${avgTotalTokens.toFixed(0)} |\n`;
|
|
310
|
+
resultMd += `| Average Score | ${avgScore.toFixed(2)}/10 |\n\n`;
|
|
311
|
+
resultMd += `## Results by Question\n\n`;
|
|
312
|
+
results.forEach((result) => {
|
|
313
|
+
resultMd += `### Q${result.questionNum}: ${result.question}\n\n`;
|
|
314
|
+
resultMd += `| Metric | Value |\n`;
|
|
315
|
+
resultMd += `|--------|-------|\n`;
|
|
316
|
+
resultMd += `| Tool Calls | ${result.toolCount} |\n`;
|
|
317
|
+
resultMd += `| Token Count | ${result.tokenCount} |\n`;
|
|
318
|
+
resultMd += `| Total Tokens (API) | ${result.totalTokens} |\n`;
|
|
319
|
+
resultMd += `| LLM Score | ${result.score}/10 |\n\n`;
|
|
320
|
+
});
|
|
321
|
+
const resultPath = join(benchmarkRunDir, "result.md");
|
|
322
|
+
writeFileSync(resultPath, resultMd);
|
|
323
|
+
console.log(`✅ Results saved to: ${resultPath}`);
|
|
324
|
+
console.log();
|
|
325
|
+
// Summary
|
|
326
|
+
console.log("=".repeat(80));
|
|
327
|
+
console.log("Benchmark Complete");
|
|
328
|
+
console.log("=".repeat(80));
|
|
329
|
+
console.log(`Scoring Model: ${modelName}`);
|
|
330
|
+
console.log(`Total questions: ${questions.length}`);
|
|
331
|
+
console.log(`Total time: ${(duration / 1000).toFixed(2)}s`);
|
|
332
|
+
console.log(`Average time per question: ${(duration / questions.length / 1000).toFixed(2)}s`);
|
|
333
|
+
console.log();
|
|
334
|
+
console.log(`📊 Scoring Results:`);
|
|
335
|
+
console.log(` - Average Tool Calls: ${avgToolCount.toFixed(2)}`);
|
|
336
|
+
console.log(` - Average Token Count: ${avgTokenCount.toFixed(0)}`);
|
|
337
|
+
console.log(` - Average Total Tokens (API): ${avgTotalTokens.toFixed(0)}`);
|
|
338
|
+
console.log(` - Average Score: ${avgScore.toFixed(2)}/10`);
|
|
339
|
+
console.log();
|
|
340
|
+
console.log(`Results saved to: ${benchmarkRunDir}`);
|
|
341
|
+
console.log("=".repeat(80));
|
|
342
|
+
}
|
|
343
|
+
// Run benchmark
|
|
344
|
+
runBenchmark().catch((error) => {
|
|
345
|
+
console.error("Fatal error:", error);
|
|
346
|
+
process.exit(1);
|
|
347
|
+
});
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
import "dotenv/config";
|
|
2
|
+
import { readFileSync, readdirSync, writeFileSync, existsSync } from "fs";
|
|
3
|
+
import { join, dirname } from "path";
|
|
4
|
+
import { fileURLToPath } from "url";
|
|
5
|
+
import { generateText } from "ai";
|
|
6
|
+
import { anthropic } from "@ai-sdk/anthropic";
|
|
7
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
8
|
+
const __dirname = dirname(__filename);
|
|
9
|
+
// Package root is two levels up from dist/benchmark/
|
|
10
|
+
const packageRoot = join(__dirname, "..", "..");
|
|
11
|
+
const ARCHIVE_PATH = join(packageRoot, "src", "benchmark", "reports", "benchmarks", "archive");
|
|
12
|
+
// Question sets to compare (1-8)
|
|
13
|
+
const QUESTION_SETS = [1, 2, 3, 4, 5, 6, 7, 8];
|
|
14
|
+
/**
|
|
15
|
+
* Compare two benchmark runs across all question sets
|
|
16
|
+
*
|
|
17
|
+
* Usage:
|
|
18
|
+
* pnpm run compare-benchmark <prefix-a> <prefix-b>
|
|
19
|
+
*
|
|
20
|
+
* Example:
|
|
21
|
+
* pnpm run compare-benchmark CTX7-943-run-3 single-params-run-0
|
|
22
|
+
*
|
|
23
|
+
* This will compare folders:
|
|
24
|
+
* {prefix-a}-file-questions{1-8}-model-claude vs {prefix-b}-file-questions{1-8}-model-claude
|
|
25
|
+
*/
|
|
26
|
+
async function compareBenchmarks() {
|
|
27
|
+
const args = process.argv.slice(2);
|
|
28
|
+
if (args.length < 2) {
|
|
29
|
+
console.error("Usage: pnpm run compare-benchmark <prefix-a> <prefix-b>");
|
|
30
|
+
console.error("Example: pnpm run compare-benchmark CTX7-943-run-3 single-params-run-0");
|
|
31
|
+
console.error("");
|
|
32
|
+
console.error("This compares folders matching pattern: {prefix}-file-questions{1-8}-model-claude");
|
|
33
|
+
process.exit(1);
|
|
34
|
+
}
|
|
35
|
+
const prefixA = args[0];
|
|
36
|
+
const prefixB = args[1];
|
|
37
|
+
console.log("=".repeat(80));
|
|
38
|
+
console.log("Context7 Benchmark Comparison (All Question Sets)");
|
|
39
|
+
console.log("=".repeat(80));
|
|
40
|
+
console.log(`Prefix A: ${prefixA}`);
|
|
41
|
+
console.log(`Prefix B: ${prefixB}`);
|
|
42
|
+
console.log(`Judge: Claude Sonnet 4.5`);
|
|
43
|
+
console.log(`Archive Path: ${ARCHIVE_PATH}`);
|
|
44
|
+
console.log();
|
|
45
|
+
const model = anthropic("claude-sonnet-4-5");
|
|
46
|
+
const allResults = [];
|
|
47
|
+
const questionSetResults = [];
|
|
48
|
+
// Aggregate counters
|
|
49
|
+
let totalWinsA = 0;
|
|
50
|
+
let totalWinsB = 0;
|
|
51
|
+
let totalTies = 0;
|
|
52
|
+
// Process each question set
|
|
53
|
+
for (const questionSet of QUESTION_SETS) {
|
|
54
|
+
const folderNameA = `${prefixA}-file-questions${questionSet}-model-claude`;
|
|
55
|
+
const folderNameB = `${prefixB}-file-questions${questionSet}-model-claude`;
|
|
56
|
+
const folderA = join(ARCHIVE_PATH, folderNameA);
|
|
57
|
+
const folderB = join(ARCHIVE_PATH, folderNameB);
|
|
58
|
+
console.log("═".repeat(80));
|
|
59
|
+
console.log(`Question Set ${questionSet}`);
|
|
60
|
+
console.log("═".repeat(80));
|
|
61
|
+
// Check if both folders exist
|
|
62
|
+
if (!existsSync(folderA)) {
|
|
63
|
+
console.log(`⚠️ Skipping: Folder A not found: ${folderNameA}`);
|
|
64
|
+
console.log();
|
|
65
|
+
continue;
|
|
66
|
+
}
|
|
67
|
+
if (!existsSync(folderB)) {
|
|
68
|
+
console.log(`⚠️ Skipping: Folder B not found: ${folderNameB}`);
|
|
69
|
+
console.log();
|
|
70
|
+
continue;
|
|
71
|
+
}
|
|
72
|
+
// Find all raw files in both folders
|
|
73
|
+
const filesA = readdirSync(folderA).filter((f) => f.match(/^q\d+_raw\.md$/));
|
|
74
|
+
const filesB = readdirSync(folderB).filter((f) => f.match(/^q\d+_raw\.md$/));
|
|
75
|
+
// Get question numbers from both folders
|
|
76
|
+
const questionsA = new Set(filesA.map((f) => parseInt(f.match(/q(\d+)_raw\.md/)?.[1] || "0")));
|
|
77
|
+
const questionsB = new Set(filesB.map((f) => parseInt(f.match(/q(\d+)_raw\.md/)?.[1] || "0")));
|
|
78
|
+
// Find common questions
|
|
79
|
+
const commonQuestions = [...questionsA].filter((q) => questionsB.has(q)).sort((a, b) => a - b);
|
|
80
|
+
if (commonQuestions.length === 0) {
|
|
81
|
+
console.log(`⚠️ Skipping: No common questions found`);
|
|
82
|
+
console.log();
|
|
83
|
+
continue;
|
|
84
|
+
}
|
|
85
|
+
console.log(`Found ${commonQuestions.length} common questions`);
|
|
86
|
+
let setWinsA = 0;
|
|
87
|
+
let setWinsB = 0;
|
|
88
|
+
let setTies = 0;
|
|
89
|
+
const setResults = [];
|
|
90
|
+
// Process questions in batches for parallel execution
|
|
91
|
+
const BATCH_SIZE = 5;
|
|
92
|
+
for (let batchStart = 0; batchStart < commonQuestions.length; batchStart += BATCH_SIZE) {
|
|
93
|
+
const batchEnd = Math.min(batchStart + BATCH_SIZE, commonQuestions.length);
|
|
94
|
+
const batch = commonQuestions.slice(batchStart, batchEnd);
|
|
95
|
+
const batchPromises = batch.map(async (questionNum) => {
|
|
96
|
+
const fileA = join(folderA, `q${questionNum}_raw.md`);
|
|
97
|
+
const fileB = join(folderB, `q${questionNum}_raw.md`);
|
|
98
|
+
const contentA = readFileSync(fileA, "utf-8");
|
|
99
|
+
const contentB = readFileSync(fileB, "utf-8");
|
|
100
|
+
// Extract question from the content
|
|
101
|
+
const questionMatch = contentA.match(/QUESTION:\s*(.+?)(?:\n|$)/);
|
|
102
|
+
const question = questionMatch ? questionMatch[1].trim() : `Question ${questionNum}`;
|
|
103
|
+
// Extract just the context part (after CONTEXT:)
|
|
104
|
+
const extractContext = (content) => {
|
|
105
|
+
const contextStart = content.indexOf("CONTEXT:");
|
|
106
|
+
if (contextStart === -1)
|
|
107
|
+
return content;
|
|
108
|
+
return content.substring(contextStart + 8).trim();
|
|
109
|
+
};
|
|
110
|
+
const contextA = extractContext(contentA);
|
|
111
|
+
const contextB = extractContext(contentB);
|
|
112
|
+
console.log(` [Q${questionNum}] Comparing: ${question.substring(0, 45)}...`);
|
|
113
|
+
try {
|
|
114
|
+
const result = await generateText({
|
|
115
|
+
model,
|
|
116
|
+
messages: [
|
|
117
|
+
{
|
|
118
|
+
role: "user",
|
|
119
|
+
content: `You are a technical documentation expert evaluating which context is more helpful for answering a programming question.
|
|
120
|
+
|
|
121
|
+
QUESTION: ${question}
|
|
122
|
+
|
|
123
|
+
=== ANSWER A ===
|
|
124
|
+
${contextA}
|
|
125
|
+
|
|
126
|
+
=== ANSWER B ===
|
|
127
|
+
${contextB}
|
|
128
|
+
|
|
129
|
+
Compare these two answers and determine which one is better for helping a developer answer the question. Consider:
|
|
130
|
+
1. Relevance - Does it directly address the question?
|
|
131
|
+
2. Code examples - Are there working, relevant code snippets?
|
|
132
|
+
3. Completeness - Does it cover the key aspects needed?
|
|
133
|
+
4. Clarity - Is the information well-organized and easy to understand?
|
|
134
|
+
5. Accuracy - Does it appear technically correct?
|
|
135
|
+
|
|
136
|
+
Respond with ONLY a JSON object in this format:
|
|
137
|
+
{"winner": "A" | "B" | "tie", "reasoning": "<brief 1-2 sentence explanation>"}
|
|
138
|
+
|
|
139
|
+
If both are roughly equal in quality, respond with "tie".`,
|
|
140
|
+
},
|
|
141
|
+
],
|
|
142
|
+
});
|
|
143
|
+
// Parse the result
|
|
144
|
+
const jsonMatch = result.text.match(/\{[\s\S]*\}/);
|
|
145
|
+
if (jsonMatch) {
|
|
146
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
147
|
+
const rawWinner = String(parsed.winner).toUpperCase();
|
|
148
|
+
const winner = rawWinner === "A" ? "A" : rawWinner === "B" ? "B" : "tie";
|
|
149
|
+
console.log(` [Q${questionNum}] Winner: ${winner}`);
|
|
150
|
+
return {
|
|
151
|
+
questionSet,
|
|
152
|
+
questionNum,
|
|
153
|
+
question,
|
|
154
|
+
winner,
|
|
155
|
+
reasoning: parsed.reasoning,
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
else {
|
|
159
|
+
console.log(` [Q${questionNum}] ⚠️ Could not parse result, marking as tie`);
|
|
160
|
+
return {
|
|
161
|
+
questionSet,
|
|
162
|
+
questionNum,
|
|
163
|
+
question,
|
|
164
|
+
winner: "tie",
|
|
165
|
+
reasoning: "Failed to parse LLM response",
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
catch (error) {
|
|
170
|
+
console.error(` [Q${questionNum}] ❌ Error:`, error);
|
|
171
|
+
return {
|
|
172
|
+
questionSet,
|
|
173
|
+
questionNum,
|
|
174
|
+
question,
|
|
175
|
+
winner: "tie",
|
|
176
|
+
reasoning: `Error during comparison: ${error}`,
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
});
|
|
180
|
+
const batchResults = await Promise.all(batchPromises);
|
|
181
|
+
setResults.push(...batchResults);
|
|
182
|
+
// Update set counts
|
|
183
|
+
batchResults.forEach((r) => {
|
|
184
|
+
if (r.winner === "A")
|
|
185
|
+
setWinsA++;
|
|
186
|
+
else if (r.winner === "B")
|
|
187
|
+
setWinsB++;
|
|
188
|
+
else
|
|
189
|
+
setTies++;
|
|
190
|
+
});
|
|
191
|
+
}
|
|
192
|
+
// Store set results
|
|
193
|
+
questionSetResults.push({
|
|
194
|
+
questionSet,
|
|
195
|
+
winsA: setWinsA,
|
|
196
|
+
winsB: setWinsB,
|
|
197
|
+
ties: setTies,
|
|
198
|
+
results: setResults,
|
|
199
|
+
});
|
|
200
|
+
allResults.push(...setResults);
|
|
201
|
+
// Update totals
|
|
202
|
+
totalWinsA += setWinsA;
|
|
203
|
+
totalWinsB += setWinsB;
|
|
204
|
+
totalTies += setTies;
|
|
205
|
+
console.log(` Summary: A=${setWinsA}, B=${setWinsB}, Ties=${setTies}`);
|
|
206
|
+
console.log();
|
|
207
|
+
}
|
|
208
|
+
// Generate final summary
|
|
209
|
+
console.log("=".repeat(80));
|
|
210
|
+
console.log("COMPARISON RESULTS");
|
|
211
|
+
console.log("=".repeat(80));
|
|
212
|
+
console.log();
|
|
213
|
+
console.log("📊 Results by Question Set:");
|
|
214
|
+
questionSetResults.forEach((qsr) => {
|
|
215
|
+
console.log(` questions${qsr.questionSet}: A=${qsr.winsA}, B=${qsr.winsB}, Ties=${qsr.ties}`);
|
|
216
|
+
});
|
|
217
|
+
console.log();
|
|
218
|
+
console.log("📊 Aggregate Score:");
|
|
219
|
+
console.log(` ${prefixA}`);
|
|
220
|
+
console.log(` → ${totalWinsA} wins`);
|
|
221
|
+
console.log();
|
|
222
|
+
console.log(` ${prefixB}`);
|
|
223
|
+
console.log(` → ${totalWinsB} wins`);
|
|
224
|
+
console.log();
|
|
225
|
+
console.log(` Ties: ${totalTies}`);
|
|
226
|
+
console.log();
|
|
227
|
+
// Determine overall winner
|
|
228
|
+
if (totalWinsA > totalWinsB) {
|
|
229
|
+
console.log(`🏆 Winner: ${prefixA}`);
|
|
230
|
+
}
|
|
231
|
+
else if (totalWinsB > totalWinsA) {
|
|
232
|
+
console.log(`🏆 Winner: ${prefixB}`);
|
|
233
|
+
}
|
|
234
|
+
else {
|
|
235
|
+
console.log(`🤝 Result: TIE`);
|
|
236
|
+
}
|
|
237
|
+
console.log("=".repeat(80));
|
|
238
|
+
// Generate markdown report
|
|
239
|
+
let report = `# Benchmark Comparison Results\n\n`;
|
|
240
|
+
report += `**Date**: ${new Date().toISOString()}\n`;
|
|
241
|
+
report += `**Judge**: Claude Sonnet 4.5\n\n`;
|
|
242
|
+
report += `## Prefixes Compared\n\n`;
|
|
243
|
+
report += `- **Prefix A**: ${prefixA}\n`;
|
|
244
|
+
report += `- **Prefix B**: ${prefixB}\n\n`;
|
|
245
|
+
report += `## Aggregate Summary\n\n`;
|
|
246
|
+
report += `| Prefix | Wins |\n`;
|
|
247
|
+
report += `|--------|------|\n`;
|
|
248
|
+
report += `| ${prefixA} | ${totalWinsA} |\n`;
|
|
249
|
+
report += `| ${prefixB} | ${totalWinsB} |\n`;
|
|
250
|
+
report += `| Ties | ${totalTies} |\n\n`;
|
|
251
|
+
if (totalWinsA > totalWinsB) {
|
|
252
|
+
report += `**Winner**: ${prefixA}\n\n`;
|
|
253
|
+
}
|
|
254
|
+
else if (totalWinsB > totalWinsA) {
|
|
255
|
+
report += `**Winner**: ${prefixB}\n\n`;
|
|
256
|
+
}
|
|
257
|
+
else {
|
|
258
|
+
report += `**Result**: TIE\n\n`;
|
|
259
|
+
}
|
|
260
|
+
report += `## Results by Question Set\n\n`;
|
|
261
|
+
report += `| Question Set | A Wins | B Wins | Ties |\n`;
|
|
262
|
+
report += `|--------------|--------|--------|------|\n`;
|
|
263
|
+
questionSetResults.forEach((qsr) => {
|
|
264
|
+
report += `| questions${qsr.questionSet} | ${qsr.winsA} | ${qsr.winsB} | ${qsr.ties} |\n`;
|
|
265
|
+
});
|
|
266
|
+
report += `| **Total** | **${totalWinsA}** | **${totalWinsB}** | **${totalTies}** |\n\n`;
|
|
267
|
+
report += `## Detailed Results\n\n`;
|
|
268
|
+
questionSetResults.forEach((qsr) => {
|
|
269
|
+
report += `### Question Set ${qsr.questionSet}\n\n`;
|
|
270
|
+
report += `| Q# | Question | Winner | Reasoning |\n`;
|
|
271
|
+
report += `|----|----------|--------|----------|\n`;
|
|
272
|
+
qsr.results.forEach((r) => {
|
|
273
|
+
const shortQuestion = r.question.length > 35 ? r.question.substring(0, 35) + "..." : r.question;
|
|
274
|
+
const winnerLabel = r.winner === "A" ? "A" : r.winner === "B" ? "B" : "Tie";
|
|
275
|
+
report += `| ${r.questionNum} | ${shortQuestion} | ${winnerLabel} | ${r.reasoning.replace(/\|/g, "\\|")} |\n`;
|
|
276
|
+
});
|
|
277
|
+
report += `\n`;
|
|
278
|
+
});
|
|
279
|
+
// Save report
|
|
280
|
+
const reportPath = join(ARCHIVE_PATH, `comparison-${prefixA}-vs-${prefixB}.md`);
|
|
281
|
+
writeFileSync(reportPath, report);
|
|
282
|
+
console.log();
|
|
283
|
+
console.log(`📄 Report saved to: ${reportPath}`);
|
|
284
|
+
}
|
|
285
|
+
// Run comparison
|
|
286
|
+
compareBenchmarks().catch((error) => {
|
|
287
|
+
console.error("Fatal error:", error);
|
|
288
|
+
process.exit(1);
|
|
289
|
+
});
|