@upstash/context7-mcp 1.0.34-canary.2 → 1.0.34-canary.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@upstash/context7-mcp",
3
- "version": "1.0.34-canary.2",
3
+ "version": "1.0.34-canary.3",
4
4
  "mcpName": "io.github.upstash/context7",
5
5
  "description": "MCP server for Context7",
6
6
  "repository": {
@@ -1,347 +0,0 @@
1
- import "dotenv/config";
2
- import { readFileSync, mkdirSync, renameSync, existsSync, readdirSync, writeFileSync } from "fs";
3
- import { join, dirname } from "path";
4
- import { fileURLToPath } from "url";
5
- import { execSync } from "child_process";
6
- import { simulate } from "./simulate.js";
7
- import { generateText } from "ai";
8
- import { anthropic } from "@ai-sdk/anthropic";
9
- import { openai } from "@ai-sdk/openai";
10
- import { google } from "@ai-sdk/google";
11
- // Check for required environment variables
12
- if (!process.env.CONTEXT7_API_KEY) {
13
- console.error("Error: CONTEXT7_API_KEY environment variable is required");
14
- console.error("Set it in your .env file or export it in your shell");
15
- process.exit(1);
16
- }
17
- const __filename = fileURLToPath(import.meta.url);
18
- const __dirname = dirname(__filename);
19
- // Package root is two levels up from dist/benchmark/
20
- const packageRoot = join(__dirname, "..", "..");
21
- /**
22
- * Get the current git branch name
23
- * @returns The branch name or "unknown" if not in a git repo
24
- */
25
- function getCurrentBranch() {
26
- try {
27
- const branch = execSync("git rev-parse --abbrev-ref HEAD", { encoding: "utf-8" }).trim();
28
- return branch;
29
- }
30
- catch (error) {
31
- console.error("Error getting current branch:", error);
32
- return "unknown";
33
- }
34
- }
35
- /**
36
- * Runs benchmarks by simulating questions from questions.txt
37
- *
38
- * Usage:
39
- * - pnpm run benchmark openai
40
- * - pnpm run benchmark claude
41
- * - pnpm run benchmark gemini
42
- * - pnpm run benchmark openai --test (run only first question)
43
- * - pnpm run benchmark claude 1 output-folder (questionset 1, custom output folder)
44
- * - pnpm run benchmark claude aa.txt output-folder (use aa.txt, custom output folder)
45
- */
46
- async function runBenchmark() {
47
- // Parse arguments
48
- const args = process.argv.slice(2);
49
- const nonFlagArgs = args.filter((a) => !a.startsWith("--"));
50
- const modelArg = nonFlagArgs[0]?.toLowerCase() || "claude";
51
- const questionFileArg = nonFlagArgs[1] || null;
52
- const outputFolderName = nonFlagArgs[2] || null;
53
- const isTestMode = args.includes("--test");
54
- let scoringModel;
55
- let modelName;
56
- if (modelArg === "openai") {
57
- scoringModel = openai("gpt-5");
58
- modelName = "GPT-5";
59
- }
60
- else if (modelArg === "gemini") {
61
- scoringModel = google("gemini-2.5-pro");
62
- modelName = "GEMINI-2.5-PRO";
63
- }
64
- else {
65
- // Default to claude
66
- scoringModel = anthropic("claude-sonnet-4-5");
67
- modelName = "CLAUDE-SONNET-4.5";
68
- }
69
- // Determine the questions file to use
70
- let questionsFileName;
71
- if (!questionFileArg) {
72
- questionsFileName = "questions.txt";
73
- }
74
- else if (questionFileArg.endsWith(".txt")) {
75
- // Filename provided directly
76
- questionsFileName = questionFileArg;
77
- }
78
- else {
79
- // Number provided, construct filename
80
- const questionSetNum = parseInt(questionFileArg, 10);
81
- if (!isNaN(questionSetNum)) {
82
- questionsFileName = `questions${questionSetNum}.txt`;
83
- }
84
- else {
85
- questionsFileName = "questions.txt";
86
- }
87
- }
88
- console.log("=".repeat(80));
89
- console.log("Context7 MCP Benchmark");
90
- console.log("=".repeat(80));
91
- console.log(`Scoring Model: ${modelName}`);
92
- console.log(`Question File: ${questionsFileName}`);
93
- if (isTestMode) {
94
- console.log(`Mode: TEST (first question only)`);
95
- }
96
- console.log();
97
- // Read questions from questions.txt or questionsN.txt (in src/benchmark/questions directory)
98
- const questionsPath = join(packageRoot, "src", "benchmark", "questions", questionsFileName);
99
- console.log(`Reading questions from: ${questionsPath}`);
100
- if (!existsSync(questionsPath)) {
101
- console.error(`Error: questions.txt not found at ${questionsPath}`);
102
- process.exit(1);
103
- }
104
- const questionsContent = readFileSync(questionsPath, "utf-8");
105
- let questions = questionsContent
106
- .split("\n")
107
- .map((line) => line.trim())
108
- .filter((line) => line.length > 0 && !line.startsWith("#")); // Filter empty lines and comments
109
- // Limit to first question if in test mode
110
- if (isTestMode) {
111
- questions = questions.slice(0, 1);
112
- console.log(`Test mode: Running only first question`);
113
- }
114
- else {
115
- console.log(`Found ${questions.length} questions to benchmark`);
116
- }
117
- console.log();
118
- // Get current git branch name
119
- const branchName = getCurrentBranch();
120
- // Create benchmark run directory with custom name or default naming
121
- let benchmarkRunDir;
122
- if (outputFolderName) {
123
- benchmarkRunDir = join(packageRoot, "src", "benchmark", "reports", "benchmarks", outputFolderName);
124
- }
125
- else {
126
- const timestamp = new Date().toISOString().replace(/[:.]/g, "-").split("Z")[0];
127
- benchmarkRunDir = join(packageRoot, "src", "benchmark", "reports", "benchmarks", `${branchName}-run-${timestamp}_${modelName.replace(/[.\s]/g, "-")}`);
128
- }
129
- mkdirSync(benchmarkRunDir, { recursive: true });
130
- console.log(`Benchmark results will be saved to: ${benchmarkRunDir}`);
131
- console.log();
132
- const results = [];
133
- // Run simulation for questions in batches (parallel processing)
134
- // BATCH_SIZE can be set via environment variable (e.g., BATCH_SIZE=1 for sequential)
135
- const startTime = Date.now();
136
- const BATCH_SIZE = process.env.BATCH_SIZE ? parseInt(process.env.BATCH_SIZE, 10) : 7;
137
- console.log(`Execution Mode: ${BATCH_SIZE === 1 ? "Sequential (1 question at a time)" : "Parallel (batch size: " + BATCH_SIZE + ")"}`);
138
- console.log();
139
- for (let batchStart = 0; batchStart < questions.length; batchStart += BATCH_SIZE) {
140
- const batchEnd = Math.min(batchStart + BATCH_SIZE, questions.length);
141
- const batch = questions.slice(batchStart, batchEnd);
142
- console.log("═".repeat(80));
143
- console.log(`Processing Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} (Questions ${batchStart + 1}-${batchEnd})`);
144
- console.log("═".repeat(80));
145
- console.log();
146
- // Process batch in parallel
147
- const batchPromises = batch.map(async (question, batchIndex) => {
148
- const questionNum = batchStart + batchIndex + 1;
149
- console.log(`[Q${questionNum}] Starting: ${question.substring(0, 60)}...`);
150
- try {
151
- // Run simulation with unique ID to prevent filename collisions
152
- const uniqueId = `q${questionNum}`;
153
- await simulate(question, uniqueId);
154
- // Wait a bit to ensure file system operations complete
155
- await new Promise((resolve) => setTimeout(resolve, 100));
156
- // Find the report files created for this question by unique ID
157
- const reportsDir = join(packageRoot, "src", "benchmark", "reports");
158
- const files = readdirSync(reportsDir);
159
- // Look for files containing the unique ID
160
- const mdFile = files.find((f) => f.includes(`_${uniqueId}.md`) && !f.endsWith("_raw.md"));
161
- const rawMdFile = files.find((f) => f.includes(`_${uniqueId}_raw.md`));
162
- if (mdFile && rawMdFile) {
163
- // Move files to benchmark directory with new names
164
- const sourceMd = join(reportsDir, mdFile);
165
- const sourceRawMd = join(reportsDir, rawMdFile);
166
- const destMd = join(benchmarkRunDir, `q${questionNum}.md`);
167
- const destRawMd = join(benchmarkRunDir, `q${questionNum}_raw.md`);
168
- renameSync(sourceMd, destMd);
169
- renameSync(sourceRawMd, destRawMd);
170
- console.log(`[Q${questionNum}] ✅ Completed and saved`);
171
- return {
172
- questionNum,
173
- question,
174
- toolCount: 0, // Will be calculated during scoring
175
- tokenCount: 0, // Will be calculated during scoring
176
- totalTokens: 0, // Will be extracted from report
177
- score: 0, // Will be calculated during scoring
178
- };
179
- }
180
- else {
181
- console.error(`[Q${questionNum}] ⚠️ No report files found (expected: *_${uniqueId}.md)`);
182
- return null;
183
- }
184
- }
185
- catch (error) {
186
- console.error(`[Q${questionNum}] ❌ Error:`, error);
187
- return null;
188
- }
189
- });
190
- // Wait for all questions in this batch to complete
191
- const batchResults = await Promise.all(batchPromises);
192
- // Add successful results to the results array
193
- batchResults.forEach((result) => {
194
- if (result) {
195
- results.push(result);
196
- }
197
- });
198
- console.log();
199
- console.log(`Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} completed: ${batchResults.filter((r) => r).length}/${batch.length} successful`);
200
- console.log();
201
- }
202
- const duration = Date.now() - startTime;
203
- // Scoring phase - also in batches of 5 for parallel processing
204
- console.log();
205
- console.log("=".repeat(80));
206
- console.log("Scoring Phase");
207
- console.log("=".repeat(80));
208
- console.log(`Using ${modelName} to score context quality...`);
209
- console.log();
210
- for (let batchStart = 0; batchStart < results.length; batchStart += BATCH_SIZE) {
211
- const batchEnd = Math.min(batchStart + BATCH_SIZE, results.length);
212
- const batchResults = results.slice(batchStart, batchEnd);
213
- console.log(`Scoring Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} (Questions ${batchStart + 1}-${batchEnd})`);
214
- // Process scoring in parallel
215
- const scoringPromises = batchResults.map(async (result) => {
216
- const rawMdPath = join(benchmarkRunDir, `q${result.questionNum}_raw.md`);
217
- const structuredMdPath = join(benchmarkRunDir, `q${result.questionNum}.md`);
218
- try {
219
- // Read raw markdown file
220
- const rawContent = readFileSync(rawMdPath, "utf-8");
221
- // Count tokens (approximate: split by whitespace and punctuation)
222
- const tokenCount = rawContent.split(/[\s\n]+/).length;
223
- result.tokenCount = tokenCount;
224
- // Count tool calls from structured report and extract total tokens
225
- const structuredContent = readFileSync(structuredMdPath, "utf-8");
226
- const toolCallMatches = structuredContent.match(/### Tool Call \d+:/g);
227
- result.toolCount = toolCallMatches ? toolCallMatches.length : 0;
228
- // Extract total tokens from structured report
229
- const totalTokensMatch = structuredContent.match(/\*\*Total Tokens\*\*: (\d+)/);
230
- result.totalTokens = totalTokensMatch ? parseInt(totalTokensMatch[1], 10) : 0;
231
- // Extract question and context from raw file
232
- const lines = rawContent.split("\n");
233
- const questionLine = lines.find((line) => line.startsWith("QUESTION:"));
234
- const question = questionLine
235
- ? questionLine.replace("QUESTION:", "").trim()
236
- : result.question;
237
- // Get context (everything after "CONTEXT:")
238
- const contextStart = rawContent.indexOf("CONTEXT:");
239
- const context = contextStart !== -1 ? rawContent.substring(contextStart + 8).trim() : rawContent;
240
- console.log(`[Q${result.questionNum}] Scoring...`);
241
- // Ask the scoring model to evaluate the context
242
- const scoringResult = await generateText({
243
- model: scoringModel,
244
- messages: [
245
- {
246
- role: "user",
247
- content: `You are evaluating the quality and usefulness of documentation context for a given question.
248
-
249
- Question: ${question}
250
-
251
- Context provided:
252
- ${context}
253
-
254
- Rate how helpful and relevant this context is for answering the question on a scale of 1-10, where:
255
- - 1-3: Poor - Missing critical information, irrelevant, or unhelpful
256
- - 4-6: Adequate - Has some useful information but gaps exist
257
- - 7-8: Good - Covers most needs with relevant examples
258
- - 9-10: Excellent - Comprehensive, relevant, with clear examples
259
-
260
- Respond with ONLY a JSON object in this format:
261
- {"score": <number>, "reasoning": "<brief explanation>"}`,
262
- },
263
- ],
264
- });
265
- // Parse the score
266
- try {
267
- const jsonMatch = scoringResult.text.match(/\{[\s\S]*\}/);
268
- if (jsonMatch) {
269
- const scoreData = JSON.parse(jsonMatch[0]);
270
- result.score = scoreData.score;
271
- console.log(`[Q${result.questionNum}] Score: ${scoreData.score}/10 - ${scoreData.reasoning.substring(0, 60)}...`);
272
- }
273
- else {
274
- console.log(`[Q${result.questionNum}] ⚠️ Could not parse score, defaulting to 0`);
275
- result.score = 0;
276
- }
277
- }
278
- catch (parseError) {
279
- console.log(`[Q${result.questionNum}] ⚠️ Error parsing score: ${parseError}`);
280
- result.score = 0;
281
- }
282
- }
283
- catch (error) {
284
- console.error(`[Q${result.questionNum}] ❌ Error scoring:`, error);
285
- }
286
- });
287
- // Wait for all scoring in this batch to complete
288
- await Promise.all(scoringPromises);
289
- console.log(`Scoring Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} completed: ${batchEnd - batchStart} questions`);
290
- console.log();
291
- }
292
- // Calculate averages
293
- const avgToolCount = results.reduce((sum, r) => sum + r.toolCount, 0) / results.length;
294
- const avgTokenCount = results.reduce((sum, r) => sum + r.tokenCount, 0) / results.length;
295
- const avgTotalTokens = results.reduce((sum, r) => sum + r.totalTokens, 0) / results.length;
296
- const avgScore = results.reduce((sum, r) => sum + r.score, 0) / results.length;
297
- // Generate result.md
298
- console.log("Generating result.md...");
299
- let resultMd = `# Benchmark Results\n\n`;
300
- resultMd += `**Scoring Model**: ${modelName}\n`;
301
- resultMd += `**Date**: ${new Date().toISOString()}\n`;
302
- resultMd += `**Total Questions**: ${results.length}\n`;
303
- resultMd += `**Total Duration**: ${(duration / 1000).toFixed(2)}s\n\n`;
304
- resultMd += `## Averages\n\n`;
305
- resultMd += `| Metric | Value |\n`;
306
- resultMd += `|--------|-------|\n`;
307
- resultMd += `| Average Tool Calls | ${avgToolCount.toFixed(2)} |\n`;
308
- resultMd += `| Average Token Count | ${avgTokenCount.toFixed(0)} |\n`;
309
- resultMd += `| Average Total Tokens (API) | ${avgTotalTokens.toFixed(0)} |\n`;
310
- resultMd += `| Average Score | ${avgScore.toFixed(2)}/10 |\n\n`;
311
- resultMd += `## Results by Question\n\n`;
312
- results.forEach((result) => {
313
- resultMd += `### Q${result.questionNum}: ${result.question}\n\n`;
314
- resultMd += `| Metric | Value |\n`;
315
- resultMd += `|--------|-------|\n`;
316
- resultMd += `| Tool Calls | ${result.toolCount} |\n`;
317
- resultMd += `| Token Count | ${result.tokenCount} |\n`;
318
- resultMd += `| Total Tokens (API) | ${result.totalTokens} |\n`;
319
- resultMd += `| LLM Score | ${result.score}/10 |\n\n`;
320
- });
321
- const resultPath = join(benchmarkRunDir, "result.md");
322
- writeFileSync(resultPath, resultMd);
323
- console.log(`✅ Results saved to: ${resultPath}`);
324
- console.log();
325
- // Summary
326
- console.log("=".repeat(80));
327
- console.log("Benchmark Complete");
328
- console.log("=".repeat(80));
329
- console.log(`Scoring Model: ${modelName}`);
330
- console.log(`Total questions: ${questions.length}`);
331
- console.log(`Total time: ${(duration / 1000).toFixed(2)}s`);
332
- console.log(`Average time per question: ${(duration / questions.length / 1000).toFixed(2)}s`);
333
- console.log();
334
- console.log(`📊 Scoring Results:`);
335
- console.log(` - Average Tool Calls: ${avgToolCount.toFixed(2)}`);
336
- console.log(` - Average Token Count: ${avgTokenCount.toFixed(0)}`);
337
- console.log(` - Average Total Tokens (API): ${avgTotalTokens.toFixed(0)}`);
338
- console.log(` - Average Score: ${avgScore.toFixed(2)}/10`);
339
- console.log();
340
- console.log(`Results saved to: ${benchmarkRunDir}`);
341
- console.log("=".repeat(80));
342
- }
343
- // Run benchmark
344
- runBenchmark().catch((error) => {
345
- console.error("Fatal error:", error);
346
- process.exit(1);
347
- });
@@ -1,289 +0,0 @@
1
- import "dotenv/config";
2
- import { readFileSync, readdirSync, writeFileSync, existsSync } from "fs";
3
- import { join, dirname } from "path";
4
- import { fileURLToPath } from "url";
5
- import { generateText } from "ai";
6
- import { anthropic } from "@ai-sdk/anthropic";
7
- const __filename = fileURLToPath(import.meta.url);
8
- const __dirname = dirname(__filename);
9
- // Package root is two levels up from dist/benchmark/
10
- const packageRoot = join(__dirname, "..", "..");
11
- const ARCHIVE_PATH = join(packageRoot, "src", "benchmark", "reports", "benchmarks", "archive");
12
- // Question sets to compare (1-8)
13
- const QUESTION_SETS = [1, 2, 3, 4, 5, 6, 7, 8];
14
- /**
15
- * Compare two benchmark runs across all question sets
16
- *
17
- * Usage:
18
- * pnpm run compare-benchmark <prefix-a> <prefix-b>
19
- *
20
- * Example:
21
- * pnpm run compare-benchmark CTX7-943-run-3 single-params-run-0
22
- *
23
- * This will compare folders:
24
- * {prefix-a}-file-questions{1-8}-model-claude vs {prefix-b}-file-questions{1-8}-model-claude
25
- */
26
- async function compareBenchmarks() {
27
- const args = process.argv.slice(2);
28
- if (args.length < 2) {
29
- console.error("Usage: pnpm run compare-benchmark <prefix-a> <prefix-b>");
30
- console.error("Example: pnpm run compare-benchmark CTX7-943-run-3 single-params-run-0");
31
- console.error("");
32
- console.error("This compares folders matching pattern: {prefix}-file-questions{1-8}-model-claude");
33
- process.exit(1);
34
- }
35
- const prefixA = args[0];
36
- const prefixB = args[1];
37
- console.log("=".repeat(80));
38
- console.log("Context7 Benchmark Comparison (All Question Sets)");
39
- console.log("=".repeat(80));
40
- console.log(`Prefix A: ${prefixA}`);
41
- console.log(`Prefix B: ${prefixB}`);
42
- console.log(`Judge: Claude Sonnet 4.5`);
43
- console.log(`Archive Path: ${ARCHIVE_PATH}`);
44
- console.log();
45
- const model = anthropic("claude-sonnet-4-5");
46
- const allResults = [];
47
- const questionSetResults = [];
48
- // Aggregate counters
49
- let totalWinsA = 0;
50
- let totalWinsB = 0;
51
- let totalTies = 0;
52
- // Process each question set
53
- for (const questionSet of QUESTION_SETS) {
54
- const folderNameA = `${prefixA}-file-questions${questionSet}-model-claude`;
55
- const folderNameB = `${prefixB}-file-questions${questionSet}-model-claude`;
56
- const folderA = join(ARCHIVE_PATH, folderNameA);
57
- const folderB = join(ARCHIVE_PATH, folderNameB);
58
- console.log("═".repeat(80));
59
- console.log(`Question Set ${questionSet}`);
60
- console.log("═".repeat(80));
61
- // Check if both folders exist
62
- if (!existsSync(folderA)) {
63
- console.log(`⚠️ Skipping: Folder A not found: ${folderNameA}`);
64
- console.log();
65
- continue;
66
- }
67
- if (!existsSync(folderB)) {
68
- console.log(`⚠️ Skipping: Folder B not found: ${folderNameB}`);
69
- console.log();
70
- continue;
71
- }
72
- // Find all raw files in both folders
73
- const filesA = readdirSync(folderA).filter((f) => f.match(/^q\d+_raw\.md$/));
74
- const filesB = readdirSync(folderB).filter((f) => f.match(/^q\d+_raw\.md$/));
75
- // Get question numbers from both folders
76
- const questionsA = new Set(filesA.map((f) => parseInt(f.match(/q(\d+)_raw\.md/)?.[1] || "0")));
77
- const questionsB = new Set(filesB.map((f) => parseInt(f.match(/q(\d+)_raw\.md/)?.[1] || "0")));
78
- // Find common questions
79
- const commonQuestions = [...questionsA].filter((q) => questionsB.has(q)).sort((a, b) => a - b);
80
- if (commonQuestions.length === 0) {
81
- console.log(`⚠️ Skipping: No common questions found`);
82
- console.log();
83
- continue;
84
- }
85
- console.log(`Found ${commonQuestions.length} common questions`);
86
- let setWinsA = 0;
87
- let setWinsB = 0;
88
- let setTies = 0;
89
- const setResults = [];
90
- // Process questions in batches for parallel execution
91
- const BATCH_SIZE = 5;
92
- for (let batchStart = 0; batchStart < commonQuestions.length; batchStart += BATCH_SIZE) {
93
- const batchEnd = Math.min(batchStart + BATCH_SIZE, commonQuestions.length);
94
- const batch = commonQuestions.slice(batchStart, batchEnd);
95
- const batchPromises = batch.map(async (questionNum) => {
96
- const fileA = join(folderA, `q${questionNum}_raw.md`);
97
- const fileB = join(folderB, `q${questionNum}_raw.md`);
98
- const contentA = readFileSync(fileA, "utf-8");
99
- const contentB = readFileSync(fileB, "utf-8");
100
- // Extract question from the content
101
- const questionMatch = contentA.match(/QUESTION:\s*(.+?)(?:\n|$)/);
102
- const question = questionMatch ? questionMatch[1].trim() : `Question ${questionNum}`;
103
- // Extract just the context part (after CONTEXT:)
104
- const extractContext = (content) => {
105
- const contextStart = content.indexOf("CONTEXT:");
106
- if (contextStart === -1)
107
- return content;
108
- return content.substring(contextStart + 8).trim();
109
- };
110
- const contextA = extractContext(contentA);
111
- const contextB = extractContext(contentB);
112
- console.log(` [Q${questionNum}] Comparing: ${question.substring(0, 45)}...`);
113
- try {
114
- const result = await generateText({
115
- model,
116
- messages: [
117
- {
118
- role: "user",
119
- content: `You are a technical documentation expert evaluating which context is more helpful for answering a programming question.
120
-
121
- QUESTION: ${question}
122
-
123
- === ANSWER A ===
124
- ${contextA}
125
-
126
- === ANSWER B ===
127
- ${contextB}
128
-
129
- Compare these two answers and determine which one is better for helping a developer answer the question. Consider:
130
- 1. Relevance - Does it directly address the question?
131
- 2. Code examples - Are there working, relevant code snippets?
132
- 3. Completeness - Does it cover the key aspects needed?
133
- 4. Clarity - Is the information well-organized and easy to understand?
134
- 5. Accuracy - Does it appear technically correct?
135
-
136
- Respond with ONLY a JSON object in this format:
137
- {"winner": "A" | "B" | "tie", "reasoning": "<brief 1-2 sentence explanation>"}
138
-
139
- If both are roughly equal in quality, respond with "tie".`,
140
- },
141
- ],
142
- });
143
- // Parse the result
144
- const jsonMatch = result.text.match(/\{[\s\S]*\}/);
145
- if (jsonMatch) {
146
- const parsed = JSON.parse(jsonMatch[0]);
147
- const rawWinner = String(parsed.winner).toUpperCase();
148
- const winner = rawWinner === "A" ? "A" : rawWinner === "B" ? "B" : "tie";
149
- console.log(` [Q${questionNum}] Winner: ${winner}`);
150
- return {
151
- questionSet,
152
- questionNum,
153
- question,
154
- winner,
155
- reasoning: parsed.reasoning,
156
- };
157
- }
158
- else {
159
- console.log(` [Q${questionNum}] ⚠️ Could not parse result, marking as tie`);
160
- return {
161
- questionSet,
162
- questionNum,
163
- question,
164
- winner: "tie",
165
- reasoning: "Failed to parse LLM response",
166
- };
167
- }
168
- }
169
- catch (error) {
170
- console.error(` [Q${questionNum}] ❌ Error:`, error);
171
- return {
172
- questionSet,
173
- questionNum,
174
- question,
175
- winner: "tie",
176
- reasoning: `Error during comparison: ${error}`,
177
- };
178
- }
179
- });
180
- const batchResults = await Promise.all(batchPromises);
181
- setResults.push(...batchResults);
182
- // Update set counts
183
- batchResults.forEach((r) => {
184
- if (r.winner === "A")
185
- setWinsA++;
186
- else if (r.winner === "B")
187
- setWinsB++;
188
- else
189
- setTies++;
190
- });
191
- }
192
- // Store set results
193
- questionSetResults.push({
194
- questionSet,
195
- winsA: setWinsA,
196
- winsB: setWinsB,
197
- ties: setTies,
198
- results: setResults,
199
- });
200
- allResults.push(...setResults);
201
- // Update totals
202
- totalWinsA += setWinsA;
203
- totalWinsB += setWinsB;
204
- totalTies += setTies;
205
- console.log(` Summary: A=${setWinsA}, B=${setWinsB}, Ties=${setTies}`);
206
- console.log();
207
- }
208
- // Generate final summary
209
- console.log("=".repeat(80));
210
- console.log("COMPARISON RESULTS");
211
- console.log("=".repeat(80));
212
- console.log();
213
- console.log("📊 Results by Question Set:");
214
- questionSetResults.forEach((qsr) => {
215
- console.log(` questions${qsr.questionSet}: A=${qsr.winsA}, B=${qsr.winsB}, Ties=${qsr.ties}`);
216
- });
217
- console.log();
218
- console.log("📊 Aggregate Score:");
219
- console.log(` ${prefixA}`);
220
- console.log(` → ${totalWinsA} wins`);
221
- console.log();
222
- console.log(` ${prefixB}`);
223
- console.log(` → ${totalWinsB} wins`);
224
- console.log();
225
- console.log(` Ties: ${totalTies}`);
226
- console.log();
227
- // Determine overall winner
228
- if (totalWinsA > totalWinsB) {
229
- console.log(`🏆 Winner: ${prefixA}`);
230
- }
231
- else if (totalWinsB > totalWinsA) {
232
- console.log(`🏆 Winner: ${prefixB}`);
233
- }
234
- else {
235
- console.log(`🤝 Result: TIE`);
236
- }
237
- console.log("=".repeat(80));
238
- // Generate markdown report
239
- let report = `# Benchmark Comparison Results\n\n`;
240
- report += `**Date**: ${new Date().toISOString()}\n`;
241
- report += `**Judge**: Claude Sonnet 4.5\n\n`;
242
- report += `## Prefixes Compared\n\n`;
243
- report += `- **Prefix A**: ${prefixA}\n`;
244
- report += `- **Prefix B**: ${prefixB}\n\n`;
245
- report += `## Aggregate Summary\n\n`;
246
- report += `| Prefix | Wins |\n`;
247
- report += `|--------|------|\n`;
248
- report += `| ${prefixA} | ${totalWinsA} |\n`;
249
- report += `| ${prefixB} | ${totalWinsB} |\n`;
250
- report += `| Ties | ${totalTies} |\n\n`;
251
- if (totalWinsA > totalWinsB) {
252
- report += `**Winner**: ${prefixA}\n\n`;
253
- }
254
- else if (totalWinsB > totalWinsA) {
255
- report += `**Winner**: ${prefixB}\n\n`;
256
- }
257
- else {
258
- report += `**Result**: TIE\n\n`;
259
- }
260
- report += `## Results by Question Set\n\n`;
261
- report += `| Question Set | A Wins | B Wins | Ties |\n`;
262
- report += `|--------------|--------|--------|------|\n`;
263
- questionSetResults.forEach((qsr) => {
264
- report += `| questions${qsr.questionSet} | ${qsr.winsA} | ${qsr.winsB} | ${qsr.ties} |\n`;
265
- });
266
- report += `| **Total** | **${totalWinsA}** | **${totalWinsB}** | **${totalTies}** |\n\n`;
267
- report += `## Detailed Results\n\n`;
268
- questionSetResults.forEach((qsr) => {
269
- report += `### Question Set ${qsr.questionSet}\n\n`;
270
- report += `| Q# | Question | Winner | Reasoning |\n`;
271
- report += `|----|----------|--------|----------|\n`;
272
- qsr.results.forEach((r) => {
273
- const shortQuestion = r.question.length > 35 ? r.question.substring(0, 35) + "..." : r.question;
274
- const winnerLabel = r.winner === "A" ? "A" : r.winner === "B" ? "B" : "Tie";
275
- report += `| ${r.questionNum} | ${shortQuestion} | ${winnerLabel} | ${r.reasoning.replace(/\|/g, "\\|")} |\n`;
276
- });
277
- report += `\n`;
278
- });
279
- // Save report
280
- const reportPath = join(ARCHIVE_PATH, `comparison-${prefixA}-vs-${prefixB}.md`);
281
- writeFileSync(reportPath, report);
282
- console.log();
283
- console.log(`📄 Report saved to: ${reportPath}`);
284
- }
285
- // Run comparison
286
- compareBenchmarks().catch((error) => {
287
- console.error("Fatal error:", error);
288
- process.exit(1);
289
- });