@upstash/context7-mcp 1.0.33 → 1.0.34-canary.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmark/benchmark.js +347 -0
- package/dist/benchmark/compare-benchmark.js +289 -0
- package/dist/benchmark/run-benchmark.js +459 -0
- package/dist/benchmark/simulate.js +319 -0
- package/dist/index.js +33 -131
- package/dist/lib/api.js +19 -80
- package/dist/lib/types.js +1 -7
- package/package.json +10 -2
|
@@ -0,0 +1,459 @@
|
|
|
1
|
+
#!/usr/bin/env tsx
|
|
2
|
+
import "dotenv/config";
|
|
3
|
+
import { execSync } from "child_process";
|
|
4
|
+
import { existsSync, mkdirSync, readdirSync, renameSync, readFileSync, writeFileSync } from "fs";
|
|
5
|
+
import { join } from "path";
|
|
6
|
+
import { fileURLToPath } from "url";
|
|
7
|
+
import { dirname } from "path";
|
|
8
|
+
// Check for required environment variables
|
|
9
|
+
if (!process.env.CONTEXT7_API_KEY) {
|
|
10
|
+
console.error("Error: CONTEXT7_API_KEY environment variable is required");
|
|
11
|
+
console.error("Set it in your .env file or export it in your shell");
|
|
12
|
+
process.exit(1);
|
|
13
|
+
}
|
|
14
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
15
|
+
const __dirname = dirname(__filename);
|
|
16
|
+
// Package root is two levels up from dist/benchmark/
|
|
17
|
+
const packageRoot = join(__dirname, "..", "..");
|
|
18
|
+
/**
|
|
19
|
+
* Get the current git branch name
|
|
20
|
+
* @returns The branch name or "unknown" if not in a git repo
|
|
21
|
+
*/
|
|
22
|
+
function getCurrentBranch() {
|
|
23
|
+
try {
|
|
24
|
+
const branch = execSync("git rev-parse --abbrev-ref HEAD", { encoding: "utf-8" }).trim();
|
|
25
|
+
return branch;
|
|
26
|
+
}
|
|
27
|
+
catch {
|
|
28
|
+
return "unknown";
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Find all run reports for a given branch and model
|
|
33
|
+
* @param benchmarksDir The benchmarks directory path
|
|
34
|
+
* @param branchName The branch name
|
|
35
|
+
* @param model The model name
|
|
36
|
+
* @returns Array of run report filenames with their run IDs
|
|
37
|
+
*/
|
|
38
|
+
function findExistingRuns(benchmarksDir, branchName, model) {
|
|
39
|
+
if (!existsSync(benchmarksDir)) {
|
|
40
|
+
return [];
|
|
41
|
+
}
|
|
42
|
+
const files = readdirSync(benchmarksDir);
|
|
43
|
+
const pattern = new RegExp(`^${branchName}-run-(\\d+)-model-${model}\\.md$`);
|
|
44
|
+
const runs = [];
|
|
45
|
+
for (const file of files) {
|
|
46
|
+
const match = file.match(pattern);
|
|
47
|
+
if (match) {
|
|
48
|
+
runs.push({
|
|
49
|
+
filename: file,
|
|
50
|
+
runId: match[1],
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
// Sort by run ID (numeric)
|
|
55
|
+
runs.sort((a, b) => parseInt(a.runId, 10) - parseInt(b.runId, 10));
|
|
56
|
+
return runs;
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Parse a run report to extract key metrics
|
|
60
|
+
* @param reportPath Path to the report file
|
|
61
|
+
* @returns Parsed data or null if parsing fails
|
|
62
|
+
*/
|
|
63
|
+
function parseRunReport(reportPath) {
|
|
64
|
+
try {
|
|
65
|
+
const content = readFileSync(reportPath, "utf-8");
|
|
66
|
+
const dateMatch = content.match(/\*\*Date\*\*: (.+)/);
|
|
67
|
+
const branchMatch = content.match(/\*\*Branch\*\*: (.+)/);
|
|
68
|
+
const runIdMatch = content.match(/\*\*Run ID\*\*: (.+)/);
|
|
69
|
+
const modelMatch = content.match(/\*\*Model\*\*: (.+)/);
|
|
70
|
+
const questionSetsMatch = content.match(/\*\*Question Sets\*\*: (\d+)/);
|
|
71
|
+
const totalQuestionsMatch = content.match(/\*\*Total Questions\*\*: (\d+)/);
|
|
72
|
+
const durationMatch = content.match(/\*\*Total Duration\*\*: ([\d.]+)s/);
|
|
73
|
+
const toolCallsMatch = content.match(/\| Average Tool Calls \| ([\d.]+) \|/);
|
|
74
|
+
const tokenCountMatch = content.match(/\| Average Token Count \| ([\d.]+) \|/);
|
|
75
|
+
const totalTokensMatch = content.match(/\| Average Total Tokens \(API\) \| ([\d.]+) \|/);
|
|
76
|
+
const scoreMatch = content.match(/\| Average Score \| \*\*([\d.]+)\/10\*\* \|/);
|
|
77
|
+
if (!dateMatch ||
|
|
78
|
+
!branchMatch ||
|
|
79
|
+
!runIdMatch ||
|
|
80
|
+
!modelMatch ||
|
|
81
|
+
!questionSetsMatch ||
|
|
82
|
+
!totalQuestionsMatch ||
|
|
83
|
+
!durationMatch ||
|
|
84
|
+
!toolCallsMatch ||
|
|
85
|
+
!tokenCountMatch ||
|
|
86
|
+
!scoreMatch) {
|
|
87
|
+
return null;
|
|
88
|
+
}
|
|
89
|
+
return {
|
|
90
|
+
date: dateMatch[1],
|
|
91
|
+
branchName: branchMatch[1],
|
|
92
|
+
runId: runIdMatch[1],
|
|
93
|
+
model: modelMatch[1],
|
|
94
|
+
questionSets: parseInt(questionSetsMatch[1], 10),
|
|
95
|
+
totalQuestions: parseInt(totalQuestionsMatch[1], 10),
|
|
96
|
+
totalDuration: parseFloat(durationMatch[1]),
|
|
97
|
+
avgToolCalls: parseFloat(toolCallsMatch[1]),
|
|
98
|
+
avgTokenCount: parseFloat(tokenCountMatch[1]),
|
|
99
|
+
avgTotalTokens: totalTokensMatch ? parseFloat(totalTokensMatch[1]) : 0,
|
|
100
|
+
avgScore: parseFloat(scoreMatch[1]),
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
catch (error) {
|
|
104
|
+
console.error(`Error parsing report ${reportPath}:`, error);
|
|
105
|
+
return null;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Generate an aggregating summary report from multiple runs
|
|
110
|
+
* @param benchmarksDir The benchmarks directory path
|
|
111
|
+
* @param branchName The branch name
|
|
112
|
+
* @param model The model name
|
|
113
|
+
* @param runs The list of runs to aggregate
|
|
114
|
+
* @returns The summary markdown content
|
|
115
|
+
*/
|
|
116
|
+
function generateAggregateSummary(benchmarksDir, branchName, model, runs) {
|
|
117
|
+
const runData = runs
|
|
118
|
+
.map((run) => parseRunReport(join(benchmarksDir, run.filename)))
|
|
119
|
+
.filter((data) => data !== null);
|
|
120
|
+
if (runData.length === 0) {
|
|
121
|
+
return "";
|
|
122
|
+
}
|
|
123
|
+
// Calculate aggregate statistics
|
|
124
|
+
const totalRuns = runData.length;
|
|
125
|
+
const avgToolCalls = runData.reduce((sum, d) => sum + d.avgToolCalls, 0) / totalRuns;
|
|
126
|
+
const avgTokenCount = runData.reduce((sum, d) => sum + d.avgTokenCount, 0) / totalRuns;
|
|
127
|
+
const avgTotalTokens = runData.reduce((sum, d) => sum + d.avgTotalTokens, 0) / totalRuns;
|
|
128
|
+
const avgScore = runData.reduce((sum, d) => sum + d.avgScore, 0) / totalRuns;
|
|
129
|
+
const totalDuration = runData.reduce((sum, d) => sum + d.totalDuration, 0);
|
|
130
|
+
const totalQuestions = runData.reduce((sum, d) => sum + d.totalQuestions, 0);
|
|
131
|
+
// Calculate standard deviations
|
|
132
|
+
const stdDevScore = Math.sqrt(runData.reduce((sum, d) => sum + Math.pow(d.avgScore - avgScore, 2), 0) / totalRuns);
|
|
133
|
+
const stdDevToolCalls = Math.sqrt(runData.reduce((sum, d) => sum + Math.pow(d.avgToolCalls - avgToolCalls, 2), 0) / totalRuns);
|
|
134
|
+
const stdDevTokenCount = Math.sqrt(runData.reduce((sum, d) => sum + Math.pow(d.avgTokenCount - avgTokenCount, 2), 0) / totalRuns);
|
|
135
|
+
const stdDevTotalTokens = Math.sqrt(runData.reduce((sum, d) => sum + Math.pow(d.avgTotalTokens - avgTotalTokens, 2), 0) / totalRuns);
|
|
136
|
+
// Get min/max scores
|
|
137
|
+
const minScore = Math.min(...runData.map((d) => d.avgScore));
|
|
138
|
+
const maxScore = Math.max(...runData.map((d) => d.avgScore));
|
|
139
|
+
// Generate markdown
|
|
140
|
+
const latestDate = runData[runData.length - 1].date;
|
|
141
|
+
let md = `# Aggregate Summary Report\n\n`;
|
|
142
|
+
md += `**Branch**: ${branchName}\n`;
|
|
143
|
+
md += `**Model**: ${model}\n`;
|
|
144
|
+
md += `**Total Runs**: ${totalRuns}\n`;
|
|
145
|
+
md += `**Last Updated**: ${latestDate}\n`;
|
|
146
|
+
md += `**Total Questions Across All Runs**: ${totalQuestions}\n`;
|
|
147
|
+
md += `**Total Duration Across All Runs**: ${totalDuration.toFixed(2)}s (${(totalDuration / 60).toFixed(1)}m)\n\n`;
|
|
148
|
+
md += `## Overall Statistics (Averaged Across ${totalRuns} Runs)\n\n`;
|
|
149
|
+
md += `| Metric | Mean | Std Dev | Min | Max |\n`;
|
|
150
|
+
md += `|--------|------|---------|-----|-----|\n`;
|
|
151
|
+
md += `| Average Score | **${avgScore.toFixed(2)}/10** | ${stdDevScore.toFixed(2)} | ${minScore.toFixed(2)} | ${maxScore.toFixed(2)} |\n`;
|
|
152
|
+
md += `| Average Tool Calls | ${avgToolCalls.toFixed(2)} | ${stdDevToolCalls.toFixed(2)} | ${Math.min(...runData.map((d) => d.avgToolCalls)).toFixed(2)} | ${Math.max(...runData.map((d) => d.avgToolCalls)).toFixed(2)} |\n`;
|
|
153
|
+
md += `| Average Token Count | ${avgTokenCount.toFixed(0)} | ${stdDevTokenCount.toFixed(0)} | ${Math.min(...runData.map((d) => d.avgTokenCount)).toFixed(0)} | ${Math.max(...runData.map((d) => d.avgTokenCount)).toFixed(0)} |\n`;
|
|
154
|
+
md += `| Average Total Tokens (API) | ${avgTotalTokens.toFixed(0)} | ${stdDevTotalTokens.toFixed(0)} | ${Math.min(...runData.map((d) => d.avgTotalTokens)).toFixed(0)} | ${Math.max(...runData.map((d) => d.avgTotalTokens)).toFixed(0)} |\n\n`;
|
|
155
|
+
md += `## Individual Run Results\n\n`;
|
|
156
|
+
md += `| Run ID | Date | Avg Score | Avg Tool Calls | Avg Token Count | Avg Total Tokens (API) | Questions | Duration |\n`;
|
|
157
|
+
md += `|--------|------|-----------|----------------|-----------------|------------------------|-----------|----------|\n`;
|
|
158
|
+
runData.forEach((data) => {
|
|
159
|
+
md += `| ${data.runId} | ${data.date} | ${data.avgScore.toFixed(2)}/10 | ${data.avgToolCalls.toFixed(2)} | ${data.avgTokenCount.toFixed(0)} | ${data.avgTotalTokens.toFixed(0)} | ${data.totalQuestions} | ${data.totalDuration.toFixed(2)}s |\n`;
|
|
160
|
+
});
|
|
161
|
+
md += `\n## Run Reports\n\n`;
|
|
162
|
+
runs.forEach((run) => {
|
|
163
|
+
md += `- [Run ${run.runId}](${run.filename})\n`;
|
|
164
|
+
});
|
|
165
|
+
return md;
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Finds all question files in the benchmark directory
|
|
169
|
+
* @param packageRoot The package root directory
|
|
170
|
+
* @returns Array of question file numbers
|
|
171
|
+
*/
|
|
172
|
+
function findAllQuestionFiles(packageRoot) {
|
|
173
|
+
const benchmarkDir = join(packageRoot, "src", "benchmark", "questions");
|
|
174
|
+
if (!existsSync(benchmarkDir)) {
|
|
175
|
+
return [];
|
|
176
|
+
}
|
|
177
|
+
const files = readdirSync(benchmarkDir);
|
|
178
|
+
const questionNumbers = [];
|
|
179
|
+
const pattern = /^questions(\d+)\.txt$/;
|
|
180
|
+
for (const file of files) {
|
|
181
|
+
const match = file.match(pattern);
|
|
182
|
+
if (match) {
|
|
183
|
+
questionNumbers.push(parseInt(match[1], 10));
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
// Sort numerically
|
|
187
|
+
questionNumbers.sort((a, b) => a - b);
|
|
188
|
+
return questionNumbers;
|
|
189
|
+
}
|
|
190
|
+
/**
|
|
191
|
+
* Runs multiple benchmarks in parallel with different question sets
|
|
192
|
+
*
|
|
193
|
+
* Usage:
|
|
194
|
+
* - pnpm run run-benchmarks claude
|
|
195
|
+
* Runs all available questions files (questions1.txt, questions2.txt, etc.) with auto-detected next run ID
|
|
196
|
+
* - pnpm run run-benchmarks claude 7
|
|
197
|
+
* Runs only questions7.txt with auto-detected next run ID
|
|
198
|
+
* - pnpm run run-benchmarks claude aa.txt
|
|
199
|
+
* Runs only aa.txt with auto-detected next run ID
|
|
200
|
+
* - pnpm run run-benchmarks claude 7 0
|
|
201
|
+
* Runs only questions7.txt with run ID 0
|
|
202
|
+
*/
|
|
203
|
+
async function runBenchmarks() {
|
|
204
|
+
const args = process.argv.slice(2);
|
|
205
|
+
const model = args[0] || "claude";
|
|
206
|
+
// Determine which question files to run
|
|
207
|
+
let questionFiles;
|
|
208
|
+
let runId = args[2];
|
|
209
|
+
const benchmarkDir = join(packageRoot, "src", "benchmark", "questions");
|
|
210
|
+
if (args.length === 1 || args[1] === undefined) {
|
|
211
|
+
// No second argument - run all question files
|
|
212
|
+
const questionSets = findAllQuestionFiles(packageRoot);
|
|
213
|
+
if (questionSets.length === 0) {
|
|
214
|
+
console.error("No question files found in src/benchmark/questions/");
|
|
215
|
+
process.exit(1);
|
|
216
|
+
}
|
|
217
|
+
questionFiles = questionSets.map((num) => `questions${num}.txt`);
|
|
218
|
+
console.log(`Found ${questionFiles.length} question files: ${questionFiles.join(", ")}`);
|
|
219
|
+
}
|
|
220
|
+
else {
|
|
221
|
+
const secondArg = args[1];
|
|
222
|
+
// Check if it's a filename (contains .txt) or a number
|
|
223
|
+
if (secondArg.endsWith(".txt")) {
|
|
224
|
+
// Filename provided
|
|
225
|
+
const filePath = join(benchmarkDir, secondArg);
|
|
226
|
+
if (!existsSync(filePath)) {
|
|
227
|
+
console.error(`Error: ${secondArg} not found at ${filePath}`);
|
|
228
|
+
process.exit(1);
|
|
229
|
+
}
|
|
230
|
+
questionFiles = [secondArg];
|
|
231
|
+
console.log(`Running with file: ${secondArg}`);
|
|
232
|
+
}
|
|
233
|
+
else {
|
|
234
|
+
// Number provided - run that specific question set
|
|
235
|
+
const questionSetNum = parseInt(secondArg, 10);
|
|
236
|
+
if (isNaN(questionSetNum) || questionSetNum < 1) {
|
|
237
|
+
console.error("Question set number must be a positive number");
|
|
238
|
+
process.exit(1);
|
|
239
|
+
}
|
|
240
|
+
questionFiles = [`questions${questionSetNum}.txt`];
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
const count = questionFiles.length;
|
|
244
|
+
// Get current git branch
|
|
245
|
+
const branchName = getCurrentBranch();
|
|
246
|
+
// Auto-detect next run ID if not provided
|
|
247
|
+
if (!runId) {
|
|
248
|
+
const benchmarksDir = join(packageRoot, "src", "benchmark", "reports", "benchmarks");
|
|
249
|
+
// Create benchmarks directory if it doesn't exist
|
|
250
|
+
if (!existsSync(benchmarksDir)) {
|
|
251
|
+
mkdirSync(benchmarksDir, { recursive: true });
|
|
252
|
+
}
|
|
253
|
+
const existingRuns = findExistingRuns(benchmarksDir, branchName, model);
|
|
254
|
+
if (existingRuns.length === 0) {
|
|
255
|
+
runId = "0";
|
|
256
|
+
console.log(`No existing runs found. Starting with run ID: ${runId}`);
|
|
257
|
+
}
|
|
258
|
+
else {
|
|
259
|
+
const maxRunId = Math.max(...existingRuns.map((r) => parseInt(r.runId, 10)));
|
|
260
|
+
runId = (maxRunId + 1).toString();
|
|
261
|
+
console.log(`Found ${existingRuns.length} existing runs (${existingRuns.map((r) => r.runId).join(", ")})`);
|
|
262
|
+
console.log(`Next run ID: ${runId}`);
|
|
263
|
+
}
|
|
264
|
+
console.log();
|
|
265
|
+
}
|
|
266
|
+
console.log("=".repeat(80));
|
|
267
|
+
console.log("Context7 MCP Parallel Benchmark Runner");
|
|
268
|
+
console.log("=".repeat(80));
|
|
269
|
+
console.log(`Model: ${model}`);
|
|
270
|
+
console.log(`Branch: ${branchName}`);
|
|
271
|
+
console.log(`Run ID: ${runId}`);
|
|
272
|
+
console.log(`Question Files: ${count} (${questionFiles.join(", ")})`);
|
|
273
|
+
console.log();
|
|
274
|
+
// Validate that all question files exist
|
|
275
|
+
for (const questionFile of questionFiles) {
|
|
276
|
+
const questionsPath = join(benchmarkDir, questionFile);
|
|
277
|
+
if (!existsSync(questionsPath)) {
|
|
278
|
+
console.error(`Error: ${questionFile} not found at ${questionsPath}`);
|
|
279
|
+
process.exit(1);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
// Create output folder names
|
|
283
|
+
const outputFolders = [];
|
|
284
|
+
for (const questionFile of questionFiles) {
|
|
285
|
+
// Remove .txt extension for folder name
|
|
286
|
+
const baseName = questionFile.replace(".txt", "");
|
|
287
|
+
const folderName = `${branchName}-run-${runId}-file-${baseName}-model-${model}`;
|
|
288
|
+
outputFolders.push(folderName);
|
|
289
|
+
}
|
|
290
|
+
console.log("Output folders:");
|
|
291
|
+
outputFolders.forEach((folder, i) => {
|
|
292
|
+
console.log(` [${i + 1}] ${folder}`);
|
|
293
|
+
});
|
|
294
|
+
console.log();
|
|
295
|
+
// Run benchmarks in parallel
|
|
296
|
+
console.log("=".repeat(80));
|
|
297
|
+
console.log("Starting parallel benchmarks...");
|
|
298
|
+
console.log("=".repeat(80));
|
|
299
|
+
console.log();
|
|
300
|
+
const startTime = Date.now();
|
|
301
|
+
try {
|
|
302
|
+
// Run all benchmarks in parallel using Promise.all
|
|
303
|
+
const benchmarkPromises = [];
|
|
304
|
+
for (let i = 0; i < questionFiles.length; i++) {
|
|
305
|
+
const questionFile = questionFiles[i];
|
|
306
|
+
const outputFolder = outputFolders[i];
|
|
307
|
+
console.log(`[${questionFile}] Starting benchmark with ${questionFile}...`);
|
|
308
|
+
const promise = new Promise((resolve, reject) => {
|
|
309
|
+
try {
|
|
310
|
+
execSync(`node dist/benchmark/benchmark.js ${model} ${questionFile} ${outputFolder}`, {
|
|
311
|
+
stdio: "inherit",
|
|
312
|
+
encoding: "utf-8",
|
|
313
|
+
cwd: packageRoot,
|
|
314
|
+
});
|
|
315
|
+
console.log(`[${questionFile}] ✅ Completed`);
|
|
316
|
+
resolve();
|
|
317
|
+
}
|
|
318
|
+
catch (error) {
|
|
319
|
+
console.error(`[${questionFile}] ❌ Failed:`, error);
|
|
320
|
+
reject(error);
|
|
321
|
+
}
|
|
322
|
+
});
|
|
323
|
+
benchmarkPromises.push(promise);
|
|
324
|
+
}
|
|
325
|
+
// Wait for all benchmarks to complete
|
|
326
|
+
await Promise.all(benchmarkPromises);
|
|
327
|
+
const duration = Date.now() - startTime;
|
|
328
|
+
console.log();
|
|
329
|
+
console.log("=".repeat(80));
|
|
330
|
+
console.log("All benchmarks completed!");
|
|
331
|
+
console.log(`Total time: ${(duration / 1000).toFixed(2)}s`);
|
|
332
|
+
console.log("=".repeat(80));
|
|
333
|
+
console.log();
|
|
334
|
+
// Prepare summary by combining results
|
|
335
|
+
console.log("Preparing summary...");
|
|
336
|
+
console.log();
|
|
337
|
+
// Use combine-summaries to create a combined summary
|
|
338
|
+
const benchmarksDir = join(packageRoot, "src", "benchmark", "reports", "benchmarks");
|
|
339
|
+
// Collect all result.md files from the output folders
|
|
340
|
+
const summaryData = [];
|
|
341
|
+
for (let i = 0; i < questionFiles.length; i++) {
|
|
342
|
+
const questionFile = questionFiles[i];
|
|
343
|
+
const folderPath = join(benchmarksDir, outputFolders[i]);
|
|
344
|
+
const resultPath = join(folderPath, "result.md");
|
|
345
|
+
if (existsSync(resultPath)) {
|
|
346
|
+
// Parse result.md to extract averages
|
|
347
|
+
const resultContent = readFileSync(resultPath, "utf-8");
|
|
348
|
+
const toolCallsMatch = resultContent.match(/Average Tool Calls \| ([\d.]+)/);
|
|
349
|
+
const tokenCountMatch = resultContent.match(/Average Token Count \| ([\d.]+)/);
|
|
350
|
+
const totalTokensMatch = resultContent.match(/Average Total Tokens \(API\) \| ([\d.]+)/);
|
|
351
|
+
const scoreMatch = resultContent.match(/Average Score \| ([\d.]+)\/10/);
|
|
352
|
+
const questionsMatch = resultContent.match(/Total Questions\*\*: (\d+)/);
|
|
353
|
+
const durationMatch = resultContent.match(/Total Duration\*\*: ([\d.]+)s/);
|
|
354
|
+
summaryData.push({
|
|
355
|
+
questionFile: questionFile,
|
|
356
|
+
folder: outputFolders[i],
|
|
357
|
+
avgToolCalls: toolCallsMatch ? parseFloat(toolCallsMatch[1]) : 0,
|
|
358
|
+
avgTokenCount: tokenCountMatch ? parseFloat(tokenCountMatch[1]) : 0,
|
|
359
|
+
avgTotalTokens: totalTokensMatch ? parseFloat(totalTokensMatch[1]) : 0,
|
|
360
|
+
avgScore: scoreMatch ? parseFloat(scoreMatch[1]) : 0,
|
|
361
|
+
totalQuestions: questionsMatch ? parseInt(questionsMatch[1], 10) : 0,
|
|
362
|
+
duration: durationMatch ? parseFloat(durationMatch[1]) : 0,
|
|
363
|
+
});
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
// Calculate overall averages
|
|
367
|
+
const totalQuestions = summaryData.reduce((sum, d) => sum + d.totalQuestions, 0);
|
|
368
|
+
const avgToolCalls = summaryData.reduce((sum, d) => sum + d.avgToolCalls, 0) / summaryData.length;
|
|
369
|
+
const avgTokenCount = summaryData.reduce((sum, d) => sum + d.avgTokenCount, 0) / summaryData.length;
|
|
370
|
+
const avgTotalTokens = summaryData.reduce((sum, d) => sum + d.avgTotalTokens, 0) / summaryData.length;
|
|
371
|
+
const avgScore = summaryData.reduce((sum, d) => sum + d.avgScore, 0) / summaryData.length;
|
|
372
|
+
const totalDuration = summaryData.reduce((sum, d) => sum + d.duration, 0);
|
|
373
|
+
// Generate combined summary markdown
|
|
374
|
+
const date = new Date().toISOString().split("T")[0];
|
|
375
|
+
let summaryMd = `# Combined Benchmark Results\n\n`;
|
|
376
|
+
summaryMd += `**Date**: ${date}\n`;
|
|
377
|
+
summaryMd += `**Branch**: ${branchName}\n`;
|
|
378
|
+
summaryMd += `**Run ID**: ${runId}\n`;
|
|
379
|
+
summaryMd += `**Model**: ${model}\n`;
|
|
380
|
+
summaryMd += `**Question Files**: ${count}\n`;
|
|
381
|
+
summaryMd += `**Total Questions**: ${totalQuestions}\n`;
|
|
382
|
+
summaryMd += `**Total Duration**: ${totalDuration.toFixed(2)}s (${(totalDuration / 60).toFixed(1)}m)\n\n`;
|
|
383
|
+
summaryMd += `## Overall Averages\n\n`;
|
|
384
|
+
summaryMd += `| Metric | Value |\n`;
|
|
385
|
+
summaryMd += `|--------|-------|\n`;
|
|
386
|
+
summaryMd += `| Average Tool Calls | ${avgToolCalls.toFixed(2)} |\n`;
|
|
387
|
+
summaryMd += `| Average Token Count | ${avgTokenCount.toFixed(0)} |\n`;
|
|
388
|
+
summaryMd += `| Average Total Tokens (API) | ${avgTotalTokens.toFixed(0)} |\n`;
|
|
389
|
+
summaryMd += `| Average Score | **${avgScore.toFixed(2)}/10** |\n\n`;
|
|
390
|
+
summaryMd += `## Results by Question File\n\n`;
|
|
391
|
+
summaryMd += `| Question File | Avg Tool Calls | Avg Token Count | Avg Total Tokens (API) | Avg Score | Questions | Duration |\n`;
|
|
392
|
+
summaryMd += `|---------------|----------------|-----------------|------------------------|-----------|-----------|----------|\n`;
|
|
393
|
+
summaryData.forEach((data) => {
|
|
394
|
+
summaryMd += `| ${data.questionFile} | ${data.avgToolCalls.toFixed(2)} | ${data.avgTokenCount.toFixed(0)} | ${data.avgTotalTokens.toFixed(0)} | ${data.avgScore.toFixed(2)}/10 | ${data.totalQuestions} | ${data.duration.toFixed(2)}s |\n`;
|
|
395
|
+
});
|
|
396
|
+
summaryMd += `\n## Output Folders\n\n`;
|
|
397
|
+
summaryData.forEach((data) => {
|
|
398
|
+
summaryMd += `- ${data.questionFile}: \`${data.folder}\`\n`;
|
|
399
|
+
});
|
|
400
|
+
// Write summary file
|
|
401
|
+
const summaryFileName = `${branchName}-run-${runId}-model-${model}.md`;
|
|
402
|
+
const summaryPath = join(benchmarksDir, summaryFileName);
|
|
403
|
+
writeFileSync(summaryPath, summaryMd);
|
|
404
|
+
console.log(`✅ Summary written to: ${summaryPath}`);
|
|
405
|
+
console.log();
|
|
406
|
+
// Generate aggregate summary from all runs
|
|
407
|
+
console.log("Generating aggregate summary across all runs...");
|
|
408
|
+
const allRuns = findExistingRuns(benchmarksDir, branchName, model);
|
|
409
|
+
if (allRuns.length > 1) {
|
|
410
|
+
const aggregateSummary = generateAggregateSummary(benchmarksDir, branchName, model, allRuns);
|
|
411
|
+
if (aggregateSummary) {
|
|
412
|
+
const aggregateSummaryFileName = `${branchName}-run-SUMMARY-model-${model}.md`;
|
|
413
|
+
const aggregateSummaryPath = join(benchmarksDir, aggregateSummaryFileName);
|
|
414
|
+
writeFileSync(aggregateSummaryPath, aggregateSummary);
|
|
415
|
+
console.log(`✅ Aggregate summary written to: ${aggregateSummaryPath}`);
|
|
416
|
+
console.log(` Includes ${allRuns.length} runs: ${allRuns.map((r) => r.runId).join(", ")}`);
|
|
417
|
+
}
|
|
418
|
+
else {
|
|
419
|
+
console.log(`⚠️ Could not generate aggregate summary (no valid run data found)`);
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
else {
|
|
423
|
+
console.log(`ℹ️ Only one run exists, skipping aggregate summary`);
|
|
424
|
+
}
|
|
425
|
+
console.log();
|
|
426
|
+
// Move folders to archive
|
|
427
|
+
console.log("Moving folders to archive...");
|
|
428
|
+
const archiveDir = join(benchmarksDir, "archive");
|
|
429
|
+
mkdirSync(archiveDir, { recursive: true });
|
|
430
|
+
for (const folder of outputFolders) {
|
|
431
|
+
const sourcePath = join(benchmarksDir, folder);
|
|
432
|
+
const destPath = join(archiveDir, folder);
|
|
433
|
+
if (existsSync(sourcePath)) {
|
|
434
|
+
renameSync(sourcePath, destPath);
|
|
435
|
+
console.log(` ✅ Moved ${folder} to archive/`);
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
console.log();
|
|
439
|
+
console.log("=".repeat(80));
|
|
440
|
+
console.log("Parallel Benchmark Run Complete!");
|
|
441
|
+
console.log("=".repeat(80));
|
|
442
|
+
console.log(`Summary: ${summaryFileName}`);
|
|
443
|
+
console.log(`Overall Average Score: ${avgScore.toFixed(2)}/10`);
|
|
444
|
+
if (allRuns.length > 1) {
|
|
445
|
+
console.log(`Aggregate Summary: ${branchName}-run-SUMMARY-model-${model}.md (${allRuns.length} runs)`);
|
|
446
|
+
}
|
|
447
|
+
console.log(`Folders archived in: archive/`);
|
|
448
|
+
console.log("=".repeat(80));
|
|
449
|
+
}
|
|
450
|
+
catch (error) {
|
|
451
|
+
console.error("Error running benchmarks:", error);
|
|
452
|
+
process.exit(1);
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
// Run parallel benchmarks
|
|
456
|
+
runBenchmarks().catch((error) => {
|
|
457
|
+
console.error("Fatal error:", error);
|
|
458
|
+
process.exit(1);
|
|
459
|
+
});
|