@upstash/context7-mcp 1.0.34-canary.1 → 1.0.34-canary.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +78 -25
- package/dist/lib/api.js +34 -5
- package/package.json +2 -10
- package/dist/benchmark/benchmark.js +0 -347
- package/dist/benchmark/compare-benchmark.js +0 -289
- package/dist/benchmark/run-benchmark.js +0 -459
- package/dist/benchmark/simulate.js +0 -319
package/dist/index.js
CHANGED
|
@@ -2,7 +2,8 @@
|
|
|
2
2
|
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
3
3
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
4
4
|
import { z } from "zod";
|
|
5
|
-
import { fetchLibraryContext } from "./lib/api.js";
|
|
5
|
+
import { searchLibraries, fetchLibraryContext } from "./lib/api.js";
|
|
6
|
+
import { formatSearchResults } from "./lib/utils.js";
|
|
6
7
|
import express from "express";
|
|
7
8
|
import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
|
|
8
9
|
import { Command } from "commander";
|
|
@@ -44,29 +45,79 @@ const CLI_PORT = (() => {
|
|
|
44
45
|
const requestContext = new AsyncLocalStorage();
|
|
45
46
|
// Store API key globally for stdio mode (where requestContext may not be available in tool handlers)
|
|
46
47
|
let globalApiKey;
|
|
47
|
-
const stripIpv6Prefix = (ip) => ip.replace(/^::ffff:/, "");
|
|
48
|
-
const isPrivateIp = (ip) => ip.startsWith("10.") || ip.startsWith("192.168.") || /^172\.(1[6-9]|2[0-9]|3[0-1])\./.test(ip);
|
|
49
48
|
function getClientIp(req) {
|
|
50
|
-
const forwardedFor = req.headers["x-forwarded-for"];
|
|
49
|
+
const forwardedFor = req.headers["x-forwarded-for"] || req.headers["X-Forwarded-For"];
|
|
51
50
|
if (forwardedFor) {
|
|
52
51
|
const ips = Array.isArray(forwardedFor) ? forwardedFor[0] : forwardedFor;
|
|
53
|
-
const ipList = ips.split(",").map((ip) =>
|
|
54
|
-
|
|
52
|
+
const ipList = ips.split(",").map((ip) => ip.trim());
|
|
53
|
+
for (const ip of ipList) {
|
|
54
|
+
const plainIp = ip.replace(/^::ffff:/, "");
|
|
55
|
+
if (!plainIp.startsWith("10.") &&
|
|
56
|
+
!plainIp.startsWith("192.168.") &&
|
|
57
|
+
!/^172\.(1[6-9]|2[0-9]|3[0-1])\./.test(plainIp)) {
|
|
58
|
+
return plainIp;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
return ipList[0].replace(/^::ffff:/, "");
|
|
62
|
+
}
|
|
63
|
+
if (req.socket?.remoteAddress) {
|
|
64
|
+
return req.socket.remoteAddress.replace(/^::ffff:/, "");
|
|
55
65
|
}
|
|
56
|
-
return
|
|
66
|
+
return undefined;
|
|
57
67
|
}
|
|
58
68
|
const server = new McpServer({
|
|
59
69
|
name: "Context7",
|
|
60
70
|
version: "2.0.0",
|
|
61
71
|
}, {
|
|
62
|
-
capabilities: {
|
|
63
|
-
tools: { listChanged: true },
|
|
64
|
-
},
|
|
65
72
|
instructions: "Use this server to retrieve up-to-date documentation and code examples for any library.",
|
|
66
73
|
});
|
|
67
|
-
server.registerTool("
|
|
68
|
-
title: "
|
|
69
|
-
description: `
|
|
74
|
+
server.registerTool("resolve-library-id", {
|
|
75
|
+
title: "Resolve Context7 Library ID",
|
|
76
|
+
description: `Resolves a package/product name to a Context7-compatible library ID and returns matching libraries.
|
|
77
|
+
|
|
78
|
+
You MUST call this function before 'query-docs' to obtain a valid Context7-compatible library ID UNLESS the user explicitly provides a library ID in the format '/org/project' or '/org/project/version' in their query.
|
|
79
|
+
|
|
80
|
+
Each result includes: Library ID (format: /org/project), name, description, code snippet count, source reputation (High/Medium/Low/Unknown), benchmark score (0-100), and available versions (/org/project/version format).
|
|
81
|
+
|
|
82
|
+
Select the best match based on: name similarity, description relevance, snippet coverage, source reputation, and benchmark score. For ambiguous queries, ask for clarification.`,
|
|
83
|
+
inputSchema: {
|
|
84
|
+
query: z
|
|
85
|
+
.string()
|
|
86
|
+
.describe("The user's original question or task. This is used to rank library results by relevance to what the user is trying to accomplish. IMPORTANT: Do not include any sensitive or confidential information such as API keys, passwords, credentials, or personal data in your query."),
|
|
87
|
+
libraryName: z
|
|
88
|
+
.string()
|
|
89
|
+
.describe("Library name to search for and retrieve a Context7-compatible library ID."),
|
|
90
|
+
},
|
|
91
|
+
}, async ({ query, libraryName }) => {
|
|
92
|
+
const ctx = requestContext.getStore();
|
|
93
|
+
const apiKey = ctx?.apiKey || globalApiKey;
|
|
94
|
+
const searchResponse = await searchLibraries(query, libraryName, ctx?.clientIp, apiKey);
|
|
95
|
+
if (!searchResponse.results || searchResponse.results.length === 0) {
|
|
96
|
+
return {
|
|
97
|
+
content: [
|
|
98
|
+
{
|
|
99
|
+
type: "text",
|
|
100
|
+
text: searchResponse.error
|
|
101
|
+
? searchResponse.error
|
|
102
|
+
: "No libraries found matching the provided name.",
|
|
103
|
+
},
|
|
104
|
+
],
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
return {
|
|
108
|
+
content: [
|
|
109
|
+
{
|
|
110
|
+
type: "text",
|
|
111
|
+
text: formatSearchResults(searchResponse),
|
|
112
|
+
},
|
|
113
|
+
],
|
|
114
|
+
};
|
|
115
|
+
});
|
|
116
|
+
server.registerTool("query-docs", {
|
|
117
|
+
title: "Query Documentation",
|
|
118
|
+
description: `Retrieves and queries up-to-date documentation and code examples from Context7 for any programming library or framework.
|
|
119
|
+
|
|
120
|
+
You must call 'resolve-library-id' first to obtain the exact Context7-compatible library ID required to use this tool, UNLESS the user explicitly provides a library ID in the format '/org/project' or '/org/project/version' in their query.
|
|
70
121
|
|
|
71
122
|
USE THIS TOOL TO:
|
|
72
123
|
- Get current, accurate documentation for libraries (e.g., React, Next.js, Express, LangChain)
|
|
@@ -77,20 +128,14 @@ USE THIS TOOL TO:
|
|
|
77
128
|
query: z
|
|
78
129
|
.string()
|
|
79
130
|
.describe("The question or task you need help with. Be specific and include relevant details. Good: 'How to set up authentication with JWT in Express.js' or 'React useEffect cleanup function examples'. Bad: 'auth' or 'hooks'. IMPORTANT: Do not include any sensitive or confidential information such as API keys, passwords, credentials, or personal data in your query."),
|
|
80
|
-
|
|
131
|
+
libraryId: z
|
|
81
132
|
.string()
|
|
82
|
-
.
|
|
83
|
-
.describe("Library or framework name (e.g., 'react', 'express') OR exact library ID if provided by the user with or without version (e.g., '/vercel/next.js', '/vercel/next.js@v14.3.0-canary.87'). Only omit if the question is generic and not relevant to any specific library or product."),
|
|
84
|
-
mode: z
|
|
85
|
-
.enum(["code", "info"])
|
|
86
|
-
.optional()
|
|
87
|
-
.default("code")
|
|
88
|
-
.describe("Type of content to prioritize. Use 'code' (default) when you need working code examples, API usage patterns, and implementation snippets. Use 'info' when you need conceptual narrative explanations, architectural overviews, or understanding how something works."),
|
|
133
|
+
.describe("Context7-compatible library ID (e.g., '/mongodb/docs' or '/vercel/next.js'). Retrieved from 'resolve-library-id' or directly from user query in the format '/org/project' or '/org/project/version'."),
|
|
89
134
|
},
|
|
90
|
-
}, async ({ query,
|
|
135
|
+
}, async ({ query, libraryId }) => {
|
|
91
136
|
const ctx = requestContext.getStore();
|
|
92
137
|
const apiKey = ctx?.apiKey || globalApiKey;
|
|
93
|
-
const response = await fetchLibraryContext({ query,
|
|
138
|
+
const response = await fetchLibraryContext({ query, libraryId }, ctx?.clientIp, apiKey);
|
|
94
139
|
return {
|
|
95
140
|
content: [
|
|
96
141
|
{
|
|
@@ -104,6 +149,7 @@ async function main() {
|
|
|
104
149
|
const transportType = TRANSPORT_TYPE;
|
|
105
150
|
if (transportType === "http") {
|
|
106
151
|
const initialPort = CLI_PORT ?? DEFAULT_PORT;
|
|
152
|
+
let actualPort = initialPort;
|
|
107
153
|
const app = express();
|
|
108
154
|
app.use(express.json());
|
|
109
155
|
app.use((req, res, next) => {
|
|
@@ -133,8 +179,14 @@ async function main() {
|
|
|
133
179
|
};
|
|
134
180
|
const extractApiKey = (req) => {
|
|
135
181
|
return (extractBearerToken(req.headers.authorization) ||
|
|
182
|
+
extractHeaderValue(req.headers["Context7-API-Key"]) ||
|
|
183
|
+
extractHeaderValue(req.headers["X-API-Key"]) ||
|
|
136
184
|
extractHeaderValue(req.headers["context7-api-key"]) ||
|
|
137
|
-
extractHeaderValue(req.headers["x-api-key"])
|
|
185
|
+
extractHeaderValue(req.headers["x-api-key"]) ||
|
|
186
|
+
extractHeaderValue(req.headers["Context7_API_Key"]) ||
|
|
187
|
+
extractHeaderValue(req.headers["X_API_Key"]) ||
|
|
188
|
+
extractHeaderValue(req.headers["context7_api_key"]) ||
|
|
189
|
+
extractHeaderValue(req.headers["x_api_key"]));
|
|
138
190
|
};
|
|
139
191
|
app.all("/mcp", async (req, res) => {
|
|
140
192
|
try {
|
|
@@ -186,7 +238,8 @@ async function main() {
|
|
|
186
238
|
}
|
|
187
239
|
});
|
|
188
240
|
httpServer.once("listening", () => {
|
|
189
|
-
|
|
241
|
+
actualPort = port;
|
|
242
|
+
console.error(`Context7 Documentation MCP Server running on HTTP at http://localhost:${actualPort}/mcp`);
|
|
190
243
|
});
|
|
191
244
|
};
|
|
192
245
|
startServer(initialPort);
|
package/dist/lib/api.js
CHANGED
|
@@ -26,7 +26,7 @@ async function parseErrorResponse(response, apiKey) {
|
|
|
26
26
|
: "Rate limited or quota exceeded. Create a free API key at https://context7.com/dashboard for higher limits.";
|
|
27
27
|
}
|
|
28
28
|
if (status === 404) {
|
|
29
|
-
return "
|
|
29
|
+
return "The library you are trying to access does not exist. Please try with a different library ID.";
|
|
30
30
|
}
|
|
31
31
|
if (status === 401) {
|
|
32
32
|
return "Invalid API key. Please check your API key. API keys should start with 'ctx7sk' prefix.";
|
|
@@ -52,6 +52,38 @@ if (PROXY_URL && !PROXY_URL.startsWith("$") && /^(http|https):\/\//i.test(PROXY_
|
|
|
52
52
|
console.error(`[Context7] Failed to configure proxy agent for provided proxy URL: ${PROXY_URL}:`, error);
|
|
53
53
|
}
|
|
54
54
|
}
|
|
55
|
+
/**
|
|
56
|
+
* Searches for libraries matching the given query
|
|
57
|
+
* @param query The user's question or task (used for LLM relevance ranking)
|
|
58
|
+
* @param libraryName The library name to search for in the database
|
|
59
|
+
* @param clientIp Optional client IP address to include in headers
|
|
60
|
+
* @param apiKey Optional API key for authentication
|
|
61
|
+
* @returns Search results or null if the request fails
|
|
62
|
+
*/
|
|
63
|
+
export async function searchLibraries(query, libraryName, clientIp, apiKey) {
|
|
64
|
+
try {
|
|
65
|
+
const url = new URL(`${CONTEXT7_API_BASE_URL}/v2/libs/search`);
|
|
66
|
+
url.searchParams.set("query", query);
|
|
67
|
+
url.searchParams.set("libraryName", libraryName);
|
|
68
|
+
const headers = generateHeaders(clientIp, apiKey);
|
|
69
|
+
const response = await fetch(url, { headers });
|
|
70
|
+
if (!response.ok) {
|
|
71
|
+
const errorMessage = await parseErrorResponse(response, apiKey);
|
|
72
|
+
console.error(errorMessage);
|
|
73
|
+
return {
|
|
74
|
+
results: [],
|
|
75
|
+
error: errorMessage,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
const searchData = await response.json();
|
|
79
|
+
return searchData;
|
|
80
|
+
}
|
|
81
|
+
catch (error) {
|
|
82
|
+
const errorMessage = `Error searching libraries: ${error}`;
|
|
83
|
+
console.error(errorMessage);
|
|
84
|
+
return { results: [], error: errorMessage };
|
|
85
|
+
}
|
|
86
|
+
}
|
|
55
87
|
/**
|
|
56
88
|
* Fetches intelligent, reranked context for a natural language query
|
|
57
89
|
* @param request The context request parameters (query, topic, library, mode)
|
|
@@ -63,10 +95,7 @@ export async function fetchLibraryContext(request, clientIp, apiKey) {
|
|
|
63
95
|
try {
|
|
64
96
|
const url = new URL(`${CONTEXT7_API_BASE_URL}/v2/context`);
|
|
65
97
|
url.searchParams.set("query", request.query);
|
|
66
|
-
|
|
67
|
-
url.searchParams.set("library", request.library);
|
|
68
|
-
if (request.mode)
|
|
69
|
-
url.searchParams.set("mode", request.mode);
|
|
98
|
+
url.searchParams.set("libraryId", request.libraryId);
|
|
70
99
|
const headers = generateHeaders(clientIp, apiKey, { "X-Context7-Source": "mcp-server" });
|
|
71
100
|
const response = await fetch(url, { headers });
|
|
72
101
|
if (!response.ok) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@upstash/context7-mcp",
|
|
3
|
-
"version": "1.0.34-canary.
|
|
3
|
+
"version": "1.0.34-canary.3",
|
|
4
4
|
"mcpName": "io.github.upstash/context7",
|
|
5
5
|
"description": "MCP server for Context7",
|
|
6
6
|
"repository": {
|
|
@@ -41,13 +41,7 @@
|
|
|
41
41
|
"zod": "^3.24.2"
|
|
42
42
|
},
|
|
43
43
|
"devDependencies": {
|
|
44
|
-
"@ai-sdk/anthropic": "^1.2.12",
|
|
45
|
-
"@ai-sdk/google": "^1.2.7",
|
|
46
|
-
"@ai-sdk/mcp": "^0.2.0",
|
|
47
|
-
"@ai-sdk/openai": "^1.3.22",
|
|
48
44
|
"@types/node": "^22.13.14",
|
|
49
|
-
"ai": "^4.3.16",
|
|
50
|
-
"dotenv": "^16.5.0",
|
|
51
45
|
"typescript": "^5.8.2"
|
|
52
46
|
},
|
|
53
47
|
"scripts": {
|
|
@@ -60,8 +54,6 @@
|
|
|
60
54
|
"format:check": "prettier --check .",
|
|
61
55
|
"dev": "tsc --watch",
|
|
62
56
|
"start": "node dist/index.js --transport http",
|
|
63
|
-
"pack-mcpb": "pnpm install && pnpm run build && rm -rf node_modules && pnpm install --prod && mv mcpb/.mcpbignore .mcpbignore && mv mcpb/manifest.json manifest.json && mv public/icon.png icon.png && mcpb validate manifest.json && mcpb pack . mcpb/context7.mcpb && mv manifest.json mcpb/manifest.json && mv .mcpbignore mcpb/.mcpbignore && mv icon.png public/icon.png && bun install"
|
|
64
|
-
"run-benchmark": "pnpm run build && node dist/benchmark/run-benchmark.js",
|
|
65
|
-
"compare-benchmark": "pnpm run build && node dist/benchmark/compare-benchmark.js"
|
|
57
|
+
"pack-mcpb": "pnpm install && pnpm run build && rm -rf node_modules && pnpm install --prod && mv mcpb/.mcpbignore .mcpbignore && mv mcpb/manifest.json manifest.json && mv public/icon.png icon.png && mcpb validate manifest.json && mcpb pack . mcpb/context7.mcpb && mv manifest.json mcpb/manifest.json && mv .mcpbignore mcpb/.mcpbignore && mv icon.png public/icon.png && bun install"
|
|
66
58
|
}
|
|
67
59
|
}
|
|
@@ -1,347 +0,0 @@
|
|
|
1
|
-
import "dotenv/config";
|
|
2
|
-
import { readFileSync, mkdirSync, renameSync, existsSync, readdirSync, writeFileSync } from "fs";
|
|
3
|
-
import { join, dirname } from "path";
|
|
4
|
-
import { fileURLToPath } from "url";
|
|
5
|
-
import { execSync } from "child_process";
|
|
6
|
-
import { simulate } from "./simulate.js";
|
|
7
|
-
import { generateText } from "ai";
|
|
8
|
-
import { anthropic } from "@ai-sdk/anthropic";
|
|
9
|
-
import { openai } from "@ai-sdk/openai";
|
|
10
|
-
import { google } from "@ai-sdk/google";
|
|
11
|
-
// Check for required environment variables
|
|
12
|
-
if (!process.env.CONTEXT7_API_KEY) {
|
|
13
|
-
console.error("Error: CONTEXT7_API_KEY environment variable is required");
|
|
14
|
-
console.error("Set it in your .env file or export it in your shell");
|
|
15
|
-
process.exit(1);
|
|
16
|
-
}
|
|
17
|
-
const __filename = fileURLToPath(import.meta.url);
|
|
18
|
-
const __dirname = dirname(__filename);
|
|
19
|
-
// Package root is two levels up from dist/benchmark/
|
|
20
|
-
const packageRoot = join(__dirname, "..", "..");
|
|
21
|
-
/**
|
|
22
|
-
* Get the current git branch name
|
|
23
|
-
* @returns The branch name or "unknown" if not in a git repo
|
|
24
|
-
*/
|
|
25
|
-
function getCurrentBranch() {
|
|
26
|
-
try {
|
|
27
|
-
const branch = execSync("git rev-parse --abbrev-ref HEAD", { encoding: "utf-8" }).trim();
|
|
28
|
-
return branch;
|
|
29
|
-
}
|
|
30
|
-
catch (error) {
|
|
31
|
-
console.error("Error getting current branch:", error);
|
|
32
|
-
return "unknown";
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
/**
|
|
36
|
-
* Runs benchmarks by simulating questions from questions.txt
|
|
37
|
-
*
|
|
38
|
-
* Usage:
|
|
39
|
-
* - pnpm run benchmark openai
|
|
40
|
-
* - pnpm run benchmark claude
|
|
41
|
-
* - pnpm run benchmark gemini
|
|
42
|
-
* - pnpm run benchmark openai --test (run only first question)
|
|
43
|
-
* - pnpm run benchmark claude 1 output-folder (questionset 1, custom output folder)
|
|
44
|
-
* - pnpm run benchmark claude aa.txt output-folder (use aa.txt, custom output folder)
|
|
45
|
-
*/
|
|
46
|
-
async function runBenchmark() {
|
|
47
|
-
// Parse arguments
|
|
48
|
-
const args = process.argv.slice(2);
|
|
49
|
-
const nonFlagArgs = args.filter((a) => !a.startsWith("--"));
|
|
50
|
-
const modelArg = nonFlagArgs[0]?.toLowerCase() || "claude";
|
|
51
|
-
const questionFileArg = nonFlagArgs[1] || null;
|
|
52
|
-
const outputFolderName = nonFlagArgs[2] || null;
|
|
53
|
-
const isTestMode = args.includes("--test");
|
|
54
|
-
let scoringModel;
|
|
55
|
-
let modelName;
|
|
56
|
-
if (modelArg === "openai") {
|
|
57
|
-
scoringModel = openai("gpt-5");
|
|
58
|
-
modelName = "GPT-5";
|
|
59
|
-
}
|
|
60
|
-
else if (modelArg === "gemini") {
|
|
61
|
-
scoringModel = google("gemini-2.5-pro");
|
|
62
|
-
modelName = "GEMINI-2.5-PRO";
|
|
63
|
-
}
|
|
64
|
-
else {
|
|
65
|
-
// Default to claude
|
|
66
|
-
scoringModel = anthropic("claude-sonnet-4-5");
|
|
67
|
-
modelName = "CLAUDE-SONNET-4.5";
|
|
68
|
-
}
|
|
69
|
-
// Determine the questions file to use
|
|
70
|
-
let questionsFileName;
|
|
71
|
-
if (!questionFileArg) {
|
|
72
|
-
questionsFileName = "questions.txt";
|
|
73
|
-
}
|
|
74
|
-
else if (questionFileArg.endsWith(".txt")) {
|
|
75
|
-
// Filename provided directly
|
|
76
|
-
questionsFileName = questionFileArg;
|
|
77
|
-
}
|
|
78
|
-
else {
|
|
79
|
-
// Number provided, construct filename
|
|
80
|
-
const questionSetNum = parseInt(questionFileArg, 10);
|
|
81
|
-
if (!isNaN(questionSetNum)) {
|
|
82
|
-
questionsFileName = `questions${questionSetNum}.txt`;
|
|
83
|
-
}
|
|
84
|
-
else {
|
|
85
|
-
questionsFileName = "questions.txt";
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
console.log("=".repeat(80));
|
|
89
|
-
console.log("Context7 MCP Benchmark");
|
|
90
|
-
console.log("=".repeat(80));
|
|
91
|
-
console.log(`Scoring Model: ${modelName}`);
|
|
92
|
-
console.log(`Question File: ${questionsFileName}`);
|
|
93
|
-
if (isTestMode) {
|
|
94
|
-
console.log(`Mode: TEST (first question only)`);
|
|
95
|
-
}
|
|
96
|
-
console.log();
|
|
97
|
-
// Read questions from questions.txt or questionsN.txt (in src/benchmark/questions directory)
|
|
98
|
-
const questionsPath = join(packageRoot, "src", "benchmark", "questions", questionsFileName);
|
|
99
|
-
console.log(`Reading questions from: ${questionsPath}`);
|
|
100
|
-
if (!existsSync(questionsPath)) {
|
|
101
|
-
console.error(`Error: questions.txt not found at ${questionsPath}`);
|
|
102
|
-
process.exit(1);
|
|
103
|
-
}
|
|
104
|
-
const questionsContent = readFileSync(questionsPath, "utf-8");
|
|
105
|
-
let questions = questionsContent
|
|
106
|
-
.split("\n")
|
|
107
|
-
.map((line) => line.trim())
|
|
108
|
-
.filter((line) => line.length > 0 && !line.startsWith("#")); // Filter empty lines and comments
|
|
109
|
-
// Limit to first question if in test mode
|
|
110
|
-
if (isTestMode) {
|
|
111
|
-
questions = questions.slice(0, 1);
|
|
112
|
-
console.log(`Test mode: Running only first question`);
|
|
113
|
-
}
|
|
114
|
-
else {
|
|
115
|
-
console.log(`Found ${questions.length} questions to benchmark`);
|
|
116
|
-
}
|
|
117
|
-
console.log();
|
|
118
|
-
// Get current git branch name
|
|
119
|
-
const branchName = getCurrentBranch();
|
|
120
|
-
// Create benchmark run directory with custom name or default naming
|
|
121
|
-
let benchmarkRunDir;
|
|
122
|
-
if (outputFolderName) {
|
|
123
|
-
benchmarkRunDir = join(packageRoot, "src", "benchmark", "reports", "benchmarks", outputFolderName);
|
|
124
|
-
}
|
|
125
|
-
else {
|
|
126
|
-
const timestamp = new Date().toISOString().replace(/[:.]/g, "-").split("Z")[0];
|
|
127
|
-
benchmarkRunDir = join(packageRoot, "src", "benchmark", "reports", "benchmarks", `${branchName}-run-${timestamp}_${modelName.replace(/[.\s]/g, "-")}`);
|
|
128
|
-
}
|
|
129
|
-
mkdirSync(benchmarkRunDir, { recursive: true });
|
|
130
|
-
console.log(`Benchmark results will be saved to: ${benchmarkRunDir}`);
|
|
131
|
-
console.log();
|
|
132
|
-
const results = [];
|
|
133
|
-
// Run simulation for questions in batches (parallel processing)
|
|
134
|
-
// BATCH_SIZE can be set via environment variable (e.g., BATCH_SIZE=1 for sequential)
|
|
135
|
-
const startTime = Date.now();
|
|
136
|
-
const BATCH_SIZE = process.env.BATCH_SIZE ? parseInt(process.env.BATCH_SIZE, 10) : 7;
|
|
137
|
-
console.log(`Execution Mode: ${BATCH_SIZE === 1 ? "Sequential (1 question at a time)" : "Parallel (batch size: " + BATCH_SIZE + ")"}`);
|
|
138
|
-
console.log();
|
|
139
|
-
for (let batchStart = 0; batchStart < questions.length; batchStart += BATCH_SIZE) {
|
|
140
|
-
const batchEnd = Math.min(batchStart + BATCH_SIZE, questions.length);
|
|
141
|
-
const batch = questions.slice(batchStart, batchEnd);
|
|
142
|
-
console.log("═".repeat(80));
|
|
143
|
-
console.log(`Processing Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} (Questions ${batchStart + 1}-${batchEnd})`);
|
|
144
|
-
console.log("═".repeat(80));
|
|
145
|
-
console.log();
|
|
146
|
-
// Process batch in parallel
|
|
147
|
-
const batchPromises = batch.map(async (question, batchIndex) => {
|
|
148
|
-
const questionNum = batchStart + batchIndex + 1;
|
|
149
|
-
console.log(`[Q${questionNum}] Starting: ${question.substring(0, 60)}...`);
|
|
150
|
-
try {
|
|
151
|
-
// Run simulation with unique ID to prevent filename collisions
|
|
152
|
-
const uniqueId = `q${questionNum}`;
|
|
153
|
-
await simulate(question, uniqueId);
|
|
154
|
-
// Wait a bit to ensure file system operations complete
|
|
155
|
-
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
156
|
-
// Find the report files created for this question by unique ID
|
|
157
|
-
const reportsDir = join(packageRoot, "src", "benchmark", "reports");
|
|
158
|
-
const files = readdirSync(reportsDir);
|
|
159
|
-
// Look for files containing the unique ID
|
|
160
|
-
const mdFile = files.find((f) => f.includes(`_${uniqueId}.md`) && !f.endsWith("_raw.md"));
|
|
161
|
-
const rawMdFile = files.find((f) => f.includes(`_${uniqueId}_raw.md`));
|
|
162
|
-
if (mdFile && rawMdFile) {
|
|
163
|
-
// Move files to benchmark directory with new names
|
|
164
|
-
const sourceMd = join(reportsDir, mdFile);
|
|
165
|
-
const sourceRawMd = join(reportsDir, rawMdFile);
|
|
166
|
-
const destMd = join(benchmarkRunDir, `q${questionNum}.md`);
|
|
167
|
-
const destRawMd = join(benchmarkRunDir, `q${questionNum}_raw.md`);
|
|
168
|
-
renameSync(sourceMd, destMd);
|
|
169
|
-
renameSync(sourceRawMd, destRawMd);
|
|
170
|
-
console.log(`[Q${questionNum}] ✅ Completed and saved`);
|
|
171
|
-
return {
|
|
172
|
-
questionNum,
|
|
173
|
-
question,
|
|
174
|
-
toolCount: 0, // Will be calculated during scoring
|
|
175
|
-
tokenCount: 0, // Will be calculated during scoring
|
|
176
|
-
totalTokens: 0, // Will be extracted from report
|
|
177
|
-
score: 0, // Will be calculated during scoring
|
|
178
|
-
};
|
|
179
|
-
}
|
|
180
|
-
else {
|
|
181
|
-
console.error(`[Q${questionNum}] ⚠️ No report files found (expected: *_${uniqueId}.md)`);
|
|
182
|
-
return null;
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
catch (error) {
|
|
186
|
-
console.error(`[Q${questionNum}] ❌ Error:`, error);
|
|
187
|
-
return null;
|
|
188
|
-
}
|
|
189
|
-
});
|
|
190
|
-
// Wait for all questions in this batch to complete
|
|
191
|
-
const batchResults = await Promise.all(batchPromises);
|
|
192
|
-
// Add successful results to the results array
|
|
193
|
-
batchResults.forEach((result) => {
|
|
194
|
-
if (result) {
|
|
195
|
-
results.push(result);
|
|
196
|
-
}
|
|
197
|
-
});
|
|
198
|
-
console.log();
|
|
199
|
-
console.log(`Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} completed: ${batchResults.filter((r) => r).length}/${batch.length} successful`);
|
|
200
|
-
console.log();
|
|
201
|
-
}
|
|
202
|
-
const duration = Date.now() - startTime;
|
|
203
|
-
// Scoring phase - also in batches of 5 for parallel processing
|
|
204
|
-
console.log();
|
|
205
|
-
console.log("=".repeat(80));
|
|
206
|
-
console.log("Scoring Phase");
|
|
207
|
-
console.log("=".repeat(80));
|
|
208
|
-
console.log(`Using ${modelName} to score context quality...`);
|
|
209
|
-
console.log();
|
|
210
|
-
for (let batchStart = 0; batchStart < results.length; batchStart += BATCH_SIZE) {
|
|
211
|
-
const batchEnd = Math.min(batchStart + BATCH_SIZE, results.length);
|
|
212
|
-
const batchResults = results.slice(batchStart, batchEnd);
|
|
213
|
-
console.log(`Scoring Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} (Questions ${batchStart + 1}-${batchEnd})`);
|
|
214
|
-
// Process scoring in parallel
|
|
215
|
-
const scoringPromises = batchResults.map(async (result) => {
|
|
216
|
-
const rawMdPath = join(benchmarkRunDir, `q${result.questionNum}_raw.md`);
|
|
217
|
-
const structuredMdPath = join(benchmarkRunDir, `q${result.questionNum}.md`);
|
|
218
|
-
try {
|
|
219
|
-
// Read raw markdown file
|
|
220
|
-
const rawContent = readFileSync(rawMdPath, "utf-8");
|
|
221
|
-
// Count tokens (approximate: split by whitespace and punctuation)
|
|
222
|
-
const tokenCount = rawContent.split(/[\s\n]+/).length;
|
|
223
|
-
result.tokenCount = tokenCount;
|
|
224
|
-
// Count tool calls from structured report and extract total tokens
|
|
225
|
-
const structuredContent = readFileSync(structuredMdPath, "utf-8");
|
|
226
|
-
const toolCallMatches = structuredContent.match(/### Tool Call \d+:/g);
|
|
227
|
-
result.toolCount = toolCallMatches ? toolCallMatches.length : 0;
|
|
228
|
-
// Extract total tokens from structured report
|
|
229
|
-
const totalTokensMatch = structuredContent.match(/\*\*Total Tokens\*\*: (\d+)/);
|
|
230
|
-
result.totalTokens = totalTokensMatch ? parseInt(totalTokensMatch[1], 10) : 0;
|
|
231
|
-
// Extract question and context from raw file
|
|
232
|
-
const lines = rawContent.split("\n");
|
|
233
|
-
const questionLine = lines.find((line) => line.startsWith("QUESTION:"));
|
|
234
|
-
const question = questionLine
|
|
235
|
-
? questionLine.replace("QUESTION:", "").trim()
|
|
236
|
-
: result.question;
|
|
237
|
-
// Get context (everything after "CONTEXT:")
|
|
238
|
-
const contextStart = rawContent.indexOf("CONTEXT:");
|
|
239
|
-
const context = contextStart !== -1 ? rawContent.substring(contextStart + 8).trim() : rawContent;
|
|
240
|
-
console.log(`[Q${result.questionNum}] Scoring...`);
|
|
241
|
-
// Ask the scoring model to evaluate the context
|
|
242
|
-
const scoringResult = await generateText({
|
|
243
|
-
model: scoringModel,
|
|
244
|
-
messages: [
|
|
245
|
-
{
|
|
246
|
-
role: "user",
|
|
247
|
-
content: `You are evaluating the quality and usefulness of documentation context for a given question.
|
|
248
|
-
|
|
249
|
-
Question: ${question}
|
|
250
|
-
|
|
251
|
-
Context provided:
|
|
252
|
-
${context}
|
|
253
|
-
|
|
254
|
-
Rate how helpful and relevant this context is for answering the question on a scale of 1-10, where:
|
|
255
|
-
- 1-3: Poor - Missing critical information, irrelevant, or unhelpful
|
|
256
|
-
- 4-6: Adequate - Has some useful information but gaps exist
|
|
257
|
-
- 7-8: Good - Covers most needs with relevant examples
|
|
258
|
-
- 9-10: Excellent - Comprehensive, relevant, with clear examples
|
|
259
|
-
|
|
260
|
-
Respond with ONLY a JSON object in this format:
|
|
261
|
-
{"score": <number>, "reasoning": "<brief explanation>"}`,
|
|
262
|
-
},
|
|
263
|
-
],
|
|
264
|
-
});
|
|
265
|
-
// Parse the score
|
|
266
|
-
try {
|
|
267
|
-
const jsonMatch = scoringResult.text.match(/\{[\s\S]*\}/);
|
|
268
|
-
if (jsonMatch) {
|
|
269
|
-
const scoreData = JSON.parse(jsonMatch[0]);
|
|
270
|
-
result.score = scoreData.score;
|
|
271
|
-
console.log(`[Q${result.questionNum}] Score: ${scoreData.score}/10 - ${scoreData.reasoning.substring(0, 60)}...`);
|
|
272
|
-
}
|
|
273
|
-
else {
|
|
274
|
-
console.log(`[Q${result.questionNum}] ⚠️ Could not parse score, defaulting to 0`);
|
|
275
|
-
result.score = 0;
|
|
276
|
-
}
|
|
277
|
-
}
|
|
278
|
-
catch (parseError) {
|
|
279
|
-
console.log(`[Q${result.questionNum}] ⚠️ Error parsing score: ${parseError}`);
|
|
280
|
-
result.score = 0;
|
|
281
|
-
}
|
|
282
|
-
}
|
|
283
|
-
catch (error) {
|
|
284
|
-
console.error(`[Q${result.questionNum}] ❌ Error scoring:`, error);
|
|
285
|
-
}
|
|
286
|
-
});
|
|
287
|
-
// Wait for all scoring in this batch to complete
|
|
288
|
-
await Promise.all(scoringPromises);
|
|
289
|
-
console.log(`Scoring Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} completed: ${batchEnd - batchStart} questions`);
|
|
290
|
-
console.log();
|
|
291
|
-
}
|
|
292
|
-
// Calculate averages
|
|
293
|
-
const avgToolCount = results.reduce((sum, r) => sum + r.toolCount, 0) / results.length;
|
|
294
|
-
const avgTokenCount = results.reduce((sum, r) => sum + r.tokenCount, 0) / results.length;
|
|
295
|
-
const avgTotalTokens = results.reduce((sum, r) => sum + r.totalTokens, 0) / results.length;
|
|
296
|
-
const avgScore = results.reduce((sum, r) => sum + r.score, 0) / results.length;
|
|
297
|
-
// Generate result.md
|
|
298
|
-
console.log("Generating result.md...");
|
|
299
|
-
let resultMd = `# Benchmark Results\n\n`;
|
|
300
|
-
resultMd += `**Scoring Model**: ${modelName}\n`;
|
|
301
|
-
resultMd += `**Date**: ${new Date().toISOString()}\n`;
|
|
302
|
-
resultMd += `**Total Questions**: ${results.length}\n`;
|
|
303
|
-
resultMd += `**Total Duration**: ${(duration / 1000).toFixed(2)}s\n\n`;
|
|
304
|
-
resultMd += `## Averages\n\n`;
|
|
305
|
-
resultMd += `| Metric | Value |\n`;
|
|
306
|
-
resultMd += `|--------|-------|\n`;
|
|
307
|
-
resultMd += `| Average Tool Calls | ${avgToolCount.toFixed(2)} |\n`;
|
|
308
|
-
resultMd += `| Average Token Count | ${avgTokenCount.toFixed(0)} |\n`;
|
|
309
|
-
resultMd += `| Average Total Tokens (API) | ${avgTotalTokens.toFixed(0)} |\n`;
|
|
310
|
-
resultMd += `| Average Score | ${avgScore.toFixed(2)}/10 |\n\n`;
|
|
311
|
-
resultMd += `## Results by Question\n\n`;
|
|
312
|
-
results.forEach((result) => {
|
|
313
|
-
resultMd += `### Q${result.questionNum}: ${result.question}\n\n`;
|
|
314
|
-
resultMd += `| Metric | Value |\n`;
|
|
315
|
-
resultMd += `|--------|-------|\n`;
|
|
316
|
-
resultMd += `| Tool Calls | ${result.toolCount} |\n`;
|
|
317
|
-
resultMd += `| Token Count | ${result.tokenCount} |\n`;
|
|
318
|
-
resultMd += `| Total Tokens (API) | ${result.totalTokens} |\n`;
|
|
319
|
-
resultMd += `| LLM Score | ${result.score}/10 |\n\n`;
|
|
320
|
-
});
|
|
321
|
-
const resultPath = join(benchmarkRunDir, "result.md");
|
|
322
|
-
writeFileSync(resultPath, resultMd);
|
|
323
|
-
console.log(`✅ Results saved to: ${resultPath}`);
|
|
324
|
-
console.log();
|
|
325
|
-
// Summary
|
|
326
|
-
console.log("=".repeat(80));
|
|
327
|
-
console.log("Benchmark Complete");
|
|
328
|
-
console.log("=".repeat(80));
|
|
329
|
-
console.log(`Scoring Model: ${modelName}`);
|
|
330
|
-
console.log(`Total questions: ${questions.length}`);
|
|
331
|
-
console.log(`Total time: ${(duration / 1000).toFixed(2)}s`);
|
|
332
|
-
console.log(`Average time per question: ${(duration / questions.length / 1000).toFixed(2)}s`);
|
|
333
|
-
console.log();
|
|
334
|
-
console.log(`📊 Scoring Results:`);
|
|
335
|
-
console.log(` - Average Tool Calls: ${avgToolCount.toFixed(2)}`);
|
|
336
|
-
console.log(` - Average Token Count: ${avgTokenCount.toFixed(0)}`);
|
|
337
|
-
console.log(` - Average Total Tokens (API): ${avgTotalTokens.toFixed(0)}`);
|
|
338
|
-
console.log(` - Average Score: ${avgScore.toFixed(2)}/10`);
|
|
339
|
-
console.log();
|
|
340
|
-
console.log(`Results saved to: ${benchmarkRunDir}`);
|
|
341
|
-
console.log("=".repeat(80));
|
|
342
|
-
}
|
|
343
|
-
// Run benchmark
|
|
344
|
-
runBenchmark().catch((error) => {
|
|
345
|
-
console.error("Fatal error:", error);
|
|
346
|
-
process.exit(1);
|
|
347
|
-
});
|