@upstash/context7-mcp 1.0.34-canary.1 → 1.0.34-canary.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2,7 +2,8 @@
2
2
  import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
3
3
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
4
4
  import { z } from "zod";
5
- import { fetchLibraryContext } from "./lib/api.js";
5
+ import { searchLibraries, fetchLibraryContext } from "./lib/api.js";
6
+ import { formatSearchResults } from "./lib/utils.js";
6
7
  import express from "express";
7
8
  import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
8
9
  import { Command } from "commander";
@@ -44,29 +45,79 @@ const CLI_PORT = (() => {
44
45
  const requestContext = new AsyncLocalStorage();
45
46
  // Store API key globally for stdio mode (where requestContext may not be available in tool handlers)
46
47
  let globalApiKey;
47
- const stripIpv6Prefix = (ip) => ip.replace(/^::ffff:/, "");
48
- const isPrivateIp = (ip) => ip.startsWith("10.") || ip.startsWith("192.168.") || /^172\.(1[6-9]|2[0-9]|3[0-1])\./.test(ip);
49
48
  function getClientIp(req) {
50
- const forwardedFor = req.headers["x-forwarded-for"];
49
+ const forwardedFor = req.headers["x-forwarded-for"] || req.headers["X-Forwarded-For"];
51
50
  if (forwardedFor) {
52
51
  const ips = Array.isArray(forwardedFor) ? forwardedFor[0] : forwardedFor;
53
- const ipList = ips.split(",").map((ip) => stripIpv6Prefix(ip.trim()));
54
- return ipList.find((ip) => !isPrivateIp(ip)) ?? ipList[0];
52
+ const ipList = ips.split(",").map((ip) => ip.trim());
53
+ for (const ip of ipList) {
54
+ const plainIp = ip.replace(/^::ffff:/, "");
55
+ if (!plainIp.startsWith("10.") &&
56
+ !plainIp.startsWith("192.168.") &&
57
+ !/^172\.(1[6-9]|2[0-9]|3[0-1])\./.test(plainIp)) {
58
+ return plainIp;
59
+ }
60
+ }
61
+ return ipList[0].replace(/^::ffff:/, "");
62
+ }
63
+ if (req.socket?.remoteAddress) {
64
+ return req.socket.remoteAddress.replace(/^::ffff:/, "");
55
65
  }
56
- return req.socket?.remoteAddress ? stripIpv6Prefix(req.socket.remoteAddress) : undefined;
66
+ return undefined;
57
67
  }
58
68
  const server = new McpServer({
59
69
  name: "Context7",
60
70
  version: "2.0.0",
61
71
  }, {
62
- capabilities: {
63
- tools: { listChanged: true },
64
- },
65
72
  instructions: "Use this server to retrieve up-to-date documentation and code examples for any library.",
66
73
  });
67
- server.registerTool("get-docs", {
68
- title: "Get Library Documentation",
69
- description: `Retrieves up-to-date documentation and code examples from Context7 for any programming library or framework.
74
+ server.registerTool("resolve-library-id", {
75
+ title: "Resolve Context7 Library ID",
76
+ description: `Resolves a package/product name to a Context7-compatible library ID and returns matching libraries.
77
+
78
+ You MUST call this function before 'query-docs' to obtain a valid Context7-compatible library ID UNLESS the user explicitly provides a library ID in the format '/org/project' or '/org/project/version' in their query.
79
+
80
+ Each result includes: Library ID (format: /org/project), name, description, code snippet count, source reputation (High/Medium/Low/Unknown), benchmark score (0-100), and available versions (/org/project/version format).
81
+
82
+ Select the best match based on: name similarity, description relevance, snippet coverage, source reputation, and benchmark score. For ambiguous queries, ask for clarification.`,
83
+ inputSchema: {
84
+ query: z
85
+ .string()
86
+ .describe("The user's original question or task. This is used to rank library results by relevance to what the user is trying to accomplish. IMPORTANT: Do not include any sensitive or confidential information such as API keys, passwords, credentials, or personal data in your query."),
87
+ libraryName: z
88
+ .string()
89
+ .describe("Library name to search for and retrieve a Context7-compatible library ID."),
90
+ },
91
+ }, async ({ query, libraryName }) => {
92
+ const ctx = requestContext.getStore();
93
+ const apiKey = ctx?.apiKey || globalApiKey;
94
+ const searchResponse = await searchLibraries(query, libraryName, ctx?.clientIp, apiKey);
95
+ if (!searchResponse.results || searchResponse.results.length === 0) {
96
+ return {
97
+ content: [
98
+ {
99
+ type: "text",
100
+ text: searchResponse.error
101
+ ? searchResponse.error
102
+ : "No libraries found matching the provided name.",
103
+ },
104
+ ],
105
+ };
106
+ }
107
+ return {
108
+ content: [
109
+ {
110
+ type: "text",
111
+ text: formatSearchResults(searchResponse),
112
+ },
113
+ ],
114
+ };
115
+ });
116
+ server.registerTool("query-docs", {
117
+ title: "Query Documentation",
118
+ description: `Retrieves and queries up-to-date documentation and code examples from Context7 for any programming library or framework.
119
+
120
+ You must call 'resolve-library-id' first to obtain the exact Context7-compatible library ID required to use this tool, UNLESS the user explicitly provides a library ID in the format '/org/project' or '/org/project/version' in their query.
70
121
 
71
122
  USE THIS TOOL TO:
72
123
  - Get current, accurate documentation for libraries (e.g., React, Next.js, Express, LangChain)
@@ -77,20 +128,14 @@ USE THIS TOOL TO:
77
128
  query: z
78
129
  .string()
79
130
  .describe("The question or task you need help with. Be specific and include relevant details. Good: 'How to set up authentication with JWT in Express.js' or 'React useEffect cleanup function examples'. Bad: 'auth' or 'hooks'. IMPORTANT: Do not include any sensitive or confidential information such as API keys, passwords, credentials, or personal data in your query."),
80
- library: z
131
+ libraryId: z
81
132
  .string()
82
- .optional()
83
- .describe("Library or framework name (e.g., 'react', 'express') OR exact library ID if provided by the user with or without version (e.g., '/vercel/next.js', '/vercel/next.js@v14.3.0-canary.87'). Only omit if the question is generic and not relevant to any specific library or product."),
84
- mode: z
85
- .enum(["code", "info"])
86
- .optional()
87
- .default("code")
88
- .describe("Type of content to prioritize. Use 'code' (default) when you need working code examples, API usage patterns, and implementation snippets. Use 'info' when you need conceptual narrative explanations, architectural overviews, or understanding how something works."),
133
+ .describe("Context7-compatible library ID (e.g., '/mongodb/docs' or '/vercel/next.js'). Retrieved from 'resolve-library-id' or directly from user query in the format '/org/project' or '/org/project/version'."),
89
134
  },
90
- }, async ({ query, library, mode = "code" }) => {
135
+ }, async ({ query, libraryId }) => {
91
136
  const ctx = requestContext.getStore();
92
137
  const apiKey = ctx?.apiKey || globalApiKey;
93
- const response = await fetchLibraryContext({ query, library, mode }, ctx?.clientIp, apiKey);
138
+ const response = await fetchLibraryContext({ query, libraryId }, ctx?.clientIp, apiKey);
94
139
  return {
95
140
  content: [
96
141
  {
@@ -104,6 +149,7 @@ async function main() {
104
149
  const transportType = TRANSPORT_TYPE;
105
150
  if (transportType === "http") {
106
151
  const initialPort = CLI_PORT ?? DEFAULT_PORT;
152
+ let actualPort = initialPort;
107
153
  const app = express();
108
154
  app.use(express.json());
109
155
  app.use((req, res, next) => {
@@ -133,8 +179,14 @@ async function main() {
133
179
  };
134
180
  const extractApiKey = (req) => {
135
181
  return (extractBearerToken(req.headers.authorization) ||
182
+ extractHeaderValue(req.headers["Context7-API-Key"]) ||
183
+ extractHeaderValue(req.headers["X-API-Key"]) ||
136
184
  extractHeaderValue(req.headers["context7-api-key"]) ||
137
- extractHeaderValue(req.headers["x-api-key"]));
185
+ extractHeaderValue(req.headers["x-api-key"]) ||
186
+ extractHeaderValue(req.headers["Context7_API_Key"]) ||
187
+ extractHeaderValue(req.headers["X_API_Key"]) ||
188
+ extractHeaderValue(req.headers["context7_api_key"]) ||
189
+ extractHeaderValue(req.headers["x_api_key"]));
138
190
  };
139
191
  app.all("/mcp", async (req, res) => {
140
192
  try {
@@ -186,7 +238,8 @@ async function main() {
186
238
  }
187
239
  });
188
240
  httpServer.once("listening", () => {
189
- console.error(`Context7 Documentation MCP Server running on HTTP at http://localhost:${port}/mcp`);
241
+ actualPort = port;
242
+ console.error(`Context7 Documentation MCP Server running on HTTP at http://localhost:${actualPort}/mcp`);
190
243
  });
191
244
  };
192
245
  startServer(initialPort);
package/dist/lib/api.js CHANGED
@@ -26,7 +26,7 @@ async function parseErrorResponse(response, apiKey) {
26
26
  : "Rate limited or quota exceeded. Create a free API key at https://context7.com/dashboard for higher limits.";
27
27
  }
28
28
  if (status === 404) {
29
- return "No documentation found. Try a different library name or refine your query.";
29
+ return "The library you are trying to access does not exist. Please try with a different library ID.";
30
30
  }
31
31
  if (status === 401) {
32
32
  return "Invalid API key. Please check your API key. API keys should start with 'ctx7sk' prefix.";
@@ -52,6 +52,38 @@ if (PROXY_URL && !PROXY_URL.startsWith("$") && /^(http|https):\/\//i.test(PROXY_
52
52
  console.error(`[Context7] Failed to configure proxy agent for provided proxy URL: ${PROXY_URL}:`, error);
53
53
  }
54
54
  }
55
+ /**
56
+ * Searches for libraries matching the given query
57
+ * @param query The user's question or task (used for LLM relevance ranking)
58
+ * @param libraryName The library name to search for in the database
59
+ * @param clientIp Optional client IP address to include in headers
60
+ * @param apiKey Optional API key for authentication
61
+ * @returns Search results or null if the request fails
62
+ */
63
+ export async function searchLibraries(query, libraryName, clientIp, apiKey) {
64
+ try {
65
+ const url = new URL(`${CONTEXT7_API_BASE_URL}/v2/libs/search`);
66
+ url.searchParams.set("query", query);
67
+ url.searchParams.set("libraryName", libraryName);
68
+ const headers = generateHeaders(clientIp, apiKey);
69
+ const response = await fetch(url, { headers });
70
+ if (!response.ok) {
71
+ const errorMessage = await parseErrorResponse(response, apiKey);
72
+ console.error(errorMessage);
73
+ return {
74
+ results: [],
75
+ error: errorMessage,
76
+ };
77
+ }
78
+ const searchData = await response.json();
79
+ return searchData;
80
+ }
81
+ catch (error) {
82
+ const errorMessage = `Error searching libraries: ${error}`;
83
+ console.error(errorMessage);
84
+ return { results: [], error: errorMessage };
85
+ }
86
+ }
55
87
  /**
56
88
  * Fetches intelligent, reranked context for a natural language query
57
89
  * @param request The context request parameters (query, topic, library, mode)
@@ -63,10 +95,7 @@ export async function fetchLibraryContext(request, clientIp, apiKey) {
63
95
  try {
64
96
  const url = new URL(`${CONTEXT7_API_BASE_URL}/v2/context`);
65
97
  url.searchParams.set("query", request.query);
66
- if (request.library)
67
- url.searchParams.set("library", request.library);
68
- if (request.mode)
69
- url.searchParams.set("mode", request.mode);
98
+ url.searchParams.set("libraryId", request.libraryId);
70
99
  const headers = generateHeaders(clientIp, apiKey, { "X-Context7-Source": "mcp-server" });
71
100
  const response = await fetch(url, { headers });
72
101
  if (!response.ok) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@upstash/context7-mcp",
3
- "version": "1.0.34-canary.1",
3
+ "version": "1.0.34-canary.3",
4
4
  "mcpName": "io.github.upstash/context7",
5
5
  "description": "MCP server for Context7",
6
6
  "repository": {
@@ -41,13 +41,7 @@
41
41
  "zod": "^3.24.2"
42
42
  },
43
43
  "devDependencies": {
44
- "@ai-sdk/anthropic": "^1.2.12",
45
- "@ai-sdk/google": "^1.2.7",
46
- "@ai-sdk/mcp": "^0.2.0",
47
- "@ai-sdk/openai": "^1.3.22",
48
44
  "@types/node": "^22.13.14",
49
- "ai": "^4.3.16",
50
- "dotenv": "^16.5.0",
51
45
  "typescript": "^5.8.2"
52
46
  },
53
47
  "scripts": {
@@ -60,8 +54,6 @@
60
54
  "format:check": "prettier --check .",
61
55
  "dev": "tsc --watch",
62
56
  "start": "node dist/index.js --transport http",
63
- "pack-mcpb": "pnpm install && pnpm run build && rm -rf node_modules && pnpm install --prod && mv mcpb/.mcpbignore .mcpbignore && mv mcpb/manifest.json manifest.json && mv public/icon.png icon.png && mcpb validate manifest.json && mcpb pack . mcpb/context7.mcpb && mv manifest.json mcpb/manifest.json && mv .mcpbignore mcpb/.mcpbignore && mv icon.png public/icon.png && bun install",
64
- "run-benchmark": "pnpm run build && node dist/benchmark/run-benchmark.js",
65
- "compare-benchmark": "pnpm run build && node dist/benchmark/compare-benchmark.js"
57
+ "pack-mcpb": "pnpm install && pnpm run build && rm -rf node_modules && pnpm install --prod && mv mcpb/.mcpbignore .mcpbignore && mv mcpb/manifest.json manifest.json && mv public/icon.png icon.png && mcpb validate manifest.json && mcpb pack . mcpb/context7.mcpb && mv manifest.json mcpb/manifest.json && mv .mcpbignore mcpb/.mcpbignore && mv icon.png public/icon.png && bun install"
66
58
  }
67
59
  }
@@ -1,347 +0,0 @@
1
- import "dotenv/config";
2
- import { readFileSync, mkdirSync, renameSync, existsSync, readdirSync, writeFileSync } from "fs";
3
- import { join, dirname } from "path";
4
- import { fileURLToPath } from "url";
5
- import { execSync } from "child_process";
6
- import { simulate } from "./simulate.js";
7
- import { generateText } from "ai";
8
- import { anthropic } from "@ai-sdk/anthropic";
9
- import { openai } from "@ai-sdk/openai";
10
- import { google } from "@ai-sdk/google";
11
- // Check for required environment variables
12
- if (!process.env.CONTEXT7_API_KEY) {
13
- console.error("Error: CONTEXT7_API_KEY environment variable is required");
14
- console.error("Set it in your .env file or export it in your shell");
15
- process.exit(1);
16
- }
17
- const __filename = fileURLToPath(import.meta.url);
18
- const __dirname = dirname(__filename);
19
- // Package root is two levels up from dist/benchmark/
20
- const packageRoot = join(__dirname, "..", "..");
21
- /**
22
- * Get the current git branch name
23
- * @returns The branch name or "unknown" if not in a git repo
24
- */
25
- function getCurrentBranch() {
26
- try {
27
- const branch = execSync("git rev-parse --abbrev-ref HEAD", { encoding: "utf-8" }).trim();
28
- return branch;
29
- }
30
- catch (error) {
31
- console.error("Error getting current branch:", error);
32
- return "unknown";
33
- }
34
- }
35
- /**
36
- * Runs benchmarks by simulating questions from questions.txt
37
- *
38
- * Usage:
39
- * - pnpm run benchmark openai
40
- * - pnpm run benchmark claude
41
- * - pnpm run benchmark gemini
42
- * - pnpm run benchmark openai --test (run only first question)
43
- * - pnpm run benchmark claude 1 output-folder (questionset 1, custom output folder)
44
- * - pnpm run benchmark claude aa.txt output-folder (use aa.txt, custom output folder)
45
- */
46
- async function runBenchmark() {
47
- // Parse arguments
48
- const args = process.argv.slice(2);
49
- const nonFlagArgs = args.filter((a) => !a.startsWith("--"));
50
- const modelArg = nonFlagArgs[0]?.toLowerCase() || "claude";
51
- const questionFileArg = nonFlagArgs[1] || null;
52
- const outputFolderName = nonFlagArgs[2] || null;
53
- const isTestMode = args.includes("--test");
54
- let scoringModel;
55
- let modelName;
56
- if (modelArg === "openai") {
57
- scoringModel = openai("gpt-5");
58
- modelName = "GPT-5";
59
- }
60
- else if (modelArg === "gemini") {
61
- scoringModel = google("gemini-2.5-pro");
62
- modelName = "GEMINI-2.5-PRO";
63
- }
64
- else {
65
- // Default to claude
66
- scoringModel = anthropic("claude-sonnet-4-5");
67
- modelName = "CLAUDE-SONNET-4.5";
68
- }
69
- // Determine the questions file to use
70
- let questionsFileName;
71
- if (!questionFileArg) {
72
- questionsFileName = "questions.txt";
73
- }
74
- else if (questionFileArg.endsWith(".txt")) {
75
- // Filename provided directly
76
- questionsFileName = questionFileArg;
77
- }
78
- else {
79
- // Number provided, construct filename
80
- const questionSetNum = parseInt(questionFileArg, 10);
81
- if (!isNaN(questionSetNum)) {
82
- questionsFileName = `questions${questionSetNum}.txt`;
83
- }
84
- else {
85
- questionsFileName = "questions.txt";
86
- }
87
- }
88
- console.log("=".repeat(80));
89
- console.log("Context7 MCP Benchmark");
90
- console.log("=".repeat(80));
91
- console.log(`Scoring Model: ${modelName}`);
92
- console.log(`Question File: ${questionsFileName}`);
93
- if (isTestMode) {
94
- console.log(`Mode: TEST (first question only)`);
95
- }
96
- console.log();
97
- // Read questions from questions.txt or questionsN.txt (in src/benchmark/questions directory)
98
- const questionsPath = join(packageRoot, "src", "benchmark", "questions", questionsFileName);
99
- console.log(`Reading questions from: ${questionsPath}`);
100
- if (!existsSync(questionsPath)) {
101
- console.error(`Error: questions.txt not found at ${questionsPath}`);
102
- process.exit(1);
103
- }
104
- const questionsContent = readFileSync(questionsPath, "utf-8");
105
- let questions = questionsContent
106
- .split("\n")
107
- .map((line) => line.trim())
108
- .filter((line) => line.length > 0 && !line.startsWith("#")); // Filter empty lines and comments
109
- // Limit to first question if in test mode
110
- if (isTestMode) {
111
- questions = questions.slice(0, 1);
112
- console.log(`Test mode: Running only first question`);
113
- }
114
- else {
115
- console.log(`Found ${questions.length} questions to benchmark`);
116
- }
117
- console.log();
118
- // Get current git branch name
119
- const branchName = getCurrentBranch();
120
- // Create benchmark run directory with custom name or default naming
121
- let benchmarkRunDir;
122
- if (outputFolderName) {
123
- benchmarkRunDir = join(packageRoot, "src", "benchmark", "reports", "benchmarks", outputFolderName);
124
- }
125
- else {
126
- const timestamp = new Date().toISOString().replace(/[:.]/g, "-").split("Z")[0];
127
- benchmarkRunDir = join(packageRoot, "src", "benchmark", "reports", "benchmarks", `${branchName}-run-${timestamp}_${modelName.replace(/[.\s]/g, "-")}`);
128
- }
129
- mkdirSync(benchmarkRunDir, { recursive: true });
130
- console.log(`Benchmark results will be saved to: ${benchmarkRunDir}`);
131
- console.log();
132
- const results = [];
133
- // Run simulation for questions in batches (parallel processing)
134
- // BATCH_SIZE can be set via environment variable (e.g., BATCH_SIZE=1 for sequential)
135
- const startTime = Date.now();
136
- const BATCH_SIZE = process.env.BATCH_SIZE ? parseInt(process.env.BATCH_SIZE, 10) : 7;
137
- console.log(`Execution Mode: ${BATCH_SIZE === 1 ? "Sequential (1 question at a time)" : "Parallel (batch size: " + BATCH_SIZE + ")"}`);
138
- console.log();
139
- for (let batchStart = 0; batchStart < questions.length; batchStart += BATCH_SIZE) {
140
- const batchEnd = Math.min(batchStart + BATCH_SIZE, questions.length);
141
- const batch = questions.slice(batchStart, batchEnd);
142
- console.log("═".repeat(80));
143
- console.log(`Processing Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} (Questions ${batchStart + 1}-${batchEnd})`);
144
- console.log("═".repeat(80));
145
- console.log();
146
- // Process batch in parallel
147
- const batchPromises = batch.map(async (question, batchIndex) => {
148
- const questionNum = batchStart + batchIndex + 1;
149
- console.log(`[Q${questionNum}] Starting: ${question.substring(0, 60)}...`);
150
- try {
151
- // Run simulation with unique ID to prevent filename collisions
152
- const uniqueId = `q${questionNum}`;
153
- await simulate(question, uniqueId);
154
- // Wait a bit to ensure file system operations complete
155
- await new Promise((resolve) => setTimeout(resolve, 100));
156
- // Find the report files created for this question by unique ID
157
- const reportsDir = join(packageRoot, "src", "benchmark", "reports");
158
- const files = readdirSync(reportsDir);
159
- // Look for files containing the unique ID
160
- const mdFile = files.find((f) => f.includes(`_${uniqueId}.md`) && !f.endsWith("_raw.md"));
161
- const rawMdFile = files.find((f) => f.includes(`_${uniqueId}_raw.md`));
162
- if (mdFile && rawMdFile) {
163
- // Move files to benchmark directory with new names
164
- const sourceMd = join(reportsDir, mdFile);
165
- const sourceRawMd = join(reportsDir, rawMdFile);
166
- const destMd = join(benchmarkRunDir, `q${questionNum}.md`);
167
- const destRawMd = join(benchmarkRunDir, `q${questionNum}_raw.md`);
168
- renameSync(sourceMd, destMd);
169
- renameSync(sourceRawMd, destRawMd);
170
- console.log(`[Q${questionNum}] ✅ Completed and saved`);
171
- return {
172
- questionNum,
173
- question,
174
- toolCount: 0, // Will be calculated during scoring
175
- tokenCount: 0, // Will be calculated during scoring
176
- totalTokens: 0, // Will be extracted from report
177
- score: 0, // Will be calculated during scoring
178
- };
179
- }
180
- else {
181
- console.error(`[Q${questionNum}] ⚠️ No report files found (expected: *_${uniqueId}.md)`);
182
- return null;
183
- }
184
- }
185
- catch (error) {
186
- console.error(`[Q${questionNum}] ❌ Error:`, error);
187
- return null;
188
- }
189
- });
190
- // Wait for all questions in this batch to complete
191
- const batchResults = await Promise.all(batchPromises);
192
- // Add successful results to the results array
193
- batchResults.forEach((result) => {
194
- if (result) {
195
- results.push(result);
196
- }
197
- });
198
- console.log();
199
- console.log(`Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} completed: ${batchResults.filter((r) => r).length}/${batch.length} successful`);
200
- console.log();
201
- }
202
- const duration = Date.now() - startTime;
203
- // Scoring phase - also in batches of 5 for parallel processing
204
- console.log();
205
- console.log("=".repeat(80));
206
- console.log("Scoring Phase");
207
- console.log("=".repeat(80));
208
- console.log(`Using ${modelName} to score context quality...`);
209
- console.log();
210
- for (let batchStart = 0; batchStart < results.length; batchStart += BATCH_SIZE) {
211
- const batchEnd = Math.min(batchStart + BATCH_SIZE, results.length);
212
- const batchResults = results.slice(batchStart, batchEnd);
213
- console.log(`Scoring Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} (Questions ${batchStart + 1}-${batchEnd})`);
214
- // Process scoring in parallel
215
- const scoringPromises = batchResults.map(async (result) => {
216
- const rawMdPath = join(benchmarkRunDir, `q${result.questionNum}_raw.md`);
217
- const structuredMdPath = join(benchmarkRunDir, `q${result.questionNum}.md`);
218
- try {
219
- // Read raw markdown file
220
- const rawContent = readFileSync(rawMdPath, "utf-8");
221
- // Count tokens (approximate: split by whitespace and punctuation)
222
- const tokenCount = rawContent.split(/[\s\n]+/).length;
223
- result.tokenCount = tokenCount;
224
- // Count tool calls from structured report and extract total tokens
225
- const structuredContent = readFileSync(structuredMdPath, "utf-8");
226
- const toolCallMatches = structuredContent.match(/### Tool Call \d+:/g);
227
- result.toolCount = toolCallMatches ? toolCallMatches.length : 0;
228
- // Extract total tokens from structured report
229
- const totalTokensMatch = structuredContent.match(/\*\*Total Tokens\*\*: (\d+)/);
230
- result.totalTokens = totalTokensMatch ? parseInt(totalTokensMatch[1], 10) : 0;
231
- // Extract question and context from raw file
232
- const lines = rawContent.split("\n");
233
- const questionLine = lines.find((line) => line.startsWith("QUESTION:"));
234
- const question = questionLine
235
- ? questionLine.replace("QUESTION:", "").trim()
236
- : result.question;
237
- // Get context (everything after "CONTEXT:")
238
- const contextStart = rawContent.indexOf("CONTEXT:");
239
- const context = contextStart !== -1 ? rawContent.substring(contextStart + 8).trim() : rawContent;
240
- console.log(`[Q${result.questionNum}] Scoring...`);
241
- // Ask the scoring model to evaluate the context
242
- const scoringResult = await generateText({
243
- model: scoringModel,
244
- messages: [
245
- {
246
- role: "user",
247
- content: `You are evaluating the quality and usefulness of documentation context for a given question.
248
-
249
- Question: ${question}
250
-
251
- Context provided:
252
- ${context}
253
-
254
- Rate how helpful and relevant this context is for answering the question on a scale of 1-10, where:
255
- - 1-3: Poor - Missing critical information, irrelevant, or unhelpful
256
- - 4-6: Adequate - Has some useful information but gaps exist
257
- - 7-8: Good - Covers most needs with relevant examples
258
- - 9-10: Excellent - Comprehensive, relevant, with clear examples
259
-
260
- Respond with ONLY a JSON object in this format:
261
- {"score": <number>, "reasoning": "<brief explanation>"}`,
262
- },
263
- ],
264
- });
265
- // Parse the score
266
- try {
267
- const jsonMatch = scoringResult.text.match(/\{[\s\S]*\}/);
268
- if (jsonMatch) {
269
- const scoreData = JSON.parse(jsonMatch[0]);
270
- result.score = scoreData.score;
271
- console.log(`[Q${result.questionNum}] Score: ${scoreData.score}/10 - ${scoreData.reasoning.substring(0, 60)}...`);
272
- }
273
- else {
274
- console.log(`[Q${result.questionNum}] ⚠️ Could not parse score, defaulting to 0`);
275
- result.score = 0;
276
- }
277
- }
278
- catch (parseError) {
279
- console.log(`[Q${result.questionNum}] ⚠️ Error parsing score: ${parseError}`);
280
- result.score = 0;
281
- }
282
- }
283
- catch (error) {
284
- console.error(`[Q${result.questionNum}] ❌ Error scoring:`, error);
285
- }
286
- });
287
- // Wait for all scoring in this batch to complete
288
- await Promise.all(scoringPromises);
289
- console.log(`Scoring Batch ${Math.floor(batchStart / BATCH_SIZE) + 1} completed: ${batchEnd - batchStart} questions`);
290
- console.log();
291
- }
292
- // Calculate averages
293
- const avgToolCount = results.reduce((sum, r) => sum + r.toolCount, 0) / results.length;
294
- const avgTokenCount = results.reduce((sum, r) => sum + r.tokenCount, 0) / results.length;
295
- const avgTotalTokens = results.reduce((sum, r) => sum + r.totalTokens, 0) / results.length;
296
- const avgScore = results.reduce((sum, r) => sum + r.score, 0) / results.length;
297
- // Generate result.md
298
- console.log("Generating result.md...");
299
- let resultMd = `# Benchmark Results\n\n`;
300
- resultMd += `**Scoring Model**: ${modelName}\n`;
301
- resultMd += `**Date**: ${new Date().toISOString()}\n`;
302
- resultMd += `**Total Questions**: ${results.length}\n`;
303
- resultMd += `**Total Duration**: ${(duration / 1000).toFixed(2)}s\n\n`;
304
- resultMd += `## Averages\n\n`;
305
- resultMd += `| Metric | Value |\n`;
306
- resultMd += `|--------|-------|\n`;
307
- resultMd += `| Average Tool Calls | ${avgToolCount.toFixed(2)} |\n`;
308
- resultMd += `| Average Token Count | ${avgTokenCount.toFixed(0)} |\n`;
309
- resultMd += `| Average Total Tokens (API) | ${avgTotalTokens.toFixed(0)} |\n`;
310
- resultMd += `| Average Score | ${avgScore.toFixed(2)}/10 |\n\n`;
311
- resultMd += `## Results by Question\n\n`;
312
- results.forEach((result) => {
313
- resultMd += `### Q${result.questionNum}: ${result.question}\n\n`;
314
- resultMd += `| Metric | Value |\n`;
315
- resultMd += `|--------|-------|\n`;
316
- resultMd += `| Tool Calls | ${result.toolCount} |\n`;
317
- resultMd += `| Token Count | ${result.tokenCount} |\n`;
318
- resultMd += `| Total Tokens (API) | ${result.totalTokens} |\n`;
319
- resultMd += `| LLM Score | ${result.score}/10 |\n\n`;
320
- });
321
- const resultPath = join(benchmarkRunDir, "result.md");
322
- writeFileSync(resultPath, resultMd);
323
- console.log(`✅ Results saved to: ${resultPath}`);
324
- console.log();
325
- // Summary
326
- console.log("=".repeat(80));
327
- console.log("Benchmark Complete");
328
- console.log("=".repeat(80));
329
- console.log(`Scoring Model: ${modelName}`);
330
- console.log(`Total questions: ${questions.length}`);
331
- console.log(`Total time: ${(duration / 1000).toFixed(2)}s`);
332
- console.log(`Average time per question: ${(duration / questions.length / 1000).toFixed(2)}s`);
333
- console.log();
334
- console.log(`📊 Scoring Results:`);
335
- console.log(` - Average Tool Calls: ${avgToolCount.toFixed(2)}`);
336
- console.log(` - Average Token Count: ${avgTokenCount.toFixed(0)}`);
337
- console.log(` - Average Total Tokens (API): ${avgTotalTokens.toFixed(0)}`);
338
- console.log(` - Average Score: ${avgScore.toFixed(2)}/10`);
339
- console.log();
340
- console.log(`Results saved to: ${benchmarkRunDir}`);
341
- console.log("=".repeat(80));
342
- }
343
- // Run benchmark
344
- runBenchmark().catch((error) => {
345
- console.error("Fatal error:", error);
346
- process.exit(1);
347
- });