@arabold/docs-mcp-server 1.15.1 → 1.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,8 +1,9 @@
1
1
  #!/usr/bin/env node
2
2
  import { execSync } from "node:child_process";
3
- import fs$1, { existsSync } from "node:fs";
3
+ import fs$1, { readFileSync, existsSync } from "node:fs";
4
4
  import "dotenv/config";
5
5
  import { Command } from "commander";
6
+ import { chromium } from "playwright";
6
7
  import * as http from "node:http";
7
8
  import { SSEServerTransport } from "@modelcontextprotocol/sdk/server/sse.js";
8
9
  import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
@@ -13,15 +14,15 @@ import semver__default from "semver";
13
14
  import * as cheerio from "cheerio";
14
15
  import "node:vm";
15
16
  import { VirtualConsole, JSDOM } from "jsdom";
16
- import { chromium } from "playwright";
17
17
  import { gfm } from "@joplin/turndown-plugin-gfm";
18
18
  import TurndownService from "turndown";
19
- import { TextDecoder } from "node:util";
19
+ import iconv from "iconv-lite";
20
20
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
21
21
  import axios from "axios";
22
22
  import { HeaderGenerator } from "header-generator";
23
23
  import fs from "node:fs/promises";
24
24
  import path from "node:path";
25
+ import * as mime from "mime-types";
25
26
  import { v4 } from "uuid";
26
27
  import psl from "psl";
27
28
  import { URL as URL$1, fileURLToPath } from "node:url";
@@ -41,7 +42,7 @@ import Fastify from "fastify";
41
42
  import { jsxs, jsx, Fragment } from "@kitajs/html/jsx-runtime";
42
43
  import DOMPurify from "dompurify";
43
44
  const name = "@arabold/docs-mcp-server";
44
- const version = "1.15.0";
45
+ const version = "1.16.0";
45
46
  const description = "MCP server for fetching and searching documentation";
46
47
  const type = "module";
47
48
  const bin = { "docs-mcp-server": "dist/index.js" };
@@ -49,8 +50,8 @@ const license = "MIT";
49
50
  const repository = { "type": "git", "url": "git+https://github.com/arabold/docs-mcp-server.git" };
50
51
  const files = ["dist", "public", "db", "README.md", "LICENSE", "package.json"];
51
52
  const scripts = { "prepare": "husky || true", "build": "vite build --config vite.config.web.ts && vite build", "start": "node --enable-source-maps dist/index.js", "cli": "node --enable-source-maps dist/index.js", "server": "node --enable-source-maps dist/index.ts", "web": "node --enable-source-maps dist/index.ts web", "dev:cli": "vite-node src/index.ts", "dev:server": "vite-node --watch src/index.ts", "dev:server:stdio": "vite-node --watch src/index.ts -- --protocol stdio", "dev:server:http": "vite-node --watch src/index.ts -- --protocol http", "dev:web": "npm-run-all --parallel dev:web:assets dev:web:bin", "dev:web:bin": "vite-node --watch src/index.ts web", "dev:web:assets": "vite build --config vite.config.web.ts --watch", "test": "vitest run", "test:watch": "vitest", "test:coverage": "vitest run --coverage", "lint": "biome check .", "format": "biome format . --write", "postinstall": "echo 'Skipping Playwright browser install. See README.md for details.'" };
52
- const dependencies = { "@fastify/formbody": "^8.0.2", "@fastify/static": "^8.1.1", "@joplin/turndown-plugin-gfm": "^1.0.62", "@kitajs/html": "^4.2.9", "@kitajs/ts-html-plugin": "^4.1.1", "@langchain/aws": "^0.1.10", "@langchain/community": "^0.3.43", "@langchain/google-genai": "^0.2.9", "@langchain/google-vertexai": "^0.2.9", "@langchain/openai": "^0.5.10", "@modelcontextprotocol/sdk": "^1.11.4", "alpinejs": "^3.14.9", "axios": "^1.9.0", "axios-retry": "^4.5.0", "better-sqlite3": "^11.10.0", "cheerio": "^1.0.0", "commander": "^13.1.0", "dompurify": "^3.2.5", "dotenv": "^16.5.0", "env-paths": "^3.0.0", "fastify": "^5.3.3", "flowbite": "^3.1.2", "fuse.js": "^7.1.0", "header-generator": "^2.1.66", "htmx.org": "^1.9.12", "jsdom": "^26.1.0", "langchain": "0.3.19", "minimatch": "^10.0.1", "playwright": "^1.52.0", "psl": "^1.15.0", "remark": "^15.0.1", "remark-gfm": "^4.0.1", "remark-html": "^16.0.1", "semver": "^7.7.2", "sqlite-vec": "^0.1.7-alpha.2", "turndown": "^7.2.0", "zod": "^3.24.4" };
53
- const devDependencies = { "@biomejs/biome": "1.9.4", "@commitlint/cli": "^19.8.1", "@commitlint/config-conventional": "^19.8.1", "@semantic-release/changelog": "^6.0.3", "@semantic-release/git": "^10.0.1", "@semantic-release/github": "^11.0.2", "@semantic-release/npm": "^12.0.1", "@tailwindcss/postcss": "^4.1.7", "@tailwindcss/vite": "^4.1.7", "@types/alpinejs": "^3.13.11", "@types/better-sqlite3": "^7.6.13", "@types/jsdom": "~21.1.7", "@types/lint-staged": "~13.3.0", "@types/node": "^20.17.47", "@types/node-fetch": "^2.6.12", "@types/psl": "^1.1.3", "@types/semver": "^7.7.0", "@types/turndown": "^5.0.5", "autoprefixer": "^10.4.21", "flowbite-typography": "^1.0.5", "husky": "^9.1.7", "lint-staged": "^15.5.2", "memfs": "^4.17.2", "npm-run-all": "^4.1.5", "postcss": "^8.5.3", "semantic-release": "^24.2.4", "tailwindcss": "^4.1.4", "typescript": "^5.8.3", "vite": "^6.3.5", "vite-node": "^3.1.2", "vite-plugin-dts": "^4.5.4", "vitest": "^3.1.3" };
53
+ const dependencies = { "@fastify/formbody": "^8.0.2", "@fastify/static": "^8.1.1", "@joplin/turndown-plugin-gfm": "^1.0.62", "@kitajs/html": "^4.2.9", "@kitajs/ts-html-plugin": "^4.1.1", "@langchain/aws": "^0.1.10", "@langchain/community": "^0.3.43", "@langchain/google-genai": "^0.2.9", "@langchain/google-vertexai": "^0.2.9", "@langchain/openai": "^0.5.10", "@modelcontextprotocol/sdk": "^1.11.4", "alpinejs": "^3.14.9", "axios": "^1.9.0", "axios-retry": "^4.5.0", "better-sqlite3": "^11.10.0", "cheerio": "^1.0.0", "commander": "^13.1.0", "dompurify": "^3.2.5", "dotenv": "^16.5.0", "env-paths": "^3.0.0", "fastify": "^5.3.3", "flowbite": "^3.1.2", "fuse.js": "^7.1.0", "header-generator": "^2.1.66", "htmx.org": "^1.9.12", "iconv-lite": "^0.6.3", "jsdom": "^26.1.0", "langchain": "0.3.19", "mime-types": "^3.0.1", "minimatch": "^10.0.1", "playwright": "^1.52.0", "psl": "^1.15.0", "remark": "^15.0.1", "remark-gfm": "^4.0.1", "remark-html": "^16.0.1", "semver": "^7.7.2", "sqlite-vec": "^0.1.7-alpha.2", "turndown": "^7.2.0", "zod": "^3.24.4" };
54
+ const devDependencies = { "@biomejs/biome": "1.9.4", "@commitlint/cli": "^19.8.1", "@commitlint/config-conventional": "^19.8.1", "@semantic-release/changelog": "^6.0.3", "@semantic-release/git": "^10.0.1", "@semantic-release/github": "^11.0.2", "@semantic-release/npm": "^12.0.1", "@tailwindcss/postcss": "^4.1.7", "@tailwindcss/vite": "^4.1.7", "@types/alpinejs": "^3.13.11", "@types/better-sqlite3": "^7.6.13", "@types/jsdom": "~21.1.7", "@types/lint-staged": "~13.3.0", "@types/mime-types": "^2.1.4", "@types/node": "^20.17.47", "@types/node-fetch": "^2.6.12", "@types/psl": "^1.1.3", "@types/semver": "^7.7.0", "@types/turndown": "^5.0.5", "autoprefixer": "^10.4.21", "flowbite-typography": "^1.0.5", "husky": "^9.1.7", "lint-staged": "^15.5.2", "memfs": "^4.17.2", "npm-run-all": "^4.1.5", "postcss": "^8.5.3", "semantic-release": "^24.2.4", "tailwindcss": "^4.1.4", "typescript": "^5.8.3", "vite": "^6.3.5", "vite-node": "^3.1.2", "vite-plugin-dts": "^4.5.4", "vitest": "^3.1.3" };
54
55
  const engines = { "node": ">=20.0.0" };
55
56
  const packageJson = {
56
57
  name,
@@ -372,6 +373,21 @@ class HtmlMetadataExtractorMiddleware {
372
373
  await next();
373
374
  }
374
375
  }
376
+ const DEFAULT_MAX_PAGES$1 = 1e3;
377
+ const DEFAULT_MAX_DEPTH$1 = 3;
378
+ const DEFAULT_MAX_CONCURRENCY = 3;
379
+ const DEFAULT_PROTOCOL = "stdio";
380
+ const DEFAULT_HTTP_PORT = 6280;
381
+ const DEFAULT_WEB_PORT = 6281;
382
+ const DEFAULT_PAGE_TIMEOUT = 5e3;
383
+ const FETCHER_MAX_RETRIES = 6;
384
+ const FETCHER_BASE_DELAY = 1e3;
385
+ const SPLITTER_MIN_CHUNK_SIZE = 500;
386
+ const SPLITTER_PREFERRED_CHUNK_SIZE = 1500;
387
+ const SPLITTER_MAX_CHUNK_SIZE = 5e3;
388
+ const EMBEDDING_BATCH_SIZE = 100;
389
+ const MIGRATION_MAX_RETRIES = 5;
390
+ const MIGRATION_RETRY_DELAY_MS = 300;
375
391
  var ScrapeMode = /* @__PURE__ */ ((ScrapeMode2) => {
376
392
  ScrapeMode2["Fetch"] = "fetch";
377
393
  ScrapeMode2["Playwright"] = "playwright";
@@ -387,10 +403,15 @@ class HtmlPlaywrightMiddleware {
387
403
  async ensureBrowser() {
388
404
  if (!this.browser || !this.browser.isConnected()) {
389
405
  const launchArgs = process.env.PLAYWRIGHT_LAUNCH_ARGS?.split(" ") ?? [];
406
+ const executablePath = process.env.PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH || void 0;
390
407
  logger.debug(
391
408
  `Launching new Playwright browser instance (Chromium) with args: ${launchArgs.join(" ") || "none"}...`
392
409
  );
393
- this.browser = await chromium.launch({ channel: "chromium", args: launchArgs });
410
+ this.browser = await chromium.launch({
411
+ channel: "chromium",
412
+ args: launchArgs,
413
+ executablePath
414
+ });
394
415
  this.browser.on("disconnected", () => {
395
416
  logger.debug("Playwright browser instance disconnected.");
396
417
  this.browser = null;
@@ -409,12 +430,50 @@ class HtmlPlaywrightMiddleware {
409
430
  this.browser = null;
410
431
  }
411
432
  }
433
+ /**
434
+ * Waits for common loading indicators (spinners, loaders) that are currently visible to disappear from the page.
435
+ * Only waits for selectors that are present and visible at the time of check.
436
+ *
437
+ * @param page The Playwright page instance to operate on.
438
+ */
439
+ async waitForLoadingToComplete(page) {
440
+ const commonLoadingSelectors = [
441
+ '[class*="loading"]',
442
+ '[class*="spinner"]',
443
+ '[class*="loader"]',
444
+ '[id*="loading"]',
445
+ '[class*="preload"]',
446
+ "#loading",
447
+ '[aria-label*="loading" i]',
448
+ '[aria-label*="spinner" i]'
449
+ ];
450
+ const waitPromises = [];
451
+ for (const selector of commonLoadingSelectors) {
452
+ try {
453
+ const isVisible = await page.isVisible(selector).catch(() => false);
454
+ if (isVisible) {
455
+ waitPromises.push(
456
+ page.waitForSelector(selector, {
457
+ state: "hidden",
458
+ timeout: DEFAULT_PAGE_TIMEOUT
459
+ }).catch(() => {
460
+ })
461
+ );
462
+ }
463
+ } catch {
464
+ }
465
+ }
466
+ if (waitPromises.length > 0) {
467
+ await Promise.all(waitPromises);
468
+ }
469
+ }
412
470
  /**
413
471
  * Processes the context using Playwright, rendering dynamic content and propagating credentials for all same-origin requests.
414
472
  *
415
473
  * - Parses credentials from the URL (if present).
416
474
  * - Uses browser.newContext({ httpCredentials }) for HTTP Basic Auth on the main page and subresources.
417
475
  * - Injects Authorization header for all same-origin requests if credentials are present and not already set.
476
+ * - Waits for common loading indicators to disappear before extracting HTML.
418
477
  *
419
478
  * @param context The middleware context containing the HTML and source URL.
420
479
  * @param next The next middleware function in the pipeline.
@@ -447,7 +506,7 @@ class HtmlPlaywrightMiddleware {
447
506
  );
448
507
  }
449
508
  } catch (e) {
450
- logger.warn(`⚠️ Could not parse URL for credential extraction: ${context.source}`);
509
+ logger.warn(`⚠️ Could not parse URL for credential extraction: ${context.source}`);
451
510
  }
452
511
  try {
453
512
  const browser = await this.ensureBrowser();
@@ -492,6 +551,7 @@ class HtmlPlaywrightMiddleware {
492
551
  });
493
552
  await page.goto(context.source, { waitUntil: "load" });
494
553
  await page.waitForSelector("body");
554
+ await this.waitForLoadingToComplete(page);
495
555
  renderedHtml = await page.content();
496
556
  logger.debug(`Playwright: Successfully rendered content for ${context.source}`);
497
557
  } catch (error) {
@@ -515,7 +575,7 @@ class HtmlPlaywrightMiddleware {
515
575
  );
516
576
  } else {
517
577
  logger.warn(
518
- `⚠️ Playwright rendering resulted in null content for ${context.source}. Proceeding without content update.`
578
+ `⚠️ Playwright rendering resulted in null content for ${context.source}. Proceeding without content update.`
519
579
  );
520
580
  }
521
581
  await next();
@@ -611,7 +671,7 @@ class HtmlSanitizerMiddleware {
611
671
  }
612
672
  } catch (selectorError) {
613
673
  logger.warn(
614
- `⚠️ Potentially invalid selector "${selector}" during element removal: ${selectorError}`
674
+ `⚠️ Potentially invalid selector "${selector}" during element removal: ${selectorError}`
615
675
  );
616
676
  context.errors.push(
617
677
  new Error(`Invalid selector "${selector}": ${selectorError}`)
@@ -711,7 +771,7 @@ ${text.replace(/^\n+|\n+$/g, "")}
711
771
  const markdown = this.turndownService.turndown(htmlToConvert).trim();
712
772
  if (!markdown) {
713
773
  const warnMsg = `HTML to Markdown conversion resulted in empty content for ${context.source}.`;
714
- logger.warn(`⚠️ ${warnMsg}`);
774
+ logger.warn(`⚠️ ${warnMsg}`);
715
775
  context.content = "";
716
776
  } else {
717
777
  context.content = markdown;
@@ -768,11 +828,12 @@ class MarkdownMetadataExtractorMiddleware {
768
828
  }
769
829
  }
770
830
  function convertToString(content, charset) {
771
- if (Buffer.isBuffer(content)) {
772
- const decoder = new TextDecoder(charset || "utf-8");
773
- return decoder.decode(content);
831
+ if (typeof content === "string") return content;
832
+ try {
833
+ return iconv.decode(content, charset || "utf-8");
834
+ } catch {
835
+ return iconv.decode(content, "utf-8");
774
836
  }
775
- return content;
776
837
  }
777
838
  class BasePipeline {
778
839
  /**
@@ -985,13 +1046,13 @@ class FetchUrlTool {
985
1046
  }
986
1047
  if (!processed) {
987
1048
  logger.warn(
988
- `⚠️ Unsupported content type "${rawContent.mimeType}" for ${url}. Returning raw content.`
1049
+ `⚠️ Unsupported content type "${rawContent.mimeType}" for ${url}. Returning raw content.`
989
1050
  );
990
1051
  const contentString = typeof rawContent.content === "string" ? rawContent.content : Buffer.from(rawContent.content).toString("utf-8");
991
1052
  return contentString;
992
1053
  }
993
1054
  for (const err of processed.errors) {
994
- logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
1055
+ logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
995
1056
  }
996
1057
  if (typeof processed.textContent !== "string" || !processed.textContent.trim()) {
997
1058
  throw new ToolError(
@@ -1179,20 +1240,6 @@ class RemoveTool {
1179
1240
  }
1180
1241
  }
1181
1242
  }
1182
- const DEFAULT_MAX_PAGES$1 = 1e3;
1183
- const DEFAULT_MAX_DEPTH$1 = 3;
1184
- const DEFAULT_MAX_CONCURRENCY = 3;
1185
- const DEFAULT_PROTOCOL = "stdio";
1186
- const DEFAULT_HTTP_PORT = 6280;
1187
- const DEFAULT_WEB_PORT = 6281;
1188
- const FETCHER_MAX_RETRIES = 6;
1189
- const FETCHER_BASE_DELAY = 1e3;
1190
- const SPLITTER_MIN_CHUNK_SIZE = 500;
1191
- const SPLITTER_PREFERRED_CHUNK_SIZE = 1500;
1192
- const SPLITTER_MAX_CHUNK_SIZE = 5e3;
1193
- const EMBEDDING_BATCH_SIZE = 100;
1194
- const MIGRATION_MAX_RETRIES = 5;
1195
- const MIGRATION_RETRY_DELAY_MS = 300;
1196
1243
  class ScrapeTool {
1197
1244
  docService;
1198
1245
  manager;
@@ -1717,7 +1764,7 @@ ${formattedJob}`);
1717
1764
  if (validation.success) {
1718
1765
  statusFilter = validation.data;
1719
1766
  } else {
1720
- logger.warn(`⚠️ Invalid status parameter received: ${statusParam}`);
1767
+ logger.warn(`⚠️ Invalid status parameter received: ${statusParam}`);
1721
1768
  }
1722
1769
  }
1723
1770
  const result = await tools.listJobs.execute({ status: statusFilter });
@@ -1746,7 +1793,7 @@ ${formattedJob}`);
1746
1793
  },
1747
1794
  async (uri, { jobId }) => {
1748
1795
  if (typeof jobId !== "string" || jobId.length === 0) {
1749
- logger.warn(`⚠️ Invalid jobId received in URI: ${jobId}`);
1796
+ logger.warn(`⚠️ Invalid jobId received in URI: ${jobId}`);
1750
1797
  return { contents: [] };
1751
1798
  }
1752
1799
  const result = await tools.getJobInfo.execute({ jobId });
@@ -1959,7 +2006,7 @@ class HttpFetcher {
1959
2006
  if (attempt < maxRetries && (status === void 0 || this.retryableStatusCodes.includes(status))) {
1960
2007
  const delay = baseDelay * 2 ** attempt;
1961
2008
  logger.warn(
1962
- `⚠️ Attempt ${attempt + 1}/${maxRetries + 1} failed for ${source} (Status: ${status}, Code: ${code}). Retrying in ${delay}ms...`
2009
+ `⚠️ Attempt ${attempt + 1}/${maxRetries + 1} failed for ${source} (Status: ${status}, Code: ${code}). Retrying in ${delay}ms...`
1963
2010
  );
1964
2011
  await this.delay(delay);
1965
2012
  continue;
@@ -1981,13 +2028,17 @@ class FileFetcher {
1981
2028
  canFetch(source) {
1982
2029
  return source.startsWith("file://");
1983
2030
  }
2031
+ /**
2032
+ * Fetches the content of a file given a file:// URL, decoding percent-encoded paths as needed.
2033
+ * Only HTML and Markdown files are processed.
2034
+ */
1984
2035
  async fetch(source, options) {
1985
- const filePath = source.replace(/^file:\/\//, "");
1986
- logger.info(`📄 Fetching file: ${filePath}`);
2036
+ const rawPath = source.replace("file://", "");
2037
+ const filePath = decodeURIComponent(rawPath);
1987
2038
  try {
1988
2039
  const content = await fs.readFile(filePath);
1989
2040
  const ext = path.extname(filePath).toLowerCase();
1990
- const mimeType = this.getMimeType(ext);
2041
+ const mimeType = mime.lookup(ext) || "application/octet-stream";
1991
2042
  return {
1992
2043
  content,
1993
2044
  mimeType,
@@ -2003,19 +2054,6 @@ class FileFetcher {
2003
2054
  );
2004
2055
  }
2005
2056
  }
2006
- getMimeType(ext) {
2007
- switch (ext) {
2008
- case ".html":
2009
- case ".htm":
2010
- return "text/html";
2011
- case ".md":
2012
- return "text/markdown";
2013
- case ".txt":
2014
- return "text/plain";
2015
- default:
2016
- return "application/octet-stream";
2017
- }
2018
- }
2019
2057
  }
2020
2058
  async function initializeTools(docService, pipelineManager) {
2021
2059
  const tools = {
@@ -2072,7 +2110,7 @@ async function stopServer() {
2072
2110
  }
2073
2111
  runningServer = null;
2074
2112
  if (hadError) {
2075
- logger.warn("⚠️ MCP Server instance stopped with errors.");
2113
+ logger.warn("⚠️ MCP Server instance stopped with errors.");
2076
2114
  } else {
2077
2115
  logger.info("✅ MCP Server instance stopped.");
2078
2116
  }
@@ -2184,9 +2222,19 @@ function extractPathAndQuery(url) {
2184
2222
  function shouldIncludeUrl(url, includePatterns, excludePatterns) {
2185
2223
  const path2 = extractPathAndQuery(url);
2186
2224
  const normalizedPath = path2.startsWith("/") ? path2 : `/${path2}`;
2187
- if (matchesAnyPattern(normalizedPath, excludePatterns)) return false;
2225
+ let basename;
2226
+ if (url.startsWith("file://")) {
2227
+ try {
2228
+ const u = new URL(url);
2229
+ basename = u.pathname ? u.pathname.split("/").pop() : void 0;
2230
+ } catch {
2231
+ }
2232
+ }
2233
+ const stripSlash = (patterns) => patterns?.map((p) => p.startsWith("/") ? p.slice(1) : p);
2234
+ if (matchesAnyPattern(normalizedPath, excludePatterns) || basename && matchesAnyPattern(basename, stripSlash(excludePatterns)))
2235
+ return false;
2188
2236
  if (!includePatterns || includePatterns.length === 0) return true;
2189
- return matchesAnyPattern(normalizedPath, includePatterns);
2237
+ return matchesAnyPattern(normalizedPath, includePatterns) || (basename ? matchesAnyPattern(basename, stripSlash(includePatterns)) : false);
2190
2238
  }
2191
2239
  function isInScope(baseUrl, targetUrl, scope) {
2192
2240
  if (baseUrl.protocol !== targetUrl.protocol) return false;
@@ -2385,16 +2433,16 @@ class WebScraperStrategy extends BaseScraperStrategy {
2385
2433
  }
2386
2434
  if (!processed) {
2387
2435
  logger.warn(
2388
- `⚠️ Unsupported content type "${rawContent.mimeType}" for URL ${url}. Skipping processing.`
2436
+ `⚠️ Unsupported content type "${rawContent.mimeType}" for URL ${url}. Skipping processing.`
2389
2437
  );
2390
2438
  return { document: void 0, links: [] };
2391
2439
  }
2392
2440
  for (const err of processed.errors) {
2393
- logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
2441
+ logger.warn(`⚠️ Processing error for ${url}: ${err.message}`);
2394
2442
  }
2395
2443
  if (!processed.textContent || !processed.textContent.trim()) {
2396
2444
  logger.warn(
2397
- `⚠️ No processable content found for ${url} after pipeline execution.`
2445
+ `⚠️ No processable content found for ${url} after pipeline execution.`
2398
2446
  );
2399
2447
  return { document: void 0, links: processed.links };
2400
2448
  }
@@ -2500,15 +2548,14 @@ class LocalFileStrategy extends BaseScraperStrategy {
2500
2548
  return url.startsWith("file://");
2501
2549
  }
2502
2550
  async processItem(item, options, _progressCallback, _signal) {
2503
- const filePath = item.url.replace(/^file:\/\//, "");
2551
+ const filePath = decodeURIComponent(item.url.replace(/^file:\/\//, ""));
2504
2552
  const stats = await fs.stat(filePath);
2505
2553
  if (stats.isDirectory()) {
2506
2554
  const contents = await fs.readdir(filePath);
2507
- return {
2508
- links: contents.map((name2) => `file://${path.join(filePath, name2)}`)
2509
- };
2555
+ const links = contents.map((name2) => `file://${path.join(filePath, name2)}`).filter((url) => this.shouldProcessUrl(url, options));
2556
+ return { links };
2510
2557
  }
2511
- logger.info(`📄 Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
2558
+ logger.info(`🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
2512
2559
  const rawContent = await this.fileFetcher.fetch(item.url);
2513
2560
  let processed;
2514
2561
  for (const pipeline of this.pipelines) {
@@ -2519,12 +2566,12 @@ class LocalFileStrategy extends BaseScraperStrategy {
2519
2566
  }
2520
2567
  if (!processed) {
2521
2568
  logger.warn(
2522
- `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
2569
+ `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
2523
2570
  );
2524
2571
  return { document: void 0, links: [] };
2525
2572
  }
2526
2573
  for (const err of processed.errors) {
2527
- logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`);
2574
+ logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`);
2528
2575
  }
2529
2576
  return {
2530
2577
  document: {
@@ -2682,7 +2729,7 @@ class PipelineWorker {
2682
2729
  }
2683
2730
  logger.debug(`[${jobId}] Worker finished job successfully.`);
2684
2731
  } catch (error) {
2685
- logger.warn(`⚠️ [${jobId}] Worker encountered error: ${error}`);
2732
+ logger.warn(`⚠️ [${jobId}] Worker encountered error: ${error}`);
2686
2733
  throw error;
2687
2734
  }
2688
2735
  }
@@ -2719,7 +2766,7 @@ class PipelineManager {
2719
2766
  */
2720
2767
  async start() {
2721
2768
  if (this.isRunning) {
2722
- logger.warn("⚠️ PipelineManager is already running.");
2769
+ logger.warn("⚠️ PipelineManager is already running.");
2723
2770
  return;
2724
2771
  }
2725
2772
  this.isRunning = true;
@@ -2733,7 +2780,7 @@ class PipelineManager {
2733
2780
  */
2734
2781
  async stop() {
2735
2782
  if (!this.isRunning) {
2736
- logger.warn("⚠️ PipelineManager is not running.");
2783
+ logger.warn("⚠️ PipelineManager is not running.");
2737
2784
  return;
2738
2785
  }
2739
2786
  this.isRunning = false;
@@ -2852,7 +2899,7 @@ class PipelineManager {
2852
2899
  case PipelineJobStatus.CANCELLED:
2853
2900
  case PipelineJobStatus.CANCELLING:
2854
2901
  logger.warn(
2855
- `⚠️ Job ${jobId} cannot be cancelled in its current state: ${job.status}`
2902
+ `⚠️ Job ${jobId} cannot be cancelled in its current state: ${job.status}`
2856
2903
  );
2857
2904
  break;
2858
2905
  default:
@@ -3764,7 +3811,7 @@ async function applyMigrations(db) {
3764
3811
  if (error?.code === "SQLITE_BUSY" && retries < MIGRATION_MAX_RETRIES) {
3765
3812
  retries++;
3766
3813
  logger.warn(
3767
- `⚠️ Migrations busy (SQLITE_BUSY), retrying attempt ${retries}/${MIGRATION_MAX_RETRIES} in ${MIGRATION_RETRY_DELAY_MS}ms...`
3814
+ `⚠️ Migrations busy (SQLITE_BUSY), retrying attempt ${retries}/${MIGRATION_MAX_RETRIES} in ${MIGRATION_RETRY_DELAY_MS}ms...`
3768
3815
  );
3769
3816
  await new Promise((resolve) => setTimeout(resolve, MIGRATION_RETRY_DELAY_MS));
3770
3817
  } else {
@@ -3943,7 +3990,7 @@ class DocumentStore {
3943
3990
  */
3944
3991
  async initializeEmbeddings() {
3945
3992
  const modelSpec = process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
3946
- const { createEmbeddingModel } = await import("./EmbeddingFactory-C6_OpOiy.js");
3993
+ const { createEmbeddingModel } = await import("./EmbeddingFactory-CElwVk3X.js");
3947
3994
  this.embeddings = createEmbeddingModel(modelSpec);
3948
3995
  const testVector = await this.embeddings.embedQuery("test");
3949
3996
  this.modelDimension = testVector.length;
@@ -4367,7 +4414,7 @@ class DocumentManagementService {
4367
4414
  try {
4368
4415
  fs$1.mkdirSync(dbDir, { recursive: true });
4369
4416
  } catch (error) {
4370
- logger.error(`⚠️ Failed to create database directory ${dbDir}: ${error}`);
4417
+ logger.error(`⚠️ Failed to create database directory ${dbDir}: ${error}`);
4371
4418
  }
4372
4419
  this.store = new DocumentStore(dbPath);
4373
4420
  this.documentRetriever = new DocumentRetrieverService(this.store);
@@ -4407,7 +4454,7 @@ class DocumentManagementService {
4407
4454
  const versions = await this.listVersions(normalizedLibrary);
4408
4455
  const hasUnversioned = await this.exists(normalizedLibrary, "");
4409
4456
  if (versions.length === 0 && !hasUnversioned) {
4410
- logger.warn(`⚠️ Library '${library}' not found.`);
4457
+ logger.warn(`⚠️ Library '${library}' not found.`);
4411
4458
  const allLibraries = await this.listLibraries();
4412
4459
  const libraryNames = allLibraries.map((lib) => lib.library);
4413
4460
  let suggestions = [];
@@ -4466,7 +4513,7 @@ class DocumentManagementService {
4466
4513
  logger.info(`ℹ️ Unversioned documents exist for ${library}`);
4467
4514
  return { bestMatch: null, hasUnversioned: true };
4468
4515
  }
4469
- logger.warn(`⚠️ No valid versions found for ${library}`);
4516
+ logger.warn(`⚠️ No valid versions found for ${library}`);
4470
4517
  const allLibraryDetails = await this.store.queryLibraryVersions();
4471
4518
  const libraryDetails = allLibraryDetails.get(library) ?? [];
4472
4519
  throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
@@ -4478,7 +4525,7 @@ class DocumentManagementService {
4478
4525
  } else {
4479
4526
  const versionRegex = /^(\d+)(?:\.(?:x(?:\.x)?|\d+(?:\.(?:x|\d+))?))?$|^$/;
4480
4527
  if (!versionRegex.test(targetVersion)) {
4481
- logger.warn(`⚠️ Invalid target version format: ${targetVersion}`);
4528
+ logger.warn(`⚠️ Invalid target version format: ${targetVersion}`);
4482
4529
  } else {
4483
4530
  let range = targetVersion;
4484
4531
  if (!semver__default.validRange(targetVersion)) {
@@ -4494,7 +4541,7 @@ class DocumentManagementService {
4494
4541
  `✅ Found best match version ${bestMatch} for ${library}@${targetVersion}`
4495
4542
  );
4496
4543
  } else {
4497
- logger.warn(`⚠️ No matching semver version found for ${library}@${targetVersion}`);
4544
+ logger.warn(`⚠️ No matching semver version found for ${library}@${targetVersion}`);
4498
4545
  }
4499
4546
  if (!bestMatch && !hasUnversioned) {
4500
4547
  const allLibraryDetails = await this.store.queryLibraryVersions();
@@ -4540,7 +4587,7 @@ class DocumentManagementService {
4540
4587
  path: chunk.section.path
4541
4588
  }
4542
4589
  }));
4543
- logger.info(`📄 Split document into ${splitDocs.length} chunks`);
4590
+ logger.info(`✂️ Split document into ${splitDocs.length} chunks`);
4544
4591
  await this.store.addDocuments(library, normalizedVersion, splitDocs);
4545
4592
  }
4546
4593
  /**
@@ -4561,17 +4608,23 @@ class DocumentManagementService {
4561
4608
  }));
4562
4609
  }
4563
4610
  }
4564
- const Layout = ({
4565
- title,
4566
- version: version2 = "1.15.0",
4567
- children
4568
- }) => /* @__PURE__ */ jsxs("html", { lang: "en", children: [
4569
- /* @__PURE__ */ jsxs("head", { children: [
4570
- /* @__PURE__ */ jsx("meta", { charset: "UTF-8" }),
4571
- /* @__PURE__ */ jsx("meta", { name: "viewport", content: "width=device-width, initial-scale=1.0" }),
4572
- /* @__PURE__ */ jsx("title", { safe: true, children: title }),
4573
- /* @__PURE__ */ jsx("link", { rel: "stylesheet", href: "/assets/main.css" }),
4574
- /* @__PURE__ */ jsx("style", { children: `
4611
+ const Layout = ({ title, version: version2, children }) => {
4612
+ let versionString = version2;
4613
+ if (!versionString) {
4614
+ try {
4615
+ const packageJson2 = JSON.parse(readFileSync("package.json", "utf-8"));
4616
+ versionString = packageJson2.version;
4617
+ } catch (error) {
4618
+ console.error("Error reading package.json:", error);
4619
+ }
4620
+ }
4621
+ return /* @__PURE__ */ jsxs("html", { lang: "en", children: [
4622
+ /* @__PURE__ */ jsxs("head", { children: [
4623
+ /* @__PURE__ */ jsx("meta", { charset: "UTF-8" }),
4624
+ /* @__PURE__ */ jsx("meta", { name: "viewport", content: "width=device-width, initial-scale=1.0" }),
4625
+ /* @__PURE__ */ jsx("title", { safe: true, children: title }),
4626
+ /* @__PURE__ */ jsx("link", { rel: "stylesheet", href: "/assets/main.css" }),
4627
+ /* @__PURE__ */ jsx("style", { children: `
4575
4628
  .htmx-indicator {
4576
4629
  display: none;
4577
4630
  }
@@ -4594,29 +4647,30 @@ const Layout = ({
4594
4647
  form .htmx-indicator .search-text { display: none; }
4595
4648
  form .spinner { display: none; }
4596
4649
  ` })
4597
- ] }),
4598
- /* @__PURE__ */ jsxs("body", { class: "bg-gray-50 dark:bg-gray-900", children: [
4599
- /* @__PURE__ */ jsxs("div", { class: "container max-w-2xl mx-auto px-4 py-4", children: [
4600
- /* @__PURE__ */ jsx("header", { class: "mb-4", children: /* @__PURE__ */ jsxs("h1", { class: "text-3xl font-bold text-gray-900 dark:text-white", children: [
4601
- /* @__PURE__ */ jsx("a", { href: "/", children: "MCP Docs" }),
4602
- version2 ? /* @__PURE__ */ jsxs(
4603
- "span",
4604
- {
4605
- safe: true,
4606
- class: "ml-2 text-base font-normal text-gray-500 dark:text-gray-400 align-baseline",
4607
- title: `Version ${version2}`,
4608
- children: [
4609
- "v",
4610
- version2
4611
- ]
4612
- }
4613
- ) : null
4614
- ] }) }),
4615
- /* @__PURE__ */ jsx("main", { children })
4616
4650
  ] }),
4617
- /* @__PURE__ */ jsx("script", { type: "module", src: "/assets/main.js" })
4618
- ] })
4619
- ] });
4651
+ /* @__PURE__ */ jsxs("body", { class: "bg-gray-50 dark:bg-gray-900", children: [
4652
+ /* @__PURE__ */ jsxs("div", { class: "container max-w-2xl mx-auto px-4 py-4", children: [
4653
+ /* @__PURE__ */ jsx("header", { class: "mb-4", children: /* @__PURE__ */ jsxs("h1", { class: "text-3xl font-bold text-gray-900 dark:text-white", children: [
4654
+ /* @__PURE__ */ jsx("a", { href: "/", children: "MCP Docs" }),
4655
+ versionString ? /* @__PURE__ */ jsxs(
4656
+ "span",
4657
+ {
4658
+ safe: true,
4659
+ class: "ml-2 text-base font-normal text-gray-500 dark:text-gray-400 align-baseline",
4660
+ title: `Version ${versionString}`,
4661
+ children: [
4662
+ "v",
4663
+ versionString
4664
+ ]
4665
+ }
4666
+ ) : null
4667
+ ] }) }),
4668
+ /* @__PURE__ */ jsx("main", { children })
4669
+ ] }),
4670
+ /* @__PURE__ */ jsx("script", { type: "module", src: "/assets/main.js" })
4671
+ ] })
4672
+ ] });
4673
+ };
4620
4674
  function registerIndexRoute(server) {
4621
4675
  server.get("/", async (_, reply) => {
4622
4676
  reply.type("text/html");
@@ -4857,7 +4911,25 @@ const ScrapeFormContent = () => /* @__PURE__ */ jsxs("div", { class: "mt-4 p-4 b
4857
4911
  children: "URL"
4858
4912
  }
4859
4913
  ),
4860
- /* @__PURE__ */ jsx(Tooltip, { text: "Enter the URL of the documentation you want to scrape. This will be the starting point for the scraper." })
4914
+ /* @__PURE__ */ jsx(
4915
+ Tooltip,
4916
+ {
4917
+ text: /* @__PURE__ */ jsxs("div", { children: [
4918
+ /* @__PURE__ */ jsx("p", { children: "Enter the URL of the documentation you want to scrape." }),
4919
+ /* @__PURE__ */ jsxs("p", { class: "mt-2", children: [
4920
+ "For local files/folders, you must use the ",
4921
+ /* @__PURE__ */ jsx("code", { children: "file://" }),
4922
+ " ",
4923
+ "prefix and ensure the path is accessible to the server."
4924
+ ] }),
4925
+ /* @__PURE__ */ jsxs("p", { class: "mt-2", children: [
4926
+ "If running in Docker, ",
4927
+ /* @__PURE__ */ jsx("b", { children: "mount the folder" }),
4928
+ " (see README for details)."
4929
+ ] })
4930
+ ] })
4931
+ }
4932
+ )
4861
4933
  ] }),
4862
4934
  /* @__PURE__ */ jsx(
4863
4935
  "input",
@@ -4875,7 +4947,7 @@ const ScrapeFormContent = () => /* @__PURE__ */ jsxs("div", { class: "mt-4 p-4 b
4875
4947
  /* @__PURE__ */ jsx(
4876
4948
  "div",
4877
4949
  {
4878
- "x-show": "hasPath",
4950
+ "x-show": "hasPath && !(url.startsWith('file://'))",
4879
4951
  "x-cloak": true,
4880
4952
  "x-transition:enter": "transition ease-out duration-300",
4881
4953
  "x-transition:enter-start": "opacity-0 transform -translate-y-2",
@@ -5652,19 +5724,27 @@ async function stopWebServer(server) {
5652
5724
  }
5653
5725
  }
5654
5726
  function ensurePlaywrightBrowsersInstalled() {
5727
+ const chromiumEnvPath = process.env.PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH;
5728
+ if (chromiumEnvPath && existsSync(chromiumEnvPath)) {
5729
+ logger.debug(
5730
+ `PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH is set to '${chromiumEnvPath}', skipping Playwright browser install.`
5731
+ );
5732
+ return;
5733
+ }
5655
5734
  try {
5656
- const playwright = require("playwright");
5657
- const chromiumPath = playwright.chromium.executablePath();
5735
+ const chromiumPath = chromium.executablePath();
5658
5736
  if (!chromiumPath || !existsSync(chromiumPath)) {
5659
5737
  throw new Error("Playwright Chromium browser not found");
5660
5738
  }
5661
5739
  } catch (err) {
5662
- console.warn(
5740
+ logger.debug(
5663
5741
  "Playwright browsers not found. Installing Chromium browser for dynamic scraping (this may take a minute)..."
5664
5742
  );
5665
5743
  try {
5666
- execSync("npx -y playwright install --no-shell --with-deps chromium", {
5667
- stdio: "inherit",
5744
+ logger.debug("Installing Playwright Chromium browser...");
5745
+ execSync("npm exec -y playwright install --no-shell --with-deps chromium", {
5746
+ stdio: "ignore",
5747
+ // Suppress output
5668
5748
  cwd: getProjectRoot()
5669
5749
  });
5670
5750
  } catch (installErr) {
@@ -5777,7 +5857,9 @@ async function main() {
5777
5857
  await new Promise(() => {
5778
5858
  });
5779
5859
  });
5780
- program.command("scrape <library> <url>").description("Scrape and index documentation from a URL").option("-v, --version <string>", "Version of the library (optional)").option(
5860
+ program.command("scrape <library> <url>").description(
5861
+ "Scrape and index documentation from a URL or local folder.\n\nTo scrape local files or folders, use a file:// URL.\nExamples:\n scrape mylib https://react.dev/reference/react\n scrape mylib file:///Users/me/docs/index.html\n scrape mylib file:///Users/me/docs/my-library\n\nNote: For local files/folders, you must use the file:// prefix. If running in Docker, mount the folder and use the container path. See README for details."
5862
+ ).option("-v, --version <string>", "Version of the library (optional)").option(
5781
5863
  "-p, --max-pages <number>",
5782
5864
  "Maximum pages to scrape",
5783
5865
  DEFAULT_MAX_PAGES$1.toString()