@arabold/docs-mcp-server 1.8.0 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -289,7 +289,7 @@ docs-cli scrape <library> <url> [options]
289
289
  - `-v, --version <string>`: The specific version to associate with the scraped documents.
290
290
  - Accepts full versions (`1.2.3`), pre-release versions (`1.2.3-beta.1`), or partial versions (`1`, `1.2` which are expanded to `1.0.0`, `1.2.0`).
291
291
  - If omitted, the documentation is indexed as **unversioned**.
292
- - `-p, --max-pages <number>`: Maximum pages to scrape (default: 100).
292
+ - `-p, --max-pages <number>`: Maximum pages to scrape (default: 1000).
293
293
  - `-d, --max-depth <number>`: Maximum navigation depth (default: 3).
294
294
  - `-c, --max-concurrency <number>`: Maximum concurrent requests (default: 3).
295
295
  - `--ignore-errors`: Ignore errors during scraping (default: true).
@@ -100,6 +100,11 @@ var require_extend = __commonJS({
100
100
  }
101
101
  });
102
102
 
103
+ // src/config.ts
104
+ var DEFAULT_MAX_PAGES = 1e3;
105
+ var DEFAULT_MAX_DEPTH = 3;
106
+ var DEFAULT_MAX_CONCURRENCY = 3;
107
+
103
108
  // src/utils/logger.ts
104
109
  var currentLogLevel = 2 /* INFO */;
105
110
  function setLogLevel(level) {
@@ -421,9 +426,8 @@ ${cleanedContent}
421
426
  );
422
427
  }
423
428
  const htmlContent = typeof content3.content === "string" ? content3.content : content3.content.toString(content3.encoding || "utf-8");
424
- const titleMatch = htmlContent.match(/<title>([^<]+)<\/title>/i);
425
- const title = titleMatch?.[1] || "Untitled";
426
- const window = new JSDOM(content3.content, { url: content3.source }).window;
429
+ const window = new JSDOM(htmlContent, { url: content3.source }).window;
430
+ const title = window.document.title || "Untitled";
427
431
  const purify = createDOMPurify(window);
428
432
  const purifiedContent = purify.sanitize(htmlContent, {
429
433
  WHOLE_DOCUMENT: true,
@@ -629,8 +633,8 @@ var CancellationError = class extends PipelineError {
629
633
  };
630
634
 
631
635
  // src/scraper/strategies/BaseScraperStrategy.ts
632
- var DEFAULT_MAX_PAGES = 100;
633
- var DEFAULT_MAX_DEPTH = 3;
636
+ var DEFAULT_MAX_PAGES2 = 100;
637
+ var DEFAULT_MAX_DEPTH2 = 3;
634
638
  var DEFAULT_CONCURRENCY = 3;
635
639
  var BaseScraperStrategy = class {
636
640
  visited = /* @__PURE__ */ new Set();
@@ -651,7 +655,7 @@ var BaseScraperStrategy = class {
651
655
  if (signal?.aborted) {
652
656
  throw new CancellationError("Scraping cancelled during batch processing");
653
657
  }
654
- const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH;
658
+ const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH2;
655
659
  if (item.depth > maxDepth) {
656
660
  return [];
657
661
  }
@@ -659,7 +663,7 @@ var BaseScraperStrategy = class {
659
663
  const result = await this.processItem(item, options, void 0, signal);
660
664
  if (result.document) {
661
665
  this.pageCount++;
662
- const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
666
+ const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES2;
663
667
  logger.info(
664
668
  `\u{1F310} Scraping page ${this.pageCount}/${maxPages} (depth ${item.depth}/${maxDepth}): ${item.url}`
665
669
  );
@@ -711,7 +715,7 @@ var BaseScraperStrategy = class {
711
715
  const baseUrl = new URL2(options.url);
712
716
  const queue = [{ url: options.url, depth: 0 }];
713
717
  this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
714
- const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
718
+ const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES2;
715
719
  const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
716
720
  while (queue.length > 0 && this.pageCount < maxPages) {
717
721
  if (signal?.aborted) {
@@ -1489,9 +1493,9 @@ var ScrapeTool = class {
1489
1493
  version: internalVersion,
1490
1494
  scope: scraperOptions?.scope ?? "subpages",
1491
1495
  followRedirects: scraperOptions?.followRedirects ?? true,
1492
- maxPages: scraperOptions?.maxPages ?? 100,
1493
- maxDepth: scraperOptions?.maxDepth ?? 3,
1494
- // maxConcurrency is handled by the manager itself now
1496
+ maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES,
1497
+ maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH,
1498
+ maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
1495
1499
  ignoreErrors: scraperOptions?.ignoreErrors ?? true
1496
1500
  });
1497
1501
  logger.info(`\u{1F680} Job ${jobId} enqueued for scraping.`);
@@ -11566,6 +11570,9 @@ var DocumentManagementService = class {
11566
11570
  };
11567
11571
 
11568
11572
  export {
11573
+ DEFAULT_MAX_PAGES,
11574
+ DEFAULT_MAX_DEPTH,
11575
+ DEFAULT_MAX_CONCURRENCY,
11569
11576
  setLogLevel,
11570
11577
  logger,
11571
11578
  HttpFetcher,
@@ -11585,4 +11592,4 @@ export {
11585
11592
  SearchTool,
11586
11593
  DocumentManagementService
11587
11594
  };
11588
- //# sourceMappingURL=chunk-ADZQJG2M.js.map
11595
+ //# sourceMappingURL=chunk-A5FW7XVC.js.map