@arabold/docs-mcp-server 1.8.0 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/{chunk-ADZQJG2M.js → chunk-A5FW7XVC.js} +19 -12
- package/dist/chunk-A5FW7XVC.js.map +1 -0
- package/dist/cli.js +18 -3
- package/dist/cli.js.map +1 -1
- package/dist/server.js +5 -3
- package/dist/server.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-ADZQJG2M.js.map +0 -1
package/README.md
CHANGED
|
@@ -289,7 +289,7 @@ docs-cli scrape <library> <url> [options]
|
|
|
289
289
|
- `-v, --version <string>`: The specific version to associate with the scraped documents.
|
|
290
290
|
- Accepts full versions (`1.2.3`), pre-release versions (`1.2.3-beta.1`), or partial versions (`1`, `1.2` which are expanded to `1.0.0`, `1.2.0`).
|
|
291
291
|
- If omitted, the documentation is indexed as **unversioned**.
|
|
292
|
-
- `-p, --max-pages <number>`: Maximum pages to scrape (default:
|
|
292
|
+
- `-p, --max-pages <number>`: Maximum pages to scrape (default: 1000).
|
|
293
293
|
- `-d, --max-depth <number>`: Maximum navigation depth (default: 3).
|
|
294
294
|
- `-c, --max-concurrency <number>`: Maximum concurrent requests (default: 3).
|
|
295
295
|
- `--ignore-errors`: Ignore errors during scraping (default: true).
|
|
@@ -100,6 +100,11 @@ var require_extend = __commonJS({
|
|
|
100
100
|
}
|
|
101
101
|
});
|
|
102
102
|
|
|
103
|
+
// src/config.ts
|
|
104
|
+
var DEFAULT_MAX_PAGES = 1e3;
|
|
105
|
+
var DEFAULT_MAX_DEPTH = 3;
|
|
106
|
+
var DEFAULT_MAX_CONCURRENCY = 3;
|
|
107
|
+
|
|
103
108
|
// src/utils/logger.ts
|
|
104
109
|
var currentLogLevel = 2 /* INFO */;
|
|
105
110
|
function setLogLevel(level) {
|
|
@@ -421,9 +426,8 @@ ${cleanedContent}
|
|
|
421
426
|
);
|
|
422
427
|
}
|
|
423
428
|
const htmlContent = typeof content3.content === "string" ? content3.content : content3.content.toString(content3.encoding || "utf-8");
|
|
424
|
-
const
|
|
425
|
-
const title =
|
|
426
|
-
const window = new JSDOM(content3.content, { url: content3.source }).window;
|
|
429
|
+
const window = new JSDOM(htmlContent, { url: content3.source }).window;
|
|
430
|
+
const title = window.document.title || "Untitled";
|
|
427
431
|
const purify = createDOMPurify(window);
|
|
428
432
|
const purifiedContent = purify.sanitize(htmlContent, {
|
|
429
433
|
WHOLE_DOCUMENT: true,
|
|
@@ -629,8 +633,8 @@ var CancellationError = class extends PipelineError {
|
|
|
629
633
|
};
|
|
630
634
|
|
|
631
635
|
// src/scraper/strategies/BaseScraperStrategy.ts
|
|
632
|
-
var
|
|
633
|
-
var
|
|
636
|
+
var DEFAULT_MAX_PAGES2 = 100;
|
|
637
|
+
var DEFAULT_MAX_DEPTH2 = 3;
|
|
634
638
|
var DEFAULT_CONCURRENCY = 3;
|
|
635
639
|
var BaseScraperStrategy = class {
|
|
636
640
|
visited = /* @__PURE__ */ new Set();
|
|
@@ -651,7 +655,7 @@ var BaseScraperStrategy = class {
|
|
|
651
655
|
if (signal?.aborted) {
|
|
652
656
|
throw new CancellationError("Scraping cancelled during batch processing");
|
|
653
657
|
}
|
|
654
|
-
const maxDepth = options.maxDepth ??
|
|
658
|
+
const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH2;
|
|
655
659
|
if (item.depth > maxDepth) {
|
|
656
660
|
return [];
|
|
657
661
|
}
|
|
@@ -659,7 +663,7 @@ var BaseScraperStrategy = class {
|
|
|
659
663
|
const result = await this.processItem(item, options, void 0, signal);
|
|
660
664
|
if (result.document) {
|
|
661
665
|
this.pageCount++;
|
|
662
|
-
const maxPages = options.maxPages ??
|
|
666
|
+
const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES2;
|
|
663
667
|
logger.info(
|
|
664
668
|
`\u{1F310} Scraping page ${this.pageCount}/${maxPages} (depth ${item.depth}/${maxDepth}): ${item.url}`
|
|
665
669
|
);
|
|
@@ -711,7 +715,7 @@ var BaseScraperStrategy = class {
|
|
|
711
715
|
const baseUrl = new URL2(options.url);
|
|
712
716
|
const queue = [{ url: options.url, depth: 0 }];
|
|
713
717
|
this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
|
|
714
|
-
const maxPages = options.maxPages ??
|
|
718
|
+
const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES2;
|
|
715
719
|
const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
|
|
716
720
|
while (queue.length > 0 && this.pageCount < maxPages) {
|
|
717
721
|
if (signal?.aborted) {
|
|
@@ -1489,9 +1493,9 @@ var ScrapeTool = class {
|
|
|
1489
1493
|
version: internalVersion,
|
|
1490
1494
|
scope: scraperOptions?.scope ?? "subpages",
|
|
1491
1495
|
followRedirects: scraperOptions?.followRedirects ?? true,
|
|
1492
|
-
maxPages: scraperOptions?.maxPages ??
|
|
1493
|
-
maxDepth: scraperOptions?.maxDepth ??
|
|
1494
|
-
|
|
1496
|
+
maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES,
|
|
1497
|
+
maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH,
|
|
1498
|
+
maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
|
|
1495
1499
|
ignoreErrors: scraperOptions?.ignoreErrors ?? true
|
|
1496
1500
|
});
|
|
1497
1501
|
logger.info(`\u{1F680} Job ${jobId} enqueued for scraping.`);
|
|
@@ -11566,6 +11570,9 @@ var DocumentManagementService = class {
|
|
|
11566
11570
|
};
|
|
11567
11571
|
|
|
11568
11572
|
export {
|
|
11573
|
+
DEFAULT_MAX_PAGES,
|
|
11574
|
+
DEFAULT_MAX_DEPTH,
|
|
11575
|
+
DEFAULT_MAX_CONCURRENCY,
|
|
11569
11576
|
setLogLevel,
|
|
11570
11577
|
logger,
|
|
11571
11578
|
HttpFetcher,
|
|
@@ -11585,4 +11592,4 @@ export {
|
|
|
11585
11592
|
SearchTool,
|
|
11586
11593
|
DocumentManagementService
|
|
11587
11594
|
};
|
|
11588
|
-
//# sourceMappingURL=chunk-
|
|
11595
|
+
//# sourceMappingURL=chunk-A5FW7XVC.js.map
|