@arabold/docs-mcp-server 1.19.0 → 1.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +94 -54
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -41,7 +41,7 @@ import "better-sqlite3";
|
|
|
41
41
|
import "sqlite-vec";
|
|
42
42
|
import { execSync } from "node:child_process";
|
|
43
43
|
import { v4 } from "uuid";
|
|
44
|
-
import
|
|
44
|
+
import "psl";
|
|
45
45
|
import { minimatch } from "minimatch";
|
|
46
46
|
const LogLevel = {
|
|
47
47
|
ERROR: 0,
|
|
@@ -101,7 +101,7 @@ const logger = {
|
|
|
101
101
|
}
|
|
102
102
|
}
|
|
103
103
|
};
|
|
104
|
-
const version = "1.
|
|
104
|
+
const version = "1.19.0";
|
|
105
105
|
const packageJson = {
|
|
106
106
|
version
|
|
107
107
|
};
|
|
@@ -328,14 +328,43 @@ class HtmlLinkExtractorMiddleware {
|
|
|
328
328
|
return;
|
|
329
329
|
}
|
|
330
330
|
try {
|
|
331
|
+
let docBase = context.source;
|
|
332
|
+
try {
|
|
333
|
+
const baseEl = $("base[href]").first();
|
|
334
|
+
const rawBase = baseEl.attr("href");
|
|
335
|
+
if (rawBase && rawBase.trim() !== "") {
|
|
336
|
+
try {
|
|
337
|
+
const trimmed = rawBase.trim();
|
|
338
|
+
const candidate = new URL(trimmed, context.source);
|
|
339
|
+
const hasScheme = /^[a-zA-Z][a-zA-Z0-9+.-]*:/.test(trimmed);
|
|
340
|
+
const protocolRelative = trimmed.startsWith("//");
|
|
341
|
+
const firstSlash = trimmed.indexOf("/");
|
|
342
|
+
const firstColon = trimmed.indexOf(":");
|
|
343
|
+
const colonBeforeSlash = firstColon !== -1 && (firstSlash === -1 || firstColon < firstSlash);
|
|
344
|
+
const suspiciousColon = colonBeforeSlash && !hasScheme && !protocolRelative;
|
|
345
|
+
if (suspiciousColon || trimmed.startsWith(":")) {
|
|
346
|
+
logger.debug(
|
|
347
|
+
`Ignoring suspicious <base href> value (colon misuse): ${rawBase}`
|
|
348
|
+
);
|
|
349
|
+
} else {
|
|
350
|
+
docBase = candidate.href;
|
|
351
|
+
}
|
|
352
|
+
} catch {
|
|
353
|
+
logger.debug(`Ignoring invalid <base href> value: ${rawBase}`);
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
} catch {
|
|
357
|
+
}
|
|
331
358
|
const linkElements = $("a[href]");
|
|
332
|
-
logger.debug(
|
|
359
|
+
logger.debug(
|
|
360
|
+
`Found ${linkElements.length} potential links in ${context.source} (base=${docBase})`
|
|
361
|
+
);
|
|
333
362
|
const extractedLinks = [];
|
|
334
363
|
linkElements.each((_index, element) => {
|
|
335
364
|
const href = $(element).attr("href");
|
|
336
365
|
if (href && href.trim() !== "") {
|
|
337
366
|
try {
|
|
338
|
-
const urlObj = new URL(href,
|
|
367
|
+
const urlObj = new URL(href, docBase);
|
|
339
368
|
if (!["http:", "https:", "file:"].includes(urlObj.protocol)) {
|
|
340
369
|
logger.debug(`Ignoring link with invalid protocol: ${href}`);
|
|
341
370
|
return;
|
|
@@ -2094,12 +2123,18 @@ class HttpFetcher {
|
|
|
2094
2123
|
} else {
|
|
2095
2124
|
content = Buffer.from(response.data);
|
|
2096
2125
|
}
|
|
2126
|
+
const finalUrl = (
|
|
2127
|
+
// Node follow-redirects style
|
|
2128
|
+
response.request?.res?.responseUrl || // Some adapters may expose directly
|
|
2129
|
+
response.request?.responseUrl || // Fallback to axios recorded config URL
|
|
2130
|
+
response.config?.url || source
|
|
2131
|
+
);
|
|
2097
2132
|
return {
|
|
2098
2133
|
content,
|
|
2099
2134
|
mimeType,
|
|
2100
2135
|
charset,
|
|
2101
2136
|
encoding: contentEncoding,
|
|
2102
|
-
source
|
|
2137
|
+
source: finalUrl
|
|
2103
2138
|
};
|
|
2104
2139
|
} catch (error) {
|
|
2105
2140
|
const axiosError = error;
|
|
@@ -4459,17 +4494,33 @@ function validateUrl(url) {
|
|
|
4459
4494
|
throw new InvalidUrlError(url, error instanceof Error ? error : void 0);
|
|
4460
4495
|
}
|
|
4461
4496
|
}
|
|
4462
|
-
function
|
|
4463
|
-
|
|
4464
|
-
|
|
4465
|
-
|
|
4466
|
-
const
|
|
4467
|
-
|
|
4468
|
-
|
|
4497
|
+
function computeBaseDirectory(pathname) {
|
|
4498
|
+
if (pathname === "") return "/";
|
|
4499
|
+
if (pathname.endsWith("/")) return pathname;
|
|
4500
|
+
const lastSegment = pathname.split("/").at(-1) || "";
|
|
4501
|
+
const looksLikeFile = lastSegment.includes(".");
|
|
4502
|
+
if (looksLikeFile) {
|
|
4503
|
+
return pathname.replace(/\/[^/]*$/, "/");
|
|
4504
|
+
}
|
|
4505
|
+
return `${pathname}/`;
|
|
4469
4506
|
}
|
|
4470
|
-
function
|
|
4471
|
-
|
|
4472
|
-
|
|
4507
|
+
function isInScope(baseUrl, targetUrl, scope) {
|
|
4508
|
+
if (baseUrl.protocol !== targetUrl.protocol) return false;
|
|
4509
|
+
switch (scope) {
|
|
4510
|
+
case "subpages": {
|
|
4511
|
+
if (baseUrl.hostname !== targetUrl.hostname) return false;
|
|
4512
|
+
const baseDir = computeBaseDirectory(baseUrl.pathname);
|
|
4513
|
+
return targetUrl.pathname.startsWith(baseDir);
|
|
4514
|
+
}
|
|
4515
|
+
case "hostname":
|
|
4516
|
+
return baseUrl.hostname === targetUrl.hostname;
|
|
4517
|
+
case "domain": {
|
|
4518
|
+
const getDomain = (host) => host.split(".").slice(-2).join(".");
|
|
4519
|
+
return getDomain(baseUrl.hostname) === getDomain(targetUrl.hostname);
|
|
4520
|
+
}
|
|
4521
|
+
default:
|
|
4522
|
+
return false;
|
|
4523
|
+
}
|
|
4473
4524
|
}
|
|
4474
4525
|
function isRegexPattern(pattern) {
|
|
4475
4526
|
return pattern.length > 2 && pattern.startsWith("/") && pattern.endsWith("/");
|
|
@@ -4517,24 +4568,6 @@ function shouldIncludeUrl(url, includePatterns, excludePatterns) {
|
|
|
4517
4568
|
if (!includePatterns || includePatterns.length === 0) return true;
|
|
4518
4569
|
return matchesAnyPattern(normalizedPath, includePatterns) || (basename ? matchesAnyPattern(basename, stripSlash(includePatterns)) : false);
|
|
4519
4570
|
}
|
|
4520
|
-
function isInScope(baseUrl, targetUrl, scope) {
|
|
4521
|
-
if (baseUrl.protocol !== targetUrl.protocol) return false;
|
|
4522
|
-
switch (scope) {
|
|
4523
|
-
case "subpages": {
|
|
4524
|
-
if (baseUrl.hostname !== targetUrl.hostname) return false;
|
|
4525
|
-
const baseDir = baseUrl.pathname.endsWith("/") ? baseUrl.pathname : baseUrl.pathname.replace(/\/[^/]*$/, "/");
|
|
4526
|
-
return targetUrl.pathname.startsWith(baseDir);
|
|
4527
|
-
}
|
|
4528
|
-
case "hostname":
|
|
4529
|
-
return baseUrl.hostname === targetUrl.hostname;
|
|
4530
|
-
case "domain": {
|
|
4531
|
-
const getDomain = (host) => host.split(".").slice(-2).join(".");
|
|
4532
|
-
return getDomain(baseUrl.hostname) === getDomain(targetUrl.hostname);
|
|
4533
|
-
}
|
|
4534
|
-
default:
|
|
4535
|
-
return false;
|
|
4536
|
-
}
|
|
4537
|
-
}
|
|
4538
4571
|
const DEFAULT_MAX_DEPTH = 3;
|
|
4539
4572
|
const DEFAULT_CONCURRENCY = 3;
|
|
4540
4573
|
class BaseScraperStrategy {
|
|
@@ -4543,6 +4576,8 @@ class BaseScraperStrategy {
|
|
|
4543
4576
|
totalDiscovered = 0;
|
|
4544
4577
|
// Track total URLs discovered (unlimited)
|
|
4545
4578
|
effectiveTotal = 0;
|
|
4579
|
+
// Track effective total (limited by maxPages)
|
|
4580
|
+
canonicalBaseUrl;
|
|
4546
4581
|
options;
|
|
4547
4582
|
constructor(options = {}) {
|
|
4548
4583
|
this.options = options;
|
|
@@ -4554,7 +4589,7 @@ class BaseScraperStrategy {
|
|
|
4554
4589
|
shouldProcessUrl(url, options) {
|
|
4555
4590
|
if (options.scope) {
|
|
4556
4591
|
try {
|
|
4557
|
-
const base = new URL$1(options.url);
|
|
4592
|
+
const base = this.canonicalBaseUrl ?? new URL$1(options.url);
|
|
4558
4593
|
const target = new URL$1(url);
|
|
4559
4594
|
if (!isInScope(base, target, options.scope)) return false;
|
|
4560
4595
|
} catch {
|
|
@@ -4577,6 +4612,23 @@ class BaseScraperStrategy {
|
|
|
4577
4612
|
}
|
|
4578
4613
|
try {
|
|
4579
4614
|
const result = await this.processItem(item, options, void 0, signal);
|
|
4615
|
+
if (item.depth === 0 && !this.canonicalBaseUrl && result?.finalUrl) {
|
|
4616
|
+
try {
|
|
4617
|
+
const finalUrlStr = result.finalUrl;
|
|
4618
|
+
const original = new URL$1(options.url);
|
|
4619
|
+
const finalUrlObj = new URL$1(finalUrlStr);
|
|
4620
|
+
if (finalUrlObj.href !== original.href && (finalUrlObj.protocol === "http:" || finalUrlObj.protocol === "https:")) {
|
|
4621
|
+
this.canonicalBaseUrl = finalUrlObj;
|
|
4622
|
+
logger.debug(
|
|
4623
|
+
`Updated scope base after redirect: ${original.href} -> ${finalUrlObj.href}`
|
|
4624
|
+
);
|
|
4625
|
+
} else {
|
|
4626
|
+
this.canonicalBaseUrl = original;
|
|
4627
|
+
}
|
|
4628
|
+
} catch {
|
|
4629
|
+
this.canonicalBaseUrl = new URL$1(options.url);
|
|
4630
|
+
}
|
|
4631
|
+
}
|
|
4580
4632
|
if (result.document) {
|
|
4581
4633
|
this.pageCount++;
|
|
4582
4634
|
logger.info(
|
|
@@ -4637,7 +4689,8 @@ class BaseScraperStrategy {
|
|
|
4637
4689
|
this.pageCount = 0;
|
|
4638
4690
|
this.totalDiscovered = 1;
|
|
4639
4691
|
this.effectiveTotal = 1;
|
|
4640
|
-
|
|
4692
|
+
this.canonicalBaseUrl = new URL$1(options.url);
|
|
4693
|
+
let baseUrl = this.canonicalBaseUrl;
|
|
4641
4694
|
const queue = [{ url: options.url, depth: 0 }];
|
|
4642
4695
|
this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
|
|
4643
4696
|
const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
|
|
@@ -4658,6 +4711,7 @@ class BaseScraperStrategy {
|
|
|
4658
4711
|
queue.length
|
|
4659
4712
|
);
|
|
4660
4713
|
const batch = queue.splice(0, batchSize);
|
|
4714
|
+
baseUrl = this.canonicalBaseUrl ?? baseUrl;
|
|
4661
4715
|
const newUrls = await this.processBatch(
|
|
4662
4716
|
batch,
|
|
4663
4717
|
baseUrl,
|
|
@@ -4690,22 +4744,7 @@ class WebScraperStrategy extends BaseScraperStrategy {
|
|
|
4690
4744
|
return false;
|
|
4691
4745
|
}
|
|
4692
4746
|
}
|
|
4693
|
-
|
|
4694
|
-
* Determines if a target URL should be followed based on the scope setting.
|
|
4695
|
-
*/
|
|
4696
|
-
isInScope(baseUrl, targetUrl, scope) {
|
|
4697
|
-
try {
|
|
4698
|
-
if (scope === "domain") {
|
|
4699
|
-
return hasSameDomain(baseUrl, targetUrl);
|
|
4700
|
-
}
|
|
4701
|
-
if (scope === "hostname") {
|
|
4702
|
-
return hasSameHostname(baseUrl, targetUrl);
|
|
4703
|
-
}
|
|
4704
|
-
return hasSameHostname(baseUrl, targetUrl) && isSubpath(baseUrl, targetUrl);
|
|
4705
|
-
} catch {
|
|
4706
|
-
return false;
|
|
4707
|
-
}
|
|
4708
|
-
}
|
|
4747
|
+
// Removed custom isInScope logic; using shared scope utility for consistent behavior
|
|
4709
4748
|
/**
|
|
4710
4749
|
* Processes a single queue item by fetching its content and processing it through pipelines.
|
|
4711
4750
|
* @param item - The queue item to process.
|
|
@@ -4746,12 +4785,12 @@ class WebScraperStrategy extends BaseScraperStrategy {
|
|
|
4746
4785
|
);
|
|
4747
4786
|
return { document: void 0, links: processed.links };
|
|
4748
4787
|
}
|
|
4749
|
-
const baseUrl = new URL(options.url);
|
|
4788
|
+
const baseUrl = item.depth === 0 ? new URL(rawContent.source) : this.canonicalBaseUrl ?? new URL(options.url);
|
|
4750
4789
|
const filteredLinks = processed.links.filter((link) => {
|
|
4751
4790
|
try {
|
|
4752
4791
|
const targetUrl = new URL(link);
|
|
4753
4792
|
const scope = options.scope || "subpages";
|
|
4754
|
-
return
|
|
4793
|
+
return isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
|
|
4755
4794
|
} catch {
|
|
4756
4795
|
return false;
|
|
4757
4796
|
}
|
|
@@ -4767,7 +4806,8 @@ class WebScraperStrategy extends BaseScraperStrategy {
|
|
|
4767
4806
|
...processed.metadata
|
|
4768
4807
|
}
|
|
4769
4808
|
},
|
|
4770
|
-
links: filteredLinks
|
|
4809
|
+
links: filteredLinks,
|
|
4810
|
+
finalUrl: rawContent.source
|
|
4771
4811
|
};
|
|
4772
4812
|
} catch (error) {
|
|
4773
4813
|
logger.error(`❌ Failed processing page ${url}: ${error}`);
|