@arabold/docs-mcp-server 1.19.0 → 1.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -41,7 +41,7 @@ import "better-sqlite3";
41
41
  import "sqlite-vec";
42
42
  import { execSync } from "node:child_process";
43
43
  import { v4 } from "uuid";
44
- import psl from "psl";
44
+ import "psl";
45
45
  import { minimatch } from "minimatch";
46
46
  const LogLevel = {
47
47
  ERROR: 0,
@@ -101,7 +101,7 @@ const logger = {
101
101
  }
102
102
  }
103
103
  };
104
- const version = "1.18.0";
104
+ const version = "1.19.0";
105
105
  const packageJson = {
106
106
  version
107
107
  };
@@ -328,14 +328,43 @@ class HtmlLinkExtractorMiddleware {
328
328
  return;
329
329
  }
330
330
  try {
331
+ let docBase = context.source;
332
+ try {
333
+ const baseEl = $("base[href]").first();
334
+ const rawBase = baseEl.attr("href");
335
+ if (rawBase && rawBase.trim() !== "") {
336
+ try {
337
+ const trimmed = rawBase.trim();
338
+ const candidate = new URL(trimmed, context.source);
339
+ const hasScheme = /^[a-zA-Z][a-zA-Z0-9+.-]*:/.test(trimmed);
340
+ const protocolRelative = trimmed.startsWith("//");
341
+ const firstSlash = trimmed.indexOf("/");
342
+ const firstColon = trimmed.indexOf(":");
343
+ const colonBeforeSlash = firstColon !== -1 && (firstSlash === -1 || firstColon < firstSlash);
344
+ const suspiciousColon = colonBeforeSlash && !hasScheme && !protocolRelative;
345
+ if (suspiciousColon || trimmed.startsWith(":")) {
346
+ logger.debug(
347
+ `Ignoring suspicious <base href> value (colon misuse): ${rawBase}`
348
+ );
349
+ } else {
350
+ docBase = candidate.href;
351
+ }
352
+ } catch {
353
+ logger.debug(`Ignoring invalid <base href> value: ${rawBase}`);
354
+ }
355
+ }
356
+ } catch {
357
+ }
331
358
  const linkElements = $("a[href]");
332
- logger.debug(`Found ${linkElements.length} potential links in ${context.source}`);
359
+ logger.debug(
360
+ `Found ${linkElements.length} potential links in ${context.source} (base=${docBase})`
361
+ );
333
362
  const extractedLinks = [];
334
363
  linkElements.each((_index, element) => {
335
364
  const href = $(element).attr("href");
336
365
  if (href && href.trim() !== "") {
337
366
  try {
338
- const urlObj = new URL(href, context.source);
367
+ const urlObj = new URL(href, docBase);
339
368
  if (!["http:", "https:", "file:"].includes(urlObj.protocol)) {
340
369
  logger.debug(`Ignoring link with invalid protocol: ${href}`);
341
370
  return;
@@ -2094,12 +2123,18 @@ class HttpFetcher {
2094
2123
  } else {
2095
2124
  content = Buffer.from(response.data);
2096
2125
  }
2126
+ const finalUrl = (
2127
+ // Node follow-redirects style
2128
+ response.request?.res?.responseUrl || // Some adapters may expose directly
2129
+ response.request?.responseUrl || // Fallback to axios recorded config URL
2130
+ response.config?.url || source
2131
+ );
2097
2132
  return {
2098
2133
  content,
2099
2134
  mimeType,
2100
2135
  charset,
2101
2136
  encoding: contentEncoding,
2102
- source
2137
+ source: finalUrl
2103
2138
  };
2104
2139
  } catch (error) {
2105
2140
  const axiosError = error;
@@ -4459,17 +4494,33 @@ function validateUrl(url) {
4459
4494
  throw new InvalidUrlError(url, error instanceof Error ? error : void 0);
4460
4495
  }
4461
4496
  }
4462
- function hasSameHostname(urlA, urlB) {
4463
- return urlA.hostname.toLowerCase() === urlB.hostname.toLowerCase();
4464
- }
4465
- function hasSameDomain(urlA, urlB) {
4466
- const domainA = psl.get(urlA.hostname.toLowerCase());
4467
- const domainB = psl.get(urlB.hostname.toLowerCase());
4468
- return domainA !== null && domainA === domainB;
4497
+ function computeBaseDirectory(pathname) {
4498
+ if (pathname === "") return "/";
4499
+ if (pathname.endsWith("/")) return pathname;
4500
+ const lastSegment = pathname.split("/").at(-1) || "";
4501
+ const looksLikeFile = lastSegment.includes(".");
4502
+ if (looksLikeFile) {
4503
+ return pathname.replace(/\/[^/]*$/, "/");
4504
+ }
4505
+ return `${pathname}/`;
4469
4506
  }
4470
- function isSubpath(baseUrl, targetUrl) {
4471
- const basePath = baseUrl.pathname.endsWith("/") ? baseUrl.pathname : `${baseUrl.pathname}/`;
4472
- return targetUrl.pathname.startsWith(basePath);
4507
+ function isInScope(baseUrl, targetUrl, scope) {
4508
+ if (baseUrl.protocol !== targetUrl.protocol) return false;
4509
+ switch (scope) {
4510
+ case "subpages": {
4511
+ if (baseUrl.hostname !== targetUrl.hostname) return false;
4512
+ const baseDir = computeBaseDirectory(baseUrl.pathname);
4513
+ return targetUrl.pathname.startsWith(baseDir);
4514
+ }
4515
+ case "hostname":
4516
+ return baseUrl.hostname === targetUrl.hostname;
4517
+ case "domain": {
4518
+ const getDomain = (host) => host.split(".").slice(-2).join(".");
4519
+ return getDomain(baseUrl.hostname) === getDomain(targetUrl.hostname);
4520
+ }
4521
+ default:
4522
+ return false;
4523
+ }
4473
4524
  }
4474
4525
  function isRegexPattern(pattern) {
4475
4526
  return pattern.length > 2 && pattern.startsWith("/") && pattern.endsWith("/");
@@ -4517,24 +4568,6 @@ function shouldIncludeUrl(url, includePatterns, excludePatterns) {
4517
4568
  if (!includePatterns || includePatterns.length === 0) return true;
4518
4569
  return matchesAnyPattern(normalizedPath, includePatterns) || (basename ? matchesAnyPattern(basename, stripSlash(includePatterns)) : false);
4519
4570
  }
4520
- function isInScope(baseUrl, targetUrl, scope) {
4521
- if (baseUrl.protocol !== targetUrl.protocol) return false;
4522
- switch (scope) {
4523
- case "subpages": {
4524
- if (baseUrl.hostname !== targetUrl.hostname) return false;
4525
- const baseDir = baseUrl.pathname.endsWith("/") ? baseUrl.pathname : baseUrl.pathname.replace(/\/[^/]*$/, "/");
4526
- return targetUrl.pathname.startsWith(baseDir);
4527
- }
4528
- case "hostname":
4529
- return baseUrl.hostname === targetUrl.hostname;
4530
- case "domain": {
4531
- const getDomain = (host) => host.split(".").slice(-2).join(".");
4532
- return getDomain(baseUrl.hostname) === getDomain(targetUrl.hostname);
4533
- }
4534
- default:
4535
- return false;
4536
- }
4537
- }
4538
4571
  const DEFAULT_MAX_DEPTH = 3;
4539
4572
  const DEFAULT_CONCURRENCY = 3;
4540
4573
  class BaseScraperStrategy {
@@ -4543,6 +4576,8 @@ class BaseScraperStrategy {
4543
4576
  totalDiscovered = 0;
4544
4577
  // Track total URLs discovered (unlimited)
4545
4578
  effectiveTotal = 0;
4579
+ // Track effective total (limited by maxPages)
4580
+ canonicalBaseUrl;
4546
4581
  options;
4547
4582
  constructor(options = {}) {
4548
4583
  this.options = options;
@@ -4554,7 +4589,7 @@ class BaseScraperStrategy {
4554
4589
  shouldProcessUrl(url, options) {
4555
4590
  if (options.scope) {
4556
4591
  try {
4557
- const base = new URL$1(options.url);
4592
+ const base = this.canonicalBaseUrl ?? new URL$1(options.url);
4558
4593
  const target = new URL$1(url);
4559
4594
  if (!isInScope(base, target, options.scope)) return false;
4560
4595
  } catch {
@@ -4577,6 +4612,23 @@ class BaseScraperStrategy {
4577
4612
  }
4578
4613
  try {
4579
4614
  const result = await this.processItem(item, options, void 0, signal);
4615
+ if (item.depth === 0 && !this.canonicalBaseUrl && result?.finalUrl) {
4616
+ try {
4617
+ const finalUrlStr = result.finalUrl;
4618
+ const original = new URL$1(options.url);
4619
+ const finalUrlObj = new URL$1(finalUrlStr);
4620
+ if (finalUrlObj.href !== original.href && (finalUrlObj.protocol === "http:" || finalUrlObj.protocol === "https:")) {
4621
+ this.canonicalBaseUrl = finalUrlObj;
4622
+ logger.debug(
4623
+ `Updated scope base after redirect: ${original.href} -> ${finalUrlObj.href}`
4624
+ );
4625
+ } else {
4626
+ this.canonicalBaseUrl = original;
4627
+ }
4628
+ } catch {
4629
+ this.canonicalBaseUrl = new URL$1(options.url);
4630
+ }
4631
+ }
4580
4632
  if (result.document) {
4581
4633
  this.pageCount++;
4582
4634
  logger.info(
@@ -4637,7 +4689,8 @@ class BaseScraperStrategy {
4637
4689
  this.pageCount = 0;
4638
4690
  this.totalDiscovered = 1;
4639
4691
  this.effectiveTotal = 1;
4640
- const baseUrl = new URL$1(options.url);
4692
+ this.canonicalBaseUrl = new URL$1(options.url);
4693
+ let baseUrl = this.canonicalBaseUrl;
4641
4694
  const queue = [{ url: options.url, depth: 0 }];
4642
4695
  this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
4643
4696
  const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
@@ -4658,6 +4711,7 @@ class BaseScraperStrategy {
4658
4711
  queue.length
4659
4712
  );
4660
4713
  const batch = queue.splice(0, batchSize);
4714
+ baseUrl = this.canonicalBaseUrl ?? baseUrl;
4661
4715
  const newUrls = await this.processBatch(
4662
4716
  batch,
4663
4717
  baseUrl,
@@ -4690,22 +4744,7 @@ class WebScraperStrategy extends BaseScraperStrategy {
4690
4744
  return false;
4691
4745
  }
4692
4746
  }
4693
- /**
4694
- * Determines if a target URL should be followed based on the scope setting.
4695
- */
4696
- isInScope(baseUrl, targetUrl, scope) {
4697
- try {
4698
- if (scope === "domain") {
4699
- return hasSameDomain(baseUrl, targetUrl);
4700
- }
4701
- if (scope === "hostname") {
4702
- return hasSameHostname(baseUrl, targetUrl);
4703
- }
4704
- return hasSameHostname(baseUrl, targetUrl) && isSubpath(baseUrl, targetUrl);
4705
- } catch {
4706
- return false;
4707
- }
4708
- }
4747
+ // Removed custom isInScope logic; using shared scope utility for consistent behavior
4709
4748
  /**
4710
4749
  * Processes a single queue item by fetching its content and processing it through pipelines.
4711
4750
  * @param item - The queue item to process.
@@ -4746,12 +4785,12 @@ class WebScraperStrategy extends BaseScraperStrategy {
4746
4785
  );
4747
4786
  return { document: void 0, links: processed.links };
4748
4787
  }
4749
- const baseUrl = new URL(options.url);
4788
+ const baseUrl = item.depth === 0 ? new URL(rawContent.source) : this.canonicalBaseUrl ?? new URL(options.url);
4750
4789
  const filteredLinks = processed.links.filter((link) => {
4751
4790
  try {
4752
4791
  const targetUrl = new URL(link);
4753
4792
  const scope = options.scope || "subpages";
4754
- return this.isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
4793
+ return isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
4755
4794
  } catch {
4756
4795
  return false;
4757
4796
  }
@@ -4767,7 +4806,8 @@ class WebScraperStrategy extends BaseScraperStrategy {
4767
4806
  ...processed.metadata
4768
4807
  }
4769
4808
  },
4770
- links: filteredLinks
4809
+ links: filteredLinks,
4810
+ finalUrl: rawContent.source
4771
4811
  };
4772
4812
  } catch (error) {
4773
4813
  logger.error(`❌ Failed processing page ${url}: ${error}`);