@arabold/docs-mcp-server 1.4.5 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -210,8 +210,8 @@ function v4(options, buf, offset) {
210
210
  }
211
211
  var v4_default = v4;
212
212
 
213
- // src/scraper/fetcher/HttpFetcher.ts
214
- import axios from "axios";
213
+ // src/utils/url.ts
214
+ import psl from "psl";
215
215
 
216
216
  // src/utils/errors.ts
217
217
  var ScraperError = class extends Error {
@@ -231,8 +231,79 @@ var InvalidUrlError = class extends ScraperError {
231
231
  super(`Invalid URL: ${url}`, false, cause);
232
232
  }
233
233
  };
234
+ var RedirectError = class extends ScraperError {
235
+ constructor(originalUrl, redirectUrl, statusCode) {
236
+ super(
237
+ `Redirect detected from ${originalUrl} to ${redirectUrl} (status: ${statusCode})`,
238
+ false
239
+ );
240
+ this.originalUrl = originalUrl;
241
+ this.redirectUrl = redirectUrl;
242
+ this.statusCode = statusCode;
243
+ }
244
+ };
245
+
246
+ // src/utils/url.ts
247
+ var defaultNormalizerOptions = {
248
+ ignoreCase: true,
249
+ removeHash: true,
250
+ removeTrailingSlash: true,
251
+ removeQuery: false,
252
+ removeIndex: true
253
+ };
254
+ function normalizeUrl(url, options = defaultNormalizerOptions) {
255
+ try {
256
+ const parsedUrl = new URL(url);
257
+ const finalOptions = { ...defaultNormalizerOptions, ...options };
258
+ const normalized = new URL(parsedUrl.origin + parsedUrl.pathname);
259
+ if (finalOptions.removeIndex) {
260
+ normalized.pathname = normalized.pathname.replace(
261
+ /\/index\.(html|htm|asp|php|jsp)$/i,
262
+ "/"
263
+ );
264
+ }
265
+ if (finalOptions.removeTrailingSlash && normalized.pathname.length > 1) {
266
+ normalized.pathname = normalized.pathname.replace(/\/+$/, "");
267
+ }
268
+ const preservedHash = !finalOptions.removeHash ? parsedUrl.hash : "";
269
+ const preservedSearch = !finalOptions.removeQuery ? parsedUrl.search : "";
270
+ let result = normalized.origin + normalized.pathname;
271
+ if (preservedSearch) {
272
+ result += preservedSearch;
273
+ }
274
+ if (preservedHash) {
275
+ result += preservedHash;
276
+ }
277
+ if (finalOptions.ignoreCase) {
278
+ result = result.toLowerCase();
279
+ }
280
+ return result;
281
+ } catch {
282
+ return url;
283
+ }
284
+ }
285
+ function validateUrl(url) {
286
+ try {
287
+ new URL(url);
288
+ } catch (error) {
289
+ throw new InvalidUrlError(url, error instanceof Error ? error : void 0);
290
+ }
291
+ }
292
+ function hasSameHostname(urlA, urlB) {
293
+ return urlA.hostname.toLowerCase() === urlB.hostname.toLowerCase();
294
+ }
295
+ function hasSameDomain(urlA, urlB) {
296
+ const domainA = psl.get(urlA.hostname.toLowerCase());
297
+ const domainB = psl.get(urlB.hostname.toLowerCase());
298
+ return domainA !== null && domainA === domainB;
299
+ }
300
+ function isSubpath(baseUrl, targetUrl) {
301
+ const basePath = baseUrl.pathname.endsWith("/") ? baseUrl.pathname : `${baseUrl.pathname}/`;
302
+ return targetUrl.pathname.startsWith(basePath);
303
+ }
234
304
 
235
305
  // src/scraper/fetcher/HttpFetcher.ts
306
+ import axios from "axios";
236
307
  var HttpFetcher = class {
237
308
  MAX_RETRIES = 6;
238
309
  BASE_DELAY = 1e3;
@@ -246,16 +317,20 @@ var HttpFetcher = class {
246
317
  async fetch(source, options) {
247
318
  const maxRetries = options?.maxRetries ?? this.MAX_RETRIES;
248
319
  const baseDelay = options?.retryDelay ?? this.BASE_DELAY;
320
+ const followRedirects = options?.followRedirects ?? true;
249
321
  for (let attempt = 0; attempt <= maxRetries; attempt++) {
250
322
  try {
251
- const response = await axios.get(source, {
323
+ const config = {
252
324
  responseType: "arraybuffer",
253
325
  // For handling both text and binary
254
326
  headers: options?.headers,
255
327
  timeout: options?.timeout,
256
- signal: options?.signal
328
+ signal: options?.signal,
257
329
  // Pass signal to axios
258
- });
330
+ // Axios follows redirects by default, we need to explicitly disable it if needed
331
+ maxRedirects: followRedirects ? 5 : 0
332
+ };
333
+ const response = await axios.get(source, config);
259
334
  return {
260
335
  content: response.data,
261
336
  mimeType: response.headers["content-type"] || "application/octet-stream",
@@ -266,6 +341,12 @@ var HttpFetcher = class {
266
341
  const axiosError = error;
267
342
  const status = axiosError.response?.status;
268
343
  const code = axiosError.code;
344
+ if (!followRedirects && status && status >= 300 && status < 400) {
345
+ const location = axiosError.response?.headers?.location;
346
+ if (location) {
347
+ throw new RedirectError(source, location, status);
348
+ }
349
+ }
269
350
  if (attempt < maxRetries && (status === void 0 || status >= 500 && status < 600)) {
270
351
  const delay = baseDelay * 2 ** attempt;
271
352
  logger.warn(
@@ -355,53 +436,6 @@ var CancellationError = class extends PipelineError {
355
436
  }
356
437
  };
357
438
 
358
- // src/utils/url.ts
359
- var defaultNormalizerOptions = {
360
- ignoreCase: true,
361
- removeHash: true,
362
- removeTrailingSlash: true,
363
- removeQuery: false,
364
- removeIndex: true
365
- };
366
- function normalizeUrl(url, options = defaultNormalizerOptions) {
367
- try {
368
- const parsedUrl = new URL(url);
369
- const finalOptions = { ...defaultNormalizerOptions, ...options };
370
- const normalized = new URL(parsedUrl.origin + parsedUrl.pathname);
371
- if (finalOptions.removeIndex) {
372
- normalized.pathname = normalized.pathname.replace(
373
- /\/index\.(html|htm|asp|php|jsp)$/i,
374
- "/"
375
- );
376
- }
377
- if (finalOptions.removeTrailingSlash && normalized.pathname.length > 1) {
378
- normalized.pathname = normalized.pathname.replace(/\/+$/, "");
379
- }
380
- const preservedHash = !finalOptions.removeHash ? parsedUrl.hash : "";
381
- const preservedSearch = !finalOptions.removeQuery ? parsedUrl.search : "";
382
- let result = normalized.origin + normalized.pathname;
383
- if (preservedSearch) {
384
- result += preservedSearch;
385
- }
386
- if (preservedHash) {
387
- result += preservedHash;
388
- }
389
- if (finalOptions.ignoreCase) {
390
- result = result.toLowerCase();
391
- }
392
- return result;
393
- } catch {
394
- return url;
395
- }
396
- }
397
- function validateUrl(url) {
398
- try {
399
- new URL(url);
400
- } catch (error) {
401
- throw new InvalidUrlError(url, error instanceof Error ? error : void 0);
402
- }
403
- }
404
-
405
439
  // src/scraper/processor/HtmlProcessor.ts
406
440
  import createDOMPurify from "dompurify";
407
441
  import { JSDOM } from "jsdom";
@@ -736,11 +770,18 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
736
770
  return false;
737
771
  }
738
772
  }
739
- isSubpage(baseUrl, targetUrl) {
773
+ /**
774
+ * Determines if a target URL should be followed based on the scope setting.
775
+ */
776
+ isInScope(baseUrl, targetUrl, scope) {
740
777
  try {
741
- const basePath = baseUrl.origin + baseUrl.pathname;
742
- const targetPath = targetUrl.origin + targetUrl.pathname;
743
- return targetPath.startsWith(basePath);
778
+ if (scope === "domain") {
779
+ return hasSameDomain(baseUrl, targetUrl);
780
+ }
781
+ if (scope === "hostname") {
782
+ return hasSameHostname(baseUrl, targetUrl);
783
+ }
784
+ return hasSameHostname(baseUrl, targetUrl) && isSubpath(baseUrl, targetUrl);
744
785
  } catch {
745
786
  return false;
746
787
  }
@@ -748,17 +789,19 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
748
789
  async processItem(item, options, _progressCallback, signal) {
749
790
  const { url } = item;
750
791
  try {
751
- const rawContent = await this.httpFetcher.fetch(url, { signal });
792
+ const fetchOptions = {
793
+ signal,
794
+ followRedirects: options.followRedirects
795
+ };
796
+ const rawContent = await this.httpFetcher.fetch(url, fetchOptions);
752
797
  const processor = this.getProcessor(rawContent.mimeType);
753
798
  const result = await processor.process(rawContent);
754
799
  const baseUrl = new URL(options.url);
755
800
  const links = result.links.filter((link) => {
756
801
  try {
757
802
  const targetUrl = new URL(link, baseUrl);
758
- if (targetUrl.origin !== baseUrl.origin) {
759
- return false;
760
- }
761
- return (!options.subpagesOnly || this.isSubpage(baseUrl, targetUrl)) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
803
+ const scope = options.scope || "subpages";
804
+ return this.isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
762
805
  } catch {
763
806
  return false;
764
807
  }
@@ -1460,7 +1503,8 @@ var ScrapeTool = class {
1460
1503
  url,
1461
1504
  library,
1462
1505
  version: internalVersion,
1463
- subpagesOnly: scraperOptions?.subpagesOnly ?? true,
1506
+ scope: scraperOptions?.scope ?? "subpages",
1507
+ followRedirects: scraperOptions?.followRedirects ?? true,
1464
1508
  maxPages: scraperOptions?.maxPages ?? 100,
1465
1509
  maxDepth: scraperOptions?.maxDepth ?? 3,
1466
1510
  // maxConcurrency is handled by the manager itself now
@@ -11524,4 +11568,4 @@ export {
11524
11568
  RemoveTool,
11525
11569
  DocumentManagementService
11526
11570
  };
11527
- //# sourceMappingURL=chunk-BD7OFN4H.js.map
11571
+ //# sourceMappingURL=chunk-2YTVPKP5.js.map