@arabold/docs-mcp-server 1.4.4 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-BD7OFN4H.js → chunk-2YTVPKP5.js} +107 -63
- package/dist/chunk-2YTVPKP5.js.map +1 -0
- package/dist/cli.js +24 -4
- package/dist/cli.js.map +1 -1
- package/dist/server.js +6 -4
- package/dist/server.js.map +1 -1
- package/package.json +3 -1
- package/dist/chunk-BD7OFN4H.js.map +0 -1
|
@@ -210,8 +210,8 @@ function v4(options, buf, offset) {
|
|
|
210
210
|
}
|
|
211
211
|
var v4_default = v4;
|
|
212
212
|
|
|
213
|
-
// src/
|
|
214
|
-
import
|
|
213
|
+
// src/utils/url.ts
|
|
214
|
+
import psl from "psl";
|
|
215
215
|
|
|
216
216
|
// src/utils/errors.ts
|
|
217
217
|
var ScraperError = class extends Error {
|
|
@@ -231,8 +231,79 @@ var InvalidUrlError = class extends ScraperError {
|
|
|
231
231
|
super(`Invalid URL: ${url}`, false, cause);
|
|
232
232
|
}
|
|
233
233
|
};
|
|
234
|
+
var RedirectError = class extends ScraperError {
|
|
235
|
+
constructor(originalUrl, redirectUrl, statusCode) {
|
|
236
|
+
super(
|
|
237
|
+
`Redirect detected from ${originalUrl} to ${redirectUrl} (status: ${statusCode})`,
|
|
238
|
+
false
|
|
239
|
+
);
|
|
240
|
+
this.originalUrl = originalUrl;
|
|
241
|
+
this.redirectUrl = redirectUrl;
|
|
242
|
+
this.statusCode = statusCode;
|
|
243
|
+
}
|
|
244
|
+
};
|
|
245
|
+
|
|
246
|
+
// src/utils/url.ts
|
|
247
|
+
var defaultNormalizerOptions = {
|
|
248
|
+
ignoreCase: true,
|
|
249
|
+
removeHash: true,
|
|
250
|
+
removeTrailingSlash: true,
|
|
251
|
+
removeQuery: false,
|
|
252
|
+
removeIndex: true
|
|
253
|
+
};
|
|
254
|
+
function normalizeUrl(url, options = defaultNormalizerOptions) {
|
|
255
|
+
try {
|
|
256
|
+
const parsedUrl = new URL(url);
|
|
257
|
+
const finalOptions = { ...defaultNormalizerOptions, ...options };
|
|
258
|
+
const normalized = new URL(parsedUrl.origin + parsedUrl.pathname);
|
|
259
|
+
if (finalOptions.removeIndex) {
|
|
260
|
+
normalized.pathname = normalized.pathname.replace(
|
|
261
|
+
/\/index\.(html|htm|asp|php|jsp)$/i,
|
|
262
|
+
"/"
|
|
263
|
+
);
|
|
264
|
+
}
|
|
265
|
+
if (finalOptions.removeTrailingSlash && normalized.pathname.length > 1) {
|
|
266
|
+
normalized.pathname = normalized.pathname.replace(/\/+$/, "");
|
|
267
|
+
}
|
|
268
|
+
const preservedHash = !finalOptions.removeHash ? parsedUrl.hash : "";
|
|
269
|
+
const preservedSearch = !finalOptions.removeQuery ? parsedUrl.search : "";
|
|
270
|
+
let result = normalized.origin + normalized.pathname;
|
|
271
|
+
if (preservedSearch) {
|
|
272
|
+
result += preservedSearch;
|
|
273
|
+
}
|
|
274
|
+
if (preservedHash) {
|
|
275
|
+
result += preservedHash;
|
|
276
|
+
}
|
|
277
|
+
if (finalOptions.ignoreCase) {
|
|
278
|
+
result = result.toLowerCase();
|
|
279
|
+
}
|
|
280
|
+
return result;
|
|
281
|
+
} catch {
|
|
282
|
+
return url;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
function validateUrl(url) {
|
|
286
|
+
try {
|
|
287
|
+
new URL(url);
|
|
288
|
+
} catch (error) {
|
|
289
|
+
throw new InvalidUrlError(url, error instanceof Error ? error : void 0);
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
function hasSameHostname(urlA, urlB) {
|
|
293
|
+
return urlA.hostname.toLowerCase() === urlB.hostname.toLowerCase();
|
|
294
|
+
}
|
|
295
|
+
function hasSameDomain(urlA, urlB) {
|
|
296
|
+
const domainA = psl.get(urlA.hostname.toLowerCase());
|
|
297
|
+
const domainB = psl.get(urlB.hostname.toLowerCase());
|
|
298
|
+
return domainA !== null && domainA === domainB;
|
|
299
|
+
}
|
|
300
|
+
function isSubpath(baseUrl, targetUrl) {
|
|
301
|
+
const basePath = baseUrl.pathname.endsWith("/") ? baseUrl.pathname : `${baseUrl.pathname}/`;
|
|
302
|
+
return targetUrl.pathname.startsWith(basePath);
|
|
303
|
+
}
|
|
234
304
|
|
|
235
305
|
// src/scraper/fetcher/HttpFetcher.ts
|
|
306
|
+
import axios from "axios";
|
|
236
307
|
var HttpFetcher = class {
|
|
237
308
|
MAX_RETRIES = 6;
|
|
238
309
|
BASE_DELAY = 1e3;
|
|
@@ -246,16 +317,20 @@ var HttpFetcher = class {
|
|
|
246
317
|
async fetch(source, options) {
|
|
247
318
|
const maxRetries = options?.maxRetries ?? this.MAX_RETRIES;
|
|
248
319
|
const baseDelay = options?.retryDelay ?? this.BASE_DELAY;
|
|
320
|
+
const followRedirects = options?.followRedirects ?? true;
|
|
249
321
|
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
250
322
|
try {
|
|
251
|
-
const
|
|
323
|
+
const config = {
|
|
252
324
|
responseType: "arraybuffer",
|
|
253
325
|
// For handling both text and binary
|
|
254
326
|
headers: options?.headers,
|
|
255
327
|
timeout: options?.timeout,
|
|
256
|
-
signal: options?.signal
|
|
328
|
+
signal: options?.signal,
|
|
257
329
|
// Pass signal to axios
|
|
258
|
-
|
|
330
|
+
// Axios follows redirects by default, we need to explicitly disable it if needed
|
|
331
|
+
maxRedirects: followRedirects ? 5 : 0
|
|
332
|
+
};
|
|
333
|
+
const response = await axios.get(source, config);
|
|
259
334
|
return {
|
|
260
335
|
content: response.data,
|
|
261
336
|
mimeType: response.headers["content-type"] || "application/octet-stream",
|
|
@@ -266,6 +341,12 @@ var HttpFetcher = class {
|
|
|
266
341
|
const axiosError = error;
|
|
267
342
|
const status = axiosError.response?.status;
|
|
268
343
|
const code = axiosError.code;
|
|
344
|
+
if (!followRedirects && status && status >= 300 && status < 400) {
|
|
345
|
+
const location = axiosError.response?.headers?.location;
|
|
346
|
+
if (location) {
|
|
347
|
+
throw new RedirectError(source, location, status);
|
|
348
|
+
}
|
|
349
|
+
}
|
|
269
350
|
if (attempt < maxRetries && (status === void 0 || status >= 500 && status < 600)) {
|
|
270
351
|
const delay = baseDelay * 2 ** attempt;
|
|
271
352
|
logger.warn(
|
|
@@ -355,53 +436,6 @@ var CancellationError = class extends PipelineError {
|
|
|
355
436
|
}
|
|
356
437
|
};
|
|
357
438
|
|
|
358
|
-
// src/utils/url.ts
|
|
359
|
-
var defaultNormalizerOptions = {
|
|
360
|
-
ignoreCase: true,
|
|
361
|
-
removeHash: true,
|
|
362
|
-
removeTrailingSlash: true,
|
|
363
|
-
removeQuery: false,
|
|
364
|
-
removeIndex: true
|
|
365
|
-
};
|
|
366
|
-
function normalizeUrl(url, options = defaultNormalizerOptions) {
|
|
367
|
-
try {
|
|
368
|
-
const parsedUrl = new URL(url);
|
|
369
|
-
const finalOptions = { ...defaultNormalizerOptions, ...options };
|
|
370
|
-
const normalized = new URL(parsedUrl.origin + parsedUrl.pathname);
|
|
371
|
-
if (finalOptions.removeIndex) {
|
|
372
|
-
normalized.pathname = normalized.pathname.replace(
|
|
373
|
-
/\/index\.(html|htm|asp|php|jsp)$/i,
|
|
374
|
-
"/"
|
|
375
|
-
);
|
|
376
|
-
}
|
|
377
|
-
if (finalOptions.removeTrailingSlash && normalized.pathname.length > 1) {
|
|
378
|
-
normalized.pathname = normalized.pathname.replace(/\/+$/, "");
|
|
379
|
-
}
|
|
380
|
-
const preservedHash = !finalOptions.removeHash ? parsedUrl.hash : "";
|
|
381
|
-
const preservedSearch = !finalOptions.removeQuery ? parsedUrl.search : "";
|
|
382
|
-
let result = normalized.origin + normalized.pathname;
|
|
383
|
-
if (preservedSearch) {
|
|
384
|
-
result += preservedSearch;
|
|
385
|
-
}
|
|
386
|
-
if (preservedHash) {
|
|
387
|
-
result += preservedHash;
|
|
388
|
-
}
|
|
389
|
-
if (finalOptions.ignoreCase) {
|
|
390
|
-
result = result.toLowerCase();
|
|
391
|
-
}
|
|
392
|
-
return result;
|
|
393
|
-
} catch {
|
|
394
|
-
return url;
|
|
395
|
-
}
|
|
396
|
-
}
|
|
397
|
-
function validateUrl(url) {
|
|
398
|
-
try {
|
|
399
|
-
new URL(url);
|
|
400
|
-
} catch (error) {
|
|
401
|
-
throw new InvalidUrlError(url, error instanceof Error ? error : void 0);
|
|
402
|
-
}
|
|
403
|
-
}
|
|
404
|
-
|
|
405
439
|
// src/scraper/processor/HtmlProcessor.ts
|
|
406
440
|
import createDOMPurify from "dompurify";
|
|
407
441
|
import { JSDOM } from "jsdom";
|
|
@@ -736,11 +770,18 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
|
|
|
736
770
|
return false;
|
|
737
771
|
}
|
|
738
772
|
}
|
|
739
|
-
|
|
773
|
+
/**
|
|
774
|
+
* Determines if a target URL should be followed based on the scope setting.
|
|
775
|
+
*/
|
|
776
|
+
isInScope(baseUrl, targetUrl, scope) {
|
|
740
777
|
try {
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
778
|
+
if (scope === "domain") {
|
|
779
|
+
return hasSameDomain(baseUrl, targetUrl);
|
|
780
|
+
}
|
|
781
|
+
if (scope === "hostname") {
|
|
782
|
+
return hasSameHostname(baseUrl, targetUrl);
|
|
783
|
+
}
|
|
784
|
+
return hasSameHostname(baseUrl, targetUrl) && isSubpath(baseUrl, targetUrl);
|
|
744
785
|
} catch {
|
|
745
786
|
return false;
|
|
746
787
|
}
|
|
@@ -748,17 +789,19 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
|
|
|
748
789
|
async processItem(item, options, _progressCallback, signal) {
|
|
749
790
|
const { url } = item;
|
|
750
791
|
try {
|
|
751
|
-
const
|
|
792
|
+
const fetchOptions = {
|
|
793
|
+
signal,
|
|
794
|
+
followRedirects: options.followRedirects
|
|
795
|
+
};
|
|
796
|
+
const rawContent = await this.httpFetcher.fetch(url, fetchOptions);
|
|
752
797
|
const processor = this.getProcessor(rawContent.mimeType);
|
|
753
798
|
const result = await processor.process(rawContent);
|
|
754
799
|
const baseUrl = new URL(options.url);
|
|
755
800
|
const links = result.links.filter((link) => {
|
|
756
801
|
try {
|
|
757
802
|
const targetUrl = new URL(link, baseUrl);
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
}
|
|
761
|
-
return (!options.subpagesOnly || this.isSubpage(baseUrl, targetUrl)) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
|
|
803
|
+
const scope = options.scope || "subpages";
|
|
804
|
+
return this.isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
|
|
762
805
|
} catch {
|
|
763
806
|
return false;
|
|
764
807
|
}
|
|
@@ -1460,7 +1503,8 @@ var ScrapeTool = class {
|
|
|
1460
1503
|
url,
|
|
1461
1504
|
library,
|
|
1462
1505
|
version: internalVersion,
|
|
1463
|
-
|
|
1506
|
+
scope: scraperOptions?.scope ?? "subpages",
|
|
1507
|
+
followRedirects: scraperOptions?.followRedirects ?? true,
|
|
1464
1508
|
maxPages: scraperOptions?.maxPages ?? 100,
|
|
1465
1509
|
maxDepth: scraperOptions?.maxDepth ?? 3,
|
|
1466
1510
|
// maxConcurrency is handled by the manager itself now
|
|
@@ -11524,4 +11568,4 @@ export {
|
|
|
11524
11568
|
RemoveTool,
|
|
11525
11569
|
DocumentManagementService
|
|
11526
11570
|
};
|
|
11527
|
-
//# sourceMappingURL=chunk-
|
|
11571
|
+
//# sourceMappingURL=chunk-2YTVPKP5.js.map
|