@arabold/docs-mcp-server 1.8.0 → 1.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -2
- package/dist/{chunk-ADZQJG2M.js → chunk-VTO2ED43.js} +764 -254
- package/dist/chunk-VTO2ED43.js.map +1 -0
- package/dist/cli.js +60 -16
- package/dist/cli.js.map +1 -1
- package/dist/server.js +7 -9
- package/dist/server.js.map +1 -1
- package/package.json +5 -6
- package/dist/chunk-ADZQJG2M.js.map +0 -1
|
@@ -100,6 +100,11 @@ var require_extend = __commonJS({
|
|
|
100
100
|
}
|
|
101
101
|
});
|
|
102
102
|
|
|
103
|
+
// src/config.ts
|
|
104
|
+
var DEFAULT_MAX_PAGES = 1e3;
|
|
105
|
+
var DEFAULT_MAX_DEPTH = 3;
|
|
106
|
+
var DEFAULT_MAX_CONCURRENCY = 3;
|
|
107
|
+
|
|
103
108
|
// src/utils/logger.ts
|
|
104
109
|
var currentLogLevel = 2 /* INFO */;
|
|
105
110
|
function setLogLevel(level) {
|
|
@@ -287,216 +292,13 @@ var FileFetcher = class {
|
|
|
287
292
|
}
|
|
288
293
|
};
|
|
289
294
|
|
|
290
|
-
// src/scraper/
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
selectorsToRemove = [
|
|
298
|
-
"nav",
|
|
299
|
-
"footer",
|
|
300
|
-
"script",
|
|
301
|
-
"style",
|
|
302
|
-
"noscript",
|
|
303
|
-
"svg",
|
|
304
|
-
"link",
|
|
305
|
-
"meta",
|
|
306
|
-
"iframe",
|
|
307
|
-
"header",
|
|
308
|
-
"button",
|
|
309
|
-
"input",
|
|
310
|
-
"textarea",
|
|
311
|
-
"select",
|
|
312
|
-
// "form", // Known issue: Some pages use alerts for important content
|
|
313
|
-
".ads",
|
|
314
|
-
".advertisement",
|
|
315
|
-
".banner",
|
|
316
|
-
".cookie-banner",
|
|
317
|
-
".cookie-consent",
|
|
318
|
-
".hidden",
|
|
319
|
-
".hide",
|
|
320
|
-
".modal",
|
|
321
|
-
".nav-bar",
|
|
322
|
-
".overlay",
|
|
323
|
-
".popup",
|
|
324
|
-
".promo",
|
|
325
|
-
".mw-editsection",
|
|
326
|
-
".side-bar",
|
|
327
|
-
".social-share",
|
|
328
|
-
".sticky",
|
|
329
|
-
"#ads",
|
|
330
|
-
"#banner",
|
|
331
|
-
"#cookieBanner",
|
|
332
|
-
"#modal",
|
|
333
|
-
"#nav",
|
|
334
|
-
"#overlay",
|
|
335
|
-
"#popup",
|
|
336
|
-
"#sidebar",
|
|
337
|
-
"#socialMediaBox",
|
|
338
|
-
"#stickyHeader",
|
|
339
|
-
"#ad-container",
|
|
340
|
-
".ad-container",
|
|
341
|
-
".login-form",
|
|
342
|
-
".signup-form",
|
|
343
|
-
".tooltip",
|
|
344
|
-
".dropdown-menu",
|
|
345
|
-
// ".alert", // Known issue: Some pages use alerts for important content
|
|
346
|
-
".breadcrumb",
|
|
347
|
-
".pagination",
|
|
348
|
-
// '[role="alert"]', // Known issue: Some pages use alerts for important content
|
|
349
|
-
'[role="banner"]',
|
|
350
|
-
'[role="dialog"]',
|
|
351
|
-
'[role="alertdialog"]',
|
|
352
|
-
'[role="region"][aria-label*="skip" i]',
|
|
353
|
-
'[aria-modal="true"]',
|
|
354
|
-
".noprint"
|
|
355
|
-
];
|
|
356
|
-
constructor(options) {
|
|
357
|
-
this.turndownService = new TurndownService({
|
|
358
|
-
headingStyle: "atx",
|
|
359
|
-
hr: "---",
|
|
360
|
-
bulletListMarker: "-",
|
|
361
|
-
codeBlockStyle: "fenced",
|
|
362
|
-
emDelimiter: "_",
|
|
363
|
-
strongDelimiter: "**",
|
|
364
|
-
linkStyle: "inlined"
|
|
365
|
-
});
|
|
366
|
-
this.turndownService.addRule("pre", {
|
|
367
|
-
filter: ["pre"],
|
|
368
|
-
replacement: (content3, node2) => {
|
|
369
|
-
const element = node2;
|
|
370
|
-
let language = element.getAttribute("data-language") || "";
|
|
371
|
-
if (!language) {
|
|
372
|
-
const highlightElement = element.closest(
|
|
373
|
-
'[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
|
|
374
|
-
);
|
|
375
|
-
if (highlightElement) {
|
|
376
|
-
const className = highlightElement.className;
|
|
377
|
-
const match = className.match(
|
|
378
|
-
/(?:highlight-source-|highlight-|language-)(\w+)/
|
|
379
|
-
);
|
|
380
|
-
if (match) {
|
|
381
|
-
language = match[1];
|
|
382
|
-
}
|
|
383
|
-
}
|
|
384
|
-
}
|
|
385
|
-
const text3 = (() => {
|
|
386
|
-
const clone = element.cloneNode(true);
|
|
387
|
-
const brElements = Array.from(clone.querySelectorAll("br"));
|
|
388
|
-
for (const br of brElements) {
|
|
389
|
-
br.replaceWith("\n");
|
|
390
|
-
}
|
|
391
|
-
return clone.textContent;
|
|
392
|
-
})();
|
|
393
|
-
return `
|
|
394
|
-
\`\`\`${language}
|
|
395
|
-
${text3}
|
|
396
|
-
\`\`\`
|
|
397
|
-
`;
|
|
398
|
-
}
|
|
399
|
-
});
|
|
400
|
-
this.turndownService.addRule("table", {
|
|
401
|
-
filter: ["table"],
|
|
402
|
-
replacement: (content3) => {
|
|
403
|
-
const cleanedContent = content3.replace(/\n+/g, "\n");
|
|
404
|
-
return `
|
|
405
|
-
|
|
406
|
-
${cleanedContent}
|
|
407
|
-
|
|
408
|
-
`;
|
|
409
|
-
}
|
|
410
|
-
});
|
|
411
|
-
this.options = options || {};
|
|
412
|
-
}
|
|
413
|
-
canProcess(content3) {
|
|
414
|
-
return content3.mimeType.startsWith("text/html");
|
|
415
|
-
}
|
|
416
|
-
async process(content3) {
|
|
417
|
-
if (!this.canProcess(content3)) {
|
|
418
|
-
throw new ScraperError(
|
|
419
|
-
`HtmlProcessor cannot process content of type ${content3.mimeType}`,
|
|
420
|
-
false
|
|
421
|
-
);
|
|
422
|
-
}
|
|
423
|
-
const htmlContent = typeof content3.content === "string" ? content3.content : content3.content.toString(content3.encoding || "utf-8");
|
|
424
|
-
const titleMatch = htmlContent.match(/<title>([^<]+)<\/title>/i);
|
|
425
|
-
const title = titleMatch?.[1] || "Untitled";
|
|
426
|
-
const window = new JSDOM(content3.content, { url: content3.source }).window;
|
|
427
|
-
const purify = createDOMPurify(window);
|
|
428
|
-
const purifiedContent = purify.sanitize(htmlContent, {
|
|
429
|
-
WHOLE_DOCUMENT: true,
|
|
430
|
-
RETURN_DOM: true
|
|
431
|
-
});
|
|
432
|
-
const linkElements = purifiedContent.querySelectorAll("a[href]");
|
|
433
|
-
let links = [];
|
|
434
|
-
if (this.options.extractLinks !== false) {
|
|
435
|
-
links = Array.from(linkElements).map((el) => el.getAttribute("href")).filter((href) => href !== null).map((href) => {
|
|
436
|
-
try {
|
|
437
|
-
return new URL(href, content3.source).href;
|
|
438
|
-
} catch {
|
|
439
|
-
return null;
|
|
440
|
-
}
|
|
441
|
-
}).filter((url) => url !== null);
|
|
442
|
-
}
|
|
443
|
-
const selectorsToRemove = [
|
|
444
|
-
...this.options.excludeSelectors || [],
|
|
445
|
-
...this.selectorsToRemove
|
|
446
|
-
];
|
|
447
|
-
for (const selector of selectorsToRemove) {
|
|
448
|
-
const elements = purifiedContent.querySelectorAll(selector);
|
|
449
|
-
for (const el of elements) {
|
|
450
|
-
el.remove();
|
|
451
|
-
}
|
|
452
|
-
}
|
|
453
|
-
const cleanedContent = purifiedContent.innerHTML;
|
|
454
|
-
const markdown = this.turndownService.turndown(cleanedContent || "").trim();
|
|
455
|
-
if (!markdown) {
|
|
456
|
-
throw new ScraperError("No valid content found", false);
|
|
457
|
-
}
|
|
458
|
-
return {
|
|
459
|
-
content: markdown,
|
|
460
|
-
title,
|
|
461
|
-
source: content3.source,
|
|
462
|
-
links,
|
|
463
|
-
metadata: {}
|
|
464
|
-
};
|
|
465
|
-
}
|
|
466
|
-
};
|
|
467
|
-
|
|
468
|
-
// src/scraper/processor/MarkdownProcessor.ts
|
|
469
|
-
var MarkdownProcessor = class {
|
|
470
|
-
canProcess(content3) {
|
|
471
|
-
return content3.mimeType === "text/markdown" || content3.mimeType === "text/plain" || // Treat plain text as markdown
|
|
472
|
-
content3.source.endsWith(".md");
|
|
473
|
-
}
|
|
474
|
-
async process(content3) {
|
|
475
|
-
if (!this.canProcess(content3)) {
|
|
476
|
-
throw new ScraperError(
|
|
477
|
-
`MarkdownProcessor cannot process content of type ${content3.mimeType}`,
|
|
478
|
-
false
|
|
479
|
-
);
|
|
480
|
-
}
|
|
481
|
-
const markdownContent = typeof content3.content === "string" ? content3.content : content3.content.toString(content3.encoding || "utf-8");
|
|
482
|
-
if (!markdownContent.trim()) {
|
|
483
|
-
throw new ScraperError("Empty Markdown content", false);
|
|
484
|
-
}
|
|
485
|
-
const title = this.extractTitle(markdownContent) || "Untitled";
|
|
486
|
-
return {
|
|
487
|
-
content: markdownContent,
|
|
488
|
-
title,
|
|
489
|
-
source: content3.source,
|
|
490
|
-
links: [],
|
|
491
|
-
// TODO: Extract links from Markdown
|
|
492
|
-
metadata: {}
|
|
493
|
-
};
|
|
494
|
-
}
|
|
495
|
-
extractTitle(markdown) {
|
|
496
|
-
const match = markdown.match(/^#\s+(.*)$/m);
|
|
497
|
-
return match ? match[1].trim() : null;
|
|
498
|
-
}
|
|
499
|
-
};
|
|
295
|
+
// src/scraper/types.ts
|
|
296
|
+
var ScrapeMode = /* @__PURE__ */ ((ScrapeMode2) => {
|
|
297
|
+
ScrapeMode2["Fetch"] = "fetch";
|
|
298
|
+
ScrapeMode2["Playwright"] = "playwright";
|
|
299
|
+
ScrapeMode2["Auto"] = "auto";
|
|
300
|
+
return ScrapeMode2;
|
|
301
|
+
})(ScrapeMode || {});
|
|
500
302
|
|
|
501
303
|
// node_modules/uuid/dist/esm-node/stringify.js
|
|
502
304
|
var byteToHex = [];
|
|
@@ -605,6 +407,541 @@ function isSubpath(baseUrl, targetUrl) {
|
|
|
605
407
|
return targetUrl.pathname.startsWith(basePath);
|
|
606
408
|
}
|
|
607
409
|
|
|
410
|
+
// src/scraper/middleware/ContentProcessorPipeline.ts
|
|
411
|
+
var ContentProcessingPipeline = class {
|
|
412
|
+
middleware;
|
|
413
|
+
/**
|
|
414
|
+
* Creates an instance of ContentProcessingPipeline.
|
|
415
|
+
* @param middleware An array of middleware instances to execute in order.
|
|
416
|
+
*/
|
|
417
|
+
constructor(middleware) {
|
|
418
|
+
this.middleware = middleware;
|
|
419
|
+
}
|
|
420
|
+
/**
|
|
421
|
+
* Executes the middleware pipeline with the given initial context.
|
|
422
|
+
* @param initialContext The starting context for the pipeline.
|
|
423
|
+
* @returns A promise that resolves with the final context after all middleware have executed.
|
|
424
|
+
*/
|
|
425
|
+
async run(initialContext) {
|
|
426
|
+
let index2 = -1;
|
|
427
|
+
const dispatch = async (i) => {
|
|
428
|
+
if (i <= index2) {
|
|
429
|
+
throw new Error("next() called multiple times");
|
|
430
|
+
}
|
|
431
|
+
index2 = i;
|
|
432
|
+
const mw = this.middleware[i];
|
|
433
|
+
if (!mw) {
|
|
434
|
+
return;
|
|
435
|
+
}
|
|
436
|
+
const next = dispatch.bind(null, i + 1);
|
|
437
|
+
try {
|
|
438
|
+
await mw.process(initialContext, next);
|
|
439
|
+
} catch (error) {
|
|
440
|
+
initialContext.errors.push(
|
|
441
|
+
error instanceof Error ? error : new Error(String(error))
|
|
442
|
+
);
|
|
443
|
+
logger.warn(`Error in middleware pipeline: ${error}`);
|
|
444
|
+
}
|
|
445
|
+
};
|
|
446
|
+
await dispatch(0);
|
|
447
|
+
return initialContext;
|
|
448
|
+
}
|
|
449
|
+
};
|
|
450
|
+
|
|
451
|
+
// src/scraper/middleware/components/HtmlCheerioParserMiddleware.ts
|
|
452
|
+
import * as cheerio from "cheerio";
|
|
453
|
+
var HtmlCheerioParserMiddleware = class {
|
|
454
|
+
async process(context, next) {
|
|
455
|
+
if (!context.contentType.startsWith("text/html")) {
|
|
456
|
+
await next();
|
|
457
|
+
return;
|
|
458
|
+
}
|
|
459
|
+
const htmlString = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
|
|
460
|
+
try {
|
|
461
|
+
logger.debug(`Parsing HTML content with Cheerio from ${context.source}`);
|
|
462
|
+
const $ = cheerio.load(htmlString);
|
|
463
|
+
context.dom = $;
|
|
464
|
+
await next();
|
|
465
|
+
} catch (error) {
|
|
466
|
+
logger.error(`Failed to parse HTML with Cheerio for ${context.source}: ${error}`);
|
|
467
|
+
context.errors.push(
|
|
468
|
+
error instanceof Error ? error : new Error(`Cheerio HTML parsing failed: ${String(error)}`)
|
|
469
|
+
);
|
|
470
|
+
return;
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
};
|
|
474
|
+
|
|
475
|
+
// src/utils/dom.ts
|
|
476
|
+
import { JSDOM, VirtualConsole } from "jsdom";
|
|
477
|
+
function createJSDOM(html, options) {
|
|
478
|
+
const virtualConsole = new VirtualConsole();
|
|
479
|
+
virtualConsole.on("error", () => {
|
|
480
|
+
});
|
|
481
|
+
virtualConsole.on("warn", () => {
|
|
482
|
+
});
|
|
483
|
+
virtualConsole.on("info", () => {
|
|
484
|
+
});
|
|
485
|
+
virtualConsole.on("debug", () => {
|
|
486
|
+
});
|
|
487
|
+
virtualConsole.on("log", () => {
|
|
488
|
+
});
|
|
489
|
+
const defaultOptions = {
|
|
490
|
+
virtualConsole
|
|
491
|
+
};
|
|
492
|
+
const finalOptions = { ...defaultOptions, ...options };
|
|
493
|
+
return new JSDOM(html, finalOptions);
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
// src/scraper/middleware/components/HtmlLinkExtractorMiddleware.ts
|
|
497
|
+
var HtmlLinkExtractorMiddleware = class {
|
|
498
|
+
/**
|
|
499
|
+
* Processes the context to extract links from the sanitized HTML body.
|
|
500
|
+
* @param context The current processing context.
|
|
501
|
+
* @param next Function to call the next middleware.
|
|
502
|
+
*/
|
|
503
|
+
async process(context, next) {
|
|
504
|
+
const $ = context.dom;
|
|
505
|
+
if (!$) {
|
|
506
|
+
if (context.contentType.startsWith("text/html")) {
|
|
507
|
+
logger.warn(
|
|
508
|
+
`Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware runs before this.`
|
|
509
|
+
);
|
|
510
|
+
}
|
|
511
|
+
await next();
|
|
512
|
+
return;
|
|
513
|
+
}
|
|
514
|
+
try {
|
|
515
|
+
const linkElements = $("a[href]");
|
|
516
|
+
logger.debug(`Found ${linkElements.length} potential links in ${context.source}`);
|
|
517
|
+
const extractedLinks = [];
|
|
518
|
+
linkElements.each((index2, element) => {
|
|
519
|
+
const href = $(element).attr("href");
|
|
520
|
+
if (href && href.trim() !== "") {
|
|
521
|
+
try {
|
|
522
|
+
const urlObj = new URL(href, context.source);
|
|
523
|
+
if (!["http:", "https:", "file:"].includes(urlObj.protocol)) {
|
|
524
|
+
logger.debug(`Ignoring link with invalid protocol: ${href}`);
|
|
525
|
+
return;
|
|
526
|
+
}
|
|
527
|
+
extractedLinks.push(urlObj.href);
|
|
528
|
+
} catch (e) {
|
|
529
|
+
logger.debug(`Ignoring invalid URL syntax: ${href}`);
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
});
|
|
533
|
+
context.links = [...new Set(extractedLinks)];
|
|
534
|
+
logger.debug(
|
|
535
|
+
`Extracted ${context.links.length} unique, valid links from ${context.source}`
|
|
536
|
+
);
|
|
537
|
+
} catch (error) {
|
|
538
|
+
logger.error(`Error extracting links from ${context.source}: ${error}`);
|
|
539
|
+
context.errors.push(
|
|
540
|
+
new Error(
|
|
541
|
+
`Failed to extract links from HTML: ${error instanceof Error ? error.message : String(error)}`
|
|
542
|
+
)
|
|
543
|
+
);
|
|
544
|
+
}
|
|
545
|
+
await next();
|
|
546
|
+
}
|
|
547
|
+
};
|
|
548
|
+
|
|
549
|
+
// src/scraper/middleware/components/HtmlMetadataExtractorMiddleware.ts
|
|
550
|
+
var HtmlMetadataExtractorMiddleware = class {
|
|
551
|
+
/**
|
|
552
|
+
* Processes the context to extract the HTML title.
|
|
553
|
+
* @param context The current processing context.
|
|
554
|
+
* @param next Function to call the next middleware.
|
|
555
|
+
*/
|
|
556
|
+
async process(context, next) {
|
|
557
|
+
const $ = context.dom;
|
|
558
|
+
if (!$) {
|
|
559
|
+
if (context.contentType.startsWith("text/html")) {
|
|
560
|
+
logger.warn(
|
|
561
|
+
`Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware runs before this.`
|
|
562
|
+
);
|
|
563
|
+
}
|
|
564
|
+
await next();
|
|
565
|
+
return;
|
|
566
|
+
}
|
|
567
|
+
try {
|
|
568
|
+
let title = $("title").first().text().trim();
|
|
569
|
+
if (!title) {
|
|
570
|
+
title = $("h1").first().text().trim();
|
|
571
|
+
}
|
|
572
|
+
title = title || "Untitled";
|
|
573
|
+
title = title.replace(/\s+/g, " ").trim();
|
|
574
|
+
context.metadata.title = title;
|
|
575
|
+
logger.debug(`Extracted title: "${title}" from ${context.source}`);
|
|
576
|
+
} catch (error) {
|
|
577
|
+
logger.error(`Error extracting metadata from ${context.source}: ${error}`);
|
|
578
|
+
context.errors.push(
|
|
579
|
+
new Error(
|
|
580
|
+
`Failed to extract metadata from HTML: ${error instanceof Error ? error.message : String(error)}`
|
|
581
|
+
)
|
|
582
|
+
);
|
|
583
|
+
}
|
|
584
|
+
await next();
|
|
585
|
+
}
|
|
586
|
+
};
|
|
587
|
+
|
|
588
|
+
// src/scraper/middleware/components/HtmlPlaywrightMiddleware.ts
|
|
589
|
+
import { chromium } from "playwright";
|
|
590
|
+
var HtmlPlaywrightMiddleware = class {
|
|
591
|
+
browser = null;
|
|
592
|
+
/**
|
|
593
|
+
* Initializes the Playwright browser instance.
|
|
594
|
+
* Consider making this more robust (e.g., lazy initialization, singleton).
|
|
595
|
+
*/
|
|
596
|
+
async ensureBrowser() {
|
|
597
|
+
if (!this.browser || !this.browser.isConnected()) {
|
|
598
|
+
const launchArgs = process.env.PLAYWRIGHT_LAUNCH_ARGS?.split(" ") ?? [];
|
|
599
|
+
logger.debug(
|
|
600
|
+
`Launching new Playwright browser instance (Chromium) with args: ${launchArgs.join(" ") || "none"}...`
|
|
601
|
+
);
|
|
602
|
+
this.browser = await chromium.launch({ channel: "chromium", args: launchArgs });
|
|
603
|
+
this.browser.on("disconnected", () => {
|
|
604
|
+
logger.debug("Playwright browser instance disconnected.");
|
|
605
|
+
this.browser = null;
|
|
606
|
+
});
|
|
607
|
+
}
|
|
608
|
+
return this.browser;
|
|
609
|
+
}
|
|
610
|
+
/**
|
|
611
|
+
* Closes the Playwright browser instance if it exists.
|
|
612
|
+
* Should be called during application shutdown.
|
|
613
|
+
*/
|
|
614
|
+
async closeBrowser() {
|
|
615
|
+
if (this.browser?.isConnected()) {
|
|
616
|
+
logger.debug("Closing Playwright browser instance...");
|
|
617
|
+
await this.browser.close();
|
|
618
|
+
this.browser = null;
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
async process(context, next) {
|
|
622
|
+
if (!context.contentType.startsWith("text/html")) {
|
|
623
|
+
await next();
|
|
624
|
+
return;
|
|
625
|
+
}
|
|
626
|
+
const scrapeMode = context.options?.scrapeMode ?? "auto" /* Auto */;
|
|
627
|
+
const shouldRunPlaywright = scrapeMode === "playwright" /* Playwright */ || scrapeMode === "auto" /* Auto */;
|
|
628
|
+
if (!shouldRunPlaywright) {
|
|
629
|
+
logger.debug(
|
|
630
|
+
`Skipping Playwright rendering for ${context.source} as scrapeMode is '${scrapeMode}'.`
|
|
631
|
+
);
|
|
632
|
+
await next();
|
|
633
|
+
return;
|
|
634
|
+
}
|
|
635
|
+
logger.debug(
|
|
636
|
+
`Running Playwright rendering for ${context.source} (scrapeMode: '${scrapeMode}')`
|
|
637
|
+
);
|
|
638
|
+
let page = null;
|
|
639
|
+
let renderedHtml = null;
|
|
640
|
+
try {
|
|
641
|
+
const browser = await this.ensureBrowser();
|
|
642
|
+
page = await browser.newPage();
|
|
643
|
+
logger.debug(`Playwright: Processing ${context.source}`);
|
|
644
|
+
await page.route("**/*", (route) => {
|
|
645
|
+
if (route.request().url() === context.source) {
|
|
646
|
+
return route.fulfill({
|
|
647
|
+
status: 200,
|
|
648
|
+
contentType: context.contentType,
|
|
649
|
+
body: context.content
|
|
650
|
+
});
|
|
651
|
+
}
|
|
652
|
+
const resourceType = route.request().resourceType();
|
|
653
|
+
if (["image", "stylesheet", "font", "media"].includes(resourceType)) {
|
|
654
|
+
return route.abort();
|
|
655
|
+
}
|
|
656
|
+
return route.continue();
|
|
657
|
+
});
|
|
658
|
+
await page.goto(context.source, {
|
|
659
|
+
waitUntil: "load"
|
|
660
|
+
});
|
|
661
|
+
renderedHtml = await page.content();
|
|
662
|
+
logger.debug(`Playwright: Successfully rendered content for ${context.source}`);
|
|
663
|
+
} catch (error) {
|
|
664
|
+
logger.error(`Playwright failed to render ${context.source}: ${error}`);
|
|
665
|
+
context.errors.push(
|
|
666
|
+
error instanceof Error ? error : new Error(`Playwright rendering failed: ${String(error)}`)
|
|
667
|
+
);
|
|
668
|
+
} finally {
|
|
669
|
+
if (page) {
|
|
670
|
+
await page.unroute("**/*");
|
|
671
|
+
await page.close();
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
if (renderedHtml !== null) {
|
|
675
|
+
context.content = renderedHtml;
|
|
676
|
+
logger.debug(
|
|
677
|
+
`Playwright middleware updated content for ${context.source}. Proceeding.`
|
|
678
|
+
);
|
|
679
|
+
} else {
|
|
680
|
+
logger.warn(
|
|
681
|
+
`Playwright rendering resulted in null content for ${context.source}. Proceeding without content update.`
|
|
682
|
+
);
|
|
683
|
+
}
|
|
684
|
+
await next();
|
|
685
|
+
}
|
|
686
|
+
};
|
|
687
|
+
|
|
688
|
+
// src/scraper/middleware/components/HtmlSanitizerMiddleware.ts
|
|
689
|
+
var HtmlSanitizerMiddleware = class {
|
|
690
|
+
// Default selectors to remove
|
|
691
|
+
defaultSelectorsToRemove = [
|
|
692
|
+
"nav",
|
|
693
|
+
"footer",
|
|
694
|
+
"script",
|
|
695
|
+
"style",
|
|
696
|
+
"noscript",
|
|
697
|
+
"svg",
|
|
698
|
+
"link",
|
|
699
|
+
"meta",
|
|
700
|
+
"iframe",
|
|
701
|
+
"header",
|
|
702
|
+
"button",
|
|
703
|
+
"input",
|
|
704
|
+
"textarea",
|
|
705
|
+
"select",
|
|
706
|
+
// "form", // Keep commented
|
|
707
|
+
".ads",
|
|
708
|
+
".advertisement",
|
|
709
|
+
".banner",
|
|
710
|
+
".cookie-banner",
|
|
711
|
+
".cookie-consent",
|
|
712
|
+
".hidden",
|
|
713
|
+
".hide",
|
|
714
|
+
".modal",
|
|
715
|
+
".nav-bar",
|
|
716
|
+
".overlay",
|
|
717
|
+
".popup",
|
|
718
|
+
".promo",
|
|
719
|
+
".mw-editsection",
|
|
720
|
+
".side-bar",
|
|
721
|
+
".social-share",
|
|
722
|
+
".sticky",
|
|
723
|
+
"#ads",
|
|
724
|
+
"#banner",
|
|
725
|
+
"#cookieBanner",
|
|
726
|
+
"#modal",
|
|
727
|
+
"#nav",
|
|
728
|
+
"#overlay",
|
|
729
|
+
"#popup",
|
|
730
|
+
"#sidebar",
|
|
731
|
+
"#socialMediaBox",
|
|
732
|
+
"#stickyHeader",
|
|
733
|
+
"#ad-container",
|
|
734
|
+
".ad-container",
|
|
735
|
+
".login-form",
|
|
736
|
+
".signup-form",
|
|
737
|
+
".tooltip",
|
|
738
|
+
".dropdown-menu",
|
|
739
|
+
// ".alert", // Keep commented
|
|
740
|
+
".breadcrumb",
|
|
741
|
+
".pagination",
|
|
742
|
+
// '[role="alert"]', // Keep commented
|
|
743
|
+
'[role="banner"]',
|
|
744
|
+
'[role="dialog"]',
|
|
745
|
+
'[role="alertdialog"]',
|
|
746
|
+
'[role="region"][aria-label*="skip" i]',
|
|
747
|
+
'[aria-modal="true"]',
|
|
748
|
+
".noprint"
|
|
749
|
+
];
|
|
750
|
+
async process(context, next) {
|
|
751
|
+
const $ = context.dom;
|
|
752
|
+
if (!$) {
|
|
753
|
+
if (context.contentType.startsWith("text/html")) {
|
|
754
|
+
logger.warn(
|
|
755
|
+
`Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
|
|
756
|
+
);
|
|
757
|
+
}
|
|
758
|
+
await next();
|
|
759
|
+
return;
|
|
760
|
+
}
|
|
761
|
+
try {
|
|
762
|
+
const selectorsToRemove = [
|
|
763
|
+
...context.options.excludeSelectors || [],
|
|
764
|
+
// Use options from the context
|
|
765
|
+
...this.defaultSelectorsToRemove
|
|
766
|
+
];
|
|
767
|
+
logger.debug(
|
|
768
|
+
`Removing elements matching ${selectorsToRemove.length} selectors for ${context.source}`
|
|
769
|
+
);
|
|
770
|
+
let removedCount = 0;
|
|
771
|
+
for (const selector of selectorsToRemove) {
|
|
772
|
+
try {
|
|
773
|
+
const elements = $(selector);
|
|
774
|
+
const count = elements.length;
|
|
775
|
+
if (count > 0) {
|
|
776
|
+
elements.remove();
|
|
777
|
+
removedCount += count;
|
|
778
|
+
}
|
|
779
|
+
} catch (selectorError) {
|
|
780
|
+
logger.warn(
|
|
781
|
+
`Potentially invalid selector "${selector}" during element removal: ${selectorError}`
|
|
782
|
+
);
|
|
783
|
+
context.errors.push(
|
|
784
|
+
new Error(`Invalid selector "${selector}": ${selectorError}`)
|
|
785
|
+
);
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
logger.debug(`Removed ${removedCount} elements for ${context.source}`);
|
|
789
|
+
} catch (error) {
|
|
790
|
+
logger.error(`Error during HTML element removal for ${context.source}: ${error}`);
|
|
791
|
+
context.errors.push(
|
|
792
|
+
error instanceof Error ? error : new Error(`HTML element removal failed: ${String(error)}`)
|
|
793
|
+
);
|
|
794
|
+
}
|
|
795
|
+
await next();
|
|
796
|
+
}
|
|
797
|
+
};
|
|
798
|
+
|
|
799
|
+
// src/scraper/middleware/components/HtmlToMarkdownMiddleware.ts
|
|
800
|
+
import { gfm } from "@joplin/turndown-plugin-gfm";
|
|
801
|
+
import TurndownService from "turndown";
|
|
802
|
+
var HtmlToMarkdownMiddleware = class {
|
|
803
|
+
turndownService;
|
|
804
|
+
constructor() {
|
|
805
|
+
this.turndownService = new TurndownService({
|
|
806
|
+
headingStyle: "atx",
|
|
807
|
+
hr: "---",
|
|
808
|
+
bulletListMarker: "-",
|
|
809
|
+
codeBlockStyle: "fenced",
|
|
810
|
+
emDelimiter: "_",
|
|
811
|
+
strongDelimiter: "**",
|
|
812
|
+
linkStyle: "inlined"
|
|
813
|
+
});
|
|
814
|
+
this.turndownService.use(gfm);
|
|
815
|
+
this.addCustomRules();
|
|
816
|
+
}
|
|
817
|
+
addCustomRules() {
|
|
818
|
+
this.turndownService.addRule("pre", {
|
|
819
|
+
filter: ["pre"],
|
|
820
|
+
replacement: (content3, node2) => {
|
|
821
|
+
const element = node2;
|
|
822
|
+
let language = element.getAttribute("data-language") || "";
|
|
823
|
+
if (!language) {
|
|
824
|
+
const highlightElement = element.closest(
|
|
825
|
+
'[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
|
|
826
|
+
) || element.querySelector(
|
|
827
|
+
'[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
|
|
828
|
+
);
|
|
829
|
+
if (highlightElement) {
|
|
830
|
+
const className = highlightElement.className;
|
|
831
|
+
const match = className.match(
|
|
832
|
+
/(?:highlight-source-|highlight-|language-)(\w+)/
|
|
833
|
+
);
|
|
834
|
+
if (match) language = match[1];
|
|
835
|
+
}
|
|
836
|
+
}
|
|
837
|
+
const brElements = element.querySelectorAll("br");
|
|
838
|
+
if (brElements.length > 0) {
|
|
839
|
+
for (const br of brElements) {
|
|
840
|
+
br.replaceWith("\n");
|
|
841
|
+
}
|
|
842
|
+
}
|
|
843
|
+
const text3 = element.textContent || "";
|
|
844
|
+
return `
|
|
845
|
+
\`\`\`${language}
|
|
846
|
+
${text3.replace(/^\n+|\n+$/g, "")}
|
|
847
|
+
\`\`\`
|
|
848
|
+
`;
|
|
849
|
+
}
|
|
850
|
+
});
|
|
851
|
+
}
|
|
852
|
+
/**
|
|
853
|
+
* Processes the context to convert the sanitized HTML body node to Markdown.
|
|
854
|
+
* @param context The current processing context.
|
|
855
|
+
* @param next Function to call the next middleware.
|
|
856
|
+
*/
|
|
857
|
+
async process(context, next) {
|
|
858
|
+
const $ = context.dom;
|
|
859
|
+
if (!$) {
|
|
860
|
+
if (context.contentType.startsWith("text/html")) {
|
|
861
|
+
logger.warn(
|
|
862
|
+
`Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware ran correctly.`
|
|
863
|
+
);
|
|
864
|
+
}
|
|
865
|
+
await next();
|
|
866
|
+
return;
|
|
867
|
+
}
|
|
868
|
+
try {
|
|
869
|
+
logger.debug(`Converting HTML content to Markdown for ${context.source}`);
|
|
870
|
+
const htmlToConvert = $("body").html() || $.html();
|
|
871
|
+
const markdown = this.turndownService.turndown(htmlToConvert).trim();
|
|
872
|
+
if (!markdown) {
|
|
873
|
+
const warnMsg = `HTML to Markdown conversion resulted in empty content for ${context.source}.`;
|
|
874
|
+
logger.warn(warnMsg);
|
|
875
|
+
context.content = "";
|
|
876
|
+
context.contentType = "text/markdown";
|
|
877
|
+
} else {
|
|
878
|
+
context.content = markdown;
|
|
879
|
+
context.contentType = "text/markdown";
|
|
880
|
+
logger.debug(`Successfully converted HTML to Markdown for ${context.source}`);
|
|
881
|
+
}
|
|
882
|
+
} catch (error) {
|
|
883
|
+
logger.error(`Error converting HTML to Markdown for ${context.source}: ${error}`);
|
|
884
|
+
context.errors.push(
|
|
885
|
+
new Error(
|
|
886
|
+
`Failed to convert HTML to Markdown: ${error instanceof Error ? error.message : String(error)}`
|
|
887
|
+
)
|
|
888
|
+
);
|
|
889
|
+
}
|
|
890
|
+
await next();
|
|
891
|
+
}
|
|
892
|
+
};
|
|
893
|
+
|
|
894
|
+
// src/scraper/middleware/components/MarkdownLinkExtractorMiddleware.ts
|
|
895
|
+
var MarkdownLinkExtractorMiddleware = class {
|
|
896
|
+
/**
|
|
897
|
+
* Processes the context. Currently a no-op regarding link extraction.
|
|
898
|
+
* @param context The current processing context.
|
|
899
|
+
* @param next Function to call the next middleware.
|
|
900
|
+
*/
|
|
901
|
+
async process(context, next) {
|
|
902
|
+
if (context.contentType === "text/markdown") {
|
|
903
|
+
if (!Array.isArray(context.links)) {
|
|
904
|
+
context.links = [];
|
|
905
|
+
}
|
|
906
|
+
}
|
|
907
|
+
await next();
|
|
908
|
+
}
|
|
909
|
+
};
|
|
910
|
+
|
|
911
|
+
// src/scraper/middleware/components/MarkdownMetadataExtractorMiddleware.ts
|
|
912
|
+
var MarkdownMetadataExtractorMiddleware = class {
|
|
913
|
+
/**
|
|
914
|
+
* Processes the context to extract the title from Markdown.
|
|
915
|
+
* @param context The current processing context.
|
|
916
|
+
* @param next Function to call the next middleware.
|
|
917
|
+
*/
|
|
918
|
+
async process(context, next) {
|
|
919
|
+
if (context.contentType === "text/markdown" || context.contentType === "text/plain") {
|
|
920
|
+
try {
|
|
921
|
+
const textContent = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
|
|
922
|
+
if (typeof context.content !== "string") {
|
|
923
|
+
context.content = textContent;
|
|
924
|
+
}
|
|
925
|
+
let title = "Untitled";
|
|
926
|
+
if (context.contentType === "text/markdown") {
|
|
927
|
+
const match = textContent.match(/^#\s+(.*)$/m);
|
|
928
|
+
if (match?.[1]) {
|
|
929
|
+
title = match[1].trim();
|
|
930
|
+
}
|
|
931
|
+
}
|
|
932
|
+
context.metadata.title = title;
|
|
933
|
+
} catch (error) {
|
|
934
|
+
context.errors.push(
|
|
935
|
+
new Error(
|
|
936
|
+
`Failed to extract metadata from Markdown: ${error instanceof Error ? error.message : String(error)}`
|
|
937
|
+
)
|
|
938
|
+
);
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
await next();
|
|
942
|
+
}
|
|
943
|
+
};
|
|
944
|
+
|
|
608
945
|
// src/scraper/strategies/BaseScraperStrategy.ts
|
|
609
946
|
import { URL as URL2 } from "node:url";
|
|
610
947
|
|
|
@@ -629,8 +966,8 @@ var CancellationError = class extends PipelineError {
|
|
|
629
966
|
};
|
|
630
967
|
|
|
631
968
|
// src/scraper/strategies/BaseScraperStrategy.ts
|
|
632
|
-
var
|
|
633
|
-
var
|
|
969
|
+
var DEFAULT_MAX_PAGES2 = 100;
|
|
970
|
+
var DEFAULT_MAX_DEPTH2 = 3;
|
|
634
971
|
var DEFAULT_CONCURRENCY = 3;
|
|
635
972
|
var BaseScraperStrategy = class {
|
|
636
973
|
visited = /* @__PURE__ */ new Set();
|
|
@@ -639,19 +976,14 @@ var BaseScraperStrategy = class {
|
|
|
639
976
|
constructor(options = {}) {
|
|
640
977
|
this.options = options;
|
|
641
978
|
}
|
|
642
|
-
getProcessor
|
|
643
|
-
if (mimeType.startsWith("text/html")) {
|
|
644
|
-
return new HtmlProcessor();
|
|
645
|
-
}
|
|
646
|
-
return new MarkdownProcessor();
|
|
647
|
-
}
|
|
979
|
+
// Removed getProcessor method as processing is now handled by strategies using middleware pipelines
|
|
648
980
|
async processBatch(batch, baseUrl, options, progressCallback, signal) {
|
|
649
981
|
const results = await Promise.all(
|
|
650
982
|
batch.map(async (item) => {
|
|
651
983
|
if (signal?.aborted) {
|
|
652
984
|
throw new CancellationError("Scraping cancelled during batch processing");
|
|
653
985
|
}
|
|
654
|
-
const maxDepth = options.maxDepth ??
|
|
986
|
+
const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH2;
|
|
655
987
|
if (item.depth > maxDepth) {
|
|
656
988
|
return [];
|
|
657
989
|
}
|
|
@@ -659,7 +991,7 @@ var BaseScraperStrategy = class {
|
|
|
659
991
|
const result = await this.processItem(item, options, void 0, signal);
|
|
660
992
|
if (result.document) {
|
|
661
993
|
this.pageCount++;
|
|
662
|
-
const maxPages = options.maxPages ??
|
|
994
|
+
const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES2;
|
|
663
995
|
logger.info(
|
|
664
996
|
`\u{1F310} Scraping page ${this.pageCount}/${maxPages} (depth ${item.depth}/${maxDepth}): ${item.url}`
|
|
665
997
|
);
|
|
@@ -711,7 +1043,7 @@ var BaseScraperStrategy = class {
|
|
|
711
1043
|
const baseUrl = new URL2(options.url);
|
|
712
1044
|
const queue = [{ url: options.url, depth: 0 }];
|
|
713
1045
|
this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
|
|
714
|
-
const maxPages = options.maxPages ??
|
|
1046
|
+
const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES2;
|
|
715
1047
|
const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
|
|
716
1048
|
while (queue.length > 0 && this.pageCount < maxPages) {
|
|
717
1049
|
if (signal?.aborted) {
|
|
@@ -745,9 +1077,12 @@ var BaseScraperStrategy = class {
|
|
|
745
1077
|
var WebScraperStrategy = class extends BaseScraperStrategy {
|
|
746
1078
|
httpFetcher = new HttpFetcher();
|
|
747
1079
|
shouldFollowLinkFn;
|
|
1080
|
+
playwrightMiddleware;
|
|
1081
|
+
// Add member
|
|
748
1082
|
constructor(options = {}) {
|
|
749
1083
|
super({ urlNormalizerOptions: options.urlNormalizerOptions });
|
|
750
1084
|
this.shouldFollowLinkFn = options.shouldFollowLink;
|
|
1085
|
+
this.playwrightMiddleware = new HtmlPlaywrightMiddleware();
|
|
751
1086
|
}
|
|
752
1087
|
canHandle(url) {
|
|
753
1088
|
try {
|
|
@@ -781,12 +1116,56 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
|
|
|
781
1116
|
followRedirects: options.followRedirects
|
|
782
1117
|
};
|
|
783
1118
|
const rawContent = await this.httpFetcher.fetch(url, fetchOptions);
|
|
784
|
-
const
|
|
785
|
-
|
|
1119
|
+
const initialContext = {
|
|
1120
|
+
content: rawContent.content,
|
|
1121
|
+
contentType: rawContent.mimeType,
|
|
1122
|
+
source: rawContent.source,
|
|
1123
|
+
// Use the final source URL after redirects
|
|
1124
|
+
metadata: {},
|
|
1125
|
+
links: [],
|
|
1126
|
+
errors: [],
|
|
1127
|
+
options,
|
|
1128
|
+
fetcher: this.httpFetcher
|
|
1129
|
+
};
|
|
1130
|
+
let pipeline;
|
|
1131
|
+
if (initialContext.contentType.startsWith("text/html")) {
|
|
1132
|
+
const htmlPipelineSteps = [
|
|
1133
|
+
this.playwrightMiddleware,
|
|
1134
|
+
// Use the instance member
|
|
1135
|
+
// TODO: Add HtmlJsExecutorMiddleware here if needed based on options
|
|
1136
|
+
new HtmlCheerioParserMiddleware(),
|
|
1137
|
+
// Always runs after content is finalized
|
|
1138
|
+
new HtmlMetadataExtractorMiddleware(),
|
|
1139
|
+
new HtmlLinkExtractorMiddleware(),
|
|
1140
|
+
new HtmlSanitizerMiddleware(),
|
|
1141
|
+
// Element remover
|
|
1142
|
+
new HtmlToMarkdownMiddleware()
|
|
1143
|
+
];
|
|
1144
|
+
pipeline = new ContentProcessingPipeline(htmlPipelineSteps);
|
|
1145
|
+
} else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain") {
|
|
1146
|
+
pipeline = new ContentProcessingPipeline([
|
|
1147
|
+
new MarkdownMetadataExtractorMiddleware(),
|
|
1148
|
+
new MarkdownLinkExtractorMiddleware()
|
|
1149
|
+
// Placeholder for now
|
|
1150
|
+
]);
|
|
1151
|
+
} else {
|
|
1152
|
+
logger.warn(
|
|
1153
|
+
`Unsupported content type "${initialContext.contentType}" for URL ${url}. Skipping processing.`
|
|
1154
|
+
);
|
|
1155
|
+
return { document: void 0, links: [] };
|
|
1156
|
+
}
|
|
1157
|
+
const finalContext = await pipeline.run(initialContext);
|
|
1158
|
+
for (const err of finalContext.errors) {
|
|
1159
|
+
logger.warn(`Processing error for ${url}: ${err.message}`);
|
|
1160
|
+
}
|
|
1161
|
+
if (typeof finalContext.content !== "string" || !finalContext.content.trim()) {
|
|
1162
|
+
logger.warn(`No processable content found for ${url} after pipeline execution.`);
|
|
1163
|
+
return { document: void 0, links: finalContext.links };
|
|
1164
|
+
}
|
|
786
1165
|
const baseUrl = new URL(options.url);
|
|
787
|
-
const
|
|
1166
|
+
const filteredLinks = finalContext.links.filter((link) => {
|
|
788
1167
|
try {
|
|
789
|
-
const targetUrl = new URL(link
|
|
1168
|
+
const targetUrl = new URL(link);
|
|
790
1169
|
const scope = options.scope || "subpages";
|
|
791
1170
|
return this.isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
|
|
792
1171
|
} catch {
|
|
@@ -795,21 +1174,37 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
|
|
|
795
1174
|
});
|
|
796
1175
|
return {
|
|
797
1176
|
document: {
|
|
798
|
-
content:
|
|
1177
|
+
content: finalContext.content,
|
|
1178
|
+
// Final processed content (Markdown)
|
|
799
1179
|
metadata: {
|
|
800
|
-
url:
|
|
801
|
-
|
|
1180
|
+
url: finalContext.source,
|
|
1181
|
+
// URL after redirects
|
|
1182
|
+
// Ensure title is a string, default to "Untitled"
|
|
1183
|
+
title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
|
|
802
1184
|
library: options.library,
|
|
803
1185
|
version: options.version
|
|
1186
|
+
// Add other metadata from context if needed
|
|
804
1187
|
}
|
|
805
1188
|
},
|
|
806
|
-
links
|
|
1189
|
+
links: filteredLinks
|
|
1190
|
+
// Use the filtered links
|
|
807
1191
|
};
|
|
808
1192
|
} catch (error) {
|
|
809
|
-
logger.error(`Failed
|
|
1193
|
+
logger.error(`Failed processing page ${url}: ${error}`);
|
|
810
1194
|
throw error;
|
|
811
1195
|
}
|
|
812
1196
|
}
|
|
1197
|
+
/**
|
|
1198
|
+
* Overrides the base scrape method to ensure the Playwright browser is closed
|
|
1199
|
+
* after the scraping process completes or errors out.
|
|
1200
|
+
*/
|
|
1201
|
+
async scrape(options, progressCallback, signal) {
|
|
1202
|
+
try {
|
|
1203
|
+
await super.scrape(options, progressCallback, signal);
|
|
1204
|
+
} finally {
|
|
1205
|
+
await this.playwrightMiddleware.closeBrowser();
|
|
1206
|
+
}
|
|
1207
|
+
}
|
|
813
1208
|
};
|
|
814
1209
|
|
|
815
1210
|
// src/scraper/strategies/GitHubScraperStrategy.ts
|
|
@@ -879,18 +1274,58 @@ var LocalFileStrategy = class extends BaseScraperStrategy {
|
|
|
879
1274
|
}
|
|
880
1275
|
logger.info(`\u{1F4C4} Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
|
|
881
1276
|
const rawContent = await this.fileFetcher.fetch(item.url);
|
|
882
|
-
const
|
|
883
|
-
|
|
1277
|
+
const initialContext = {
|
|
1278
|
+
content: rawContent.content,
|
|
1279
|
+
contentType: rawContent.mimeType,
|
|
1280
|
+
source: rawContent.source,
|
|
1281
|
+
// file:// URL
|
|
1282
|
+
metadata: {},
|
|
1283
|
+
links: [],
|
|
1284
|
+
// LocalFileStrategy doesn't extract links from file content itself
|
|
1285
|
+
errors: [],
|
|
1286
|
+
options
|
|
1287
|
+
// Pass the full options object
|
|
1288
|
+
};
|
|
1289
|
+
let pipeline;
|
|
1290
|
+
if (initialContext.contentType.startsWith("text/html")) {
|
|
1291
|
+
pipeline = new ContentProcessingPipeline([
|
|
1292
|
+
new HtmlCheerioParserMiddleware(),
|
|
1293
|
+
new HtmlMetadataExtractorMiddleware(),
|
|
1294
|
+
// No HtmlLinkExtractorMiddleware needed for local files
|
|
1295
|
+
new HtmlSanitizerMiddleware(),
|
|
1296
|
+
new HtmlToMarkdownMiddleware()
|
|
1297
|
+
]);
|
|
1298
|
+
} else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain" || // Treat plain text as markdown
|
|
1299
|
+
initialContext.contentType.startsWith("text/")) {
|
|
1300
|
+
pipeline = new ContentProcessingPipeline([
|
|
1301
|
+
new MarkdownMetadataExtractorMiddleware()
|
|
1302
|
+
// No MarkdownLinkExtractorMiddleware needed for local files
|
|
1303
|
+
]);
|
|
1304
|
+
} else {
|
|
1305
|
+
logger.warn(
|
|
1306
|
+
`Unsupported content type "${initialContext.contentType}" for file ${filePath}. Skipping processing.`
|
|
1307
|
+
);
|
|
1308
|
+
return { document: void 0, links: [] };
|
|
1309
|
+
}
|
|
1310
|
+
const finalContext = await pipeline.run(initialContext);
|
|
1311
|
+
for (const err of finalContext.errors) {
|
|
1312
|
+
logger.warn(`Processing error for ${filePath}: ${err.message}`);
|
|
1313
|
+
}
|
|
1314
|
+
const finalContentString = typeof finalContext.content === "string" ? finalContext.content : Buffer.from(finalContext.content).toString("utf-8");
|
|
884
1315
|
return {
|
|
885
1316
|
document: {
|
|
886
|
-
|
|
1317
|
+
// Use the potentially empty string content
|
|
1318
|
+
content: finalContentString,
|
|
887
1319
|
metadata: {
|
|
888
|
-
url:
|
|
889
|
-
|
|
1320
|
+
url: finalContext.source,
|
|
1321
|
+
// Use context source (file:// URL)
|
|
1322
|
+
// Ensure title is a string, default to "Untitled"
|
|
1323
|
+
title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
|
|
890
1324
|
library: options.library,
|
|
891
1325
|
version: options.version
|
|
892
1326
|
}
|
|
893
1327
|
}
|
|
1328
|
+
// No links returned from file content processing
|
|
894
1329
|
};
|
|
895
1330
|
}
|
|
896
1331
|
async scrape(options, progressCallback, signal) {
|
|
@@ -1003,7 +1438,7 @@ var PipelineWorker = class {
|
|
|
1003
1438
|
async executeJob(job, callbacks) {
|
|
1004
1439
|
const { id: jobId, library, version, options, abortController } = job;
|
|
1005
1440
|
const signal = abortController.signal;
|
|
1006
|
-
logger.
|
|
1441
|
+
logger.debug(`[${jobId}] Worker starting job for ${library}@${version}`);
|
|
1007
1442
|
try {
|
|
1008
1443
|
await this.scraperService.scrape(
|
|
1009
1444
|
options,
|
|
@@ -1323,14 +1758,13 @@ var LibraryNotFoundError = class extends ToolError {
|
|
|
1323
1758
|
|
|
1324
1759
|
// src/tools/FetchUrlTool.ts
|
|
1325
1760
|
var FetchUrlTool = class {
|
|
1326
|
-
constructor(httpFetcher, fileFetcher, processor) {
|
|
1327
|
-
this.processor = processor;
|
|
1328
|
-
this.fetchers = [httpFetcher, fileFetcher];
|
|
1329
|
-
}
|
|
1330
1761
|
/**
|
|
1331
1762
|
* Collection of fetchers that will be tried in order for a given URL.
|
|
1332
1763
|
*/
|
|
1333
1764
|
fetchers;
|
|
1765
|
+
constructor(httpFetcher, fileFetcher) {
|
|
1766
|
+
this.fetchers = [httpFetcher, fileFetcher];
|
|
1767
|
+
}
|
|
1334
1768
|
/**
|
|
1335
1769
|
* Fetches content from a URL and converts it to Markdown.
|
|
1336
1770
|
* Supports both HTTP/HTTPS URLs and local file URLs (file://).
|
|
@@ -1338,7 +1772,7 @@ var FetchUrlTool = class {
|
|
|
1338
1772
|
* @throws {ToolError} If fetching or processing fails
|
|
1339
1773
|
*/
|
|
1340
1774
|
async execute(options) {
|
|
1341
|
-
const { url } = options;
|
|
1775
|
+
const { url, scrapeMode = "auto" /* Auto */ } = options;
|
|
1342
1776
|
const canFetchResults = this.fetchers.map((f) => f.canFetch(url));
|
|
1343
1777
|
const fetcherIndex = canFetchResults.findIndex((result) => result === true);
|
|
1344
1778
|
if (fetcherIndex === -1) {
|
|
@@ -1348,18 +1782,88 @@ var FetchUrlTool = class {
|
|
|
1348
1782
|
);
|
|
1349
1783
|
}
|
|
1350
1784
|
const fetcher = this.fetchers[fetcherIndex];
|
|
1785
|
+
const playwrightMiddleware = new HtmlPlaywrightMiddleware();
|
|
1351
1786
|
try {
|
|
1352
1787
|
logger.info(`\u{1F4E1} Fetching ${url}...`);
|
|
1353
1788
|
const rawContent = await fetcher.fetch(url, {
|
|
1354
1789
|
followRedirects: options.followRedirects ?? true,
|
|
1355
1790
|
maxRetries: 3
|
|
1791
|
+
// Keep retries for fetching
|
|
1356
1792
|
});
|
|
1357
|
-
logger.info("\u{1F504}
|
|
1358
|
-
const
|
|
1359
|
-
|
|
1360
|
-
|
|
1793
|
+
logger.info("\u{1F504} Processing content...");
|
|
1794
|
+
const initialContext = {
|
|
1795
|
+
content: rawContent.content,
|
|
1796
|
+
contentType: rawContent.mimeType,
|
|
1797
|
+
source: rawContent.source,
|
|
1798
|
+
metadata: {},
|
|
1799
|
+
links: [],
|
|
1800
|
+
// Links not needed for this tool's output
|
|
1801
|
+
errors: [],
|
|
1802
|
+
fetcher,
|
|
1803
|
+
// Create a minimal ScraperOptions object for the context
|
|
1804
|
+
options: {
|
|
1805
|
+
url,
|
|
1806
|
+
// Use the input URL
|
|
1807
|
+
library: "",
|
|
1808
|
+
// Not applicable for this tool
|
|
1809
|
+
version: "",
|
|
1810
|
+
// Use empty string instead of undefined
|
|
1811
|
+
// Default other options as needed by middleware
|
|
1812
|
+
maxDepth: 0,
|
|
1813
|
+
maxPages: 1,
|
|
1814
|
+
maxConcurrency: 1,
|
|
1815
|
+
scope: "subpages",
|
|
1816
|
+
// Default, though not used for single page fetch
|
|
1817
|
+
followRedirects: options.followRedirects ?? true,
|
|
1818
|
+
excludeSelectors: void 0,
|
|
1819
|
+
// Not currently configurable via this tool
|
|
1820
|
+
ignoreErrors: false,
|
|
1821
|
+
scrapeMode
|
|
1822
|
+
// Pass the scrapeMode
|
|
1823
|
+
}
|
|
1824
|
+
};
|
|
1825
|
+
let pipeline;
|
|
1826
|
+
if (initialContext.contentType.startsWith("text/html")) {
|
|
1827
|
+
const htmlPipelineSteps = [
|
|
1828
|
+
playwrightMiddleware,
|
|
1829
|
+
// Use the instantiated middleware
|
|
1830
|
+
new HtmlCheerioParserMiddleware(),
|
|
1831
|
+
// Always runs after content is finalized
|
|
1832
|
+
new HtmlMetadataExtractorMiddleware(),
|
|
1833
|
+
// Keep for potential future use
|
|
1834
|
+
// No Link Extractor needed for this tool
|
|
1835
|
+
new HtmlSanitizerMiddleware(),
|
|
1836
|
+
// Element remover
|
|
1837
|
+
new HtmlToMarkdownMiddleware()
|
|
1838
|
+
];
|
|
1839
|
+
pipeline = new ContentProcessingPipeline(htmlPipelineSteps);
|
|
1840
|
+
} else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain") {
|
|
1841
|
+
pipeline = new ContentProcessingPipeline([
|
|
1842
|
+
new MarkdownMetadataExtractorMiddleware()
|
|
1843
|
+
// Extract title (though not used)
|
|
1844
|
+
// No further processing needed for Markdown/Plain text for this tool
|
|
1845
|
+
]);
|
|
1846
|
+
} else {
|
|
1847
|
+
logger.warn(
|
|
1848
|
+
`Unsupported content type "${initialContext.contentType}" for ${url}. Returning raw content.`
|
|
1849
|
+
);
|
|
1850
|
+
const contentString = typeof rawContent.content === "string" ? rawContent.content : Buffer.from(rawContent.content).toString("utf-8");
|
|
1851
|
+
return contentString;
|
|
1852
|
+
}
|
|
1853
|
+
const finalContext = await pipeline.run(initialContext);
|
|
1854
|
+
for (const err of finalContext.errors) {
|
|
1855
|
+
logger.warn(`Processing error for ${url}: ${err.message}`);
|
|
1856
|
+
}
|
|
1857
|
+
if (typeof finalContext.content !== "string" || !finalContext.content.trim()) {
|
|
1858
|
+
throw new ToolError(
|
|
1859
|
+
`Processing resulted in empty content for ${url}`,
|
|
1860
|
+
this.constructor.name
|
|
1861
|
+
);
|
|
1862
|
+
}
|
|
1863
|
+
logger.info(`\u2705 Successfully processed ${url}`);
|
|
1864
|
+
return finalContext.content;
|
|
1361
1865
|
} catch (error) {
|
|
1362
|
-
if (error instanceof ScraperError) {
|
|
1866
|
+
if (error instanceof ScraperError || error instanceof ToolError) {
|
|
1363
1867
|
throw new ToolError(
|
|
1364
1868
|
`Failed to fetch or process URL: ${error.message}`,
|
|
1365
1869
|
this.constructor.name
|
|
@@ -1369,6 +1873,8 @@ var FetchUrlTool = class {
|
|
|
1369
1873
|
`Failed to fetch or process URL: ${error instanceof Error ? error.message : String(error)}`,
|
|
1370
1874
|
this.constructor.name
|
|
1371
1875
|
);
|
|
1876
|
+
} finally {
|
|
1877
|
+
await playwrightMiddleware.closeBrowser();
|
|
1372
1878
|
}
|
|
1373
1879
|
}
|
|
1374
1880
|
};
|
|
@@ -1489,10 +1995,12 @@ var ScrapeTool = class {
|
|
|
1489
1995
|
version: internalVersion,
|
|
1490
1996
|
scope: scraperOptions?.scope ?? "subpages",
|
|
1491
1997
|
followRedirects: scraperOptions?.followRedirects ?? true,
|
|
1492
|
-
maxPages: scraperOptions?.maxPages ??
|
|
1493
|
-
maxDepth: scraperOptions?.maxDepth ??
|
|
1494
|
-
|
|
1495
|
-
ignoreErrors: scraperOptions?.ignoreErrors ?? true
|
|
1998
|
+
maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES,
|
|
1999
|
+
maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH,
|
|
2000
|
+
maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
|
|
2001
|
+
ignoreErrors: scraperOptions?.ignoreErrors ?? true,
|
|
2002
|
+
scrapeMode: scraperOptions?.scrapeMode ?? "auto" /* Auto */
|
|
2003
|
+
// Pass scrapeMode enum
|
|
1496
2004
|
});
|
|
1497
2005
|
logger.info(`\u{1F680} Job ${jobId} enqueued for scraping.`);
|
|
1498
2006
|
options.onProgress?.({
|
|
@@ -1780,7 +2288,6 @@ import Fuse from "fuse.js";
|
|
|
1780
2288
|
import semver3 from "semver";
|
|
1781
2289
|
|
|
1782
2290
|
// src/splitter/SemanticMarkdownSplitter.ts
|
|
1783
|
-
import { JSDOM as JSDOM2 } from "jsdom";
|
|
1784
2291
|
import { RecursiveCharacterTextSplitter as RecursiveCharacterTextSplitter2 } from "langchain/text_splitter";
|
|
1785
2292
|
import remarkGfm from "remark-gfm";
|
|
1786
2293
|
import remarkHtml from "remark-html";
|
|
@@ -10597,7 +11104,7 @@ ${"```"}`;
|
|
|
10597
11104
|
* Parse HTML
|
|
10598
11105
|
*/
|
|
10599
11106
|
async parseHtml(html) {
|
|
10600
|
-
const { window } =
|
|
11107
|
+
const { window } = createJSDOM(html);
|
|
10601
11108
|
return window.document;
|
|
10602
11109
|
}
|
|
10603
11110
|
};
|
|
@@ -11566,11 +12073,14 @@ var DocumentManagementService = class {
|
|
|
11566
12073
|
};
|
|
11567
12074
|
|
|
11568
12075
|
export {
|
|
12076
|
+
DEFAULT_MAX_PAGES,
|
|
12077
|
+
DEFAULT_MAX_DEPTH,
|
|
12078
|
+
DEFAULT_MAX_CONCURRENCY,
|
|
11569
12079
|
setLogLevel,
|
|
11570
12080
|
logger,
|
|
11571
12081
|
HttpFetcher,
|
|
11572
12082
|
FileFetcher,
|
|
11573
|
-
|
|
12083
|
+
ScrapeMode,
|
|
11574
12084
|
PipelineJobStatus,
|
|
11575
12085
|
PipelineManager,
|
|
11576
12086
|
CancelJobTool,
|
|
@@ -11585,4 +12095,4 @@ export {
|
|
|
11585
12095
|
SearchTool,
|
|
11586
12096
|
DocumentManagementService
|
|
11587
12097
|
};
|
|
11588
|
-
//# sourceMappingURL=chunk-
|
|
12098
|
+
//# sourceMappingURL=chunk-VTO2ED43.js.map
|