@arabold/docs-mcp-server 1.9.0 → 1.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/dist/{chunk-A5FW7XVC.js → chunk-VTO2ED43.js} +748 -245
- package/dist/chunk-VTO2ED43.js.map +1 -0
- package/dist/cli.js +44 -15
- package/dist/cli.js.map +1 -1
- package/dist/server.js +3 -7
- package/dist/server.js.map +1 -1
- package/package.json +5 -6
- package/dist/chunk-A5FW7XVC.js.map +0 -1
|
@@ -292,215 +292,13 @@ var FileFetcher = class {
|
|
|
292
292
|
}
|
|
293
293
|
};
|
|
294
294
|
|
|
295
|
-
// src/scraper/
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
selectorsToRemove = [
|
|
303
|
-
"nav",
|
|
304
|
-
"footer",
|
|
305
|
-
"script",
|
|
306
|
-
"style",
|
|
307
|
-
"noscript",
|
|
308
|
-
"svg",
|
|
309
|
-
"link",
|
|
310
|
-
"meta",
|
|
311
|
-
"iframe",
|
|
312
|
-
"header",
|
|
313
|
-
"button",
|
|
314
|
-
"input",
|
|
315
|
-
"textarea",
|
|
316
|
-
"select",
|
|
317
|
-
// "form", // Known issue: Some pages use alerts for important content
|
|
318
|
-
".ads",
|
|
319
|
-
".advertisement",
|
|
320
|
-
".banner",
|
|
321
|
-
".cookie-banner",
|
|
322
|
-
".cookie-consent",
|
|
323
|
-
".hidden",
|
|
324
|
-
".hide",
|
|
325
|
-
".modal",
|
|
326
|
-
".nav-bar",
|
|
327
|
-
".overlay",
|
|
328
|
-
".popup",
|
|
329
|
-
".promo",
|
|
330
|
-
".mw-editsection",
|
|
331
|
-
".side-bar",
|
|
332
|
-
".social-share",
|
|
333
|
-
".sticky",
|
|
334
|
-
"#ads",
|
|
335
|
-
"#banner",
|
|
336
|
-
"#cookieBanner",
|
|
337
|
-
"#modal",
|
|
338
|
-
"#nav",
|
|
339
|
-
"#overlay",
|
|
340
|
-
"#popup",
|
|
341
|
-
"#sidebar",
|
|
342
|
-
"#socialMediaBox",
|
|
343
|
-
"#stickyHeader",
|
|
344
|
-
"#ad-container",
|
|
345
|
-
".ad-container",
|
|
346
|
-
".login-form",
|
|
347
|
-
".signup-form",
|
|
348
|
-
".tooltip",
|
|
349
|
-
".dropdown-menu",
|
|
350
|
-
// ".alert", // Known issue: Some pages use alerts for important content
|
|
351
|
-
".breadcrumb",
|
|
352
|
-
".pagination",
|
|
353
|
-
// '[role="alert"]', // Known issue: Some pages use alerts for important content
|
|
354
|
-
'[role="banner"]',
|
|
355
|
-
'[role="dialog"]',
|
|
356
|
-
'[role="alertdialog"]',
|
|
357
|
-
'[role="region"][aria-label*="skip" i]',
|
|
358
|
-
'[aria-modal="true"]',
|
|
359
|
-
".noprint"
|
|
360
|
-
];
|
|
361
|
-
constructor(options) {
|
|
362
|
-
this.turndownService = new TurndownService({
|
|
363
|
-
headingStyle: "atx",
|
|
364
|
-
hr: "---",
|
|
365
|
-
bulletListMarker: "-",
|
|
366
|
-
codeBlockStyle: "fenced",
|
|
367
|
-
emDelimiter: "_",
|
|
368
|
-
strongDelimiter: "**",
|
|
369
|
-
linkStyle: "inlined"
|
|
370
|
-
});
|
|
371
|
-
this.turndownService.addRule("pre", {
|
|
372
|
-
filter: ["pre"],
|
|
373
|
-
replacement: (content3, node2) => {
|
|
374
|
-
const element = node2;
|
|
375
|
-
let language = element.getAttribute("data-language") || "";
|
|
376
|
-
if (!language) {
|
|
377
|
-
const highlightElement = element.closest(
|
|
378
|
-
'[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
|
|
379
|
-
);
|
|
380
|
-
if (highlightElement) {
|
|
381
|
-
const className = highlightElement.className;
|
|
382
|
-
const match = className.match(
|
|
383
|
-
/(?:highlight-source-|highlight-|language-)(\w+)/
|
|
384
|
-
);
|
|
385
|
-
if (match) {
|
|
386
|
-
language = match[1];
|
|
387
|
-
}
|
|
388
|
-
}
|
|
389
|
-
}
|
|
390
|
-
const text3 = (() => {
|
|
391
|
-
const clone = element.cloneNode(true);
|
|
392
|
-
const brElements = Array.from(clone.querySelectorAll("br"));
|
|
393
|
-
for (const br of brElements) {
|
|
394
|
-
br.replaceWith("\n");
|
|
395
|
-
}
|
|
396
|
-
return clone.textContent;
|
|
397
|
-
})();
|
|
398
|
-
return `
|
|
399
|
-
\`\`\`${language}
|
|
400
|
-
${text3}
|
|
401
|
-
\`\`\`
|
|
402
|
-
`;
|
|
403
|
-
}
|
|
404
|
-
});
|
|
405
|
-
this.turndownService.addRule("table", {
|
|
406
|
-
filter: ["table"],
|
|
407
|
-
replacement: (content3) => {
|
|
408
|
-
const cleanedContent = content3.replace(/\n+/g, "\n");
|
|
409
|
-
return `
|
|
410
|
-
|
|
411
|
-
${cleanedContent}
|
|
412
|
-
|
|
413
|
-
`;
|
|
414
|
-
}
|
|
415
|
-
});
|
|
416
|
-
this.options = options || {};
|
|
417
|
-
}
|
|
418
|
-
canProcess(content3) {
|
|
419
|
-
return content3.mimeType.startsWith("text/html");
|
|
420
|
-
}
|
|
421
|
-
async process(content3) {
|
|
422
|
-
if (!this.canProcess(content3)) {
|
|
423
|
-
throw new ScraperError(
|
|
424
|
-
`HtmlProcessor cannot process content of type ${content3.mimeType}`,
|
|
425
|
-
false
|
|
426
|
-
);
|
|
427
|
-
}
|
|
428
|
-
const htmlContent = typeof content3.content === "string" ? content3.content : content3.content.toString(content3.encoding || "utf-8");
|
|
429
|
-
const window = new JSDOM(htmlContent, { url: content3.source }).window;
|
|
430
|
-
const title = window.document.title || "Untitled";
|
|
431
|
-
const purify = createDOMPurify(window);
|
|
432
|
-
const purifiedContent = purify.sanitize(htmlContent, {
|
|
433
|
-
WHOLE_DOCUMENT: true,
|
|
434
|
-
RETURN_DOM: true
|
|
435
|
-
});
|
|
436
|
-
const linkElements = purifiedContent.querySelectorAll("a[href]");
|
|
437
|
-
let links = [];
|
|
438
|
-
if (this.options.extractLinks !== false) {
|
|
439
|
-
links = Array.from(linkElements).map((el) => el.getAttribute("href")).filter((href) => href !== null).map((href) => {
|
|
440
|
-
try {
|
|
441
|
-
return new URL(href, content3.source).href;
|
|
442
|
-
} catch {
|
|
443
|
-
return null;
|
|
444
|
-
}
|
|
445
|
-
}).filter((url) => url !== null);
|
|
446
|
-
}
|
|
447
|
-
const selectorsToRemove = [
|
|
448
|
-
...this.options.excludeSelectors || [],
|
|
449
|
-
...this.selectorsToRemove
|
|
450
|
-
];
|
|
451
|
-
for (const selector of selectorsToRemove) {
|
|
452
|
-
const elements = purifiedContent.querySelectorAll(selector);
|
|
453
|
-
for (const el of elements) {
|
|
454
|
-
el.remove();
|
|
455
|
-
}
|
|
456
|
-
}
|
|
457
|
-
const cleanedContent = purifiedContent.innerHTML;
|
|
458
|
-
const markdown = this.turndownService.turndown(cleanedContent || "").trim();
|
|
459
|
-
if (!markdown) {
|
|
460
|
-
throw new ScraperError("No valid content found", false);
|
|
461
|
-
}
|
|
462
|
-
return {
|
|
463
|
-
content: markdown,
|
|
464
|
-
title,
|
|
465
|
-
source: content3.source,
|
|
466
|
-
links,
|
|
467
|
-
metadata: {}
|
|
468
|
-
};
|
|
469
|
-
}
|
|
470
|
-
};
|
|
471
|
-
|
|
472
|
-
// src/scraper/processor/MarkdownProcessor.ts
|
|
473
|
-
var MarkdownProcessor = class {
|
|
474
|
-
canProcess(content3) {
|
|
475
|
-
return content3.mimeType === "text/markdown" || content3.mimeType === "text/plain" || // Treat plain text as markdown
|
|
476
|
-
content3.source.endsWith(".md");
|
|
477
|
-
}
|
|
478
|
-
async process(content3) {
|
|
479
|
-
if (!this.canProcess(content3)) {
|
|
480
|
-
throw new ScraperError(
|
|
481
|
-
`MarkdownProcessor cannot process content of type ${content3.mimeType}`,
|
|
482
|
-
false
|
|
483
|
-
);
|
|
484
|
-
}
|
|
485
|
-
const markdownContent = typeof content3.content === "string" ? content3.content : content3.content.toString(content3.encoding || "utf-8");
|
|
486
|
-
if (!markdownContent.trim()) {
|
|
487
|
-
throw new ScraperError("Empty Markdown content", false);
|
|
488
|
-
}
|
|
489
|
-
const title = this.extractTitle(markdownContent) || "Untitled";
|
|
490
|
-
return {
|
|
491
|
-
content: markdownContent,
|
|
492
|
-
title,
|
|
493
|
-
source: content3.source,
|
|
494
|
-
links: [],
|
|
495
|
-
// TODO: Extract links from Markdown
|
|
496
|
-
metadata: {}
|
|
497
|
-
};
|
|
498
|
-
}
|
|
499
|
-
extractTitle(markdown) {
|
|
500
|
-
const match = markdown.match(/^#\s+(.*)$/m);
|
|
501
|
-
return match ? match[1].trim() : null;
|
|
502
|
-
}
|
|
503
|
-
};
|
|
295
|
+
// src/scraper/types.ts
|
|
296
|
+
var ScrapeMode = /* @__PURE__ */ ((ScrapeMode2) => {
|
|
297
|
+
ScrapeMode2["Fetch"] = "fetch";
|
|
298
|
+
ScrapeMode2["Playwright"] = "playwright";
|
|
299
|
+
ScrapeMode2["Auto"] = "auto";
|
|
300
|
+
return ScrapeMode2;
|
|
301
|
+
})(ScrapeMode || {});
|
|
504
302
|
|
|
505
303
|
// node_modules/uuid/dist/esm-node/stringify.js
|
|
506
304
|
var byteToHex = [];
|
|
@@ -609,6 +407,541 @@ function isSubpath(baseUrl, targetUrl) {
|
|
|
609
407
|
return targetUrl.pathname.startsWith(basePath);
|
|
610
408
|
}
|
|
611
409
|
|
|
410
|
+
// src/scraper/middleware/ContentProcessorPipeline.ts
|
|
411
|
+
var ContentProcessingPipeline = class {
|
|
412
|
+
middleware;
|
|
413
|
+
/**
|
|
414
|
+
* Creates an instance of ContentProcessingPipeline.
|
|
415
|
+
* @param middleware An array of middleware instances to execute in order.
|
|
416
|
+
*/
|
|
417
|
+
constructor(middleware) {
|
|
418
|
+
this.middleware = middleware;
|
|
419
|
+
}
|
|
420
|
+
/**
|
|
421
|
+
* Executes the middleware pipeline with the given initial context.
|
|
422
|
+
* @param initialContext The starting context for the pipeline.
|
|
423
|
+
* @returns A promise that resolves with the final context after all middleware have executed.
|
|
424
|
+
*/
|
|
425
|
+
async run(initialContext) {
|
|
426
|
+
let index2 = -1;
|
|
427
|
+
const dispatch = async (i) => {
|
|
428
|
+
if (i <= index2) {
|
|
429
|
+
throw new Error("next() called multiple times");
|
|
430
|
+
}
|
|
431
|
+
index2 = i;
|
|
432
|
+
const mw = this.middleware[i];
|
|
433
|
+
if (!mw) {
|
|
434
|
+
return;
|
|
435
|
+
}
|
|
436
|
+
const next = dispatch.bind(null, i + 1);
|
|
437
|
+
try {
|
|
438
|
+
await mw.process(initialContext, next);
|
|
439
|
+
} catch (error) {
|
|
440
|
+
initialContext.errors.push(
|
|
441
|
+
error instanceof Error ? error : new Error(String(error))
|
|
442
|
+
);
|
|
443
|
+
logger.warn(`Error in middleware pipeline: ${error}`);
|
|
444
|
+
}
|
|
445
|
+
};
|
|
446
|
+
await dispatch(0);
|
|
447
|
+
return initialContext;
|
|
448
|
+
}
|
|
449
|
+
};
|
|
450
|
+
|
|
451
|
+
// src/scraper/middleware/components/HtmlCheerioParserMiddleware.ts
|
|
452
|
+
import * as cheerio from "cheerio";
|
|
453
|
+
var HtmlCheerioParserMiddleware = class {
|
|
454
|
+
async process(context, next) {
|
|
455
|
+
if (!context.contentType.startsWith("text/html")) {
|
|
456
|
+
await next();
|
|
457
|
+
return;
|
|
458
|
+
}
|
|
459
|
+
const htmlString = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
|
|
460
|
+
try {
|
|
461
|
+
logger.debug(`Parsing HTML content with Cheerio from ${context.source}`);
|
|
462
|
+
const $ = cheerio.load(htmlString);
|
|
463
|
+
context.dom = $;
|
|
464
|
+
await next();
|
|
465
|
+
} catch (error) {
|
|
466
|
+
logger.error(`Failed to parse HTML with Cheerio for ${context.source}: ${error}`);
|
|
467
|
+
context.errors.push(
|
|
468
|
+
error instanceof Error ? error : new Error(`Cheerio HTML parsing failed: ${String(error)}`)
|
|
469
|
+
);
|
|
470
|
+
return;
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
};
|
|
474
|
+
|
|
475
|
+
// src/utils/dom.ts
|
|
476
|
+
import { JSDOM, VirtualConsole } from "jsdom";
|
|
477
|
+
function createJSDOM(html, options) {
|
|
478
|
+
const virtualConsole = new VirtualConsole();
|
|
479
|
+
virtualConsole.on("error", () => {
|
|
480
|
+
});
|
|
481
|
+
virtualConsole.on("warn", () => {
|
|
482
|
+
});
|
|
483
|
+
virtualConsole.on("info", () => {
|
|
484
|
+
});
|
|
485
|
+
virtualConsole.on("debug", () => {
|
|
486
|
+
});
|
|
487
|
+
virtualConsole.on("log", () => {
|
|
488
|
+
});
|
|
489
|
+
const defaultOptions = {
|
|
490
|
+
virtualConsole
|
|
491
|
+
};
|
|
492
|
+
const finalOptions = { ...defaultOptions, ...options };
|
|
493
|
+
return new JSDOM(html, finalOptions);
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
// src/scraper/middleware/components/HtmlLinkExtractorMiddleware.ts
|
|
497
|
+
var HtmlLinkExtractorMiddleware = class {
|
|
498
|
+
/**
|
|
499
|
+
* Processes the context to extract links from the sanitized HTML body.
|
|
500
|
+
* @param context The current processing context.
|
|
501
|
+
* @param next Function to call the next middleware.
|
|
502
|
+
*/
|
|
503
|
+
async process(context, next) {
|
|
504
|
+
const $ = context.dom;
|
|
505
|
+
if (!$) {
|
|
506
|
+
if (context.contentType.startsWith("text/html")) {
|
|
507
|
+
logger.warn(
|
|
508
|
+
`Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware runs before this.`
|
|
509
|
+
);
|
|
510
|
+
}
|
|
511
|
+
await next();
|
|
512
|
+
return;
|
|
513
|
+
}
|
|
514
|
+
try {
|
|
515
|
+
const linkElements = $("a[href]");
|
|
516
|
+
logger.debug(`Found ${linkElements.length} potential links in ${context.source}`);
|
|
517
|
+
const extractedLinks = [];
|
|
518
|
+
linkElements.each((index2, element) => {
|
|
519
|
+
const href = $(element).attr("href");
|
|
520
|
+
if (href && href.trim() !== "") {
|
|
521
|
+
try {
|
|
522
|
+
const urlObj = new URL(href, context.source);
|
|
523
|
+
if (!["http:", "https:", "file:"].includes(urlObj.protocol)) {
|
|
524
|
+
logger.debug(`Ignoring link with invalid protocol: ${href}`);
|
|
525
|
+
return;
|
|
526
|
+
}
|
|
527
|
+
extractedLinks.push(urlObj.href);
|
|
528
|
+
} catch (e) {
|
|
529
|
+
logger.debug(`Ignoring invalid URL syntax: ${href}`);
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
});
|
|
533
|
+
context.links = [...new Set(extractedLinks)];
|
|
534
|
+
logger.debug(
|
|
535
|
+
`Extracted ${context.links.length} unique, valid links from ${context.source}`
|
|
536
|
+
);
|
|
537
|
+
} catch (error) {
|
|
538
|
+
logger.error(`Error extracting links from ${context.source}: ${error}`);
|
|
539
|
+
context.errors.push(
|
|
540
|
+
new Error(
|
|
541
|
+
`Failed to extract links from HTML: ${error instanceof Error ? error.message : String(error)}`
|
|
542
|
+
)
|
|
543
|
+
);
|
|
544
|
+
}
|
|
545
|
+
await next();
|
|
546
|
+
}
|
|
547
|
+
};
|
|
548
|
+
|
|
549
|
+
// src/scraper/middleware/components/HtmlMetadataExtractorMiddleware.ts
|
|
550
|
+
var HtmlMetadataExtractorMiddleware = class {
|
|
551
|
+
/**
|
|
552
|
+
* Processes the context to extract the HTML title.
|
|
553
|
+
* @param context The current processing context.
|
|
554
|
+
* @param next Function to call the next middleware.
|
|
555
|
+
*/
|
|
556
|
+
async process(context, next) {
|
|
557
|
+
const $ = context.dom;
|
|
558
|
+
if (!$) {
|
|
559
|
+
if (context.contentType.startsWith("text/html")) {
|
|
560
|
+
logger.warn(
|
|
561
|
+
`Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware runs before this.`
|
|
562
|
+
);
|
|
563
|
+
}
|
|
564
|
+
await next();
|
|
565
|
+
return;
|
|
566
|
+
}
|
|
567
|
+
try {
|
|
568
|
+
let title = $("title").first().text().trim();
|
|
569
|
+
if (!title) {
|
|
570
|
+
title = $("h1").first().text().trim();
|
|
571
|
+
}
|
|
572
|
+
title = title || "Untitled";
|
|
573
|
+
title = title.replace(/\s+/g, " ").trim();
|
|
574
|
+
context.metadata.title = title;
|
|
575
|
+
logger.debug(`Extracted title: "${title}" from ${context.source}`);
|
|
576
|
+
} catch (error) {
|
|
577
|
+
logger.error(`Error extracting metadata from ${context.source}: ${error}`);
|
|
578
|
+
context.errors.push(
|
|
579
|
+
new Error(
|
|
580
|
+
`Failed to extract metadata from HTML: ${error instanceof Error ? error.message : String(error)}`
|
|
581
|
+
)
|
|
582
|
+
);
|
|
583
|
+
}
|
|
584
|
+
await next();
|
|
585
|
+
}
|
|
586
|
+
};
|
|
587
|
+
|
|
588
|
+
// src/scraper/middleware/components/HtmlPlaywrightMiddleware.ts
|
|
589
|
+
import { chromium } from "playwright";
|
|
590
|
+
var HtmlPlaywrightMiddleware = class {
|
|
591
|
+
browser = null;
|
|
592
|
+
/**
|
|
593
|
+
* Initializes the Playwright browser instance.
|
|
594
|
+
* Consider making this more robust (e.g., lazy initialization, singleton).
|
|
595
|
+
*/
|
|
596
|
+
async ensureBrowser() {
|
|
597
|
+
if (!this.browser || !this.browser.isConnected()) {
|
|
598
|
+
const launchArgs = process.env.PLAYWRIGHT_LAUNCH_ARGS?.split(" ") ?? [];
|
|
599
|
+
logger.debug(
|
|
600
|
+
`Launching new Playwright browser instance (Chromium) with args: ${launchArgs.join(" ") || "none"}...`
|
|
601
|
+
);
|
|
602
|
+
this.browser = await chromium.launch({ channel: "chromium", args: launchArgs });
|
|
603
|
+
this.browser.on("disconnected", () => {
|
|
604
|
+
logger.debug("Playwright browser instance disconnected.");
|
|
605
|
+
this.browser = null;
|
|
606
|
+
});
|
|
607
|
+
}
|
|
608
|
+
return this.browser;
|
|
609
|
+
}
|
|
610
|
+
/**
|
|
611
|
+
* Closes the Playwright browser instance if it exists.
|
|
612
|
+
* Should be called during application shutdown.
|
|
613
|
+
*/
|
|
614
|
+
async closeBrowser() {
|
|
615
|
+
if (this.browser?.isConnected()) {
|
|
616
|
+
logger.debug("Closing Playwright browser instance...");
|
|
617
|
+
await this.browser.close();
|
|
618
|
+
this.browser = null;
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
async process(context, next) {
|
|
622
|
+
if (!context.contentType.startsWith("text/html")) {
|
|
623
|
+
await next();
|
|
624
|
+
return;
|
|
625
|
+
}
|
|
626
|
+
const scrapeMode = context.options?.scrapeMode ?? "auto" /* Auto */;
|
|
627
|
+
const shouldRunPlaywright = scrapeMode === "playwright" /* Playwright */ || scrapeMode === "auto" /* Auto */;
|
|
628
|
+
if (!shouldRunPlaywright) {
|
|
629
|
+
logger.debug(
|
|
630
|
+
`Skipping Playwright rendering for ${context.source} as scrapeMode is '${scrapeMode}'.`
|
|
631
|
+
);
|
|
632
|
+
await next();
|
|
633
|
+
return;
|
|
634
|
+
}
|
|
635
|
+
logger.debug(
|
|
636
|
+
`Running Playwright rendering for ${context.source} (scrapeMode: '${scrapeMode}')`
|
|
637
|
+
);
|
|
638
|
+
let page = null;
|
|
639
|
+
let renderedHtml = null;
|
|
640
|
+
try {
|
|
641
|
+
const browser = await this.ensureBrowser();
|
|
642
|
+
page = await browser.newPage();
|
|
643
|
+
logger.debug(`Playwright: Processing ${context.source}`);
|
|
644
|
+
await page.route("**/*", (route) => {
|
|
645
|
+
if (route.request().url() === context.source) {
|
|
646
|
+
return route.fulfill({
|
|
647
|
+
status: 200,
|
|
648
|
+
contentType: context.contentType,
|
|
649
|
+
body: context.content
|
|
650
|
+
});
|
|
651
|
+
}
|
|
652
|
+
const resourceType = route.request().resourceType();
|
|
653
|
+
if (["image", "stylesheet", "font", "media"].includes(resourceType)) {
|
|
654
|
+
return route.abort();
|
|
655
|
+
}
|
|
656
|
+
return route.continue();
|
|
657
|
+
});
|
|
658
|
+
await page.goto(context.source, {
|
|
659
|
+
waitUntil: "load"
|
|
660
|
+
});
|
|
661
|
+
renderedHtml = await page.content();
|
|
662
|
+
logger.debug(`Playwright: Successfully rendered content for ${context.source}`);
|
|
663
|
+
} catch (error) {
|
|
664
|
+
logger.error(`Playwright failed to render ${context.source}: ${error}`);
|
|
665
|
+
context.errors.push(
|
|
666
|
+
error instanceof Error ? error : new Error(`Playwright rendering failed: ${String(error)}`)
|
|
667
|
+
);
|
|
668
|
+
} finally {
|
|
669
|
+
if (page) {
|
|
670
|
+
await page.unroute("**/*");
|
|
671
|
+
await page.close();
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
if (renderedHtml !== null) {
|
|
675
|
+
context.content = renderedHtml;
|
|
676
|
+
logger.debug(
|
|
677
|
+
`Playwright middleware updated content for ${context.source}. Proceeding.`
|
|
678
|
+
);
|
|
679
|
+
} else {
|
|
680
|
+
logger.warn(
|
|
681
|
+
`Playwright rendering resulted in null content for ${context.source}. Proceeding without content update.`
|
|
682
|
+
);
|
|
683
|
+
}
|
|
684
|
+
await next();
|
|
685
|
+
}
|
|
686
|
+
};
|
|
687
|
+
|
|
688
|
+
// src/scraper/middleware/components/HtmlSanitizerMiddleware.ts
|
|
689
|
+
var HtmlSanitizerMiddleware = class {
|
|
690
|
+
// Default selectors to remove
|
|
691
|
+
defaultSelectorsToRemove = [
|
|
692
|
+
"nav",
|
|
693
|
+
"footer",
|
|
694
|
+
"script",
|
|
695
|
+
"style",
|
|
696
|
+
"noscript",
|
|
697
|
+
"svg",
|
|
698
|
+
"link",
|
|
699
|
+
"meta",
|
|
700
|
+
"iframe",
|
|
701
|
+
"header",
|
|
702
|
+
"button",
|
|
703
|
+
"input",
|
|
704
|
+
"textarea",
|
|
705
|
+
"select",
|
|
706
|
+
// "form", // Keep commented
|
|
707
|
+
".ads",
|
|
708
|
+
".advertisement",
|
|
709
|
+
".banner",
|
|
710
|
+
".cookie-banner",
|
|
711
|
+
".cookie-consent",
|
|
712
|
+
".hidden",
|
|
713
|
+
".hide",
|
|
714
|
+
".modal",
|
|
715
|
+
".nav-bar",
|
|
716
|
+
".overlay",
|
|
717
|
+
".popup",
|
|
718
|
+
".promo",
|
|
719
|
+
".mw-editsection",
|
|
720
|
+
".side-bar",
|
|
721
|
+
".social-share",
|
|
722
|
+
".sticky",
|
|
723
|
+
"#ads",
|
|
724
|
+
"#banner",
|
|
725
|
+
"#cookieBanner",
|
|
726
|
+
"#modal",
|
|
727
|
+
"#nav",
|
|
728
|
+
"#overlay",
|
|
729
|
+
"#popup",
|
|
730
|
+
"#sidebar",
|
|
731
|
+
"#socialMediaBox",
|
|
732
|
+
"#stickyHeader",
|
|
733
|
+
"#ad-container",
|
|
734
|
+
".ad-container",
|
|
735
|
+
".login-form",
|
|
736
|
+
".signup-form",
|
|
737
|
+
".tooltip",
|
|
738
|
+
".dropdown-menu",
|
|
739
|
+
// ".alert", // Keep commented
|
|
740
|
+
".breadcrumb",
|
|
741
|
+
".pagination",
|
|
742
|
+
// '[role="alert"]', // Keep commented
|
|
743
|
+
'[role="banner"]',
|
|
744
|
+
'[role="dialog"]',
|
|
745
|
+
'[role="alertdialog"]',
|
|
746
|
+
'[role="region"][aria-label*="skip" i]',
|
|
747
|
+
'[aria-modal="true"]',
|
|
748
|
+
".noprint"
|
|
749
|
+
];
|
|
750
|
+
async process(context, next) {
|
|
751
|
+
const $ = context.dom;
|
|
752
|
+
if (!$) {
|
|
753
|
+
if (context.contentType.startsWith("text/html")) {
|
|
754
|
+
logger.warn(
|
|
755
|
+
`Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
|
|
756
|
+
);
|
|
757
|
+
}
|
|
758
|
+
await next();
|
|
759
|
+
return;
|
|
760
|
+
}
|
|
761
|
+
try {
|
|
762
|
+
const selectorsToRemove = [
|
|
763
|
+
...context.options.excludeSelectors || [],
|
|
764
|
+
// Use options from the context
|
|
765
|
+
...this.defaultSelectorsToRemove
|
|
766
|
+
];
|
|
767
|
+
logger.debug(
|
|
768
|
+
`Removing elements matching ${selectorsToRemove.length} selectors for ${context.source}`
|
|
769
|
+
);
|
|
770
|
+
let removedCount = 0;
|
|
771
|
+
for (const selector of selectorsToRemove) {
|
|
772
|
+
try {
|
|
773
|
+
const elements = $(selector);
|
|
774
|
+
const count = elements.length;
|
|
775
|
+
if (count > 0) {
|
|
776
|
+
elements.remove();
|
|
777
|
+
removedCount += count;
|
|
778
|
+
}
|
|
779
|
+
} catch (selectorError) {
|
|
780
|
+
logger.warn(
|
|
781
|
+
`Potentially invalid selector "${selector}" during element removal: ${selectorError}`
|
|
782
|
+
);
|
|
783
|
+
context.errors.push(
|
|
784
|
+
new Error(`Invalid selector "${selector}": ${selectorError}`)
|
|
785
|
+
);
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
logger.debug(`Removed ${removedCount} elements for ${context.source}`);
|
|
789
|
+
} catch (error) {
|
|
790
|
+
logger.error(`Error during HTML element removal for ${context.source}: ${error}`);
|
|
791
|
+
context.errors.push(
|
|
792
|
+
error instanceof Error ? error : new Error(`HTML element removal failed: ${String(error)}`)
|
|
793
|
+
);
|
|
794
|
+
}
|
|
795
|
+
await next();
|
|
796
|
+
}
|
|
797
|
+
};
|
|
798
|
+
|
|
799
|
+
// src/scraper/middleware/components/HtmlToMarkdownMiddleware.ts
|
|
800
|
+
import { gfm } from "@joplin/turndown-plugin-gfm";
|
|
801
|
+
import TurndownService from "turndown";
|
|
802
|
+
var HtmlToMarkdownMiddleware = class {
|
|
803
|
+
turndownService;
|
|
804
|
+
constructor() {
|
|
805
|
+
this.turndownService = new TurndownService({
|
|
806
|
+
headingStyle: "atx",
|
|
807
|
+
hr: "---",
|
|
808
|
+
bulletListMarker: "-",
|
|
809
|
+
codeBlockStyle: "fenced",
|
|
810
|
+
emDelimiter: "_",
|
|
811
|
+
strongDelimiter: "**",
|
|
812
|
+
linkStyle: "inlined"
|
|
813
|
+
});
|
|
814
|
+
this.turndownService.use(gfm);
|
|
815
|
+
this.addCustomRules();
|
|
816
|
+
}
|
|
817
|
+
addCustomRules() {
|
|
818
|
+
this.turndownService.addRule("pre", {
|
|
819
|
+
filter: ["pre"],
|
|
820
|
+
replacement: (content3, node2) => {
|
|
821
|
+
const element = node2;
|
|
822
|
+
let language = element.getAttribute("data-language") || "";
|
|
823
|
+
if (!language) {
|
|
824
|
+
const highlightElement = element.closest(
|
|
825
|
+
'[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
|
|
826
|
+
) || element.querySelector(
|
|
827
|
+
'[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
|
|
828
|
+
);
|
|
829
|
+
if (highlightElement) {
|
|
830
|
+
const className = highlightElement.className;
|
|
831
|
+
const match = className.match(
|
|
832
|
+
/(?:highlight-source-|highlight-|language-)(\w+)/
|
|
833
|
+
);
|
|
834
|
+
if (match) language = match[1];
|
|
835
|
+
}
|
|
836
|
+
}
|
|
837
|
+
const brElements = element.querySelectorAll("br");
|
|
838
|
+
if (brElements.length > 0) {
|
|
839
|
+
for (const br of brElements) {
|
|
840
|
+
br.replaceWith("\n");
|
|
841
|
+
}
|
|
842
|
+
}
|
|
843
|
+
const text3 = element.textContent || "";
|
|
844
|
+
return `
|
|
845
|
+
\`\`\`${language}
|
|
846
|
+
${text3.replace(/^\n+|\n+$/g, "")}
|
|
847
|
+
\`\`\`
|
|
848
|
+
`;
|
|
849
|
+
}
|
|
850
|
+
});
|
|
851
|
+
}
|
|
852
|
+
/**
|
|
853
|
+
* Processes the context to convert the sanitized HTML body node to Markdown.
|
|
854
|
+
* @param context The current processing context.
|
|
855
|
+
* @param next Function to call the next middleware.
|
|
856
|
+
*/
|
|
857
|
+
async process(context, next) {
|
|
858
|
+
const $ = context.dom;
|
|
859
|
+
if (!$) {
|
|
860
|
+
if (context.contentType.startsWith("text/html")) {
|
|
861
|
+
logger.warn(
|
|
862
|
+
`Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware ran correctly.`
|
|
863
|
+
);
|
|
864
|
+
}
|
|
865
|
+
await next();
|
|
866
|
+
return;
|
|
867
|
+
}
|
|
868
|
+
try {
|
|
869
|
+
logger.debug(`Converting HTML content to Markdown for ${context.source}`);
|
|
870
|
+
const htmlToConvert = $("body").html() || $.html();
|
|
871
|
+
const markdown = this.turndownService.turndown(htmlToConvert).trim();
|
|
872
|
+
if (!markdown) {
|
|
873
|
+
const warnMsg = `HTML to Markdown conversion resulted in empty content for ${context.source}.`;
|
|
874
|
+
logger.warn(warnMsg);
|
|
875
|
+
context.content = "";
|
|
876
|
+
context.contentType = "text/markdown";
|
|
877
|
+
} else {
|
|
878
|
+
context.content = markdown;
|
|
879
|
+
context.contentType = "text/markdown";
|
|
880
|
+
logger.debug(`Successfully converted HTML to Markdown for ${context.source}`);
|
|
881
|
+
}
|
|
882
|
+
} catch (error) {
|
|
883
|
+
logger.error(`Error converting HTML to Markdown for ${context.source}: ${error}`);
|
|
884
|
+
context.errors.push(
|
|
885
|
+
new Error(
|
|
886
|
+
`Failed to convert HTML to Markdown: ${error instanceof Error ? error.message : String(error)}`
|
|
887
|
+
)
|
|
888
|
+
);
|
|
889
|
+
}
|
|
890
|
+
await next();
|
|
891
|
+
}
|
|
892
|
+
};
|
|
893
|
+
|
|
894
|
+
// src/scraper/middleware/components/MarkdownLinkExtractorMiddleware.ts
|
|
895
|
+
var MarkdownLinkExtractorMiddleware = class {
|
|
896
|
+
/**
|
|
897
|
+
* Processes the context. Currently a no-op regarding link extraction.
|
|
898
|
+
* @param context The current processing context.
|
|
899
|
+
* @param next Function to call the next middleware.
|
|
900
|
+
*/
|
|
901
|
+
async process(context, next) {
|
|
902
|
+
if (context.contentType === "text/markdown") {
|
|
903
|
+
if (!Array.isArray(context.links)) {
|
|
904
|
+
context.links = [];
|
|
905
|
+
}
|
|
906
|
+
}
|
|
907
|
+
await next();
|
|
908
|
+
}
|
|
909
|
+
};
|
|
910
|
+
|
|
911
|
+
// src/scraper/middleware/components/MarkdownMetadataExtractorMiddleware.ts
|
|
912
|
+
var MarkdownMetadataExtractorMiddleware = class {
|
|
913
|
+
/**
|
|
914
|
+
* Processes the context to extract the title from Markdown.
|
|
915
|
+
* @param context The current processing context.
|
|
916
|
+
* @param next Function to call the next middleware.
|
|
917
|
+
*/
|
|
918
|
+
async process(context, next) {
|
|
919
|
+
if (context.contentType === "text/markdown" || context.contentType === "text/plain") {
|
|
920
|
+
try {
|
|
921
|
+
const textContent = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
|
|
922
|
+
if (typeof context.content !== "string") {
|
|
923
|
+
context.content = textContent;
|
|
924
|
+
}
|
|
925
|
+
let title = "Untitled";
|
|
926
|
+
if (context.contentType === "text/markdown") {
|
|
927
|
+
const match = textContent.match(/^#\s+(.*)$/m);
|
|
928
|
+
if (match?.[1]) {
|
|
929
|
+
title = match[1].trim();
|
|
930
|
+
}
|
|
931
|
+
}
|
|
932
|
+
context.metadata.title = title;
|
|
933
|
+
} catch (error) {
|
|
934
|
+
context.errors.push(
|
|
935
|
+
new Error(
|
|
936
|
+
`Failed to extract metadata from Markdown: ${error instanceof Error ? error.message : String(error)}`
|
|
937
|
+
)
|
|
938
|
+
);
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
await next();
|
|
942
|
+
}
|
|
943
|
+
};
|
|
944
|
+
|
|
612
945
|
// src/scraper/strategies/BaseScraperStrategy.ts
|
|
613
946
|
import { URL as URL2 } from "node:url";
|
|
614
947
|
|
|
@@ -643,12 +976,7 @@ var BaseScraperStrategy = class {
|
|
|
643
976
|
constructor(options = {}) {
|
|
644
977
|
this.options = options;
|
|
645
978
|
}
|
|
646
|
-
getProcessor
|
|
647
|
-
if (mimeType.startsWith("text/html")) {
|
|
648
|
-
return new HtmlProcessor();
|
|
649
|
-
}
|
|
650
|
-
return new MarkdownProcessor();
|
|
651
|
-
}
|
|
979
|
+
// Removed getProcessor method as processing is now handled by strategies using middleware pipelines
|
|
652
980
|
async processBatch(batch, baseUrl, options, progressCallback, signal) {
|
|
653
981
|
const results = await Promise.all(
|
|
654
982
|
batch.map(async (item) => {
|
|
@@ -749,9 +1077,12 @@ var BaseScraperStrategy = class {
|
|
|
749
1077
|
var WebScraperStrategy = class extends BaseScraperStrategy {
|
|
750
1078
|
httpFetcher = new HttpFetcher();
|
|
751
1079
|
shouldFollowLinkFn;
|
|
1080
|
+
playwrightMiddleware;
|
|
1081
|
+
// Add member
|
|
752
1082
|
constructor(options = {}) {
|
|
753
1083
|
super({ urlNormalizerOptions: options.urlNormalizerOptions });
|
|
754
1084
|
this.shouldFollowLinkFn = options.shouldFollowLink;
|
|
1085
|
+
this.playwrightMiddleware = new HtmlPlaywrightMiddleware();
|
|
755
1086
|
}
|
|
756
1087
|
canHandle(url) {
|
|
757
1088
|
try {
|
|
@@ -785,12 +1116,56 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
|
|
|
785
1116
|
followRedirects: options.followRedirects
|
|
786
1117
|
};
|
|
787
1118
|
const rawContent = await this.httpFetcher.fetch(url, fetchOptions);
|
|
788
|
-
const
|
|
789
|
-
|
|
1119
|
+
const initialContext = {
|
|
1120
|
+
content: rawContent.content,
|
|
1121
|
+
contentType: rawContent.mimeType,
|
|
1122
|
+
source: rawContent.source,
|
|
1123
|
+
// Use the final source URL after redirects
|
|
1124
|
+
metadata: {},
|
|
1125
|
+
links: [],
|
|
1126
|
+
errors: [],
|
|
1127
|
+
options,
|
|
1128
|
+
fetcher: this.httpFetcher
|
|
1129
|
+
};
|
|
1130
|
+
let pipeline;
|
|
1131
|
+
if (initialContext.contentType.startsWith("text/html")) {
|
|
1132
|
+
const htmlPipelineSteps = [
|
|
1133
|
+
this.playwrightMiddleware,
|
|
1134
|
+
// Use the instance member
|
|
1135
|
+
// TODO: Add HtmlJsExecutorMiddleware here if needed based on options
|
|
1136
|
+
new HtmlCheerioParserMiddleware(),
|
|
1137
|
+
// Always runs after content is finalized
|
|
1138
|
+
new HtmlMetadataExtractorMiddleware(),
|
|
1139
|
+
new HtmlLinkExtractorMiddleware(),
|
|
1140
|
+
new HtmlSanitizerMiddleware(),
|
|
1141
|
+
// Element remover
|
|
1142
|
+
new HtmlToMarkdownMiddleware()
|
|
1143
|
+
];
|
|
1144
|
+
pipeline = new ContentProcessingPipeline(htmlPipelineSteps);
|
|
1145
|
+
} else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain") {
|
|
1146
|
+
pipeline = new ContentProcessingPipeline([
|
|
1147
|
+
new MarkdownMetadataExtractorMiddleware(),
|
|
1148
|
+
new MarkdownLinkExtractorMiddleware()
|
|
1149
|
+
// Placeholder for now
|
|
1150
|
+
]);
|
|
1151
|
+
} else {
|
|
1152
|
+
logger.warn(
|
|
1153
|
+
`Unsupported content type "${initialContext.contentType}" for URL ${url}. Skipping processing.`
|
|
1154
|
+
);
|
|
1155
|
+
return { document: void 0, links: [] };
|
|
1156
|
+
}
|
|
1157
|
+
const finalContext = await pipeline.run(initialContext);
|
|
1158
|
+
for (const err of finalContext.errors) {
|
|
1159
|
+
logger.warn(`Processing error for ${url}: ${err.message}`);
|
|
1160
|
+
}
|
|
1161
|
+
if (typeof finalContext.content !== "string" || !finalContext.content.trim()) {
|
|
1162
|
+
logger.warn(`No processable content found for ${url} after pipeline execution.`);
|
|
1163
|
+
return { document: void 0, links: finalContext.links };
|
|
1164
|
+
}
|
|
790
1165
|
const baseUrl = new URL(options.url);
|
|
791
|
-
const
|
|
1166
|
+
const filteredLinks = finalContext.links.filter((link) => {
|
|
792
1167
|
try {
|
|
793
|
-
const targetUrl = new URL(link
|
|
1168
|
+
const targetUrl = new URL(link);
|
|
794
1169
|
const scope = options.scope || "subpages";
|
|
795
1170
|
return this.isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
|
|
796
1171
|
} catch {
|
|
@@ -799,21 +1174,37 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
|
|
|
799
1174
|
});
|
|
800
1175
|
return {
|
|
801
1176
|
document: {
|
|
802
|
-
content:
|
|
1177
|
+
content: finalContext.content,
|
|
1178
|
+
// Final processed content (Markdown)
|
|
803
1179
|
metadata: {
|
|
804
|
-
url:
|
|
805
|
-
|
|
1180
|
+
url: finalContext.source,
|
|
1181
|
+
// URL after redirects
|
|
1182
|
+
// Ensure title is a string, default to "Untitled"
|
|
1183
|
+
title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
|
|
806
1184
|
library: options.library,
|
|
807
1185
|
version: options.version
|
|
1186
|
+
// Add other metadata from context if needed
|
|
808
1187
|
}
|
|
809
1188
|
},
|
|
810
|
-
links
|
|
1189
|
+
links: filteredLinks
|
|
1190
|
+
// Use the filtered links
|
|
811
1191
|
};
|
|
812
1192
|
} catch (error) {
|
|
813
|
-
logger.error(`Failed
|
|
1193
|
+
logger.error(`Failed processing page ${url}: ${error}`);
|
|
814
1194
|
throw error;
|
|
815
1195
|
}
|
|
816
1196
|
}
|
|
1197
|
+
/**
|
|
1198
|
+
* Overrides the base scrape method to ensure the Playwright browser is closed
|
|
1199
|
+
* after the scraping process completes or errors out.
|
|
1200
|
+
*/
|
|
1201
|
+
async scrape(options, progressCallback, signal) {
|
|
1202
|
+
try {
|
|
1203
|
+
await super.scrape(options, progressCallback, signal);
|
|
1204
|
+
} finally {
|
|
1205
|
+
await this.playwrightMiddleware.closeBrowser();
|
|
1206
|
+
}
|
|
1207
|
+
}
|
|
817
1208
|
};
|
|
818
1209
|
|
|
819
1210
|
// src/scraper/strategies/GitHubScraperStrategy.ts
|
|
@@ -883,18 +1274,58 @@ var LocalFileStrategy = class extends BaseScraperStrategy {
|
|
|
883
1274
|
}
|
|
884
1275
|
logger.info(`\u{1F4C4} Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
|
|
885
1276
|
const rawContent = await this.fileFetcher.fetch(item.url);
|
|
886
|
-
const
|
|
887
|
-
|
|
1277
|
+
const initialContext = {
|
|
1278
|
+
content: rawContent.content,
|
|
1279
|
+
contentType: rawContent.mimeType,
|
|
1280
|
+
source: rawContent.source,
|
|
1281
|
+
// file:// URL
|
|
1282
|
+
metadata: {},
|
|
1283
|
+
links: [],
|
|
1284
|
+
// LocalFileStrategy doesn't extract links from file content itself
|
|
1285
|
+
errors: [],
|
|
1286
|
+
options
|
|
1287
|
+
// Pass the full options object
|
|
1288
|
+
};
|
|
1289
|
+
let pipeline;
|
|
1290
|
+
if (initialContext.contentType.startsWith("text/html")) {
|
|
1291
|
+
pipeline = new ContentProcessingPipeline([
|
|
1292
|
+
new HtmlCheerioParserMiddleware(),
|
|
1293
|
+
new HtmlMetadataExtractorMiddleware(),
|
|
1294
|
+
// No HtmlLinkExtractorMiddleware needed for local files
|
|
1295
|
+
new HtmlSanitizerMiddleware(),
|
|
1296
|
+
new HtmlToMarkdownMiddleware()
|
|
1297
|
+
]);
|
|
1298
|
+
} else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain" || // Treat plain text as markdown
|
|
1299
|
+
initialContext.contentType.startsWith("text/")) {
|
|
1300
|
+
pipeline = new ContentProcessingPipeline([
|
|
1301
|
+
new MarkdownMetadataExtractorMiddleware()
|
|
1302
|
+
// No MarkdownLinkExtractorMiddleware needed for local files
|
|
1303
|
+
]);
|
|
1304
|
+
} else {
|
|
1305
|
+
logger.warn(
|
|
1306
|
+
`Unsupported content type "${initialContext.contentType}" for file ${filePath}. Skipping processing.`
|
|
1307
|
+
);
|
|
1308
|
+
return { document: void 0, links: [] };
|
|
1309
|
+
}
|
|
1310
|
+
const finalContext = await pipeline.run(initialContext);
|
|
1311
|
+
for (const err of finalContext.errors) {
|
|
1312
|
+
logger.warn(`Processing error for ${filePath}: ${err.message}`);
|
|
1313
|
+
}
|
|
1314
|
+
const finalContentString = typeof finalContext.content === "string" ? finalContext.content : Buffer.from(finalContext.content).toString("utf-8");
|
|
888
1315
|
return {
|
|
889
1316
|
document: {
|
|
890
|
-
|
|
1317
|
+
// Use the potentially empty string content
|
|
1318
|
+
content: finalContentString,
|
|
891
1319
|
metadata: {
|
|
892
|
-
url:
|
|
893
|
-
|
|
1320
|
+
url: finalContext.source,
|
|
1321
|
+
// Use context source (file:// URL)
|
|
1322
|
+
// Ensure title is a string, default to "Untitled"
|
|
1323
|
+
title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
|
|
894
1324
|
library: options.library,
|
|
895
1325
|
version: options.version
|
|
896
1326
|
}
|
|
897
1327
|
}
|
|
1328
|
+
// No links returned from file content processing
|
|
898
1329
|
};
|
|
899
1330
|
}
|
|
900
1331
|
async scrape(options, progressCallback, signal) {
|
|
@@ -1007,7 +1438,7 @@ var PipelineWorker = class {
|
|
|
1007
1438
|
async executeJob(job, callbacks) {
|
|
1008
1439
|
const { id: jobId, library, version, options, abortController } = job;
|
|
1009
1440
|
const signal = abortController.signal;
|
|
1010
|
-
logger.
|
|
1441
|
+
logger.debug(`[${jobId}] Worker starting job for ${library}@${version}`);
|
|
1011
1442
|
try {
|
|
1012
1443
|
await this.scraperService.scrape(
|
|
1013
1444
|
options,
|
|
@@ -1327,14 +1758,13 @@ var LibraryNotFoundError = class extends ToolError {
|
|
|
1327
1758
|
|
|
1328
1759
|
// src/tools/FetchUrlTool.ts
|
|
1329
1760
|
var FetchUrlTool = class {
|
|
1330
|
-
constructor(httpFetcher, fileFetcher, processor) {
|
|
1331
|
-
this.processor = processor;
|
|
1332
|
-
this.fetchers = [httpFetcher, fileFetcher];
|
|
1333
|
-
}
|
|
1334
1761
|
/**
|
|
1335
1762
|
* Collection of fetchers that will be tried in order for a given URL.
|
|
1336
1763
|
*/
|
|
1337
1764
|
fetchers;
|
|
1765
|
+
constructor(httpFetcher, fileFetcher) {
|
|
1766
|
+
this.fetchers = [httpFetcher, fileFetcher];
|
|
1767
|
+
}
|
|
1338
1768
|
/**
|
|
1339
1769
|
* Fetches content from a URL and converts it to Markdown.
|
|
1340
1770
|
* Supports both HTTP/HTTPS URLs and local file URLs (file://).
|
|
@@ -1342,7 +1772,7 @@ var FetchUrlTool = class {
|
|
|
1342
1772
|
* @throws {ToolError} If fetching or processing fails
|
|
1343
1773
|
*/
|
|
1344
1774
|
async execute(options) {
|
|
1345
|
-
const { url } = options;
|
|
1775
|
+
const { url, scrapeMode = "auto" /* Auto */ } = options;
|
|
1346
1776
|
const canFetchResults = this.fetchers.map((f) => f.canFetch(url));
|
|
1347
1777
|
const fetcherIndex = canFetchResults.findIndex((result) => result === true);
|
|
1348
1778
|
if (fetcherIndex === -1) {
|
|
@@ -1352,18 +1782,88 @@ var FetchUrlTool = class {
|
|
|
1352
1782
|
);
|
|
1353
1783
|
}
|
|
1354
1784
|
const fetcher = this.fetchers[fetcherIndex];
|
|
1785
|
+
const playwrightMiddleware = new HtmlPlaywrightMiddleware();
|
|
1355
1786
|
try {
|
|
1356
1787
|
logger.info(`\u{1F4E1} Fetching ${url}...`);
|
|
1357
1788
|
const rawContent = await fetcher.fetch(url, {
|
|
1358
1789
|
followRedirects: options.followRedirects ?? true,
|
|
1359
1790
|
maxRetries: 3
|
|
1791
|
+
// Keep retries for fetching
|
|
1360
1792
|
});
|
|
1361
|
-
logger.info("\u{1F504}
|
|
1362
|
-
const
|
|
1363
|
-
|
|
1364
|
-
|
|
1793
|
+
logger.info("\u{1F504} Processing content...");
|
|
1794
|
+
const initialContext = {
|
|
1795
|
+
content: rawContent.content,
|
|
1796
|
+
contentType: rawContent.mimeType,
|
|
1797
|
+
source: rawContent.source,
|
|
1798
|
+
metadata: {},
|
|
1799
|
+
links: [],
|
|
1800
|
+
// Links not needed for this tool's output
|
|
1801
|
+
errors: [],
|
|
1802
|
+
fetcher,
|
|
1803
|
+
// Create a minimal ScraperOptions object for the context
|
|
1804
|
+
options: {
|
|
1805
|
+
url,
|
|
1806
|
+
// Use the input URL
|
|
1807
|
+
library: "",
|
|
1808
|
+
// Not applicable for this tool
|
|
1809
|
+
version: "",
|
|
1810
|
+
// Use empty string instead of undefined
|
|
1811
|
+
// Default other options as needed by middleware
|
|
1812
|
+
maxDepth: 0,
|
|
1813
|
+
maxPages: 1,
|
|
1814
|
+
maxConcurrency: 1,
|
|
1815
|
+
scope: "subpages",
|
|
1816
|
+
// Default, though not used for single page fetch
|
|
1817
|
+
followRedirects: options.followRedirects ?? true,
|
|
1818
|
+
excludeSelectors: void 0,
|
|
1819
|
+
// Not currently configurable via this tool
|
|
1820
|
+
ignoreErrors: false,
|
|
1821
|
+
scrapeMode
|
|
1822
|
+
// Pass the scrapeMode
|
|
1823
|
+
}
|
|
1824
|
+
};
|
|
1825
|
+
let pipeline;
|
|
1826
|
+
if (initialContext.contentType.startsWith("text/html")) {
|
|
1827
|
+
const htmlPipelineSteps = [
|
|
1828
|
+
playwrightMiddleware,
|
|
1829
|
+
// Use the instantiated middleware
|
|
1830
|
+
new HtmlCheerioParserMiddleware(),
|
|
1831
|
+
// Always runs after content is finalized
|
|
1832
|
+
new HtmlMetadataExtractorMiddleware(),
|
|
1833
|
+
// Keep for potential future use
|
|
1834
|
+
// No Link Extractor needed for this tool
|
|
1835
|
+
new HtmlSanitizerMiddleware(),
|
|
1836
|
+
// Element remover
|
|
1837
|
+
new HtmlToMarkdownMiddleware()
|
|
1838
|
+
];
|
|
1839
|
+
pipeline = new ContentProcessingPipeline(htmlPipelineSteps);
|
|
1840
|
+
} else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain") {
|
|
1841
|
+
pipeline = new ContentProcessingPipeline([
|
|
1842
|
+
new MarkdownMetadataExtractorMiddleware()
|
|
1843
|
+
// Extract title (though not used)
|
|
1844
|
+
// No further processing needed for Markdown/Plain text for this tool
|
|
1845
|
+
]);
|
|
1846
|
+
} else {
|
|
1847
|
+
logger.warn(
|
|
1848
|
+
`Unsupported content type "${initialContext.contentType}" for ${url}. Returning raw content.`
|
|
1849
|
+
);
|
|
1850
|
+
const contentString = typeof rawContent.content === "string" ? rawContent.content : Buffer.from(rawContent.content).toString("utf-8");
|
|
1851
|
+
return contentString;
|
|
1852
|
+
}
|
|
1853
|
+
const finalContext = await pipeline.run(initialContext);
|
|
1854
|
+
for (const err of finalContext.errors) {
|
|
1855
|
+
logger.warn(`Processing error for ${url}: ${err.message}`);
|
|
1856
|
+
}
|
|
1857
|
+
if (typeof finalContext.content !== "string" || !finalContext.content.trim()) {
|
|
1858
|
+
throw new ToolError(
|
|
1859
|
+
`Processing resulted in empty content for ${url}`,
|
|
1860
|
+
this.constructor.name
|
|
1861
|
+
);
|
|
1862
|
+
}
|
|
1863
|
+
logger.info(`\u2705 Successfully processed ${url}`);
|
|
1864
|
+
return finalContext.content;
|
|
1365
1865
|
} catch (error) {
|
|
1366
|
-
if (error instanceof ScraperError) {
|
|
1866
|
+
if (error instanceof ScraperError || error instanceof ToolError) {
|
|
1367
1867
|
throw new ToolError(
|
|
1368
1868
|
`Failed to fetch or process URL: ${error.message}`,
|
|
1369
1869
|
this.constructor.name
|
|
@@ -1373,6 +1873,8 @@ var FetchUrlTool = class {
|
|
|
1373
1873
|
`Failed to fetch or process URL: ${error instanceof Error ? error.message : String(error)}`,
|
|
1374
1874
|
this.constructor.name
|
|
1375
1875
|
);
|
|
1876
|
+
} finally {
|
|
1877
|
+
await playwrightMiddleware.closeBrowser();
|
|
1376
1878
|
}
|
|
1377
1879
|
}
|
|
1378
1880
|
};
|
|
@@ -1496,7 +1998,9 @@ var ScrapeTool = class {
|
|
|
1496
1998
|
maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES,
|
|
1497
1999
|
maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH,
|
|
1498
2000
|
maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
|
|
1499
|
-
ignoreErrors: scraperOptions?.ignoreErrors ?? true
|
|
2001
|
+
ignoreErrors: scraperOptions?.ignoreErrors ?? true,
|
|
2002
|
+
scrapeMode: scraperOptions?.scrapeMode ?? "auto" /* Auto */
|
|
2003
|
+
// Pass scrapeMode enum
|
|
1500
2004
|
});
|
|
1501
2005
|
logger.info(`\u{1F680} Job ${jobId} enqueued for scraping.`);
|
|
1502
2006
|
options.onProgress?.({
|
|
@@ -1784,7 +2288,6 @@ import Fuse from "fuse.js";
|
|
|
1784
2288
|
import semver3 from "semver";
|
|
1785
2289
|
|
|
1786
2290
|
// src/splitter/SemanticMarkdownSplitter.ts
|
|
1787
|
-
import { JSDOM as JSDOM2 } from "jsdom";
|
|
1788
2291
|
import { RecursiveCharacterTextSplitter as RecursiveCharacterTextSplitter2 } from "langchain/text_splitter";
|
|
1789
2292
|
import remarkGfm from "remark-gfm";
|
|
1790
2293
|
import remarkHtml from "remark-html";
|
|
@@ -10601,7 +11104,7 @@ ${"```"}`;
|
|
|
10601
11104
|
* Parse HTML
|
|
10602
11105
|
*/
|
|
10603
11106
|
async parseHtml(html) {
|
|
10604
|
-
const { window } =
|
|
11107
|
+
const { window } = createJSDOM(html);
|
|
10605
11108
|
return window.document;
|
|
10606
11109
|
}
|
|
10607
11110
|
};
|
|
@@ -11577,7 +12080,7 @@ export {
|
|
|
11577
12080
|
logger,
|
|
11578
12081
|
HttpFetcher,
|
|
11579
12082
|
FileFetcher,
|
|
11580
|
-
|
|
12083
|
+
ScrapeMode,
|
|
11581
12084
|
PipelineJobStatus,
|
|
11582
12085
|
PipelineManager,
|
|
11583
12086
|
CancelJobTool,
|
|
@@ -11592,4 +12095,4 @@ export {
|
|
|
11592
12095
|
SearchTool,
|
|
11593
12096
|
DocumentManagementService
|
|
11594
12097
|
};
|
|
11595
|
-
//# sourceMappingURL=chunk-
|
|
12098
|
+
//# sourceMappingURL=chunk-VTO2ED43.js.map
|