@arabold/docs-mcp-server 1.9.0 → 1.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +85 -241
- package/dist/{chunk-A5FW7XVC.js → chunk-VF2RUEVV.js} +779 -280
- package/dist/chunk-VF2RUEVV.js.map +1 -0
- package/dist/cli.js +46 -17
- package/dist/cli.js.map +1 -1
- package/dist/server.js +567 -366
- package/dist/server.js.map +1 -1
- package/package.json +6 -7
- package/dist/chunk-A5FW7XVC.js.map +0 -1
|
@@ -100,11 +100,6 @@ var require_extend = __commonJS({
|
|
|
100
100
|
}
|
|
101
101
|
});
|
|
102
102
|
|
|
103
|
-
// src/config.ts
|
|
104
|
-
var DEFAULT_MAX_PAGES = 1e3;
|
|
105
|
-
var DEFAULT_MAX_DEPTH = 3;
|
|
106
|
-
var DEFAULT_MAX_CONCURRENCY = 3;
|
|
107
|
-
|
|
108
103
|
// src/utils/logger.ts
|
|
109
104
|
var currentLogLevel = 2 /* INFO */;
|
|
110
105
|
function setLogLevel(level) {
|
|
@@ -292,215 +287,13 @@ var FileFetcher = class {
|
|
|
292
287
|
}
|
|
293
288
|
};
|
|
294
289
|
|
|
295
|
-
// src/scraper/
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
selectorsToRemove = [
|
|
303
|
-
"nav",
|
|
304
|
-
"footer",
|
|
305
|
-
"script",
|
|
306
|
-
"style",
|
|
307
|
-
"noscript",
|
|
308
|
-
"svg",
|
|
309
|
-
"link",
|
|
310
|
-
"meta",
|
|
311
|
-
"iframe",
|
|
312
|
-
"header",
|
|
313
|
-
"button",
|
|
314
|
-
"input",
|
|
315
|
-
"textarea",
|
|
316
|
-
"select",
|
|
317
|
-
// "form", // Known issue: Some pages use alerts for important content
|
|
318
|
-
".ads",
|
|
319
|
-
".advertisement",
|
|
320
|
-
".banner",
|
|
321
|
-
".cookie-banner",
|
|
322
|
-
".cookie-consent",
|
|
323
|
-
".hidden",
|
|
324
|
-
".hide",
|
|
325
|
-
".modal",
|
|
326
|
-
".nav-bar",
|
|
327
|
-
".overlay",
|
|
328
|
-
".popup",
|
|
329
|
-
".promo",
|
|
330
|
-
".mw-editsection",
|
|
331
|
-
".side-bar",
|
|
332
|
-
".social-share",
|
|
333
|
-
".sticky",
|
|
334
|
-
"#ads",
|
|
335
|
-
"#banner",
|
|
336
|
-
"#cookieBanner",
|
|
337
|
-
"#modal",
|
|
338
|
-
"#nav",
|
|
339
|
-
"#overlay",
|
|
340
|
-
"#popup",
|
|
341
|
-
"#sidebar",
|
|
342
|
-
"#socialMediaBox",
|
|
343
|
-
"#stickyHeader",
|
|
344
|
-
"#ad-container",
|
|
345
|
-
".ad-container",
|
|
346
|
-
".login-form",
|
|
347
|
-
".signup-form",
|
|
348
|
-
".tooltip",
|
|
349
|
-
".dropdown-menu",
|
|
350
|
-
// ".alert", // Known issue: Some pages use alerts for important content
|
|
351
|
-
".breadcrumb",
|
|
352
|
-
".pagination",
|
|
353
|
-
// '[role="alert"]', // Known issue: Some pages use alerts for important content
|
|
354
|
-
'[role="banner"]',
|
|
355
|
-
'[role="dialog"]',
|
|
356
|
-
'[role="alertdialog"]',
|
|
357
|
-
'[role="region"][aria-label*="skip" i]',
|
|
358
|
-
'[aria-modal="true"]',
|
|
359
|
-
".noprint"
|
|
360
|
-
];
|
|
361
|
-
constructor(options) {
|
|
362
|
-
this.turndownService = new TurndownService({
|
|
363
|
-
headingStyle: "atx",
|
|
364
|
-
hr: "---",
|
|
365
|
-
bulletListMarker: "-",
|
|
366
|
-
codeBlockStyle: "fenced",
|
|
367
|
-
emDelimiter: "_",
|
|
368
|
-
strongDelimiter: "**",
|
|
369
|
-
linkStyle: "inlined"
|
|
370
|
-
});
|
|
371
|
-
this.turndownService.addRule("pre", {
|
|
372
|
-
filter: ["pre"],
|
|
373
|
-
replacement: (content3, node2) => {
|
|
374
|
-
const element = node2;
|
|
375
|
-
let language = element.getAttribute("data-language") || "";
|
|
376
|
-
if (!language) {
|
|
377
|
-
const highlightElement = element.closest(
|
|
378
|
-
'[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
|
|
379
|
-
);
|
|
380
|
-
if (highlightElement) {
|
|
381
|
-
const className = highlightElement.className;
|
|
382
|
-
const match = className.match(
|
|
383
|
-
/(?:highlight-source-|highlight-|language-)(\w+)/
|
|
384
|
-
);
|
|
385
|
-
if (match) {
|
|
386
|
-
language = match[1];
|
|
387
|
-
}
|
|
388
|
-
}
|
|
389
|
-
}
|
|
390
|
-
const text3 = (() => {
|
|
391
|
-
const clone = element.cloneNode(true);
|
|
392
|
-
const brElements = Array.from(clone.querySelectorAll("br"));
|
|
393
|
-
for (const br of brElements) {
|
|
394
|
-
br.replaceWith("\n");
|
|
395
|
-
}
|
|
396
|
-
return clone.textContent;
|
|
397
|
-
})();
|
|
398
|
-
return `
|
|
399
|
-
\`\`\`${language}
|
|
400
|
-
${text3}
|
|
401
|
-
\`\`\`
|
|
402
|
-
`;
|
|
403
|
-
}
|
|
404
|
-
});
|
|
405
|
-
this.turndownService.addRule("table", {
|
|
406
|
-
filter: ["table"],
|
|
407
|
-
replacement: (content3) => {
|
|
408
|
-
const cleanedContent = content3.replace(/\n+/g, "\n");
|
|
409
|
-
return `
|
|
410
|
-
|
|
411
|
-
${cleanedContent}
|
|
412
|
-
|
|
413
|
-
`;
|
|
414
|
-
}
|
|
415
|
-
});
|
|
416
|
-
this.options = options || {};
|
|
417
|
-
}
|
|
418
|
-
canProcess(content3) {
|
|
419
|
-
return content3.mimeType.startsWith("text/html");
|
|
420
|
-
}
|
|
421
|
-
async process(content3) {
|
|
422
|
-
if (!this.canProcess(content3)) {
|
|
423
|
-
throw new ScraperError(
|
|
424
|
-
`HtmlProcessor cannot process content of type ${content3.mimeType}`,
|
|
425
|
-
false
|
|
426
|
-
);
|
|
427
|
-
}
|
|
428
|
-
const htmlContent = typeof content3.content === "string" ? content3.content : content3.content.toString(content3.encoding || "utf-8");
|
|
429
|
-
const window = new JSDOM(htmlContent, { url: content3.source }).window;
|
|
430
|
-
const title = window.document.title || "Untitled";
|
|
431
|
-
const purify = createDOMPurify(window);
|
|
432
|
-
const purifiedContent = purify.sanitize(htmlContent, {
|
|
433
|
-
WHOLE_DOCUMENT: true,
|
|
434
|
-
RETURN_DOM: true
|
|
435
|
-
});
|
|
436
|
-
const linkElements = purifiedContent.querySelectorAll("a[href]");
|
|
437
|
-
let links = [];
|
|
438
|
-
if (this.options.extractLinks !== false) {
|
|
439
|
-
links = Array.from(linkElements).map((el) => el.getAttribute("href")).filter((href) => href !== null).map((href) => {
|
|
440
|
-
try {
|
|
441
|
-
return new URL(href, content3.source).href;
|
|
442
|
-
} catch {
|
|
443
|
-
return null;
|
|
444
|
-
}
|
|
445
|
-
}).filter((url) => url !== null);
|
|
446
|
-
}
|
|
447
|
-
const selectorsToRemove = [
|
|
448
|
-
...this.options.excludeSelectors || [],
|
|
449
|
-
...this.selectorsToRemove
|
|
450
|
-
];
|
|
451
|
-
for (const selector of selectorsToRemove) {
|
|
452
|
-
const elements = purifiedContent.querySelectorAll(selector);
|
|
453
|
-
for (const el of elements) {
|
|
454
|
-
el.remove();
|
|
455
|
-
}
|
|
456
|
-
}
|
|
457
|
-
const cleanedContent = purifiedContent.innerHTML;
|
|
458
|
-
const markdown = this.turndownService.turndown(cleanedContent || "").trim();
|
|
459
|
-
if (!markdown) {
|
|
460
|
-
throw new ScraperError("No valid content found", false);
|
|
461
|
-
}
|
|
462
|
-
return {
|
|
463
|
-
content: markdown,
|
|
464
|
-
title,
|
|
465
|
-
source: content3.source,
|
|
466
|
-
links,
|
|
467
|
-
metadata: {}
|
|
468
|
-
};
|
|
469
|
-
}
|
|
470
|
-
};
|
|
471
|
-
|
|
472
|
-
// src/scraper/processor/MarkdownProcessor.ts
|
|
473
|
-
var MarkdownProcessor = class {
|
|
474
|
-
canProcess(content3) {
|
|
475
|
-
return content3.mimeType === "text/markdown" || content3.mimeType === "text/plain" || // Treat plain text as markdown
|
|
476
|
-
content3.source.endsWith(".md");
|
|
477
|
-
}
|
|
478
|
-
async process(content3) {
|
|
479
|
-
if (!this.canProcess(content3)) {
|
|
480
|
-
throw new ScraperError(
|
|
481
|
-
`MarkdownProcessor cannot process content of type ${content3.mimeType}`,
|
|
482
|
-
false
|
|
483
|
-
);
|
|
484
|
-
}
|
|
485
|
-
const markdownContent = typeof content3.content === "string" ? content3.content : content3.content.toString(content3.encoding || "utf-8");
|
|
486
|
-
if (!markdownContent.trim()) {
|
|
487
|
-
throw new ScraperError("Empty Markdown content", false);
|
|
488
|
-
}
|
|
489
|
-
const title = this.extractTitle(markdownContent) || "Untitled";
|
|
490
|
-
return {
|
|
491
|
-
content: markdownContent,
|
|
492
|
-
title,
|
|
493
|
-
source: content3.source,
|
|
494
|
-
links: [],
|
|
495
|
-
// TODO: Extract links from Markdown
|
|
496
|
-
metadata: {}
|
|
497
|
-
};
|
|
498
|
-
}
|
|
499
|
-
extractTitle(markdown) {
|
|
500
|
-
const match = markdown.match(/^#\s+(.*)$/m);
|
|
501
|
-
return match ? match[1].trim() : null;
|
|
502
|
-
}
|
|
503
|
-
};
|
|
290
|
+
// src/scraper/types.ts
|
|
291
|
+
var ScrapeMode = /* @__PURE__ */ ((ScrapeMode2) => {
|
|
292
|
+
ScrapeMode2["Fetch"] = "fetch";
|
|
293
|
+
ScrapeMode2["Playwright"] = "playwright";
|
|
294
|
+
ScrapeMode2["Auto"] = "auto";
|
|
295
|
+
return ScrapeMode2;
|
|
296
|
+
})(ScrapeMode || {});
|
|
504
297
|
|
|
505
298
|
// node_modules/uuid/dist/esm-node/stringify.js
|
|
506
299
|
var byteToHex = [];
|
|
@@ -609,6 +402,552 @@ function isSubpath(baseUrl, targetUrl) {
|
|
|
609
402
|
return targetUrl.pathname.startsWith(basePath);
|
|
610
403
|
}
|
|
611
404
|
|
|
405
|
+
// src/scraper/middleware/ContentProcessorPipeline.ts
|
|
406
|
+
var ContentProcessingPipeline = class {
|
|
407
|
+
middleware;
|
|
408
|
+
/**
|
|
409
|
+
* Creates an instance of ContentProcessingPipeline.
|
|
410
|
+
* @param middleware An array of middleware instances to execute in order.
|
|
411
|
+
*/
|
|
412
|
+
constructor(middleware) {
|
|
413
|
+
this.middleware = middleware;
|
|
414
|
+
}
|
|
415
|
+
/**
|
|
416
|
+
* Executes the middleware pipeline with the given initial context.
|
|
417
|
+
* @param initialContext The starting context for the pipeline.
|
|
418
|
+
* @returns A promise that resolves with the final context after all middleware have executed.
|
|
419
|
+
*/
|
|
420
|
+
async run(initialContext) {
|
|
421
|
+
let index2 = -1;
|
|
422
|
+
const dispatch = async (i) => {
|
|
423
|
+
if (i <= index2) {
|
|
424
|
+
throw new Error("next() called multiple times");
|
|
425
|
+
}
|
|
426
|
+
index2 = i;
|
|
427
|
+
const mw = this.middleware[i];
|
|
428
|
+
if (!mw) {
|
|
429
|
+
return;
|
|
430
|
+
}
|
|
431
|
+
const next = dispatch.bind(null, i + 1);
|
|
432
|
+
try {
|
|
433
|
+
await mw.process(initialContext, next);
|
|
434
|
+
} catch (error) {
|
|
435
|
+
initialContext.errors.push(
|
|
436
|
+
error instanceof Error ? error : new Error(String(error))
|
|
437
|
+
);
|
|
438
|
+
logger.warn(`Error in middleware pipeline: ${error}`);
|
|
439
|
+
}
|
|
440
|
+
};
|
|
441
|
+
await dispatch(0);
|
|
442
|
+
return initialContext;
|
|
443
|
+
}
|
|
444
|
+
};
|
|
445
|
+
|
|
446
|
+
// src/scraper/middleware/components/HtmlCheerioParserMiddleware.ts
|
|
447
|
+
import * as cheerio from "cheerio";
|
|
448
|
+
var HtmlCheerioParserMiddleware = class {
|
|
449
|
+
async process(context, next) {
|
|
450
|
+
if (!context.contentType.startsWith("text/html")) {
|
|
451
|
+
await next();
|
|
452
|
+
return;
|
|
453
|
+
}
|
|
454
|
+
const htmlString = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
|
|
455
|
+
try {
|
|
456
|
+
logger.debug(`Parsing HTML content with Cheerio from ${context.source}`);
|
|
457
|
+
const $ = cheerio.load(htmlString);
|
|
458
|
+
context.dom = $;
|
|
459
|
+
await next();
|
|
460
|
+
} catch (error) {
|
|
461
|
+
logger.error(`Failed to parse HTML with Cheerio for ${context.source}: ${error}`);
|
|
462
|
+
context.errors.push(
|
|
463
|
+
error instanceof Error ? error : new Error(`Cheerio HTML parsing failed: ${String(error)}`)
|
|
464
|
+
);
|
|
465
|
+
return;
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
};
|
|
469
|
+
|
|
470
|
+
// src/utils/dom.ts
|
|
471
|
+
import { JSDOM, VirtualConsole } from "jsdom";
|
|
472
|
+
function createJSDOM(html, options) {
|
|
473
|
+
const virtualConsole = new VirtualConsole();
|
|
474
|
+
virtualConsole.on("error", () => {
|
|
475
|
+
});
|
|
476
|
+
virtualConsole.on("warn", () => {
|
|
477
|
+
});
|
|
478
|
+
virtualConsole.on("info", () => {
|
|
479
|
+
});
|
|
480
|
+
virtualConsole.on("debug", () => {
|
|
481
|
+
});
|
|
482
|
+
virtualConsole.on("log", () => {
|
|
483
|
+
});
|
|
484
|
+
const defaultOptions = {
|
|
485
|
+
virtualConsole
|
|
486
|
+
};
|
|
487
|
+
const finalOptions = { ...defaultOptions, ...options };
|
|
488
|
+
return new JSDOM(html, finalOptions);
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
// src/scraper/middleware/components/HtmlLinkExtractorMiddleware.ts
|
|
492
|
+
var HtmlLinkExtractorMiddleware = class {
|
|
493
|
+
/**
|
|
494
|
+
* Processes the context to extract links from the sanitized HTML body.
|
|
495
|
+
* @param context The current processing context.
|
|
496
|
+
* @param next Function to call the next middleware.
|
|
497
|
+
*/
|
|
498
|
+
async process(context, next) {
|
|
499
|
+
const $ = context.dom;
|
|
500
|
+
if (!$) {
|
|
501
|
+
if (context.contentType.startsWith("text/html")) {
|
|
502
|
+
logger.warn(
|
|
503
|
+
`Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware runs before this.`
|
|
504
|
+
);
|
|
505
|
+
}
|
|
506
|
+
await next();
|
|
507
|
+
return;
|
|
508
|
+
}
|
|
509
|
+
try {
|
|
510
|
+
const linkElements = $("a[href]");
|
|
511
|
+
logger.debug(`Found ${linkElements.length} potential links in ${context.source}`);
|
|
512
|
+
const extractedLinks = [];
|
|
513
|
+
linkElements.each((index2, element) => {
|
|
514
|
+
const href = $(element).attr("href");
|
|
515
|
+
if (href && href.trim() !== "") {
|
|
516
|
+
try {
|
|
517
|
+
const urlObj = new URL(href, context.source);
|
|
518
|
+
if (!["http:", "https:", "file:"].includes(urlObj.protocol)) {
|
|
519
|
+
logger.debug(`Ignoring link with invalid protocol: ${href}`);
|
|
520
|
+
return;
|
|
521
|
+
}
|
|
522
|
+
extractedLinks.push(urlObj.href);
|
|
523
|
+
} catch (e) {
|
|
524
|
+
logger.debug(`Ignoring invalid URL syntax: ${href}`);
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
});
|
|
528
|
+
context.links = [...new Set(extractedLinks)];
|
|
529
|
+
logger.debug(
|
|
530
|
+
`Extracted ${context.links.length} unique, valid links from ${context.source}`
|
|
531
|
+
);
|
|
532
|
+
} catch (error) {
|
|
533
|
+
logger.error(`Error extracting links from ${context.source}: ${error}`);
|
|
534
|
+
context.errors.push(
|
|
535
|
+
new Error(
|
|
536
|
+
`Failed to extract links from HTML: ${error instanceof Error ? error.message : String(error)}`
|
|
537
|
+
)
|
|
538
|
+
);
|
|
539
|
+
}
|
|
540
|
+
await next();
|
|
541
|
+
}
|
|
542
|
+
};
|
|
543
|
+
|
|
544
|
+
// src/scraper/middleware/components/HtmlMetadataExtractorMiddleware.ts
|
|
545
|
+
var HtmlMetadataExtractorMiddleware = class {
|
|
546
|
+
/**
|
|
547
|
+
* Processes the context to extract the HTML title.
|
|
548
|
+
* @param context The current processing context.
|
|
549
|
+
* @param next Function to call the next middleware.
|
|
550
|
+
*/
|
|
551
|
+
async process(context, next) {
|
|
552
|
+
const $ = context.dom;
|
|
553
|
+
if (!$) {
|
|
554
|
+
if (context.contentType.startsWith("text/html")) {
|
|
555
|
+
logger.warn(
|
|
556
|
+
`Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware runs before this.`
|
|
557
|
+
);
|
|
558
|
+
}
|
|
559
|
+
await next();
|
|
560
|
+
return;
|
|
561
|
+
}
|
|
562
|
+
try {
|
|
563
|
+
let title = $("title").first().text().trim();
|
|
564
|
+
if (!title) {
|
|
565
|
+
title = $("h1").first().text().trim();
|
|
566
|
+
}
|
|
567
|
+
title = title || "Untitled";
|
|
568
|
+
title = title.replace(/\s+/g, " ").trim();
|
|
569
|
+
context.metadata.title = title;
|
|
570
|
+
logger.debug(`Extracted title: "${title}" from ${context.source}`);
|
|
571
|
+
} catch (error) {
|
|
572
|
+
logger.error(`Error extracting metadata from ${context.source}: ${error}`);
|
|
573
|
+
context.errors.push(
|
|
574
|
+
new Error(
|
|
575
|
+
`Failed to extract metadata from HTML: ${error instanceof Error ? error.message : String(error)}`
|
|
576
|
+
)
|
|
577
|
+
);
|
|
578
|
+
}
|
|
579
|
+
await next();
|
|
580
|
+
}
|
|
581
|
+
};
|
|
582
|
+
|
|
583
|
+
// src/scraper/middleware/components/HtmlPlaywrightMiddleware.ts
|
|
584
|
+
import { chromium } from "playwright";
|
|
585
|
+
var HtmlPlaywrightMiddleware = class {
|
|
586
|
+
browser = null;
|
|
587
|
+
/**
|
|
588
|
+
* Initializes the Playwright browser instance.
|
|
589
|
+
* Consider making this more robust (e.g., lazy initialization, singleton).
|
|
590
|
+
*/
|
|
591
|
+
async ensureBrowser() {
|
|
592
|
+
if (!this.browser || !this.browser.isConnected()) {
|
|
593
|
+
const launchArgs = process.env.PLAYWRIGHT_LAUNCH_ARGS?.split(" ") ?? [];
|
|
594
|
+
logger.debug(
|
|
595
|
+
`Launching new Playwright browser instance (Chromium) with args: ${launchArgs.join(" ") || "none"}...`
|
|
596
|
+
);
|
|
597
|
+
this.browser = await chromium.launch({ channel: "chromium", args: launchArgs });
|
|
598
|
+
this.browser.on("disconnected", () => {
|
|
599
|
+
logger.debug("Playwright browser instance disconnected.");
|
|
600
|
+
this.browser = null;
|
|
601
|
+
});
|
|
602
|
+
}
|
|
603
|
+
return this.browser;
|
|
604
|
+
}
|
|
605
|
+
/**
|
|
606
|
+
* Closes the Playwright browser instance if it exists.
|
|
607
|
+
* Should be called during application shutdown.
|
|
608
|
+
*/
|
|
609
|
+
async closeBrowser() {
|
|
610
|
+
if (this.browser?.isConnected()) {
|
|
611
|
+
logger.debug("Closing Playwright browser instance...");
|
|
612
|
+
await this.browser.close();
|
|
613
|
+
this.browser = null;
|
|
614
|
+
}
|
|
615
|
+
}
|
|
616
|
+
async process(context, next) {
|
|
617
|
+
if (!context.contentType.startsWith("text/html")) {
|
|
618
|
+
await next();
|
|
619
|
+
return;
|
|
620
|
+
}
|
|
621
|
+
const scrapeMode = context.options?.scrapeMode ?? "auto" /* Auto */;
|
|
622
|
+
const shouldRunPlaywright = scrapeMode === "playwright" /* Playwright */ || scrapeMode === "auto" /* Auto */;
|
|
623
|
+
if (!shouldRunPlaywright) {
|
|
624
|
+
logger.debug(
|
|
625
|
+
`Skipping Playwright rendering for ${context.source} as scrapeMode is '${scrapeMode}'.`
|
|
626
|
+
);
|
|
627
|
+
await next();
|
|
628
|
+
return;
|
|
629
|
+
}
|
|
630
|
+
logger.debug(
|
|
631
|
+
`Running Playwright rendering for ${context.source} (scrapeMode: '${scrapeMode}')`
|
|
632
|
+
);
|
|
633
|
+
let page = null;
|
|
634
|
+
let renderedHtml = null;
|
|
635
|
+
try {
|
|
636
|
+
const browser = await this.ensureBrowser();
|
|
637
|
+
page = await browser.newPage();
|
|
638
|
+
logger.debug(`Playwright: Processing ${context.source}`);
|
|
639
|
+
await page.route("**/*", (route) => {
|
|
640
|
+
if (route.request().url() === context.source) {
|
|
641
|
+
return route.fulfill({
|
|
642
|
+
status: 200,
|
|
643
|
+
contentType: context.contentType,
|
|
644
|
+
body: context.content
|
|
645
|
+
});
|
|
646
|
+
}
|
|
647
|
+
const resourceType = route.request().resourceType();
|
|
648
|
+
if (["image", "stylesheet", "font", "media"].includes(resourceType)) {
|
|
649
|
+
return route.abort();
|
|
650
|
+
}
|
|
651
|
+
return route.continue();
|
|
652
|
+
});
|
|
653
|
+
await page.goto(context.source, {
|
|
654
|
+
waitUntil: "load"
|
|
655
|
+
});
|
|
656
|
+
renderedHtml = await page.content();
|
|
657
|
+
logger.debug(`Playwright: Successfully rendered content for ${context.source}`);
|
|
658
|
+
} catch (error) {
|
|
659
|
+
logger.error(`Playwright failed to render ${context.source}: ${error}`);
|
|
660
|
+
context.errors.push(
|
|
661
|
+
error instanceof Error ? error : new Error(`Playwright rendering failed: ${String(error)}`)
|
|
662
|
+
);
|
|
663
|
+
} finally {
|
|
664
|
+
if (page) {
|
|
665
|
+
await page.unroute("**/*");
|
|
666
|
+
await page.close();
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
if (renderedHtml !== null) {
|
|
670
|
+
context.content = renderedHtml;
|
|
671
|
+
logger.debug(
|
|
672
|
+
`Playwright middleware updated content for ${context.source}. Proceeding.`
|
|
673
|
+
);
|
|
674
|
+
} else {
|
|
675
|
+
logger.warn(
|
|
676
|
+
`Playwright rendering resulted in null content for ${context.source}. Proceeding without content update.`
|
|
677
|
+
);
|
|
678
|
+
}
|
|
679
|
+
await next();
|
|
680
|
+
}
|
|
681
|
+
};
|
|
682
|
+
|
|
683
|
+
// src/scraper/middleware/components/HtmlSanitizerMiddleware.ts
|
|
684
|
+
var HtmlSanitizerMiddleware = class {
|
|
685
|
+
// Default selectors to remove
|
|
686
|
+
defaultSelectorsToRemove = [
|
|
687
|
+
"nav",
|
|
688
|
+
"footer",
|
|
689
|
+
"script",
|
|
690
|
+
"style",
|
|
691
|
+
"noscript",
|
|
692
|
+
"svg",
|
|
693
|
+
"link",
|
|
694
|
+
"meta",
|
|
695
|
+
"iframe",
|
|
696
|
+
"header",
|
|
697
|
+
"button",
|
|
698
|
+
"input",
|
|
699
|
+
"textarea",
|
|
700
|
+
"select",
|
|
701
|
+
// "form", // Keep commented
|
|
702
|
+
".ads",
|
|
703
|
+
".advertisement",
|
|
704
|
+
".banner",
|
|
705
|
+
".cookie-banner",
|
|
706
|
+
".cookie-consent",
|
|
707
|
+
".hidden",
|
|
708
|
+
".hide",
|
|
709
|
+
".modal",
|
|
710
|
+
".nav-bar",
|
|
711
|
+
".overlay",
|
|
712
|
+
".popup",
|
|
713
|
+
".promo",
|
|
714
|
+
".mw-editsection",
|
|
715
|
+
".side-bar",
|
|
716
|
+
".social-share",
|
|
717
|
+
".sticky",
|
|
718
|
+
"#ads",
|
|
719
|
+
"#banner",
|
|
720
|
+
"#cookieBanner",
|
|
721
|
+
"#modal",
|
|
722
|
+
"#nav",
|
|
723
|
+
"#overlay",
|
|
724
|
+
"#popup",
|
|
725
|
+
"#sidebar",
|
|
726
|
+
"#socialMediaBox",
|
|
727
|
+
"#stickyHeader",
|
|
728
|
+
"#ad-container",
|
|
729
|
+
".ad-container",
|
|
730
|
+
".login-form",
|
|
731
|
+
".signup-form",
|
|
732
|
+
".tooltip",
|
|
733
|
+
".dropdown-menu",
|
|
734
|
+
// ".alert", // Keep commented
|
|
735
|
+
".breadcrumb",
|
|
736
|
+
".pagination",
|
|
737
|
+
// '[role="alert"]', // Keep commented
|
|
738
|
+
'[role="banner"]',
|
|
739
|
+
'[role="dialog"]',
|
|
740
|
+
'[role="alertdialog"]',
|
|
741
|
+
'[role="region"][aria-label*="skip" i]',
|
|
742
|
+
'[aria-modal="true"]',
|
|
743
|
+
".noprint"
|
|
744
|
+
];
|
|
745
|
+
async process(context, next) {
|
|
746
|
+
const $ = context.dom;
|
|
747
|
+
if (!$) {
|
|
748
|
+
if (context.contentType.startsWith("text/html")) {
|
|
749
|
+
logger.warn(
|
|
750
|
+
`Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
|
|
751
|
+
);
|
|
752
|
+
}
|
|
753
|
+
await next();
|
|
754
|
+
return;
|
|
755
|
+
}
|
|
756
|
+
try {
|
|
757
|
+
const selectorsToRemove = [
|
|
758
|
+
...context.options.excludeSelectors || [],
|
|
759
|
+
// Use options from the context
|
|
760
|
+
...this.defaultSelectorsToRemove
|
|
761
|
+
];
|
|
762
|
+
logger.debug(
|
|
763
|
+
`Removing elements matching ${selectorsToRemove.length} selectors for ${context.source}`
|
|
764
|
+
);
|
|
765
|
+
let removedCount = 0;
|
|
766
|
+
for (const selector of selectorsToRemove) {
|
|
767
|
+
try {
|
|
768
|
+
const elements = $(selector);
|
|
769
|
+
const count = elements.length;
|
|
770
|
+
if (count > 0) {
|
|
771
|
+
elements.remove();
|
|
772
|
+
removedCount += count;
|
|
773
|
+
}
|
|
774
|
+
} catch (selectorError) {
|
|
775
|
+
logger.warn(
|
|
776
|
+
`Potentially invalid selector "${selector}" during element removal: ${selectorError}`
|
|
777
|
+
);
|
|
778
|
+
context.errors.push(
|
|
779
|
+
new Error(`Invalid selector "${selector}": ${selectorError}`)
|
|
780
|
+
);
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
logger.debug(`Removed ${removedCount} elements for ${context.source}`);
|
|
784
|
+
} catch (error) {
|
|
785
|
+
logger.error(`Error during HTML element removal for ${context.source}: ${error}`);
|
|
786
|
+
context.errors.push(
|
|
787
|
+
error instanceof Error ? error : new Error(`HTML element removal failed: ${String(error)}`)
|
|
788
|
+
);
|
|
789
|
+
}
|
|
790
|
+
await next();
|
|
791
|
+
}
|
|
792
|
+
};
|
|
793
|
+
|
|
794
|
+
// src/scraper/middleware/components/HtmlToMarkdownMiddleware.ts
|
|
795
|
+
import { gfm } from "@joplin/turndown-plugin-gfm";
|
|
796
|
+
import TurndownService from "turndown";
|
|
797
|
+
var HtmlToMarkdownMiddleware = class {
|
|
798
|
+
turndownService;
|
|
799
|
+
constructor() {
|
|
800
|
+
this.turndownService = new TurndownService({
|
|
801
|
+
headingStyle: "atx",
|
|
802
|
+
hr: "---",
|
|
803
|
+
bulletListMarker: "-",
|
|
804
|
+
codeBlockStyle: "fenced",
|
|
805
|
+
emDelimiter: "_",
|
|
806
|
+
strongDelimiter: "**",
|
|
807
|
+
linkStyle: "inlined"
|
|
808
|
+
});
|
|
809
|
+
this.turndownService.use(gfm);
|
|
810
|
+
this.addCustomRules();
|
|
811
|
+
}
|
|
812
|
+
addCustomRules() {
|
|
813
|
+
this.turndownService.addRule("pre", {
|
|
814
|
+
filter: ["pre"],
|
|
815
|
+
replacement: (content3, node2) => {
|
|
816
|
+
const element = node2;
|
|
817
|
+
let language = element.getAttribute("data-language") || "";
|
|
818
|
+
if (!language) {
|
|
819
|
+
const highlightElement = element.closest(
|
|
820
|
+
'[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
|
|
821
|
+
) || element.querySelector(
|
|
822
|
+
'[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
|
|
823
|
+
);
|
|
824
|
+
if (highlightElement) {
|
|
825
|
+
const className = highlightElement.className;
|
|
826
|
+
const match = className.match(
|
|
827
|
+
/(?:highlight-source-|highlight-|language-)(\w+)/
|
|
828
|
+
);
|
|
829
|
+
if (match) language = match[1];
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
const brElements = Array.from(element.querySelectorAll("br"));
|
|
833
|
+
for (const br of brElements) {
|
|
834
|
+
br.replaceWith("\n");
|
|
835
|
+
}
|
|
836
|
+
const text3 = element.textContent || "";
|
|
837
|
+
return `
|
|
838
|
+
\`\`\`${language}
|
|
839
|
+
${text3.replace(/^\n+|\n+$/g, "")}
|
|
840
|
+
\`\`\`
|
|
841
|
+
`;
|
|
842
|
+
}
|
|
843
|
+
});
|
|
844
|
+
this.turndownService.addRule("anchor", {
|
|
845
|
+
filter: ["a"],
|
|
846
|
+
replacement: (content3, node2) => {
|
|
847
|
+
const href = node2.getAttribute("href");
|
|
848
|
+
if (!content3 || content3 === "#") {
|
|
849
|
+
return "";
|
|
850
|
+
}
|
|
851
|
+
if (!href) {
|
|
852
|
+
return content3;
|
|
853
|
+
}
|
|
854
|
+
return `[${content3}](${href})`;
|
|
855
|
+
}
|
|
856
|
+
});
|
|
857
|
+
}
|
|
858
|
+
/**
|
|
859
|
+
* Processes the context to convert the sanitized HTML body node to Markdown.
|
|
860
|
+
* @param context The current processing context.
|
|
861
|
+
* @param next Function to call the next middleware.
|
|
862
|
+
*/
|
|
863
|
+
async process(context, next) {
|
|
864
|
+
const $ = context.dom;
|
|
865
|
+
if (!$) {
|
|
866
|
+
if (context.contentType.startsWith("text/html")) {
|
|
867
|
+
logger.warn(
|
|
868
|
+
`Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware ran correctly.`
|
|
869
|
+
);
|
|
870
|
+
}
|
|
871
|
+
await next();
|
|
872
|
+
return;
|
|
873
|
+
}
|
|
874
|
+
try {
|
|
875
|
+
logger.debug(`Converting HTML content to Markdown for ${context.source}`);
|
|
876
|
+
const htmlToConvert = $("body").html() || $.html();
|
|
877
|
+
const markdown = this.turndownService.turndown(htmlToConvert).trim();
|
|
878
|
+
if (!markdown) {
|
|
879
|
+
const warnMsg = `HTML to Markdown conversion resulted in empty content for ${context.source}.`;
|
|
880
|
+
logger.warn(warnMsg);
|
|
881
|
+
context.content = "";
|
|
882
|
+
context.contentType = "text/markdown";
|
|
883
|
+
} else {
|
|
884
|
+
context.content = markdown;
|
|
885
|
+
context.contentType = "text/markdown";
|
|
886
|
+
logger.debug(`Successfully converted HTML to Markdown for ${context.source}`);
|
|
887
|
+
}
|
|
888
|
+
} catch (error) {
|
|
889
|
+
logger.error(`Error converting HTML to Markdown for ${context.source}: ${error}`);
|
|
890
|
+
context.errors.push(
|
|
891
|
+
new Error(
|
|
892
|
+
`Failed to convert HTML to Markdown: ${error instanceof Error ? error.message : String(error)}`
|
|
893
|
+
)
|
|
894
|
+
);
|
|
895
|
+
}
|
|
896
|
+
await next();
|
|
897
|
+
}
|
|
898
|
+
};
|
|
899
|
+
|
|
900
|
+
// src/scraper/middleware/components/MarkdownLinkExtractorMiddleware.ts
|
|
901
|
+
var MarkdownLinkExtractorMiddleware = class {
|
|
902
|
+
/**
|
|
903
|
+
* Processes the context. Currently a no-op regarding link extraction.
|
|
904
|
+
* @param context The current processing context.
|
|
905
|
+
* @param next Function to call the next middleware.
|
|
906
|
+
*/
|
|
907
|
+
async process(context, next) {
|
|
908
|
+
if (context.contentType === "text/markdown") {
|
|
909
|
+
if (!Array.isArray(context.links)) {
|
|
910
|
+
context.links = [];
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
await next();
|
|
914
|
+
}
|
|
915
|
+
};
|
|
916
|
+
|
|
917
|
+
// src/scraper/middleware/components/MarkdownMetadataExtractorMiddleware.ts
|
|
918
|
+
var MarkdownMetadataExtractorMiddleware = class {
|
|
919
|
+
/**
|
|
920
|
+
* Processes the context to extract the title from Markdown.
|
|
921
|
+
* @param context The current processing context.
|
|
922
|
+
* @param next Function to call the next middleware.
|
|
923
|
+
*/
|
|
924
|
+
async process(context, next) {
|
|
925
|
+
if (context.contentType === "text/markdown" || context.contentType === "text/plain") {
|
|
926
|
+
try {
|
|
927
|
+
const textContent = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
|
|
928
|
+
if (typeof context.content !== "string") {
|
|
929
|
+
context.content = textContent;
|
|
930
|
+
}
|
|
931
|
+
let title = "Untitled";
|
|
932
|
+
if (context.contentType === "text/markdown") {
|
|
933
|
+
const match = textContent.match(/^#\s+(.*)$/m);
|
|
934
|
+
if (match?.[1]) {
|
|
935
|
+
title = match[1].trim();
|
|
936
|
+
}
|
|
937
|
+
}
|
|
938
|
+
context.metadata.title = title;
|
|
939
|
+
} catch (error) {
|
|
940
|
+
context.errors.push(
|
|
941
|
+
new Error(
|
|
942
|
+
`Failed to extract metadata from Markdown: ${error instanceof Error ? error.message : String(error)}`
|
|
943
|
+
)
|
|
944
|
+
);
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
await next();
|
|
948
|
+
}
|
|
949
|
+
};
|
|
950
|
+
|
|
612
951
|
// src/scraper/strategies/BaseScraperStrategy.ts
|
|
613
952
|
import { URL as URL2 } from "node:url";
|
|
614
953
|
|
|
@@ -633,8 +972,8 @@ var CancellationError = class extends PipelineError {
|
|
|
633
972
|
};
|
|
634
973
|
|
|
635
974
|
// src/scraper/strategies/BaseScraperStrategy.ts
|
|
636
|
-
var
|
|
637
|
-
var
|
|
975
|
+
var DEFAULT_MAX_PAGES = 100;
|
|
976
|
+
var DEFAULT_MAX_DEPTH = 3;
|
|
638
977
|
var DEFAULT_CONCURRENCY = 3;
|
|
639
978
|
var BaseScraperStrategy = class {
|
|
640
979
|
visited = /* @__PURE__ */ new Set();
|
|
@@ -643,19 +982,14 @@ var BaseScraperStrategy = class {
|
|
|
643
982
|
constructor(options = {}) {
|
|
644
983
|
this.options = options;
|
|
645
984
|
}
|
|
646
|
-
getProcessor
|
|
647
|
-
if (mimeType.startsWith("text/html")) {
|
|
648
|
-
return new HtmlProcessor();
|
|
649
|
-
}
|
|
650
|
-
return new MarkdownProcessor();
|
|
651
|
-
}
|
|
985
|
+
// Removed getProcessor method as processing is now handled by strategies using middleware pipelines
|
|
652
986
|
async processBatch(batch, baseUrl, options, progressCallback, signal) {
|
|
653
987
|
const results = await Promise.all(
|
|
654
988
|
batch.map(async (item) => {
|
|
655
989
|
if (signal?.aborted) {
|
|
656
990
|
throw new CancellationError("Scraping cancelled during batch processing");
|
|
657
991
|
}
|
|
658
|
-
const maxDepth = options.maxDepth ??
|
|
992
|
+
const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH;
|
|
659
993
|
if (item.depth > maxDepth) {
|
|
660
994
|
return [];
|
|
661
995
|
}
|
|
@@ -663,7 +997,7 @@ var BaseScraperStrategy = class {
|
|
|
663
997
|
const result = await this.processItem(item, options, void 0, signal);
|
|
664
998
|
if (result.document) {
|
|
665
999
|
this.pageCount++;
|
|
666
|
-
const maxPages = options.maxPages ??
|
|
1000
|
+
const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
|
|
667
1001
|
logger.info(
|
|
668
1002
|
`\u{1F310} Scraping page ${this.pageCount}/${maxPages} (depth ${item.depth}/${maxDepth}): ${item.url}`
|
|
669
1003
|
);
|
|
@@ -715,7 +1049,7 @@ var BaseScraperStrategy = class {
|
|
|
715
1049
|
const baseUrl = new URL2(options.url);
|
|
716
1050
|
const queue = [{ url: options.url, depth: 0 }];
|
|
717
1051
|
this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
|
|
718
|
-
const maxPages = options.maxPages ??
|
|
1052
|
+
const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
|
|
719
1053
|
const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
|
|
720
1054
|
while (queue.length > 0 && this.pageCount < maxPages) {
|
|
721
1055
|
if (signal?.aborted) {
|
|
@@ -749,9 +1083,12 @@ var BaseScraperStrategy = class {
|
|
|
749
1083
|
var WebScraperStrategy = class extends BaseScraperStrategy {
|
|
750
1084
|
httpFetcher = new HttpFetcher();
|
|
751
1085
|
shouldFollowLinkFn;
|
|
1086
|
+
playwrightMiddleware;
|
|
1087
|
+
// Add member
|
|
752
1088
|
constructor(options = {}) {
|
|
753
1089
|
super({ urlNormalizerOptions: options.urlNormalizerOptions });
|
|
754
1090
|
this.shouldFollowLinkFn = options.shouldFollowLink;
|
|
1091
|
+
this.playwrightMiddleware = new HtmlPlaywrightMiddleware();
|
|
755
1092
|
}
|
|
756
1093
|
canHandle(url) {
|
|
757
1094
|
try {
|
|
@@ -785,12 +1122,56 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
|
|
|
785
1122
|
followRedirects: options.followRedirects
|
|
786
1123
|
};
|
|
787
1124
|
const rawContent = await this.httpFetcher.fetch(url, fetchOptions);
|
|
788
|
-
const
|
|
789
|
-
|
|
1125
|
+
const initialContext = {
|
|
1126
|
+
content: rawContent.content,
|
|
1127
|
+
contentType: rawContent.mimeType,
|
|
1128
|
+
source: rawContent.source,
|
|
1129
|
+
// Use the final source URL after redirects
|
|
1130
|
+
metadata: {},
|
|
1131
|
+
links: [],
|
|
1132
|
+
errors: [],
|
|
1133
|
+
options,
|
|
1134
|
+
fetcher: this.httpFetcher
|
|
1135
|
+
};
|
|
1136
|
+
let pipeline;
|
|
1137
|
+
if (initialContext.contentType.startsWith("text/html")) {
|
|
1138
|
+
const htmlPipelineSteps = [
|
|
1139
|
+
this.playwrightMiddleware,
|
|
1140
|
+
// Use the instance member
|
|
1141
|
+
// TODO: Add HtmlJsExecutorMiddleware here if needed based on options
|
|
1142
|
+
new HtmlCheerioParserMiddleware(),
|
|
1143
|
+
// Always runs after content is finalized
|
|
1144
|
+
new HtmlMetadataExtractorMiddleware(),
|
|
1145
|
+
new HtmlLinkExtractorMiddleware(),
|
|
1146
|
+
new HtmlSanitizerMiddleware(),
|
|
1147
|
+
// Element remover
|
|
1148
|
+
new HtmlToMarkdownMiddleware()
|
|
1149
|
+
];
|
|
1150
|
+
pipeline = new ContentProcessingPipeline(htmlPipelineSteps);
|
|
1151
|
+
} else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain") {
|
|
1152
|
+
pipeline = new ContentProcessingPipeline([
|
|
1153
|
+
new MarkdownMetadataExtractorMiddleware(),
|
|
1154
|
+
new MarkdownLinkExtractorMiddleware()
|
|
1155
|
+
// Placeholder for now
|
|
1156
|
+
]);
|
|
1157
|
+
} else {
|
|
1158
|
+
logger.warn(
|
|
1159
|
+
`Unsupported content type "${initialContext.contentType}" for URL ${url}. Skipping processing.`
|
|
1160
|
+
);
|
|
1161
|
+
return { document: void 0, links: [] };
|
|
1162
|
+
}
|
|
1163
|
+
const finalContext = await pipeline.run(initialContext);
|
|
1164
|
+
for (const err of finalContext.errors) {
|
|
1165
|
+
logger.warn(`Processing error for ${url}: ${err.message}`);
|
|
1166
|
+
}
|
|
1167
|
+
if (typeof finalContext.content !== "string" || !finalContext.content.trim()) {
|
|
1168
|
+
logger.warn(`No processable content found for ${url} after pipeline execution.`);
|
|
1169
|
+
return { document: void 0, links: finalContext.links };
|
|
1170
|
+
}
|
|
790
1171
|
const baseUrl = new URL(options.url);
|
|
791
|
-
const
|
|
1172
|
+
const filteredLinks = finalContext.links.filter((link) => {
|
|
792
1173
|
try {
|
|
793
|
-
const targetUrl = new URL(link
|
|
1174
|
+
const targetUrl = new URL(link);
|
|
794
1175
|
const scope = options.scope || "subpages";
|
|
795
1176
|
return this.isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
|
|
796
1177
|
} catch {
|
|
@@ -799,21 +1180,37 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
|
|
|
799
1180
|
});
|
|
800
1181
|
return {
|
|
801
1182
|
document: {
|
|
802
|
-
content:
|
|
1183
|
+
content: finalContext.content,
|
|
1184
|
+
// Final processed content (Markdown)
|
|
803
1185
|
metadata: {
|
|
804
|
-
url:
|
|
805
|
-
|
|
1186
|
+
url: finalContext.source,
|
|
1187
|
+
// URL after redirects
|
|
1188
|
+
// Ensure title is a string, default to "Untitled"
|
|
1189
|
+
title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
|
|
806
1190
|
library: options.library,
|
|
807
1191
|
version: options.version
|
|
1192
|
+
// Add other metadata from context if needed
|
|
808
1193
|
}
|
|
809
1194
|
},
|
|
810
|
-
links
|
|
1195
|
+
links: filteredLinks
|
|
1196
|
+
// Use the filtered links
|
|
811
1197
|
};
|
|
812
1198
|
} catch (error) {
|
|
813
|
-
logger.error(`Failed
|
|
1199
|
+
logger.error(`Failed processing page ${url}: ${error}`);
|
|
814
1200
|
throw error;
|
|
815
1201
|
}
|
|
816
1202
|
}
|
|
1203
|
+
/**
|
|
1204
|
+
* Overrides the base scrape method to ensure the Playwright browser is closed
|
|
1205
|
+
* after the scraping process completes or errors out.
|
|
1206
|
+
*/
|
|
1207
|
+
async scrape(options, progressCallback, signal) {
|
|
1208
|
+
try {
|
|
1209
|
+
await super.scrape(options, progressCallback, signal);
|
|
1210
|
+
} finally {
|
|
1211
|
+
await this.playwrightMiddleware.closeBrowser();
|
|
1212
|
+
}
|
|
1213
|
+
}
|
|
817
1214
|
};
|
|
818
1215
|
|
|
819
1216
|
// src/scraper/strategies/GitHubScraperStrategy.ts
|
|
@@ -883,18 +1280,58 @@ var LocalFileStrategy = class extends BaseScraperStrategy {
|
|
|
883
1280
|
}
|
|
884
1281
|
logger.info(`\u{1F4C4} Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
|
|
885
1282
|
const rawContent = await this.fileFetcher.fetch(item.url);
|
|
886
|
-
const
|
|
887
|
-
|
|
1283
|
+
const initialContext = {
|
|
1284
|
+
content: rawContent.content,
|
|
1285
|
+
contentType: rawContent.mimeType,
|
|
1286
|
+
source: rawContent.source,
|
|
1287
|
+
// file:// URL
|
|
1288
|
+
metadata: {},
|
|
1289
|
+
links: [],
|
|
1290
|
+
// LocalFileStrategy doesn't extract links from file content itself
|
|
1291
|
+
errors: [],
|
|
1292
|
+
options
|
|
1293
|
+
// Pass the full options object
|
|
1294
|
+
};
|
|
1295
|
+
let pipeline;
|
|
1296
|
+
if (initialContext.contentType.startsWith("text/html")) {
|
|
1297
|
+
pipeline = new ContentProcessingPipeline([
|
|
1298
|
+
new HtmlCheerioParserMiddleware(),
|
|
1299
|
+
new HtmlMetadataExtractorMiddleware(),
|
|
1300
|
+
// No HtmlLinkExtractorMiddleware needed for local files
|
|
1301
|
+
new HtmlSanitizerMiddleware(),
|
|
1302
|
+
new HtmlToMarkdownMiddleware()
|
|
1303
|
+
]);
|
|
1304
|
+
} else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain" || // Treat plain text as markdown
|
|
1305
|
+
initialContext.contentType.startsWith("text/")) {
|
|
1306
|
+
pipeline = new ContentProcessingPipeline([
|
|
1307
|
+
new MarkdownMetadataExtractorMiddleware()
|
|
1308
|
+
// No MarkdownLinkExtractorMiddleware needed for local files
|
|
1309
|
+
]);
|
|
1310
|
+
} else {
|
|
1311
|
+
logger.warn(
|
|
1312
|
+
`Unsupported content type "${initialContext.contentType}" for file ${filePath}. Skipping processing.`
|
|
1313
|
+
);
|
|
1314
|
+
return { document: void 0, links: [] };
|
|
1315
|
+
}
|
|
1316
|
+
const finalContext = await pipeline.run(initialContext);
|
|
1317
|
+
for (const err of finalContext.errors) {
|
|
1318
|
+
logger.warn(`Processing error for ${filePath}: ${err.message}`);
|
|
1319
|
+
}
|
|
1320
|
+
const finalContentString = typeof finalContext.content === "string" ? finalContext.content : Buffer.from(finalContext.content).toString("utf-8");
|
|
888
1321
|
return {
|
|
889
1322
|
document: {
|
|
890
|
-
|
|
1323
|
+
// Use the potentially empty string content
|
|
1324
|
+
content: finalContentString,
|
|
891
1325
|
metadata: {
|
|
892
|
-
url:
|
|
893
|
-
|
|
1326
|
+
url: finalContext.source,
|
|
1327
|
+
// Use context source (file:// URL)
|
|
1328
|
+
// Ensure title is a string, default to "Untitled"
|
|
1329
|
+
title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
|
|
894
1330
|
library: options.library,
|
|
895
1331
|
version: options.version
|
|
896
1332
|
}
|
|
897
1333
|
}
|
|
1334
|
+
// No links returned from file content processing
|
|
898
1335
|
};
|
|
899
1336
|
}
|
|
900
1337
|
async scrape(options, progressCallback, signal) {
|
|
@@ -1007,7 +1444,7 @@ var PipelineWorker = class {
|
|
|
1007
1444
|
async executeJob(job, callbacks) {
|
|
1008
1445
|
const { id: jobId, library, version, options, abortController } = job;
|
|
1009
1446
|
const signal = abortController.signal;
|
|
1010
|
-
logger.
|
|
1447
|
+
logger.debug(`[${jobId}] Worker starting job for ${library}@${version}`);
|
|
1011
1448
|
try {
|
|
1012
1449
|
await this.scraperService.scrape(
|
|
1013
1450
|
options,
|
|
@@ -1327,14 +1764,13 @@ var LibraryNotFoundError = class extends ToolError {
|
|
|
1327
1764
|
|
|
1328
1765
|
// src/tools/FetchUrlTool.ts
|
|
1329
1766
|
var FetchUrlTool = class {
|
|
1330
|
-
constructor(httpFetcher, fileFetcher, processor) {
|
|
1331
|
-
this.processor = processor;
|
|
1332
|
-
this.fetchers = [httpFetcher, fileFetcher];
|
|
1333
|
-
}
|
|
1334
1767
|
/**
|
|
1335
1768
|
* Collection of fetchers that will be tried in order for a given URL.
|
|
1336
1769
|
*/
|
|
1337
1770
|
fetchers;
|
|
1771
|
+
constructor(httpFetcher, fileFetcher) {
|
|
1772
|
+
this.fetchers = [httpFetcher, fileFetcher];
|
|
1773
|
+
}
|
|
1338
1774
|
/**
|
|
1339
1775
|
* Fetches content from a URL and converts it to Markdown.
|
|
1340
1776
|
* Supports both HTTP/HTTPS URLs and local file URLs (file://).
|
|
@@ -1342,7 +1778,7 @@ var FetchUrlTool = class {
|
|
|
1342
1778
|
* @throws {ToolError} If fetching or processing fails
|
|
1343
1779
|
*/
|
|
1344
1780
|
async execute(options) {
|
|
1345
|
-
const { url } = options;
|
|
1781
|
+
const { url, scrapeMode = "auto" /* Auto */ } = options;
|
|
1346
1782
|
const canFetchResults = this.fetchers.map((f) => f.canFetch(url));
|
|
1347
1783
|
const fetcherIndex = canFetchResults.findIndex((result) => result === true);
|
|
1348
1784
|
if (fetcherIndex === -1) {
|
|
@@ -1352,18 +1788,88 @@ var FetchUrlTool = class {
|
|
|
1352
1788
|
);
|
|
1353
1789
|
}
|
|
1354
1790
|
const fetcher = this.fetchers[fetcherIndex];
|
|
1791
|
+
const playwrightMiddleware = new HtmlPlaywrightMiddleware();
|
|
1355
1792
|
try {
|
|
1356
1793
|
logger.info(`\u{1F4E1} Fetching ${url}...`);
|
|
1357
1794
|
const rawContent = await fetcher.fetch(url, {
|
|
1358
1795
|
followRedirects: options.followRedirects ?? true,
|
|
1359
1796
|
maxRetries: 3
|
|
1797
|
+
// Keep retries for fetching
|
|
1360
1798
|
});
|
|
1361
|
-
logger.info("\u{1F504}
|
|
1362
|
-
const
|
|
1363
|
-
|
|
1364
|
-
|
|
1799
|
+
logger.info("\u{1F504} Processing content...");
|
|
1800
|
+
const initialContext = {
|
|
1801
|
+
content: rawContent.content,
|
|
1802
|
+
contentType: rawContent.mimeType,
|
|
1803
|
+
source: rawContent.source,
|
|
1804
|
+
metadata: {},
|
|
1805
|
+
links: [],
|
|
1806
|
+
// Links not needed for this tool's output
|
|
1807
|
+
errors: [],
|
|
1808
|
+
fetcher,
|
|
1809
|
+
// Create a minimal ScraperOptions object for the context
|
|
1810
|
+
options: {
|
|
1811
|
+
url,
|
|
1812
|
+
// Use the input URL
|
|
1813
|
+
library: "",
|
|
1814
|
+
// Not applicable for this tool
|
|
1815
|
+
version: "",
|
|
1816
|
+
// Use empty string instead of undefined
|
|
1817
|
+
// Default other options as needed by middleware
|
|
1818
|
+
maxDepth: 0,
|
|
1819
|
+
maxPages: 1,
|
|
1820
|
+
maxConcurrency: 1,
|
|
1821
|
+
scope: "subpages",
|
|
1822
|
+
// Default, though not used for single page fetch
|
|
1823
|
+
followRedirects: options.followRedirects ?? true,
|
|
1824
|
+
excludeSelectors: void 0,
|
|
1825
|
+
// Not currently configurable via this tool
|
|
1826
|
+
ignoreErrors: false,
|
|
1827
|
+
scrapeMode
|
|
1828
|
+
// Pass the scrapeMode
|
|
1829
|
+
}
|
|
1830
|
+
};
|
|
1831
|
+
let pipeline;
|
|
1832
|
+
if (initialContext.contentType.startsWith("text/html")) {
|
|
1833
|
+
const htmlPipelineSteps = [
|
|
1834
|
+
playwrightMiddleware,
|
|
1835
|
+
// Use the instantiated middleware
|
|
1836
|
+
new HtmlCheerioParserMiddleware(),
|
|
1837
|
+
// Always runs after content is finalized
|
|
1838
|
+
new HtmlMetadataExtractorMiddleware(),
|
|
1839
|
+
// Keep for potential future use
|
|
1840
|
+
// No Link Extractor needed for this tool
|
|
1841
|
+
new HtmlSanitizerMiddleware(),
|
|
1842
|
+
// Element remover
|
|
1843
|
+
new HtmlToMarkdownMiddleware()
|
|
1844
|
+
];
|
|
1845
|
+
pipeline = new ContentProcessingPipeline(htmlPipelineSteps);
|
|
1846
|
+
} else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain") {
|
|
1847
|
+
pipeline = new ContentProcessingPipeline([
|
|
1848
|
+
new MarkdownMetadataExtractorMiddleware()
|
|
1849
|
+
// Extract title (though not used)
|
|
1850
|
+
// No further processing needed for Markdown/Plain text for this tool
|
|
1851
|
+
]);
|
|
1852
|
+
} else {
|
|
1853
|
+
logger.warn(
|
|
1854
|
+
`Unsupported content type "${initialContext.contentType}" for ${url}. Returning raw content.`
|
|
1855
|
+
);
|
|
1856
|
+
const contentString = typeof rawContent.content === "string" ? rawContent.content : Buffer.from(rawContent.content).toString("utf-8");
|
|
1857
|
+
return contentString;
|
|
1858
|
+
}
|
|
1859
|
+
const finalContext = await pipeline.run(initialContext);
|
|
1860
|
+
for (const err of finalContext.errors) {
|
|
1861
|
+
logger.warn(`Processing error for ${url}: ${err.message}`);
|
|
1862
|
+
}
|
|
1863
|
+
if (typeof finalContext.content !== "string" || !finalContext.content.trim()) {
|
|
1864
|
+
throw new ToolError(
|
|
1865
|
+
`Processing resulted in empty content for ${url}`,
|
|
1866
|
+
this.constructor.name
|
|
1867
|
+
);
|
|
1868
|
+
}
|
|
1869
|
+
logger.info(`\u2705 Successfully processed ${url}`);
|
|
1870
|
+
return finalContext.content;
|
|
1365
1871
|
} catch (error) {
|
|
1366
|
-
if (error instanceof ScraperError) {
|
|
1872
|
+
if (error instanceof ScraperError || error instanceof ToolError) {
|
|
1367
1873
|
throw new ToolError(
|
|
1368
1874
|
`Failed to fetch or process URL: ${error.message}`,
|
|
1369
1875
|
this.constructor.name
|
|
@@ -1373,6 +1879,8 @@ var FetchUrlTool = class {
|
|
|
1373
1879
|
`Failed to fetch or process URL: ${error instanceof Error ? error.message : String(error)}`,
|
|
1374
1880
|
this.constructor.name
|
|
1375
1881
|
);
|
|
1882
|
+
} finally {
|
|
1883
|
+
await playwrightMiddleware.closeBrowser();
|
|
1376
1884
|
}
|
|
1377
1885
|
}
|
|
1378
1886
|
};
|
|
@@ -1439,6 +1947,13 @@ var ListLibrariesTool = class {
|
|
|
1439
1947
|
}
|
|
1440
1948
|
};
|
|
1441
1949
|
|
|
1950
|
+
// src/utils/config.ts
|
|
1951
|
+
var DEFAULT_MAX_PAGES2 = 1e3;
|
|
1952
|
+
var DEFAULT_MAX_DEPTH2 = 3;
|
|
1953
|
+
var DEFAULT_MAX_CONCURRENCY = 3;
|
|
1954
|
+
var DEFAULT_PROTOCOL = "stdio";
|
|
1955
|
+
var DEFAULT_HTTP_PORT = 8e3;
|
|
1956
|
+
|
|
1442
1957
|
// src/tools/ScrapeTool.ts
|
|
1443
1958
|
import * as semver2 from "semver";
|
|
1444
1959
|
var ScrapeTool = class {
|
|
@@ -1493,10 +2008,12 @@ var ScrapeTool = class {
|
|
|
1493
2008
|
version: internalVersion,
|
|
1494
2009
|
scope: scraperOptions?.scope ?? "subpages",
|
|
1495
2010
|
followRedirects: scraperOptions?.followRedirects ?? true,
|
|
1496
|
-
maxPages: scraperOptions?.maxPages ??
|
|
1497
|
-
maxDepth: scraperOptions?.maxDepth ??
|
|
2011
|
+
maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES2,
|
|
2012
|
+
maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH2,
|
|
1498
2013
|
maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
|
|
1499
|
-
ignoreErrors: scraperOptions?.ignoreErrors ?? true
|
|
2014
|
+
ignoreErrors: scraperOptions?.ignoreErrors ?? true,
|
|
2015
|
+
scrapeMode: scraperOptions?.scrapeMode ?? "auto" /* Auto */
|
|
2016
|
+
// Pass scrapeMode enum
|
|
1500
2017
|
});
|
|
1501
2018
|
logger.info(`\u{1F680} Job ${jobId} enqueued for scraping.`);
|
|
1502
2019
|
options.onProgress?.({
|
|
@@ -1576,26 +2093,6 @@ var SearchTool = class {
|
|
|
1576
2093
|
logger.info(`\u2705 Found ${results.length} matching results`);
|
|
1577
2094
|
return { results };
|
|
1578
2095
|
} catch (error) {
|
|
1579
|
-
if (error instanceof LibraryNotFoundError) {
|
|
1580
|
-
logger.info(`\u2139\uFE0F Library not found: ${error.message}`);
|
|
1581
|
-
return {
|
|
1582
|
-
results: [],
|
|
1583
|
-
error: {
|
|
1584
|
-
message: error.message,
|
|
1585
|
-
suggestions: error.suggestions
|
|
1586
|
-
}
|
|
1587
|
-
};
|
|
1588
|
-
}
|
|
1589
|
-
if (error instanceof VersionNotFoundError) {
|
|
1590
|
-
logger.info(`\u2139\uFE0F Version not found: ${error.message}`);
|
|
1591
|
-
return {
|
|
1592
|
-
results: [],
|
|
1593
|
-
error: {
|
|
1594
|
-
message: error.message,
|
|
1595
|
-
availableVersions: error.availableVersions
|
|
1596
|
-
}
|
|
1597
|
-
};
|
|
1598
|
-
}
|
|
1599
2096
|
logger.error(
|
|
1600
2097
|
`\u274C Search failed: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
1601
2098
|
);
|
|
@@ -1784,7 +2281,6 @@ import Fuse from "fuse.js";
|
|
|
1784
2281
|
import semver3 from "semver";
|
|
1785
2282
|
|
|
1786
2283
|
// src/splitter/SemanticMarkdownSplitter.ts
|
|
1787
|
-
import { JSDOM as JSDOM2 } from "jsdom";
|
|
1788
2284
|
import { RecursiveCharacterTextSplitter as RecursiveCharacterTextSplitter2 } from "langchain/text_splitter";
|
|
1789
2285
|
import remarkGfm from "remark-gfm";
|
|
1790
2286
|
import remarkHtml from "remark-html";
|
|
@@ -10601,7 +11097,7 @@ ${"```"}`;
|
|
|
10601
11097
|
* Parse HTML
|
|
10602
11098
|
*/
|
|
10603
11099
|
async parseHtml(html) {
|
|
10604
|
-
const { window } =
|
|
11100
|
+
const { window } = createJSDOM(html);
|
|
10605
11101
|
return window.document;
|
|
10606
11102
|
}
|
|
10607
11103
|
};
|
|
@@ -11570,26 +12066,29 @@ var DocumentManagementService = class {
|
|
|
11570
12066
|
};
|
|
11571
12067
|
|
|
11572
12068
|
export {
|
|
11573
|
-
DEFAULT_MAX_PAGES,
|
|
11574
|
-
DEFAULT_MAX_DEPTH,
|
|
11575
|
-
DEFAULT_MAX_CONCURRENCY,
|
|
11576
12069
|
setLogLevel,
|
|
11577
12070
|
logger,
|
|
11578
12071
|
HttpFetcher,
|
|
11579
12072
|
FileFetcher,
|
|
11580
|
-
|
|
12073
|
+
ScrapeMode,
|
|
11581
12074
|
PipelineJobStatus,
|
|
11582
12075
|
PipelineManager,
|
|
11583
12076
|
CancelJobTool,
|
|
11584
12077
|
VersionNotFoundError,
|
|
12078
|
+
LibraryNotFoundError,
|
|
11585
12079
|
FetchUrlTool,
|
|
11586
12080
|
FindVersionTool,
|
|
11587
12081
|
GetJobInfoTool,
|
|
11588
12082
|
ListJobsTool,
|
|
11589
12083
|
ListLibrariesTool,
|
|
11590
12084
|
RemoveTool,
|
|
12085
|
+
DEFAULT_MAX_PAGES2 as DEFAULT_MAX_PAGES,
|
|
12086
|
+
DEFAULT_MAX_DEPTH2 as DEFAULT_MAX_DEPTH,
|
|
12087
|
+
DEFAULT_MAX_CONCURRENCY,
|
|
12088
|
+
DEFAULT_PROTOCOL,
|
|
12089
|
+
DEFAULT_HTTP_PORT,
|
|
11591
12090
|
ScrapeTool,
|
|
11592
12091
|
SearchTool,
|
|
11593
12092
|
DocumentManagementService
|
|
11594
12093
|
};
|
|
11595
|
-
//# sourceMappingURL=chunk-
|
|
12094
|
+
//# sourceMappingURL=chunk-VF2RUEVV.js.map
|