npm - @mdream/crawl - Versions diffs - 0.8.3 → 0.8.5 - Mend

@mdream/crawl 0.8.3 → 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/_chunks/{crawl-BwURA9nQ.mjs → crawl-BtuYX2_u.mjs} +15 -23
package/dist/cli.mjs +40 -22
package/dist/index.d.mts +1 -0
package/dist/index.mjs +1 -1
package/package.json +3 -2

package/dist/_chunks/{crawl-BwURA9nQ.mjs → crawl-BtuYX2_u.mjs} RENAMED Viewed

@@ -5,24 +5,10 @@ import { HttpCrawler, PlaywrightCrawler, log, purgeDefaultStorages } from "crawl
 import { generateLlmsTxtArtifacts, htmlToMarkdown } from "mdream";
 import { withMinimalPreset } from "mdream/preset/minimal";
 import { dirname, join, normalize, resolve } from "pathe";
+import { withHttps } from "ufo";
 import picomatch from "picomatch";
 import { extractionPlugin } from "mdream/plugins";
-//#region ../../node_modules/.pnpm/ufo@1.6.1/node_modules/ufo/dist/index.mjs
-const r = String.fromCharCode;
-const PROTOCOL_REGEX = /^[\s\w\0+.-]{2,}:([/\\]{2})?/;
-function withHttps(input) {
-	return withProtocol(input, "https://");
-}
-function withProtocol(input, protocol) {
-	let match = input.match(PROTOCOL_REGEX);
-	if (!match) match = input.match(/^\/{2,}/);
-	if (!match) return protocol + input;
-	return protocol + input.slice(match[0].length);
-}
-const protocolRelative = Symbol.for("ufo:protocolRelative");
-//#endregion
 //#region src/glob-utils.ts
 /**
 * Parse a URL that may contain glob patterns
@@ -194,7 +180,7 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
 	return urls;
 }
 async function crawlAndGenerate(options, onProgress) {
-	const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false } = options;
+	const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false } = options;
 	const outputDir = resolve(normalize(rawOutputDir));
 	if (verbose) log.setLevel(log.LEVELS.INFO);
 	else log.setLevel(log.LEVELS.OFF);
@@ -449,8 +435,14 @@ async function crawlAndGenerate(options, onProgress) {
 		respectRobotsTxtFile: true
 	};
 	if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
-	if (driver === "playwright") crawler = new PlaywrightCrawler(crawlerOptions);
-	else crawler = new HttpCrawler(crawlerOptions);
+	if (driver === "playwright") {
+		const playwrightOptions = crawlerOptions;
+		if (useChrome) playwrightOptions.launchContext = {
+			...playwrightOptions.launchContext,
+			useChrome
+		};
+		crawler = new PlaywrightCrawler(playwrightOptions);
+	} else crawler = new HttpCrawler(crawlerOptions);
 	const initialRequests = startingUrls.map((url) => ({
 		url,
 		userData: { depth: 0 }
@@ -461,14 +453,14 @@ async function crawlAndGenerate(options, onProgress) {
 	await crawler.run(initialRequests);
 	progress.crawling.status = "completed";
 	onProgress?.(progress);
-	if (results.some((r$1) => r$1.success)) {
+	if (results.some((r) => r.success)) {
 		progress.generation.status = "generating";
 		onProgress?.(progress);
-		const successfulResults = results.filter((r$1) => r$1.success);
+		const successfulResults = results.filter((r) => r.success);
 		const firstUrl = new URL(withHttps(urls[0]));
 		const origin$1 = firstUrl.origin;
-		const homePageResult = successfulResults.find((r$1) => {
-			const resultUrl = new URL(withHttps(r$1.url));
+		const homePageResult = successfulResults.find((r) => {
+			const resultUrl = new URL(withHttps(r.url));
 			return resultUrl.href === origin$1 || resultUrl.href === `${origin$1}/`;
 		});
 		const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
@@ -522,4 +514,4 @@ async function crawlAndGenerate(options, onProgress) {
 }
 //#endregion
-export { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps };
+export { crawlAndGenerate, parseUrlPattern, validateGlobPattern };

package/dist/cli.mjs CHANGED Viewed

@@ -1,8 +1,10 @@
-import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-BwURA9nQ.mjs";
+import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-BtuYX2_u.mjs";
 import { readFileSync } from "node:fs";
 import * as p$1 from "@clack/prompts";
 import * as p from "@clack/prompts";
+import { PlaywrightCrawler } from "crawlee";
 import { dirname, join, resolve } from "pathe";
+import { withHttps } from "ufo";
 import { fileURLToPath } from "node:url";
 import { addDependency } from "nypm";
@@ -22,21 +24,15 @@ async function promptPlaywrightInstall() {
 	});
 	if (p$1.isCancel(shouldInstall) || !shouldInstall) return false;
 	const s = p$1.spinner();
-	s.start("Installing Playwright...");
+	s.start("Installing Playwright globally...");
 	try {
-		await addDependency("playwright", { workspace: true });
+		await addDependency("playwright", { global: true });
 		s.stop("Playwright installed successfully!");
 		return true;
-	} catch {
-		try {
-			await addDependency("playwright");
-			s.stop("Playwright installed successfully!");
-			return true;
-		} catch (fallbackError) {
-			s.stop("Failed to install Playwright");
-			p$1.log.error(`Installation failed: ${fallbackError}`);
-			return false;
-		}
+	} catch (fallbackError) {
+		s.stop("Failed to install Playwright");
+		p$1.log.error(`Installation failed: ${fallbackError}`);
+		return false;
 	}
 }
 async function ensurePlaywrightInstalled() {
@@ -50,6 +46,24 @@ async function ensurePlaywrightInstalled() {
 	}
 	return true;
 }
+async function isUseChromeSupported() {
+	try {
+		const crawler = new PlaywrightCrawler({
+			launchContext: { useChrome: true },
+			requestHandler: async () => {},
+			maxRequestsPerCrawl: 1
+		});
+		const page = await crawler.browserPool.newPage();
+		await page.evaluate(() => {
+			return window.navigator.userAgent;
+		});
+		await page.close();
+		await crawler.browserPool.closeAllBrowsers();
+		crawler.stop();
+		return true;
+	} catch {}
+	return false;
+}
 //#endregion
 //#region src/cli.ts
@@ -400,10 +414,17 @@ async function main() {
 	} else options = await interactiveCrawl();
 	if (!options) process.exit(0);
 	if (options.driver === "playwright") {
-		const playwrightInstalled = await ensurePlaywrightInstalled();
-		if (!playwrightInstalled) {
-			p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
-			process.exit(1);
+		const chromeSupported = await isUseChromeSupported();
+		if (chromeSupported) {
+			options.useChrome = true;
+			p.log.info("System Chrome detected and enabled.");
+		} else {
+			const playwrightInstalled = await ensurePlaywrightInstalled();
+			if (!playwrightInstalled) {
+				p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
+				process.exit(1);
+			}
+			p.log.info("Using global playwright instance.");
 		}
 	}
 	const s = p.spinner();
@@ -451,11 +472,8 @@ async function main() {
 		if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
 		if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
 	}
-	if (!cliOptions) await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
-	else {
-		await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
-		if (successful === 0) process.exit(1);
-	}
+	await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
+	process.exit(0);
 }
 main().catch((error) => {
 	p.log.error(`Unexpected error: ${error}`);

package/dist/index.d.mts CHANGED Viewed

@@ -9,6 +9,7 @@ interface CrawlOptions {
   origin?: string;
   chunkSize?: number;
   driver?: 'http' | 'playwright';
+  useChrome?: boolean;
   followLinks?: boolean;
   maxDepth?: number;
   globPatterns?: ParsedUrlPattern[];

package/dist/index.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { crawlAndGenerate } from "./_chunks/crawl-BwURA9nQ.mjs";
+import { crawlAndGenerate } from "./_chunks/crawl-BtuYX2_u.mjs";
 import { writeFile } from "node:fs/promises";
 import { basename, sep } from "pathe";

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@mdream/crawl",
   "type": "module",
-  "version": "0.8.3",
+  "version": "0.8.5",
   "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
   "author": {
     "name": "Harlan Wilton",
@@ -50,7 +50,8 @@
     "nypm": "^0.6.0",
     "pathe": "^2.0.3",
     "picomatch": "^4.0.3",
-    "mdream": "0.8.3"
+    "ufo": "^1.6.1",
+    "mdream": "0.8.5"
   },
   "devDependencies": {
     "@types/picomatch": "^4.0.1"