npm - @mdream/crawl - Versions diffs - 0.13.2 → 0.14.0 - Mend

@mdream/crawl 0.13.2 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/_chunks/{crawl-DEZX9kH_.mjs → crawl-BInMcRnS.mjs} +66 -93
package/dist/cli.mjs +24 -36
package/dist/index.mjs +2 -3
package/package.json +3 -3

package/dist/_chunks/{crawl-DEZX9kH_.mjs → crawl-BInMcRnS.mjs} RENAMED Viewed

@@ -16,22 +16,19 @@ import { extractionPlugin } from "mdream/plugins";
 * Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
 */
 function parseUrlPattern(input) {
-	const hasGlob = input.includes("*") || input.includes("?") || input.includes("[");
-	if (!hasGlob) return {
+	if (!(input.includes("*") || input.includes("?") || input.includes("["))) return {
 		baseUrl: input,
 		pattern: "",
 		isGlob: false
 	};
 	try {
-		const urlWithProtocol = input.startsWith("http") ? input : `https://${input}`;
-		const urlWithoutGlob = urlWithProtocol.replace(/\*.*$/, "");
+		const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(/\*.*$/, "");
 		const url = new URL(urlWithoutGlob);
 		const baseUrl = `${url.protocol}//${url.host}`;
 		const patternStart = input.indexOf(url.host) + url.host.length;
-		const pattern = input.substring(patternStart);
 		return {
 			baseUrl,
-			pattern,
+			pattern: input.substring(patternStart),
 			isGlob: true
 		};
 	} catch {
@@ -46,8 +43,7 @@ function matchesGlobPattern(url, parsedPattern) {
 	try {
 		const urlObj = new URL(url);
 		const urlPath = urlObj.pathname + urlObj.search + urlObj.hash;
-		const urlBase = `${urlObj.protocol}//${urlObj.host}`;
-		if (urlBase !== parsedPattern.baseUrl) return false;
+		if (`${urlObj.protocol}//${urlObj.host}` !== parsedPattern.baseUrl) return false;
 		let pattern = parsedPattern.pattern;
 		if (pattern.endsWith("*") && !pattern.endsWith("**") && !pattern.endsWith("/*")) {
 			const base = pattern.slice(0, -1);
@@ -86,10 +82,7 @@ function isUrlExcluded(url, excludePatterns) {
 				if (parsedPattern.isGlob) return matchesGlobPattern(url, parsedPattern);
 				return url === pattern;
 			}
-			if (pattern.startsWith("/")) {
-				const adjustedPattern = pattern.endsWith("/*") ? pattern.replace("/*", "/**") : pattern;
-				return picomatch(adjustedPattern)(urlPath);
-			}
+			if (pattern.startsWith("/")) return picomatch(pattern.endsWith("/*") ? pattern.replace("/*", "/**") : pattern)(urlPath);
 			return picomatch(pattern)(urlPath) || picomatch(pattern)(urlPath.substring(1));
 		});
 	} catch {
@@ -102,7 +95,7 @@ function isUrlExcluded(url, excludePatterns) {
 function validateGlobPattern(pattern) {
 	try {
 		parseUrlPattern(pattern);
-		return void 0;
+		return;
 	} catch (error) {
 		return `Invalid glob pattern: ${error instanceof Error ? error.message : error}`;
 	}
@@ -116,35 +109,34 @@ function extractMetadata(html, url) {
 	let description = "";
 	let keywords = "";
 	let author = "";
-	const extractionPluginInstance = extractionPlugin({
-		"a[href]": (element) => {
-			const href = element.attributes?.href;
-			if (href) try {
-				const absoluteUrl = new URL(href, url).href;
-				if (!links.includes(absoluteUrl)) links.push(absoluteUrl);
-			} catch {}
-		},
-		"title": (element) => {
-			if (!title && element.textContent) title = element.textContent.trim();
-		},
-		"meta[name=\"description\"]": (element) => {
-			if (!description && element.attributes?.content) description = element.attributes.content.trim();
-		},
-		"meta[property=\"og:description\"]": (element) => {
-			if (!description && element.attributes?.content) description = element.attributes.content.trim();
-		},
-		"meta[name=\"keywords\"]": (element) => {
-			if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
-		},
-		"meta[name=\"author\"]": (element) => {
-			if (!author && element.attributes?.content) author = element.attributes.content.trim();
-		},
-		"meta[property=\"og:title\"]": (element) => {
-			if (!title && element.attributes?.content) title = element.attributes.content.trim();
-		}
-	});
 	htmlToMarkdown(html, {
-		plugins: [extractionPluginInstance],
+		plugins: [extractionPlugin({
+			"a[href]": (element) => {
+				const href = element.attributes?.href;
+				if (href) try {
+					const absoluteUrl = new URL(href, url).href;
+					if (!links.includes(absoluteUrl)) links.push(absoluteUrl);
+				} catch {}
+			},
+			"title": (element) => {
+				if (!title && element.textContent) title = element.textContent.trim();
+			},
+			"meta[name=\"description\"]": (element) => {
+				if (!description && element.attributes?.content) description = element.attributes.content.trim();
+			},
+			"meta[property=\"og:description\"]": (element) => {
+				if (!description && element.attributes?.content) description = element.attributes.content.trim();
+			},
+			"meta[name=\"keywords\"]": (element) => {
+				if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
+			},
+			"meta[name=\"author\"]": (element) => {
+				if (!author && element.attributes?.content) author = element.attributes.content.trim();
+			},
+			"meta[property=\"og:title\"]": (element) => {
+				if (!title && element.attributes?.content) title = element.attributes.content.trim();
+			}
+		})],
 		origin: new URL(url).origin
 	});
 	return {
@@ -260,8 +252,7 @@ async function crawlAndGenerate(options, onProgress) {
 			robotsResponse = null;
 		}
 		if (robotsResponse?.ok) {
-			const robotsContent = await robotsResponse.text();
-			const sitemapMatches = robotsContent.match(/Sitemap:\s*(.*)/gi);
+			const sitemapMatches = (await robotsResponse.text()).match(/Sitemap:\s*(.*)/gi);
 			if (sitemapMatches && sitemapMatches.length > 0) {
 				progress.sitemap.found = sitemapMatches.length;
 				progress.sitemap.status = "processing";
@@ -273,8 +264,7 @@ async function crawlAndGenerate(options, onProgress) {
 						url: sitemapUrl,
 						success: true
 					});
-					const hasGlobPatterns = patterns.some((p$1) => p$1.isGlob);
-					if (hasGlobPatterns) {
+					if (patterns.some((p$1) => p$1.isGlob)) {
 						const filteredUrls = robotsUrls.filter((url) => {
 							return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
 						});
@@ -310,8 +300,7 @@ async function crawlAndGenerate(options, onProgress) {
 				url: mainSitemapUrl,
 				success: true
 			});
-			const hasGlobPatterns = patterns.some((p$1) => p$1.isGlob);
-			if (hasGlobPatterns) {
+			if (patterns.some((p$1) => p$1.isGlob)) {
 				const filteredUrls = sitemapUrls.filter((url) => {
 					return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
 				});
@@ -350,8 +339,7 @@ async function crawlAndGenerate(options, onProgress) {
 						url: sitemapUrl,
 						success: true
 					});
-					const hasGlobPatterns = patterns.some((p$1) => p$1.isGlob);
-					if (hasGlobPatterns) {
+					if (patterns.some((p$1) => p$1.isGlob)) {
 						const filteredUrls = altUrls.filter((url) => {
 							return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
 						});
@@ -405,7 +393,7 @@ async function crawlAndGenerate(options, onProgress) {
 	}
 	if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });
 	const results = [];
-	const processedUrls = new Set();
+	const processedUrls = /* @__PURE__ */ new Set();
 	const shouldCrawlUrl = (url) => {
 		if (isUrlExcluded(url, exclude)) return false;
 		if (!patterns.some((p$1) => p$1.isGlob)) return true;
@@ -432,36 +420,25 @@ async function crawlAndGenerate(options, onProgress) {
 			if (!title) title = metadata.title;
 			const shouldProcessMarkdown = shouldCrawlUrl(request.loadedUrl);
 			const pageOrigin = origin || new URL(request.loadedUrl).origin;
-			if (onPage && shouldProcessMarkdown) {
-				const pageData = {
-					url: request.loadedUrl,
-					html,
-					title,
-					metadata,
-					origin: pageOrigin
-				};
-				await onPage(pageData);
-			}
+			if (onPage && shouldProcessMarkdown) await onPage({
+				url: request.loadedUrl,
+				html,
+				title,
+				metadata,
+				origin: pageOrigin
+			});
 			let md = "";
 			if (shouldProcessMarkdown && (!onPage || generateIndividualMd)) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
 			let filePath;
-			if (shouldProcessMarkdown) {
+			if (shouldProcessMarkdown && generateIndividualMd) {
 				const urlObj = new URL(request.loadedUrl);
-				const urlPath = urlObj.pathname === "/" ? "/index" : urlObj.pathname;
-				const pathSegments = urlPath.replace(/\/$/, "").split("/").filter((seg) => seg.length > 0);
-				const safeSegments = pathSegments.map((seg) => seg.replace(/[^\w\-]/g, "-"));
-				const filename = safeSegments.length > 0 ? safeSegments.join("/") : "index";
-				const safeFilename = normalize(`${filename}.md`);
-				filePath = join(outputDir, safeFilename);
-				if (generateIndividualMd) {
-					const fileDir = dirname(filePath);
-					if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
-					await writeFile(filePath, md, "utf-8");
-				}
+				const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(/\/$/, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(/[^\w\-]/g, "-"));
+				filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
+				const fileDir = dirname(filePath);
+				if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
+				await writeFile(filePath, md, "utf-8");
 			}
-			const normalizedUrl = request.loadedUrl.replace(/\/$/, "");
-			const normalizedHomePageUrl = homePageUrl.replace(/\/$/, "");
-			const isHomePage = normalizedUrl === normalizedHomePageUrl;
+			const isHomePage = request.loadedUrl.replace(/\/$/, "") === homePageUrl.replace(/\/$/, "");
 			if (shouldProcessMarkdown || isHomePage) {
 				const result = {
 					url: request.loadedUrl,
@@ -578,25 +555,21 @@ async function crawlAndGenerate(options, onProgress) {
 			onProgress?.(progress);
 			const contentResults = successfulResults.filter((result) => {
 				if (!result.content) return false;
-				const trimmedContent = result.content.trim();
-				const contentWithoutFrontmatter = trimmedContent.replace(/^---\s*\n(?:.*\n)*?---\s*/, "").trim();
-				return contentWithoutFrontmatter.length > 10;
-			});
-			const seenUrls = new Set();
-			const deduplicatedResults = contentResults.filter((result) => {
-				if (seenUrls.has(result.url)) return false;
-				seenUrls.add(result.url);
-				return true;
+				return result.content.trim().replace(/^---\s*\n(?:.*\n)*?---\s*/, "").trim().length > 10;
 			});
-			const processedFiles = deduplicatedResults.map((result) => ({
-				filePath: result.filePath,
-				title: result.title,
-				content: result.content,
-				url: result.url,
-				metadata: result.metadata
-			}));
+			const seenUrls = /* @__PURE__ */ new Set();
 			const llmsResult = await generateLlmsTxtArtifacts({
-				files: processedFiles,
+				files: contentResults.filter((result) => {
+					if (seenUrls.has(result.url)) return false;
+					seenUrls.add(result.url);
+					return true;
+				}).map((result) => ({
+					filePath: result.filePath,
+					title: result.title,
+					content: result.content,
+					url: result.url,
+					metadata: result.metadata
+				})),
 				siteName,
 				description,
 				origin: origin$1 || firstUrl.origin,
@@ -622,4 +595,4 @@ async function crawlAndGenerate(options, onProgress) {
 }
 //#endregion
-export { crawlAndGenerate, parseUrlPattern, validateGlobPattern };
+export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };

package/dist/cli.mjs CHANGED Viewed

@@ -1,6 +1,5 @@
-import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-DEZX9kH_.mjs";
+import { n as parseUrlPattern, r as validateGlobPattern, t as crawlAndGenerate } from "./_chunks/crawl-BInMcRnS.mjs";
 import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
-import * as p$1 from "@clack/prompts";
 import * as p from "@clack/prompts";
 import { PlaywrightCrawler } from "crawlee";
 import { dirname, join, resolve } from "pathe";
@@ -18,12 +17,12 @@ async function checkPlaywrightInstallation() {
 	}
 }
 async function promptPlaywrightInstall() {
-	const shouldInstall = await p$1.confirm({
+	const shouldInstall = await p.confirm({
 		message: "Playwright is required for the Playwright driver. Install it now?",
 		initialValue: true
 	});
-	if (p$1.isCancel(shouldInstall) || !shouldInstall) return false;
-	const s = p$1.spinner();
+	if (p.isCancel(shouldInstall) || !shouldInstall) return false;
+	const s = p.spinner();
 	s.start("Installing Playwright globally...");
 	try {
 		await addDependency("playwright", { global: true });
@@ -31,17 +30,15 @@ async function promptPlaywrightInstall() {
 		return true;
 	} catch (fallbackError) {
 		s.stop("Failed to install Playwright");
-		p$1.log.error(`Installation failed: ${fallbackError}`);
+		p.log.error(`Installation failed: ${fallbackError}`);
 		return false;
 	}
 }
 async function ensurePlaywrightInstalled() {
-	const isInstalled = await checkPlaywrightInstallation();
-	if (isInstalled) return true;
-	p$1.log.warn("Playwright driver selected but Playwright is not installed.");
-	const installed = await promptPlaywrightInstall();
-	if (!installed) {
-		p$1.log.error("Cannot proceed with Playwright driver without Playwright installed.");
+	if (await checkPlaywrightInstallation()) return true;
+	p.log.warn("Playwright driver selected but Playwright is not installed.");
+	if (!await promptPlaywrightInstall()) {
+		p.log.error("Cannot proceed with Playwright driver without Playwright installed.");
 		return false;
 	}
 	return true;
@@ -67,10 +64,8 @@ async function isUseChromeSupported() {
 //#endregion
 //#region src/cli.ts
-const __dirname = dirname(fileURLToPath(import.meta.url));
-const packageJsonPath = join(__dirname, "..", "package.json");
-const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
-const version = packageJson.version;
+const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
+const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
 function checkOutputDirectoryPermissions(outputDir) {
 	try {
 		mkdirSync(outputDir, { recursive: true });
@@ -116,8 +111,7 @@ async function interactiveCrawl() {
 				const globError = validateGlobPattern(url);
 				if (globError) return globError;
 				try {
-					const parsed = parseUrlPattern(url);
-					if (!parsed.isGlob) try {
+					if (!parseUrlPattern(url).isGlob) try {
 						new URL(withHttps(url));
 					} catch {
 						return `Invalid URL: ${withHttps(url)}`;
@@ -194,7 +188,7 @@ async function interactiveCrawl() {
 			const url = new URL(withHttps(firstUrl));
 			return `${url.protocol}//${url.host}`;
 		} catch {
-			return void 0;
+			return;
 		}
 	})();
 	const outputFormats = advancedOptions.outputFormats.map((f) => {
@@ -216,7 +210,7 @@ async function interactiveCrawl() {
 		inferredOrigin && `Origin: ${inferredOrigin}`
 	].filter(Boolean);
 	p.note(summary.join("\n"), "Crawl Configuration");
-	if (advancedOptions.skipSitemap && globPatterns.some((p$2) => p$2.isGlob)) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
+	if (advancedOptions.skipSitemap && globPatterns.some((p$1) => p$1.isGlob)) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
 	return {
 		urls,
 		outputDir: resolve(outputDir),
@@ -381,7 +375,7 @@ Examples:
 			const urlObj = new URL(withHttps(url));
 			return `${urlObj.protocol}//${urlObj.host}`;
 		} catch {
-			return void 0;
+			return;
 		}
 	})();
 	const siteNameOverride = getArgValue("--site-name");
@@ -439,19 +433,15 @@ async function main() {
 		if (permCheck.error?.includes("Permission denied")) p.log.info("Tip: Try running with elevated privileges (e.g., sudo) or change the output directory permissions.");
 		process.exit(1);
 	}
-	if (options.driver === "playwright") {
-		const chromeSupported = await isUseChromeSupported();
-		if (chromeSupported) {
-			options.useChrome = true;
-			p.log.info("System Chrome detected and enabled.");
-		} else {
-			const playwrightInstalled = await ensurePlaywrightInstalled();
-			if (!playwrightInstalled) {
-				p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
-				process.exit(1);
-			}
-			p.log.info("Using global playwright instance.");
+	if (options.driver === "playwright") if (await isUseChromeSupported()) {
+		options.useChrome = true;
+		p.log.info("System Chrome detected and enabled.");
+	} else {
+		if (!await ensurePlaywrightInstalled()) {
+			p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
+			process.exit(1);
 		}
+		p.log.info("Using global playwright instance.");
 	}
 	const s = p.spinner();
 	s.start("Starting crawl...");
@@ -475,9 +465,7 @@ async function main() {
 		}
 	});
 	s.stop();
-	const endTime = Date.now();
-	const durationMs = endTime - startTime;
-	const durationSeconds = durationMs / 1e3;
+	const durationSeconds = (Date.now() - startTime) / 1e3;
 	const successful = results.filter((r) => r.success).length;
 	const failed = results.filter((r) => !r.success).length;
 	const failedResults = results.filter((r) => !r.success);

package/dist/index.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { crawlAndGenerate } from "./_chunks/crawl-DEZX9kH_.mjs";
+import { t as crawlAndGenerate } from "./_chunks/crawl-BInMcRnS.mjs";
 import { writeFile } from "node:fs/promises";
 import { basename, sep } from "pathe";
@@ -19,8 +19,7 @@ async function generateLlmsTxt(options) {
 			if (result.filePath) {
 				const mdSeparator = `${sep}md${sep}`;
 				const mdIndex = result.filePath.indexOf(mdSeparator);
-				const relativePath = mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath);
-				const linkPath = relativePath.split(sep).join("/");
+				const linkPath = (mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath)).split(sep).join("/");
 				content += `- [${title}](md/${linkPath}): ${result.url}\n`;
 			} else {
 				const description$1 = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@mdream/crawl",
   "type": "module",
-  "version": "0.13.2",
+  "version": "0.14.0",
   "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
   "author": {
     "name": "Harlan Wilton",
@@ -51,12 +51,12 @@
   },
   "dependencies": {
     "@clack/prompts": "^0.11.0",
-    "crawlee": "^3.15.1",
+    "crawlee": "^3.15.3",
     "nypm": "^0.6.2",
     "pathe": "^2.0.3",
     "picomatch": "^4.0.3",
     "ufo": "^1.6.1",
-    "mdream": "0.13.2"
+    "mdream": "0.14.0"
   },
   "devDependencies": {
     "@types/picomatch": "^4.0.2"