npm - @mdream/crawl - Versions diffs - 0.16.0 → 0.17.1 - Mend

@mdream/crawl 0.16.0 → 0.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/_chunks/crawl.mjs CHANGED Viewed

@@ -9,6 +9,13 @@ import { dirname, join, normalize, resolve } from "pathe";
 import { withHttps } from "ufo";
 import picomatch from "picomatch";
 import { extractionPlugin } from "mdream/plugins";
+//#region src/glob-utils.ts
+const GLOB_STRIP_TAIL_RE = /\*.*$/;
+const GLOB_CHAR_RE = /[*?[]/;
+/**
+* Parse a URL that may contain glob patterns
+* Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
+*/
 function parseUrlPattern(input) {
 	if (!(input.includes("*") || input.includes("?") || input.includes("["))) return {
 		baseUrl: input,
@@ -16,7 +23,7 @@ function parseUrlPattern(input) {
 		isGlob: false
 	};
 	try {
-		const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(/\*.*$/, "");
+		const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(GLOB_STRIP_TAIL_RE, "");
 		const url = new URL(urlWithoutGlob);
 		const baseUrl = `${url.protocol}//${url.host}`;
 		const patternStart = input.indexOf(url.host) + url.host.length;
@@ -29,6 +36,9 @@ function parseUrlPattern(input) {
 		throw new Error(`Invalid URL pattern: "${input}". Please provide a valid URL with glob patterns (e.g., "example.com/docs/*" or "https://example.com/api/**").`);
 	}
 }
+/**
+* Check if a URL matches a glob pattern
+*/
 function matchesGlobPattern(url, parsedPattern) {
 	if (!parsedPattern.isGlob) return true;
 	try {
@@ -45,16 +55,23 @@ function matchesGlobPattern(url, parsedPattern) {
 		return false;
 	}
 }
+/**
+* Get the starting URL for crawling from a glob pattern
+* For https://nuxtseo.com/docs/**, we want to start at https://nuxtseo.com
+*/
 function getStartingUrl(parsedPattern) {
 	if (!parsedPattern.isGlob) return withHttps(parsedPattern.baseUrl);
 	const pattern = parsedPattern.pattern;
-	const firstGlobIndex = pattern.search(/[*?[]/);
+	const firstGlobIndex = pattern.search(GLOB_CHAR_RE);
 	if (firstGlobIndex === -1) return withHttps(parsedPattern.baseUrl + pattern);
 	const beforeGlob = pattern.substring(0, firstGlobIndex);
 	const lastSlash = beforeGlob.lastIndexOf("/");
 	const pathBeforeGlob = lastSlash >= 0 ? beforeGlob.substring(0, lastSlash + 1) : "/";
 	return withHttps(parsedPattern.baseUrl + pathBeforeGlob);
 }
+/**
+* Check if a URL should be excluded based on exclude patterns
+*/
 function isUrlExcluded(url, excludePatterns) {
 	if (!excludePatterns || excludePatterns.length === 0) return false;
 	try {
@@ -73,6 +90,15 @@ function isUrlExcluded(url, excludePatterns) {
 		return false;
 	}
 }
+/**
+* Check if a string is valid sitemap XML content (not an HTML page or other non-sitemap response)
+*/
+function isValidSitemapXml(content) {
+	return content.includes("<urlset") || content.includes("<sitemapindex");
+}
+/**
+* Validate glob pattern syntax
+*/
 function validateGlobPattern(pattern) {
 	try {
 		parseUrlPattern(pattern);
@@ -81,6 +107,8 @@ function validateGlobPattern(pattern) {
 		return `Invalid glob pattern: ${error instanceof Error ? error.message : error}`;
 	}
 }
+//#endregion
+//#region src/metadata-extractor.ts
 function extractMetadata(html, url) {
 	const links = [];
 	let title = "";
@@ -133,6 +161,15 @@ function extractMetadata(html, url) {
 		})
 	};
 }
+//#endregion
+//#region src/crawl.ts
+const SITEMAP_INDEX_LOC_RE = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
+const SITEMAP_URL_LOC_RE = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
+const ROBOTS_SITEMAP_RE = /Sitemap:\s*(.*)/gi;
+const ROBOTS_SITEMAP_PREFIX_RE = /Sitemap:\s*/i;
+const URL_TRAILING_SLASH_RE = /\/$/;
+const URL_PATH_UNSAFE_CHARS_RE = /[^\w\-]/g;
+const FRONTMATTER_BLOCK_RE = /^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/;
 async function loadSitemapWithoutRetries(sitemapUrl) {
 	const controller = new AbortController();
 	const timeoutId = setTimeout(() => controller.abort(), 1e4);
@@ -144,12 +181,13 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
 		clearTimeout(timeoutId);
 		if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
 		const xmlContent = await response.text();
+		if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
 		if (xmlContent.includes("<sitemapindex")) {
-			const sitemapIndexRegex = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
+			SITEMAP_INDEX_LOC_RE.lastIndex = 0;
 			const childSitemaps = [];
 			let match;
 			while (true) {
-				match = sitemapIndexRegex.exec(xmlContent);
+				match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
 				if (match === null) break;
 				let url = match[1];
 				if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
@@ -165,10 +203,10 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
 			return allUrls;
 		} else {
 			const urls = [];
-			const urlRegex = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
+			SITEMAP_URL_LOC_RE.lastIndex = 0;
 			let match;
 			while (true) {
-				match = urlRegex.exec(xmlContent);
+				match = SITEMAP_URL_LOC_RE.exec(xmlContent);
 				if (match === null) break;
 				let url = match[1];
 				if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
@@ -227,12 +265,12 @@ async function crawlAndGenerate(options, onProgress) {
 			robotsResponse = null;
 		}
 		if (robotsResponse?.ok) {
-			const sitemapMatches = (await robotsResponse.text()).match(/Sitemap:\s*(.*)/gi);
+			const sitemapMatches = (await robotsResponse.text()).match(ROBOTS_SITEMAP_RE);
 			if (sitemapMatches && sitemapMatches.length > 0) {
 				progress.sitemap.found = sitemapMatches.length;
 				progress.sitemap.status = "processing";
 				onProgress?.(progress);
-				const robotsSitemaps = sitemapMatches.map((match) => match.replace(/Sitemap:\s*/i, "").trim());
+				const robotsSitemaps = sitemapMatches.map((match) => match.replace(ROBOTS_SITEMAP_PREFIX_RE, "").trim());
 				for (const sitemapUrl of robotsSitemaps) try {
 					const robotsUrls = await loadSitemapWithoutRetries(sitemapUrl);
 					sitemapAttempts.push({
@@ -403,17 +441,17 @@ async function crawlAndGenerate(options, onProgress) {
 				origin: pageOrigin
 			});
 			let md = "";
-			if (shouldProcessMarkdown && (!onPage || generateIndividualMd)) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
+			if (shouldProcessMarkdown) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
 			let filePath;
 			if (shouldProcessMarkdown && generateIndividualMd) {
 				const urlObj = new URL(request.loadedUrl);
-				const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(/\/$/, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(/[^\w\-]/g, "-"));
+				const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"));
 				filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
 				const fileDir = dirname(filePath);
 				if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
 				await writeFile(filePath, md, "utf-8");
 			}
-			const isHomePage = request.loadedUrl.replace(/\/$/, "") === homePageUrl.replace(/\/$/, "");
+			const isHomePage = request.loadedUrl.replace(URL_TRAILING_SLASH_RE, "") === homePageUrl.replace(URL_TRAILING_SLASH_RE, "");
 			if (shouldProcessMarkdown || isHomePage) {
 				const result = {
 					url: request.loadedUrl,
@@ -530,7 +568,7 @@ async function crawlAndGenerate(options, onProgress) {
 			onProgress?.(progress);
 			const contentResults = successfulResults.filter((result) => {
 				if (!result.content) return false;
-				return result.content.trim().replace(/^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/, "").trim().length > 10;
+				return result.content.trim().replace(FRONTMATTER_BLOCK_RE, "").trim().length > 10;
 			});
 			const seenUrls = /* @__PURE__ */ new Set();
 			const llmsResult = await generateLlmsTxtArtifacts({
@@ -568,4 +606,5 @@ async function crawlAndGenerate(options, onProgress) {
 	await purgeDefaultStorages();
 	return results;
 }
+//#endregion
 export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };

package/dist/cli.mjs CHANGED Viewed

@@ -6,6 +6,7 @@ import { dirname, join, resolve } from "pathe";
 import { withHttps } from "ufo";
 import { fileURLToPath } from "node:url";
 import { addDependency } from "nypm";
+//#region src/playwright-utils.ts
 async function checkPlaywrightInstallation() {
 	try {
 		await import("playwright");
@@ -59,6 +60,8 @@ async function isUseChromeSupported() {
 	} catch {}
 	return false;
 }
+//#endregion
+//#region src/cli.ts
 const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
 const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
 function checkOutputDirectoryPermissions(outputDir) {
@@ -488,4 +491,5 @@ main().catch((error) => {
 	p.log.error(`Unexpected error: ${error}`);
 	process.exit(1);
 });
+//#endregion
 export {};

package/dist/index.mjs CHANGED Viewed

@@ -1,6 +1,8 @@
 import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
 import { writeFile } from "node:fs/promises";
 import { basename, sep } from "pathe";
+//#region src/llms-txt.ts
+const ANCHOR_UNSAFE_CHARS_RE = /[^a-z0-9]/g;
 async function generateLlmsTxt(options) {
 	const { siteName, description, results, outputPath } = options;
 	let content = `# ${siteName}\n\n`;
@@ -40,7 +42,7 @@ async function generateLlmsFullTxt(options) {
 			} catch {
 				title = result.title || result.url;
 			}
-			const anchor = title.toLowerCase().replace(/[^a-z0-9]/g, "-");
+			const anchor = title.toLowerCase().replace(ANCHOR_UNSAFE_CHARS_RE, "-");
 			content += `- [${title}](#${anchor})\n`;
 		}
 		content += `\n---\n\n`;
@@ -58,4 +60,5 @@ async function generateLlmsFullTxt(options) {
 	}
 	await writeFile(outputPath, content, "utf-8");
 }
+//#endregion
 export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@mdream/crawl",
   "type": "module",
-  "version": "0.16.0",
+  "version": "0.17.1",
   "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
   "author": {
     "name": "Harlan Wilton",
@@ -50,13 +50,13 @@
     }
   },
   "dependencies": {
-    "@clack/prompts": "^1.0.1",
+    "@clack/prompts": "^1.1.0",
     "crawlee": "^3.16.0",
     "nypm": "^0.6.5",
     "pathe": "^2.0.3",
     "picomatch": "^4.0.3",
     "ufo": "^1.6.3",
-    "mdream": "0.16.0"
+    "mdream": "0.17.1"
   },
   "devDependencies": {
     "@types/picomatch": "^4.0.2"