npm - @mdream/crawl - Versions diffs - 0.15.2 → 0.16.0 - Mend

@mdream/crawl 0.15.2 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/_chunks/{crawl-BInMcRnS.mjs → crawl.mjs} +12 -39
package/dist/cli.mjs +5 -11
package/dist/index.mjs +4 -8
package/package.json +6 -6

package/dist/_chunks/{crawl-BInMcRnS.mjs → crawl.mjs} RENAMED Viewed

@@ -9,12 +9,6 @@ import { dirname, join, normalize, resolve } from "pathe";
 import { withHttps } from "ufo";
 import picomatch from "picomatch";
 import { extractionPlugin } from "mdream/plugins";
-//#region src/glob-utils.ts
-/**
-* Parse a URL that may contain glob patterns
-* Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
-*/
 function parseUrlPattern(input) {
 	if (!(input.includes("*") || input.includes("?") || input.includes("["))) return {
 		baseUrl: input,
@@ -35,9 +29,6 @@ function parseUrlPattern(input) {
 		throw new Error(`Invalid URL pattern: "${input}". Please provide a valid URL with glob patterns (e.g., "example.com/docs/*" or "https://example.com/api/**").`);
 	}
 }
-/**
-* Check if a URL matches a glob pattern
-*/
 function matchesGlobPattern(url, parsedPattern) {
 	if (!parsedPattern.isGlob) return true;
 	try {
@@ -54,10 +45,6 @@ function matchesGlobPattern(url, parsedPattern) {
 		return false;
 	}
 }
-/**
-* Get the starting URL for crawling from a glob pattern
-* For https://nuxtseo.com/docs/**, we want to start at https://nuxtseo.com
-*/
 function getStartingUrl(parsedPattern) {
 	if (!parsedPattern.isGlob) return withHttps(parsedPattern.baseUrl);
 	const pattern = parsedPattern.pattern;
@@ -68,9 +55,6 @@ function getStartingUrl(parsedPattern) {
 	const pathBeforeGlob = lastSlash >= 0 ? beforeGlob.substring(0, lastSlash + 1) : "/";
 	return withHttps(parsedPattern.baseUrl + pathBeforeGlob);
 }
-/**
-* Check if a URL should be excluded based on exclude patterns
-*/
 function isUrlExcluded(url, excludePatterns) {
 	if (!excludePatterns || excludePatterns.length === 0) return false;
 	try {
@@ -89,9 +73,6 @@ function isUrlExcluded(url, excludePatterns) {
 		return false;
 	}
 }
-/**
-* Validate glob pattern syntax
-*/
 function validateGlobPattern(pattern) {
 	try {
 		parseUrlPattern(pattern);
@@ -100,9 +81,6 @@ function validateGlobPattern(pattern) {
 		return `Invalid glob pattern: ${error instanceof Error ? error.message : error}`;
 	}
 }
-//#endregion
-//#region src/metadata-extractor.ts
 function extractMetadata(html, url) {
 	const links = [];
 	let title = "";
@@ -155,9 +133,6 @@ function extractMetadata(html, url) {
 		})
 	};
 }
-//#endregion
-//#region src/crawl.ts
 async function loadSitemapWithoutRetries(sitemapUrl) {
 	const controller = new AbortController();
 	const timeoutId = setTimeout(() => controller.abort(), 1e4);
@@ -247,7 +222,7 @@ async function crawlAndGenerate(options, onProgress) {
 				headers: { "User-Agent": "mdream-crawler/1.0" }
 			});
 			clearTimeout(robotsTimeoutId);
-		} catch (error) {
+		} catch {
 			clearTimeout(robotsTimeoutId);
 			robotsResponse = null;
 		}
@@ -264,7 +239,7 @@ async function crawlAndGenerate(options, onProgress) {
 						url: sitemapUrl,
 						success: true
 					});
-					if (patterns.some((p$1) => p$1.isGlob)) {
+					if (patterns.some((p) => p.isGlob)) {
 						const filteredUrls = robotsUrls.filter((url) => {
 							return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
 						});
@@ -300,7 +275,7 @@ async function crawlAndGenerate(options, onProgress) {
 				url: mainSitemapUrl,
 				success: true
 			});
-			if (patterns.some((p$1) => p$1.isGlob)) {
+			if (patterns.some((p) => p.isGlob)) {
 				const filteredUrls = sitemapUrls.filter((url) => {
 					return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
 				});
@@ -339,7 +314,7 @@ async function crawlAndGenerate(options, onProgress) {
 						url: sitemapUrl,
 						success: true
 					});
-					if (patterns.some((p$1) => p$1.isGlob)) {
+					if (patterns.some((p) => p.isGlob)) {
 						const filteredUrls = altUrls.filter((url) => {
 							return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
 						});
@@ -360,11 +335,11 @@ async function crawlAndGenerate(options, onProgress) {
 							break;
 						}
 					}
-				} catch (error$1) {
+				} catch (error) {
 					sitemapAttempts.push({
 						url: sitemapUrl,
 						success: false,
-						error: error$1 instanceof Error ? error$1.message : "Unknown error"
+						error: error instanceof Error ? error.message : "Unknown error"
 					});
 				}
 			}
@@ -396,7 +371,7 @@ async function crawlAndGenerate(options, onProgress) {
 	const processedUrls = /* @__PURE__ */ new Set();
 	const shouldCrawlUrl = (url) => {
 		if (isUrlExcluded(url, exclude)) return false;
-		if (!patterns.some((p$1) => p$1.isGlob)) return true;
+		if (!patterns.some((p) => p.isGlob)) return true;
 		return patterns.some((pattern) => matchesGlobPattern(url, pattern));
 	};
 	const createRequestHandler = (crawlerType) => {
@@ -543,10 +518,10 @@ async function crawlAndGenerate(options, onProgress) {
 		onProgress?.(progress);
 		const successfulResults = results.filter((r) => r.success);
 		const firstUrl = new URL(withHttps(urls[0]));
-		const origin$1 = firstUrl.origin;
+		const origin = firstUrl.origin;
 		const homePageResult = successfulResults.find((r) => {
 			const resultUrl = new URL(withHttps(r.url));
-			return resultUrl.href === origin$1 || resultUrl.href === `${origin$1}/`;
+			return resultUrl.href === origin || resultUrl.href === `${origin}/`;
 		});
 		const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
 		const description = descriptionOverride || homePageResult?.metadata?.description || successfulResults[0]?.metadata?.description;
@@ -555,7 +530,7 @@ async function crawlAndGenerate(options, onProgress) {
 			onProgress?.(progress);
 			const contentResults = successfulResults.filter((result) => {
 				if (!result.content) return false;
-				return result.content.trim().replace(/^---\s*\n(?:.*\n)*?---\s*/, "").trim().length > 10;
+				return result.content.trim().replace(/^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/, "").trim().length > 10;
 			});
 			const seenUrls = /* @__PURE__ */ new Set();
 			const llmsResult = await generateLlmsTxtArtifacts({
@@ -572,7 +547,7 @@ async function crawlAndGenerate(options, onProgress) {
 				})),
 				siteName,
 				description,
-				origin: origin$1 || firstUrl.origin,
+				origin: origin || firstUrl.origin,
 				generateFull: generateLlmsFullTxt,
 				outputDir
 			});
@@ -593,6 +568,4 @@ async function crawlAndGenerate(options, onProgress) {
 	await purgeDefaultStorages();
 	return results;
 }
-//#endregion
-export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };
+export { parseUrlPattern as n, validateGlobPattern as r, crawlAndGenerate as t };

package/dist/cli.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { n as parseUrlPattern, r as validateGlobPattern, t as crawlAndGenerate } from "./_chunks/crawl-BInMcRnS.mjs";
+import { n as parseUrlPattern, r as validateGlobPattern, t as crawlAndGenerate } from "./_chunks/crawl.mjs";
 import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
 import * as p from "@clack/prompts";
 import { PlaywrightCrawler } from "crawlee";
@@ -6,8 +6,6 @@ import { dirname, join, resolve } from "pathe";
 import { withHttps } from "ufo";
 import { fileURLToPath } from "node:url";
 import { addDependency } from "nypm";
-//#region src/playwright-utils.ts
 async function checkPlaywrightInstallation() {
 	try {
 		await import("playwright");
@@ -61,9 +59,6 @@ async function isUseChromeSupported() {
 	} catch {}
 	return false;
 }
-//#endregion
-//#region src/cli.ts
 const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
 const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
 function checkOutputDirectoryPermissions(outputDir) {
@@ -106,8 +101,8 @@ async function interactiveCrawl() {
 		placeholder: "e.g. docs.example.com, site.com/docs/**",
 		validate: (value) => {
 			if (!value) return "Please enter at least one URL";
-			const urls$1 = value.split(",").map((url) => url.trim());
-			for (const url of urls$1) {
+			const urls = value.split(",").map((url) => url.trim());
+			for (const url of urls) {
 				const globError = validateGlobPattern(url);
 				if (globError) return globError;
 				try {
@@ -210,7 +205,7 @@ async function interactiveCrawl() {
 		inferredOrigin && `Origin: ${inferredOrigin}`
 	].filter(Boolean);
 	p.note(summary.join("\n"), "Crawl Configuration");
-	if (advancedOptions.skipSitemap && globPatterns.some((p$1) => p$1.isGlob)) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
+	if (advancedOptions.skipSitemap && globPatterns.some((p) => p.isGlob)) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
 	return {
 		urls,
 		outputDir: resolve(outputDir),
@@ -493,5 +488,4 @@ main().catch((error) => {
 	p.log.error(`Unexpected error: ${error}`);
 	process.exit(1);
 });
-//#endregion
+export {};

package/dist/index.mjs CHANGED Viewed

@@ -1,8 +1,6 @@
-import { t as crawlAndGenerate } from "./_chunks/crawl-BInMcRnS.mjs";
+import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
 import { writeFile } from "node:fs/promises";
 import { basename, sep } from "pathe";
-//#region src/llms-txt.ts
 async function generateLlmsTxt(options) {
 	const { siteName, description, results, outputPath } = options;
 	let content = `# ${siteName}\n\n`;
@@ -22,8 +20,8 @@ async function generateLlmsTxt(options) {
 				const linkPath = (mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath)).split(sep).join("/");
 				content += `- [${title}](md/${linkPath}): ${result.url}\n`;
 			} else {
-				const description$1 = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
-				content += `- [${title}](${result.url})${description$1 ? `: ${description$1}` : ""}\n`;
+				const description = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
+				content += `- [${title}](${result.url})${description ? `: ${description}` : ""}\n`;
 			}
 		}
 	}
@@ -60,6 +58,4 @@ async function generateLlmsFullTxt(options) {
 	}
 	await writeFile(outputPath, content, "utf-8");
 }
-//#endregion
-export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
+export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@mdream/crawl",
   "type": "module",
-  "version": "0.15.2",
+  "version": "0.16.0",
   "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
   "author": {
     "name": "Harlan Wilton",
@@ -50,13 +50,13 @@
     }
   },
   "dependencies": {
-    "@clack/prompts": "^0.11.0",
-    "crawlee": "^3.15.3",
-    "nypm": "^0.6.2",
+    "@clack/prompts": "^1.0.1",
+    "crawlee": "^3.16.0",
+    "nypm": "^0.6.5",
     "pathe": "^2.0.3",
     "picomatch": "^4.0.3",
-    "ufo": "^1.6.1",
-    "mdream": "0.15.2"
+    "ufo": "^1.6.3",
+    "mdream": "0.16.0"
   },
   "devDependencies": {
     "@types/picomatch": "^4.0.2"