npm - @mdream/crawl - Versions diffs - 0.9.1 → 0.10.0 - Mend

@mdream/crawl 0.9.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/_chunks/{crawl-BtuYX2_u.mjs → crawl-D8WIR9L5.mjs} +115 -19
package/dist/cli.mjs +42 -28
package/dist/index.d.mts +1 -0
package/dist/index.mjs +1 -1
package/package.json +5 -5

package/dist/_chunks/{crawl-BtuYX2_u.mjs → crawl-D8WIR9L5.mjs} RENAMED Viewed

@@ -166,21 +166,56 @@ function extractMetadata(html, url) {
 //#endregion
 //#region src/crawl.ts
 async function loadSitemapWithoutRetries(sitemapUrl) {
-	const response = await fetch(sitemapUrl);
-	if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
-	const xmlContent = await response.text();
-	const urls = [];
-	const urlRegex = /<loc>(.*?)<\/loc>/g;
-	let match;
-	while (true) {
-		match = urlRegex.exec(xmlContent);
-		if (match === null) break;
-		urls.push(match[1]);
+	const controller = new AbortController();
+	const timeoutId = setTimeout(() => controller.abort(), 1e4);
+	try {
+		const response = await fetch(sitemapUrl, {
+			signal: controller.signal,
+			headers: { "User-Agent": "mdream-crawler/1.0" }
+		});
+		clearTimeout(timeoutId);
+		if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
+		const xmlContent = await response.text();
+		if (xmlContent.includes("<sitemapindex")) {
+			const sitemapIndexRegex = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
+			const childSitemaps = [];
+			let match;
+			while (true) {
+				match = sitemapIndexRegex.exec(xmlContent);
+				if (match === null) break;
+				let url = match[1];
+				if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
+				childSitemaps.push(url);
+			}
+			const allUrls = [];
+			for (const childSitemapUrl of childSitemaps) try {
+				const childUrls = await loadSitemapWithoutRetries(childSitemapUrl);
+				allUrls.push(...childUrls);
+			} catch (error) {
+				console.warn(`Failed to load child sitemap ${childSitemapUrl}:`, error instanceof Error ? error.message : "Unknown error");
+			}
+			return allUrls;
+		} else {
+			const urls = [];
+			const urlRegex = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
+			let match;
+			while (true) {
+				match = urlRegex.exec(xmlContent);
+				if (match === null) break;
+				let url = match[1];
+				if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
+				urls.push(url);
+			}
+			return urls;
+		}
+	} catch (error) {
+		clearTimeout(timeoutId);
+		if (error instanceof Error && error.name === "AbortError") throw new Error("Sitemap request timed out after 10 seconds");
+		throw error;
 	}
-	return urls;
 }
 async function crawlAndGenerate(options, onProgress) {
-	const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false } = options;
+	const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false } = options;
 	const outputDir = resolve(normalize(rawOutputDir));
 	if (verbose) log.setLevel(log.LEVELS.INFO);
 	else log.setLevel(log.LEVELS.OFF);
@@ -205,13 +240,25 @@ async function crawlAndGenerate(options, onProgress) {
 		generation: { status: "idle" }
 	};
 	const sitemapAttempts = [];
-	if (startingUrls.length > 0) {
+	if (startingUrls.length > 0 && !skipSitemap) {
 		const baseUrl = new URL(startingUrls[0]).origin;
 		const homePageUrl = baseUrl;
 		onProgress?.(progress);
 		const robotsUrl = new URL("/robots.txt", baseUrl).toString();
-		const robotsResponse = await fetch(robotsUrl);
-		if (robotsResponse.ok) {
+		const robotsController = new AbortController();
+		const robotsTimeoutId = setTimeout(() => robotsController.abort(), 1e4);
+		let robotsResponse;
+		try {
+			robotsResponse = await fetch(robotsUrl, {
+				signal: robotsController.signal,
+				headers: { "User-Agent": "mdream-crawler/1.0" }
+			});
+			clearTimeout(robotsTimeoutId);
+		} catch (error) {
+			clearTimeout(robotsTimeoutId);
+			robotsResponse = null;
+		}
+		if (robotsResponse?.ok) {
 			const robotsContent = await robotsResponse.text();
 			const sitemapMatches = robotsContent.match(/Sitemap:\s*(.*)/gi);
 			if (sitemapMatches && sitemapMatches.length > 0) {
@@ -348,6 +395,12 @@ async function crawlAndGenerate(options, onProgress) {
 		progress.sitemap.status = "completed";
 		progress.crawling.total = startingUrls.length;
 		onProgress?.(progress);
+	} else if (skipSitemap && startingUrls.length > 0) {
+		progress.sitemap.status = "completed";
+		progress.sitemap.found = 0;
+		progress.sitemap.processed = 0;
+		progress.crawling.total = startingUrls.length;
+		onProgress?.(progress);
 	}
 	if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });
 	const results = [];
@@ -428,11 +481,46 @@ async function crawlAndGenerate(options, onProgress) {
 	let crawler;
 	const crawlerOptions = {
 		requestHandler: createRequestHandler(driver),
-		errorHandler: async ({ request, response }) => {
-			if (response?.statusCode && response?.statusCode >= 400) request.noRetry = true;
+		errorHandler: async ({ request, response, error }) => {
+			if (verbose) console.error(`[ERROR] URL: ${request.url}, Status: ${response?.statusCode || "N/A"}, Error: ${error?.message || "Unknown"}`);
+			if (response?.statusCode && response?.statusCode >= 400) {
+				request.noRetry = true;
+				const result = {
+					url: request.url,
+					title: "",
+					content: "",
+					timestamp: Date.now(),
+					success: false,
+					error: `HTTP ${response.statusCode}`,
+					metadata: {
+						title: "",
+						description: "",
+						links: []
+					},
+					depth: request.userData?.depth || 0
+				};
+				results.push(result);
+			} else if (error) {
+				request.noRetry = true;
+				const result = {
+					url: request.url,
+					title: "",
+					content: "",
+					timestamp: Date.now(),
+					success: false,
+					error: error.message || "Unknown error",
+					metadata: {
+						title: "",
+						description: "",
+						links: []
+					},
+					depth: request.userData?.depth || 0
+				};
+				results.push(result);
+			}
 		},
 		maxRequestsPerCrawl,
-		respectRobotsTxtFile: true
+		respectRobotsTxtFile: false
 	};
 	if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
 	if (driver === "playwright") {
@@ -450,7 +538,15 @@ async function crawlAndGenerate(options, onProgress) {
 	progress.crawling.status = "processing";
 	progress.crawling.total = startingUrls.length;
 	onProgress?.(progress);
-	await crawler.run(initialRequests);
+	try {
+		await crawler.run(initialRequests);
+	} catch (error) {
+		if (verbose) {
+			console.error(`[CRAWLER ERROR] ${error instanceof Error ? error.message : "Unknown error"}`);
+			console.error(`[CRAWLER ERROR] Stack trace:`, error instanceof Error ? error.stack : "No stack trace");
+		}
+		throw error;
+	}
 	progress.crawling.status = "completed";
 	onProgress?.(progress);
 	if (results.some((r) => r.success)) {

package/dist/cli.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-BtuYX2_u.mjs";
+import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-D8WIR9L5.mjs";
 import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
 import * as p$1 from "@clack/prompts";
 import * as p from "@clack/prompts";
@@ -156,29 +156,35 @@ async function interactiveCrawl() {
 		p.cancel("Operation cancelled.");
 		process.exit(0);
 	} });
-	const advancedOptions = await p.group({ outputFormats: () => p.multiselect({
-		message: "Select output formats:",
-		options: [
-			{
-				value: "llms.txt",
-				label: "llms.txt (basic format)",
-				hint: "Recommended"
-			},
-			{
-				value: "llms-full.txt",
-				label: "llms-full.txt (extended format)"
-			},
-			{
-				value: "markdown",
-				label: "Individual Markdown files"
-			}
-		],
-		initialValues: [
-			"llms.txt",
-			"llms-full.txt",
-			"markdown"
-		]
-	}) }, { onCancel: () => {
+	const advancedOptions = await p.group({
+		outputFormats: () => p.multiselect({
+			message: "Select output formats:",
+			options: [
+				{
+					value: "llms.txt",
+					label: "llms.txt (basic format)",
+					hint: "Recommended"
+				},
+				{
+					value: "llms-full.txt",
+					label: "llms-full.txt (extended format)"
+				},
+				{
+					value: "markdown",
+					label: "Individual Markdown files"
+				}
+			],
+			initialValues: [
+				"llms.txt",
+				"llms-full.txt",
+				"markdown"
+			]
+		}),
+		skipSitemap: () => p.confirm({
+			message: "Skip sitemap.xml and robots.txt discovery?",
+			initialValue: false
+		})
+	}, { onCancel: () => {
 		p.cancel("Operation cancelled.");
 		process.exit(0);
 	} });
@@ -206,10 +212,11 @@ async function interactiveCrawl() {
 		`Max pages: Unlimited`,
 		`Follow links: Yes (depth 3)`,
 		`Output formats: ${outputFormats.join(", ")}`,
-		`Sitemap discovery: Automatic`,
+		`Sitemap discovery: ${advancedOptions.skipSitemap ? "Skipped" : "Automatic"}`,
 		inferredOrigin && `Origin: ${inferredOrigin}`
 	].filter(Boolean);
 	p.note(summary.join("\n"), "Crawl Configuration");
+	if (advancedOptions.skipSitemap && globPatterns.some((p$2) => p$2.isGlob)) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
 	return {
 		urls,
 		outputDir: resolve(outputDir),
@@ -222,7 +229,8 @@ async function interactiveCrawl() {
 		origin: inferredOrigin,
 		globPatterns,
 		verbose: false,
-		maxDepth: 3
+		maxDepth: 3,
+		skipSitemap: advancedOptions.skipSitemap
 	};
 }
 async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds) {
@@ -258,17 +266,19 @@ Options:
   --max-pages <number>        Maximum pages to crawl (default: unlimited)
   --crawl-delay <seconds>     Crawl delay in seconds
   --exclude <pattern>         Exclude URLs matching glob patterns (can be used multiple times)
+  --skip-sitemap              Skip sitemap.xml and robots.txt discovery
   -v, --verbose               Enable verbose logging
   -h, --help                  Show this help message
   --version                   Show version number
-Note: Sitemap discovery and robots.txt checking are automatic
+Note: Sitemap discovery and robots.txt checking are automatic unless --skip-sitemap is used.
 Examples:
   @mdream/crawl -u harlanzw.com --artifacts "llms.txt,markdown"
   @mdream/crawl --url https://docs.example.com --depth 2 --artifacts "llms-full.txt"
   @mdream/crawl -u example.com --exclude "*/admin/*" --exclude "*/api/*"
   @mdream/crawl -u example.com --verbose
+  @mdream/crawl -u example.com --skip-sitemap
 `);
 		process.exit(0);
 	}
@@ -378,6 +388,8 @@ Examples:
 	const descriptionOverride = getArgValue("--description");
 	const patterns = [parsed];
 	const verbose = args.includes("--verbose") || args.includes("-v");
+	const skipSitemap = args.includes("--skip-sitemap");
+	if (skipSitemap && parsed.isGlob) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
 	return {
 		urls: [url],
 		outputDir: resolve(getArgValue("--output") || getArgValue("-o") || "output"),
@@ -394,7 +406,8 @@ Examples:
 		globPatterns: patterns,
 		crawlDelay: crawlDelayStr ? Number.parseInt(crawlDelayStr) : void 0,
 		exclude: excludePatterns.length > 0 ? excludePatterns : void 0,
-		verbose
+		verbose,
+		skipSitemap
 	};
 }
 async function main() {
@@ -414,6 +427,7 @@ async function main() {
 			`Depth: ${options.maxDepth}`,
 			`Formats: ${formats.join(", ")}`,
 			options.exclude && options.exclude.length > 0 && `Exclude: ${options.exclude.join(", ")}`,
+			options.skipSitemap && `Skip sitemap: Yes`,
 			options.verbose && `Verbose: Enabled`
 		].filter(Boolean);
 		p.note(summary.join("\n"), "Configuration");

package/dist/index.d.mts CHANGED Viewed

@@ -18,6 +18,7 @@ interface CrawlOptions {
   siteNameOverride?: string;
   descriptionOverride?: string;
   verbose?: boolean;
+  skipSitemap?: boolean;
 }
 interface ParsedUrlPattern {
   baseUrl: string;

package/dist/index.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { crawlAndGenerate } from "./_chunks/crawl-BtuYX2_u.mjs";
+import { crawlAndGenerate } from "./_chunks/crawl-D8WIR9L5.mjs";
 import { writeFile } from "node:fs/promises";
 import { basename, sep } from "pathe";

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@mdream/crawl",
   "type": "module",
-  "version": "0.9.1",
+  "version": "0.10.0",
   "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
   "author": {
     "name": "Harlan Wilton",
@@ -46,15 +46,15 @@
   },
   "dependencies": {
     "@clack/prompts": "^0.11.0",
-    "crawlee": "^3.14.0",
-    "nypm": "^0.6.0",
+    "crawlee": "^3.14.1",
+    "nypm": "^0.6.1",
     "pathe": "^2.0.3",
     "picomatch": "^4.0.3",
     "ufo": "^1.6.1",
-    "mdream": "0.9.1"
+    "mdream": "0.10.0"
   },
   "devDependencies": {
-    "@types/picomatch": "^4.0.1"
+    "@types/picomatch": "^4.0.2"
   },
   "scripts": {
     "build": "obuild",