npm - @mdream/crawl - Versions diffs - 0.17.0 → 1.0.0-beta.10 - Mend

@mdream/crawl 0.17.0 → 1.0.0-beta.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -31,7 +31,7 @@ The crawler will automatically discover and follow internal links to crawl entir
 You can also use @mdream/crawl programmatically in your Node.js applications:
 ```typescript
-import { crawlAndGenerate, generateLlmsTxt } from '@mdream/crawl'
+import { crawlAndGenerate } from '@mdream/crawl'
 // Crawl entire websites programmatically
 const results = await crawlAndGenerate({
@@ -44,16 +44,10 @@ const results = await crawlAndGenerate({
   driver: 'http', // or 'playwright' for JS-heavy sites
   verbose: true
 })
-// Generate llms.txt manually from existing results
-await generateLlmsTxt({
-  siteName: 'Example Site',
-  description: 'Documentation for Example Site',
-  results: crawlResults,
-  outputPath: './output/llms.txt'
-})
 ```
+> **Note**: llms.txt artifact generation is handled by [`@mdream/js/llms-txt`](../js). The crawl package uses it internally when `generateLlmsTxt: true`.
 ## Output
 The crawler generates comprehensive output from entire websites:

package/dist/_chunks/crawl.mjs CHANGED Viewed

@@ -1,16 +1,17 @@
 import { existsSync, mkdirSync } from "node:fs";
 import { writeFile } from "node:fs/promises";
 import * as p from "@clack/prompts";
+import { generateLlmsTxtArtifacts } from "@mdream/js/llms-txt";
 import { HttpCrawler, PlaywrightCrawler, log, purgeDefaultStorages } from "crawlee";
 import { htmlToMarkdown } from "mdream";
-import { generateLlmsTxtArtifacts } from "mdream/llms-txt";
-import { withMinimalPreset } from "mdream/preset/minimal";
 import { dirname, join, normalize, resolve } from "pathe";
 import { withHttps } from "ufo";
 import picomatch from "picomatch";
-import { extractionPlugin } from "mdream/plugins";
 //#region src/glob-utils.ts
-const GLOB_STRIP_TAIL_RE = /\*.*$/;
+function stripGlobTail(s) {
+	const idx = s.indexOf("*");
+	return idx === -1 ? s : s.slice(0, idx);
+}
 const GLOB_CHAR_RE = /[*?[]/;
 /**
 * Parse a URL that may contain glob patterns
@@ -23,7 +24,7 @@ function parseUrlPattern(input) {
 		isGlob: false
 	};
 	try {
-		const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(GLOB_STRIP_TAIL_RE, "");
+		const urlWithoutGlob = stripGlobTail(input.startsWith("http") ? input : `https://${input}`);
 		const url = new URL(urlWithoutGlob);
 		const baseUrl = `${url.protocol}//${url.host}`;
 		const patternStart = input.indexOf(url.host) + url.host.length;
@@ -91,6 +92,12 @@ function isUrlExcluded(url, excludePatterns) {
 	}
 }
 /**
+* Check if a string is valid sitemap XML content (not an HTML page or other non-sitemap response)
+*/
+function isValidSitemapXml(content) {
+	return content.includes("<urlset") || content.includes("<sitemapindex");
+}
+/**
 * Validate glob pattern syntax
 */
 function validateGlobPattern(pattern) {
@@ -110,40 +117,40 @@ function extractMetadata(html, url) {
 	let keywords = "";
 	let author = "";
 	htmlToMarkdown(html, {
-		plugins: [extractionPlugin({
-			"a[href]": (element) => {
-				const href = element.attributes?.href;
+		origin: new URL(url).origin,
+		extraction: {
+			"a[href]": (el) => {
+				const href = el.attributes.href;
 				if (href) try {
 					const absoluteUrl = new URL(href, url).href;
 					if (!links.includes(absoluteUrl)) links.push(absoluteUrl);
 				} catch {}
 			},
-			"title": (element) => {
-				if (!title && element.textContent) title = element.textContent.trim();
+			"title": (el) => {
+				if (!title) title = el.textContent;
 			},
-			"meta[name=\"description\"]": (element) => {
-				if (!description && element.attributes?.content) description = element.attributes.content.trim();
+			"meta[name=\"description\"]": (el) => {
+				if (!description) description = el.attributes.content || "";
 			},
-			"meta[property=\"og:description\"]": (element) => {
-				if (!description && element.attributes?.content) description = element.attributes.content.trim();
+			"meta[property=\"og:description\"]": (el) => {
+				if (!description) description = el.attributes.content || "";
 			},
-			"meta[name=\"keywords\"]": (element) => {
-				if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
+			"meta[name=\"keywords\"]": (el) => {
+				if (!keywords) keywords = el.attributes.content || "";
 			},
-			"meta[name=\"author\"]": (element) => {
-				if (!author && element.attributes?.content) author = element.attributes.content.trim();
+			"meta[name=\"author\"]": (el) => {
+				if (!author) author = el.attributes.content || "";
 			},
-			"meta[property=\"og:title\"]": (element) => {
-				if (!title && element.attributes?.content) title = element.attributes.content.trim();
+			"meta[property=\"og:title\"]": (el) => {
+				if (!title) title = el.attributes.content || "";
 			}
-		})],
-		origin: new URL(url).origin
+		}
 	});
 	return {
-		title: title || new URL(url).pathname,
-		description: description || void 0,
-		keywords: keywords || void 0,
-		author: author || void 0,
+		title: title.trim() || new URL(url).pathname,
+		description: description.trim() || void 0,
+		keywords: keywords.trim() || void 0,
+		author: author.trim() || void 0,
 		links: links.filter((link) => {
 			try {
 				const linkUrl = new URL(link);
@@ -175,6 +182,7 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
 		clearTimeout(timeoutId);
 		if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
 		const xmlContent = await response.text();
+		if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
 		if (xmlContent.includes("<sitemapindex")) {
 			SITEMAP_INDEX_LOC_RE.lastIndex = 0;
 			const childSitemaps = [];
@@ -434,7 +442,7 @@ async function crawlAndGenerate(options, onProgress) {
 				origin: pageOrigin
 			});
 			let md = "";
-			if (shouldProcessMarkdown) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
+			if (shouldProcessMarkdown) md = htmlToMarkdown(html, { origin: pageOrigin });
 			let filePath;
 			if (shouldProcessMarkdown && generateIndividualMd) {
 				const urlObj = new URL(request.loadedUrl);

package/dist/index.d.mts CHANGED Viewed

@@ -51,12 +51,6 @@ interface CrawlResult {
   metadata?: PageMetadata;
   depth?: number;
 }
-interface LlmsTxtOptions {
-  siteName: string;
-  description?: string;
-  results: CrawlResult[];
-  outputPath: string;
-}
 //#endregion
 //#region src/crawl.d.ts
 interface CrawlProgress {
@@ -78,8 +72,4 @@ interface CrawlProgress {
 }
 declare function crawlAndGenerate(options: CrawlOptions, onProgress?: (progress: CrawlProgress) => void): Promise<CrawlResult[]>;
 //#endregion
-//#region src/llms-txt.d.ts
-declare function generateLlmsTxt(options: LlmsTxtOptions): Promise<void>;
-declare function generateLlmsFullTxt(options: LlmsTxtOptions): Promise<void>;
-//#endregion
-export { type CrawlOptions, type CrawlResult, type LlmsTxtOptions, type PageData, crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
+export { type CrawlOptions, type CrawlResult, type PageData, crawlAndGenerate };

package/dist/index.mjs CHANGED Viewed

@@ -1,64 +1,2 @@
 import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
-import { writeFile } from "node:fs/promises";
-import { basename, sep } from "pathe";
-//#region src/llms-txt.ts
-const ANCHOR_UNSAFE_CHARS_RE = /[^a-z0-9]/g;
-async function generateLlmsTxt(options) {
-	const { siteName, description, results, outputPath } = options;
-	let content = `# ${siteName}\n\n`;
-	if (description) content += `> ${description}\n\n`;
-	if (results.length > 0) {
-		content += `## Pages\n\n`;
-		for (const result of results) {
-			let title;
-			try {
-				title = result.title || new URL(result.url).pathname;
-			} catch {
-				title = result.title || result.url;
-			}
-			if (result.filePath) {
-				const mdSeparator = `${sep}md${sep}`;
-				const mdIndex = result.filePath.indexOf(mdSeparator);
-				const linkPath = (mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath)).split(sep).join("/");
-				content += `- [${title}](md/${linkPath}): ${result.url}\n`;
-			} else {
-				const description = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
-				content += `- [${title}](${result.url})${description ? `: ${description}` : ""}\n`;
-			}
-		}
-	}
-	await writeFile(outputPath, content, "utf-8");
-}
-async function generateLlmsFullTxt(options) {
-	const { siteName, description, results, outputPath } = options;
-	let content = `# ${siteName}\n\n`;
-	if (description) content += `> ${description}\n\n`;
-	if (results.length > 0) {
-		content += `## Table of Contents\n\n`;
-		for (const result of results) {
-			let title;
-			try {
-				title = result.title || new URL(result.url).pathname;
-			} catch {
-				title = result.title || result.url;
-			}
-			const anchor = title.toLowerCase().replace(ANCHOR_UNSAFE_CHARS_RE, "-");
-			content += `- [${title}](#${anchor})\n`;
-		}
-		content += `\n---\n\n`;
-		for (const result of results) {
-			let title;
-			try {
-				title = result.title || new URL(result.url).pathname;
-			} catch {
-				title = result.title || result.url;
-			}
-			content += `## ${title}\n\n`;
-			content += `**URL:** ${result.url}\n\n`;
-			content += `${result.content}\n\n---\n\n`;
-		}
-	}
-	await writeFile(outputPath, content, "utf-8");
-}
-//#endregion
-export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
+export { crawlAndGenerate };

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@mdream/crawl",
   "type": "module",
-  "version": "0.17.0",
+  "version": "1.0.0-beta.10",
   "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
   "author": {
     "name": "Harlan Wilton",
@@ -56,7 +56,8 @@
     "pathe": "^2.0.3",
     "picomatch": "^4.0.3",
     "ufo": "^1.6.3",
-    "mdream": "0.17.0"
+    "mdream": "1.0.0-beta.10",
+    "@mdream/js": "1.0.0-beta.10"
   },
   "devDependencies": {
     "@types/picomatch": "^4.0.2"