npm - @mdream/crawl - Versions diffs - 1.0.0-beta.11 → 1.0.0-beta.14 - Mend

@mdream/crawl 1.0.0-beta.11 → 1.0.0-beta.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md CHANGED Viewed

@@ -1,106 +1,456 @@
 # @mdream/crawl
-Multi-page website crawler that generates comprehensive llms.txt files by following internal links and processing entire websites using mdream HTML-to-Markdown conversion.
+Multi-page website crawler that generates [llms.txt](https://llmstxt.org/) files. Follows internal links and converts HTML to Markdown using [mdream](../mdream).
-> **Note**: For single-page HTML-to-Markdown conversion, use the [`mdream`](../mdream) binary instead. `@mdream/crawl` is specifically designed for crawling entire websites with multiple pages.
+## Setup
-## Installation
+```bash
+npm install @mdream/crawl
+```
+For JavaScript-heavy sites that require browser rendering, install the optional Playwright dependencies:
+```bash
+npm install crawlee playwright
+```
+## CLI Usage
+### Interactive Mode
+Run without arguments to start the interactive prompt-based interface:
+```bash
+npx @mdream/crawl
+```
+### Direct Mode
+Pass arguments directly to skip interactive prompts:
+```bash
+npx @mdream/crawl -u https://docs.example.com
+```
+### CLI Options
+| Flag | Alias | Description | Default |
+|------|-------|-------------|---------|
+| `--url <url>` | `-u` | Website URL to crawl (supports glob patterns) | Required |
+| `--output <dir>` | `-o` | Output directory | `output` |
+| `--depth <number>` | `-d` | Crawl depth (0 for single page, max 10) | `3` |
+| `--single-page` | | Only process the given URL(s), no crawling. Alias for `--depth 0` | |
+| `--driver <type>` | | Crawler driver: `http` or `playwright` | `http` |
+| `--artifacts <list>` | | Comma-separated output formats: `llms.txt`, `llms-full.txt`, `markdown` | all three |
+| `--origin <url>` | | Origin URL for resolving relative paths (overrides auto-detection) | auto-detected |
+| `--site-name <name>` | | Override the auto-extracted site name used in llms.txt | auto-extracted |
+| `--description <desc>` | | Override the auto-extracted site description used in llms.txt | auto-extracted |
+| `--max-pages <number>` | | Maximum pages to crawl | unlimited |
+| `--crawl-delay <seconds>` | | Delay between requests in seconds | from `robots.txt` or none |
+| `--exclude <pattern>` | | Exclude URLs matching glob patterns (repeatable) | none |
+| `--skip-sitemap` | | Skip `sitemap.xml` and `robots.txt` discovery | `false` |
+| `--allow-subdomains` | | Crawl across subdomains of the same root domain | `false` |
+| `--verbose` | `-v` | Enable verbose logging | `false` |
+| `--help` | `-h` | Show help message | |
+| `--version` | | Show version number | |
+### CLI Examples
 ```bash
-npm install @mdream/crawl@beta
+# Basic crawl with specific artifacts
+npx @mdream/crawl -u harlanzw.com --artifacts "llms.txt,markdown"
+# Shallow crawl (depth 2) with only llms-full.txt output
+npx @mdream/crawl --url https://docs.example.com --depth 2 --artifacts "llms-full.txt"
+# Exclude admin and API routes
+npx @mdream/crawl -u example.com --exclude "*/admin/*" --exclude "*/api/*"
+# Single page mode (no link following)
+npx @mdream/crawl -u example.com/pricing --single-page
+# Use Playwright for JavaScript-heavy sites
+npx @mdream/crawl -u example.com --driver playwright
+# Skip sitemap discovery with verbose output
+npx @mdream/crawl -u example.com --skip-sitemap --verbose
+# Crawl across subdomains (docs.example.com, blog.example.com, etc.)
+npx @mdream/crawl -u example.com --allow-subdomains
+# Override site metadata
+npx @mdream/crawl -u example.com --site-name "My Company" --description "Company documentation"
 ```
-## Usage
+## Glob Patterns
-Simply run the command to start the interactive multi-page website crawler:
+URLs support glob patterns for targeted crawling. When a glob pattern is provided, the crawler uses sitemap discovery to find all matching URLs.
 ```bash
-npx @mdream/crawl@beta
+# Crawl only the /docs/ section
+npx @mdream/crawl -u "docs.example.com/docs/**"
+# Crawl pages matching a prefix
+npx @mdream/crawl -u "example.com/blog/2024*"
 ```
-The crawler will automatically discover and follow internal links to crawl entire websites. The interactive interface provides:
-- ✨ Beautiful prompts powered by Clack
-- 🎯 Step-by-step configuration guidance
-- ✅ Input validation and helpful hints
-- 📋 Configuration summary before crawling
-- 🎉 Clean result display with progress indicators
-- 🧹 Automatic cleanup of crawler storage
+Patterns are matched against the URL pathname using [picomatch](https://github.com/micromatch/picomatch) syntax. A trailing single `*` (e.g. `/fieldtypes*`) automatically expands to match both the path itself and all subdirectories.
+## Programmatic API
-## Programmatic Usage
+### `crawlAndGenerate(options, onProgress?)`
-You can also use @mdream/crawl programmatically in your Node.js applications:
+The main entry point for programmatic use. Returns a `Promise<CrawlResult[]>`.
 ```typescript
 import { crawlAndGenerate } from '@mdream/crawl'
-// Crawl entire websites programmatically
 const results = await crawlAndGenerate({
-  urls: ['https://docs.example.com'], // Starting URLs for website crawling
+  urls: ['https://docs.example.com'],
+  outputDir: './output',
+})
+```
+### `CrawlOptions`
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `urls` | `string[]` | Required | Starting URLs for crawling |
+| `outputDir` | `string` | Required | Directory to write output files |
+| `driver` | `'http' \| 'playwright'` | `'http'` | Crawler driver to use |
+| `maxRequestsPerCrawl` | `number` | `Number.MAX_SAFE_INTEGER` | Maximum total pages to crawl |
+| `followLinks` | `boolean` | `false` | Whether to follow internal links discovered on pages |
+| `maxDepth` | `number` | `1` | Maximum link-following depth. `0` enables single-page mode |
+| `generateLlmsTxt` | `boolean` | `true` | Generate an `llms.txt` file |
+| `generateLlmsFullTxt` | `boolean` | `false` | Generate an `llms-full.txt` file with full page content |
+| `generateIndividualMd` | `boolean` | `true` | Write individual `.md` files for each page |
+| `origin` | `string` | auto-detected | Origin URL for resolving relative paths in HTML |
+| `siteNameOverride` | `string` | auto-extracted | Override the site name in the generated `llms.txt` |
+| `descriptionOverride` | `string` | auto-extracted | Override the site description in the generated `llms.txt` |
+| `globPatterns` | `ParsedUrlPattern[]` | `[]` | Pre-parsed URL glob patterns (advanced usage) |
+| `exclude` | `string[]` | `[]` | Glob patterns for URLs to exclude |
+| `crawlDelay` | `number` | from `robots.txt` | Delay between requests in seconds |
+| `skipSitemap` | `boolean` | `false` | Skip `sitemap.xml` and `robots.txt` discovery |
+| `allowSubdomains` | `boolean` | `false` | Crawl across subdomains of the same root domain (e.g. `docs.example.com` + `blog.example.com`). Output files are namespaced by hostname to avoid collisions |
+| `useChrome` | `boolean` | `false` | Use system Chrome instead of Playwright's bundled browser (Playwright driver only) |
+| `chunkSize` | `number` | | Chunk size passed to mdream for markdown conversion |
+| `verbose` | `boolean` | `false` | Enable verbose error logging |
+| `hooks` | `Partial<CrawlHooks>` | | Hook functions for the crawl pipeline (see [Hooks](#hooks)) |
+| `onPage` | `(page: PageData) => Promise<void> \| void` | | **Deprecated.** Use `hooks['crawl:page']` instead. Still works for backwards compatibility |
+### `CrawlResult`
+```typescript
+interface CrawlResult {
+  url: string
+  title: string
+  content: string
+  filePath?: string // Set when generateIndividualMd is true
+  timestamp: number // Unix timestamp of processing time
+  success: boolean
+  error?: string // Set when success is false
+  metadata?: PageMetadata
+  depth?: number // Link-following depth at which this page was found
+}
+interface PageMetadata {
+  title: string
+  description?: string
+  keywords?: string
+  author?: string
+  links: string[] // Internal links discovered on the page
+}
+```
+### `PageData`
+The shape passed to the `onPage` callback:
+```typescript
+interface PageData {
+  url: string
+  html: string // Raw HTML (empty string if content was already markdown)
+  title: string
+  metadata: PageMetadata
+  origin: string
+}
+```
+### Progress Callback
+The optional second argument to `crawlAndGenerate` receives progress updates:
+```typescript
+await crawlAndGenerate(options, (progress) => {
+  // progress.sitemap.status: 'discovering' | 'processing' | 'completed'
+  // progress.sitemap.found: number of sitemap URLs found
+  // progress.sitemap.processed: number of URLs after filtering
+  // progress.crawling.status: 'starting' | 'processing' | 'completed'
+  // progress.crawling.total: total URLs to process
+  // progress.crawling.processed: pages completed so far
+  // progress.crawling.failed: pages that errored
+  // progress.crawling.currentUrl: URL currently being fetched
+  // progress.crawling.latency: { total, min, max, count } in ms
+  // progress.generation.status: 'idle' | 'generating' | 'completed'
+  // progress.generation.current: description of current generation step
+})
+```
+### Examples
+#### Custom page processing with `onPage`
+```typescript
+import { crawlAndGenerate } from '@mdream/crawl'
+const pages = []
+await crawlAndGenerate({
+  urls: ['https://docs.example.com'],
+  outputDir: './output',
+  generateIndividualMd: false,
+  generateLlmsTxt: false,
+  onPage: (page) => {
+    pages.push({
+      url: page.url,
+      title: page.title,
+      description: page.metadata.description,
+    })
+  },
+})
+console.log(`Discovered ${pages.length} pages`)
+```
+#### Glob filtering with exclusions
+```typescript
+import { crawlAndGenerate } from '@mdream/crawl'
+await crawlAndGenerate({
+  urls: ['https://example.com/docs/**'],
+  outputDir: './docs-output',
+  exclude: ['/docs/deprecated/*', '/docs/internal/*'],
+  followLinks: true,
+  maxDepth: 2,
+})
+```
+#### Crawling across subdomains
+```typescript
+await crawlAndGenerate({
+  urls: ['https://example.com'],
+  outputDir: './output',
+  allowSubdomains: true, // Will also crawl docs.example.com, blog.example.com, etc.
+  followLinks: true,
+  maxDepth: 2,
+})
+```
+#### Single-page mode
+Set `maxDepth: 0` to process only the provided URLs without crawling or link following:
+```typescript
+await crawlAndGenerate({
+  urls: ['https://example.com/pricing', 'https://example.com/about'],
+  outputDir: './output',
+  maxDepth: 0,
+})
+```
+## Config File
+Create a `mdream.config.ts` (or `.js`, `.mjs`) in your project root to set defaults and register hooks. Loaded via [c12](https://github.com/unjs/c12).
+```typescript
+import { defineConfig } from '@mdream/crawl'
+export default defineConfig({
+  exclude: ['*/admin/*', '*/internal/*'],
+  driver: 'http',
+  maxDepth: 3,
+  hooks: {
+    'crawl:page': (page) => {
+      // Strip branding from all page titles
+      page.title = page.title.replace(/ \| My Brand$/, '')
+    },
+  },
+})
+```
+CLI arguments override config file values. Array options like `exclude` are concatenated (config + CLI).
+### Config Options
+| Option | Type | Description |
+|--------|------|-------------|
+| `exclude` | `string[]` | Glob patterns for URLs to exclude |
+| `driver` | `'http' \| 'playwright'` | Crawler driver |
+| `maxDepth` | `number` | Maximum crawl depth |
+| `maxPages` | `number` | Maximum pages to crawl |
+| `crawlDelay` | `number` | Delay between requests (seconds) |
+| `skipSitemap` | `boolean` | Skip sitemap discovery |
+| `allowSubdomains` | `boolean` | Crawl across subdomains |
+| `verbose` | `boolean` | Enable verbose logging |
+| `artifacts` | `string[]` | Output formats: `llms.txt`, `llms-full.txt`, `markdown` |
+| `hooks` | `object` | Hook functions (see below) |
+## Hooks
+Four hooks let you intercept and transform data at each stage of the crawl pipeline. Hooks receive mutable objects. Mutate in-place to transform output.
+### `crawl:url`
+Called before fetching a URL. Set `ctx.skip = true` to skip it entirely (saves the network request).
+```typescript
+defineConfig({
+  hooks: {
+    'crawl:url': (ctx) => {
+      // Skip large asset pages
+      if (ctx.url.includes('/assets/') || ctx.url.includes('/downloads/'))
+        ctx.skip = true
+    },
+  },
+})
+```
+### `crawl:page`
+Called after HTML-to-Markdown conversion, before storage. Mutate `page.title` or other fields. This replaces the `onPage` callback (which still works for backwards compatibility).
+```typescript
+defineConfig({
+  hooks: {
+    'crawl:page': (page) => {
+      // page.url, page.html, page.title, page.metadata, page.origin
+      page.title = page.title.replace(/ - Docs$/, '')
+    },
+  },
+})
+```
+### `crawl:content`
+Called before markdown is written to disk. Transform the final output content or change the file path.
+```typescript
+defineConfig({
+  hooks: {
+    'crawl:content': (ctx) => {
+      // ctx.url, ctx.title, ctx.content, ctx.filePath
+      ctx.content = ctx.content.replace(/CONFIDENTIAL/g, '[REDACTED]')
+      ctx.filePath = ctx.filePath.replace('.md', '.mdx')
+    },
+  },
+})
+```
+### `crawl:done`
+Called after all pages are crawled, before `llms.txt` generation. Filter or reorder results.
+```typescript
+defineConfig({
+  hooks: {
+    'crawl:done': (ctx) => {
+      // Remove short pages from the final output
+      const filtered = ctx.results.filter(r => r.content.length > 100)
+      ctx.results.length = 0
+      ctx.results.push(...filtered)
+    },
+  },
+})
+```
+### Programmatic Hooks
+Hooks can also be passed directly to `crawlAndGenerate`:
+```typescript
+import { crawlAndGenerate } from '@mdream/crawl'
+await crawlAndGenerate({
+  urls: ['https://example.com'],
   outputDir: './output',
-  maxRequestsPerCrawl: 100, // Maximum pages per website
-  generateLlmsTxt: true,
-  followLinks: true, // Follow internal links to crawl entire site
-  maxDepth: 3, // How deep to follow links
-  driver: 'http', // or 'playwright' for JS-heavy sites
-  verbose: true
+  hooks: {
+    'crawl:page': (page) => {
+      page.title = page.title.replace(/ \| Brand$/, '')
+    },
+    'crawl:done': (ctx) => {
+      ctx.results.sort((a, b) => a.url.localeCompare(b.url))
+    },
+  },
 })
 ```
+## Crawl Drivers
+### HTTP Driver (default)
+Uses [`ofetch`](https://github.com/unjs/ofetch) for page fetching with up to 20 concurrent requests.
+- Automatic retry (2 retries with 500ms delay)
+- 10 second request timeout
+- Respects `Retry-After` headers on 429 responses (automatically adjusts crawl delay)
+- Detects `text/markdown` content types and skips HTML-to-Markdown conversion
 ### Playwright Driver
-The default HTTP driver works for most sites. For JavaScript-heavy sites that require a browser, install the optional dependencies:
+For sites that require a browser to render content. Requires `crawlee` and `playwright` as peer dependencies (see [Setup](#setup)).
 ```bash
-npm install crawlee playwright
+npx @mdream/crawl -u example.com --driver playwright
+```
+```typescript
+await crawlAndGenerate({
+  urls: ['https://spa-app.example.com'],
+  outputDir: './output',
+  driver: 'playwright',
+})
 ```
-Then use `--driver playwright` or `driver: 'playwright'` in the API.
+Waits for `networkidle` before extracting content. Automatically detects and uses system Chrome when available, falling back to Playwright's bundled browser.
-> **Note**: llms.txt artifact generation is handled by [`@mdream/js/llms-txt`](../js). The crawl package uses it internally when `generateLlmsTxt: true`.
+## Sitemap and Robots.txt Discovery
-## Output
+By default, the crawler performs sitemap discovery before crawling:
-The crawler generates comprehensive output from entire websites:
+1. Fetches `robots.txt` to find `Sitemap:` directives and `Crawl-delay` values
+2. Loads sitemaps referenced in `robots.txt`
+3. Falls back to `/sitemap.xml`
+4. Tries common alternatives: `/sitemap_index.xml`, `/sitemaps.xml`, `/sitemap-index.xml`
+5. Supports sitemap index files (recursively loads child sitemaps)
+6. Filters discovered URLs against glob patterns and exclusion rules
-1. **Markdown files** - One `.md` file per crawled page with clean markdown content
-2. **llms.txt** - Comprehensive site overview file following the [llms.txt specification](https://llmstxt.org/)
+The home page is always included for metadata extraction (site name, description).
-### Example llms.txt output
+Disable with `--skip-sitemap` or `skipSitemap: true`.
-```markdown
-# example.com
+## Output Formats
-## Pages
+### Individual Markdown Files
-- [Example Domain](https---example-com-.md): https://example.com/
-- [About Us](https---example-com-about.md): https://example.com/about
-```
+One `.md` file per crawled page, written to the output directory preserving the URL path structure. For example, `https://example.com/docs/getting-started` becomes `output/docs/getting-started.md`.
-## Features
+### llms.txt
-- ✅ **Multi-Page Website Crawling**: Designed specifically for crawling entire websites by following internal links
-- ✅ **Purely Interactive**: No complex command-line options to remember
-- ✅ **Dual Crawler Support**: Fast HTTP crawler (default) + Playwright for JavaScript-heavy sites (requires `crawlee` and `playwright`)
-- ✅ **Smart Link Discovery**: Uses mdream's extraction plugin to find and follow internal links
-- ✅ **Rich Metadata Extraction**: Extracts titles, descriptions, keywords, and author info from all pages
-- ✅ **Comprehensive llms.txt Generation**: Creates complete site documentation files
-- ✅ **Configurable Depth Crawling**: Follow links with customizable depth limits (1-10 levels)
-- ✅ **Clean Markdown Conversion**: Powered by mdream's HTML-to-Markdown engine
-- ✅ **Performance Optimized**: HTTP crawler is 5-10x faster than browser-based crawling
-- ✅ **Beautiful Output**: Clean result display with progress indicators
-- ✅ **Automatic Cleanup**: Purges crawler storage after completion
-- ✅ **TypeScript Support**: Full type definitions with excellent IDE support
+A site overview file following the [llms.txt specification](https://llmstxt.org/), listing all crawled pages with titles and links to their markdown files.
-## Use Cases
+```markdown
+# example.com
-Perfect for:
-- 📚 **Documentation Sites**: Crawl entire documentation websites (GitBook, Docusaurus, etc.)
-- 🏢 **Company Websites**: Generate comprehensive site overviews for LLM context
-- 📝 **Blogs**: Process entire blog archives with proper categorization
-- 🔗 **Multi-Page Resources**: Any website where you need all pages, not just one
+## Pages
-**Not suitable for**: Single-page conversions (use `mdream` binary instead)
+- [Example Domain](index.md): https://example.com/
+- [About Us](about.md): https://example.com/about
+```
-## License
+### llms-full.txt
-MIT
+Same structure as `llms.txt` but includes the full markdown content of every page inline.

package/dist/_chunks/crawl.mjs CHANGED Viewed

@@ -2,11 +2,13 @@ import { mkdirSync } from "node:fs";
 import { mkdir, writeFile } from "node:fs/promises";
 import * as p from "@clack/prompts";
 import { generateLlmsTxtArtifacts } from "@mdream/js/llms-txt";
+import { createHooks } from "hookable";
 import { htmlToMarkdown } from "mdream";
 import { ofetch } from "ofetch";
 import { dirname, join, normalize, resolve } from "pathe";
 import { withHttps } from "ufo";
 import picomatch from "picomatch";
+import { getDomain } from "tldts";
 //#region src/glob-utils.ts
 function stripGlobTail(s) {
 	const idx = s.indexOf("*");
@@ -14,6 +16,14 @@ function stripGlobTail(s) {
 }
 const GLOB_CHAR_RE = /[*?[]/;
 /**
+* Extract the registrable domain from a hostname using the public suffix list.
+* Handles multi-part TLDs (.co.uk, .github.io, etc.) correctly.
+* Returns the hostname unchanged for IPs or when parsing fails.
+*/
+function getRegistrableDomain(hostname) {
+	return getDomain(hostname, { allowPrivateDomains: true }) || hostname;
+}
+/**
 * Parse a URL that may contain glob patterns
 * Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
 */
@@ -40,12 +50,15 @@ function parseUrlPattern(input) {
 /**
 * Check if a URL matches a glob pattern
 */
-function matchesGlobPattern(url, parsedPattern) {
+function matchesGlobPattern(url, parsedPattern, allowSubdomains = false) {
 	if (!parsedPattern.isGlob) return true;
 	try {
 		const urlObj = new URL(url);
 		const urlPath = urlObj.pathname + urlObj.search + urlObj.hash;
-		if (`${urlObj.protocol}//${urlObj.host}` !== parsedPattern.baseUrl) return false;
+		if (allowSubdomains) {
+			const patternUrl = new URL(parsedPattern.baseUrl);
+			if (getRegistrableDomain(urlObj.hostname) !== getRegistrableDomain(patternUrl.hostname)) return false;
+		} else if (`${urlObj.protocol}//${urlObj.host}` !== parsedPattern.baseUrl) return false;
 		let pattern = parsedPattern.pattern;
 		if (pattern.endsWith("*") && !pattern.endsWith("**") && !pattern.endsWith("/*")) {
 			const base = pattern.slice(0, -1);
@@ -73,7 +86,7 @@ function getStartingUrl(parsedPattern) {
 /**
 * Check if a URL should be excluded based on exclude patterns
 */
-function isUrlExcluded(url, excludePatterns) {
+function isUrlExcluded(url, excludePatterns, allowSubdomains = false) {
 	if (!excludePatterns || excludePatterns.length === 0) return false;
 	try {
 		const urlObj = new URL(url);
@@ -81,7 +94,7 @@ function isUrlExcluded(url, excludePatterns) {
 		return excludePatterns.some((pattern) => {
 			if (pattern.includes("://")) {
 				const parsedPattern = parseUrlPattern(pattern);
-				if (parsedPattern.isGlob) return matchesGlobPattern(url, parsedPattern);
+				if (parsedPattern.isGlob) return matchesGlobPattern(url, parsedPattern, allowSubdomains);
 				return url === pattern;
 			}
 			if (pattern.startsWith("/")) return picomatch(pattern.endsWith("/*") ? pattern.replace("/*", "/**") : pattern)(urlPath);
@@ -159,7 +172,7 @@ async function loadSitemap(sitemapUrl) {
 	}
 	return urls;
 }
-function extractMetadataInline(parsedUrl) {
+function extractMetadataInline(parsedUrl, allowedDomains) {
 	const links = /* @__PURE__ */ new Set();
 	let title = "";
 	let description = "";
@@ -172,8 +185,12 @@ function extractMetadataInline(parsedUrl) {
 			"a[href]": (el) => {
 				const href = el.attributes.href;
 				if (href) try {
-					const absoluteUrl = new URL(href, url).href;
-					if (absoluteUrl.startsWith(originPrefix) || absoluteUrl === parsedUrl.origin) links.add(absoluteUrl);
+					const resolved = new URL(href, url);
+					const absoluteUrl = resolved.href;
+					if (allowedDomains) {
+						const domain = getRegistrableDomain(resolved.hostname);
+						if (domain && allowedDomains.has(domain)) links.add(absoluteUrl);
+					} else if (absoluteUrl.startsWith(originPrefix) || absoluteUrl === parsedUrl.origin) links.add(absoluteUrl);
 				} catch {}
 			},
 			"title": (el) => {
@@ -204,9 +221,9 @@ function extractMetadataInline(parsedUrl) {
 		})
 	};
 }
-function filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, allPatterns) {
-	if (hasGlobPatterns) return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude) && allPatterns.some((pattern) => matchesGlobPattern(url, pattern)));
-	return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude));
+function filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, allPatterns, allowSubdomains = false) {
+	if (hasGlobPatterns) return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude, allowSubdomains) && allPatterns.some((pattern) => matchesGlobPattern(url, pattern, allowSubdomains)));
+	return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude, allowSubdomains));
 }
 async function runConcurrent(items, concurrency, fn) {
 	let idx = 0;
@@ -216,7 +233,11 @@ async function runConcurrent(items, concurrency, fn) {
 	await Promise.all(workers);
 }
 async function crawlAndGenerate(options, onProgress) {
-	const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay: userCrawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, onPage } = options;
+	const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay: userCrawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, allowSubdomains = false, hooks: hooksConfig, onPage } = options;
+	const hooks = createHooks();
+	if (hooksConfig) hooks.addHooks(hooksConfig);
+	if (onPage) hooks.hook("crawl:page", onPage);
+	const singlePageMode = maxDepth === 0;
 	const outputDir = resolve(normalize(rawOutputDir));
 	let crawlDelay = userCrawlDelay;
 	let patterns;
@@ -248,7 +269,7 @@ async function crawlAndGenerate(options, onProgress) {
 		generation: { status: "idle" }
 	};
 	const sitemapAttempts = [];
-	if (startingUrls.length > 0 && !skipSitemap) {
+	if (startingUrls.length > 0 && !skipSitemap && !singlePageMode) {
 		const baseUrl = new URL(startingUrls[0]).origin;
 		const homePageUrl = baseUrl;
 		onProgress?.(progress);
@@ -265,7 +286,7 @@ async function crawlAndGenerate(options, onProgress) {
 			const crawlDelayMatch = robotsContent.match(ROBOTS_CRAWL_DELAY_RE);
 			if (crawlDelayMatch) {
 				crawlDelay = Number.parseFloat(crawlDelayMatch[1]);
-				p.log(`[ROBOTS] Crawl-delay: ${crawlDelay}s`);
+				p.log.info(`[ROBOTS] Crawl-delay: ${crawlDelay}s`);
 			}
 		}
 		if (robotsContent) {
@@ -281,7 +302,7 @@ async function crawlAndGenerate(options, onProgress) {
 						url: sitemapUrl,
 						success: true
 					});
-					const filteredUrls = filterSitemapUrls(robotsUrls, hasGlobPatterns, exclude, patterns);
+					const filteredUrls = filterSitemapUrls(robotsUrls, hasGlobPatterns, exclude, patterns, allowSubdomains);
 					if (hasGlobPatterns) {
 						startingUrls = filteredUrls;
 						progress.sitemap.processed = filteredUrls.length;
@@ -310,7 +331,7 @@ async function crawlAndGenerate(options, onProgress) {
 				url: mainSitemapUrl,
 				success: true
 			});
-			const filteredUrls = filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, patterns);
+			const filteredUrls = filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, patterns, allowSubdomains);
 			if (hasGlobPatterns) {
 				startingUrls = filteredUrls;
 				progress.sitemap.found = sitemapUrls.length;
@@ -342,7 +363,7 @@ async function crawlAndGenerate(options, onProgress) {
 						url: sitemapUrl,
 						success: true
 					});
-					const filteredUrls = filterSitemapUrls(altUrls, hasGlobPatterns, exclude, patterns);
+					const filteredUrls = filterSitemapUrls(altUrls, hasGlobPatterns, exclude, patterns, allowSubdomains);
 					if (hasGlobPatterns) {
 						startingUrls = filteredUrls;
 						progress.sitemap.found = altUrls.length;
@@ -380,7 +401,7 @@ async function crawlAndGenerate(options, onProgress) {
 		progress.sitemap.status = "completed";
 		progress.crawling.total = startingUrls.length;
 		onProgress?.(progress);
-	} else if (skipSitemap && startingUrls.length > 0) {
+	} else if ((skipSitemap || singlePageMode) && startingUrls.length > 0) {
 		progress.sitemap.status = "completed";
 		progress.sitemap.found = 0;
 		progress.sitemap.processed = 0;
@@ -390,10 +411,24 @@ async function crawlAndGenerate(options, onProgress) {
 	mkdirSync(outputDir, { recursive: true });
 	const results = [];
 	const processedUrls = /* @__PURE__ */ new Set();
+	const allowedRegistrableDomains = allowSubdomains ? new Set(startingUrls.map((u) => {
+		try {
+			return getRegistrableDomain(new URL(u).hostname);
+		} catch {
+			return "";
+		}
+	}).filter(Boolean)) : void 0;
 	const shouldCrawlUrl = (url) => {
-		if (isUrlExcluded(url, exclude)) return false;
-		if (!hasGlobPatterns) return true;
-		return patterns.some((pattern) => matchesGlobPattern(url, pattern));
+		if (isUrlExcluded(url, exclude, allowSubdomains)) return false;
+		if (!hasGlobPatterns) {
+			if (allowedRegistrableDomains) try {
+				return allowedRegistrableDomains.has(getRegistrableDomain(new URL(url).hostname));
+			} catch {
+				return false;
+			}
+			return true;
+		}
+		return patterns.some((pattern) => matchesGlobPattern(url, pattern, allowSubdomains));
 	};
 	const recordLatency = (ms) => {
 		const lat = progress.crawling.latency;
@@ -418,25 +453,41 @@ async function crawlAndGenerate(options, onProgress) {
 				links: []
 			};
 		} else {
-			const { extraction, getMetadata } = extractMetadataInline(parsedUrl);
+			const { extraction, getMetadata } = extractMetadataInline(parsedUrl, allowedRegistrableDomains);
 			md = htmlToMarkdown(content, {
 				origin: pageOrigin,
 				extraction
 			});
 			metadata = getMetadata();
 		}
-		const title = initialTitle || metadata.title;
-		if (onPage && shouldProcessMarkdown) await onPage({
-			url,
-			html: isMarkdown ? "" : content,
-			title,
-			metadata,
-			origin: pageOrigin
-		});
+		let title = initialTitle || metadata.title;
+		if (shouldProcessMarkdown) {
+			const pageData = {
+				url,
+				html: isMarkdown ? "" : content,
+				title,
+				metadata,
+				origin: pageOrigin
+			};
+			await hooks.callHook("crawl:page", pageData);
+			title = pageData.title;
+		}
 		let filePath;
 		if (shouldProcessMarkdown && generateIndividualMd) {
-			const safeSegments = (parsedUrl.pathname === "/" ? "/index" : parsedUrl.pathname).replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"));
+			const urlPath = parsedUrl.pathname === "/" ? "/index" : parsedUrl.pathname;
+			const hostPrefix = allowSubdomains ? [parsedUrl.hostname.replace(URL_PATH_UNSAFE_CHARS_RE, "-")] : [];
+			const pathSegments = urlPath.replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0);
+			const safeSegments = [...hostPrefix, ...pathSegments.map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"))];
 			filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
+			const contentCtx = {
+				url,
+				title,
+				content: md,
+				filePath
+			};
+			await hooks.callHook("crawl:content", contentCtx);
+			md = contentCtx.content;
+			filePath = contentCtx.filePath;
 			const fileDir = dirname(filePath);
 			if (fileDir && !createdDirs.has(fileDir)) {
 				await mkdir(fileDir, { recursive: true });
@@ -460,7 +511,7 @@ async function crawlAndGenerate(options, onProgress) {
 			progress.crawling.processed = results.length;
 			onProgress?.(progress);
 		}
-		if (followLinks && depth < maxDepth) {
+		if (followLinks && !singlePageMode && depth < maxDepth) {
 			const filteredLinks = metadata.links.filter((link) => shouldCrawlUrl(link));
 			for (const link of filteredLinks) processedUrls.add(link);
 		}
@@ -477,6 +528,12 @@ async function crawlAndGenerate(options, onProgress) {
 			requestHandler: async ({ request, page }) => {
 				progress.crawling.currentUrl = request.loadedUrl;
 				onProgress?.(progress);
+				const urlCtx = {
+					url: request.loadedUrl,
+					skip: false
+				};
+				await hooks.callHook("crawl:url", urlCtx);
+				if (urlCtx.skip) return;
 				const fetchStart = Date.now();
 				await page.waitForLoadState("networkidle");
 				const title = await page.title();
@@ -519,8 +576,10 @@ async function crawlAndGenerate(options, onProgress) {
 		try {
 			await crawler.run(initialRequests);
 		} catch (error) {
+			const msg = error instanceof Error ? error.message : "";
+			if (msg.includes("wmic") || msg.includes("ENOENT")) throw new Error(`Crawlee failed to spawn a system process (${msg}). On Windows 11+, wmic.exe is no longer available. Upgrade crawlee to >=3.16.0 or use the HTTP driver instead (--driver http).`);
 			if (verbose) {
-				console.error(`[CRAWLER ERROR] ${error instanceof Error ? error.message : "Unknown error"}`);
+				console.error(`[CRAWLER ERROR] ${msg || "Unknown error"}`);
 				console.error(`[CRAWLER ERROR] Stack trace:`, error instanceof Error ? error.stack : "No stack trace");
 			}
 			throw error;
@@ -533,6 +592,12 @@ async function crawlAndGenerate(options, onProgress) {
 			const delay = crawlDelay;
 			await new Promise((resolve) => setTimeout(resolve, delay * 1e3));
 		}
+		const urlCtx = {
+			url,
+			skip: false
+		};
+		await hooks.callHook("crawl:url", urlCtx);
+		if (urlCtx.skip) return;
 		try {
 			const fetchStart = Date.now();
 			const response = await ofetch.raw(url, {
@@ -576,6 +641,7 @@ async function crawlAndGenerate(options, onProgress) {
 	});
 	progress.crawling.status = "completed";
 	onProgress?.(progress);
+	await hooks.callHook("crawl:done", { results });
 	if (results.some((r) => r.success)) {
 		progress.generation.status = "generating";
 		onProgress?.(progress);

package/dist/cli.mjs CHANGED Viewed

@@ -4,6 +4,16 @@ import * as p from "@clack/prompts";
 import { dirname, join, resolve } from "pathe";
 import { withHttps } from "ufo";
 import { fileURLToPath } from "node:url";
+import { loadConfig } from "c12";
+//#region src/config.ts
+async function loadMdreamConfig(cwd) {
+	const { config } = await loadConfig({
+		name: "mdream",
+		cwd
+	});
+	return config || {};
+}
+//#endregion
 //#region src/cli.ts
 const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
 const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
@@ -198,7 +208,8 @@ Usage:
 Options:
   -u, --url <url>              Website URL to crawl
   -o, --output <dir>           Output directory (default: output)
-  -d, --depth <number>         Crawl depth (default: 3)
+  -d, --depth <number>         Crawl depth, 0 for single page (default: 3)
+  --single-page                Only process the given URL(s), no crawling (alias for --depth 0)
   --driver <http|playwright>   Crawler driver (default: http)
   --artifacts <list>           Comma-separated list of artifacts: llms.txt,llms-full.txt,markdown (default: all)
   --origin <url>               Origin URL for resolving relative paths (overrides auto-detection)
@@ -208,6 +219,7 @@ Options:
   --crawl-delay <seconds>     Crawl delay in seconds
   --exclude <pattern>         Exclude URLs matching glob patterns (can be used multiple times)
   --skip-sitemap              Skip sitemap.xml and robots.txt discovery
+  --allow-subdomains          Crawl across subdomains of the same root domain
   -v, --verbose               Enable verbose logging
   -h, --help                  Show this help message
   --version                   Show version number
@@ -220,6 +232,7 @@ Examples:
   @mdream/crawl -u example.com --exclude "*/admin/*" --exclude "*/api/*"
   @mdream/crawl -u example.com --verbose
   @mdream/crawl -u example.com --skip-sitemap
+  @mdream/crawl -u example.com --driver playwright --single-page
 `);
 		process.exit(0);
 	}
@@ -273,10 +286,10 @@ Examples:
 			process.exit(1);
 		}
 	}
-	const depthStr = getArgValue("--depth") || getArgValue("-d") || "3";
-	const depth = Number.parseInt(depthStr);
-	if (Number.isNaN(depth) || depth < 1 || depth > 10) {
-		p.log.error("Error: Depth must be between 1 and 10");
+	const depthStr = args.includes("--single-page") ? "0" : getArgValue("--depth") || getArgValue("-d") || "3";
+	const depth = Number(depthStr);
+	if (!Number.isInteger(depth) || depth < 0 || depth > 10) {
+		p.log.error("Error: Depth must be an integer between 0 and 10");
 		process.exit(1);
 	}
 	const driver = getArgValue("--driver");
@@ -330,13 +343,14 @@ Examples:
 	const patterns = [parsed];
 	const verbose = args.includes("--verbose") || args.includes("-v");
 	const skipSitemap = args.includes("--skip-sitemap");
+	const allowSubdomains = args.includes("--allow-subdomains");
 	if (skipSitemap && parsed.isGlob) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
 	return {
 		urls: [url],
 		outputDir: resolve(getArgValue("--output") || getArgValue("-o") || "output"),
 		driver: driver || "http",
 		maxRequestsPerCrawl: Number.parseInt(maxPagesStr || String(Number.MAX_SAFE_INTEGER)),
-		followLinks: true,
+		followLinks: depth > 0,
 		maxDepth: depth,
 		generateLlmsTxt: artifacts.includes("llms.txt"),
 		generateLlmsFullTxt: artifacts.includes("llms-full.txt"),
@@ -348,14 +362,28 @@ Examples:
 		crawlDelay: crawlDelayStr ? Number.parseInt(crawlDelayStr) : void 0,
 		exclude: excludePatterns.length > 0 ? excludePatterns : void 0,
 		verbose,
-		skipSitemap
+		skipSitemap,
+		allowSubdomains
 	};
 }
 async function main() {
 	const cliOptions = parseCliArgs();
+	const fileConfig = await loadMdreamConfig();
 	let options;
 	if (cliOptions) {
-		options = cliOptions;
+		const configExclude = fileConfig.exclude || [];
+		const cliExclude = cliOptions.exclude || [];
+		options = {
+			...cliOptions,
+			driver: cliOptions.driver || fileConfig.driver || "http",
+			maxDepth: cliOptions.maxDepth ?? fileConfig.maxDepth,
+			crawlDelay: cliOptions.crawlDelay ?? fileConfig.crawlDelay,
+			skipSitemap: cliOptions.skipSitemap || fileConfig.skipSitemap || false,
+			allowSubdomains: cliOptions.allowSubdomains || fileConfig.allowSubdomains || false,
+			verbose: cliOptions.verbose || fileConfig.verbose || false,
+			exclude: configExclude.length > 0 || cliExclude.length > 0 ? [...configExclude, ...cliExclude] : void 0,
+			hooks: fileConfig.hooks
+		};
 		p.intro(`☁️  mdream v${version}`);
 		const formats = [];
 		if (options.generateLlmsTxt) formats.push("llms.txt");
@@ -369,6 +397,7 @@ async function main() {
 			`Formats: ${formats.join(", ")}`,
 			options.exclude && options.exclude.length > 0 && `Exclude: ${options.exclude.join(", ")}`,
 			options.skipSitemap && `Skip sitemap: Yes`,
+			options.allowSubdomains && `Allow subdomains: Yes`,
 			options.verbose && `Verbose: Enabled`
 		].filter(Boolean);
 		p.note(summary.join("\n"), "Configuration");
@@ -447,7 +476,9 @@ async function main() {
 	process.exit(0);
 }
 main().catch((error) => {
-	p.log.error(`Unexpected error: ${error}`);
+	const msg = error instanceof Error ? error.message : String(error);
+	if (msg.includes("wmic") || msg.includes("ENOENT") && process.platform === "win32") p.log.error("Crawlee failed because wmic.exe is not available on this system. Windows 11 removed wmic.exe, which older crawlee versions depend on for memory monitoring.\nFix: upgrade crawlee to >=3.16.0 or switch to the HTTP driver (--driver http).");
+	else p.log.error(`Unexpected error: ${msg}`);
 	process.exit(1);
 });
 //#endregion

package/dist/index.d.mts CHANGED Viewed

@@ -6,6 +6,22 @@ interface PageData {
   metadata: PageMetadata;
   origin: string;
 }
+interface CrawlHooks {
+  'crawl:url': (ctx: {
+    url: string;
+    skip: boolean;
+  }) => void | Promise<void>;
+  'crawl:page': (page: PageData) => void | Promise<void>;
+  'crawl:content': (ctx: {
+    url: string;
+    title: string;
+    content: string;
+    filePath: string;
+  }) => void | Promise<void>;
+  'crawl:done': (ctx: {
+    results: CrawlResult[];
+  }) => void | Promise<void>;
+}
 interface CrawlOptions {
   urls: string[];
   outputDir: string;
@@ -26,8 +42,23 @@ interface CrawlOptions {
   descriptionOverride?: string;
   verbose?: boolean;
   skipSitemap?: boolean;
+  allowSubdomains?: boolean;
+  hooks?: Partial<{ [K in keyof CrawlHooks]: CrawlHooks[K] | CrawlHooks[K][] }>;
   onPage?: (page: PageData) => Promise<void> | void;
 }
+interface MdreamCrawlConfig {
+  exclude?: string[];
+  driver?: 'http' | 'playwright';
+  maxDepth?: number;
+  maxPages?: number;
+  crawlDelay?: number;
+  skipSitemap?: boolean;
+  allowSubdomains?: boolean;
+  verbose?: boolean;
+  artifacts?: ('llms.txt' | 'llms-full.txt' | 'markdown')[];
+  hooks?: Partial<{ [K in keyof CrawlHooks]: CrawlHooks[K] | CrawlHooks[K][] }>;
+}
+declare function defineConfig(config: MdreamCrawlConfig): MdreamCrawlConfig;
 interface ParsedUrlPattern {
   baseUrl: string;
   pattern: string;
@@ -79,4 +110,4 @@ interface CrawlProgress {
 }
 declare function crawlAndGenerate(options: CrawlOptions, onProgress?: (progress: CrawlProgress) => void): Promise<CrawlResult[]>;
 //#endregion
-export { type CrawlOptions, type CrawlResult, type PageData, crawlAndGenerate };
+export { type CrawlHooks, type CrawlOptions, type CrawlResult, type MdreamCrawlConfig, type PageData, crawlAndGenerate, defineConfig };

package/dist/index.mjs CHANGED Viewed

@@ -1,2 +1,7 @@
 import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
-export { crawlAndGenerate };
+//#region src/types.ts
+function defineConfig(config) {
+	return config;
+}
+//#endregion
+export { crawlAndGenerate, defineConfig };

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@mdream/crawl",
   "type": "module",
-  "version": "1.0.0-beta.11",
+  "version": "1.0.0-beta.14",
   "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
   "author": {
     "name": "Harlan Wilton",
@@ -55,13 +55,16 @@
   },
   "dependencies": {
     "@clack/prompts": "^1.1.0",
+    "c12": "^3.0.4",
+    "hookable": "^5.5.3",
     "nypm": "^0.6.5",
     "ofetch": "^1.5.1",
     "pathe": "^2.0.3",
     "picomatch": "^4.0.3",
+    "tldts": "^7.0.26",
     "ufo": "^1.6.3",
-    "@mdream/js": "1.0.0-beta.11",
-    "mdream": "1.0.0-beta.11"
+    "@mdream/js": "1.0.0-beta.14",
+    "mdream": "1.0.0-beta.14"
   },
   "devDependencies": {
     "@types/picomatch": "^4.0.2"