@mdream/crawl 1.0.0-beta.11 โ 1.0.0-beta.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +411 -61
- package/dist/_chunks/crawl.mjs +98 -32
- package/dist/cli.mjs +40 -9
- package/dist/index.d.mts +32 -1
- package/dist/index.mjs +6 -1
- package/package.json +6 -3
package/README.md
CHANGED
|
@@ -1,106 +1,456 @@
|
|
|
1
1
|
# @mdream/crawl
|
|
2
2
|
|
|
3
|
-
Multi-page website crawler that generates
|
|
3
|
+
Multi-page website crawler that generates [llms.txt](https://llmstxt.org/) files. Follows internal links and converts HTML to Markdown using [mdream](../mdream).
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
## Setup
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
```bash
|
|
8
|
+
npm install @mdream/crawl
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
For JavaScript-heavy sites that require browser rendering, install the optional Playwright dependencies:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
npm install crawlee playwright
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## CLI Usage
|
|
18
|
+
|
|
19
|
+
### Interactive Mode
|
|
20
|
+
|
|
21
|
+
Run without arguments to start the interactive prompt-based interface:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
npx @mdream/crawl
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
### Direct Mode
|
|
28
|
+
|
|
29
|
+
Pass arguments directly to skip interactive prompts:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
npx @mdream/crawl -u https://docs.example.com
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### CLI Options
|
|
36
|
+
|
|
37
|
+
| Flag | Alias | Description | Default |
|
|
38
|
+
|------|-------|-------------|---------|
|
|
39
|
+
| `--url <url>` | `-u` | Website URL to crawl (supports glob patterns) | Required |
|
|
40
|
+
| `--output <dir>` | `-o` | Output directory | `output` |
|
|
41
|
+
| `--depth <number>` | `-d` | Crawl depth (0 for single page, max 10) | `3` |
|
|
42
|
+
| `--single-page` | | Only process the given URL(s), no crawling. Alias for `--depth 0` | |
|
|
43
|
+
| `--driver <type>` | | Crawler driver: `http` or `playwright` | `http` |
|
|
44
|
+
| `--artifacts <list>` | | Comma-separated output formats: `llms.txt`, `llms-full.txt`, `markdown` | all three |
|
|
45
|
+
| `--origin <url>` | | Origin URL for resolving relative paths (overrides auto-detection) | auto-detected |
|
|
46
|
+
| `--site-name <name>` | | Override the auto-extracted site name used in llms.txt | auto-extracted |
|
|
47
|
+
| `--description <desc>` | | Override the auto-extracted site description used in llms.txt | auto-extracted |
|
|
48
|
+
| `--max-pages <number>` | | Maximum pages to crawl | unlimited |
|
|
49
|
+
| `--crawl-delay <seconds>` | | Delay between requests in seconds | from `robots.txt` or none |
|
|
50
|
+
| `--exclude <pattern>` | | Exclude URLs matching glob patterns (repeatable) | none |
|
|
51
|
+
| `--skip-sitemap` | | Skip `sitemap.xml` and `robots.txt` discovery | `false` |
|
|
52
|
+
| `--allow-subdomains` | | Crawl across subdomains of the same root domain | `false` |
|
|
53
|
+
| `--verbose` | `-v` | Enable verbose logging | `false` |
|
|
54
|
+
| `--help` | `-h` | Show help message | |
|
|
55
|
+
| `--version` | | Show version number | |
|
|
56
|
+
|
|
57
|
+
### CLI Examples
|
|
8
58
|
|
|
9
59
|
```bash
|
|
10
|
-
|
|
60
|
+
# Basic crawl with specific artifacts
|
|
61
|
+
npx @mdream/crawl -u harlanzw.com --artifacts "llms.txt,markdown"
|
|
62
|
+
|
|
63
|
+
# Shallow crawl (depth 2) with only llms-full.txt output
|
|
64
|
+
npx @mdream/crawl --url https://docs.example.com --depth 2 --artifacts "llms-full.txt"
|
|
65
|
+
|
|
66
|
+
# Exclude admin and API routes
|
|
67
|
+
npx @mdream/crawl -u example.com --exclude "*/admin/*" --exclude "*/api/*"
|
|
68
|
+
|
|
69
|
+
# Single page mode (no link following)
|
|
70
|
+
npx @mdream/crawl -u example.com/pricing --single-page
|
|
71
|
+
|
|
72
|
+
# Use Playwright for JavaScript-heavy sites
|
|
73
|
+
npx @mdream/crawl -u example.com --driver playwright
|
|
74
|
+
|
|
75
|
+
# Skip sitemap discovery with verbose output
|
|
76
|
+
npx @mdream/crawl -u example.com --skip-sitemap --verbose
|
|
77
|
+
|
|
78
|
+
# Crawl across subdomains (docs.example.com, blog.example.com, etc.)
|
|
79
|
+
npx @mdream/crawl -u example.com --allow-subdomains
|
|
80
|
+
|
|
81
|
+
# Override site metadata
|
|
82
|
+
npx @mdream/crawl -u example.com --site-name "My Company" --description "Company documentation"
|
|
11
83
|
```
|
|
12
84
|
|
|
13
|
-
##
|
|
85
|
+
## Glob Patterns
|
|
14
86
|
|
|
15
|
-
|
|
87
|
+
URLs support glob patterns for targeted crawling. When a glob pattern is provided, the crawler uses sitemap discovery to find all matching URLs.
|
|
16
88
|
|
|
17
89
|
```bash
|
|
18
|
-
|
|
90
|
+
# Crawl only the /docs/ section
|
|
91
|
+
npx @mdream/crawl -u "docs.example.com/docs/**"
|
|
92
|
+
|
|
93
|
+
# Crawl pages matching a prefix
|
|
94
|
+
npx @mdream/crawl -u "example.com/blog/2024*"
|
|
19
95
|
```
|
|
20
96
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
- โ
Input validation and helpful hints
|
|
25
|
-
- ๐ Configuration summary before crawling
|
|
26
|
-
- ๐ Clean result display with progress indicators
|
|
27
|
-
- ๐งน Automatic cleanup of crawler storage
|
|
97
|
+
Patterns are matched against the URL pathname using [picomatch](https://github.com/micromatch/picomatch) syntax. A trailing single `*` (e.g. `/fieldtypes*`) automatically expands to match both the path itself and all subdirectories.
|
|
98
|
+
|
|
99
|
+
## Programmatic API
|
|
28
100
|
|
|
29
|
-
|
|
101
|
+
### `crawlAndGenerate(options, onProgress?)`
|
|
30
102
|
|
|
31
|
-
|
|
103
|
+
The main entry point for programmatic use. Returns a `Promise<CrawlResult[]>`.
|
|
32
104
|
|
|
33
105
|
```typescript
|
|
34
106
|
import { crawlAndGenerate } from '@mdream/crawl'
|
|
35
107
|
|
|
36
|
-
// Crawl entire websites programmatically
|
|
37
108
|
const results = await crawlAndGenerate({
|
|
38
|
-
urls: ['https://docs.example.com'],
|
|
109
|
+
urls: ['https://docs.example.com'],
|
|
110
|
+
outputDir: './output',
|
|
111
|
+
})
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### `CrawlOptions`
|
|
115
|
+
|
|
116
|
+
| Option | Type | Default | Description |
|
|
117
|
+
|--------|------|---------|-------------|
|
|
118
|
+
| `urls` | `string[]` | Required | Starting URLs for crawling |
|
|
119
|
+
| `outputDir` | `string` | Required | Directory to write output files |
|
|
120
|
+
| `driver` | `'http' \| 'playwright'` | `'http'` | Crawler driver to use |
|
|
121
|
+
| `maxRequestsPerCrawl` | `number` | `Number.MAX_SAFE_INTEGER` | Maximum total pages to crawl |
|
|
122
|
+
| `followLinks` | `boolean` | `false` | Whether to follow internal links discovered on pages |
|
|
123
|
+
| `maxDepth` | `number` | `1` | Maximum link-following depth. `0` enables single-page mode |
|
|
124
|
+
| `generateLlmsTxt` | `boolean` | `true` | Generate an `llms.txt` file |
|
|
125
|
+
| `generateLlmsFullTxt` | `boolean` | `false` | Generate an `llms-full.txt` file with full page content |
|
|
126
|
+
| `generateIndividualMd` | `boolean` | `true` | Write individual `.md` files for each page |
|
|
127
|
+
| `origin` | `string` | auto-detected | Origin URL for resolving relative paths in HTML |
|
|
128
|
+
| `siteNameOverride` | `string` | auto-extracted | Override the site name in the generated `llms.txt` |
|
|
129
|
+
| `descriptionOverride` | `string` | auto-extracted | Override the site description in the generated `llms.txt` |
|
|
130
|
+
| `globPatterns` | `ParsedUrlPattern[]` | `[]` | Pre-parsed URL glob patterns (advanced usage) |
|
|
131
|
+
| `exclude` | `string[]` | `[]` | Glob patterns for URLs to exclude |
|
|
132
|
+
| `crawlDelay` | `number` | from `robots.txt` | Delay between requests in seconds |
|
|
133
|
+
| `skipSitemap` | `boolean` | `false` | Skip `sitemap.xml` and `robots.txt` discovery |
|
|
134
|
+
| `allowSubdomains` | `boolean` | `false` | Crawl across subdomains of the same root domain (e.g. `docs.example.com` + `blog.example.com`). Output files are namespaced by hostname to avoid collisions |
|
|
135
|
+
| `useChrome` | `boolean` | `false` | Use system Chrome instead of Playwright's bundled browser (Playwright driver only) |
|
|
136
|
+
| `chunkSize` | `number` | | Chunk size passed to mdream for markdown conversion |
|
|
137
|
+
| `verbose` | `boolean` | `false` | Enable verbose error logging |
|
|
138
|
+
| `hooks` | `Partial<CrawlHooks>` | | Hook functions for the crawl pipeline (see [Hooks](#hooks)) |
|
|
139
|
+
| `onPage` | `(page: PageData) => Promise<void> \| void` | | **Deprecated.** Use `hooks['crawl:page']` instead. Still works for backwards compatibility |
|
|
140
|
+
|
|
141
|
+
### `CrawlResult`
|
|
142
|
+
|
|
143
|
+
```typescript
|
|
144
|
+
interface CrawlResult {
|
|
145
|
+
url: string
|
|
146
|
+
title: string
|
|
147
|
+
content: string
|
|
148
|
+
filePath?: string // Set when generateIndividualMd is true
|
|
149
|
+
timestamp: number // Unix timestamp of processing time
|
|
150
|
+
success: boolean
|
|
151
|
+
error?: string // Set when success is false
|
|
152
|
+
metadata?: PageMetadata
|
|
153
|
+
depth?: number // Link-following depth at which this page was found
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
interface PageMetadata {
|
|
157
|
+
title: string
|
|
158
|
+
description?: string
|
|
159
|
+
keywords?: string
|
|
160
|
+
author?: string
|
|
161
|
+
links: string[] // Internal links discovered on the page
|
|
162
|
+
}
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### `PageData`
|
|
166
|
+
|
|
167
|
+
The shape passed to the `onPage` callback:
|
|
168
|
+
|
|
169
|
+
```typescript
|
|
170
|
+
interface PageData {
|
|
171
|
+
url: string
|
|
172
|
+
html: string // Raw HTML (empty string if content was already markdown)
|
|
173
|
+
title: string
|
|
174
|
+
metadata: PageMetadata
|
|
175
|
+
origin: string
|
|
176
|
+
}
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### Progress Callback
|
|
180
|
+
|
|
181
|
+
The optional second argument to `crawlAndGenerate` receives progress updates:
|
|
182
|
+
|
|
183
|
+
```typescript
|
|
184
|
+
await crawlAndGenerate(options, (progress) => {
|
|
185
|
+
// progress.sitemap.status: 'discovering' | 'processing' | 'completed'
|
|
186
|
+
// progress.sitemap.found: number of sitemap URLs found
|
|
187
|
+
// progress.sitemap.processed: number of URLs after filtering
|
|
188
|
+
|
|
189
|
+
// progress.crawling.status: 'starting' | 'processing' | 'completed'
|
|
190
|
+
// progress.crawling.total: total URLs to process
|
|
191
|
+
// progress.crawling.processed: pages completed so far
|
|
192
|
+
// progress.crawling.failed: pages that errored
|
|
193
|
+
// progress.crawling.currentUrl: URL currently being fetched
|
|
194
|
+
// progress.crawling.latency: { total, min, max, count } in ms
|
|
195
|
+
|
|
196
|
+
// progress.generation.status: 'idle' | 'generating' | 'completed'
|
|
197
|
+
// progress.generation.current: description of current generation step
|
|
198
|
+
})
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### Examples
|
|
202
|
+
|
|
203
|
+
#### Custom page processing with `onPage`
|
|
204
|
+
|
|
205
|
+
```typescript
|
|
206
|
+
import { crawlAndGenerate } from '@mdream/crawl'
|
|
207
|
+
|
|
208
|
+
const pages = []
|
|
209
|
+
|
|
210
|
+
await crawlAndGenerate({
|
|
211
|
+
urls: ['https://docs.example.com'],
|
|
212
|
+
outputDir: './output',
|
|
213
|
+
generateIndividualMd: false,
|
|
214
|
+
generateLlmsTxt: false,
|
|
215
|
+
onPage: (page) => {
|
|
216
|
+
pages.push({
|
|
217
|
+
url: page.url,
|
|
218
|
+
title: page.title,
|
|
219
|
+
description: page.metadata.description,
|
|
220
|
+
})
|
|
221
|
+
},
|
|
222
|
+
})
|
|
223
|
+
|
|
224
|
+
console.log(`Discovered ${pages.length} pages`)
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
#### Glob filtering with exclusions
|
|
228
|
+
|
|
229
|
+
```typescript
|
|
230
|
+
import { crawlAndGenerate } from '@mdream/crawl'
|
|
231
|
+
|
|
232
|
+
await crawlAndGenerate({
|
|
233
|
+
urls: ['https://example.com/docs/**'],
|
|
234
|
+
outputDir: './docs-output',
|
|
235
|
+
exclude: ['/docs/deprecated/*', '/docs/internal/*'],
|
|
236
|
+
followLinks: true,
|
|
237
|
+
maxDepth: 2,
|
|
238
|
+
})
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
#### Crawling across subdomains
|
|
242
|
+
|
|
243
|
+
```typescript
|
|
244
|
+
await crawlAndGenerate({
|
|
245
|
+
urls: ['https://example.com'],
|
|
246
|
+
outputDir: './output',
|
|
247
|
+
allowSubdomains: true, // Will also crawl docs.example.com, blog.example.com, etc.
|
|
248
|
+
followLinks: true,
|
|
249
|
+
maxDepth: 2,
|
|
250
|
+
})
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
#### Single-page mode
|
|
254
|
+
|
|
255
|
+
Set `maxDepth: 0` to process only the provided URLs without crawling or link following:
|
|
256
|
+
|
|
257
|
+
```typescript
|
|
258
|
+
await crawlAndGenerate({
|
|
259
|
+
urls: ['https://example.com/pricing', 'https://example.com/about'],
|
|
260
|
+
outputDir: './output',
|
|
261
|
+
maxDepth: 0,
|
|
262
|
+
})
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
## Config File
|
|
266
|
+
|
|
267
|
+
Create a `mdream.config.ts` (or `.js`, `.mjs`) in your project root to set defaults and register hooks. Loaded via [c12](https://github.com/unjs/c12).
|
|
268
|
+
|
|
269
|
+
```typescript
|
|
270
|
+
import { defineConfig } from '@mdream/crawl'
|
|
271
|
+
|
|
272
|
+
export default defineConfig({
|
|
273
|
+
exclude: ['*/admin/*', '*/internal/*'],
|
|
274
|
+
driver: 'http',
|
|
275
|
+
maxDepth: 3,
|
|
276
|
+
hooks: {
|
|
277
|
+
'crawl:page': (page) => {
|
|
278
|
+
// Strip branding from all page titles
|
|
279
|
+
page.title = page.title.replace(/ \| My Brand$/, '')
|
|
280
|
+
},
|
|
281
|
+
},
|
|
282
|
+
})
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
CLI arguments override config file values. Array options like `exclude` are concatenated (config + CLI).
|
|
286
|
+
|
|
287
|
+
### Config Options
|
|
288
|
+
|
|
289
|
+
| Option | Type | Description |
|
|
290
|
+
|--------|------|-------------|
|
|
291
|
+
| `exclude` | `string[]` | Glob patterns for URLs to exclude |
|
|
292
|
+
| `driver` | `'http' \| 'playwright'` | Crawler driver |
|
|
293
|
+
| `maxDepth` | `number` | Maximum crawl depth |
|
|
294
|
+
| `maxPages` | `number` | Maximum pages to crawl |
|
|
295
|
+
| `crawlDelay` | `number` | Delay between requests (seconds) |
|
|
296
|
+
| `skipSitemap` | `boolean` | Skip sitemap discovery |
|
|
297
|
+
| `allowSubdomains` | `boolean` | Crawl across subdomains |
|
|
298
|
+
| `verbose` | `boolean` | Enable verbose logging |
|
|
299
|
+
| `artifacts` | `string[]` | Output formats: `llms.txt`, `llms-full.txt`, `markdown` |
|
|
300
|
+
| `hooks` | `object` | Hook functions (see below) |
|
|
301
|
+
|
|
302
|
+
## Hooks
|
|
303
|
+
|
|
304
|
+
Four hooks let you intercept and transform data at each stage of the crawl pipeline. Hooks receive mutable objects. Mutate in-place to transform output.
|
|
305
|
+
|
|
306
|
+
### `crawl:url`
|
|
307
|
+
|
|
308
|
+
Called before fetching a URL. Set `ctx.skip = true` to skip it entirely (saves the network request).
|
|
309
|
+
|
|
310
|
+
```typescript
|
|
311
|
+
defineConfig({
|
|
312
|
+
hooks: {
|
|
313
|
+
'crawl:url': (ctx) => {
|
|
314
|
+
// Skip large asset pages
|
|
315
|
+
if (ctx.url.includes('/assets/') || ctx.url.includes('/downloads/'))
|
|
316
|
+
ctx.skip = true
|
|
317
|
+
},
|
|
318
|
+
},
|
|
319
|
+
})
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
### `crawl:page`
|
|
323
|
+
|
|
324
|
+
Called after HTML-to-Markdown conversion, before storage. Mutate `page.title` or other fields. This replaces the `onPage` callback (which still works for backwards compatibility).
|
|
325
|
+
|
|
326
|
+
```typescript
|
|
327
|
+
defineConfig({
|
|
328
|
+
hooks: {
|
|
329
|
+
'crawl:page': (page) => {
|
|
330
|
+
// page.url, page.html, page.title, page.metadata, page.origin
|
|
331
|
+
page.title = page.title.replace(/ - Docs$/, '')
|
|
332
|
+
},
|
|
333
|
+
},
|
|
334
|
+
})
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
### `crawl:content`
|
|
338
|
+
|
|
339
|
+
Called before markdown is written to disk. Transform the final output content or change the file path.
|
|
340
|
+
|
|
341
|
+
```typescript
|
|
342
|
+
defineConfig({
|
|
343
|
+
hooks: {
|
|
344
|
+
'crawl:content': (ctx) => {
|
|
345
|
+
// ctx.url, ctx.title, ctx.content, ctx.filePath
|
|
346
|
+
ctx.content = ctx.content.replace(/CONFIDENTIAL/g, '[REDACTED]')
|
|
347
|
+
ctx.filePath = ctx.filePath.replace('.md', '.mdx')
|
|
348
|
+
},
|
|
349
|
+
},
|
|
350
|
+
})
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
### `crawl:done`
|
|
354
|
+
|
|
355
|
+
Called after all pages are crawled, before `llms.txt` generation. Filter or reorder results.
|
|
356
|
+
|
|
357
|
+
```typescript
|
|
358
|
+
defineConfig({
|
|
359
|
+
hooks: {
|
|
360
|
+
'crawl:done': (ctx) => {
|
|
361
|
+
// Remove short pages from the final output
|
|
362
|
+
const filtered = ctx.results.filter(r => r.content.length > 100)
|
|
363
|
+
ctx.results.length = 0
|
|
364
|
+
ctx.results.push(...filtered)
|
|
365
|
+
},
|
|
366
|
+
},
|
|
367
|
+
})
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
### Programmatic Hooks
|
|
371
|
+
|
|
372
|
+
Hooks can also be passed directly to `crawlAndGenerate`:
|
|
373
|
+
|
|
374
|
+
```typescript
|
|
375
|
+
import { crawlAndGenerate } from '@mdream/crawl'
|
|
376
|
+
|
|
377
|
+
await crawlAndGenerate({
|
|
378
|
+
urls: ['https://example.com'],
|
|
39
379
|
outputDir: './output',
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
380
|
+
hooks: {
|
|
381
|
+
'crawl:page': (page) => {
|
|
382
|
+
page.title = page.title.replace(/ \| Brand$/, '')
|
|
383
|
+
},
|
|
384
|
+
'crawl:done': (ctx) => {
|
|
385
|
+
ctx.results.sort((a, b) => a.url.localeCompare(b.url))
|
|
386
|
+
},
|
|
387
|
+
},
|
|
46
388
|
})
|
|
47
389
|
```
|
|
48
390
|
|
|
391
|
+
## Crawl Drivers
|
|
392
|
+
|
|
393
|
+
### HTTP Driver (default)
|
|
394
|
+
|
|
395
|
+
Uses [`ofetch`](https://github.com/unjs/ofetch) for page fetching with up to 20 concurrent requests.
|
|
396
|
+
|
|
397
|
+
- Automatic retry (2 retries with 500ms delay)
|
|
398
|
+
- 10 second request timeout
|
|
399
|
+
- Respects `Retry-After` headers on 429 responses (automatically adjusts crawl delay)
|
|
400
|
+
- Detects `text/markdown` content types and skips HTML-to-Markdown conversion
|
|
401
|
+
|
|
49
402
|
### Playwright Driver
|
|
50
403
|
|
|
51
|
-
|
|
404
|
+
For sites that require a browser to render content. Requires `crawlee` and `playwright` as peer dependencies (see [Setup](#setup)).
|
|
52
405
|
|
|
53
406
|
```bash
|
|
54
|
-
|
|
407
|
+
npx @mdream/crawl -u example.com --driver playwright
|
|
408
|
+
```
|
|
409
|
+
|
|
410
|
+
```typescript
|
|
411
|
+
await crawlAndGenerate({
|
|
412
|
+
urls: ['https://spa-app.example.com'],
|
|
413
|
+
outputDir: './output',
|
|
414
|
+
driver: 'playwright',
|
|
415
|
+
})
|
|
55
416
|
```
|
|
56
417
|
|
|
57
|
-
|
|
418
|
+
Waits for `networkidle` before extracting content. Automatically detects and uses system Chrome when available, falling back to Playwright's bundled browser.
|
|
58
419
|
|
|
59
|
-
|
|
420
|
+
## Sitemap and Robots.txt Discovery
|
|
60
421
|
|
|
61
|
-
|
|
422
|
+
By default, the crawler performs sitemap discovery before crawling:
|
|
62
423
|
|
|
63
|
-
|
|
424
|
+
1. Fetches `robots.txt` to find `Sitemap:` directives and `Crawl-delay` values
|
|
425
|
+
2. Loads sitemaps referenced in `robots.txt`
|
|
426
|
+
3. Falls back to `/sitemap.xml`
|
|
427
|
+
4. Tries common alternatives: `/sitemap_index.xml`, `/sitemaps.xml`, `/sitemap-index.xml`
|
|
428
|
+
5. Supports sitemap index files (recursively loads child sitemaps)
|
|
429
|
+
6. Filters discovered URLs against glob patterns and exclusion rules
|
|
64
430
|
|
|
65
|
-
|
|
66
|
-
2. **llms.txt** - Comprehensive site overview file following the [llms.txt specification](https://llmstxt.org/)
|
|
431
|
+
The home page is always included for metadata extraction (site name, description).
|
|
67
432
|
|
|
68
|
-
|
|
433
|
+
Disable with `--skip-sitemap` or `skipSitemap: true`.
|
|
69
434
|
|
|
70
|
-
|
|
71
|
-
# example.com
|
|
435
|
+
## Output Formats
|
|
72
436
|
|
|
73
|
-
|
|
437
|
+
### Individual Markdown Files
|
|
74
438
|
|
|
75
|
-
|
|
76
|
-
- [About Us](https---example-com-about.md): https://example.com/about
|
|
77
|
-
```
|
|
439
|
+
One `.md` file per crawled page, written to the output directory preserving the URL path structure. For example, `https://example.com/docs/getting-started` becomes `output/docs/getting-started.md`.
|
|
78
440
|
|
|
79
|
-
|
|
441
|
+
### llms.txt
|
|
80
442
|
|
|
81
|
-
|
|
82
|
-
- โ
**Purely Interactive**: No complex command-line options to remember
|
|
83
|
-
- โ
**Dual Crawler Support**: Fast HTTP crawler (default) + Playwright for JavaScript-heavy sites (requires `crawlee` and `playwright`)
|
|
84
|
-
- โ
**Smart Link Discovery**: Uses mdream's extraction plugin to find and follow internal links
|
|
85
|
-
- โ
**Rich Metadata Extraction**: Extracts titles, descriptions, keywords, and author info from all pages
|
|
86
|
-
- โ
**Comprehensive llms.txt Generation**: Creates complete site documentation files
|
|
87
|
-
- โ
**Configurable Depth Crawling**: Follow links with customizable depth limits (1-10 levels)
|
|
88
|
-
- โ
**Clean Markdown Conversion**: Powered by mdream's HTML-to-Markdown engine
|
|
89
|
-
- โ
**Performance Optimized**: HTTP crawler is 5-10x faster than browser-based crawling
|
|
90
|
-
- โ
**Beautiful Output**: Clean result display with progress indicators
|
|
91
|
-
- โ
**Automatic Cleanup**: Purges crawler storage after completion
|
|
92
|
-
- โ
**TypeScript Support**: Full type definitions with excellent IDE support
|
|
443
|
+
A site overview file following the [llms.txt specification](https://llmstxt.org/), listing all crawled pages with titles and links to their markdown files.
|
|
93
444
|
|
|
94
|
-
|
|
445
|
+
```markdown
|
|
446
|
+
# example.com
|
|
95
447
|
|
|
96
|
-
|
|
97
|
-
- ๐ **Documentation Sites**: Crawl entire documentation websites (GitBook, Docusaurus, etc.)
|
|
98
|
-
- ๐ข **Company Websites**: Generate comprehensive site overviews for LLM context
|
|
99
|
-
- ๐ **Blogs**: Process entire blog archives with proper categorization
|
|
100
|
-
- ๐ **Multi-Page Resources**: Any website where you need all pages, not just one
|
|
448
|
+
## Pages
|
|
101
449
|
|
|
102
|
-
|
|
450
|
+
- [Example Domain](index.md): https://example.com/
|
|
451
|
+
- [About Us](about.md): https://example.com/about
|
|
452
|
+
```
|
|
103
453
|
|
|
104
|
-
|
|
454
|
+
### llms-full.txt
|
|
105
455
|
|
|
106
|
-
|
|
456
|
+
Same structure as `llms.txt` but includes the full markdown content of every page inline.
|
package/dist/_chunks/crawl.mjs
CHANGED
|
@@ -2,11 +2,13 @@ import { mkdirSync } from "node:fs";
|
|
|
2
2
|
import { mkdir, writeFile } from "node:fs/promises";
|
|
3
3
|
import * as p from "@clack/prompts";
|
|
4
4
|
import { generateLlmsTxtArtifacts } from "@mdream/js/llms-txt";
|
|
5
|
+
import { createHooks } from "hookable";
|
|
5
6
|
import { htmlToMarkdown } from "mdream";
|
|
6
7
|
import { ofetch } from "ofetch";
|
|
7
8
|
import { dirname, join, normalize, resolve } from "pathe";
|
|
8
9
|
import { withHttps } from "ufo";
|
|
9
10
|
import picomatch from "picomatch";
|
|
11
|
+
import { getDomain } from "tldts";
|
|
10
12
|
//#region src/glob-utils.ts
|
|
11
13
|
function stripGlobTail(s) {
|
|
12
14
|
const idx = s.indexOf("*");
|
|
@@ -14,6 +16,14 @@ function stripGlobTail(s) {
|
|
|
14
16
|
}
|
|
15
17
|
const GLOB_CHAR_RE = /[*?[]/;
|
|
16
18
|
/**
|
|
19
|
+
* Extract the registrable domain from a hostname using the public suffix list.
|
|
20
|
+
* Handles multi-part TLDs (.co.uk, .github.io, etc.) correctly.
|
|
21
|
+
* Returns the hostname unchanged for IPs or when parsing fails.
|
|
22
|
+
*/
|
|
23
|
+
function getRegistrableDomain(hostname) {
|
|
24
|
+
return getDomain(hostname, { allowPrivateDomains: true }) || hostname;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
17
27
|
* Parse a URL that may contain glob patterns
|
|
18
28
|
* Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
|
|
19
29
|
*/
|
|
@@ -40,12 +50,15 @@ function parseUrlPattern(input) {
|
|
|
40
50
|
/**
|
|
41
51
|
* Check if a URL matches a glob pattern
|
|
42
52
|
*/
|
|
43
|
-
function matchesGlobPattern(url, parsedPattern) {
|
|
53
|
+
function matchesGlobPattern(url, parsedPattern, allowSubdomains = false) {
|
|
44
54
|
if (!parsedPattern.isGlob) return true;
|
|
45
55
|
try {
|
|
46
56
|
const urlObj = new URL(url);
|
|
47
57
|
const urlPath = urlObj.pathname + urlObj.search + urlObj.hash;
|
|
48
|
-
if (
|
|
58
|
+
if (allowSubdomains) {
|
|
59
|
+
const patternUrl = new URL(parsedPattern.baseUrl);
|
|
60
|
+
if (getRegistrableDomain(urlObj.hostname) !== getRegistrableDomain(patternUrl.hostname)) return false;
|
|
61
|
+
} else if (`${urlObj.protocol}//${urlObj.host}` !== parsedPattern.baseUrl) return false;
|
|
49
62
|
let pattern = parsedPattern.pattern;
|
|
50
63
|
if (pattern.endsWith("*") && !pattern.endsWith("**") && !pattern.endsWith("/*")) {
|
|
51
64
|
const base = pattern.slice(0, -1);
|
|
@@ -73,7 +86,7 @@ function getStartingUrl(parsedPattern) {
|
|
|
73
86
|
/**
|
|
74
87
|
* Check if a URL should be excluded based on exclude patterns
|
|
75
88
|
*/
|
|
76
|
-
function isUrlExcluded(url, excludePatterns) {
|
|
89
|
+
function isUrlExcluded(url, excludePatterns, allowSubdomains = false) {
|
|
77
90
|
if (!excludePatterns || excludePatterns.length === 0) return false;
|
|
78
91
|
try {
|
|
79
92
|
const urlObj = new URL(url);
|
|
@@ -81,7 +94,7 @@ function isUrlExcluded(url, excludePatterns) {
|
|
|
81
94
|
return excludePatterns.some((pattern) => {
|
|
82
95
|
if (pattern.includes("://")) {
|
|
83
96
|
const parsedPattern = parseUrlPattern(pattern);
|
|
84
|
-
if (parsedPattern.isGlob) return matchesGlobPattern(url, parsedPattern);
|
|
97
|
+
if (parsedPattern.isGlob) return matchesGlobPattern(url, parsedPattern, allowSubdomains);
|
|
85
98
|
return url === pattern;
|
|
86
99
|
}
|
|
87
100
|
if (pattern.startsWith("/")) return picomatch(pattern.endsWith("/*") ? pattern.replace("/*", "/**") : pattern)(urlPath);
|
|
@@ -159,7 +172,7 @@ async function loadSitemap(sitemapUrl) {
|
|
|
159
172
|
}
|
|
160
173
|
return urls;
|
|
161
174
|
}
|
|
162
|
-
function extractMetadataInline(parsedUrl) {
|
|
175
|
+
function extractMetadataInline(parsedUrl, allowedDomains) {
|
|
163
176
|
const links = /* @__PURE__ */ new Set();
|
|
164
177
|
let title = "";
|
|
165
178
|
let description = "";
|
|
@@ -172,8 +185,12 @@ function extractMetadataInline(parsedUrl) {
|
|
|
172
185
|
"a[href]": (el) => {
|
|
173
186
|
const href = el.attributes.href;
|
|
174
187
|
if (href) try {
|
|
175
|
-
const
|
|
176
|
-
|
|
188
|
+
const resolved = new URL(href, url);
|
|
189
|
+
const absoluteUrl = resolved.href;
|
|
190
|
+
if (allowedDomains) {
|
|
191
|
+
const domain = getRegistrableDomain(resolved.hostname);
|
|
192
|
+
if (domain && allowedDomains.has(domain)) links.add(absoluteUrl);
|
|
193
|
+
} else if (absoluteUrl.startsWith(originPrefix) || absoluteUrl === parsedUrl.origin) links.add(absoluteUrl);
|
|
177
194
|
} catch {}
|
|
178
195
|
},
|
|
179
196
|
"title": (el) => {
|
|
@@ -204,9 +221,9 @@ function extractMetadataInline(parsedUrl) {
|
|
|
204
221
|
})
|
|
205
222
|
};
|
|
206
223
|
}
|
|
207
|
-
function filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, allPatterns) {
|
|
208
|
-
if (hasGlobPatterns) return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude) && allPatterns.some((pattern) => matchesGlobPattern(url, pattern)));
|
|
209
|
-
return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude));
|
|
224
|
+
function filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, allPatterns, allowSubdomains = false) {
|
|
225
|
+
if (hasGlobPatterns) return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude, allowSubdomains) && allPatterns.some((pattern) => matchesGlobPattern(url, pattern, allowSubdomains)));
|
|
226
|
+
return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude, allowSubdomains));
|
|
210
227
|
}
|
|
211
228
|
async function runConcurrent(items, concurrency, fn) {
|
|
212
229
|
let idx = 0;
|
|
@@ -216,7 +233,11 @@ async function runConcurrent(items, concurrency, fn) {
|
|
|
216
233
|
await Promise.all(workers);
|
|
217
234
|
}
|
|
218
235
|
async function crawlAndGenerate(options, onProgress) {
|
|
219
|
-
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay: userCrawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, onPage } = options;
|
|
236
|
+
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay: userCrawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, allowSubdomains = false, hooks: hooksConfig, onPage } = options;
|
|
237
|
+
const hooks = createHooks();
|
|
238
|
+
if (hooksConfig) hooks.addHooks(hooksConfig);
|
|
239
|
+
if (onPage) hooks.hook("crawl:page", onPage);
|
|
240
|
+
const singlePageMode = maxDepth === 0;
|
|
220
241
|
const outputDir = resolve(normalize(rawOutputDir));
|
|
221
242
|
let crawlDelay = userCrawlDelay;
|
|
222
243
|
let patterns;
|
|
@@ -248,7 +269,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
248
269
|
generation: { status: "idle" }
|
|
249
270
|
};
|
|
250
271
|
const sitemapAttempts = [];
|
|
251
|
-
if (startingUrls.length > 0 && !skipSitemap) {
|
|
272
|
+
if (startingUrls.length > 0 && !skipSitemap && !singlePageMode) {
|
|
252
273
|
const baseUrl = new URL(startingUrls[0]).origin;
|
|
253
274
|
const homePageUrl = baseUrl;
|
|
254
275
|
onProgress?.(progress);
|
|
@@ -265,7 +286,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
265
286
|
const crawlDelayMatch = robotsContent.match(ROBOTS_CRAWL_DELAY_RE);
|
|
266
287
|
if (crawlDelayMatch) {
|
|
267
288
|
crawlDelay = Number.parseFloat(crawlDelayMatch[1]);
|
|
268
|
-
p.log(`[ROBOTS] Crawl-delay: ${crawlDelay}s`);
|
|
289
|
+
p.log.info(`[ROBOTS] Crawl-delay: ${crawlDelay}s`);
|
|
269
290
|
}
|
|
270
291
|
}
|
|
271
292
|
if (robotsContent) {
|
|
@@ -281,7 +302,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
281
302
|
url: sitemapUrl,
|
|
282
303
|
success: true
|
|
283
304
|
});
|
|
284
|
-
const filteredUrls = filterSitemapUrls(robotsUrls, hasGlobPatterns, exclude, patterns);
|
|
305
|
+
const filteredUrls = filterSitemapUrls(robotsUrls, hasGlobPatterns, exclude, patterns, allowSubdomains);
|
|
285
306
|
if (hasGlobPatterns) {
|
|
286
307
|
startingUrls = filteredUrls;
|
|
287
308
|
progress.sitemap.processed = filteredUrls.length;
|
|
@@ -310,7 +331,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
310
331
|
url: mainSitemapUrl,
|
|
311
332
|
success: true
|
|
312
333
|
});
|
|
313
|
-
const filteredUrls = filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, patterns);
|
|
334
|
+
const filteredUrls = filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, patterns, allowSubdomains);
|
|
314
335
|
if (hasGlobPatterns) {
|
|
315
336
|
startingUrls = filteredUrls;
|
|
316
337
|
progress.sitemap.found = sitemapUrls.length;
|
|
@@ -342,7 +363,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
342
363
|
url: sitemapUrl,
|
|
343
364
|
success: true
|
|
344
365
|
});
|
|
345
|
-
const filteredUrls = filterSitemapUrls(altUrls, hasGlobPatterns, exclude, patterns);
|
|
366
|
+
const filteredUrls = filterSitemapUrls(altUrls, hasGlobPatterns, exclude, patterns, allowSubdomains);
|
|
346
367
|
if (hasGlobPatterns) {
|
|
347
368
|
startingUrls = filteredUrls;
|
|
348
369
|
progress.sitemap.found = altUrls.length;
|
|
@@ -380,7 +401,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
380
401
|
progress.sitemap.status = "completed";
|
|
381
402
|
progress.crawling.total = startingUrls.length;
|
|
382
403
|
onProgress?.(progress);
|
|
383
|
-
} else if (skipSitemap && startingUrls.length > 0) {
|
|
404
|
+
} else if ((skipSitemap || singlePageMode) && startingUrls.length > 0) {
|
|
384
405
|
progress.sitemap.status = "completed";
|
|
385
406
|
progress.sitemap.found = 0;
|
|
386
407
|
progress.sitemap.processed = 0;
|
|
@@ -390,10 +411,24 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
390
411
|
mkdirSync(outputDir, { recursive: true });
|
|
391
412
|
const results = [];
|
|
392
413
|
const processedUrls = /* @__PURE__ */ new Set();
|
|
414
|
+
const allowedRegistrableDomains = allowSubdomains ? new Set(startingUrls.map((u) => {
|
|
415
|
+
try {
|
|
416
|
+
return getRegistrableDomain(new URL(u).hostname);
|
|
417
|
+
} catch {
|
|
418
|
+
return "";
|
|
419
|
+
}
|
|
420
|
+
}).filter(Boolean)) : void 0;
|
|
393
421
|
const shouldCrawlUrl = (url) => {
|
|
394
|
-
if (isUrlExcluded(url, exclude)) return false;
|
|
395
|
-
if (!hasGlobPatterns)
|
|
396
|
-
|
|
422
|
+
if (isUrlExcluded(url, exclude, allowSubdomains)) return false;
|
|
423
|
+
if (!hasGlobPatterns) {
|
|
424
|
+
if (allowedRegistrableDomains) try {
|
|
425
|
+
return allowedRegistrableDomains.has(getRegistrableDomain(new URL(url).hostname));
|
|
426
|
+
} catch {
|
|
427
|
+
return false;
|
|
428
|
+
}
|
|
429
|
+
return true;
|
|
430
|
+
}
|
|
431
|
+
return patterns.some((pattern) => matchesGlobPattern(url, pattern, allowSubdomains));
|
|
397
432
|
};
|
|
398
433
|
const recordLatency = (ms) => {
|
|
399
434
|
const lat = progress.crawling.latency;
|
|
@@ -418,25 +453,41 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
418
453
|
links: []
|
|
419
454
|
};
|
|
420
455
|
} else {
|
|
421
|
-
const { extraction, getMetadata } = extractMetadataInline(parsedUrl);
|
|
456
|
+
const { extraction, getMetadata } = extractMetadataInline(parsedUrl, allowedRegistrableDomains);
|
|
422
457
|
md = htmlToMarkdown(content, {
|
|
423
458
|
origin: pageOrigin,
|
|
424
459
|
extraction
|
|
425
460
|
});
|
|
426
461
|
metadata = getMetadata();
|
|
427
462
|
}
|
|
428
|
-
|
|
429
|
-
if (
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
463
|
+
let title = initialTitle || metadata.title;
|
|
464
|
+
if (shouldProcessMarkdown) {
|
|
465
|
+
const pageData = {
|
|
466
|
+
url,
|
|
467
|
+
html: isMarkdown ? "" : content,
|
|
468
|
+
title,
|
|
469
|
+
metadata,
|
|
470
|
+
origin: pageOrigin
|
|
471
|
+
};
|
|
472
|
+
await hooks.callHook("crawl:page", pageData);
|
|
473
|
+
title = pageData.title;
|
|
474
|
+
}
|
|
436
475
|
let filePath;
|
|
437
476
|
if (shouldProcessMarkdown && generateIndividualMd) {
|
|
438
|
-
const
|
|
477
|
+
const urlPath = parsedUrl.pathname === "/" ? "/index" : parsedUrl.pathname;
|
|
478
|
+
const hostPrefix = allowSubdomains ? [parsedUrl.hostname.replace(URL_PATH_UNSAFE_CHARS_RE, "-")] : [];
|
|
479
|
+
const pathSegments = urlPath.replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0);
|
|
480
|
+
const safeSegments = [...hostPrefix, ...pathSegments.map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"))];
|
|
439
481
|
filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
|
|
482
|
+
const contentCtx = {
|
|
483
|
+
url,
|
|
484
|
+
title,
|
|
485
|
+
content: md,
|
|
486
|
+
filePath
|
|
487
|
+
};
|
|
488
|
+
await hooks.callHook("crawl:content", contentCtx);
|
|
489
|
+
md = contentCtx.content;
|
|
490
|
+
filePath = contentCtx.filePath;
|
|
440
491
|
const fileDir = dirname(filePath);
|
|
441
492
|
if (fileDir && !createdDirs.has(fileDir)) {
|
|
442
493
|
await mkdir(fileDir, { recursive: true });
|
|
@@ -460,7 +511,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
460
511
|
progress.crawling.processed = results.length;
|
|
461
512
|
onProgress?.(progress);
|
|
462
513
|
}
|
|
463
|
-
if (followLinks && depth < maxDepth) {
|
|
514
|
+
if (followLinks && !singlePageMode && depth < maxDepth) {
|
|
464
515
|
const filteredLinks = metadata.links.filter((link) => shouldCrawlUrl(link));
|
|
465
516
|
for (const link of filteredLinks) processedUrls.add(link);
|
|
466
517
|
}
|
|
@@ -477,6 +528,12 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
477
528
|
requestHandler: async ({ request, page }) => {
|
|
478
529
|
progress.crawling.currentUrl = request.loadedUrl;
|
|
479
530
|
onProgress?.(progress);
|
|
531
|
+
const urlCtx = {
|
|
532
|
+
url: request.loadedUrl,
|
|
533
|
+
skip: false
|
|
534
|
+
};
|
|
535
|
+
await hooks.callHook("crawl:url", urlCtx);
|
|
536
|
+
if (urlCtx.skip) return;
|
|
480
537
|
const fetchStart = Date.now();
|
|
481
538
|
await page.waitForLoadState("networkidle");
|
|
482
539
|
const title = await page.title();
|
|
@@ -519,8 +576,10 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
519
576
|
try {
|
|
520
577
|
await crawler.run(initialRequests);
|
|
521
578
|
} catch (error) {
|
|
579
|
+
const msg = error instanceof Error ? error.message : "";
|
|
580
|
+
if (msg.includes("wmic") || msg.includes("ENOENT")) throw new Error(`Crawlee failed to spawn a system process (${msg}). On Windows 11+, wmic.exe is no longer available. Upgrade crawlee to >=3.16.0 or use the HTTP driver instead (--driver http).`);
|
|
522
581
|
if (verbose) {
|
|
523
|
-
console.error(`[CRAWLER ERROR] ${
|
|
582
|
+
console.error(`[CRAWLER ERROR] ${msg || "Unknown error"}`);
|
|
524
583
|
console.error(`[CRAWLER ERROR] Stack trace:`, error instanceof Error ? error.stack : "No stack trace");
|
|
525
584
|
}
|
|
526
585
|
throw error;
|
|
@@ -533,6 +592,12 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
533
592
|
const delay = crawlDelay;
|
|
534
593
|
await new Promise((resolve) => setTimeout(resolve, delay * 1e3));
|
|
535
594
|
}
|
|
595
|
+
const urlCtx = {
|
|
596
|
+
url,
|
|
597
|
+
skip: false
|
|
598
|
+
};
|
|
599
|
+
await hooks.callHook("crawl:url", urlCtx);
|
|
600
|
+
if (urlCtx.skip) return;
|
|
536
601
|
try {
|
|
537
602
|
const fetchStart = Date.now();
|
|
538
603
|
const response = await ofetch.raw(url, {
|
|
@@ -576,6 +641,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
576
641
|
});
|
|
577
642
|
progress.crawling.status = "completed";
|
|
578
643
|
onProgress?.(progress);
|
|
644
|
+
await hooks.callHook("crawl:done", { results });
|
|
579
645
|
if (results.some((r) => r.success)) {
|
|
580
646
|
progress.generation.status = "generating";
|
|
581
647
|
onProgress?.(progress);
|
package/dist/cli.mjs
CHANGED
|
@@ -4,6 +4,16 @@ import * as p from "@clack/prompts";
|
|
|
4
4
|
import { dirname, join, resolve } from "pathe";
|
|
5
5
|
import { withHttps } from "ufo";
|
|
6
6
|
import { fileURLToPath } from "node:url";
|
|
7
|
+
import { loadConfig } from "c12";
|
|
8
|
+
//#region src/config.ts
|
|
9
|
+
async function loadMdreamConfig(cwd) {
|
|
10
|
+
const { config } = await loadConfig({
|
|
11
|
+
name: "mdream",
|
|
12
|
+
cwd
|
|
13
|
+
});
|
|
14
|
+
return config || {};
|
|
15
|
+
}
|
|
16
|
+
//#endregion
|
|
7
17
|
//#region src/cli.ts
|
|
8
18
|
const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
|
|
9
19
|
const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
|
|
@@ -198,7 +208,8 @@ Usage:
|
|
|
198
208
|
Options:
|
|
199
209
|
-u, --url <url> Website URL to crawl
|
|
200
210
|
-o, --output <dir> Output directory (default: output)
|
|
201
|
-
-d, --depth <number> Crawl depth (default: 3)
|
|
211
|
+
-d, --depth <number> Crawl depth, 0 for single page (default: 3)
|
|
212
|
+
--single-page Only process the given URL(s), no crawling (alias for --depth 0)
|
|
202
213
|
--driver <http|playwright> Crawler driver (default: http)
|
|
203
214
|
--artifacts <list> Comma-separated list of artifacts: llms.txt,llms-full.txt,markdown (default: all)
|
|
204
215
|
--origin <url> Origin URL for resolving relative paths (overrides auto-detection)
|
|
@@ -208,6 +219,7 @@ Options:
|
|
|
208
219
|
--crawl-delay <seconds> Crawl delay in seconds
|
|
209
220
|
--exclude <pattern> Exclude URLs matching glob patterns (can be used multiple times)
|
|
210
221
|
--skip-sitemap Skip sitemap.xml and robots.txt discovery
|
|
222
|
+
--allow-subdomains Crawl across subdomains of the same root domain
|
|
211
223
|
-v, --verbose Enable verbose logging
|
|
212
224
|
-h, --help Show this help message
|
|
213
225
|
--version Show version number
|
|
@@ -220,6 +232,7 @@ Examples:
|
|
|
220
232
|
@mdream/crawl -u example.com --exclude "*/admin/*" --exclude "*/api/*"
|
|
221
233
|
@mdream/crawl -u example.com --verbose
|
|
222
234
|
@mdream/crawl -u example.com --skip-sitemap
|
|
235
|
+
@mdream/crawl -u example.com --driver playwright --single-page
|
|
223
236
|
`);
|
|
224
237
|
process.exit(0);
|
|
225
238
|
}
|
|
@@ -273,10 +286,10 @@ Examples:
|
|
|
273
286
|
process.exit(1);
|
|
274
287
|
}
|
|
275
288
|
}
|
|
276
|
-
const depthStr = getArgValue("--depth") || getArgValue("-d") || "3";
|
|
277
|
-
const depth = Number
|
|
278
|
-
if (Number.
|
|
279
|
-
p.log.error("Error: Depth must be between
|
|
289
|
+
const depthStr = args.includes("--single-page") ? "0" : getArgValue("--depth") || getArgValue("-d") || "3";
|
|
290
|
+
const depth = Number(depthStr);
|
|
291
|
+
if (!Number.isInteger(depth) || depth < 0 || depth > 10) {
|
|
292
|
+
p.log.error("Error: Depth must be an integer between 0 and 10");
|
|
280
293
|
process.exit(1);
|
|
281
294
|
}
|
|
282
295
|
const driver = getArgValue("--driver");
|
|
@@ -330,13 +343,14 @@ Examples:
|
|
|
330
343
|
const patterns = [parsed];
|
|
331
344
|
const verbose = args.includes("--verbose") || args.includes("-v");
|
|
332
345
|
const skipSitemap = args.includes("--skip-sitemap");
|
|
346
|
+
const allowSubdomains = args.includes("--allow-subdomains");
|
|
333
347
|
if (skipSitemap && parsed.isGlob) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
|
|
334
348
|
return {
|
|
335
349
|
urls: [url],
|
|
336
350
|
outputDir: resolve(getArgValue("--output") || getArgValue("-o") || "output"),
|
|
337
351
|
driver: driver || "http",
|
|
338
352
|
maxRequestsPerCrawl: Number.parseInt(maxPagesStr || String(Number.MAX_SAFE_INTEGER)),
|
|
339
|
-
followLinks:
|
|
353
|
+
followLinks: depth > 0,
|
|
340
354
|
maxDepth: depth,
|
|
341
355
|
generateLlmsTxt: artifacts.includes("llms.txt"),
|
|
342
356
|
generateLlmsFullTxt: artifacts.includes("llms-full.txt"),
|
|
@@ -348,14 +362,28 @@ Examples:
|
|
|
348
362
|
crawlDelay: crawlDelayStr ? Number.parseInt(crawlDelayStr) : void 0,
|
|
349
363
|
exclude: excludePatterns.length > 0 ? excludePatterns : void 0,
|
|
350
364
|
verbose,
|
|
351
|
-
skipSitemap
|
|
365
|
+
skipSitemap,
|
|
366
|
+
allowSubdomains
|
|
352
367
|
};
|
|
353
368
|
}
|
|
354
369
|
async function main() {
|
|
355
370
|
const cliOptions = parseCliArgs();
|
|
371
|
+
const fileConfig = await loadMdreamConfig();
|
|
356
372
|
let options;
|
|
357
373
|
if (cliOptions) {
|
|
358
|
-
|
|
374
|
+
const configExclude = fileConfig.exclude || [];
|
|
375
|
+
const cliExclude = cliOptions.exclude || [];
|
|
376
|
+
options = {
|
|
377
|
+
...cliOptions,
|
|
378
|
+
driver: cliOptions.driver || fileConfig.driver || "http",
|
|
379
|
+
maxDepth: cliOptions.maxDepth ?? fileConfig.maxDepth,
|
|
380
|
+
crawlDelay: cliOptions.crawlDelay ?? fileConfig.crawlDelay,
|
|
381
|
+
skipSitemap: cliOptions.skipSitemap || fileConfig.skipSitemap || false,
|
|
382
|
+
allowSubdomains: cliOptions.allowSubdomains || fileConfig.allowSubdomains || false,
|
|
383
|
+
verbose: cliOptions.verbose || fileConfig.verbose || false,
|
|
384
|
+
exclude: configExclude.length > 0 || cliExclude.length > 0 ? [...configExclude, ...cliExclude] : void 0,
|
|
385
|
+
hooks: fileConfig.hooks
|
|
386
|
+
};
|
|
359
387
|
p.intro(`โ๏ธ mdream v${version}`);
|
|
360
388
|
const formats = [];
|
|
361
389
|
if (options.generateLlmsTxt) formats.push("llms.txt");
|
|
@@ -369,6 +397,7 @@ async function main() {
|
|
|
369
397
|
`Formats: ${formats.join(", ")}`,
|
|
370
398
|
options.exclude && options.exclude.length > 0 && `Exclude: ${options.exclude.join(", ")}`,
|
|
371
399
|
options.skipSitemap && `Skip sitemap: Yes`,
|
|
400
|
+
options.allowSubdomains && `Allow subdomains: Yes`,
|
|
372
401
|
options.verbose && `Verbose: Enabled`
|
|
373
402
|
].filter(Boolean);
|
|
374
403
|
p.note(summary.join("\n"), "Configuration");
|
|
@@ -447,7 +476,9 @@ async function main() {
|
|
|
447
476
|
process.exit(0);
|
|
448
477
|
}
|
|
449
478
|
main().catch((error) => {
|
|
450
|
-
|
|
479
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
480
|
+
if (msg.includes("wmic") || msg.includes("ENOENT") && process.platform === "win32") p.log.error("Crawlee failed because wmic.exe is not available on this system. Windows 11 removed wmic.exe, which older crawlee versions depend on for memory monitoring.\nFix: upgrade crawlee to >=3.16.0 or switch to the HTTP driver (--driver http).");
|
|
481
|
+
else p.log.error(`Unexpected error: ${msg}`);
|
|
451
482
|
process.exit(1);
|
|
452
483
|
});
|
|
453
484
|
//#endregion
|
package/dist/index.d.mts
CHANGED
|
@@ -6,6 +6,22 @@ interface PageData {
|
|
|
6
6
|
metadata: PageMetadata;
|
|
7
7
|
origin: string;
|
|
8
8
|
}
|
|
9
|
+
interface CrawlHooks {
|
|
10
|
+
'crawl:url': (ctx: {
|
|
11
|
+
url: string;
|
|
12
|
+
skip: boolean;
|
|
13
|
+
}) => void | Promise<void>;
|
|
14
|
+
'crawl:page': (page: PageData) => void | Promise<void>;
|
|
15
|
+
'crawl:content': (ctx: {
|
|
16
|
+
url: string;
|
|
17
|
+
title: string;
|
|
18
|
+
content: string;
|
|
19
|
+
filePath: string;
|
|
20
|
+
}) => void | Promise<void>;
|
|
21
|
+
'crawl:done': (ctx: {
|
|
22
|
+
results: CrawlResult[];
|
|
23
|
+
}) => void | Promise<void>;
|
|
24
|
+
}
|
|
9
25
|
interface CrawlOptions {
|
|
10
26
|
urls: string[];
|
|
11
27
|
outputDir: string;
|
|
@@ -26,8 +42,23 @@ interface CrawlOptions {
|
|
|
26
42
|
descriptionOverride?: string;
|
|
27
43
|
verbose?: boolean;
|
|
28
44
|
skipSitemap?: boolean;
|
|
45
|
+
allowSubdomains?: boolean;
|
|
46
|
+
hooks?: Partial<{ [K in keyof CrawlHooks]: CrawlHooks[K] | CrawlHooks[K][] }>;
|
|
29
47
|
onPage?: (page: PageData) => Promise<void> | void;
|
|
30
48
|
}
|
|
49
|
+
interface MdreamCrawlConfig {
|
|
50
|
+
exclude?: string[];
|
|
51
|
+
driver?: 'http' | 'playwright';
|
|
52
|
+
maxDepth?: number;
|
|
53
|
+
maxPages?: number;
|
|
54
|
+
crawlDelay?: number;
|
|
55
|
+
skipSitemap?: boolean;
|
|
56
|
+
allowSubdomains?: boolean;
|
|
57
|
+
verbose?: boolean;
|
|
58
|
+
artifacts?: ('llms.txt' | 'llms-full.txt' | 'markdown')[];
|
|
59
|
+
hooks?: Partial<{ [K in keyof CrawlHooks]: CrawlHooks[K] | CrawlHooks[K][] }>;
|
|
60
|
+
}
|
|
61
|
+
declare function defineConfig(config: MdreamCrawlConfig): MdreamCrawlConfig;
|
|
31
62
|
interface ParsedUrlPattern {
|
|
32
63
|
baseUrl: string;
|
|
33
64
|
pattern: string;
|
|
@@ -79,4 +110,4 @@ interface CrawlProgress {
|
|
|
79
110
|
}
|
|
80
111
|
declare function crawlAndGenerate(options: CrawlOptions, onProgress?: (progress: CrawlProgress) => void): Promise<CrawlResult[]>;
|
|
81
112
|
//#endregion
|
|
82
|
-
export { type CrawlOptions, type CrawlResult, type PageData, crawlAndGenerate };
|
|
113
|
+
export { type CrawlHooks, type CrawlOptions, type CrawlResult, type MdreamCrawlConfig, type PageData, crawlAndGenerate, defineConfig };
|
package/dist/index.mjs
CHANGED
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "1.0.0-beta.
|
|
4
|
+
"version": "1.0.0-beta.14",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -55,13 +55,16 @@
|
|
|
55
55
|
},
|
|
56
56
|
"dependencies": {
|
|
57
57
|
"@clack/prompts": "^1.1.0",
|
|
58
|
+
"c12": "^3.0.4",
|
|
59
|
+
"hookable": "^5.5.3",
|
|
58
60
|
"nypm": "^0.6.5",
|
|
59
61
|
"ofetch": "^1.5.1",
|
|
60
62
|
"pathe": "^2.0.3",
|
|
61
63
|
"picomatch": "^4.0.3",
|
|
64
|
+
"tldts": "^7.0.26",
|
|
62
65
|
"ufo": "^1.6.3",
|
|
63
|
-
"@mdream/js": "1.0.0-beta.
|
|
64
|
-
"mdream": "1.0.0-beta.
|
|
66
|
+
"@mdream/js": "1.0.0-beta.14",
|
|
67
|
+
"mdream": "1.0.0-beta.14"
|
|
65
68
|
},
|
|
66
69
|
"devDependencies": {
|
|
67
70
|
"@types/picomatch": "^4.0.2"
|