@mdream/crawl 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +9 -0
- package/README.md +102 -0
- package/bin/mdream-crawl.mjs +3 -0
- package/dist/_chunks/crawl-NJU1Dyc-.mjs +445 -0
- package/dist/cli.d.mts +1 -0
- package/dist/cli.mjs +458 -0
- package/dist/index.d.mts +74 -0
- package/dist/index.mjs +66 -0
- package/package.json +62 -0
package/LICENSE.md
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Harlan Wilton
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
6
|
+
|
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# @mdream/crawl
|
|
2
|
+
|
|
3
|
+
Multi-page website crawler that generates comprehensive llms.txt files by following internal links and processing entire websites using mdream HTML-to-Markdown conversion.
|
|
4
|
+
|
|
5
|
+
> **Note**: For single-page HTML-to-Markdown conversion, use the [`mdream`](../mdream) binary instead. `@mdream/crawl` is specifically designed for crawling entire websites with multiple pages.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
npm install @mdream/crawl
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
Simply run the command to start the interactive multi-page website crawler:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
npx @mdream/crawl
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
The crawler will automatically discover and follow internal links to crawl entire websites. The interactive interface provides:
|
|
22
|
+
- โจ Beautiful prompts powered by Clack
|
|
23
|
+
- ๐ฏ Step-by-step configuration guidance
|
|
24
|
+
- โ
Input validation and helpful hints
|
|
25
|
+
- ๐ Configuration summary before crawling
|
|
26
|
+
- ๐ Clean result display with progress indicators
|
|
27
|
+
- ๐งน Automatic cleanup of crawler storage
|
|
28
|
+
|
|
29
|
+
## Programmatic Usage
|
|
30
|
+
|
|
31
|
+
You can also use @mdream/crawl programmatically in your Node.js applications:
|
|
32
|
+
|
|
33
|
+
```typescript
|
|
34
|
+
import { crawlAndGenerate, generateLlmsTxt } from '@mdream/crawl'
|
|
35
|
+
|
|
36
|
+
// Crawl entire websites programmatically
|
|
37
|
+
const results = await crawlAndGenerate({
|
|
38
|
+
urls: ['https://docs.example.com'], // Starting URLs for website crawling
|
|
39
|
+
outputDir: './output',
|
|
40
|
+
maxRequestsPerCrawl: 100, // Maximum pages per website
|
|
41
|
+
generateLlmsTxt: true,
|
|
42
|
+
followLinks: true, // Follow internal links to crawl entire site
|
|
43
|
+
maxDepth: 3, // How deep to follow links
|
|
44
|
+
driver: 'http', // or 'playwright' for JS-heavy sites
|
|
45
|
+
verbose: true
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
// Generate llms.txt manually from existing results
|
|
49
|
+
await generateLlmsTxt({
|
|
50
|
+
siteName: 'Example Site',
|
|
51
|
+
description: 'Documentation for Example Site',
|
|
52
|
+
results: crawlResults,
|
|
53
|
+
outputPath: './output/llms.txt'
|
|
54
|
+
})
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Output
|
|
58
|
+
|
|
59
|
+
The crawler generates comprehensive output from entire websites:
|
|
60
|
+
|
|
61
|
+
1. **Markdown files** - One `.md` file per crawled page with clean markdown content
|
|
62
|
+
2. **llms.txt** - Comprehensive site overview file following the [llms.txt specification](https://llmstxt.org/)
|
|
63
|
+
|
|
64
|
+
### Example llms.txt output
|
|
65
|
+
|
|
66
|
+
```markdown
|
|
67
|
+
# example.com
|
|
68
|
+
|
|
69
|
+
## Pages
|
|
70
|
+
|
|
71
|
+
- [Example Domain](https---example-com-.md): https://example.com/
|
|
72
|
+
- [About Us](https---example-com-about.md): https://example.com/about
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Features
|
|
76
|
+
|
|
77
|
+
- โ
**Multi-Page Website Crawling**: Designed specifically for crawling entire websites by following internal links
|
|
78
|
+
- โ
**Purely Interactive**: No complex command-line options to remember
|
|
79
|
+
- โ
**Dual Crawler Support**: Fast HTTP crawler (default) + Playwright for JavaScript-heavy sites
|
|
80
|
+
- โ
**Smart Link Discovery**: Uses mdream's extraction plugin to find and follow internal links
|
|
81
|
+
- โ
**Rich Metadata Extraction**: Extracts titles, descriptions, keywords, and author info from all pages
|
|
82
|
+
- โ
**Comprehensive llms.txt Generation**: Creates complete site documentation files
|
|
83
|
+
- โ
**Configurable Depth Crawling**: Follow links with customizable depth limits (1-10 levels)
|
|
84
|
+
- โ
**Clean Markdown Conversion**: Powered by mdream's HTML-to-Markdown engine
|
|
85
|
+
- โ
**Performance Optimized**: HTTP crawler is 5-10x faster than browser-based crawling
|
|
86
|
+
- โ
**Beautiful Output**: Clean result display with progress indicators
|
|
87
|
+
- โ
**Automatic Cleanup**: Purges crawler storage after completion
|
|
88
|
+
- โ
**TypeScript Support**: Full type definitions with excellent IDE support
|
|
89
|
+
|
|
90
|
+
## Use Cases
|
|
91
|
+
|
|
92
|
+
Perfect for:
|
|
93
|
+
- ๐ **Documentation Sites**: Crawl entire documentation websites (GitBook, Docusaurus, etc.)
|
|
94
|
+
- ๐ข **Company Websites**: Generate comprehensive site overviews for LLM context
|
|
95
|
+
- ๐ **Blogs**: Process entire blog archives with proper categorization
|
|
96
|
+
- ๐ **Multi-Page Resources**: Any website where you need all pages, not just one
|
|
97
|
+
|
|
98
|
+
**Not suitable for**: Single-page conversions (use `mdream` binary instead)
|
|
99
|
+
|
|
100
|
+
## License
|
|
101
|
+
|
|
102
|
+
MIT
|
|
@@ -0,0 +1,445 @@
|
|
|
1
|
+
import { existsSync, mkdirSync } from "node:fs";
|
|
2
|
+
import { writeFile } from "node:fs/promises";
|
|
3
|
+
import { HttpCrawler, Sitemap, purgeDefaultStorages } from "crawlee";
|
|
4
|
+
import { generateLlmsTxtArtifacts, htmlToMarkdown } from "mdream";
|
|
5
|
+
import { withMinimalPreset } from "mdream/preset/minimal";
|
|
6
|
+
import { dirname, join, normalize, resolve } from "pathe";
|
|
7
|
+
import { minimatch } from "minimatch";
|
|
8
|
+
import { extractionPlugin } from "mdream/plugins";
|
|
9
|
+
|
|
10
|
+
//#region ../../node_modules/.pnpm/ufo@1.6.1/node_modules/ufo/dist/index.mjs
|
|
11
|
+
const r = String.fromCharCode;
|
|
12
|
+
const PROTOCOL_REGEX = /^[\s\w\0+.-]{2,}:([/\\]{2})?/;
|
|
13
|
+
function withHttps(input) {
|
|
14
|
+
return withProtocol(input, "https://");
|
|
15
|
+
}
|
|
16
|
+
function withProtocol(input, protocol) {
|
|
17
|
+
let match = input.match(PROTOCOL_REGEX);
|
|
18
|
+
if (!match) match = input.match(/^\/{2,}/);
|
|
19
|
+
if (!match) return protocol + input;
|
|
20
|
+
return protocol + input.slice(match[0].length);
|
|
21
|
+
}
|
|
22
|
+
const protocolRelative = Symbol.for("ufo:protocolRelative");
|
|
23
|
+
|
|
24
|
+
//#endregion
|
|
25
|
+
//#region src/glob-utils.ts
|
|
26
|
+
/**
|
|
27
|
+
* Parse a URL that may contain glob patterns
|
|
28
|
+
* Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
|
|
29
|
+
*/
|
|
30
|
+
function parseUrlPattern(input) {
|
|
31
|
+
const hasGlob = input.includes("*") || input.includes("?") || input.includes("[");
|
|
32
|
+
if (!hasGlob) return {
|
|
33
|
+
baseUrl: input,
|
|
34
|
+
pattern: "",
|
|
35
|
+
isGlob: false
|
|
36
|
+
};
|
|
37
|
+
try {
|
|
38
|
+
const urlWithProtocol = input.startsWith("http") ? input : `https://${input}`;
|
|
39
|
+
const urlWithoutGlob = urlWithProtocol.replace(/\*.*$/, "");
|
|
40
|
+
const url = new URL(urlWithoutGlob);
|
|
41
|
+
const baseUrl = `${url.protocol}//${url.host}`;
|
|
42
|
+
const patternStart = input.indexOf(url.host) + url.host.length;
|
|
43
|
+
const pattern = input.substring(patternStart);
|
|
44
|
+
return {
|
|
45
|
+
baseUrl,
|
|
46
|
+
pattern,
|
|
47
|
+
isGlob: true
|
|
48
|
+
};
|
|
49
|
+
} catch (error) {
|
|
50
|
+
throw new Error(`Invalid URL pattern: "${input}". Please provide a valid URL with glob patterns (e.g., "example.com/docs/*" or "https://example.com/api/**").`);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Check if a URL matches a glob pattern
|
|
55
|
+
*/
|
|
56
|
+
function matchesGlobPattern(url, parsedPattern) {
|
|
57
|
+
if (!parsedPattern.isGlob) return true;
|
|
58
|
+
try {
|
|
59
|
+
const urlObj = new URL(url);
|
|
60
|
+
const urlPath = urlObj.pathname + urlObj.search + urlObj.hash;
|
|
61
|
+
const urlBase = `${urlObj.protocol}//${urlObj.host}`;
|
|
62
|
+
if (urlBase !== parsedPattern.baseUrl) return false;
|
|
63
|
+
return minimatch(urlPath, parsedPattern.pattern);
|
|
64
|
+
} catch {
|
|
65
|
+
return false;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Get the starting URL for crawling from a glob pattern
|
|
70
|
+
* For https://nuxtseo.com/docs/**, we want to start at https://nuxtseo.com
|
|
71
|
+
*/
|
|
72
|
+
function getStartingUrl(parsedPattern) {
|
|
73
|
+
if (!parsedPattern.isGlob) return withHttps(parsedPattern.baseUrl);
|
|
74
|
+
const pattern = parsedPattern.pattern;
|
|
75
|
+
const firstGlobIndex = pattern.search(/[*?[]/);
|
|
76
|
+
if (firstGlobIndex === -1) return withHttps(parsedPattern.baseUrl + pattern);
|
|
77
|
+
const beforeGlob = pattern.substring(0, firstGlobIndex);
|
|
78
|
+
const lastSlash = beforeGlob.lastIndexOf("/");
|
|
79
|
+
const pathBeforeGlob = lastSlash >= 0 ? beforeGlob.substring(0, lastSlash + 1) : "/";
|
|
80
|
+
return withHttps(parsedPattern.baseUrl + pathBeforeGlob);
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Check if a URL should be excluded based on exclude patterns
|
|
84
|
+
*/
|
|
85
|
+
function isUrlExcluded(url, excludePatterns) {
|
|
86
|
+
if (!excludePatterns || excludePatterns.length === 0) return false;
|
|
87
|
+
try {
|
|
88
|
+
const urlObj = new URL(url);
|
|
89
|
+
const urlPath = urlObj.pathname + urlObj.search + urlObj.hash;
|
|
90
|
+
return excludePatterns.some((pattern) => {
|
|
91
|
+
if (pattern.includes("://")) {
|
|
92
|
+
const parsedPattern = parseUrlPattern(pattern);
|
|
93
|
+
if (parsedPattern.isGlob) return matchesGlobPattern(url, parsedPattern);
|
|
94
|
+
return url === pattern;
|
|
95
|
+
}
|
|
96
|
+
if (pattern.startsWith("/")) {
|
|
97
|
+
const adjustedPattern = pattern.endsWith("/*") ? pattern.replace("/*", "/**") : pattern;
|
|
98
|
+
return minimatch(urlPath, adjustedPattern);
|
|
99
|
+
}
|
|
100
|
+
return minimatch(urlPath, pattern) || minimatch(urlPath.substring(1), pattern);
|
|
101
|
+
});
|
|
102
|
+
} catch {
|
|
103
|
+
return false;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Validate glob pattern syntax
|
|
108
|
+
*/
|
|
109
|
+
function validateGlobPattern(pattern) {
|
|
110
|
+
try {
|
|
111
|
+
parseUrlPattern(pattern);
|
|
112
|
+
return void 0;
|
|
113
|
+
} catch (error) {
|
|
114
|
+
return `Invalid glob pattern: ${error instanceof Error ? error.message : error}`;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
//#endregion
|
|
119
|
+
//#region src/metadata-extractor.ts
|
|
120
|
+
function extractMetadata(html, url) {
|
|
121
|
+
const links = [];
|
|
122
|
+
let title = "";
|
|
123
|
+
let description = "";
|
|
124
|
+
let keywords = "";
|
|
125
|
+
let author = "";
|
|
126
|
+
const extractionPluginInstance = extractionPlugin({
|
|
127
|
+
"a[href]": (element) => {
|
|
128
|
+
const href = element.attributes?.href;
|
|
129
|
+
if (href) try {
|
|
130
|
+
const absoluteUrl = new URL(href, url).href;
|
|
131
|
+
if (!links.includes(absoluteUrl)) links.push(absoluteUrl);
|
|
132
|
+
} catch {}
|
|
133
|
+
},
|
|
134
|
+
"title": (element) => {
|
|
135
|
+
if (!title && element.textContent) title = element.textContent.trim();
|
|
136
|
+
},
|
|
137
|
+
"meta[name=\"description\"]": (element) => {
|
|
138
|
+
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
139
|
+
},
|
|
140
|
+
"meta[property=\"og:description\"]": (element) => {
|
|
141
|
+
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
142
|
+
},
|
|
143
|
+
"meta[name=\"keywords\"]": (element) => {
|
|
144
|
+
if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
|
|
145
|
+
},
|
|
146
|
+
"meta[name=\"author\"]": (element) => {
|
|
147
|
+
if (!author && element.attributes?.content) author = element.attributes.content.trim();
|
|
148
|
+
},
|
|
149
|
+
"meta[property=\"og:title\"]": (element) => {
|
|
150
|
+
if (!title && element.attributes?.content) title = element.attributes.content.trim();
|
|
151
|
+
}
|
|
152
|
+
});
|
|
153
|
+
htmlToMarkdown(html, {
|
|
154
|
+
plugins: [extractionPluginInstance],
|
|
155
|
+
origin: new URL(url).origin
|
|
156
|
+
});
|
|
157
|
+
return {
|
|
158
|
+
title: title || new URL(url).pathname,
|
|
159
|
+
description: description || void 0,
|
|
160
|
+
keywords: keywords || void 0,
|
|
161
|
+
author: author || void 0,
|
|
162
|
+
links: links.filter((link) => {
|
|
163
|
+
try {
|
|
164
|
+
const linkUrl = new URL(link);
|
|
165
|
+
const baseUrl = new URL(url);
|
|
166
|
+
return linkUrl.hostname === baseUrl.hostname;
|
|
167
|
+
} catch {
|
|
168
|
+
return false;
|
|
169
|
+
}
|
|
170
|
+
})
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
//#endregion
|
|
175
|
+
//#region src/crawl.ts
|
|
176
|
+
async function crawlAndGenerate(options, onProgress) {
|
|
177
|
+
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride } = options;
|
|
178
|
+
const outputDir = resolve(normalize(rawOutputDir));
|
|
179
|
+
let patterns;
|
|
180
|
+
try {
|
|
181
|
+
patterns = globPatterns.length > 0 ? globPatterns : urls.map(parseUrlPattern);
|
|
182
|
+
} catch (error) {
|
|
183
|
+
throw new Error(`Invalid URL pattern: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
184
|
+
}
|
|
185
|
+
let startingUrls = patterns.map(getStartingUrl);
|
|
186
|
+
const progress = {
|
|
187
|
+
sitemap: {
|
|
188
|
+
status: "discovering",
|
|
189
|
+
found: 0,
|
|
190
|
+
processed: 0
|
|
191
|
+
},
|
|
192
|
+
crawling: {
|
|
193
|
+
status: "starting",
|
|
194
|
+
total: 0,
|
|
195
|
+
processed: 0
|
|
196
|
+
},
|
|
197
|
+
generation: { status: "idle" }
|
|
198
|
+
};
|
|
199
|
+
if (startingUrls.length > 0) {
|
|
200
|
+
const baseUrl = new URL(startingUrls[0]).origin;
|
|
201
|
+
const homePageUrl = baseUrl;
|
|
202
|
+
onProgress?.(progress);
|
|
203
|
+
const robotsUrl = new URL("/robots.txt", baseUrl).toString();
|
|
204
|
+
const robotsResponse = await fetch(robotsUrl);
|
|
205
|
+
if (robotsResponse.ok) {
|
|
206
|
+
const robotsContent = await robotsResponse.text();
|
|
207
|
+
const sitemapMatches = robotsContent.match(/Sitemap:\s*(.*)/gi);
|
|
208
|
+
if (sitemapMatches && sitemapMatches.length > 0) {
|
|
209
|
+
progress.sitemap.found = sitemapMatches.length;
|
|
210
|
+
progress.sitemap.status = "processing";
|
|
211
|
+
onProgress?.(progress);
|
|
212
|
+
const robotsSitemaps = sitemapMatches.map((match) => match.replace(/Sitemap:\s*/i, "").trim());
|
|
213
|
+
for (const sitemapUrl of robotsSitemaps) try {
|
|
214
|
+
const { urls: robotsUrls } = await Sitemap.load(sitemapUrl);
|
|
215
|
+
const hasGlobPatterns = patterns.some((p) => p.isGlob);
|
|
216
|
+
if (hasGlobPatterns) {
|
|
217
|
+
const filteredUrls = robotsUrls.filter((url) => {
|
|
218
|
+
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
219
|
+
});
|
|
220
|
+
startingUrls = filteredUrls;
|
|
221
|
+
progress.sitemap.processed = filteredUrls.length;
|
|
222
|
+
onProgress?.(progress);
|
|
223
|
+
break;
|
|
224
|
+
} else {
|
|
225
|
+
const filteredUrls = robotsUrls.filter((url) => {
|
|
226
|
+
return !isUrlExcluded(url, exclude);
|
|
227
|
+
});
|
|
228
|
+
if (filteredUrls.length > 0) {
|
|
229
|
+
startingUrls = filteredUrls;
|
|
230
|
+
progress.sitemap.processed = filteredUrls.length;
|
|
231
|
+
onProgress?.(progress);
|
|
232
|
+
break;
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
} catch {
|
|
236
|
+
continue;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
try {
|
|
241
|
+
const { urls: sitemapUrls } = await Sitemap.load(`${baseUrl}/sitemap.xml`);
|
|
242
|
+
const hasGlobPatterns = patterns.some((p) => p.isGlob);
|
|
243
|
+
if (hasGlobPatterns) {
|
|
244
|
+
const filteredUrls = sitemapUrls.filter((url) => {
|
|
245
|
+
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
246
|
+
});
|
|
247
|
+
startingUrls = filteredUrls;
|
|
248
|
+
progress.sitemap.found = sitemapUrls.length;
|
|
249
|
+
progress.sitemap.processed = filteredUrls.length;
|
|
250
|
+
onProgress?.(progress);
|
|
251
|
+
} else {
|
|
252
|
+
const filteredUrls = sitemapUrls.filter((url) => {
|
|
253
|
+
return !isUrlExcluded(url, exclude);
|
|
254
|
+
});
|
|
255
|
+
if (filteredUrls.length > 0) {
|
|
256
|
+
startingUrls = filteredUrls;
|
|
257
|
+
progress.sitemap.found = sitemapUrls.length;
|
|
258
|
+
progress.sitemap.processed = filteredUrls.length;
|
|
259
|
+
onProgress?.(progress);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
} catch {
|
|
263
|
+
const commonSitemaps = [
|
|
264
|
+
`${baseUrl}/sitemap_index.xml`,
|
|
265
|
+
`${baseUrl}/sitemaps.xml`,
|
|
266
|
+
`${baseUrl}/sitemap-index.xml`
|
|
267
|
+
];
|
|
268
|
+
for (const sitemapUrl of commonSitemaps) try {
|
|
269
|
+
const { urls: altUrls } = await Sitemap.load(sitemapUrl);
|
|
270
|
+
const hasGlobPatterns = patterns.some((p) => p.isGlob);
|
|
271
|
+
if (hasGlobPatterns) {
|
|
272
|
+
const filteredUrls = altUrls.filter((url) => {
|
|
273
|
+
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
274
|
+
});
|
|
275
|
+
startingUrls = filteredUrls;
|
|
276
|
+
progress.sitemap.found = altUrls.length;
|
|
277
|
+
progress.sitemap.processed = filteredUrls.length;
|
|
278
|
+
onProgress?.(progress);
|
|
279
|
+
break;
|
|
280
|
+
} else {
|
|
281
|
+
const filteredUrls = altUrls.filter((url) => {
|
|
282
|
+
return !isUrlExcluded(url, exclude);
|
|
283
|
+
});
|
|
284
|
+
if (filteredUrls.length > 0) {
|
|
285
|
+
startingUrls = filteredUrls;
|
|
286
|
+
progress.sitemap.found = altUrls.length;
|
|
287
|
+
progress.sitemap.processed = filteredUrls.length;
|
|
288
|
+
onProgress?.(progress);
|
|
289
|
+
break;
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
} catch {
|
|
293
|
+
continue;
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
if (!startingUrls.includes(homePageUrl)) startingUrls.unshift(homePageUrl);
|
|
297
|
+
progress.sitemap.status = "completed";
|
|
298
|
+
progress.crawling.total = startingUrls.length;
|
|
299
|
+
onProgress?.(progress);
|
|
300
|
+
}
|
|
301
|
+
if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });
|
|
302
|
+
const results = [];
|
|
303
|
+
const processedUrls = new Set();
|
|
304
|
+
const shouldCrawlUrl = (url) => {
|
|
305
|
+
if (isUrlExcluded(url, exclude)) return false;
|
|
306
|
+
if (!patterns.some((p) => p.isGlob)) return true;
|
|
307
|
+
return patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
308
|
+
};
|
|
309
|
+
const createRequestHandler = (crawlerType) => {
|
|
310
|
+
return async ({ request, body, page, enqueueLinks }) => {
|
|
311
|
+
const startTime = Date.now();
|
|
312
|
+
progress.crawling.currentUrl = request.loadedUrl;
|
|
313
|
+
onProgress?.(progress);
|
|
314
|
+
const baseUrl = new URL(startingUrls[0]).origin;
|
|
315
|
+
const homePageUrl = baseUrl;
|
|
316
|
+
let html;
|
|
317
|
+
let title;
|
|
318
|
+
if (crawlerType === "playwright") {
|
|
319
|
+
await page.waitForLoadState("networkidle");
|
|
320
|
+
title = await page.title();
|
|
321
|
+
html = await page.innerHTML("html");
|
|
322
|
+
} else {
|
|
323
|
+
html = typeof body === "string" ? body : body.toString();
|
|
324
|
+
title = "";
|
|
325
|
+
}
|
|
326
|
+
const metadata = extractMetadata(html, request.loadedUrl);
|
|
327
|
+
if (!title) title = metadata.title;
|
|
328
|
+
const shouldProcessMarkdown = shouldCrawlUrl(request.loadedUrl);
|
|
329
|
+
let md = "";
|
|
330
|
+
if (shouldProcessMarkdown) md = htmlToMarkdown(html, withMinimalPreset({ origin: origin || new URL(request.loadedUrl).origin }));
|
|
331
|
+
let filePath;
|
|
332
|
+
if (shouldProcessMarkdown) {
|
|
333
|
+
const urlObj = new URL(request.loadedUrl);
|
|
334
|
+
const urlPath = urlObj.pathname === "/" ? "/index" : urlObj.pathname;
|
|
335
|
+
const pathSegments = urlPath.replace(/\/$/, "").split("/").filter((seg) => seg.length > 0);
|
|
336
|
+
const safeSegments = pathSegments.map((seg) => seg.replace(/[^\w\-]/g, "-"));
|
|
337
|
+
const filename = safeSegments.length > 0 ? safeSegments.join("/") : "index";
|
|
338
|
+
const safeFilename = normalize(`${filename}.md`);
|
|
339
|
+
filePath = join(outputDir, "md", safeFilename);
|
|
340
|
+
if (generateIndividualMd) {
|
|
341
|
+
const fileDir = dirname(filePath);
|
|
342
|
+
if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
|
|
343
|
+
await writeFile(filePath, md, "utf-8");
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
const isHomePage = request.loadedUrl === homePageUrl;
|
|
347
|
+
if (shouldProcessMarkdown || isHomePage) {
|
|
348
|
+
const result = {
|
|
349
|
+
url: request.loadedUrl,
|
|
350
|
+
title,
|
|
351
|
+
content: md,
|
|
352
|
+
filePath: generateIndividualMd && shouldProcessMarkdown ? filePath : void 0,
|
|
353
|
+
timestamp: startTime,
|
|
354
|
+
success: true,
|
|
355
|
+
metadata,
|
|
356
|
+
depth: request.userData?.depth || 0
|
|
357
|
+
};
|
|
358
|
+
results.push(result);
|
|
359
|
+
progress.crawling.processed = results.length;
|
|
360
|
+
onProgress?.(progress);
|
|
361
|
+
}
|
|
362
|
+
if (followLinks && (request.userData?.depth || 0) < maxDepth) {
|
|
363
|
+
const currentDepth = (request.userData?.depth || 0) + 1;
|
|
364
|
+
const filteredLinks = metadata.links.filter((link) => {
|
|
365
|
+
return shouldCrawlUrl(link);
|
|
366
|
+
});
|
|
367
|
+
if (enqueueLinks) await enqueueLinks({
|
|
368
|
+
urls: filteredLinks,
|
|
369
|
+
userData: { depth: currentDepth }
|
|
370
|
+
});
|
|
371
|
+
else for (const link of filteredLinks) if (!processedUrls.has(link)) processedUrls.add(link);
|
|
372
|
+
}
|
|
373
|
+
};
|
|
374
|
+
};
|
|
375
|
+
let crawler;
|
|
376
|
+
const crawlerOptions = {
|
|
377
|
+
requestHandler: createRequestHandler(driver),
|
|
378
|
+
maxRequestsPerCrawl,
|
|
379
|
+
respectRobotsTxtFile: true
|
|
380
|
+
};
|
|
381
|
+
if (crawlDelay) crawlerOptions.requestHandlerTimeoutMillis = crawlDelay * 1e3;
|
|
382
|
+
if (driver === "playwright") {
|
|
383
|
+
const { PlaywrightCrawler: PlaywrightCrawlerClass } = await import("crawlee");
|
|
384
|
+
crawler = new PlaywrightCrawlerClass(crawlerOptions);
|
|
385
|
+
} else crawler = new HttpCrawler(crawlerOptions);
|
|
386
|
+
const initialRequests = startingUrls.map((url) => ({
|
|
387
|
+
url,
|
|
388
|
+
userData: { depth: 0 }
|
|
389
|
+
}));
|
|
390
|
+
progress.crawling.status = "processing";
|
|
391
|
+
progress.crawling.total = startingUrls.length;
|
|
392
|
+
onProgress?.(progress);
|
|
393
|
+
await crawler.run(initialRequests);
|
|
394
|
+
progress.crawling.status = "completed";
|
|
395
|
+
onProgress?.(progress);
|
|
396
|
+
if (results.some((r$1) => r$1.success)) {
|
|
397
|
+
progress.generation.status = "generating";
|
|
398
|
+
onProgress?.(progress);
|
|
399
|
+
const successfulResults = results.filter((r$1) => r$1.success);
|
|
400
|
+
const firstUrl = new URL(withHttps(urls[0]));
|
|
401
|
+
const homePageResult = successfulResults.find((r$1) => {
|
|
402
|
+
const resultUrl = new URL(withHttps(r$1.url));
|
|
403
|
+
const homeUrl = new URL(withHttps(urls[0]));
|
|
404
|
+
return resultUrl.href === homeUrl.href;
|
|
405
|
+
});
|
|
406
|
+
const siteName = siteNameOverride || homePageResult?.metadata?.title || firstUrl.hostname;
|
|
407
|
+
const description = descriptionOverride || homePageResult?.metadata?.description || successfulResults[0]?.metadata?.description;
|
|
408
|
+
if (generateLlmsTxt || generateLlmsFullTxt) {
|
|
409
|
+
progress.generation.current = "Generating llms.txt files";
|
|
410
|
+
onProgress?.(progress);
|
|
411
|
+
const contentResults = successfulResults.filter((result) => result.content && result.content.trim().length > 0);
|
|
412
|
+
const processedFiles = contentResults.map((result) => ({
|
|
413
|
+
filePath: result.filePath,
|
|
414
|
+
title: result.title,
|
|
415
|
+
content: result.content,
|
|
416
|
+
url: result.url,
|
|
417
|
+
metadata: result.metadata
|
|
418
|
+
}));
|
|
419
|
+
const llmsResult = await generateLlmsTxtArtifacts({
|
|
420
|
+
files: processedFiles,
|
|
421
|
+
siteName,
|
|
422
|
+
description,
|
|
423
|
+
origin: origin || firstUrl.origin,
|
|
424
|
+
generateFull: generateLlmsFullTxt
|
|
425
|
+
});
|
|
426
|
+
if (generateLlmsTxt) {
|
|
427
|
+
progress.generation.current = "Writing llms.txt";
|
|
428
|
+
onProgress?.(progress);
|
|
429
|
+
await writeFile(join(outputDir, "llms.txt"), llmsResult.llmsTxt, "utf-8");
|
|
430
|
+
}
|
|
431
|
+
if (generateLlmsFullTxt && llmsResult.llmsFullTxt) {
|
|
432
|
+
progress.generation.current = "Writing llms-full.txt";
|
|
433
|
+
onProgress?.(progress);
|
|
434
|
+
await writeFile(join(outputDir, "llms-full.txt"), llmsResult.llmsFullTxt, "utf-8");
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
progress.generation.status = "completed";
|
|
438
|
+
onProgress?.(progress);
|
|
439
|
+
}
|
|
440
|
+
await purgeDefaultStorages();
|
|
441
|
+
return results;
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
//#endregion
|
|
445
|
+
export { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps };
|
package/dist/cli.d.mts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { };
|
package/dist/cli.mjs
ADDED
|
@@ -0,0 +1,458 @@
|
|
|
1
|
+
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-NJU1Dyc-.mjs";
|
|
2
|
+
import { readFileSync } from "node:fs";
|
|
3
|
+
import { dirname, join, resolve } from "pathe";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
5
|
+
import * as p$1 from "@clack/prompts";
|
|
6
|
+
import * as p from "@clack/prompts";
|
|
7
|
+
import { addDependency } from "nypm";
|
|
8
|
+
|
|
9
|
+
//#region src/playwright-utils.ts
|
|
10
|
+
async function checkPlaywrightInstallation() {
|
|
11
|
+
try {
|
|
12
|
+
await import("playwright");
|
|
13
|
+
return true;
|
|
14
|
+
} catch {
|
|
15
|
+
return false;
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
async function promptPlaywrightInstall() {
|
|
19
|
+
const shouldInstall = await p$1.confirm({
|
|
20
|
+
message: "Playwright is required for the Playwright driver. Install it now?",
|
|
21
|
+
initialValue: true
|
|
22
|
+
});
|
|
23
|
+
if (p$1.isCancel(shouldInstall) || !shouldInstall) return false;
|
|
24
|
+
const s = p$1.spinner();
|
|
25
|
+
s.start("Installing Playwright...");
|
|
26
|
+
try {
|
|
27
|
+
await addDependency("playwright", { workspace: true });
|
|
28
|
+
s.stop("Playwright installed successfully!");
|
|
29
|
+
return true;
|
|
30
|
+
} catch {
|
|
31
|
+
try {
|
|
32
|
+
await addDependency("playwright");
|
|
33
|
+
s.stop("Playwright installed successfully!");
|
|
34
|
+
return true;
|
|
35
|
+
} catch (fallbackError) {
|
|
36
|
+
s.stop("Failed to install Playwright");
|
|
37
|
+
p$1.log.error(`Installation failed: ${fallbackError}`);
|
|
38
|
+
return false;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
async function ensurePlaywrightInstalled() {
|
|
43
|
+
const isInstalled = await checkPlaywrightInstallation();
|
|
44
|
+
if (isInstalled) return true;
|
|
45
|
+
p$1.log.warn("Playwright driver selected but Playwright is not installed.");
|
|
46
|
+
const installed = await promptPlaywrightInstall();
|
|
47
|
+
if (!installed) {
|
|
48
|
+
p$1.log.error("Cannot proceed with Playwright driver without Playwright installed.");
|
|
49
|
+
return false;
|
|
50
|
+
}
|
|
51
|
+
return true;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
//#endregion
|
|
55
|
+
//#region src/cli.ts
|
|
56
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
57
|
+
const packageJsonPath = join(__dirname, "..", "package.json");
|
|
58
|
+
const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
|
|
59
|
+
const version = packageJson.version;
|
|
60
|
+
async function interactiveCrawl() {
|
|
61
|
+
console.clear();
|
|
62
|
+
p.intro("โ๏ธ @mdream/crawl");
|
|
63
|
+
const urlsInput = await p.text({
|
|
64
|
+
message: "Enter starting URL for crawling (supports glob patterns):",
|
|
65
|
+
placeholder: "e.g. docs.example.com, site.com/docs/**",
|
|
66
|
+
validate: (value) => {
|
|
67
|
+
if (!value) return "Please enter at least one URL";
|
|
68
|
+
const urls$1 = value.split(",").map((url) => url.trim());
|
|
69
|
+
for (const url of urls$1) {
|
|
70
|
+
const globError = validateGlobPattern(url);
|
|
71
|
+
if (globError) return globError;
|
|
72
|
+
try {
|
|
73
|
+
const parsed = parseUrlPattern(url);
|
|
74
|
+
if (!parsed.isGlob) try {
|
|
75
|
+
new URL(withHttps(url));
|
|
76
|
+
} catch {
|
|
77
|
+
return `Invalid URL: ${withHttps(url)}`;
|
|
78
|
+
}
|
|
79
|
+
} catch (error) {
|
|
80
|
+
return error instanceof Error ? error.message : "Invalid URL pattern";
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
});
|
|
85
|
+
if (p.isCancel(urlsInput)) {
|
|
86
|
+
p.cancel("Operation cancelled.");
|
|
87
|
+
return null;
|
|
88
|
+
}
|
|
89
|
+
const urls = urlsInput.split(",").map((url) => url.trim());
|
|
90
|
+
let globPatterns;
|
|
91
|
+
try {
|
|
92
|
+
globPatterns = urls.map(parseUrlPattern);
|
|
93
|
+
} catch (error) {
|
|
94
|
+
p.cancel(error instanceof Error ? error.message : "Invalid URL pattern");
|
|
95
|
+
return null;
|
|
96
|
+
}
|
|
97
|
+
const outputDir = ".";
|
|
98
|
+
const crawlerOptions = await p.group({
|
|
99
|
+
driver: () => p.select({
|
|
100
|
+
message: "Select crawler driver:",
|
|
101
|
+
options: [{
|
|
102
|
+
value: "http",
|
|
103
|
+
label: "HTTP Crawler (Fast, for static content)",
|
|
104
|
+
hint: "Recommended"
|
|
105
|
+
}, {
|
|
106
|
+
value: "playwright",
|
|
107
|
+
label: "Playwright (Slower, supports JavaScript)"
|
|
108
|
+
}],
|
|
109
|
+
initialValue: "http"
|
|
110
|
+
}),
|
|
111
|
+
maxDepth: () => p.text({
|
|
112
|
+
message: "Clicks to page (crawl depth):",
|
|
113
|
+
placeholder: "3",
|
|
114
|
+
defaultValue: "3",
|
|
115
|
+
validate: (value) => {
|
|
116
|
+
const num = Number.parseInt(value);
|
|
117
|
+
if (Number.isNaN(num) || num < 1 || num > 10) return "Depth must be between 1 and 10";
|
|
118
|
+
}
|
|
119
|
+
})
|
|
120
|
+
}, { onCancel: () => {
|
|
121
|
+
p.cancel("Operation cancelled.");
|
|
122
|
+
process.exit(0);
|
|
123
|
+
} });
|
|
124
|
+
const advancedOptions = await p.group({ outputFormats: () => p.multiselect({
|
|
125
|
+
message: "Select output formats:",
|
|
126
|
+
options: [
|
|
127
|
+
{
|
|
128
|
+
value: "llms.txt",
|
|
129
|
+
label: "llms.txt (basic format)",
|
|
130
|
+
hint: "Recommended"
|
|
131
|
+
},
|
|
132
|
+
{
|
|
133
|
+
value: "llms-full.txt",
|
|
134
|
+
label: "llms-full.txt (extended format)"
|
|
135
|
+
},
|
|
136
|
+
{
|
|
137
|
+
value: "markdown",
|
|
138
|
+
label: "Individual Markdown files"
|
|
139
|
+
}
|
|
140
|
+
],
|
|
141
|
+
initialValues: [
|
|
142
|
+
"llms.txt",
|
|
143
|
+
"llms-full.txt",
|
|
144
|
+
"markdown"
|
|
145
|
+
]
|
|
146
|
+
}) }, { onCancel: () => {
|
|
147
|
+
p.cancel("Operation cancelled.");
|
|
148
|
+
process.exit(0);
|
|
149
|
+
} });
|
|
150
|
+
const firstUrl = urls[0];
|
|
151
|
+
const inferredOrigin = (() => {
|
|
152
|
+
try {
|
|
153
|
+
const url = new URL(withHttps(firstUrl));
|
|
154
|
+
return `${url.protocol}//${url.host}`;
|
|
155
|
+
} catch {
|
|
156
|
+
return void 0;
|
|
157
|
+
}
|
|
158
|
+
})();
|
|
159
|
+
const outputFormats = advancedOptions.outputFormats.map((f) => {
|
|
160
|
+
switch (f) {
|
|
161
|
+
case "llms.txt": return "llms.txt";
|
|
162
|
+
case "llms-full.txt": return "llms-full.txt";
|
|
163
|
+
case "markdown": return "Individual MD files";
|
|
164
|
+
default: return f;
|
|
165
|
+
}
|
|
166
|
+
});
|
|
167
|
+
const summary = [
|
|
168
|
+
`URLs: ${urls.join(", ")}`,
|
|
169
|
+
`Output: ${outputDir}`,
|
|
170
|
+
`Driver: ${crawlerOptions.driver}`,
|
|
171
|
+
`Max pages: Unlimited`,
|
|
172
|
+
`Follow links: Yes (depth ${crawlerOptions.maxDepth})`,
|
|
173
|
+
`Output formats: ${outputFormats.join(", ")}`,
|
|
174
|
+
`Sitemap discovery: Automatic`,
|
|
175
|
+
inferredOrigin && `Origin: ${inferredOrigin}`
|
|
176
|
+
].filter(Boolean);
|
|
177
|
+
p.note(summary.join("\n"), "Crawl Configuration");
|
|
178
|
+
const shouldProceed = await p.confirm({
|
|
179
|
+
message: "Start crawling?",
|
|
180
|
+
initialValue: true
|
|
181
|
+
});
|
|
182
|
+
if (p.isCancel(shouldProceed) || !shouldProceed) {
|
|
183
|
+
p.cancel("Crawl cancelled.");
|
|
184
|
+
return null;
|
|
185
|
+
}
|
|
186
|
+
return {
|
|
187
|
+
urls,
|
|
188
|
+
outputDir: resolve(outputDir),
|
|
189
|
+
driver: crawlerOptions.driver,
|
|
190
|
+
maxRequestsPerCrawl: Number.MAX_SAFE_INTEGER,
|
|
191
|
+
followLinks: true,
|
|
192
|
+
maxDepth: Number.parseInt(crawlerOptions.maxDepth),
|
|
193
|
+
generateLlmsTxt: advancedOptions.outputFormats.includes("llms.txt"),
|
|
194
|
+
generateLlmsFullTxt: advancedOptions.outputFormats.includes("llms-full.txt"),
|
|
195
|
+
generateIndividualMd: advancedOptions.outputFormats.includes("markdown"),
|
|
196
|
+
origin: inferredOrigin,
|
|
197
|
+
globPatterns
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
async function showCrawlResults(successful, failed, outputDir, generatedFiles) {
|
|
201
|
+
const messages = [];
|
|
202
|
+
if (successful > 0) messages.push(`โ
${successful} pages processed successfully`);
|
|
203
|
+
if (failed > 0) messages.push(`โ ${failed} pages failed`);
|
|
204
|
+
if (generatedFiles.length > 0) messages.push(`๐ Generated: ${generatedFiles.join(", ")}`);
|
|
205
|
+
messages.push(`๐ Output: ${outputDir}`);
|
|
206
|
+
p.note(messages.join("\n"), "Crawl Results");
|
|
207
|
+
if (successful > 0) p.outro("๐ Crawling completed successfully!");
|
|
208
|
+
else p.outro("โ Crawling failed - no pages processed");
|
|
209
|
+
}
|
|
210
|
+
function parseCliArgs() {
|
|
211
|
+
const args = process.argv.slice(2);
|
|
212
|
+
if (args.includes("--help") || args.includes("-h")) {
|
|
213
|
+
console.log(`
|
|
214
|
+
@mdream/crawl v${version}
|
|
215
|
+
|
|
216
|
+
Multi-page website crawler that generates comprehensive llms.txt files
|
|
217
|
+
|
|
218
|
+
Usage:
|
|
219
|
+
@mdream/crawl [options] <url> Crawl a website with CLI flags
|
|
220
|
+
@mdream/crawl Start interactive mode
|
|
221
|
+
|
|
222
|
+
Options:
|
|
223
|
+
-u, --url <url> Website URL to crawl
|
|
224
|
+
-o, --output <dir> Output directory (default: .)
|
|
225
|
+
-d, --depth <number> Crawl depth (default: 3)
|
|
226
|
+
--driver <http|playwright> Crawler driver (default: http)
|
|
227
|
+
--artifacts <list> Comma-separated list of artifacts: llms.txt,llms-full.txt,markdown (default: all)
|
|
228
|
+
--origin <url> Origin URL for resolving relative paths (overrides auto-detection)
|
|
229
|
+
--site-name <name> Override site name (overrides auto-extracted title)
|
|
230
|
+
--description <desc> Override site description (overrides auto-extracted description)
|
|
231
|
+
--max-pages <number> Maximum pages to crawl (default: unlimited)
|
|
232
|
+
--crawl-delay <seconds> Crawl delay in seconds
|
|
233
|
+
--exclude <pattern> Exclude URLs matching glob patterns (can be used multiple times)
|
|
234
|
+
-h, --help Show this help message
|
|
235
|
+
--version Show version number
|
|
236
|
+
|
|
237
|
+
Note: Sitemap discovery and robots.txt checking are automatic
|
|
238
|
+
|
|
239
|
+
Examples:
|
|
240
|
+
@mdream/crawl -u harlanzw.com --artifacts "llms.txt,markdown"
|
|
241
|
+
@mdream/crawl --url https://docs.example.com --depth 2 --artifacts "llms-full.txt"
|
|
242
|
+
@mdream/crawl -u example.com --exclude "*/admin/*" --exclude "*/api/*"
|
|
243
|
+
`);
|
|
244
|
+
process.exit(0);
|
|
245
|
+
}
|
|
246
|
+
if (args.includes("--version")) {
|
|
247
|
+
console.log(`@mdream/crawl v${version}`);
|
|
248
|
+
process.exit(0);
|
|
249
|
+
}
|
|
250
|
+
if (args.length === 0) return null;
|
|
251
|
+
const getArgValue = (flag) => {
|
|
252
|
+
const index = args.findIndex((arg) => arg === flag || arg === flag.replace("--", "-"));
|
|
253
|
+
return index >= 0 && index + 1 < args.length ? args[index + 1] : void 0;
|
|
254
|
+
};
|
|
255
|
+
const getArgValues = (flag) => {
|
|
256
|
+
const values = [];
|
|
257
|
+
for (let i = 0; i < args.length; i++) if (args[i] === flag || args[i] === flag.replace("--", "-")) {
|
|
258
|
+
if (i + 1 < args.length && !args[i + 1].startsWith("-")) values.push(args[i + 1]);
|
|
259
|
+
}
|
|
260
|
+
return values;
|
|
261
|
+
};
|
|
262
|
+
const urlFromFlag = getArgValue("--url") || getArgValue("-u");
|
|
263
|
+
const urlFromArgs = args.find((arg) => !arg.startsWith("-") && !args[args.indexOf(arg) - 1]?.startsWith("-"));
|
|
264
|
+
const url = urlFromFlag || urlFromArgs;
|
|
265
|
+
if (!url) {
|
|
266
|
+
p.log.error("Error: URL is required when using CLI arguments");
|
|
267
|
+
p.log.info("Use --help for usage information or run without arguments for interactive mode");
|
|
268
|
+
process.exit(1);
|
|
269
|
+
}
|
|
270
|
+
const globError = validateGlobPattern(url);
|
|
271
|
+
if (globError) {
|
|
272
|
+
p.log.error(`Error: ${globError}`);
|
|
273
|
+
process.exit(1);
|
|
274
|
+
}
|
|
275
|
+
let parsed;
|
|
276
|
+
try {
|
|
277
|
+
parsed = parseUrlPattern(url);
|
|
278
|
+
} catch (error) {
|
|
279
|
+
p.log.error(`Error: ${error instanceof Error ? error.message : "Invalid URL pattern"}`);
|
|
280
|
+
process.exit(1);
|
|
281
|
+
}
|
|
282
|
+
if (!parsed.isGlob) try {
|
|
283
|
+
new URL(withHttps(url));
|
|
284
|
+
} catch {
|
|
285
|
+
p.log.error(`Error: Invalid URL: ${withHttps(url)}`);
|
|
286
|
+
process.exit(1);
|
|
287
|
+
}
|
|
288
|
+
const excludePatterns = getArgValues("--exclude");
|
|
289
|
+
for (const pattern of excludePatterns) {
|
|
290
|
+
const excludeError = validateGlobPattern(pattern);
|
|
291
|
+
if (excludeError) {
|
|
292
|
+
p.log.error(`Error in exclude pattern: ${excludeError}`);
|
|
293
|
+
process.exit(1);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
const depthStr = getArgValue("--depth") || getArgValue("-d") || "3";
|
|
297
|
+
const depth = Number.parseInt(depthStr);
|
|
298
|
+
if (Number.isNaN(depth) || depth < 1 || depth > 10) {
|
|
299
|
+
p.log.error("Error: Depth must be between 1 and 10");
|
|
300
|
+
process.exit(1);
|
|
301
|
+
}
|
|
302
|
+
const driver = getArgValue("--driver");
|
|
303
|
+
if (driver && driver !== "http" && driver !== "playwright") {
|
|
304
|
+
p.log.error("Error: Driver must be either \"http\" or \"playwright\"");
|
|
305
|
+
process.exit(1);
|
|
306
|
+
}
|
|
307
|
+
const maxPagesStr = getArgValue("--max-pages");
|
|
308
|
+
if (maxPagesStr) {
|
|
309
|
+
const maxPages = Number.parseInt(maxPagesStr);
|
|
310
|
+
if (Number.isNaN(maxPages) || maxPages < 1) {
|
|
311
|
+
p.log.error("Error: Max pages must be a positive number");
|
|
312
|
+
process.exit(1);
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
const crawlDelayStr = getArgValue("--crawl-delay");
|
|
316
|
+
if (crawlDelayStr) {
|
|
317
|
+
const crawlDelay = Number.parseInt(crawlDelayStr);
|
|
318
|
+
if (Number.isNaN(crawlDelay) || crawlDelay < 0) {
|
|
319
|
+
p.log.error("Error: Crawl delay must be a non-negative number");
|
|
320
|
+
process.exit(1);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
const artifactsStr = getArgValue("--artifacts");
|
|
324
|
+
const artifacts = artifactsStr ? artifactsStr.split(",").map((a) => a.trim()) : [
|
|
325
|
+
"llms.txt",
|
|
326
|
+
"llms-full.txt",
|
|
327
|
+
"markdown"
|
|
328
|
+
];
|
|
329
|
+
const validArtifacts = [
|
|
330
|
+
"llms.txt",
|
|
331
|
+
"llms-full.txt",
|
|
332
|
+
"markdown"
|
|
333
|
+
];
|
|
334
|
+
for (const artifact of artifacts) if (!validArtifacts.includes(artifact)) {
|
|
335
|
+
p.log.error(`Error: Invalid artifact '${artifact}'. Valid options: ${validArtifacts.join(", ")}`);
|
|
336
|
+
process.exit(1);
|
|
337
|
+
}
|
|
338
|
+
const originOverride = getArgValue("--origin");
|
|
339
|
+
const inferredOrigin = (() => {
|
|
340
|
+
if (originOverride) return originOverride;
|
|
341
|
+
try {
|
|
342
|
+
const urlObj = new URL(withHttps(url));
|
|
343
|
+
return `${urlObj.protocol}//${urlObj.host}`;
|
|
344
|
+
} catch {
|
|
345
|
+
return void 0;
|
|
346
|
+
}
|
|
347
|
+
})();
|
|
348
|
+
const siteNameOverride = getArgValue("--site-name");
|
|
349
|
+
const descriptionOverride = getArgValue("--description");
|
|
350
|
+
const patterns = [parsed];
|
|
351
|
+
return {
|
|
352
|
+
urls: [url],
|
|
353
|
+
outputDir: resolve(getArgValue("--output") || getArgValue("-o") || "."),
|
|
354
|
+
driver: driver || "http",
|
|
355
|
+
maxRequestsPerCrawl: Number.parseInt(maxPagesStr || String(Number.MAX_SAFE_INTEGER)),
|
|
356
|
+
followLinks: true,
|
|
357
|
+
maxDepth: depth,
|
|
358
|
+
generateLlmsTxt: artifacts.includes("llms.txt"),
|
|
359
|
+
generateLlmsFullTxt: artifacts.includes("llms-full.txt"),
|
|
360
|
+
generateIndividualMd: artifacts.includes("markdown"),
|
|
361
|
+
siteNameOverride,
|
|
362
|
+
descriptionOverride,
|
|
363
|
+
origin: inferredOrigin,
|
|
364
|
+
globPatterns: patterns,
|
|
365
|
+
crawlDelay: crawlDelayStr ? Number.parseInt(crawlDelayStr) : void 0,
|
|
366
|
+
exclude: excludePatterns.length > 0 ? excludePatterns : void 0
|
|
367
|
+
};
|
|
368
|
+
}
|
|
369
|
+
async function main() {
|
|
370
|
+
const cliOptions = parseCliArgs();
|
|
371
|
+
let options;
|
|
372
|
+
if (cliOptions) {
|
|
373
|
+
options = cliOptions;
|
|
374
|
+
p.intro(`โ๏ธ mdream v${version}`);
|
|
375
|
+
const formats = [];
|
|
376
|
+
if (options.generateLlmsTxt) formats.push("llms.txt");
|
|
377
|
+
if (options.generateLlmsFullTxt) formats.push("llms-full.txt");
|
|
378
|
+
if (options.generateIndividualMd) formats.push("Individual MD files");
|
|
379
|
+
const summary = [
|
|
380
|
+
`URL: ${options.urls.join(", ")}`,
|
|
381
|
+
`Output: ${options.outputDir}`,
|
|
382
|
+
`Driver: ${options.driver}`,
|
|
383
|
+
`Depth: ${options.maxDepth}`,
|
|
384
|
+
`Formats: ${formats.join(", ")}`,
|
|
385
|
+
options.exclude && options.exclude.length > 0 && `Exclude: ${options.exclude.join(", ")}`
|
|
386
|
+
].filter(Boolean);
|
|
387
|
+
p.note(summary.join("\n"), "Configuration");
|
|
388
|
+
} else options = await interactiveCrawl();
|
|
389
|
+
if (!options) process.exit(0);
|
|
390
|
+
if (options.driver === "playwright") {
|
|
391
|
+
const playwrightInstalled = await ensurePlaywrightInstalled();
|
|
392
|
+
if (!playwrightInstalled) {
|
|
393
|
+
p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
|
|
394
|
+
process.exit(1);
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
const s = p.spinner();
|
|
398
|
+
s.start("Starting crawl...");
|
|
399
|
+
const results = await crawlAndGenerate(options, (progress) => {
|
|
400
|
+
if (progress.sitemap.status === "discovering") s.message("Discovering sitemaps...");
|
|
401
|
+
else if (progress.sitemap.status === "processing") s.message(`Processing sitemap... Found ${progress.sitemap.found} URLs`);
|
|
402
|
+
else if (progress.crawling.status === "processing") {
|
|
403
|
+
const processedCount = progress.crawling.processed;
|
|
404
|
+
const totalCount = progress.crawling.total;
|
|
405
|
+
const currentUrl = progress.crawling.currentUrl;
|
|
406
|
+
if (currentUrl) {
|
|
407
|
+
const shortUrl = currentUrl.length > 60 ? `${currentUrl.substring(0, 57)}...` : currentUrl;
|
|
408
|
+
if (processedCount > totalCount) s.message(`Crawling ${processedCount}: ${shortUrl}`);
|
|
409
|
+
else s.message(`Crawling ${processedCount}/${totalCount}: ${shortUrl}`);
|
|
410
|
+
} else if (processedCount > totalCount) s.message(`Crawling... ${processedCount} pages`);
|
|
411
|
+
else s.message(`Crawling... ${processedCount}/${totalCount} pages`);
|
|
412
|
+
} else if (progress.generation.status === "generating") {
|
|
413
|
+
const current = progress.generation.current || "Generating files";
|
|
414
|
+
s.message(current);
|
|
415
|
+
}
|
|
416
|
+
});
|
|
417
|
+
s.stop("Crawl completed!");
|
|
418
|
+
const successful = results.filter((r) => r.success).length;
|
|
419
|
+
const failed = results.filter((r) => !r.success).length;
|
|
420
|
+
const failedResults = results.filter((r) => !r.success);
|
|
421
|
+
if (failed > 0 && cliOptions) {
|
|
422
|
+
p.log.error("Failed URLs:");
|
|
423
|
+
failedResults.forEach((result) => {
|
|
424
|
+
p.log.error(` ${result.url}: ${result.error || "Unknown error"}`);
|
|
425
|
+
});
|
|
426
|
+
} else if (failed > 0) {
|
|
427
|
+
console.log("\nFailed URLs:");
|
|
428
|
+
failedResults.forEach((result) => {
|
|
429
|
+
console.log(` - ${result.url}: ${result.error || "Unknown error"}`);
|
|
430
|
+
});
|
|
431
|
+
}
|
|
432
|
+
const generatedFiles = [];
|
|
433
|
+
if (successful > 0) {
|
|
434
|
+
if (options.generateLlmsTxt) generatedFiles.push("llms.txt");
|
|
435
|
+
if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
|
|
436
|
+
if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
|
|
437
|
+
}
|
|
438
|
+
if (!cliOptions) await showCrawlResults(successful, failed, options.outputDir, generatedFiles);
|
|
439
|
+
else {
|
|
440
|
+
const messages = [];
|
|
441
|
+
if (successful > 0) messages.push(`โ
${successful} pages processed`);
|
|
442
|
+
if (failed > 0) messages.push(`โ ${failed} pages failed`);
|
|
443
|
+
if (generatedFiles.length > 0) messages.push(`๐ Generated: ${generatedFiles.join(", ")}`);
|
|
444
|
+
messages.push(`๐ Output: ${options.outputDir}`);
|
|
445
|
+
p.note(messages.join("\n"), "Results");
|
|
446
|
+
if (successful > 0) p.outro("๐ Crawling completed!");
|
|
447
|
+
else {
|
|
448
|
+
p.outro("โ Crawling failed - no pages processed");
|
|
449
|
+
process.exit(1);
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
main().catch((error) => {
|
|
454
|
+
p.log.error(`Unexpected error: ${error}`);
|
|
455
|
+
process.exit(1);
|
|
456
|
+
});
|
|
457
|
+
|
|
458
|
+
//#endregion
|
package/dist/index.d.mts
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
//#region src/types.d.ts
|
|
2
|
+
interface CrawlOptions {
|
|
3
|
+
urls: string[];
|
|
4
|
+
outputDir: string;
|
|
5
|
+
maxRequestsPerCrawl?: number;
|
|
6
|
+
generateLlmsTxt?: boolean;
|
|
7
|
+
generateLlmsFullTxt?: boolean;
|
|
8
|
+
generateIndividualMd?: boolean;
|
|
9
|
+
origin?: string;
|
|
10
|
+
chunkSize?: number;
|
|
11
|
+
driver?: 'http' | 'playwright';
|
|
12
|
+
followLinks?: boolean;
|
|
13
|
+
maxDepth?: number;
|
|
14
|
+
globPatterns?: ParsedUrlPattern[];
|
|
15
|
+
crawlDelay?: number;
|
|
16
|
+
exclude?: string[];
|
|
17
|
+
siteNameOverride?: string;
|
|
18
|
+
descriptionOverride?: string;
|
|
19
|
+
}
|
|
20
|
+
interface ParsedUrlPattern {
|
|
21
|
+
baseUrl: string;
|
|
22
|
+
pattern: string;
|
|
23
|
+
isGlob: boolean;
|
|
24
|
+
}
|
|
25
|
+
interface PageMetadata {
|
|
26
|
+
title: string;
|
|
27
|
+
description?: string;
|
|
28
|
+
keywords?: string;
|
|
29
|
+
author?: string;
|
|
30
|
+
links: string[];
|
|
31
|
+
}
|
|
32
|
+
interface CrawlResult {
|
|
33
|
+
url: string;
|
|
34
|
+
title: string;
|
|
35
|
+
content: string;
|
|
36
|
+
filePath?: string;
|
|
37
|
+
timestamp: number;
|
|
38
|
+
success: boolean;
|
|
39
|
+
error?: string;
|
|
40
|
+
metadata?: PageMetadata;
|
|
41
|
+
depth?: number;
|
|
42
|
+
}
|
|
43
|
+
interface LlmsTxtOptions {
|
|
44
|
+
siteName: string;
|
|
45
|
+
description?: string;
|
|
46
|
+
results: CrawlResult[];
|
|
47
|
+
outputPath: string;
|
|
48
|
+
}
|
|
49
|
+
//#endregion
|
|
50
|
+
//#region src/crawl.d.ts
|
|
51
|
+
interface CrawlProgress {
|
|
52
|
+
sitemap: {
|
|
53
|
+
status: 'discovering' | 'processing' | 'completed';
|
|
54
|
+
found: number;
|
|
55
|
+
processed: number;
|
|
56
|
+
};
|
|
57
|
+
crawling: {
|
|
58
|
+
status: 'starting' | 'processing' | 'completed';
|
|
59
|
+
total: number;
|
|
60
|
+
processed: number;
|
|
61
|
+
currentUrl?: string;
|
|
62
|
+
};
|
|
63
|
+
generation: {
|
|
64
|
+
status: 'idle' | 'generating' | 'completed';
|
|
65
|
+
current?: string;
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
declare function crawlAndGenerate(options: CrawlOptions, onProgress?: (progress: CrawlProgress) => void): Promise<CrawlResult[]>;
|
|
69
|
+
//#endregion
|
|
70
|
+
//#region src/llms-txt.d.ts
|
|
71
|
+
declare function generateLlmsTxt(options: LlmsTxtOptions): Promise<void>;
|
|
72
|
+
declare function generateLlmsFullTxt(options: LlmsTxtOptions): Promise<void>;
|
|
73
|
+
//#endregion
|
|
74
|
+
export { CrawlOptions, CrawlResult, LlmsTxtOptions, crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
|
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import { crawlAndGenerate } from "./_chunks/crawl-NJU1Dyc-.mjs";
|
|
2
|
+
import { writeFile } from "node:fs/promises";
|
|
3
|
+
import { basename, sep } from "pathe";
|
|
4
|
+
|
|
5
|
+
//#region src/llms-txt.ts
|
|
6
|
+
async function generateLlmsTxt(options) {
|
|
7
|
+
const { siteName, description, results, outputPath } = options;
|
|
8
|
+
let content = `# ${siteName}\n\n`;
|
|
9
|
+
if (description) content += `> ${description}\n\n`;
|
|
10
|
+
if (results.length > 0) {
|
|
11
|
+
content += `## Pages\n\n`;
|
|
12
|
+
for (const result of results) {
|
|
13
|
+
let title;
|
|
14
|
+
try {
|
|
15
|
+
title = result.title || new URL(result.url).pathname;
|
|
16
|
+
} catch {
|
|
17
|
+
title = result.title || result.url;
|
|
18
|
+
}
|
|
19
|
+
if (result.filePath) {
|
|
20
|
+
const mdSeparator = `${sep}md${sep}`;
|
|
21
|
+
const mdIndex = result.filePath.indexOf(mdSeparator);
|
|
22
|
+
const relativePath = mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath);
|
|
23
|
+
const linkPath = relativePath.split(sep).join("/");
|
|
24
|
+
content += `- [${title}](md/${linkPath}): ${result.url}\n`;
|
|
25
|
+
} else {
|
|
26
|
+
const description$1 = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
|
|
27
|
+
content += `- [${title}](${result.url})${description$1 ? `: ${description$1}` : ""}\n`;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
await writeFile(outputPath, content, "utf-8");
|
|
32
|
+
}
|
|
33
|
+
async function generateLlmsFullTxt(options) {
|
|
34
|
+
const { siteName, description, results, outputPath } = options;
|
|
35
|
+
let content = `# ${siteName}\n\n`;
|
|
36
|
+
if (description) content += `> ${description}\n\n`;
|
|
37
|
+
if (results.length > 0) {
|
|
38
|
+
content += `## Table of Contents\n\n`;
|
|
39
|
+
for (const result of results) {
|
|
40
|
+
let title;
|
|
41
|
+
try {
|
|
42
|
+
title = result.title || new URL(result.url).pathname;
|
|
43
|
+
} catch {
|
|
44
|
+
title = result.title || result.url;
|
|
45
|
+
}
|
|
46
|
+
const anchor = title.toLowerCase().replace(/[^a-z0-9]/g, "-");
|
|
47
|
+
content += `- [${title}](#${anchor})\n`;
|
|
48
|
+
}
|
|
49
|
+
content += `\n---\n\n`;
|
|
50
|
+
for (const result of results) {
|
|
51
|
+
let title;
|
|
52
|
+
try {
|
|
53
|
+
title = result.title || new URL(result.url).pathname;
|
|
54
|
+
} catch {
|
|
55
|
+
title = result.title || result.url;
|
|
56
|
+
}
|
|
57
|
+
content += `## ${title}\n\n`;
|
|
58
|
+
content += `**URL:** ${result.url}\n\n`;
|
|
59
|
+
content += `${result.content}\n\n---\n\n`;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
await writeFile(outputPath, content, "utf-8");
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
//#endregion
|
|
66
|
+
export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
|
package/package.json
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@mdream/crawl",
|
|
3
|
+
"type": "module",
|
|
4
|
+
"version": "0.7.0",
|
|
5
|
+
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
|
+
"author": {
|
|
7
|
+
"name": "Harlan Wilton",
|
|
8
|
+
"email": "harlan@harlanzw.com",
|
|
9
|
+
"url": "https://harlanzw.com/"
|
|
10
|
+
},
|
|
11
|
+
"license": "MIT",
|
|
12
|
+
"exports": {
|
|
13
|
+
".": {
|
|
14
|
+
"types": "./dist/index.d.mts",
|
|
15
|
+
"import": {
|
|
16
|
+
"types": "./dist/index.d.mts",
|
|
17
|
+
"default": "./dist/index.mjs"
|
|
18
|
+
},
|
|
19
|
+
"default": "./dist/index.mjs"
|
|
20
|
+
},
|
|
21
|
+
"./cli": {
|
|
22
|
+
"types": "./dist/cli.d.mts",
|
|
23
|
+
"import": {
|
|
24
|
+
"types": "./dist/cli.d.mts",
|
|
25
|
+
"default": "./dist/cli.mjs"
|
|
26
|
+
},
|
|
27
|
+
"default": "./dist/cli.mjs"
|
|
28
|
+
}
|
|
29
|
+
},
|
|
30
|
+
"main": "./dist/index.mjs",
|
|
31
|
+
"types": "./dist/index.d.mts",
|
|
32
|
+
"bin": {
|
|
33
|
+
"@mdream/crawl": "./bin/mdream-crawl.mjs"
|
|
34
|
+
},
|
|
35
|
+
"files": [
|
|
36
|
+
"bin",
|
|
37
|
+
"dist"
|
|
38
|
+
],
|
|
39
|
+
"peerDependencies": {
|
|
40
|
+
"playwright": "^1.53.2"
|
|
41
|
+
},
|
|
42
|
+
"peerDependenciesMeta": {
|
|
43
|
+
"playwright": {
|
|
44
|
+
"optional": true
|
|
45
|
+
}
|
|
46
|
+
},
|
|
47
|
+
"dependencies": {
|
|
48
|
+
"@clack/prompts": "^0.11.0",
|
|
49
|
+
"crawlee": "^3.13.9",
|
|
50
|
+
"minimatch": "^10.0.3",
|
|
51
|
+
"nypm": "^0.6.0",
|
|
52
|
+
"pathe": "^2.0.3",
|
|
53
|
+
"mdream": "0.7.0"
|
|
54
|
+
},
|
|
55
|
+
"scripts": {
|
|
56
|
+
"build": "obuild",
|
|
57
|
+
"typecheck": "tsc --noEmit",
|
|
58
|
+
"dev:prepare": "obuild --stub",
|
|
59
|
+
"test": "vitest test",
|
|
60
|
+
"test:attw": "attw --pack"
|
|
61
|
+
}
|
|
62
|
+
}
|