@mdream/crawl 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE.md ADDED
@@ -0,0 +1,9 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Harlan Wilton
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,102 @@
1
+ # @mdream/crawl
2
+
3
+ Multi-page website crawler that generates comprehensive llms.txt files by following internal links and processing entire websites using mdream HTML-to-Markdown conversion.
4
+
5
+ > **Note**: For single-page HTML-to-Markdown conversion, use the [`mdream`](../mdream) binary instead. `@mdream/crawl` is specifically designed for crawling entire websites with multiple pages.
6
+
7
+ ## Installation
8
+
9
+ ```bash
10
+ npm install @mdream/crawl
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ Simply run the command to start the interactive multi-page website crawler:
16
+
17
+ ```bash
18
+ npx @mdream/crawl
19
+ ```
20
+
21
+ The crawler will automatically discover and follow internal links to crawl entire websites. The interactive interface provides:
22
+ - โœจ Beautiful prompts powered by Clack
23
+ - ๐ŸŽฏ Step-by-step configuration guidance
24
+ - โœ… Input validation and helpful hints
25
+ - ๐Ÿ“‹ Configuration summary before crawling
26
+ - ๐ŸŽ‰ Clean result display with progress indicators
27
+ - ๐Ÿงน Automatic cleanup of crawler storage
28
+
29
+ ## Programmatic Usage
30
+
31
+ You can also use @mdream/crawl programmatically in your Node.js applications:
32
+
33
+ ```typescript
34
+ import { crawlAndGenerate, generateLlmsTxt } from '@mdream/crawl'
35
+
36
+ // Crawl entire websites programmatically
37
+ const results = await crawlAndGenerate({
38
+ urls: ['https://docs.example.com'], // Starting URLs for website crawling
39
+ outputDir: './output',
40
+ maxRequestsPerCrawl: 100, // Maximum pages per website
41
+ generateLlmsTxt: true,
42
+ followLinks: true, // Follow internal links to crawl entire site
43
+ maxDepth: 3, // How deep to follow links
44
+ driver: 'http', // or 'playwright' for JS-heavy sites
45
+ verbose: true
46
+ })
47
+
48
+ // Generate llms.txt manually from existing results
49
+ await generateLlmsTxt({
50
+ siteName: 'Example Site',
51
+ description: 'Documentation for Example Site',
52
+ results: crawlResults,
53
+ outputPath: './output/llms.txt'
54
+ })
55
+ ```
56
+
57
+ ## Output
58
+
59
+ The crawler generates comprehensive output from entire websites:
60
+
61
+ 1. **Markdown files** - One `.md` file per crawled page with clean markdown content
62
+ 2. **llms.txt** - Comprehensive site overview file following the [llms.txt specification](https://llmstxt.org/)
63
+
64
+ ### Example llms.txt output
65
+
66
+ ```markdown
67
+ # example.com
68
+
69
+ ## Pages
70
+
71
+ - [Example Domain](https---example-com-.md): https://example.com/
72
+ - [About Us](https---example-com-about.md): https://example.com/about
73
+ ```
74
+
75
+ ## Features
76
+
77
+ - โœ… **Multi-Page Website Crawling**: Designed specifically for crawling entire websites by following internal links
78
+ - โœ… **Purely Interactive**: No complex command-line options to remember
79
+ - โœ… **Dual Crawler Support**: Fast HTTP crawler (default) + Playwright for JavaScript-heavy sites
80
+ - โœ… **Smart Link Discovery**: Uses mdream's extraction plugin to find and follow internal links
81
+ - โœ… **Rich Metadata Extraction**: Extracts titles, descriptions, keywords, and author info from all pages
82
+ - โœ… **Comprehensive llms.txt Generation**: Creates complete site documentation files
83
+ - โœ… **Configurable Depth Crawling**: Follow links with customizable depth limits (1-10 levels)
84
+ - โœ… **Clean Markdown Conversion**: Powered by mdream's HTML-to-Markdown engine
85
+ - โœ… **Performance Optimized**: HTTP crawler is 5-10x faster than browser-based crawling
86
+ - โœ… **Beautiful Output**: Clean result display with progress indicators
87
+ - โœ… **Automatic Cleanup**: Purges crawler storage after completion
88
+ - โœ… **TypeScript Support**: Full type definitions with excellent IDE support
89
+
90
+ ## Use Cases
91
+
92
+ Perfect for:
93
+ - ๐Ÿ“š **Documentation Sites**: Crawl entire documentation websites (GitBook, Docusaurus, etc.)
94
+ - ๐Ÿข **Company Websites**: Generate comprehensive site overviews for LLM context
95
+ - ๐Ÿ“ **Blogs**: Process entire blog archives with proper categorization
96
+ - ๐Ÿ”— **Multi-Page Resources**: Any website where you need all pages, not just one
97
+
98
+ **Not suitable for**: Single-page conversions (use `mdream` binary instead)
99
+
100
+ ## License
101
+
102
+ MIT
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env node
2
+
3
+ import('../dist/cli.mjs')
@@ -0,0 +1,445 @@
1
+ import { existsSync, mkdirSync } from "node:fs";
2
+ import { writeFile } from "node:fs/promises";
3
+ import { HttpCrawler, Sitemap, purgeDefaultStorages } from "crawlee";
4
+ import { generateLlmsTxtArtifacts, htmlToMarkdown } from "mdream";
5
+ import { withMinimalPreset } from "mdream/preset/minimal";
6
+ import { dirname, join, normalize, resolve } from "pathe";
7
+ import { minimatch } from "minimatch";
8
+ import { extractionPlugin } from "mdream/plugins";
9
+
10
+ //#region ../../node_modules/.pnpm/ufo@1.6.1/node_modules/ufo/dist/index.mjs
11
+ const r = String.fromCharCode;
12
+ const PROTOCOL_REGEX = /^[\s\w\0+.-]{2,}:([/\\]{2})?/;
13
+ function withHttps(input) {
14
+ return withProtocol(input, "https://");
15
+ }
16
+ function withProtocol(input, protocol) {
17
+ let match = input.match(PROTOCOL_REGEX);
18
+ if (!match) match = input.match(/^\/{2,}/);
19
+ if (!match) return protocol + input;
20
+ return protocol + input.slice(match[0].length);
21
+ }
22
+ const protocolRelative = Symbol.for("ufo:protocolRelative");
23
+
24
+ //#endregion
25
+ //#region src/glob-utils.ts
26
+ /**
27
+ * Parse a URL that may contain glob patterns
28
+ * Example: https://nuxtseo.com/docs/** -> { baseUrl: "https://nuxtseo.com", pattern: "/docs/**", isGlob: true }
29
+ */
30
+ function parseUrlPattern(input) {
31
+ const hasGlob = input.includes("*") || input.includes("?") || input.includes("[");
32
+ if (!hasGlob) return {
33
+ baseUrl: input,
34
+ pattern: "",
35
+ isGlob: false
36
+ };
37
+ try {
38
+ const urlWithProtocol = input.startsWith("http") ? input : `https://${input}`;
39
+ const urlWithoutGlob = urlWithProtocol.replace(/\*.*$/, "");
40
+ const url = new URL(urlWithoutGlob);
41
+ const baseUrl = `${url.protocol}//${url.host}`;
42
+ const patternStart = input.indexOf(url.host) + url.host.length;
43
+ const pattern = input.substring(patternStart);
44
+ return {
45
+ baseUrl,
46
+ pattern,
47
+ isGlob: true
48
+ };
49
+ } catch (error) {
50
+ throw new Error(`Invalid URL pattern: "${input}". Please provide a valid URL with glob patterns (e.g., "example.com/docs/*" or "https://example.com/api/**").`);
51
+ }
52
+ }
53
+ /**
54
+ * Check if a URL matches a glob pattern
55
+ */
56
+ function matchesGlobPattern(url, parsedPattern) {
57
+ if (!parsedPattern.isGlob) return true;
58
+ try {
59
+ const urlObj = new URL(url);
60
+ const urlPath = urlObj.pathname + urlObj.search + urlObj.hash;
61
+ const urlBase = `${urlObj.protocol}//${urlObj.host}`;
62
+ if (urlBase !== parsedPattern.baseUrl) return false;
63
+ return minimatch(urlPath, parsedPattern.pattern);
64
+ } catch {
65
+ return false;
66
+ }
67
+ }
68
+ /**
69
+ * Get the starting URL for crawling from a glob pattern
70
+ * For https://nuxtseo.com/docs/**, we want to start at https://nuxtseo.com
71
+ */
72
+ function getStartingUrl(parsedPattern) {
73
+ if (!parsedPattern.isGlob) return withHttps(parsedPattern.baseUrl);
74
+ const pattern = parsedPattern.pattern;
75
+ const firstGlobIndex = pattern.search(/[*?[]/);
76
+ if (firstGlobIndex === -1) return withHttps(parsedPattern.baseUrl + pattern);
77
+ const beforeGlob = pattern.substring(0, firstGlobIndex);
78
+ const lastSlash = beforeGlob.lastIndexOf("/");
79
+ const pathBeforeGlob = lastSlash >= 0 ? beforeGlob.substring(0, lastSlash + 1) : "/";
80
+ return withHttps(parsedPattern.baseUrl + pathBeforeGlob);
81
+ }
82
+ /**
83
+ * Check if a URL should be excluded based on exclude patterns
84
+ */
85
+ function isUrlExcluded(url, excludePatterns) {
86
+ if (!excludePatterns || excludePatterns.length === 0) return false;
87
+ try {
88
+ const urlObj = new URL(url);
89
+ const urlPath = urlObj.pathname + urlObj.search + urlObj.hash;
90
+ return excludePatterns.some((pattern) => {
91
+ if (pattern.includes("://")) {
92
+ const parsedPattern = parseUrlPattern(pattern);
93
+ if (parsedPattern.isGlob) return matchesGlobPattern(url, parsedPattern);
94
+ return url === pattern;
95
+ }
96
+ if (pattern.startsWith("/")) {
97
+ const adjustedPattern = pattern.endsWith("/*") ? pattern.replace("/*", "/**") : pattern;
98
+ return minimatch(urlPath, adjustedPattern);
99
+ }
100
+ return minimatch(urlPath, pattern) || minimatch(urlPath.substring(1), pattern);
101
+ });
102
+ } catch {
103
+ return false;
104
+ }
105
+ }
106
+ /**
107
+ * Validate glob pattern syntax
108
+ */
109
+ function validateGlobPattern(pattern) {
110
+ try {
111
+ parseUrlPattern(pattern);
112
+ return void 0;
113
+ } catch (error) {
114
+ return `Invalid glob pattern: ${error instanceof Error ? error.message : error}`;
115
+ }
116
+ }
117
+
118
+ //#endregion
119
+ //#region src/metadata-extractor.ts
120
+ function extractMetadata(html, url) {
121
+ const links = [];
122
+ let title = "";
123
+ let description = "";
124
+ let keywords = "";
125
+ let author = "";
126
+ const extractionPluginInstance = extractionPlugin({
127
+ "a[href]": (element) => {
128
+ const href = element.attributes?.href;
129
+ if (href) try {
130
+ const absoluteUrl = new URL(href, url).href;
131
+ if (!links.includes(absoluteUrl)) links.push(absoluteUrl);
132
+ } catch {}
133
+ },
134
+ "title": (element) => {
135
+ if (!title && element.textContent) title = element.textContent.trim();
136
+ },
137
+ "meta[name=\"description\"]": (element) => {
138
+ if (!description && element.attributes?.content) description = element.attributes.content.trim();
139
+ },
140
+ "meta[property=\"og:description\"]": (element) => {
141
+ if (!description && element.attributes?.content) description = element.attributes.content.trim();
142
+ },
143
+ "meta[name=\"keywords\"]": (element) => {
144
+ if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
145
+ },
146
+ "meta[name=\"author\"]": (element) => {
147
+ if (!author && element.attributes?.content) author = element.attributes.content.trim();
148
+ },
149
+ "meta[property=\"og:title\"]": (element) => {
150
+ if (!title && element.attributes?.content) title = element.attributes.content.trim();
151
+ }
152
+ });
153
+ htmlToMarkdown(html, {
154
+ plugins: [extractionPluginInstance],
155
+ origin: new URL(url).origin
156
+ });
157
+ return {
158
+ title: title || new URL(url).pathname,
159
+ description: description || void 0,
160
+ keywords: keywords || void 0,
161
+ author: author || void 0,
162
+ links: links.filter((link) => {
163
+ try {
164
+ const linkUrl = new URL(link);
165
+ const baseUrl = new URL(url);
166
+ return linkUrl.hostname === baseUrl.hostname;
167
+ } catch {
168
+ return false;
169
+ }
170
+ })
171
+ };
172
+ }
173
+
174
+ //#endregion
175
+ //#region src/crawl.ts
176
+ async function crawlAndGenerate(options, onProgress) {
177
+ const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride } = options;
178
+ const outputDir = resolve(normalize(rawOutputDir));
179
+ let patterns;
180
+ try {
181
+ patterns = globPatterns.length > 0 ? globPatterns : urls.map(parseUrlPattern);
182
+ } catch (error) {
183
+ throw new Error(`Invalid URL pattern: ${error instanceof Error ? error.message : "Unknown error"}`);
184
+ }
185
+ let startingUrls = patterns.map(getStartingUrl);
186
+ const progress = {
187
+ sitemap: {
188
+ status: "discovering",
189
+ found: 0,
190
+ processed: 0
191
+ },
192
+ crawling: {
193
+ status: "starting",
194
+ total: 0,
195
+ processed: 0
196
+ },
197
+ generation: { status: "idle" }
198
+ };
199
+ if (startingUrls.length > 0) {
200
+ const baseUrl = new URL(startingUrls[0]).origin;
201
+ const homePageUrl = baseUrl;
202
+ onProgress?.(progress);
203
+ const robotsUrl = new URL("/robots.txt", baseUrl).toString();
204
+ const robotsResponse = await fetch(robotsUrl);
205
+ if (robotsResponse.ok) {
206
+ const robotsContent = await robotsResponse.text();
207
+ const sitemapMatches = robotsContent.match(/Sitemap:\s*(.*)/gi);
208
+ if (sitemapMatches && sitemapMatches.length > 0) {
209
+ progress.sitemap.found = sitemapMatches.length;
210
+ progress.sitemap.status = "processing";
211
+ onProgress?.(progress);
212
+ const robotsSitemaps = sitemapMatches.map((match) => match.replace(/Sitemap:\s*/i, "").trim());
213
+ for (const sitemapUrl of robotsSitemaps) try {
214
+ const { urls: robotsUrls } = await Sitemap.load(sitemapUrl);
215
+ const hasGlobPatterns = patterns.some((p) => p.isGlob);
216
+ if (hasGlobPatterns) {
217
+ const filteredUrls = robotsUrls.filter((url) => {
218
+ return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
219
+ });
220
+ startingUrls = filteredUrls;
221
+ progress.sitemap.processed = filteredUrls.length;
222
+ onProgress?.(progress);
223
+ break;
224
+ } else {
225
+ const filteredUrls = robotsUrls.filter((url) => {
226
+ return !isUrlExcluded(url, exclude);
227
+ });
228
+ if (filteredUrls.length > 0) {
229
+ startingUrls = filteredUrls;
230
+ progress.sitemap.processed = filteredUrls.length;
231
+ onProgress?.(progress);
232
+ break;
233
+ }
234
+ }
235
+ } catch {
236
+ continue;
237
+ }
238
+ }
239
+ }
240
+ try {
241
+ const { urls: sitemapUrls } = await Sitemap.load(`${baseUrl}/sitemap.xml`);
242
+ const hasGlobPatterns = patterns.some((p) => p.isGlob);
243
+ if (hasGlobPatterns) {
244
+ const filteredUrls = sitemapUrls.filter((url) => {
245
+ return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
246
+ });
247
+ startingUrls = filteredUrls;
248
+ progress.sitemap.found = sitemapUrls.length;
249
+ progress.sitemap.processed = filteredUrls.length;
250
+ onProgress?.(progress);
251
+ } else {
252
+ const filteredUrls = sitemapUrls.filter((url) => {
253
+ return !isUrlExcluded(url, exclude);
254
+ });
255
+ if (filteredUrls.length > 0) {
256
+ startingUrls = filteredUrls;
257
+ progress.sitemap.found = sitemapUrls.length;
258
+ progress.sitemap.processed = filteredUrls.length;
259
+ onProgress?.(progress);
260
+ }
261
+ }
262
+ } catch {
263
+ const commonSitemaps = [
264
+ `${baseUrl}/sitemap_index.xml`,
265
+ `${baseUrl}/sitemaps.xml`,
266
+ `${baseUrl}/sitemap-index.xml`
267
+ ];
268
+ for (const sitemapUrl of commonSitemaps) try {
269
+ const { urls: altUrls } = await Sitemap.load(sitemapUrl);
270
+ const hasGlobPatterns = patterns.some((p) => p.isGlob);
271
+ if (hasGlobPatterns) {
272
+ const filteredUrls = altUrls.filter((url) => {
273
+ return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
274
+ });
275
+ startingUrls = filteredUrls;
276
+ progress.sitemap.found = altUrls.length;
277
+ progress.sitemap.processed = filteredUrls.length;
278
+ onProgress?.(progress);
279
+ break;
280
+ } else {
281
+ const filteredUrls = altUrls.filter((url) => {
282
+ return !isUrlExcluded(url, exclude);
283
+ });
284
+ if (filteredUrls.length > 0) {
285
+ startingUrls = filteredUrls;
286
+ progress.sitemap.found = altUrls.length;
287
+ progress.sitemap.processed = filteredUrls.length;
288
+ onProgress?.(progress);
289
+ break;
290
+ }
291
+ }
292
+ } catch {
293
+ continue;
294
+ }
295
+ }
296
+ if (!startingUrls.includes(homePageUrl)) startingUrls.unshift(homePageUrl);
297
+ progress.sitemap.status = "completed";
298
+ progress.crawling.total = startingUrls.length;
299
+ onProgress?.(progress);
300
+ }
301
+ if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });
302
+ const results = [];
303
+ const processedUrls = new Set();
304
+ const shouldCrawlUrl = (url) => {
305
+ if (isUrlExcluded(url, exclude)) return false;
306
+ if (!patterns.some((p) => p.isGlob)) return true;
307
+ return patterns.some((pattern) => matchesGlobPattern(url, pattern));
308
+ };
309
+ const createRequestHandler = (crawlerType) => {
310
+ return async ({ request, body, page, enqueueLinks }) => {
311
+ const startTime = Date.now();
312
+ progress.crawling.currentUrl = request.loadedUrl;
313
+ onProgress?.(progress);
314
+ const baseUrl = new URL(startingUrls[0]).origin;
315
+ const homePageUrl = baseUrl;
316
+ let html;
317
+ let title;
318
+ if (crawlerType === "playwright") {
319
+ await page.waitForLoadState("networkidle");
320
+ title = await page.title();
321
+ html = await page.innerHTML("html");
322
+ } else {
323
+ html = typeof body === "string" ? body : body.toString();
324
+ title = "";
325
+ }
326
+ const metadata = extractMetadata(html, request.loadedUrl);
327
+ if (!title) title = metadata.title;
328
+ const shouldProcessMarkdown = shouldCrawlUrl(request.loadedUrl);
329
+ let md = "";
330
+ if (shouldProcessMarkdown) md = htmlToMarkdown(html, withMinimalPreset({ origin: origin || new URL(request.loadedUrl).origin }));
331
+ let filePath;
332
+ if (shouldProcessMarkdown) {
333
+ const urlObj = new URL(request.loadedUrl);
334
+ const urlPath = urlObj.pathname === "/" ? "/index" : urlObj.pathname;
335
+ const pathSegments = urlPath.replace(/\/$/, "").split("/").filter((seg) => seg.length > 0);
336
+ const safeSegments = pathSegments.map((seg) => seg.replace(/[^\w\-]/g, "-"));
337
+ const filename = safeSegments.length > 0 ? safeSegments.join("/") : "index";
338
+ const safeFilename = normalize(`${filename}.md`);
339
+ filePath = join(outputDir, "md", safeFilename);
340
+ if (generateIndividualMd) {
341
+ const fileDir = dirname(filePath);
342
+ if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
343
+ await writeFile(filePath, md, "utf-8");
344
+ }
345
+ }
346
+ const isHomePage = request.loadedUrl === homePageUrl;
347
+ if (shouldProcessMarkdown || isHomePage) {
348
+ const result = {
349
+ url: request.loadedUrl,
350
+ title,
351
+ content: md,
352
+ filePath: generateIndividualMd && shouldProcessMarkdown ? filePath : void 0,
353
+ timestamp: startTime,
354
+ success: true,
355
+ metadata,
356
+ depth: request.userData?.depth || 0
357
+ };
358
+ results.push(result);
359
+ progress.crawling.processed = results.length;
360
+ onProgress?.(progress);
361
+ }
362
+ if (followLinks && (request.userData?.depth || 0) < maxDepth) {
363
+ const currentDepth = (request.userData?.depth || 0) + 1;
364
+ const filteredLinks = metadata.links.filter((link) => {
365
+ return shouldCrawlUrl(link);
366
+ });
367
+ if (enqueueLinks) await enqueueLinks({
368
+ urls: filteredLinks,
369
+ userData: { depth: currentDepth }
370
+ });
371
+ else for (const link of filteredLinks) if (!processedUrls.has(link)) processedUrls.add(link);
372
+ }
373
+ };
374
+ };
375
+ let crawler;
376
+ const crawlerOptions = {
377
+ requestHandler: createRequestHandler(driver),
378
+ maxRequestsPerCrawl,
379
+ respectRobotsTxtFile: true
380
+ };
381
+ if (crawlDelay) crawlerOptions.requestHandlerTimeoutMillis = crawlDelay * 1e3;
382
+ if (driver === "playwright") {
383
+ const { PlaywrightCrawler: PlaywrightCrawlerClass } = await import("crawlee");
384
+ crawler = new PlaywrightCrawlerClass(crawlerOptions);
385
+ } else crawler = new HttpCrawler(crawlerOptions);
386
+ const initialRequests = startingUrls.map((url) => ({
387
+ url,
388
+ userData: { depth: 0 }
389
+ }));
390
+ progress.crawling.status = "processing";
391
+ progress.crawling.total = startingUrls.length;
392
+ onProgress?.(progress);
393
+ await crawler.run(initialRequests);
394
+ progress.crawling.status = "completed";
395
+ onProgress?.(progress);
396
+ if (results.some((r$1) => r$1.success)) {
397
+ progress.generation.status = "generating";
398
+ onProgress?.(progress);
399
+ const successfulResults = results.filter((r$1) => r$1.success);
400
+ const firstUrl = new URL(withHttps(urls[0]));
401
+ const homePageResult = successfulResults.find((r$1) => {
402
+ const resultUrl = new URL(withHttps(r$1.url));
403
+ const homeUrl = new URL(withHttps(urls[0]));
404
+ return resultUrl.href === homeUrl.href;
405
+ });
406
+ const siteName = siteNameOverride || homePageResult?.metadata?.title || firstUrl.hostname;
407
+ const description = descriptionOverride || homePageResult?.metadata?.description || successfulResults[0]?.metadata?.description;
408
+ if (generateLlmsTxt || generateLlmsFullTxt) {
409
+ progress.generation.current = "Generating llms.txt files";
410
+ onProgress?.(progress);
411
+ const contentResults = successfulResults.filter((result) => result.content && result.content.trim().length > 0);
412
+ const processedFiles = contentResults.map((result) => ({
413
+ filePath: result.filePath,
414
+ title: result.title,
415
+ content: result.content,
416
+ url: result.url,
417
+ metadata: result.metadata
418
+ }));
419
+ const llmsResult = await generateLlmsTxtArtifacts({
420
+ files: processedFiles,
421
+ siteName,
422
+ description,
423
+ origin: origin || firstUrl.origin,
424
+ generateFull: generateLlmsFullTxt
425
+ });
426
+ if (generateLlmsTxt) {
427
+ progress.generation.current = "Writing llms.txt";
428
+ onProgress?.(progress);
429
+ await writeFile(join(outputDir, "llms.txt"), llmsResult.llmsTxt, "utf-8");
430
+ }
431
+ if (generateLlmsFullTxt && llmsResult.llmsFullTxt) {
432
+ progress.generation.current = "Writing llms-full.txt";
433
+ onProgress?.(progress);
434
+ await writeFile(join(outputDir, "llms-full.txt"), llmsResult.llmsFullTxt, "utf-8");
435
+ }
436
+ }
437
+ progress.generation.status = "completed";
438
+ onProgress?.(progress);
439
+ }
440
+ await purgeDefaultStorages();
441
+ return results;
442
+ }
443
+
444
+ //#endregion
445
+ export { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps };
package/dist/cli.d.mts ADDED
@@ -0,0 +1 @@
1
+ export { };
package/dist/cli.mjs ADDED
@@ -0,0 +1,458 @@
1
+ import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-NJU1Dyc-.mjs";
2
+ import { readFileSync } from "node:fs";
3
+ import { dirname, join, resolve } from "pathe";
4
+ import { fileURLToPath } from "node:url";
5
+ import * as p$1 from "@clack/prompts";
6
+ import * as p from "@clack/prompts";
7
+ import { addDependency } from "nypm";
8
+
9
+ //#region src/playwright-utils.ts
10
+ async function checkPlaywrightInstallation() {
11
+ try {
12
+ await import("playwright");
13
+ return true;
14
+ } catch {
15
+ return false;
16
+ }
17
+ }
18
+ async function promptPlaywrightInstall() {
19
+ const shouldInstall = await p$1.confirm({
20
+ message: "Playwright is required for the Playwright driver. Install it now?",
21
+ initialValue: true
22
+ });
23
+ if (p$1.isCancel(shouldInstall) || !shouldInstall) return false;
24
+ const s = p$1.spinner();
25
+ s.start("Installing Playwright...");
26
+ try {
27
+ await addDependency("playwright", { workspace: true });
28
+ s.stop("Playwright installed successfully!");
29
+ return true;
30
+ } catch {
31
+ try {
32
+ await addDependency("playwright");
33
+ s.stop("Playwright installed successfully!");
34
+ return true;
35
+ } catch (fallbackError) {
36
+ s.stop("Failed to install Playwright");
37
+ p$1.log.error(`Installation failed: ${fallbackError}`);
38
+ return false;
39
+ }
40
+ }
41
+ }
42
+ async function ensurePlaywrightInstalled() {
43
+ const isInstalled = await checkPlaywrightInstallation();
44
+ if (isInstalled) return true;
45
+ p$1.log.warn("Playwright driver selected but Playwright is not installed.");
46
+ const installed = await promptPlaywrightInstall();
47
+ if (!installed) {
48
+ p$1.log.error("Cannot proceed with Playwright driver without Playwright installed.");
49
+ return false;
50
+ }
51
+ return true;
52
+ }
53
+
54
+ //#endregion
55
+ //#region src/cli.ts
56
+ const __dirname = dirname(fileURLToPath(import.meta.url));
57
+ const packageJsonPath = join(__dirname, "..", "package.json");
58
+ const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
59
+ const version = packageJson.version;
60
+ async function interactiveCrawl() {
61
+ console.clear();
62
+ p.intro("โ˜๏ธ @mdream/crawl");
63
+ const urlsInput = await p.text({
64
+ message: "Enter starting URL for crawling (supports glob patterns):",
65
+ placeholder: "e.g. docs.example.com, site.com/docs/**",
66
+ validate: (value) => {
67
+ if (!value) return "Please enter at least one URL";
68
+ const urls$1 = value.split(",").map((url) => url.trim());
69
+ for (const url of urls$1) {
70
+ const globError = validateGlobPattern(url);
71
+ if (globError) return globError;
72
+ try {
73
+ const parsed = parseUrlPattern(url);
74
+ if (!parsed.isGlob) try {
75
+ new URL(withHttps(url));
76
+ } catch {
77
+ return `Invalid URL: ${withHttps(url)}`;
78
+ }
79
+ } catch (error) {
80
+ return error instanceof Error ? error.message : "Invalid URL pattern";
81
+ }
82
+ }
83
+ }
84
+ });
85
+ if (p.isCancel(urlsInput)) {
86
+ p.cancel("Operation cancelled.");
87
+ return null;
88
+ }
89
+ const urls = urlsInput.split(",").map((url) => url.trim());
90
+ let globPatterns;
91
+ try {
92
+ globPatterns = urls.map(parseUrlPattern);
93
+ } catch (error) {
94
+ p.cancel(error instanceof Error ? error.message : "Invalid URL pattern");
95
+ return null;
96
+ }
97
+ const outputDir = ".";
98
+ const crawlerOptions = await p.group({
99
+ driver: () => p.select({
100
+ message: "Select crawler driver:",
101
+ options: [{
102
+ value: "http",
103
+ label: "HTTP Crawler (Fast, for static content)",
104
+ hint: "Recommended"
105
+ }, {
106
+ value: "playwright",
107
+ label: "Playwright (Slower, supports JavaScript)"
108
+ }],
109
+ initialValue: "http"
110
+ }),
111
+ maxDepth: () => p.text({
112
+ message: "Clicks to page (crawl depth):",
113
+ placeholder: "3",
114
+ defaultValue: "3",
115
+ validate: (value) => {
116
+ const num = Number.parseInt(value);
117
+ if (Number.isNaN(num) || num < 1 || num > 10) return "Depth must be between 1 and 10";
118
+ }
119
+ })
120
+ }, { onCancel: () => {
121
+ p.cancel("Operation cancelled.");
122
+ process.exit(0);
123
+ } });
124
+ const advancedOptions = await p.group({ outputFormats: () => p.multiselect({
125
+ message: "Select output formats:",
126
+ options: [
127
+ {
128
+ value: "llms.txt",
129
+ label: "llms.txt (basic format)",
130
+ hint: "Recommended"
131
+ },
132
+ {
133
+ value: "llms-full.txt",
134
+ label: "llms-full.txt (extended format)"
135
+ },
136
+ {
137
+ value: "markdown",
138
+ label: "Individual Markdown files"
139
+ }
140
+ ],
141
+ initialValues: [
142
+ "llms.txt",
143
+ "llms-full.txt",
144
+ "markdown"
145
+ ]
146
+ }) }, { onCancel: () => {
147
+ p.cancel("Operation cancelled.");
148
+ process.exit(0);
149
+ } });
150
+ const firstUrl = urls[0];
151
+ const inferredOrigin = (() => {
152
+ try {
153
+ const url = new URL(withHttps(firstUrl));
154
+ return `${url.protocol}//${url.host}`;
155
+ } catch {
156
+ return void 0;
157
+ }
158
+ })();
159
+ const outputFormats = advancedOptions.outputFormats.map((f) => {
160
+ switch (f) {
161
+ case "llms.txt": return "llms.txt";
162
+ case "llms-full.txt": return "llms-full.txt";
163
+ case "markdown": return "Individual MD files";
164
+ default: return f;
165
+ }
166
+ });
167
+ const summary = [
168
+ `URLs: ${urls.join(", ")}`,
169
+ `Output: ${outputDir}`,
170
+ `Driver: ${crawlerOptions.driver}`,
171
+ `Max pages: Unlimited`,
172
+ `Follow links: Yes (depth ${crawlerOptions.maxDepth})`,
173
+ `Output formats: ${outputFormats.join(", ")}`,
174
+ `Sitemap discovery: Automatic`,
175
+ inferredOrigin && `Origin: ${inferredOrigin}`
176
+ ].filter(Boolean);
177
+ p.note(summary.join("\n"), "Crawl Configuration");
178
+ const shouldProceed = await p.confirm({
179
+ message: "Start crawling?",
180
+ initialValue: true
181
+ });
182
+ if (p.isCancel(shouldProceed) || !shouldProceed) {
183
+ p.cancel("Crawl cancelled.");
184
+ return null;
185
+ }
186
+ return {
187
+ urls,
188
+ outputDir: resolve(outputDir),
189
+ driver: crawlerOptions.driver,
190
+ maxRequestsPerCrawl: Number.MAX_SAFE_INTEGER,
191
+ followLinks: true,
192
+ maxDepth: Number.parseInt(crawlerOptions.maxDepth),
193
+ generateLlmsTxt: advancedOptions.outputFormats.includes("llms.txt"),
194
+ generateLlmsFullTxt: advancedOptions.outputFormats.includes("llms-full.txt"),
195
+ generateIndividualMd: advancedOptions.outputFormats.includes("markdown"),
196
+ origin: inferredOrigin,
197
+ globPatterns
198
+ };
199
+ }
200
+ async function showCrawlResults(successful, failed, outputDir, generatedFiles) {
201
+ const messages = [];
202
+ if (successful > 0) messages.push(`โœ… ${successful} pages processed successfully`);
203
+ if (failed > 0) messages.push(`โŒ ${failed} pages failed`);
204
+ if (generatedFiles.length > 0) messages.push(`๐Ÿ“„ Generated: ${generatedFiles.join(", ")}`);
205
+ messages.push(`๐Ÿ“ Output: ${outputDir}`);
206
+ p.note(messages.join("\n"), "Crawl Results");
207
+ if (successful > 0) p.outro("๐ŸŽ‰ Crawling completed successfully!");
208
+ else p.outro("โŒ Crawling failed - no pages processed");
209
+ }
210
+ function parseCliArgs() {
211
+ const args = process.argv.slice(2);
212
+ if (args.includes("--help") || args.includes("-h")) {
213
+ console.log(`
214
+ @mdream/crawl v${version}
215
+
216
+ Multi-page website crawler that generates comprehensive llms.txt files
217
+
218
+ Usage:
219
+ @mdream/crawl [options] <url> Crawl a website with CLI flags
220
+ @mdream/crawl Start interactive mode
221
+
222
+ Options:
223
+ -u, --url <url> Website URL to crawl
224
+ -o, --output <dir> Output directory (default: .)
225
+ -d, --depth <number> Crawl depth (default: 3)
226
+ --driver <http|playwright> Crawler driver (default: http)
227
+ --artifacts <list> Comma-separated list of artifacts: llms.txt,llms-full.txt,markdown (default: all)
228
+ --origin <url> Origin URL for resolving relative paths (overrides auto-detection)
229
+ --site-name <name> Override site name (overrides auto-extracted title)
230
+ --description <desc> Override site description (overrides auto-extracted description)
231
+ --max-pages <number> Maximum pages to crawl (default: unlimited)
232
+ --crawl-delay <seconds> Crawl delay in seconds
233
+ --exclude <pattern> Exclude URLs matching glob patterns (can be used multiple times)
234
+ -h, --help Show this help message
235
+ --version Show version number
236
+
237
+ Note: Sitemap discovery and robots.txt checking are automatic
238
+
239
+ Examples:
240
+ @mdream/crawl -u harlanzw.com --artifacts "llms.txt,markdown"
241
+ @mdream/crawl --url https://docs.example.com --depth 2 --artifacts "llms-full.txt"
242
+ @mdream/crawl -u example.com --exclude "*/admin/*" --exclude "*/api/*"
243
+ `);
244
+ process.exit(0);
245
+ }
246
+ if (args.includes("--version")) {
247
+ console.log(`@mdream/crawl v${version}`);
248
+ process.exit(0);
249
+ }
250
+ if (args.length === 0) return null;
251
+ const getArgValue = (flag) => {
252
+ const index = args.findIndex((arg) => arg === flag || arg === flag.replace("--", "-"));
253
+ return index >= 0 && index + 1 < args.length ? args[index + 1] : void 0;
254
+ };
255
+ const getArgValues = (flag) => {
256
+ const values = [];
257
+ for (let i = 0; i < args.length; i++) if (args[i] === flag || args[i] === flag.replace("--", "-")) {
258
+ if (i + 1 < args.length && !args[i + 1].startsWith("-")) values.push(args[i + 1]);
259
+ }
260
+ return values;
261
+ };
262
+ const urlFromFlag = getArgValue("--url") || getArgValue("-u");
263
+ const urlFromArgs = args.find((arg) => !arg.startsWith("-") && !args[args.indexOf(arg) - 1]?.startsWith("-"));
264
+ const url = urlFromFlag || urlFromArgs;
265
+ if (!url) {
266
+ p.log.error("Error: URL is required when using CLI arguments");
267
+ p.log.info("Use --help for usage information or run without arguments for interactive mode");
268
+ process.exit(1);
269
+ }
270
+ const globError = validateGlobPattern(url);
271
+ if (globError) {
272
+ p.log.error(`Error: ${globError}`);
273
+ process.exit(1);
274
+ }
275
+ let parsed;
276
+ try {
277
+ parsed = parseUrlPattern(url);
278
+ } catch (error) {
279
+ p.log.error(`Error: ${error instanceof Error ? error.message : "Invalid URL pattern"}`);
280
+ process.exit(1);
281
+ }
282
+ if (!parsed.isGlob) try {
283
+ new URL(withHttps(url));
284
+ } catch {
285
+ p.log.error(`Error: Invalid URL: ${withHttps(url)}`);
286
+ process.exit(1);
287
+ }
288
+ const excludePatterns = getArgValues("--exclude");
289
+ for (const pattern of excludePatterns) {
290
+ const excludeError = validateGlobPattern(pattern);
291
+ if (excludeError) {
292
+ p.log.error(`Error in exclude pattern: ${excludeError}`);
293
+ process.exit(1);
294
+ }
295
+ }
296
+ const depthStr = getArgValue("--depth") || getArgValue("-d") || "3";
297
+ const depth = Number.parseInt(depthStr);
298
+ if (Number.isNaN(depth) || depth < 1 || depth > 10) {
299
+ p.log.error("Error: Depth must be between 1 and 10");
300
+ process.exit(1);
301
+ }
302
+ const driver = getArgValue("--driver");
303
+ if (driver && driver !== "http" && driver !== "playwright") {
304
+ p.log.error("Error: Driver must be either \"http\" or \"playwright\"");
305
+ process.exit(1);
306
+ }
307
+ const maxPagesStr = getArgValue("--max-pages");
308
+ if (maxPagesStr) {
309
+ const maxPages = Number.parseInt(maxPagesStr);
310
+ if (Number.isNaN(maxPages) || maxPages < 1) {
311
+ p.log.error("Error: Max pages must be a positive number");
312
+ process.exit(1);
313
+ }
314
+ }
315
+ const crawlDelayStr = getArgValue("--crawl-delay");
316
+ if (crawlDelayStr) {
317
+ const crawlDelay = Number.parseInt(crawlDelayStr);
318
+ if (Number.isNaN(crawlDelay) || crawlDelay < 0) {
319
+ p.log.error("Error: Crawl delay must be a non-negative number");
320
+ process.exit(1);
321
+ }
322
+ }
323
+ const artifactsStr = getArgValue("--artifacts");
324
+ const artifacts = artifactsStr ? artifactsStr.split(",").map((a) => a.trim()) : [
325
+ "llms.txt",
326
+ "llms-full.txt",
327
+ "markdown"
328
+ ];
329
+ const validArtifacts = [
330
+ "llms.txt",
331
+ "llms-full.txt",
332
+ "markdown"
333
+ ];
334
+ for (const artifact of artifacts) if (!validArtifacts.includes(artifact)) {
335
+ p.log.error(`Error: Invalid artifact '${artifact}'. Valid options: ${validArtifacts.join(", ")}`);
336
+ process.exit(1);
337
+ }
338
+ const originOverride = getArgValue("--origin");
339
+ const inferredOrigin = (() => {
340
+ if (originOverride) return originOverride;
341
+ try {
342
+ const urlObj = new URL(withHttps(url));
343
+ return `${urlObj.protocol}//${urlObj.host}`;
344
+ } catch {
345
+ return void 0;
346
+ }
347
+ })();
348
+ const siteNameOverride = getArgValue("--site-name");
349
+ const descriptionOverride = getArgValue("--description");
350
+ const patterns = [parsed];
351
+ return {
352
+ urls: [url],
353
+ outputDir: resolve(getArgValue("--output") || getArgValue("-o") || "."),
354
+ driver: driver || "http",
355
+ maxRequestsPerCrawl: Number.parseInt(maxPagesStr || String(Number.MAX_SAFE_INTEGER)),
356
+ followLinks: true,
357
+ maxDepth: depth,
358
+ generateLlmsTxt: artifacts.includes("llms.txt"),
359
+ generateLlmsFullTxt: artifacts.includes("llms-full.txt"),
360
+ generateIndividualMd: artifacts.includes("markdown"),
361
+ siteNameOverride,
362
+ descriptionOverride,
363
+ origin: inferredOrigin,
364
+ globPatterns: patterns,
365
+ crawlDelay: crawlDelayStr ? Number.parseInt(crawlDelayStr) : void 0,
366
+ exclude: excludePatterns.length > 0 ? excludePatterns : void 0
367
+ };
368
+ }
369
+ async function main() {
370
+ const cliOptions = parseCliArgs();
371
+ let options;
372
+ if (cliOptions) {
373
+ options = cliOptions;
374
+ p.intro(`โ˜๏ธ mdream v${version}`);
375
+ const formats = [];
376
+ if (options.generateLlmsTxt) formats.push("llms.txt");
377
+ if (options.generateLlmsFullTxt) formats.push("llms-full.txt");
378
+ if (options.generateIndividualMd) formats.push("Individual MD files");
379
+ const summary = [
380
+ `URL: ${options.urls.join(", ")}`,
381
+ `Output: ${options.outputDir}`,
382
+ `Driver: ${options.driver}`,
383
+ `Depth: ${options.maxDepth}`,
384
+ `Formats: ${formats.join(", ")}`,
385
+ options.exclude && options.exclude.length > 0 && `Exclude: ${options.exclude.join(", ")}`
386
+ ].filter(Boolean);
387
+ p.note(summary.join("\n"), "Configuration");
388
+ } else options = await interactiveCrawl();
389
+ if (!options) process.exit(0);
390
+ if (options.driver === "playwright") {
391
+ const playwrightInstalled = await ensurePlaywrightInstalled();
392
+ if (!playwrightInstalled) {
393
+ p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
394
+ process.exit(1);
395
+ }
396
+ }
397
+ const s = p.spinner();
398
+ s.start("Starting crawl...");
399
+ const results = await crawlAndGenerate(options, (progress) => {
400
+ if (progress.sitemap.status === "discovering") s.message("Discovering sitemaps...");
401
+ else if (progress.sitemap.status === "processing") s.message(`Processing sitemap... Found ${progress.sitemap.found} URLs`);
402
+ else if (progress.crawling.status === "processing") {
403
+ const processedCount = progress.crawling.processed;
404
+ const totalCount = progress.crawling.total;
405
+ const currentUrl = progress.crawling.currentUrl;
406
+ if (currentUrl) {
407
+ const shortUrl = currentUrl.length > 60 ? `${currentUrl.substring(0, 57)}...` : currentUrl;
408
+ if (processedCount > totalCount) s.message(`Crawling ${processedCount}: ${shortUrl}`);
409
+ else s.message(`Crawling ${processedCount}/${totalCount}: ${shortUrl}`);
410
+ } else if (processedCount > totalCount) s.message(`Crawling... ${processedCount} pages`);
411
+ else s.message(`Crawling... ${processedCount}/${totalCount} pages`);
412
+ } else if (progress.generation.status === "generating") {
413
+ const current = progress.generation.current || "Generating files";
414
+ s.message(current);
415
+ }
416
+ });
417
+ s.stop("Crawl completed!");
418
+ const successful = results.filter((r) => r.success).length;
419
+ const failed = results.filter((r) => !r.success).length;
420
+ const failedResults = results.filter((r) => !r.success);
421
+ if (failed > 0 && cliOptions) {
422
+ p.log.error("Failed URLs:");
423
+ failedResults.forEach((result) => {
424
+ p.log.error(` ${result.url}: ${result.error || "Unknown error"}`);
425
+ });
426
+ } else if (failed > 0) {
427
+ console.log("\nFailed URLs:");
428
+ failedResults.forEach((result) => {
429
+ console.log(` - ${result.url}: ${result.error || "Unknown error"}`);
430
+ });
431
+ }
432
+ const generatedFiles = [];
433
+ if (successful > 0) {
434
+ if (options.generateLlmsTxt) generatedFiles.push("llms.txt");
435
+ if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
436
+ if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
437
+ }
438
+ if (!cliOptions) await showCrawlResults(successful, failed, options.outputDir, generatedFiles);
439
+ else {
440
+ const messages = [];
441
+ if (successful > 0) messages.push(`โœ… ${successful} pages processed`);
442
+ if (failed > 0) messages.push(`โŒ ${failed} pages failed`);
443
+ if (generatedFiles.length > 0) messages.push(`๐Ÿ“„ Generated: ${generatedFiles.join(", ")}`);
444
+ messages.push(`๐Ÿ“ Output: ${options.outputDir}`);
445
+ p.note(messages.join("\n"), "Results");
446
+ if (successful > 0) p.outro("๐ŸŽ‰ Crawling completed!");
447
+ else {
448
+ p.outro("โŒ Crawling failed - no pages processed");
449
+ process.exit(1);
450
+ }
451
+ }
452
+ }
453
+ main().catch((error) => {
454
+ p.log.error(`Unexpected error: ${error}`);
455
+ process.exit(1);
456
+ });
457
+
458
+ //#endregion
@@ -0,0 +1,74 @@
1
+ //#region src/types.d.ts
2
+ interface CrawlOptions {
3
+ urls: string[];
4
+ outputDir: string;
5
+ maxRequestsPerCrawl?: number;
6
+ generateLlmsTxt?: boolean;
7
+ generateLlmsFullTxt?: boolean;
8
+ generateIndividualMd?: boolean;
9
+ origin?: string;
10
+ chunkSize?: number;
11
+ driver?: 'http' | 'playwright';
12
+ followLinks?: boolean;
13
+ maxDepth?: number;
14
+ globPatterns?: ParsedUrlPattern[];
15
+ crawlDelay?: number;
16
+ exclude?: string[];
17
+ siteNameOverride?: string;
18
+ descriptionOverride?: string;
19
+ }
20
+ interface ParsedUrlPattern {
21
+ baseUrl: string;
22
+ pattern: string;
23
+ isGlob: boolean;
24
+ }
25
+ interface PageMetadata {
26
+ title: string;
27
+ description?: string;
28
+ keywords?: string;
29
+ author?: string;
30
+ links: string[];
31
+ }
32
+ interface CrawlResult {
33
+ url: string;
34
+ title: string;
35
+ content: string;
36
+ filePath?: string;
37
+ timestamp: number;
38
+ success: boolean;
39
+ error?: string;
40
+ metadata?: PageMetadata;
41
+ depth?: number;
42
+ }
43
+ interface LlmsTxtOptions {
44
+ siteName: string;
45
+ description?: string;
46
+ results: CrawlResult[];
47
+ outputPath: string;
48
+ }
49
+ //#endregion
50
+ //#region src/crawl.d.ts
51
+ interface CrawlProgress {
52
+ sitemap: {
53
+ status: 'discovering' | 'processing' | 'completed';
54
+ found: number;
55
+ processed: number;
56
+ };
57
+ crawling: {
58
+ status: 'starting' | 'processing' | 'completed';
59
+ total: number;
60
+ processed: number;
61
+ currentUrl?: string;
62
+ };
63
+ generation: {
64
+ status: 'idle' | 'generating' | 'completed';
65
+ current?: string;
66
+ };
67
+ }
68
+ declare function crawlAndGenerate(options: CrawlOptions, onProgress?: (progress: CrawlProgress) => void): Promise<CrawlResult[]>;
69
+ //#endregion
70
+ //#region src/llms-txt.d.ts
71
+ declare function generateLlmsTxt(options: LlmsTxtOptions): Promise<void>;
72
+ declare function generateLlmsFullTxt(options: LlmsTxtOptions): Promise<void>;
73
+ //#endregion
74
+ export { CrawlOptions, CrawlResult, LlmsTxtOptions, crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
package/dist/index.mjs ADDED
@@ -0,0 +1,66 @@
1
+ import { crawlAndGenerate } from "./_chunks/crawl-NJU1Dyc-.mjs";
2
+ import { writeFile } from "node:fs/promises";
3
+ import { basename, sep } from "pathe";
4
+
5
+ //#region src/llms-txt.ts
6
+ async function generateLlmsTxt(options) {
7
+ const { siteName, description, results, outputPath } = options;
8
+ let content = `# ${siteName}\n\n`;
9
+ if (description) content += `> ${description}\n\n`;
10
+ if (results.length > 0) {
11
+ content += `## Pages\n\n`;
12
+ for (const result of results) {
13
+ let title;
14
+ try {
15
+ title = result.title || new URL(result.url).pathname;
16
+ } catch {
17
+ title = result.title || result.url;
18
+ }
19
+ if (result.filePath) {
20
+ const mdSeparator = `${sep}md${sep}`;
21
+ const mdIndex = result.filePath.indexOf(mdSeparator);
22
+ const relativePath = mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath);
23
+ const linkPath = relativePath.split(sep).join("/");
24
+ content += `- [${title}](md/${linkPath}): ${result.url}\n`;
25
+ } else {
26
+ const description$1 = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
27
+ content += `- [${title}](${result.url})${description$1 ? `: ${description$1}` : ""}\n`;
28
+ }
29
+ }
30
+ }
31
+ await writeFile(outputPath, content, "utf-8");
32
+ }
33
+ async function generateLlmsFullTxt(options) {
34
+ const { siteName, description, results, outputPath } = options;
35
+ let content = `# ${siteName}\n\n`;
36
+ if (description) content += `> ${description}\n\n`;
37
+ if (results.length > 0) {
38
+ content += `## Table of Contents\n\n`;
39
+ for (const result of results) {
40
+ let title;
41
+ try {
42
+ title = result.title || new URL(result.url).pathname;
43
+ } catch {
44
+ title = result.title || result.url;
45
+ }
46
+ const anchor = title.toLowerCase().replace(/[^a-z0-9]/g, "-");
47
+ content += `- [${title}](#${anchor})\n`;
48
+ }
49
+ content += `\n---\n\n`;
50
+ for (const result of results) {
51
+ let title;
52
+ try {
53
+ title = result.title || new URL(result.url).pathname;
54
+ } catch {
55
+ title = result.title || result.url;
56
+ }
57
+ content += `## ${title}\n\n`;
58
+ content += `**URL:** ${result.url}\n\n`;
59
+ content += `${result.content}\n\n---\n\n`;
60
+ }
61
+ }
62
+ await writeFile(outputPath, content, "utf-8");
63
+ }
64
+
65
+ //#endregion
66
+ export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
package/package.json ADDED
@@ -0,0 +1,62 @@
1
+ {
2
+ "name": "@mdream/crawl",
3
+ "type": "module",
4
+ "version": "0.7.0",
5
+ "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
+ "author": {
7
+ "name": "Harlan Wilton",
8
+ "email": "harlan@harlanzw.com",
9
+ "url": "https://harlanzw.com/"
10
+ },
11
+ "license": "MIT",
12
+ "exports": {
13
+ ".": {
14
+ "types": "./dist/index.d.mts",
15
+ "import": {
16
+ "types": "./dist/index.d.mts",
17
+ "default": "./dist/index.mjs"
18
+ },
19
+ "default": "./dist/index.mjs"
20
+ },
21
+ "./cli": {
22
+ "types": "./dist/cli.d.mts",
23
+ "import": {
24
+ "types": "./dist/cli.d.mts",
25
+ "default": "./dist/cli.mjs"
26
+ },
27
+ "default": "./dist/cli.mjs"
28
+ }
29
+ },
30
+ "main": "./dist/index.mjs",
31
+ "types": "./dist/index.d.mts",
32
+ "bin": {
33
+ "@mdream/crawl": "./bin/mdream-crawl.mjs"
34
+ },
35
+ "files": [
36
+ "bin",
37
+ "dist"
38
+ ],
39
+ "peerDependencies": {
40
+ "playwright": "^1.53.2"
41
+ },
42
+ "peerDependenciesMeta": {
43
+ "playwright": {
44
+ "optional": true
45
+ }
46
+ },
47
+ "dependencies": {
48
+ "@clack/prompts": "^0.11.0",
49
+ "crawlee": "^3.13.9",
50
+ "minimatch": "^10.0.3",
51
+ "nypm": "^0.6.0",
52
+ "pathe": "^2.0.3",
53
+ "mdream": "0.7.0"
54
+ },
55
+ "scripts": {
56
+ "build": "obuild",
57
+ "typecheck": "tsc --noEmit",
58
+ "dev:prepare": "obuild --stub",
59
+ "test": "vitest test",
60
+ "test:attw": "attw --pack"
61
+ }
62
+ }