@mdream/crawl 0.17.1 → 1.0.0-beta.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -7,7 +7,7 @@ Multi-page website crawler that generates comprehensive llms.txt files by follow
7
7
  ## Installation
8
8
 
9
9
  ```bash
10
- npm install @mdream/crawl
10
+ npm install @mdream/crawl@beta
11
11
  ```
12
12
 
13
13
  ## Usage
@@ -15,7 +15,7 @@ npm install @mdream/crawl
15
15
  Simply run the command to start the interactive multi-page website crawler:
16
16
 
17
17
  ```bash
18
- npx @mdream/crawl
18
+ npx @mdream/crawl@beta
19
19
  ```
20
20
 
21
21
  The crawler will automatically discover and follow internal links to crawl entire websites. The interactive interface provides:
@@ -31,7 +31,7 @@ The crawler will automatically discover and follow internal links to crawl entir
31
31
  You can also use @mdream/crawl programmatically in your Node.js applications:
32
32
 
33
33
  ```typescript
34
- import { crawlAndGenerate, generateLlmsTxt } from '@mdream/crawl'
34
+ import { crawlAndGenerate } from '@mdream/crawl'
35
35
 
36
36
  // Crawl entire websites programmatically
37
37
  const results = await crawlAndGenerate({
@@ -44,16 +44,20 @@ const results = await crawlAndGenerate({
44
44
  driver: 'http', // or 'playwright' for JS-heavy sites
45
45
  verbose: true
46
46
  })
47
+ ```
47
48
 
48
- // Generate llms.txt manually from existing results
49
- await generateLlmsTxt({
50
- siteName: 'Example Site',
51
- description: 'Documentation for Example Site',
52
- results: crawlResults,
53
- outputPath: './output/llms.txt'
54
- })
49
+ ### Playwright Driver
50
+
51
+ The default HTTP driver works for most sites. For JavaScript-heavy sites that require a browser, install the optional dependencies:
52
+
53
+ ```bash
54
+ npm install crawlee playwright
55
55
  ```
56
56
 
57
+ Then use `--driver playwright` or `driver: 'playwright'` in the API.
58
+
59
+ > **Note**: llms.txt artifact generation is handled by [`@mdream/js/llms-txt`](../js). The crawl package uses it internally when `generateLlmsTxt: true`.
60
+
57
61
  ## Output
58
62
 
59
63
  The crawler generates comprehensive output from entire websites:
@@ -76,7 +80,7 @@ The crawler generates comprehensive output from entire websites:
76
80
 
77
81
  - ✅ **Multi-Page Website Crawling**: Designed specifically for crawling entire websites by following internal links
78
82
  - ✅ **Purely Interactive**: No complex command-line options to remember
79
- - ✅ **Dual Crawler Support**: Fast HTTP crawler (default) + Playwright for JavaScript-heavy sites
83
+ - ✅ **Dual Crawler Support**: Fast HTTP crawler (default) + Playwright for JavaScript-heavy sites (requires `crawlee` and `playwright`)
80
84
  - ✅ **Smart Link Discovery**: Uses mdream's extraction plugin to find and follow internal links
81
85
  - ✅ **Rich Metadata Extraction**: Extracts titles, descriptions, keywords, and author info from all pages
82
86
  - ✅ **Comprehensive llms.txt Generation**: Creates complete site documentation files
@@ -1,16 +1,17 @@
1
- import { existsSync, mkdirSync } from "node:fs";
2
- import { writeFile } from "node:fs/promises";
1
+ import { mkdirSync } from "node:fs";
2
+ import { mkdir, writeFile } from "node:fs/promises";
3
3
  import * as p from "@clack/prompts";
4
- import { HttpCrawler, PlaywrightCrawler, log, purgeDefaultStorages } from "crawlee";
4
+ import { generateLlmsTxtArtifacts } from "@mdream/js/llms-txt";
5
5
  import { htmlToMarkdown } from "mdream";
6
- import { generateLlmsTxtArtifacts } from "mdream/llms-txt";
7
- import { withMinimalPreset } from "mdream/preset/minimal";
6
+ import { ofetch } from "ofetch";
8
7
  import { dirname, join, normalize, resolve } from "pathe";
9
8
  import { withHttps } from "ufo";
10
9
  import picomatch from "picomatch";
11
- import { extractionPlugin } from "mdream/plugins";
12
10
  //#region src/glob-utils.ts
13
- const GLOB_STRIP_TAIL_RE = /\*.*$/;
11
+ function stripGlobTail(s) {
12
+ const idx = s.indexOf("*");
13
+ return idx === -1 ? s : s.slice(0, idx);
14
+ }
14
15
  const GLOB_CHAR_RE = /[*?[]/;
15
16
  /**
16
17
  * Parse a URL that may contain glob patterns
@@ -23,7 +24,7 @@ function parseUrlPattern(input) {
23
24
  isGlob: false
24
25
  };
25
26
  try {
26
- const urlWithoutGlob = (input.startsWith("http") ? input : `https://${input}`).replace(GLOB_STRIP_TAIL_RE, "");
27
+ const urlWithoutGlob = stripGlobTail(input.startsWith("http") ? input : `https://${input}`);
27
28
  const url = new URL(urlWithoutGlob);
28
29
  const baseUrl = `${url.protocol}//${url.host}`;
29
30
  const patternStart = input.indexOf(url.host) + url.host.length;
@@ -108,123 +109,116 @@ function validateGlobPattern(pattern) {
108
109
  }
109
110
  }
110
111
  //#endregion
111
- //#region src/metadata-extractor.ts
112
- function extractMetadata(html, url) {
113
- const links = [];
112
+ //#region src/crawl.ts
113
+ const SITEMAP_INDEX_LOC_RE = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
114
+ const SITEMAP_URL_LOC_RE = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
115
+ const ROBOTS_SITEMAP_RE = /Sitemap:\s*(.*)/gi;
116
+ const ROBOTS_SITEMAP_PREFIX_RE = /Sitemap:\s*/i;
117
+ const ROBOTS_CRAWL_DELAY_RE = /Crawl-delay:\s*(\d+(?:\.\d+)?)/i;
118
+ const URL_TRAILING_SLASH_RE = /\/$/;
119
+ const URL_PATH_UNSAFE_CHARS_RE = /[^\w\-]/g;
120
+ const FRONTMATTER_BLOCK_RE = /^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/;
121
+ const FETCH_HEADERS = {
122
+ "User-Agent": "mdream-crawler/1.0",
123
+ "Accept": "text/html,application/xhtml+xml,text/markdown"
124
+ };
125
+ const DEFAULT_CONCURRENCY = 20;
126
+ function extractCdataUrl(url) {
127
+ if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) return url.slice(9, -3);
128
+ return url;
129
+ }
130
+ async function loadSitemap(sitemapUrl) {
131
+ const xmlContent = await ofetch(sitemapUrl, {
132
+ headers: FETCH_HEADERS,
133
+ timeout: 1e4,
134
+ responseType: "text",
135
+ retry: 0
136
+ });
137
+ if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
138
+ if (xmlContent.includes("<sitemapindex")) {
139
+ SITEMAP_INDEX_LOC_RE.lastIndex = 0;
140
+ const childSitemaps = [];
141
+ let match;
142
+ while (true) {
143
+ match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
144
+ if (match === null) break;
145
+ childSitemaps.push(extractCdataUrl(match[1]));
146
+ }
147
+ const childResults = await Promise.allSettled(childSitemaps.map((url) => loadSitemap(url)));
148
+ const allUrls = [];
149
+ for (const result of childResults) if (result.status === "fulfilled") allUrls.push(...result.value);
150
+ return allUrls;
151
+ }
152
+ const urls = [];
153
+ SITEMAP_URL_LOC_RE.lastIndex = 0;
154
+ let match;
155
+ while (true) {
156
+ match = SITEMAP_URL_LOC_RE.exec(xmlContent);
157
+ if (match === null) break;
158
+ urls.push(extractCdataUrl(match[1]));
159
+ }
160
+ return urls;
161
+ }
162
+ function extractMetadataInline(parsedUrl) {
163
+ const links = /* @__PURE__ */ new Set();
114
164
  let title = "";
115
165
  let description = "";
116
166
  let keywords = "";
117
167
  let author = "";
118
- htmlToMarkdown(html, {
119
- plugins: [extractionPlugin({
120
- "a[href]": (element) => {
121
- const href = element.attributes?.href;
168
+ const url = parsedUrl.href;
169
+ const originPrefix = `${parsedUrl.origin}/`;
170
+ return {
171
+ extraction: {
172
+ "a[href]": (el) => {
173
+ const href = el.attributes.href;
122
174
  if (href) try {
123
175
  const absoluteUrl = new URL(href, url).href;
124
- if (!links.includes(absoluteUrl)) links.push(absoluteUrl);
176
+ if (absoluteUrl.startsWith(originPrefix) || absoluteUrl === parsedUrl.origin) links.add(absoluteUrl);
125
177
  } catch {}
126
178
  },
127
- "title": (element) => {
128
- if (!title && element.textContent) title = element.textContent.trim();
179
+ "title": (el) => {
180
+ if (!title) title = el.textContent;
129
181
  },
130
- "meta[name=\"description\"]": (element) => {
131
- if (!description && element.attributes?.content) description = element.attributes.content.trim();
182
+ "meta[name=\"description\"]": (el) => {
183
+ if (!description) description = el.attributes.content || "";
132
184
  },
133
- "meta[property=\"og:description\"]": (element) => {
134
- if (!description && element.attributes?.content) description = element.attributes.content.trim();
185
+ "meta[property=\"og:description\"]": (el) => {
186
+ if (!description) description = el.attributes.content || "";
135
187
  },
136
- "meta[name=\"keywords\"]": (element) => {
137
- if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
188
+ "meta[name=\"keywords\"]": (el) => {
189
+ if (!keywords) keywords = el.attributes.content || "";
138
190
  },
139
- "meta[name=\"author\"]": (element) => {
140
- if (!author && element.attributes?.content) author = element.attributes.content.trim();
191
+ "meta[name=\"author\"]": (el) => {
192
+ if (!author) author = el.attributes.content || "";
141
193
  },
142
- "meta[property=\"og:title\"]": (element) => {
143
- if (!title && element.attributes?.content) title = element.attributes.content.trim();
144
- }
145
- })],
146
- origin: new URL(url).origin
147
- });
148
- return {
149
- title: title || new URL(url).pathname,
150
- description: description || void 0,
151
- keywords: keywords || void 0,
152
- author: author || void 0,
153
- links: links.filter((link) => {
154
- try {
155
- const linkUrl = new URL(link);
156
- const baseUrl = new URL(url);
157
- return linkUrl.hostname === baseUrl.hostname;
158
- } catch {
159
- return false;
194
+ "meta[property=\"og:title\"]": (el) => {
195
+ if (!title) title = el.attributes.content || "";
160
196
  }
197
+ },
198
+ getMetadata: () => ({
199
+ title: title.trim() || parsedUrl.pathname,
200
+ description: description.trim() || void 0,
201
+ keywords: keywords.trim() || void 0,
202
+ author: author.trim() || void 0,
203
+ links: [...links]
161
204
  })
162
205
  };
163
206
  }
164
- //#endregion
165
- //#region src/crawl.ts
166
- const SITEMAP_INDEX_LOC_RE = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
167
- const SITEMAP_URL_LOC_RE = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
168
- const ROBOTS_SITEMAP_RE = /Sitemap:\s*(.*)/gi;
169
- const ROBOTS_SITEMAP_PREFIX_RE = /Sitemap:\s*/i;
170
- const URL_TRAILING_SLASH_RE = /\/$/;
171
- const URL_PATH_UNSAFE_CHARS_RE = /[^\w\-]/g;
172
- const FRONTMATTER_BLOCK_RE = /^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/;
173
- async function loadSitemapWithoutRetries(sitemapUrl) {
174
- const controller = new AbortController();
175
- const timeoutId = setTimeout(() => controller.abort(), 1e4);
176
- try {
177
- const response = await fetch(sitemapUrl, {
178
- signal: controller.signal,
179
- headers: { "User-Agent": "mdream-crawler/1.0" }
180
- });
181
- clearTimeout(timeoutId);
182
- if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
183
- const xmlContent = await response.text();
184
- if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
185
- if (xmlContent.includes("<sitemapindex")) {
186
- SITEMAP_INDEX_LOC_RE.lastIndex = 0;
187
- const childSitemaps = [];
188
- let match;
189
- while (true) {
190
- match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
191
- if (match === null) break;
192
- let url = match[1];
193
- if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
194
- childSitemaps.push(url);
195
- }
196
- const allUrls = [];
197
- for (const childSitemapUrl of childSitemaps) try {
198
- const childUrls = await loadSitemapWithoutRetries(childSitemapUrl);
199
- allUrls.push(...childUrls);
200
- } catch (error) {
201
- console.warn(`Failed to load child sitemap ${childSitemapUrl}:`, error instanceof Error ? error.message : "Unknown error");
202
- }
203
- return allUrls;
204
- } else {
205
- const urls = [];
206
- SITEMAP_URL_LOC_RE.lastIndex = 0;
207
- let match;
208
- while (true) {
209
- match = SITEMAP_URL_LOC_RE.exec(xmlContent);
210
- if (match === null) break;
211
- let url = match[1];
212
- if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
213
- urls.push(url);
214
- }
215
- return urls;
216
- }
217
- } catch (error) {
218
- clearTimeout(timeoutId);
219
- if (error instanceof Error && error.name === "AbortError") throw new Error("Sitemap request timed out after 10 seconds");
220
- throw error;
221
- }
207
+ function filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, allPatterns) {
208
+ if (hasGlobPatterns) return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude) && allPatterns.some((pattern) => matchesGlobPattern(url, pattern)));
209
+ return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude));
210
+ }
211
+ async function runConcurrent(items, concurrency, fn) {
212
+ let idx = 0;
213
+ const workers = Array.from({ length: Math.min(concurrency, items.length) }, async () => {
214
+ while (idx < items.length) await fn(items[idx++]);
215
+ });
216
+ await Promise.all(workers);
222
217
  }
223
218
  async function crawlAndGenerate(options, onProgress) {
224
- const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, onPage } = options;
219
+ const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay: userCrawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, onPage } = options;
225
220
  const outputDir = resolve(normalize(rawOutputDir));
226
- if (verbose) log.setLevel(log.LEVELS.INFO);
227
- else log.setLevel(log.LEVELS.OFF);
221
+ let crawlDelay = userCrawlDelay;
228
222
  let patterns;
229
223
  try {
230
224
  patterns = globPatterns.length > 0 ? globPatterns : urls.map(parseUrlPattern);
@@ -232,6 +226,7 @@ async function crawlAndGenerate(options, onProgress) {
232
226
  throw new Error(`Invalid URL pattern: ${error instanceof Error ? error.message : "Unknown error"}`);
233
227
  }
234
228
  let startingUrls = patterns.map(getStartingUrl);
229
+ const hasGlobPatterns = patterns.some((p) => p.isGlob);
235
230
  const progress = {
236
231
  sitemap: {
237
232
  status: "discovering",
@@ -241,7 +236,14 @@ async function crawlAndGenerate(options, onProgress) {
241
236
  crawling: {
242
237
  status: "starting",
243
238
  total: 0,
244
- processed: 0
239
+ processed: 0,
240
+ failed: 0,
241
+ latency: {
242
+ total: 0,
243
+ min: Infinity,
244
+ max: 0,
245
+ count: 0
246
+ }
245
247
  },
246
248
  generation: { status: "idle" }
247
249
  };
@@ -250,51 +252,46 @@ async function crawlAndGenerate(options, onProgress) {
250
252
  const baseUrl = new URL(startingUrls[0]).origin;
251
253
  const homePageUrl = baseUrl;
252
254
  onProgress?.(progress);
253
- const robotsUrl = new URL("/robots.txt", baseUrl).toString();
254
- const robotsController = new AbortController();
255
- const robotsTimeoutId = setTimeout(() => robotsController.abort(), 1e4);
256
- let robotsResponse;
255
+ let robotsContent = null;
257
256
  try {
258
- robotsResponse = await fetch(robotsUrl, {
259
- signal: robotsController.signal,
260
- headers: { "User-Agent": "mdream-crawler/1.0" }
257
+ robotsContent = await ofetch(`${baseUrl}/robots.txt`, {
258
+ headers: FETCH_HEADERS,
259
+ timeout: 1e4,
260
+ responseType: "text",
261
+ retry: 0
261
262
  });
262
- clearTimeout(robotsTimeoutId);
263
- } catch {
264
- clearTimeout(robotsTimeoutId);
265
- robotsResponse = null;
263
+ } catch {}
264
+ if (robotsContent && !crawlDelay) {
265
+ const crawlDelayMatch = robotsContent.match(ROBOTS_CRAWL_DELAY_RE);
266
+ if (crawlDelayMatch) {
267
+ crawlDelay = Number.parseFloat(crawlDelayMatch[1]);
268
+ p.log(`[ROBOTS] Crawl-delay: ${crawlDelay}s`);
269
+ }
266
270
  }
267
- if (robotsResponse?.ok) {
268
- const sitemapMatches = (await robotsResponse.text()).match(ROBOTS_SITEMAP_RE);
271
+ if (robotsContent) {
272
+ const sitemapMatches = robotsContent.match(ROBOTS_SITEMAP_RE);
269
273
  if (sitemapMatches && sitemapMatches.length > 0) {
270
274
  progress.sitemap.found = sitemapMatches.length;
271
275
  progress.sitemap.status = "processing";
272
276
  onProgress?.(progress);
273
277
  const robotsSitemaps = sitemapMatches.map((match) => match.replace(ROBOTS_SITEMAP_PREFIX_RE, "").trim());
274
278
  for (const sitemapUrl of robotsSitemaps) try {
275
- const robotsUrls = await loadSitemapWithoutRetries(sitemapUrl);
279
+ const robotsUrls = await loadSitemap(sitemapUrl);
276
280
  sitemapAttempts.push({
277
281
  url: sitemapUrl,
278
282
  success: true
279
283
  });
280
- if (patterns.some((p) => p.isGlob)) {
281
- const filteredUrls = robotsUrls.filter((url) => {
282
- return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
283
- });
284
+ const filteredUrls = filterSitemapUrls(robotsUrls, hasGlobPatterns, exclude, patterns);
285
+ if (hasGlobPatterns) {
286
+ startingUrls = filteredUrls;
287
+ progress.sitemap.processed = filteredUrls.length;
288
+ onProgress?.(progress);
289
+ break;
290
+ } else if (filteredUrls.length > 0) {
284
291
  startingUrls = filteredUrls;
285
292
  progress.sitemap.processed = filteredUrls.length;
286
293
  onProgress?.(progress);
287
294
  break;
288
- } else {
289
- const filteredUrls = robotsUrls.filter((url) => {
290
- return !isUrlExcluded(url, exclude);
291
- });
292
- if (filteredUrls.length > 0) {
293
- startingUrls = filteredUrls;
294
- progress.sitemap.processed = filteredUrls.length;
295
- onProgress?.(progress);
296
- break;
297
- }
298
295
  }
299
296
  } catch (error) {
300
297
  sitemapAttempts.push({
@@ -308,31 +305,24 @@ async function crawlAndGenerate(options, onProgress) {
308
305
  let mainSitemapProcessed = false;
309
306
  const mainSitemapUrl = `${baseUrl}/sitemap.xml`;
310
307
  try {
311
- const sitemapUrls = await loadSitemapWithoutRetries(mainSitemapUrl);
308
+ const sitemapUrls = await loadSitemap(mainSitemapUrl);
312
309
  sitemapAttempts.push({
313
310
  url: mainSitemapUrl,
314
311
  success: true
315
312
  });
316
- if (patterns.some((p) => p.isGlob)) {
317
- const filteredUrls = sitemapUrls.filter((url) => {
318
- return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
319
- });
313
+ const filteredUrls = filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, patterns);
314
+ if (hasGlobPatterns) {
315
+ startingUrls = filteredUrls;
316
+ progress.sitemap.found = sitemapUrls.length;
317
+ progress.sitemap.processed = filteredUrls.length;
318
+ onProgress?.(progress);
319
+ mainSitemapProcessed = true;
320
+ } else if (filteredUrls.length > 0) {
320
321
  startingUrls = filteredUrls;
321
322
  progress.sitemap.found = sitemapUrls.length;
322
323
  progress.sitemap.processed = filteredUrls.length;
323
324
  onProgress?.(progress);
324
325
  mainSitemapProcessed = true;
325
- } else {
326
- const filteredUrls = sitemapUrls.filter((url) => {
327
- return !isUrlExcluded(url, exclude);
328
- });
329
- if (filteredUrls.length > 0) {
330
- startingUrls = filteredUrls;
331
- progress.sitemap.found = sitemapUrls.length;
332
- progress.sitemap.processed = filteredUrls.length;
333
- onProgress?.(progress);
334
- mainSitemapProcessed = true;
335
- }
336
326
  }
337
327
  } catch (error) {
338
328
  sitemapAttempts.push({
@@ -347,31 +337,24 @@ async function crawlAndGenerate(options, onProgress) {
347
337
  `${baseUrl}/sitemap-index.xml`
348
338
  ];
349
339
  for (const sitemapUrl of commonSitemaps) try {
350
- const altUrls = await loadSitemapWithoutRetries(sitemapUrl);
340
+ const altUrls = await loadSitemap(sitemapUrl);
351
341
  sitemapAttempts.push({
352
342
  url: sitemapUrl,
353
343
  success: true
354
344
  });
355
- if (patterns.some((p) => p.isGlob)) {
356
- const filteredUrls = altUrls.filter((url) => {
357
- return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
358
- });
345
+ const filteredUrls = filterSitemapUrls(altUrls, hasGlobPatterns, exclude, patterns);
346
+ if (hasGlobPatterns) {
347
+ startingUrls = filteredUrls;
348
+ progress.sitemap.found = altUrls.length;
349
+ progress.sitemap.processed = filteredUrls.length;
350
+ onProgress?.(progress);
351
+ break;
352
+ } else if (filteredUrls.length > 0) {
359
353
  startingUrls = filteredUrls;
360
354
  progress.sitemap.found = altUrls.length;
361
355
  progress.sitemap.processed = filteredUrls.length;
362
356
  onProgress?.(progress);
363
357
  break;
364
- } else {
365
- const filteredUrls = altUrls.filter((url) => {
366
- return !isUrlExcluded(url, exclude);
367
- });
368
- if (filteredUrls.length > 0) {
369
- startingUrls = filteredUrls;
370
- progress.sitemap.found = altUrls.length;
371
- progress.sitemap.processed = filteredUrls.length;
372
- onProgress?.(progress);
373
- break;
374
- }
375
358
  }
376
359
  } catch (error) {
377
360
  sitemapAttempts.push({
@@ -404,151 +387,193 @@ async function crawlAndGenerate(options, onProgress) {
404
387
  progress.crawling.total = startingUrls.length;
405
388
  onProgress?.(progress);
406
389
  }
407
- if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });
390
+ mkdirSync(outputDir, { recursive: true });
408
391
  const results = [];
409
392
  const processedUrls = /* @__PURE__ */ new Set();
410
393
  const shouldCrawlUrl = (url) => {
411
394
  if (isUrlExcluded(url, exclude)) return false;
412
- if (!patterns.some((p) => p.isGlob)) return true;
395
+ if (!hasGlobPatterns) return true;
413
396
  return patterns.some((pattern) => matchesGlobPattern(url, pattern));
414
397
  };
415
- const createRequestHandler = (crawlerType) => {
416
- return async ({ request, body, page, enqueueLinks, response }) => {
417
- const startTime = Date.now();
418
- progress.crawling.currentUrl = request.loadedUrl;
419
- onProgress?.(progress);
420
- if (response?.statusCode && (response.statusCode < 200 || response.statusCode >= 300)) return;
421
- const homePageUrl = new URL(startingUrls[0]).origin;
422
- let html;
423
- let title;
424
- if (crawlerType === "playwright") {
425
- await page.waitForLoadState("networkidle");
426
- title = await page.title();
427
- html = await page.innerHTML("html");
428
- } else {
429
- html = typeof body === "string" ? body : body.toString();
430
- title = "";
398
+ const recordLatency = (ms) => {
399
+ const lat = progress.crawling.latency;
400
+ lat.total += ms;
401
+ lat.count++;
402
+ if (ms < lat.min) lat.min = ms;
403
+ if (ms > lat.max) lat.max = ms;
404
+ };
405
+ const normalizedHomePageUrl = (startingUrls.length > 0 ? new URL(startingUrls[0]).origin : "").replace(URL_TRAILING_SLASH_RE, "");
406
+ const createdDirs = /* @__PURE__ */ new Set();
407
+ const sharedOrigin = origin || "";
408
+ const processPage = async (url, content, initialTitle, depth, isMarkdown = false) => {
409
+ const parsedUrl = new URL(url);
410
+ const shouldProcessMarkdown = shouldCrawlUrl(url);
411
+ const pageOrigin = sharedOrigin || parsedUrl.origin;
412
+ let md;
413
+ let metadata;
414
+ if (isMarkdown) {
415
+ md = content;
416
+ metadata = {
417
+ title: initialTitle || parsedUrl.pathname,
418
+ links: []
419
+ };
420
+ } else {
421
+ const { extraction, getMetadata } = extractMetadataInline(parsedUrl);
422
+ md = htmlToMarkdown(content, {
423
+ origin: pageOrigin,
424
+ extraction
425
+ });
426
+ metadata = getMetadata();
427
+ }
428
+ const title = initialTitle || metadata.title;
429
+ if (onPage && shouldProcessMarkdown) await onPage({
430
+ url,
431
+ html: isMarkdown ? "" : content,
432
+ title,
433
+ metadata,
434
+ origin: pageOrigin
435
+ });
436
+ let filePath;
437
+ if (shouldProcessMarkdown && generateIndividualMd) {
438
+ const safeSegments = (parsedUrl.pathname === "/" ? "/index" : parsedUrl.pathname).replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"));
439
+ filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
440
+ const fileDir = dirname(filePath);
441
+ if (fileDir && !createdDirs.has(fileDir)) {
442
+ await mkdir(fileDir, { recursive: true });
443
+ createdDirs.add(fileDir);
431
444
  }
432
- const metadata = extractMetadata(html, request.loadedUrl);
433
- if (!title) title = metadata.title;
434
- const shouldProcessMarkdown = shouldCrawlUrl(request.loadedUrl);
435
- const pageOrigin = origin || new URL(request.loadedUrl).origin;
436
- if (onPage && shouldProcessMarkdown) await onPage({
437
- url: request.loadedUrl,
438
- html,
445
+ await writeFile(filePath, md, "utf-8");
446
+ }
447
+ const isHomePage = parsedUrl.pathname === "/" && parsedUrl.origin === normalizedHomePageUrl;
448
+ if (shouldProcessMarkdown || isHomePage) {
449
+ const result = {
450
+ url,
439
451
  title,
452
+ content: md,
453
+ filePath: shouldProcessMarkdown ? filePath : void 0,
454
+ timestamp: Date.now(),
455
+ success: true,
440
456
  metadata,
441
- origin: pageOrigin
442
- });
443
- let md = "";
444
- if (shouldProcessMarkdown) md = htmlToMarkdown(html, withMinimalPreset({ origin: pageOrigin }));
445
- let filePath;
446
- if (shouldProcessMarkdown && generateIndividualMd) {
447
- const urlObj = new URL(request.loadedUrl);
448
- const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"));
449
- filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
450
- const fileDir = dirname(filePath);
451
- if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
452
- await writeFile(filePath, md, "utf-8");
453
- }
454
- const isHomePage = request.loadedUrl.replace(URL_TRAILING_SLASH_RE, "") === homePageUrl.replace(URL_TRAILING_SLASH_RE, "");
455
- if (shouldProcessMarkdown || isHomePage) {
456
- const result = {
457
- url: request.loadedUrl,
458
- title,
459
- content: md,
460
- filePath: shouldProcessMarkdown ? filePath : void 0,
461
- timestamp: startTime,
462
- success: true,
463
- metadata,
464
- depth: request.userData?.depth || 0
465
- };
466
- results.push(result);
467
- progress.crawling.processed = results.length;
468
- onProgress?.(progress);
469
- }
470
- if (followLinks && (request.userData?.depth || 0) < maxDepth) {
471
- const currentDepth = (request.userData?.depth || 0) + 1;
472
- const filteredLinks = metadata.links.filter((link) => {
473
- return shouldCrawlUrl(link);
474
- });
475
- if (enqueueLinks) await enqueueLinks({
476
- urls: filteredLinks,
477
- userData: { depth: currentDepth }
478
- });
479
- else for (const link of filteredLinks) if (!processedUrls.has(link)) processedUrls.add(link);
480
- }
481
- };
457
+ depth
458
+ };
459
+ results.push(result);
460
+ progress.crawling.processed = results.length;
461
+ onProgress?.(progress);
462
+ }
463
+ if (followLinks && depth < maxDepth) {
464
+ const filteredLinks = metadata.links.filter((link) => shouldCrawlUrl(link));
465
+ for (const link of filteredLinks) processedUrls.add(link);
466
+ }
482
467
  };
483
- let crawler;
484
- const crawlerOptions = {
485
- requestHandler: createRequestHandler(driver),
486
- errorHandler: async ({ request, response, error }) => {
487
- if (verbose) console.error(`[ERROR] URL: ${request.url}, Status: ${response?.statusCode || "N/A"}, Error: ${error?.message || "Unknown"}`);
488
- if (response?.statusCode && response?.statusCode >= 400) {
489
- request.noRetry = true;
490
- const result = {
491
- url: request.url,
492
- title: "",
493
- content: "",
494
- timestamp: Date.now(),
495
- success: false,
496
- error: `HTTP ${response.statusCode}`,
497
- metadata: {
498
- title: "",
499
- description: "",
500
- links: []
501
- },
502
- depth: request.userData?.depth || 0
503
- };
504
- results.push(result);
505
- } else if (error) {
468
+ const urlsToProcess = startingUrls.slice(0, maxRequestsPerCrawl);
469
+ progress.crawling.status = "processing";
470
+ progress.crawling.total = urlsToProcess.length;
471
+ onProgress?.(progress);
472
+ if (driver === "playwright") {
473
+ const { log, PlaywrightCrawler, purgeDefaultStorages } = await import("crawlee");
474
+ if (verbose) log.setLevel(log.LEVELS.INFO);
475
+ else log.setLevel(log.LEVELS.OFF);
476
+ const crawlerOptions = {
477
+ requestHandler: async ({ request, page }) => {
478
+ progress.crawling.currentUrl = request.loadedUrl;
479
+ onProgress?.(progress);
480
+ const fetchStart = Date.now();
481
+ await page.waitForLoadState("networkidle");
482
+ const title = await page.title();
483
+ const html = await page.innerHTML("html");
484
+ recordLatency(Date.now() - fetchStart);
485
+ await processPage(request.loadedUrl, html, title, request.userData?.depth || 0);
486
+ },
487
+ errorHandler: async ({ request, response, error }) => {
488
+ if (verbose) console.error(`[ERROR] URL: ${request.url}, Status: ${response?.statusCode || "N/A"}, Error: ${error?.message || "Unknown"}`);
506
489
  request.noRetry = true;
507
- const result = {
490
+ progress.crawling.failed++;
491
+ results.push({
508
492
  url: request.url,
509
493
  title: "",
510
494
  content: "",
511
495
  timestamp: Date.now(),
512
496
  success: false,
513
- error: error.message || "Unknown error",
497
+ error: response?.statusCode ? `HTTP ${response.statusCode}` : error?.message || "Unknown error",
514
498
  metadata: {
515
499
  title: "",
516
500
  description: "",
517
501
  links: []
518
502
  },
519
503
  depth: request.userData?.depth || 0
520
- };
521
- results.push(result);
522
- }
523
- },
524
- maxRequestsPerCrawl,
525
- respectRobotsTxtFile: false
526
- };
527
- if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
528
- if (driver === "playwright") {
529
- const playwrightOptions = crawlerOptions;
530
- if (useChrome) playwrightOptions.launchContext = {
531
- ...playwrightOptions.launchContext,
504
+ });
505
+ },
506
+ maxRequestsPerCrawl,
507
+ respectRobotsTxtFile: false
508
+ };
509
+ if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
510
+ if (useChrome) crawlerOptions.launchContext = {
511
+ ...crawlerOptions.launchContext,
532
512
  useChrome
533
513
  };
534
- crawler = new PlaywrightCrawler(playwrightOptions);
535
- } else crawler = new HttpCrawler(crawlerOptions);
536
- const initialRequests = startingUrls.map((url) => ({
537
- url,
538
- userData: { depth: 0 }
539
- }));
540
- progress.crawling.status = "processing";
541
- progress.crawling.total = startingUrls.length;
542
- onProgress?.(progress);
543
- try {
544
- await crawler.run(initialRequests);
545
- } catch (error) {
546
- if (verbose) {
547
- console.error(`[CRAWLER ERROR] ${error instanceof Error ? error.message : "Unknown error"}`);
548
- console.error(`[CRAWLER ERROR] Stack trace:`, error instanceof Error ? error.stack : "No stack trace");
514
+ const crawler = new PlaywrightCrawler(crawlerOptions);
515
+ const initialRequests = urlsToProcess.map((url) => ({
516
+ url,
517
+ userData: { depth: 0 }
518
+ }));
519
+ try {
520
+ await crawler.run(initialRequests);
521
+ } catch (error) {
522
+ if (verbose) {
523
+ console.error(`[CRAWLER ERROR] ${error instanceof Error ? error.message : "Unknown error"}`);
524
+ console.error(`[CRAWLER ERROR] Stack trace:`, error instanceof Error ? error.stack : "No stack trace");
525
+ }
526
+ throw error;
549
527
  }
550
- throw error;
551
- }
528
+ await purgeDefaultStorages();
529
+ } else await runConcurrent(urlsToProcess, DEFAULT_CONCURRENCY, async (url) => {
530
+ progress.crawling.currentUrl = url;
531
+ onProgress?.(progress);
532
+ if (crawlDelay) {
533
+ const delay = crawlDelay;
534
+ await new Promise((resolve) => setTimeout(resolve, delay * 1e3));
535
+ }
536
+ try {
537
+ const fetchStart = Date.now();
538
+ const response = await ofetch.raw(url, {
539
+ headers: FETCH_HEADERS,
540
+ responseType: "text",
541
+ retry: 2,
542
+ retryDelay: 500,
543
+ timeout: 1e4,
544
+ onResponseError({ response }) {
545
+ if (response.status === 429) {
546
+ const retryAfter = response.headers.get("retry-after");
547
+ const delaySec = retryAfter ? Number.parseInt(retryAfter) || 1 : 2;
548
+ if (!crawlDelay || delaySec > crawlDelay) crawlDelay = delaySec;
549
+ }
550
+ }
551
+ });
552
+ recordLatency(Date.now() - fetchStart);
553
+ const body = response._data ?? "";
554
+ const contentType = response.headers.get("content-type") || "";
555
+ await processPage(url, body, "", 0, contentType.includes("text/markdown") || contentType.includes("text/x-markdown"));
556
+ } catch (error) {
557
+ if (verbose) console.error(`[ERROR] URL: ${url}, Error: ${error instanceof Error ? error.message : "Unknown"}`);
558
+ progress.crawling.failed++;
559
+ results.push({
560
+ url,
561
+ title: "",
562
+ content: "",
563
+ timestamp: Date.now(),
564
+ success: false,
565
+ error: error instanceof Error ? error.message : "Unknown error",
566
+ metadata: {
567
+ title: "",
568
+ description: "",
569
+ links: []
570
+ },
571
+ depth: 0
572
+ });
573
+ progress.crawling.processed = results.length;
574
+ onProgress?.(progress);
575
+ }
576
+ });
552
577
  progress.crawling.status = "completed";
553
578
  onProgress?.(progress);
554
579
  if (results.some((r) => r.success)) {
@@ -556,10 +581,10 @@ async function crawlAndGenerate(options, onProgress) {
556
581
  onProgress?.(progress);
557
582
  const successfulResults = results.filter((r) => r.success);
558
583
  const firstUrl = new URL(withHttps(urls[0]));
559
- const origin = firstUrl.origin;
584
+ const originUrl = firstUrl.origin;
560
585
  const homePageResult = successfulResults.find((r) => {
561
586
  const resultUrl = new URL(withHttps(r.url));
562
- return resultUrl.href === origin || resultUrl.href === `${origin}/`;
587
+ return resultUrl.href === originUrl || resultUrl.href === `${originUrl}/`;
563
588
  });
564
589
  const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
565
590
  const description = descriptionOverride || homePageResult?.metadata?.description || successfulResults[0]?.metadata?.description;
@@ -585,7 +610,7 @@ async function crawlAndGenerate(options, onProgress) {
585
610
  })),
586
611
  siteName,
587
612
  description,
588
- origin: origin || firstUrl.origin,
613
+ origin: originUrl || firstUrl.origin,
589
614
  generateFull: generateLlmsFullTxt,
590
615
  outputDir
591
616
  });
@@ -603,7 +628,6 @@ async function crawlAndGenerate(options, onProgress) {
603
628
  progress.generation.status = "completed";
604
629
  onProgress?.(progress);
605
630
  }
606
- await purgeDefaultStorages();
607
631
  return results;
608
632
  }
609
633
  //#endregion
@@ -0,0 +1,59 @@
1
+ import * as p from "@clack/prompts";
2
+ import { addDependency } from "nypm";
3
+ //#region src/playwright-utils.ts
4
+ async function checkPlaywrightInstallation() {
5
+ try {
6
+ await import("playwright");
7
+ return true;
8
+ } catch {
9
+ return false;
10
+ }
11
+ }
12
+ async function promptPlaywrightInstall() {
13
+ const shouldInstall = await p.confirm({
14
+ message: "Playwright is required for the Playwright driver. Install it now?",
15
+ initialValue: true
16
+ });
17
+ if (p.isCancel(shouldInstall) || !shouldInstall) return false;
18
+ const s = p.spinner();
19
+ s.start("Installing Playwright globally...");
20
+ try {
21
+ await addDependency("playwright", { global: true });
22
+ s.stop("Playwright installed successfully!");
23
+ return true;
24
+ } catch (fallbackError) {
25
+ s.stop("Failed to install Playwright");
26
+ p.log.error(`Installation failed: ${fallbackError}`);
27
+ return false;
28
+ }
29
+ }
30
+ async function ensurePlaywrightInstalled() {
31
+ if (await checkPlaywrightInstallation()) return true;
32
+ p.log.warn("Playwright driver selected but Playwright is not installed.");
33
+ if (!await promptPlaywrightInstall()) {
34
+ p.log.error("Cannot proceed with Playwright driver without Playwright installed.");
35
+ return false;
36
+ }
37
+ return true;
38
+ }
39
+ async function isUseChromeSupported() {
40
+ try {
41
+ const { PlaywrightCrawler } = await import("crawlee");
42
+ const crawler = new PlaywrightCrawler({
43
+ launchContext: { useChrome: true },
44
+ requestHandler: async () => {},
45
+ maxRequestsPerCrawl: 1
46
+ });
47
+ const page = await crawler.browserPool.newPage();
48
+ await page.evaluate(() => {
49
+ return window.navigator.userAgent;
50
+ });
51
+ await page.close();
52
+ await crawler.browserPool.closeAllBrowsers();
53
+ crawler.stop();
54
+ return true;
55
+ } catch {}
56
+ return false;
57
+ }
58
+ //#endregion
59
+ export { ensurePlaywrightInstalled, isUseChromeSupported };
package/dist/cli.mjs CHANGED
@@ -1,66 +1,9 @@
1
1
  import { n as parseUrlPattern, r as validateGlobPattern, t as crawlAndGenerate } from "./_chunks/crawl.mjs";
2
2
  import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
3
3
  import * as p from "@clack/prompts";
4
- import { PlaywrightCrawler } from "crawlee";
5
4
  import { dirname, join, resolve } from "pathe";
6
5
  import { withHttps } from "ufo";
7
6
  import { fileURLToPath } from "node:url";
8
- import { addDependency } from "nypm";
9
- //#region src/playwright-utils.ts
10
- async function checkPlaywrightInstallation() {
11
- try {
12
- await import("playwright");
13
- return true;
14
- } catch {
15
- return false;
16
- }
17
- }
18
- async function promptPlaywrightInstall() {
19
- const shouldInstall = await p.confirm({
20
- message: "Playwright is required for the Playwright driver. Install it now?",
21
- initialValue: true
22
- });
23
- if (p.isCancel(shouldInstall) || !shouldInstall) return false;
24
- const s = p.spinner();
25
- s.start("Installing Playwright globally...");
26
- try {
27
- await addDependency("playwright", { global: true });
28
- s.stop("Playwright installed successfully!");
29
- return true;
30
- } catch (fallbackError) {
31
- s.stop("Failed to install Playwright");
32
- p.log.error(`Installation failed: ${fallbackError}`);
33
- return false;
34
- }
35
- }
36
- async function ensurePlaywrightInstalled() {
37
- if (await checkPlaywrightInstallation()) return true;
38
- p.log.warn("Playwright driver selected but Playwright is not installed.");
39
- if (!await promptPlaywrightInstall()) {
40
- p.log.error("Cannot proceed with Playwright driver without Playwright installed.");
41
- return false;
42
- }
43
- return true;
44
- }
45
- async function isUseChromeSupported() {
46
- try {
47
- const crawler = new PlaywrightCrawler({
48
- launchContext: { useChrome: true },
49
- requestHandler: async () => {},
50
- maxRequestsPerCrawl: 1
51
- });
52
- const page = await crawler.browserPool.newPage();
53
- await page.evaluate(() => {
54
- return window.navigator.userAgent;
55
- });
56
- await page.close();
57
- await crawler.browserPool.closeAllBrowsers();
58
- crawler.stop();
59
- return true;
60
- } catch {}
61
- return false;
62
- }
63
- //#endregion
64
7
  //#region src/cli.ts
65
8
  const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
66
9
  const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
@@ -225,11 +168,17 @@ async function interactiveCrawl() {
225
168
  skipSitemap: advancedOptions.skipSitemap
226
169
  };
227
170
  }
228
- async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds) {
171
+ async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds, latency) {
229
172
  const messages = [];
230
173
  const durationStr = `${durationSeconds.toFixed(1)}s`;
231
- const stats = failed > 0 ? `${successful} pages, ${failed} failed` : `${successful} pages`;
232
- messages.push(`📄 ${stats} • ⏱️ ${durationStr}`);
174
+ messages.push(`📄 ${successful} pages \u00B7 ⏱️ ${durationStr}`);
175
+ if (failed > 0) messages.push(`⚠️ ${failed} failed`);
176
+ if (latency && latency.count > 0) {
177
+ const avg = Math.round(latency.total / latency.count);
178
+ const min = latency.min === Infinity ? 0 : Math.round(latency.min);
179
+ const max = Math.round(latency.max);
180
+ messages.push(`🏓 avg ${avg}ms \u00B7 min ${min}ms \u00B7 max ${max}ms`);
181
+ }
233
182
  messages.push(`📦 ${generatedFiles.join(", ")}`);
234
183
  messages.push(`📁 ${outputDir}`);
235
184
  p.note(messages.join("\n"), "✅ Complete");
@@ -431,36 +380,46 @@ async function main() {
431
380
  if (permCheck.error?.includes("Permission denied")) p.log.info("Tip: Try running with elevated privileges (e.g., sudo) or change the output directory permissions.");
432
381
  process.exit(1);
433
382
  }
434
- if (options.driver === "playwright") if (await isUseChromeSupported()) {
435
- options.useChrome = true;
436
- p.log.info("System Chrome detected and enabled.");
437
- } else {
438
- if (!await ensurePlaywrightInstalled()) {
439
- p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
383
+ if (options.driver === "playwright") {
384
+ try {
385
+ await import("crawlee");
386
+ } catch {
387
+ p.log.error("The Playwright driver requires crawlee. Install it with: npm install crawlee");
440
388
  process.exit(1);
441
389
  }
442
- p.log.info("Using global playwright instance.");
390
+ const { ensurePlaywrightInstalled, isUseChromeSupported } = await import("./_chunks/playwright-utils.mjs");
391
+ if (await isUseChromeSupported()) {
392
+ options.useChrome = true;
393
+ p.log.info("System Chrome detected and enabled.");
394
+ } else {
395
+ if (!await ensurePlaywrightInstalled()) {
396
+ p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
397
+ process.exit(1);
398
+ }
399
+ p.log.info("Using global playwright instance.");
400
+ }
443
401
  }
444
402
  const s = p.spinner();
445
- s.start("Starting crawl...");
403
+ s.start("Discovering sitemaps");
446
404
  const startTime = Date.now();
405
+ let crawlStartTime = 0;
406
+ let lastProgress;
447
407
  const results = await crawlAndGenerate(options, (progress) => {
448
- if (progress.sitemap.status === "discovering") s.message("Discovering sitemaps...");
408
+ lastProgress = progress;
409
+ if (progress.sitemap.status === "discovering") s.message("Discovering sitemaps");
449
410
  else if (progress.sitemap.status === "processing") s.message(`Processing sitemap... Found ${progress.sitemap.found} URLs`);
450
411
  else if (progress.crawling.status === "processing") {
451
- const processedCount = progress.crawling.processed;
452
- const totalCount = progress.crawling.total;
453
- const currentUrl = progress.crawling.currentUrl;
454
- if (currentUrl) {
455
- const shortUrl = currentUrl.length > 60 ? `${currentUrl.substring(0, 57)}...` : currentUrl;
456
- if (processedCount > totalCount) s.message(`Crawling ${processedCount}: ${shortUrl}`);
457
- else s.message(`Crawling ${processedCount}/${totalCount}: ${shortUrl}`);
458
- } else if (processedCount > totalCount) s.message(`Crawling... ${processedCount} pages`);
459
- else s.message(`Crawling... ${processedCount}/${totalCount} pages`);
460
- } else if (progress.generation.status === "generating") {
461
- const current = progress.generation.current || "Generating files";
462
- s.message(current);
463
- }
412
+ if (!crawlStartTime) crawlStartTime = Date.now();
413
+ const processed = progress.crawling.processed;
414
+ const total = progress.crawling.total;
415
+ const failed = progress.crawling.failed;
416
+ const elapsed = (Date.now() - crawlStartTime) / 1e3;
417
+ const rate = elapsed > .1 ? Math.round(processed / elapsed) : 0;
418
+ let msg = processed > total ? `Crawling ${processed} pages` : `Crawling ${processed}/${total}`;
419
+ if (rate > 0) msg += ` \u00B7 ${rate}/s`;
420
+ if (failed > 0) msg += ` \u00B7 ${failed} failed`;
421
+ s.message(msg);
422
+ } else if (progress.generation.status === "generating") s.message(progress.generation.current || "Generating files");
464
423
  });
465
424
  s.stop();
466
425
  const durationSeconds = (Date.now() - startTime) / 1e3;
@@ -484,7 +443,7 @@ async function main() {
484
443
  if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
485
444
  if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
486
445
  }
487
- await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
446
+ await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds, lastProgress?.crawling.latency);
488
447
  process.exit(0);
489
448
  }
490
449
  main().catch((error) => {
package/dist/index.d.mts CHANGED
@@ -51,12 +51,6 @@ interface CrawlResult {
51
51
  metadata?: PageMetadata;
52
52
  depth?: number;
53
53
  }
54
- interface LlmsTxtOptions {
55
- siteName: string;
56
- description?: string;
57
- results: CrawlResult[];
58
- outputPath: string;
59
- }
60
54
  //#endregion
61
55
  //#region src/crawl.d.ts
62
56
  interface CrawlProgress {
@@ -69,7 +63,14 @@ interface CrawlProgress {
69
63
  status: 'starting' | 'processing' | 'completed';
70
64
  total: number;
71
65
  processed: number;
72
- currentUrl?: string;
66
+ failed: number;
67
+ currentUrl?: string; /** Page fetch latency stats in ms */
68
+ latency: {
69
+ total: number;
70
+ min: number;
71
+ max: number;
72
+ count: number;
73
+ };
73
74
  };
74
75
  generation: {
75
76
  status: 'idle' | 'generating' | 'completed';
@@ -78,8 +79,4 @@ interface CrawlProgress {
78
79
  }
79
80
  declare function crawlAndGenerate(options: CrawlOptions, onProgress?: (progress: CrawlProgress) => void): Promise<CrawlResult[]>;
80
81
  //#endregion
81
- //#region src/llms-txt.d.ts
82
- declare function generateLlmsTxt(options: LlmsTxtOptions): Promise<void>;
83
- declare function generateLlmsFullTxt(options: LlmsTxtOptions): Promise<void>;
84
- //#endregion
85
- export { type CrawlOptions, type CrawlResult, type LlmsTxtOptions, type PageData, crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
82
+ export { type CrawlOptions, type CrawlResult, type PageData, crawlAndGenerate };
package/dist/index.mjs CHANGED
@@ -1,64 +1,2 @@
1
1
  import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
2
- import { writeFile } from "node:fs/promises";
3
- import { basename, sep } from "pathe";
4
- //#region src/llms-txt.ts
5
- const ANCHOR_UNSAFE_CHARS_RE = /[^a-z0-9]/g;
6
- async function generateLlmsTxt(options) {
7
- const { siteName, description, results, outputPath } = options;
8
- let content = `# ${siteName}\n\n`;
9
- if (description) content += `> ${description}\n\n`;
10
- if (results.length > 0) {
11
- content += `## Pages\n\n`;
12
- for (const result of results) {
13
- let title;
14
- try {
15
- title = result.title || new URL(result.url).pathname;
16
- } catch {
17
- title = result.title || result.url;
18
- }
19
- if (result.filePath) {
20
- const mdSeparator = `${sep}md${sep}`;
21
- const mdIndex = result.filePath.indexOf(mdSeparator);
22
- const linkPath = (mdIndex !== -1 ? result.filePath.substring(mdIndex + mdSeparator.length) : basename(result.filePath)).split(sep).join("/");
23
- content += `- [${title}](md/${linkPath}): ${result.url}\n`;
24
- } else {
25
- const description = result.metadata?.description ? result.metadata.description.split("\n")[0].substring(0, 100) + (result.metadata.description.length > 100 ? "..." : "") : "";
26
- content += `- [${title}](${result.url})${description ? `: ${description}` : ""}\n`;
27
- }
28
- }
29
- }
30
- await writeFile(outputPath, content, "utf-8");
31
- }
32
- async function generateLlmsFullTxt(options) {
33
- const { siteName, description, results, outputPath } = options;
34
- let content = `# ${siteName}\n\n`;
35
- if (description) content += `> ${description}\n\n`;
36
- if (results.length > 0) {
37
- content += `## Table of Contents\n\n`;
38
- for (const result of results) {
39
- let title;
40
- try {
41
- title = result.title || new URL(result.url).pathname;
42
- } catch {
43
- title = result.title || result.url;
44
- }
45
- const anchor = title.toLowerCase().replace(ANCHOR_UNSAFE_CHARS_RE, "-");
46
- content += `- [${title}](#${anchor})\n`;
47
- }
48
- content += `\n---\n\n`;
49
- for (const result of results) {
50
- let title;
51
- try {
52
- title = result.title || new URL(result.url).pathname;
53
- } catch {
54
- title = result.title || result.url;
55
- }
56
- content += `## ${title}\n\n`;
57
- content += `**URL:** ${result.url}\n\n`;
58
- content += `${result.content}\n\n---\n\n`;
59
- }
60
- }
61
- await writeFile(outputPath, content, "utf-8");
62
- }
63
- //#endregion
64
- export { crawlAndGenerate, generateLlmsFullTxt, generateLlmsTxt };
2
+ export { crawlAndGenerate };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "0.17.1",
4
+ "version": "1.0.0-beta.11",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -42,21 +42,26 @@
42
42
  "dist"
43
43
  ],
44
44
  "peerDependencies": {
45
+ "crawlee": "^3.16.0",
45
46
  "playwright": "^1.53.2"
46
47
  },
47
48
  "peerDependenciesMeta": {
49
+ "crawlee": {
50
+ "optional": true
51
+ },
48
52
  "playwright": {
49
53
  "optional": true
50
54
  }
51
55
  },
52
56
  "dependencies": {
53
57
  "@clack/prompts": "^1.1.0",
54
- "crawlee": "^3.16.0",
55
58
  "nypm": "^0.6.5",
59
+ "ofetch": "^1.5.1",
56
60
  "pathe": "^2.0.3",
57
61
  "picomatch": "^4.0.3",
58
62
  "ufo": "^1.6.3",
59
- "mdream": "0.17.1"
63
+ "@mdream/js": "1.0.0-beta.11",
64
+ "mdream": "1.0.0-beta.11"
60
65
  },
61
66
  "devDependencies": {
62
67
  "@types/picomatch": "^4.0.2"