@mdream/crawl 1.0.0-beta.10 → 1.0.0-beta.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -7,7 +7,7 @@ Multi-page website crawler that generates comprehensive llms.txt files by follow
7
7
  ## Installation
8
8
 
9
9
  ```bash
10
- npm install @mdream/crawl
10
+ npm install @mdream/crawl@beta
11
11
  ```
12
12
 
13
13
  ## Usage
@@ -15,7 +15,7 @@ npm install @mdream/crawl
15
15
  Simply run the command to start the interactive multi-page website crawler:
16
16
 
17
17
  ```bash
18
- npx @mdream/crawl
18
+ npx @mdream/crawl@beta
19
19
  ```
20
20
 
21
21
  The crawler will automatically discover and follow internal links to crawl entire websites. The interactive interface provides:
@@ -46,6 +46,16 @@ const results = await crawlAndGenerate({
46
46
  })
47
47
  ```
48
48
 
49
+ ### Playwright Driver
50
+
51
+ The default HTTP driver works for most sites. For JavaScript-heavy sites that require a browser, install the optional dependencies:
52
+
53
+ ```bash
54
+ npm install crawlee playwright
55
+ ```
56
+
57
+ Then use `--driver playwright` or `driver: 'playwright'` in the API.
58
+
49
59
  > **Note**: llms.txt artifact generation is handled by [`@mdream/js/llms-txt`](../js). The crawl package uses it internally when `generateLlmsTxt: true`.
50
60
 
51
61
  ## Output
@@ -70,7 +80,7 @@ The crawler generates comprehensive output from entire websites:
70
80
 
71
81
  - ✅ **Multi-Page Website Crawling**: Designed specifically for crawling entire websites by following internal links
72
82
  - ✅ **Purely Interactive**: No complex command-line options to remember
73
- - ✅ **Dual Crawler Support**: Fast HTTP crawler (default) + Playwright for JavaScript-heavy sites
83
+ - ✅ **Dual Crawler Support**: Fast HTTP crawler (default) + Playwright for JavaScript-heavy sites (requires `crawlee` and `playwright`)
74
84
  - ✅ **Smart Link Discovery**: Uses mdream's extraction plugin to find and follow internal links
75
85
  - ✅ **Rich Metadata Extraction**: Extracts titles, descriptions, keywords, and author info from all pages
76
86
  - ✅ **Comprehensive llms.txt Generation**: Creates complete site documentation files
@@ -1,9 +1,9 @@
1
- import { existsSync, mkdirSync } from "node:fs";
2
- import { writeFile } from "node:fs/promises";
1
+ import { mkdirSync } from "node:fs";
2
+ import { mkdir, writeFile } from "node:fs/promises";
3
3
  import * as p from "@clack/prompts";
4
4
  import { generateLlmsTxtArtifacts } from "@mdream/js/llms-txt";
5
- import { HttpCrawler, PlaywrightCrawler, log, purgeDefaultStorages } from "crawlee";
6
5
  import { htmlToMarkdown } from "mdream";
6
+ import { ofetch } from "ofetch";
7
7
  import { dirname, join, normalize, resolve } from "pathe";
8
8
  import { withHttps } from "ufo";
9
9
  import picomatch from "picomatch";
@@ -109,21 +109,71 @@ function validateGlobPattern(pattern) {
109
109
  }
110
110
  }
111
111
  //#endregion
112
- //#region src/metadata-extractor.ts
113
- function extractMetadata(html, url) {
114
- const links = [];
112
+ //#region src/crawl.ts
113
+ const SITEMAP_INDEX_LOC_RE = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
114
+ const SITEMAP_URL_LOC_RE = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
115
+ const ROBOTS_SITEMAP_RE = /Sitemap:\s*(.*)/gi;
116
+ const ROBOTS_SITEMAP_PREFIX_RE = /Sitemap:\s*/i;
117
+ const ROBOTS_CRAWL_DELAY_RE = /Crawl-delay:\s*(\d+(?:\.\d+)?)/i;
118
+ const URL_TRAILING_SLASH_RE = /\/$/;
119
+ const URL_PATH_UNSAFE_CHARS_RE = /[^\w\-]/g;
120
+ const FRONTMATTER_BLOCK_RE = /^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/;
121
+ const FETCH_HEADERS = {
122
+ "User-Agent": "mdream-crawler/1.0",
123
+ "Accept": "text/html,application/xhtml+xml,text/markdown"
124
+ };
125
+ const DEFAULT_CONCURRENCY = 20;
126
+ function extractCdataUrl(url) {
127
+ if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) return url.slice(9, -3);
128
+ return url;
129
+ }
130
+ async function loadSitemap(sitemapUrl) {
131
+ const xmlContent = await ofetch(sitemapUrl, {
132
+ headers: FETCH_HEADERS,
133
+ timeout: 1e4,
134
+ responseType: "text",
135
+ retry: 0
136
+ });
137
+ if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
138
+ if (xmlContent.includes("<sitemapindex")) {
139
+ SITEMAP_INDEX_LOC_RE.lastIndex = 0;
140
+ const childSitemaps = [];
141
+ let match;
142
+ while (true) {
143
+ match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
144
+ if (match === null) break;
145
+ childSitemaps.push(extractCdataUrl(match[1]));
146
+ }
147
+ const childResults = await Promise.allSettled(childSitemaps.map((url) => loadSitemap(url)));
148
+ const allUrls = [];
149
+ for (const result of childResults) if (result.status === "fulfilled") allUrls.push(...result.value);
150
+ return allUrls;
151
+ }
152
+ const urls = [];
153
+ SITEMAP_URL_LOC_RE.lastIndex = 0;
154
+ let match;
155
+ while (true) {
156
+ match = SITEMAP_URL_LOC_RE.exec(xmlContent);
157
+ if (match === null) break;
158
+ urls.push(extractCdataUrl(match[1]));
159
+ }
160
+ return urls;
161
+ }
162
+ function extractMetadataInline(parsedUrl) {
163
+ const links = /* @__PURE__ */ new Set();
115
164
  let title = "";
116
165
  let description = "";
117
166
  let keywords = "";
118
167
  let author = "";
119
- htmlToMarkdown(html, {
120
- origin: new URL(url).origin,
168
+ const url = parsedUrl.href;
169
+ const originPrefix = `${parsedUrl.origin}/`;
170
+ return {
121
171
  extraction: {
122
172
  "a[href]": (el) => {
123
173
  const href = el.attributes.href;
124
174
  if (href) try {
125
175
  const absoluteUrl = new URL(href, url).href;
126
- if (!links.includes(absoluteUrl)) links.push(absoluteUrl);
176
+ if (absoluteUrl.startsWith(originPrefix) || absoluteUrl === parsedUrl.origin) links.add(absoluteUrl);
127
177
  } catch {}
128
178
  },
129
179
  "title": (el) => {
@@ -144,88 +194,31 @@ function extractMetadata(html, url) {
144
194
  "meta[property=\"og:title\"]": (el) => {
145
195
  if (!title) title = el.attributes.content || "";
146
196
  }
147
- }
148
- });
149
- return {
150
- title: title.trim() || new URL(url).pathname,
151
- description: description.trim() || void 0,
152
- keywords: keywords.trim() || void 0,
153
- author: author.trim() || void 0,
154
- links: links.filter((link) => {
155
- try {
156
- const linkUrl = new URL(link);
157
- const baseUrl = new URL(url);
158
- return linkUrl.hostname === baseUrl.hostname;
159
- } catch {
160
- return false;
161
- }
197
+ },
198
+ getMetadata: () => ({
199
+ title: title.trim() || parsedUrl.pathname,
200
+ description: description.trim() || void 0,
201
+ keywords: keywords.trim() || void 0,
202
+ author: author.trim() || void 0,
203
+ links: [...links]
162
204
  })
163
205
  };
164
206
  }
165
- //#endregion
166
- //#region src/crawl.ts
167
- const SITEMAP_INDEX_LOC_RE = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
168
- const SITEMAP_URL_LOC_RE = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
169
- const ROBOTS_SITEMAP_RE = /Sitemap:\s*(.*)/gi;
170
- const ROBOTS_SITEMAP_PREFIX_RE = /Sitemap:\s*/i;
171
- const URL_TRAILING_SLASH_RE = /\/$/;
172
- const URL_PATH_UNSAFE_CHARS_RE = /[^\w\-]/g;
173
- const FRONTMATTER_BLOCK_RE = /^---[^\n]*\n[\s\S]*?\n---[^\n]*\n?/;
174
- async function loadSitemapWithoutRetries(sitemapUrl) {
175
- const controller = new AbortController();
176
- const timeoutId = setTimeout(() => controller.abort(), 1e4);
177
- try {
178
- const response = await fetch(sitemapUrl, {
179
- signal: controller.signal,
180
- headers: { "User-Agent": "mdream-crawler/1.0" }
181
- });
182
- clearTimeout(timeoutId);
183
- if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
184
- const xmlContent = await response.text();
185
- if (!isValidSitemapXml(xmlContent)) throw new Error("Response is not a valid sitemap XML");
186
- if (xmlContent.includes("<sitemapindex")) {
187
- SITEMAP_INDEX_LOC_RE.lastIndex = 0;
188
- const childSitemaps = [];
189
- let match;
190
- while (true) {
191
- match = SITEMAP_INDEX_LOC_RE.exec(xmlContent);
192
- if (match === null) break;
193
- let url = match[1];
194
- if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
195
- childSitemaps.push(url);
196
- }
197
- const allUrls = [];
198
- for (const childSitemapUrl of childSitemaps) try {
199
- const childUrls = await loadSitemapWithoutRetries(childSitemapUrl);
200
- allUrls.push(...childUrls);
201
- } catch (error) {
202
- console.warn(`Failed to load child sitemap ${childSitemapUrl}:`, error instanceof Error ? error.message : "Unknown error");
203
- }
204
- return allUrls;
205
- } else {
206
- const urls = [];
207
- SITEMAP_URL_LOC_RE.lastIndex = 0;
208
- let match;
209
- while (true) {
210
- match = SITEMAP_URL_LOC_RE.exec(xmlContent);
211
- if (match === null) break;
212
- let url = match[1];
213
- if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
214
- urls.push(url);
215
- }
216
- return urls;
217
- }
218
- } catch (error) {
219
- clearTimeout(timeoutId);
220
- if (error instanceof Error && error.name === "AbortError") throw new Error("Sitemap request timed out after 10 seconds");
221
- throw error;
222
- }
207
+ function filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, allPatterns) {
208
+ if (hasGlobPatterns) return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude) && allPatterns.some((pattern) => matchesGlobPattern(url, pattern)));
209
+ return sitemapUrls.filter((url) => !isUrlExcluded(url, exclude));
210
+ }
211
+ async function runConcurrent(items, concurrency, fn) {
212
+ let idx = 0;
213
+ const workers = Array.from({ length: Math.min(concurrency, items.length) }, async () => {
214
+ while (idx < items.length) await fn(items[idx++]);
215
+ });
216
+ await Promise.all(workers);
223
217
  }
224
218
  async function crawlAndGenerate(options, onProgress) {
225
- const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, onPage } = options;
219
+ const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay: userCrawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false, onPage } = options;
226
220
  const outputDir = resolve(normalize(rawOutputDir));
227
- if (verbose) log.setLevel(log.LEVELS.INFO);
228
- else log.setLevel(log.LEVELS.OFF);
221
+ let crawlDelay = userCrawlDelay;
229
222
  let patterns;
230
223
  try {
231
224
  patterns = globPatterns.length > 0 ? globPatterns : urls.map(parseUrlPattern);
@@ -233,6 +226,7 @@ async function crawlAndGenerate(options, onProgress) {
233
226
  throw new Error(`Invalid URL pattern: ${error instanceof Error ? error.message : "Unknown error"}`);
234
227
  }
235
228
  let startingUrls = patterns.map(getStartingUrl);
229
+ const hasGlobPatterns = patterns.some((p) => p.isGlob);
236
230
  const progress = {
237
231
  sitemap: {
238
232
  status: "discovering",
@@ -242,7 +236,14 @@ async function crawlAndGenerate(options, onProgress) {
242
236
  crawling: {
243
237
  status: "starting",
244
238
  total: 0,
245
- processed: 0
239
+ processed: 0,
240
+ failed: 0,
241
+ latency: {
242
+ total: 0,
243
+ min: Infinity,
244
+ max: 0,
245
+ count: 0
246
+ }
246
247
  },
247
248
  generation: { status: "idle" }
248
249
  };
@@ -251,51 +252,46 @@ async function crawlAndGenerate(options, onProgress) {
251
252
  const baseUrl = new URL(startingUrls[0]).origin;
252
253
  const homePageUrl = baseUrl;
253
254
  onProgress?.(progress);
254
- const robotsUrl = new URL("/robots.txt", baseUrl).toString();
255
- const robotsController = new AbortController();
256
- const robotsTimeoutId = setTimeout(() => robotsController.abort(), 1e4);
257
- let robotsResponse;
255
+ let robotsContent = null;
258
256
  try {
259
- robotsResponse = await fetch(robotsUrl, {
260
- signal: robotsController.signal,
261
- headers: { "User-Agent": "mdream-crawler/1.0" }
257
+ robotsContent = await ofetch(`${baseUrl}/robots.txt`, {
258
+ headers: FETCH_HEADERS,
259
+ timeout: 1e4,
260
+ responseType: "text",
261
+ retry: 0
262
262
  });
263
- clearTimeout(robotsTimeoutId);
264
- } catch {
265
- clearTimeout(robotsTimeoutId);
266
- robotsResponse = null;
263
+ } catch {}
264
+ if (robotsContent && !crawlDelay) {
265
+ const crawlDelayMatch = robotsContent.match(ROBOTS_CRAWL_DELAY_RE);
266
+ if (crawlDelayMatch) {
267
+ crawlDelay = Number.parseFloat(crawlDelayMatch[1]);
268
+ p.log(`[ROBOTS] Crawl-delay: ${crawlDelay}s`);
269
+ }
267
270
  }
268
- if (robotsResponse?.ok) {
269
- const sitemapMatches = (await robotsResponse.text()).match(ROBOTS_SITEMAP_RE);
271
+ if (robotsContent) {
272
+ const sitemapMatches = robotsContent.match(ROBOTS_SITEMAP_RE);
270
273
  if (sitemapMatches && sitemapMatches.length > 0) {
271
274
  progress.sitemap.found = sitemapMatches.length;
272
275
  progress.sitemap.status = "processing";
273
276
  onProgress?.(progress);
274
277
  const robotsSitemaps = sitemapMatches.map((match) => match.replace(ROBOTS_SITEMAP_PREFIX_RE, "").trim());
275
278
  for (const sitemapUrl of robotsSitemaps) try {
276
- const robotsUrls = await loadSitemapWithoutRetries(sitemapUrl);
279
+ const robotsUrls = await loadSitemap(sitemapUrl);
277
280
  sitemapAttempts.push({
278
281
  url: sitemapUrl,
279
282
  success: true
280
283
  });
281
- if (patterns.some((p) => p.isGlob)) {
282
- const filteredUrls = robotsUrls.filter((url) => {
283
- return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
284
- });
284
+ const filteredUrls = filterSitemapUrls(robotsUrls, hasGlobPatterns, exclude, patterns);
285
+ if (hasGlobPatterns) {
286
+ startingUrls = filteredUrls;
287
+ progress.sitemap.processed = filteredUrls.length;
288
+ onProgress?.(progress);
289
+ break;
290
+ } else if (filteredUrls.length > 0) {
285
291
  startingUrls = filteredUrls;
286
292
  progress.sitemap.processed = filteredUrls.length;
287
293
  onProgress?.(progress);
288
294
  break;
289
- } else {
290
- const filteredUrls = robotsUrls.filter((url) => {
291
- return !isUrlExcluded(url, exclude);
292
- });
293
- if (filteredUrls.length > 0) {
294
- startingUrls = filteredUrls;
295
- progress.sitemap.processed = filteredUrls.length;
296
- onProgress?.(progress);
297
- break;
298
- }
299
295
  }
300
296
  } catch (error) {
301
297
  sitemapAttempts.push({
@@ -309,31 +305,24 @@ async function crawlAndGenerate(options, onProgress) {
309
305
  let mainSitemapProcessed = false;
310
306
  const mainSitemapUrl = `${baseUrl}/sitemap.xml`;
311
307
  try {
312
- const sitemapUrls = await loadSitemapWithoutRetries(mainSitemapUrl);
308
+ const sitemapUrls = await loadSitemap(mainSitemapUrl);
313
309
  sitemapAttempts.push({
314
310
  url: mainSitemapUrl,
315
311
  success: true
316
312
  });
317
- if (patterns.some((p) => p.isGlob)) {
318
- const filteredUrls = sitemapUrls.filter((url) => {
319
- return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
320
- });
313
+ const filteredUrls = filterSitemapUrls(sitemapUrls, hasGlobPatterns, exclude, patterns);
314
+ if (hasGlobPatterns) {
315
+ startingUrls = filteredUrls;
316
+ progress.sitemap.found = sitemapUrls.length;
317
+ progress.sitemap.processed = filteredUrls.length;
318
+ onProgress?.(progress);
319
+ mainSitemapProcessed = true;
320
+ } else if (filteredUrls.length > 0) {
321
321
  startingUrls = filteredUrls;
322
322
  progress.sitemap.found = sitemapUrls.length;
323
323
  progress.sitemap.processed = filteredUrls.length;
324
324
  onProgress?.(progress);
325
325
  mainSitemapProcessed = true;
326
- } else {
327
- const filteredUrls = sitemapUrls.filter((url) => {
328
- return !isUrlExcluded(url, exclude);
329
- });
330
- if (filteredUrls.length > 0) {
331
- startingUrls = filteredUrls;
332
- progress.sitemap.found = sitemapUrls.length;
333
- progress.sitemap.processed = filteredUrls.length;
334
- onProgress?.(progress);
335
- mainSitemapProcessed = true;
336
- }
337
326
  }
338
327
  } catch (error) {
339
328
  sitemapAttempts.push({
@@ -348,31 +337,24 @@ async function crawlAndGenerate(options, onProgress) {
348
337
  `${baseUrl}/sitemap-index.xml`
349
338
  ];
350
339
  for (const sitemapUrl of commonSitemaps) try {
351
- const altUrls = await loadSitemapWithoutRetries(sitemapUrl);
340
+ const altUrls = await loadSitemap(sitemapUrl);
352
341
  sitemapAttempts.push({
353
342
  url: sitemapUrl,
354
343
  success: true
355
344
  });
356
- if (patterns.some((p) => p.isGlob)) {
357
- const filteredUrls = altUrls.filter((url) => {
358
- return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
359
- });
345
+ const filteredUrls = filterSitemapUrls(altUrls, hasGlobPatterns, exclude, patterns);
346
+ if (hasGlobPatterns) {
347
+ startingUrls = filteredUrls;
348
+ progress.sitemap.found = altUrls.length;
349
+ progress.sitemap.processed = filteredUrls.length;
350
+ onProgress?.(progress);
351
+ break;
352
+ } else if (filteredUrls.length > 0) {
360
353
  startingUrls = filteredUrls;
361
354
  progress.sitemap.found = altUrls.length;
362
355
  progress.sitemap.processed = filteredUrls.length;
363
356
  onProgress?.(progress);
364
357
  break;
365
- } else {
366
- const filteredUrls = altUrls.filter((url) => {
367
- return !isUrlExcluded(url, exclude);
368
- });
369
- if (filteredUrls.length > 0) {
370
- startingUrls = filteredUrls;
371
- progress.sitemap.found = altUrls.length;
372
- progress.sitemap.processed = filteredUrls.length;
373
- onProgress?.(progress);
374
- break;
375
- }
376
358
  }
377
359
  } catch (error) {
378
360
  sitemapAttempts.push({
@@ -405,151 +387,193 @@ async function crawlAndGenerate(options, onProgress) {
405
387
  progress.crawling.total = startingUrls.length;
406
388
  onProgress?.(progress);
407
389
  }
408
- if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });
390
+ mkdirSync(outputDir, { recursive: true });
409
391
  const results = [];
410
392
  const processedUrls = /* @__PURE__ */ new Set();
411
393
  const shouldCrawlUrl = (url) => {
412
394
  if (isUrlExcluded(url, exclude)) return false;
413
- if (!patterns.some((p) => p.isGlob)) return true;
395
+ if (!hasGlobPatterns) return true;
414
396
  return patterns.some((pattern) => matchesGlobPattern(url, pattern));
415
397
  };
416
- const createRequestHandler = (crawlerType) => {
417
- return async ({ request, body, page, enqueueLinks, response }) => {
418
- const startTime = Date.now();
419
- progress.crawling.currentUrl = request.loadedUrl;
420
- onProgress?.(progress);
421
- if (response?.statusCode && (response.statusCode < 200 || response.statusCode >= 300)) return;
422
- const homePageUrl = new URL(startingUrls[0]).origin;
423
- let html;
424
- let title;
425
- if (crawlerType === "playwright") {
426
- await page.waitForLoadState("networkidle");
427
- title = await page.title();
428
- html = await page.innerHTML("html");
429
- } else {
430
- html = typeof body === "string" ? body : body.toString();
431
- title = "";
398
+ const recordLatency = (ms) => {
399
+ const lat = progress.crawling.latency;
400
+ lat.total += ms;
401
+ lat.count++;
402
+ if (ms < lat.min) lat.min = ms;
403
+ if (ms > lat.max) lat.max = ms;
404
+ };
405
+ const normalizedHomePageUrl = (startingUrls.length > 0 ? new URL(startingUrls[0]).origin : "").replace(URL_TRAILING_SLASH_RE, "");
406
+ const createdDirs = /* @__PURE__ */ new Set();
407
+ const sharedOrigin = origin || "";
408
+ const processPage = async (url, content, initialTitle, depth, isMarkdown = false) => {
409
+ const parsedUrl = new URL(url);
410
+ const shouldProcessMarkdown = shouldCrawlUrl(url);
411
+ const pageOrigin = sharedOrigin || parsedUrl.origin;
412
+ let md;
413
+ let metadata;
414
+ if (isMarkdown) {
415
+ md = content;
416
+ metadata = {
417
+ title: initialTitle || parsedUrl.pathname,
418
+ links: []
419
+ };
420
+ } else {
421
+ const { extraction, getMetadata } = extractMetadataInline(parsedUrl);
422
+ md = htmlToMarkdown(content, {
423
+ origin: pageOrigin,
424
+ extraction
425
+ });
426
+ metadata = getMetadata();
427
+ }
428
+ const title = initialTitle || metadata.title;
429
+ if (onPage && shouldProcessMarkdown) await onPage({
430
+ url,
431
+ html: isMarkdown ? "" : content,
432
+ title,
433
+ metadata,
434
+ origin: pageOrigin
435
+ });
436
+ let filePath;
437
+ if (shouldProcessMarkdown && generateIndividualMd) {
438
+ const safeSegments = (parsedUrl.pathname === "/" ? "/index" : parsedUrl.pathname).replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"));
439
+ filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
440
+ const fileDir = dirname(filePath);
441
+ if (fileDir && !createdDirs.has(fileDir)) {
442
+ await mkdir(fileDir, { recursive: true });
443
+ createdDirs.add(fileDir);
432
444
  }
433
- const metadata = extractMetadata(html, request.loadedUrl);
434
- if (!title) title = metadata.title;
435
- const shouldProcessMarkdown = shouldCrawlUrl(request.loadedUrl);
436
- const pageOrigin = origin || new URL(request.loadedUrl).origin;
437
- if (onPage && shouldProcessMarkdown) await onPage({
438
- url: request.loadedUrl,
439
- html,
445
+ await writeFile(filePath, md, "utf-8");
446
+ }
447
+ const isHomePage = parsedUrl.pathname === "/" && parsedUrl.origin === normalizedHomePageUrl;
448
+ if (shouldProcessMarkdown || isHomePage) {
449
+ const result = {
450
+ url,
440
451
  title,
452
+ content: md,
453
+ filePath: shouldProcessMarkdown ? filePath : void 0,
454
+ timestamp: Date.now(),
455
+ success: true,
441
456
  metadata,
442
- origin: pageOrigin
443
- });
444
- let md = "";
445
- if (shouldProcessMarkdown) md = htmlToMarkdown(html, { origin: pageOrigin });
446
- let filePath;
447
- if (shouldProcessMarkdown && generateIndividualMd) {
448
- const urlObj = new URL(request.loadedUrl);
449
- const safeSegments = (urlObj.pathname === "/" ? "/index" : urlObj.pathname).replace(URL_TRAILING_SLASH_RE, "").split("/").filter((seg) => seg.length > 0).map((seg) => seg.replace(URL_PATH_UNSAFE_CHARS_RE, "-"));
450
- filePath = join(outputDir, normalize(`${safeSegments.length > 0 ? safeSegments.join("/") : "index"}.md`));
451
- const fileDir = dirname(filePath);
452
- if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
453
- await writeFile(filePath, md, "utf-8");
454
- }
455
- const isHomePage = request.loadedUrl.replace(URL_TRAILING_SLASH_RE, "") === homePageUrl.replace(URL_TRAILING_SLASH_RE, "");
456
- if (shouldProcessMarkdown || isHomePage) {
457
- const result = {
458
- url: request.loadedUrl,
459
- title,
460
- content: md,
461
- filePath: shouldProcessMarkdown ? filePath : void 0,
462
- timestamp: startTime,
463
- success: true,
464
- metadata,
465
- depth: request.userData?.depth || 0
466
- };
467
- results.push(result);
468
- progress.crawling.processed = results.length;
469
- onProgress?.(progress);
470
- }
471
- if (followLinks && (request.userData?.depth || 0) < maxDepth) {
472
- const currentDepth = (request.userData?.depth || 0) + 1;
473
- const filteredLinks = metadata.links.filter((link) => {
474
- return shouldCrawlUrl(link);
475
- });
476
- if (enqueueLinks) await enqueueLinks({
477
- urls: filteredLinks,
478
- userData: { depth: currentDepth }
479
- });
480
- else for (const link of filteredLinks) if (!processedUrls.has(link)) processedUrls.add(link);
481
- }
482
- };
457
+ depth
458
+ };
459
+ results.push(result);
460
+ progress.crawling.processed = results.length;
461
+ onProgress?.(progress);
462
+ }
463
+ if (followLinks && depth < maxDepth) {
464
+ const filteredLinks = metadata.links.filter((link) => shouldCrawlUrl(link));
465
+ for (const link of filteredLinks) processedUrls.add(link);
466
+ }
483
467
  };
484
- let crawler;
485
- const crawlerOptions = {
486
- requestHandler: createRequestHandler(driver),
487
- errorHandler: async ({ request, response, error }) => {
488
- if (verbose) console.error(`[ERROR] URL: ${request.url}, Status: ${response?.statusCode || "N/A"}, Error: ${error?.message || "Unknown"}`);
489
- if (response?.statusCode && response?.statusCode >= 400) {
490
- request.noRetry = true;
491
- const result = {
492
- url: request.url,
493
- title: "",
494
- content: "",
495
- timestamp: Date.now(),
496
- success: false,
497
- error: `HTTP ${response.statusCode}`,
498
- metadata: {
499
- title: "",
500
- description: "",
501
- links: []
502
- },
503
- depth: request.userData?.depth || 0
504
- };
505
- results.push(result);
506
- } else if (error) {
468
+ const urlsToProcess = startingUrls.slice(0, maxRequestsPerCrawl);
469
+ progress.crawling.status = "processing";
470
+ progress.crawling.total = urlsToProcess.length;
471
+ onProgress?.(progress);
472
+ if (driver === "playwright") {
473
+ const { log, PlaywrightCrawler, purgeDefaultStorages } = await import("crawlee");
474
+ if (verbose) log.setLevel(log.LEVELS.INFO);
475
+ else log.setLevel(log.LEVELS.OFF);
476
+ const crawlerOptions = {
477
+ requestHandler: async ({ request, page }) => {
478
+ progress.crawling.currentUrl = request.loadedUrl;
479
+ onProgress?.(progress);
480
+ const fetchStart = Date.now();
481
+ await page.waitForLoadState("networkidle");
482
+ const title = await page.title();
483
+ const html = await page.innerHTML("html");
484
+ recordLatency(Date.now() - fetchStart);
485
+ await processPage(request.loadedUrl, html, title, request.userData?.depth || 0);
486
+ },
487
+ errorHandler: async ({ request, response, error }) => {
488
+ if (verbose) console.error(`[ERROR] URL: ${request.url}, Status: ${response?.statusCode || "N/A"}, Error: ${error?.message || "Unknown"}`);
507
489
  request.noRetry = true;
508
- const result = {
490
+ progress.crawling.failed++;
491
+ results.push({
509
492
  url: request.url,
510
493
  title: "",
511
494
  content: "",
512
495
  timestamp: Date.now(),
513
496
  success: false,
514
- error: error.message || "Unknown error",
497
+ error: response?.statusCode ? `HTTP ${response.statusCode}` : error?.message || "Unknown error",
515
498
  metadata: {
516
499
  title: "",
517
500
  description: "",
518
501
  links: []
519
502
  },
520
503
  depth: request.userData?.depth || 0
521
- };
522
- results.push(result);
523
- }
524
- },
525
- maxRequestsPerCrawl,
526
- respectRobotsTxtFile: false
527
- };
528
- if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
529
- if (driver === "playwright") {
530
- const playwrightOptions = crawlerOptions;
531
- if (useChrome) playwrightOptions.launchContext = {
532
- ...playwrightOptions.launchContext,
504
+ });
505
+ },
506
+ maxRequestsPerCrawl,
507
+ respectRobotsTxtFile: false
508
+ };
509
+ if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
510
+ if (useChrome) crawlerOptions.launchContext = {
511
+ ...crawlerOptions.launchContext,
533
512
  useChrome
534
513
  };
535
- crawler = new PlaywrightCrawler(playwrightOptions);
536
- } else crawler = new HttpCrawler(crawlerOptions);
537
- const initialRequests = startingUrls.map((url) => ({
538
- url,
539
- userData: { depth: 0 }
540
- }));
541
- progress.crawling.status = "processing";
542
- progress.crawling.total = startingUrls.length;
543
- onProgress?.(progress);
544
- try {
545
- await crawler.run(initialRequests);
546
- } catch (error) {
547
- if (verbose) {
548
- console.error(`[CRAWLER ERROR] ${error instanceof Error ? error.message : "Unknown error"}`);
549
- console.error(`[CRAWLER ERROR] Stack trace:`, error instanceof Error ? error.stack : "No stack trace");
514
+ const crawler = new PlaywrightCrawler(crawlerOptions);
515
+ const initialRequests = urlsToProcess.map((url) => ({
516
+ url,
517
+ userData: { depth: 0 }
518
+ }));
519
+ try {
520
+ await crawler.run(initialRequests);
521
+ } catch (error) {
522
+ if (verbose) {
523
+ console.error(`[CRAWLER ERROR] ${error instanceof Error ? error.message : "Unknown error"}`);
524
+ console.error(`[CRAWLER ERROR] Stack trace:`, error instanceof Error ? error.stack : "No stack trace");
525
+ }
526
+ throw error;
550
527
  }
551
- throw error;
552
- }
528
+ await purgeDefaultStorages();
529
+ } else await runConcurrent(urlsToProcess, DEFAULT_CONCURRENCY, async (url) => {
530
+ progress.crawling.currentUrl = url;
531
+ onProgress?.(progress);
532
+ if (crawlDelay) {
533
+ const delay = crawlDelay;
534
+ await new Promise((resolve) => setTimeout(resolve, delay * 1e3));
535
+ }
536
+ try {
537
+ const fetchStart = Date.now();
538
+ const response = await ofetch.raw(url, {
539
+ headers: FETCH_HEADERS,
540
+ responseType: "text",
541
+ retry: 2,
542
+ retryDelay: 500,
543
+ timeout: 1e4,
544
+ onResponseError({ response }) {
545
+ if (response.status === 429) {
546
+ const retryAfter = response.headers.get("retry-after");
547
+ const delaySec = retryAfter ? Number.parseInt(retryAfter) || 1 : 2;
548
+ if (!crawlDelay || delaySec > crawlDelay) crawlDelay = delaySec;
549
+ }
550
+ }
551
+ });
552
+ recordLatency(Date.now() - fetchStart);
553
+ const body = response._data ?? "";
554
+ const contentType = response.headers.get("content-type") || "";
555
+ await processPage(url, body, "", 0, contentType.includes("text/markdown") || contentType.includes("text/x-markdown"));
556
+ } catch (error) {
557
+ if (verbose) console.error(`[ERROR] URL: ${url}, Error: ${error instanceof Error ? error.message : "Unknown"}`);
558
+ progress.crawling.failed++;
559
+ results.push({
560
+ url,
561
+ title: "",
562
+ content: "",
563
+ timestamp: Date.now(),
564
+ success: false,
565
+ error: error instanceof Error ? error.message : "Unknown error",
566
+ metadata: {
567
+ title: "",
568
+ description: "",
569
+ links: []
570
+ },
571
+ depth: 0
572
+ });
573
+ progress.crawling.processed = results.length;
574
+ onProgress?.(progress);
575
+ }
576
+ });
553
577
  progress.crawling.status = "completed";
554
578
  onProgress?.(progress);
555
579
  if (results.some((r) => r.success)) {
@@ -557,10 +581,10 @@ async function crawlAndGenerate(options, onProgress) {
557
581
  onProgress?.(progress);
558
582
  const successfulResults = results.filter((r) => r.success);
559
583
  const firstUrl = new URL(withHttps(urls[0]));
560
- const origin = firstUrl.origin;
584
+ const originUrl = firstUrl.origin;
561
585
  const homePageResult = successfulResults.find((r) => {
562
586
  const resultUrl = new URL(withHttps(r.url));
563
- return resultUrl.href === origin || resultUrl.href === `${origin}/`;
587
+ return resultUrl.href === originUrl || resultUrl.href === `${originUrl}/`;
564
588
  });
565
589
  const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
566
590
  const description = descriptionOverride || homePageResult?.metadata?.description || successfulResults[0]?.metadata?.description;
@@ -586,7 +610,7 @@ async function crawlAndGenerate(options, onProgress) {
586
610
  })),
587
611
  siteName,
588
612
  description,
589
- origin: origin || firstUrl.origin,
613
+ origin: originUrl || firstUrl.origin,
590
614
  generateFull: generateLlmsFullTxt,
591
615
  outputDir
592
616
  });
@@ -604,7 +628,6 @@ async function crawlAndGenerate(options, onProgress) {
604
628
  progress.generation.status = "completed";
605
629
  onProgress?.(progress);
606
630
  }
607
- await purgeDefaultStorages();
608
631
  return results;
609
632
  }
610
633
  //#endregion
@@ -0,0 +1,59 @@
1
+ import * as p from "@clack/prompts";
2
+ import { addDependency } from "nypm";
3
+ //#region src/playwright-utils.ts
4
+ async function checkPlaywrightInstallation() {
5
+ try {
6
+ await import("playwright");
7
+ return true;
8
+ } catch {
9
+ return false;
10
+ }
11
+ }
12
+ async function promptPlaywrightInstall() {
13
+ const shouldInstall = await p.confirm({
14
+ message: "Playwright is required for the Playwright driver. Install it now?",
15
+ initialValue: true
16
+ });
17
+ if (p.isCancel(shouldInstall) || !shouldInstall) return false;
18
+ const s = p.spinner();
19
+ s.start("Installing Playwright globally...");
20
+ try {
21
+ await addDependency("playwright", { global: true });
22
+ s.stop("Playwright installed successfully!");
23
+ return true;
24
+ } catch (fallbackError) {
25
+ s.stop("Failed to install Playwright");
26
+ p.log.error(`Installation failed: ${fallbackError}`);
27
+ return false;
28
+ }
29
+ }
30
+ async function ensurePlaywrightInstalled() {
31
+ if (await checkPlaywrightInstallation()) return true;
32
+ p.log.warn("Playwright driver selected but Playwright is not installed.");
33
+ if (!await promptPlaywrightInstall()) {
34
+ p.log.error("Cannot proceed with Playwright driver without Playwright installed.");
35
+ return false;
36
+ }
37
+ return true;
38
+ }
39
+ async function isUseChromeSupported() {
40
+ try {
41
+ const { PlaywrightCrawler } = await import("crawlee");
42
+ const crawler = new PlaywrightCrawler({
43
+ launchContext: { useChrome: true },
44
+ requestHandler: async () => {},
45
+ maxRequestsPerCrawl: 1
46
+ });
47
+ const page = await crawler.browserPool.newPage();
48
+ await page.evaluate(() => {
49
+ return window.navigator.userAgent;
50
+ });
51
+ await page.close();
52
+ await crawler.browserPool.closeAllBrowsers();
53
+ crawler.stop();
54
+ return true;
55
+ } catch {}
56
+ return false;
57
+ }
58
+ //#endregion
59
+ export { ensurePlaywrightInstalled, isUseChromeSupported };
package/dist/cli.mjs CHANGED
@@ -1,66 +1,9 @@
1
1
  import { n as parseUrlPattern, r as validateGlobPattern, t as crawlAndGenerate } from "./_chunks/crawl.mjs";
2
2
  import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
3
3
  import * as p from "@clack/prompts";
4
- import { PlaywrightCrawler } from "crawlee";
5
4
  import { dirname, join, resolve } from "pathe";
6
5
  import { withHttps } from "ufo";
7
6
  import { fileURLToPath } from "node:url";
8
- import { addDependency } from "nypm";
9
- //#region src/playwright-utils.ts
10
- async function checkPlaywrightInstallation() {
11
- try {
12
- await import("playwright");
13
- return true;
14
- } catch {
15
- return false;
16
- }
17
- }
18
- async function promptPlaywrightInstall() {
19
- const shouldInstall = await p.confirm({
20
- message: "Playwright is required for the Playwright driver. Install it now?",
21
- initialValue: true
22
- });
23
- if (p.isCancel(shouldInstall) || !shouldInstall) return false;
24
- const s = p.spinner();
25
- s.start("Installing Playwright globally...");
26
- try {
27
- await addDependency("playwright", { global: true });
28
- s.stop("Playwright installed successfully!");
29
- return true;
30
- } catch (fallbackError) {
31
- s.stop("Failed to install Playwright");
32
- p.log.error(`Installation failed: ${fallbackError}`);
33
- return false;
34
- }
35
- }
36
- async function ensurePlaywrightInstalled() {
37
- if (await checkPlaywrightInstallation()) return true;
38
- p.log.warn("Playwright driver selected but Playwright is not installed.");
39
- if (!await promptPlaywrightInstall()) {
40
- p.log.error("Cannot proceed with Playwright driver without Playwright installed.");
41
- return false;
42
- }
43
- return true;
44
- }
45
- async function isUseChromeSupported() {
46
- try {
47
- const crawler = new PlaywrightCrawler({
48
- launchContext: { useChrome: true },
49
- requestHandler: async () => {},
50
- maxRequestsPerCrawl: 1
51
- });
52
- const page = await crawler.browserPool.newPage();
53
- await page.evaluate(() => {
54
- return window.navigator.userAgent;
55
- });
56
- await page.close();
57
- await crawler.browserPool.closeAllBrowsers();
58
- crawler.stop();
59
- return true;
60
- } catch {}
61
- return false;
62
- }
63
- //#endregion
64
7
  //#region src/cli.ts
65
8
  const packageJsonPath = join(dirname(fileURLToPath(import.meta.url)), "..", "package.json");
66
9
  const version = JSON.parse(readFileSync(packageJsonPath, "utf-8")).version;
@@ -225,11 +168,17 @@ async function interactiveCrawl() {
225
168
  skipSitemap: advancedOptions.skipSitemap
226
169
  };
227
170
  }
228
- async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds) {
171
+ async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds, latency) {
229
172
  const messages = [];
230
173
  const durationStr = `${durationSeconds.toFixed(1)}s`;
231
- const stats = failed > 0 ? `${successful} pages, ${failed} failed` : `${successful} pages`;
232
- messages.push(`📄 ${stats} • ⏱️ ${durationStr}`);
174
+ messages.push(`📄 ${successful} pages \u00B7 ⏱️ ${durationStr}`);
175
+ if (failed > 0) messages.push(`⚠️ ${failed} failed`);
176
+ if (latency && latency.count > 0) {
177
+ const avg = Math.round(latency.total / latency.count);
178
+ const min = latency.min === Infinity ? 0 : Math.round(latency.min);
179
+ const max = Math.round(latency.max);
180
+ messages.push(`🏓 avg ${avg}ms \u00B7 min ${min}ms \u00B7 max ${max}ms`);
181
+ }
233
182
  messages.push(`📦 ${generatedFiles.join(", ")}`);
234
183
  messages.push(`📁 ${outputDir}`);
235
184
  p.note(messages.join("\n"), "✅ Complete");
@@ -431,36 +380,46 @@ async function main() {
431
380
  if (permCheck.error?.includes("Permission denied")) p.log.info("Tip: Try running with elevated privileges (e.g., sudo) or change the output directory permissions.");
432
381
  process.exit(1);
433
382
  }
434
- if (options.driver === "playwright") if (await isUseChromeSupported()) {
435
- options.useChrome = true;
436
- p.log.info("System Chrome detected and enabled.");
437
- } else {
438
- if (!await ensurePlaywrightInstalled()) {
439
- p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
383
+ if (options.driver === "playwright") {
384
+ try {
385
+ await import("crawlee");
386
+ } catch {
387
+ p.log.error("The Playwright driver requires crawlee. Install it with: npm install crawlee");
440
388
  process.exit(1);
441
389
  }
442
- p.log.info("Using global playwright instance.");
390
+ const { ensurePlaywrightInstalled, isUseChromeSupported } = await import("./_chunks/playwright-utils.mjs");
391
+ if (await isUseChromeSupported()) {
392
+ options.useChrome = true;
393
+ p.log.info("System Chrome detected and enabled.");
394
+ } else {
395
+ if (!await ensurePlaywrightInstalled()) {
396
+ p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
397
+ process.exit(1);
398
+ }
399
+ p.log.info("Using global playwright instance.");
400
+ }
443
401
  }
444
402
  const s = p.spinner();
445
- s.start("Starting crawl...");
403
+ s.start("Discovering sitemaps");
446
404
  const startTime = Date.now();
405
+ let crawlStartTime = 0;
406
+ let lastProgress;
447
407
  const results = await crawlAndGenerate(options, (progress) => {
448
- if (progress.sitemap.status === "discovering") s.message("Discovering sitemaps...");
408
+ lastProgress = progress;
409
+ if (progress.sitemap.status === "discovering") s.message("Discovering sitemaps");
449
410
  else if (progress.sitemap.status === "processing") s.message(`Processing sitemap... Found ${progress.sitemap.found} URLs`);
450
411
  else if (progress.crawling.status === "processing") {
451
- const processedCount = progress.crawling.processed;
452
- const totalCount = progress.crawling.total;
453
- const currentUrl = progress.crawling.currentUrl;
454
- if (currentUrl) {
455
- const shortUrl = currentUrl.length > 60 ? `${currentUrl.substring(0, 57)}...` : currentUrl;
456
- if (processedCount > totalCount) s.message(`Crawling ${processedCount}: ${shortUrl}`);
457
- else s.message(`Crawling ${processedCount}/${totalCount}: ${shortUrl}`);
458
- } else if (processedCount > totalCount) s.message(`Crawling... ${processedCount} pages`);
459
- else s.message(`Crawling... ${processedCount}/${totalCount} pages`);
460
- } else if (progress.generation.status === "generating") {
461
- const current = progress.generation.current || "Generating files";
462
- s.message(current);
463
- }
412
+ if (!crawlStartTime) crawlStartTime = Date.now();
413
+ const processed = progress.crawling.processed;
414
+ const total = progress.crawling.total;
415
+ const failed = progress.crawling.failed;
416
+ const elapsed = (Date.now() - crawlStartTime) / 1e3;
417
+ const rate = elapsed > .1 ? Math.round(processed / elapsed) : 0;
418
+ let msg = processed > total ? `Crawling ${processed} pages` : `Crawling ${processed}/${total}`;
419
+ if (rate > 0) msg += ` \u00B7 ${rate}/s`;
420
+ if (failed > 0) msg += ` \u00B7 ${failed} failed`;
421
+ s.message(msg);
422
+ } else if (progress.generation.status === "generating") s.message(progress.generation.current || "Generating files");
464
423
  });
465
424
  s.stop();
466
425
  const durationSeconds = (Date.now() - startTime) / 1e3;
@@ -484,7 +443,7 @@ async function main() {
484
443
  if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
485
444
  if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
486
445
  }
487
- await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
446
+ await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds, lastProgress?.crawling.latency);
488
447
  process.exit(0);
489
448
  }
490
449
  main().catch((error) => {
package/dist/index.d.mts CHANGED
@@ -63,7 +63,14 @@ interface CrawlProgress {
63
63
  status: 'starting' | 'processing' | 'completed';
64
64
  total: number;
65
65
  processed: number;
66
- currentUrl?: string;
66
+ failed: number;
67
+ currentUrl?: string; /** Page fetch latency stats in ms */
68
+ latency: {
69
+ total: number;
70
+ min: number;
71
+ max: number;
72
+ count: number;
73
+ };
67
74
  };
68
75
  generation: {
69
76
  status: 'idle' | 'generating' | 'completed';
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "1.0.0-beta.10",
4
+ "version": "1.0.0-beta.11",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -42,22 +42,26 @@
42
42
  "dist"
43
43
  ],
44
44
  "peerDependencies": {
45
+ "crawlee": "^3.16.0",
45
46
  "playwright": "^1.53.2"
46
47
  },
47
48
  "peerDependenciesMeta": {
49
+ "crawlee": {
50
+ "optional": true
51
+ },
48
52
  "playwright": {
49
53
  "optional": true
50
54
  }
51
55
  },
52
56
  "dependencies": {
53
57
  "@clack/prompts": "^1.1.0",
54
- "crawlee": "^3.16.0",
55
58
  "nypm": "^0.6.5",
59
+ "ofetch": "^1.5.1",
56
60
  "pathe": "^2.0.3",
57
61
  "picomatch": "^4.0.3",
58
62
  "ufo": "^1.6.3",
59
- "mdream": "1.0.0-beta.10",
60
- "@mdream/js": "1.0.0-beta.10"
63
+ "@mdream/js": "1.0.0-beta.11",
64
+ "mdream": "1.0.0-beta.11"
61
65
  },
62
66
  "devDependencies": {
63
67
  "@types/picomatch": "^4.0.2"