@mdream/crawl 1.0.0-beta.9 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,59 @@
1
+ import * as p from "@clack/prompts";
2
+ import { addDependency } from "nypm";
3
+ //#region src/playwright-utils.ts
4
+ async function checkPlaywrightInstallation() {
5
+ try {
6
+ await import("playwright");
7
+ return true;
8
+ } catch {
9
+ return false;
10
+ }
11
+ }
12
+ async function promptPlaywrightInstall() {
13
+ const shouldInstall = await p.confirm({
14
+ message: "Playwright is required for the Playwright driver. Install it now?",
15
+ initialValue: true
16
+ });
17
+ if (p.isCancel(shouldInstall) || !shouldInstall) return false;
18
+ const s = p.spinner();
19
+ s.start("Installing Playwright globally...");
20
+ try {
21
+ await addDependency("playwright", { global: true });
22
+ s.stop("Playwright installed successfully!");
23
+ return true;
24
+ } catch (fallbackError) {
25
+ s.stop("Failed to install Playwright");
26
+ p.log.error(`Installation failed: ${fallbackError}`);
27
+ return false;
28
+ }
29
+ }
30
+ async function ensurePlaywrightInstalled() {
31
+ if (await checkPlaywrightInstallation()) return true;
32
+ p.log.warn("Playwright driver selected but Playwright is not installed.");
33
+ if (!await promptPlaywrightInstall()) {
34
+ p.log.error("Cannot proceed with Playwright driver without Playwright installed.");
35
+ return false;
36
+ }
37
+ return true;
38
+ }
39
+ async function isUseChromeSupported() {
40
+ try {
41
+ const { PlaywrightCrawler } = await import("crawlee");
42
+ const crawler = new PlaywrightCrawler({
43
+ launchContext: { useChrome: true },
44
+ requestHandler: async () => {},
45
+ maxRequestsPerCrawl: 1
46
+ });
47
+ const page = await crawler.browserPool.newPage();
48
+ await page.evaluate(() => {
49
+ return window.navigator.userAgent;
50
+ });
51
+ await page.close();
52
+ await crawler.browserPool.closeAllBrowsers();
53
+ crawler.stop();
54
+ return true;
55
+ } catch {}
56
+ return false;
57
+ }
58
+ //#endregion
59
+ export { ensurePlaywrightInstalled, isUseChromeSupported };
package/dist/cli.mjs CHANGED
@@ -1,64 +1,17 @@
1
1
  import { n as parseUrlPattern, r as validateGlobPattern, t as crawlAndGenerate } from "./_chunks/crawl.mjs";
2
2
  import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
3
3
  import * as p from "@clack/prompts";
4
- import { PlaywrightCrawler } from "crawlee";
5
4
  import { dirname, join, resolve } from "pathe";
6
5
  import { withHttps } from "ufo";
7
6
  import { fileURLToPath } from "node:url";
8
- import { addDependency } from "nypm";
9
- //#region src/playwright-utils.ts
10
- async function checkPlaywrightInstallation() {
11
- try {
12
- await import("playwright");
13
- return true;
14
- } catch {
15
- return false;
16
- }
17
- }
18
- async function promptPlaywrightInstall() {
19
- const shouldInstall = await p.confirm({
20
- message: "Playwright is required for the Playwright driver. Install it now?",
21
- initialValue: true
7
+ import { loadConfig } from "c12";
8
+ //#region src/config.ts
9
+ async function loadMdreamConfig(cwd) {
10
+ const { config } = await loadConfig({
11
+ name: "mdream",
12
+ cwd
22
13
  });
23
- if (p.isCancel(shouldInstall) || !shouldInstall) return false;
24
- const s = p.spinner();
25
- s.start("Installing Playwright globally...");
26
- try {
27
- await addDependency("playwright", { global: true });
28
- s.stop("Playwright installed successfully!");
29
- return true;
30
- } catch (fallbackError) {
31
- s.stop("Failed to install Playwright");
32
- p.log.error(`Installation failed: ${fallbackError}`);
33
- return false;
34
- }
35
- }
36
- async function ensurePlaywrightInstalled() {
37
- if (await checkPlaywrightInstallation()) return true;
38
- p.log.warn("Playwright driver selected but Playwright is not installed.");
39
- if (!await promptPlaywrightInstall()) {
40
- p.log.error("Cannot proceed with Playwright driver without Playwright installed.");
41
- return false;
42
- }
43
- return true;
44
- }
45
- async function isUseChromeSupported() {
46
- try {
47
- const crawler = new PlaywrightCrawler({
48
- launchContext: { useChrome: true },
49
- requestHandler: async () => {},
50
- maxRequestsPerCrawl: 1
51
- });
52
- const page = await crawler.browserPool.newPage();
53
- await page.evaluate(() => {
54
- return window.navigator.userAgent;
55
- });
56
- await page.close();
57
- await crawler.browserPool.closeAllBrowsers();
58
- crawler.stop();
59
- return true;
60
- } catch {}
61
- return false;
14
+ return config || {};
62
15
  }
63
16
  //#endregion
64
17
  //#region src/cli.ts
@@ -225,11 +178,17 @@ async function interactiveCrawl() {
225
178
  skipSitemap: advancedOptions.skipSitemap
226
179
  };
227
180
  }
228
- async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds) {
181
+ async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds, latency) {
229
182
  const messages = [];
230
183
  const durationStr = `${durationSeconds.toFixed(1)}s`;
231
- const stats = failed > 0 ? `${successful} pages, ${failed} failed` : `${successful} pages`;
232
- messages.push(`📄 ${stats} • ⏱️ ${durationStr}`);
184
+ messages.push(`📄 ${successful} pages \u00B7 ⏱️ ${durationStr}`);
185
+ if (failed > 0) messages.push(`⚠️ ${failed} failed`);
186
+ if (latency && latency.count > 0) {
187
+ const avg = Math.round(latency.total / latency.count);
188
+ const min = latency.min === Infinity ? 0 : Math.round(latency.min);
189
+ const max = Math.round(latency.max);
190
+ messages.push(`🏓 avg ${avg}ms \u00B7 min ${min}ms \u00B7 max ${max}ms`);
191
+ }
233
192
  messages.push(`📦 ${generatedFiles.join(", ")}`);
234
193
  messages.push(`📁 ${outputDir}`);
235
194
  p.note(messages.join("\n"), "✅ Complete");
@@ -249,7 +208,8 @@ Usage:
249
208
  Options:
250
209
  -u, --url <url> Website URL to crawl
251
210
  -o, --output <dir> Output directory (default: output)
252
- -d, --depth <number> Crawl depth (default: 3)
211
+ -d, --depth <number> Crawl depth, 0 for single page (default: 3)
212
+ --single-page Only process the given URL(s), no crawling (alias for --depth 0)
253
213
  --driver <http|playwright> Crawler driver (default: http)
254
214
  --artifacts <list> Comma-separated list of artifacts: llms.txt,llms-full.txt,markdown (default: all)
255
215
  --origin <url> Origin URL for resolving relative paths (overrides auto-detection)
@@ -259,6 +219,7 @@ Options:
259
219
  --crawl-delay <seconds> Crawl delay in seconds
260
220
  --exclude <pattern> Exclude URLs matching glob patterns (can be used multiple times)
261
221
  --skip-sitemap Skip sitemap.xml and robots.txt discovery
222
+ --allow-subdomains Crawl across subdomains of the same root domain
262
223
  -v, --verbose Enable verbose logging
263
224
  -h, --help Show this help message
264
225
  --version Show version number
@@ -271,6 +232,7 @@ Examples:
271
232
  @mdream/crawl -u example.com --exclude "*/admin/*" --exclude "*/api/*"
272
233
  @mdream/crawl -u example.com --verbose
273
234
  @mdream/crawl -u example.com --skip-sitemap
235
+ @mdream/crawl -u example.com --driver playwright --single-page
274
236
  `);
275
237
  process.exit(0);
276
238
  }
@@ -324,10 +286,10 @@ Examples:
324
286
  process.exit(1);
325
287
  }
326
288
  }
327
- const depthStr = getArgValue("--depth") || getArgValue("-d") || "3";
328
- const depth = Number.parseInt(depthStr);
329
- if (Number.isNaN(depth) || depth < 1 || depth > 10) {
330
- p.log.error("Error: Depth must be between 1 and 10");
289
+ const depthStr = args.includes("--single-page") ? "0" : getArgValue("--depth") || getArgValue("-d") || "3";
290
+ const depth = Number(depthStr);
291
+ if (!Number.isInteger(depth) || depth < 0 || depth > 10) {
292
+ p.log.error("Error: Depth must be an integer between 0 and 10");
331
293
  process.exit(1);
332
294
  }
333
295
  const driver = getArgValue("--driver");
@@ -381,13 +343,14 @@ Examples:
381
343
  const patterns = [parsed];
382
344
  const verbose = args.includes("--verbose") || args.includes("-v");
383
345
  const skipSitemap = args.includes("--skip-sitemap");
346
+ const allowSubdomains = args.includes("--allow-subdomains");
384
347
  if (skipSitemap && parsed.isGlob) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
385
348
  return {
386
349
  urls: [url],
387
350
  outputDir: resolve(getArgValue("--output") || getArgValue("-o") || "output"),
388
351
  driver: driver || "http",
389
352
  maxRequestsPerCrawl: Number.parseInt(maxPagesStr || String(Number.MAX_SAFE_INTEGER)),
390
- followLinks: true,
353
+ followLinks: depth > 0,
391
354
  maxDepth: depth,
392
355
  generateLlmsTxt: artifacts.includes("llms.txt"),
393
356
  generateLlmsFullTxt: artifacts.includes("llms-full.txt"),
@@ -399,14 +362,28 @@ Examples:
399
362
  crawlDelay: crawlDelayStr ? Number.parseInt(crawlDelayStr) : void 0,
400
363
  exclude: excludePatterns.length > 0 ? excludePatterns : void 0,
401
364
  verbose,
402
- skipSitemap
365
+ skipSitemap,
366
+ allowSubdomains
403
367
  };
404
368
  }
405
369
  async function main() {
406
370
  const cliOptions = parseCliArgs();
371
+ const fileConfig = await loadMdreamConfig();
407
372
  let options;
408
373
  if (cliOptions) {
409
- options = cliOptions;
374
+ const configExclude = fileConfig.exclude || [];
375
+ const cliExclude = cliOptions.exclude || [];
376
+ options = {
377
+ ...cliOptions,
378
+ driver: cliOptions.driver || fileConfig.driver || "http",
379
+ maxDepth: cliOptions.maxDepth ?? fileConfig.maxDepth,
380
+ crawlDelay: cliOptions.crawlDelay ?? fileConfig.crawlDelay,
381
+ skipSitemap: cliOptions.skipSitemap || fileConfig.skipSitemap || false,
382
+ allowSubdomains: cliOptions.allowSubdomains || fileConfig.allowSubdomains || false,
383
+ verbose: cliOptions.verbose || fileConfig.verbose || false,
384
+ exclude: configExclude.length > 0 || cliExclude.length > 0 ? [...configExclude, ...cliExclude] : void 0,
385
+ hooks: fileConfig.hooks
386
+ };
410
387
  p.intro(`☁️ mdream v${version}`);
411
388
  const formats = [];
412
389
  if (options.generateLlmsTxt) formats.push("llms.txt");
@@ -420,6 +397,7 @@ async function main() {
420
397
  `Formats: ${formats.join(", ")}`,
421
398
  options.exclude && options.exclude.length > 0 && `Exclude: ${options.exclude.join(", ")}`,
422
399
  options.skipSitemap && `Skip sitemap: Yes`,
400
+ options.allowSubdomains && `Allow subdomains: Yes`,
423
401
  options.verbose && `Verbose: Enabled`
424
402
  ].filter(Boolean);
425
403
  p.note(summary.join("\n"), "Configuration");
@@ -431,36 +409,46 @@ async function main() {
431
409
  if (permCheck.error?.includes("Permission denied")) p.log.info("Tip: Try running with elevated privileges (e.g., sudo) or change the output directory permissions.");
432
410
  process.exit(1);
433
411
  }
434
- if (options.driver === "playwright") if (await isUseChromeSupported()) {
435
- options.useChrome = true;
436
- p.log.info("System Chrome detected and enabled.");
437
- } else {
438
- if (!await ensurePlaywrightInstalled()) {
439
- p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
412
+ if (options.driver === "playwright") {
413
+ try {
414
+ await import("crawlee");
415
+ } catch {
416
+ p.log.error("The Playwright driver requires crawlee. Install it with: npm install crawlee");
440
417
  process.exit(1);
441
418
  }
442
- p.log.info("Using global playwright instance.");
419
+ const { ensurePlaywrightInstalled, isUseChromeSupported } = await import("./_chunks/playwright-utils.mjs");
420
+ if (await isUseChromeSupported()) {
421
+ options.useChrome = true;
422
+ p.log.info("System Chrome detected and enabled.");
423
+ } else {
424
+ if (!await ensurePlaywrightInstalled()) {
425
+ p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
426
+ process.exit(1);
427
+ }
428
+ p.log.info("Using global playwright instance.");
429
+ }
443
430
  }
444
431
  const s = p.spinner();
445
- s.start("Starting crawl...");
432
+ s.start("Discovering sitemaps");
446
433
  const startTime = Date.now();
434
+ let crawlStartTime = 0;
435
+ let lastProgress;
447
436
  const results = await crawlAndGenerate(options, (progress) => {
448
- if (progress.sitemap.status === "discovering") s.message("Discovering sitemaps...");
437
+ lastProgress = progress;
438
+ if (progress.sitemap.status === "discovering") s.message("Discovering sitemaps");
449
439
  else if (progress.sitemap.status === "processing") s.message(`Processing sitemap... Found ${progress.sitemap.found} URLs`);
450
440
  else if (progress.crawling.status === "processing") {
451
- const processedCount = progress.crawling.processed;
452
- const totalCount = progress.crawling.total;
453
- const currentUrl = progress.crawling.currentUrl;
454
- if (currentUrl) {
455
- const shortUrl = currentUrl.length > 60 ? `${currentUrl.substring(0, 57)}...` : currentUrl;
456
- if (processedCount > totalCount) s.message(`Crawling ${processedCount}: ${shortUrl}`);
457
- else s.message(`Crawling ${processedCount}/${totalCount}: ${shortUrl}`);
458
- } else if (processedCount > totalCount) s.message(`Crawling... ${processedCount} pages`);
459
- else s.message(`Crawling... ${processedCount}/${totalCount} pages`);
460
- } else if (progress.generation.status === "generating") {
461
- const current = progress.generation.current || "Generating files";
462
- s.message(current);
463
- }
441
+ if (!crawlStartTime) crawlStartTime = Date.now();
442
+ const processed = progress.crawling.processed;
443
+ const total = progress.crawling.total;
444
+ const failed = progress.crawling.failed;
445
+ const elapsed = (Date.now() - crawlStartTime) / 1e3;
446
+ const rate = elapsed > .1 ? Math.round(processed / elapsed) : 0;
447
+ let msg = processed > total ? `Crawling ${processed} pages` : `Crawling ${processed}/${total}`;
448
+ if (rate > 0) msg += ` \u00B7 ${rate}/s`;
449
+ if (failed > 0) msg += ` \u00B7 ${failed} failed`;
450
+ s.message(msg);
451
+ } else if (progress.generation.status === "generating") s.message(progress.generation.current || "Generating files");
464
452
  });
465
453
  s.stop();
466
454
  const durationSeconds = (Date.now() - startTime) / 1e3;
@@ -484,11 +472,13 @@ async function main() {
484
472
  if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
485
473
  if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
486
474
  }
487
- await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
475
+ await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds, lastProgress?.crawling.latency);
488
476
  process.exit(0);
489
477
  }
490
478
  main().catch((error) => {
491
- p.log.error(`Unexpected error: ${error}`);
479
+ const msg = error instanceof Error ? error.message : String(error);
480
+ if (msg.includes("wmic") || msg.includes("ENOENT") && process.platform === "win32") p.log.error("Crawlee failed because wmic.exe is not available on this system. Windows 11 removed wmic.exe, which older crawlee versions depend on for memory monitoring.\nFix: upgrade crawlee to >=3.16.0 or switch to the HTTP driver (--driver http).");
481
+ else p.log.error(`Unexpected error: ${msg}`);
492
482
  process.exit(1);
493
483
  });
494
484
  //#endregion
package/dist/index.d.mts CHANGED
@@ -6,6 +6,22 @@ interface PageData {
6
6
  metadata: PageMetadata;
7
7
  origin: string;
8
8
  }
9
+ interface CrawlHooks {
10
+ 'crawl:url': (ctx: {
11
+ url: string;
12
+ skip: boolean;
13
+ }) => void | Promise<void>;
14
+ 'crawl:page': (page: PageData) => void | Promise<void>;
15
+ 'crawl:content': (ctx: {
16
+ url: string;
17
+ title: string;
18
+ content: string;
19
+ filePath: string;
20
+ }) => void | Promise<void>;
21
+ 'crawl:done': (ctx: {
22
+ results: CrawlResult[];
23
+ }) => void | Promise<void>;
24
+ }
9
25
  interface CrawlOptions {
10
26
  urls: string[];
11
27
  outputDir: string;
@@ -26,8 +42,23 @@ interface CrawlOptions {
26
42
  descriptionOverride?: string;
27
43
  verbose?: boolean;
28
44
  skipSitemap?: boolean;
45
+ allowSubdomains?: boolean;
46
+ hooks?: Partial<{ [K in keyof CrawlHooks]: CrawlHooks[K] | CrawlHooks[K][] }>;
29
47
  onPage?: (page: PageData) => Promise<void> | void;
30
48
  }
49
+ interface MdreamCrawlConfig {
50
+ exclude?: string[];
51
+ driver?: 'http' | 'playwright';
52
+ maxDepth?: number;
53
+ maxPages?: number;
54
+ crawlDelay?: number;
55
+ skipSitemap?: boolean;
56
+ allowSubdomains?: boolean;
57
+ verbose?: boolean;
58
+ artifacts?: ('llms.txt' | 'llms-full.txt' | 'markdown')[];
59
+ hooks?: Partial<{ [K in keyof CrawlHooks]: CrawlHooks[K] | CrawlHooks[K][] }>;
60
+ }
61
+ declare function defineConfig(config: MdreamCrawlConfig): MdreamCrawlConfig;
31
62
  interface ParsedUrlPattern {
32
63
  baseUrl: string;
33
64
  pattern: string;
@@ -63,7 +94,14 @@ interface CrawlProgress {
63
94
  status: 'starting' | 'processing' | 'completed';
64
95
  total: number;
65
96
  processed: number;
66
- currentUrl?: string;
97
+ failed: number;
98
+ currentUrl?: string; /** Page fetch latency stats in ms */
99
+ latency: {
100
+ total: number;
101
+ min: number;
102
+ max: number;
103
+ count: number;
104
+ };
67
105
  };
68
106
  generation: {
69
107
  status: 'idle' | 'generating' | 'completed';
@@ -72,4 +110,4 @@ interface CrawlProgress {
72
110
  }
73
111
  declare function crawlAndGenerate(options: CrawlOptions, onProgress?: (progress: CrawlProgress) => void): Promise<CrawlResult[]>;
74
112
  //#endregion
75
- export { type CrawlOptions, type CrawlResult, type PageData, crawlAndGenerate };
113
+ export { type CrawlHooks, type CrawlOptions, type CrawlResult, type MdreamCrawlConfig, type PageData, crawlAndGenerate, defineConfig };
package/dist/index.mjs CHANGED
@@ -1,2 +1,7 @@
1
1
  import { t as crawlAndGenerate } from "./_chunks/crawl.mjs";
2
- export { crawlAndGenerate };
2
+ //#region src/types.ts
3
+ function defineConfig(config) {
4
+ return config;
5
+ }
6
+ //#endregion
7
+ export { crawlAndGenerate, defineConfig };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "1.0.0-beta.9",
4
+ "version": "1.0.1",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -42,22 +42,29 @@
42
42
  "dist"
43
43
  ],
44
44
  "peerDependencies": {
45
+ "crawlee": "^3.16.0",
45
46
  "playwright": "^1.53.2"
46
47
  },
47
48
  "peerDependenciesMeta": {
49
+ "crawlee": {
50
+ "optional": true
51
+ },
48
52
  "playwright": {
49
53
  "optional": true
50
54
  }
51
55
  },
52
56
  "dependencies": {
53
57
  "@clack/prompts": "^1.1.0",
54
- "crawlee": "^3.16.0",
58
+ "c12": "^3.0.4",
59
+ "hookable": "^5.5.3",
55
60
  "nypm": "^0.6.5",
61
+ "ofetch": "^1.5.1",
56
62
  "pathe": "^2.0.3",
57
63
  "picomatch": "^4.0.3",
64
+ "tldts": "^7.0.26",
58
65
  "ufo": "^1.6.3",
59
- "@mdream/js": "1.0.0-beta.9",
60
- "mdream": "1.0.0-beta.9"
66
+ "@mdream/js": "1.0.1",
67
+ "mdream": "1.0.1"
61
68
  },
62
69
  "devDependencies": {
63
70
  "@types/picomatch": "^4.0.2"