@mdream/crawl 0.8.2 → 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -194,7 +194,7 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
194
194
  return urls;
195
195
  }
196
196
  async function crawlAndGenerate(options, onProgress) {
197
- const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false } = options;
197
+ const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false } = options;
198
198
  const outputDir = resolve(normalize(rawOutputDir));
199
199
  if (verbose) log.setLevel(log.LEVELS.INFO);
200
200
  else log.setLevel(log.LEVELS.OFF);
@@ -449,8 +449,14 @@ async function crawlAndGenerate(options, onProgress) {
449
449
  respectRobotsTxtFile: true
450
450
  };
451
451
  if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
452
- if (driver === "playwright") crawler = new PlaywrightCrawler(crawlerOptions);
453
- else crawler = new HttpCrawler(crawlerOptions);
452
+ if (driver === "playwright") {
453
+ const playwrightOptions = crawlerOptions;
454
+ if (useChrome) playwrightOptions.launchContext = {
455
+ ...playwrightOptions.launchContext,
456
+ useChrome
457
+ };
458
+ crawler = new PlaywrightCrawler(playwrightOptions);
459
+ } else crawler = new HttpCrawler(crawlerOptions);
454
460
  const initialRequests = startingUrls.map((url) => ({
455
461
  url,
456
462
  userData: { depth: 0 }
package/dist/cli.mjs CHANGED
@@ -1,7 +1,8 @@
1
- import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-BwURA9nQ.mjs";
1
+ import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-DYXGzu7W.mjs";
2
2
  import { readFileSync } from "node:fs";
3
3
  import * as p$1 from "@clack/prompts";
4
4
  import * as p from "@clack/prompts";
5
+ import { PlaywrightCrawler } from "crawlee";
5
6
  import { dirname, join, resolve } from "pathe";
6
7
  import { fileURLToPath } from "node:url";
7
8
  import { addDependency } from "nypm";
@@ -22,21 +23,15 @@ async function promptPlaywrightInstall() {
22
23
  });
23
24
  if (p$1.isCancel(shouldInstall) || !shouldInstall) return false;
24
25
  const s = p$1.spinner();
25
- s.start("Installing Playwright...");
26
+ s.start("Installing Playwright globally...");
26
27
  try {
27
- await addDependency("playwright", { workspace: true });
28
+ await addDependency("playwright", { global: true });
28
29
  s.stop("Playwright installed successfully!");
29
30
  return true;
30
- } catch {
31
- try {
32
- await addDependency("playwright");
33
- s.stop("Playwright installed successfully!");
34
- return true;
35
- } catch (fallbackError) {
36
- s.stop("Failed to install Playwright");
37
- p$1.log.error(`Installation failed: ${fallbackError}`);
38
- return false;
39
- }
31
+ } catch (fallbackError) {
32
+ s.stop("Failed to install Playwright");
33
+ p$1.log.error(`Installation failed: ${fallbackError}`);
34
+ return false;
40
35
  }
41
36
  }
42
37
  async function ensurePlaywrightInstalled() {
@@ -50,6 +45,24 @@ async function ensurePlaywrightInstalled() {
50
45
  }
51
46
  return true;
52
47
  }
48
+ async function isUseChromeSupported() {
49
+ try {
50
+ const crawler = new PlaywrightCrawler({
51
+ launchContext: { useChrome: true },
52
+ requestHandler: async () => {},
53
+ maxRequestsPerCrawl: 1
54
+ });
55
+ const page = await crawler.browserPool.newPage();
56
+ await page.evaluate(() => {
57
+ return window.navigator.userAgent;
58
+ });
59
+ await page.close();
60
+ await crawler.browserPool.closeAllBrowsers();
61
+ crawler.stop();
62
+ return true;
63
+ } catch {}
64
+ return false;
65
+ }
53
66
 
54
67
  //#endregion
55
68
  //#region src/cli.ts
@@ -400,10 +413,17 @@ async function main() {
400
413
  } else options = await interactiveCrawl();
401
414
  if (!options) process.exit(0);
402
415
  if (options.driver === "playwright") {
403
- const playwrightInstalled = await ensurePlaywrightInstalled();
404
- if (!playwrightInstalled) {
405
- p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
406
- process.exit(1);
416
+ const chromeSupported = await isUseChromeSupported();
417
+ if (chromeSupported) {
418
+ options.useChrome = true;
419
+ p.log.info("System Chrome detected and enabled.");
420
+ } else {
421
+ const playwrightInstalled = await ensurePlaywrightInstalled();
422
+ if (!playwrightInstalled) {
423
+ p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
424
+ process.exit(1);
425
+ }
426
+ p.log.info("Using global playwright instance.");
407
427
  }
408
428
  }
409
429
  const s = p.spinner();
@@ -451,11 +471,8 @@ async function main() {
451
471
  if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
452
472
  if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
453
473
  }
454
- if (!cliOptions) await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
455
- else {
456
- await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
457
- if (successful === 0) process.exit(1);
458
- }
474
+ await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
475
+ process.exit(0);
459
476
  }
460
477
  main().catch((error) => {
461
478
  p.log.error(`Unexpected error: ${error}`);
package/dist/index.d.mts CHANGED
@@ -9,6 +9,7 @@ interface CrawlOptions {
9
9
  origin?: string;
10
10
  chunkSize?: number;
11
11
  driver?: 'http' | 'playwright';
12
+ useChrome?: boolean;
12
13
  followLinks?: boolean;
13
14
  maxDepth?: number;
14
15
  globPatterns?: ParsedUrlPattern[];
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { crawlAndGenerate } from "./_chunks/crawl-BwURA9nQ.mjs";
1
+ import { crawlAndGenerate } from "./_chunks/crawl-DYXGzu7W.mjs";
2
2
  import { writeFile } from "node:fs/promises";
3
3
  import { basename, sep } from "pathe";
4
4
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "0.8.2",
4
+ "version": "0.8.4",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -50,7 +50,7 @@
50
50
  "nypm": "^0.6.0",
51
51
  "pathe": "^2.0.3",
52
52
  "picomatch": "^4.0.3",
53
- "mdream": "0.8.2"
53
+ "mdream": "0.8.4"
54
54
  },
55
55
  "devDependencies": {
56
56
  "@types/picomatch": "^4.0.1"