@mdream/crawl 0.8.3 → 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,24 +5,10 @@ import { HttpCrawler, PlaywrightCrawler, log, purgeDefaultStorages } from "crawl
5
5
  import { generateLlmsTxtArtifacts, htmlToMarkdown } from "mdream";
6
6
  import { withMinimalPreset } from "mdream/preset/minimal";
7
7
  import { dirname, join, normalize, resolve } from "pathe";
8
+ import { withHttps } from "ufo";
8
9
  import picomatch from "picomatch";
9
10
  import { extractionPlugin } from "mdream/plugins";
10
11
 
11
- //#region ../../node_modules/.pnpm/ufo@1.6.1/node_modules/ufo/dist/index.mjs
12
- const r = String.fromCharCode;
13
- const PROTOCOL_REGEX = /^[\s\w\0+.-]{2,}:([/\\]{2})?/;
14
- function withHttps(input) {
15
- return withProtocol(input, "https://");
16
- }
17
- function withProtocol(input, protocol) {
18
- let match = input.match(PROTOCOL_REGEX);
19
- if (!match) match = input.match(/^\/{2,}/);
20
- if (!match) return protocol + input;
21
- return protocol + input.slice(match[0].length);
22
- }
23
- const protocolRelative = Symbol.for("ufo:protocolRelative");
24
-
25
- //#endregion
26
12
  //#region src/glob-utils.ts
27
13
  /**
28
14
  * Parse a URL that may contain glob patterns
@@ -194,7 +180,7 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
194
180
  return urls;
195
181
  }
196
182
  async function crawlAndGenerate(options, onProgress) {
197
- const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false } = options;
183
+ const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false } = options;
198
184
  const outputDir = resolve(normalize(rawOutputDir));
199
185
  if (verbose) log.setLevel(log.LEVELS.INFO);
200
186
  else log.setLevel(log.LEVELS.OFF);
@@ -449,8 +435,14 @@ async function crawlAndGenerate(options, onProgress) {
449
435
  respectRobotsTxtFile: true
450
436
  };
451
437
  if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
452
- if (driver === "playwright") crawler = new PlaywrightCrawler(crawlerOptions);
453
- else crawler = new HttpCrawler(crawlerOptions);
438
+ if (driver === "playwright") {
439
+ const playwrightOptions = crawlerOptions;
440
+ if (useChrome) playwrightOptions.launchContext = {
441
+ ...playwrightOptions.launchContext,
442
+ useChrome
443
+ };
444
+ crawler = new PlaywrightCrawler(playwrightOptions);
445
+ } else crawler = new HttpCrawler(crawlerOptions);
454
446
  const initialRequests = startingUrls.map((url) => ({
455
447
  url,
456
448
  userData: { depth: 0 }
@@ -461,14 +453,14 @@ async function crawlAndGenerate(options, onProgress) {
461
453
  await crawler.run(initialRequests);
462
454
  progress.crawling.status = "completed";
463
455
  onProgress?.(progress);
464
- if (results.some((r$1) => r$1.success)) {
456
+ if (results.some((r) => r.success)) {
465
457
  progress.generation.status = "generating";
466
458
  onProgress?.(progress);
467
- const successfulResults = results.filter((r$1) => r$1.success);
459
+ const successfulResults = results.filter((r) => r.success);
468
460
  const firstUrl = new URL(withHttps(urls[0]));
469
461
  const origin$1 = firstUrl.origin;
470
- const homePageResult = successfulResults.find((r$1) => {
471
- const resultUrl = new URL(withHttps(r$1.url));
462
+ const homePageResult = successfulResults.find((r) => {
463
+ const resultUrl = new URL(withHttps(r.url));
472
464
  return resultUrl.href === origin$1 || resultUrl.href === `${origin$1}/`;
473
465
  });
474
466
  const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
@@ -522,4 +514,4 @@ async function crawlAndGenerate(options, onProgress) {
522
514
  }
523
515
 
524
516
  //#endregion
525
- export { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps };
517
+ export { crawlAndGenerate, parseUrlPattern, validateGlobPattern };
package/dist/cli.mjs CHANGED
@@ -1,8 +1,10 @@
1
- import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-BwURA9nQ.mjs";
1
+ import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-BtuYX2_u.mjs";
2
2
  import { readFileSync } from "node:fs";
3
3
  import * as p$1 from "@clack/prompts";
4
4
  import * as p from "@clack/prompts";
5
+ import { PlaywrightCrawler } from "crawlee";
5
6
  import { dirname, join, resolve } from "pathe";
7
+ import { withHttps } from "ufo";
6
8
  import { fileURLToPath } from "node:url";
7
9
  import { addDependency } from "nypm";
8
10
 
@@ -22,21 +24,15 @@ async function promptPlaywrightInstall() {
22
24
  });
23
25
  if (p$1.isCancel(shouldInstall) || !shouldInstall) return false;
24
26
  const s = p$1.spinner();
25
- s.start("Installing Playwright...");
27
+ s.start("Installing Playwright globally...");
26
28
  try {
27
- await addDependency("playwright", { workspace: true });
29
+ await addDependency("playwright", { global: true });
28
30
  s.stop("Playwright installed successfully!");
29
31
  return true;
30
- } catch {
31
- try {
32
- await addDependency("playwright");
33
- s.stop("Playwright installed successfully!");
34
- return true;
35
- } catch (fallbackError) {
36
- s.stop("Failed to install Playwright");
37
- p$1.log.error(`Installation failed: ${fallbackError}`);
38
- return false;
39
- }
32
+ } catch (fallbackError) {
33
+ s.stop("Failed to install Playwright");
34
+ p$1.log.error(`Installation failed: ${fallbackError}`);
35
+ return false;
40
36
  }
41
37
  }
42
38
  async function ensurePlaywrightInstalled() {
@@ -50,6 +46,24 @@ async function ensurePlaywrightInstalled() {
50
46
  }
51
47
  return true;
52
48
  }
49
+ async function isUseChromeSupported() {
50
+ try {
51
+ const crawler = new PlaywrightCrawler({
52
+ launchContext: { useChrome: true },
53
+ requestHandler: async () => {},
54
+ maxRequestsPerCrawl: 1
55
+ });
56
+ const page = await crawler.browserPool.newPage();
57
+ await page.evaluate(() => {
58
+ return window.navigator.userAgent;
59
+ });
60
+ await page.close();
61
+ await crawler.browserPool.closeAllBrowsers();
62
+ crawler.stop();
63
+ return true;
64
+ } catch {}
65
+ return false;
66
+ }
53
67
 
54
68
  //#endregion
55
69
  //#region src/cli.ts
@@ -400,10 +414,17 @@ async function main() {
400
414
  } else options = await interactiveCrawl();
401
415
  if (!options) process.exit(0);
402
416
  if (options.driver === "playwright") {
403
- const playwrightInstalled = await ensurePlaywrightInstalled();
404
- if (!playwrightInstalled) {
405
- p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
406
- process.exit(1);
417
+ const chromeSupported = await isUseChromeSupported();
418
+ if (chromeSupported) {
419
+ options.useChrome = true;
420
+ p.log.info("System Chrome detected and enabled.");
421
+ } else {
422
+ const playwrightInstalled = await ensurePlaywrightInstalled();
423
+ if (!playwrightInstalled) {
424
+ p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
425
+ process.exit(1);
426
+ }
427
+ p.log.info("Using global playwright instance.");
407
428
  }
408
429
  }
409
430
  const s = p.spinner();
@@ -451,11 +472,8 @@ async function main() {
451
472
  if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
452
473
  if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
453
474
  }
454
- if (!cliOptions) await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
455
- else {
456
- await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
457
- if (successful === 0) process.exit(1);
458
- }
475
+ await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
476
+ process.exit(0);
459
477
  }
460
478
  main().catch((error) => {
461
479
  p.log.error(`Unexpected error: ${error}`);
package/dist/index.d.mts CHANGED
@@ -9,6 +9,7 @@ interface CrawlOptions {
9
9
  origin?: string;
10
10
  chunkSize?: number;
11
11
  driver?: 'http' | 'playwright';
12
+ useChrome?: boolean;
12
13
  followLinks?: boolean;
13
14
  maxDepth?: number;
14
15
  globPatterns?: ParsedUrlPattern[];
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { crawlAndGenerate } from "./_chunks/crawl-BwURA9nQ.mjs";
1
+ import { crawlAndGenerate } from "./_chunks/crawl-BtuYX2_u.mjs";
2
2
  import { writeFile } from "node:fs/promises";
3
3
  import { basename, sep } from "pathe";
4
4
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "0.8.3",
4
+ "version": "0.8.5",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -50,7 +50,8 @@
50
50
  "nypm": "^0.6.0",
51
51
  "pathe": "^2.0.3",
52
52
  "picomatch": "^4.0.3",
53
- "mdream": "0.8.3"
53
+ "ufo": "^1.6.1",
54
+ "mdream": "0.8.5"
54
55
  },
55
56
  "devDependencies": {
56
57
  "@types/picomatch": "^4.0.1"