@mdream/crawl 0.8.3 → 0.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -194,7 +194,7 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
|
|
|
194
194
|
return urls;
|
|
195
195
|
}
|
|
196
196
|
async function crawlAndGenerate(options, onProgress) {
|
|
197
|
-
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false } = options;
|
|
197
|
+
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false } = options;
|
|
198
198
|
const outputDir = resolve(normalize(rawOutputDir));
|
|
199
199
|
if (verbose) log.setLevel(log.LEVELS.INFO);
|
|
200
200
|
else log.setLevel(log.LEVELS.OFF);
|
|
@@ -449,8 +449,14 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
449
449
|
respectRobotsTxtFile: true
|
|
450
450
|
};
|
|
451
451
|
if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
|
|
452
|
-
if (driver === "playwright")
|
|
453
|
-
|
|
452
|
+
if (driver === "playwright") {
|
|
453
|
+
const playwrightOptions = crawlerOptions;
|
|
454
|
+
if (useChrome) playwrightOptions.launchContext = {
|
|
455
|
+
...playwrightOptions.launchContext,
|
|
456
|
+
useChrome
|
|
457
|
+
};
|
|
458
|
+
crawler = new PlaywrightCrawler(playwrightOptions);
|
|
459
|
+
} else crawler = new HttpCrawler(crawlerOptions);
|
|
454
460
|
const initialRequests = startingUrls.map((url) => ({
|
|
455
461
|
url,
|
|
456
462
|
userData: { depth: 0 }
|
package/dist/cli.mjs
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-
|
|
1
|
+
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-DYXGzu7W.mjs";
|
|
2
2
|
import { readFileSync } from "node:fs";
|
|
3
3
|
import * as p$1 from "@clack/prompts";
|
|
4
4
|
import * as p from "@clack/prompts";
|
|
5
|
+
import { PlaywrightCrawler } from "crawlee";
|
|
5
6
|
import { dirname, join, resolve } from "pathe";
|
|
6
7
|
import { fileURLToPath } from "node:url";
|
|
7
8
|
import { addDependency } from "nypm";
|
|
@@ -22,21 +23,15 @@ async function promptPlaywrightInstall() {
|
|
|
22
23
|
});
|
|
23
24
|
if (p$1.isCancel(shouldInstall) || !shouldInstall) return false;
|
|
24
25
|
const s = p$1.spinner();
|
|
25
|
-
s.start("Installing Playwright...");
|
|
26
|
+
s.start("Installing Playwright globally...");
|
|
26
27
|
try {
|
|
27
|
-
await addDependency("playwright", {
|
|
28
|
+
await addDependency("playwright", { global: true });
|
|
28
29
|
s.stop("Playwright installed successfully!");
|
|
29
30
|
return true;
|
|
30
|
-
} catch {
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
return true;
|
|
35
|
-
} catch (fallbackError) {
|
|
36
|
-
s.stop("Failed to install Playwright");
|
|
37
|
-
p$1.log.error(`Installation failed: ${fallbackError}`);
|
|
38
|
-
return false;
|
|
39
|
-
}
|
|
31
|
+
} catch (fallbackError) {
|
|
32
|
+
s.stop("Failed to install Playwright");
|
|
33
|
+
p$1.log.error(`Installation failed: ${fallbackError}`);
|
|
34
|
+
return false;
|
|
40
35
|
}
|
|
41
36
|
}
|
|
42
37
|
async function ensurePlaywrightInstalled() {
|
|
@@ -50,6 +45,24 @@ async function ensurePlaywrightInstalled() {
|
|
|
50
45
|
}
|
|
51
46
|
return true;
|
|
52
47
|
}
|
|
48
|
+
async function isUseChromeSupported() {
|
|
49
|
+
try {
|
|
50
|
+
const crawler = new PlaywrightCrawler({
|
|
51
|
+
launchContext: { useChrome: true },
|
|
52
|
+
requestHandler: async () => {},
|
|
53
|
+
maxRequestsPerCrawl: 1
|
|
54
|
+
});
|
|
55
|
+
const page = await crawler.browserPool.newPage();
|
|
56
|
+
await page.evaluate(() => {
|
|
57
|
+
return window.navigator.userAgent;
|
|
58
|
+
});
|
|
59
|
+
await page.close();
|
|
60
|
+
await crawler.browserPool.closeAllBrowsers();
|
|
61
|
+
crawler.stop();
|
|
62
|
+
return true;
|
|
63
|
+
} catch {}
|
|
64
|
+
return false;
|
|
65
|
+
}
|
|
53
66
|
|
|
54
67
|
//#endregion
|
|
55
68
|
//#region src/cli.ts
|
|
@@ -400,10 +413,17 @@ async function main() {
|
|
|
400
413
|
} else options = await interactiveCrawl();
|
|
401
414
|
if (!options) process.exit(0);
|
|
402
415
|
if (options.driver === "playwright") {
|
|
403
|
-
const
|
|
404
|
-
if (
|
|
405
|
-
|
|
406
|
-
|
|
416
|
+
const chromeSupported = await isUseChromeSupported();
|
|
417
|
+
if (chromeSupported) {
|
|
418
|
+
options.useChrome = true;
|
|
419
|
+
p.log.info("System Chrome detected and enabled.");
|
|
420
|
+
} else {
|
|
421
|
+
const playwrightInstalled = await ensurePlaywrightInstalled();
|
|
422
|
+
if (!playwrightInstalled) {
|
|
423
|
+
p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
|
|
424
|
+
process.exit(1);
|
|
425
|
+
}
|
|
426
|
+
p.log.info("Using global playwright instance.");
|
|
407
427
|
}
|
|
408
428
|
}
|
|
409
429
|
const s = p.spinner();
|
|
@@ -451,11 +471,8 @@ async function main() {
|
|
|
451
471
|
if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
|
|
452
472
|
if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
|
|
453
473
|
}
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
|
|
457
|
-
if (successful === 0) process.exit(1);
|
|
458
|
-
}
|
|
474
|
+
await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
|
|
475
|
+
process.exit(0);
|
|
459
476
|
}
|
|
460
477
|
main().catch((error) => {
|
|
461
478
|
p.log.error(`Unexpected error: ${error}`);
|
package/dist/index.d.mts
CHANGED
package/dist/index.mjs
CHANGED
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.8.
|
|
4
|
+
"version": "0.8.4",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -50,7 +50,7 @@
|
|
|
50
50
|
"nypm": "^0.6.0",
|
|
51
51
|
"pathe": "^2.0.3",
|
|
52
52
|
"picomatch": "^4.0.3",
|
|
53
|
-
"mdream": "0.8.
|
|
53
|
+
"mdream": "0.8.4"
|
|
54
54
|
},
|
|
55
55
|
"devDependencies": {
|
|
56
56
|
"@types/picomatch": "^4.0.1"
|