@mdream/crawl 0.8.3 → 0.8.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/{crawl-BwURA9nQ.mjs → crawl-BtuYX2_u.mjs} +15 -23
- package/dist/cli.mjs +40 -22
- package/dist/index.d.mts +1 -0
- package/dist/index.mjs +1 -1
- package/package.json +3 -2
|
@@ -5,24 +5,10 @@ import { HttpCrawler, PlaywrightCrawler, log, purgeDefaultStorages } from "crawl
|
|
|
5
5
|
import { generateLlmsTxtArtifacts, htmlToMarkdown } from "mdream";
|
|
6
6
|
import { withMinimalPreset } from "mdream/preset/minimal";
|
|
7
7
|
import { dirname, join, normalize, resolve } from "pathe";
|
|
8
|
+
import { withHttps } from "ufo";
|
|
8
9
|
import picomatch from "picomatch";
|
|
9
10
|
import { extractionPlugin } from "mdream/plugins";
|
|
10
11
|
|
|
11
|
-
//#region ../../node_modules/.pnpm/ufo@1.6.1/node_modules/ufo/dist/index.mjs
|
|
12
|
-
const r = String.fromCharCode;
|
|
13
|
-
const PROTOCOL_REGEX = /^[\s\w\0+.-]{2,}:([/\\]{2})?/;
|
|
14
|
-
function withHttps(input) {
|
|
15
|
-
return withProtocol(input, "https://");
|
|
16
|
-
}
|
|
17
|
-
function withProtocol(input, protocol) {
|
|
18
|
-
let match = input.match(PROTOCOL_REGEX);
|
|
19
|
-
if (!match) match = input.match(/^\/{2,}/);
|
|
20
|
-
if (!match) return protocol + input;
|
|
21
|
-
return protocol + input.slice(match[0].length);
|
|
22
|
-
}
|
|
23
|
-
const protocolRelative = Symbol.for("ufo:protocolRelative");
|
|
24
|
-
|
|
25
|
-
//#endregion
|
|
26
12
|
//#region src/glob-utils.ts
|
|
27
13
|
/**
|
|
28
14
|
* Parse a URL that may contain glob patterns
|
|
@@ -194,7 +180,7 @@ async function loadSitemapWithoutRetries(sitemapUrl) {
|
|
|
194
180
|
return urls;
|
|
195
181
|
}
|
|
196
182
|
async function crawlAndGenerate(options, onProgress) {
|
|
197
|
-
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false } = options;
|
|
183
|
+
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false } = options;
|
|
198
184
|
const outputDir = resolve(normalize(rawOutputDir));
|
|
199
185
|
if (verbose) log.setLevel(log.LEVELS.INFO);
|
|
200
186
|
else log.setLevel(log.LEVELS.OFF);
|
|
@@ -449,8 +435,14 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
449
435
|
respectRobotsTxtFile: true
|
|
450
436
|
};
|
|
451
437
|
if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
|
|
452
|
-
if (driver === "playwright")
|
|
453
|
-
|
|
438
|
+
if (driver === "playwright") {
|
|
439
|
+
const playwrightOptions = crawlerOptions;
|
|
440
|
+
if (useChrome) playwrightOptions.launchContext = {
|
|
441
|
+
...playwrightOptions.launchContext,
|
|
442
|
+
useChrome
|
|
443
|
+
};
|
|
444
|
+
crawler = new PlaywrightCrawler(playwrightOptions);
|
|
445
|
+
} else crawler = new HttpCrawler(crawlerOptions);
|
|
454
446
|
const initialRequests = startingUrls.map((url) => ({
|
|
455
447
|
url,
|
|
456
448
|
userData: { depth: 0 }
|
|
@@ -461,14 +453,14 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
461
453
|
await crawler.run(initialRequests);
|
|
462
454
|
progress.crawling.status = "completed";
|
|
463
455
|
onProgress?.(progress);
|
|
464
|
-
if (results.some((r
|
|
456
|
+
if (results.some((r) => r.success)) {
|
|
465
457
|
progress.generation.status = "generating";
|
|
466
458
|
onProgress?.(progress);
|
|
467
|
-
const successfulResults = results.filter((r
|
|
459
|
+
const successfulResults = results.filter((r) => r.success);
|
|
468
460
|
const firstUrl = new URL(withHttps(urls[0]));
|
|
469
461
|
const origin$1 = firstUrl.origin;
|
|
470
|
-
const homePageResult = successfulResults.find((r
|
|
471
|
-
const resultUrl = new URL(withHttps(r
|
|
462
|
+
const homePageResult = successfulResults.find((r) => {
|
|
463
|
+
const resultUrl = new URL(withHttps(r.url));
|
|
472
464
|
return resultUrl.href === origin$1 || resultUrl.href === `${origin$1}/`;
|
|
473
465
|
});
|
|
474
466
|
const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
|
|
@@ -522,4 +514,4 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
522
514
|
}
|
|
523
515
|
|
|
524
516
|
//#endregion
|
|
525
|
-
export { crawlAndGenerate, parseUrlPattern, validateGlobPattern
|
|
517
|
+
export { crawlAndGenerate, parseUrlPattern, validateGlobPattern };
|
package/dist/cli.mjs
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern
|
|
1
|
+
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-BtuYX2_u.mjs";
|
|
2
2
|
import { readFileSync } from "node:fs";
|
|
3
3
|
import * as p$1 from "@clack/prompts";
|
|
4
4
|
import * as p from "@clack/prompts";
|
|
5
|
+
import { PlaywrightCrawler } from "crawlee";
|
|
5
6
|
import { dirname, join, resolve } from "pathe";
|
|
7
|
+
import { withHttps } from "ufo";
|
|
6
8
|
import { fileURLToPath } from "node:url";
|
|
7
9
|
import { addDependency } from "nypm";
|
|
8
10
|
|
|
@@ -22,21 +24,15 @@ async function promptPlaywrightInstall() {
|
|
|
22
24
|
});
|
|
23
25
|
if (p$1.isCancel(shouldInstall) || !shouldInstall) return false;
|
|
24
26
|
const s = p$1.spinner();
|
|
25
|
-
s.start("Installing Playwright...");
|
|
27
|
+
s.start("Installing Playwright globally...");
|
|
26
28
|
try {
|
|
27
|
-
await addDependency("playwright", {
|
|
29
|
+
await addDependency("playwright", { global: true });
|
|
28
30
|
s.stop("Playwright installed successfully!");
|
|
29
31
|
return true;
|
|
30
|
-
} catch {
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
return true;
|
|
35
|
-
} catch (fallbackError) {
|
|
36
|
-
s.stop("Failed to install Playwright");
|
|
37
|
-
p$1.log.error(`Installation failed: ${fallbackError}`);
|
|
38
|
-
return false;
|
|
39
|
-
}
|
|
32
|
+
} catch (fallbackError) {
|
|
33
|
+
s.stop("Failed to install Playwright");
|
|
34
|
+
p$1.log.error(`Installation failed: ${fallbackError}`);
|
|
35
|
+
return false;
|
|
40
36
|
}
|
|
41
37
|
}
|
|
42
38
|
async function ensurePlaywrightInstalled() {
|
|
@@ -50,6 +46,24 @@ async function ensurePlaywrightInstalled() {
|
|
|
50
46
|
}
|
|
51
47
|
return true;
|
|
52
48
|
}
|
|
49
|
+
async function isUseChromeSupported() {
|
|
50
|
+
try {
|
|
51
|
+
const crawler = new PlaywrightCrawler({
|
|
52
|
+
launchContext: { useChrome: true },
|
|
53
|
+
requestHandler: async () => {},
|
|
54
|
+
maxRequestsPerCrawl: 1
|
|
55
|
+
});
|
|
56
|
+
const page = await crawler.browserPool.newPage();
|
|
57
|
+
await page.evaluate(() => {
|
|
58
|
+
return window.navigator.userAgent;
|
|
59
|
+
});
|
|
60
|
+
await page.close();
|
|
61
|
+
await crawler.browserPool.closeAllBrowsers();
|
|
62
|
+
crawler.stop();
|
|
63
|
+
return true;
|
|
64
|
+
} catch {}
|
|
65
|
+
return false;
|
|
66
|
+
}
|
|
53
67
|
|
|
54
68
|
//#endregion
|
|
55
69
|
//#region src/cli.ts
|
|
@@ -400,10 +414,17 @@ async function main() {
|
|
|
400
414
|
} else options = await interactiveCrawl();
|
|
401
415
|
if (!options) process.exit(0);
|
|
402
416
|
if (options.driver === "playwright") {
|
|
403
|
-
const
|
|
404
|
-
if (
|
|
405
|
-
|
|
406
|
-
|
|
417
|
+
const chromeSupported = await isUseChromeSupported();
|
|
418
|
+
if (chromeSupported) {
|
|
419
|
+
options.useChrome = true;
|
|
420
|
+
p.log.info("System Chrome detected and enabled.");
|
|
421
|
+
} else {
|
|
422
|
+
const playwrightInstalled = await ensurePlaywrightInstalled();
|
|
423
|
+
if (!playwrightInstalled) {
|
|
424
|
+
p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
|
|
425
|
+
process.exit(1);
|
|
426
|
+
}
|
|
427
|
+
p.log.info("Using global playwright instance.");
|
|
407
428
|
}
|
|
408
429
|
}
|
|
409
430
|
const s = p.spinner();
|
|
@@ -451,11 +472,8 @@ async function main() {
|
|
|
451
472
|
if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
|
|
452
473
|
if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
|
|
453
474
|
}
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
|
|
457
|
-
if (successful === 0) process.exit(1);
|
|
458
|
-
}
|
|
475
|
+
await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
|
|
476
|
+
process.exit(0);
|
|
459
477
|
}
|
|
460
478
|
main().catch((error) => {
|
|
461
479
|
p.log.error(`Unexpected error: ${error}`);
|
package/dist/index.d.mts
CHANGED
package/dist/index.mjs
CHANGED
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.8.
|
|
4
|
+
"version": "0.8.5",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -50,7 +50,8 @@
|
|
|
50
50
|
"nypm": "^0.6.0",
|
|
51
51
|
"pathe": "^2.0.3",
|
|
52
52
|
"picomatch": "^4.0.3",
|
|
53
|
-
"
|
|
53
|
+
"ufo": "^1.6.1",
|
|
54
|
+
"mdream": "0.8.5"
|
|
54
55
|
},
|
|
55
56
|
"devDependencies": {
|
|
56
57
|
"@types/picomatch": "^4.0.1"
|