@mdream/crawl 1.0.0-beta.9 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +416 -56
- package/dist/_chunks/crawl.mjs +366 -277
- package/dist/_chunks/playwright-utils.mjs +59 -0
- package/dist/cli.mjs +79 -89
- package/dist/index.d.mts +40 -2
- package/dist/index.mjs +6 -1
- package/package.json +11 -4
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import * as p from "@clack/prompts";
|
|
2
|
+
import { addDependency } from "nypm";
|
|
3
|
+
//#region src/playwright-utils.ts
|
|
4
|
+
async function checkPlaywrightInstallation() {
|
|
5
|
+
try {
|
|
6
|
+
await import("playwright");
|
|
7
|
+
return true;
|
|
8
|
+
} catch {
|
|
9
|
+
return false;
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
async function promptPlaywrightInstall() {
|
|
13
|
+
const shouldInstall = await p.confirm({
|
|
14
|
+
message: "Playwright is required for the Playwright driver. Install it now?",
|
|
15
|
+
initialValue: true
|
|
16
|
+
});
|
|
17
|
+
if (p.isCancel(shouldInstall) || !shouldInstall) return false;
|
|
18
|
+
const s = p.spinner();
|
|
19
|
+
s.start("Installing Playwright globally...");
|
|
20
|
+
try {
|
|
21
|
+
await addDependency("playwright", { global: true });
|
|
22
|
+
s.stop("Playwright installed successfully!");
|
|
23
|
+
return true;
|
|
24
|
+
} catch (fallbackError) {
|
|
25
|
+
s.stop("Failed to install Playwright");
|
|
26
|
+
p.log.error(`Installation failed: ${fallbackError}`);
|
|
27
|
+
return false;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
async function ensurePlaywrightInstalled() {
|
|
31
|
+
if (await checkPlaywrightInstallation()) return true;
|
|
32
|
+
p.log.warn("Playwright driver selected but Playwright is not installed.");
|
|
33
|
+
if (!await promptPlaywrightInstall()) {
|
|
34
|
+
p.log.error("Cannot proceed with Playwright driver without Playwright installed.");
|
|
35
|
+
return false;
|
|
36
|
+
}
|
|
37
|
+
return true;
|
|
38
|
+
}
|
|
39
|
+
async function isUseChromeSupported() {
|
|
40
|
+
try {
|
|
41
|
+
const { PlaywrightCrawler } = await import("crawlee");
|
|
42
|
+
const crawler = new PlaywrightCrawler({
|
|
43
|
+
launchContext: { useChrome: true },
|
|
44
|
+
requestHandler: async () => {},
|
|
45
|
+
maxRequestsPerCrawl: 1
|
|
46
|
+
});
|
|
47
|
+
const page = await crawler.browserPool.newPage();
|
|
48
|
+
await page.evaluate(() => {
|
|
49
|
+
return window.navigator.userAgent;
|
|
50
|
+
});
|
|
51
|
+
await page.close();
|
|
52
|
+
await crawler.browserPool.closeAllBrowsers();
|
|
53
|
+
crawler.stop();
|
|
54
|
+
return true;
|
|
55
|
+
} catch {}
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
//#endregion
|
|
59
|
+
export { ensurePlaywrightInstalled, isUseChromeSupported };
|
package/dist/cli.mjs
CHANGED
|
@@ -1,64 +1,17 @@
|
|
|
1
1
|
import { n as parseUrlPattern, r as validateGlobPattern, t as crawlAndGenerate } from "./_chunks/crawl.mjs";
|
|
2
2
|
import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
|
|
3
3
|
import * as p from "@clack/prompts";
|
|
4
|
-
import { PlaywrightCrawler } from "crawlee";
|
|
5
4
|
import { dirname, join, resolve } from "pathe";
|
|
6
5
|
import { withHttps } from "ufo";
|
|
7
6
|
import { fileURLToPath } from "node:url";
|
|
8
|
-
import {
|
|
9
|
-
//#region src/
|
|
10
|
-
async function
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
} catch {
|
|
15
|
-
return false;
|
|
16
|
-
}
|
|
17
|
-
}
|
|
18
|
-
async function promptPlaywrightInstall() {
|
|
19
|
-
const shouldInstall = await p.confirm({
|
|
20
|
-
message: "Playwright is required for the Playwright driver. Install it now?",
|
|
21
|
-
initialValue: true
|
|
7
|
+
import { loadConfig } from "c12";
|
|
8
|
+
//#region src/config.ts
|
|
9
|
+
async function loadMdreamConfig(cwd) {
|
|
10
|
+
const { config } = await loadConfig({
|
|
11
|
+
name: "mdream",
|
|
12
|
+
cwd
|
|
22
13
|
});
|
|
23
|
-
|
|
24
|
-
const s = p.spinner();
|
|
25
|
-
s.start("Installing Playwright globally...");
|
|
26
|
-
try {
|
|
27
|
-
await addDependency("playwright", { global: true });
|
|
28
|
-
s.stop("Playwright installed successfully!");
|
|
29
|
-
return true;
|
|
30
|
-
} catch (fallbackError) {
|
|
31
|
-
s.stop("Failed to install Playwright");
|
|
32
|
-
p.log.error(`Installation failed: ${fallbackError}`);
|
|
33
|
-
return false;
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
async function ensurePlaywrightInstalled() {
|
|
37
|
-
if (await checkPlaywrightInstallation()) return true;
|
|
38
|
-
p.log.warn("Playwright driver selected but Playwright is not installed.");
|
|
39
|
-
if (!await promptPlaywrightInstall()) {
|
|
40
|
-
p.log.error("Cannot proceed with Playwright driver without Playwright installed.");
|
|
41
|
-
return false;
|
|
42
|
-
}
|
|
43
|
-
return true;
|
|
44
|
-
}
|
|
45
|
-
async function isUseChromeSupported() {
|
|
46
|
-
try {
|
|
47
|
-
const crawler = new PlaywrightCrawler({
|
|
48
|
-
launchContext: { useChrome: true },
|
|
49
|
-
requestHandler: async () => {},
|
|
50
|
-
maxRequestsPerCrawl: 1
|
|
51
|
-
});
|
|
52
|
-
const page = await crawler.browserPool.newPage();
|
|
53
|
-
await page.evaluate(() => {
|
|
54
|
-
return window.navigator.userAgent;
|
|
55
|
-
});
|
|
56
|
-
await page.close();
|
|
57
|
-
await crawler.browserPool.closeAllBrowsers();
|
|
58
|
-
crawler.stop();
|
|
59
|
-
return true;
|
|
60
|
-
} catch {}
|
|
61
|
-
return false;
|
|
14
|
+
return config || {};
|
|
62
15
|
}
|
|
63
16
|
//#endregion
|
|
64
17
|
//#region src/cli.ts
|
|
@@ -225,11 +178,17 @@ async function interactiveCrawl() {
|
|
|
225
178
|
skipSitemap: advancedOptions.skipSitemap
|
|
226
179
|
};
|
|
227
180
|
}
|
|
228
|
-
async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds) {
|
|
181
|
+
async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds, latency) {
|
|
229
182
|
const messages = [];
|
|
230
183
|
const durationStr = `${durationSeconds.toFixed(1)}s`;
|
|
231
|
-
|
|
232
|
-
messages.push(
|
|
184
|
+
messages.push(`📄 ${successful} pages \u00B7 ⏱️ ${durationStr}`);
|
|
185
|
+
if (failed > 0) messages.push(`⚠️ ${failed} failed`);
|
|
186
|
+
if (latency && latency.count > 0) {
|
|
187
|
+
const avg = Math.round(latency.total / latency.count);
|
|
188
|
+
const min = latency.min === Infinity ? 0 : Math.round(latency.min);
|
|
189
|
+
const max = Math.round(latency.max);
|
|
190
|
+
messages.push(`🏓 avg ${avg}ms \u00B7 min ${min}ms \u00B7 max ${max}ms`);
|
|
191
|
+
}
|
|
233
192
|
messages.push(`📦 ${generatedFiles.join(", ")}`);
|
|
234
193
|
messages.push(`📁 ${outputDir}`);
|
|
235
194
|
p.note(messages.join("\n"), "✅ Complete");
|
|
@@ -249,7 +208,8 @@ Usage:
|
|
|
249
208
|
Options:
|
|
250
209
|
-u, --url <url> Website URL to crawl
|
|
251
210
|
-o, --output <dir> Output directory (default: output)
|
|
252
|
-
-d, --depth <number> Crawl depth (default: 3)
|
|
211
|
+
-d, --depth <number> Crawl depth, 0 for single page (default: 3)
|
|
212
|
+
--single-page Only process the given URL(s), no crawling (alias for --depth 0)
|
|
253
213
|
--driver <http|playwright> Crawler driver (default: http)
|
|
254
214
|
--artifacts <list> Comma-separated list of artifacts: llms.txt,llms-full.txt,markdown (default: all)
|
|
255
215
|
--origin <url> Origin URL for resolving relative paths (overrides auto-detection)
|
|
@@ -259,6 +219,7 @@ Options:
|
|
|
259
219
|
--crawl-delay <seconds> Crawl delay in seconds
|
|
260
220
|
--exclude <pattern> Exclude URLs matching glob patterns (can be used multiple times)
|
|
261
221
|
--skip-sitemap Skip sitemap.xml and robots.txt discovery
|
|
222
|
+
--allow-subdomains Crawl across subdomains of the same root domain
|
|
262
223
|
-v, --verbose Enable verbose logging
|
|
263
224
|
-h, --help Show this help message
|
|
264
225
|
--version Show version number
|
|
@@ -271,6 +232,7 @@ Examples:
|
|
|
271
232
|
@mdream/crawl -u example.com --exclude "*/admin/*" --exclude "*/api/*"
|
|
272
233
|
@mdream/crawl -u example.com --verbose
|
|
273
234
|
@mdream/crawl -u example.com --skip-sitemap
|
|
235
|
+
@mdream/crawl -u example.com --driver playwright --single-page
|
|
274
236
|
`);
|
|
275
237
|
process.exit(0);
|
|
276
238
|
}
|
|
@@ -324,10 +286,10 @@ Examples:
|
|
|
324
286
|
process.exit(1);
|
|
325
287
|
}
|
|
326
288
|
}
|
|
327
|
-
const depthStr = getArgValue("--depth") || getArgValue("-d") || "3";
|
|
328
|
-
const depth = Number
|
|
329
|
-
if (Number.
|
|
330
|
-
p.log.error("Error: Depth must be between
|
|
289
|
+
const depthStr = args.includes("--single-page") ? "0" : getArgValue("--depth") || getArgValue("-d") || "3";
|
|
290
|
+
const depth = Number(depthStr);
|
|
291
|
+
if (!Number.isInteger(depth) || depth < 0 || depth > 10) {
|
|
292
|
+
p.log.error("Error: Depth must be an integer between 0 and 10");
|
|
331
293
|
process.exit(1);
|
|
332
294
|
}
|
|
333
295
|
const driver = getArgValue("--driver");
|
|
@@ -381,13 +343,14 @@ Examples:
|
|
|
381
343
|
const patterns = [parsed];
|
|
382
344
|
const verbose = args.includes("--verbose") || args.includes("-v");
|
|
383
345
|
const skipSitemap = args.includes("--skip-sitemap");
|
|
346
|
+
const allowSubdomains = args.includes("--allow-subdomains");
|
|
384
347
|
if (skipSitemap && parsed.isGlob) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
|
|
385
348
|
return {
|
|
386
349
|
urls: [url],
|
|
387
350
|
outputDir: resolve(getArgValue("--output") || getArgValue("-o") || "output"),
|
|
388
351
|
driver: driver || "http",
|
|
389
352
|
maxRequestsPerCrawl: Number.parseInt(maxPagesStr || String(Number.MAX_SAFE_INTEGER)),
|
|
390
|
-
followLinks:
|
|
353
|
+
followLinks: depth > 0,
|
|
391
354
|
maxDepth: depth,
|
|
392
355
|
generateLlmsTxt: artifacts.includes("llms.txt"),
|
|
393
356
|
generateLlmsFullTxt: artifacts.includes("llms-full.txt"),
|
|
@@ -399,14 +362,28 @@ Examples:
|
|
|
399
362
|
crawlDelay: crawlDelayStr ? Number.parseInt(crawlDelayStr) : void 0,
|
|
400
363
|
exclude: excludePatterns.length > 0 ? excludePatterns : void 0,
|
|
401
364
|
verbose,
|
|
402
|
-
skipSitemap
|
|
365
|
+
skipSitemap,
|
|
366
|
+
allowSubdomains
|
|
403
367
|
};
|
|
404
368
|
}
|
|
405
369
|
async function main() {
|
|
406
370
|
const cliOptions = parseCliArgs();
|
|
371
|
+
const fileConfig = await loadMdreamConfig();
|
|
407
372
|
let options;
|
|
408
373
|
if (cliOptions) {
|
|
409
|
-
|
|
374
|
+
const configExclude = fileConfig.exclude || [];
|
|
375
|
+
const cliExclude = cliOptions.exclude || [];
|
|
376
|
+
options = {
|
|
377
|
+
...cliOptions,
|
|
378
|
+
driver: cliOptions.driver || fileConfig.driver || "http",
|
|
379
|
+
maxDepth: cliOptions.maxDepth ?? fileConfig.maxDepth,
|
|
380
|
+
crawlDelay: cliOptions.crawlDelay ?? fileConfig.crawlDelay,
|
|
381
|
+
skipSitemap: cliOptions.skipSitemap || fileConfig.skipSitemap || false,
|
|
382
|
+
allowSubdomains: cliOptions.allowSubdomains || fileConfig.allowSubdomains || false,
|
|
383
|
+
verbose: cliOptions.verbose || fileConfig.verbose || false,
|
|
384
|
+
exclude: configExclude.length > 0 || cliExclude.length > 0 ? [...configExclude, ...cliExclude] : void 0,
|
|
385
|
+
hooks: fileConfig.hooks
|
|
386
|
+
};
|
|
410
387
|
p.intro(`☁️ mdream v${version}`);
|
|
411
388
|
const formats = [];
|
|
412
389
|
if (options.generateLlmsTxt) formats.push("llms.txt");
|
|
@@ -420,6 +397,7 @@ async function main() {
|
|
|
420
397
|
`Formats: ${formats.join(", ")}`,
|
|
421
398
|
options.exclude && options.exclude.length > 0 && `Exclude: ${options.exclude.join(", ")}`,
|
|
422
399
|
options.skipSitemap && `Skip sitemap: Yes`,
|
|
400
|
+
options.allowSubdomains && `Allow subdomains: Yes`,
|
|
423
401
|
options.verbose && `Verbose: Enabled`
|
|
424
402
|
].filter(Boolean);
|
|
425
403
|
p.note(summary.join("\n"), "Configuration");
|
|
@@ -431,36 +409,46 @@ async function main() {
|
|
|
431
409
|
if (permCheck.error?.includes("Permission denied")) p.log.info("Tip: Try running with elevated privileges (e.g., sudo) or change the output directory permissions.");
|
|
432
410
|
process.exit(1);
|
|
433
411
|
}
|
|
434
|
-
if (options.driver === "playwright")
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
|
|
412
|
+
if (options.driver === "playwright") {
|
|
413
|
+
try {
|
|
414
|
+
await import("crawlee");
|
|
415
|
+
} catch {
|
|
416
|
+
p.log.error("The Playwright driver requires crawlee. Install it with: npm install crawlee");
|
|
440
417
|
process.exit(1);
|
|
441
418
|
}
|
|
442
|
-
|
|
419
|
+
const { ensurePlaywrightInstalled, isUseChromeSupported } = await import("./_chunks/playwright-utils.mjs");
|
|
420
|
+
if (await isUseChromeSupported()) {
|
|
421
|
+
options.useChrome = true;
|
|
422
|
+
p.log.info("System Chrome detected and enabled.");
|
|
423
|
+
} else {
|
|
424
|
+
if (!await ensurePlaywrightInstalled()) {
|
|
425
|
+
p.log.error("Cannot proceed without Playwright. Please install it manually or use the HTTP driver instead.");
|
|
426
|
+
process.exit(1);
|
|
427
|
+
}
|
|
428
|
+
p.log.info("Using global playwright instance.");
|
|
429
|
+
}
|
|
443
430
|
}
|
|
444
431
|
const s = p.spinner();
|
|
445
|
-
s.start("
|
|
432
|
+
s.start("Discovering sitemaps");
|
|
446
433
|
const startTime = Date.now();
|
|
434
|
+
let crawlStartTime = 0;
|
|
435
|
+
let lastProgress;
|
|
447
436
|
const results = await crawlAndGenerate(options, (progress) => {
|
|
448
|
-
|
|
437
|
+
lastProgress = progress;
|
|
438
|
+
if (progress.sitemap.status === "discovering") s.message("Discovering sitemaps");
|
|
449
439
|
else if (progress.sitemap.status === "processing") s.message(`Processing sitemap... Found ${progress.sitemap.found} URLs`);
|
|
450
440
|
else if (progress.crawling.status === "processing") {
|
|
451
|
-
|
|
452
|
-
const
|
|
453
|
-
const
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
s.message(current);
|
|
463
|
-
}
|
|
441
|
+
if (!crawlStartTime) crawlStartTime = Date.now();
|
|
442
|
+
const processed = progress.crawling.processed;
|
|
443
|
+
const total = progress.crawling.total;
|
|
444
|
+
const failed = progress.crawling.failed;
|
|
445
|
+
const elapsed = (Date.now() - crawlStartTime) / 1e3;
|
|
446
|
+
const rate = elapsed > .1 ? Math.round(processed / elapsed) : 0;
|
|
447
|
+
let msg = processed > total ? `Crawling ${processed} pages` : `Crawling ${processed}/${total}`;
|
|
448
|
+
if (rate > 0) msg += ` \u00B7 ${rate}/s`;
|
|
449
|
+
if (failed > 0) msg += ` \u00B7 ${failed} failed`;
|
|
450
|
+
s.message(msg);
|
|
451
|
+
} else if (progress.generation.status === "generating") s.message(progress.generation.current || "Generating files");
|
|
464
452
|
});
|
|
465
453
|
s.stop();
|
|
466
454
|
const durationSeconds = (Date.now() - startTime) / 1e3;
|
|
@@ -484,11 +472,13 @@ async function main() {
|
|
|
484
472
|
if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
|
|
485
473
|
if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
|
|
486
474
|
}
|
|
487
|
-
await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
|
|
475
|
+
await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds, lastProgress?.crawling.latency);
|
|
488
476
|
process.exit(0);
|
|
489
477
|
}
|
|
490
478
|
main().catch((error) => {
|
|
491
|
-
|
|
479
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
480
|
+
if (msg.includes("wmic") || msg.includes("ENOENT") && process.platform === "win32") p.log.error("Crawlee failed because wmic.exe is not available on this system. Windows 11 removed wmic.exe, which older crawlee versions depend on for memory monitoring.\nFix: upgrade crawlee to >=3.16.0 or switch to the HTTP driver (--driver http).");
|
|
481
|
+
else p.log.error(`Unexpected error: ${msg}`);
|
|
492
482
|
process.exit(1);
|
|
493
483
|
});
|
|
494
484
|
//#endregion
|
package/dist/index.d.mts
CHANGED
|
@@ -6,6 +6,22 @@ interface PageData {
|
|
|
6
6
|
metadata: PageMetadata;
|
|
7
7
|
origin: string;
|
|
8
8
|
}
|
|
9
|
+
interface CrawlHooks {
|
|
10
|
+
'crawl:url': (ctx: {
|
|
11
|
+
url: string;
|
|
12
|
+
skip: boolean;
|
|
13
|
+
}) => void | Promise<void>;
|
|
14
|
+
'crawl:page': (page: PageData) => void | Promise<void>;
|
|
15
|
+
'crawl:content': (ctx: {
|
|
16
|
+
url: string;
|
|
17
|
+
title: string;
|
|
18
|
+
content: string;
|
|
19
|
+
filePath: string;
|
|
20
|
+
}) => void | Promise<void>;
|
|
21
|
+
'crawl:done': (ctx: {
|
|
22
|
+
results: CrawlResult[];
|
|
23
|
+
}) => void | Promise<void>;
|
|
24
|
+
}
|
|
9
25
|
interface CrawlOptions {
|
|
10
26
|
urls: string[];
|
|
11
27
|
outputDir: string;
|
|
@@ -26,8 +42,23 @@ interface CrawlOptions {
|
|
|
26
42
|
descriptionOverride?: string;
|
|
27
43
|
verbose?: boolean;
|
|
28
44
|
skipSitemap?: boolean;
|
|
45
|
+
allowSubdomains?: boolean;
|
|
46
|
+
hooks?: Partial<{ [K in keyof CrawlHooks]: CrawlHooks[K] | CrawlHooks[K][] }>;
|
|
29
47
|
onPage?: (page: PageData) => Promise<void> | void;
|
|
30
48
|
}
|
|
49
|
+
interface MdreamCrawlConfig {
|
|
50
|
+
exclude?: string[];
|
|
51
|
+
driver?: 'http' | 'playwright';
|
|
52
|
+
maxDepth?: number;
|
|
53
|
+
maxPages?: number;
|
|
54
|
+
crawlDelay?: number;
|
|
55
|
+
skipSitemap?: boolean;
|
|
56
|
+
allowSubdomains?: boolean;
|
|
57
|
+
verbose?: boolean;
|
|
58
|
+
artifacts?: ('llms.txt' | 'llms-full.txt' | 'markdown')[];
|
|
59
|
+
hooks?: Partial<{ [K in keyof CrawlHooks]: CrawlHooks[K] | CrawlHooks[K][] }>;
|
|
60
|
+
}
|
|
61
|
+
declare function defineConfig(config: MdreamCrawlConfig): MdreamCrawlConfig;
|
|
31
62
|
interface ParsedUrlPattern {
|
|
32
63
|
baseUrl: string;
|
|
33
64
|
pattern: string;
|
|
@@ -63,7 +94,14 @@ interface CrawlProgress {
|
|
|
63
94
|
status: 'starting' | 'processing' | 'completed';
|
|
64
95
|
total: number;
|
|
65
96
|
processed: number;
|
|
66
|
-
|
|
97
|
+
failed: number;
|
|
98
|
+
currentUrl?: string; /** Page fetch latency stats in ms */
|
|
99
|
+
latency: {
|
|
100
|
+
total: number;
|
|
101
|
+
min: number;
|
|
102
|
+
max: number;
|
|
103
|
+
count: number;
|
|
104
|
+
};
|
|
67
105
|
};
|
|
68
106
|
generation: {
|
|
69
107
|
status: 'idle' | 'generating' | 'completed';
|
|
@@ -72,4 +110,4 @@ interface CrawlProgress {
|
|
|
72
110
|
}
|
|
73
111
|
declare function crawlAndGenerate(options: CrawlOptions, onProgress?: (progress: CrawlProgress) => void): Promise<CrawlResult[]>;
|
|
74
112
|
//#endregion
|
|
75
|
-
export { type CrawlOptions, type CrawlResult, type PageData, crawlAndGenerate };
|
|
113
|
+
export { type CrawlHooks, type CrawlOptions, type CrawlResult, type MdreamCrawlConfig, type PageData, crawlAndGenerate, defineConfig };
|
package/dist/index.mjs
CHANGED
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "1.0.
|
|
4
|
+
"version": "1.0.1",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -42,22 +42,29 @@
|
|
|
42
42
|
"dist"
|
|
43
43
|
],
|
|
44
44
|
"peerDependencies": {
|
|
45
|
+
"crawlee": "^3.16.0",
|
|
45
46
|
"playwright": "^1.53.2"
|
|
46
47
|
},
|
|
47
48
|
"peerDependenciesMeta": {
|
|
49
|
+
"crawlee": {
|
|
50
|
+
"optional": true
|
|
51
|
+
},
|
|
48
52
|
"playwright": {
|
|
49
53
|
"optional": true
|
|
50
54
|
}
|
|
51
55
|
},
|
|
52
56
|
"dependencies": {
|
|
53
57
|
"@clack/prompts": "^1.1.0",
|
|
54
|
-
"
|
|
58
|
+
"c12": "^3.0.4",
|
|
59
|
+
"hookable": "^5.5.3",
|
|
55
60
|
"nypm": "^0.6.5",
|
|
61
|
+
"ofetch": "^1.5.1",
|
|
56
62
|
"pathe": "^2.0.3",
|
|
57
63
|
"picomatch": "^4.0.3",
|
|
64
|
+
"tldts": "^7.0.26",
|
|
58
65
|
"ufo": "^1.6.3",
|
|
59
|
-
"@mdream/js": "1.0.
|
|
60
|
-
"mdream": "1.0.
|
|
66
|
+
"@mdream/js": "1.0.1",
|
|
67
|
+
"mdream": "1.0.1"
|
|
61
68
|
},
|
|
62
69
|
"devDependencies": {
|
|
63
70
|
"@types/picomatch": "^4.0.2"
|