@mdream/crawl 0.7.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/{crawl-B5MaCj6O.mjs → crawl-BwURA9nQ.mjs} +103 -33
- package/dist/cli.mjs +28 -33
- package/dist/index.d.mts +1 -0
- package/dist/index.mjs +1 -1
- package/package.json +3 -3
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { existsSync, mkdirSync } from "node:fs";
|
|
2
2
|
import { writeFile } from "node:fs/promises";
|
|
3
|
-
import
|
|
3
|
+
import * as p from "@clack/prompts";
|
|
4
|
+
import { HttpCrawler, PlaywrightCrawler, log, purgeDefaultStorages } from "crawlee";
|
|
4
5
|
import { generateLlmsTxtArtifacts, htmlToMarkdown } from "mdream";
|
|
5
6
|
import { withMinimalPreset } from "mdream/preset/minimal";
|
|
6
7
|
import { dirname, join, normalize, resolve } from "pathe";
|
|
@@ -46,7 +47,7 @@ function parseUrlPattern(input) {
|
|
|
46
47
|
pattern,
|
|
47
48
|
isGlob: true
|
|
48
49
|
};
|
|
49
|
-
} catch
|
|
50
|
+
} catch {
|
|
50
51
|
throw new Error(`Invalid URL pattern: "${input}". Please provide a valid URL with glob patterns (e.g., "example.com/docs/*" or "https://example.com/api/**").`);
|
|
51
52
|
}
|
|
52
53
|
}
|
|
@@ -178,9 +179,25 @@ function extractMetadata(html, url) {
|
|
|
178
179
|
|
|
179
180
|
//#endregion
|
|
180
181
|
//#region src/crawl.ts
|
|
182
|
+
async function loadSitemapWithoutRetries(sitemapUrl) {
|
|
183
|
+
const response = await fetch(sitemapUrl);
|
|
184
|
+
if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
|
|
185
|
+
const xmlContent = await response.text();
|
|
186
|
+
const urls = [];
|
|
187
|
+
const urlRegex = /<loc>(.*?)<\/loc>/g;
|
|
188
|
+
let match;
|
|
189
|
+
while (true) {
|
|
190
|
+
match = urlRegex.exec(xmlContent);
|
|
191
|
+
if (match === null) break;
|
|
192
|
+
urls.push(match[1]);
|
|
193
|
+
}
|
|
194
|
+
return urls;
|
|
195
|
+
}
|
|
181
196
|
async function crawlAndGenerate(options, onProgress) {
|
|
182
|
-
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride } = options;
|
|
197
|
+
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false } = options;
|
|
183
198
|
const outputDir = resolve(normalize(rawOutputDir));
|
|
199
|
+
if (verbose) log.setLevel(log.LEVELS.INFO);
|
|
200
|
+
else log.setLevel(log.LEVELS.OFF);
|
|
184
201
|
let patterns;
|
|
185
202
|
try {
|
|
186
203
|
patterns = globPatterns.length > 0 ? globPatterns : urls.map(parseUrlPattern);
|
|
@@ -201,6 +218,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
201
218
|
},
|
|
202
219
|
generation: { status: "idle" }
|
|
203
220
|
};
|
|
221
|
+
const sitemapAttempts = [];
|
|
204
222
|
if (startingUrls.length > 0) {
|
|
205
223
|
const baseUrl = new URL(startingUrls[0]).origin;
|
|
206
224
|
const homePageUrl = baseUrl;
|
|
@@ -216,8 +234,12 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
216
234
|
onProgress?.(progress);
|
|
217
235
|
const robotsSitemaps = sitemapMatches.map((match) => match.replace(/Sitemap:\s*/i, "").trim());
|
|
218
236
|
for (const sitemapUrl of robotsSitemaps) try {
|
|
219
|
-
const
|
|
220
|
-
|
|
237
|
+
const robotsUrls = await loadSitemapWithoutRetries(sitemapUrl);
|
|
238
|
+
sitemapAttempts.push({
|
|
239
|
+
url: sitemapUrl,
|
|
240
|
+
success: true
|
|
241
|
+
});
|
|
242
|
+
const hasGlobPatterns = patterns.some((p$1) => p$1.isGlob);
|
|
221
243
|
if (hasGlobPatterns) {
|
|
222
244
|
const filteredUrls = robotsUrls.filter((url) => {
|
|
223
245
|
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
@@ -237,15 +259,24 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
237
259
|
break;
|
|
238
260
|
}
|
|
239
261
|
}
|
|
240
|
-
} catch {
|
|
241
|
-
|
|
262
|
+
} catch (error) {
|
|
263
|
+
sitemapAttempts.push({
|
|
264
|
+
url: sitemapUrl,
|
|
265
|
+
success: false,
|
|
266
|
+
error: error instanceof Error ? error.message : "Unknown error"
|
|
267
|
+
});
|
|
242
268
|
}
|
|
243
269
|
}
|
|
244
270
|
}
|
|
245
271
|
let mainSitemapProcessed = false;
|
|
272
|
+
const mainSitemapUrl = `${baseUrl}/sitemap.xml`;
|
|
246
273
|
try {
|
|
247
|
-
const
|
|
248
|
-
|
|
274
|
+
const sitemapUrls = await loadSitemapWithoutRetries(mainSitemapUrl);
|
|
275
|
+
sitemapAttempts.push({
|
|
276
|
+
url: mainSitemapUrl,
|
|
277
|
+
success: true
|
|
278
|
+
});
|
|
279
|
+
const hasGlobPatterns = patterns.some((p$1) => p$1.isGlob);
|
|
249
280
|
if (hasGlobPatterns) {
|
|
250
281
|
const filteredUrls = sitemapUrls.filter((url) => {
|
|
251
282
|
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
@@ -267,7 +298,12 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
267
298
|
mainSitemapProcessed = true;
|
|
268
299
|
}
|
|
269
300
|
}
|
|
270
|
-
} catch {
|
|
301
|
+
} catch (error) {
|
|
302
|
+
sitemapAttempts.push({
|
|
303
|
+
url: mainSitemapUrl,
|
|
304
|
+
success: false,
|
|
305
|
+
error: error instanceof Error ? error.message : "Unknown error"
|
|
306
|
+
});
|
|
271
307
|
if (!mainSitemapProcessed) {
|
|
272
308
|
const commonSitemaps = [
|
|
273
309
|
`${baseUrl}/sitemap_index.xml`,
|
|
@@ -275,8 +311,12 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
275
311
|
`${baseUrl}/sitemap-index.xml`
|
|
276
312
|
];
|
|
277
313
|
for (const sitemapUrl of commonSitemaps) try {
|
|
278
|
-
const
|
|
279
|
-
|
|
314
|
+
const altUrls = await loadSitemapWithoutRetries(sitemapUrl);
|
|
315
|
+
sitemapAttempts.push({
|
|
316
|
+
url: sitemapUrl,
|
|
317
|
+
success: true
|
|
318
|
+
});
|
|
319
|
+
const hasGlobPatterns = patterns.some((p$1) => p$1.isGlob);
|
|
280
320
|
if (hasGlobPatterns) {
|
|
281
321
|
const filteredUrls = altUrls.filter((url) => {
|
|
282
322
|
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
@@ -298,11 +338,26 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
298
338
|
break;
|
|
299
339
|
}
|
|
300
340
|
}
|
|
301
|
-
} catch {
|
|
302
|
-
|
|
341
|
+
} catch (error$1) {
|
|
342
|
+
sitemapAttempts.push({
|
|
343
|
+
url: sitemapUrl,
|
|
344
|
+
success: false,
|
|
345
|
+
error: error$1 instanceof Error ? error$1.message : "Unknown error"
|
|
346
|
+
});
|
|
303
347
|
}
|
|
304
348
|
}
|
|
305
349
|
}
|
|
350
|
+
const successfulSitemaps = sitemapAttempts.filter((a) => a.success);
|
|
351
|
+
const failedSitemaps = sitemapAttempts.filter((a) => !a.success);
|
|
352
|
+
if (successfulSitemaps.length > 0) {
|
|
353
|
+
const sitemapUrl = successfulSitemaps[0].url;
|
|
354
|
+
if (progress.sitemap.processed > 0) p.note(`Found sitemap at ${sitemapUrl} with ${progress.sitemap.processed} URLs`, "Sitemap Discovery");
|
|
355
|
+
else p.note(`Found sitemap at ${sitemapUrl} but no URLs matched your search criteria`, "Sitemap Discovery");
|
|
356
|
+
} else if (failedSitemaps.length > 0) {
|
|
357
|
+
const firstAttempt = failedSitemaps[0];
|
|
358
|
+
if (firstAttempt.error?.includes("404")) p.note(`No sitemap found, using crawler to discover pages`, "Sitemap Discovery");
|
|
359
|
+
else p.note(`Could not access sitemap: ${firstAttempt.error}`, "Sitemap Discovery");
|
|
360
|
+
}
|
|
306
361
|
if (!startingUrls.includes(homePageUrl)) startingUrls.unshift(homePageUrl);
|
|
307
362
|
progress.sitemap.status = "completed";
|
|
308
363
|
progress.crawling.total = startingUrls.length;
|
|
@@ -313,16 +368,16 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
313
368
|
const processedUrls = new Set();
|
|
314
369
|
const shouldCrawlUrl = (url) => {
|
|
315
370
|
if (isUrlExcluded(url, exclude)) return false;
|
|
316
|
-
if (!patterns.some((p) => p.isGlob)) return true;
|
|
371
|
+
if (!patterns.some((p$1) => p$1.isGlob)) return true;
|
|
317
372
|
return patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
318
373
|
};
|
|
319
374
|
const createRequestHandler = (crawlerType) => {
|
|
320
|
-
return async ({ request, body, page, enqueueLinks }) => {
|
|
375
|
+
return async ({ request, body, page, enqueueLinks, response }) => {
|
|
321
376
|
const startTime = Date.now();
|
|
322
377
|
progress.crawling.currentUrl = request.loadedUrl;
|
|
323
378
|
onProgress?.(progress);
|
|
324
|
-
|
|
325
|
-
const homePageUrl =
|
|
379
|
+
if (response?.statusCode && (response.statusCode < 200 || response.statusCode >= 300)) return;
|
|
380
|
+
const homePageUrl = new URL(startingUrls[0]).origin;
|
|
326
381
|
let html;
|
|
327
382
|
let title;
|
|
328
383
|
if (crawlerType === "playwright") {
|
|
@@ -346,20 +401,22 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
346
401
|
const safeSegments = pathSegments.map((seg) => seg.replace(/[^\w\-]/g, "-"));
|
|
347
402
|
const filename = safeSegments.length > 0 ? safeSegments.join("/") : "index";
|
|
348
403
|
const safeFilename = normalize(`${filename}.md`);
|
|
349
|
-
filePath = join(outputDir,
|
|
404
|
+
filePath = join(outputDir, safeFilename);
|
|
350
405
|
if (generateIndividualMd) {
|
|
351
406
|
const fileDir = dirname(filePath);
|
|
352
407
|
if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
|
|
353
408
|
await writeFile(filePath, md, "utf-8");
|
|
354
409
|
}
|
|
355
410
|
}
|
|
356
|
-
const
|
|
411
|
+
const normalizedUrl = request.loadedUrl.replace(/\/$/, "");
|
|
412
|
+
const normalizedHomePageUrl = homePageUrl.replace(/\/$/, "");
|
|
413
|
+
const isHomePage = normalizedUrl === normalizedHomePageUrl;
|
|
357
414
|
if (shouldProcessMarkdown || isHomePage) {
|
|
358
415
|
const result = {
|
|
359
416
|
url: request.loadedUrl,
|
|
360
417
|
title,
|
|
361
418
|
content: md,
|
|
362
|
-
filePath:
|
|
419
|
+
filePath: shouldProcessMarkdown ? filePath : void 0,
|
|
363
420
|
timestamp: startTime,
|
|
364
421
|
success: true,
|
|
365
422
|
metadata,
|
|
@@ -385,14 +442,15 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
385
442
|
let crawler;
|
|
386
443
|
const crawlerOptions = {
|
|
387
444
|
requestHandler: createRequestHandler(driver),
|
|
445
|
+
errorHandler: async ({ request, response }) => {
|
|
446
|
+
if (response?.statusCode && response?.statusCode >= 400) request.noRetry = true;
|
|
447
|
+
},
|
|
388
448
|
maxRequestsPerCrawl,
|
|
389
449
|
respectRobotsTxtFile: true
|
|
390
450
|
};
|
|
391
|
-
if (crawlDelay) crawlerOptions.
|
|
392
|
-
if (driver === "playwright")
|
|
393
|
-
|
|
394
|
-
crawler = new PlaywrightCrawlerClass(crawlerOptions);
|
|
395
|
-
} else crawler = new HttpCrawler(crawlerOptions);
|
|
451
|
+
if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
|
|
452
|
+
if (driver === "playwright") crawler = new PlaywrightCrawler(crawlerOptions);
|
|
453
|
+
else crawler = new HttpCrawler(crawlerOptions);
|
|
396
454
|
const initialRequests = startingUrls.map((url) => ({
|
|
397
455
|
url,
|
|
398
456
|
userData: { depth: 0 }
|
|
@@ -408,18 +466,29 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
408
466
|
onProgress?.(progress);
|
|
409
467
|
const successfulResults = results.filter((r$1) => r$1.success);
|
|
410
468
|
const firstUrl = new URL(withHttps(urls[0]));
|
|
469
|
+
const origin$1 = firstUrl.origin;
|
|
411
470
|
const homePageResult = successfulResults.find((r$1) => {
|
|
412
471
|
const resultUrl = new URL(withHttps(r$1.url));
|
|
413
|
-
|
|
414
|
-
return resultUrl.href === homeUrl.href;
|
|
472
|
+
return resultUrl.href === origin$1 || resultUrl.href === `${origin$1}/`;
|
|
415
473
|
});
|
|
416
|
-
const siteName = siteNameOverride || homePageResult?.metadata?.title || firstUrl.hostname;
|
|
474
|
+
const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
|
|
417
475
|
const description = descriptionOverride || homePageResult?.metadata?.description || successfulResults[0]?.metadata?.description;
|
|
418
476
|
if (generateLlmsTxt || generateLlmsFullTxt) {
|
|
419
477
|
progress.generation.current = "Generating llms.txt files";
|
|
420
478
|
onProgress?.(progress);
|
|
421
|
-
const contentResults = successfulResults.filter((result) =>
|
|
422
|
-
|
|
479
|
+
const contentResults = successfulResults.filter((result) => {
|
|
480
|
+
if (!result.content) return false;
|
|
481
|
+
const trimmedContent = result.content.trim();
|
|
482
|
+
const contentWithoutFrontmatter = trimmedContent.replace(/^---\s*\n(?:.*\n)*?---\s*/, "").trim();
|
|
483
|
+
return contentWithoutFrontmatter.length > 10;
|
|
484
|
+
});
|
|
485
|
+
const seenUrls = new Set();
|
|
486
|
+
const deduplicatedResults = contentResults.filter((result) => {
|
|
487
|
+
if (seenUrls.has(result.url)) return false;
|
|
488
|
+
seenUrls.add(result.url);
|
|
489
|
+
return true;
|
|
490
|
+
});
|
|
491
|
+
const processedFiles = deduplicatedResults.map((result) => ({
|
|
423
492
|
filePath: result.filePath,
|
|
424
493
|
title: result.title,
|
|
425
494
|
content: result.content,
|
|
@@ -430,8 +499,9 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
430
499
|
files: processedFiles,
|
|
431
500
|
siteName,
|
|
432
501
|
description,
|
|
433
|
-
origin: origin || firstUrl.origin,
|
|
434
|
-
generateFull: generateLlmsFullTxt
|
|
502
|
+
origin: origin$1 || firstUrl.origin,
|
|
503
|
+
generateFull: generateLlmsFullTxt,
|
|
504
|
+
outputDir
|
|
435
505
|
});
|
|
436
506
|
if (generateLlmsTxt) {
|
|
437
507
|
progress.generation.current = "Writing llms.txt";
|
package/dist/cli.mjs
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-
|
|
1
|
+
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-BwURA9nQ.mjs";
|
|
2
2
|
import { readFileSync } from "node:fs";
|
|
3
|
-
import { dirname, join, resolve } from "pathe";
|
|
4
|
-
import { fileURLToPath } from "node:url";
|
|
5
3
|
import * as p$1 from "@clack/prompts";
|
|
6
4
|
import * as p from "@clack/prompts";
|
|
5
|
+
import { dirname, join, resolve } from "pathe";
|
|
6
|
+
import { fileURLToPath } from "node:url";
|
|
7
7
|
import { addDependency } from "nypm";
|
|
8
8
|
|
|
9
9
|
//#region src/playwright-utils.ts
|
|
@@ -59,7 +59,7 @@ const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
|
|
|
59
59
|
const version = packageJson.version;
|
|
60
60
|
async function interactiveCrawl() {
|
|
61
61
|
console.clear();
|
|
62
|
-
p.intro(
|
|
62
|
+
p.intro(`☁️ @mdream/crawl v${version}`);
|
|
63
63
|
const urlsInput = await p.text({
|
|
64
64
|
message: "Enter starting URL for crawling (supports glob patterns):",
|
|
65
65
|
placeholder: "e.g. docs.example.com, site.com/docs/**",
|
|
@@ -94,7 +94,7 @@ async function interactiveCrawl() {
|
|
|
94
94
|
p.cancel(error instanceof Error ? error.message : "Invalid URL pattern");
|
|
95
95
|
return null;
|
|
96
96
|
}
|
|
97
|
-
const outputDir = "
|
|
97
|
+
const outputDir = "output";
|
|
98
98
|
const crawlerOptions = await p.group({
|
|
99
99
|
driver: () => p.select({
|
|
100
100
|
message: "Select crawler driver:",
|
|
@@ -172,7 +172,8 @@ async function interactiveCrawl() {
|
|
|
172
172
|
`Follow links: Yes (depth ${crawlerOptions.maxDepth})`,
|
|
173
173
|
`Output formats: ${outputFormats.join(", ")}`,
|
|
174
174
|
`Sitemap discovery: Automatic`,
|
|
175
|
-
inferredOrigin && `Origin: ${inferredOrigin}
|
|
175
|
+
inferredOrigin && `Origin: ${inferredOrigin}`,
|
|
176
|
+
advancedOptions.verbose && `Verbose logging: Enabled`
|
|
176
177
|
].filter(Boolean);
|
|
177
178
|
p.note(summary.join("\n"), "Crawl Configuration");
|
|
178
179
|
const shouldProceed = await p.confirm({
|
|
@@ -194,19 +195,18 @@ async function interactiveCrawl() {
|
|
|
194
195
|
generateLlmsFullTxt: advancedOptions.outputFormats.includes("llms-full.txt"),
|
|
195
196
|
generateIndividualMd: advancedOptions.outputFormats.includes("markdown"),
|
|
196
197
|
origin: inferredOrigin,
|
|
197
|
-
globPatterns
|
|
198
|
+
globPatterns,
|
|
199
|
+
verbose: advancedOptions.verbose
|
|
198
200
|
};
|
|
199
201
|
}
|
|
200
|
-
async function showCrawlResults(successful, failed, outputDir, generatedFiles,
|
|
202
|
+
async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds) {
|
|
201
203
|
const messages = [];
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
messages.push(`📁 Output: ${outputDir}`);
|
|
209
|
-
p.note(messages.join("\n"), "Completed: Results");
|
|
204
|
+
const durationStr = `${durationSeconds.toFixed(1)}s`;
|
|
205
|
+
const stats = failed > 0 ? `${successful} pages, ${failed} failed` : `${successful} pages`;
|
|
206
|
+
messages.push(`📄 ${stats} • ⏱️ ${durationStr}`);
|
|
207
|
+
messages.push(`📦 ${generatedFiles.join(", ")}`);
|
|
208
|
+
messages.push(`📁 ${outputDir}`);
|
|
209
|
+
p.note(messages.join("\n"), "✅ Complete");
|
|
210
210
|
}
|
|
211
211
|
function parseCliArgs() {
|
|
212
212
|
const args = process.argv.slice(2);
|
|
@@ -222,7 +222,7 @@ Usage:
|
|
|
222
222
|
|
|
223
223
|
Options:
|
|
224
224
|
-u, --url <url> Website URL to crawl
|
|
225
|
-
-o, --output <dir> Output directory (default:
|
|
225
|
+
-o, --output <dir> Output directory (default: output)
|
|
226
226
|
-d, --depth <number> Crawl depth (default: 3)
|
|
227
227
|
--driver <http|playwright> Crawler driver (default: http)
|
|
228
228
|
--artifacts <list> Comma-separated list of artifacts: llms.txt,llms-full.txt,markdown (default: all)
|
|
@@ -232,6 +232,7 @@ Options:
|
|
|
232
232
|
--max-pages <number> Maximum pages to crawl (default: unlimited)
|
|
233
233
|
--crawl-delay <seconds> Crawl delay in seconds
|
|
234
234
|
--exclude <pattern> Exclude URLs matching glob patterns (can be used multiple times)
|
|
235
|
+
-v, --verbose Enable verbose logging
|
|
235
236
|
-h, --help Show this help message
|
|
236
237
|
--version Show version number
|
|
237
238
|
|
|
@@ -241,6 +242,7 @@ Examples:
|
|
|
241
242
|
@mdream/crawl -u harlanzw.com --artifacts "llms.txt,markdown"
|
|
242
243
|
@mdream/crawl --url https://docs.example.com --depth 2 --artifacts "llms-full.txt"
|
|
243
244
|
@mdream/crawl -u example.com --exclude "*/admin/*" --exclude "*/api/*"
|
|
245
|
+
@mdream/crawl -u example.com --verbose
|
|
244
246
|
`);
|
|
245
247
|
process.exit(0);
|
|
246
248
|
}
|
|
@@ -349,9 +351,10 @@ Examples:
|
|
|
349
351
|
const siteNameOverride = getArgValue("--site-name");
|
|
350
352
|
const descriptionOverride = getArgValue("--description");
|
|
351
353
|
const patterns = [parsed];
|
|
354
|
+
const verbose = args.includes("--verbose") || args.includes("-v");
|
|
352
355
|
return {
|
|
353
356
|
urls: [url],
|
|
354
|
-
outputDir: resolve(getArgValue("--output") || getArgValue("-o") || "
|
|
357
|
+
outputDir: resolve(getArgValue("--output") || getArgValue("-o") || "output"),
|
|
355
358
|
driver: driver || "http",
|
|
356
359
|
maxRequestsPerCrawl: Number.parseInt(maxPagesStr || String(Number.MAX_SAFE_INTEGER)),
|
|
357
360
|
followLinks: true,
|
|
@@ -364,7 +367,8 @@ Examples:
|
|
|
364
367
|
origin: inferredOrigin,
|
|
365
368
|
globPatterns: patterns,
|
|
366
369
|
crawlDelay: crawlDelayStr ? Number.parseInt(crawlDelayStr) : void 0,
|
|
367
|
-
exclude: excludePatterns.length > 0 ? excludePatterns : void 0
|
|
370
|
+
exclude: excludePatterns.length > 0 ? excludePatterns : void 0,
|
|
371
|
+
verbose
|
|
368
372
|
};
|
|
369
373
|
}
|
|
370
374
|
async function main() {
|
|
@@ -383,7 +387,8 @@ async function main() {
|
|
|
383
387
|
`Driver: ${options.driver}`,
|
|
384
388
|
`Depth: ${options.maxDepth}`,
|
|
385
389
|
`Formats: ${formats.join(", ")}`,
|
|
386
|
-
options.exclude && options.exclude.length > 0 && `Exclude: ${options.exclude.join(", ")}
|
|
390
|
+
options.exclude && options.exclude.length > 0 && `Exclude: ${options.exclude.join(", ")}`,
|
|
391
|
+
options.verbose && `Verbose: Enabled`
|
|
387
392
|
].filter(Boolean);
|
|
388
393
|
p.note(summary.join("\n"), "Configuration");
|
|
389
394
|
} else options = await interactiveCrawl();
|
|
@@ -419,9 +424,7 @@ async function main() {
|
|
|
419
424
|
s.stop();
|
|
420
425
|
const endTime = Date.now();
|
|
421
426
|
const durationMs = endTime - startTime;
|
|
422
|
-
const durationSeconds =
|
|
423
|
-
const durationMinutes = Math.floor(durationSeconds / 60);
|
|
424
|
-
const remainingSeconds = durationSeconds % 60;
|
|
427
|
+
const durationSeconds = durationMs / 1e3;
|
|
425
428
|
const successful = results.filter((r) => r.success).length;
|
|
426
429
|
const failed = results.filter((r) => !r.success).length;
|
|
427
430
|
const failedResults = results.filter((r) => !r.success);
|
|
@@ -442,17 +445,9 @@ async function main() {
|
|
|
442
445
|
if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
|
|
443
446
|
if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
|
|
444
447
|
}
|
|
445
|
-
if (!cliOptions) await showCrawlResults(successful, failed, options.outputDir, generatedFiles,
|
|
448
|
+
if (!cliOptions) await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
|
|
446
449
|
else {
|
|
447
|
-
|
|
448
|
-
if (successful > 0) {
|
|
449
|
-
const durationStr = durationMinutes > 0 ? `${durationMinutes}m ${remainingSeconds}s` : `${remainingSeconds}s`;
|
|
450
|
-
messages.push(`✅ ${successful} pages processed in ${durationStr}`);
|
|
451
|
-
}
|
|
452
|
-
if (failed > 0) messages.push(`❌ ${failed} pages failed`);
|
|
453
|
-
if (generatedFiles.length > 0) messages.push(`📄 Generated: ${generatedFiles.join(", ")}`);
|
|
454
|
-
messages.push(`📁 Output: ${options.outputDir}`);
|
|
455
|
-
p.note(messages.join("\n"), "Completed: Results");
|
|
450
|
+
await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
|
|
456
451
|
if (successful === 0) process.exit(1);
|
|
457
452
|
}
|
|
458
453
|
}
|
package/dist/index.d.mts
CHANGED
package/dist/index.mjs
CHANGED
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.8.0",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -46,11 +46,11 @@
|
|
|
46
46
|
},
|
|
47
47
|
"dependencies": {
|
|
48
48
|
"@clack/prompts": "^0.11.0",
|
|
49
|
-
"crawlee": "^3.13.
|
|
49
|
+
"crawlee": "^3.13.10",
|
|
50
50
|
"nypm": "^0.6.0",
|
|
51
51
|
"pathe": "^2.0.3",
|
|
52
52
|
"picomatch": "^4.0.2",
|
|
53
|
-
"mdream": "0.
|
|
53
|
+
"mdream": "0.8.0"
|
|
54
54
|
},
|
|
55
55
|
"devDependencies": {
|
|
56
56
|
"@types/picomatch": "^4.0.0"
|