@mdream/crawl 0.7.2 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/{crawl-DEysrw0h.mjs → crawl-BwURA9nQ.mjs} +97 -27
- package/dist/cli.mjs +57 -56
- package/dist/index.d.mts +1 -0
- package/dist/index.mjs +1 -1
- package/package.json +4 -4
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { existsSync, mkdirSync } from "node:fs";
|
|
2
2
|
import { writeFile } from "node:fs/promises";
|
|
3
|
-
import
|
|
3
|
+
import * as p from "@clack/prompts";
|
|
4
|
+
import { HttpCrawler, PlaywrightCrawler, log, purgeDefaultStorages } from "crawlee";
|
|
4
5
|
import { generateLlmsTxtArtifacts, htmlToMarkdown } from "mdream";
|
|
5
6
|
import { withMinimalPreset } from "mdream/preset/minimal";
|
|
6
7
|
import { dirname, join, normalize, resolve } from "pathe";
|
|
@@ -46,7 +47,7 @@ function parseUrlPattern(input) {
|
|
|
46
47
|
pattern,
|
|
47
48
|
isGlob: true
|
|
48
49
|
};
|
|
49
|
-
} catch
|
|
50
|
+
} catch {
|
|
50
51
|
throw new Error(`Invalid URL pattern: "${input}". Please provide a valid URL with glob patterns (e.g., "example.com/docs/*" or "https://example.com/api/**").`);
|
|
51
52
|
}
|
|
52
53
|
}
|
|
@@ -178,9 +179,25 @@ function extractMetadata(html, url) {
|
|
|
178
179
|
|
|
179
180
|
//#endregion
|
|
180
181
|
//#region src/crawl.ts
|
|
182
|
+
async function loadSitemapWithoutRetries(sitemapUrl) {
|
|
183
|
+
const response = await fetch(sitemapUrl);
|
|
184
|
+
if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
|
|
185
|
+
const xmlContent = await response.text();
|
|
186
|
+
const urls = [];
|
|
187
|
+
const urlRegex = /<loc>(.*?)<\/loc>/g;
|
|
188
|
+
let match;
|
|
189
|
+
while (true) {
|
|
190
|
+
match = urlRegex.exec(xmlContent);
|
|
191
|
+
if (match === null) break;
|
|
192
|
+
urls.push(match[1]);
|
|
193
|
+
}
|
|
194
|
+
return urls;
|
|
195
|
+
}
|
|
181
196
|
async function crawlAndGenerate(options, onProgress) {
|
|
182
|
-
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride } = options;
|
|
197
|
+
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false } = options;
|
|
183
198
|
const outputDir = resolve(normalize(rawOutputDir));
|
|
199
|
+
if (verbose) log.setLevel(log.LEVELS.INFO);
|
|
200
|
+
else log.setLevel(log.LEVELS.OFF);
|
|
184
201
|
let patterns;
|
|
185
202
|
try {
|
|
186
203
|
patterns = globPatterns.length > 0 ? globPatterns : urls.map(parseUrlPattern);
|
|
@@ -201,6 +218,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
201
218
|
},
|
|
202
219
|
generation: { status: "idle" }
|
|
203
220
|
};
|
|
221
|
+
const sitemapAttempts = [];
|
|
204
222
|
if (startingUrls.length > 0) {
|
|
205
223
|
const baseUrl = new URL(startingUrls[0]).origin;
|
|
206
224
|
const homePageUrl = baseUrl;
|
|
@@ -216,8 +234,12 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
216
234
|
onProgress?.(progress);
|
|
217
235
|
const robotsSitemaps = sitemapMatches.map((match) => match.replace(/Sitemap:\s*/i, "").trim());
|
|
218
236
|
for (const sitemapUrl of robotsSitemaps) try {
|
|
219
|
-
const
|
|
220
|
-
|
|
237
|
+
const robotsUrls = await loadSitemapWithoutRetries(sitemapUrl);
|
|
238
|
+
sitemapAttempts.push({
|
|
239
|
+
url: sitemapUrl,
|
|
240
|
+
success: true
|
|
241
|
+
});
|
|
242
|
+
const hasGlobPatterns = patterns.some((p$1) => p$1.isGlob);
|
|
221
243
|
if (hasGlobPatterns) {
|
|
222
244
|
const filteredUrls = robotsUrls.filter((url) => {
|
|
223
245
|
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
@@ -237,15 +259,24 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
237
259
|
break;
|
|
238
260
|
}
|
|
239
261
|
}
|
|
240
|
-
} catch {
|
|
241
|
-
|
|
262
|
+
} catch (error) {
|
|
263
|
+
sitemapAttempts.push({
|
|
264
|
+
url: sitemapUrl,
|
|
265
|
+
success: false,
|
|
266
|
+
error: error instanceof Error ? error.message : "Unknown error"
|
|
267
|
+
});
|
|
242
268
|
}
|
|
243
269
|
}
|
|
244
270
|
}
|
|
245
271
|
let mainSitemapProcessed = false;
|
|
272
|
+
const mainSitemapUrl = `${baseUrl}/sitemap.xml`;
|
|
246
273
|
try {
|
|
247
|
-
const
|
|
248
|
-
|
|
274
|
+
const sitemapUrls = await loadSitemapWithoutRetries(mainSitemapUrl);
|
|
275
|
+
sitemapAttempts.push({
|
|
276
|
+
url: mainSitemapUrl,
|
|
277
|
+
success: true
|
|
278
|
+
});
|
|
279
|
+
const hasGlobPatterns = patterns.some((p$1) => p$1.isGlob);
|
|
249
280
|
if (hasGlobPatterns) {
|
|
250
281
|
const filteredUrls = sitemapUrls.filter((url) => {
|
|
251
282
|
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
@@ -267,7 +298,12 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
267
298
|
mainSitemapProcessed = true;
|
|
268
299
|
}
|
|
269
300
|
}
|
|
270
|
-
} catch {
|
|
301
|
+
} catch (error) {
|
|
302
|
+
sitemapAttempts.push({
|
|
303
|
+
url: mainSitemapUrl,
|
|
304
|
+
success: false,
|
|
305
|
+
error: error instanceof Error ? error.message : "Unknown error"
|
|
306
|
+
});
|
|
271
307
|
if (!mainSitemapProcessed) {
|
|
272
308
|
const commonSitemaps = [
|
|
273
309
|
`${baseUrl}/sitemap_index.xml`,
|
|
@@ -275,8 +311,12 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
275
311
|
`${baseUrl}/sitemap-index.xml`
|
|
276
312
|
];
|
|
277
313
|
for (const sitemapUrl of commonSitemaps) try {
|
|
278
|
-
const
|
|
279
|
-
|
|
314
|
+
const altUrls = await loadSitemapWithoutRetries(sitemapUrl);
|
|
315
|
+
sitemapAttempts.push({
|
|
316
|
+
url: sitemapUrl,
|
|
317
|
+
success: true
|
|
318
|
+
});
|
|
319
|
+
const hasGlobPatterns = patterns.some((p$1) => p$1.isGlob);
|
|
280
320
|
if (hasGlobPatterns) {
|
|
281
321
|
const filteredUrls = altUrls.filter((url) => {
|
|
282
322
|
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
@@ -298,11 +338,26 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
298
338
|
break;
|
|
299
339
|
}
|
|
300
340
|
}
|
|
301
|
-
} catch {
|
|
302
|
-
|
|
341
|
+
} catch (error$1) {
|
|
342
|
+
sitemapAttempts.push({
|
|
343
|
+
url: sitemapUrl,
|
|
344
|
+
success: false,
|
|
345
|
+
error: error$1 instanceof Error ? error$1.message : "Unknown error"
|
|
346
|
+
});
|
|
303
347
|
}
|
|
304
348
|
}
|
|
305
349
|
}
|
|
350
|
+
const successfulSitemaps = sitemapAttempts.filter((a) => a.success);
|
|
351
|
+
const failedSitemaps = sitemapAttempts.filter((a) => !a.success);
|
|
352
|
+
if (successfulSitemaps.length > 0) {
|
|
353
|
+
const sitemapUrl = successfulSitemaps[0].url;
|
|
354
|
+
if (progress.sitemap.processed > 0) p.note(`Found sitemap at ${sitemapUrl} with ${progress.sitemap.processed} URLs`, "Sitemap Discovery");
|
|
355
|
+
else p.note(`Found sitemap at ${sitemapUrl} but no URLs matched your search criteria`, "Sitemap Discovery");
|
|
356
|
+
} else if (failedSitemaps.length > 0) {
|
|
357
|
+
const firstAttempt = failedSitemaps[0];
|
|
358
|
+
if (firstAttempt.error?.includes("404")) p.note(`No sitemap found, using crawler to discover pages`, "Sitemap Discovery");
|
|
359
|
+
else p.note(`Could not access sitemap: ${firstAttempt.error}`, "Sitemap Discovery");
|
|
360
|
+
}
|
|
306
361
|
if (!startingUrls.includes(homePageUrl)) startingUrls.unshift(homePageUrl);
|
|
307
362
|
progress.sitemap.status = "completed";
|
|
308
363
|
progress.crawling.total = startingUrls.length;
|
|
@@ -313,14 +368,15 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
313
368
|
const processedUrls = new Set();
|
|
314
369
|
const shouldCrawlUrl = (url) => {
|
|
315
370
|
if (isUrlExcluded(url, exclude)) return false;
|
|
316
|
-
if (!patterns.some((p) => p.isGlob)) return true;
|
|
371
|
+
if (!patterns.some((p$1) => p$1.isGlob)) return true;
|
|
317
372
|
return patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
318
373
|
};
|
|
319
374
|
const createRequestHandler = (crawlerType) => {
|
|
320
|
-
return async ({ request, body, page, enqueueLinks }) => {
|
|
375
|
+
return async ({ request, body, page, enqueueLinks, response }) => {
|
|
321
376
|
const startTime = Date.now();
|
|
322
377
|
progress.crawling.currentUrl = request.loadedUrl;
|
|
323
378
|
onProgress?.(progress);
|
|
379
|
+
if (response?.statusCode && (response.statusCode < 200 || response.statusCode >= 300)) return;
|
|
324
380
|
const homePageUrl = new URL(startingUrls[0]).origin;
|
|
325
381
|
let html;
|
|
326
382
|
let title;
|
|
@@ -345,20 +401,22 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
345
401
|
const safeSegments = pathSegments.map((seg) => seg.replace(/[^\w\-]/g, "-"));
|
|
346
402
|
const filename = safeSegments.length > 0 ? safeSegments.join("/") : "index";
|
|
347
403
|
const safeFilename = normalize(`${filename}.md`);
|
|
348
|
-
filePath = join(outputDir,
|
|
404
|
+
filePath = join(outputDir, safeFilename);
|
|
349
405
|
if (generateIndividualMd) {
|
|
350
406
|
const fileDir = dirname(filePath);
|
|
351
407
|
if (fileDir && !existsSync(fileDir)) mkdirSync(fileDir, { recursive: true });
|
|
352
408
|
await writeFile(filePath, md, "utf-8");
|
|
353
409
|
}
|
|
354
410
|
}
|
|
355
|
-
const
|
|
411
|
+
const normalizedUrl = request.loadedUrl.replace(/\/$/, "");
|
|
412
|
+
const normalizedHomePageUrl = homePageUrl.replace(/\/$/, "");
|
|
413
|
+
const isHomePage = normalizedUrl === normalizedHomePageUrl;
|
|
356
414
|
if (shouldProcessMarkdown || isHomePage) {
|
|
357
415
|
const result = {
|
|
358
416
|
url: request.loadedUrl,
|
|
359
417
|
title,
|
|
360
418
|
content: md,
|
|
361
|
-
filePath:
|
|
419
|
+
filePath: shouldProcessMarkdown ? filePath : void 0,
|
|
362
420
|
timestamp: startTime,
|
|
363
421
|
success: true,
|
|
364
422
|
metadata,
|
|
@@ -384,7 +442,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
384
442
|
let crawler;
|
|
385
443
|
const crawlerOptions = {
|
|
386
444
|
requestHandler: createRequestHandler(driver),
|
|
387
|
-
errorHandler: async ({ request, response }
|
|
445
|
+
errorHandler: async ({ request, response }) => {
|
|
388
446
|
if (response?.statusCode && response?.statusCode >= 400) request.noRetry = true;
|
|
389
447
|
},
|
|
390
448
|
maxRequestsPerCrawl,
|
|
@@ -408,18 +466,29 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
408
466
|
onProgress?.(progress);
|
|
409
467
|
const successfulResults = results.filter((r$1) => r$1.success);
|
|
410
468
|
const firstUrl = new URL(withHttps(urls[0]));
|
|
469
|
+
const origin$1 = firstUrl.origin;
|
|
411
470
|
const homePageResult = successfulResults.find((r$1) => {
|
|
412
471
|
const resultUrl = new URL(withHttps(r$1.url));
|
|
413
|
-
|
|
414
|
-
return resultUrl.href === homeUrl.href;
|
|
472
|
+
return resultUrl.href === origin$1 || resultUrl.href === `${origin$1}/`;
|
|
415
473
|
});
|
|
416
|
-
const siteName = siteNameOverride || homePageResult?.metadata?.title || firstUrl.hostname;
|
|
474
|
+
const siteName = siteNameOverride || homePageResult?.metadata?.title || homePageResult?.title || firstUrl.hostname;
|
|
417
475
|
const description = descriptionOverride || homePageResult?.metadata?.description || successfulResults[0]?.metadata?.description;
|
|
418
476
|
if (generateLlmsTxt || generateLlmsFullTxt) {
|
|
419
477
|
progress.generation.current = "Generating llms.txt files";
|
|
420
478
|
onProgress?.(progress);
|
|
421
|
-
const contentResults = successfulResults.filter((result) =>
|
|
422
|
-
|
|
479
|
+
const contentResults = successfulResults.filter((result) => {
|
|
480
|
+
if (!result.content) return false;
|
|
481
|
+
const trimmedContent = result.content.trim();
|
|
482
|
+
const contentWithoutFrontmatter = trimmedContent.replace(/^---\s*\n(?:.*\n)*?---\s*/, "").trim();
|
|
483
|
+
return contentWithoutFrontmatter.length > 10;
|
|
484
|
+
});
|
|
485
|
+
const seenUrls = new Set();
|
|
486
|
+
const deduplicatedResults = contentResults.filter((result) => {
|
|
487
|
+
if (seenUrls.has(result.url)) return false;
|
|
488
|
+
seenUrls.add(result.url);
|
|
489
|
+
return true;
|
|
490
|
+
});
|
|
491
|
+
const processedFiles = deduplicatedResults.map((result) => ({
|
|
423
492
|
filePath: result.filePath,
|
|
424
493
|
title: result.title,
|
|
425
494
|
content: result.content,
|
|
@@ -430,8 +499,9 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
430
499
|
files: processedFiles,
|
|
431
500
|
siteName,
|
|
432
501
|
description,
|
|
433
|
-
origin: origin || firstUrl.origin,
|
|
434
|
-
generateFull: generateLlmsFullTxt
|
|
502
|
+
origin: origin$1 || firstUrl.origin,
|
|
503
|
+
generateFull: generateLlmsFullTxt,
|
|
504
|
+
outputDir
|
|
435
505
|
});
|
|
436
506
|
if (generateLlmsTxt) {
|
|
437
507
|
progress.generation.current = "Writing llms.txt";
|
package/dist/cli.mjs
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-
|
|
1
|
+
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-BwURA9nQ.mjs";
|
|
2
2
|
import { readFileSync } from "node:fs";
|
|
3
|
-
import { dirname, join, resolve } from "pathe";
|
|
4
|
-
import { fileURLToPath } from "node:url";
|
|
5
3
|
import * as p$1 from "@clack/prompts";
|
|
6
4
|
import * as p from "@clack/prompts";
|
|
5
|
+
import { dirname, join, resolve } from "pathe";
|
|
6
|
+
import { fileURLToPath } from "node:url";
|
|
7
7
|
import { addDependency } from "nypm";
|
|
8
8
|
|
|
9
9
|
//#region src/playwright-utils.ts
|
|
@@ -59,7 +59,7 @@ const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
|
|
|
59
59
|
const version = packageJson.version;
|
|
60
60
|
async function interactiveCrawl() {
|
|
61
61
|
console.clear();
|
|
62
|
-
p.intro(
|
|
62
|
+
p.intro(`☁️ @mdream/crawl v${version}`);
|
|
63
63
|
const urlsInput = await p.text({
|
|
64
64
|
message: "Enter starting URL for crawling (supports glob patterns):",
|
|
65
65
|
placeholder: "e.g. docs.example.com, site.com/docs/**",
|
|
@@ -94,7 +94,7 @@ async function interactiveCrawl() {
|
|
|
94
94
|
p.cancel(error instanceof Error ? error.message : "Invalid URL pattern");
|
|
95
95
|
return null;
|
|
96
96
|
}
|
|
97
|
-
const outputDir = "
|
|
97
|
+
const outputDir = "output";
|
|
98
98
|
const crawlerOptions = await p.group({
|
|
99
99
|
driver: () => p.select({
|
|
100
100
|
message: "Select crawler driver:",
|
|
@@ -121,29 +121,35 @@ async function interactiveCrawl() {
|
|
|
121
121
|
p.cancel("Operation cancelled.");
|
|
122
122
|
process.exit(0);
|
|
123
123
|
} });
|
|
124
|
-
const advancedOptions = await p.group({
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
124
|
+
const advancedOptions = await p.group({
|
|
125
|
+
outputFormats: () => p.multiselect({
|
|
126
|
+
message: "Select output formats:",
|
|
127
|
+
options: [
|
|
128
|
+
{
|
|
129
|
+
value: "llms.txt",
|
|
130
|
+
label: "llms.txt (basic format)",
|
|
131
|
+
hint: "Recommended"
|
|
132
|
+
},
|
|
133
|
+
{
|
|
134
|
+
value: "llms-full.txt",
|
|
135
|
+
label: "llms-full.txt (extended format)"
|
|
136
|
+
},
|
|
137
|
+
{
|
|
138
|
+
value: "markdown",
|
|
139
|
+
label: "Individual Markdown files"
|
|
140
|
+
}
|
|
141
|
+
],
|
|
142
|
+
initialValues: [
|
|
143
|
+
"llms.txt",
|
|
144
|
+
"llms-full.txt",
|
|
145
|
+
"markdown"
|
|
146
|
+
]
|
|
147
|
+
}),
|
|
148
|
+
verbose: () => p.confirm({
|
|
149
|
+
message: "Enable verbose logging?",
|
|
150
|
+
initialValue: false
|
|
151
|
+
})
|
|
152
|
+
}, { onCancel: () => {
|
|
147
153
|
p.cancel("Operation cancelled.");
|
|
148
154
|
process.exit(0);
|
|
149
155
|
} });
|
|
@@ -172,7 +178,8 @@ async function interactiveCrawl() {
|
|
|
172
178
|
`Follow links: Yes (depth ${crawlerOptions.maxDepth})`,
|
|
173
179
|
`Output formats: ${outputFormats.join(", ")}`,
|
|
174
180
|
`Sitemap discovery: Automatic`,
|
|
175
|
-
inferredOrigin && `Origin: ${inferredOrigin}
|
|
181
|
+
inferredOrigin && `Origin: ${inferredOrigin}`,
|
|
182
|
+
advancedOptions.verbose && `Verbose logging: Enabled`
|
|
176
183
|
].filter(Boolean);
|
|
177
184
|
p.note(summary.join("\n"), "Crawl Configuration");
|
|
178
185
|
const shouldProceed = await p.confirm({
|
|
@@ -194,19 +201,18 @@ async function interactiveCrawl() {
|
|
|
194
201
|
generateLlmsFullTxt: advancedOptions.outputFormats.includes("llms-full.txt"),
|
|
195
202
|
generateIndividualMd: advancedOptions.outputFormats.includes("markdown"),
|
|
196
203
|
origin: inferredOrigin,
|
|
197
|
-
globPatterns
|
|
204
|
+
globPatterns,
|
|
205
|
+
verbose: advancedOptions.verbose
|
|
198
206
|
};
|
|
199
207
|
}
|
|
200
|
-
async function showCrawlResults(successful, failed, outputDir, generatedFiles,
|
|
208
|
+
async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds) {
|
|
201
209
|
const messages = [];
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
messages.push(`📁 Output: ${outputDir}`);
|
|
209
|
-
p.note(messages.join("\n"), "Completed: Results");
|
|
210
|
+
const durationStr = `${durationSeconds.toFixed(1)}s`;
|
|
211
|
+
const stats = failed > 0 ? `${successful} pages, ${failed} failed` : `${successful} pages`;
|
|
212
|
+
messages.push(`📄 ${stats} • ⏱️ ${durationStr}`);
|
|
213
|
+
messages.push(`📦 ${generatedFiles.join(", ")}`);
|
|
214
|
+
messages.push(`📁 ${outputDir}`);
|
|
215
|
+
p.note(messages.join("\n"), "✅ Complete");
|
|
210
216
|
}
|
|
211
217
|
function parseCliArgs() {
|
|
212
218
|
const args = process.argv.slice(2);
|
|
@@ -222,7 +228,7 @@ Usage:
|
|
|
222
228
|
|
|
223
229
|
Options:
|
|
224
230
|
-u, --url <url> Website URL to crawl
|
|
225
|
-
-o, --output <dir> Output directory (default:
|
|
231
|
+
-o, --output <dir> Output directory (default: output)
|
|
226
232
|
-d, --depth <number> Crawl depth (default: 3)
|
|
227
233
|
--driver <http|playwright> Crawler driver (default: http)
|
|
228
234
|
--artifacts <list> Comma-separated list of artifacts: llms.txt,llms-full.txt,markdown (default: all)
|
|
@@ -232,6 +238,7 @@ Options:
|
|
|
232
238
|
--max-pages <number> Maximum pages to crawl (default: unlimited)
|
|
233
239
|
--crawl-delay <seconds> Crawl delay in seconds
|
|
234
240
|
--exclude <pattern> Exclude URLs matching glob patterns (can be used multiple times)
|
|
241
|
+
-v, --verbose Enable verbose logging
|
|
235
242
|
-h, --help Show this help message
|
|
236
243
|
--version Show version number
|
|
237
244
|
|
|
@@ -241,6 +248,7 @@ Examples:
|
|
|
241
248
|
@mdream/crawl -u harlanzw.com --artifacts "llms.txt,markdown"
|
|
242
249
|
@mdream/crawl --url https://docs.example.com --depth 2 --artifacts "llms-full.txt"
|
|
243
250
|
@mdream/crawl -u example.com --exclude "*/admin/*" --exclude "*/api/*"
|
|
251
|
+
@mdream/crawl -u example.com --verbose
|
|
244
252
|
`);
|
|
245
253
|
process.exit(0);
|
|
246
254
|
}
|
|
@@ -349,9 +357,10 @@ Examples:
|
|
|
349
357
|
const siteNameOverride = getArgValue("--site-name");
|
|
350
358
|
const descriptionOverride = getArgValue("--description");
|
|
351
359
|
const patterns = [parsed];
|
|
360
|
+
const verbose = args.includes("--verbose") || args.includes("-v");
|
|
352
361
|
return {
|
|
353
362
|
urls: [url],
|
|
354
|
-
outputDir: resolve(getArgValue("--output") || getArgValue("-o") || "
|
|
363
|
+
outputDir: resolve(getArgValue("--output") || getArgValue("-o") || "output"),
|
|
355
364
|
driver: driver || "http",
|
|
356
365
|
maxRequestsPerCrawl: Number.parseInt(maxPagesStr || String(Number.MAX_SAFE_INTEGER)),
|
|
357
366
|
followLinks: true,
|
|
@@ -364,7 +373,8 @@ Examples:
|
|
|
364
373
|
origin: inferredOrigin,
|
|
365
374
|
globPatterns: patterns,
|
|
366
375
|
crawlDelay: crawlDelayStr ? Number.parseInt(crawlDelayStr) : void 0,
|
|
367
|
-
exclude: excludePatterns.length > 0 ? excludePatterns : void 0
|
|
376
|
+
exclude: excludePatterns.length > 0 ? excludePatterns : void 0,
|
|
377
|
+
verbose
|
|
368
378
|
};
|
|
369
379
|
}
|
|
370
380
|
async function main() {
|
|
@@ -383,7 +393,8 @@ async function main() {
|
|
|
383
393
|
`Driver: ${options.driver}`,
|
|
384
394
|
`Depth: ${options.maxDepth}`,
|
|
385
395
|
`Formats: ${formats.join(", ")}`,
|
|
386
|
-
options.exclude && options.exclude.length > 0 && `Exclude: ${options.exclude.join(", ")}
|
|
396
|
+
options.exclude && options.exclude.length > 0 && `Exclude: ${options.exclude.join(", ")}`,
|
|
397
|
+
options.verbose && `Verbose: Enabled`
|
|
387
398
|
].filter(Boolean);
|
|
388
399
|
p.note(summary.join("\n"), "Configuration");
|
|
389
400
|
} else options = await interactiveCrawl();
|
|
@@ -419,9 +430,7 @@ async function main() {
|
|
|
419
430
|
s.stop();
|
|
420
431
|
const endTime = Date.now();
|
|
421
432
|
const durationMs = endTime - startTime;
|
|
422
|
-
const durationSeconds =
|
|
423
|
-
const durationMinutes = Math.floor(durationSeconds / 60);
|
|
424
|
-
const remainingSeconds = durationSeconds % 60;
|
|
433
|
+
const durationSeconds = durationMs / 1e3;
|
|
425
434
|
const successful = results.filter((r) => r.success).length;
|
|
426
435
|
const failed = results.filter((r) => !r.success).length;
|
|
427
436
|
const failedResults = results.filter((r) => !r.success);
|
|
@@ -442,17 +451,9 @@ async function main() {
|
|
|
442
451
|
if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
|
|
443
452
|
if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
|
|
444
453
|
}
|
|
445
|
-
if (!cliOptions) await showCrawlResults(successful, failed, options.outputDir, generatedFiles,
|
|
454
|
+
if (!cliOptions) await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
|
|
446
455
|
else {
|
|
447
|
-
|
|
448
|
-
if (successful > 0) {
|
|
449
|
-
const durationStr = durationMinutes > 0 ? `${durationMinutes}m ${remainingSeconds}s` : `${remainingSeconds}s`;
|
|
450
|
-
messages.push(`✅ ${successful} pages processed in ${durationStr}`);
|
|
451
|
-
}
|
|
452
|
-
if (failed > 0) messages.push(`❌ ${failed} pages failed`);
|
|
453
|
-
if (generatedFiles.length > 0) messages.push(`📄 Generated: ${generatedFiles.join(", ")}`);
|
|
454
|
-
messages.push(`📁 Output: ${options.outputDir}`);
|
|
455
|
-
p.note(messages.join("\n"), "Completed: Results");
|
|
456
|
+
await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationSeconds);
|
|
456
457
|
if (successful === 0) process.exit(1);
|
|
457
458
|
}
|
|
458
459
|
}
|
package/dist/index.d.mts
CHANGED
package/dist/index.mjs
CHANGED
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.8.1",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -49,11 +49,11 @@
|
|
|
49
49
|
"crawlee": "^3.13.10",
|
|
50
50
|
"nypm": "^0.6.0",
|
|
51
51
|
"pathe": "^2.0.3",
|
|
52
|
-
"picomatch": "^4.0.
|
|
53
|
-
"mdream": "0.
|
|
52
|
+
"picomatch": "^4.0.3",
|
|
53
|
+
"mdream": "0.8.1"
|
|
54
54
|
},
|
|
55
55
|
"devDependencies": {
|
|
56
|
-
"@types/picomatch": "^4.0.
|
|
56
|
+
"@types/picomatch": "^4.0.1"
|
|
57
57
|
},
|
|
58
58
|
"scripts": {
|
|
59
59
|
"build": "obuild",
|