@mdream/crawl 0.9.1 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/{crawl-BtuYX2_u.mjs → crawl-D8WIR9L5.mjs} +115 -19
- package/dist/cli.mjs +42 -28
- package/dist/index.d.mts +1 -0
- package/dist/index.mjs +1 -1
- package/package.json +5 -5
|
@@ -166,21 +166,56 @@ function extractMetadata(html, url) {
|
|
|
166
166
|
//#endregion
|
|
167
167
|
//#region src/crawl.ts
|
|
168
168
|
async function loadSitemapWithoutRetries(sitemapUrl) {
|
|
169
|
-
const
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
if (
|
|
178
|
-
|
|
169
|
+
const controller = new AbortController();
|
|
170
|
+
const timeoutId = setTimeout(() => controller.abort(), 1e4);
|
|
171
|
+
try {
|
|
172
|
+
const response = await fetch(sitemapUrl, {
|
|
173
|
+
signal: controller.signal,
|
|
174
|
+
headers: { "User-Agent": "mdream-crawler/1.0" }
|
|
175
|
+
});
|
|
176
|
+
clearTimeout(timeoutId);
|
|
177
|
+
if (!response.ok) throw new Error(`Sitemap not found: ${response.status}`);
|
|
178
|
+
const xmlContent = await response.text();
|
|
179
|
+
if (xmlContent.includes("<sitemapindex")) {
|
|
180
|
+
const sitemapIndexRegex = /<sitemap[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/sitemap>/gs;
|
|
181
|
+
const childSitemaps = [];
|
|
182
|
+
let match;
|
|
183
|
+
while (true) {
|
|
184
|
+
match = sitemapIndexRegex.exec(xmlContent);
|
|
185
|
+
if (match === null) break;
|
|
186
|
+
let url = match[1];
|
|
187
|
+
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
|
|
188
|
+
childSitemaps.push(url);
|
|
189
|
+
}
|
|
190
|
+
const allUrls = [];
|
|
191
|
+
for (const childSitemapUrl of childSitemaps) try {
|
|
192
|
+
const childUrls = await loadSitemapWithoutRetries(childSitemapUrl);
|
|
193
|
+
allUrls.push(...childUrls);
|
|
194
|
+
} catch (error) {
|
|
195
|
+
console.warn(`Failed to load child sitemap ${childSitemapUrl}:`, error instanceof Error ? error.message : "Unknown error");
|
|
196
|
+
}
|
|
197
|
+
return allUrls;
|
|
198
|
+
} else {
|
|
199
|
+
const urls = [];
|
|
200
|
+
const urlRegex = /<url[^>]*>.*?<loc>(.*?)<\/loc>.*?<\/url>/gs;
|
|
201
|
+
let match;
|
|
202
|
+
while (true) {
|
|
203
|
+
match = urlRegex.exec(xmlContent);
|
|
204
|
+
if (match === null) break;
|
|
205
|
+
let url = match[1];
|
|
206
|
+
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) url = url.slice(9, -3);
|
|
207
|
+
urls.push(url);
|
|
208
|
+
}
|
|
209
|
+
return urls;
|
|
210
|
+
}
|
|
211
|
+
} catch (error) {
|
|
212
|
+
clearTimeout(timeoutId);
|
|
213
|
+
if (error instanceof Error && error.name === "AbortError") throw new Error("Sitemap request timed out after 10 seconds");
|
|
214
|
+
throw error;
|
|
179
215
|
}
|
|
180
|
-
return urls;
|
|
181
216
|
}
|
|
182
217
|
async function crawlAndGenerate(options, onProgress) {
|
|
183
|
-
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false } = options;
|
|
218
|
+
const { urls, outputDir: rawOutputDir, maxRequestsPerCrawl = Number.MAX_SAFE_INTEGER, generateLlmsTxt = true, generateLlmsFullTxt = false, generateIndividualMd = true, origin, driver = "http", useChrome, followLinks = false, maxDepth = 1, globPatterns = [], crawlDelay, exclude = [], siteNameOverride, descriptionOverride, verbose = false, skipSitemap = false } = options;
|
|
184
219
|
const outputDir = resolve(normalize(rawOutputDir));
|
|
185
220
|
if (verbose) log.setLevel(log.LEVELS.INFO);
|
|
186
221
|
else log.setLevel(log.LEVELS.OFF);
|
|
@@ -205,13 +240,25 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
205
240
|
generation: { status: "idle" }
|
|
206
241
|
};
|
|
207
242
|
const sitemapAttempts = [];
|
|
208
|
-
if (startingUrls.length > 0) {
|
|
243
|
+
if (startingUrls.length > 0 && !skipSitemap) {
|
|
209
244
|
const baseUrl = new URL(startingUrls[0]).origin;
|
|
210
245
|
const homePageUrl = baseUrl;
|
|
211
246
|
onProgress?.(progress);
|
|
212
247
|
const robotsUrl = new URL("/robots.txt", baseUrl).toString();
|
|
213
|
-
const
|
|
214
|
-
|
|
248
|
+
const robotsController = new AbortController();
|
|
249
|
+
const robotsTimeoutId = setTimeout(() => robotsController.abort(), 1e4);
|
|
250
|
+
let robotsResponse;
|
|
251
|
+
try {
|
|
252
|
+
robotsResponse = await fetch(robotsUrl, {
|
|
253
|
+
signal: robotsController.signal,
|
|
254
|
+
headers: { "User-Agent": "mdream-crawler/1.0" }
|
|
255
|
+
});
|
|
256
|
+
clearTimeout(robotsTimeoutId);
|
|
257
|
+
} catch (error) {
|
|
258
|
+
clearTimeout(robotsTimeoutId);
|
|
259
|
+
robotsResponse = null;
|
|
260
|
+
}
|
|
261
|
+
if (robotsResponse?.ok) {
|
|
215
262
|
const robotsContent = await robotsResponse.text();
|
|
216
263
|
const sitemapMatches = robotsContent.match(/Sitemap:\s*(.*)/gi);
|
|
217
264
|
if (sitemapMatches && sitemapMatches.length > 0) {
|
|
@@ -348,6 +395,12 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
348
395
|
progress.sitemap.status = "completed";
|
|
349
396
|
progress.crawling.total = startingUrls.length;
|
|
350
397
|
onProgress?.(progress);
|
|
398
|
+
} else if (skipSitemap && startingUrls.length > 0) {
|
|
399
|
+
progress.sitemap.status = "completed";
|
|
400
|
+
progress.sitemap.found = 0;
|
|
401
|
+
progress.sitemap.processed = 0;
|
|
402
|
+
progress.crawling.total = startingUrls.length;
|
|
403
|
+
onProgress?.(progress);
|
|
351
404
|
}
|
|
352
405
|
if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });
|
|
353
406
|
const results = [];
|
|
@@ -428,11 +481,46 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
428
481
|
let crawler;
|
|
429
482
|
const crawlerOptions = {
|
|
430
483
|
requestHandler: createRequestHandler(driver),
|
|
431
|
-
errorHandler: async ({ request, response }) => {
|
|
432
|
-
if (
|
|
484
|
+
errorHandler: async ({ request, response, error }) => {
|
|
485
|
+
if (verbose) console.error(`[ERROR] URL: ${request.url}, Status: ${response?.statusCode || "N/A"}, Error: ${error?.message || "Unknown"}`);
|
|
486
|
+
if (response?.statusCode && response?.statusCode >= 400) {
|
|
487
|
+
request.noRetry = true;
|
|
488
|
+
const result = {
|
|
489
|
+
url: request.url,
|
|
490
|
+
title: "",
|
|
491
|
+
content: "",
|
|
492
|
+
timestamp: Date.now(),
|
|
493
|
+
success: false,
|
|
494
|
+
error: `HTTP ${response.statusCode}`,
|
|
495
|
+
metadata: {
|
|
496
|
+
title: "",
|
|
497
|
+
description: "",
|
|
498
|
+
links: []
|
|
499
|
+
},
|
|
500
|
+
depth: request.userData?.depth || 0
|
|
501
|
+
};
|
|
502
|
+
results.push(result);
|
|
503
|
+
} else if (error) {
|
|
504
|
+
request.noRetry = true;
|
|
505
|
+
const result = {
|
|
506
|
+
url: request.url,
|
|
507
|
+
title: "",
|
|
508
|
+
content: "",
|
|
509
|
+
timestamp: Date.now(),
|
|
510
|
+
success: false,
|
|
511
|
+
error: error.message || "Unknown error",
|
|
512
|
+
metadata: {
|
|
513
|
+
title: "",
|
|
514
|
+
description: "",
|
|
515
|
+
links: []
|
|
516
|
+
},
|
|
517
|
+
depth: request.userData?.depth || 0
|
|
518
|
+
};
|
|
519
|
+
results.push(result);
|
|
520
|
+
}
|
|
433
521
|
},
|
|
434
522
|
maxRequestsPerCrawl,
|
|
435
|
-
respectRobotsTxtFile:
|
|
523
|
+
respectRobotsTxtFile: false
|
|
436
524
|
};
|
|
437
525
|
if (crawlDelay) crawlerOptions.requestHandlerTimeoutSecs = crawlDelay;
|
|
438
526
|
if (driver === "playwright") {
|
|
@@ -450,7 +538,15 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
450
538
|
progress.crawling.status = "processing";
|
|
451
539
|
progress.crawling.total = startingUrls.length;
|
|
452
540
|
onProgress?.(progress);
|
|
453
|
-
|
|
541
|
+
try {
|
|
542
|
+
await crawler.run(initialRequests);
|
|
543
|
+
} catch (error) {
|
|
544
|
+
if (verbose) {
|
|
545
|
+
console.error(`[CRAWLER ERROR] ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
546
|
+
console.error(`[CRAWLER ERROR] Stack trace:`, error instanceof Error ? error.stack : "No stack trace");
|
|
547
|
+
}
|
|
548
|
+
throw error;
|
|
549
|
+
}
|
|
454
550
|
progress.crawling.status = "completed";
|
|
455
551
|
onProgress?.(progress);
|
|
456
552
|
if (results.some((r) => r.success)) {
|
package/dist/cli.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-
|
|
1
|
+
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern } from "./_chunks/crawl-D8WIR9L5.mjs";
|
|
2
2
|
import { accessSync, constants, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
|
|
3
3
|
import * as p$1 from "@clack/prompts";
|
|
4
4
|
import * as p from "@clack/prompts";
|
|
@@ -156,29 +156,35 @@ async function interactiveCrawl() {
|
|
|
156
156
|
p.cancel("Operation cancelled.");
|
|
157
157
|
process.exit(0);
|
|
158
158
|
} });
|
|
159
|
-
const advancedOptions = await p.group({
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
159
|
+
const advancedOptions = await p.group({
|
|
160
|
+
outputFormats: () => p.multiselect({
|
|
161
|
+
message: "Select output formats:",
|
|
162
|
+
options: [
|
|
163
|
+
{
|
|
164
|
+
value: "llms.txt",
|
|
165
|
+
label: "llms.txt (basic format)",
|
|
166
|
+
hint: "Recommended"
|
|
167
|
+
},
|
|
168
|
+
{
|
|
169
|
+
value: "llms-full.txt",
|
|
170
|
+
label: "llms-full.txt (extended format)"
|
|
171
|
+
},
|
|
172
|
+
{
|
|
173
|
+
value: "markdown",
|
|
174
|
+
label: "Individual Markdown files"
|
|
175
|
+
}
|
|
176
|
+
],
|
|
177
|
+
initialValues: [
|
|
178
|
+
"llms.txt",
|
|
179
|
+
"llms-full.txt",
|
|
180
|
+
"markdown"
|
|
181
|
+
]
|
|
182
|
+
}),
|
|
183
|
+
skipSitemap: () => p.confirm({
|
|
184
|
+
message: "Skip sitemap.xml and robots.txt discovery?",
|
|
185
|
+
initialValue: false
|
|
186
|
+
})
|
|
187
|
+
}, { onCancel: () => {
|
|
182
188
|
p.cancel("Operation cancelled.");
|
|
183
189
|
process.exit(0);
|
|
184
190
|
} });
|
|
@@ -206,10 +212,11 @@ async function interactiveCrawl() {
|
|
|
206
212
|
`Max pages: Unlimited`,
|
|
207
213
|
`Follow links: Yes (depth 3)`,
|
|
208
214
|
`Output formats: ${outputFormats.join(", ")}`,
|
|
209
|
-
`Sitemap discovery: Automatic`,
|
|
215
|
+
`Sitemap discovery: ${advancedOptions.skipSitemap ? "Skipped" : "Automatic"}`,
|
|
210
216
|
inferredOrigin && `Origin: ${inferredOrigin}`
|
|
211
217
|
].filter(Boolean);
|
|
212
218
|
p.note(summary.join("\n"), "Crawl Configuration");
|
|
219
|
+
if (advancedOptions.skipSitemap && globPatterns.some((p$2) => p$2.isGlob)) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
|
|
213
220
|
return {
|
|
214
221
|
urls,
|
|
215
222
|
outputDir: resolve(outputDir),
|
|
@@ -222,7 +229,8 @@ async function interactiveCrawl() {
|
|
|
222
229
|
origin: inferredOrigin,
|
|
223
230
|
globPatterns,
|
|
224
231
|
verbose: false,
|
|
225
|
-
maxDepth: 3
|
|
232
|
+
maxDepth: 3,
|
|
233
|
+
skipSitemap: advancedOptions.skipSitemap
|
|
226
234
|
};
|
|
227
235
|
}
|
|
228
236
|
async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationSeconds) {
|
|
@@ -258,17 +266,19 @@ Options:
|
|
|
258
266
|
--max-pages <number> Maximum pages to crawl (default: unlimited)
|
|
259
267
|
--crawl-delay <seconds> Crawl delay in seconds
|
|
260
268
|
--exclude <pattern> Exclude URLs matching glob patterns (can be used multiple times)
|
|
269
|
+
--skip-sitemap Skip sitemap.xml and robots.txt discovery
|
|
261
270
|
-v, --verbose Enable verbose logging
|
|
262
271
|
-h, --help Show this help message
|
|
263
272
|
--version Show version number
|
|
264
273
|
|
|
265
|
-
Note: Sitemap discovery and robots.txt checking are automatic
|
|
274
|
+
Note: Sitemap discovery and robots.txt checking are automatic unless --skip-sitemap is used.
|
|
266
275
|
|
|
267
276
|
Examples:
|
|
268
277
|
@mdream/crawl -u harlanzw.com --artifacts "llms.txt,markdown"
|
|
269
278
|
@mdream/crawl --url https://docs.example.com --depth 2 --artifacts "llms-full.txt"
|
|
270
279
|
@mdream/crawl -u example.com --exclude "*/admin/*" --exclude "*/api/*"
|
|
271
280
|
@mdream/crawl -u example.com --verbose
|
|
281
|
+
@mdream/crawl -u example.com --skip-sitemap
|
|
272
282
|
`);
|
|
273
283
|
process.exit(0);
|
|
274
284
|
}
|
|
@@ -378,6 +388,8 @@ Examples:
|
|
|
378
388
|
const descriptionOverride = getArgValue("--description");
|
|
379
389
|
const patterns = [parsed];
|
|
380
390
|
const verbose = args.includes("--verbose") || args.includes("-v");
|
|
391
|
+
const skipSitemap = args.includes("--skip-sitemap");
|
|
392
|
+
if (skipSitemap && parsed.isGlob) p.log.warn("Warning: Using --skip-sitemap with glob URLs may not discover all matching pages.");
|
|
381
393
|
return {
|
|
382
394
|
urls: [url],
|
|
383
395
|
outputDir: resolve(getArgValue("--output") || getArgValue("-o") || "output"),
|
|
@@ -394,7 +406,8 @@ Examples:
|
|
|
394
406
|
globPatterns: patterns,
|
|
395
407
|
crawlDelay: crawlDelayStr ? Number.parseInt(crawlDelayStr) : void 0,
|
|
396
408
|
exclude: excludePatterns.length > 0 ? excludePatterns : void 0,
|
|
397
|
-
verbose
|
|
409
|
+
verbose,
|
|
410
|
+
skipSitemap
|
|
398
411
|
};
|
|
399
412
|
}
|
|
400
413
|
async function main() {
|
|
@@ -414,6 +427,7 @@ async function main() {
|
|
|
414
427
|
`Depth: ${options.maxDepth}`,
|
|
415
428
|
`Formats: ${formats.join(", ")}`,
|
|
416
429
|
options.exclude && options.exclude.length > 0 && `Exclude: ${options.exclude.join(", ")}`,
|
|
430
|
+
options.skipSitemap && `Skip sitemap: Yes`,
|
|
417
431
|
options.verbose && `Verbose: Enabled`
|
|
418
432
|
].filter(Boolean);
|
|
419
433
|
p.note(summary.join("\n"), "Configuration");
|
package/dist/index.d.mts
CHANGED
package/dist/index.mjs
CHANGED
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.10.1",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -46,15 +46,15 @@
|
|
|
46
46
|
},
|
|
47
47
|
"dependencies": {
|
|
48
48
|
"@clack/prompts": "^0.11.0",
|
|
49
|
-
"crawlee": "^3.14.
|
|
50
|
-
"nypm": "^0.6.
|
|
49
|
+
"crawlee": "^3.14.1",
|
|
50
|
+
"nypm": "^0.6.1",
|
|
51
51
|
"pathe": "^2.0.3",
|
|
52
52
|
"picomatch": "^4.0.3",
|
|
53
53
|
"ufo": "^1.6.1",
|
|
54
|
-
"mdream": "0.
|
|
54
|
+
"mdream": "0.10.1"
|
|
55
55
|
},
|
|
56
56
|
"devDependencies": {
|
|
57
|
-
"@types/picomatch": "^4.0.
|
|
57
|
+
"@types/picomatch": "^4.0.2"
|
|
58
58
|
},
|
|
59
59
|
"scripts": {
|
|
60
60
|
"build": "obuild",
|