@mdream/crawl 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/{crawl-NJU1Dyc-.mjs → crawl-B5MaCj6O.mjs} +38 -28
- package/dist/cli.mjs +21 -15
- package/dist/index.mjs +1 -1
- package/package.json +6 -3
|
@@ -4,7 +4,7 @@ import { HttpCrawler, Sitemap, purgeDefaultStorages } from "crawlee";
|
|
|
4
4
|
import { generateLlmsTxtArtifacts, htmlToMarkdown } from "mdream";
|
|
5
5
|
import { withMinimalPreset } from "mdream/preset/minimal";
|
|
6
6
|
import { dirname, join, normalize, resolve } from "pathe";
|
|
7
|
-
import
|
|
7
|
+
import picomatch from "picomatch";
|
|
8
8
|
import { extractionPlugin } from "mdream/plugins";
|
|
9
9
|
|
|
10
10
|
//#region ../../node_modules/.pnpm/ufo@1.6.1/node_modules/ufo/dist/index.mjs
|
|
@@ -60,7 +60,12 @@ function matchesGlobPattern(url, parsedPattern) {
|
|
|
60
60
|
const urlPath = urlObj.pathname + urlObj.search + urlObj.hash;
|
|
61
61
|
const urlBase = `${urlObj.protocol}//${urlObj.host}`;
|
|
62
62
|
if (urlBase !== parsedPattern.baseUrl) return false;
|
|
63
|
-
|
|
63
|
+
let pattern = parsedPattern.pattern;
|
|
64
|
+
if (pattern.endsWith("*") && !pattern.endsWith("**") && !pattern.endsWith("/*")) {
|
|
65
|
+
const base = pattern.slice(0, -1);
|
|
66
|
+
pattern = `{${base},${base}/**}`;
|
|
67
|
+
}
|
|
68
|
+
return picomatch(pattern)(urlPath);
|
|
64
69
|
} catch {
|
|
65
70
|
return false;
|
|
66
71
|
}
|
|
@@ -95,9 +100,9 @@ function isUrlExcluded(url, excludePatterns) {
|
|
|
95
100
|
}
|
|
96
101
|
if (pattern.startsWith("/")) {
|
|
97
102
|
const adjustedPattern = pattern.endsWith("/*") ? pattern.replace("/*", "/**") : pattern;
|
|
98
|
-
return
|
|
103
|
+
return picomatch(adjustedPattern)(urlPath);
|
|
99
104
|
}
|
|
100
|
-
return
|
|
105
|
+
return picomatch(pattern)(urlPath) || picomatch(pattern)(urlPath.substring(1));
|
|
101
106
|
});
|
|
102
107
|
} catch {
|
|
103
108
|
return false;
|
|
@@ -237,6 +242,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
237
242
|
}
|
|
238
243
|
}
|
|
239
244
|
}
|
|
245
|
+
let mainSitemapProcessed = false;
|
|
240
246
|
try {
|
|
241
247
|
const { urls: sitemapUrls } = await Sitemap.load(`${baseUrl}/sitemap.xml`);
|
|
242
248
|
const hasGlobPatterns = patterns.some((p) => p.isGlob);
|
|
@@ -248,6 +254,7 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
248
254
|
progress.sitemap.found = sitemapUrls.length;
|
|
249
255
|
progress.sitemap.processed = filteredUrls.length;
|
|
250
256
|
onProgress?.(progress);
|
|
257
|
+
mainSitemapProcessed = true;
|
|
251
258
|
} else {
|
|
252
259
|
const filteredUrls = sitemapUrls.filter((url) => {
|
|
253
260
|
return !isUrlExcluded(url, exclude);
|
|
@@ -257,40 +264,43 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
257
264
|
progress.sitemap.found = sitemapUrls.length;
|
|
258
265
|
progress.sitemap.processed = filteredUrls.length;
|
|
259
266
|
onProgress?.(progress);
|
|
267
|
+
mainSitemapProcessed = true;
|
|
260
268
|
}
|
|
261
269
|
}
|
|
262
270
|
} catch {
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
const
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
progress.sitemap.found = altUrls.length;
|
|
277
|
-
progress.sitemap.processed = filteredUrls.length;
|
|
278
|
-
onProgress?.(progress);
|
|
279
|
-
break;
|
|
280
|
-
} else {
|
|
281
|
-
const filteredUrls = altUrls.filter((url) => {
|
|
282
|
-
return !isUrlExcluded(url, exclude);
|
|
283
|
-
});
|
|
284
|
-
if (filteredUrls.length > 0) {
|
|
271
|
+
if (!mainSitemapProcessed) {
|
|
272
|
+
const commonSitemaps = [
|
|
273
|
+
`${baseUrl}/sitemap_index.xml`,
|
|
274
|
+
`${baseUrl}/sitemaps.xml`,
|
|
275
|
+
`${baseUrl}/sitemap-index.xml`
|
|
276
|
+
];
|
|
277
|
+
for (const sitemapUrl of commonSitemaps) try {
|
|
278
|
+
const { urls: altUrls } = await Sitemap.load(sitemapUrl);
|
|
279
|
+
const hasGlobPatterns = patterns.some((p) => p.isGlob);
|
|
280
|
+
if (hasGlobPatterns) {
|
|
281
|
+
const filteredUrls = altUrls.filter((url) => {
|
|
282
|
+
return !isUrlExcluded(url, exclude) && patterns.some((pattern) => matchesGlobPattern(url, pattern));
|
|
283
|
+
});
|
|
285
284
|
startingUrls = filteredUrls;
|
|
286
285
|
progress.sitemap.found = altUrls.length;
|
|
287
286
|
progress.sitemap.processed = filteredUrls.length;
|
|
288
287
|
onProgress?.(progress);
|
|
289
288
|
break;
|
|
289
|
+
} else {
|
|
290
|
+
const filteredUrls = altUrls.filter((url) => {
|
|
291
|
+
return !isUrlExcluded(url, exclude);
|
|
292
|
+
});
|
|
293
|
+
if (filteredUrls.length > 0) {
|
|
294
|
+
startingUrls = filteredUrls;
|
|
295
|
+
progress.sitemap.found = altUrls.length;
|
|
296
|
+
progress.sitemap.processed = filteredUrls.length;
|
|
297
|
+
onProgress?.(progress);
|
|
298
|
+
break;
|
|
299
|
+
}
|
|
290
300
|
}
|
|
301
|
+
} catch {
|
|
302
|
+
continue;
|
|
291
303
|
}
|
|
292
|
-
} catch {
|
|
293
|
-
continue;
|
|
294
304
|
}
|
|
295
305
|
}
|
|
296
306
|
if (!startingUrls.includes(homePageUrl)) startingUrls.unshift(homePageUrl);
|
package/dist/cli.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-
|
|
1
|
+
import { crawlAndGenerate, parseUrlPattern, validateGlobPattern, withHttps } from "./_chunks/crawl-B5MaCj6O.mjs";
|
|
2
2
|
import { readFileSync } from "node:fs";
|
|
3
3
|
import { dirname, join, resolve } from "pathe";
|
|
4
4
|
import { fileURLToPath } from "node:url";
|
|
@@ -197,15 +197,16 @@ async function interactiveCrawl() {
|
|
|
197
197
|
globPatterns
|
|
198
198
|
};
|
|
199
199
|
}
|
|
200
|
-
async function showCrawlResults(successful, failed, outputDir, generatedFiles) {
|
|
200
|
+
async function showCrawlResults(successful, failed, outputDir, generatedFiles, durationMinutes, remainingSeconds) {
|
|
201
201
|
const messages = [];
|
|
202
|
-
if (successful > 0)
|
|
202
|
+
if (successful > 0) {
|
|
203
|
+
const durationStr = durationMinutes > 0 ? `${durationMinutes}m ${remainingSeconds}s` : `${remainingSeconds}s`;
|
|
204
|
+
messages.push(`✅ ${successful} pages processed in ${durationStr}`);
|
|
205
|
+
}
|
|
203
206
|
if (failed > 0) messages.push(`❌ ${failed} pages failed`);
|
|
204
207
|
if (generatedFiles.length > 0) messages.push(`📄 Generated: ${generatedFiles.join(", ")}`);
|
|
205
208
|
messages.push(`📁 Output: ${outputDir}`);
|
|
206
|
-
p.note(messages.join("\n"), "
|
|
207
|
-
if (successful > 0) p.outro("🎉 Crawling completed successfully!");
|
|
208
|
-
else p.outro("❌ Crawling failed - no pages processed");
|
|
209
|
+
p.note(messages.join("\n"), "Completed: Results");
|
|
209
210
|
}
|
|
210
211
|
function parseCliArgs() {
|
|
211
212
|
const args = process.argv.slice(2);
|
|
@@ -396,6 +397,7 @@ async function main() {
|
|
|
396
397
|
}
|
|
397
398
|
const s = p.spinner();
|
|
398
399
|
s.start("Starting crawl...");
|
|
400
|
+
const startTime = Date.now();
|
|
399
401
|
const results = await crawlAndGenerate(options, (progress) => {
|
|
400
402
|
if (progress.sitemap.status === "discovering") s.message("Discovering sitemaps...");
|
|
401
403
|
else if (progress.sitemap.status === "processing") s.message(`Processing sitemap... Found ${progress.sitemap.found} URLs`);
|
|
@@ -414,7 +416,12 @@ async function main() {
|
|
|
414
416
|
s.message(current);
|
|
415
417
|
}
|
|
416
418
|
});
|
|
417
|
-
s.stop(
|
|
419
|
+
s.stop();
|
|
420
|
+
const endTime = Date.now();
|
|
421
|
+
const durationMs = endTime - startTime;
|
|
422
|
+
const durationSeconds = Math.floor(durationMs / 1e3);
|
|
423
|
+
const durationMinutes = Math.floor(durationSeconds / 60);
|
|
424
|
+
const remainingSeconds = durationSeconds % 60;
|
|
418
425
|
const successful = results.filter((r) => r.success).length;
|
|
419
426
|
const failed = results.filter((r) => !r.success).length;
|
|
420
427
|
const failedResults = results.filter((r) => !r.success);
|
|
@@ -435,19 +442,18 @@ async function main() {
|
|
|
435
442
|
if (options.generateLlmsFullTxt) generatedFiles.push("llms-full.txt");
|
|
436
443
|
if (options.generateIndividualMd) generatedFiles.push(`${successful} MD files`);
|
|
437
444
|
}
|
|
438
|
-
if (!cliOptions) await showCrawlResults(successful, failed, options.outputDir, generatedFiles);
|
|
445
|
+
if (!cliOptions) await showCrawlResults(successful, failed, options.outputDir, generatedFiles, durationMinutes, remainingSeconds);
|
|
439
446
|
else {
|
|
440
447
|
const messages = [];
|
|
441
|
-
if (successful > 0)
|
|
448
|
+
if (successful > 0) {
|
|
449
|
+
const durationStr = durationMinutes > 0 ? `${durationMinutes}m ${remainingSeconds}s` : `${remainingSeconds}s`;
|
|
450
|
+
messages.push(`✅ ${successful} pages processed in ${durationStr}`);
|
|
451
|
+
}
|
|
442
452
|
if (failed > 0) messages.push(`❌ ${failed} pages failed`);
|
|
443
453
|
if (generatedFiles.length > 0) messages.push(`📄 Generated: ${generatedFiles.join(", ")}`);
|
|
444
454
|
messages.push(`📁 Output: ${options.outputDir}`);
|
|
445
|
-
p.note(messages.join("\n"), "Results");
|
|
446
|
-
if (successful
|
|
447
|
-
else {
|
|
448
|
-
p.outro("❌ Crawling failed - no pages processed");
|
|
449
|
-
process.exit(1);
|
|
450
|
-
}
|
|
455
|
+
p.note(messages.join("\n"), "Completed: Results");
|
|
456
|
+
if (successful === 0) process.exit(1);
|
|
451
457
|
}
|
|
452
458
|
}
|
|
453
459
|
main().catch((error) => {
|
package/dist/index.mjs
CHANGED
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.7.
|
|
4
|
+
"version": "0.7.1",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -47,10 +47,13 @@
|
|
|
47
47
|
"dependencies": {
|
|
48
48
|
"@clack/prompts": "^0.11.0",
|
|
49
49
|
"crawlee": "^3.13.9",
|
|
50
|
-
"minimatch": "^10.0.3",
|
|
51
50
|
"nypm": "^0.6.0",
|
|
52
51
|
"pathe": "^2.0.3",
|
|
53
|
-
"
|
|
52
|
+
"picomatch": "^4.0.2",
|
|
53
|
+
"mdream": "0.7.1"
|
|
54
|
+
},
|
|
55
|
+
"devDependencies": {
|
|
56
|
+
"@types/picomatch": "^4.0.0"
|
|
54
57
|
},
|
|
55
58
|
"scripts": {
|
|
56
59
|
"build": "obuild",
|