@mdream/crawl 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/crawl.mjs +116 -56
- package/package.json +5 -5
package/dist/_chunks/crawl.mjs
CHANGED
|
@@ -136,6 +136,20 @@ const FETCH_HEADERS = {
|
|
|
136
136
|
"Accept": "text/html,application/xhtml+xml,text/markdown"
|
|
137
137
|
};
|
|
138
138
|
const DEFAULT_CONCURRENCY = 20;
|
|
139
|
+
const IGNORED_PATH_PREFIXES = [
|
|
140
|
+
"/cdn-cgi/",
|
|
141
|
+
"/_next/",
|
|
142
|
+
"/_nuxt/",
|
|
143
|
+
"/__",
|
|
144
|
+
"/wp-admin/",
|
|
145
|
+
"/wp-json/",
|
|
146
|
+
"/wp-includes/",
|
|
147
|
+
"/wp-content/uploads/",
|
|
148
|
+
"/api/",
|
|
149
|
+
"/assets/",
|
|
150
|
+
"/static/"
|
|
151
|
+
];
|
|
152
|
+
const HTML_EXTENSIONS_RE = /\.(html?|php|aspx?|jsp)$/i;
|
|
139
153
|
function extractCdataUrl(url) {
|
|
140
154
|
if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) return url.slice(9, -3);
|
|
141
155
|
return url;
|
|
@@ -186,7 +200,8 @@ function extractMetadataInline(parsedUrl, allowedDomains) {
|
|
|
186
200
|
const href = el.attributes.href;
|
|
187
201
|
if (href) try {
|
|
188
202
|
const resolved = new URL(href, url);
|
|
189
|
-
|
|
203
|
+
resolved.hash = "";
|
|
204
|
+
const absoluteUrl = resolved.href.replace(URL_TRAILING_SLASH_RE, "") || resolved.href;
|
|
190
205
|
if (allowedDomains) {
|
|
191
206
|
const domain = getRegistrableDomain(resolved.hostname);
|
|
192
207
|
if (domain && allowedDomains.has(domain)) links.add(absoluteUrl);
|
|
@@ -418,6 +433,16 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
418
433
|
return "";
|
|
419
434
|
}
|
|
420
435
|
}).filter(Boolean)) : void 0;
|
|
436
|
+
const isContentUrl = (url) => {
|
|
437
|
+
try {
|
|
438
|
+
const pathname = new URL(url).pathname;
|
|
439
|
+
for (let i = 0; i < IGNORED_PATH_PREFIXES.length; i++) if (pathname.startsWith(IGNORED_PATH_PREFIXES[i])) return false;
|
|
440
|
+
if (pathname.lastIndexOf(".") > pathname.lastIndexOf("/")) return HTML_EXTENSIONS_RE.test(pathname);
|
|
441
|
+
return true;
|
|
442
|
+
} catch {
|
|
443
|
+
return false;
|
|
444
|
+
}
|
|
445
|
+
};
|
|
421
446
|
const shouldCrawlUrl = (url) => {
|
|
422
447
|
if (isUrlExcluded(url, exclude, allowSubdomains)) return false;
|
|
423
448
|
if (!hasGlobPatterns) {
|
|
@@ -512,11 +537,20 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
512
537
|
onProgress?.(progress);
|
|
513
538
|
}
|
|
514
539
|
if (followLinks && !singlePageMode && depth < maxDepth) {
|
|
515
|
-
const filteredLinks = metadata.links.filter((link) => shouldCrawlUrl(link));
|
|
516
|
-
for (const link of filteredLinks) processedUrls.
|
|
540
|
+
const filteredLinks = metadata.links.filter((link) => shouldCrawlUrl(link) && isContentUrl(link));
|
|
541
|
+
for (const link of filteredLinks) if (!processedUrls.has(link)) {
|
|
542
|
+
processedUrls.add(link);
|
|
543
|
+
pendingUrls.push({
|
|
544
|
+
url: link,
|
|
545
|
+
depth: depth + 1
|
|
546
|
+
});
|
|
547
|
+
}
|
|
517
548
|
}
|
|
518
549
|
};
|
|
550
|
+
const pendingUrls = [];
|
|
519
551
|
const urlsToProcess = startingUrls.slice(0, maxRequestsPerCrawl);
|
|
552
|
+
for (const url of urlsToProcess) processedUrls.add(url.replace(URL_TRAILING_SLASH_RE, "") || url);
|
|
553
|
+
let totalProcessed = 0;
|
|
520
554
|
progress.crawling.status = "processing";
|
|
521
555
|
progress.crawling.total = urlsToProcess.length;
|
|
522
556
|
onProgress?.(progress);
|
|
@@ -569,12 +603,24 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
569
603
|
useChrome
|
|
570
604
|
};
|
|
571
605
|
const crawler = new PlaywrightCrawler(crawlerOptions);
|
|
572
|
-
const
|
|
606
|
+
const allRequests = [...urlsToProcess.map((url) => ({
|
|
573
607
|
url,
|
|
574
608
|
userData: { depth: 0 }
|
|
575
|
-
}));
|
|
609
|
+
}))];
|
|
576
610
|
try {
|
|
577
|
-
await crawler.run(
|
|
611
|
+
await crawler.run(allRequests);
|
|
612
|
+
totalProcessed += urlsToProcess.length;
|
|
613
|
+
while (pendingUrls.length > 0 && totalProcessed < maxRequestsPerCrawl) {
|
|
614
|
+
const batch = pendingUrls.splice(0, maxRequestsPerCrawl - totalProcessed);
|
|
615
|
+
progress.crawling.total += batch.length;
|
|
616
|
+
onProgress?.(progress);
|
|
617
|
+
const batchRequests = batch.map((item) => ({
|
|
618
|
+
url: item.url,
|
|
619
|
+
userData: { depth: item.depth }
|
|
620
|
+
}));
|
|
621
|
+
await crawler.run(batchRequests);
|
|
622
|
+
totalProcessed += batch.length;
|
|
623
|
+
}
|
|
578
624
|
} catch (error) {
|
|
579
625
|
const msg = error instanceof Error ? error.message : "";
|
|
580
626
|
if (msg.includes("wmic") || msg.includes("ENOENT")) throw new Error(`Crawlee failed to spawn a system process (${msg}). On Windows 11+, wmic.exe is no longer available. Upgrade crawlee to >=3.16.0 or use the HTTP driver instead (--driver http).`);
|
|
@@ -585,60 +631,74 @@ async function crawlAndGenerate(options, onProgress) {
|
|
|
585
631
|
throw error;
|
|
586
632
|
}
|
|
587
633
|
await purgeDefaultStorages();
|
|
588
|
-
} else
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
skip: false
|
|
598
|
-
};
|
|
599
|
-
await hooks.callHook("crawl:url", urlCtx);
|
|
600
|
-
if (urlCtx.skip) return;
|
|
601
|
-
try {
|
|
602
|
-
const fetchStart = Date.now();
|
|
603
|
-
const response = await ofetch.raw(url, {
|
|
604
|
-
headers: FETCH_HEADERS,
|
|
605
|
-
responseType: "text",
|
|
606
|
-
retry: 2,
|
|
607
|
-
retryDelay: 500,
|
|
608
|
-
timeout: 1e4,
|
|
609
|
-
onResponseError({ response }) {
|
|
610
|
-
if (response.status === 429) {
|
|
611
|
-
const retryAfter = response.headers.get("retry-after");
|
|
612
|
-
const delaySec = retryAfter ? Number.parseInt(retryAfter) || 1 : 2;
|
|
613
|
-
if (!crawlDelay || delaySec > crawlDelay) crawlDelay = delaySec;
|
|
614
|
-
}
|
|
615
|
-
}
|
|
616
|
-
});
|
|
617
|
-
recordLatency(Date.now() - fetchStart);
|
|
618
|
-
const body = response._data ?? "";
|
|
619
|
-
const contentType = response.headers.get("content-type") || "";
|
|
620
|
-
await processPage(url, body, "", 0, contentType.includes("text/markdown") || contentType.includes("text/x-markdown"));
|
|
621
|
-
} catch (error) {
|
|
622
|
-
if (verbose) console.error(`[ERROR] URL: ${url}, Error: ${error instanceof Error ? error.message : "Unknown"}`);
|
|
623
|
-
progress.crawling.failed++;
|
|
624
|
-
results.push({
|
|
634
|
+
} else {
|
|
635
|
+
const fetchPage = async (url, depth) => {
|
|
636
|
+
progress.crawling.currentUrl = url;
|
|
637
|
+
onProgress?.(progress);
|
|
638
|
+
if (crawlDelay) {
|
|
639
|
+
const delay = crawlDelay;
|
|
640
|
+
await new Promise((resolve) => setTimeout(resolve, delay * 1e3));
|
|
641
|
+
}
|
|
642
|
+
const urlCtx = {
|
|
625
643
|
url,
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
644
|
+
skip: false
|
|
645
|
+
};
|
|
646
|
+
await hooks.callHook("crawl:url", urlCtx);
|
|
647
|
+
if (urlCtx.skip) return;
|
|
648
|
+
try {
|
|
649
|
+
const fetchStart = Date.now();
|
|
650
|
+
const response = await ofetch.raw(url, {
|
|
651
|
+
headers: FETCH_HEADERS,
|
|
652
|
+
responseType: "text",
|
|
653
|
+
retry: 2,
|
|
654
|
+
retryDelay: 500,
|
|
655
|
+
timeout: 1e4,
|
|
656
|
+
onResponseError({ response }) {
|
|
657
|
+
if (response.status === 429) {
|
|
658
|
+
const retryAfter = response.headers.get("retry-after");
|
|
659
|
+
const delaySec = retryAfter ? Number.parseInt(retryAfter) || 1 : 2;
|
|
660
|
+
if (!crawlDelay || delaySec > crawlDelay) crawlDelay = delaySec;
|
|
661
|
+
}
|
|
662
|
+
}
|
|
663
|
+
});
|
|
664
|
+
recordLatency(Date.now() - fetchStart);
|
|
665
|
+
const body = response._data ?? "";
|
|
666
|
+
const contentType = response.headers.get("content-type") || "";
|
|
667
|
+
const isHtml = contentType.includes("text/html") || contentType.includes("application/xhtml");
|
|
668
|
+
const isMarkdown = contentType.includes("text/markdown") || contentType.includes("text/x-markdown");
|
|
669
|
+
if (!isHtml && !isMarkdown) return;
|
|
670
|
+
await processPage(url, body, "", depth, isMarkdown);
|
|
671
|
+
} catch (error) {
|
|
672
|
+
if (verbose) console.error(`[ERROR] URL: ${url}, Error: ${error instanceof Error ? error.message : "Unknown"}`);
|
|
673
|
+
progress.crawling.failed++;
|
|
674
|
+
results.push({
|
|
675
|
+
url,
|
|
632
676
|
title: "",
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
677
|
+
content: "",
|
|
678
|
+
timestamp: Date.now(),
|
|
679
|
+
success: false,
|
|
680
|
+
error: error instanceof Error ? error.message : "Unknown error",
|
|
681
|
+
metadata: {
|
|
682
|
+
title: "",
|
|
683
|
+
description: "",
|
|
684
|
+
links: []
|
|
685
|
+
},
|
|
686
|
+
depth
|
|
687
|
+
});
|
|
688
|
+
progress.crawling.processed = results.length;
|
|
689
|
+
onProgress?.(progress);
|
|
690
|
+
}
|
|
691
|
+
};
|
|
692
|
+
await runConcurrent(urlsToProcess, DEFAULT_CONCURRENCY, (url) => fetchPage(url, 0));
|
|
693
|
+
totalProcessed += urlsToProcess.length;
|
|
694
|
+
while (pendingUrls.length > 0 && totalProcessed < maxRequestsPerCrawl) {
|
|
695
|
+
const batch = pendingUrls.splice(0, maxRequestsPerCrawl - totalProcessed);
|
|
696
|
+
progress.crawling.total += batch.length;
|
|
639
697
|
onProgress?.(progress);
|
|
698
|
+
await runConcurrent(batch, DEFAULT_CONCURRENCY, (item) => fetchPage(item.url, item.depth));
|
|
699
|
+
totalProcessed += batch.length;
|
|
640
700
|
}
|
|
641
|
-
}
|
|
701
|
+
}
|
|
642
702
|
progress.crawling.status = "completed";
|
|
643
703
|
onProgress?.(progress);
|
|
644
704
|
await hooks.callHook("crawl:done", { results });
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mdream/crawl",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "1.0.
|
|
4
|
+
"version": "1.0.3",
|
|
5
5
|
"description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -55,16 +55,16 @@
|
|
|
55
55
|
},
|
|
56
56
|
"dependencies": {
|
|
57
57
|
"@clack/prompts": "^1.1.0",
|
|
58
|
-
"c12": "^3.
|
|
59
|
-
"hookable": "^
|
|
58
|
+
"c12": "^3.3.3",
|
|
59
|
+
"hookable": "^6.1.0",
|
|
60
60
|
"nypm": "^0.6.5",
|
|
61
61
|
"ofetch": "^1.5.1",
|
|
62
62
|
"pathe": "^2.0.3",
|
|
63
63
|
"picomatch": "^4.0.3",
|
|
64
64
|
"tldts": "^7.0.26",
|
|
65
65
|
"ufo": "^1.6.3",
|
|
66
|
-
"@mdream/js": "1.0.
|
|
67
|
-
"mdream": "1.0.
|
|
66
|
+
"@mdream/js": "1.0.3",
|
|
67
|
+
"mdream": "1.0.3"
|
|
68
68
|
},
|
|
69
69
|
"devDependencies": {
|
|
70
70
|
"@types/picomatch": "^4.0.2"
|