@mdream/crawl 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -136,6 +136,20 @@ const FETCH_HEADERS = {
136
136
  "Accept": "text/html,application/xhtml+xml,text/markdown"
137
137
  };
138
138
  const DEFAULT_CONCURRENCY = 20;
139
+ const IGNORED_PATH_PREFIXES = [
140
+ "/cdn-cgi/",
141
+ "/_next/",
142
+ "/_nuxt/",
143
+ "/__",
144
+ "/wp-admin/",
145
+ "/wp-json/",
146
+ "/wp-includes/",
147
+ "/wp-content/uploads/",
148
+ "/api/",
149
+ "/assets/",
150
+ "/static/"
151
+ ];
152
+ const HTML_EXTENSIONS_RE = /\.(html?|php|aspx?|jsp)$/i;
139
153
  function extractCdataUrl(url) {
140
154
  if (url.startsWith("<![CDATA[") && url.endsWith("]]>")) return url.slice(9, -3);
141
155
  return url;
@@ -186,7 +200,8 @@ function extractMetadataInline(parsedUrl, allowedDomains) {
186
200
  const href = el.attributes.href;
187
201
  if (href) try {
188
202
  const resolved = new URL(href, url);
189
- const absoluteUrl = resolved.href;
203
+ resolved.hash = "";
204
+ const absoluteUrl = resolved.href.replace(URL_TRAILING_SLASH_RE, "") || resolved.href;
190
205
  if (allowedDomains) {
191
206
  const domain = getRegistrableDomain(resolved.hostname);
192
207
  if (domain && allowedDomains.has(domain)) links.add(absoluteUrl);
@@ -418,6 +433,16 @@ async function crawlAndGenerate(options, onProgress) {
418
433
  return "";
419
434
  }
420
435
  }).filter(Boolean)) : void 0;
436
+ const isContentUrl = (url) => {
437
+ try {
438
+ const pathname = new URL(url).pathname;
439
+ for (let i = 0; i < IGNORED_PATH_PREFIXES.length; i++) if (pathname.startsWith(IGNORED_PATH_PREFIXES[i])) return false;
440
+ if (pathname.lastIndexOf(".") > pathname.lastIndexOf("/")) return HTML_EXTENSIONS_RE.test(pathname);
441
+ return true;
442
+ } catch {
443
+ return false;
444
+ }
445
+ };
421
446
  const shouldCrawlUrl = (url) => {
422
447
  if (isUrlExcluded(url, exclude, allowSubdomains)) return false;
423
448
  if (!hasGlobPatterns) {
@@ -512,11 +537,20 @@ async function crawlAndGenerate(options, onProgress) {
512
537
  onProgress?.(progress);
513
538
  }
514
539
  if (followLinks && !singlePageMode && depth < maxDepth) {
515
- const filteredLinks = metadata.links.filter((link) => shouldCrawlUrl(link));
516
- for (const link of filteredLinks) processedUrls.add(link);
540
+ const filteredLinks = metadata.links.filter((link) => shouldCrawlUrl(link) && isContentUrl(link));
541
+ for (const link of filteredLinks) if (!processedUrls.has(link)) {
542
+ processedUrls.add(link);
543
+ pendingUrls.push({
544
+ url: link,
545
+ depth: depth + 1
546
+ });
547
+ }
517
548
  }
518
549
  };
550
+ const pendingUrls = [];
519
551
  const urlsToProcess = startingUrls.slice(0, maxRequestsPerCrawl);
552
+ for (const url of urlsToProcess) processedUrls.add(url.replace(URL_TRAILING_SLASH_RE, "") || url);
553
+ let totalProcessed = 0;
520
554
  progress.crawling.status = "processing";
521
555
  progress.crawling.total = urlsToProcess.length;
522
556
  onProgress?.(progress);
@@ -569,12 +603,24 @@ async function crawlAndGenerate(options, onProgress) {
569
603
  useChrome
570
604
  };
571
605
  const crawler = new PlaywrightCrawler(crawlerOptions);
572
- const initialRequests = urlsToProcess.map((url) => ({
606
+ const allRequests = [...urlsToProcess.map((url) => ({
573
607
  url,
574
608
  userData: { depth: 0 }
575
- }));
609
+ }))];
576
610
  try {
577
- await crawler.run(initialRequests);
611
+ await crawler.run(allRequests);
612
+ totalProcessed += urlsToProcess.length;
613
+ while (pendingUrls.length > 0 && totalProcessed < maxRequestsPerCrawl) {
614
+ const batch = pendingUrls.splice(0, maxRequestsPerCrawl - totalProcessed);
615
+ progress.crawling.total += batch.length;
616
+ onProgress?.(progress);
617
+ const batchRequests = batch.map((item) => ({
618
+ url: item.url,
619
+ userData: { depth: item.depth }
620
+ }));
621
+ await crawler.run(batchRequests);
622
+ totalProcessed += batch.length;
623
+ }
578
624
  } catch (error) {
579
625
  const msg = error instanceof Error ? error.message : "";
580
626
  if (msg.includes("wmic") || msg.includes("ENOENT")) throw new Error(`Crawlee failed to spawn a system process (${msg}). On Windows 11+, wmic.exe is no longer available. Upgrade crawlee to >=3.16.0 or use the HTTP driver instead (--driver http).`);
@@ -585,60 +631,74 @@ async function crawlAndGenerate(options, onProgress) {
585
631
  throw error;
586
632
  }
587
633
  await purgeDefaultStorages();
588
- } else await runConcurrent(urlsToProcess, DEFAULT_CONCURRENCY, async (url) => {
589
- progress.crawling.currentUrl = url;
590
- onProgress?.(progress);
591
- if (crawlDelay) {
592
- const delay = crawlDelay;
593
- await new Promise((resolve) => setTimeout(resolve, delay * 1e3));
594
- }
595
- const urlCtx = {
596
- url,
597
- skip: false
598
- };
599
- await hooks.callHook("crawl:url", urlCtx);
600
- if (urlCtx.skip) return;
601
- try {
602
- const fetchStart = Date.now();
603
- const response = await ofetch.raw(url, {
604
- headers: FETCH_HEADERS,
605
- responseType: "text",
606
- retry: 2,
607
- retryDelay: 500,
608
- timeout: 1e4,
609
- onResponseError({ response }) {
610
- if (response.status === 429) {
611
- const retryAfter = response.headers.get("retry-after");
612
- const delaySec = retryAfter ? Number.parseInt(retryAfter) || 1 : 2;
613
- if (!crawlDelay || delaySec > crawlDelay) crawlDelay = delaySec;
614
- }
615
- }
616
- });
617
- recordLatency(Date.now() - fetchStart);
618
- const body = response._data ?? "";
619
- const contentType = response.headers.get("content-type") || "";
620
- await processPage(url, body, "", 0, contentType.includes("text/markdown") || contentType.includes("text/x-markdown"));
621
- } catch (error) {
622
- if (verbose) console.error(`[ERROR] URL: ${url}, Error: ${error instanceof Error ? error.message : "Unknown"}`);
623
- progress.crawling.failed++;
624
- results.push({
634
+ } else {
635
+ const fetchPage = async (url, depth) => {
636
+ progress.crawling.currentUrl = url;
637
+ onProgress?.(progress);
638
+ if (crawlDelay) {
639
+ const delay = crawlDelay;
640
+ await new Promise((resolve) => setTimeout(resolve, delay * 1e3));
641
+ }
642
+ const urlCtx = {
625
643
  url,
626
- title: "",
627
- content: "",
628
- timestamp: Date.now(),
629
- success: false,
630
- error: error instanceof Error ? error.message : "Unknown error",
631
- metadata: {
644
+ skip: false
645
+ };
646
+ await hooks.callHook("crawl:url", urlCtx);
647
+ if (urlCtx.skip) return;
648
+ try {
649
+ const fetchStart = Date.now();
650
+ const response = await ofetch.raw(url, {
651
+ headers: FETCH_HEADERS,
652
+ responseType: "text",
653
+ retry: 2,
654
+ retryDelay: 500,
655
+ timeout: 1e4,
656
+ onResponseError({ response }) {
657
+ if (response.status === 429) {
658
+ const retryAfter = response.headers.get("retry-after");
659
+ const delaySec = retryAfter ? Number.parseInt(retryAfter) || 1 : 2;
660
+ if (!crawlDelay || delaySec > crawlDelay) crawlDelay = delaySec;
661
+ }
662
+ }
663
+ });
664
+ recordLatency(Date.now() - fetchStart);
665
+ const body = response._data ?? "";
666
+ const contentType = response.headers.get("content-type") || "";
667
+ const isHtml = contentType.includes("text/html") || contentType.includes("application/xhtml");
668
+ const isMarkdown = contentType.includes("text/markdown") || contentType.includes("text/x-markdown");
669
+ if (!isHtml && !isMarkdown) return;
670
+ await processPage(url, body, "", depth, isMarkdown);
671
+ } catch (error) {
672
+ if (verbose) console.error(`[ERROR] URL: ${url}, Error: ${error instanceof Error ? error.message : "Unknown"}`);
673
+ progress.crawling.failed++;
674
+ results.push({
675
+ url,
632
676
  title: "",
633
- description: "",
634
- links: []
635
- },
636
- depth: 0
637
- });
638
- progress.crawling.processed = results.length;
677
+ content: "",
678
+ timestamp: Date.now(),
679
+ success: false,
680
+ error: error instanceof Error ? error.message : "Unknown error",
681
+ metadata: {
682
+ title: "",
683
+ description: "",
684
+ links: []
685
+ },
686
+ depth
687
+ });
688
+ progress.crawling.processed = results.length;
689
+ onProgress?.(progress);
690
+ }
691
+ };
692
+ await runConcurrent(urlsToProcess, DEFAULT_CONCURRENCY, (url) => fetchPage(url, 0));
693
+ totalProcessed += urlsToProcess.length;
694
+ while (pendingUrls.length > 0 && totalProcessed < maxRequestsPerCrawl) {
695
+ const batch = pendingUrls.splice(0, maxRequestsPerCrawl - totalProcessed);
696
+ progress.crawling.total += batch.length;
639
697
  onProgress?.(progress);
698
+ await runConcurrent(batch, DEFAULT_CONCURRENCY, (item) => fetchPage(item.url, item.depth));
699
+ totalProcessed += batch.length;
640
700
  }
641
- });
701
+ }
642
702
  progress.crawling.status = "completed";
643
703
  onProgress?.(progress);
644
704
  await hooks.callHook("crawl:done", { results });
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mdream/crawl",
3
3
  "type": "module",
4
- "version": "1.0.1",
4
+ "version": "1.0.3",
5
5
  "description": "Mdream Crawl generates comprehensive llms.txt artifacts from a single URL, using mdream to convert HTML to Markdown.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -55,16 +55,16 @@
55
55
  },
56
56
  "dependencies": {
57
57
  "@clack/prompts": "^1.1.0",
58
- "c12": "^3.0.4",
59
- "hookable": "^5.5.3",
58
+ "c12": "^3.3.3",
59
+ "hookable": "^6.1.0",
60
60
  "nypm": "^0.6.5",
61
61
  "ofetch": "^1.5.1",
62
62
  "pathe": "^2.0.3",
63
63
  "picomatch": "^4.0.3",
64
64
  "tldts": "^7.0.26",
65
65
  "ufo": "^1.6.3",
66
- "@mdream/js": "1.0.1",
67
- "mdream": "1.0.1"
66
+ "@mdream/js": "1.0.3",
67
+ "mdream": "1.0.3"
68
68
  },
69
69
  "devDependencies": {
70
70
  "@types/picomatch": "^4.0.2"