@pseolint/core 0.6.4 → 0.6.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/auditor.js CHANGED
@@ -34,7 +34,7 @@ import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-con
34
34
  import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
35
35
  import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
36
36
  import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
37
- import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds } from "./rules/tech/robots-sitemap-presence.js";
37
+ import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds, parseSitemapDirectives } from "./rules/tech/robots-sitemap-presence.js";
38
38
  import { llmsTxtRule } from "./rules/aeo/llms-txt.js";
39
39
  import { crawlerAccessRule } from "./rules/aeo/crawler-access.js";
40
40
  import { freshnessSignalsRule } from "./rules/aeo/freshness-signals.js";
@@ -615,157 +615,157 @@ sampled = false) {
615
615
  // Spam rules — always compute cross-page data, only push findings if enabled
616
616
  const nearDuplicate = nearDuplicateRule(pages, resolvedRules.nearDuplicateThreshold);
617
617
  if (isEnabled("spam/near-duplicate") && modeOk("spam/near-duplicate")) {
618
- findings.push(...tag(nearDuplicate.findings));
618
+ pushAll(findings, tag(nearDuplicate.findings));
619
619
  }
620
620
  const entitySwap = entitySwapRule(pages, entityPatterns, resolvedRules.entitySwapThreshold);
621
621
  if (isEnabled("spam/entity-swap") && modeOk("spam/entity-swap")) {
622
- findings.push(...tag(entitySwap.findings));
622
+ pushAll(findings, tag(entitySwap.findings));
623
623
  }
624
624
  const thinContent = thinContentRule(pages, resolvedRules.thinContentMinWords);
625
625
  if (isEnabled("spam/thin-content") && modeOk("spam/thin-content")) {
626
- findings.push(...tag(thinContent.findings));
626
+ pushAll(findings, tag(thinContent.findings));
627
627
  }
628
628
  if (isEnabled("spam/doorway-pattern") && modeOk("spam/doorway-pattern")) {
629
- findings.push(...tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
629
+ pushAll(findings, tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
630
630
  }
631
631
  if (isEnabled("spam/publication-velocity") && modeOk("spam/publication-velocity")) {
632
- findings.push(...tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay, resolvedRules.publicationVelocityMaxPerDayCorpusFraction)));
632
+ pushAll(findings, tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay, resolvedRules.publicationVelocityMaxPerDayCorpusFraction)));
633
633
  }
634
634
  if (isEnabled("spam/boilerplate-ratio") && modeOk("spam/boilerplate-ratio")) {
635
- findings.push(...tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
635
+ pushAll(findings, tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
636
636
  }
637
637
  if (isEnabled("spam/template-diversity") && modeOk("spam/template-diversity")) {
638
- findings.push(...tag(templateDiversityRule(pages, resolvedRules.templateDiversityMinUniqueRatio)));
638
+ pushAll(findings, tag(templateDiversityRule(pages, resolvedRules.templateDiversityMinUniqueRatio)));
639
639
  }
640
640
  if (isEnabled("spam/template-coverage") && modeOk("spam/template-coverage")) {
641
- findings.push(...tag(templateCoverageRule(pages, entityPatterns, resolvedRules.templateCoverageMinPages)));
641
+ pushAll(findings, tag(templateCoverageRule(pages, entityPatterns, resolvedRules.templateCoverageMinPages)));
642
642
  }
643
643
  // Content rules
644
644
  if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
645
- findings.push(...tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
645
+ pushAll(findings, tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
646
646
  }
647
647
  if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
648
- findings.push(...tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
648
+ pushAll(findings, tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
649
649
  }
650
650
  if (isEnabled("content/missing-author") && modeOk("content/missing-author")) {
651
- findings.push(...tag(missingAuthorRule(pages)));
651
+ pushAll(findings, tag(missingAuthorRule(pages)));
652
652
  }
653
653
  if (isEnabled("content/eeat-signals") && modeOk("content/eeat-signals")) {
654
- findings.push(...tag(eeatSignalsRule(pages)));
654
+ pushAll(findings, tag(eeatSignalsRule(pages)));
655
655
  }
656
656
  // 2026-05-03 v0.5.2 blind-spot fixes — title uniqueness + heading
657
657
  // structure + image alt-text were tier-1 gaps in the blind-spot audit.
658
658
  if (isEnabled("content/title-uniqueness") && modeOk("content/title-uniqueness")) {
659
- findings.push(...tag(titleUniquenessRule(pages)));
659
+ pushAll(findings, tag(titleUniquenessRule(pages)));
660
660
  }
661
661
  if (isEnabled("content/heading-structure") && modeOk("content/heading-structure")) {
662
- findings.push(...tag(headingStructureRule(pages)));
662
+ pushAll(findings, tag(headingStructureRule(pages)));
663
663
  }
664
664
  if (isEnabled("content/image-alt-text") && modeOk("content/image-alt-text")) {
665
- findings.push(...tag(imageAltTextRule(pages)));
665
+ pushAll(findings, tag(imageAltTextRule(pages)));
666
666
  }
667
667
  if (isEnabled("content/translation-no-op") && modeOk("content/translation-no-op")) {
668
- findings.push(...tag(translationNoOpRule(pages)));
668
+ pushAll(findings, tag(translationNoOpRule(pages)));
669
669
  }
670
670
  if (isEnabled("content/regurgitated-content") && modeOk("content/regurgitated-content")) {
671
- findings.push(...tag(regurgitatedContentRule(pages)));
671
+ pushAll(findings, tag(regurgitatedContentRule(pages)));
672
672
  }
673
673
  if (isEnabled("content/common-phrase-reuse") && modeOk("content/common-phrase-reuse")) {
674
- findings.push(...tag(commonPhraseReuseRule(pages)));
674
+ pushAll(findings, tag(commonPhraseReuseRule(pages)));
675
675
  }
676
676
  if (isEnabled("content/wikipedia-paraphrase") && modeOk("content/wikipedia-paraphrase")) {
677
- findings.push(...tag(wikipediaParaphraseRule(pages)));
677
+ pushAll(findings, tag(wikipediaParaphraseRule(pages)));
678
678
  }
679
679
  // Link rules — use the global link graph
680
680
  if (isEnabled("links/orphan-pages") && modeOk("links/orphan-pages")) {
681
- findings.push(...tag(orphanPagesRule(pages, inbound, rootUrl)));
681
+ pushAll(findings, tag(orphanPagesRule(pages, inbound, rootUrl)));
682
682
  }
683
683
  if (isEnabled("links/dead-ends") && modeOk("links/dead-ends")) {
684
- findings.push(...tag(deadEndsRule(pages, knownUrls, rootUrl)));
684
+ pushAll(findings, tag(deadEndsRule(pages, knownUrls, rootUrl)));
685
685
  }
686
686
  if (isEnabled("links/link-depth") && modeOk("links/link-depth")) {
687
687
  if (rootUrl) {
688
- findings.push(...tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound, sampled)));
688
+ pushAll(findings, tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound, sampled)));
689
689
  }
690
690
  }
691
691
  if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
692
- findings.push(...tag(clusterConnectivityRule(pages, knownUrls)));
692
+ pushAll(findings, tag(clusterConnectivityRule(pages, knownUrls)));
693
693
  }
694
694
  if (isEnabled("links/host-section-divergence") && modeOk("links/host-section-divergence")) {
695
- findings.push(...tag(hostSectionDivergenceRule(pages, adjacency)));
695
+ pushAll(findings, tag(hostSectionDivergenceRule(pages, adjacency)));
696
696
  }
697
697
  // Tech rules
698
698
  if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
699
- findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
699
+ pushAll(findings, tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
700
700
  }
701
701
  if (isEnabled("tech/canonical-noindex-conflict") && modeOk("tech/canonical-noindex-conflict")) {
702
- findings.push(...tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
702
+ pushAll(findings, tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
703
703
  }
704
704
  if (isEnabled("tech/robots-noindex-conflict") && modeOk("tech/robots-noindex-conflict")) {
705
- findings.push(...tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
705
+ pushAll(findings, tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
706
706
  }
707
707
  if (isEnabled("tech/redirect-chain") && modeOk("tech/redirect-chain")) {
708
- findings.push(...tag(redirectChainRule(pages)));
708
+ pushAll(findings, tag(redirectChainRule(pages)));
709
709
  }
710
710
  if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
711
- findings.push(...tag(soft404Rule(pages)));
711
+ pushAll(findings, tag(soft404Rule(pages)));
712
712
  }
713
713
  if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
714
714
  // hreflang declarations on noindex'd pages are still bugs when they're
715
715
  // inconsistent — see auditor.test.ts "emits technical SEO findings".
716
- findings.push(...tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
716
+ pushAll(findings, tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
717
717
  }
718
718
  // 2026-05-03 v0.5.2 blind-spot fix: og-completeness was referenced in
719
719
  // the v0.4.x README without ever shipping. Now it does.
720
720
  if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
721
- findings.push(...tag(ogCompletenessRule(pages)));
721
+ pushAll(findings, tag(ogCompletenessRule(pages)));
722
722
  }
723
723
  // Schema rules
724
724
  if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
725
- findings.push(...tag(jsonLdValidRule(pages)));
725
+ pushAll(findings, tag(jsonLdValidRule(pages)));
726
726
  }
727
727
  if (isEnabled("schema/required-fields") && modeOk("schema/required-fields")) {
728
- findings.push(...tag(requiredFieldsRule(pages)));
728
+ pushAll(findings, tag(requiredFieldsRule(pages)));
729
729
  }
730
730
  if (isEnabled("schema/consistency") && modeOk("schema/consistency")) {
731
- findings.push(...tag(schemaConsistencyRule(pages)));
731
+ pushAll(findings, tag(schemaConsistencyRule(pages)));
732
732
  }
733
733
  // AEO rules
734
734
  if (isEnabled("aeo/freshness-signals")) {
735
- findings.push(...tag(freshnessSignalsRule(pages, {
735
+ pushAll(findings, tag(freshnessSignalsRule(pages, {
736
736
  maxStaleDays: resolvedRules.freshnessMaxStaleDays,
737
737
  })));
738
738
  }
739
739
  if (isEnabled("aeo/faq-coverage")) {
740
- findings.push(...tag(faqCoverageRule(pages, {
740
+ pushAll(findings, tag(faqCoverageRule(pages, {
741
741
  minQuestionHeadings: resolvedRules.faqMinQuestionHeadings,
742
742
  })));
743
743
  }
744
744
  if (isEnabled("aeo/answer-first")) {
745
- findings.push(...tag(answerFirstRule(pages, entityPatterns, {
745
+ pushAll(findings, tag(answerFirstRule(pages, entityPatterns, {
746
746
  maxFirstParagraphWords: resolvedRules.answerFirstMaxWords,
747
747
  })));
748
748
  }
749
749
  if (isEnabled("aeo/citable-facts")) {
750
- findings.push(...tag(citableFactsRule(pages, entityPatterns, {
750
+ pushAll(findings, tag(citableFactsRule(pages, entityPatterns, {
751
751
  minFactsPerPage: resolvedRules.citableFactsMin,
752
752
  targetFactsPerPage: resolvedRules.citableFactsTarget,
753
753
  })));
754
754
  }
755
755
  if (isEnabled("aeo/content-modularity")) {
756
- findings.push(...tag(contentModularityRule(pages, {
756
+ pushAll(findings, tag(contentModularityRule(pages, {
757
757
  maxParagraphWords: resolvedRules.modularityMaxParagraphWords,
758
758
  minSelfContainedRatio: resolvedRules.modularityMinSelfContainedRatio,
759
759
  })));
760
760
  }
761
761
  if (isEnabled("aeo/summary-bait")) {
762
- findings.push(...tag(summaryBaitRule(pages, entityPatterns)));
762
+ pushAll(findings, tag(summaryBaitRule(pages, entityPatterns)));
763
763
  }
764
764
  // Cannibal rules — only url-pattern survives in v0.4 (title-overlap and
765
765
  // keyword-collision dropped due to high false-positive rates; see
766
766
  // 2026-04-29 v0.4 redesign spec §4.3).
767
767
  if (isEnabled("cannibal/url-pattern") && modeOk("cannibal/url-pattern")) {
768
- findings.push(...tag(urlPatternRule(pages)));
768
+ pushAll(findings, tag(urlPatternRule(pages)));
769
769
  }
770
770
  return findings;
771
771
  }
@@ -1003,6 +1003,20 @@ function withDocsUrls(findings) {
1003
1003
  }
1004
1004
  return findings;
1005
1005
  }
1006
+ /**
1007
+ * Append every item of `items` to `target` in place. Use this instead of
1008
+ * `target.push(...items)` whenever `items` can be large. The spread form passes
1009
+ * each element as a separate call argument, and V8 caps argument count
1010
+ * (~131072) — so `push(...bigArray)` throws `RangeError: Maximum call stack size
1011
+ * exceeded` on large inputs. A dense site makes the pairwise rules
1012
+ * (near-duplicate / entity-swap) emit C(N,2) findings, which blew the cap at the
1013
+ * rule-aggregation push *before* enrichment was even reached. The loop has no
1014
+ * such limit. See tests/integration/large-corpus-no-overflow.test.ts.
1015
+ */
1016
+ function pushAll(target, items) {
1017
+ for (const item of items)
1018
+ target.push(item);
1019
+ }
1006
1020
  async function collectHtmlFiles(directory) {
1007
1021
  const entries = await readdir(directory, { withFileTypes: true });
1008
1022
  const files = await Promise.all(entries.map(async (entry) => {
@@ -1038,7 +1052,11 @@ function composeSignals(...signals) {
1038
1052
  }
1039
1053
  return ac.signal;
1040
1054
  }
1041
- async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop) {
1055
+ async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop,
1056
+ // Per-sitemap byte cap (sitemaps.org caps an uncompressed sitemap at 50 MB).
1057
+ // Guards against a hostile/misconfigured sitemap eating the whole byte budget
1058
+ // or memory. 0 / undefined = no cap.
1059
+ maxBytes) {
1042
1060
  try {
1043
1061
  stats.total += 1;
1044
1062
  const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
@@ -1048,6 +1066,11 @@ async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop)
1048
1066
  }
1049
1067
  if (r.status < 200 || r.status >= 300)
1050
1068
  return null;
1069
+ if (maxBytes && maxBytes > 0 && r.body.length > maxBytes) {
1070
+ // eslint-disable-next-line no-console
1071
+ console.error(`pseolint: sitemap ${url} is ${(r.body.length / 1_048_576).toFixed(0)}MB, over the ${(maxBytes / 1_048_576).toFixed(0)}MB cap — skipping it.`);
1072
+ return null;
1073
+ }
1051
1074
  return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
1052
1075
  }
1053
1076
  catch (err) {
@@ -1205,7 +1228,16 @@ function fisherYatesSample(items, n, random = Math.random) {
1205
1228
  }
1206
1229
  return arr.slice(arr.length - n);
1207
1230
  }
1208
- async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop) {
1231
+ /** sitemaps.org caps an uncompressed sitemap at 50 MB. */
1232
+ const SITEMAP_MAX_BYTES = 50 * 1024 * 1024;
1233
+ /**
1234
+ * Max `<sitemapindex>` nesting depth we recurse through. The protocol only
1235
+ * defines a single level of nesting, but some sites nest deeper; 5 is generous
1236
+ * while still bounding work (and stack) on a hostile/misconfigured index that a
1237
+ * `visited` set alone wouldn't catch (e.g. a deep non-cyclic chain).
1238
+ */
1239
+ const SITEMAP_MAX_DEPTH = 5;
1240
+ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth = 0, maxDepth = SITEMAP_MAX_DEPTH) {
1209
1241
  visited.add(sitemapUrl);
1210
1242
  const entries = parseSitemapUrlsWithLastmod(sitemapText);
1211
1243
  if (!isSitemapIndex(sitemapText)) {
@@ -1219,6 +1251,13 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
1219
1251
  }
1220
1252
  return { urls, lastmodByUrl };
1221
1253
  }
1254
+ // It's a sitemap index. Stop recursing past the depth cap (the index itself
1255
+ // carries no page URLs, only child-sitemap refs, so returning empty is safe).
1256
+ if (depth >= maxDepth) {
1257
+ // eslint-disable-next-line no-console
1258
+ console.error(`pseolint: sitemap-index nesting exceeded depth ${maxDepth} at ${sitemapUrl}; not recursing further.`);
1259
+ return { urls: [], lastmodByUrl: new Map() };
1260
+ }
1222
1261
  const allUrls = [];
1223
1262
  const allLastmodByUrl = new Map();
1224
1263
  for (const entry of entries) {
@@ -1227,14 +1266,14 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
1227
1266
  throw signal.reason ?? new Error("aborted");
1228
1267
  if (visited.has(childUrl))
1229
1268
  continue;
1230
- const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop);
1269
+ const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
1231
1270
  if (!child)
1232
1271
  continue;
1233
1272
  const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
1234
1273
  if (!childLike)
1235
1274
  continue;
1236
- const { urls: childUrls, lastmodByUrl: childLastmodByUrl } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
1237
- allUrls.push(...childUrls);
1275
+ const { urls: childUrls, lastmodByUrl: childLastmodByUrl } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth + 1, maxDepth);
1276
+ pushAll(allUrls, childUrls);
1238
1277
  for (const [u, lm] of childLastmodByUrl) {
1239
1278
  allLastmodByUrl.set(u, lm);
1240
1279
  }
@@ -1243,7 +1282,7 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
1243
1282
  }
1244
1283
  async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
1245
1284
  if (!origin)
1246
- return { disallow: [], crawlDelaySec: 0 };
1285
+ return { disallow: [], crawlDelaySec: 0, sitemaps: [] };
1247
1286
  try {
1248
1287
  const robotsUrl = `${origin}/robots.txt`;
1249
1288
  const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats, signal, validateHop);
@@ -1253,10 +1292,14 @@ async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validate
1253
1292
  return {
1254
1293
  disallow: parseDisallowPatterns(fetched.text, ["*", "pseolint"]),
1255
1294
  crawlDelaySec: parseCrawlDelaySeconds(fetched.text),
1295
+ // `Sitemap:` directives are origin-relative-agnostic (absolute URLs) and
1296
+ // there can be several. Surfaced so discovery can read the site's declared
1297
+ // sitemaps instead of guessing.
1298
+ sitemaps: parseSitemapDirectives(fetched.text),
1256
1299
  };
1257
1300
  }
1258
1301
  catch {
1259
- return { disallow: [], crawlDelaySec: 0 };
1302
+ return { disallow: [], crawlDelaySec: 0, sitemaps: [] };
1260
1303
  }
1261
1304
  }
1262
1305
  function sleep(ms) {
@@ -1272,7 +1315,12 @@ function isDisallowedByRobots(urlPath, patterns) {
1272
1315
  function budgetExceeded(b) {
1273
1316
  return b.cap > 0 && b.used >= b.cap;
1274
1317
  }
1275
- async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null) {
1318
+ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null,
1319
+ // Backpressure salvage: when provided, every page body that comes back is
1320
+ // pushed into this caller-owned array as it's fetched. If the watchdog aborts
1321
+ // mid-crawl and this function throws, the caller still holds the partial set
1322
+ // (the local `pages` array would otherwise be lost with the stack frame).
1323
+ pageSink) {
1276
1324
  // Memoized SSRF validator. When guardSsrf is on, every URL fetched by the
1277
1325
  // audit (source, sitemap entries, redirects, discovered links) goes through
1278
1326
  // this. DNS is hit once per unique hostname per audit — a 4k-page audit on
@@ -1359,7 +1407,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
1359
1407
  else {
1360
1408
  urlsToFetch = sampledUrls;
1361
1409
  }
1362
- const pages = [];
1410
+ // Reuse the caller's salvage sink as the live page accumulator so a
1411
+ // mid-crawl watchdog abort leaves the already-fetched pages visible to
1412
+ // the caller. Falls back to a private array when no sink is passed.
1413
+ const pages = pageSink ?? [];
1363
1414
  // Fetch robots.txt once for the origin — reused for Crawl-Delay pacing and Disallow checks.
1364
1415
  const sourceOrigin = (() => { try {
1365
1416
  return new URL(source).origin;
@@ -1469,7 +1520,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
1469
1520
  }
1470
1521
  if (contentType.includes("html") || looksLikeHtml(text)) {
1471
1522
  const initialPage = { url: source, html: text };
1472
- const pages = [initialPage];
1523
+ // See note above: reuse the caller's salvage sink so a watchdog abort
1524
+ // during link-discovery crawling preserves the pages fetched so far.
1525
+ const pages = pageSink ?? [];
1526
+ pages.push(initialPage);
1473
1527
  if (crawlDiscovery) {
1474
1528
  let sourceOrigin;
1475
1529
  try {
@@ -1481,6 +1535,92 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
1481
1535
  const knownCrawled = new Set([source]);
1482
1536
  const allDiscoveredUrls = new Set([source]);
1483
1537
  const maxDepth = 3;
1538
+ // Sitemap-first discovery (like Google). Before link-crawling, read the
1539
+ // sitemap(s) the site declares — link-crawl only reaches *linked* pages,
1540
+ // but a pSEO site's whole point is thousands of programmatic URLs that
1541
+ // may be sparsely linked (or behind a build-frozen, under-linked nav).
1542
+ // Sources of truth, in order:
1543
+ // 1. `Sitemap:` directives in robots.txt (there can be several)
1544
+ // 2. failing that, probe /sitemap.xml then /sitemap_index.xml
1545
+ // Sitemap-listed URLs are authoritative, so we fetch them FIRST; the
1546
+ // link-crawl below then fills any remaining budget and dedups against
1547
+ // them. When no sitemap exists, this is a no-op and we crawl as before.
1548
+ if (sourceOrigin) {
1549
+ const robotsForDiscovery = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats, signal, validateHop);
1550
+ const probing = robotsForDiscovery.sitemaps.length === 0;
1551
+ const sitemapCandidates = probing
1552
+ ? [`${sourceOrigin}/sitemap.xml`, `${sourceOrigin}/sitemap_index.xml`]
1553
+ : robotsForDiscovery.sitemaps;
1554
+ const visitedSitemaps = new Set();
1555
+ const sitemapListedUrls = [];
1556
+ for (const candidate of sitemapCandidates) {
1557
+ if (discoveryBudget > 0 && pages.length + sitemapListedUrls.length >= discoveryBudget)
1558
+ break;
1559
+ if (visitedSitemaps.has(candidate))
1560
+ continue;
1561
+ let smText;
1562
+ let smType;
1563
+ try {
1564
+ if (validateHop)
1565
+ await validateHop(candidate);
1566
+ const fetched = await fetchWithRetry(candidate, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
1567
+ if (!fetched)
1568
+ continue;
1569
+ smText = fetched.text;
1570
+ smType = fetched.contentType;
1571
+ }
1572
+ catch {
1573
+ continue; // SSRF refusal, network error, etc. — skip this candidate
1574
+ }
1575
+ if (!(smType.includes("xml") || looksLikeSitemap(smText)))
1576
+ continue;
1577
+ const { urls: discoveredSmUrls } = await collectUrlsFromSitemap(smText, candidate, visitedSitemaps, timeoutMs, cache, stats, signal, validateHop);
1578
+ pushAll(sitemapListedUrls, discoveredSmUrls);
1579
+ // When probing the conventional paths, stop at the first that hits.
1580
+ if (probing && discoveredSmUrls.length > 0)
1581
+ break;
1582
+ }
1583
+ // Same-origin + robots-aware filter, deduped against what we have.
1584
+ const seedUrls = Array.from(new Set(sitemapListedUrls)).filter((u) => {
1585
+ if (knownCrawled.has(u))
1586
+ return false;
1587
+ try {
1588
+ const parsed = new URL(u);
1589
+ if (parsed.origin !== sourceOrigin)
1590
+ return false;
1591
+ if (respectRobotsTxt && isDisallowedByRobots(parsed.pathname, robotsForDiscovery.disallow)) {
1592
+ skippedByRobots.push(u);
1593
+ return false;
1594
+ }
1595
+ return true;
1596
+ }
1597
+ catch {
1598
+ return false;
1599
+ }
1600
+ });
1601
+ for (const u of seedUrls)
1602
+ allDiscoveredUrls.add(u);
1603
+ // Cap the seed fetch. With a sampling budget, fit under it; without one
1604
+ // (the default "audit everything" path) bound by maxCrawlDiscovered, the
1605
+ // same ceiling the link-crawl honors — otherwise a homepage audit of a
1606
+ // site with a 50k-URL sitemap would try to fetch all of them (the link
1607
+ // crawl never could, so this would be an unbounded-egress regression).
1608
+ const seedToFetch = discoveryBudget > 0
1609
+ ? seedUrls.slice(0, Math.max(0, discoveryBudget - pages.length))
1610
+ : seedUrls.slice(0, maxCrawlDiscovered);
1611
+ if (seedToFetch.length > 0) {
1612
+ await runWithConcurrency(seedToFetch, concurrency, async (url) => {
1613
+ if (budgetExceeded(byteBudget))
1614
+ return;
1615
+ const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
1616
+ knownCrawled.add(url);
1617
+ if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
1618
+ byteBudget.used += result.html.length;
1619
+ pages.push(result);
1620
+ }
1621
+ });
1622
+ }
1623
+ }
1484
1624
  for (let depth = 0; depth < maxDepth; depth += 1) {
1485
1625
  // Stop if we've hit the discovery budget
1486
1626
  if (discoveryBudget > 0 && pages.length >= discoveryBudget)
@@ -1542,7 +1682,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
1542
1682
  knownCrawled.add(url);
1543
1683
  }
1544
1684
  });
1545
- pages.push(...newPages);
1685
+ pushAll(pages, newPages);
1546
1686
  if (newPages.length === 0)
1547
1687
  break;
1548
1688
  }
@@ -1633,6 +1773,13 @@ export async function auditSource(source, options) {
1633
1773
  const backpressureEnabled = options?.backpressure !== false;
1634
1774
  const backpressureAbort = new AbortController();
1635
1775
  let backpressureError = null;
1776
+ // Set once we've decided to salvage a partial report after a watchdog abort.
1777
+ // From that point `throwIfAborted` must NOT re-throw the backpressure error —
1778
+ // the watchdog already did its job (stopped fetching); the rest of the
1779
+ // pipeline runs over the pages collected so far and the truncation is
1780
+ // surfaced on the summary instead.
1781
+ let truncated = false;
1782
+ let truncatedReason;
1636
1783
  const signal = composeSignals(externalSignal, backpressureAbort.signal);
1637
1784
  const observer = new FetchObserver();
1638
1785
  // 2026-05-03 calibration: the prior (3s p95 cap, 2× baseline multiplier)
@@ -1674,12 +1821,32 @@ export async function auditSource(source, options) {
1674
1821
  backpressureAbort.abort(backpressureError);
1675
1822
  }
1676
1823
  };
1824
+ // Flip the run into salvage mode after a watchdog abort: record the reason so
1825
+ // assembly sets summary.truncated, and from here `throwIfAborted` will no
1826
+ // longer re-throw the backpressure error. Idempotent. Returns true when a
1827
+ // backpressure abort was present to salvage.
1828
+ function salvageBackpressure() {
1829
+ if (!backpressureError)
1830
+ return false;
1831
+ truncated = true;
1832
+ truncatedReason = backpressureError.message;
1833
+ return true;
1834
+ }
1677
1835
  function throwIfAborted() {
1678
- if (backpressureError)
1679
- throw backpressureError;
1836
+ // An EXTERNAL abort (ctrl-C, parent timeout) is always fatal: the caller
1837
+ // asked to stop, not to degrade. Check it first so it wins over salvage.
1680
1838
  if (externalSignal?.aborted) {
1681
1839
  throw externalSignal.reason ?? new DOMException("Audit aborted", "AbortError");
1682
1840
  }
1841
+ // A backpressure abort is salvageable. Once we've committed to a partial
1842
+ // report (`truncated`), swallow it and let the pipeline finish over the
1843
+ // pages collected so far. Before that commit, the loader-boundary catch
1844
+ // handles it; this guard only fires on the rare path where the loader
1845
+ // returned normally (e.g. a fetch mock that ignores the abort signal) yet
1846
+ // the watchdog still voted to abort — salvage rather than crash.
1847
+ if (backpressureError && !truncated) {
1848
+ salvageBackpressure();
1849
+ }
1683
1850
  }
1684
1851
  const resolvedRules = {
1685
1852
  nearDuplicateThreshold: options?.rules?.nearDuplicateThreshold ?? DEFAULTS.nearDuplicateThreshold,
@@ -1834,13 +2001,26 @@ export async function auditSource(source, options) {
1834
2001
  }
1835
2002
  : undefined;
1836
2003
  const pinnedPages = [];
1837
- await runWithConcurrency(Array.from(pinned), concurrency, async (url) => {
1838
- const result = await fetchPageWithMeta(url, timeoutMs, cacheConfig, cacheStats, signal, validateHopPinned, followRedirects);
1839
- if (result) {
1840
- fetchByteBudget.used += result.html.length;
1841
- pinnedPages.push(result);
2004
+ try {
2005
+ await runWithConcurrency(Array.from(pinned), concurrency, async (url) => {
2006
+ const result = await fetchPageWithMeta(url, timeoutMs, cacheConfig, cacheStats, signal, validateHopPinned, followRedirects);
2007
+ if (result) {
2008
+ fetchByteBudget.used += result.html.length;
2009
+ pinnedPages.push(result);
2010
+ }
2011
+ });
2012
+ }
2013
+ catch (err) {
2014
+ // Same salvage contract as the sitemap/crawl path: a watchdog abort
2015
+ // mid-fetch keeps the pages already collected in `pinnedPages`. Any other
2016
+ // error (external abort, SSRF rejection) is fatal — re-throw it.
2017
+ if (err instanceof OriginDegradedError) {
2018
+ salvageBackpressure();
1842
2019
  }
1843
- });
2020
+ else {
2021
+ throw err;
2022
+ }
2023
+ }
1844
2024
  loadedPagesRaw = pinnedPages;
1845
2025
  // No sitemap context in pinned mode
1846
2026
  sitemapUrlSet = undefined;
@@ -1849,12 +2029,46 @@ export async function auditSource(source, options) {
1849
2029
  scrapePlan = undefined;
1850
2030
  }
1851
2031
  else {
1852
- const loaded = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext);
1853
- loadedPagesRaw = loaded.pages;
1854
- sitemapUrlSet = loaded.sitemapUrls;
1855
- sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
1856
- discoveredUrlCount = loaded.discoveredUrlCount;
1857
- scrapePlan = loaded.scrapePlan;
2032
+ // Salvage sink: loadPagesFromSource fills this incrementally as pages come
2033
+ // back. If the backpressure watchdog aborts mid-crawl the call throws an
2034
+ // OriginDegradedError and the function's own return value is lost — but the
2035
+ // already-fetched pages survive here, so we recover them and continue the
2036
+ // pipeline with a `truncated` flag instead of throwing the whole run away.
2037
+ const pageSink = [];
2038
+ try {
2039
+ const loaded = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext, pageSink);
2040
+ loadedPagesRaw = loaded.pages;
2041
+ sitemapUrlSet = loaded.sitemapUrls;
2042
+ sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
2043
+ discoveredUrlCount = loaded.discoveredUrlCount;
2044
+ scrapePlan = loaded.scrapePlan;
2045
+ }
2046
+ catch (err) {
2047
+ // Only the watchdog abort is salvageable. An external abort (ctrl-C /
2048
+ // parent timeout) or any other error is fatal — re-throw it untouched so
2049
+ // --no-backpressure and ctrl-C behaviour are unchanged.
2050
+ if (err instanceof OriginDegradedError) {
2051
+ // Prefer the canonical backpressureError message (same object the
2052
+ // monitor raised); fall back to the caught error if somehow distinct.
2053
+ if (!salvageBackpressure()) {
2054
+ truncated = true;
2055
+ truncatedReason = err.message;
2056
+ }
2057
+ // Recover whatever was fetched before the abort. The sink is the same
2058
+ // array loadPagesFromSource was pushing into, so it holds the partial
2059
+ // page set even though the function never reached its `return`.
2060
+ loadedPagesRaw = pageSink;
2061
+ // No sitemap/discovery context survives a mid-sitemap abort; the
2062
+ // downstream classifier falls back to the loaded page URLs.
2063
+ sitemapUrlSet = undefined;
2064
+ sitemapLastmodByUrl = undefined;
2065
+ discoveredUrlCount = undefined;
2066
+ scrapePlan = undefined;
2067
+ }
2068
+ else {
2069
+ throw err;
2070
+ }
2071
+ }
1858
2072
  }
1859
2073
  // The scrapePlan tells us which URLs were skipped pre-fetch under monitoring
1860
2074
  // mode. Surface them in skippedUrls so they show up under summary.skippedUrls
@@ -2076,29 +2290,29 @@ export async function auditSource(source, options) {
2076
2290
  // Site-wide rules (run once, outside group loop)
2077
2291
  if (sitemapUrlSet && sitemapUrlSet.size > 0 && auditMode !== "diff") {
2078
2292
  const sitemapFindings = sitemapCompletenessRule(parsedPages, sitemapUrlSet);
2079
- allFindings.push(...sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2293
+ pushAll(allFindings, sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2080
2294
  if (robotsTxtContent) {
2081
2295
  const robotsFindings = robotsComplianceRule(parsedPages, sitemapUrlSet, robotsTxtContent);
2082
- allFindings.push(...robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2296
+ pushAll(allFindings, robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2083
2297
  }
2084
2298
  }
2085
2299
  // AEO site-wide rules. These run unconditionally (consistent with sitemap-completeness
2086
2300
  // and robots-compliance); page-group rule lists govern per-page AEO rules only.
2087
2301
  const llmsFindings = await llmsTxtRule(source, { timeoutMs });
2088
- allFindings.push(...llmsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2302
+ pushAll(allFindings, llmsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2089
2303
  if (robotsTxtContent) {
2090
2304
  const crawlerFindings = crawlerAccessRule(robotsTxtContent);
2091
- allFindings.push(...crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2305
+ pushAll(allFindings, crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2092
2306
  }
2093
2307
  // Data source comparison rules
2094
2308
  if (options?.dataSource?.records && options.dataSource.records.length > 0) {
2095
2309
  if (auditMode !== "diff" || isRuleAllowedInDiff("data/missing-binding")) {
2096
2310
  const dataBindingFindings = dataBindingRule(parsedPages, options.dataSource.records);
2097
- allFindings.push(...dataBindingFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2311
+ pushAll(allFindings, dataBindingFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2098
2312
  }
2099
2313
  if (auditMode !== "diff" || isRuleAllowedInDiff("data/identical-across-pages")) {
2100
2314
  const dataIdenticalFindings = dataIdenticalRule(parsedPages, options.dataSource.records);
2101
- allFindings.push(...dataIdenticalFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2315
+ pushAll(allFindings, dataIdenticalFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2102
2316
  }
2103
2317
  }
2104
2318
  for (const [groupName, groupPages] of classified) {
@@ -2118,7 +2332,7 @@ export async function auditSource(source, options) {
2118
2332
  // because the nav paths between locale-specific currency-converter URLs
2119
2333
  // were not in the pinned set).
2120
2334
  isSampledAudit || hasPinnedUrlsEarly);
2121
- allFindings.push(...findings);
2335
+ pushAll(allFindings, findings);
2122
2336
  groupPageCounts[groupName] = groupPages.length;
2123
2337
  // v0.4.3: per-group scoring uses the same site-classification profile so
2124
2338
  // group-level risk numbers reflect the same severity / confidence remaps
@@ -2136,7 +2350,7 @@ export async function auditSource(source, options) {
2136
2350
  (auditMode === "full" || isRuleAllowedInDiff("content/value-add"));
2137
2351
  if (isValueAddEnabled) {
2138
2352
  const valueAddFindings = valueAddRule(parsedPages, allFindings);
2139
- allFindings.push(...valueAddFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2353
+ pushAll(allFindings, valueAddFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2140
2354
  }
2141
2355
  }
2142
2356
  // Enrich findings: cluster pairwise, detect templates, assign effort
@@ -2260,6 +2474,14 @@ export async function auditSource(source, options) {
2260
2474
  ? [...parsedPages.map((p) => p.url)].sort()
2261
2475
  : undefined,
2262
2476
  };
2477
+ // Partial-report flag: the backpressure watchdog aborted mid-crawl and we
2478
+ // salvaged whatever pages had been fetched. Consumers MUST treat coverage as
2479
+ // a lower bound (counts/verdict are partial). Only set when actually
2480
+ // truncated so complete runs keep `truncated` absent.
2481
+ if (truncated) {
2482
+ summary.truncated = true;
2483
+ summary.truncatedReason = truncatedReason;
2484
+ }
2263
2485
  if (cacheConfig) {
2264
2486
  summary.cacheStats = cacheStats;
2265
2487
  }