@pseolint/core 0.6.4 → 0.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/auditor.js CHANGED
@@ -34,7 +34,7 @@ import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-con
34
34
  import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
35
35
  import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
36
36
  import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
37
- import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds } from "./rules/tech/robots-sitemap-presence.js";
37
+ import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds, parseSitemapDirectives } from "./rules/tech/robots-sitemap-presence.js";
38
38
  import { llmsTxtRule } from "./rules/aeo/llms-txt.js";
39
39
  import { crawlerAccessRule } from "./rules/aeo/crawler-access.js";
40
40
  import { freshnessSignalsRule } from "./rules/aeo/freshness-signals.js";
@@ -615,157 +615,157 @@ sampled = false) {
615
615
  // Spam rules — always compute cross-page data, only push findings if enabled
616
616
  const nearDuplicate = nearDuplicateRule(pages, resolvedRules.nearDuplicateThreshold);
617
617
  if (isEnabled("spam/near-duplicate") && modeOk("spam/near-duplicate")) {
618
- findings.push(...tag(nearDuplicate.findings));
618
+ pushAll(findings, tag(nearDuplicate.findings));
619
619
  }
620
620
  const entitySwap = entitySwapRule(pages, entityPatterns, resolvedRules.entitySwapThreshold);
621
621
  if (isEnabled("spam/entity-swap") && modeOk("spam/entity-swap")) {
622
- findings.push(...tag(entitySwap.findings));
622
+ pushAll(findings, tag(entitySwap.findings));
623
623
  }
624
624
  const thinContent = thinContentRule(pages, resolvedRules.thinContentMinWords);
625
625
  if (isEnabled("spam/thin-content") && modeOk("spam/thin-content")) {
626
- findings.push(...tag(thinContent.findings));
626
+ pushAll(findings, tag(thinContent.findings));
627
627
  }
628
628
  if (isEnabled("spam/doorway-pattern") && modeOk("spam/doorway-pattern")) {
629
- findings.push(...tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
629
+ pushAll(findings, tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
630
630
  }
631
631
  if (isEnabled("spam/publication-velocity") && modeOk("spam/publication-velocity")) {
632
- findings.push(...tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay, resolvedRules.publicationVelocityMaxPerDayCorpusFraction)));
632
+ pushAll(findings, tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay, resolvedRules.publicationVelocityMaxPerDayCorpusFraction)));
633
633
  }
634
634
  if (isEnabled("spam/boilerplate-ratio") && modeOk("spam/boilerplate-ratio")) {
635
- findings.push(...tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
635
+ pushAll(findings, tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
636
636
  }
637
637
  if (isEnabled("spam/template-diversity") && modeOk("spam/template-diversity")) {
638
- findings.push(...tag(templateDiversityRule(pages, resolvedRules.templateDiversityMinUniqueRatio)));
638
+ pushAll(findings, tag(templateDiversityRule(pages, resolvedRules.templateDiversityMinUniqueRatio)));
639
639
  }
640
640
  if (isEnabled("spam/template-coverage") && modeOk("spam/template-coverage")) {
641
- findings.push(...tag(templateCoverageRule(pages, entityPatterns, resolvedRules.templateCoverageMinPages)));
641
+ pushAll(findings, tag(templateCoverageRule(pages, entityPatterns, resolvedRules.templateCoverageMinPages)));
642
642
  }
643
643
  // Content rules
644
644
  if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
645
- findings.push(...tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
645
+ pushAll(findings, tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
646
646
  }
647
647
  if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
648
- findings.push(...tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
648
+ pushAll(findings, tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
649
649
  }
650
650
  if (isEnabled("content/missing-author") && modeOk("content/missing-author")) {
651
- findings.push(...tag(missingAuthorRule(pages)));
651
+ pushAll(findings, tag(missingAuthorRule(pages)));
652
652
  }
653
653
  if (isEnabled("content/eeat-signals") && modeOk("content/eeat-signals")) {
654
- findings.push(...tag(eeatSignalsRule(pages)));
654
+ pushAll(findings, tag(eeatSignalsRule(pages)));
655
655
  }
656
656
  // 2026-05-03 v0.5.2 blind-spot fixes — title uniqueness + heading
657
657
  // structure + image alt-text were tier-1 gaps in the blind-spot audit.
658
658
  if (isEnabled("content/title-uniqueness") && modeOk("content/title-uniqueness")) {
659
- findings.push(...tag(titleUniquenessRule(pages)));
659
+ pushAll(findings, tag(titleUniquenessRule(pages)));
660
660
  }
661
661
  if (isEnabled("content/heading-structure") && modeOk("content/heading-structure")) {
662
- findings.push(...tag(headingStructureRule(pages)));
662
+ pushAll(findings, tag(headingStructureRule(pages)));
663
663
  }
664
664
  if (isEnabled("content/image-alt-text") && modeOk("content/image-alt-text")) {
665
- findings.push(...tag(imageAltTextRule(pages)));
665
+ pushAll(findings, tag(imageAltTextRule(pages)));
666
666
  }
667
667
  if (isEnabled("content/translation-no-op") && modeOk("content/translation-no-op")) {
668
- findings.push(...tag(translationNoOpRule(pages)));
668
+ pushAll(findings, tag(translationNoOpRule(pages)));
669
669
  }
670
670
  if (isEnabled("content/regurgitated-content") && modeOk("content/regurgitated-content")) {
671
- findings.push(...tag(regurgitatedContentRule(pages)));
671
+ pushAll(findings, tag(regurgitatedContentRule(pages)));
672
672
  }
673
673
  if (isEnabled("content/common-phrase-reuse") && modeOk("content/common-phrase-reuse")) {
674
- findings.push(...tag(commonPhraseReuseRule(pages)));
674
+ pushAll(findings, tag(commonPhraseReuseRule(pages)));
675
675
  }
676
676
  if (isEnabled("content/wikipedia-paraphrase") && modeOk("content/wikipedia-paraphrase")) {
677
- findings.push(...tag(wikipediaParaphraseRule(pages)));
677
+ pushAll(findings, tag(wikipediaParaphraseRule(pages)));
678
678
  }
679
679
  // Link rules — use the global link graph
680
680
  if (isEnabled("links/orphan-pages") && modeOk("links/orphan-pages")) {
681
- findings.push(...tag(orphanPagesRule(pages, inbound, rootUrl)));
681
+ pushAll(findings, tag(orphanPagesRule(pages, inbound, rootUrl)));
682
682
  }
683
683
  if (isEnabled("links/dead-ends") && modeOk("links/dead-ends")) {
684
- findings.push(...tag(deadEndsRule(pages, knownUrls, rootUrl)));
684
+ pushAll(findings, tag(deadEndsRule(pages, knownUrls, rootUrl)));
685
685
  }
686
686
  if (isEnabled("links/link-depth") && modeOk("links/link-depth")) {
687
687
  if (rootUrl) {
688
- findings.push(...tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound, sampled)));
688
+ pushAll(findings, tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound, sampled)));
689
689
  }
690
690
  }
691
691
  if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
692
- findings.push(...tag(clusterConnectivityRule(pages, knownUrls)));
692
+ pushAll(findings, tag(clusterConnectivityRule(pages, knownUrls)));
693
693
  }
694
694
  if (isEnabled("links/host-section-divergence") && modeOk("links/host-section-divergence")) {
695
- findings.push(...tag(hostSectionDivergenceRule(pages, adjacency)));
695
+ pushAll(findings, tag(hostSectionDivergenceRule(pages, adjacency)));
696
696
  }
697
697
  // Tech rules
698
698
  if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
699
- findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
699
+ pushAll(findings, tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
700
700
  }
701
701
  if (isEnabled("tech/canonical-noindex-conflict") && modeOk("tech/canonical-noindex-conflict")) {
702
- findings.push(...tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
702
+ pushAll(findings, tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
703
703
  }
704
704
  if (isEnabled("tech/robots-noindex-conflict") && modeOk("tech/robots-noindex-conflict")) {
705
- findings.push(...tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
705
+ pushAll(findings, tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
706
706
  }
707
707
  if (isEnabled("tech/redirect-chain") && modeOk("tech/redirect-chain")) {
708
- findings.push(...tag(redirectChainRule(pages)));
708
+ pushAll(findings, tag(redirectChainRule(pages)));
709
709
  }
710
710
  if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
711
- findings.push(...tag(soft404Rule(pages)));
711
+ pushAll(findings, tag(soft404Rule(pages)));
712
712
  }
713
713
  if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
714
714
  // hreflang declarations on noindex'd pages are still bugs when they're
715
715
  // inconsistent — see auditor.test.ts "emits technical SEO findings".
716
- findings.push(...tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
716
+ pushAll(findings, tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
717
717
  }
718
718
  // 2026-05-03 v0.5.2 blind-spot fix: og-completeness was referenced in
719
719
  // the v0.4.x README without ever shipping. Now it does.
720
720
  if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
721
- findings.push(...tag(ogCompletenessRule(pages)));
721
+ pushAll(findings, tag(ogCompletenessRule(pages)));
722
722
  }
723
723
  // Schema rules
724
724
  if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
725
- findings.push(...tag(jsonLdValidRule(pages)));
725
+ pushAll(findings, tag(jsonLdValidRule(pages)));
726
726
  }
727
727
  if (isEnabled("schema/required-fields") && modeOk("schema/required-fields")) {
728
- findings.push(...tag(requiredFieldsRule(pages)));
728
+ pushAll(findings, tag(requiredFieldsRule(pages)));
729
729
  }
730
730
  if (isEnabled("schema/consistency") && modeOk("schema/consistency")) {
731
- findings.push(...tag(schemaConsistencyRule(pages)));
731
+ pushAll(findings, tag(schemaConsistencyRule(pages)));
732
732
  }
733
733
  // AEO rules
734
734
  if (isEnabled("aeo/freshness-signals")) {
735
- findings.push(...tag(freshnessSignalsRule(pages, {
735
+ pushAll(findings, tag(freshnessSignalsRule(pages, {
736
736
  maxStaleDays: resolvedRules.freshnessMaxStaleDays,
737
737
  })));
738
738
  }
739
739
  if (isEnabled("aeo/faq-coverage")) {
740
- findings.push(...tag(faqCoverageRule(pages, {
740
+ pushAll(findings, tag(faqCoverageRule(pages, {
741
741
  minQuestionHeadings: resolvedRules.faqMinQuestionHeadings,
742
742
  })));
743
743
  }
744
744
  if (isEnabled("aeo/answer-first")) {
745
- findings.push(...tag(answerFirstRule(pages, entityPatterns, {
745
+ pushAll(findings, tag(answerFirstRule(pages, entityPatterns, {
746
746
  maxFirstParagraphWords: resolvedRules.answerFirstMaxWords,
747
747
  })));
748
748
  }
749
749
  if (isEnabled("aeo/citable-facts")) {
750
- findings.push(...tag(citableFactsRule(pages, entityPatterns, {
750
+ pushAll(findings, tag(citableFactsRule(pages, entityPatterns, {
751
751
  minFactsPerPage: resolvedRules.citableFactsMin,
752
752
  targetFactsPerPage: resolvedRules.citableFactsTarget,
753
753
  })));
754
754
  }
755
755
  if (isEnabled("aeo/content-modularity")) {
756
- findings.push(...tag(contentModularityRule(pages, {
756
+ pushAll(findings, tag(contentModularityRule(pages, {
757
757
  maxParagraphWords: resolvedRules.modularityMaxParagraphWords,
758
758
  minSelfContainedRatio: resolvedRules.modularityMinSelfContainedRatio,
759
759
  })));
760
760
  }
761
761
  if (isEnabled("aeo/summary-bait")) {
762
- findings.push(...tag(summaryBaitRule(pages, entityPatterns)));
762
+ pushAll(findings, tag(summaryBaitRule(pages, entityPatterns)));
763
763
  }
764
764
  // Cannibal rules — only url-pattern survives in v0.4 (title-overlap and
765
765
  // keyword-collision dropped due to high false-positive rates; see
766
766
  // 2026-04-29 v0.4 redesign spec §4.3).
767
767
  if (isEnabled("cannibal/url-pattern") && modeOk("cannibal/url-pattern")) {
768
- findings.push(...tag(urlPatternRule(pages)));
768
+ pushAll(findings, tag(urlPatternRule(pages)));
769
769
  }
770
770
  return findings;
771
771
  }
@@ -1003,6 +1003,20 @@ function withDocsUrls(findings) {
1003
1003
  }
1004
1004
  return findings;
1005
1005
  }
1006
+ /**
1007
+ * Append every item of `items` to `target` in place. Use this instead of
1008
+ * `target.push(...items)` whenever `items` can be large. The spread form passes
1009
+ * each element as a separate call argument, and V8 caps argument count
1010
+ * (~131072) — so `push(...bigArray)` throws `RangeError: Maximum call stack size
1011
+ * exceeded` on large inputs. A dense site makes the pairwise rules
1012
+ * (near-duplicate / entity-swap) emit C(N,2) findings, which blew the cap at the
1013
+ * rule-aggregation push *before* enrichment was even reached. The loop has no
1014
+ * such limit. See tests/integration/large-corpus-no-overflow.test.ts.
1015
+ */
1016
+ function pushAll(target, items) {
1017
+ for (const item of items)
1018
+ target.push(item);
1019
+ }
1006
1020
  async function collectHtmlFiles(directory) {
1007
1021
  const entries = await readdir(directory, { withFileTypes: true });
1008
1022
  const files = await Promise.all(entries.map(async (entry) => {
@@ -1038,7 +1052,11 @@ function composeSignals(...signals) {
1038
1052
  }
1039
1053
  return ac.signal;
1040
1054
  }
1041
- async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop) {
1055
+ async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop,
1056
+ // Per-sitemap byte cap (sitemaps.org caps an uncompressed sitemap at 50 MB).
1057
+ // Guards against a hostile/misconfigured sitemap eating the whole byte budget
1058
+ // or memory. 0 / undefined = no cap.
1059
+ maxBytes) {
1042
1060
  try {
1043
1061
  stats.total += 1;
1044
1062
  const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
@@ -1048,6 +1066,11 @@ async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop)
1048
1066
  }
1049
1067
  if (r.status < 200 || r.status >= 300)
1050
1068
  return null;
1069
+ if (maxBytes && maxBytes > 0 && r.body.length > maxBytes) {
1070
+ // eslint-disable-next-line no-console
1071
+ console.error(`pseolint: sitemap ${url} is ${(r.body.length / 1_048_576).toFixed(0)}MB, over the ${(maxBytes / 1_048_576).toFixed(0)}MB cap — skipping it.`);
1072
+ return null;
1073
+ }
1051
1074
  return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
1052
1075
  }
1053
1076
  catch (err) {
@@ -1205,7 +1228,16 @@ function fisherYatesSample(items, n, random = Math.random) {
1205
1228
  }
1206
1229
  return arr.slice(arr.length - n);
1207
1230
  }
1208
- async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop) {
1231
+ /** sitemaps.org caps an uncompressed sitemap at 50 MB. */
1232
+ const SITEMAP_MAX_BYTES = 50 * 1024 * 1024;
1233
+ /**
1234
+ * Max `<sitemapindex>` nesting depth we recurse through. The protocol only
1235
+ * defines a single level of nesting, but some sites nest deeper; 5 is generous
1236
+ * while still bounding work (and stack) on a hostile/misconfigured index that a
1237
+ * `visited` set alone wouldn't catch (e.g. a deep non-cyclic chain).
1238
+ */
1239
+ const SITEMAP_MAX_DEPTH = 5;
1240
+ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth = 0, maxDepth = SITEMAP_MAX_DEPTH) {
1209
1241
  visited.add(sitemapUrl);
1210
1242
  const entries = parseSitemapUrlsWithLastmod(sitemapText);
1211
1243
  if (!isSitemapIndex(sitemapText)) {
@@ -1217,33 +1249,50 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
1217
1249
  lastmodByUrl.set(entry.url, entry.lastmod);
1218
1250
  }
1219
1251
  }
1220
- return { urls, lastmodByUrl };
1252
+ return { urls, lastmodByUrl, childTotal: 0, childFailed: 0 };
1253
+ }
1254
+ // It's a sitemap index. Past the depth cap we stop recursing — but the
1255
+ // children we DON'T walk are unreached coverage, so report them as failed.
1256
+ if (depth >= maxDepth) {
1257
+ // eslint-disable-next-line no-console
1258
+ console.error(`pseolint: sitemap-index nesting exceeded depth ${maxDepth} at ${sitemapUrl}; not recursing further.`);
1259
+ return { urls: [], lastmodByUrl: new Map(), childTotal: entries.length, childFailed: entries.length };
1221
1260
  }
1222
1261
  const allUrls = [];
1223
1262
  const allLastmodByUrl = new Map();
1263
+ let childTotal = 0;
1264
+ let childFailed = 0;
1224
1265
  for (const entry of entries) {
1225
1266
  const childUrl = entry.url;
1226
1267
  if (signal?.aborted)
1227
1268
  throw signal.reason ?? new Error("aborted");
1269
+ childTotal += 1;
1228
1270
  if (visited.has(childUrl))
1271
+ continue; // already walked (cyclic index) — not a failure
1272
+ const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
1273
+ if (!child) {
1274
+ childFailed += 1;
1229
1275
  continue;
1230
- const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop);
1231
- if (!child)
1232
- continue;
1276
+ }
1233
1277
  const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
1234
- if (!childLike)
1278
+ if (!childLike) {
1279
+ childFailed += 1;
1235
1280
  continue;
1236
- const { urls: childUrls, lastmodByUrl: childLastmodByUrl } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
1237
- allUrls.push(...childUrls);
1281
+ }
1282
+ const { urls: childUrls, lastmodByUrl: childLastmodByUrl, childTotal: ct, childFailed: cf } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth + 1, maxDepth);
1283
+ pushAll(allUrls, childUrls);
1238
1284
  for (const [u, lm] of childLastmodByUrl) {
1239
1285
  allLastmodByUrl.set(u, lm);
1240
1286
  }
1287
+ // Accumulate nested index structure (a child that is itself an index).
1288
+ childTotal += ct;
1289
+ childFailed += cf;
1241
1290
  }
1242
- return { urls: allUrls, lastmodByUrl: allLastmodByUrl };
1291
+ return { urls: allUrls, lastmodByUrl: allLastmodByUrl, childTotal, childFailed };
1243
1292
  }
1244
1293
  async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
1245
1294
  if (!origin)
1246
- return { disallow: [], crawlDelaySec: 0 };
1295
+ return { disallow: [], crawlDelaySec: 0, sitemaps: [] };
1247
1296
  try {
1248
1297
  const robotsUrl = `${origin}/robots.txt`;
1249
1298
  const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats, signal, validateHop);
@@ -1253,10 +1302,14 @@ async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validate
1253
1302
  return {
1254
1303
  disallow: parseDisallowPatterns(fetched.text, ["*", "pseolint"]),
1255
1304
  crawlDelaySec: parseCrawlDelaySeconds(fetched.text),
1305
+ // `Sitemap:` directives are origin-relative-agnostic (absolute URLs) and
1306
+ // there can be several. Surfaced so discovery can read the site's declared
1307
+ // sitemaps instead of guessing.
1308
+ sitemaps: parseSitemapDirectives(fetched.text),
1256
1309
  };
1257
1310
  }
1258
1311
  catch {
1259
- return { disallow: [], crawlDelaySec: 0 };
1312
+ return { disallow: [], crawlDelaySec: 0, sitemaps: [] };
1260
1313
  }
1261
1314
  }
1262
1315
  function sleep(ms) {
@@ -1272,7 +1325,12 @@ function isDisallowedByRobots(urlPath, patterns) {
1272
1325
  function budgetExceeded(b) {
1273
1326
  return b.cap > 0 && b.used >= b.cap;
1274
1327
  }
1275
- async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null) {
1328
+ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null,
1329
+ // Backpressure salvage: when provided, every page body that comes back is
1330
+ // pushed into this caller-owned array as it's fetched. If the watchdog aborts
1331
+ // mid-crawl and this function throws, the caller still holds the partial set
1332
+ // (the local `pages` array would otherwise be lost with the stack frame).
1333
+ pageSink) {
1276
1334
  // Memoized SSRF validator. When guardSsrf is on, every URL fetched by the
1277
1335
  // audit (source, sitemap entries, redirects, discovered links) goes through
1278
1336
  // this. DNS is hit once per unique hostname per audit — a 4k-page audit on
@@ -1332,7 +1390,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
1332
1390
  const isXml = (contentType.includes("xml") || looksLikeSitemap(text)) && sourceStatus !== -1;
1333
1391
  if (isXml) {
1334
1392
  const visited = new Set();
1335
- const { urls: allSitemapUrls, lastmodByUrl: sitemapLastmodByUrl } = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
1393
+ const { urls: allSitemapUrls, lastmodByUrl: sitemapLastmodByUrl, childTotal: sitemapChildTotal, childFailed: sitemapChildFailed } = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
1336
1394
  // If we have a budget, sample from sitemap URLs before fetching
1337
1395
  const sampledUrls = discoveryBudget > 0 && allSitemapUrls.length > discoveryBudget
1338
1396
  ? fisherYatesSample(allSitemapUrls, discoveryBudget)
@@ -1359,7 +1417,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
1359
1417
  else {
1360
1418
  urlsToFetch = sampledUrls;
1361
1419
  }
1362
- const pages = [];
1420
+ // Reuse the caller's salvage sink as the live page accumulator so a
1421
+ // mid-crawl watchdog abort leaves the already-fetched pages visible to
1422
+ // the caller. Falls back to a private array when no sink is passed.
1423
+ const pages = pageSink ?? [];
1363
1424
  // Fetch robots.txt once for the origin — reused for Crawl-Delay pacing and Disallow checks.
1364
1425
  const sourceOrigin = (() => { try {
1365
1426
  return new URL(source).origin;
@@ -1465,11 +1526,14 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
1465
1526
  });
1466
1527
  }
1467
1528
  }
1468
- return { pages, sitemapUrls: new Set(allSitemapUrls), sitemapLastmodByUrl, discoveredUrlCount: allSitemapUrls.length, scrapePlan };
1529
+ return { pages, sitemapUrls: new Set(allSitemapUrls), sitemapLastmodByUrl, discoveredUrlCount: allSitemapUrls.length, declaredSitemapUrlCount: allSitemapUrls.length, sitemapChildTotal, sitemapChildFailed, scrapePlan };
1469
1530
  }
1470
1531
  if (contentType.includes("html") || looksLikeHtml(text)) {
1471
1532
  const initialPage = { url: source, html: text };
1472
- const pages = [initialPage];
1533
+ // See note above: reuse the caller's salvage sink so a watchdog abort
1534
+ // during link-discovery crawling preserves the pages fetched so far.
1535
+ const pages = pageSink ?? [];
1536
+ pages.push(initialPage);
1473
1537
  if (crawlDiscovery) {
1474
1538
  let sourceOrigin;
1475
1539
  try {
@@ -1481,6 +1545,106 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
1481
1545
  const knownCrawled = new Set([source]);
1482
1546
  const allDiscoveredUrls = new Set([source]);
1483
1547
  const maxDepth = 3;
1548
+ // Total URLs the discovered sitemap(s) declare — the basis for the
1549
+ // caller's coverage guardrail. Undefined when no sitemap is found.
1550
+ let declaredSitemapUrlCount;
1551
+ // Child-sitemap reachability for the guardrail: how many child sitemaps
1552
+ // an index referenced vs how many we could not fetch/parse. childFailed>0
1553
+ // means the declared URL list is itself incomplete.
1554
+ let sitemapChildTotal = 0;
1555
+ let sitemapChildFailed = 0;
1556
+ // Sitemap-first discovery (like Google). Before link-crawling, read the
1557
+ // sitemap(s) the site declares — link-crawl only reaches *linked* pages,
1558
+ // but a pSEO site's whole point is thousands of programmatic URLs that
1559
+ // may be sparsely linked (or behind a build-frozen, under-linked nav).
1560
+ // Sources of truth, in order:
1561
+ // 1. `Sitemap:` directives in robots.txt (there can be several)
1562
+ // 2. failing that, probe /sitemap.xml then /sitemap_index.xml
1563
+ // Sitemap-listed URLs are authoritative, so we fetch them FIRST; the
1564
+ // link-crawl below then fills any remaining budget and dedups against
1565
+ // them. When no sitemap exists, this is a no-op and we crawl as before.
1566
+ if (sourceOrigin) {
1567
+ const robotsForDiscovery = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats, signal, validateHop);
1568
+ const probing = robotsForDiscovery.sitemaps.length === 0;
1569
+ const sitemapCandidates = probing
1570
+ ? [`${sourceOrigin}/sitemap.xml`, `${sourceOrigin}/sitemap_index.xml`]
1571
+ : robotsForDiscovery.sitemaps;
1572
+ const visitedSitemaps = new Set();
1573
+ const sitemapListedUrls = [];
1574
+ for (const candidate of sitemapCandidates) {
1575
+ if (discoveryBudget > 0 && pages.length + sitemapListedUrls.length >= discoveryBudget)
1576
+ break;
1577
+ if (visitedSitemaps.has(candidate))
1578
+ continue;
1579
+ let smText;
1580
+ let smType;
1581
+ try {
1582
+ if (validateHop)
1583
+ await validateHop(candidate);
1584
+ const fetched = await fetchWithRetry(candidate, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
1585
+ if (!fetched)
1586
+ continue;
1587
+ smText = fetched.text;
1588
+ smType = fetched.contentType;
1589
+ }
1590
+ catch {
1591
+ continue; // SSRF refusal, network error, etc. — skip this candidate
1592
+ }
1593
+ if (!(smType.includes("xml") || looksLikeSitemap(smText)))
1594
+ continue;
1595
+ const { urls: discoveredSmUrls, childTotal: ct, childFailed: cf } = await collectUrlsFromSitemap(smText, candidate, visitedSitemaps, timeoutMs, cache, stats, signal, validateHop);
1596
+ sitemapChildTotal += ct;
1597
+ sitemapChildFailed += cf;
1598
+ pushAll(sitemapListedUrls, discoveredSmUrls);
1599
+ // When probing the conventional paths, stop at the first that hits.
1600
+ if (probing && discoveredSmUrls.length > 0)
1601
+ break;
1602
+ }
1603
+ // Same-origin + robots-aware filter, deduped against what we have.
1604
+ // Record what the sitemap(s) declared (deduped) before same-origin /
1605
+ // robots filtering — the operator's site has this many URLs.
1606
+ if (sitemapListedUrls.length > 0)
1607
+ declaredSitemapUrlCount = new Set(sitemapListedUrls).size;
1608
+ const seedUrls = Array.from(new Set(sitemapListedUrls)).filter((u) => {
1609
+ if (knownCrawled.has(u))
1610
+ return false;
1611
+ try {
1612
+ const parsed = new URL(u);
1613
+ if (parsed.origin !== sourceOrigin)
1614
+ return false;
1615
+ if (respectRobotsTxt && isDisallowedByRobots(parsed.pathname, robotsForDiscovery.disallow)) {
1616
+ skippedByRobots.push(u);
1617
+ return false;
1618
+ }
1619
+ return true;
1620
+ }
1621
+ catch {
1622
+ return false;
1623
+ }
1624
+ });
1625
+ for (const u of seedUrls)
1626
+ allDiscoveredUrls.add(u);
1627
+ // Cap the seed fetch. With a sampling budget, fit under it; without one
1628
+ // (the default "audit everything" path) bound by maxCrawlDiscovered, the
1629
+ // same ceiling the link-crawl honors — otherwise a homepage audit of a
1630
+ // site with a 50k-URL sitemap would try to fetch all of them (the link
1631
+ // crawl never could, so this would be an unbounded-egress regression).
1632
+ const seedToFetch = discoveryBudget > 0
1633
+ ? seedUrls.slice(0, Math.max(0, discoveryBudget - pages.length))
1634
+ : seedUrls.slice(0, maxCrawlDiscovered);
1635
+ if (seedToFetch.length > 0) {
1636
+ await runWithConcurrency(seedToFetch, concurrency, async (url) => {
1637
+ if (budgetExceeded(byteBudget))
1638
+ return;
1639
+ const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
1640
+ knownCrawled.add(url);
1641
+ if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
1642
+ byteBudget.used += result.html.length;
1643
+ pages.push(result);
1644
+ }
1645
+ });
1646
+ }
1647
+ }
1484
1648
  for (let depth = 0; depth < maxDepth; depth += 1) {
1485
1649
  // Stop if we've hit the discovery budget
1486
1650
  if (discoveryBudget > 0 && pages.length >= discoveryBudget)
@@ -1542,11 +1706,11 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
1542
1706
  knownCrawled.add(url);
1543
1707
  }
1544
1708
  });
1545
- pages.push(...newPages);
1709
+ pushAll(pages, newPages);
1546
1710
  if (newPages.length === 0)
1547
1711
  break;
1548
1712
  }
1549
- return { pages, discoveredUrlCount: allDiscoveredUrls.size };
1713
+ return { pages, discoveredUrlCount: allDiscoveredUrls.size, declaredSitemapUrlCount, sitemapChildTotal, sitemapChildFailed };
1550
1714
  }
1551
1715
  return { pages };
1552
1716
  }
@@ -1633,6 +1797,14 @@ export async function auditSource(source, options) {
1633
1797
  const backpressureEnabled = options?.backpressure !== false;
1634
1798
  const backpressureAbort = new AbortController();
1635
1799
  let backpressureError = null;
1800
+ // Set once we've decided to salvage a partial report after a watchdog abort.
1801
+ // From that point `throwIfAborted` must NOT re-throw the backpressure error —
1802
+ // the watchdog already did its job (stopped fetching); the rest of the
1803
+ // pipeline runs over the pages collected so far and the truncation is
1804
+ // surfaced on the summary instead.
1805
+ let truncated = false;
1806
+ let truncatedReason;
1807
+ let truncatedKind;
1636
1808
  const signal = composeSignals(externalSignal, backpressureAbort.signal);
1637
1809
  const observer = new FetchObserver();
1638
1810
  // 2026-05-03 calibration: the prior (3s p95 cap, 2× baseline multiplier)
@@ -1674,12 +1846,33 @@ export async function auditSource(source, options) {
1674
1846
  backpressureAbort.abort(backpressureError);
1675
1847
  }
1676
1848
  };
1849
+ // Flip the run into salvage mode after a watchdog abort: record the reason so
1850
+ // assembly sets summary.truncated, and from here `throwIfAborted` will no
1851
+ // longer re-throw the backpressure error. Idempotent. Returns true when a
1852
+ // backpressure abort was present to salvage.
1853
+ function salvageBackpressure() {
1854
+ if (!backpressureError)
1855
+ return false;
1856
+ truncated = true;
1857
+ truncatedReason = backpressureError.message;
1858
+ truncatedKind = "backpressure";
1859
+ return true;
1860
+ }
1677
1861
  function throwIfAborted() {
1678
- if (backpressureError)
1679
- throw backpressureError;
1862
+ // An EXTERNAL abort (ctrl-C, parent timeout) is always fatal: the caller
1863
+ // asked to stop, not to degrade. Check it first so it wins over salvage.
1680
1864
  if (externalSignal?.aborted) {
1681
1865
  throw externalSignal.reason ?? new DOMException("Audit aborted", "AbortError");
1682
1866
  }
1867
+ // A backpressure abort is salvageable. Once we've committed to a partial
1868
+ // report (`truncated`), swallow it and let the pipeline finish over the
1869
+ // pages collected so far. Before that commit, the loader-boundary catch
1870
+ // handles it; this guard only fires on the rare path where the loader
1871
+ // returned normally (e.g. a fetch mock that ignores the abort signal) yet
1872
+ // the watchdog still voted to abort — salvage rather than crash.
1873
+ if (backpressureError && !truncated) {
1874
+ salvageBackpressure();
1875
+ }
1683
1876
  }
1684
1877
  const resolvedRules = {
1685
1878
  nearDuplicateThreshold: options?.rules?.nearDuplicateThreshold ?? DEFAULTS.nearDuplicateThreshold,
@@ -1785,6 +1978,9 @@ export async function auditSource(source, options) {
1785
1978
  let sitemapUrlSet;
1786
1979
  let sitemapLastmodByUrl;
1787
1980
  let discoveredUrlCount;
1981
+ let declaredSitemapUrlCount;
1982
+ let sitemapChildTotal;
1983
+ let sitemapChildFailed;
1788
1984
  let scrapePlan;
1789
1985
  if (hasPinnedUrlsEarly) {
1790
1986
  const pinned = options.pinnedUrls;
@@ -1834,13 +2030,26 @@ export async function auditSource(source, options) {
1834
2030
  }
1835
2031
  : undefined;
1836
2032
  const pinnedPages = [];
1837
- await runWithConcurrency(Array.from(pinned), concurrency, async (url) => {
1838
- const result = await fetchPageWithMeta(url, timeoutMs, cacheConfig, cacheStats, signal, validateHopPinned, followRedirects);
1839
- if (result) {
1840
- fetchByteBudget.used += result.html.length;
1841
- pinnedPages.push(result);
2033
+ try {
2034
+ await runWithConcurrency(Array.from(pinned), concurrency, async (url) => {
2035
+ const result = await fetchPageWithMeta(url, timeoutMs, cacheConfig, cacheStats, signal, validateHopPinned, followRedirects);
2036
+ if (result) {
2037
+ fetchByteBudget.used += result.html.length;
2038
+ pinnedPages.push(result);
2039
+ }
2040
+ });
2041
+ }
2042
+ catch (err) {
2043
+ // Same salvage contract as the sitemap/crawl path: a watchdog abort
2044
+ // mid-fetch keeps the pages already collected in `pinnedPages`. Any other
2045
+ // error (external abort, SSRF rejection) is fatal — re-throw it.
2046
+ if (err instanceof OriginDegradedError) {
2047
+ salvageBackpressure();
1842
2048
  }
1843
- });
2049
+ else {
2050
+ throw err;
2051
+ }
2052
+ }
1844
2053
  loadedPagesRaw = pinnedPages;
1845
2054
  // No sitemap context in pinned mode
1846
2055
  sitemapUrlSet = undefined;
@@ -1849,13 +2058,60 @@ export async function auditSource(source, options) {
1849
2058
  scrapePlan = undefined;
1850
2059
  }
1851
2060
  else {
1852
- const loaded = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext);
1853
- loadedPagesRaw = loaded.pages;
1854
- sitemapUrlSet = loaded.sitemapUrls;
1855
- sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
1856
- discoveredUrlCount = loaded.discoveredUrlCount;
1857
- scrapePlan = loaded.scrapePlan;
2061
+ // Salvage sink: loadPagesFromSource fills this incrementally as pages come
2062
+ // back. If the backpressure watchdog aborts mid-crawl the call throws an
2063
+ // OriginDegradedError and the function's own return value is lost — but the
2064
+ // already-fetched pages survive here, so we recover them and continue the
2065
+ // pipeline with a `truncated` flag instead of throwing the whole run away.
2066
+ const pageSink = [];
2067
+ try {
2068
+ const loaded = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext, pageSink);
2069
+ loadedPagesRaw = loaded.pages;
2070
+ sitemapUrlSet = loaded.sitemapUrls;
2071
+ sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
2072
+ discoveredUrlCount = loaded.discoveredUrlCount;
2073
+ declaredSitemapUrlCount = loaded.declaredSitemapUrlCount;
2074
+ sitemapChildTotal = loaded.sitemapChildTotal;
2075
+ sitemapChildFailed = loaded.sitemapChildFailed;
2076
+ scrapePlan = loaded.scrapePlan;
2077
+ }
2078
+ catch (err) {
2079
+ // Only the watchdog abort is salvageable. An external abort (ctrl-C /
2080
+ // parent timeout) or any other error is fatal — re-throw it untouched so
2081
+ // --no-backpressure and ctrl-C behaviour are unchanged.
2082
+ if (err instanceof OriginDegradedError) {
2083
+ // Prefer the canonical backpressureError message (same object the
2084
+ // monitor raised); fall back to the caught error if somehow distinct.
2085
+ if (!salvageBackpressure()) {
2086
+ truncated = true;
2087
+ truncatedReason = err.message;
2088
+ truncatedKind = "backpressure";
2089
+ }
2090
+ // Recover whatever was fetched before the abort. The sink is the same
2091
+ // array loadPagesFromSource was pushing into, so it holds the partial
2092
+ // page set even though the function never reached its `return`.
2093
+ loadedPagesRaw = pageSink;
2094
+ // No sitemap/discovery context survives a mid-sitemap abort; the
2095
+ // downstream classifier falls back to the loaded page URLs.
2096
+ sitemapUrlSet = undefined;
2097
+ sitemapLastmodByUrl = undefined;
2098
+ discoveredUrlCount = undefined;
2099
+ declaredSitemapUrlCount = undefined;
2100
+ sitemapChildTotal = undefined;
2101
+ sitemapChildFailed = undefined;
2102
+ scrapePlan = undefined;
2103
+ }
2104
+ else {
2105
+ throw err;
2106
+ }
2107
+ }
1858
2108
  }
2109
+ // Pages we successfully FETCHED (HTTP 2xx) from discovery — before content-type
2110
+ // and policy filtering, and before sampling. This is the right denominator for
2111
+ // the coverage guardrail: noindex / non-HTML pages were still *reached* (they
2112
+ // count), intentional sampling happens later (doesn't count against us), and
2113
+ // only genuinely-unreachable URLs (4xx/5xx) are missing from it.
2114
+ const fetchedCount = loadedPagesRaw.length;
1859
2115
  // The scrapePlan tells us which URLs were skipped pre-fetch under monitoring
1860
2116
  // mode. Surface them in skippedUrls so they show up under summary.skippedUrls
1861
2117
  // (kept for back-compat with --since consumers); T7 will carry their prior
@@ -1889,7 +2145,10 @@ export async function auditSource(source, options) {
1889
2145
  skippedByContentType.push(p.url);
1890
2146
  }
1891
2147
  }
1892
- loadedPages.splice(0, loadedPages.length, ...htmlOnlyPages);
2148
+ // Replace contents in place without `splice(0, n, ...big)` — that spread hits
2149
+ // the V8 argument-count cap on large corpora (same class as pushAll).
2150
+ loadedPages.length = 0;
2151
+ pushAll(loadedPages, htmlOnlyPages);
1893
2152
  if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
1894
2153
  console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
1895
2154
  }
@@ -2047,9 +2306,21 @@ export async function auditSource(source, options) {
2047
2306
  const guardedClassification = applyDegenerationGuard(computedClassification, corpusStatsFromPages(parsedPages));
2048
2307
  // `--strict` (or AuditOptions.strict) keeps the classification but forces
2049
2308
  // every rule to run regardless of detected site type.
2309
+ //
2310
+ // A backpressure abort BEFORE classification salvages only a fragment of the
2311
+ // crawl (`truncated` is already set here; the coverage guardrail runs later).
2312
+ // Classifying that fragment — e.g. the 1 page left after the watchdog aborts a
2313
+ // cold-start origin — as `small-marketing` and suppressing the pSEO rules off
2314
+ // it is exactly what produced the confident false "READY" on a 5,600-page
2315
+ // site. When the run was truncated pre-classification we genuinely could not
2316
+ // determine the site type: force `unclear` (confidence 0, no suppression,
2317
+ // neutral scoring) so nothing masks the incompleteness.
2318
+ const classificationUnreliable = truncated;
2050
2319
  const siteClassification = options?.strict
2051
2320
  ? { ...guardedClassification, suppressedRules: [] }
2052
- : guardedClassification;
2321
+ : classificationUnreliable
2322
+ ? { ...guardedClassification, type: "unclear", confidence: 0, suppressedRules: [] }
2323
+ : guardedClassification;
2053
2324
  const suppressedRuleSet = new Set(siteClassification.suppressedRules);
2054
2325
  // Classify pages into groups and run only enabled rules per group
2055
2326
  const classified = classifyPages(parsedPages, options?.pageGroups);
@@ -2076,29 +2347,29 @@ export async function auditSource(source, options) {
2076
2347
  // Site-wide rules (run once, outside group loop)
2077
2348
  if (sitemapUrlSet && sitemapUrlSet.size > 0 && auditMode !== "diff") {
2078
2349
  const sitemapFindings = sitemapCompletenessRule(parsedPages, sitemapUrlSet);
2079
- allFindings.push(...sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2350
+ pushAll(allFindings, sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2080
2351
  if (robotsTxtContent) {
2081
2352
  const robotsFindings = robotsComplianceRule(parsedPages, sitemapUrlSet, robotsTxtContent);
2082
- allFindings.push(...robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2353
+ pushAll(allFindings, robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2083
2354
  }
2084
2355
  }
2085
2356
  // AEO site-wide rules. These run unconditionally (consistent with sitemap-completeness
2086
2357
  // and robots-compliance); page-group rule lists govern per-page AEO rules only.
2087
2358
  const llmsFindings = await llmsTxtRule(source, { timeoutMs });
2088
- allFindings.push(...llmsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2359
+ pushAll(allFindings, llmsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2089
2360
  if (robotsTxtContent) {
2090
2361
  const crawlerFindings = crawlerAccessRule(robotsTxtContent);
2091
- allFindings.push(...crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2362
+ pushAll(allFindings, crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2092
2363
  }
2093
2364
  // Data source comparison rules
2094
2365
  if (options?.dataSource?.records && options.dataSource.records.length > 0) {
2095
2366
  if (auditMode !== "diff" || isRuleAllowedInDiff("data/missing-binding")) {
2096
2367
  const dataBindingFindings = dataBindingRule(parsedPages, options.dataSource.records);
2097
- allFindings.push(...dataBindingFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2368
+ pushAll(allFindings, dataBindingFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2098
2369
  }
2099
2370
  if (auditMode !== "diff" || isRuleAllowedInDiff("data/identical-across-pages")) {
2100
2371
  const dataIdenticalFindings = dataIdenticalRule(parsedPages, options.dataSource.records);
2101
- allFindings.push(...dataIdenticalFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2372
+ pushAll(allFindings, dataIdenticalFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2102
2373
  }
2103
2374
  }
2104
2375
  for (const [groupName, groupPages] of classified) {
@@ -2118,7 +2389,7 @@ export async function auditSource(source, options) {
2118
2389
  // because the nav paths between locale-specific currency-converter URLs
2119
2390
  // were not in the pinned set).
2120
2391
  isSampledAudit || hasPinnedUrlsEarly);
2121
- allFindings.push(...findings);
2392
+ pushAll(allFindings, findings);
2122
2393
  groupPageCounts[groupName] = groupPages.length;
2123
2394
  // v0.4.3: per-group scoring uses the same site-classification profile so
2124
2395
  // group-level risk numbers reflect the same severity / confidence remaps
@@ -2136,7 +2407,7 @@ export async function auditSource(source, options) {
2136
2407
  (auditMode === "full" || isRuleAllowedInDiff("content/value-add"));
2137
2408
  if (isValueAddEnabled) {
2138
2409
  const valueAddFindings = valueAddRule(parsedPages, allFindings);
2139
- allFindings.push(...valueAddFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2410
+ pushAll(allFindings, valueAddFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2140
2411
  }
2141
2412
  }
2142
2413
  // Enrich findings: cluster pairwise, detect templates, assign effort
@@ -2260,6 +2531,71 @@ export async function auditSource(source, options) {
2260
2531
  ? [...parsedPages.map((p) => p.url)].sort()
2261
2532
  : undefined,
2262
2533
  };
2534
+ // Partial-report flag: the backpressure watchdog aborted mid-crawl and we
2535
+ // salvaged whatever pages had been fetched. Consumers MUST treat coverage as
2536
+ // a lower bound (counts/verdict are partial). Only set when actually
2537
+ // truncated so complete runs keep `truncated` absent.
2538
+ // ── Coverage guardrails (#4) ─────────────────────────────────────────────
2539
+ // A sitemap was found at discovery, so we know roughly how large the site is.
2540
+ // Two independent under-coverage signals, each reusing the `truncated`
2541
+ // partial-coverage surface (CLI/Action/MCP/web already flag it) tagged
2542
+ // `truncatedKind: "coverage"` so consumers can tell it apart from a
2543
+ // backpressure abort. Backpressure (set during the crawl) takes precedence.
2544
+ if (!truncated && sitemapChildFailed && sitemapChildFailed > 0) {
2545
+ // (A) Extraction-side: a sitemap INDEX referenced child sitemaps we could
2546
+ // not fetch/parse (404, non-sitemap, or beyond the depth cap). The declared
2547
+ // URL list is itself incomplete — the "unreachable child sitemaps" case a
2548
+ // urls-only count can never see (and the original false-negative class).
2549
+ truncated = true;
2550
+ truncatedKind = "coverage";
2551
+ truncatedReason =
2552
+ `${sitemapChildFailed} of ${sitemapChildTotal} child sitemaps referenced by the sitemap index could not be ` +
2553
+ `fetched or parsed — both the declared URL count and this audit are incomplete, so the verdict is not ` +
2554
+ `representative of the full site. Check that every child sitemap is reachable (HTTP 200, valid XML).`;
2555
+ // eslint-disable-next-line no-console
2556
+ console.error(`pseolint: ${truncatedReason}`);
2557
+ }
2558
+ if (!truncated && declaredSitemapUrlCount && declaredSitemapUrlCount >= 20) {
2559
+ // (B) Audit-side: the sitemap declared N URLs but we FETCHED far fewer than
2560
+ // we intended to. Compare against `fetchedCount` (pages actually fetched,
2561
+ // pre-filter/pre-sample) so legitimately-skipped pages (noindex, non-HTML)
2562
+ // and intentional sampling do NOT register as a shortfall. `intended` is
2563
+ // bounded by every deliberate limit — an explicit sample, the crawl cap, and
2564
+ // the declared total — so none of them false-fire.
2565
+ const sampleCap = sampleSize > 0 ? sampleSize : Number.POSITIVE_INFINITY;
2566
+ const crawlCap = maxCrawlDiscovered > 0 ? maxCrawlDiscovered : Number.POSITIVE_INFINITY;
2567
+ const intended = Math.min(sampleCap, crawlCap, declaredSitemapUrlCount);
2568
+ const floor = Math.max(20, Math.floor(intended * 0.05));
2569
+ // `intended >= 20`: only judge representativeness when we actually meant to
2570
+ // audit a substantial slice. A deliberately tiny sample/crawl cap (intended
2571
+ // < 20) is the operator's choice, not under-discovery — don't flag it (and
2572
+ // it would otherwise trip the absolute floor of 20).
2573
+ if (intended >= 20 && fetchedCount < floor) {
2574
+ const unreached = Math.max(0, declaredSitemapUrlCount - fetchedCount);
2575
+ const ratio = fetchedCount / declaredSitemapUrlCount;
2576
+ const pct = (ratio * 100).toFixed(ratio < 0.01 ? 2 : 1);
2577
+ truncated = true;
2578
+ truncatedKind = "coverage";
2579
+ truncatedReason =
2580
+ `Fetched ${fetchedCount} of ~${declaredSitemapUrlCount} sitemap-declared URLs (~${pct}% coverage); ` +
2581
+ `~${unreached} could not be retrieved (4xx/5xx, redirects, or robots-blocked). The verdict covers only the ` +
2582
+ `pages reached and is not representative — check for a stale sitemap or unreachable pages, or raise crawl limits.`;
2583
+ // eslint-disable-next-line no-console
2584
+ console.error(`pseolint: ${truncatedReason}`);
2585
+ }
2586
+ }
2587
+ if (truncated) {
2588
+ summary.truncated = true;
2589
+ summary.truncatedReason = truncatedReason;
2590
+ if (truncatedKind)
2591
+ summary.truncatedKind = truncatedKind;
2592
+ // A truncated run is incomplete — never present it as a clean green. Floor
2593
+ // the verdict to at least "caution" so the headline matches the partial-
2594
+ // coverage banner instead of the false "READY ✓" over a salvaged fragment.
2595
+ // ("ready" is the only rung below "caution"; everything else already is.)
2596
+ if (summary.verdict === "ready")
2597
+ summary.verdict = "caution";
2598
+ }
2263
2599
  if (cacheConfig) {
2264
2600
  summary.cacheStats = cacheStats;
2265
2601
  }