npm - @pseolint/core - Versions diffs - 0.6.4 → 0.6.6 - Mend

@pseolint/core 0.6.4 → 0.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/README.md +64 -0
package/dist/ai/tools/fetch-sitemap.js +2 -1
package/dist/ai/tools/fetch-sitemap.js.map +1 -1
package/dist/auditor.d.ts.map +1 -1
package/dist/auditor.js +421 -85
package/dist/auditor.js.map +1 -1
package/dist/enrich-findings.d.ts.map +1 -1
package/dist/enrich-findings.js +27 -5
package/dist/enrich-findings.js.map +1 -1
package/dist/parser.d.ts.map +1 -1
package/dist/parser.js +17 -1
package/dist/parser.js.map +1 -1
package/dist/rules/content/title-uniqueness.d.ts.map +1 -1
package/dist/rules/content/title-uniqueness.js +13 -0
package/dist/rules/content/title-uniqueness.js.map +1 -1
package/dist/rules/content/unique-value.d.ts.map +1 -1
package/dist/rules/content/unique-value.js +29 -4
package/dist/rules/content/unique-value.js.map +1 -1
package/dist/site-classifier.d.ts.map +1 -1
package/dist/site-classifier.js +7 -1
package/dist/site-classifier.js.map +1 -1
package/dist/stratified-sample.js +2 -1
package/dist/stratified-sample.js.map +1 -1
package/dist/types.d.ts +48 -3
package/dist/types.d.ts.map +1 -1
package/dist/types.js +14 -2
package/dist/types.js.map +1 -1
package/package.json +5 -3
package/schemas/audit-summary.schema.json +300 -0

package/dist/auditor.js CHANGED Viewed

@@ -34,7 +34,7 @@ import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-con
 import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
 import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
 import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
-import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds } from "./rules/tech/robots-sitemap-presence.js";
+import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds, parseSitemapDirectives } from "./rules/tech/robots-sitemap-presence.js";
 import { llmsTxtRule } from "./rules/aeo/llms-txt.js";
 import { crawlerAccessRule } from "./rules/aeo/crawler-access.js";
 import { freshnessSignalsRule } from "./rules/aeo/freshness-signals.js";
@@ -615,157 +615,157 @@ sampled = false) {
     // Spam rules — always compute cross-page data, only push findings if enabled
     const nearDuplicate = nearDuplicateRule(pages, resolvedRules.nearDuplicateThreshold);
     if (isEnabled("spam/near-duplicate") && modeOk("spam/near-duplicate")) {
-        findings.push(...tag(nearDuplicate.findings));
+        pushAll(findings, tag(nearDuplicate.findings));
     }
     const entitySwap = entitySwapRule(pages, entityPatterns, resolvedRules.entitySwapThreshold);
     if (isEnabled("spam/entity-swap") && modeOk("spam/entity-swap")) {
-        findings.push(...tag(entitySwap.findings));
+        pushAll(findings, tag(entitySwap.findings));
     }
     const thinContent = thinContentRule(pages, resolvedRules.thinContentMinWords);
     if (isEnabled("spam/thin-content") && modeOk("spam/thin-content")) {
-        findings.push(...tag(thinContent.findings));
+        pushAll(findings, tag(thinContent.findings));
     }
     if (isEnabled("spam/doorway-pattern") && modeOk("spam/doorway-pattern")) {
-        findings.push(...tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
+        pushAll(findings, tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
     }
     if (isEnabled("spam/publication-velocity") && modeOk("spam/publication-velocity")) {
-        findings.push(...tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay, resolvedRules.publicationVelocityMaxPerDayCorpusFraction)));
+        pushAll(findings, tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay, resolvedRules.publicationVelocityMaxPerDayCorpusFraction)));
     }
     if (isEnabled("spam/boilerplate-ratio") && modeOk("spam/boilerplate-ratio")) {
-        findings.push(...tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
+        pushAll(findings, tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
     }
     if (isEnabled("spam/template-diversity") && modeOk("spam/template-diversity")) {
-        findings.push(...tag(templateDiversityRule(pages, resolvedRules.templateDiversityMinUniqueRatio)));
+        pushAll(findings, tag(templateDiversityRule(pages, resolvedRules.templateDiversityMinUniqueRatio)));
     }
     if (isEnabled("spam/template-coverage") && modeOk("spam/template-coverage")) {
-        findings.push(...tag(templateCoverageRule(pages, entityPatterns, resolvedRules.templateCoverageMinPages)));
+        pushAll(findings, tag(templateCoverageRule(pages, entityPatterns, resolvedRules.templateCoverageMinPages)));
     }
     // Content rules
     if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
-        findings.push(...tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
+        pushAll(findings, tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
     }
     if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
-        findings.push(...tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
+        pushAll(findings, tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
     }
     if (isEnabled("content/missing-author") && modeOk("content/missing-author")) {
-        findings.push(...tag(missingAuthorRule(pages)));
+        pushAll(findings, tag(missingAuthorRule(pages)));
     }
     if (isEnabled("content/eeat-signals") && modeOk("content/eeat-signals")) {
-        findings.push(...tag(eeatSignalsRule(pages)));
+        pushAll(findings, tag(eeatSignalsRule(pages)));
     }
     // 2026-05-03 v0.5.2 blind-spot fixes — title uniqueness + heading
     // structure + image alt-text were tier-1 gaps in the blind-spot audit.
     if (isEnabled("content/title-uniqueness") && modeOk("content/title-uniqueness")) {
-        findings.push(...tag(titleUniquenessRule(pages)));
+        pushAll(findings, tag(titleUniquenessRule(pages)));
     }
     if (isEnabled("content/heading-structure") && modeOk("content/heading-structure")) {
-        findings.push(...tag(headingStructureRule(pages)));
+        pushAll(findings, tag(headingStructureRule(pages)));
     }
     if (isEnabled("content/image-alt-text") && modeOk("content/image-alt-text")) {
-        findings.push(...tag(imageAltTextRule(pages)));
+        pushAll(findings, tag(imageAltTextRule(pages)));
     }
     if (isEnabled("content/translation-no-op") && modeOk("content/translation-no-op")) {
-        findings.push(...tag(translationNoOpRule(pages)));
+        pushAll(findings, tag(translationNoOpRule(pages)));
     }
     if (isEnabled("content/regurgitated-content") && modeOk("content/regurgitated-content")) {
-        findings.push(...tag(regurgitatedContentRule(pages)));
+        pushAll(findings, tag(regurgitatedContentRule(pages)));
     }
     if (isEnabled("content/common-phrase-reuse") && modeOk("content/common-phrase-reuse")) {
-        findings.push(...tag(commonPhraseReuseRule(pages)));
+        pushAll(findings, tag(commonPhraseReuseRule(pages)));
     }
     if (isEnabled("content/wikipedia-paraphrase") && modeOk("content/wikipedia-paraphrase")) {
-        findings.push(...tag(wikipediaParaphraseRule(pages)));
+        pushAll(findings, tag(wikipediaParaphraseRule(pages)));
     }
     // Link rules — use the global link graph
     if (isEnabled("links/orphan-pages") && modeOk("links/orphan-pages")) {
-        findings.push(...tag(orphanPagesRule(pages, inbound, rootUrl)));
+        pushAll(findings, tag(orphanPagesRule(pages, inbound, rootUrl)));
     }
     if (isEnabled("links/dead-ends") && modeOk("links/dead-ends")) {
-        findings.push(...tag(deadEndsRule(pages, knownUrls, rootUrl)));
+        pushAll(findings, tag(deadEndsRule(pages, knownUrls, rootUrl)));
     }
     if (isEnabled("links/link-depth") && modeOk("links/link-depth")) {
         if (rootUrl) {
-            findings.push(...tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound, sampled)));
+            pushAll(findings, tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound, sampled)));
         }
     }
     if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
-        findings.push(...tag(clusterConnectivityRule(pages, knownUrls)));
+        pushAll(findings, tag(clusterConnectivityRule(pages, knownUrls)));
     }
     if (isEnabled("links/host-section-divergence") && modeOk("links/host-section-divergence")) {
-        findings.push(...tag(hostSectionDivergenceRule(pages, adjacency)));
+        pushAll(findings, tag(hostSectionDivergenceRule(pages, adjacency)));
     }
     // Tech rules
     if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
-        findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
+        pushAll(findings, tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
     }
     if (isEnabled("tech/canonical-noindex-conflict") && modeOk("tech/canonical-noindex-conflict")) {
-        findings.push(...tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
+        pushAll(findings, tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
     }
     if (isEnabled("tech/robots-noindex-conflict") && modeOk("tech/robots-noindex-conflict")) {
-        findings.push(...tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
+        pushAll(findings, tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
     }
     if (isEnabled("tech/redirect-chain") && modeOk("tech/redirect-chain")) {
-        findings.push(...tag(redirectChainRule(pages)));
+        pushAll(findings, tag(redirectChainRule(pages)));
     }
     if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
-        findings.push(...tag(soft404Rule(pages)));
+        pushAll(findings, tag(soft404Rule(pages)));
     }
     if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
         // hreflang declarations on noindex'd pages are still bugs when they're
         // inconsistent — see auditor.test.ts "emits technical SEO findings".
-        findings.push(...tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
+        pushAll(findings, tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
     }
     // 2026-05-03 v0.5.2 blind-spot fix: og-completeness was referenced in
     // the v0.4.x README without ever shipping. Now it does.
     if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
-        findings.push(...tag(ogCompletenessRule(pages)));
+        pushAll(findings, tag(ogCompletenessRule(pages)));
     }
     // Schema rules
     if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
-        findings.push(...tag(jsonLdValidRule(pages)));
+        pushAll(findings, tag(jsonLdValidRule(pages)));
     }
     if (isEnabled("schema/required-fields") && modeOk("schema/required-fields")) {
-        findings.push(...tag(requiredFieldsRule(pages)));
+        pushAll(findings, tag(requiredFieldsRule(pages)));
     }
     if (isEnabled("schema/consistency") && modeOk("schema/consistency")) {
-        findings.push(...tag(schemaConsistencyRule(pages)));
+        pushAll(findings, tag(schemaConsistencyRule(pages)));
     }
     // AEO rules
     if (isEnabled("aeo/freshness-signals")) {
-        findings.push(...tag(freshnessSignalsRule(pages, {
+        pushAll(findings, tag(freshnessSignalsRule(pages, {
             maxStaleDays: resolvedRules.freshnessMaxStaleDays,
         })));
     }
     if (isEnabled("aeo/faq-coverage")) {
-        findings.push(...tag(faqCoverageRule(pages, {
+        pushAll(findings, tag(faqCoverageRule(pages, {
             minQuestionHeadings: resolvedRules.faqMinQuestionHeadings,
         })));
     }
     if (isEnabled("aeo/answer-first")) {
-        findings.push(...tag(answerFirstRule(pages, entityPatterns, {
+        pushAll(findings, tag(answerFirstRule(pages, entityPatterns, {
             maxFirstParagraphWords: resolvedRules.answerFirstMaxWords,
         })));
     }
     if (isEnabled("aeo/citable-facts")) {
-        findings.push(...tag(citableFactsRule(pages, entityPatterns, {
+        pushAll(findings, tag(citableFactsRule(pages, entityPatterns, {
             minFactsPerPage: resolvedRules.citableFactsMin,
             targetFactsPerPage: resolvedRules.citableFactsTarget,
         })));
     }
     if (isEnabled("aeo/content-modularity")) {
-        findings.push(...tag(contentModularityRule(pages, {
+        pushAll(findings, tag(contentModularityRule(pages, {
             maxParagraphWords: resolvedRules.modularityMaxParagraphWords,
             minSelfContainedRatio: resolvedRules.modularityMinSelfContainedRatio,
         })));
     }
     if (isEnabled("aeo/summary-bait")) {
-        findings.push(...tag(summaryBaitRule(pages, entityPatterns)));
+        pushAll(findings, tag(summaryBaitRule(pages, entityPatterns)));
     }
     // Cannibal rules — only url-pattern survives in v0.4 (title-overlap and
     // keyword-collision dropped due to high false-positive rates; see
     // 2026-04-29 v0.4 redesign spec §4.3).
     if (isEnabled("cannibal/url-pattern") && modeOk("cannibal/url-pattern")) {
-        findings.push(...tag(urlPatternRule(pages)));
+        pushAll(findings, tag(urlPatternRule(pages)));
     }
     return findings;
 }
@@ -1003,6 +1003,20 @@ function withDocsUrls(findings) {
     }
     return findings;
 }
+/**
+ * Append every item of `items` to `target` in place. Use this instead of
+ * `target.push(...items)` whenever `items` can be large. The spread form passes
+ * each element as a separate call argument, and V8 caps argument count
+ * (~131072) — so `push(...bigArray)` throws `RangeError: Maximum call stack size
+ * exceeded` on large inputs. A dense site makes the pairwise rules
+ * (near-duplicate / entity-swap) emit C(N,2) findings, which blew the cap at the
+ * rule-aggregation push *before* enrichment was even reached. The loop has no
+ * such limit. See tests/integration/large-corpus-no-overflow.test.ts.
+ */
+function pushAll(target, items) {
+    for (const item of items)
+        target.push(item);
+}
 async function collectHtmlFiles(directory) {
     const entries = await readdir(directory, { withFileTypes: true });
     const files = await Promise.all(entries.map(async (entry) => {
@@ -1038,7 +1052,11 @@ function composeSignals(...signals) {
     }
     return ac.signal;
 }
-async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop) {
+async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop,
+// Per-sitemap byte cap (sitemaps.org caps an uncompressed sitemap at 50 MB).
+// Guards against a hostile/misconfigured sitemap eating the whole byte budget
+// or memory. 0 / undefined = no cap.
+maxBytes) {
     try {
         stats.total += 1;
         const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
@@ -1048,6 +1066,11 @@ async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop)
         }
         if (r.status < 200 || r.status >= 300)
             return null;
+        if (maxBytes && maxBytes > 0 && r.body.length > maxBytes) {
+            // eslint-disable-next-line no-console
+            console.error(`pseolint: sitemap ${url} is ${(r.body.length / 1_048_576).toFixed(0)}MB, over the ${(maxBytes / 1_048_576).toFixed(0)}MB cap — skipping it.`);
+            return null;
+        }
         return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
     }
     catch (err) {
@@ -1205,7 +1228,16 @@ function fisherYatesSample(items, n, random = Math.random) {
     }
     return arr.slice(arr.length - n);
 }
-async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop) {
+/** sitemaps.org caps an uncompressed sitemap at 50 MB. */
+const SITEMAP_MAX_BYTES = 50 * 1024 * 1024;
+/**
+ * Max `<sitemapindex>` nesting depth we recurse through. The protocol only
+ * defines a single level of nesting, but some sites nest deeper; 5 is generous
+ * while still bounding work (and stack) on a hostile/misconfigured index that a
+ * `visited` set alone wouldn't catch (e.g. a deep non-cyclic chain).
+ */
+const SITEMAP_MAX_DEPTH = 5;
+async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth = 0, maxDepth = SITEMAP_MAX_DEPTH) {
     visited.add(sitemapUrl);
     const entries = parseSitemapUrlsWithLastmod(sitemapText);
     if (!isSitemapIndex(sitemapText)) {
@@ -1217,33 +1249,50 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
                 lastmodByUrl.set(entry.url, entry.lastmod);
             }
         }
-        return { urls, lastmodByUrl };
+        return { urls, lastmodByUrl, childTotal: 0, childFailed: 0 };
+    }
+    // It's a sitemap index. Past the depth cap we stop recursing — but the
+    // children we DON'T walk are unreached coverage, so report them as failed.
+    if (depth >= maxDepth) {
+        // eslint-disable-next-line no-console
+        console.error(`pseolint: sitemap-index nesting exceeded depth ${maxDepth} at ${sitemapUrl}; not recursing further.`);
+        return { urls: [], lastmodByUrl: new Map(), childTotal: entries.length, childFailed: entries.length };
     }
     const allUrls = [];
     const allLastmodByUrl = new Map();
+    let childTotal = 0;
+    let childFailed = 0;
     for (const entry of entries) {
         const childUrl = entry.url;
         if (signal?.aborted)
             throw signal.reason ?? new Error("aborted");
+        childTotal += 1;
         if (visited.has(childUrl))
+            continue; // already walked (cyclic index) — not a failure
+        const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
+        if (!child) {
+            childFailed += 1;
             continue;
-        const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop);
-        if (!child)
-            continue;
+        }
         const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
-        if (!childLike)
+        if (!childLike) {
+            childFailed += 1;
             continue;
-        const { urls: childUrls, lastmodByUrl: childLastmodByUrl } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
-        allUrls.push(...childUrls);
+        }
+        const { urls: childUrls, lastmodByUrl: childLastmodByUrl, childTotal: ct, childFailed: cf } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth + 1, maxDepth);
+        pushAll(allUrls, childUrls);
         for (const [u, lm] of childLastmodByUrl) {
             allLastmodByUrl.set(u, lm);
         }
+        // Accumulate nested index structure (a child that is itself an index).
+        childTotal += ct;
+        childFailed += cf;
     }
-    return { urls: allUrls, lastmodByUrl: allLastmodByUrl };
+    return { urls: allUrls, lastmodByUrl: allLastmodByUrl, childTotal, childFailed };
 }
 async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
     if (!origin)
-        return { disallow: [], crawlDelaySec: 0 };
+        return { disallow: [], crawlDelaySec: 0, sitemaps: [] };
     try {
         const robotsUrl = `${origin}/robots.txt`;
         const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats, signal, validateHop);
@@ -1253,10 +1302,14 @@ async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validate
         return {
             disallow: parseDisallowPatterns(fetched.text, ["*", "pseolint"]),
             crawlDelaySec: parseCrawlDelaySeconds(fetched.text),
+            // `Sitemap:` directives are origin-relative-agnostic (absolute URLs) and
+            // there can be several. Surfaced so discovery can read the site's declared
+            // sitemaps instead of guessing.
+            sitemaps: parseSitemapDirectives(fetched.text),
         };
     }
     catch {
-        return { disallow: [], crawlDelaySec: 0 };
+        return { disallow: [], crawlDelaySec: 0, sitemaps: [] };
     }
 }
 function sleep(ms) {
@@ -1272,7 +1325,12 @@ function isDisallowedByRobots(urlPath, patterns) {
 function budgetExceeded(b) {
     return b.cap > 0 && b.used >= b.cap;
 }
-async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null) {
+async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null,
+// Backpressure salvage: when provided, every page body that comes back is
+// pushed into this caller-owned array as it's fetched. If the watchdog aborts
+// mid-crawl and this function throws, the caller still holds the partial set
+// (the local `pages` array would otherwise be lost with the stack frame).
+pageSink) {
     // Memoized SSRF validator. When guardSsrf is on, every URL fetched by the
     // audit (source, sitemap entries, redirects, discovered links) goes through
     // this. DNS is hit once per unique hostname per audit — a 4k-page audit on
@@ -1332,7 +1390,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
         const isXml = (contentType.includes("xml") || looksLikeSitemap(text)) && sourceStatus !== -1;
         if (isXml) {
             const visited = new Set();
-            const { urls: allSitemapUrls, lastmodByUrl: sitemapLastmodByUrl } = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
+            const { urls: allSitemapUrls, lastmodByUrl: sitemapLastmodByUrl, childTotal: sitemapChildTotal, childFailed: sitemapChildFailed } = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
             // If we have a budget, sample from sitemap URLs before fetching
             const sampledUrls = discoveryBudget > 0 && allSitemapUrls.length > discoveryBudget
                 ? fisherYatesSample(allSitemapUrls, discoveryBudget)
@@ -1359,7 +1417,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
             else {
                 urlsToFetch = sampledUrls;
             }
-            const pages = [];
+            // Reuse the caller's salvage sink as the live page accumulator so a
+            // mid-crawl watchdog abort leaves the already-fetched pages visible to
+            // the caller. Falls back to a private array when no sink is passed.
+            const pages = pageSink ?? [];
             // Fetch robots.txt once for the origin — reused for Crawl-Delay pacing and Disallow checks.
             const sourceOrigin = (() => { try {
                 return new URL(source).origin;
@@ -1465,11 +1526,14 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
                     });
                 }
             }
-            return { pages, sitemapUrls: new Set(allSitemapUrls), sitemapLastmodByUrl, discoveredUrlCount: allSitemapUrls.length, scrapePlan };
+            return { pages, sitemapUrls: new Set(allSitemapUrls), sitemapLastmodByUrl, discoveredUrlCount: allSitemapUrls.length, declaredSitemapUrlCount: allSitemapUrls.length, sitemapChildTotal, sitemapChildFailed, scrapePlan };
         }
         if (contentType.includes("html") || looksLikeHtml(text)) {
             const initialPage = { url: source, html: text };
-            const pages = [initialPage];
+            // See note above: reuse the caller's salvage sink so a watchdog abort
+            // during link-discovery crawling preserves the pages fetched so far.
+            const pages = pageSink ?? [];
+            pages.push(initialPage);
             if (crawlDiscovery) {
                 let sourceOrigin;
                 try {
@@ -1481,6 +1545,106 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
                 const knownCrawled = new Set([source]);
                 const allDiscoveredUrls = new Set([source]);
                 const maxDepth = 3;
+                // Total URLs the discovered sitemap(s) declare — the basis for the
+                // caller's coverage guardrail. Undefined when no sitemap is found.
+                let declaredSitemapUrlCount;
+                // Child-sitemap reachability for the guardrail: how many child sitemaps
+                // an index referenced vs how many we could not fetch/parse. childFailed>0
+                // means the declared URL list is itself incomplete.
+                let sitemapChildTotal = 0;
+                let sitemapChildFailed = 0;
+                // Sitemap-first discovery (like Google). Before link-crawling, read the
+                // sitemap(s) the site declares — link-crawl only reaches *linked* pages,
+                // but a pSEO site's whole point is thousands of programmatic URLs that
+                // may be sparsely linked (or behind a build-frozen, under-linked nav).
+                // Sources of truth, in order:
+                //   1. `Sitemap:` directives in robots.txt (there can be several)
+                //   2. failing that, probe /sitemap.xml then /sitemap_index.xml
+                // Sitemap-listed URLs are authoritative, so we fetch them FIRST; the
+                // link-crawl below then fills any remaining budget and dedups against
+                // them. When no sitemap exists, this is a no-op and we crawl as before.
+                if (sourceOrigin) {
+                    const robotsForDiscovery = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats, signal, validateHop);
+                    const probing = robotsForDiscovery.sitemaps.length === 0;
+                    const sitemapCandidates = probing
+                        ? [`${sourceOrigin}/sitemap.xml`, `${sourceOrigin}/sitemap_index.xml`]
+                        : robotsForDiscovery.sitemaps;
+                    const visitedSitemaps = new Set();
+                    const sitemapListedUrls = [];
+                    for (const candidate of sitemapCandidates) {
+                        if (discoveryBudget > 0 && pages.length + sitemapListedUrls.length >= discoveryBudget)
+                            break;
+                        if (visitedSitemaps.has(candidate))
+                            continue;
+                        let smText;
+                        let smType;
+                        try {
+                            if (validateHop)
+                                await validateHop(candidate);
+                            const fetched = await fetchWithRetry(candidate, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
+                            if (!fetched)
+                                continue;
+                            smText = fetched.text;
+                            smType = fetched.contentType;
+                        }
+                        catch {
+                            continue; // SSRF refusal, network error, etc. — skip this candidate
+                        }
+                        if (!(smType.includes("xml") || looksLikeSitemap(smText)))
+                            continue;
+                        const { urls: discoveredSmUrls, childTotal: ct, childFailed: cf } = await collectUrlsFromSitemap(smText, candidate, visitedSitemaps, timeoutMs, cache, stats, signal, validateHop);
+                        sitemapChildTotal += ct;
+                        sitemapChildFailed += cf;
+                        pushAll(sitemapListedUrls, discoveredSmUrls);
+                        // When probing the conventional paths, stop at the first that hits.
+                        if (probing && discoveredSmUrls.length > 0)
+                            break;
+                    }
+                    // Same-origin + robots-aware filter, deduped against what we have.
+                    // Record what the sitemap(s) declared (deduped) before same-origin /
+                    // robots filtering — the operator's site has this many URLs.
+                    if (sitemapListedUrls.length > 0)
+                        declaredSitemapUrlCount = new Set(sitemapListedUrls).size;
+                    const seedUrls = Array.from(new Set(sitemapListedUrls)).filter((u) => {
+                        if (knownCrawled.has(u))
+                            return false;
+                        try {
+                            const parsed = new URL(u);
+                            if (parsed.origin !== sourceOrigin)
+                                return false;
+                            if (respectRobotsTxt && isDisallowedByRobots(parsed.pathname, robotsForDiscovery.disallow)) {
+                                skippedByRobots.push(u);
+                                return false;
+                            }
+                            return true;
+                        }
+                        catch {
+                            return false;
+                        }
+                    });
+                    for (const u of seedUrls)
+                        allDiscoveredUrls.add(u);
+                    // Cap the seed fetch. With a sampling budget, fit under it; without one
+                    // (the default "audit everything" path) bound by maxCrawlDiscovered, the
+                    // same ceiling the link-crawl honors — otherwise a homepage audit of a
+                    // site with a 50k-URL sitemap would try to fetch all of them (the link
+                    // crawl never could, so this would be an unbounded-egress regression).
+                    const seedToFetch = discoveryBudget > 0
+                        ? seedUrls.slice(0, Math.max(0, discoveryBudget - pages.length))
+                        : seedUrls.slice(0, maxCrawlDiscovered);
+                    if (seedToFetch.length > 0) {
+                        await runWithConcurrency(seedToFetch, concurrency, async (url) => {
+                            if (budgetExceeded(byteBudget))
+                                return;
+                            const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
+                            knownCrawled.add(url);
+                            if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
+                                byteBudget.used += result.html.length;
+                                pages.push(result);
+                            }
+                        });
+                    }
+                }
                 for (let depth = 0; depth < maxDepth; depth += 1) {
                     // Stop if we've hit the discovery budget
                     if (discoveryBudget > 0 && pages.length >= discoveryBudget)
@@ -1542,11 +1706,11 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
                             knownCrawled.add(url);
                         }
                     });
-                    pages.push(...newPages);
+                    pushAll(pages, newPages);
                     if (newPages.length === 0)
                         break;
                 }
-                return { pages, discoveredUrlCount: allDiscoveredUrls.size };
+                return { pages, discoveredUrlCount: allDiscoveredUrls.size, declaredSitemapUrlCount, sitemapChildTotal, sitemapChildFailed };
             }
             return { pages };
         }
@@ -1633,6 +1797,14 @@ export async function auditSource(source, options) {
     const backpressureEnabled = options?.backpressure !== false;
     const backpressureAbort = new AbortController();
     let backpressureError = null;
+    // Set once we've decided to salvage a partial report after a watchdog abort.
+    // From that point `throwIfAborted` must NOT re-throw the backpressure error —
+    // the watchdog already did its job (stopped fetching); the rest of the
+    // pipeline runs over the pages collected so far and the truncation is
+    // surfaced on the summary instead.
+    let truncated = false;
+    let truncatedReason;
+    let truncatedKind;
     const signal = composeSignals(externalSignal, backpressureAbort.signal);
     const observer = new FetchObserver();
     // 2026-05-03 calibration: the prior (3s p95 cap, 2× baseline multiplier)
@@ -1674,12 +1846,33 @@ export async function auditSource(source, options) {
             backpressureAbort.abort(backpressureError);
         }
     };
+    // Flip the run into salvage mode after a watchdog abort: record the reason so
+    // assembly sets summary.truncated, and from here `throwIfAborted` will no
+    // longer re-throw the backpressure error. Idempotent. Returns true when a
+    // backpressure abort was present to salvage.
+    function salvageBackpressure() {
+        if (!backpressureError)
+            return false;
+        truncated = true;
+        truncatedReason = backpressureError.message;
+        truncatedKind = "backpressure";
+        return true;
+    }
     function throwIfAborted() {
-        if (backpressureError)
-            throw backpressureError;
+        // An EXTERNAL abort (ctrl-C, parent timeout) is always fatal: the caller
+        // asked to stop, not to degrade. Check it first so it wins over salvage.
         if (externalSignal?.aborted) {
             throw externalSignal.reason ?? new DOMException("Audit aborted", "AbortError");
         }
+        // A backpressure abort is salvageable. Once we've committed to a partial
+        // report (`truncated`), swallow it and let the pipeline finish over the
+        // pages collected so far. Before that commit, the loader-boundary catch
+        // handles it; this guard only fires on the rare path where the loader
+        // returned normally (e.g. a fetch mock that ignores the abort signal) yet
+        // the watchdog still voted to abort — salvage rather than crash.
+        if (backpressureError && !truncated) {
+            salvageBackpressure();
+        }
     }
     const resolvedRules = {
         nearDuplicateThreshold: options?.rules?.nearDuplicateThreshold ?? DEFAULTS.nearDuplicateThreshold,
@@ -1785,6 +1978,9 @@ export async function auditSource(source, options) {
     let sitemapUrlSet;
     let sitemapLastmodByUrl;
     let discoveredUrlCount;
+    let declaredSitemapUrlCount;
+    let sitemapChildTotal;
+    let sitemapChildFailed;
     let scrapePlan;
     if (hasPinnedUrlsEarly) {
         const pinned = options.pinnedUrls;
@@ -1834,13 +2030,26 @@ export async function auditSource(source, options) {
             }
             : undefined;
         const pinnedPages = [];
-        await runWithConcurrency(Array.from(pinned), concurrency, async (url) => {
-            const result = await fetchPageWithMeta(url, timeoutMs, cacheConfig, cacheStats, signal, validateHopPinned, followRedirects);
-            if (result) {
-                fetchByteBudget.used += result.html.length;
-                pinnedPages.push(result);
+        try {
+            await runWithConcurrency(Array.from(pinned), concurrency, async (url) => {
+                const result = await fetchPageWithMeta(url, timeoutMs, cacheConfig, cacheStats, signal, validateHopPinned, followRedirects);
+                if (result) {
+                    fetchByteBudget.used += result.html.length;
+                    pinnedPages.push(result);
+                }
+            });
+        }
+        catch (err) {
+            // Same salvage contract as the sitemap/crawl path: a watchdog abort
+            // mid-fetch keeps the pages already collected in `pinnedPages`. Any other
+            // error (external abort, SSRF rejection) is fatal — re-throw it.
+            if (err instanceof OriginDegradedError) {
+                salvageBackpressure();
             }
-        });
+            else {
+                throw err;
+            }
+        }
         loadedPagesRaw = pinnedPages;
         // No sitemap context in pinned mode
         sitemapUrlSet = undefined;
@@ -1849,13 +2058,60 @@ export async function auditSource(source, options) {
         scrapePlan = undefined;
     }
     else {
-        const loaded = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext);
-        loadedPagesRaw = loaded.pages;
-        sitemapUrlSet = loaded.sitemapUrls;
-        sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
-        discoveredUrlCount = loaded.discoveredUrlCount;
-        scrapePlan = loaded.scrapePlan;
+        // Salvage sink: loadPagesFromSource fills this incrementally as pages come
+        // back. If the backpressure watchdog aborts mid-crawl the call throws an
+        // OriginDegradedError and the function's own return value is lost — but the
+        // already-fetched pages survive here, so we recover them and continue the
+        // pipeline with a `truncated` flag instead of throwing the whole run away.
+        const pageSink = [];
+        try {
+            const loaded = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext, pageSink);
+            loadedPagesRaw = loaded.pages;
+            sitemapUrlSet = loaded.sitemapUrls;
+            sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
+            discoveredUrlCount = loaded.discoveredUrlCount;
+            declaredSitemapUrlCount = loaded.declaredSitemapUrlCount;
+            sitemapChildTotal = loaded.sitemapChildTotal;
+            sitemapChildFailed = loaded.sitemapChildFailed;
+            scrapePlan = loaded.scrapePlan;
+        }
+        catch (err) {
+            // Only the watchdog abort is salvageable. An external abort (ctrl-C /
+            // parent timeout) or any other error is fatal — re-throw it untouched so
+            // --no-backpressure and ctrl-C behaviour are unchanged.
+            if (err instanceof OriginDegradedError) {
+                // Prefer the canonical backpressureError message (same object the
+                // monitor raised); fall back to the caught error if somehow distinct.
+                if (!salvageBackpressure()) {
+                    truncated = true;
+                    truncatedReason = err.message;
+                    truncatedKind = "backpressure";
+                }
+                // Recover whatever was fetched before the abort. The sink is the same
+                // array loadPagesFromSource was pushing into, so it holds the partial
+                // page set even though the function never reached its `return`.
+                loadedPagesRaw = pageSink;
+                // No sitemap/discovery context survives a mid-sitemap abort; the
+                // downstream classifier falls back to the loaded page URLs.
+                sitemapUrlSet = undefined;
+                sitemapLastmodByUrl = undefined;
+                discoveredUrlCount = undefined;
+                declaredSitemapUrlCount = undefined;
+                sitemapChildTotal = undefined;
+                sitemapChildFailed = undefined;
+                scrapePlan = undefined;
+            }
+            else {
+                throw err;
+            }
+        }
     }
+    // Pages we successfully FETCHED (HTTP 2xx) from discovery — before content-type
+    // and policy filtering, and before sampling. This is the right denominator for
+    // the coverage guardrail: noindex / non-HTML pages were still *reached* (they
+    // count), intentional sampling happens later (doesn't count against us), and
+    // only genuinely-unreachable URLs (4xx/5xx) are missing from it.
+    const fetchedCount = loadedPagesRaw.length;
     // The scrapePlan tells us which URLs were skipped pre-fetch under monitoring
     // mode. Surface them in skippedUrls so they show up under summary.skippedUrls
     // (kept for back-compat with --since consumers); T7 will carry their prior
@@ -1889,7 +2145,10 @@ export async function auditSource(source, options) {
             skippedByContentType.push(p.url);
         }
     }
-    loadedPages.splice(0, loadedPages.length, ...htmlOnlyPages);
+    // Replace contents in place without `splice(0, n, ...big)` — that spread hits
+    // the V8 argument-count cap on large corpora (same class as pushAll).
+    loadedPages.length = 0;
+    pushAll(loadedPages, htmlOnlyPages);
     if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
         console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
     }
@@ -2047,9 +2306,21 @@ export async function auditSource(source, options) {
     const guardedClassification = applyDegenerationGuard(computedClassification, corpusStatsFromPages(parsedPages));
     // `--strict` (or AuditOptions.strict) keeps the classification but forces
     // every rule to run regardless of detected site type.
+    //
+    // A backpressure abort BEFORE classification salvages only a fragment of the
+    // crawl (`truncated` is already set here; the coverage guardrail runs later).
+    // Classifying that fragment — e.g. the 1 page left after the watchdog aborts a
+    // cold-start origin — as `small-marketing` and suppressing the pSEO rules off
+    // it is exactly what produced the confident false "READY" on a 5,600-page
+    // site. When the run was truncated pre-classification we genuinely could not
+    // determine the site type: force `unclear` (confidence 0, no suppression,
+    // neutral scoring) so nothing masks the incompleteness.
+    const classificationUnreliable = truncated;
     const siteClassification = options?.strict
         ? { ...guardedClassification, suppressedRules: [] }
-        : guardedClassification;
+        : classificationUnreliable
+            ? { ...guardedClassification, type: "unclear", confidence: 0, suppressedRules: [] }
+            : guardedClassification;
     const suppressedRuleSet = new Set(siteClassification.suppressedRules);
     // Classify pages into groups and run only enabled rules per group
     const classified = classifyPages(parsedPages, options?.pageGroups);
@@ -2076,29 +2347,29 @@ export async function auditSource(source, options) {
     // Site-wide rules (run once, outside group loop)
     if (sitemapUrlSet && sitemapUrlSet.size > 0 && auditMode !== "diff") {
         const sitemapFindings = sitemapCompletenessRule(parsedPages, sitemapUrlSet);
-        allFindings.push(...sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
+        pushAll(allFindings, sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
         if (robotsTxtContent) {
             const robotsFindings = robotsComplianceRule(parsedPages, sitemapUrlSet, robotsTxtContent);
-            allFindings.push(...robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
+            pushAll(allFindings, robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
         }
     }
     // AEO site-wide rules. These run unconditionally (consistent with sitemap-completeness
     // and robots-compliance); page-group rule lists govern per-page AEO rules only.
     const llmsFindings = await llmsTxtRule(source, { timeoutMs });
-    allFindings.push(...llmsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
+    pushAll(allFindings, llmsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
     if (robotsTxtContent) {
         const crawlerFindings = crawlerAccessRule(robotsTxtContent);
-        allFindings.push(...crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
+        pushAll(allFindings, crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
     }
     // Data source comparison rules
     if (options?.dataSource?.records && options.dataSource.records.length > 0) {
         if (auditMode !== "diff" || isRuleAllowedInDiff("data/missing-binding")) {
             const dataBindingFindings = dataBindingRule(parsedPages, options.dataSource.records);
-            allFindings.push(...dataBindingFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
+            pushAll(allFindings, dataBindingFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
         }
         if (auditMode !== "diff" || isRuleAllowedInDiff("data/identical-across-pages")) {
             const dataIdenticalFindings = dataIdenticalRule(parsedPages, options.dataSource.records);
-            allFindings.push(...dataIdenticalFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
+            pushAll(allFindings, dataIdenticalFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
         }
     }
     for (const [groupName, groupPages] of classified) {
@@ -2118,7 +2389,7 @@ export async function auditSource(source, options) {
         // because the nav paths between locale-specific currency-converter URLs
         // were not in the pinned set).
         isSampledAudit || hasPinnedUrlsEarly);
-        allFindings.push(...findings);
+        pushAll(allFindings, findings);
         groupPageCounts[groupName] = groupPages.length;
         // v0.4.3: per-group scoring uses the same site-classification profile so
         // group-level risk numbers reflect the same severity / confidence remaps
@@ -2136,7 +2407,7 @@ export async function auditSource(source, options) {
             (auditMode === "full" || isRuleAllowedInDiff("content/value-add"));
         if (isValueAddEnabled) {
             const valueAddFindings = valueAddRule(parsedPages, allFindings);
-            allFindings.push(...valueAddFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
+            pushAll(allFindings, valueAddFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
         }
     }
     // Enrich findings: cluster pairwise, detect templates, assign effort
@@ -2260,6 +2531,71 @@ export async function auditSource(source, options) {
             ? [...parsedPages.map((p) => p.url)].sort()
             : undefined,
     };
+    // Partial-report flag: the backpressure watchdog aborted mid-crawl and we
+    // salvaged whatever pages had been fetched. Consumers MUST treat coverage as
+    // a lower bound (counts/verdict are partial). Only set when actually
+    // truncated so complete runs keep `truncated` absent.
+    // ── Coverage guardrails (#4) ─────────────────────────────────────────────
+    // A sitemap was found at discovery, so we know roughly how large the site is.
+    // Two independent under-coverage signals, each reusing the `truncated`
+    // partial-coverage surface (CLI/Action/MCP/web already flag it) tagged
+    // `truncatedKind: "coverage"` so consumers can tell it apart from a
+    // backpressure abort. Backpressure (set during the crawl) takes precedence.
+    if (!truncated && sitemapChildFailed && sitemapChildFailed > 0) {
+        // (A) Extraction-side: a sitemap INDEX referenced child sitemaps we could
+        // not fetch/parse (404, non-sitemap, or beyond the depth cap). The declared
+        // URL list is itself incomplete — the "unreachable child sitemaps" case a
+        // urls-only count can never see (and the original false-negative class).
+        truncated = true;
+        truncatedKind = "coverage";
+        truncatedReason =
+            `${sitemapChildFailed} of ${sitemapChildTotal} child sitemaps referenced by the sitemap index could not be ` +
+                `fetched or parsed — both the declared URL count and this audit are incomplete, so the verdict is not ` +
+                `representative of the full site. Check that every child sitemap is reachable (HTTP 200, valid XML).`;
+        // eslint-disable-next-line no-console
+        console.error(`pseolint: ${truncatedReason}`);
+    }
+    if (!truncated && declaredSitemapUrlCount && declaredSitemapUrlCount >= 20) {
+        // (B) Audit-side: the sitemap declared N URLs but we FETCHED far fewer than
+        // we intended to. Compare against `fetchedCount` (pages actually fetched,
+        // pre-filter/pre-sample) so legitimately-skipped pages (noindex, non-HTML)
+        // and intentional sampling do NOT register as a shortfall. `intended` is
+        // bounded by every deliberate limit — an explicit sample, the crawl cap, and
+        // the declared total — so none of them false-fire.
+        const sampleCap = sampleSize > 0 ? sampleSize : Number.POSITIVE_INFINITY;
+        const crawlCap = maxCrawlDiscovered > 0 ? maxCrawlDiscovered : Number.POSITIVE_INFINITY;
+        const intended = Math.min(sampleCap, crawlCap, declaredSitemapUrlCount);
+        const floor = Math.max(20, Math.floor(intended * 0.05));
+        // `intended >= 20`: only judge representativeness when we actually meant to
+        // audit a substantial slice. A deliberately tiny sample/crawl cap (intended
+        // < 20) is the operator's choice, not under-discovery — don't flag it (and
+        // it would otherwise trip the absolute floor of 20).
+        if (intended >= 20 && fetchedCount < floor) {
+            const unreached = Math.max(0, declaredSitemapUrlCount - fetchedCount);
+            const ratio = fetchedCount / declaredSitemapUrlCount;
+            const pct = (ratio * 100).toFixed(ratio < 0.01 ? 2 : 1);
+            truncated = true;
+            truncatedKind = "coverage";
+            truncatedReason =
+                `Fetched ${fetchedCount} of ~${declaredSitemapUrlCount} sitemap-declared URLs (~${pct}% coverage); ` +
+                    `~${unreached} could not be retrieved (4xx/5xx, redirects, or robots-blocked). The verdict covers only the ` +
+                    `pages reached and is not representative — check for a stale sitemap or unreachable pages, or raise crawl limits.`;
+            // eslint-disable-next-line no-console
+            console.error(`pseolint: ${truncatedReason}`);
+        }
+    }
+    if (truncated) {
+        summary.truncated = true;
+        summary.truncatedReason = truncatedReason;
+        if (truncatedKind)
+            summary.truncatedKind = truncatedKind;
+        // A truncated run is incomplete — never present it as a clean green. Floor
+        // the verdict to at least "caution" so the headline matches the partial-
+        // coverage banner instead of the false "READY ✓" over a salvaged fragment.
+        // ("ready" is the only rung below "caution"; everything else already is.)
+        if (summary.verdict === "ready")
+            summary.verdict = "caution";
+    }
     if (cacheConfig) {
         summary.cacheStats = cacheStats;
     }