npm - @pseolint/core - Versions diffs - 0.6.3 → 0.6.5 - Mend

@pseolint/core 0.6.3 → 0.6.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/README.md +64 -0
package/dist/ai/tools/fetch-sitemap.js +2 -1
package/dist/ai/tools/fetch-sitemap.js.map +1 -1
package/dist/auditor.d.ts +2 -1
package/dist/auditor.d.ts.map +1 -1
package/dist/auditor.js +324 -79
package/dist/auditor.js.map +1 -1
package/dist/enrich-findings.d.ts.map +1 -1
package/dist/enrich-findings.js +27 -5
package/dist/enrich-findings.js.map +1 -1
package/dist/parser.d.ts.map +1 -1
package/dist/parser.js +17 -1
package/dist/parser.js.map +1 -1
package/dist/rules/content/title-uniqueness.d.ts.map +1 -1
package/dist/rules/content/title-uniqueness.js +13 -0
package/dist/rules/content/title-uniqueness.js.map +1 -1
package/dist/site-classifier.d.ts.map +1 -1
package/dist/site-classifier.js +7 -1
package/dist/site-classifier.js.map +1 -1
package/dist/stratified-sample.js +2 -1
package/dist/stratified-sample.js.map +1 -1
package/dist/types.d.ts +47 -3
package/dist/types.d.ts.map +1 -1
package/dist/types.js +22 -2
package/dist/types.js.map +1 -1
package/package.json +5 -3
package/schemas/audit-summary.schema.json +295 -0

package/dist/auditor.js CHANGED Viewed

@@ -34,7 +34,7 @@ import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-con
 import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
 import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
 import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
-import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds } from "./rules/tech/robots-sitemap-presence.js";
+import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds, parseSitemapDirectives } from "./rules/tech/robots-sitemap-presence.js";
 import { llmsTxtRule } from "./rules/aeo/llms-txt.js";
 import { crawlerAccessRule } from "./rules/aeo/crawler-access.js";
 import { freshnessSignalsRule } from "./rules/aeo/freshness-signals.js";
@@ -107,6 +107,24 @@ const CATEGORY_MAP = {
     data: "data",
     audit: "audit",
 };
+/**
+ * Per-rule category overrides — take precedence over the namespace-level
+ * CATEGORY_MAP. A rule lands here when its namespace (chosen for code
+ * organisation) doesn't match the scoring bucket its *signal* belongs to.
+ *
+ * `links/host-section-divergence` lives in the links namespace because it reads
+ * the internal-link graph, but semantically it detects a spam-policy violation
+ * (Google's May 2024 site-reputation-abuse) — an INTEGRITY signal, not a
+ * discoverability one. Without this override it scored in the discoverability
+ * bucket (0.15 weight on programmatic-directory), so a confirmed parasite
+ * section moved the risk score by ~2pts despite registering as a blocker.
+ */
+const RULE_CATEGORY_OVERRIDES = {
+    "links/host-section-divergence": "integrity",
+};
+export function categoryForRule(ruleId) {
+    return RULE_CATEGORY_OVERRIDES[ruleId] ?? CATEGORY_MAP[ruleId.split("/")[0]];
+}
 const SCORING_PROFILES = {
     "small-marketing": {
         categoryWeights: { integrity: 0.30, discoverability: 0.40, citation: 0.20, data: 0.05, audit: 0 },
@@ -427,6 +445,13 @@ const RULE_IMPACTS = {
     "links/dead-ends": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
     "links/cluster-connectivity": { baseImpact: 5, perInstance: 1, maxImpact: 25 },
     "links/link-depth": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
+    // host-section-divergence is a reputation/integrity-grade signal that happens
+    // to live in the links namespace (it reads the link graph). It escalates to
+    // `error` and maps to manual-action risk, so it gets an explicit weight rather
+    // than inheriting DEFAULT_RULE_IMPACT (5/25), and is routed to the `integrity`
+    // bucket via RULE_CATEGORY_OVERRIDES so the score reflects the spam-policy
+    // severity rather than diluting into discoverability (0.15 weight).
+    "links/host-section-divergence": { baseImpact: 15, perInstance: 5, maxImpact: 45 },
     // AEO — much lower baselines than spam (AEO is opt-in optimization)
     "aeo/citable-facts": { baseImpact: 2, perInstance: 1, maxImpact: 25 },
     "aeo/answer-first": { baseImpact: 3, perInstance: 1, maxImpact: 25 },
@@ -590,157 +615,157 @@ sampled = false) {
     // Spam rules — always compute cross-page data, only push findings if enabled
     const nearDuplicate = nearDuplicateRule(pages, resolvedRules.nearDuplicateThreshold);
     if (isEnabled("spam/near-duplicate") && modeOk("spam/near-duplicate")) {
-        findings.push(...tag(nearDuplicate.findings));
+        pushAll(findings, tag(nearDuplicate.findings));
     }
     const entitySwap = entitySwapRule(pages, entityPatterns, resolvedRules.entitySwapThreshold);
     if (isEnabled("spam/entity-swap") && modeOk("spam/entity-swap")) {
-        findings.push(...tag(entitySwap.findings));
+        pushAll(findings, tag(entitySwap.findings));
     }
     const thinContent = thinContentRule(pages, resolvedRules.thinContentMinWords);
     if (isEnabled("spam/thin-content") && modeOk("spam/thin-content")) {
-        findings.push(...tag(thinContent.findings));
+        pushAll(findings, tag(thinContent.findings));
     }
     if (isEnabled("spam/doorway-pattern") && modeOk("spam/doorway-pattern")) {
-        findings.push(...tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
+        pushAll(findings, tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
     }
     if (isEnabled("spam/publication-velocity") && modeOk("spam/publication-velocity")) {
-        findings.push(...tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay, resolvedRules.publicationVelocityMaxPerDayCorpusFraction)));
+        pushAll(findings, tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay, resolvedRules.publicationVelocityMaxPerDayCorpusFraction)));
     }
     if (isEnabled("spam/boilerplate-ratio") && modeOk("spam/boilerplate-ratio")) {
-        findings.push(...tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
+        pushAll(findings, tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
     }
     if (isEnabled("spam/template-diversity") && modeOk("spam/template-diversity")) {
-        findings.push(...tag(templateDiversityRule(pages, resolvedRules.templateDiversityMinUniqueRatio)));
+        pushAll(findings, tag(templateDiversityRule(pages, resolvedRules.templateDiversityMinUniqueRatio)));
     }
     if (isEnabled("spam/template-coverage") && modeOk("spam/template-coverage")) {
-        findings.push(...tag(templateCoverageRule(pages, entityPatterns, resolvedRules.templateCoverageMinPages)));
+        pushAll(findings, tag(templateCoverageRule(pages, entityPatterns, resolvedRules.templateCoverageMinPages)));
     }
     // Content rules
     if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
-        findings.push(...tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
+        pushAll(findings, tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
     }
     if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
-        findings.push(...tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
+        pushAll(findings, tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
     }
     if (isEnabled("content/missing-author") && modeOk("content/missing-author")) {
-        findings.push(...tag(missingAuthorRule(pages)));
+        pushAll(findings, tag(missingAuthorRule(pages)));
     }
     if (isEnabled("content/eeat-signals") && modeOk("content/eeat-signals")) {
-        findings.push(...tag(eeatSignalsRule(pages)));
+        pushAll(findings, tag(eeatSignalsRule(pages)));
     }
     // 2026-05-03 v0.5.2 blind-spot fixes — title uniqueness + heading
     // structure + image alt-text were tier-1 gaps in the blind-spot audit.
     if (isEnabled("content/title-uniqueness") && modeOk("content/title-uniqueness")) {
-        findings.push(...tag(titleUniquenessRule(pages)));
+        pushAll(findings, tag(titleUniquenessRule(pages)));
     }
     if (isEnabled("content/heading-structure") && modeOk("content/heading-structure")) {
-        findings.push(...tag(headingStructureRule(pages)));
+        pushAll(findings, tag(headingStructureRule(pages)));
     }
     if (isEnabled("content/image-alt-text") && modeOk("content/image-alt-text")) {
-        findings.push(...tag(imageAltTextRule(pages)));
+        pushAll(findings, tag(imageAltTextRule(pages)));
     }
     if (isEnabled("content/translation-no-op") && modeOk("content/translation-no-op")) {
-        findings.push(...tag(translationNoOpRule(pages)));
+        pushAll(findings, tag(translationNoOpRule(pages)));
     }
     if (isEnabled("content/regurgitated-content") && modeOk("content/regurgitated-content")) {
-        findings.push(...tag(regurgitatedContentRule(pages)));
+        pushAll(findings, tag(regurgitatedContentRule(pages)));
     }
     if (isEnabled("content/common-phrase-reuse") && modeOk("content/common-phrase-reuse")) {
-        findings.push(...tag(commonPhraseReuseRule(pages)));
+        pushAll(findings, tag(commonPhraseReuseRule(pages)));
     }
     if (isEnabled("content/wikipedia-paraphrase") && modeOk("content/wikipedia-paraphrase")) {
-        findings.push(...tag(wikipediaParaphraseRule(pages)));
+        pushAll(findings, tag(wikipediaParaphraseRule(pages)));
     }
     // Link rules — use the global link graph
     if (isEnabled("links/orphan-pages") && modeOk("links/orphan-pages")) {
-        findings.push(...tag(orphanPagesRule(pages, inbound, rootUrl)));
+        pushAll(findings, tag(orphanPagesRule(pages, inbound, rootUrl)));
     }
     if (isEnabled("links/dead-ends") && modeOk("links/dead-ends")) {
-        findings.push(...tag(deadEndsRule(pages, knownUrls, rootUrl)));
+        pushAll(findings, tag(deadEndsRule(pages, knownUrls, rootUrl)));
     }
     if (isEnabled("links/link-depth") && modeOk("links/link-depth")) {
         if (rootUrl) {
-            findings.push(...tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound, sampled)));
+            pushAll(findings, tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound, sampled)));
         }
     }
     if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
-        findings.push(...tag(clusterConnectivityRule(pages, knownUrls)));
+        pushAll(findings, tag(clusterConnectivityRule(pages, knownUrls)));
     }
     if (isEnabled("links/host-section-divergence") && modeOk("links/host-section-divergence")) {
-        findings.push(...tag(hostSectionDivergenceRule(pages, adjacency)));
+        pushAll(findings, tag(hostSectionDivergenceRule(pages, adjacency)));
     }
     // Tech rules
     if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
-        findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
+        pushAll(findings, tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
     }
     if (isEnabled("tech/canonical-noindex-conflict") && modeOk("tech/canonical-noindex-conflict")) {
-        findings.push(...tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
+        pushAll(findings, tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
     }
     if (isEnabled("tech/robots-noindex-conflict") && modeOk("tech/robots-noindex-conflict")) {
-        findings.push(...tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
+        pushAll(findings, tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
     }
     if (isEnabled("tech/redirect-chain") && modeOk("tech/redirect-chain")) {
-        findings.push(...tag(redirectChainRule(pages)));
+        pushAll(findings, tag(redirectChainRule(pages)));
     }
     if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
-        findings.push(...tag(soft404Rule(pages)));
+        pushAll(findings, tag(soft404Rule(pages)));
     }
     if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
         // hreflang declarations on noindex'd pages are still bugs when they're
         // inconsistent — see auditor.test.ts "emits technical SEO findings".
-        findings.push(...tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
+        pushAll(findings, tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
     }
     // 2026-05-03 v0.5.2 blind-spot fix: og-completeness was referenced in
     // the v0.4.x README without ever shipping. Now it does.
     if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
-        findings.push(...tag(ogCompletenessRule(pages)));
+        pushAll(findings, tag(ogCompletenessRule(pages)));
     }
     // Schema rules
     if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
-        findings.push(...tag(jsonLdValidRule(pages)));
+        pushAll(findings, tag(jsonLdValidRule(pages)));
     }
     if (isEnabled("schema/required-fields") && modeOk("schema/required-fields")) {
-        findings.push(...tag(requiredFieldsRule(pages)));
+        pushAll(findings, tag(requiredFieldsRule(pages)));
     }
     if (isEnabled("schema/consistency") && modeOk("schema/consistency")) {
-        findings.push(...tag(schemaConsistencyRule(pages)));
+        pushAll(findings, tag(schemaConsistencyRule(pages)));
     }
     // AEO rules
     if (isEnabled("aeo/freshness-signals")) {
-        findings.push(...tag(freshnessSignalsRule(pages, {
+        pushAll(findings, tag(freshnessSignalsRule(pages, {
             maxStaleDays: resolvedRules.freshnessMaxStaleDays,
         })));
     }
     if (isEnabled("aeo/faq-coverage")) {
-        findings.push(...tag(faqCoverageRule(pages, {
+        pushAll(findings, tag(faqCoverageRule(pages, {
             minQuestionHeadings: resolvedRules.faqMinQuestionHeadings,
         })));
     }
     if (isEnabled("aeo/answer-first")) {
-        findings.push(...tag(answerFirstRule(pages, entityPatterns, {
+        pushAll(findings, tag(answerFirstRule(pages, entityPatterns, {
             maxFirstParagraphWords: resolvedRules.answerFirstMaxWords,
         })));
     }
     if (isEnabled("aeo/citable-facts")) {
-        findings.push(...tag(citableFactsRule(pages, entityPatterns, {
+        pushAll(findings, tag(citableFactsRule(pages, entityPatterns, {
             minFactsPerPage: resolvedRules.citableFactsMin,
             targetFactsPerPage: resolvedRules.citableFactsTarget,
         })));
     }
     if (isEnabled("aeo/content-modularity")) {
-        findings.push(...tag(contentModularityRule(pages, {
+        pushAll(findings, tag(contentModularityRule(pages, {
             maxParagraphWords: resolvedRules.modularityMaxParagraphWords,
             minSelfContainedRatio: resolvedRules.modularityMinSelfContainedRatio,
         })));
     }
     if (isEnabled("aeo/summary-bait")) {
-        findings.push(...tag(summaryBaitRule(pages, entityPatterns)));
+        pushAll(findings, tag(summaryBaitRule(pages, entityPatterns)));
     }
     // Cannibal rules — only url-pattern survives in v0.4 (title-overlap and
     // keyword-collision dropped due to high false-positive rates; see
     // 2026-04-29 v0.4 redesign spec §4.3).
     if (isEnabled("cannibal/url-pattern") && modeOk("cannibal/url-pattern")) {
-        findings.push(...tag(urlPatternRule(pages)));
+        pushAll(findings, tag(urlPatternRule(pages)));
     }
     return findings;
 }
@@ -830,8 +855,7 @@ function scoreFromFindings(findings, classification, pageCount = 0) {
     // Each group's weighted impact lands in its category bucket.
     const groups = new Map();
     for (const finding of findings) {
-        const namespace = finding.ruleId.split("/")[0];
-        const bucket = CATEGORY_MAP[namespace];
+        const bucket = categoryForRule(finding.ruleId);
         if (!bucket)
             continue;
         if (bucket !== "audit")
@@ -867,8 +891,7 @@ function scoreFromFindings(findings, classification, pageCount = 0) {
         integrity: 0, discoverability: 0, citation: 0, data: 0, audit: 0,
     };
     for (const [ruleId, group] of groups) {
-        const namespace = ruleId.split("/")[0];
-        const bucket = CATEGORY_MAP[namespace];
+        const bucket = categoryForRule(ruleId);
         if (!bucket || bucket === "audit")
             continue;
         const impactSpec = RULE_IMPACTS[ruleId] ?? DEFAULT_RULE_IMPACT;
@@ -980,6 +1003,20 @@ function withDocsUrls(findings) {
     }
     return findings;
 }
+/**
+ * Append every item of `items` to `target` in place. Use this instead of
+ * `target.push(...items)` whenever `items` can be large. The spread form passes
+ * each element as a separate call argument, and V8 caps argument count
+ * (~131072) — so `push(...bigArray)` throws `RangeError: Maximum call stack size
+ * exceeded` on large inputs. A dense site makes the pairwise rules
+ * (near-duplicate / entity-swap) emit C(N,2) findings, which blew the cap at the
+ * rule-aggregation push *before* enrichment was even reached. The loop has no
+ * such limit. See tests/integration/large-corpus-no-overflow.test.ts.
+ */
+function pushAll(target, items) {
+    for (const item of items)
+        target.push(item);
+}
 async function collectHtmlFiles(directory) {
     const entries = await readdir(directory, { withFileTypes: true });
     const files = await Promise.all(entries.map(async (entry) => {
@@ -1015,7 +1052,11 @@ function composeSignals(...signals) {
     }
     return ac.signal;
 }
-async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop) {
+async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop,
+// Per-sitemap byte cap (sitemaps.org caps an uncompressed sitemap at 50 MB).
+// Guards against a hostile/misconfigured sitemap eating the whole byte budget
+// or memory. 0 / undefined = no cap.
+maxBytes) {
     try {
         stats.total += 1;
         const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
@@ -1025,6 +1066,11 @@ async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop)
         }
         if (r.status < 200 || r.status >= 300)
             return null;
+        if (maxBytes && maxBytes > 0 && r.body.length > maxBytes) {
+            // eslint-disable-next-line no-console
+            console.error(`pseolint: sitemap ${url} is ${(r.body.length / 1_048_576).toFixed(0)}MB, over the ${(maxBytes / 1_048_576).toFixed(0)}MB cap — skipping it.`);
+            return null;
+        }
         return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
     }
     catch (err) {
@@ -1182,7 +1228,16 @@ function fisherYatesSample(items, n, random = Math.random) {
     }
     return arr.slice(arr.length - n);
 }
-async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop) {
+/** sitemaps.org caps an uncompressed sitemap at 50 MB. */
+const SITEMAP_MAX_BYTES = 50 * 1024 * 1024;
+/**
+ * Max `<sitemapindex>` nesting depth we recurse through. The protocol only
+ * defines a single level of nesting, but some sites nest deeper; 5 is generous
+ * while still bounding work (and stack) on a hostile/misconfigured index that a
+ * `visited` set alone wouldn't catch (e.g. a deep non-cyclic chain).
+ */
+const SITEMAP_MAX_DEPTH = 5;
+async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth = 0, maxDepth = SITEMAP_MAX_DEPTH) {
     visited.add(sitemapUrl);
     const entries = parseSitemapUrlsWithLastmod(sitemapText);
     if (!isSitemapIndex(sitemapText)) {
@@ -1196,6 +1251,13 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
         }
         return { urls, lastmodByUrl };
     }
+    // It's a sitemap index. Stop recursing past the depth cap (the index itself
+    // carries no page URLs, only child-sitemap refs, so returning empty is safe).
+    if (depth >= maxDepth) {
+        // eslint-disable-next-line no-console
+        console.error(`pseolint: sitemap-index nesting exceeded depth ${maxDepth} at ${sitemapUrl}; not recursing further.`);
+        return { urls: [], lastmodByUrl: new Map() };
+    }
     const allUrls = [];
     const allLastmodByUrl = new Map();
     for (const entry of entries) {
@@ -1204,14 +1266,14 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
             throw signal.reason ?? new Error("aborted");
         if (visited.has(childUrl))
             continue;
-        const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop);
+        const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
         if (!child)
             continue;
         const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
         if (!childLike)
             continue;
-        const { urls: childUrls, lastmodByUrl: childLastmodByUrl } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
-        allUrls.push(...childUrls);
+        const { urls: childUrls, lastmodByUrl: childLastmodByUrl } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth + 1, maxDepth);
+        pushAll(allUrls, childUrls);
         for (const [u, lm] of childLastmodByUrl) {
             allLastmodByUrl.set(u, lm);
         }
@@ -1220,7 +1282,7 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
 }
 async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
     if (!origin)
-        return { disallow: [], crawlDelaySec: 0 };
+        return { disallow: [], crawlDelaySec: 0, sitemaps: [] };
     try {
         const robotsUrl = `${origin}/robots.txt`;
         const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats, signal, validateHop);
@@ -1230,10 +1292,14 @@ async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validate
         return {
             disallow: parseDisallowPatterns(fetched.text, ["*", "pseolint"]),
             crawlDelaySec: parseCrawlDelaySeconds(fetched.text),
+            // `Sitemap:` directives are origin-relative-agnostic (absolute URLs) and
+            // there can be several. Surfaced so discovery can read the site's declared
+            // sitemaps instead of guessing.
+            sitemaps: parseSitemapDirectives(fetched.text),
         };
     }
     catch {
-        return { disallow: [], crawlDelaySec: 0 };
+        return { disallow: [], crawlDelaySec: 0, sitemaps: [] };
     }
 }
 function sleep(ms) {
@@ -1249,7 +1315,12 @@ function isDisallowedByRobots(urlPath, patterns) {
 function budgetExceeded(b) {
     return b.cap > 0 && b.used >= b.cap;
 }
-async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null) {
+async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null,
+// Backpressure salvage: when provided, every page body that comes back is
+// pushed into this caller-owned array as it's fetched. If the watchdog aborts
+// mid-crawl and this function throws, the caller still holds the partial set
+// (the local `pages` array would otherwise be lost with the stack frame).
+pageSink) {
     // Memoized SSRF validator. When guardSsrf is on, every URL fetched by the
     // audit (source, sitemap entries, redirects, discovered links) goes through
     // this. DNS is hit once per unique hostname per audit — a 4k-page audit on
@@ -1336,7 +1407,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
             else {
                 urlsToFetch = sampledUrls;
             }
-            const pages = [];
+            // Reuse the caller's salvage sink as the live page accumulator so a
+            // mid-crawl watchdog abort leaves the already-fetched pages visible to
+            // the caller. Falls back to a private array when no sink is passed.
+            const pages = pageSink ?? [];
             // Fetch robots.txt once for the origin — reused for Crawl-Delay pacing and Disallow checks.
             const sourceOrigin = (() => { try {
                 return new URL(source).origin;
@@ -1446,7 +1520,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
         }
         if (contentType.includes("html") || looksLikeHtml(text)) {
             const initialPage = { url: source, html: text };
-            const pages = [initialPage];
+            // See note above: reuse the caller's salvage sink so a watchdog abort
+            // during link-discovery crawling preserves the pages fetched so far.
+            const pages = pageSink ?? [];
+            pages.push(initialPage);
             if (crawlDiscovery) {
                 let sourceOrigin;
                 try {
@@ -1458,6 +1535,92 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
                 const knownCrawled = new Set([source]);
                 const allDiscoveredUrls = new Set([source]);
                 const maxDepth = 3;
+                // Sitemap-first discovery (like Google). Before link-crawling, read the
+                // sitemap(s) the site declares — link-crawl only reaches *linked* pages,
+                // but a pSEO site's whole point is thousands of programmatic URLs that
+                // may be sparsely linked (or behind a build-frozen, under-linked nav).
+                // Sources of truth, in order:
+                //   1. `Sitemap:` directives in robots.txt (there can be several)
+                //   2. failing that, probe /sitemap.xml then /sitemap_index.xml
+                // Sitemap-listed URLs are authoritative, so we fetch them FIRST; the
+                // link-crawl below then fills any remaining budget and dedups against
+                // them. When no sitemap exists, this is a no-op and we crawl as before.
+                if (sourceOrigin) {
+                    const robotsForDiscovery = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats, signal, validateHop);
+                    const probing = robotsForDiscovery.sitemaps.length === 0;
+                    const sitemapCandidates = probing
+                        ? [`${sourceOrigin}/sitemap.xml`, `${sourceOrigin}/sitemap_index.xml`]
+                        : robotsForDiscovery.sitemaps;
+                    const visitedSitemaps = new Set();
+                    const sitemapListedUrls = [];
+                    for (const candidate of sitemapCandidates) {
+                        if (discoveryBudget > 0 && pages.length + sitemapListedUrls.length >= discoveryBudget)
+                            break;
+                        if (visitedSitemaps.has(candidate))
+                            continue;
+                        let smText;
+                        let smType;
+                        try {
+                            if (validateHop)
+                                await validateHop(candidate);
+                            const fetched = await fetchWithRetry(candidate, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
+                            if (!fetched)
+                                continue;
+                            smText = fetched.text;
+                            smType = fetched.contentType;
+                        }
+                        catch {
+                            continue; // SSRF refusal, network error, etc. — skip this candidate
+                        }
+                        if (!(smType.includes("xml") || looksLikeSitemap(smText)))
+                            continue;
+                        const { urls: discoveredSmUrls } = await collectUrlsFromSitemap(smText, candidate, visitedSitemaps, timeoutMs, cache, stats, signal, validateHop);
+                        pushAll(sitemapListedUrls, discoveredSmUrls);
+                        // When probing the conventional paths, stop at the first that hits.
+                        if (probing && discoveredSmUrls.length > 0)
+                            break;
+                    }
+                    // Same-origin + robots-aware filter, deduped against what we have.
+                    const seedUrls = Array.from(new Set(sitemapListedUrls)).filter((u) => {
+                        if (knownCrawled.has(u))
+                            return false;
+                        try {
+                            const parsed = new URL(u);
+                            if (parsed.origin !== sourceOrigin)
+                                return false;
+                            if (respectRobotsTxt && isDisallowedByRobots(parsed.pathname, robotsForDiscovery.disallow)) {
+                                skippedByRobots.push(u);
+                                return false;
+                            }
+                            return true;
+                        }
+                        catch {
+                            return false;
+                        }
+                    });
+                    for (const u of seedUrls)
+                        allDiscoveredUrls.add(u);
+                    // Cap the seed fetch. With a sampling budget, fit under it; without one
+                    // (the default "audit everything" path) bound by maxCrawlDiscovered, the
+                    // same ceiling the link-crawl honors — otherwise a homepage audit of a
+                    // site with a 50k-URL sitemap would try to fetch all of them (the link
+                    // crawl never could, so this would be an unbounded-egress regression).
+                    const seedToFetch = discoveryBudget > 0
+                        ? seedUrls.slice(0, Math.max(0, discoveryBudget - pages.length))
+                        : seedUrls.slice(0, maxCrawlDiscovered);
+                    if (seedToFetch.length > 0) {
+                        await runWithConcurrency(seedToFetch, concurrency, async (url) => {
+                            if (budgetExceeded(byteBudget))
+                                return;
+                            const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
+                            knownCrawled.add(url);
+                            if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
+                                byteBudget.used += result.html.length;
+                                pages.push(result);
+                            }
+                        });
+                    }
+                }
                 for (let depth = 0; depth < maxDepth; depth += 1) {
                     // Stop if we've hit the discovery budget
                     if (discoveryBudget > 0 && pages.length >= discoveryBudget)
@@ -1519,7 +1682,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
                             knownCrawled.add(url);
                         }
                     });
-                    pages.push(...newPages);
+                    pushAll(pages, newPages);
                     if (newPages.length === 0)
                         break;
                 }
@@ -1610,6 +1773,13 @@ export async function auditSource(source, options) {
     const backpressureEnabled = options?.backpressure !== false;
     const backpressureAbort = new AbortController();
     let backpressureError = null;
+    // Set once we've decided to salvage a partial report after a watchdog abort.
+    // From that point `throwIfAborted` must NOT re-throw the backpressure error —
+    // the watchdog already did its job (stopped fetching); the rest of the
+    // pipeline runs over the pages collected so far and the truncation is
+    // surfaced on the summary instead.
+    let truncated = false;
+    let truncatedReason;
     const signal = composeSignals(externalSignal, backpressureAbort.signal);
     const observer = new FetchObserver();
     // 2026-05-03 calibration: the prior (3s p95 cap, 2× baseline multiplier)
@@ -1651,12 +1821,32 @@ export async function auditSource(source, options) {
             backpressureAbort.abort(backpressureError);
         }
     };
+    // Flip the run into salvage mode after a watchdog abort: record the reason so
+    // assembly sets summary.truncated, and from here `throwIfAborted` will no
+    // longer re-throw the backpressure error. Idempotent. Returns true when a
+    // backpressure abort was present to salvage.
+    function salvageBackpressure() {
+        if (!backpressureError)
+            return false;
+        truncated = true;
+        truncatedReason = backpressureError.message;
+        return true;
+    }
     function throwIfAborted() {
-        if (backpressureError)
-            throw backpressureError;
+        // An EXTERNAL abort (ctrl-C, parent timeout) is always fatal: the caller
+        // asked to stop, not to degrade. Check it first so it wins over salvage.
         if (externalSignal?.aborted) {
             throw externalSignal.reason ?? new DOMException("Audit aborted", "AbortError");
         }
+        // A backpressure abort is salvageable. Once we've committed to a partial
+        // report (`truncated`), swallow it and let the pipeline finish over the
+        // pages collected so far. Before that commit, the loader-boundary catch
+        // handles it; this guard only fires on the rare path where the loader
+        // returned normally (e.g. a fetch mock that ignores the abort signal) yet
+        // the watchdog still voted to abort — salvage rather than crash.
+        if (backpressureError && !truncated) {
+            salvageBackpressure();
+        }
     }
     const resolvedRules = {
         nearDuplicateThreshold: options?.rules?.nearDuplicateThreshold ?? DEFAULTS.nearDuplicateThreshold,
@@ -1811,13 +2001,26 @@ export async function auditSource(source, options) {
             }
             : undefined;
         const pinnedPages = [];
-        await runWithConcurrency(Array.from(pinned), concurrency, async (url) => {
-            const result = await fetchPageWithMeta(url, timeoutMs, cacheConfig, cacheStats, signal, validateHopPinned, followRedirects);
-            if (result) {
-                fetchByteBudget.used += result.html.length;
-                pinnedPages.push(result);
+        try {
+            await runWithConcurrency(Array.from(pinned), concurrency, async (url) => {
+                const result = await fetchPageWithMeta(url, timeoutMs, cacheConfig, cacheStats, signal, validateHopPinned, followRedirects);
+                if (result) {
+                    fetchByteBudget.used += result.html.length;
+                    pinnedPages.push(result);
+                }
+            });
+        }
+        catch (err) {
+            // Same salvage contract as the sitemap/crawl path: a watchdog abort
+            // mid-fetch keeps the pages already collected in `pinnedPages`. Any other
+            // error (external abort, SSRF rejection) is fatal — re-throw it.
+            if (err instanceof OriginDegradedError) {
+                salvageBackpressure();
             }
-        });
+            else {
+                throw err;
+            }
+        }
         loadedPagesRaw = pinnedPages;
         // No sitemap context in pinned mode
         sitemapUrlSet = undefined;
@@ -1826,12 +2029,46 @@ export async function auditSource(source, options) {
         scrapePlan = undefined;
     }
     else {
-        const loaded = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext);
-        loadedPagesRaw = loaded.pages;
-        sitemapUrlSet = loaded.sitemapUrls;
-        sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
-        discoveredUrlCount = loaded.discoveredUrlCount;
-        scrapePlan = loaded.scrapePlan;
+        // Salvage sink: loadPagesFromSource fills this incrementally as pages come
+        // back. If the backpressure watchdog aborts mid-crawl the call throws an
+        // OriginDegradedError and the function's own return value is lost — but the
+        // already-fetched pages survive here, so we recover them and continue the
+        // pipeline with a `truncated` flag instead of throwing the whole run away.
+        const pageSink = [];
+        try {
+            const loaded = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext, pageSink);
+            loadedPagesRaw = loaded.pages;
+            sitemapUrlSet = loaded.sitemapUrls;
+            sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
+            discoveredUrlCount = loaded.discoveredUrlCount;
+            scrapePlan = loaded.scrapePlan;
+        }
+        catch (err) {
+            // Only the watchdog abort is salvageable. An external abort (ctrl-C /
+            // parent timeout) or any other error is fatal — re-throw it untouched so
+            // --no-backpressure and ctrl-C behaviour are unchanged.
+            if (err instanceof OriginDegradedError) {
+                // Prefer the canonical backpressureError message (same object the
+                // monitor raised); fall back to the caught error if somehow distinct.
+                if (!salvageBackpressure()) {
+                    truncated = true;
+                    truncatedReason = err.message;
+                }
+                // Recover whatever was fetched before the abort. The sink is the same
+                // array loadPagesFromSource was pushing into, so it holds the partial
+                // page set even though the function never reached its `return`.
+                loadedPagesRaw = pageSink;
+                // No sitemap/discovery context survives a mid-sitemap abort; the
+                // downstream classifier falls back to the loaded page URLs.
+                sitemapUrlSet = undefined;
+                sitemapLastmodByUrl = undefined;
+                discoveredUrlCount = undefined;
+                scrapePlan = undefined;
+            }
+            else {
+                throw err;
+            }
+        }
     }
     // The scrapePlan tells us which URLs were skipped pre-fetch under monitoring
     // mode. Surface them in skippedUrls so they show up under summary.skippedUrls
@@ -2053,29 +2290,29 @@ export async function auditSource(source, options) {
     // Site-wide rules (run once, outside group loop)
     if (sitemapUrlSet && sitemapUrlSet.size > 0 && auditMode !== "diff") {
         const sitemapFindings = sitemapCompletenessRule(parsedPages, sitemapUrlSet);
-        allFindings.push(...sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
+        pushAll(allFindings, sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
         if (robotsTxtContent) {
             const robotsFindings = robotsComplianceRule(parsedPages, sitemapUrlSet, robotsTxtContent);
-            allFindings.push(...robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
+            pushAll(allFindings, robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
         }
     }
     // AEO site-wide rules. These run unconditionally (consistent with sitemap-completeness
     // and robots-compliance); page-group rule lists govern per-page AEO rules only.
     const llmsFindings = await llmsTxtRule(source, { timeoutMs });
-    allFindings.push(...llmsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
+    pushAll(allFindings, llmsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
     if (robotsTxtContent) {
         const crawlerFindings = crawlerAccessRule(robotsTxtContent);
-        allFindings.push(...crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
+        pushAll(allFindings, crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
     }
     // Data source comparison rules
     if (options?.dataSource?.records && options.dataSource.records.length > 0) {
         if (auditMode !== "diff" || isRuleAllowedInDiff("data/missing-binding")) {
             const dataBindingFindings = dataBindingRule(parsedPages, options.dataSource.records);
-            allFindings.push(...dataBindingFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
+            pushAll(allFindings, dataBindingFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
         }
         if (auditMode !== "diff" || isRuleAllowedInDiff("data/identical-across-pages")) {
             const dataIdenticalFindings = dataIdenticalRule(parsedPages, options.dataSource.records);
-            allFindings.push(...dataIdenticalFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
+            pushAll(allFindings, dataIdenticalFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
         }
     }
     for (const [groupName, groupPages] of classified) {
@@ -2095,7 +2332,7 @@ export async function auditSource(source, options) {
         // because the nav paths between locale-specific currency-converter URLs
         // were not in the pinned set).
         isSampledAudit || hasPinnedUrlsEarly);
-        allFindings.push(...findings);
+        pushAll(allFindings, findings);
         groupPageCounts[groupName] = groupPages.length;
         // v0.4.3: per-group scoring uses the same site-classification profile so
         // group-level risk numbers reflect the same severity / confidence remaps
@@ -2113,7 +2350,7 @@ export async function auditSource(source, options) {
             (auditMode === "full" || isRuleAllowedInDiff("content/value-add"));
         if (isValueAddEnabled) {
             const valueAddFindings = valueAddRule(parsedPages, allFindings);
-            allFindings.push(...valueAddFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
+            pushAll(allFindings, valueAddFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
         }
     }
     // Enrich findings: cluster pairwise, detect templates, assign effort
@@ -2237,6 +2474,14 @@ export async function auditSource(source, options) {
             ? [...parsedPages.map((p) => p.url)].sort()
             : undefined,
     };
+    // Partial-report flag: the backpressure watchdog aborted mid-crawl and we
+    // salvaged whatever pages had been fetched. Consumers MUST treat coverage as
+    // a lower bound (counts/verdict are partial). Only set when actually
+    // truncated so complete runs keep `truncated` absent.
+    if (truncated) {
+        summary.truncated = true;
+        summary.truncatedReason = truncatedReason;
+    }
     if (cacheConfig) {
         summary.cacheStats = cacheStats;
     }