npm - @pseolint/core - Versions diffs - 0.3.2 → 0.4.1 - Mend

@pseolint/core 0.3.2 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

package/README.md +49 -1
package/dist/ai/triage.d.ts.map +1 -1
package/dist/ai/triage.js +8 -1
package/dist/ai/triage.js.map +1 -1
package/dist/auditor.d.ts.map +1 -1
package/dist/auditor.js +566 -136
package/dist/auditor.js.map +1 -1
package/dist/backpressure.d.ts +68 -0
package/dist/backpressure.d.ts.map +1 -0
package/dist/backpressure.js +81 -0
package/dist/backpressure.js.map +1 -0
package/dist/cache.d.ts +73 -0
package/dist/cache.d.ts.map +1 -1
package/dist/cache.js +258 -19
package/dist/cache.js.map +1 -1
package/dist/enrich-findings.d.ts.map +1 -1
package/dist/enrich-findings.js +1 -14
package/dist/enrich-findings.js.map +1 -1
package/dist/fetch-observer.d.ts +97 -0
package/dist/fetch-observer.d.ts.map +1 -0
package/dist/fetch-observer.js +124 -0
package/dist/fetch-observer.js.map +1 -0
package/dist/formatters/console.d.ts +7 -9
package/dist/formatters/console.d.ts.map +1 -1
package/dist/formatters/console.js +218 -254
package/dist/formatters/console.js.map +1 -1
package/dist/formatters/html.d.ts +5 -1
package/dist/formatters/html.d.ts.map +1 -1
package/dist/formatters/html.js +352 -570
package/dist/formatters/html.js.map +1 -1
package/dist/formatters/index.d.ts +4 -1
package/dist/formatters/index.d.ts.map +1 -1
package/dist/formatters/index.js +1 -1
package/dist/formatters/index.js.map +1 -1
package/dist/formatters/json.d.ts +11 -1
package/dist/formatters/json.d.ts.map +1 -1
package/dist/formatters/json.js +5 -1
package/dist/formatters/json.js.map +1 -1
package/dist/formatters/markdown.d.ts +7 -1
package/dist/formatters/markdown.d.ts.map +1 -1
package/dist/formatters/markdown.js +77 -70
package/dist/formatters/markdown.js.map +1 -1
package/dist/index.d.ts +13 -8
package/dist/index.d.ts.map +1 -1
package/dist/index.js +6 -7
package/dist/index.js.map +1 -1
package/dist/page-filter.d.ts +50 -0
package/dist/page-filter.d.ts.map +1 -0
package/dist/page-filter.js +86 -0
package/dist/page-filter.js.map +1 -0
package/dist/rule-references.d.ts.map +1 -1
package/dist/rule-references.js +0 -6
package/dist/rule-references.js.map +1 -1
package/dist/rules/content/unique-value.d.ts.map +1 -1
package/dist/rules/content/unique-value.js +1 -0
package/dist/rules/content/unique-value.js.map +1 -1
package/dist/rules/scope.d.ts.map +1 -1
package/dist/rules/scope.js +6 -14
package/dist/rules/scope.js.map +1 -1
package/dist/rules/tech/robots-sitemap-presence.d.ts +9 -1
package/dist/rules/tech/robots-sitemap-presence.d.ts.map +1 -1
package/dist/rules/tech/robots-sitemap-presence.js +14 -5
package/dist/rules/tech/robots-sitemap-presence.js.map +1 -1
package/dist/safe-mode-preset.d.ts +27 -0
package/dist/safe-mode-preset.d.ts.map +1 -0
package/dist/safe-mode-preset.js +54 -0
package/dist/safe-mode-preset.js.map +1 -0
package/dist/site-classifier.d.ts +83 -0
package/dist/site-classifier.d.ts.map +1 -0
package/dist/site-classifier.js +205 -0
package/dist/site-classifier.js.map +1 -0
package/dist/ssrf-guard.d.ts +96 -0
package/dist/ssrf-guard.d.ts.map +1 -0
package/dist/ssrf-guard.js +268 -0
package/dist/ssrf-guard.js.map +1 -0
package/dist/types.d.ts +202 -19
package/dist/types.d.ts.map +1 -1
package/dist/types.js +2 -1
package/dist/types.js.map +1 -1
package/package.json +2 -2

package/dist/auditor.js CHANGED Viewed

@@ -2,9 +2,9 @@ import { createHash } from "node:crypto";
 import { readdir, readFile, stat } from "node:fs/promises";
 import { extname, join, resolve } from "node:path";
 import { parseHtmlPage } from "./parser.js";
+import { pageSkipReason } from "./page-filter.js";
 import { mergeNormalizeUrlOptions, normalizeAuditUrl } from "./url-normalize.js";
 import { eeatSignalsRule } from "./rules/content/eeat-signals.js";
-import { headingUniquenessRule } from "./rules/content/heading-uniqueness.js";
 import { metaUniquenessRule } from "./rules/content/meta-uniqueness.js";
 import { missingAuthorRule } from "./rules/content/missing-author.js";
 import { uniqueValueRule } from "./rules/content/unique-value.js";
@@ -18,12 +18,10 @@ import { thinContentRule } from "./rules/spam/thin-content.js";
 import { deadEndsRule } from "./rules/links/dead-ends.js";
 import { linkDepthRule } from "./rules/links/link-depth.js";
 import { clusterConnectivityRule } from "./rules/links/cluster-connectivity.js";
-import { hubPagesRule } from "./rules/links/hub-pages.js";
 import { orphanPagesRule } from "./rules/links/orphan-pages.js";
 import { canonicalConsistencyRule } from "./rules/tech/canonical-consistency.js";
 import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-conflict.js";
 import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
-import { ogCompletenessRule } from "./rules/tech/og-completeness.js";
 import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
 import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
 import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds } from "./rules/tech/robots-sitemap-presence.js";
@@ -33,7 +31,6 @@ import { freshnessSignalsRule } from "./rules/aeo/freshness-signals.js";
 import { faqCoverageRule } from "./rules/aeo/faq-coverage.js";
 import { answerFirstRule } from "./rules/aeo/answer-first.js";
 import { citableFactsRule } from "./rules/aeo/citable-facts.js";
-import { nonReplicableValueRule } from "./rules/aeo/non-replicable-value.js";
 import { contentModularityRule } from "./rules/aeo/content-modularity.js";
 import { summaryBaitRule } from "./rules/aeo/summary-bait.js";
 import { redirectChainRule } from "./rules/tech/redirect-chain.js";
@@ -41,8 +38,6 @@ import { soft404Rule } from "./rules/tech/soft-404.js";
 import { jsonLdValidRule } from "./rules/schema/json-ld-valid.js";
 import { requiredFieldsRule } from "./rules/schema/required-fields.js";
 import { schemaConsistencyRule } from "./rules/schema/consistency.js";
-import { titleOverlapRule } from "./rules/cannibal/title-overlap.js";
-import { keywordCollisionRule } from "./rules/cannibal/keyword-collision.js";
 import { urlPatternRule } from "./rules/cannibal/url-pattern.js";
 import { templateCoverageRule } from "./rules/spam/template-coverage.js";
 import { dataBindingRule, dataIdenticalRule } from "./rules/data/data-binding.js";
@@ -54,8 +49,14 @@ import { triageFindings } from "./ai/triage.js";
 import { createLanguageModel } from "./ai/adapters/index.js";
 import { promptTriageFeedback } from "./ai/feedback-prompt.js";
 import { generateRunId, appendTelemetryRecord, todayTriageSpendUsd, } from "./telemetry/index.js";
-import { cachedFetch } from "./cache.js";
+import { SCHEMA_VERSION } from "./types.js";
+import { cachedFetch, pruneCache } from "./cache.js";
+import { SSRFError, validateTargetHost } from "./ssrf-guard.js";
+import { SAFE_MODE_PRESETS, resolveSafeModeKey } from "./safe-mode-preset.js";
+import { FetchObserver, computeReadiness, detectDevServer } from "./fetch-observer.js";
+import { BackpressureMonitor, OriginDegradedError } from "./backpressure.js";
 import { stratifiedSample } from "./stratified-sample.js";
+import { classifySite } from "./site-classifier.js";
 import { readState, writeState, computeContentHash, STATE_SCHEMA_VERSION, } from "./state.js";
 const DEFAULTS = {
     nearDuplicateThreshold: 0.85,
@@ -67,10 +68,6 @@ const DEFAULTS = {
     uniqueValueMinWords: 100,
     metaUniquenessMinJaccard: 0.9,
     linkDepthMaxClicks: 3,
-    hubPagesMinSiblings: 4,
-    hubPagesMaxSiblings: 50,
-    titleOverlapThreshold: 0.8,
-    keywordCollisionMinShared: 6,
     templateCoverageMinPages: 5,
     answerFirstMaxWords: 100,
     citableFactsMin: 3,
@@ -80,18 +77,82 @@ const DEFAULTS = {
     modularityMinSelfContainedRatio: 0.7,
     faqMinQuestionHeadings: 2
 };
+/**
+ * v0.4 four-category weights. Audit is diagnostic-only (weight 0).
+ * See 2026-04-29 v0.4 redesign spec §4.2.
+ */
 const CATEGORY_WEIGHTS = {
-    spam: 0.33,
-    content: 0.19,
-    aeo: 0.14,
-    links: 0.11,
-    tech: 0.07,
-    data: 0.06,
-    schema: 0.05,
-    cannibal: 0.05,
-    /** Dedup / crawl hygiene; does not affect composite score. */
-    audit: 0
+    integrity: 0.50, // spam + content + cannibal
+    discoverability: 0.20, // links + tech
+    citation: 0.25, // aeo + schema
+    data: 0.05, // data
+    audit: 0, // diagnostics, never weighted
 };
+/**
+ * Maps the v0.3 ruleId namespace prefix to the v0.4 four-bucket category.
+ * Used by `scoreFromFindings` to bucket findings without changing rule IDs.
+ */
+const CATEGORY_MAP = {
+    spam: "integrity",
+    content: "integrity",
+    cannibal: "integrity",
+    links: "discoverability",
+    tech: "discoverability",
+    aeo: "citation",
+    schema: "citation",
+    data: "data",
+    audit: "audit",
+};
+/** Slug map for `RuleResult.docsUrl`. Defaults to the rule-id segment after the `/`. */
+const RULE_DOCS_SLUG = {
+// intentionally empty for v0.4 — slug = ruleId.split("/").pop() works for every shipped rule
+};
+function docsUrlFor(ruleId) {
+    const slug = RULE_DOCS_SLUG[ruleId] ?? ruleId.split("/").pop() ?? ruleId;
+    return `https://pseolint.dev/rules/${slug}`;
+}
+/** Verdict ladder thresholds — see spec §4.4. */
+function verdictForRisk(risk) {
+    if (risk <= 20)
+        return "ready";
+    if (risk <= 40)
+        return "caution";
+    if (risk <= 60)
+        return "concerning";
+    return "critical";
+}
+function gradeForPenalty(penalty) {
+    if (penalty <= 20)
+        return "A";
+    if (penalty <= 40)
+        return "B";
+    if (penalty <= 60)
+        return "C";
+    if (penalty <= 80)
+        return "D";
+    return "F";
+}
+/** True for `text/html` and `application/xhtml+xml` only (treat as audit-eligible content). */
+function isHtmlContentType(contentType) {
+    if (!contentType)
+        return true; // Local files / unknown — assume HTML.
+    const lower = contentType.toLowerCase();
+    return lower.includes("text/html") || lower.includes("application/xhtml+xml");
+}
+/** Glob match against a URL pathname only (not the full URL). v0.4 spec §4.5. */
+function globMatchPathname(pattern, urlOrPath) {
+    let pathname;
+    try {
+        pathname = new URL(urlOrPath).pathname;
+    }
+    catch {
+        // Not a URL — treat as already-a-path. Force a leading slash for consistency.
+        pathname = urlOrPath.startsWith("/") ? urlOrPath : `/${urlOrPath}`;
+    }
+    // Allow patterns that don't begin with "/" by normalising both sides.
+    const normPattern = pattern.startsWith("/") || pattern.startsWith("*") ? pattern : `/${pattern}`;
+    return matchGlob(normPattern, pathname) || matchGlob(pattern, pathname);
+}
 const DEFAULT_ENTITY_PATTERNS = [
     {
         placeholder: "[STATE]",
@@ -112,7 +173,16 @@ function resolveGroupRules(baseRules, overrides) {
     }
     return result;
 }
-function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides, mode = "full") {
+function runRulesOnPages(pages,
+/**
+ * Full set of parsed pages including those filtered out by `respectNoindex`
+ * / `skipDetectedAuth`. Defaults to `pages` for backwards compat. The two
+ * noindex-conflict rules (`tech/canonical-noindex-conflict`,
+ * `tech/robots-noindex-conflict`) read this list specifically — without it,
+ * `respectNoindex: true` would hide noindex'd pages from the very rules
+ * designed to flag accidental noindex'ing.
+ */
+noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides, mode = "full") {
     const findings = [];
     const modeOk = (ruleId) => mode !== "diff" || isRuleAllowedInDiff(ruleId);
     const tag = (results) => results.map((r) => {
@@ -156,9 +226,6 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
     if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
         findings.push(...tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
     }
-    if (isEnabled("content/heading-uniqueness") && modeOk("content/heading-uniqueness")) {
-        findings.push(...tag(headingUniquenessRule(pages, entityPatterns)));
-    }
     if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
         findings.push(...tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
     }
@@ -183,18 +250,15 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
     if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
         findings.push(...tag(clusterConnectivityRule(pages, knownUrls)));
     }
-    if (isEnabled("links/hub-pages") && modeOk("links/hub-pages")) {
-        findings.push(...tag(hubPagesRule(pages, knownUrls, resolvedRules.hubPagesMinSiblings, resolvedRules.hubPagesMaxSiblings)));
-    }
     // Tech rules
     if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
         findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
     }
     if (isEnabled("tech/canonical-noindex-conflict") && modeOk("tech/canonical-noindex-conflict")) {
-        findings.push(...tag(canonicalNoindexConflictRule(pages, normalizeUrlOptions)));
+        findings.push(...tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
     }
     if (isEnabled("tech/robots-noindex-conflict") && modeOk("tech/robots-noindex-conflict")) {
-        findings.push(...tag(robotsNoindexConflictRule(pages, inbound)));
+        findings.push(...tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
     }
     if (isEnabled("tech/redirect-chain") && modeOk("tech/redirect-chain")) {
         findings.push(...tag(redirectChainRule(pages)));
@@ -202,11 +266,10 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
     if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
         findings.push(...tag(soft404Rule(pages)));
     }
-    if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
-        findings.push(...tag(ogCompletenessRule(pages)));
-    }
     if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
-        findings.push(...tag(hreflangConsistencyRule(pages, normalizeUrlOptions)));
+        // hreflang declarations on noindex'd pages are still bugs when they're
+        // inconsistent — see auditor.test.ts "emits technical SEO findings".
+        findings.push(...tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
     }
     // Schema rules
     if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
@@ -240,9 +303,6 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
             targetFactsPerPage: resolvedRules.citableFactsTarget,
         })));
     }
-    if (isEnabled("aeo/non-replicable-value")) {
-        findings.push(...tag(nonReplicableValueRule(pages)));
-    }
     if (isEnabled("aeo/content-modularity")) {
         findings.push(...tag(contentModularityRule(pages, {
             maxParagraphWords: resolvedRules.modularityMaxParagraphWords,
@@ -252,13 +312,9 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
     if (isEnabled("aeo/summary-bait")) {
         findings.push(...tag(summaryBaitRule(pages, entityPatterns)));
     }
-    // Cannibal rules
-    if (isEnabled("cannibal/title-overlap") && modeOk("cannibal/title-overlap")) {
-        findings.push(...tag(titleOverlapRule(pages, entityPatterns, resolvedRules.titleOverlapThreshold)));
-    }
-    if (isEnabled("cannibal/keyword-collision") && modeOk("cannibal/keyword-collision")) {
-        findings.push(...tag(keywordCollisionRule(pages, resolvedRules.keywordCollisionMinShared)));
-    }
+    // Cannibal rules — only url-pattern survives in v0.4 (title-overlap and
+    // keyword-collision dropped due to high false-positive rates; see
+    // 2026-04-29 v0.4 redesign spec §4.3).
     if (isEnabled("cannibal/url-pattern") && modeOk("cannibal/url-pattern")) {
         findings.push(...tag(urlPatternRule(pages)));
     }
@@ -267,54 +323,110 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
 function hashHtml(html) {
     return createHash("sha256").update(html, "utf8").digest("hex");
 }
+const SEVERITY_WEIGHTS = {
+    critical: 40,
+    error: 25,
+    warning: 12,
+    info: 5,
+};
 function scoreFromFindings(findings) {
-    const severityWeights = {
-        critical: 40,
-        error: 25,
-        warning: 12,
-        info: 5
+    // v0.4 four-bucket raw penalties.
+    const bucketRaw = {
+        integrity: 0,
+        discoverability: 0,
+        citation: 0,
+        data: 0,
+        audit: 0,
     };
-    const raw = {
-        spam: 0,
-        content: 0,
-        aeo: 0,
-        links: 0,
-        tech: 0,
+    const bucketIssues = {
+        integrity: 0,
+        discoverability: 0,
+        citation: 0,
         data: 0,
-        schema: 0,
-        cannibal: 0,
-        audit: 0
+        audit: 0,
     };
+    let blockers = 0;
+    let shouldFix = 0;
+    let informational = 0;
     for (const finding of findings) {
-        const category = finding.ruleId.split("/")[0];
-        if (!(category in raw)) {
+        const namespace = finding.ruleId.split("/")[0];
+        const bucket = CATEGORY_MAP[namespace];
+        if (!bucket)
             continue;
+        const weight = SEVERITY_WEIGHTS[finding.severity];
+        // v0.4 buckets.
+        bucketRaw[bucket] = Math.min(100, bucketRaw[bucket] + weight);
+        if (bucket !== "audit") {
+            bucketIssues[bucket] += 1;
         }
-        raw[category] = Math.min(100, raw[category] + severityWeights[finding.severity]);
-    }
-    const weighted = raw.spam * CATEGORY_WEIGHTS.spam +
-        raw.content * CATEGORY_WEIGHTS.content +
-        raw.aeo * CATEGORY_WEIGHTS.aeo +
-        raw.links * CATEGORY_WEIGHTS.links +
-        raw.tech * CATEGORY_WEIGHTS.tech +
-        raw.data * CATEGORY_WEIGHTS.data +
-        raw.schema * CATEGORY_WEIGHTS.schema +
-        raw.cannibal * CATEGORY_WEIGHTS.cannibal +
-        raw.audit * CATEGORY_WEIGHTS.audit;
+        // Issue-bucket counts (audit/* findings are diagnostic-only and excluded).
+        if (bucket === "audit")
+            continue;
+        if (finding.severity === "critical" || finding.severity === "error")
+            blockers += 1;
+        else if (finding.severity === "warning")
+            shouldFix += 1;
+        else
+            informational += 1;
+    }
+    const weighted = bucketRaw.integrity * CATEGORY_WEIGHTS.integrity +
+        bucketRaw.discoverability * CATEGORY_WEIGHTS.discoverability +
+        bucketRaw.citation * CATEGORY_WEIGHTS.citation +
+        bucketRaw.data * CATEGORY_WEIGHTS.data;
+    const risk = Math.round(Math.min(100, weighted));
+    const categories = {
+        integrity: { grade: gradeForPenalty(bucketRaw.integrity), issues: bucketIssues.integrity },
+        discoverability: { grade: gradeForPenalty(bucketRaw.discoverability), issues: bucketIssues.discoverability },
+        citation: { grade: gradeForPenalty(bucketRaw.citation), issues: bucketIssues.citation },
+        data: { grade: gradeForPenalty(bucketRaw.data), issues: bucketIssues.data },
+        audit: { grade: "A", issues: 0 },
+    };
     return {
-        score: Math.round(Math.min(100, weighted)),
-        categoryScores: {
-            spam: raw.spam,
-            content: raw.content,
-            aeo: raw.aeo,
-            links: raw.links,
-            tech: raw.tech,
-            data: raw.data,
-            schema: raw.schema,
-            cannibal: raw.cannibal
-        }
+        risk,
+        categories,
+        bucketCounts: { blockers, shouldFix, informational },
     };
 }
+function bucketIssues(findings) {
+    const blockers = [];
+    const shouldFix = [];
+    const informational = [];
+    for (const f of findings) {
+        // audit/* findings are diagnostics and never appear in issue buckets.
+        if (f.ruleId.startsWith("audit/"))
+            continue;
+        if (f.severity === "critical" || f.severity === "error")
+            blockers.push(f);
+        else if (f.severity === "warning")
+            shouldFix.push(f);
+        else
+            informational.push(f);
+    }
+    return { blockers, shouldFix, informational };
+}
+function buildHeadline(counts) {
+    const parts = [];
+    if (counts.blockers > 0) {
+        parts.push(`${counts.blockers} ship-blocker${counts.blockers === 1 ? "" : "s"}`);
+    }
+    if (counts.shouldFix > 0) {
+        parts.push(`${counts.shouldFix} should-fix`);
+    }
+    if (counts.informational > 0 && parts.length < 2) {
+        parts.push(`${counts.informational} informational`);
+    }
+    if (parts.length === 0)
+        return "No issues detected.";
+    return parts.join(", ");
+}
+/** Populate `docsUrl` on every finding that doesn't already have one. */
+function withDocsUrls(findings) {
+    for (const f of findings) {
+        if (!f.docsUrl)
+            f.docsUrl = docsUrlFor(f.ruleId);
+    }
+    return findings;
+}
 async function collectHtmlFiles(directory) {
     const entries = await readdir(directory, { withFileTypes: true });
     const files = await Promise.all(entries.map(async (entry) => {
@@ -330,10 +442,30 @@ async function collectHtmlFiles(directory) {
     }));
     return files.flat();
 }
-async function fetchWithRetry(url, timeoutMs, cache, stats) {
+/**
+ * Combine up to N AbortSignals into one. The returned signal aborts as soon
+ * as any input aborts. Avoids the node-only `AbortSignal.any` for wider
+ * compatibility and keeps listeners weak-ish (one per input, no unbounded
+ * listener growth).
+ */
+function composeSignals(...signals) {
+    const actual = signals.filter((s) => Boolean(s));
+    if (actual.length === 0)
+        return new AbortController().signal;
+    const ac = new AbortController();
+    for (const s of actual) {
+        if (s.aborted) {
+            ac.abort(s.reason);
+            return ac.signal;
+        }
+        s.addEventListener("abort", () => ac.abort(s.reason), { once: true });
+    }
+    return ac.signal;
+}
+async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop) {
     try {
         stats.total += 1;
-        const r = await cachedFetch(url, { timeoutMs, cache });
+        const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
         if (r.fromCache) {
             stats.hits += 1;
             stats.bytesSavedEstimate += r.body.length;
@@ -342,14 +474,16 @@ async function fetchWithRetry(url, timeoutMs, cache, stats) {
             return null;
         return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
     }
-    catch {
+    catch (err) {
+        if (signal?.aborted)
+            throw err; // propagate cancellation
         return null;
     }
 }
-async function fetchPageWithMeta(url, timeoutMs, cache, stats) {
+async function fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects = true) {
     try {
         stats.total += 1;
-        const r = await cachedFetch(url, { timeoutMs, cache });
+        const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, followRedirects, onObservation: stats.onObservation });
         if (r.fromCache) {
             stats.hits += 1;
             stats.bytesSavedEstimate += r.body.length;
@@ -366,13 +500,15 @@ async function fetchPageWithMeta(url, timeoutMs, cache, stats) {
             },
         };
     }
-    catch {
+    catch (err) {
+        if (signal?.aborted)
+            throw err;
         return null;
     }
 }
-async function fetchTextStrict(url, timeoutMs, cache, stats) {
+async function fetchTextStrict(url, timeoutMs, cache, stats, signal, validateHop) {
     stats.total += 1;
-    const r = await cachedFetch(url, { timeoutMs, cache });
+    const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
     if (r.fromCache) {
         stats.hits += 1;
         stats.bytesSavedEstimate += r.body.length;
@@ -455,8 +591,13 @@ function matchGlob(pattern, value) {
 function shouldIgnore(url, patterns) {
     if (patterns.length === 0)
         return false;
+    // v0.4 §4.5: globs match against the URL pathname only, NOT the full URL.
+    // Operator intuition: `ignore: ["dashboard/**"]` should match
+    // `https://example.com/dashboard/...` even though the full URL contains the
+    // host. Previously globs matched the full URL and silently failed for users
+    // who didn't think to write `**/dashboard/**`.
     for (const pattern of patterns) {
-        if (matchGlob(pattern, url))
+        if (globMatchPathname(pattern, url))
             return true;
     }
     return false;
@@ -469,7 +610,7 @@ function fisherYatesSample(items, n) {
     }
     return arr.slice(arr.length - n);
 }
-async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats) {
+async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop) {
     visited.add(sitemapUrl);
     const locs = parseSitemapUrls(sitemapText);
     if (!isSitemapIndex(sitemapText)) {
@@ -477,27 +618,32 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
     }
     const allUrls = [];
     for (const childUrl of locs) {
+        if (signal?.aborted)
+            throw signal.reason ?? new Error("aborted");
         if (visited.has(childUrl))
             continue;
-        const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats);
+        const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop);
         if (!child)
             continue;
         const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
         if (!childLike)
             continue;
-        const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats);
+        const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
         allUrls.push(...childUrls);
     }
     return allUrls;
 }
-async function fetchRobotsMeta(origin, timeoutMs, cache, stats) {
+async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
     if (!origin)
         return { disallow: [], crawlDelaySec: 0 };
     try {
         const robotsUrl = `${origin}/robots.txt`;
-        const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats);
+        const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats, signal, validateHop);
+        // Honor both the wildcard block AND any block specifically targeting us.
+        // A malicious target can't bypass our crawler by adding a targeted
+        // `User-agent: pseolint / Disallow: /` without a wildcard.
         return {
-            disallow: parseDisallowPatterns(fetched.text),
+            disallow: parseDisallowPatterns(fetched.text, ["*", "pseolint"]),
             crawlDelaySec: parseCrawlDelaySeconds(fetched.text),
         };
     }
@@ -518,13 +664,42 @@ function isDisallowedByRobots(urlPath, patterns) {
 function budgetExceeded(b) {
     return b.cap > 0 && b.used >= b.cap;
 }
-async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }) {
+async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000) {
+    // Memoized SSRF validator. When guardSsrf is on, every URL fetched by the
+    // audit (source, sitemap entries, redirects, discovered links) goes through
+    // this. DNS is hit once per unique hostname per audit — a 4k-page audit on
+    // one origin does 1 DNS lookup, not 4k.
+    const ssrfCache = new Map();
+    const validateHop = guardSsrf
+        ? async (u) => {
+            let host;
+            try {
+                host = new URL(u).hostname;
+            }
+            catch {
+                throw new Error(`Refusing to fetch invalid URL: ${u}`);
+            }
+            let pending = ssrfCache.get(host);
+            if (!pending) {
+                pending = validateTargetHost(host).catch((err) => {
+                    if (err instanceof SSRFError) {
+                        throw new Error(`Refusing to fetch ${u}: ${err.reason}`);
+                    }
+                    throw err;
+                });
+                ssrfCache.set(host, pending);
+            }
+            await pending;
+        }
+        : undefined;
     if (/^https?:\/\//i.test(source)) {
+        if (validateHop)
+            await validateHop(source);
         let text;
         let contentType;
         let sourceStatus = 200;
         try {
-            const fetched = await fetchTextStrict(source, timeoutMs, cache, stats);
+            const fetched = await fetchTextStrict(source, timeoutMs, cache, stats, signal, validateHop);
             text = fetched.text;
             contentType = fetched.contentType;
         }
@@ -533,7 +708,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
             if (source.includes("sitemap")) {
                 try {
                     const origin = new URL(source).origin;
-                    const fallback = await fetchTextStrict(origin, timeoutMs, cache, stats);
+                    const fallback = await fetchTextStrict(origin, timeoutMs, cache, stats, signal, validateHop);
                     text = fallback.text;
                     contentType = fallback.contentType;
                     sourceStatus = -1; // flag that we fell back
@@ -549,7 +724,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
         const isXml = (contentType.includes("xml") || looksLikeSitemap(text)) && sourceStatus !== -1;
         if (isXml) {
             const visited = new Set();
-            const allSitemapUrls = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats);
+            const allSitemapUrls = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
             // If we have a budget, sample from sitemap URLs before fetching
             const urlsToFetch = discoveryBudget > 0 && allSitemapUrls.length > discoveryBudget
                 ? fisherYatesSample(allSitemapUrls, discoveryBudget)
@@ -562,13 +737,29 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
             catch {
                 return "";
             } })();
-            const robots = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats);
+            const robots = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats, signal, validateHop);
             const effectiveConcurrency = robots.crawlDelaySec > 0 ? 1 : concurrency;
             const delayMs = robots.crawlDelaySec * 1000;
             await runWithConcurrency(urlsToFetch, effectiveConcurrency, async (url) => {
                 if (budgetExceeded(byteBudget))
                     return;
-                const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
+                // Honor robots.txt for our own crawl when respectRobotsTxt is on (default).
+                // The existing robotsComplianceRule flags sitemap-vs-robots conflicts as
+                // findings; this actually refuses to fetch the disallowed URL. Keeps us
+                // legally defensible (we are a bot, our UA `pseolint` is public, and we
+                // respect Disallow directives) and removes the "crawler-for-hire" abuse
+                // vector when the library is invoked from a hosted service.
+                if (respectRobotsTxt) {
+                    try {
+                        const p = new URL(url).pathname;
+                        if (isDisallowedByRobots(p, robots.disallow)) {
+                            skippedByRobots.push(url);
+                            return;
+                        }
+                    }
+                    catch { /* URL parse failed — fall through, fetch will fail naturally */ }
+                }
+                const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
                 if (result) {
                     byteBudget.used += result.html.length;
                     pages.push(result);
@@ -587,9 +778,16 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
                 const discoveredUrls = new Set();
                 // robots already fetched above; reuse its Disallow patterns here.
                 const disallowPatterns = robots.disallow;
-                for (const page of pages) {
+                let discoveryCeilingReached = false;
+                outer: for (const page of pages) {
                     const linkMatches = Array.from(page.html.matchAll(/href=["']([^"']+)["']/gi));
                     for (const match of linkMatches) {
+                        if (discoveredUrls.size >= maxCrawlDiscovered) {
+                            // Hard ceiling — don't let a malicious site with many self-links
+                            // extend crawl discovery up to the byte budget.
+                            discoveryCeilingReached = true;
+                            break outer;
+                        }
                         const href = match[1];
                         if (!href || href.startsWith("#") || /^mailto:|^tel:|^javascript:|^data:/i.test(href))
                             continue;
@@ -614,6 +812,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
                         }
                     }
                 }
+                if (discoveryCeilingReached) {
+                    // eslint-disable-next-line no-console
+                    console.error(`pseolint: crawl discovery hit maxCrawlDiscovered=${maxCrawlDiscovered} ceiling; sampling from the first ${discoveredUrls.size} URLs.`);
+                }
                 if (discoveredUrls.size > 0) {
                     const candidates = Array.from(discoveredUrls);
                     // Fisher-Yates shuffle so we don't bias toward the first-discovered links (nav/footer).
@@ -623,7 +825,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
                     await runWithConcurrency(toFetch, effectiveConcurrency, async (url) => {
                         if (budgetExceeded(byteBudget))
                             return;
-                        const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
+                        const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
                         if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
                             byteBudget.used += result.html.length;
                             pages.push(result);
@@ -700,7 +902,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
                     }
                     const newPages = [];
                     await runWithConcurrency(urlsToFetch, concurrency, async (url) => {
-                        const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
+                        const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
                         if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
                             newPages.push(result);
                             knownCrawled.add(url);
@@ -744,10 +946,65 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
 export async function auditSource(source, options) {
     const runId = generateRunId();
     const runStartedAt = Date.now();
-    const concurrency = options?.concurrency ?? 5;
+    // Apply safeMode preset first, then let explicit options override it. Using
+    // `??` preserves the "not set" vs "explicitly false" distinction — a user
+    // who picks safeMode="saas" but passes `guardSsrf: false` gets the explicit
+    // override. Localhost sources auto-promote to the `dev` preset unless the
+    // caller explicitly set `safeMode` or passed `autoDevPreset: false`.
+    const presetKey = resolveSafeModeKey(source, options);
+    const preset = SAFE_MODE_PRESETS[presetKey];
+    const concurrency = options?.concurrency ?? preset.concurrency ?? 5;
     const timeoutMs = options?.timeout ?? 30000;
     const ignorePatterns = options?.ignore ?? [];
-    const sampleSize = options?.sampleSize ?? 0;
+    const respectNoindex = options?.respectNoindex ?? true;
+    const skipDetectedAuth = options?.skipDetectedAuth ?? false;
+    const sampleSize = options?.sampleSize ?? preset.sampleSize ?? 0;
+    const externalSignal = options?.signal;
+    const guardSsrf = options?.guardSsrf ?? preset.guardSsrf ?? false;
+    const respectRobotsTxt = options?.respectRobotsTxt ?? preset.respectRobotsTxt ?? true;
+    const followRedirects = options?.followRedirects ?? preset.followRedirects ?? true;
+    const maxCrawlDiscovered = options?.maxCrawlDiscovered ?? preset.maxCrawlDiscovered ?? 5000;
+    const skippedByRobots = [];
+    // Backpressure: watch TTFB + 5xx rate during the crawl and abort if the
+    // origin looks degraded. The audit signal is a composite of the caller's
+    // signal (ctrl-C, parent timeout) and the monitor's abort controller.
+    const backpressureEnabled = options?.backpressure !== false;
+    const backpressureAbort = new AbortController();
+    let backpressureError = null;
+    const signal = composeSignals(externalSignal, backpressureAbort.signal);
+    const observer = new FetchObserver();
+    const monitor = backpressureEnabled
+        ? new BackpressureMonitor({
+            warmupSize: 10,
+            absoluteP95Ms: 3000,
+            baselineMultiplier: 2,
+            errorRatioThreshold: 0.1,
+        })
+        : null;
+    // v0.4: framework gets set on the first observation that carries headers
+    // (the source URL fetch). Backpressure thresholds and computeReadiness use
+    // it to soften limits when auditing a dev server.
+    let detectedFramework = null;
+    const onObservation = (obs) => {
+        if (detectedFramework === null && obs.headers) {
+            detectedFramework = detectDevServer(obs.headers);
+        }
+        observer.record(obs);
+        if (!monitor)
+            return;
+        const decision = monitor.record(obs);
+        if (decision.shouldAbort && !backpressureError && decision.snapshot) {
+            backpressureError = new OriginDegradedError(decision.reason ?? "", decision.snapshot);
+            backpressureAbort.abort(backpressureError);
+        }
+    };
+    function throwIfAborted() {
+        if (backpressureError)
+            throw backpressureError;
+        if (externalSignal?.aborted) {
+            throw externalSignal.reason ?? new DOMException("Audit aborted", "AbortError");
+        }
+    }
     const resolvedRules = {
         nearDuplicateThreshold: options?.rules?.nearDuplicateThreshold ?? DEFAULTS.nearDuplicateThreshold,
         entitySwapThreshold: options?.rules?.entitySwapThreshold ?? DEFAULTS.entitySwapThreshold,
@@ -758,10 +1015,6 @@ export async function auditSource(source, options) {
         uniqueValueMinWords: options?.rules?.uniqueValueMinWords ?? DEFAULTS.uniqueValueMinWords,
         metaUniquenessMinJaccard: options?.rules?.metaUniquenessMinJaccard ?? DEFAULTS.metaUniquenessMinJaccard,
         linkDepthMaxClicks: options?.rules?.linkDepthMaxClicks ?? DEFAULTS.linkDepthMaxClicks,
-        hubPagesMinSiblings: options?.rules?.hubPagesMinSiblings ?? DEFAULTS.hubPagesMinSiblings,
-        hubPagesMaxSiblings: options?.rules?.hubPagesMaxSiblings ?? DEFAULTS.hubPagesMaxSiblings,
-        titleOverlapThreshold: options?.rules?.titleOverlapThreshold ?? DEFAULTS.titleOverlapThreshold,
-        keywordCollisionMinShared: options?.rules?.keywordCollisionMinShared ?? DEFAULTS.keywordCollisionMinShared,
         templateCoverageMinPages: options?.rules?.templateCoverageMinPages ?? DEFAULTS.templateCoverageMinPages,
         answerFirstMaxWords: options?.rules?.answerFirstMaxWords ?? DEFAULTS.answerFirstMaxWords,
         citableFactsMin: options?.rules?.citableFactsMin ?? DEFAULTS.citableFactsMin,
@@ -783,18 +1036,47 @@ export async function auditSource(source, options) {
     const discoveryBudget = options?.sampleSize && options.sampleSize > 0
         ? Math.max(50, options.sampleSize * 2)
         : 0;
-    const cacheStats = { hits: 0, total: 0, bytesSavedEstimate: 0 };
+    const cacheStats = { hits: 0, total: 0, bytesSavedEstimate: 0, onObservation };
     const cacheConfig = options?.cache
         ? {
             dir: options.cache.dir ?? ".pseolint/cache",
             ttlMs: options.cache.ttlMs ?? 7 * 24 * 60 * 60 * 1000,
         }
         : null;
+    // Size cap (post-audit eviction). Default 200 MB keeps pSEO-scale sites in check;
+    // a single full crawl of a 5k-page site averages ~250 KB per body = ~1.25 GB uncapped.
+    const cacheMaxBytes = options?.cache?.maxBytes ?? 209_715_200;
     const fillBudgetViaLinkDiscovery = options?.fillBudgetViaLinkDiscovery ?? false;
-    const maxFetchBytes = options?.maxFetchBytes ?? 52_428_800;
+    const maxFetchBytes = options?.maxFetchBytes ?? preset.maxFetchBytes ?? 52_428_800;
     const fetchByteBudget = { used: 0, cap: maxFetchBytes };
-    const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, discoveredUrlCount } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget);
+    // v0.4 §4.7: detectedFramework is set in onObservation above, side-effect
+    // of the normal source URL fetch. No separate probe needed.
+    const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, discoveredUrlCount } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered);
+    throwIfAborted();
     const loadedPages = [...loadedPagesRaw];
+    // v0.4 §4.7: content-type-aware crawling. Filter out fetched URLs whose
+    // response Content-Type is not HTML (text/html or application/xhtml+xml).
+    // Binary routes like /apple-icon, /opengraph-image, /icon get pushed to
+    // crawlStats.skipped instead of being parsed as thin-content pages.
+    const skippedByContentType = [];
+    const htmlOnlyPages = [];
+    for (const p of loadedPages) {
+        // httpMeta is set on URL fetches; locally-loaded files have no httpMeta
+        // and are always HTML by definition (collectHtmlFiles only picks .html).
+        // We don't have content-type on the LoadedPage object. Heuristic: if html
+        // body doesn't contain any HTML markers, treat as non-HTML.
+        if (!p.httpMeta) {
+            htmlOnlyPages.push(p);
+            continue;
+        }
+        if (looksLikeHtml(p.html)) {
+            htmlOnlyPages.push(p);
+        }
+        else {
+            skippedByContentType.push(p.url);
+        }
+    }
+    loadedPages.splice(0, loadedPages.length, ...htmlOnlyPages);
     if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
         console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
     }
@@ -830,7 +1112,7 @@ export async function auditSource(source, options) {
     if (/^https?:\/\//i.test(source)) {
         try {
             const origin = new URL(source).origin;
-            const result = await fetchWithRetry(`${origin}/robots.txt`, timeoutMs, cacheConfig, cacheStats);
+            const result = await fetchWithRetry(`${origin}/robots.txt`, timeoutMs, cacheConfig, cacheStats, signal);
             if (result)
                 robotsTxtContent = result.text;
         }
@@ -872,13 +1154,27 @@ export async function auditSource(source, options) {
             })()
             : fisherYatesSample(filtered, sampleSize))
         : filtered;
-    const parsedPages = sampled.map((page) => {
+    const parsedPagesAll = sampled.map((page) => {
         const parsed = parseHtmlPage(page.html, page.url, { normalizeUrl: normalizeUrlOptions });
         if (page.httpMeta) {
             parsed.httpMeta = page.httpMeta;
         }
         return parsed;
     });
+    // v0.4.1 §page-filter: drop noindex'd pages and (when enabled) heuristically
+    // detected auth pages BEFORE rule evaluation. The site owner's noindex is a
+    // hard signal — they already opted out of SEO indexing, so auditing those
+    // URLs produces only noise. Auth detection is opt-in via skipDetectedAuth
+    // (off for the CLI by default; on for the hosted web form).
+    const skippedByPolicy = [];
+    const parsedPages = parsedPagesAll.filter((p) => {
+        const reason = pageSkipReason(p, { respectNoindex, skipDetectedAuth });
+        if (reason) {
+            skippedByPolicy.push({ url: p.url, reason });
+            return false;
+        }
+        return true;
+    });
     const knownUrls = new Set(parsedPages.map((p) => p.url));
     const rootUrl = parsedPages.find((p) => /(^|[\\/])index\.html?$/i.test(p.url))?.url ?? parsedPages[0]?.url ?? "";
     const adjacency = new Map();
@@ -918,11 +1214,50 @@ export async function auditSource(source, options) {
             }),
         ]
         : DEFAULT_ENTITY_PATTERNS;
+    // v0.4 §4.11 — pre-flight site classification. We compute this BEFORE the
+    // rule pipeline so the dispatcher can skip pSEO-only rules on small
+    // marketing sites / blogs. Classification is computed off the FULL
+    // discovered URL set (sitemap when available, else loaded URLs). This
+    // matters: a sampled crawl of a 5000-page directory must still classify
+    // as `programmatic-directory`, not `unclear`.
+    const classifierUrls = (() => {
+        if (sitemapUrlSet && sitemapUrlSet.size > 0) {
+            return Array.from(sitemapUrlSet);
+        }
+        return loadedPagesRaw.map((p) => p.url);
+    })();
+    const classifierFramework = detectedFramework ?? "unknown";
+    const computedClassification = classifySite({
+        urls: classifierUrls,
+        framework: classifierFramework,
+    });
+    // `--strict` (or AuditOptions.strict) keeps the classification but forces
+    // every rule to run regardless of detected site type.
+    const siteClassification = options?.strict
+        ? { ...computedClassification, suppressedRules: [] }
+        : computedClassification;
+    const suppressedRuleSet = new Set(siteClassification.suppressedRules);
     // Classify pages into groups and run only enabled rules per group
     const classified = classifyPages(parsedPages, options?.pageGroups);
     const allFindings = [...duplicateUrlFindings];
     const groupScores = {};
     const groupPageCounts = {};
+    // Surface robots-skipped URLs so users don't silently get a smaller audit
+    // than expected. One rollup finding (not per-URL) to avoid flooding the
+    // output on large sites. Also included on summary.skippedUrls below.
+    if (skippedByRobots.length > 0) {
+        allFindings.push({
+            ruleId: "audit/skipped-by-robots",
+            severity: "info",
+            message: `Skipped ${skippedByRobots.length} sitemap URL${skippedByRobots.length === 1 ? "" : "s"} because the target's robots.txt Disallow'd them: ${skippedByRobots.slice(0, 5).join(", ")}${skippedByRobots.length > 5 ? ", …" : ""}.`,
+            fix: "If you own this site and want to audit these URLs anyway, pass `respectRobotsTxt: false` (or remove the Disallow directive).",
+            relatedUrls: skippedByRobots,
+        });
+    }
+    // v0.4 §4.4: origin readiness is now diagnostic-only. The previous
+    // `audit/origin-readiness` finding emission was retired — the structured
+    // ReadinessReport in `summary.diagnostics.originReadiness` is the canonical
+    // signal now (no double-counting in the issue buckets).
     const auditMode = options?.mode ?? "full";
     // Site-wide rules (run once, outside group loop)
     if (sitemapUrlSet && sitemapUrlSet.size > 0 && auditMode !== "diff") {
@@ -959,39 +1294,120 @@ export async function auditSource(source, options) {
         if (groupConfig?.rules !== undefined && groupConfig.rules.length === 0)
             continue;
         const groupRules = resolveGroupRules(resolvedRules, groupConfig?.overrides);
-        const enabledCheck = (ruleId) => isRuleEnabled(ruleId, groupConfig?.rules);
-        const findings = runRulesOnPages(groupPages, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full");
+        const enabledCheck = (ruleId) => !suppressedRuleSet.has(ruleId) && isRuleEnabled(ruleId, groupConfig?.rules);
+        const findings = runRulesOnPages(groupPages, parsedPagesAll, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full");
         allFindings.push(...findings);
         groupPageCounts[groupName] = groupPages.length;
-        const { score } = scoreFromFindings(findings);
-        groupScores[groupName] = score;
+        const { risk: groupRisk } = scoreFromFindings(findings);
+        groupScores[groupName] = groupRisk;
     }
+    throwIfAborted();
     // Enrich findings: cluster pairwise, detect templates, assign effort
     const enriched = enrichFindings(allFindings, parsedPages, {
         templateGenerated: options?.templateGenerated,
     });
-    const { score, categoryScores } = scoreFromFindings(enriched.findings);
+    // Populate docsUrl on every finding before they leave the engine.
+    withDocsUrls(enriched.findings);
+    const { risk, categories, bucketCounts } = scoreFromFindings(enriched.findings);
     const auditedPageCount = Object.values(groupPageCounts).reduce((a, b) => a + b, 0);
+    const issues = bucketIssues(enriched.findings);
+    const verdict = verdictForRisk(risk);
+    const headline = buildHeadline(bucketCounts);
+    // audit/* findings are diagnostic-only and never appear in summary.issues.
+    // Surface them under diagnostics so consumers (telemetry, debug UIs) can
+    // still see what was deduped or skipped.
+    const auditFindings = enriched.findings.filter((f) => f.ruleId.startsWith("audit/"));
+    const readinessReport = computeReadiness(observer.getAll(), { detectedFramework });
+    const crawlStats = {
+        discovered: discoveredUrlCount ?? loadedPagesRaw.length,
+        fetched: parsedPages.length,
+        skipped: skippedByContentType.length + skippedByRobots.length + skippedUrls.length,
+    };
     const summary = {
-        score,
-        categoryScores,
+        schemaVersion: SCHEMA_VERSION,
+        verdict,
+        risk,
+        headline,
+        categories,
+        issues,
+        siteClassification,
+        diagnostics: {
+            originReadiness: readinessReport,
+            crawlStats,
+            auditFindings,
+        },
         groupScores: options?.pageGroups ? groupScores : undefined,
         groupPageCounts: options?.pageGroups ? groupPageCounts : undefined,
         pageCount: auditedPageCount || parsedPages.length,
-        findings: enriched.findings,
         templateDetected: enriched.templateDetected,
         rawFindingCount: enriched.rawFindingCount,
     };
     if (cacheConfig) {
         summary.cacheStats = cacheStats;
     }
-    if (skippedUrls.length > 0) {
-        summary.skippedUrls = skippedUrls;
+    // v0.4 §4.5 / v0.4.1: warn when ignore patterns matched zero discovered URLs.
+    //   - Per-pattern warning fires only when `warnUnmatchedIgnore` is true
+    //     (set by the CLI when `--ignore` was passed explicitly). Quiet by
+    //     default for config-loaded patterns where broad safety lists like
+    //     `**/dashboard/**` legitimately don't match small marketing sites.
+    //   - When ALL patterns matched zero (strongest typo signal, e.g. user
+    //     wrote `*.json` instead of `**/*.json`), emit a single consolidated
+    //     warning regardless of source.
+    if (ignorePatterns.length > 0) {
+        const unmatched = ignorePatterns.filter((pattern) => !deduped.some((p) => globMatchPathname(pattern, p.url)));
+        if (unmatched.length === ignorePatterns.length) {
+            // eslint-disable-next-line no-console
+            console.warn(`[pseolint] none of the ${ignorePatterns.length} ignore pattern${ignorePatterns.length === 1 ? "" : "s"} matched any URLs — check config or --ignore for typos`);
+        }
+        else if (options?.warnUnmatchedIgnore === true) {
+            for (const pattern of unmatched) {
+                // eslint-disable-next-line no-console
+                console.warn(`[pseolint] ignore pattern '${pattern}' matched 0 URLs — likely typo`);
+            }
+        }
     }
+    // Merge state-skipped (unchanged since last run), robots-skipped (target
+    // robots.txt Disallow'd), and policy-skipped (noindex / detected-auth) URLs
+    // so callers have a single audit-skipped surface.
+    const allSkipped = [
+        ...skippedUrls,
+        ...skippedByRobots,
+        ...skippedByPolicy.map((s) => s.url),
+    ];
+    if (allSkipped.length > 0) {
+        summary.skippedUrls = allSkipped;
+    }
+    // v0.4.1: surface noindex / auth skips as a discoverable diagnostic so the
+    // user sees what the engine excluded. Catches the accidental-noindex bug:
+    // pages silently dropped from indexing show up as a visible skip line
+    // instead of being absent without explanation.
+    if (skippedByPolicy.length > 0) {
+        const noindexCount = skippedByPolicy.filter((s) => s.reason === "noindex").length;
+        const authCount = skippedByPolicy.filter((s) => s.reason === "auth-detected").length;
+        const sample = skippedByPolicy.slice(0, 5).map((s) => `${s.url} (${s.reason})`).join(", ");
+        const more = skippedByPolicy.length > 5 ? `, +${skippedByPolicy.length - 5} more` : "";
+        const parts = [];
+        if (noindexCount > 0)
+            parts.push(`${noindexCount} marked noindex`);
+        if (authCount > 0)
+            parts.push(`${authCount} detected as auth (login/register/etc)`);
+        auditFindings.push({
+            ruleId: "audit/skipped-by-policy",
+            severity: "info",
+            message: `Skipped ${skippedByPolicy.length} page${skippedByPolicy.length === 1 ? "" : "s"} from rule evaluation — ${parts.join(", ")}. First few: ${sample}${more}.`,
+            relatedUrls: skippedByPolicy.map((s) => s.url),
+        });
+    }
+    // Local flat view of every finding the engine produced, used internally for
+    // state persistence, regression detection, AI triage input, and telemetry
+    // counts. NOT exposed on the AuditSummary — consumers must use
+    // `summary.issues.{blockers,shouldFix,informational}` and
+    // `summary.diagnostics.auditFindings`.
+    const enrichedFindings = enriched.findings;
     if (priorState && options?.state?.exitOnRegression) {
         let hasRegression = false;
         const currentFindings = new Map();
-        for (const f of summary.findings) {
+        for (const f of enrichedFindings) {
             if (!f.pageUrl)
                 continue;
             const set = currentFindings.get(f.pageUrl) ?? new Set();
@@ -1019,7 +1435,7 @@ export async function auditSource(source, options) {
         const renderMode = options.render ? "rendered" : "static";
         const urls = {};
         const findingsByUrl = new Map();
-        for (const f of summary.findings) {
+        for (const f of enrichedFindings) {
             if (!f.pageUrl)
                 continue;
             const list = findingsByUrl.get(f.pageUrl) ?? [];
@@ -1051,9 +1467,10 @@ export async function auditSource(source, options) {
             renderMode,
             urls,
             summary: {
-                score: summary.score,
-                totalFindings: summary.findings.length,
-                byCategory: Object.fromEntries(Object.entries(summary.categoryScores).map(([k, v]) => [k, v])),
+                score: summary.risk,
+                totalFindings: enrichedFindings.length,
+                byCategory: Object.fromEntries(Object.entries(summary.categories)
+                    .map(([k, v]) => [k, v.issues])),
             },
         };
         await writeState(statePath, newState);
@@ -1089,7 +1506,8 @@ export async function auditSource(source, options) {
                     spentTodayUsd = 0;
                 }
             }
-            const outcome = await triageFindings(summary.findings, summary.pageCount, {
+            throwIfAborted();
+            const outcome = await triageFindings(enrichedFindings, summary.pageCount, {
                 enabled: true,
                 model: resolved.model,
                 providerId: resolved.providerId,
@@ -1124,9 +1542,9 @@ export async function auditSource(source, options) {
             runId,
             timestamp: new Date().toISOString(),
             durationMs: Date.now() - runStartedAt,
-            score: summary.score,
+            score: summary.risk,
             pageCount: summary.pageCount,
-            findingCount: summary.findings.length,
+            findingCount: enrichedFindings.length,
             ...(summary.rawFindingCount !== undefined && { rawFindingCount: summary.rawFindingCount }),
             ...(summary.templateDetected !== undefined && { templateDetected: summary.templateDetected }),
             ...(summary.cacheStats && { cacheStats: summary.cacheStats }),
@@ -1181,7 +1599,19 @@ export async function auditSource(source, options) {
     }
     const aiHintEnabled = options?.ai?.suggest !== false;
     if (aiHintEnabled && !options?.ai?.enabled && process.env.ANTHROPIC_API_KEY) {
-        console.error(`💡 AI triage available — re-run with --ai to prioritize ${summary.findings.length} findings into a fix list.`);
+        console.error(`💡 AI triage available — re-run with --ai to prioritize ${enrichedFindings.length} findings into a fix list.`);
+    }
+    if (cacheConfig && cacheMaxBytes > 0) {
+        try {
+            const pruneResult = await pruneCache(cacheConfig.dir, cacheMaxBytes);
+            if (pruneResult.removedEntries > 0 || pruneResult.removedTmpFiles > 0) {
+                const freedMb = ((pruneResult.before.bytes - pruneResult.after.bytes) / 1024 / 1024).toFixed(1);
+                console.error(`pseolint: cache prune freed ${freedMb} MB (${pruneResult.removedEntries} entries, ${pruneResult.removedTmpFiles} .tmp files); size=${(pruneResult.after.bytes / 1024 / 1024).toFixed(1)}MB / cap=${(cacheMaxBytes / 1024 / 1024).toFixed(0)}MB`);
+            }
+        }
+        catch {
+            // Non-fatal: eviction failure must not break the audit.
+        }
     }
     return summary;
 }