npm - @pseolint/core - Versions diffs - 0.7.2 → 0.7.3 - Mend

@pseolint/core 0.7.2 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

package/LICENSE +21 -21
package/README.md +27 -4
package/dist/algorithms/content-effort/cache.d.ts +5 -0
package/dist/algorithms/content-effort/cache.d.ts.map +1 -0
package/dist/algorithms/content-effort/cache.js +23 -0
package/dist/algorithms/content-effort/cache.js.map +1 -0
package/dist/algorithms/content-effort/index.d.ts +4 -0
package/dist/algorithms/content-effort/index.d.ts.map +1 -0
package/dist/algorithms/content-effort/index.js +4 -0
package/dist/algorithms/content-effort/index.js.map +1 -0
package/dist/algorithms/content-effort/judge.d.ts +36 -0
package/dist/algorithms/content-effort/judge.d.ts.map +1 -0
package/dist/algorithms/content-effort/judge.js +69 -0
package/dist/algorithms/content-effort/judge.js.map +1 -0
package/dist/algorithms/content-effort/schema.d.ts +13 -0
package/dist/algorithms/content-effort/schema.d.ts.map +1 -0
package/dist/algorithms/content-effort/schema.js +20 -0
package/dist/algorithms/content-effort/schema.js.map +1 -0
package/dist/auditor.d.ts +18 -1
package/dist/auditor.d.ts.map +1 -1
package/dist/auditor.js +155 -16
package/dist/auditor.js.map +1 -1
package/dist/cache.d.ts.map +1 -1
package/dist/cache.js +18 -3
package/dist/cache.js.map +1 -1
package/dist/formatters/template-cards.js +32 -32
package/dist/framework-detect.d.ts +6 -0
package/dist/framework-detect.d.ts.map +1 -0
package/dist/framework-detect.js +22 -0
package/dist/framework-detect.js.map +1 -0
package/dist/rule-references.d.ts.map +1 -1
package/dist/rule-references.js +1 -0
package/dist/rule-references.js.map +1 -1
package/dist/rules/content/unique-value.d.ts +2 -2
package/dist/rules/content/unique-value.d.ts.map +1 -1
package/dist/rules/content/unique-value.js +8 -2
package/dist/rules/content/unique-value.js.map +1 -1
package/dist/rules/scope.d.ts.map +1 -1
package/dist/rules/scope.js +1 -0
package/dist/rules/scope.js.map +1 -1
package/dist/rules/tech/csr-bailout.d.ts +8 -0
package/dist/rules/tech/csr-bailout.d.ts.map +1 -0
package/dist/rules/tech/csr-bailout.js +48 -0
package/dist/rules/tech/csr-bailout.js.map +1 -0
package/dist/rules/tech/soft-404.d.ts +6 -0
package/dist/rules/tech/soft-404.d.ts.map +1 -1
package/dist/rules/tech/soft-404.js +23 -0
package/dist/rules/tech/soft-404.js.map +1 -1
package/dist/types.d.ts +25 -0
package/dist/types.d.ts.map +1 -1
package/package.json +1 -1
package/schemas/audit-summary.schema.json +300 -300
package/dist/rules/aeo/non-replicable-value.d.ts +0 -9
package/dist/rules/aeo/non-replicable-value.d.ts.map +0 -1
package/dist/rules/aeo/non-replicable-value.js +0 -95
package/dist/rules/aeo/non-replicable-value.js.map +0 -1
package/dist/rules/cannibal/keyword-collision.d.ts +0 -3
package/dist/rules/cannibal/keyword-collision.d.ts.map +0 -1
package/dist/rules/cannibal/keyword-collision.js +0 -25
package/dist/rules/cannibal/keyword-collision.js.map +0 -1
package/dist/rules/cannibal/title-overlap.d.ts +0 -3
package/dist/rules/cannibal/title-overlap.d.ts.map +0 -1
package/dist/rules/cannibal/title-overlap.js +0 -43
package/dist/rules/cannibal/title-overlap.js.map +0 -1
package/dist/rules/content/heading-uniqueness.d.ts +0 -3
package/dist/rules/content/heading-uniqueness.d.ts.map +0 -1
package/dist/rules/content/heading-uniqueness.js +0 -56
package/dist/rules/content/heading-uniqueness.js.map +0 -1
package/dist/rules/links/hub-pages.d.ts +0 -7
package/dist/rules/links/hub-pages.d.ts.map +0 -1
package/dist/rules/links/hub-pages.js +0 -73
package/dist/rules/links/hub-pages.js.map +0 -1

package/dist/auditor.js CHANGED Viewed

@@ -1,7 +1,9 @@
 import { createHash } from "node:crypto";
 import { readdir, readFile, stat } from "node:fs/promises";
+import { tmpdir } from "node:os";
 import { extname, join, resolve } from "node:path";
 import { parseHtmlPage } from "./parser.js";
+import { renderPages } from "./renderer.js";
 import { pageSkipReason } from "./page-filter.js";
 import { mergeNormalizeUrlOptions, normalizeAuditUrl } from "./url-normalize.js";
 import { eeatSignalsRule } from "./rules/content/eeat-signals.js";
@@ -45,7 +47,8 @@ import { citableFactsRule } from "./rules/aeo/citable-facts.js";
 import { contentModularityRule } from "./rules/aeo/content-modularity.js";
 import { summaryBaitRule } from "./rules/aeo/summary-bait.js";
 import { redirectChainRule } from "./rules/tech/redirect-chain.js";
-import { soft404Rule } from "./rules/tech/soft-404.js";
+import { soft404Rule, evaluateProbe } from "./rules/tech/soft-404.js";
+import { csrBailoutRule } from "./rules/tech/csr-bailout.js";
 import { jsonLdValidRule } from "./rules/schema/json-ld-valid.js";
 import { requiredFieldsRule } from "./rules/schema/required-fields.js";
 import { schemaConsistencyRule } from "./rules/schema/consistency.js";
@@ -71,7 +74,7 @@ import { classifySite, applyDegenerationGuard, corpusStatsFromPages } from "./si
 import { readState, writeState, computeContentHash, STATE_SCHEMA_VERSION, } from "./state.js";
 import { CORE_RULESET_VERSION } from "./ruleset-version.js";
 import { planScrapeStrategy, DEFAULT_AGE_FLOOR_DAYS } from "./scrape-strategy.js";
-import { detectTemplates, buildUrlToTemplateMap, shouldActivateTemplateScoring } from "./template-detection.js";
+import { detectTemplates, buildUrlToTemplateMap, shouldActivateTemplateScoring, LONGTAIL_SIGNATURE } from "./template-detection.js";
 import { scoreTemplates, siteVerdictFromTemplates } from "./per-template-scoring.js";
 import { deriveEntityPatterns } from "./algorithms/auto-entity-mask.js";
 import { CompositeAuthorityProvider } from "./algorithms/authority/provider.js";
@@ -139,6 +142,9 @@ const SCORING_PROFILES = {
             "aeo/citable-facts": "info",
             "aeo/answer-first": "info",
             "aeo/summary-bait": "warning",
+            // CSR-bailout on a small-marketing SPA is lower-stakes (a deliberately
+            // client-only marketing widget) — keep visible but don't tank the verdict.
+            "tech/csr-bailout": "info",
             // 2026-05-03 calibration round 5: Segment integrations had 24 thin
             // pages (200-300 words is correct for a catalog record). thin-content
             // contributing capped 40 impact pushed integrity to its 100 cap → 30
@@ -523,24 +529,40 @@ function verdictForRisk(risk) {
  * concerning → critical, critical → critical.
  */
 const VERDICT_LADDER = ["ready", "caution", "concerning", "critical"];
-function shiftVerdictForAuthority(verdict, authorityScore) {
-    if (authorityScore === undefined)
-        return verdict;
-    if (!Number.isFinite(authorityScore))
-        return verdict;
-    if (authorityScore < 0 || authorityScore > 100)
+/**
+ * Shared bounded bidirectional verdict moderator. A 0-100 `score` shifts the
+ * verdict along {@link VERDICT_LADDER} by at most `cap` tiers:
+ *   - `score >= lenientAt` → soften (toward "ready"), clamped at index 0.
+ *   - `score <= strictAt`  → escalate (toward "critical"), clamped at the top.
+ *   - in between (or absent) → no shift.
+ * Absent evidence is a no-op: `undefined`/`null`/non-finite/out-of-[0,100]
+ * `score` returns the verdict unchanged (so a null content-effort or an
+ * unavailable authority provider never moves the verdict). Authority and
+ * content-effort are both callers (see {@link shiftVerdictForAuthority}).
+ */
+export function shiftVerdict(verdict, o) {
+    if (o.score === undefined || o.score === null || !Number.isFinite(o.score) || o.score < 0 || o.score > 100) {
         return verdict;
+    }
     const idx = VERDICT_LADDER.indexOf(verdict);
     if (idx < 0)
         return verdict;
-    if (authorityScore >= 80) {
-        return VERDICT_LADDER[Math.max(0, idx - 1)];
-    }
-    if (authorityScore <= 30) {
-        return VERDICT_LADDER[Math.min(VERDICT_LADDER.length - 1, idx + 1)];
-    }
+    if (o.score >= o.lenientAt)
+        return VERDICT_LADDER[Math.max(0, idx - o.cap)];
+    if (o.score <= o.strictAt)
+        return VERDICT_LADDER[Math.min(VERDICT_LADDER.length - 1, idx + o.cap)];
     return verdict;
 }
+/** Authority keeps its exact ±1 / 80 / 30 behavior via the shared moderator. */
+function shiftVerdictForAuthority(verdict, authorityScore) {
+    return shiftVerdict(verdict, { score: authorityScore, lenientAt: 80, strictAt: 30, cap: 1 });
+}
+// content-effort moderation band — STARTING values; Task 7 tunes against the
+// ratchet. Derived from the gate data: reputable median effort ≈ 8.5, addressable
+// farms cluster ≤7, proprietary-data winners (numbeo/airbyte) ≈28.
+const EFFORT_STRICT_AT = 5; // very-low effort → escalate (farm cluster)
+const EFFORT_LENIENT_AT = 25; // high effort → soften (rescues proprietary-data winners e.g. numbeo)
+const EFFORT_CAP = 1;
 function gradeForPenalty(penalty) {
     if (penalty <= 20)
         return "A";
@@ -652,7 +674,7 @@ sampled = false) {
     }
     // Content rules
     if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
-        pushAll(findings, tag(uniqueValueRule(pages, resolvedRules.uniqueValueDensity)));
+        pushAll(findings, tag(uniqueValueRule(pages, resolvedRules.uniqueValueDensity, entityPatterns)));
     }
     if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
         pushAll(findings, tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
@@ -727,6 +749,10 @@ sampled = false) {
     if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
         pushAll(findings, tag(soft404Rule(pages)));
     }
+    if (isEnabled("tech/csr-bailout") && modeOk("tech/csr-bailout")) {
+        // No-op unless --render populated page.renderedHtml (the rule guards internally).
+        pushAll(findings, tag(csrBailoutRule(pages)));
+    }
     if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
         // hreflang declarations on noindex'd pages are still bugs when they're
         // inconsistent — see auditor.test.ts "emits technical SEO findings".
@@ -2238,6 +2264,31 @@ export async function auditSource(source, options) {
         }
         return parsed;
     });
+    // --render: execute each page in a headless browser and attach the
+    // post-hydration DOM so tech/csr-bailout can diff raw vs rendered. Opt-in,
+    // Node-only (fails under bun). Degrades gracefully when no browser is available.
+    if (options?.render) {
+        try {
+            const rendered = await renderPages(parsedPagesAll.map((p) => ({ url: p.url })), null, {
+                browserWsEndpoint: options.render.browserWsEndpoint,
+                concurrency,
+                timeoutMs: 30000,
+                analyticsMode: options.render.analyticsMode,
+                extraBlockedHosts: options.render.extraBlockedHosts,
+            });
+            const renderedByUrl = new Map(rendered.map((r) => [r.url, r.html]));
+            for (const p of parsedPagesAll) {
+                const html = renderedByUrl.get(p.url);
+                if (html)
+                    p.renderedHtml = html;
+            }
+        }
+        catch (err) {
+            // eslint-disable-next-line no-console
+            console.error(`pseolint: --render failed (${err instanceof Error ? err.message : String(err)}). ` +
+                `Continuing without rendered DOM; tech/csr-bailout will be skipped.`);
+        }
+    }
     // v0.4.1 §page-filter: drop noindex'd pages and (when enabled) heuristically
     // detected auth pages BEFORE rule evaluation. The site owner's noindex is a
     // hard signal — they already opted out of SEO indexing, so auditing those
@@ -2384,6 +2435,41 @@ export async function auditSource(source, options) {
         const crawlerFindings = crawlerAccessRule(robotsTxtContent);
         pushAll(allFindings, crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
     }
+    // tech/soft-404 synthetic probe: a URL we deliberately invent to be
+    // nonexistent should return 404/410. A 200 means the site soft-404s, letting
+    // crawlers index unlimited junk. Default-on for programmatic directories
+    // (clusters are well-formed there); one probe per cluster, capped, fail-open.
+    if (auditMode !== "diff" &&
+        siteClassification.type === "programmatic-directory" &&
+        isRuleEnabled("tech/soft-404", undefined)) {
+        const PROBE_CAP = 25;
+        const probeClusters = detectTemplates(parsedPages.map((p) => p.url))
+            .filter((c) => c.signature !== LONGTAIL_SIGNATURE && c.urls.length > 0)
+            .slice(0, PROBE_CAP);
+        for (const cluster of probeClusters) {
+            try {
+                const rep = new URL(cluster.urls[0]);
+                const token = Math.abs([...cluster.signature].reduce((h, ch) => ((h << 5) - h + ch.charCodeAt(0)) | 0, 0));
+                const originalPath = rep.pathname;
+                rep.pathname = originalPath.replace(/\/[^/]+\/?$/, `/pseolint-404-probe-${token}`);
+                if (rep.pathname === originalPath)
+                    continue; // couldn't synthesize an invalid URL
+                const probeUrl = rep.toString();
+                const res = await cachedFetch(probeUrl, {
+                    timeoutMs,
+                    cache: cacheConfig,
+                    signal,
+                    onObservation: cacheStats.onObservation,
+                });
+                const finding = evaluateProbe(probeUrl, res.status, res.body ?? "");
+                if (finding)
+                    allFindings.push({ ...finding, ref: RULE_REFERENCES[finding.ruleId] });
+            }
+            catch {
+                // fail-open: a probe network error never produces a finding.
+            }
+        }
+    }
     // Data source comparison rules
     if (options?.dataSource?.records && options.dataSource.records.length > 0) {
         if (auditMode !== "diff" || isRuleAllowedInDiff("data/missing-binding")) {
@@ -2536,7 +2622,57 @@ export async function auditSource(source, options) {
     // The `risk` score is intentionally unchanged — §15.1 governs verdict only.
     const legacyVerdict = shiftVerdictForAuthority(verdictForRisk(risk), resolvedAuthorityScore);
     const templateVerdict = siteVerdictFromTemplates(siteTemplates);
-    const verdict = templateVerdict !== null ? templateVerdict : legacyVerdict;
+    const baseVerdict = templateVerdict !== null ? templateVerdict : legacyVerdict;
+    // 2026-06-17 SP1 — opt-in content-effort moderation. Like authority, this
+    // shifts only the user-facing verdict (never `risk`), one tier in either
+    // direction, and is a strict no-op when the signal is absent (null/undefined).
+    // Resolution: an injected `contentEffortScore` (calibration/tests, offline)
+    // wins; otherwise the LLM judge runs only when `contentEffort.enabled`. Any
+    // failure (no key, network, abort) fails safe to `undefined` → no shift.
+    let resolvedEffort = options?.contentEffortScore;
+    if (resolvedEffort === undefined && options?.contentEffort?.enabled) {
+        try {
+            const { judgeContentEffort, makeLlmGenerate } = await import("./algorithms/content-effort/judge.js");
+            const { model, modelId } = await createLanguageModel({
+                model: options.contentEffort.model ?? "claude-sonnet-4-6",
+            });
+            // Reuse the audit's own parsed pages + template clustering: map each
+            // template's audited URLs back to their parsed contentText. When no
+            // template qualified (small/unclear sites), fall back to ONE synthetic
+            // site-wide template over every audited page — mirrors the validation
+            // runner's buildTemplates (scripts/content-effort-validate.ts).
+            const contentByUrl = new Map(parsedPages.map((p) => [p.url, p.contentText ?? ""]));
+            const toSamples = (urls) => urls
+                .filter((u) => contentByUrl.has(u))
+                .map((u) => ({ url: u, contentText: contentByUrl.get(u) ?? "" }));
+            let effortSamples = siteTemplates.length > 0
+                ? siteTemplates
+                    .map((t) => ({ signature: t.signature, samplePages: toSamples(t.auditedUrls) }))
+                    .filter((t) => t.samplePages.length > 0)
+                : [];
+            if (effortSamples.length === 0) {
+                const all = parsedPages.map((p) => ({ url: p.url, contentText: p.contentText ?? "" }));
+                effortSamples = all.length > 0 ? [{ signature: "site", samplePages: all }] : [];
+            }
+            const cacheDir = options.contentEffort.cacheDir ?? join(tmpdir(), "pseolint-content-effort");
+            const judged = await judgeContentEffort(effortSamples, {
+                modelId,
+                cacheDir,
+                generate: makeLlmGenerate(model, options.signal),
+                signal: options.signal,
+            });
+            resolvedEffort = judged.siteEffort;
+        }
+        catch {
+            resolvedEffort = undefined; // fail-safe: model/key unavailable → no moderation
+        }
+    }
+    const verdict = shiftVerdict(baseVerdict, {
+        score: resolvedEffort,
+        lenientAt: EFFORT_LENIENT_AT,
+        strictAt: EFFORT_STRICT_AT,
+        cap: EFFORT_CAP,
+    });
     const headline = buildHeadline(bucketCounts);
     // audit/* findings are diagnostic-only and never appear in summary.issues.
     // Surface them under diagnostics so consumers (telemetry, debug UIs) can
@@ -2576,6 +2712,9 @@ export async function auditSource(source, options) {
         ...(resolvedAuthorityScore !== undefined
             ? { authority: { score: resolvedAuthorityScore, domain: resolvedAuthorityDomain ?? "" } }
             : {}),
+        ...(resolvedEffort !== undefined && resolvedEffort !== null
+            ? { contentEffort: { score: resolvedEffort } }
+            : {}),
     };
     // Partial-report flag: the backpressure watchdog aborted mid-crawl and we
     // salvaged whatever pages had been fetched. Consumers MUST treat coverage as