@pseolint/core 0.7.2 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +27 -4
- package/dist/algorithms/content-effort/cache.d.ts +5 -0
- package/dist/algorithms/content-effort/cache.d.ts.map +1 -0
- package/dist/algorithms/content-effort/cache.js +23 -0
- package/dist/algorithms/content-effort/cache.js.map +1 -0
- package/dist/algorithms/content-effort/index.d.ts +4 -0
- package/dist/algorithms/content-effort/index.d.ts.map +1 -0
- package/dist/algorithms/content-effort/index.js +4 -0
- package/dist/algorithms/content-effort/index.js.map +1 -0
- package/dist/algorithms/content-effort/judge.d.ts +36 -0
- package/dist/algorithms/content-effort/judge.d.ts.map +1 -0
- package/dist/algorithms/content-effort/judge.js +69 -0
- package/dist/algorithms/content-effort/judge.js.map +1 -0
- package/dist/algorithms/content-effort/schema.d.ts +13 -0
- package/dist/algorithms/content-effort/schema.d.ts.map +1 -0
- package/dist/algorithms/content-effort/schema.js +20 -0
- package/dist/algorithms/content-effort/schema.js.map +1 -0
- package/dist/auditor.d.ts +18 -1
- package/dist/auditor.d.ts.map +1 -1
- package/dist/auditor.js +155 -16
- package/dist/auditor.js.map +1 -1
- package/dist/cache.d.ts.map +1 -1
- package/dist/cache.js +18 -3
- package/dist/cache.js.map +1 -1
- package/dist/formatters/template-cards.js +32 -32
- package/dist/framework-detect.d.ts +6 -0
- package/dist/framework-detect.d.ts.map +1 -0
- package/dist/framework-detect.js +22 -0
- package/dist/framework-detect.js.map +1 -0
- package/dist/rule-references.d.ts.map +1 -1
- package/dist/rule-references.js +1 -0
- package/dist/rule-references.js.map +1 -1
- package/dist/rules/content/unique-value.d.ts +2 -2
- package/dist/rules/content/unique-value.d.ts.map +1 -1
- package/dist/rules/content/unique-value.js +8 -2
- package/dist/rules/content/unique-value.js.map +1 -1
- package/dist/rules/scope.d.ts.map +1 -1
- package/dist/rules/scope.js +1 -0
- package/dist/rules/scope.js.map +1 -1
- package/dist/rules/tech/csr-bailout.d.ts +8 -0
- package/dist/rules/tech/csr-bailout.d.ts.map +1 -0
- package/dist/rules/tech/csr-bailout.js +48 -0
- package/dist/rules/tech/csr-bailout.js.map +1 -0
- package/dist/rules/tech/soft-404.d.ts +6 -0
- package/dist/rules/tech/soft-404.d.ts.map +1 -1
- package/dist/rules/tech/soft-404.js +23 -0
- package/dist/rules/tech/soft-404.js.map +1 -1
- package/dist/types.d.ts +25 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/schemas/audit-summary.schema.json +300 -300
- package/dist/rules/aeo/non-replicable-value.d.ts +0 -9
- package/dist/rules/aeo/non-replicable-value.d.ts.map +0 -1
- package/dist/rules/aeo/non-replicable-value.js +0 -95
- package/dist/rules/aeo/non-replicable-value.js.map +0 -1
- package/dist/rules/cannibal/keyword-collision.d.ts +0 -3
- package/dist/rules/cannibal/keyword-collision.d.ts.map +0 -1
- package/dist/rules/cannibal/keyword-collision.js +0 -25
- package/dist/rules/cannibal/keyword-collision.js.map +0 -1
- package/dist/rules/cannibal/title-overlap.d.ts +0 -3
- package/dist/rules/cannibal/title-overlap.d.ts.map +0 -1
- package/dist/rules/cannibal/title-overlap.js +0 -43
- package/dist/rules/cannibal/title-overlap.js.map +0 -1
- package/dist/rules/content/heading-uniqueness.d.ts +0 -3
- package/dist/rules/content/heading-uniqueness.d.ts.map +0 -1
- package/dist/rules/content/heading-uniqueness.js +0 -56
- package/dist/rules/content/heading-uniqueness.js.map +0 -1
- package/dist/rules/links/hub-pages.d.ts +0 -7
- package/dist/rules/links/hub-pages.d.ts.map +0 -1
- package/dist/rules/links/hub-pages.js +0 -73
- package/dist/rules/links/hub-pages.js.map +0 -1
package/dist/auditor.js
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import { createHash } from "node:crypto";
|
|
2
2
|
import { readdir, readFile, stat } from "node:fs/promises";
|
|
3
|
+
import { tmpdir } from "node:os";
|
|
3
4
|
import { extname, join, resolve } from "node:path";
|
|
4
5
|
import { parseHtmlPage } from "./parser.js";
|
|
6
|
+
import { renderPages } from "./renderer.js";
|
|
5
7
|
import { pageSkipReason } from "./page-filter.js";
|
|
6
8
|
import { mergeNormalizeUrlOptions, normalizeAuditUrl } from "./url-normalize.js";
|
|
7
9
|
import { eeatSignalsRule } from "./rules/content/eeat-signals.js";
|
|
@@ -45,7 +47,8 @@ import { citableFactsRule } from "./rules/aeo/citable-facts.js";
|
|
|
45
47
|
import { contentModularityRule } from "./rules/aeo/content-modularity.js";
|
|
46
48
|
import { summaryBaitRule } from "./rules/aeo/summary-bait.js";
|
|
47
49
|
import { redirectChainRule } from "./rules/tech/redirect-chain.js";
|
|
48
|
-
import { soft404Rule } from "./rules/tech/soft-404.js";
|
|
50
|
+
import { soft404Rule, evaluateProbe } from "./rules/tech/soft-404.js";
|
|
51
|
+
import { csrBailoutRule } from "./rules/tech/csr-bailout.js";
|
|
49
52
|
import { jsonLdValidRule } from "./rules/schema/json-ld-valid.js";
|
|
50
53
|
import { requiredFieldsRule } from "./rules/schema/required-fields.js";
|
|
51
54
|
import { schemaConsistencyRule } from "./rules/schema/consistency.js";
|
|
@@ -71,7 +74,7 @@ import { classifySite, applyDegenerationGuard, corpusStatsFromPages } from "./si
|
|
|
71
74
|
import { readState, writeState, computeContentHash, STATE_SCHEMA_VERSION, } from "./state.js";
|
|
72
75
|
import { CORE_RULESET_VERSION } from "./ruleset-version.js";
|
|
73
76
|
import { planScrapeStrategy, DEFAULT_AGE_FLOOR_DAYS } from "./scrape-strategy.js";
|
|
74
|
-
import { detectTemplates, buildUrlToTemplateMap, shouldActivateTemplateScoring } from "./template-detection.js";
|
|
77
|
+
import { detectTemplates, buildUrlToTemplateMap, shouldActivateTemplateScoring, LONGTAIL_SIGNATURE } from "./template-detection.js";
|
|
75
78
|
import { scoreTemplates, siteVerdictFromTemplates } from "./per-template-scoring.js";
|
|
76
79
|
import { deriveEntityPatterns } from "./algorithms/auto-entity-mask.js";
|
|
77
80
|
import { CompositeAuthorityProvider } from "./algorithms/authority/provider.js";
|
|
@@ -139,6 +142,9 @@ const SCORING_PROFILES = {
|
|
|
139
142
|
"aeo/citable-facts": "info",
|
|
140
143
|
"aeo/answer-first": "info",
|
|
141
144
|
"aeo/summary-bait": "warning",
|
|
145
|
+
// CSR-bailout on a small-marketing SPA is lower-stakes (a deliberately
|
|
146
|
+
// client-only marketing widget) — keep visible but don't tank the verdict.
|
|
147
|
+
"tech/csr-bailout": "info",
|
|
142
148
|
// 2026-05-03 calibration round 5: Segment integrations had 24 thin
|
|
143
149
|
// pages (200-300 words is correct for a catalog record). thin-content
|
|
144
150
|
// contributing capped 40 impact pushed integrity to its 100 cap → 30
|
|
@@ -523,24 +529,40 @@ function verdictForRisk(risk) {
|
|
|
523
529
|
* concerning → critical, critical → critical.
|
|
524
530
|
*/
|
|
525
531
|
const VERDICT_LADDER = ["ready", "caution", "concerning", "critical"];
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
+
/**
|
|
533
|
+
* Shared bounded bidirectional verdict moderator. A 0-100 `score` shifts the
|
|
534
|
+
* verdict along {@link VERDICT_LADDER} by at most `cap` tiers:
|
|
535
|
+
* - `score >= lenientAt` → soften (toward "ready"), clamped at index 0.
|
|
536
|
+
* - `score <= strictAt` → escalate (toward "critical"), clamped at the top.
|
|
537
|
+
* - in between (or absent) → no shift.
|
|
538
|
+
* Absent evidence is a no-op: `undefined`/`null`/non-finite/out-of-[0,100]
|
|
539
|
+
* `score` returns the verdict unchanged (so a null content-effort or an
|
|
540
|
+
* unavailable authority provider never moves the verdict). Authority and
|
|
541
|
+
* content-effort are both callers (see {@link shiftVerdictForAuthority}).
|
|
542
|
+
*/
|
|
543
|
+
export function shiftVerdict(verdict, o) {
|
|
544
|
+
if (o.score === undefined || o.score === null || !Number.isFinite(o.score) || o.score < 0 || o.score > 100) {
|
|
532
545
|
return verdict;
|
|
546
|
+
}
|
|
533
547
|
const idx = VERDICT_LADDER.indexOf(verdict);
|
|
534
548
|
if (idx < 0)
|
|
535
549
|
return verdict;
|
|
536
|
-
if (
|
|
537
|
-
return VERDICT_LADDER[Math.max(0, idx -
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
return VERDICT_LADDER[Math.min(VERDICT_LADDER.length - 1, idx + 1)];
|
|
541
|
-
}
|
|
550
|
+
if (o.score >= o.lenientAt)
|
|
551
|
+
return VERDICT_LADDER[Math.max(0, idx - o.cap)];
|
|
552
|
+
if (o.score <= o.strictAt)
|
|
553
|
+
return VERDICT_LADDER[Math.min(VERDICT_LADDER.length - 1, idx + o.cap)];
|
|
542
554
|
return verdict;
|
|
543
555
|
}
|
|
556
|
+
/** Authority keeps its exact ±1 / 80 / 30 behavior via the shared moderator. */
|
|
557
|
+
function shiftVerdictForAuthority(verdict, authorityScore) {
|
|
558
|
+
return shiftVerdict(verdict, { score: authorityScore, lenientAt: 80, strictAt: 30, cap: 1 });
|
|
559
|
+
}
|
|
560
|
+
// content-effort moderation band — STARTING values; Task 7 tunes against the
|
|
561
|
+
// ratchet. Derived from the gate data: reputable median effort ≈ 8.5, addressable
|
|
562
|
+
// farms cluster ≤7, proprietary-data winners (numbeo/airbyte) ≈28.
|
|
563
|
+
const EFFORT_STRICT_AT = 5; // very-low effort → escalate (farm cluster)
|
|
564
|
+
const EFFORT_LENIENT_AT = 25; // high effort → soften (rescues proprietary-data winners e.g. numbeo)
|
|
565
|
+
const EFFORT_CAP = 1;
|
|
544
566
|
function gradeForPenalty(penalty) {
|
|
545
567
|
if (penalty <= 20)
|
|
546
568
|
return "A";
|
|
@@ -652,7 +674,7 @@ sampled = false) {
|
|
|
652
674
|
}
|
|
653
675
|
// Content rules
|
|
654
676
|
if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
|
|
655
|
-
pushAll(findings, tag(uniqueValueRule(pages, resolvedRules.uniqueValueDensity)));
|
|
677
|
+
pushAll(findings, tag(uniqueValueRule(pages, resolvedRules.uniqueValueDensity, entityPatterns)));
|
|
656
678
|
}
|
|
657
679
|
if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
|
|
658
680
|
pushAll(findings, tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
|
|
@@ -727,6 +749,10 @@ sampled = false) {
|
|
|
727
749
|
if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
|
|
728
750
|
pushAll(findings, tag(soft404Rule(pages)));
|
|
729
751
|
}
|
|
752
|
+
if (isEnabled("tech/csr-bailout") && modeOk("tech/csr-bailout")) {
|
|
753
|
+
// No-op unless --render populated page.renderedHtml (the rule guards internally).
|
|
754
|
+
pushAll(findings, tag(csrBailoutRule(pages)));
|
|
755
|
+
}
|
|
730
756
|
if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
|
|
731
757
|
// hreflang declarations on noindex'd pages are still bugs when they're
|
|
732
758
|
// inconsistent — see auditor.test.ts "emits technical SEO findings".
|
|
@@ -2238,6 +2264,31 @@ export async function auditSource(source, options) {
|
|
|
2238
2264
|
}
|
|
2239
2265
|
return parsed;
|
|
2240
2266
|
});
|
|
2267
|
+
// --render: execute each page in a headless browser and attach the
|
|
2268
|
+
// post-hydration DOM so tech/csr-bailout can diff raw vs rendered. Opt-in,
|
|
2269
|
+
// Node-only (fails under bun). Degrades gracefully when no browser is available.
|
|
2270
|
+
if (options?.render) {
|
|
2271
|
+
try {
|
|
2272
|
+
const rendered = await renderPages(parsedPagesAll.map((p) => ({ url: p.url })), null, {
|
|
2273
|
+
browserWsEndpoint: options.render.browserWsEndpoint,
|
|
2274
|
+
concurrency,
|
|
2275
|
+
timeoutMs: 30000,
|
|
2276
|
+
analyticsMode: options.render.analyticsMode,
|
|
2277
|
+
extraBlockedHosts: options.render.extraBlockedHosts,
|
|
2278
|
+
});
|
|
2279
|
+
const renderedByUrl = new Map(rendered.map((r) => [r.url, r.html]));
|
|
2280
|
+
for (const p of parsedPagesAll) {
|
|
2281
|
+
const html = renderedByUrl.get(p.url);
|
|
2282
|
+
if (html)
|
|
2283
|
+
p.renderedHtml = html;
|
|
2284
|
+
}
|
|
2285
|
+
}
|
|
2286
|
+
catch (err) {
|
|
2287
|
+
// eslint-disable-next-line no-console
|
|
2288
|
+
console.error(`pseolint: --render failed (${err instanceof Error ? err.message : String(err)}). ` +
|
|
2289
|
+
`Continuing without rendered DOM; tech/csr-bailout will be skipped.`);
|
|
2290
|
+
}
|
|
2291
|
+
}
|
|
2241
2292
|
// v0.4.1 §page-filter: drop noindex'd pages and (when enabled) heuristically
|
|
2242
2293
|
// detected auth pages BEFORE rule evaluation. The site owner's noindex is a
|
|
2243
2294
|
// hard signal — they already opted out of SEO indexing, so auditing those
|
|
@@ -2384,6 +2435,41 @@ export async function auditSource(source, options) {
|
|
|
2384
2435
|
const crawlerFindings = crawlerAccessRule(robotsTxtContent);
|
|
2385
2436
|
pushAll(allFindings, crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2386
2437
|
}
|
|
2438
|
+
// tech/soft-404 synthetic probe: a URL we deliberately invent to be
|
|
2439
|
+
// nonexistent should return 404/410. A 200 means the site soft-404s, letting
|
|
2440
|
+
// crawlers index unlimited junk. Default-on for programmatic directories
|
|
2441
|
+
// (clusters are well-formed there); one probe per cluster, capped, fail-open.
|
|
2442
|
+
if (auditMode !== "diff" &&
|
|
2443
|
+
siteClassification.type === "programmatic-directory" &&
|
|
2444
|
+
isRuleEnabled("tech/soft-404", undefined)) {
|
|
2445
|
+
const PROBE_CAP = 25;
|
|
2446
|
+
const probeClusters = detectTemplates(parsedPages.map((p) => p.url))
|
|
2447
|
+
.filter((c) => c.signature !== LONGTAIL_SIGNATURE && c.urls.length > 0)
|
|
2448
|
+
.slice(0, PROBE_CAP);
|
|
2449
|
+
for (const cluster of probeClusters) {
|
|
2450
|
+
try {
|
|
2451
|
+
const rep = new URL(cluster.urls[0]);
|
|
2452
|
+
const token = Math.abs([...cluster.signature].reduce((h, ch) => ((h << 5) - h + ch.charCodeAt(0)) | 0, 0));
|
|
2453
|
+
const originalPath = rep.pathname;
|
|
2454
|
+
rep.pathname = originalPath.replace(/\/[^/]+\/?$/, `/pseolint-404-probe-${token}`);
|
|
2455
|
+
if (rep.pathname === originalPath)
|
|
2456
|
+
continue; // couldn't synthesize an invalid URL
|
|
2457
|
+
const probeUrl = rep.toString();
|
|
2458
|
+
const res = await cachedFetch(probeUrl, {
|
|
2459
|
+
timeoutMs,
|
|
2460
|
+
cache: cacheConfig,
|
|
2461
|
+
signal,
|
|
2462
|
+
onObservation: cacheStats.onObservation,
|
|
2463
|
+
});
|
|
2464
|
+
const finding = evaluateProbe(probeUrl, res.status, res.body ?? "");
|
|
2465
|
+
if (finding)
|
|
2466
|
+
allFindings.push({ ...finding, ref: RULE_REFERENCES[finding.ruleId] });
|
|
2467
|
+
}
|
|
2468
|
+
catch {
|
|
2469
|
+
// fail-open: a probe network error never produces a finding.
|
|
2470
|
+
}
|
|
2471
|
+
}
|
|
2472
|
+
}
|
|
2387
2473
|
// Data source comparison rules
|
|
2388
2474
|
if (options?.dataSource?.records && options.dataSource.records.length > 0) {
|
|
2389
2475
|
if (auditMode !== "diff" || isRuleAllowedInDiff("data/missing-binding")) {
|
|
@@ -2536,7 +2622,57 @@ export async function auditSource(source, options) {
|
|
|
2536
2622
|
// The `risk` score is intentionally unchanged — §15.1 governs verdict only.
|
|
2537
2623
|
const legacyVerdict = shiftVerdictForAuthority(verdictForRisk(risk), resolvedAuthorityScore);
|
|
2538
2624
|
const templateVerdict = siteVerdictFromTemplates(siteTemplates);
|
|
2539
|
-
const
|
|
2625
|
+
const baseVerdict = templateVerdict !== null ? templateVerdict : legacyVerdict;
|
|
2626
|
+
// 2026-06-17 SP1 — opt-in content-effort moderation. Like authority, this
|
|
2627
|
+
// shifts only the user-facing verdict (never `risk`), one tier in either
|
|
2628
|
+
// direction, and is a strict no-op when the signal is absent (null/undefined).
|
|
2629
|
+
// Resolution: an injected `contentEffortScore` (calibration/tests, offline)
|
|
2630
|
+
// wins; otherwise the LLM judge runs only when `contentEffort.enabled`. Any
|
|
2631
|
+
// failure (no key, network, abort) fails safe to `undefined` → no shift.
|
|
2632
|
+
let resolvedEffort = options?.contentEffortScore;
|
|
2633
|
+
if (resolvedEffort === undefined && options?.contentEffort?.enabled) {
|
|
2634
|
+
try {
|
|
2635
|
+
const { judgeContentEffort, makeLlmGenerate } = await import("./algorithms/content-effort/judge.js");
|
|
2636
|
+
const { model, modelId } = await createLanguageModel({
|
|
2637
|
+
model: options.contentEffort.model ?? "claude-sonnet-4-6",
|
|
2638
|
+
});
|
|
2639
|
+
// Reuse the audit's own parsed pages + template clustering: map each
|
|
2640
|
+
// template's audited URLs back to their parsed contentText. When no
|
|
2641
|
+
// template qualified (small/unclear sites), fall back to ONE synthetic
|
|
2642
|
+
// site-wide template over every audited page — mirrors the validation
|
|
2643
|
+
// runner's buildTemplates (scripts/content-effort-validate.ts).
|
|
2644
|
+
const contentByUrl = new Map(parsedPages.map((p) => [p.url, p.contentText ?? ""]));
|
|
2645
|
+
const toSamples = (urls) => urls
|
|
2646
|
+
.filter((u) => contentByUrl.has(u))
|
|
2647
|
+
.map((u) => ({ url: u, contentText: contentByUrl.get(u) ?? "" }));
|
|
2648
|
+
let effortSamples = siteTemplates.length > 0
|
|
2649
|
+
? siteTemplates
|
|
2650
|
+
.map((t) => ({ signature: t.signature, samplePages: toSamples(t.auditedUrls) }))
|
|
2651
|
+
.filter((t) => t.samplePages.length > 0)
|
|
2652
|
+
: [];
|
|
2653
|
+
if (effortSamples.length === 0) {
|
|
2654
|
+
const all = parsedPages.map((p) => ({ url: p.url, contentText: p.contentText ?? "" }));
|
|
2655
|
+
effortSamples = all.length > 0 ? [{ signature: "site", samplePages: all }] : [];
|
|
2656
|
+
}
|
|
2657
|
+
const cacheDir = options.contentEffort.cacheDir ?? join(tmpdir(), "pseolint-content-effort");
|
|
2658
|
+
const judged = await judgeContentEffort(effortSamples, {
|
|
2659
|
+
modelId,
|
|
2660
|
+
cacheDir,
|
|
2661
|
+
generate: makeLlmGenerate(model, options.signal),
|
|
2662
|
+
signal: options.signal,
|
|
2663
|
+
});
|
|
2664
|
+
resolvedEffort = judged.siteEffort;
|
|
2665
|
+
}
|
|
2666
|
+
catch {
|
|
2667
|
+
resolvedEffort = undefined; // fail-safe: model/key unavailable → no moderation
|
|
2668
|
+
}
|
|
2669
|
+
}
|
|
2670
|
+
const verdict = shiftVerdict(baseVerdict, {
|
|
2671
|
+
score: resolvedEffort,
|
|
2672
|
+
lenientAt: EFFORT_LENIENT_AT,
|
|
2673
|
+
strictAt: EFFORT_STRICT_AT,
|
|
2674
|
+
cap: EFFORT_CAP,
|
|
2675
|
+
});
|
|
2540
2676
|
const headline = buildHeadline(bucketCounts);
|
|
2541
2677
|
// audit/* findings are diagnostic-only and never appear in summary.issues.
|
|
2542
2678
|
// Surface them under diagnostics so consumers (telemetry, debug UIs) can
|
|
@@ -2576,6 +2712,9 @@ export async function auditSource(source, options) {
|
|
|
2576
2712
|
...(resolvedAuthorityScore !== undefined
|
|
2577
2713
|
? { authority: { score: resolvedAuthorityScore, domain: resolvedAuthorityDomain ?? "" } }
|
|
2578
2714
|
: {}),
|
|
2715
|
+
...(resolvedEffort !== undefined && resolvedEffort !== null
|
|
2716
|
+
? { contentEffort: { score: resolvedEffort } }
|
|
2717
|
+
: {}),
|
|
2579
2718
|
};
|
|
2580
2719
|
// Partial-report flag: the backpressure watchdog aborted mid-crawl and we
|
|
2581
2720
|
// salvaged whatever pages had been fetched. Consumers MUST treat coverage as
|