@pseolint/core 0.7.2 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/LICENSE +21 -21
  2. package/README.md +27 -4
  3. package/dist/algorithms/content-effort/cache.d.ts +5 -0
  4. package/dist/algorithms/content-effort/cache.d.ts.map +1 -0
  5. package/dist/algorithms/content-effort/cache.js +23 -0
  6. package/dist/algorithms/content-effort/cache.js.map +1 -0
  7. package/dist/algorithms/content-effort/index.d.ts +4 -0
  8. package/dist/algorithms/content-effort/index.d.ts.map +1 -0
  9. package/dist/algorithms/content-effort/index.js +4 -0
  10. package/dist/algorithms/content-effort/index.js.map +1 -0
  11. package/dist/algorithms/content-effort/judge.d.ts +36 -0
  12. package/dist/algorithms/content-effort/judge.d.ts.map +1 -0
  13. package/dist/algorithms/content-effort/judge.js +69 -0
  14. package/dist/algorithms/content-effort/judge.js.map +1 -0
  15. package/dist/algorithms/content-effort/schema.d.ts +13 -0
  16. package/dist/algorithms/content-effort/schema.d.ts.map +1 -0
  17. package/dist/algorithms/content-effort/schema.js +20 -0
  18. package/dist/algorithms/content-effort/schema.js.map +1 -0
  19. package/dist/auditor.d.ts +18 -1
  20. package/dist/auditor.d.ts.map +1 -1
  21. package/dist/auditor.js +155 -16
  22. package/dist/auditor.js.map +1 -1
  23. package/dist/cache.d.ts.map +1 -1
  24. package/dist/cache.js +18 -3
  25. package/dist/cache.js.map +1 -1
  26. package/dist/formatters/template-cards.js +32 -32
  27. package/dist/framework-detect.d.ts +6 -0
  28. package/dist/framework-detect.d.ts.map +1 -0
  29. package/dist/framework-detect.js +22 -0
  30. package/dist/framework-detect.js.map +1 -0
  31. package/dist/rule-references.d.ts.map +1 -1
  32. package/dist/rule-references.js +1 -0
  33. package/dist/rule-references.js.map +1 -1
  34. package/dist/rules/content/unique-value.d.ts +2 -2
  35. package/dist/rules/content/unique-value.d.ts.map +1 -1
  36. package/dist/rules/content/unique-value.js +8 -2
  37. package/dist/rules/content/unique-value.js.map +1 -1
  38. package/dist/rules/scope.d.ts.map +1 -1
  39. package/dist/rules/scope.js +1 -0
  40. package/dist/rules/scope.js.map +1 -1
  41. package/dist/rules/tech/csr-bailout.d.ts +8 -0
  42. package/dist/rules/tech/csr-bailout.d.ts.map +1 -0
  43. package/dist/rules/tech/csr-bailout.js +48 -0
  44. package/dist/rules/tech/csr-bailout.js.map +1 -0
  45. package/dist/rules/tech/soft-404.d.ts +6 -0
  46. package/dist/rules/tech/soft-404.d.ts.map +1 -1
  47. package/dist/rules/tech/soft-404.js +23 -0
  48. package/dist/rules/tech/soft-404.js.map +1 -1
  49. package/dist/types.d.ts +25 -0
  50. package/dist/types.d.ts.map +1 -1
  51. package/package.json +1 -1
  52. package/schemas/audit-summary.schema.json +300 -300
  53. package/dist/rules/aeo/non-replicable-value.d.ts +0 -9
  54. package/dist/rules/aeo/non-replicable-value.d.ts.map +0 -1
  55. package/dist/rules/aeo/non-replicable-value.js +0 -95
  56. package/dist/rules/aeo/non-replicable-value.js.map +0 -1
  57. package/dist/rules/cannibal/keyword-collision.d.ts +0 -3
  58. package/dist/rules/cannibal/keyword-collision.d.ts.map +0 -1
  59. package/dist/rules/cannibal/keyword-collision.js +0 -25
  60. package/dist/rules/cannibal/keyword-collision.js.map +0 -1
  61. package/dist/rules/cannibal/title-overlap.d.ts +0 -3
  62. package/dist/rules/cannibal/title-overlap.d.ts.map +0 -1
  63. package/dist/rules/cannibal/title-overlap.js +0 -43
  64. package/dist/rules/cannibal/title-overlap.js.map +0 -1
  65. package/dist/rules/content/heading-uniqueness.d.ts +0 -3
  66. package/dist/rules/content/heading-uniqueness.d.ts.map +0 -1
  67. package/dist/rules/content/heading-uniqueness.js +0 -56
  68. package/dist/rules/content/heading-uniqueness.js.map +0 -1
  69. package/dist/rules/links/hub-pages.d.ts +0 -7
  70. package/dist/rules/links/hub-pages.d.ts.map +0 -1
  71. package/dist/rules/links/hub-pages.js +0 -73
  72. package/dist/rules/links/hub-pages.js.map +0 -1
package/dist/auditor.js CHANGED
@@ -1,7 +1,9 @@
1
1
  import { createHash } from "node:crypto";
2
2
  import { readdir, readFile, stat } from "node:fs/promises";
3
+ import { tmpdir } from "node:os";
3
4
  import { extname, join, resolve } from "node:path";
4
5
  import { parseHtmlPage } from "./parser.js";
6
+ import { renderPages } from "./renderer.js";
5
7
  import { pageSkipReason } from "./page-filter.js";
6
8
  import { mergeNormalizeUrlOptions, normalizeAuditUrl } from "./url-normalize.js";
7
9
  import { eeatSignalsRule } from "./rules/content/eeat-signals.js";
@@ -45,7 +47,8 @@ import { citableFactsRule } from "./rules/aeo/citable-facts.js";
45
47
  import { contentModularityRule } from "./rules/aeo/content-modularity.js";
46
48
  import { summaryBaitRule } from "./rules/aeo/summary-bait.js";
47
49
  import { redirectChainRule } from "./rules/tech/redirect-chain.js";
48
- import { soft404Rule } from "./rules/tech/soft-404.js";
50
+ import { soft404Rule, evaluateProbe } from "./rules/tech/soft-404.js";
51
+ import { csrBailoutRule } from "./rules/tech/csr-bailout.js";
49
52
  import { jsonLdValidRule } from "./rules/schema/json-ld-valid.js";
50
53
  import { requiredFieldsRule } from "./rules/schema/required-fields.js";
51
54
  import { schemaConsistencyRule } from "./rules/schema/consistency.js";
@@ -71,7 +74,7 @@ import { classifySite, applyDegenerationGuard, corpusStatsFromPages } from "./si
71
74
  import { readState, writeState, computeContentHash, STATE_SCHEMA_VERSION, } from "./state.js";
72
75
  import { CORE_RULESET_VERSION } from "./ruleset-version.js";
73
76
  import { planScrapeStrategy, DEFAULT_AGE_FLOOR_DAYS } from "./scrape-strategy.js";
74
- import { detectTemplates, buildUrlToTemplateMap, shouldActivateTemplateScoring } from "./template-detection.js";
77
+ import { detectTemplates, buildUrlToTemplateMap, shouldActivateTemplateScoring, LONGTAIL_SIGNATURE } from "./template-detection.js";
75
78
  import { scoreTemplates, siteVerdictFromTemplates } from "./per-template-scoring.js";
76
79
  import { deriveEntityPatterns } from "./algorithms/auto-entity-mask.js";
77
80
  import { CompositeAuthorityProvider } from "./algorithms/authority/provider.js";
@@ -139,6 +142,9 @@ const SCORING_PROFILES = {
139
142
  "aeo/citable-facts": "info",
140
143
  "aeo/answer-first": "info",
141
144
  "aeo/summary-bait": "warning",
145
+ // CSR-bailout on a small-marketing SPA is lower-stakes (a deliberately
146
+ // client-only marketing widget) — keep visible but don't tank the verdict.
147
+ "tech/csr-bailout": "info",
142
148
  // 2026-05-03 calibration round 5: Segment integrations had 24 thin
143
149
  // pages (200-300 words is correct for a catalog record). thin-content
144
150
  // contributing capped 40 impact pushed integrity to its 100 cap → 30
@@ -523,24 +529,40 @@ function verdictForRisk(risk) {
523
529
  * concerning → critical, critical → critical.
524
530
  */
525
531
  const VERDICT_LADDER = ["ready", "caution", "concerning", "critical"];
526
- function shiftVerdictForAuthority(verdict, authorityScore) {
527
- if (authorityScore === undefined)
528
- return verdict;
529
- if (!Number.isFinite(authorityScore))
530
- return verdict;
531
- if (authorityScore < 0 || authorityScore > 100)
532
+ /**
533
+ * Shared bounded bidirectional verdict moderator. A 0-100 `score` shifts the
534
+ * verdict along {@link VERDICT_LADDER} by at most `cap` tiers:
535
+ * - `score >= lenientAt` → soften (toward "ready"), clamped at index 0.
536
+ * - `score <= strictAt` → escalate (toward "critical"), clamped at the top.
537
+ * - in between (or absent) no shift.
538
+ * Absent evidence is a no-op: `undefined`/`null`/non-finite/out-of-[0,100]
539
+ * `score` returns the verdict unchanged (so a null content-effort or an
540
+ * unavailable authority provider never moves the verdict). Authority and
541
+ * content-effort are both callers (see {@link shiftVerdictForAuthority}).
542
+ */
543
+ export function shiftVerdict(verdict, o) {
544
+ if (o.score === undefined || o.score === null || !Number.isFinite(o.score) || o.score < 0 || o.score > 100) {
532
545
  return verdict;
546
+ }
533
547
  const idx = VERDICT_LADDER.indexOf(verdict);
534
548
  if (idx < 0)
535
549
  return verdict;
536
- if (authorityScore >= 80) {
537
- return VERDICT_LADDER[Math.max(0, idx - 1)];
538
- }
539
- if (authorityScore <= 30) {
540
- return VERDICT_LADDER[Math.min(VERDICT_LADDER.length - 1, idx + 1)];
541
- }
550
+ if (o.score >= o.lenientAt)
551
+ return VERDICT_LADDER[Math.max(0, idx - o.cap)];
552
+ if (o.score <= o.strictAt)
553
+ return VERDICT_LADDER[Math.min(VERDICT_LADDER.length - 1, idx + o.cap)];
542
554
  return verdict;
543
555
  }
556
+ /** Authority keeps its exact ±1 / 80 / 30 behavior via the shared moderator. */
557
+ function shiftVerdictForAuthority(verdict, authorityScore) {
558
+ return shiftVerdict(verdict, { score: authorityScore, lenientAt: 80, strictAt: 30, cap: 1 });
559
+ }
560
+ // content-effort moderation band — STARTING values; Task 7 tunes against the
561
+ // ratchet. Derived from the gate data: reputable median effort ≈ 8.5, addressable
562
+ // farms cluster ≤7, proprietary-data winners (numbeo/airbyte) ≈28.
563
+ const EFFORT_STRICT_AT = 5; // very-low effort → escalate (farm cluster)
564
+ const EFFORT_LENIENT_AT = 25; // high effort → soften (rescues proprietary-data winners e.g. numbeo)
565
+ const EFFORT_CAP = 1;
544
566
  function gradeForPenalty(penalty) {
545
567
  if (penalty <= 20)
546
568
  return "A";
@@ -652,7 +674,7 @@ sampled = false) {
652
674
  }
653
675
  // Content rules
654
676
  if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
655
- pushAll(findings, tag(uniqueValueRule(pages, resolvedRules.uniqueValueDensity)));
677
+ pushAll(findings, tag(uniqueValueRule(pages, resolvedRules.uniqueValueDensity, entityPatterns)));
656
678
  }
657
679
  if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
658
680
  pushAll(findings, tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
@@ -727,6 +749,10 @@ sampled = false) {
727
749
  if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
728
750
  pushAll(findings, tag(soft404Rule(pages)));
729
751
  }
752
+ if (isEnabled("tech/csr-bailout") && modeOk("tech/csr-bailout")) {
753
+ // No-op unless --render populated page.renderedHtml (the rule guards internally).
754
+ pushAll(findings, tag(csrBailoutRule(pages)));
755
+ }
730
756
  if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
731
757
  // hreflang declarations on noindex'd pages are still bugs when they're
732
758
  // inconsistent — see auditor.test.ts "emits technical SEO findings".
@@ -2238,6 +2264,31 @@ export async function auditSource(source, options) {
2238
2264
  }
2239
2265
  return parsed;
2240
2266
  });
2267
+ // --render: execute each page in a headless browser and attach the
2268
+ // post-hydration DOM so tech/csr-bailout can diff raw vs rendered. Opt-in,
2269
+ // Node-only (fails under bun). Degrades gracefully when no browser is available.
2270
+ if (options?.render) {
2271
+ try {
2272
+ const rendered = await renderPages(parsedPagesAll.map((p) => ({ url: p.url })), null, {
2273
+ browserWsEndpoint: options.render.browserWsEndpoint,
2274
+ concurrency,
2275
+ timeoutMs: 30000,
2276
+ analyticsMode: options.render.analyticsMode,
2277
+ extraBlockedHosts: options.render.extraBlockedHosts,
2278
+ });
2279
+ const renderedByUrl = new Map(rendered.map((r) => [r.url, r.html]));
2280
+ for (const p of parsedPagesAll) {
2281
+ const html = renderedByUrl.get(p.url);
2282
+ if (html)
2283
+ p.renderedHtml = html;
2284
+ }
2285
+ }
2286
+ catch (err) {
2287
+ // eslint-disable-next-line no-console
2288
+ console.error(`pseolint: --render failed (${err instanceof Error ? err.message : String(err)}). ` +
2289
+ `Continuing without rendered DOM; tech/csr-bailout will be skipped.`);
2290
+ }
2291
+ }
2241
2292
  // v0.4.1 §page-filter: drop noindex'd pages and (when enabled) heuristically
2242
2293
  // detected auth pages BEFORE rule evaluation. The site owner's noindex is a
2243
2294
  // hard signal — they already opted out of SEO indexing, so auditing those
@@ -2384,6 +2435,41 @@ export async function auditSource(source, options) {
2384
2435
  const crawlerFindings = crawlerAccessRule(robotsTxtContent);
2385
2436
  pushAll(allFindings, crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2386
2437
  }
2438
+ // tech/soft-404 synthetic probe: a URL we deliberately invent to be
2439
+ // nonexistent should return 404/410. A 200 means the site soft-404s, letting
2440
+ // crawlers index unlimited junk. Default-on for programmatic directories
2441
+ // (clusters are well-formed there); one probe per cluster, capped, fail-open.
2442
+ if (auditMode !== "diff" &&
2443
+ siteClassification.type === "programmatic-directory" &&
2444
+ isRuleEnabled("tech/soft-404", undefined)) {
2445
+ const PROBE_CAP = 25;
2446
+ const probeClusters = detectTemplates(parsedPages.map((p) => p.url))
2447
+ .filter((c) => c.signature !== LONGTAIL_SIGNATURE && c.urls.length > 0)
2448
+ .slice(0, PROBE_CAP);
2449
+ for (const cluster of probeClusters) {
2450
+ try {
2451
+ const rep = new URL(cluster.urls[0]);
2452
+ const token = Math.abs([...cluster.signature].reduce((h, ch) => ((h << 5) - h + ch.charCodeAt(0)) | 0, 0));
2453
+ const originalPath = rep.pathname;
2454
+ rep.pathname = originalPath.replace(/\/[^/]+\/?$/, `/pseolint-404-probe-${token}`);
2455
+ if (rep.pathname === originalPath)
2456
+ continue; // couldn't synthesize an invalid URL
2457
+ const probeUrl = rep.toString();
2458
+ const res = await cachedFetch(probeUrl, {
2459
+ timeoutMs,
2460
+ cache: cacheConfig,
2461
+ signal,
2462
+ onObservation: cacheStats.onObservation,
2463
+ });
2464
+ const finding = evaluateProbe(probeUrl, res.status, res.body ?? "");
2465
+ if (finding)
2466
+ allFindings.push({ ...finding, ref: RULE_REFERENCES[finding.ruleId] });
2467
+ }
2468
+ catch {
2469
+ // fail-open: a probe network error never produces a finding.
2470
+ }
2471
+ }
2472
+ }
2387
2473
  // Data source comparison rules
2388
2474
  if (options?.dataSource?.records && options.dataSource.records.length > 0) {
2389
2475
  if (auditMode !== "diff" || isRuleAllowedInDiff("data/missing-binding")) {
@@ -2536,7 +2622,57 @@ export async function auditSource(source, options) {
2536
2622
  // The `risk` score is intentionally unchanged — §15.1 governs verdict only.
2537
2623
  const legacyVerdict = shiftVerdictForAuthority(verdictForRisk(risk), resolvedAuthorityScore);
2538
2624
  const templateVerdict = siteVerdictFromTemplates(siteTemplates);
2539
- const verdict = templateVerdict !== null ? templateVerdict : legacyVerdict;
2625
+ const baseVerdict = templateVerdict !== null ? templateVerdict : legacyVerdict;
2626
+ // 2026-06-17 SP1 — opt-in content-effort moderation. Like authority, this
2627
+ // shifts only the user-facing verdict (never `risk`), one tier in either
2628
+ // direction, and is a strict no-op when the signal is absent (null/undefined).
2629
+ // Resolution: an injected `contentEffortScore` (calibration/tests, offline)
2630
+ // wins; otherwise the LLM judge runs only when `contentEffort.enabled`. Any
2631
+ // failure (no key, network, abort) fails safe to `undefined` → no shift.
2632
+ let resolvedEffort = options?.contentEffortScore;
2633
+ if (resolvedEffort === undefined && options?.contentEffort?.enabled) {
2634
+ try {
2635
+ const { judgeContentEffort, makeLlmGenerate } = await import("./algorithms/content-effort/judge.js");
2636
+ const { model, modelId } = await createLanguageModel({
2637
+ model: options.contentEffort.model ?? "claude-sonnet-4-6",
2638
+ });
2639
+ // Reuse the audit's own parsed pages + template clustering: map each
2640
+ // template's audited URLs back to their parsed contentText. When no
2641
+ // template qualified (small/unclear sites), fall back to ONE synthetic
2642
+ // site-wide template over every audited page — mirrors the validation
2643
+ // runner's buildTemplates (scripts/content-effort-validate.ts).
2644
+ const contentByUrl = new Map(parsedPages.map((p) => [p.url, p.contentText ?? ""]));
2645
+ const toSamples = (urls) => urls
2646
+ .filter((u) => contentByUrl.has(u))
2647
+ .map((u) => ({ url: u, contentText: contentByUrl.get(u) ?? "" }));
2648
+ let effortSamples = siteTemplates.length > 0
2649
+ ? siteTemplates
2650
+ .map((t) => ({ signature: t.signature, samplePages: toSamples(t.auditedUrls) }))
2651
+ .filter((t) => t.samplePages.length > 0)
2652
+ : [];
2653
+ if (effortSamples.length === 0) {
2654
+ const all = parsedPages.map((p) => ({ url: p.url, contentText: p.contentText ?? "" }));
2655
+ effortSamples = all.length > 0 ? [{ signature: "site", samplePages: all }] : [];
2656
+ }
2657
+ const cacheDir = options.contentEffort.cacheDir ?? join(tmpdir(), "pseolint-content-effort");
2658
+ const judged = await judgeContentEffort(effortSamples, {
2659
+ modelId,
2660
+ cacheDir,
2661
+ generate: makeLlmGenerate(model, options.signal),
2662
+ signal: options.signal,
2663
+ });
2664
+ resolvedEffort = judged.siteEffort;
2665
+ }
2666
+ catch {
2667
+ resolvedEffort = undefined; // fail-safe: model/key unavailable → no moderation
2668
+ }
2669
+ }
2670
+ const verdict = shiftVerdict(baseVerdict, {
2671
+ score: resolvedEffort,
2672
+ lenientAt: EFFORT_LENIENT_AT,
2673
+ strictAt: EFFORT_STRICT_AT,
2674
+ cap: EFFORT_CAP,
2675
+ });
2540
2676
  const headline = buildHeadline(bucketCounts);
2541
2677
  // audit/* findings are diagnostic-only and never appear in summary.issues.
2542
2678
  // Surface them under diagnostics so consumers (telemetry, debug UIs) can
@@ -2576,6 +2712,9 @@ export async function auditSource(source, options) {
2576
2712
  ...(resolvedAuthorityScore !== undefined
2577
2713
  ? { authority: { score: resolvedAuthorityScore, domain: resolvedAuthorityDomain ?? "" } }
2578
2714
  : {}),
2715
+ ...(resolvedEffort !== undefined && resolvedEffort !== null
2716
+ ? { contentEffort: { score: resolvedEffort } }
2717
+ : {}),
2579
2718
  };
2580
2719
  // Partial-report flag: the backpressure watchdog aborted mid-crawl and we
2581
2720
  // salvaged whatever pages had been fetched. Consumers MUST treat coverage as