@pseolint/core 0.4.3 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +264 -169
- package/dist/ai/manifest/diff.d.ts +78 -0
- package/dist/ai/manifest/diff.d.ts.map +1 -0
- package/dist/ai/manifest/diff.js +139 -0
- package/dist/ai/manifest/diff.js.map +1 -0
- package/dist/ai/manifest/index.d.ts +18 -0
- package/dist/ai/manifest/index.d.ts.map +1 -0
- package/dist/ai/manifest/index.js +15 -0
- package/dist/ai/manifest/index.js.map +1 -0
- package/dist/ai/manifest/validate-manifest.d.ts +37 -0
- package/dist/ai/manifest/validate-manifest.d.ts.map +1 -0
- package/dist/ai/manifest/validate-manifest.js +67 -0
- package/dist/ai/manifest/validate-manifest.js.map +1 -0
- package/dist/ai/manifest/validators/domain-patches.d.ts +15 -0
- package/dist/ai/manifest/validators/domain-patches.d.ts.map +1 -0
- package/dist/ai/manifest/validators/domain-patches.js +110 -0
- package/dist/ai/manifest/validators/domain-patches.js.map +1 -0
- package/dist/ai/manifest/validators/index.d.ts +5 -0
- package/dist/ai/manifest/validators/index.d.ts.map +1 -0
- package/dist/ai/manifest/validators/index.js +4 -0
- package/dist/ai/manifest/validators/index.js.map +1 -0
- package/dist/ai/manifest/validators/page-changes.d.ts +36 -0
- package/dist/ai/manifest/validators/page-changes.d.ts.map +1 -0
- package/dist/ai/manifest/validators/page-changes.js +221 -0
- package/dist/ai/manifest/validators/page-changes.js.map +1 -0
- package/dist/ai/manifest/validators/types.d.ts +17 -0
- package/dist/ai/manifest/validators/types.d.ts.map +1 -0
- package/dist/ai/manifest/validators/types.js +5 -0
- package/dist/ai/manifest/validators/types.js.map +1 -0
- package/dist/ai/orchestrate.d.ts +74 -0
- package/dist/ai/orchestrate.d.ts.map +1 -0
- package/dist/ai/orchestrate.js +54 -0
- package/dist/ai/orchestrate.js.map +1 -0
- package/dist/ai/orchestrator/budget.d.ts +57 -0
- package/dist/ai/orchestrator/budget.d.ts.map +1 -0
- package/dist/ai/orchestrator/budget.js +114 -0
- package/dist/ai/orchestrator/budget.js.map +1 -0
- package/dist/ai/orchestrator/finish-tool.d.ts +568 -0
- package/dist/ai/orchestrator/finish-tool.d.ts.map +1 -0
- package/dist/ai/orchestrator/finish-tool.js +114 -0
- package/dist/ai/orchestrator/finish-tool.js.map +1 -0
- package/dist/ai/orchestrator/index.d.ts +25 -0
- package/dist/ai/orchestrator/index.d.ts.map +1 -0
- package/dist/ai/orchestrator/index.js +21 -0
- package/dist/ai/orchestrator/index.js.map +1 -0
- package/dist/ai/orchestrator/log.d.ts +24 -0
- package/dist/ai/orchestrator/log.d.ts.map +1 -0
- package/dist/ai/orchestrator/log.js +48 -0
- package/dist/ai/orchestrator/log.js.map +1 -0
- package/dist/ai/orchestrator/page-cache.d.ts +64 -0
- package/dist/ai/orchestrator/page-cache.d.ts.map +1 -0
- package/dist/ai/orchestrator/page-cache.js +127 -0
- package/dist/ai/orchestrator/page-cache.js.map +1 -0
- package/dist/ai/orchestrator/prompt.d.ts +16 -0
- package/dist/ai/orchestrator/prompt.d.ts.map +1 -0
- package/dist/ai/orchestrator/prompt.js +52 -0
- package/dist/ai/orchestrator/prompt.js.map +1 -0
- package/dist/ai/orchestrator/runner.d.ts +65 -0
- package/dist/ai/orchestrator/runner.d.ts.map +1 -0
- package/dist/ai/orchestrator/runner.js +223 -0
- package/dist/ai/orchestrator/runner.js.map +1 -0
- package/dist/ai/orchestrator/session.d.ts +44 -0
- package/dist/ai/orchestrator/session.d.ts.map +1 -0
- package/dist/ai/orchestrator/session.js +64 -0
- package/dist/ai/orchestrator/session.js.map +1 -0
- package/dist/ai/orchestrator/types.d.ts +99 -0
- package/dist/ai/orchestrator/types.d.ts.map +1 -0
- package/dist/ai/orchestrator/types.js +8 -0
- package/dist/ai/orchestrator/types.js.map +1 -0
- package/dist/ai/probes/cache.d.ts +12 -0
- package/dist/ai/probes/cache.d.ts.map +1 -0
- package/dist/ai/probes/cache.js +46 -0
- package/dist/ai/probes/cache.js.map +1 -0
- package/dist/ai/tools/ask-ai-engine.d.ts +77 -0
- package/dist/ai/tools/ask-ai-engine.d.ts.map +1 -0
- package/dist/ai/tools/ask-ai-engine.js +253 -0
- package/dist/ai/tools/ask-ai-engine.js.map +1 -0
- package/dist/ai/tools/check-domain-crawler-access.d.ts +71 -0
- package/dist/ai/tools/check-domain-crawler-access.d.ts.map +1 -0
- package/dist/ai/tools/check-domain-crawler-access.js +76 -0
- package/dist/ai/tools/check-domain-crawler-access.js.map +1 -0
- package/dist/ai/tools/check-domain-llms-txt.d.ts +70 -0
- package/dist/ai/tools/check-domain-llms-txt.d.ts.map +1 -0
- package/dist/ai/tools/check-domain-llms-txt.js +75 -0
- package/dist/ai/tools/check-domain-llms-txt.js.map +1 -0
- package/dist/ai/tools/check-indexability.d.ts +58 -0
- package/dist/ai/tools/check-indexability.d.ts.map +1 -0
- package/dist/ai/tools/check-indexability.js +64 -0
- package/dist/ai/tools/check-indexability.js.map +1 -0
- package/dist/ai/tools/check-robots.d.ts +68 -0
- package/dist/ai/tools/check-robots.d.ts.map +1 -0
- package/dist/ai/tools/check-robots.js +90 -0
- package/dist/ai/tools/check-robots.js.map +1 -0
- package/dist/ai/tools/check-rule-answer-first.d.ts +54 -0
- package/dist/ai/tools/check-rule-answer-first.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-answer-first.js +50 -0
- package/dist/ai/tools/check-rule-answer-first.js.map +1 -0
- package/dist/ai/tools/check-rule-canonical-consistency.d.ts +66 -0
- package/dist/ai/tools/check-rule-canonical-consistency.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-canonical-consistency.js +51 -0
- package/dist/ai/tools/check-rule-canonical-consistency.js.map +1 -0
- package/dist/ai/tools/check-rule-citable-facts.d.ts +58 -0
- package/dist/ai/tools/check-rule-citable-facts.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-citable-facts.js +41 -0
- package/dist/ai/tools/check-rule-citable-facts.js.map +1 -0
- package/dist/ai/tools/check-rule-content-modularity.d.ts +58 -0
- package/dist/ai/tools/check-rule-content-modularity.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-content-modularity.js +45 -0
- package/dist/ai/tools/check-rule-content-modularity.js.map +1 -0
- package/dist/ai/tools/check-rule-faq-coverage.d.ts +54 -0
- package/dist/ai/tools/check-rule-faq-coverage.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-faq-coverage.js +39 -0
- package/dist/ai/tools/check-rule-faq-coverage.js.map +1 -0
- package/dist/ai/tools/check-rule-freshness-signals.d.ts +54 -0
- package/dist/ai/tools/check-rule-freshness-signals.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-freshness-signals.js +45 -0
- package/dist/ai/tools/check-rule-freshness-signals.js.map +1 -0
- package/dist/ai/tools/check-rule-json-ld-valid.d.ts +54 -0
- package/dist/ai/tools/check-rule-json-ld-valid.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-json-ld-valid.js +44 -0
- package/dist/ai/tools/check-rule-json-ld-valid.js.map +1 -0
- package/dist/ai/tools/check-rule-missing-author.d.ts +54 -0
- package/dist/ai/tools/check-rule-missing-author.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-missing-author.js +45 -0
- package/dist/ai/tools/check-rule-missing-author.js.map +1 -0
- package/dist/ai/tools/check-rule-near-duplicate.d.ts +82 -0
- package/dist/ai/tools/check-rule-near-duplicate.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-near-duplicate.js +63 -0
- package/dist/ai/tools/check-rule-near-duplicate.js.map +1 -0
- package/dist/ai/tools/check-rule-required-fields.d.ts +50 -0
- package/dist/ai/tools/check-rule-required-fields.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-required-fields.js +38 -0
- package/dist/ai/tools/check-rule-required-fields.js.map +1 -0
- package/dist/ai/tools/check-rule-schema-consistency.d.ts +54 -0
- package/dist/ai/tools/check-rule-schema-consistency.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-schema-consistency.js +44 -0
- package/dist/ai/tools/check-rule-schema-consistency.js.map +1 -0
- package/dist/ai/tools/check-rule-summary-bait.d.ts +54 -0
- package/dist/ai/tools/check-rule-summary-bait.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-summary-bait.js +39 -0
- package/dist/ai/tools/check-rule-summary-bait.js.map +1 -0
- package/dist/ai/tools/check-rule-thin-content.d.ts +66 -0
- package/dist/ai/tools/check-rule-thin-content.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-thin-content.js +58 -0
- package/dist/ai/tools/check-rule-thin-content.js.map +1 -0
- package/dist/ai/tools/detect-templates.d.ts +60 -0
- package/dist/ai/tools/detect-templates.d.ts.map +1 -0
- package/dist/ai/tools/detect-templates.js +43 -0
- package/dist/ai/tools/detect-templates.js.map +1 -0
- package/dist/ai/tools/fetch-page.d.ts +70 -0
- package/dist/ai/tools/fetch-page.d.ts.map +1 -0
- package/dist/ai/tools/fetch-page.js +93 -0
- package/dist/ai/tools/fetch-page.js.map +1 -0
- package/dist/ai/tools/fetch-sitemap.d.ts +60 -0
- package/dist/ai/tools/fetch-sitemap.d.ts.map +1 -0
- package/dist/ai/tools/fetch-sitemap.js +116 -0
- package/dist/ai/tools/fetch-sitemap.js.map +1 -0
- package/dist/ai/tools/index.d.ts +1555 -0
- package/dist/ai/tools/index.d.ts.map +1 -0
- package/dist/ai/tools/index.js +119 -0
- package/dist/ai/tools/index.js.map +1 -0
- package/dist/ai/tools/parse-page.d.ts +94 -0
- package/dist/ai/tools/parse-page.d.ts.map +1 -0
- package/dist/ai/tools/parse-page.js +108 -0
- package/dist/ai/tools/parse-page.js.map +1 -0
- package/dist/ai/tools/query-serp.d.ts +113 -0
- package/dist/ai/tools/query-serp.d.ts.map +1 -0
- package/dist/ai/tools/query-serp.js +131 -0
- package/dist/ai/tools/query-serp.js.map +1 -0
- package/dist/ai/tools/sample-template.d.ts +67 -0
- package/dist/ai/tools/sample-template.d.ts.map +1 -0
- package/dist/ai/tools/sample-template.js +75 -0
- package/dist/ai/tools/sample-template.js.map +1 -0
- package/dist/ai/tools/types.d.ts +73 -0
- package/dist/ai/tools/types.d.ts.map +1 -0
- package/dist/ai/tools/types.js +64 -0
- package/dist/ai/tools/types.js.map +1 -0
- package/dist/ai/tools/validate-jsonld.d.ts +62 -0
- package/dist/ai/tools/validate-jsonld.d.ts.map +1 -0
- package/dist/ai/tools/validate-jsonld.js +84 -0
- package/dist/ai/tools/validate-jsonld.js.map +1 -0
- package/dist/auditor.d.ts +4 -0
- package/dist/auditor.d.ts.map +1 -1
- package/dist/auditor.js +629 -64
- package/dist/auditor.js.map +1 -1
- package/dist/backpressure.d.ts.map +1 -1
- package/dist/backpressure.js +10 -3
- package/dist/backpressure.js.map +1 -1
- package/dist/enrich-findings.d.ts.map +1 -1
- package/dist/enrich-findings.js +15 -1
- package/dist/enrich-findings.js.map +1 -1
- package/dist/formatters/console.d.ts.map +1 -1
- package/dist/formatters/console.js +13 -0
- package/dist/formatters/console.js.map +1 -1
- package/dist/formatters/markdown.d.ts.map +1 -1
- package/dist/formatters/markdown.js +20 -2
- package/dist/formatters/markdown.js.map +1 -1
- package/dist/index.d.ts +12 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -1
- package/dist/rule-references.d.ts.map +1 -1
- package/dist/rule-references.js +5 -0
- package/dist/rule-references.js.map +1 -1
- package/dist/rules/content/heading-structure.d.ts +21 -0
- package/dist/rules/content/heading-structure.d.ts.map +1 -0
- package/dist/rules/content/heading-structure.js +56 -0
- package/dist/rules/content/heading-structure.js.map +1 -0
- package/dist/rules/content/image-alt-text.d.ts +18 -0
- package/dist/rules/content/image-alt-text.d.ts.map +1 -0
- package/dist/rules/content/image-alt-text.js +77 -0
- package/dist/rules/content/image-alt-text.js.map +1 -0
- package/dist/rules/content/title-uniqueness.d.ts +18 -0
- package/dist/rules/content/title-uniqueness.d.ts.map +1 -0
- package/dist/rules/content/title-uniqueness.js +70 -0
- package/dist/rules/content/title-uniqueness.js.map +1 -0
- package/dist/rules/links/host-section-divergence.d.ts +3 -0
- package/dist/rules/links/host-section-divergence.d.ts.map +1 -0
- package/dist/rules/links/host-section-divergence.js +158 -0
- package/dist/rules/links/host-section-divergence.js.map +1 -0
- package/dist/rules/links/link-depth.d.ts +12 -1
- package/dist/rules/links/link-depth.d.ts.map +1 -1
- package/dist/rules/links/link-depth.js +25 -12
- package/dist/rules/links/link-depth.js.map +1 -1
- package/dist/rules/scope.d.ts.map +1 -1
- package/dist/rules/scope.js +5 -0
- package/dist/rules/scope.js.map +1 -1
- package/dist/rules/spam/doorway-pattern.d.ts.map +1 -1
- package/dist/rules/spam/doorway-pattern.js +27 -4
- package/dist/rules/spam/doorway-pattern.js.map +1 -1
- package/dist/rules/spam/publication-velocity.d.ts +1 -1
- package/dist/rules/spam/publication-velocity.d.ts.map +1 -1
- package/dist/rules/spam/publication-velocity.js +9 -4
- package/dist/rules/spam/publication-velocity.js.map +1 -1
- package/dist/rules/spam/template-coverage.js +1 -1
- package/dist/rules/spam/template-coverage.js.map +1 -1
- package/dist/rules/spam/template-diversity.js +1 -1
- package/dist/rules/spam/template-diversity.js.map +1 -1
- package/dist/rules/tech/hreflang-consistency.d.ts.map +1 -1
- package/dist/rules/tech/hreflang-consistency.js +33 -4
- package/dist/rules/tech/hreflang-consistency.js.map +1 -1
- package/dist/rules/tech/og-completeness.d.ts +11 -0
- package/dist/rules/tech/og-completeness.d.ts.map +1 -1
- package/dist/rules/tech/og-completeness.js +22 -23
- package/dist/rules/tech/og-completeness.js.map +1 -1
- package/dist/ruleset-version.d.ts +8 -0
- package/dist/ruleset-version.d.ts.map +1 -0
- package/dist/ruleset-version.js +8 -0
- package/dist/ruleset-version.js.map +1 -0
- package/dist/scrape-strategy.d.ts +42 -0
- package/dist/scrape-strategy.d.ts.map +1 -0
- package/dist/scrape-strategy.js +101 -0
- package/dist/scrape-strategy.js.map +1 -0
- package/dist/site-classifier.d.ts.map +1 -1
- package/dist/site-classifier.js +1 -0
- package/dist/site-classifier.js.map +1 -1
- package/dist/state.d.ts +36 -1
- package/dist/state.d.ts.map +1 -1
- package/dist/state.js +3 -1
- package/dist/state.js.map +1 -1
- package/dist/stratified-sample.d.ts +9 -1
- package/dist/stratified-sample.d.ts.map +1 -1
- package/dist/stratified-sample.js +23 -6
- package/dist/stratified-sample.js.map +1 -1
- package/dist/types.d.ts +135 -2
- package/dist/types.d.ts.map +1 -1
- package/dist/url-normalize.d.ts.map +1 -1
- package/dist/url-normalize.js +13 -1
- package/dist/url-normalize.js.map +1 -1
- package/package.json +90 -90
package/dist/auditor.js
CHANGED
|
@@ -18,7 +18,12 @@ import { thinContentRule } from "./rules/spam/thin-content.js";
|
|
|
18
18
|
import { deadEndsRule } from "./rules/links/dead-ends.js";
|
|
19
19
|
import { linkDepthRule } from "./rules/links/link-depth.js";
|
|
20
20
|
import { clusterConnectivityRule } from "./rules/links/cluster-connectivity.js";
|
|
21
|
+
import { hostSectionDivergenceRule } from "./rules/links/host-section-divergence.js";
|
|
21
22
|
import { orphanPagesRule } from "./rules/links/orphan-pages.js";
|
|
23
|
+
import { ogCompletenessRule } from "./rules/tech/og-completeness.js";
|
|
24
|
+
import { titleUniquenessRule } from "./rules/content/title-uniqueness.js";
|
|
25
|
+
import { headingStructureRule } from "./rules/content/heading-structure.js";
|
|
26
|
+
import { imageAltTextRule } from "./rules/content/image-alt-text.js";
|
|
22
27
|
import { canonicalConsistencyRule } from "./rules/tech/canonical-consistency.js";
|
|
23
28
|
import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-conflict.js";
|
|
24
29
|
import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
|
|
@@ -55,14 +60,17 @@ import { SSRFError, validateTargetHost } from "./ssrf-guard.js";
|
|
|
55
60
|
import { SAFE_MODE_PRESETS, resolveSafeModeKey } from "./safe-mode-preset.js";
|
|
56
61
|
import { FetchObserver, computeReadiness, detectDevServer } from "./fetch-observer.js";
|
|
57
62
|
import { BackpressureMonitor, OriginDegradedError } from "./backpressure.js";
|
|
58
|
-
import { stratifiedSample } from "./stratified-sample.js";
|
|
63
|
+
import { stratifiedSample, mulberry32 } from "./stratified-sample.js";
|
|
59
64
|
import { classifySite } from "./site-classifier.js";
|
|
60
65
|
import { readState, writeState, computeContentHash, STATE_SCHEMA_VERSION, } from "./state.js";
|
|
66
|
+
import { CORE_RULESET_VERSION } from "./ruleset-version.js";
|
|
67
|
+
import { planScrapeStrategy, DEFAULT_AGE_FLOOR_DAYS } from "./scrape-strategy.js";
|
|
61
68
|
const DEFAULTS = {
|
|
62
69
|
nearDuplicateThreshold: 0.85,
|
|
63
70
|
entitySwapThreshold: 0.95,
|
|
64
71
|
thinContentMinWords: 300,
|
|
65
72
|
publicationVelocityMaxPerDay: 100,
|
|
73
|
+
publicationVelocityMaxPerDayCorpusFraction: 0.10,
|
|
66
74
|
boilerplateMaxRatio: 0.7,
|
|
67
75
|
templateDiversityMinUniqueRatio: 0.35,
|
|
68
76
|
uniqueValueMinWords: 100,
|
|
@@ -99,13 +107,60 @@ const SCORING_PROFILES = {
|
|
|
99
107
|
"aeo/citable-facts": "info",
|
|
100
108
|
"aeo/answer-first": "info",
|
|
101
109
|
"aeo/summary-bait": "warning",
|
|
102
|
-
|
|
110
|
+
// 2026-05-03 calibration round 5: Segment integrations had 24 thin
|
|
111
|
+
// pages (200-300 words is correct for a catalog record). thin-content
|
|
112
|
+
// contributing capped 40 impact pushed integrity to its 100 cap → 30
|
|
113
|
+
// contribution at small-marketing weight, which alone tripped
|
|
114
|
+
// 'concerning'. Demoting to info keeps the signal visible without
|
|
115
|
+
// tanking the verdict on catalog-shape sites mis-classified as
|
|
116
|
+
// small-marketing. Real marketing sites (linear.app etc) don't
|
|
117
|
+
// normally have many sub-300-word pages so this won't hide quality
|
|
118
|
+
// issues there.
|
|
119
|
+
"spam/thin-content": "info",
|
|
120
|
+
"aeo/freshness-signals": "info",
|
|
121
|
+
"content/missing-author": "info",
|
|
122
|
+
// 2026-05-03 calibration round 3: Segment integrations classified as
|
|
123
|
+
// small-marketing@0.88 and tripped doorway-pattern 300× critical
|
|
124
|
+
// (catalog records are thin + entity-swap by design — not actually a
|
|
125
|
+
// doorway funnel). The classifier mistakes catalog directories as
|
|
126
|
+
// small-marketing; this demotion absorbs that mis-classification
|
|
127
|
+
// without weakening detection on actual small-marketing sites
|
|
128
|
+
// (linear.app, supabase.com — none of which produce entity-swap pairs).
|
|
129
|
+
"spam/doorway-pattern": "warning",
|
|
130
|
+
// 2026-05-03 calibration round 4: spam/boilerplate-ratio fired ERROR
|
|
131
|
+
// on Segment's integration directory (24 pages, 60%+ shared template
|
|
132
|
+
// chrome). On a marketing-template site the rule is correct — repeated
|
|
133
|
+
// "About us" / "Pricing" copy across pages IS a quality issue. On a
|
|
134
|
+
// catalog mis-classified to small-marketing, the shared chrome IS the
|
|
135
|
+
// template — by design. Demote to warning here; real marketing sites
|
|
136
|
+
// (linear.app, supabase.com) won't trip it because their corpus is
|
|
137
|
+
// page-diverse, but catalog-shape pages classified as small-marketing
|
|
138
|
+
// (Segment, Wise) won't tank the verdict.
|
|
139
|
+
"spam/boilerplate-ratio": "warning",
|
|
140
|
+
// 2026-05-03 v0.5.2 round 10: og-completeness, heading-structure,
|
|
141
|
+
// image-alt-text were added as new rules and tipped Segment from
|
|
142
|
+
// concerning → critical because catalog/template-driven sites
|
|
143
|
+
// commonly have shared OG defaults, weird H1 patterns (multiple H1s
|
|
144
|
+
// for repeated nav cards), and unlabelled logo grids. These are
|
|
145
|
+
// real findings on isolated sites but typical for catalog shape;
|
|
146
|
+
// demote to info here so the signal stays visible without driving
|
|
147
|
+
// the verdict.
|
|
148
|
+
"tech/og-completeness": "info",
|
|
149
|
+
"content/heading-structure": "info",
|
|
150
|
+
"content/image-alt-text": "info",
|
|
103
151
|
},
|
|
104
152
|
confidenceOverrides: {
|
|
105
153
|
"aeo/citable-facts": "low",
|
|
106
154
|
"aeo/answer-first": "low",
|
|
107
155
|
"aeo/summary-bait": "medium",
|
|
108
|
-
"spam/thin-content": "
|
|
156
|
+
"spam/thin-content": "low",
|
|
157
|
+
"aeo/freshness-signals": "low",
|
|
158
|
+
"content/missing-author": "low",
|
|
159
|
+
"spam/doorway-pattern": "medium",
|
|
160
|
+
"spam/boilerplate-ratio": "medium",
|
|
161
|
+
"tech/og-completeness": "low",
|
|
162
|
+
"content/heading-structure": "low",
|
|
163
|
+
"content/image-alt-text": "low",
|
|
109
164
|
},
|
|
110
165
|
},
|
|
111
166
|
"blog": {
|
|
@@ -118,8 +173,82 @@ const SCORING_PROFILES = {
|
|
|
118
173
|
},
|
|
119
174
|
"programmatic-directory": {
|
|
120
175
|
categoryWeights: { integrity: 0.55, discoverability: 0.15, citation: 0.20, data: 0.10, audit: 0 },
|
|
121
|
-
|
|
122
|
-
|
|
176
|
+
// Symmetry argument: every other profile has severity overrides for the
|
|
177
|
+
// rules that mis-fit its shape (`docs` demotes AEO + author rules,
|
|
178
|
+
// `ecommerce` demotes `aeo/citable-facts`, `small-marketing` demotes 4
|
|
179
|
+
// rules). `programmatic-directory` is the site type *most* structurally
|
|
180
|
+
// different from the "page = article" assumptions the AEO and EEAT rules
|
|
181
|
+
// are calibrated against — yet was the only profile with no overrides.
|
|
182
|
+
//
|
|
183
|
+
// Pre-calibration adjustment: demote (never escalate) the rules that
|
|
184
|
+
// first-principles analysis predicts will false-positive on catalog-
|
|
185
|
+
// shaped sites (Zapier integrations, G2 categories, Wise currency pairs,
|
|
186
|
+
// etc.). A reputable-pSEO calibration corpus + runner has been added
|
|
187
|
+
// (scripts/calibration-reputable-pseo.ts); these overrides will be
|
|
188
|
+
// tightened or loosened based on actual fire-rates measured against
|
|
189
|
+
// sites that demonstrably win in production. See
|
|
190
|
+
// docs/superpowers/specs/2026-05-03-calibration-against-reputable-pseo.md.
|
|
191
|
+
severityOverrides: {
|
|
192
|
+
// Catalog pages are tables, not prose. AEO rules calibrated on
|
|
193
|
+
// editorial content over-fire here.
|
|
194
|
+
"aeo/citable-facts": "info",
|
|
195
|
+
"aeo/answer-first": "info",
|
|
196
|
+
"aeo/content-modularity": "info",
|
|
197
|
+
// 2026-05-03 calibration: freshness-signals fired on every page of
|
|
198
|
+
// every reputable pSEO site. Catalog freshness is expressed via the
|
|
199
|
+
// data (live currency rates, current job listings, current pricing),
|
|
200
|
+
// not via visible "last updated" stamps. Demote.
|
|
201
|
+
"aeo/freshness-signals": "info",
|
|
202
|
+
// Authorship lives at the platform level (operator's about page),
|
|
203
|
+
// not on every catalog record. Following the rule's "add a byline"
|
|
204
|
+
// fix on a Zillow listing would actively make the page worse.
|
|
205
|
+
"content/missing-author": "info",
|
|
206
|
+
"content/eeat-signals": "info",
|
|
207
|
+
// Template uniformity is correct for catalogs by design. Keep the
|
|
208
|
+
// signal but cap at warning — never error.
|
|
209
|
+
"spam/template-diversity": "warning",
|
|
210
|
+
// 2026-05-03 v0.5.2 round 10: same catalog logic as small-marketing.
|
|
211
|
+
"tech/og-completeness": "info",
|
|
212
|
+
"content/heading-structure": "info",
|
|
213
|
+
"content/image-alt-text": "info",
|
|
214
|
+
// 2026-05-03 calibration round 2: catalogs are near-duplicate by
|
|
215
|
+
// design. spam/near-duplicate fires CRITICAL on every catalog pair.
|
|
216
|
+
// Demote to warning — keeps the signal visible without dominating
|
|
217
|
+
// the score.
|
|
218
|
+
"spam/near-duplicate": "warning",
|
|
219
|
+
// 2026-05-03 calibration round 5: catalog records are by-design
|
|
220
|
+
// shorter than the 300-word default. Demote to info on programmatic-
|
|
221
|
+
// directory; the data IS the content.
|
|
222
|
+
"spam/thin-content": "info",
|
|
223
|
+
// 2026-05-03 calibration round 2: doorway-pattern fires CRITICAL on
|
|
224
|
+
// every (thin + entity-swap) pair. On Segment integrations, integration
|
|
225
|
+
// pages are thin (200-300 words is the right amount for a directory
|
|
226
|
+
// record) and entity-swap (slack/google-sheets, slack/airtable, …) by
|
|
227
|
+
// design. The composite signal is genuinely true but the *intent*
|
|
228
|
+
// (doorway funnel) doesn't match the reality (catalog record).
|
|
229
|
+
// Demoting to warning preserves the signal without tanking the score.
|
|
230
|
+
"spam/doorway-pattern": "warning",
|
|
231
|
+
// 2026-05-03 calibration round 4: catalog pages share template chrome
|
|
232
|
+
// by design — same as `spam/template-diversity`, this signal is
|
|
233
|
+
// structurally true on programmatic-directories.
|
|
234
|
+
"spam/boilerplate-ratio": "warning",
|
|
235
|
+
},
|
|
236
|
+
confidenceOverrides: {
|
|
237
|
+
"aeo/citable-facts": "low",
|
|
238
|
+
"aeo/answer-first": "low",
|
|
239
|
+
"aeo/content-modularity": "low",
|
|
240
|
+
"aeo/freshness-signals": "low",
|
|
241
|
+
"content/missing-author": "low",
|
|
242
|
+
"content/eeat-signals": "low",
|
|
243
|
+
"spam/template-diversity": "medium",
|
|
244
|
+
"spam/near-duplicate": "medium",
|
|
245
|
+
"spam/doorway-pattern": "medium",
|
|
246
|
+
"spam/boilerplate-ratio": "medium",
|
|
247
|
+
"spam/thin-content": "low",
|
|
248
|
+
"tech/og-completeness": "low",
|
|
249
|
+
"content/heading-structure": "low",
|
|
250
|
+
"content/image-alt-text": "low",
|
|
251
|
+
},
|
|
123
252
|
},
|
|
124
253
|
"ecommerce": {
|
|
125
254
|
categoryWeights: { integrity: 0.20, discoverability: 0.40, citation: 0.15, data: 0.25, audit: 0 },
|
|
@@ -146,8 +275,75 @@ const SCORING_PROFILES = {
|
|
|
146
275
|
},
|
|
147
276
|
"unclear": {
|
|
148
277
|
categoryWeights: { integrity: 0.50, discoverability: 0.20, citation: 0.25, data: 0.05, audit: 0 },
|
|
149
|
-
|
|
150
|
-
|
|
278
|
+
// 2026-05-03 calibration round 2: the original "stay strict when unsure"
|
|
279
|
+
// intent meant that 4 of 5 reputable pSEO sites that classified as
|
|
280
|
+
// unclear (Zapier integrations, Typeform templates, Jasper templates,
|
|
281
|
+
// Numbeo cost-of-living) failed their verdict ceiling. The dominant
|
|
282
|
+
// driver was always `aeo/citable-facts` at full error severity — but
|
|
283
|
+
// catalog/template-gallery pages don't have prose, so the rule fires
|
|
284
|
+
// for a STRUCTURAL reason (page is a table, not a paragraph), not a
|
|
285
|
+
// QUALITY reason. Demoting the structurally-incompatible rules to
|
|
286
|
+
// info on `unclear` is conservative:
|
|
287
|
+
// - if site is genuinely editorial and got mis-classified, signals
|
|
288
|
+
// still surface (just info, not error) — author can act on them.
|
|
289
|
+
// - if site is catalog and got mis-classified to unclear, verdict
|
|
290
|
+
// no longer falsely tanks.
|
|
291
|
+
// Real spam signals (near-dup, doorway, thin) keep their severity.
|
|
292
|
+
severityOverrides: {
|
|
293
|
+
"aeo/citable-facts": "info",
|
|
294
|
+
"aeo/answer-first": "info",
|
|
295
|
+
"aeo/content-modularity": "info",
|
|
296
|
+
"aeo/freshness-signals": "info",
|
|
297
|
+
"content/missing-author": "info",
|
|
298
|
+
"content/eeat-signals": "info",
|
|
299
|
+
// 2026-05-03 calibration round 3: Airbyte classified as unclear@0.5
|
|
300
|
+
// and scored concerning despite all info-severity findings in the
|
|
301
|
+
// top 5. The 8 critical "blockers" came from spam/near-duplicate,
|
|
302
|
+
// spam/entity-swap, spam/doorway-pattern firing 1-2× each on its
|
|
303
|
+
// connectors directory — invisible per-rule but cumulatively pushing
|
|
304
|
+
// the score over 'caution'. On unclear sites we cannot tell whether
|
|
305
|
+
// these triple-fires represent a real doorway or a catalog; the
|
|
306
|
+
// calibration corpus shows reputable catalogs hitting them more
|
|
307
|
+
// often than real doorways do. Demote to warning — keeps the signal
|
|
308
|
+
// visible (it appears in shouldFix bucket, with full message) without
|
|
309
|
+
// tanking the verdict on a structurally-ambiguous site.
|
|
310
|
+
"spam/near-duplicate": "warning",
|
|
311
|
+
"spam/entity-swap": "warning",
|
|
312
|
+
"spam/doorway-pattern": "warning",
|
|
313
|
+
// 2026-05-03 calibration round 4: same boilerplate logic on unclear —
|
|
314
|
+
// we can't tell whether the site is a marketing site (boilerplate IS
|
|
315
|
+
// a quality issue) or a catalog (it isn't), so demote conservatively.
|
|
316
|
+
"spam/boilerplate-ratio": "warning",
|
|
317
|
+
// 2026-05-03 calibration round 5: same thin-content logic on unclear.
|
|
318
|
+
// Catalog-shape sites that classify as unclear (Zapier, Typeform,
|
|
319
|
+
// Jasper) had thin-content firing at error on the 5-15% of pages
|
|
320
|
+
// shorter than the 300-word default. Demote to info — surfaces the
|
|
321
|
+
// signal without driving the verdict on a structurally-ambiguous site.
|
|
322
|
+
"spam/thin-content": "info",
|
|
323
|
+
// 2026-05-03 v0.5.2 round 10: same demotions as programmatic-
|
|
324
|
+
// directory profile — these tipped Webflow/Zapier/Numbeo/Airbyte
|
|
325
|
+
// back into concerning territory because they classify as unclear
|
|
326
|
+
// and the new rules aren't yet calibrated for catalog shape.
|
|
327
|
+
"tech/og-completeness": "info",
|
|
328
|
+
"content/heading-structure": "info",
|
|
329
|
+
"content/image-alt-text": "info",
|
|
330
|
+
},
|
|
331
|
+
confidenceOverrides: {
|
|
332
|
+
"aeo/citable-facts": "low",
|
|
333
|
+
"aeo/answer-first": "low",
|
|
334
|
+
"aeo/content-modularity": "low",
|
|
335
|
+
"aeo/freshness-signals": "low",
|
|
336
|
+
"content/missing-author": "low",
|
|
337
|
+
"content/eeat-signals": "low",
|
|
338
|
+
"spam/near-duplicate": "medium",
|
|
339
|
+
"spam/entity-swap": "medium",
|
|
340
|
+
"spam/doorway-pattern": "medium",
|
|
341
|
+
"spam/boilerplate-ratio": "medium",
|
|
342
|
+
"spam/thin-content": "low",
|
|
343
|
+
"tech/og-completeness": "low",
|
|
344
|
+
"content/heading-structure": "low",
|
|
345
|
+
"content/image-alt-text": "low",
|
|
346
|
+
},
|
|
151
347
|
},
|
|
152
348
|
};
|
|
153
349
|
/**
|
|
@@ -175,6 +371,10 @@ const RULE_IMPACTS = {
|
|
|
175
371
|
"content/meta-uniqueness": { baseImpact: 8, perInstance: 2, maxImpact: 40 },
|
|
176
372
|
"content/missing-author": { baseImpact: 4, perInstance: 1, maxImpact: 20 },
|
|
177
373
|
"content/eeat-signals": { baseImpact: 4, perInstance: 1, maxImpact: 20 },
|
|
374
|
+
// 2026-05-03 v0.5.2 blind-spot fixes
|
|
375
|
+
"content/title-uniqueness": { baseImpact: 8, perInstance: 2, maxImpact: 25 }, // 2026-05-03 round 11: title is high-impact but the original 50-cap was disproportionate to other content rules and tipped Typeform into critical on a 6-finding cluster. Keep the rule at native error severity (duplicate titles ARE real bugs); just don't let one rule dominate the integrity bucket.
|
|
376
|
+
"content/heading-structure": { baseImpact: 5, perInstance: 1, maxImpact: 20 },
|
|
377
|
+
"content/image-alt-text": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
|
|
178
378
|
// Tech — softened in v0.4.3-rc2 after dogfood showed nextjs.org regressing
|
|
179
379
|
// from ready→caution on tech/canonical-consistency × 4 (legit cross-domain
|
|
180
380
|
// canonicals on a CDN). Per-instance now 1 (was 3).
|
|
@@ -191,6 +391,7 @@ const RULE_IMPACTS = {
|
|
|
191
391
|
// stripe.com from a single missing reciprocal pair — that should not be
|
|
192
392
|
// treated as 350× the impact.
|
|
193
393
|
"tech/hreflang-consistency": { baseImpact: 5, perInstance: 0, maxImpact: 5 },
|
|
394
|
+
"tech/og-completeness": { baseImpact: 4, perInstance: 1, maxImpact: 20 },
|
|
194
395
|
// Links
|
|
195
396
|
"links/orphan-pages": { baseImpact: 5, perInstance: 1, maxImpact: 25 },
|
|
196
397
|
"links/dead-ends": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
|
|
@@ -242,6 +443,39 @@ function verdictForRisk(risk) {
|
|
|
242
443
|
return "concerning";
|
|
243
444
|
return "critical";
|
|
244
445
|
}
|
|
446
|
+
/**
|
|
447
|
+
* 2026-05-03 v0.5.2 — apply the bring-your-own-authority shift to the
|
|
448
|
+
* verdict ladder. The raw `risk` number is unchanged; only the user-
|
|
449
|
+
* facing verdict mapping shifts.
|
|
450
|
+
*
|
|
451
|
+
* `authorityScore >= 80` (established brand) → shift ONE TIER LENIENT
|
|
452
|
+
* `authorityScore <= 30` (newer/lower) → shift ONE TIER STRICT
|
|
453
|
+
* 31..79 or undefined → no shift
|
|
454
|
+
*
|
|
455
|
+
* "One tier lenient" means: critical → concerning, concerning → caution,
|
|
456
|
+
* caution → ready, ready → ready (clamped). "One tier strict" is the
|
|
457
|
+
* inverse direction: ready → caution, caution → concerning,
|
|
458
|
+
* concerning → critical, critical → critical.
|
|
459
|
+
*/
|
|
460
|
+
const VERDICT_LADDER = ["ready", "caution", "concerning", "critical"];
|
|
461
|
+
function shiftVerdictForAuthority(verdict, authorityScore) {
|
|
462
|
+
if (authorityScore === undefined)
|
|
463
|
+
return verdict;
|
|
464
|
+
if (!Number.isFinite(authorityScore))
|
|
465
|
+
return verdict;
|
|
466
|
+
if (authorityScore < 0 || authorityScore > 100)
|
|
467
|
+
return verdict;
|
|
468
|
+
const idx = VERDICT_LADDER.indexOf(verdict);
|
|
469
|
+
if (idx < 0)
|
|
470
|
+
return verdict;
|
|
471
|
+
if (authorityScore >= 80) {
|
|
472
|
+
return VERDICT_LADDER[Math.max(0, idx - 1)];
|
|
473
|
+
}
|
|
474
|
+
if (authorityScore <= 30) {
|
|
475
|
+
return VERDICT_LADDER[Math.min(VERDICT_LADDER.length - 1, idx + 1)];
|
|
476
|
+
}
|
|
477
|
+
return verdict;
|
|
478
|
+
}
|
|
245
479
|
function gradeForPenalty(penalty) {
|
|
246
480
|
if (penalty <= 20)
|
|
247
481
|
return "A";
|
|
@@ -303,7 +537,15 @@ function runRulesOnPages(pages,
|
|
|
303
537
|
* `respectNoindex: true` would hide noindex'd pages from the very rules
|
|
304
538
|
* designed to flag accidental noindex'ing.
|
|
305
539
|
*/
|
|
306
|
-
noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides, mode = "full"
|
|
540
|
+
noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides, mode = "full",
|
|
541
|
+
/**
|
|
542
|
+
* 2026-05-03 calibration credibility fix: signals that the audit is
|
|
543
|
+
* running on a sampled subset of the discovered URLs. Rules whose
|
|
544
|
+
* outputs depend on a complete link graph (`links/unreachable-from-
|
|
545
|
+
* root`) skip their checks when this is true to avoid sampling-
|
|
546
|
+
* artifact false positives.
|
|
547
|
+
*/
|
|
548
|
+
sampled = false) {
|
|
307
549
|
const findings = [];
|
|
308
550
|
const modeOk = (ruleId) => mode !== "diff" || isRuleAllowedInDiff(ruleId);
|
|
309
551
|
const tag = (results) => results.map((r) => {
|
|
@@ -332,7 +574,7 @@ noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, in
|
|
|
332
574
|
findings.push(...tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
|
|
333
575
|
}
|
|
334
576
|
if (isEnabled("spam/publication-velocity") && modeOk("spam/publication-velocity")) {
|
|
335
|
-
findings.push(...tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay)));
|
|
577
|
+
findings.push(...tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay, resolvedRules.publicationVelocityMaxPerDayCorpusFraction)));
|
|
336
578
|
}
|
|
337
579
|
if (isEnabled("spam/boilerplate-ratio") && modeOk("spam/boilerplate-ratio")) {
|
|
338
580
|
findings.push(...tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
|
|
@@ -356,6 +598,17 @@ noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, in
|
|
|
356
598
|
if (isEnabled("content/eeat-signals") && modeOk("content/eeat-signals")) {
|
|
357
599
|
findings.push(...tag(eeatSignalsRule(pages)));
|
|
358
600
|
}
|
|
601
|
+
// 2026-05-03 v0.5.2 blind-spot fixes — title uniqueness + heading
|
|
602
|
+
// structure + image alt-text were tier-1 gaps in the blind-spot audit.
|
|
603
|
+
if (isEnabled("content/title-uniqueness") && modeOk("content/title-uniqueness")) {
|
|
604
|
+
findings.push(...tag(titleUniquenessRule(pages)));
|
|
605
|
+
}
|
|
606
|
+
if (isEnabled("content/heading-structure") && modeOk("content/heading-structure")) {
|
|
607
|
+
findings.push(...tag(headingStructureRule(pages)));
|
|
608
|
+
}
|
|
609
|
+
if (isEnabled("content/image-alt-text") && modeOk("content/image-alt-text")) {
|
|
610
|
+
findings.push(...tag(imageAltTextRule(pages)));
|
|
611
|
+
}
|
|
359
612
|
// Link rules — use the global link graph
|
|
360
613
|
if (isEnabled("links/orphan-pages") && modeOk("links/orphan-pages")) {
|
|
361
614
|
findings.push(...tag(orphanPagesRule(pages, inbound, rootUrl)));
|
|
@@ -365,12 +618,15 @@ noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, in
|
|
|
365
618
|
}
|
|
366
619
|
if (isEnabled("links/link-depth") && modeOk("links/link-depth")) {
|
|
367
620
|
if (rootUrl) {
|
|
368
|
-
findings.push(...tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound)));
|
|
621
|
+
findings.push(...tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound, sampled)));
|
|
369
622
|
}
|
|
370
623
|
}
|
|
371
624
|
if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
|
|
372
625
|
findings.push(...tag(clusterConnectivityRule(pages, knownUrls)));
|
|
373
626
|
}
|
|
627
|
+
if (isEnabled("links/host-section-divergence") && modeOk("links/host-section-divergence")) {
|
|
628
|
+
findings.push(...tag(hostSectionDivergenceRule(pages, adjacency)));
|
|
629
|
+
}
|
|
374
630
|
// Tech rules
|
|
375
631
|
if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
|
|
376
632
|
findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
|
|
@@ -392,6 +648,11 @@ noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, in
|
|
|
392
648
|
// inconsistent — see auditor.test.ts "emits technical SEO findings".
|
|
393
649
|
findings.push(...tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
|
|
394
650
|
}
|
|
651
|
+
// 2026-05-03 v0.5.2 blind-spot fix: og-completeness was referenced in
|
|
652
|
+
// the v0.4.x README without ever shipping. Now it does.
|
|
653
|
+
if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
|
|
654
|
+
findings.push(...tag(ogCompletenessRule(pages)));
|
|
655
|
+
}
|
|
395
656
|
// Schema rules
|
|
396
657
|
if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
|
|
397
658
|
findings.push(...tag(jsonLdValidRule(pages)));
|
|
@@ -471,6 +732,26 @@ export function applyScoringProfileOverrides(findings, classification) {
|
|
|
471
732
|
};
|
|
472
733
|
});
|
|
473
734
|
}
|
|
735
|
+
/**
|
|
736
|
+
* 2026-05-03 credibility: list of rule IDs that ACTUALLY had their severity
|
|
737
|
+
* remapped on this audit. Distinct from `profile.severityOverrides` which is
|
|
738
|
+
* the static set of demotions defined per profile — this is the subset of
|
|
739
|
+
* those that actually fired. Surfaced via `summary.appliedSeverityDemotions`
|
|
740
|
+
* so formatters can show the user "engine demoted X rules because <site
|
|
741
|
+
* type> profile" rather than hiding the mechanism.
|
|
742
|
+
*/
|
|
743
|
+
function computeAppliedDemotions(findings, classification) {
|
|
744
|
+
const profile = profileFor(classification);
|
|
745
|
+
if (Object.keys(profile.severityOverrides).length === 0)
|
|
746
|
+
return [];
|
|
747
|
+
const applied = new Set();
|
|
748
|
+
for (const f of findings) {
|
|
749
|
+
if (profile.severityOverrides[f.ruleId] !== undefined) {
|
|
750
|
+
applied.add(f.ruleId);
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
return Array.from(applied).sort();
|
|
754
|
+
}
|
|
474
755
|
/**
|
|
475
756
|
* v0.4.3 — confidence-and-count-aware scoring. Replaces the v0.4 model that
|
|
476
757
|
* counted only severity. Each rule has a `baseImpact + (count - 1) *
|
|
@@ -525,6 +806,24 @@ function scoreFromFindings(findings, classification) {
|
|
|
525
806
|
arr.push(finding);
|
|
526
807
|
groups.set(finding.ruleId, arr);
|
|
527
808
|
}
|
|
809
|
+
// 2026-05-03 calibration credibility fix: track info-severity vs
|
|
810
|
+
// non-info contributions to each bucket separately so a flood of info
|
|
811
|
+
// findings can't fill the bucket cap and tank the verdict on its own.
|
|
812
|
+
// Round 7 surfaced this on Airbyte and round 8 on Zapier — both had
|
|
813
|
+
// ALL info-severity findings in their top drivers yet scored
|
|
814
|
+
// `concerning` because cumulative info impact filled the citation
|
|
815
|
+
// bucket past its 100 cap. Now: info contribution per bucket caps at
|
|
816
|
+
// 50; warning+ contribution caps at 100; final bucket = sum, capped
|
|
817
|
+
// at 100. A site with no real warning/error findings can score at
|
|
818
|
+
// most ~12.5 risk from info accumulation at typical 0.25 citation
|
|
819
|
+
// weight — which keeps verdict aligned with the visible severity in
|
|
820
|
+
// the report.
|
|
821
|
+
const bucketInfoOnly = {
|
|
822
|
+
integrity: 0, discoverability: 0, citation: 0, data: 0, audit: 0,
|
|
823
|
+
};
|
|
824
|
+
const bucketNonInfo = {
|
|
825
|
+
integrity: 0, discoverability: 0, citation: 0, data: 0, audit: 0,
|
|
826
|
+
};
|
|
528
827
|
for (const [ruleId, group] of groups) {
|
|
529
828
|
const namespace = ruleId.split("/")[0];
|
|
530
829
|
const bucket = CATEGORY_MAP[namespace];
|
|
@@ -548,7 +847,22 @@ function scoreFromFindings(findings, classification) {
|
|
|
548
847
|
if (bestMultiplier === 0)
|
|
549
848
|
bestMultiplier = CONFIDENCE_MULTIPLIER.high;
|
|
550
849
|
const weighted = cappedImpact * bestMultiplier;
|
|
551
|
-
|
|
850
|
+
// Bucket the rule's contribution by the highest severity in the group.
|
|
851
|
+
// Mixed-severity groups (e.g. error + info) count toward non-info — once
|
|
852
|
+
// a rule has any non-info finding, its count contribution is treated as
|
|
853
|
+
// a real-issue signal, not info accumulation.
|
|
854
|
+
const isInfoOnly = group.every((f) => f.severity === "info");
|
|
855
|
+
if (isInfoOnly) {
|
|
856
|
+
bucketInfoOnly[bucket] += weighted;
|
|
857
|
+
}
|
|
858
|
+
else {
|
|
859
|
+
bucketNonInfo[bucket] += weighted;
|
|
860
|
+
}
|
|
861
|
+
}
|
|
862
|
+
for (const key of ["integrity", "discoverability", "citation", "data"]) {
|
|
863
|
+
const info = Math.min(50, bucketInfoOnly[key]);
|
|
864
|
+
const nonInfo = Math.min(100, bucketNonInfo[key]);
|
|
865
|
+
bucketRaw[key] = Math.min(100, info + nonInfo);
|
|
552
866
|
}
|
|
553
867
|
const cw = profile.categoryWeights;
|
|
554
868
|
const weighted = bucketRaw.integrity * cw.integrity +
|
|
@@ -716,6 +1030,25 @@ function parseSitemapUrls(xml) {
|
|
|
716
1030
|
const matches = Array.from(xml.matchAll(/<loc>\s*([^<\s]+)\s*<\/loc>/gi));
|
|
717
1031
|
return matches.map((match) => match[1]).filter(Boolean);
|
|
718
1032
|
}
|
|
1033
|
+
export function parseSitemapUrlsWithLastmod(xml) {
|
|
1034
|
+
const out = [];
|
|
1035
|
+
// Match both <url>...</url> blocks (in <urlset>) and <sitemap>...</sitemap>
|
|
1036
|
+
// blocks (in <sitemapindex>). Both carry <loc> + optional <lastmod>.
|
|
1037
|
+
const blocks = xml.matchAll(/<(url|sitemap)\b[^>]*>([\s\S]*?)<\/\1>/gi);
|
|
1038
|
+
for (const block of blocks) {
|
|
1039
|
+
const inner = block[2] ?? "";
|
|
1040
|
+
const locMatch = inner.match(/<loc\b[^>]*>([\s\S]*?)<\/loc>/i);
|
|
1041
|
+
if (!locMatch)
|
|
1042
|
+
continue;
|
|
1043
|
+
const url = locMatch[1].trim();
|
|
1044
|
+
if (!url)
|
|
1045
|
+
continue;
|
|
1046
|
+
const lastmodMatch = inner.match(/<lastmod\b[^>]*>([\s\S]*?)<\/lastmod>/i);
|
|
1047
|
+
const lastmod = lastmodMatch ? lastmodMatch[1].trim() : undefined;
|
|
1048
|
+
out.push({ url, lastmod });
|
|
1049
|
+
}
|
|
1050
|
+
return out;
|
|
1051
|
+
}
|
|
719
1052
|
function looksLikeSitemap(text) {
|
|
720
1053
|
const lowered = text.toLowerCase();
|
|
721
1054
|
return lowered.includes("<urlset") || lowered.includes("<sitemapindex");
|
|
@@ -784,22 +1117,32 @@ function shouldIgnore(url, patterns) {
|
|
|
784
1117
|
}
|
|
785
1118
|
return false;
|
|
786
1119
|
}
|
|
787
|
-
function fisherYatesSample(items, n) {
|
|
1120
|
+
function fisherYatesSample(items, n, random = Math.random) {
|
|
788
1121
|
const arr = [...items];
|
|
789
1122
|
for (let i = arr.length - 1; i > 0 && arr.length - i <= n; i -= 1) {
|
|
790
|
-
const j = Math.floor(
|
|
1123
|
+
const j = Math.floor(random() * (i + 1));
|
|
791
1124
|
[arr[i], arr[j]] = [arr[j], arr[i]];
|
|
792
1125
|
}
|
|
793
1126
|
return arr.slice(arr.length - n);
|
|
794
1127
|
}
|
|
795
1128
|
async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop) {
|
|
796
1129
|
visited.add(sitemapUrl);
|
|
797
|
-
const
|
|
1130
|
+
const entries = parseSitemapUrlsWithLastmod(sitemapText);
|
|
798
1131
|
if (!isSitemapIndex(sitemapText)) {
|
|
799
|
-
|
|
1132
|
+
const urls = [];
|
|
1133
|
+
const lastmodByUrl = new Map();
|
|
1134
|
+
for (const entry of entries) {
|
|
1135
|
+
urls.push(entry.url);
|
|
1136
|
+
if (entry.lastmod !== undefined) {
|
|
1137
|
+
lastmodByUrl.set(entry.url, entry.lastmod);
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
return { urls, lastmodByUrl };
|
|
800
1141
|
}
|
|
801
1142
|
const allUrls = [];
|
|
802
|
-
|
|
1143
|
+
const allLastmodByUrl = new Map();
|
|
1144
|
+
for (const entry of entries) {
|
|
1145
|
+
const childUrl = entry.url;
|
|
803
1146
|
if (signal?.aborted)
|
|
804
1147
|
throw signal.reason ?? new Error("aborted");
|
|
805
1148
|
if (visited.has(childUrl))
|
|
@@ -810,10 +1153,13 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
|
|
|
810
1153
|
const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
|
|
811
1154
|
if (!childLike)
|
|
812
1155
|
continue;
|
|
813
|
-
const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
1156
|
+
const { urls: childUrls, lastmodByUrl: childLastmodByUrl } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
814
1157
|
allUrls.push(...childUrls);
|
|
1158
|
+
for (const [u, lm] of childLastmodByUrl) {
|
|
1159
|
+
allLastmodByUrl.set(u, lm);
|
|
1160
|
+
}
|
|
815
1161
|
}
|
|
816
|
-
return allUrls;
|
|
1162
|
+
return { urls: allUrls, lastmodByUrl: allLastmodByUrl };
|
|
817
1163
|
}
|
|
818
1164
|
async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
|
|
819
1165
|
if (!origin)
|
|
@@ -846,7 +1192,7 @@ function isDisallowedByRobots(urlPath, patterns) {
|
|
|
846
1192
|
function budgetExceeded(b) {
|
|
847
1193
|
return b.cap > 0 && b.used >= b.cap;
|
|
848
1194
|
}
|
|
849
|
-
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000) {
|
|
1195
|
+
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null) {
|
|
850
1196
|
// Memoized SSRF validator. When guardSsrf is on, every URL fetched by the
|
|
851
1197
|
// audit (source, sitemap entries, redirects, discovered links) goes through
|
|
852
1198
|
// this. DNS is hit once per unique hostname per audit — a 4k-page audit on
|
|
@@ -906,11 +1252,33 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
906
1252
|
const isXml = (contentType.includes("xml") || looksLikeSitemap(text)) && sourceStatus !== -1;
|
|
907
1253
|
if (isXml) {
|
|
908
1254
|
const visited = new Set();
|
|
909
|
-
const allSitemapUrls = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
1255
|
+
const { urls: allSitemapUrls, lastmodByUrl: sitemapLastmodByUrl } = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
910
1256
|
// If we have a budget, sample from sitemap URLs before fetching
|
|
911
|
-
const
|
|
1257
|
+
const sampledUrls = discoveryBudget > 0 && allSitemapUrls.length > discoveryBudget
|
|
912
1258
|
? fisherYatesSample(allSitemapUrls, discoveryBudget)
|
|
913
1259
|
: allSitemapUrls;
|
|
1260
|
+
// v0.5: change-driven monitoring. Apply the decision matrix BEFORE
|
|
1261
|
+
// fetching bodies. URLs in plan.skip are not network-touched at all —
|
|
1262
|
+
// their findings will be carried forward from prior state by the caller.
|
|
1263
|
+
// This is the whole point of monitoring mode: rule eval is microseconds,
|
|
1264
|
+
// the fetch is seconds; move the skip decision upstream of the fetch.
|
|
1265
|
+
let scrapePlan;
|
|
1266
|
+
let urlsToFetch;
|
|
1267
|
+
if (monitoringContext) {
|
|
1268
|
+
scrapePlan = planScrapeStrategy({
|
|
1269
|
+
candidateUrls: sampledUrls,
|
|
1270
|
+
priorState: monitoringContext.priorState,
|
|
1271
|
+
sitemapLastmodByUrl,
|
|
1272
|
+
currentRulesetVersion: monitoringContext.currentRulesetVersion,
|
|
1273
|
+
ageFloorDays: monitoringContext.ageFloorDays,
|
|
1274
|
+
now: monitoringContext.now,
|
|
1275
|
+
forceRefetchUrls: monitoringContext.forceRefetchUrls,
|
|
1276
|
+
});
|
|
1277
|
+
urlsToFetch = Array.from(scrapePlan.refetch.keys());
|
|
1278
|
+
}
|
|
1279
|
+
else {
|
|
1280
|
+
urlsToFetch = sampledUrls;
|
|
1281
|
+
}
|
|
914
1282
|
const pages = [];
|
|
915
1283
|
// Fetch robots.txt once for the origin — reused for Crawl-Delay pacing and Disallow checks.
|
|
916
1284
|
const sourceOrigin = (() => { try {
|
|
@@ -1017,7 +1385,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
1017
1385
|
});
|
|
1018
1386
|
}
|
|
1019
1387
|
}
|
|
1020
|
-
return { pages, sitemapUrls: new Set(allSitemapUrls), discoveredUrlCount: allSitemapUrls.length };
|
|
1388
|
+
return { pages, sitemapUrls: new Set(allSitemapUrls), sitemapLastmodByUrl, discoveredUrlCount: allSitemapUrls.length, scrapePlan };
|
|
1021
1389
|
}
|
|
1022
1390
|
if (contentType.includes("html") || looksLikeHtml(text)) {
|
|
1023
1391
|
const initialPage = { url: source, html: text };
|
|
@@ -1158,12 +1526,26 @@ export async function auditSource(source, options) {
|
|
|
1158
1526
|
let backpressureError = null;
|
|
1159
1527
|
const signal = composeSignals(externalSignal, backpressureAbort.signal);
|
|
1160
1528
|
const observer = new FetchObserver();
|
|
1529
|
+
// 2026-05-03 calibration: the prior (3s p95 cap, 2× baseline multiplier)
|
|
1530
|
+
// gate aborted 4 of 12 reputable-pSEO audits on what was normal load
|
|
1531
|
+
// variance — Zapier at p95=576ms (2.4× a 236ms baseline), Webflow at
|
|
1532
|
+
// p95=1808ms (2.2× 833ms), Airbyte at p95=1288ms (3.4× 380ms). For real
|
|
1533
|
+
// production CDNs these spikes are noise, not degradation. Raise the
|
|
1534
|
+
// gate so it still catches truly broken origins (sustained 4× slowdown
|
|
1535
|
+
// OR p95 above 8s) without tripping on normal audit-induced load.
|
|
1161
1536
|
const monitor = backpressureEnabled
|
|
1162
1537
|
? new BackpressureMonitor({
|
|
1163
1538
|
warmupSize: 10,
|
|
1164
|
-
absoluteP95Ms:
|
|
1165
|
-
baselineMultiplier:
|
|
1166
|
-
|
|
1539
|
+
absoluteP95Ms: 8000,
|
|
1540
|
+
baselineMultiplier: 4,
|
|
1541
|
+
// 2026-05-03 production fix: 0.1 (10%) was tripping pseolint.dev
|
|
1542
|
+
// audits on real production sites that legitimately return ~10% 5xx
|
|
1543
|
+
// (transient errors, async page renderers warming up, sites in
|
|
1544
|
+
// canary). Combined with the `>=` comparison bug (also fixed),
|
|
1545
|
+
// this aborted every web-app audit. 0.15 keeps the gate honest —
|
|
1546
|
+
// a sustained 15%+ 5xx rate is a real problem, not noise — while
|
|
1547
|
+
// letting transient errors not bring down the whole audit.
|
|
1548
|
+
errorRatioThreshold: 0.15,
|
|
1167
1549
|
})
|
|
1168
1550
|
: null;
|
|
1169
1551
|
// v0.4: framework gets set on the first observation that carries headers
|
|
@@ -1195,6 +1577,8 @@ export async function auditSource(source, options) {
|
|
|
1195
1577
|
entitySwapThreshold: options?.rules?.entitySwapThreshold ?? DEFAULTS.entitySwapThreshold,
|
|
1196
1578
|
thinContentMinWords: options?.rules?.thinContentMinWords ?? DEFAULTS.thinContentMinWords,
|
|
1197
1579
|
publicationVelocityMaxPerDay: options?.rules?.publicationVelocityMaxPerDay ?? DEFAULTS.publicationVelocityMaxPerDay,
|
|
1580
|
+
publicationVelocityMaxPerDayCorpusFraction: options?.rules?.publicationVelocityMaxPerDayCorpusFraction
|
|
1581
|
+
?? DEFAULTS.publicationVelocityMaxPerDayCorpusFraction,
|
|
1198
1582
|
boilerplateMaxRatio: options?.rules?.boilerplateMaxRatio ?? DEFAULTS.boilerplateMaxRatio,
|
|
1199
1583
|
templateDiversityMinUniqueRatio: options?.rules?.templateDiversityMinUniqueRatio ?? DEFAULTS.templateDiversityMinUniqueRatio,
|
|
1200
1584
|
uniqueValueMinWords: options?.rules?.uniqueValueMinWords ?? DEFAULTS.uniqueValueMinWords,
|
|
@@ -1236,7 +1620,65 @@ export async function auditSource(source, options) {
|
|
|
1236
1620
|
const fetchByteBudget = { used: 0, cap: maxFetchBytes };
|
|
1237
1621
|
// v0.4 §4.7: detectedFramework is set in onObservation above, side-effect
|
|
1238
1622
|
// of the normal source URL fetch. No separate probe needed.
|
|
1239
|
-
|
|
1623
|
+
// v0.5: read prior state BEFORE loadPagesFromSource so the change-driven
|
|
1624
|
+
// monitoring decision matrix can run pre-fetch and tell loadPagesFromSource
|
|
1625
|
+
// which URLs to actually fetch. Reading state is cheap; doing it here also
|
|
1626
|
+
// means we know `priorState` once for both the monitoring path and the
|
|
1627
|
+
// post-audit state-write path further down.
|
|
1628
|
+
let priorState = null;
|
|
1629
|
+
const skippedUrls = [];
|
|
1630
|
+
const currentRenderMode = options?.render ? "rendered" : "static";
|
|
1631
|
+
if (options?.state?.path || options?.state?.since || options?.state?.exitOnRegression || options?.state?.mode) {
|
|
1632
|
+
const statePath = options.state?.path ?? ".pseolint/state.json";
|
|
1633
|
+
priorState = await readState(statePath);
|
|
1634
|
+
if (priorState && priorState.renderMode !== currentRenderMode) {
|
|
1635
|
+
console.error(`warning: prior state renderMode=${priorState.renderMode} differs from current ${currentRenderMode}. Performing full re-audit.`);
|
|
1636
|
+
priorState = null;
|
|
1637
|
+
}
|
|
1638
|
+
}
|
|
1639
|
+
// Effective monitoring mode:
|
|
1640
|
+
// - explicit `state.mode` wins ("monitoring" or "fresh")
|
|
1641
|
+
// - else if `--since` is passed and prior state exists → "monitoring" (back-compat alias)
|
|
1642
|
+
// - else if prior state exists → "monitoring" (auto, v0.5 default)
|
|
1643
|
+
// - else → "fresh" (no prior state available)
|
|
1644
|
+
const explicitMode = options?.state?.mode;
|
|
1645
|
+
const effectiveMode = explicitMode ??
|
|
1646
|
+
(priorState ? "monitoring" : "fresh");
|
|
1647
|
+
// Build the monitoring context only for HTTP sources in monitoring mode with
|
|
1648
|
+
// prior state. Single-page HTML and filesystem sources skip this — they are
|
|
1649
|
+
// exempted from the strategy (a single-page audit has nothing to plan; local
|
|
1650
|
+
// reads are cheap so re-reading every file beats branch complexity).
|
|
1651
|
+
const isHttpSource = /^https?:\/\//i.test(source);
|
|
1652
|
+
// If the user asked for monitoring against a filesystem source, surface that
|
|
1653
|
+
// we're ignoring the request. Silent bypass leads to "why is my state file
|
|
1654
|
+
// not being used?" debugging. Only log when the user actively chose
|
|
1655
|
+
// monitoring (explicit --mode or --since) — auto-monitoring on prior state
|
|
1656
|
+
// existence is implicit and shouldn't warn.
|
|
1657
|
+
if (!isHttpSource && effectiveMode === "monitoring" && (options?.state?.mode === "monitoring" || options?.state?.since)) {
|
|
1658
|
+
console.error("warning: monitoring mode requested but source is a local file/directory; reading every HTML file (the matrix only applies to HTTP sources).");
|
|
1659
|
+
}
|
|
1660
|
+
const monitoringContext = effectiveMode === "monitoring" && priorState && isHttpSource
|
|
1661
|
+
? {
|
|
1662
|
+
priorState,
|
|
1663
|
+
currentRulesetVersion: CORE_RULESET_VERSION,
|
|
1664
|
+
ageFloorDays: options?.state?.ageFloorDays ?? DEFAULT_AGE_FLOOR_DAYS,
|
|
1665
|
+
now: new Date(),
|
|
1666
|
+
forceRefetchUrls: options?.force?.urls,
|
|
1667
|
+
}
|
|
1668
|
+
: null;
|
|
1669
|
+
if (!priorState && options?.state?.since) {
|
|
1670
|
+
console.error("no prior state found — performing full baseline audit");
|
|
1671
|
+
}
|
|
1672
|
+
const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, sitemapLastmodByUrl, discoveredUrlCount, scrapePlan } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext);
|
|
1673
|
+
// The scrapePlan tells us which URLs were skipped pre-fetch under monitoring
|
|
1674
|
+
// mode. Surface them in skippedUrls so they show up under summary.skippedUrls
|
|
1675
|
+
// (kept for back-compat with --since consumers); T7 will carry their prior
|
|
1676
|
+
// findings forward and T8 will surface the full plan in summary.scrapePlan.
|
|
1677
|
+
if (scrapePlan) {
|
|
1678
|
+
for (const url of scrapePlan.skip.keys()) {
|
|
1679
|
+
skippedUrls.push(url);
|
|
1680
|
+
}
|
|
1681
|
+
}
|
|
1240
1682
|
throwIfAborted();
|
|
1241
1683
|
const loadedPages = [...loadedPagesRaw];
|
|
1242
1684
|
// v0.4 §4.7: content-type-aware crawling. Filter out fetched URLs whose
|
|
@@ -1265,34 +1707,11 @@ export async function auditSource(source, options) {
|
|
|
1265
1707
|
if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
|
|
1266
1708
|
console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
|
|
1267
1709
|
}
|
|
1268
|
-
//
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
priorState = await readState(statePath);
|
|
1274
|
-
const currentRenderMode = options.render ? "rendered" : "static";
|
|
1275
|
-
if (priorState && priorState.renderMode !== currentRenderMode) {
|
|
1276
|
-
console.error(`warning: prior state renderMode=${priorState.renderMode} differs from current ${currentRenderMode}. Performing full re-audit.`);
|
|
1277
|
-
priorState = null;
|
|
1278
|
-
}
|
|
1279
|
-
if (priorState && options.state.since) {
|
|
1280
|
-
const kept = [];
|
|
1281
|
-
for (const p of loadedPages) {
|
|
1282
|
-
const prior = priorState.urls[p.url];
|
|
1283
|
-
if (prior && prior.contentHash === computeContentHash(p.html)) {
|
|
1284
|
-
skippedUrls.push(p.url);
|
|
1285
|
-
}
|
|
1286
|
-
else {
|
|
1287
|
-
kept.push(p);
|
|
1288
|
-
}
|
|
1289
|
-
}
|
|
1290
|
-
loadedPages.splice(0, loadedPages.length, ...kept);
|
|
1291
|
-
}
|
|
1292
|
-
else if (!priorState && options.state.since) {
|
|
1293
|
-
console.error("no prior state found — performing full baseline audit");
|
|
1294
|
-
}
|
|
1295
|
-
}
|
|
1710
|
+
// v0.5: prior state was loaded BEFORE loadPagesFromSource so the change-
|
|
1711
|
+
// driven monitoring decision matrix could run pre-fetch. URLs the matrix
|
|
1712
|
+
// marked as "skip" were never fetched and are recorded in skippedUrls
|
|
1713
|
+
// above. The old post-fetch contentHash skip is gone — the decision now
|
|
1714
|
+
// happens upstream of the network round-trip.
|
|
1296
1715
|
let robotsTxtContent = "";
|
|
1297
1716
|
if (/^https?:\/\//i.test(source)) {
|
|
1298
1717
|
try {
|
|
@@ -1330,14 +1749,22 @@ export async function auditSource(source, options) {
|
|
|
1330
1749
|
? deduped.filter((page) => !shouldIgnore(page.url, ignorePatterns))
|
|
1331
1750
|
: deduped;
|
|
1332
1751
|
const strategy = options?.samplingStrategy ?? "stratified";
|
|
1333
|
-
|
|
1752
|
+
// 2026-05-03 calibration credibility fix: when sampleSeed is set, use a
|
|
1753
|
+
// deterministic PRNG so repeated audits pick the same pages and the
|
|
1754
|
+
// verdict is reproducible. Without a seed, fall back to Math.random
|
|
1755
|
+
// (legacy behavior, kept for backward compatibility).
|
|
1756
|
+
const samplingRandom = options?.sampleSeed !== undefined
|
|
1757
|
+
? mulberry32(options.sampleSeed)
|
|
1758
|
+
: Math.random;
|
|
1759
|
+
const isSampledAudit = sampleSize > 0 && sampleSize < filtered.length;
|
|
1760
|
+
const sampled = isSampledAudit
|
|
1334
1761
|
? (strategy === "stratified"
|
|
1335
1762
|
? (() => {
|
|
1336
1763
|
const urlsMap = new Map(filtered.map(p => [p.url, p]));
|
|
1337
|
-
const sampledUrls = stratifiedSample(filtered.map(p => p.url), sampleSize);
|
|
1764
|
+
const sampledUrls = stratifiedSample(filtered.map(p => p.url), sampleSize, samplingRandom);
|
|
1338
1765
|
return sampledUrls.map(u => urlsMap.get(u));
|
|
1339
1766
|
})()
|
|
1340
|
-
: fisherYatesSample(filtered, sampleSize))
|
|
1767
|
+
: fisherYatesSample(filtered, sampleSize, samplingRandom))
|
|
1341
1768
|
: filtered;
|
|
1342
1769
|
const parsedPagesAll = sampled.map((page) => {
|
|
1343
1770
|
const parsed = parseHtmlPage(page.html, page.url, { normalizeUrl: normalizeUrlOptions });
|
|
@@ -1486,7 +1913,7 @@ export async function auditSource(source, options) {
|
|
|
1486
1913
|
continue;
|
|
1487
1914
|
const groupRules = resolveGroupRules(resolvedRules, groupConfig?.overrides);
|
|
1488
1915
|
const enabledCheck = (ruleId) => !suppressedRuleSet.has(ruleId) && isRuleEnabled(ruleId, groupConfig?.rules);
|
|
1489
|
-
const findings = runRulesOnPages(groupPages, parsedPagesAll, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full");
|
|
1916
|
+
const findings = runRulesOnPages(groupPages, parsedPagesAll, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full", isSampledAudit);
|
|
1490
1917
|
allFindings.push(...findings);
|
|
1491
1918
|
groupPageCounts[groupName] = groupPages.length;
|
|
1492
1919
|
// v0.4.3: per-group scoring uses the same site-classification profile so
|
|
@@ -1508,10 +1935,55 @@ export async function auditSource(source, options) {
|
|
|
1508
1935
|
// the enrichment output so every downstream consumer (summary.issues, AI
|
|
1509
1936
|
// triage input, telemetry, formatters) sees the corrected severity.
|
|
1510
1937
|
enriched.findings = applyScoringProfileOverrides(enriched.findings, siteClassification);
|
|
1938
|
+
// v0.5: change-driven monitoring carry-forward. URLs that the pre-fetch
|
|
1939
|
+
// strategy marked as "skip" were never fetched this run, so no rule produced
|
|
1940
|
+
// findings for them. Restore their findings from prior state, marked with
|
|
1941
|
+
// `carriedForward: true` and `lastVerifiedAt` so consumers can reason about
|
|
1942
|
+
// staleness. Inject after enrichment + overrides — these findings already
|
|
1943
|
+
// went through both in their original run; re-running enrichment would
|
|
1944
|
+
// strip their template / cluster assignments because parsedPages doesn't
|
|
1945
|
+
// contain the skipped pages.
|
|
1946
|
+
if (priorState && skippedUrls.length > 0) {
|
|
1947
|
+
for (const url of skippedUrls) {
|
|
1948
|
+
const prior = priorState.urls[url];
|
|
1949
|
+
if (!prior || prior.findings.length === 0)
|
|
1950
|
+
continue;
|
|
1951
|
+
for (const f of prior.findings) {
|
|
1952
|
+
const carried = {
|
|
1953
|
+
ruleId: f.ruleId,
|
|
1954
|
+
severity: f.severity,
|
|
1955
|
+
message: f.message,
|
|
1956
|
+
confidence: f.confidence,
|
|
1957
|
+
carriedForward: true,
|
|
1958
|
+
lastVerifiedAt: prior.fetchedAt,
|
|
1959
|
+
// State stores `url` but the engine type uses `pageUrl` — map back.
|
|
1960
|
+
pageUrl: typeof f.url === "string" ? f.url : url,
|
|
1961
|
+
};
|
|
1962
|
+
// Optional fields are preserved opportunistically when present in state.
|
|
1963
|
+
if (typeof f.fix === "string")
|
|
1964
|
+
carried.fix = f.fix;
|
|
1965
|
+
if (typeof f.ref === "string")
|
|
1966
|
+
carried.ref = f.ref;
|
|
1967
|
+
if (typeof f.docsUrl === "string")
|
|
1968
|
+
carried.docsUrl = f.docsUrl;
|
|
1969
|
+
if (Array.isArray(f.relatedUrls))
|
|
1970
|
+
carried.relatedUrls = f.relatedUrls;
|
|
1971
|
+
if (typeof f.group === "string")
|
|
1972
|
+
carried.group = f.group;
|
|
1973
|
+
if (typeof f.similarity === "number")
|
|
1974
|
+
carried.similarity = f.similarity;
|
|
1975
|
+
if (f.context !== undefined)
|
|
1976
|
+
carried.context = f.context;
|
|
1977
|
+
if (f.effort !== undefined)
|
|
1978
|
+
carried.effort = f.effort;
|
|
1979
|
+
enriched.findings.push(carried);
|
|
1980
|
+
}
|
|
1981
|
+
}
|
|
1982
|
+
}
|
|
1511
1983
|
const { risk, categories, bucketCounts } = scoreFromFindings(enriched.findings, siteClassification);
|
|
1512
1984
|
const auditedPageCount = Object.values(groupPageCounts).reduce((a, b) => a + b, 0);
|
|
1513
1985
|
const issues = bucketIssues(enriched.findings);
|
|
1514
|
-
const verdict = verdictForRisk(risk);
|
|
1986
|
+
const verdict = shiftVerdictForAuthority(verdictForRisk(risk), options?.authorityScore);
|
|
1515
1987
|
const headline = buildHeadline(bucketCounts);
|
|
1516
1988
|
// audit/* findings are diagnostic-only and never appear in summary.issues.
|
|
1517
1989
|
// Surface them under diagnostics so consumers (telemetry, debug UIs) can
|
|
@@ -1523,6 +1995,7 @@ export async function auditSource(source, options) {
|
|
|
1523
1995
|
fetched: parsedPages.length,
|
|
1524
1996
|
skipped: skippedByContentType.length + skippedByRobots.length + skippedUrls.length,
|
|
1525
1997
|
};
|
|
1998
|
+
const appliedSeverityDemotions = computeAppliedDemotions(enriched.findings, siteClassification);
|
|
1526
1999
|
const summary = {
|
|
1527
2000
|
schemaVersion: SCHEMA_VERSION,
|
|
1528
2001
|
verdict,
|
|
@@ -1531,6 +2004,7 @@ export async function auditSource(source, options) {
|
|
|
1531
2004
|
categories,
|
|
1532
2005
|
issues,
|
|
1533
2006
|
siteClassification,
|
|
2007
|
+
appliedSeverityDemotions: appliedSeverityDemotions.length > 0 ? appliedSeverityDemotions : undefined,
|
|
1534
2008
|
diagnostics: {
|
|
1535
2009
|
originReadiness: readinessReport,
|
|
1536
2010
|
crawlStats,
|
|
@@ -1577,6 +2051,31 @@ export async function auditSource(source, options) {
|
|
|
1577
2051
|
if (allSkipped.length > 0) {
|
|
1578
2052
|
summary.skippedUrls = allSkipped;
|
|
1579
2053
|
}
|
|
2054
|
+
// v0.5+: surface the change-driven monitoring summary when this run was a
|
|
2055
|
+
// monitoring run (had prior state and didn't force --mode=fresh). Filesystem
|
|
2056
|
+
// sources don't get a scrapePlan because they bypass the matrix.
|
|
2057
|
+
if (effectiveMode === "monitoring" && priorState && scrapePlan) {
|
|
2058
|
+
const reasonCounts = {};
|
|
2059
|
+
for (const reason of scrapePlan.refetch.values()) {
|
|
2060
|
+
reasonCounts[reason] = (reasonCounts[reason] ?? 0) + 1;
|
|
2061
|
+
}
|
|
2062
|
+
for (const reason of scrapePlan.skip.values()) {
|
|
2063
|
+
reasonCounts[reason] = (reasonCounts[reason] ?? 0) + 1;
|
|
2064
|
+
}
|
|
2065
|
+
// `fetched` is the number of URLs whose bodies actually came back —
|
|
2066
|
+
// robots-disallowed, byte-budget-exceeded, content-type-filtered, and 4xx
|
|
2067
|
+
// URLs the matrix INTENDED to refetch may have dropped out before we got
|
|
2068
|
+
// here. `intended` (= scrapePlan.refetch.size) is exposed too so callers
|
|
2069
|
+
// can spot the gap (e.g. "intended 200, fetched 187, 13 URLs dropped").
|
|
2070
|
+
summary.scrapePlan = {
|
|
2071
|
+
fetched: loadedPages.length,
|
|
2072
|
+
intended: scrapePlan.refetch.size,
|
|
2073
|
+
carriedForward: scrapePlan.skip.size,
|
|
2074
|
+
reasonCounts,
|
|
2075
|
+
rulesetVersion: CORE_RULESET_VERSION,
|
|
2076
|
+
lastFullAuditAt: priorState.lastFullAuditAt ?? priorState.lastRun ?? null,
|
|
2077
|
+
};
|
|
2078
|
+
}
|
|
1580
2079
|
// v0.4.1: surface noindex / auth skips as a discoverable diagnostic so the
|
|
1581
2080
|
// user sees what the engine excluded. Catches the accidental-noindex bug:
|
|
1582
2081
|
// pages silently dropped from indexing show up as a visible skip line
|
|
@@ -1619,6 +2118,13 @@ export async function auditSource(source, options) {
|
|
|
1619
2118
|
for (const f of enrichedFindings) {
|
|
1620
2119
|
if (!f.pageUrl)
|
|
1621
2120
|
continue;
|
|
2121
|
+
// Carried-forward findings are not "current" — we did not re-verify them
|
|
2122
|
+
// this run. Including them would mask a genuine regression on a skipped
|
|
2123
|
+
// URL: prior set has rule X carried-forward, current set also has X
|
|
2124
|
+
// (carried-forward), comparison says "no new rule", we miss the case
|
|
2125
|
+
// where the page actually started failing rule Y too.
|
|
2126
|
+
if (f.carriedForward)
|
|
2127
|
+
continue;
|
|
1622
2128
|
const set = currentFindings.get(f.pageUrl) ?? new Set();
|
|
1623
2129
|
set.add(f.ruleId);
|
|
1624
2130
|
currentFindings.set(f.pageUrl, set);
|
|
@@ -1644,6 +2150,12 @@ export async function auditSource(source, options) {
|
|
|
1644
2150
|
const renderMode = options.render ? "rendered" : "static";
|
|
1645
2151
|
const urls = {};
|
|
1646
2152
|
const findingsByUrl = new Map();
|
|
2153
|
+
// v0.5+: persist full finding records per URL so future monitoring runs
|
|
2154
|
+
// can carry them forward when the URL is skipped pre-fetch. Carried-
|
|
2155
|
+
// forward findings (carriedForward=true) are NOT re-persisted under the
|
|
2156
|
+
// fetched URL — they belong to the prior entry that's preserved verbatim
|
|
2157
|
+
// for skipped URLs above.
|
|
2158
|
+
const fullFindingsByUrl = new Map();
|
|
1647
2159
|
for (const f of enrichedFindings) {
|
|
1648
2160
|
if (!f.pageUrl)
|
|
1649
2161
|
continue;
|
|
@@ -1651,9 +2163,16 @@ export async function auditSource(source, options) {
|
|
|
1651
2163
|
if (!list.includes(f.ruleId))
|
|
1652
2164
|
list.push(f.ruleId);
|
|
1653
2165
|
findingsByUrl.set(f.pageUrl, list);
|
|
2166
|
+
if (!f.carriedForward) {
|
|
2167
|
+
const records = fullFindingsByUrl.get(f.pageUrl) ?? [];
|
|
2168
|
+
records.push(f);
|
|
2169
|
+
fullFindingsByUrl.set(f.pageUrl, records);
|
|
2170
|
+
}
|
|
1654
2171
|
}
|
|
1655
|
-
// Preserve prior entries for URLs
|
|
1656
|
-
//
|
|
2172
|
+
// Preserve prior entries for URLs the monitoring matrix skipped (we never
|
|
2173
|
+
// fetched them this run; their fetchedAt MUST NOT advance or the age floor
|
|
2174
|
+
// never trips). Skipped URLs include those in scrapePlan.skip plus any
|
|
2175
|
+
// robots-skipped URLs from prior runs that are still in priorState.
|
|
1657
2176
|
if (priorState && skippedUrls.length > 0) {
|
|
1658
2177
|
for (const url of skippedUrls) {
|
|
1659
2178
|
const prior = priorState.urls[url];
|
|
@@ -1661,19 +2180,65 @@ export async function auditSource(source, options) {
|
|
|
1661
2180
|
urls[url] = prior;
|
|
1662
2181
|
}
|
|
1663
2182
|
}
|
|
2183
|
+
const nowIso = new Date().toISOString();
|
|
1664
2184
|
for (const p of loadedPages) {
|
|
1665
|
-
urls[p.url]
|
|
2185
|
+
const priorEntry = priorState?.urls[p.url];
|
|
2186
|
+
const responseHeaders = p.httpMeta?.headers;
|
|
2187
|
+
const lastModifiedHeader = responseHeaders?.["last-modified"];
|
|
2188
|
+
const etagHeader = responseHeaders?.["etag"];
|
|
2189
|
+
const sitemapLastmodForUrl = sitemapLastmodByUrl?.get(p.url);
|
|
2190
|
+
const entry = {
|
|
1666
2191
|
contentHash: computeContentHash(p.html),
|
|
1667
|
-
fetchedAt:
|
|
2192
|
+
fetchedAt: nowIso,
|
|
1668
2193
|
status: p.httpMeta?.statusCode ?? 200,
|
|
1669
2194
|
findingIds: findingsByUrl.get(p.url) ?? [],
|
|
2195
|
+
findings: (fullFindingsByUrl.get(p.url) ?? []).map((f) => ({
|
|
2196
|
+
id: `${f.ruleId}::${p.url}`,
|
|
2197
|
+
ruleId: f.ruleId,
|
|
2198
|
+
severity: f.severity,
|
|
2199
|
+
confidence: f.confidence ?? "high",
|
|
2200
|
+
message: f.message,
|
|
2201
|
+
...(f.fix !== undefined ? { fix: f.fix } : {}),
|
|
2202
|
+
...(f.ref !== undefined ? { ref: f.ref } : {}),
|
|
2203
|
+
...(f.docsUrl !== undefined ? { docsUrl: f.docsUrl } : {}),
|
|
2204
|
+
...(f.pageUrl !== undefined ? { url: f.pageUrl } : {}),
|
|
2205
|
+
...(f.relatedUrls !== undefined ? { relatedUrls: f.relatedUrls } : {}),
|
|
2206
|
+
...(f.group !== undefined ? { group: f.group } : {}),
|
|
2207
|
+
...(f.similarity !== undefined ? { similarity: f.similarity } : {}),
|
|
2208
|
+
...(f.context !== undefined ? { context: f.context } : {}),
|
|
2209
|
+
...(f.effort !== undefined ? { effort: f.effort } : {}),
|
|
2210
|
+
})),
|
|
2211
|
+
rulesetVersion: CORE_RULESET_VERSION,
|
|
1670
2212
|
};
|
|
2213
|
+
if (lastModifiedHeader)
|
|
2214
|
+
entry.lastModified = lastModifiedHeader;
|
|
2215
|
+
else if (priorEntry?.lastModified)
|
|
2216
|
+
entry.lastModified = priorEntry.lastModified;
|
|
2217
|
+
if (etagHeader)
|
|
2218
|
+
entry.etag = etagHeader;
|
|
2219
|
+
else if (priorEntry?.etag)
|
|
2220
|
+
entry.etag = priorEntry.etag;
|
|
2221
|
+
if (sitemapLastmodForUrl)
|
|
2222
|
+
entry.sitemapLastmodAtAudit = sitemapLastmodForUrl;
|
|
2223
|
+
else if (priorEntry?.sitemapLastmodAtAudit)
|
|
2224
|
+
entry.sitemapLastmodAtAudit = priorEntry.sitemapLastmodAtAudit;
|
|
2225
|
+
urls[p.url] = entry;
|
|
1671
2226
|
}
|
|
2227
|
+
// `lastFullAuditAt` advances only when this run actually re-fetched every
|
|
2228
|
+
// candidate URL. In monitoring mode (matrix skipped some URLs), preserve
|
|
2229
|
+
// the prior baseline timestamp so callers can reason about staleness.
|
|
2230
|
+
// In fresh mode (every candidate URL was fetched), bump to now.
|
|
2231
|
+
const isMonitoringRun = effectiveMode === "monitoring" && priorState !== null;
|
|
2232
|
+
const lastFullAuditAt = isMonitoringRun
|
|
2233
|
+
? (priorState?.lastFullAuditAt ?? priorState?.lastRun ?? nowIso)
|
|
2234
|
+
: nowIso;
|
|
1672
2235
|
const newState = {
|
|
1673
2236
|
version: STATE_SCHEMA_VERSION,
|
|
1674
|
-
lastRun:
|
|
2237
|
+
lastRun: nowIso,
|
|
2238
|
+
lastFullAuditAt,
|
|
1675
2239
|
source,
|
|
1676
2240
|
renderMode,
|
|
2241
|
+
rulesetVersion: CORE_RULESET_VERSION,
|
|
1677
2242
|
urls,
|
|
1678
2243
|
summary: {
|
|
1679
2244
|
score: summary.risk,
|