@pseolint/core 0.4.1 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +264 -169
- package/dist/ai/manifest/diff.d.ts +78 -0
- package/dist/ai/manifest/diff.d.ts.map +1 -0
- package/dist/ai/manifest/diff.js +139 -0
- package/dist/ai/manifest/diff.js.map +1 -0
- package/dist/ai/manifest/index.d.ts +18 -0
- package/dist/ai/manifest/index.d.ts.map +1 -0
- package/dist/ai/manifest/index.js +15 -0
- package/dist/ai/manifest/index.js.map +1 -0
- package/dist/ai/manifest/validate-manifest.d.ts +37 -0
- package/dist/ai/manifest/validate-manifest.d.ts.map +1 -0
- package/dist/ai/manifest/validate-manifest.js +67 -0
- package/dist/ai/manifest/validate-manifest.js.map +1 -0
- package/dist/ai/manifest/validators/domain-patches.d.ts +15 -0
- package/dist/ai/manifest/validators/domain-patches.d.ts.map +1 -0
- package/dist/ai/manifest/validators/domain-patches.js +110 -0
- package/dist/ai/manifest/validators/domain-patches.js.map +1 -0
- package/dist/ai/manifest/validators/index.d.ts +5 -0
- package/dist/ai/manifest/validators/index.d.ts.map +1 -0
- package/dist/ai/manifest/validators/index.js +4 -0
- package/dist/ai/manifest/validators/index.js.map +1 -0
- package/dist/ai/manifest/validators/page-changes.d.ts +36 -0
- package/dist/ai/manifest/validators/page-changes.d.ts.map +1 -0
- package/dist/ai/manifest/validators/page-changes.js +221 -0
- package/dist/ai/manifest/validators/page-changes.js.map +1 -0
- package/dist/ai/manifest/validators/types.d.ts +17 -0
- package/dist/ai/manifest/validators/types.d.ts.map +1 -0
- package/dist/ai/manifest/validators/types.js +5 -0
- package/dist/ai/manifest/validators/types.js.map +1 -0
- package/dist/ai/orchestrate.d.ts +74 -0
- package/dist/ai/orchestrate.d.ts.map +1 -0
- package/dist/ai/orchestrate.js +54 -0
- package/dist/ai/orchestrate.js.map +1 -0
- package/dist/ai/orchestrator/budget.d.ts +57 -0
- package/dist/ai/orchestrator/budget.d.ts.map +1 -0
- package/dist/ai/orchestrator/budget.js +114 -0
- package/dist/ai/orchestrator/budget.js.map +1 -0
- package/dist/ai/orchestrator/finish-tool.d.ts +568 -0
- package/dist/ai/orchestrator/finish-tool.d.ts.map +1 -0
- package/dist/ai/orchestrator/finish-tool.js +114 -0
- package/dist/ai/orchestrator/finish-tool.js.map +1 -0
- package/dist/ai/orchestrator/index.d.ts +25 -0
- package/dist/ai/orchestrator/index.d.ts.map +1 -0
- package/dist/ai/orchestrator/index.js +21 -0
- package/dist/ai/orchestrator/index.js.map +1 -0
- package/dist/ai/orchestrator/log.d.ts +24 -0
- package/dist/ai/orchestrator/log.d.ts.map +1 -0
- package/dist/ai/orchestrator/log.js +48 -0
- package/dist/ai/orchestrator/log.js.map +1 -0
- package/dist/ai/orchestrator/page-cache.d.ts +64 -0
- package/dist/ai/orchestrator/page-cache.d.ts.map +1 -0
- package/dist/ai/orchestrator/page-cache.js +127 -0
- package/dist/ai/orchestrator/page-cache.js.map +1 -0
- package/dist/ai/orchestrator/prompt.d.ts +16 -0
- package/dist/ai/orchestrator/prompt.d.ts.map +1 -0
- package/dist/ai/orchestrator/prompt.js +52 -0
- package/dist/ai/orchestrator/prompt.js.map +1 -0
- package/dist/ai/orchestrator/runner.d.ts +65 -0
- package/dist/ai/orchestrator/runner.d.ts.map +1 -0
- package/dist/ai/orchestrator/runner.js +223 -0
- package/dist/ai/orchestrator/runner.js.map +1 -0
- package/dist/ai/orchestrator/session.d.ts +44 -0
- package/dist/ai/orchestrator/session.d.ts.map +1 -0
- package/dist/ai/orchestrator/session.js +64 -0
- package/dist/ai/orchestrator/session.js.map +1 -0
- package/dist/ai/orchestrator/types.d.ts +99 -0
- package/dist/ai/orchestrator/types.d.ts.map +1 -0
- package/dist/ai/orchestrator/types.js +8 -0
- package/dist/ai/orchestrator/types.js.map +1 -0
- package/dist/ai/probes/cache.d.ts +12 -0
- package/dist/ai/probes/cache.d.ts.map +1 -0
- package/dist/ai/probes/cache.js +46 -0
- package/dist/ai/probes/cache.js.map +1 -0
- package/dist/ai/tools/ask-ai-engine.d.ts +77 -0
- package/dist/ai/tools/ask-ai-engine.d.ts.map +1 -0
- package/dist/ai/tools/ask-ai-engine.js +253 -0
- package/dist/ai/tools/ask-ai-engine.js.map +1 -0
- package/dist/ai/tools/check-domain-crawler-access.d.ts +71 -0
- package/dist/ai/tools/check-domain-crawler-access.d.ts.map +1 -0
- package/dist/ai/tools/check-domain-crawler-access.js +76 -0
- package/dist/ai/tools/check-domain-crawler-access.js.map +1 -0
- package/dist/ai/tools/check-domain-llms-txt.d.ts +70 -0
- package/dist/ai/tools/check-domain-llms-txt.d.ts.map +1 -0
- package/dist/ai/tools/check-domain-llms-txt.js +75 -0
- package/dist/ai/tools/check-domain-llms-txt.js.map +1 -0
- package/dist/ai/tools/check-indexability.d.ts +58 -0
- package/dist/ai/tools/check-indexability.d.ts.map +1 -0
- package/dist/ai/tools/check-indexability.js +64 -0
- package/dist/ai/tools/check-indexability.js.map +1 -0
- package/dist/ai/tools/check-robots.d.ts +68 -0
- package/dist/ai/tools/check-robots.d.ts.map +1 -0
- package/dist/ai/tools/check-robots.js +90 -0
- package/dist/ai/tools/check-robots.js.map +1 -0
- package/dist/ai/tools/check-rule-answer-first.d.ts +54 -0
- package/dist/ai/tools/check-rule-answer-first.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-answer-first.js +50 -0
- package/dist/ai/tools/check-rule-answer-first.js.map +1 -0
- package/dist/ai/tools/check-rule-canonical-consistency.d.ts +66 -0
- package/dist/ai/tools/check-rule-canonical-consistency.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-canonical-consistency.js +51 -0
- package/dist/ai/tools/check-rule-canonical-consistency.js.map +1 -0
- package/dist/ai/tools/check-rule-citable-facts.d.ts +58 -0
- package/dist/ai/tools/check-rule-citable-facts.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-citable-facts.js +41 -0
- package/dist/ai/tools/check-rule-citable-facts.js.map +1 -0
- package/dist/ai/tools/check-rule-content-modularity.d.ts +58 -0
- package/dist/ai/tools/check-rule-content-modularity.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-content-modularity.js +45 -0
- package/dist/ai/tools/check-rule-content-modularity.js.map +1 -0
- package/dist/ai/tools/check-rule-faq-coverage.d.ts +54 -0
- package/dist/ai/tools/check-rule-faq-coverage.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-faq-coverage.js +39 -0
- package/dist/ai/tools/check-rule-faq-coverage.js.map +1 -0
- package/dist/ai/tools/check-rule-freshness-signals.d.ts +54 -0
- package/dist/ai/tools/check-rule-freshness-signals.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-freshness-signals.js +45 -0
- package/dist/ai/tools/check-rule-freshness-signals.js.map +1 -0
- package/dist/ai/tools/check-rule-json-ld-valid.d.ts +54 -0
- package/dist/ai/tools/check-rule-json-ld-valid.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-json-ld-valid.js +44 -0
- package/dist/ai/tools/check-rule-json-ld-valid.js.map +1 -0
- package/dist/ai/tools/check-rule-missing-author.d.ts +54 -0
- package/dist/ai/tools/check-rule-missing-author.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-missing-author.js +45 -0
- package/dist/ai/tools/check-rule-missing-author.js.map +1 -0
- package/dist/ai/tools/check-rule-near-duplicate.d.ts +82 -0
- package/dist/ai/tools/check-rule-near-duplicate.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-near-duplicate.js +63 -0
- package/dist/ai/tools/check-rule-near-duplicate.js.map +1 -0
- package/dist/ai/tools/check-rule-required-fields.d.ts +50 -0
- package/dist/ai/tools/check-rule-required-fields.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-required-fields.js +38 -0
- package/dist/ai/tools/check-rule-required-fields.js.map +1 -0
- package/dist/ai/tools/check-rule-schema-consistency.d.ts +54 -0
- package/dist/ai/tools/check-rule-schema-consistency.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-schema-consistency.js +44 -0
- package/dist/ai/tools/check-rule-schema-consistency.js.map +1 -0
- package/dist/ai/tools/check-rule-summary-bait.d.ts +54 -0
- package/dist/ai/tools/check-rule-summary-bait.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-summary-bait.js +39 -0
- package/dist/ai/tools/check-rule-summary-bait.js.map +1 -0
- package/dist/ai/tools/check-rule-thin-content.d.ts +66 -0
- package/dist/ai/tools/check-rule-thin-content.d.ts.map +1 -0
- package/dist/ai/tools/check-rule-thin-content.js +58 -0
- package/dist/ai/tools/check-rule-thin-content.js.map +1 -0
- package/dist/ai/tools/detect-templates.d.ts +60 -0
- package/dist/ai/tools/detect-templates.d.ts.map +1 -0
- package/dist/ai/tools/detect-templates.js +43 -0
- package/dist/ai/tools/detect-templates.js.map +1 -0
- package/dist/ai/tools/fetch-page.d.ts +70 -0
- package/dist/ai/tools/fetch-page.d.ts.map +1 -0
- package/dist/ai/tools/fetch-page.js +93 -0
- package/dist/ai/tools/fetch-page.js.map +1 -0
- package/dist/ai/tools/fetch-sitemap.d.ts +60 -0
- package/dist/ai/tools/fetch-sitemap.d.ts.map +1 -0
- package/dist/ai/tools/fetch-sitemap.js +116 -0
- package/dist/ai/tools/fetch-sitemap.js.map +1 -0
- package/dist/ai/tools/index.d.ts +1555 -0
- package/dist/ai/tools/index.d.ts.map +1 -0
- package/dist/ai/tools/index.js +119 -0
- package/dist/ai/tools/index.js.map +1 -0
- package/dist/ai/tools/parse-page.d.ts +94 -0
- package/dist/ai/tools/parse-page.d.ts.map +1 -0
- package/dist/ai/tools/parse-page.js +108 -0
- package/dist/ai/tools/parse-page.js.map +1 -0
- package/dist/ai/tools/query-serp.d.ts +113 -0
- package/dist/ai/tools/query-serp.d.ts.map +1 -0
- package/dist/ai/tools/query-serp.js +131 -0
- package/dist/ai/tools/query-serp.js.map +1 -0
- package/dist/ai/tools/sample-template.d.ts +67 -0
- package/dist/ai/tools/sample-template.d.ts.map +1 -0
- package/dist/ai/tools/sample-template.js +75 -0
- package/dist/ai/tools/sample-template.js.map +1 -0
- package/dist/ai/tools/types.d.ts +73 -0
- package/dist/ai/tools/types.d.ts.map +1 -0
- package/dist/ai/tools/types.js +64 -0
- package/dist/ai/tools/types.js.map +1 -0
- package/dist/ai/tools/validate-jsonld.d.ts +62 -0
- package/dist/ai/tools/validate-jsonld.d.ts.map +1 -0
- package/dist/ai/tools/validate-jsonld.js +84 -0
- package/dist/ai/tools/validate-jsonld.js.map +1 -0
- package/dist/auditor.d.ts +16 -1
- package/dist/auditor.d.ts.map +1 -1
- package/dist/auditor.js +862 -88
- package/dist/auditor.js.map +1 -1
- package/dist/backpressure.d.ts.map +1 -1
- package/dist/backpressure.js +10 -3
- package/dist/backpressure.js.map +1 -1
- package/dist/enrich-findings.d.ts.map +1 -1
- package/dist/enrich-findings.js +15 -1
- package/dist/enrich-findings.js.map +1 -1
- package/dist/formatters/bucket-findings.d.ts +43 -0
- package/dist/formatters/bucket-findings.d.ts.map +1 -0
- package/dist/formatters/bucket-findings.js +110 -0
- package/dist/formatters/bucket-findings.js.map +1 -0
- package/dist/formatters/console.d.ts.map +1 -1
- package/dist/formatters/console.js +116 -34
- package/dist/formatters/console.js.map +1 -1
- package/dist/formatters/fixplan.d.ts +13 -0
- package/dist/formatters/fixplan.d.ts.map +1 -0
- package/dist/formatters/fixplan.js +328 -0
- package/dist/formatters/fixplan.js.map +1 -0
- package/dist/formatters/html.d.ts.map +1 -1
- package/dist/formatters/html.js +27 -0
- package/dist/formatters/html.js.map +1 -1
- package/dist/formatters/index.d.ts +2 -0
- package/dist/formatters/index.d.ts.map +1 -1
- package/dist/formatters/index.js +1 -0
- package/dist/formatters/index.js.map +1 -1
- package/dist/formatters/markdown.d.ts.map +1 -1
- package/dist/formatters/markdown.js +97 -9
- package/dist/formatters/markdown.js.map +1 -1
- package/dist/index.d.ts +12 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -1
- package/dist/page-filter.d.ts +64 -6
- package/dist/page-filter.d.ts.map +1 -1
- package/dist/page-filter.js +124 -3
- package/dist/page-filter.js.map +1 -1
- package/dist/rule-references.d.ts.map +1 -1
- package/dist/rule-references.js +5 -0
- package/dist/rule-references.js.map +1 -1
- package/dist/rules/aeo/answer-first.d.ts.map +1 -1
- package/dist/rules/aeo/answer-first.js +17 -3
- package/dist/rules/aeo/answer-first.js.map +1 -1
- package/dist/rules/aeo/citable-facts.d.ts.map +1 -1
- package/dist/rules/aeo/citable-facts.js +12 -1
- package/dist/rules/aeo/citable-facts.js.map +1 -1
- package/dist/rules/aeo/content-modularity.d.ts.map +1 -1
- package/dist/rules/aeo/content-modularity.js +3 -0
- package/dist/rules/aeo/content-modularity.js.map +1 -1
- package/dist/rules/aeo/crawler-access.d.ts.map +1 -1
- package/dist/rules/aeo/crawler-access.js +6 -0
- package/dist/rules/aeo/crawler-access.js.map +1 -1
- package/dist/rules/aeo/faq-coverage.d.ts.map +1 -1
- package/dist/rules/aeo/faq-coverage.js +4 -0
- package/dist/rules/aeo/faq-coverage.js.map +1 -1
- package/dist/rules/aeo/freshness-signals.d.ts.map +1 -1
- package/dist/rules/aeo/freshness-signals.js +9 -2
- package/dist/rules/aeo/freshness-signals.js.map +1 -1
- package/dist/rules/aeo/llms-txt.d.ts.map +1 -1
- package/dist/rules/aeo/llms-txt.js +6 -1
- package/dist/rules/aeo/llms-txt.js.map +1 -1
- package/dist/rules/aeo/summary-bait.d.ts.map +1 -1
- package/dist/rules/aeo/summary-bait.js +5 -2
- package/dist/rules/aeo/summary-bait.js.map +1 -1
- package/dist/rules/content/heading-structure.d.ts +21 -0
- package/dist/rules/content/heading-structure.d.ts.map +1 -0
- package/dist/rules/content/heading-structure.js +56 -0
- package/dist/rules/content/heading-structure.js.map +1 -0
- package/dist/rules/content/image-alt-text.d.ts +18 -0
- package/dist/rules/content/image-alt-text.d.ts.map +1 -0
- package/dist/rules/content/image-alt-text.js +77 -0
- package/dist/rules/content/image-alt-text.js.map +1 -0
- package/dist/rules/content/missing-author.d.ts.map +1 -1
- package/dist/rules/content/missing-author.js +10 -2
- package/dist/rules/content/missing-author.js.map +1 -1
- package/dist/rules/content/title-uniqueness.d.ts +18 -0
- package/dist/rules/content/title-uniqueness.d.ts.map +1 -0
- package/dist/rules/content/title-uniqueness.js +70 -0
- package/dist/rules/content/title-uniqueness.js.map +1 -0
- package/dist/rules/links/host-section-divergence.d.ts +3 -0
- package/dist/rules/links/host-section-divergence.d.ts.map +1 -0
- package/dist/rules/links/host-section-divergence.js +158 -0
- package/dist/rules/links/host-section-divergence.js.map +1 -0
- package/dist/rules/links/link-depth.d.ts +12 -1
- package/dist/rules/links/link-depth.d.ts.map +1 -1
- package/dist/rules/links/link-depth.js +25 -12
- package/dist/rules/links/link-depth.js.map +1 -1
- package/dist/rules/scope.d.ts.map +1 -1
- package/dist/rules/scope.js +5 -0
- package/dist/rules/scope.js.map +1 -1
- package/dist/rules/spam/doorway-pattern.d.ts.map +1 -1
- package/dist/rules/spam/doorway-pattern.js +27 -4
- package/dist/rules/spam/doorway-pattern.js.map +1 -1
- package/dist/rules/spam/publication-velocity.d.ts +1 -1
- package/dist/rules/spam/publication-velocity.d.ts.map +1 -1
- package/dist/rules/spam/publication-velocity.js +9 -4
- package/dist/rules/spam/publication-velocity.js.map +1 -1
- package/dist/rules/spam/template-coverage.js +1 -1
- package/dist/rules/spam/template-coverage.js.map +1 -1
- package/dist/rules/spam/template-diversity.js +1 -1
- package/dist/rules/spam/template-diversity.js.map +1 -1
- package/dist/rules/spam/thin-content.d.ts.map +1 -1
- package/dist/rules/spam/thin-content.js +9 -1
- package/dist/rules/spam/thin-content.js.map +1 -1
- package/dist/rules/tech/hreflang-consistency.d.ts.map +1 -1
- package/dist/rules/tech/hreflang-consistency.js +33 -4
- package/dist/rules/tech/hreflang-consistency.js.map +1 -1
- package/dist/rules/tech/og-completeness.d.ts +11 -0
- package/dist/rules/tech/og-completeness.d.ts.map +1 -1
- package/dist/rules/tech/og-completeness.js +22 -23
- package/dist/rules/tech/og-completeness.js.map +1 -1
- package/dist/ruleset-version.d.ts +8 -0
- package/dist/ruleset-version.d.ts.map +1 -0
- package/dist/ruleset-version.js +8 -0
- package/dist/ruleset-version.js.map +1 -0
- package/dist/scrape-strategy.d.ts +42 -0
- package/dist/scrape-strategy.d.ts.map +1 -0
- package/dist/scrape-strategy.js +101 -0
- package/dist/scrape-strategy.js.map +1 -0
- package/dist/site-classifier.d.ts +1 -1
- package/dist/site-classifier.d.ts.map +1 -1
- package/dist/site-classifier.js +217 -0
- package/dist/site-classifier.js.map +1 -1
- package/dist/state.d.ts +36 -1
- package/dist/state.d.ts.map +1 -1
- package/dist/state.js +3 -1
- package/dist/state.js.map +1 -1
- package/dist/stratified-sample.d.ts +9 -1
- package/dist/stratified-sample.d.ts.map +1 -1
- package/dist/stratified-sample.js +23 -6
- package/dist/stratified-sample.js.map +1 -1
- package/dist/types.d.ts +179 -2
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/dist/url-normalize.d.ts.map +1 -1
- package/dist/url-normalize.js +13 -1
- package/dist/url-normalize.js.map +1 -1
- package/package.json +90 -90
package/dist/auditor.js
CHANGED
|
@@ -18,7 +18,12 @@ import { thinContentRule } from "./rules/spam/thin-content.js";
|
|
|
18
18
|
import { deadEndsRule } from "./rules/links/dead-ends.js";
|
|
19
19
|
import { linkDepthRule } from "./rules/links/link-depth.js";
|
|
20
20
|
import { clusterConnectivityRule } from "./rules/links/cluster-connectivity.js";
|
|
21
|
+
import { hostSectionDivergenceRule } from "./rules/links/host-section-divergence.js";
|
|
21
22
|
import { orphanPagesRule } from "./rules/links/orphan-pages.js";
|
|
23
|
+
import { ogCompletenessRule } from "./rules/tech/og-completeness.js";
|
|
24
|
+
import { titleUniquenessRule } from "./rules/content/title-uniqueness.js";
|
|
25
|
+
import { headingStructureRule } from "./rules/content/heading-structure.js";
|
|
26
|
+
import { imageAltTextRule } from "./rules/content/image-alt-text.js";
|
|
22
27
|
import { canonicalConsistencyRule } from "./rules/tech/canonical-consistency.js";
|
|
23
28
|
import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-conflict.js";
|
|
24
29
|
import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
|
|
@@ -55,14 +60,17 @@ import { SSRFError, validateTargetHost } from "./ssrf-guard.js";
|
|
|
55
60
|
import { SAFE_MODE_PRESETS, resolveSafeModeKey } from "./safe-mode-preset.js";
|
|
56
61
|
import { FetchObserver, computeReadiness, detectDevServer } from "./fetch-observer.js";
|
|
57
62
|
import { BackpressureMonitor, OriginDegradedError } from "./backpressure.js";
|
|
58
|
-
import { stratifiedSample } from "./stratified-sample.js";
|
|
63
|
+
import { stratifiedSample, mulberry32 } from "./stratified-sample.js";
|
|
59
64
|
import { classifySite } from "./site-classifier.js";
|
|
60
65
|
import { readState, writeState, computeContentHash, STATE_SCHEMA_VERSION, } from "./state.js";
|
|
66
|
+
import { CORE_RULESET_VERSION } from "./ruleset-version.js";
|
|
67
|
+
import { planScrapeStrategy, DEFAULT_AGE_FLOOR_DAYS } from "./scrape-strategy.js";
|
|
61
68
|
const DEFAULTS = {
|
|
62
69
|
nearDuplicateThreshold: 0.85,
|
|
63
70
|
entitySwapThreshold: 0.95,
|
|
64
71
|
thinContentMinWords: 300,
|
|
65
72
|
publicationVelocityMaxPerDay: 100,
|
|
73
|
+
publicationVelocityMaxPerDayCorpusFraction: 0.10,
|
|
66
74
|
boilerplateMaxRatio: 0.7,
|
|
67
75
|
templateDiversityMinUniqueRatio: 0.35,
|
|
68
76
|
uniqueValueMinWords: 100,
|
|
@@ -77,17 +85,6 @@ const DEFAULTS = {
|
|
|
77
85
|
modularityMinSelfContainedRatio: 0.7,
|
|
78
86
|
faqMinQuestionHeadings: 2
|
|
79
87
|
};
|
|
80
|
-
/**
|
|
81
|
-
* v0.4 four-category weights. Audit is diagnostic-only (weight 0).
|
|
82
|
-
* See 2026-04-29 v0.4 redesign spec §4.2.
|
|
83
|
-
*/
|
|
84
|
-
const CATEGORY_WEIGHTS = {
|
|
85
|
-
integrity: 0.50, // spam + content + cannibal
|
|
86
|
-
discoverability: 0.20, // links + tech
|
|
87
|
-
citation: 0.25, // aeo + schema
|
|
88
|
-
data: 0.05, // data
|
|
89
|
-
audit: 0, // diagnostics, never weighted
|
|
90
|
-
};
|
|
91
88
|
/**
|
|
92
89
|
* Maps the v0.3 ruleId namespace prefix to the v0.4 four-bucket category.
|
|
93
90
|
* Used by `scoreFromFindings` to bucket findings without changing rule IDs.
|
|
@@ -103,6 +100,331 @@ const CATEGORY_MAP = {
|
|
|
103
100
|
data: "data",
|
|
104
101
|
audit: "audit",
|
|
105
102
|
};
|
|
103
|
+
const SCORING_PROFILES = {
|
|
104
|
+
"small-marketing": {
|
|
105
|
+
categoryWeights: { integrity: 0.30, discoverability: 0.40, citation: 0.20, data: 0.05, audit: 0 },
|
|
106
|
+
severityOverrides: {
|
|
107
|
+
"aeo/citable-facts": "info",
|
|
108
|
+
"aeo/answer-first": "info",
|
|
109
|
+
"aeo/summary-bait": "warning",
|
|
110
|
+
// 2026-05-03 calibration round 5: Segment integrations had 24 thin
|
|
111
|
+
// pages (200-300 words is correct for a catalog record). thin-content
|
|
112
|
+
// contributing capped 40 impact pushed integrity to its 100 cap → 30
|
|
113
|
+
// contribution at small-marketing weight, which alone tripped
|
|
114
|
+
// 'concerning'. Demoting to info keeps the signal visible without
|
|
115
|
+
// tanking the verdict on catalog-shape sites mis-classified as
|
|
116
|
+
// small-marketing. Real marketing sites (linear.app etc) don't
|
|
117
|
+
// normally have many sub-300-word pages so this won't hide quality
|
|
118
|
+
// issues there.
|
|
119
|
+
"spam/thin-content": "info",
|
|
120
|
+
"aeo/freshness-signals": "info",
|
|
121
|
+
"content/missing-author": "info",
|
|
122
|
+
// 2026-05-03 calibration round 3: Segment integrations classified as
|
|
123
|
+
// small-marketing@0.88 and tripped doorway-pattern 300× critical
|
|
124
|
+
// (catalog records are thin + entity-swap by design — not actually a
|
|
125
|
+
// doorway funnel). The classifier mistakes catalog directories as
|
|
126
|
+
// small-marketing; this demotion absorbs that mis-classification
|
|
127
|
+
// without weakening detection on actual small-marketing sites
|
|
128
|
+
// (linear.app, supabase.com — none of which produce entity-swap pairs).
|
|
129
|
+
"spam/doorway-pattern": "warning",
|
|
130
|
+
// 2026-05-03 calibration round 4: spam/boilerplate-ratio fired ERROR
|
|
131
|
+
// on Segment's integration directory (24 pages, 60%+ shared template
|
|
132
|
+
// chrome). On a marketing-template site the rule is correct — repeated
|
|
133
|
+
// "About us" / "Pricing" copy across pages IS a quality issue. On a
|
|
134
|
+
// catalog mis-classified to small-marketing, the shared chrome IS the
|
|
135
|
+
// template — by design. Demote to warning here; real marketing sites
|
|
136
|
+
// (linear.app, supabase.com) won't trip it because their corpus is
|
|
137
|
+
// page-diverse, but catalog-shape pages classified as small-marketing
|
|
138
|
+
// (Segment, Wise) won't tank the verdict.
|
|
139
|
+
"spam/boilerplate-ratio": "warning",
|
|
140
|
+
// 2026-05-03 v0.5.2 round 10: og-completeness, heading-structure,
|
|
141
|
+
// image-alt-text were added as new rules and tipped Segment from
|
|
142
|
+
// concerning → critical because catalog/template-driven sites
|
|
143
|
+
// commonly have shared OG defaults, weird H1 patterns (multiple H1s
|
|
144
|
+
// for repeated nav cards), and unlabelled logo grids. These are
|
|
145
|
+
// real findings on isolated sites but typical for catalog shape;
|
|
146
|
+
// demote to info here so the signal stays visible without driving
|
|
147
|
+
// the verdict.
|
|
148
|
+
"tech/og-completeness": "info",
|
|
149
|
+
"content/heading-structure": "info",
|
|
150
|
+
"content/image-alt-text": "info",
|
|
151
|
+
},
|
|
152
|
+
confidenceOverrides: {
|
|
153
|
+
"aeo/citable-facts": "low",
|
|
154
|
+
"aeo/answer-first": "low",
|
|
155
|
+
"aeo/summary-bait": "medium",
|
|
156
|
+
"spam/thin-content": "low",
|
|
157
|
+
"aeo/freshness-signals": "low",
|
|
158
|
+
"content/missing-author": "low",
|
|
159
|
+
"spam/doorway-pattern": "medium",
|
|
160
|
+
"spam/boilerplate-ratio": "medium",
|
|
161
|
+
"tech/og-completeness": "low",
|
|
162
|
+
"content/heading-structure": "low",
|
|
163
|
+
"content/image-alt-text": "low",
|
|
164
|
+
},
|
|
165
|
+
},
|
|
166
|
+
"blog": {
|
|
167
|
+
categoryWeights: { integrity: 0.40, discoverability: 0.25, citation: 0.30, data: 0.05, audit: 0 },
|
|
168
|
+
severityOverrides: {
|
|
169
|
+
"content/missing-author": "error",
|
|
170
|
+
"spam/thin-content": "error",
|
|
171
|
+
},
|
|
172
|
+
confidenceOverrides: {},
|
|
173
|
+
},
|
|
174
|
+
"programmatic-directory": {
|
|
175
|
+
categoryWeights: { integrity: 0.55, discoverability: 0.15, citation: 0.20, data: 0.10, audit: 0 },
|
|
176
|
+
// Symmetry argument: every other profile has severity overrides for the
|
|
177
|
+
// rules that mis-fit its shape (`docs` demotes AEO + author rules,
|
|
178
|
+
// `ecommerce` demotes `aeo/citable-facts`, `small-marketing` demotes 4
|
|
179
|
+
// rules). `programmatic-directory` is the site type *most* structurally
|
|
180
|
+
// different from the "page = article" assumptions the AEO and EEAT rules
|
|
181
|
+
// are calibrated against — yet was the only profile with no overrides.
|
|
182
|
+
//
|
|
183
|
+
// Pre-calibration adjustment: demote (never escalate) the rules that
|
|
184
|
+
// first-principles analysis predicts will false-positive on catalog-
|
|
185
|
+
// shaped sites (Zapier integrations, G2 categories, Wise currency pairs,
|
|
186
|
+
// etc.). A reputable-pSEO calibration corpus + runner has been added
|
|
187
|
+
// (scripts/calibration-reputable-pseo.ts); these overrides will be
|
|
188
|
+
// tightened or loosened based on actual fire-rates measured against
|
|
189
|
+
// sites that demonstrably win in production. See
|
|
190
|
+
// docs/superpowers/specs/2026-05-03-calibration-against-reputable-pseo.md.
|
|
191
|
+
severityOverrides: {
|
|
192
|
+
// Catalog pages are tables, not prose. AEO rules calibrated on
|
|
193
|
+
// editorial content over-fire here.
|
|
194
|
+
"aeo/citable-facts": "info",
|
|
195
|
+
"aeo/answer-first": "info",
|
|
196
|
+
"aeo/content-modularity": "info",
|
|
197
|
+
// 2026-05-03 calibration: freshness-signals fired on every page of
|
|
198
|
+
// every reputable pSEO site. Catalog freshness is expressed via the
|
|
199
|
+
// data (live currency rates, current job listings, current pricing),
|
|
200
|
+
// not via visible "last updated" stamps. Demote.
|
|
201
|
+
"aeo/freshness-signals": "info",
|
|
202
|
+
// Authorship lives at the platform level (operator's about page),
|
|
203
|
+
// not on every catalog record. Following the rule's "add a byline"
|
|
204
|
+
// fix on a Zillow listing would actively make the page worse.
|
|
205
|
+
"content/missing-author": "info",
|
|
206
|
+
"content/eeat-signals": "info",
|
|
207
|
+
// Template uniformity is correct for catalogs by design. Keep the
|
|
208
|
+
// signal but cap at warning — never error.
|
|
209
|
+
"spam/template-diversity": "warning",
|
|
210
|
+
// 2026-05-03 v0.5.2 round 10: same catalog logic as small-marketing.
|
|
211
|
+
"tech/og-completeness": "info",
|
|
212
|
+
"content/heading-structure": "info",
|
|
213
|
+
"content/image-alt-text": "info",
|
|
214
|
+
// 2026-05-03 calibration round 2: catalogs are near-duplicate by
|
|
215
|
+
// design. spam/near-duplicate fires CRITICAL on every catalog pair.
|
|
216
|
+
// Demote to warning — keeps the signal visible without dominating
|
|
217
|
+
// the score.
|
|
218
|
+
"spam/near-duplicate": "warning",
|
|
219
|
+
// 2026-05-03 calibration round 5: catalog records are by-design
|
|
220
|
+
// shorter than the 300-word default. Demote to info on programmatic-
|
|
221
|
+
// directory; the data IS the content.
|
|
222
|
+
"spam/thin-content": "info",
|
|
223
|
+
// 2026-05-03 calibration round 2: doorway-pattern fires CRITICAL on
|
|
224
|
+
// every (thin + entity-swap) pair. On Segment integrations, integration
|
|
225
|
+
// pages are thin (200-300 words is the right amount for a directory
|
|
226
|
+
// record) and entity-swap (slack/google-sheets, slack/airtable, …) by
|
|
227
|
+
// design. The composite signal is genuinely true but the *intent*
|
|
228
|
+
// (doorway funnel) doesn't match the reality (catalog record).
|
|
229
|
+
// Demoting to warning preserves the signal without tanking the score.
|
|
230
|
+
"spam/doorway-pattern": "warning",
|
|
231
|
+
// 2026-05-03 calibration round 4: catalog pages share template chrome
|
|
232
|
+
// by design — same as `spam/template-diversity`, this signal is
|
|
233
|
+
// structurally true on programmatic-directories.
|
|
234
|
+
"spam/boilerplate-ratio": "warning",
|
|
235
|
+
},
|
|
236
|
+
confidenceOverrides: {
|
|
237
|
+
"aeo/citable-facts": "low",
|
|
238
|
+
"aeo/answer-first": "low",
|
|
239
|
+
"aeo/content-modularity": "low",
|
|
240
|
+
"aeo/freshness-signals": "low",
|
|
241
|
+
"content/missing-author": "low",
|
|
242
|
+
"content/eeat-signals": "low",
|
|
243
|
+
"spam/template-diversity": "medium",
|
|
244
|
+
"spam/near-duplicate": "medium",
|
|
245
|
+
"spam/doorway-pattern": "medium",
|
|
246
|
+
"spam/boilerplate-ratio": "medium",
|
|
247
|
+
"spam/thin-content": "low",
|
|
248
|
+
"tech/og-completeness": "low",
|
|
249
|
+
"content/heading-structure": "low",
|
|
250
|
+
"content/image-alt-text": "low",
|
|
251
|
+
},
|
|
252
|
+
},
|
|
253
|
+
"ecommerce": {
|
|
254
|
+
categoryWeights: { integrity: 0.20, discoverability: 0.40, citation: 0.15, data: 0.25, audit: 0 },
|
|
255
|
+
severityOverrides: {
|
|
256
|
+
"aeo/citable-facts": "info",
|
|
257
|
+
"schema/required-fields": "error",
|
|
258
|
+
},
|
|
259
|
+
confidenceOverrides: {
|
|
260
|
+
"aeo/citable-facts": "low",
|
|
261
|
+
},
|
|
262
|
+
},
|
|
263
|
+
"docs": {
|
|
264
|
+
categoryWeights: { integrity: 0.30, discoverability: 0.30, citation: 0.30, data: 0.10, audit: 0 },
|
|
265
|
+
severityOverrides: {
|
|
266
|
+
"aeo/citable-facts": "info",
|
|
267
|
+
"aeo/answer-first": "warning",
|
|
268
|
+
"content/missing-author": "info",
|
|
269
|
+
},
|
|
270
|
+
confidenceOverrides: {
|
|
271
|
+
"aeo/citable-facts": "low",
|
|
272
|
+
"aeo/answer-first": "low",
|
|
273
|
+
"content/missing-author": "low",
|
|
274
|
+
},
|
|
275
|
+
},
|
|
276
|
+
"unclear": {
|
|
277
|
+
categoryWeights: { integrity: 0.50, discoverability: 0.20, citation: 0.25, data: 0.05, audit: 0 },
|
|
278
|
+
// 2026-05-03 calibration round 2: the original "stay strict when unsure"
|
|
279
|
+
// intent meant that 4 of 5 reputable pSEO sites that classified as
|
|
280
|
+
// unclear (Zapier integrations, Typeform templates, Jasper templates,
|
|
281
|
+
// Numbeo cost-of-living) failed their verdict ceiling. The dominant
|
|
282
|
+
// driver was always `aeo/citable-facts` at full error severity — but
|
|
283
|
+
// catalog/template-gallery pages don't have prose, so the rule fires
|
|
284
|
+
// for a STRUCTURAL reason (page is a table, not a paragraph), not a
|
|
285
|
+
// QUALITY reason. Demoting the structurally-incompatible rules to
|
|
286
|
+
// info on `unclear` is conservative:
|
|
287
|
+
// - if site is genuinely editorial and got mis-classified, signals
|
|
288
|
+
// still surface (just info, not error) — author can act on them.
|
|
289
|
+
// - if site is catalog and got mis-classified to unclear, verdict
|
|
290
|
+
// no longer falsely tanks.
|
|
291
|
+
// Real spam signals (near-dup, doorway, thin) keep their severity.
|
|
292
|
+
severityOverrides: {
|
|
293
|
+
"aeo/citable-facts": "info",
|
|
294
|
+
"aeo/answer-first": "info",
|
|
295
|
+
"aeo/content-modularity": "info",
|
|
296
|
+
"aeo/freshness-signals": "info",
|
|
297
|
+
"content/missing-author": "info",
|
|
298
|
+
"content/eeat-signals": "info",
|
|
299
|
+
// 2026-05-03 calibration round 3: Airbyte classified as unclear@0.5
|
|
300
|
+
// and scored concerning despite all info-severity findings in the
|
|
301
|
+
// top 5. The 8 critical "blockers" came from spam/near-duplicate,
|
|
302
|
+
// spam/entity-swap, spam/doorway-pattern firing 1-2× each on its
|
|
303
|
+
// connectors directory — invisible per-rule but cumulatively pushing
|
|
304
|
+
// the score over 'caution'. On unclear sites we cannot tell whether
|
|
305
|
+
// these triple-fires represent a real doorway or a catalog; the
|
|
306
|
+
// calibration corpus shows reputable catalogs hitting them more
|
|
307
|
+
// often than real doorways do. Demote to warning — keeps the signal
|
|
308
|
+
// visible (it appears in shouldFix bucket, with full message) without
|
|
309
|
+
// tanking the verdict on a structurally-ambiguous site.
|
|
310
|
+
"spam/near-duplicate": "warning",
|
|
311
|
+
"spam/entity-swap": "warning",
|
|
312
|
+
"spam/doorway-pattern": "warning",
|
|
313
|
+
// 2026-05-03 calibration round 4: same boilerplate logic on unclear —
|
|
314
|
+
// we can't tell whether the site is a marketing site (boilerplate IS
|
|
315
|
+
// a quality issue) or a catalog (it isn't), so demote conservatively.
|
|
316
|
+
"spam/boilerplate-ratio": "warning",
|
|
317
|
+
// 2026-05-03 calibration round 5: same thin-content logic on unclear.
|
|
318
|
+
// Catalog-shape sites that classify as unclear (Zapier, Typeform,
|
|
319
|
+
// Jasper) had thin-content firing at error on the 5-15% of pages
|
|
320
|
+
// shorter than the 300-word default. Demote to info — surfaces the
|
|
321
|
+
// signal without driving the verdict on a structurally-ambiguous site.
|
|
322
|
+
"spam/thin-content": "info",
|
|
323
|
+
// 2026-05-03 v0.5.2 round 10: same demotions as programmatic-
|
|
324
|
+
// directory profile — these tipped Webflow/Zapier/Numbeo/Airbyte
|
|
325
|
+
// back into concerning territory because they classify as unclear
|
|
326
|
+
// and the new rules aren't yet calibrated for catalog shape.
|
|
327
|
+
"tech/og-completeness": "info",
|
|
328
|
+
"content/heading-structure": "info",
|
|
329
|
+
"content/image-alt-text": "info",
|
|
330
|
+
},
|
|
331
|
+
confidenceOverrides: {
|
|
332
|
+
"aeo/citable-facts": "low",
|
|
333
|
+
"aeo/answer-first": "low",
|
|
334
|
+
"aeo/content-modularity": "low",
|
|
335
|
+
"aeo/freshness-signals": "low",
|
|
336
|
+
"content/missing-author": "low",
|
|
337
|
+
"content/eeat-signals": "low",
|
|
338
|
+
"spam/near-duplicate": "medium",
|
|
339
|
+
"spam/entity-swap": "medium",
|
|
340
|
+
"spam/doorway-pattern": "medium",
|
|
341
|
+
"spam/boilerplate-ratio": "medium",
|
|
342
|
+
"spam/thin-content": "low",
|
|
343
|
+
"tech/og-completeness": "low",
|
|
344
|
+
"content/heading-structure": "low",
|
|
345
|
+
"content/image-alt-text": "low",
|
|
346
|
+
},
|
|
347
|
+
},
|
|
348
|
+
};
|
|
349
|
+
/**
|
|
350
|
+
* Pick the scoring profile for a classification. Falls back to `unclear`
|
|
351
|
+
* (the conservative default) when classifier confidence is below 70%.
|
|
352
|
+
*/
|
|
353
|
+
function profileFor(classification) {
|
|
354
|
+
if (!classification || classification.confidence < 0.7)
|
|
355
|
+
return SCORING_PROFILES.unclear;
|
|
356
|
+
return SCORING_PROFILES[classification.type] ?? SCORING_PROFILES.unclear;
|
|
357
|
+
}
|
|
358
|
+
const RULE_IMPACTS = {
|
|
359
|
+
// SpamBrain — high baseline, count amplifies (cluster matters)
|
|
360
|
+
"spam/near-duplicate": { baseImpact: 25, perInstance: 5, maxImpact: 80 },
|
|
361
|
+
"spam/entity-swap": { baseImpact: 25, perInstance: 5, maxImpact: 80 },
|
|
362
|
+
"spam/doorway-pattern": { baseImpact: 30, perInstance: 0, maxImpact: 30 },
|
|
363
|
+
"spam/template-coverage": { baseImpact: 15, perInstance: 3, maxImpact: 60 },
|
|
364
|
+
"spam/template-diversity": { baseImpact: 12, perInstance: 3, maxImpact: 50 },
|
|
365
|
+
"spam/boilerplate-ratio": { baseImpact: 10, perInstance: 2, maxImpact: 40 },
|
|
366
|
+
"spam/thin-content": { baseImpact: 8, perInstance: 2, maxImpact: 40 },
|
|
367
|
+
"spam/publication-velocity": { baseImpact: 8, perInstance: 2, maxImpact: 30 },
|
|
368
|
+
"cannibal/url-pattern": { baseImpact: 10, perInstance: 2, maxImpact: 40 },
|
|
369
|
+
// Content
|
|
370
|
+
"content/unique-value": { baseImpact: 10, perInstance: 2, maxImpact: 40 },
|
|
371
|
+
"content/meta-uniqueness": { baseImpact: 8, perInstance: 2, maxImpact: 40 },
|
|
372
|
+
"content/missing-author": { baseImpact: 4, perInstance: 1, maxImpact: 20 },
|
|
373
|
+
"content/eeat-signals": { baseImpact: 4, perInstance: 1, maxImpact: 20 },
|
|
374
|
+
// 2026-05-03 v0.5.2 blind-spot fixes
|
|
375
|
+
"content/title-uniqueness": { baseImpact: 8, perInstance: 2, maxImpact: 25 }, // 2026-05-03 round 11: title is high-impact but the original 50-cap was disproportionate to other content rules and tipped Typeform into critical on a 6-finding cluster. Keep the rule at native error severity (duplicate titles ARE real bugs); just don't let one rule dominate the integrity bucket.
|
|
376
|
+
"content/heading-structure": { baseImpact: 5, perInstance: 1, maxImpact: 20 },
|
|
377
|
+
"content/image-alt-text": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
|
|
378
|
+
// Tech — softened in v0.4.3-rc2 after dogfood showed nextjs.org regressing
|
|
379
|
+
// from ready→caution on tech/canonical-consistency × 4 (legit cross-domain
|
|
380
|
+
// canonicals on a CDN). Per-instance now 1 (was 3).
|
|
381
|
+
"tech/canonical-consistency": { baseImpact: 8, perInstance: 1, maxImpact: 25 },
|
|
382
|
+
"tech/canonical-noindex-conflict": { baseImpact: 10, perInstance: 2, maxImpact: 40 },
|
|
383
|
+
"tech/robots-noindex-conflict": { baseImpact: 10, perInstance: 2, maxImpact: 40 },
|
|
384
|
+
"tech/redirect-chain": { baseImpact: 5, perInstance: 1, maxImpact: 25 },
|
|
385
|
+
"tech/sitemap-completeness": { baseImpact: 8, perInstance: 1, maxImpact: 30 },
|
|
386
|
+
"tech/robots-sitemap-presence": { baseImpact: 8, perInstance: 0, maxImpact: 8 },
|
|
387
|
+
"tech/soft-404": { baseImpact: 6, perInstance: 1, maxImpact: 30 },
|
|
388
|
+
// hreflang — one bad declaration breaks all language pairs, so the COUNT
|
|
389
|
+
// doesn't compound. perInstance: 0 keeps it at the base impact regardless
|
|
390
|
+
// of how many language pairs are affected. Dogfood showed 350 findings on
|
|
391
|
+
// stripe.com from a single missing reciprocal pair — that should not be
|
|
392
|
+
// treated as 350× the impact.
|
|
393
|
+
"tech/hreflang-consistency": { baseImpact: 5, perInstance: 0, maxImpact: 5 },
|
|
394
|
+
"tech/og-completeness": { baseImpact: 4, perInstance: 1, maxImpact: 20 },
|
|
395
|
+
// Links
|
|
396
|
+
"links/orphan-pages": { baseImpact: 5, perInstance: 1, maxImpact: 25 },
|
|
397
|
+
"links/dead-ends": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
|
|
398
|
+
"links/cluster-connectivity": { baseImpact: 5, perInstance: 1, maxImpact: 25 },
|
|
399
|
+
"links/link-depth": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
|
|
400
|
+
// AEO — much lower baselines than spam (AEO is opt-in optimization)
|
|
401
|
+
"aeo/citable-facts": { baseImpact: 2, perInstance: 1, maxImpact: 25 },
|
|
402
|
+
"aeo/answer-first": { baseImpact: 3, perInstance: 1, maxImpact: 25 },
|
|
403
|
+
"aeo/summary-bait": { baseImpact: 4, perInstance: 1, maxImpact: 25 },
|
|
404
|
+
"aeo/crawler-access": { baseImpact: 8, perInstance: 0, maxImpact: 8 },
|
|
405
|
+
"aeo/freshness-signals": { baseImpact: 2, perInstance: 1, maxImpact: 20 },
|
|
406
|
+
"aeo/llms-txt": { baseImpact: 4, perInstance: 0, maxImpact: 4 },
|
|
407
|
+
"aeo/faq-coverage": { baseImpact: 2, perInstance: 1, maxImpact: 15 },
|
|
408
|
+
"aeo/content-modularity": { baseImpact: 2, perInstance: 1, maxImpact: 15 },
|
|
409
|
+
// Schema
|
|
410
|
+
"schema/json-ld-valid": { baseImpact: 8, perInstance: 2, maxImpact: 35 },
|
|
411
|
+
"schema/required-fields": { baseImpact: 6, perInstance: 1, maxImpact: 30 },
|
|
412
|
+
"schema/consistency": { baseImpact: 3, perInstance: 1, maxImpact: 15 },
|
|
413
|
+
// Data
|
|
414
|
+
"data/data-binding": { baseImpact: 6, perInstance: 1, maxImpact: 30 },
|
|
415
|
+
};
|
|
416
|
+
const DEFAULT_RULE_IMPACT = { baseImpact: 5, perInstance: 1, maxImpact: 25 };
|
|
417
|
+
/**
|
|
418
|
+
* v0.4.3 — confidence-based discount applied to each finding's impact.
|
|
419
|
+
* Low-confidence findings contribute less to the bucket so they don't
|
|
420
|
+
* inflate the verdict on site types where they false-positive.
|
|
421
|
+
*/
|
|
422
|
+
const CONFIDENCE_MULTIPLIER = {
|
|
423
|
+
high: 1.0,
|
|
424
|
+
medium: 0.6,
|
|
425
|
+
low: 0.3,
|
|
426
|
+
speculative: 0.1,
|
|
427
|
+
};
|
|
106
428
|
/** Slug map for `RuleResult.docsUrl`. Defaults to the rule-id segment after the `/`. */
|
|
107
429
|
const RULE_DOCS_SLUG = {
|
|
108
430
|
// intentionally empty for v0.4 — slug = ruleId.split("/").pop() works for every shipped rule
|
|
@@ -121,6 +443,39 @@ function verdictForRisk(risk) {
|
|
|
121
443
|
return "concerning";
|
|
122
444
|
return "critical";
|
|
123
445
|
}
|
|
446
|
+
/**
|
|
447
|
+
* 2026-05-03 v0.5.2 — apply the bring-your-own-authority shift to the
|
|
448
|
+
* verdict ladder. The raw `risk` number is unchanged; only the user-
|
|
449
|
+
* facing verdict mapping shifts.
|
|
450
|
+
*
|
|
451
|
+
* `authorityScore >= 80` (established brand) → shift ONE TIER LENIENT
|
|
452
|
+
* `authorityScore <= 30` (newer/lower) → shift ONE TIER STRICT
|
|
453
|
+
* 31..79 or undefined → no shift
|
|
454
|
+
*
|
|
455
|
+
* "One tier lenient" means: critical → concerning, concerning → caution,
|
|
456
|
+
* caution → ready, ready → ready (clamped). "One tier strict" is the
|
|
457
|
+
* inverse direction: ready → caution, caution → concerning,
|
|
458
|
+
* concerning → critical, critical → critical.
|
|
459
|
+
*/
|
|
460
|
+
const VERDICT_LADDER = ["ready", "caution", "concerning", "critical"];
|
|
461
|
+
function shiftVerdictForAuthority(verdict, authorityScore) {
|
|
462
|
+
if (authorityScore === undefined)
|
|
463
|
+
return verdict;
|
|
464
|
+
if (!Number.isFinite(authorityScore))
|
|
465
|
+
return verdict;
|
|
466
|
+
if (authorityScore < 0 || authorityScore > 100)
|
|
467
|
+
return verdict;
|
|
468
|
+
const idx = VERDICT_LADDER.indexOf(verdict);
|
|
469
|
+
if (idx < 0)
|
|
470
|
+
return verdict;
|
|
471
|
+
if (authorityScore >= 80) {
|
|
472
|
+
return VERDICT_LADDER[Math.max(0, idx - 1)];
|
|
473
|
+
}
|
|
474
|
+
if (authorityScore <= 30) {
|
|
475
|
+
return VERDICT_LADDER[Math.min(VERDICT_LADDER.length - 1, idx + 1)];
|
|
476
|
+
}
|
|
477
|
+
return verdict;
|
|
478
|
+
}
|
|
124
479
|
function gradeForPenalty(penalty) {
|
|
125
480
|
if (penalty <= 20)
|
|
126
481
|
return "A";
|
|
@@ -182,7 +537,15 @@ function runRulesOnPages(pages,
|
|
|
182
537
|
* `respectNoindex: true` would hide noindex'd pages from the very rules
|
|
183
538
|
* designed to flag accidental noindex'ing.
|
|
184
539
|
*/
|
|
185
|
-
noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides, mode = "full"
|
|
540
|
+
noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides, mode = "full",
|
|
541
|
+
/**
|
|
542
|
+
* 2026-05-03 calibration credibility fix: signals that the audit is
|
|
543
|
+
* running on a sampled subset of the discovered URLs. Rules whose
|
|
544
|
+
* outputs depend on a complete link graph (`links/unreachable-from-
|
|
545
|
+
* root`) skip their checks when this is true to avoid sampling-
|
|
546
|
+
* artifact false positives.
|
|
547
|
+
*/
|
|
548
|
+
sampled = false) {
|
|
186
549
|
const findings = [];
|
|
187
550
|
const modeOk = (ruleId) => mode !== "diff" || isRuleAllowedInDiff(ruleId);
|
|
188
551
|
const tag = (results) => results.map((r) => {
|
|
@@ -211,7 +574,7 @@ noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, in
|
|
|
211
574
|
findings.push(...tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
|
|
212
575
|
}
|
|
213
576
|
if (isEnabled("spam/publication-velocity") && modeOk("spam/publication-velocity")) {
|
|
214
|
-
findings.push(...tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay)));
|
|
577
|
+
findings.push(...tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay, resolvedRules.publicationVelocityMaxPerDayCorpusFraction)));
|
|
215
578
|
}
|
|
216
579
|
if (isEnabled("spam/boilerplate-ratio") && modeOk("spam/boilerplate-ratio")) {
|
|
217
580
|
findings.push(...tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
|
|
@@ -235,6 +598,17 @@ noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, in
|
|
|
235
598
|
if (isEnabled("content/eeat-signals") && modeOk("content/eeat-signals")) {
|
|
236
599
|
findings.push(...tag(eeatSignalsRule(pages)));
|
|
237
600
|
}
|
|
601
|
+
// 2026-05-03 v0.5.2 blind-spot fixes — title uniqueness + heading
|
|
602
|
+
// structure + image alt-text were tier-1 gaps in the blind-spot audit.
|
|
603
|
+
if (isEnabled("content/title-uniqueness") && modeOk("content/title-uniqueness")) {
|
|
604
|
+
findings.push(...tag(titleUniquenessRule(pages)));
|
|
605
|
+
}
|
|
606
|
+
if (isEnabled("content/heading-structure") && modeOk("content/heading-structure")) {
|
|
607
|
+
findings.push(...tag(headingStructureRule(pages)));
|
|
608
|
+
}
|
|
609
|
+
if (isEnabled("content/image-alt-text") && modeOk("content/image-alt-text")) {
|
|
610
|
+
findings.push(...tag(imageAltTextRule(pages)));
|
|
611
|
+
}
|
|
238
612
|
// Link rules — use the global link graph
|
|
239
613
|
if (isEnabled("links/orphan-pages") && modeOk("links/orphan-pages")) {
|
|
240
614
|
findings.push(...tag(orphanPagesRule(pages, inbound, rootUrl)));
|
|
@@ -244,12 +618,15 @@ noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, in
|
|
|
244
618
|
}
|
|
245
619
|
if (isEnabled("links/link-depth") && modeOk("links/link-depth")) {
|
|
246
620
|
if (rootUrl) {
|
|
247
|
-
findings.push(...tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound)));
|
|
621
|
+
findings.push(...tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound, sampled)));
|
|
248
622
|
}
|
|
249
623
|
}
|
|
250
624
|
if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
|
|
251
625
|
findings.push(...tag(clusterConnectivityRule(pages, knownUrls)));
|
|
252
626
|
}
|
|
627
|
+
if (isEnabled("links/host-section-divergence") && modeOk("links/host-section-divergence")) {
|
|
628
|
+
findings.push(...tag(hostSectionDivergenceRule(pages, adjacency)));
|
|
629
|
+
}
|
|
253
630
|
// Tech rules
|
|
254
631
|
if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
|
|
255
632
|
findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
|
|
@@ -271,6 +648,11 @@ noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, in
|
|
|
271
648
|
// inconsistent — see auditor.test.ts "emits technical SEO findings".
|
|
272
649
|
findings.push(...tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
|
|
273
650
|
}
|
|
651
|
+
// 2026-05-03 v0.5.2 blind-spot fix: og-completeness was referenced in
|
|
652
|
+
// the v0.4.x README without ever shipping. Now it does.
|
|
653
|
+
if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
|
|
654
|
+
findings.push(...tag(ogCompletenessRule(pages)));
|
|
655
|
+
}
|
|
274
656
|
// Schema rules
|
|
275
657
|
if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
|
|
276
658
|
findings.push(...tag(jsonLdValidRule(pages)));
|
|
@@ -323,13 +705,67 @@ noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, in
|
|
|
323
705
|
function hashHtml(html) {
|
|
324
706
|
return createHash("sha256").update(html, "utf8").digest("hex");
|
|
325
707
|
}
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
708
|
+
/**
|
|
709
|
+
* v0.4.3 — apply per-site-type severity + confidence overrides BEFORE any
|
|
710
|
+
* bucketing happens, so blocker/shouldFix counts and category buckets all
|
|
711
|
+
* reflect the user-visible severity, not the rule's native severity.
|
|
712
|
+
*
|
|
713
|
+
* Returns a NEW array of findings (does not mutate the input). Only the
|
|
714
|
+
* `severity` and `confidence` fields are remapped; everything else is
|
|
715
|
+
* preserved by reference.
|
|
716
|
+
*/
|
|
717
|
+
export function applyScoringProfileOverrides(findings, classification) {
|
|
718
|
+
const profile = profileFor(classification);
|
|
719
|
+
const sevHas = Object.keys(profile.severityOverrides).length > 0;
|
|
720
|
+
const confHas = Object.keys(profile.confidenceOverrides).length > 0;
|
|
721
|
+
if (!sevHas && !confHas)
|
|
722
|
+
return findings;
|
|
723
|
+
return findings.map((f) => {
|
|
724
|
+
const newSev = profile.severityOverrides[f.ruleId];
|
|
725
|
+
const newConf = profile.confidenceOverrides[f.ruleId];
|
|
726
|
+
if (newSev === undefined && newConf === undefined)
|
|
727
|
+
return f;
|
|
728
|
+
return {
|
|
729
|
+
...f,
|
|
730
|
+
...(newSev !== undefined ? { severity: newSev } : {}),
|
|
731
|
+
...(newConf !== undefined ? { confidence: newConf } : {}),
|
|
732
|
+
};
|
|
733
|
+
});
|
|
734
|
+
}
|
|
735
|
+
/**
|
|
736
|
+
* 2026-05-03 credibility: list of rule IDs that ACTUALLY had their severity
|
|
737
|
+
* remapped on this audit. Distinct from `profile.severityOverrides` which is
|
|
738
|
+
* the static set of demotions defined per profile — this is the subset of
|
|
739
|
+
* those that actually fired. Surfaced via `summary.appliedSeverityDemotions`
|
|
740
|
+
* so formatters can show the user "engine demoted X rules because <site
|
|
741
|
+
* type> profile" rather than hiding the mechanism.
|
|
742
|
+
*/
|
|
743
|
+
function computeAppliedDemotions(findings, classification) {
|
|
744
|
+
const profile = profileFor(classification);
|
|
745
|
+
if (Object.keys(profile.severityOverrides).length === 0)
|
|
746
|
+
return [];
|
|
747
|
+
const applied = new Set();
|
|
748
|
+
for (const f of findings) {
|
|
749
|
+
if (profile.severityOverrides[f.ruleId] !== undefined) {
|
|
750
|
+
applied.add(f.ruleId);
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
return Array.from(applied).sort();
|
|
754
|
+
}
|
|
755
|
+
/**
|
|
756
|
+
* v0.4.3 — confidence-and-count-aware scoring. Replaces the v0.4 model that
|
|
757
|
+
* counted only severity. Each rule has a `baseImpact + (count - 1) *
|
|
758
|
+
* perInstance` contribution capped by `maxImpact`. The result is multiplied
|
|
759
|
+
* by the finding's `confidence` (default `high` → 1.0). Per-site-type
|
|
760
|
+
* profiles can remap a rule's severity / confidence; this function expects
|
|
761
|
+
* those overrides to ALREADY be applied to the input findings.
|
|
762
|
+
*
|
|
763
|
+
* Bucket math: per-rule impacts sum into the rule's `CATEGORY_MAP` bucket;
|
|
764
|
+
* each bucket is then capped at 100 and weighted by the active scoring
|
|
765
|
+
* profile's `categoryWeights`.
|
|
766
|
+
*/
|
|
767
|
+
function scoreFromFindings(findings, classification) {
|
|
768
|
+
const profile = profileFor(classification);
|
|
333
769
|
// v0.4 four-bucket raw penalties.
|
|
334
770
|
const bucketRaw = {
|
|
335
771
|
integrity: 0,
|
|
@@ -348,18 +784,16 @@ function scoreFromFindings(findings) {
|
|
|
348
784
|
let blockers = 0;
|
|
349
785
|
let shouldFix = 0;
|
|
350
786
|
let informational = 0;
|
|
787
|
+
// Group findings by ruleId so we can apply baseImpact + perInstance.
|
|
788
|
+
// Each group's weighted impact lands in its category bucket.
|
|
789
|
+
const groups = new Map();
|
|
351
790
|
for (const finding of findings) {
|
|
352
791
|
const namespace = finding.ruleId.split("/")[0];
|
|
353
792
|
const bucket = CATEGORY_MAP[namespace];
|
|
354
793
|
if (!bucket)
|
|
355
794
|
continue;
|
|
356
|
-
|
|
357
|
-
// v0.4 buckets.
|
|
358
|
-
bucketRaw[bucket] = Math.min(100, bucketRaw[bucket] + weight);
|
|
359
|
-
if (bucket !== "audit") {
|
|
795
|
+
if (bucket !== "audit")
|
|
360
796
|
bucketIssues[bucket] += 1;
|
|
361
|
-
}
|
|
362
|
-
// Issue-bucket counts (audit/* findings are diagnostic-only and excluded).
|
|
363
797
|
if (bucket === "audit")
|
|
364
798
|
continue;
|
|
365
799
|
if (finding.severity === "critical" || finding.severity === "error")
|
|
@@ -368,11 +802,73 @@ function scoreFromFindings(findings) {
|
|
|
368
802
|
shouldFix += 1;
|
|
369
803
|
else
|
|
370
804
|
informational += 1;
|
|
805
|
+
const arr = groups.get(finding.ruleId) ?? [];
|
|
806
|
+
arr.push(finding);
|
|
807
|
+
groups.set(finding.ruleId, arr);
|
|
808
|
+
}
|
|
809
|
+
// 2026-05-03 calibration credibility fix: track info-severity vs
|
|
810
|
+
// non-info contributions to each bucket separately so a flood of info
|
|
811
|
+
// findings can't fill the bucket cap and tank the verdict on its own.
|
|
812
|
+
// Round 7 surfaced this on Airbyte and round 8 on Zapier — both had
|
|
813
|
+
// ALL info-severity findings in their top drivers yet scored
|
|
814
|
+
// `concerning` because cumulative info impact filled the citation
|
|
815
|
+
// bucket past its 100 cap. Now: info contribution per bucket caps at
|
|
816
|
+
// 50; warning+ contribution caps at 100; final bucket = sum, capped
|
|
817
|
+
// at 100. A site with no real warning/error findings can score at
|
|
818
|
+
// most ~12.5 risk from info accumulation at typical 0.25 citation
|
|
819
|
+
// weight — which keeps verdict aligned with the visible severity in
|
|
820
|
+
// the report.
|
|
821
|
+
const bucketInfoOnly = {
|
|
822
|
+
integrity: 0, discoverability: 0, citation: 0, data: 0, audit: 0,
|
|
823
|
+
};
|
|
824
|
+
const bucketNonInfo = {
|
|
825
|
+
integrity: 0, discoverability: 0, citation: 0, data: 0, audit: 0,
|
|
826
|
+
};
|
|
827
|
+
for (const [ruleId, group] of groups) {
|
|
828
|
+
const namespace = ruleId.split("/")[0];
|
|
829
|
+
const bucket = CATEGORY_MAP[namespace];
|
|
830
|
+
if (!bucket || bucket === "audit")
|
|
831
|
+
continue;
|
|
832
|
+
const impactSpec = RULE_IMPACTS[ruleId] ?? DEFAULT_RULE_IMPACT;
|
|
833
|
+
const count = group.length;
|
|
834
|
+
const rawImpact = impactSpec.baseImpact + Math.max(0, count - 1) * impactSpec.perInstance;
|
|
835
|
+
const cap = impactSpec.maxImpact ?? Number.POSITIVE_INFINITY;
|
|
836
|
+
const cappedImpact = Math.min(cap, rawImpact);
|
|
837
|
+
// Confidence multiplier — use the WORST (highest-multiplier) confidence
|
|
838
|
+
// in the group so a rule that fires repeatedly with mixed confidence is
|
|
839
|
+
// not unfairly downweighted to its lowest-confidence instance.
|
|
840
|
+
let bestMultiplier = 0;
|
|
841
|
+
for (const f of group) {
|
|
842
|
+
const conf = f.confidence ?? "high";
|
|
843
|
+
const m = CONFIDENCE_MULTIPLIER[conf];
|
|
844
|
+
if (m > bestMultiplier)
|
|
845
|
+
bestMultiplier = m;
|
|
846
|
+
}
|
|
847
|
+
if (bestMultiplier === 0)
|
|
848
|
+
bestMultiplier = CONFIDENCE_MULTIPLIER.high;
|
|
849
|
+
const weighted = cappedImpact * bestMultiplier;
|
|
850
|
+
// Bucket the rule's contribution by the highest severity in the group.
|
|
851
|
+
// Mixed-severity groups (e.g. error + info) count toward non-info — once
|
|
852
|
+
// a rule has any non-info finding, its count contribution is treated as
|
|
853
|
+
// a real-issue signal, not info accumulation.
|
|
854
|
+
const isInfoOnly = group.every((f) => f.severity === "info");
|
|
855
|
+
if (isInfoOnly) {
|
|
856
|
+
bucketInfoOnly[bucket] += weighted;
|
|
857
|
+
}
|
|
858
|
+
else {
|
|
859
|
+
bucketNonInfo[bucket] += weighted;
|
|
860
|
+
}
|
|
371
861
|
}
|
|
372
|
-
const
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
bucketRaw.
|
|
862
|
+
for (const key of ["integrity", "discoverability", "citation", "data"]) {
|
|
863
|
+
const info = Math.min(50, bucketInfoOnly[key]);
|
|
864
|
+
const nonInfo = Math.min(100, bucketNonInfo[key]);
|
|
865
|
+
bucketRaw[key] = Math.min(100, info + nonInfo);
|
|
866
|
+
}
|
|
867
|
+
const cw = profile.categoryWeights;
|
|
868
|
+
const weighted = bucketRaw.integrity * cw.integrity +
|
|
869
|
+
bucketRaw.discoverability * cw.discoverability +
|
|
870
|
+
bucketRaw.citation * cw.citation +
|
|
871
|
+
bucketRaw.data * cw.data;
|
|
376
872
|
const risk = Math.round(Math.min(100, weighted));
|
|
377
873
|
const categories = {
|
|
378
874
|
integrity: { grade: gradeForPenalty(bucketRaw.integrity), issues: bucketIssues.integrity },
|
|
@@ -534,6 +1030,25 @@ function parseSitemapUrls(xml) {
|
|
|
534
1030
|
const matches = Array.from(xml.matchAll(/<loc>\s*([^<\s]+)\s*<\/loc>/gi));
|
|
535
1031
|
return matches.map((match) => match[1]).filter(Boolean);
|
|
536
1032
|
}
|
|
1033
|
+
export function parseSitemapUrlsWithLastmod(xml) {
|
|
1034
|
+
const out = [];
|
|
1035
|
+
// Match both <url>...</url> blocks (in <urlset>) and <sitemap>...</sitemap>
|
|
1036
|
+
// blocks (in <sitemapindex>). Both carry <loc> + optional <lastmod>.
|
|
1037
|
+
const blocks = xml.matchAll(/<(url|sitemap)\b[^>]*>([\s\S]*?)<\/\1>/gi);
|
|
1038
|
+
for (const block of blocks) {
|
|
1039
|
+
const inner = block[2] ?? "";
|
|
1040
|
+
const locMatch = inner.match(/<loc\b[^>]*>([\s\S]*?)<\/loc>/i);
|
|
1041
|
+
if (!locMatch)
|
|
1042
|
+
continue;
|
|
1043
|
+
const url = locMatch[1].trim();
|
|
1044
|
+
if (!url)
|
|
1045
|
+
continue;
|
|
1046
|
+
const lastmodMatch = inner.match(/<lastmod\b[^>]*>([\s\S]*?)<\/lastmod>/i);
|
|
1047
|
+
const lastmod = lastmodMatch ? lastmodMatch[1].trim() : undefined;
|
|
1048
|
+
out.push({ url, lastmod });
|
|
1049
|
+
}
|
|
1050
|
+
return out;
|
|
1051
|
+
}
|
|
537
1052
|
function looksLikeSitemap(text) {
|
|
538
1053
|
const lowered = text.toLowerCase();
|
|
539
1054
|
return lowered.includes("<urlset") || lowered.includes("<sitemapindex");
|
|
@@ -602,22 +1117,32 @@ function shouldIgnore(url, patterns) {
|
|
|
602
1117
|
}
|
|
603
1118
|
return false;
|
|
604
1119
|
}
|
|
605
|
-
function fisherYatesSample(items, n) {
|
|
1120
|
+
function fisherYatesSample(items, n, random = Math.random) {
|
|
606
1121
|
const arr = [...items];
|
|
607
1122
|
for (let i = arr.length - 1; i > 0 && arr.length - i <= n; i -= 1) {
|
|
608
|
-
const j = Math.floor(
|
|
1123
|
+
const j = Math.floor(random() * (i + 1));
|
|
609
1124
|
[arr[i], arr[j]] = [arr[j], arr[i]];
|
|
610
1125
|
}
|
|
611
1126
|
return arr.slice(arr.length - n);
|
|
612
1127
|
}
|
|
613
1128
|
async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop) {
|
|
614
1129
|
visited.add(sitemapUrl);
|
|
615
|
-
const
|
|
1130
|
+
const entries = parseSitemapUrlsWithLastmod(sitemapText);
|
|
616
1131
|
if (!isSitemapIndex(sitemapText)) {
|
|
617
|
-
|
|
1132
|
+
const urls = [];
|
|
1133
|
+
const lastmodByUrl = new Map();
|
|
1134
|
+
for (const entry of entries) {
|
|
1135
|
+
urls.push(entry.url);
|
|
1136
|
+
if (entry.lastmod !== undefined) {
|
|
1137
|
+
lastmodByUrl.set(entry.url, entry.lastmod);
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
return { urls, lastmodByUrl };
|
|
618
1141
|
}
|
|
619
1142
|
const allUrls = [];
|
|
620
|
-
|
|
1143
|
+
const allLastmodByUrl = new Map();
|
|
1144
|
+
for (const entry of entries) {
|
|
1145
|
+
const childUrl = entry.url;
|
|
621
1146
|
if (signal?.aborted)
|
|
622
1147
|
throw signal.reason ?? new Error("aborted");
|
|
623
1148
|
if (visited.has(childUrl))
|
|
@@ -628,10 +1153,13 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
|
|
|
628
1153
|
const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
|
|
629
1154
|
if (!childLike)
|
|
630
1155
|
continue;
|
|
631
|
-
const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
1156
|
+
const { urls: childUrls, lastmodByUrl: childLastmodByUrl } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
632
1157
|
allUrls.push(...childUrls);
|
|
1158
|
+
for (const [u, lm] of childLastmodByUrl) {
|
|
1159
|
+
allLastmodByUrl.set(u, lm);
|
|
1160
|
+
}
|
|
633
1161
|
}
|
|
634
|
-
return allUrls;
|
|
1162
|
+
return { urls: allUrls, lastmodByUrl: allLastmodByUrl };
|
|
635
1163
|
}
|
|
636
1164
|
async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
|
|
637
1165
|
if (!origin)
|
|
@@ -664,7 +1192,7 @@ function isDisallowedByRobots(urlPath, patterns) {
|
|
|
664
1192
|
function budgetExceeded(b) {
|
|
665
1193
|
return b.cap > 0 && b.used >= b.cap;
|
|
666
1194
|
}
|
|
667
|
-
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000) {
|
|
1195
|
+
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null) {
|
|
668
1196
|
// Memoized SSRF validator. When guardSsrf is on, every URL fetched by the
|
|
669
1197
|
// audit (source, sitemap entries, redirects, discovered links) goes through
|
|
670
1198
|
// this. DNS is hit once per unique hostname per audit — a 4k-page audit on
|
|
@@ -724,11 +1252,33 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
724
1252
|
const isXml = (contentType.includes("xml") || looksLikeSitemap(text)) && sourceStatus !== -1;
|
|
725
1253
|
if (isXml) {
|
|
726
1254
|
const visited = new Set();
|
|
727
|
-
const allSitemapUrls = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
1255
|
+
const { urls: allSitemapUrls, lastmodByUrl: sitemapLastmodByUrl } = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
728
1256
|
// If we have a budget, sample from sitemap URLs before fetching
|
|
729
|
-
const
|
|
1257
|
+
const sampledUrls = discoveryBudget > 0 && allSitemapUrls.length > discoveryBudget
|
|
730
1258
|
? fisherYatesSample(allSitemapUrls, discoveryBudget)
|
|
731
1259
|
: allSitemapUrls;
|
|
1260
|
+
// v0.5: change-driven monitoring. Apply the decision matrix BEFORE
|
|
1261
|
+
// fetching bodies. URLs in plan.skip are not network-touched at all —
|
|
1262
|
+
// their findings will be carried forward from prior state by the caller.
|
|
1263
|
+
// This is the whole point of monitoring mode: rule eval is microseconds,
|
|
1264
|
+
// the fetch is seconds; move the skip decision upstream of the fetch.
|
|
1265
|
+
let scrapePlan;
|
|
1266
|
+
let urlsToFetch;
|
|
1267
|
+
if (monitoringContext) {
|
|
1268
|
+
scrapePlan = planScrapeStrategy({
|
|
1269
|
+
candidateUrls: sampledUrls,
|
|
1270
|
+
priorState: monitoringContext.priorState,
|
|
1271
|
+
sitemapLastmodByUrl,
|
|
1272
|
+
currentRulesetVersion: monitoringContext.currentRulesetVersion,
|
|
1273
|
+
ageFloorDays: monitoringContext.ageFloorDays,
|
|
1274
|
+
now: monitoringContext.now,
|
|
1275
|
+
forceRefetchUrls: monitoringContext.forceRefetchUrls,
|
|
1276
|
+
});
|
|
1277
|
+
urlsToFetch = Array.from(scrapePlan.refetch.keys());
|
|
1278
|
+
}
|
|
1279
|
+
else {
|
|
1280
|
+
urlsToFetch = sampledUrls;
|
|
1281
|
+
}
|
|
732
1282
|
const pages = [];
|
|
733
1283
|
// Fetch robots.txt once for the origin — reused for Crawl-Delay pacing and Disallow checks.
|
|
734
1284
|
const sourceOrigin = (() => { try {
|
|
@@ -835,7 +1385,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
835
1385
|
});
|
|
836
1386
|
}
|
|
837
1387
|
}
|
|
838
|
-
return { pages, sitemapUrls: new Set(allSitemapUrls), discoveredUrlCount: allSitemapUrls.length };
|
|
1388
|
+
return { pages, sitemapUrls: new Set(allSitemapUrls), sitemapLastmodByUrl, discoveredUrlCount: allSitemapUrls.length, scrapePlan };
|
|
839
1389
|
}
|
|
840
1390
|
if (contentType.includes("html") || looksLikeHtml(text)) {
|
|
841
1391
|
const initialPage = { url: source, html: text };
|
|
@@ -958,6 +1508,9 @@ export async function auditSource(source, options) {
|
|
|
958
1508
|
const ignorePatterns = options?.ignore ?? [];
|
|
959
1509
|
const respectNoindex = options?.respectNoindex ?? true;
|
|
960
1510
|
const skipDetectedAuth = options?.skipDetectedAuth ?? false;
|
|
1511
|
+
const skipBoilerplate = options?.skipBoilerplate ?? false;
|
|
1512
|
+
const skipSearchPages = options?.skipSearchPages ?? false;
|
|
1513
|
+
const skipEmptyBody = options?.skipEmptyBody ?? false;
|
|
961
1514
|
const sampleSize = options?.sampleSize ?? preset.sampleSize ?? 0;
|
|
962
1515
|
const externalSignal = options?.signal;
|
|
963
1516
|
const guardSsrf = options?.guardSsrf ?? preset.guardSsrf ?? false;
|
|
@@ -973,12 +1526,26 @@ export async function auditSource(source, options) {
|
|
|
973
1526
|
let backpressureError = null;
|
|
974
1527
|
const signal = composeSignals(externalSignal, backpressureAbort.signal);
|
|
975
1528
|
const observer = new FetchObserver();
|
|
1529
|
+
// 2026-05-03 calibration: the prior (3s p95 cap, 2× baseline multiplier)
|
|
1530
|
+
// gate aborted 4 of 12 reputable-pSEO audits on what was normal load
|
|
1531
|
+
// variance — Zapier at p95=576ms (2.4× a 236ms baseline), Webflow at
|
|
1532
|
+
// p95=1808ms (2.2× 833ms), Airbyte at p95=1288ms (3.4× 380ms). For real
|
|
1533
|
+
// production CDNs these spikes are noise, not degradation. Raise the
|
|
1534
|
+
// gate so it still catches truly broken origins (sustained 4× slowdown
|
|
1535
|
+
// OR p95 above 8s) without tripping on normal audit-induced load.
|
|
976
1536
|
const monitor = backpressureEnabled
|
|
977
1537
|
? new BackpressureMonitor({
|
|
978
1538
|
warmupSize: 10,
|
|
979
|
-
absoluteP95Ms:
|
|
980
|
-
baselineMultiplier:
|
|
981
|
-
|
|
1539
|
+
absoluteP95Ms: 8000,
|
|
1540
|
+
baselineMultiplier: 4,
|
|
1541
|
+
// 2026-05-03 production fix: 0.1 (10%) was tripping pseolint.dev
|
|
1542
|
+
// audits on real production sites that legitimately return ~10% 5xx
|
|
1543
|
+
// (transient errors, async page renderers warming up, sites in
|
|
1544
|
+
// canary). Combined with the `>=` comparison bug (also fixed),
|
|
1545
|
+
// this aborted every web-app audit. 0.15 keeps the gate honest —
|
|
1546
|
+
// a sustained 15%+ 5xx rate is a real problem, not noise — while
|
|
1547
|
+
// letting transient errors not bring down the whole audit.
|
|
1548
|
+
errorRatioThreshold: 0.15,
|
|
982
1549
|
})
|
|
983
1550
|
: null;
|
|
984
1551
|
// v0.4: framework gets set on the first observation that carries headers
|
|
@@ -1010,6 +1577,8 @@ export async function auditSource(source, options) {
|
|
|
1010
1577
|
entitySwapThreshold: options?.rules?.entitySwapThreshold ?? DEFAULTS.entitySwapThreshold,
|
|
1011
1578
|
thinContentMinWords: options?.rules?.thinContentMinWords ?? DEFAULTS.thinContentMinWords,
|
|
1012
1579
|
publicationVelocityMaxPerDay: options?.rules?.publicationVelocityMaxPerDay ?? DEFAULTS.publicationVelocityMaxPerDay,
|
|
1580
|
+
publicationVelocityMaxPerDayCorpusFraction: options?.rules?.publicationVelocityMaxPerDayCorpusFraction
|
|
1581
|
+
?? DEFAULTS.publicationVelocityMaxPerDayCorpusFraction,
|
|
1013
1582
|
boilerplateMaxRatio: options?.rules?.boilerplateMaxRatio ?? DEFAULTS.boilerplateMaxRatio,
|
|
1014
1583
|
templateDiversityMinUniqueRatio: options?.rules?.templateDiversityMinUniqueRatio ?? DEFAULTS.templateDiversityMinUniqueRatio,
|
|
1015
1584
|
uniqueValueMinWords: options?.rules?.uniqueValueMinWords ?? DEFAULTS.uniqueValueMinWords,
|
|
@@ -1051,7 +1620,65 @@ export async function auditSource(source, options) {
|
|
|
1051
1620
|
const fetchByteBudget = { used: 0, cap: maxFetchBytes };
|
|
1052
1621
|
// v0.4 §4.7: detectedFramework is set in onObservation above, side-effect
|
|
1053
1622
|
// of the normal source URL fetch. No separate probe needed.
|
|
1054
|
-
|
|
1623
|
+
// v0.5: read prior state BEFORE loadPagesFromSource so the change-driven
|
|
1624
|
+
// monitoring decision matrix can run pre-fetch and tell loadPagesFromSource
|
|
1625
|
+
// which URLs to actually fetch. Reading state is cheap; doing it here also
|
|
1626
|
+
// means we know `priorState` once for both the monitoring path and the
|
|
1627
|
+
// post-audit state-write path further down.
|
|
1628
|
+
let priorState = null;
|
|
1629
|
+
const skippedUrls = [];
|
|
1630
|
+
const currentRenderMode = options?.render ? "rendered" : "static";
|
|
1631
|
+
if (options?.state?.path || options?.state?.since || options?.state?.exitOnRegression || options?.state?.mode) {
|
|
1632
|
+
const statePath = options.state?.path ?? ".pseolint/state.json";
|
|
1633
|
+
priorState = await readState(statePath);
|
|
1634
|
+
if (priorState && priorState.renderMode !== currentRenderMode) {
|
|
1635
|
+
console.error(`warning: prior state renderMode=${priorState.renderMode} differs from current ${currentRenderMode}. Performing full re-audit.`);
|
|
1636
|
+
priorState = null;
|
|
1637
|
+
}
|
|
1638
|
+
}
|
|
1639
|
+
// Effective monitoring mode:
|
|
1640
|
+
// - explicit `state.mode` wins ("monitoring" or "fresh")
|
|
1641
|
+
// - else if `--since` is passed and prior state exists → "monitoring" (back-compat alias)
|
|
1642
|
+
// - else if prior state exists → "monitoring" (auto, v0.5 default)
|
|
1643
|
+
// - else → "fresh" (no prior state available)
|
|
1644
|
+
const explicitMode = options?.state?.mode;
|
|
1645
|
+
const effectiveMode = explicitMode ??
|
|
1646
|
+
(priorState ? "monitoring" : "fresh");
|
|
1647
|
+
// Build the monitoring context only for HTTP sources in monitoring mode with
|
|
1648
|
+
// prior state. Single-page HTML and filesystem sources skip this — they are
|
|
1649
|
+
// exempted from the strategy (a single-page audit has nothing to plan; local
|
|
1650
|
+
// reads are cheap so re-reading every file beats branch complexity).
|
|
1651
|
+
const isHttpSource = /^https?:\/\//i.test(source);
|
|
1652
|
+
// If the user asked for monitoring against a filesystem source, surface that
|
|
1653
|
+
// we're ignoring the request. Silent bypass leads to "why is my state file
|
|
1654
|
+
// not being used?" debugging. Only log when the user actively chose
|
|
1655
|
+
// monitoring (explicit --mode or --since) — auto-monitoring on prior state
|
|
1656
|
+
// existence is implicit and shouldn't warn.
|
|
1657
|
+
if (!isHttpSource && effectiveMode === "monitoring" && (options?.state?.mode === "monitoring" || options?.state?.since)) {
|
|
1658
|
+
console.error("warning: monitoring mode requested but source is a local file/directory; reading every HTML file (the matrix only applies to HTTP sources).");
|
|
1659
|
+
}
|
|
1660
|
+
const monitoringContext = effectiveMode === "monitoring" && priorState && isHttpSource
|
|
1661
|
+
? {
|
|
1662
|
+
priorState,
|
|
1663
|
+
currentRulesetVersion: CORE_RULESET_VERSION,
|
|
1664
|
+
ageFloorDays: options?.state?.ageFloorDays ?? DEFAULT_AGE_FLOOR_DAYS,
|
|
1665
|
+
now: new Date(),
|
|
1666
|
+
forceRefetchUrls: options?.force?.urls,
|
|
1667
|
+
}
|
|
1668
|
+
: null;
|
|
1669
|
+
if (!priorState && options?.state?.since) {
|
|
1670
|
+
console.error("no prior state found — performing full baseline audit");
|
|
1671
|
+
}
|
|
1672
|
+
const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, sitemapLastmodByUrl, discoveredUrlCount, scrapePlan } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext);
|
|
1673
|
+
// The scrapePlan tells us which URLs were skipped pre-fetch under monitoring
|
|
1674
|
+
// mode. Surface them in skippedUrls so they show up under summary.skippedUrls
|
|
1675
|
+
// (kept for back-compat with --since consumers); T7 will carry their prior
|
|
1676
|
+
// findings forward and T8 will surface the full plan in summary.scrapePlan.
|
|
1677
|
+
if (scrapePlan) {
|
|
1678
|
+
for (const url of scrapePlan.skip.keys()) {
|
|
1679
|
+
skippedUrls.push(url);
|
|
1680
|
+
}
|
|
1681
|
+
}
|
|
1055
1682
|
throwIfAborted();
|
|
1056
1683
|
const loadedPages = [...loadedPagesRaw];
|
|
1057
1684
|
// v0.4 §4.7: content-type-aware crawling. Filter out fetched URLs whose
|
|
@@ -1080,34 +1707,11 @@ export async function auditSource(source, options) {
|
|
|
1080
1707
|
if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
|
|
1081
1708
|
console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
|
|
1082
1709
|
}
|
|
1083
|
-
//
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
priorState = await readState(statePath);
|
|
1089
|
-
const currentRenderMode = options.render ? "rendered" : "static";
|
|
1090
|
-
if (priorState && priorState.renderMode !== currentRenderMode) {
|
|
1091
|
-
console.error(`warning: prior state renderMode=${priorState.renderMode} differs from current ${currentRenderMode}. Performing full re-audit.`);
|
|
1092
|
-
priorState = null;
|
|
1093
|
-
}
|
|
1094
|
-
if (priorState && options.state.since) {
|
|
1095
|
-
const kept = [];
|
|
1096
|
-
for (const p of loadedPages) {
|
|
1097
|
-
const prior = priorState.urls[p.url];
|
|
1098
|
-
if (prior && prior.contentHash === computeContentHash(p.html)) {
|
|
1099
|
-
skippedUrls.push(p.url);
|
|
1100
|
-
}
|
|
1101
|
-
else {
|
|
1102
|
-
kept.push(p);
|
|
1103
|
-
}
|
|
1104
|
-
}
|
|
1105
|
-
loadedPages.splice(0, loadedPages.length, ...kept);
|
|
1106
|
-
}
|
|
1107
|
-
else if (!priorState && options.state.since) {
|
|
1108
|
-
console.error("no prior state found — performing full baseline audit");
|
|
1109
|
-
}
|
|
1110
|
-
}
|
|
1710
|
+
// v0.5: prior state was loaded BEFORE loadPagesFromSource so the change-
|
|
1711
|
+
// driven monitoring decision matrix could run pre-fetch. URLs the matrix
|
|
1712
|
+
// marked as "skip" were never fetched and are recorded in skippedUrls
|
|
1713
|
+
// above. The old post-fetch contentHash skip is gone — the decision now
|
|
1714
|
+
// happens upstream of the network round-trip.
|
|
1111
1715
|
let robotsTxtContent = "";
|
|
1112
1716
|
if (/^https?:\/\//i.test(source)) {
|
|
1113
1717
|
try {
|
|
@@ -1145,14 +1749,22 @@ export async function auditSource(source, options) {
|
|
|
1145
1749
|
? deduped.filter((page) => !shouldIgnore(page.url, ignorePatterns))
|
|
1146
1750
|
: deduped;
|
|
1147
1751
|
const strategy = options?.samplingStrategy ?? "stratified";
|
|
1148
|
-
|
|
1752
|
+
// 2026-05-03 calibration credibility fix: when sampleSeed is set, use a
|
|
1753
|
+
// deterministic PRNG so repeated audits pick the same pages and the
|
|
1754
|
+
// verdict is reproducible. Without a seed, fall back to Math.random
|
|
1755
|
+
// (legacy behavior, kept for backward compatibility).
|
|
1756
|
+
const samplingRandom = options?.sampleSeed !== undefined
|
|
1757
|
+
? mulberry32(options.sampleSeed)
|
|
1758
|
+
: Math.random;
|
|
1759
|
+
const isSampledAudit = sampleSize > 0 && sampleSize < filtered.length;
|
|
1760
|
+
const sampled = isSampledAudit
|
|
1149
1761
|
? (strategy === "stratified"
|
|
1150
1762
|
? (() => {
|
|
1151
1763
|
const urlsMap = new Map(filtered.map(p => [p.url, p]));
|
|
1152
|
-
const sampledUrls = stratifiedSample(filtered.map(p => p.url), sampleSize);
|
|
1764
|
+
const sampledUrls = stratifiedSample(filtered.map(p => p.url), sampleSize, samplingRandom);
|
|
1153
1765
|
return sampledUrls.map(u => urlsMap.get(u));
|
|
1154
1766
|
})()
|
|
1155
|
-
: fisherYatesSample(filtered, sampleSize))
|
|
1767
|
+
: fisherYatesSample(filtered, sampleSize, samplingRandom))
|
|
1156
1768
|
: filtered;
|
|
1157
1769
|
const parsedPagesAll = sampled.map((page) => {
|
|
1158
1770
|
const parsed = parseHtmlPage(page.html, page.url, { normalizeUrl: normalizeUrlOptions });
|
|
@@ -1168,7 +1780,13 @@ export async function auditSource(source, options) {
|
|
|
1168
1780
|
// (off for the CLI by default; on for the hosted web form).
|
|
1169
1781
|
const skippedByPolicy = [];
|
|
1170
1782
|
const parsedPages = parsedPagesAll.filter((p) => {
|
|
1171
|
-
const reason = pageSkipReason(p, {
|
|
1783
|
+
const reason = pageSkipReason(p, {
|
|
1784
|
+
respectNoindex,
|
|
1785
|
+
skipDetectedAuth,
|
|
1786
|
+
skipBoilerplate,
|
|
1787
|
+
skipSearchPages,
|
|
1788
|
+
skipEmptyBody,
|
|
1789
|
+
});
|
|
1172
1790
|
if (reason) {
|
|
1173
1791
|
skippedByPolicy.push({ url: p.url, reason });
|
|
1174
1792
|
return false;
|
|
@@ -1295,10 +1913,13 @@ export async function auditSource(source, options) {
|
|
|
1295
1913
|
continue;
|
|
1296
1914
|
const groupRules = resolveGroupRules(resolvedRules, groupConfig?.overrides);
|
|
1297
1915
|
const enabledCheck = (ruleId) => !suppressedRuleSet.has(ruleId) && isRuleEnabled(ruleId, groupConfig?.rules);
|
|
1298
|
-
const findings = runRulesOnPages(groupPages, parsedPagesAll, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full");
|
|
1916
|
+
const findings = runRulesOnPages(groupPages, parsedPagesAll, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full", isSampledAudit);
|
|
1299
1917
|
allFindings.push(...findings);
|
|
1300
1918
|
groupPageCounts[groupName] = groupPages.length;
|
|
1301
|
-
|
|
1919
|
+
// v0.4.3: per-group scoring uses the same site-classification profile so
|
|
1920
|
+
// group-level risk numbers reflect the same severity / confidence remaps
|
|
1921
|
+
// as the headline verdict.
|
|
1922
|
+
const { risk: groupRisk } = scoreFromFindings(applyScoringProfileOverrides(findings, siteClassification), siteClassification);
|
|
1302
1923
|
groupScores[groupName] = groupRisk;
|
|
1303
1924
|
}
|
|
1304
1925
|
throwIfAborted();
|
|
@@ -1308,10 +1929,61 @@ export async function auditSource(source, options) {
|
|
|
1308
1929
|
});
|
|
1309
1930
|
// Populate docsUrl on every finding before they leave the engine.
|
|
1310
1931
|
withDocsUrls(enriched.findings);
|
|
1311
|
-
|
|
1932
|
+
// v0.4.3: apply site-type-aware severity + confidence overrides so blocker
|
|
1933
|
+
// counts, issue buckets, and category bucketing all reflect the user-visible
|
|
1934
|
+
// severity (not the rule's native severity). The remapped findings replace
|
|
1935
|
+
// the enrichment output so every downstream consumer (summary.issues, AI
|
|
1936
|
+
// triage input, telemetry, formatters) sees the corrected severity.
|
|
1937
|
+
enriched.findings = applyScoringProfileOverrides(enriched.findings, siteClassification);
|
|
1938
|
+
// v0.5: change-driven monitoring carry-forward. URLs that the pre-fetch
|
|
1939
|
+
// strategy marked as "skip" were never fetched this run, so no rule produced
|
|
1940
|
+
// findings for them. Restore their findings from prior state, marked with
|
|
1941
|
+
// `carriedForward: true` and `lastVerifiedAt` so consumers can reason about
|
|
1942
|
+
// staleness. Inject after enrichment + overrides — these findings already
|
|
1943
|
+
// went through both in their original run; re-running enrichment would
|
|
1944
|
+
// strip their template / cluster assignments because parsedPages doesn't
|
|
1945
|
+
// contain the skipped pages.
|
|
1946
|
+
if (priorState && skippedUrls.length > 0) {
|
|
1947
|
+
for (const url of skippedUrls) {
|
|
1948
|
+
const prior = priorState.urls[url];
|
|
1949
|
+
if (!prior || prior.findings.length === 0)
|
|
1950
|
+
continue;
|
|
1951
|
+
for (const f of prior.findings) {
|
|
1952
|
+
const carried = {
|
|
1953
|
+
ruleId: f.ruleId,
|
|
1954
|
+
severity: f.severity,
|
|
1955
|
+
message: f.message,
|
|
1956
|
+
confidence: f.confidence,
|
|
1957
|
+
carriedForward: true,
|
|
1958
|
+
lastVerifiedAt: prior.fetchedAt,
|
|
1959
|
+
// State stores `url` but the engine type uses `pageUrl` — map back.
|
|
1960
|
+
pageUrl: typeof f.url === "string" ? f.url : url,
|
|
1961
|
+
};
|
|
1962
|
+
// Optional fields are preserved opportunistically when present in state.
|
|
1963
|
+
if (typeof f.fix === "string")
|
|
1964
|
+
carried.fix = f.fix;
|
|
1965
|
+
if (typeof f.ref === "string")
|
|
1966
|
+
carried.ref = f.ref;
|
|
1967
|
+
if (typeof f.docsUrl === "string")
|
|
1968
|
+
carried.docsUrl = f.docsUrl;
|
|
1969
|
+
if (Array.isArray(f.relatedUrls))
|
|
1970
|
+
carried.relatedUrls = f.relatedUrls;
|
|
1971
|
+
if (typeof f.group === "string")
|
|
1972
|
+
carried.group = f.group;
|
|
1973
|
+
if (typeof f.similarity === "number")
|
|
1974
|
+
carried.similarity = f.similarity;
|
|
1975
|
+
if (f.context !== undefined)
|
|
1976
|
+
carried.context = f.context;
|
|
1977
|
+
if (f.effort !== undefined)
|
|
1978
|
+
carried.effort = f.effort;
|
|
1979
|
+
enriched.findings.push(carried);
|
|
1980
|
+
}
|
|
1981
|
+
}
|
|
1982
|
+
}
|
|
1983
|
+
const { risk, categories, bucketCounts } = scoreFromFindings(enriched.findings, siteClassification);
|
|
1312
1984
|
const auditedPageCount = Object.values(groupPageCounts).reduce((a, b) => a + b, 0);
|
|
1313
1985
|
const issues = bucketIssues(enriched.findings);
|
|
1314
|
-
const verdict = verdictForRisk(risk);
|
|
1986
|
+
const verdict = shiftVerdictForAuthority(verdictForRisk(risk), options?.authorityScore);
|
|
1315
1987
|
const headline = buildHeadline(bucketCounts);
|
|
1316
1988
|
// audit/* findings are diagnostic-only and never appear in summary.issues.
|
|
1317
1989
|
// Surface them under diagnostics so consumers (telemetry, debug UIs) can
|
|
@@ -1323,6 +1995,7 @@ export async function auditSource(source, options) {
|
|
|
1323
1995
|
fetched: parsedPages.length,
|
|
1324
1996
|
skipped: skippedByContentType.length + skippedByRobots.length + skippedUrls.length,
|
|
1325
1997
|
};
|
|
1998
|
+
const appliedSeverityDemotions = computeAppliedDemotions(enriched.findings, siteClassification);
|
|
1326
1999
|
const summary = {
|
|
1327
2000
|
schemaVersion: SCHEMA_VERSION,
|
|
1328
2001
|
verdict,
|
|
@@ -1331,6 +2004,7 @@ export async function auditSource(source, options) {
|
|
|
1331
2004
|
categories,
|
|
1332
2005
|
issues,
|
|
1333
2006
|
siteClassification,
|
|
2007
|
+
appliedSeverityDemotions: appliedSeverityDemotions.length > 0 ? appliedSeverityDemotions : undefined,
|
|
1334
2008
|
diagnostics: {
|
|
1335
2009
|
originReadiness: readinessReport,
|
|
1336
2010
|
crawlStats,
|
|
@@ -1377,6 +2051,31 @@ export async function auditSource(source, options) {
|
|
|
1377
2051
|
if (allSkipped.length > 0) {
|
|
1378
2052
|
summary.skippedUrls = allSkipped;
|
|
1379
2053
|
}
|
|
2054
|
+
// v0.5+: surface the change-driven monitoring summary when this run was a
|
|
2055
|
+
// monitoring run (had prior state and didn't force --mode=fresh). Filesystem
|
|
2056
|
+
// sources don't get a scrapePlan because they bypass the matrix.
|
|
2057
|
+
if (effectiveMode === "monitoring" && priorState && scrapePlan) {
|
|
2058
|
+
const reasonCounts = {};
|
|
2059
|
+
for (const reason of scrapePlan.refetch.values()) {
|
|
2060
|
+
reasonCounts[reason] = (reasonCounts[reason] ?? 0) + 1;
|
|
2061
|
+
}
|
|
2062
|
+
for (const reason of scrapePlan.skip.values()) {
|
|
2063
|
+
reasonCounts[reason] = (reasonCounts[reason] ?? 0) + 1;
|
|
2064
|
+
}
|
|
2065
|
+
// `fetched` is the number of URLs whose bodies actually came back —
|
|
2066
|
+
// robots-disallowed, byte-budget-exceeded, content-type-filtered, and 4xx
|
|
2067
|
+
// URLs the matrix INTENDED to refetch may have dropped out before we got
|
|
2068
|
+
// here. `intended` (= scrapePlan.refetch.size) is exposed too so callers
|
|
2069
|
+
// can spot the gap (e.g. "intended 200, fetched 187, 13 URLs dropped").
|
|
2070
|
+
summary.scrapePlan = {
|
|
2071
|
+
fetched: loadedPages.length,
|
|
2072
|
+
intended: scrapePlan.refetch.size,
|
|
2073
|
+
carriedForward: scrapePlan.skip.size,
|
|
2074
|
+
reasonCounts,
|
|
2075
|
+
rulesetVersion: CORE_RULESET_VERSION,
|
|
2076
|
+
lastFullAuditAt: priorState.lastFullAuditAt ?? priorState.lastRun ?? null,
|
|
2077
|
+
};
|
|
2078
|
+
}
|
|
1380
2079
|
// v0.4.1: surface noindex / auth skips as a discoverable diagnostic so the
|
|
1381
2080
|
// user sees what the engine excluded. Catches the accidental-noindex bug:
|
|
1382
2081
|
// pages silently dropped from indexing show up as a visible skip line
|
|
@@ -1384,6 +2083,9 @@ export async function auditSource(source, options) {
|
|
|
1384
2083
|
if (skippedByPolicy.length > 0) {
|
|
1385
2084
|
const noindexCount = skippedByPolicy.filter((s) => s.reason === "noindex").length;
|
|
1386
2085
|
const authCount = skippedByPolicy.filter((s) => s.reason === "auth-detected").length;
|
|
2086
|
+
const boilerplateCount = skippedByPolicy.filter((s) => s.reason === "boilerplate").length;
|
|
2087
|
+
const searchCount = skippedByPolicy.filter((s) => s.reason === "search-result").length;
|
|
2088
|
+
const spaShellCount = skippedByPolicy.filter((s) => s.reason === "spa-shell").length;
|
|
1387
2089
|
const sample = skippedByPolicy.slice(0, 5).map((s) => `${s.url} (${s.reason})`).join(", ");
|
|
1388
2090
|
const more = skippedByPolicy.length > 5 ? `, +${skippedByPolicy.length - 5} more` : "";
|
|
1389
2091
|
const parts = [];
|
|
@@ -1391,6 +2093,12 @@ export async function auditSource(source, options) {
|
|
|
1391
2093
|
parts.push(`${noindexCount} marked noindex`);
|
|
1392
2094
|
if (authCount > 0)
|
|
1393
2095
|
parts.push(`${authCount} detected as auth (login/register/etc)`);
|
|
2096
|
+
if (boilerplateCount > 0)
|
|
2097
|
+
parts.push(`${boilerplateCount} cookie/legal/consent boilerplate`);
|
|
2098
|
+
if (searchCount > 0)
|
|
2099
|
+
parts.push(`${searchCount} search-result page${searchCount === 1 ? "" : "s"}`);
|
|
2100
|
+
if (spaShellCount > 0)
|
|
2101
|
+
parts.push(`${spaShellCount} un-hydrated SPA shell${spaShellCount === 1 ? "" : "s"}`);
|
|
1394
2102
|
auditFindings.push({
|
|
1395
2103
|
ruleId: "audit/skipped-by-policy",
|
|
1396
2104
|
severity: "info",
|
|
@@ -1410,6 +2118,13 @@ export async function auditSource(source, options) {
|
|
|
1410
2118
|
for (const f of enrichedFindings) {
|
|
1411
2119
|
if (!f.pageUrl)
|
|
1412
2120
|
continue;
|
|
2121
|
+
// Carried-forward findings are not "current" — we did not re-verify them
|
|
2122
|
+
// this run. Including them would mask a genuine regression on a skipped
|
|
2123
|
+
// URL: prior set has rule X carried-forward, current set also has X
|
|
2124
|
+
// (carried-forward), comparison says "no new rule", we miss the case
|
|
2125
|
+
// where the page actually started failing rule Y too.
|
|
2126
|
+
if (f.carriedForward)
|
|
2127
|
+
continue;
|
|
1413
2128
|
const set = currentFindings.get(f.pageUrl) ?? new Set();
|
|
1414
2129
|
set.add(f.ruleId);
|
|
1415
2130
|
currentFindings.set(f.pageUrl, set);
|
|
@@ -1435,6 +2150,12 @@ export async function auditSource(source, options) {
|
|
|
1435
2150
|
const renderMode = options.render ? "rendered" : "static";
|
|
1436
2151
|
const urls = {};
|
|
1437
2152
|
const findingsByUrl = new Map();
|
|
2153
|
+
// v0.5+: persist full finding records per URL so future monitoring runs
|
|
2154
|
+
// can carry them forward when the URL is skipped pre-fetch. Carried-
|
|
2155
|
+
// forward findings (carriedForward=true) are NOT re-persisted under the
|
|
2156
|
+
// fetched URL — they belong to the prior entry that's preserved verbatim
|
|
2157
|
+
// for skipped URLs above.
|
|
2158
|
+
const fullFindingsByUrl = new Map();
|
|
1438
2159
|
for (const f of enrichedFindings) {
|
|
1439
2160
|
if (!f.pageUrl)
|
|
1440
2161
|
continue;
|
|
@@ -1442,9 +2163,16 @@ export async function auditSource(source, options) {
|
|
|
1442
2163
|
if (!list.includes(f.ruleId))
|
|
1443
2164
|
list.push(f.ruleId);
|
|
1444
2165
|
findingsByUrl.set(f.pageUrl, list);
|
|
2166
|
+
if (!f.carriedForward) {
|
|
2167
|
+
const records = fullFindingsByUrl.get(f.pageUrl) ?? [];
|
|
2168
|
+
records.push(f);
|
|
2169
|
+
fullFindingsByUrl.set(f.pageUrl, records);
|
|
2170
|
+
}
|
|
1445
2171
|
}
|
|
1446
|
-
// Preserve prior entries for URLs
|
|
1447
|
-
//
|
|
2172
|
+
// Preserve prior entries for URLs the monitoring matrix skipped (we never
|
|
2173
|
+
// fetched them this run; their fetchedAt MUST NOT advance or the age floor
|
|
2174
|
+
// never trips). Skipped URLs include those in scrapePlan.skip plus any
|
|
2175
|
+
// robots-skipped URLs from prior runs that are still in priorState.
|
|
1448
2176
|
if (priorState && skippedUrls.length > 0) {
|
|
1449
2177
|
for (const url of skippedUrls) {
|
|
1450
2178
|
const prior = priorState.urls[url];
|
|
@@ -1452,19 +2180,65 @@ export async function auditSource(source, options) {
|
|
|
1452
2180
|
urls[url] = prior;
|
|
1453
2181
|
}
|
|
1454
2182
|
}
|
|
2183
|
+
const nowIso = new Date().toISOString();
|
|
1455
2184
|
for (const p of loadedPages) {
|
|
1456
|
-
urls[p.url]
|
|
2185
|
+
const priorEntry = priorState?.urls[p.url];
|
|
2186
|
+
const responseHeaders = p.httpMeta?.headers;
|
|
2187
|
+
const lastModifiedHeader = responseHeaders?.["last-modified"];
|
|
2188
|
+
const etagHeader = responseHeaders?.["etag"];
|
|
2189
|
+
const sitemapLastmodForUrl = sitemapLastmodByUrl?.get(p.url);
|
|
2190
|
+
const entry = {
|
|
1457
2191
|
contentHash: computeContentHash(p.html),
|
|
1458
|
-
fetchedAt:
|
|
2192
|
+
fetchedAt: nowIso,
|
|
1459
2193
|
status: p.httpMeta?.statusCode ?? 200,
|
|
1460
2194
|
findingIds: findingsByUrl.get(p.url) ?? [],
|
|
2195
|
+
findings: (fullFindingsByUrl.get(p.url) ?? []).map((f) => ({
|
|
2196
|
+
id: `${f.ruleId}::${p.url}`,
|
|
2197
|
+
ruleId: f.ruleId,
|
|
2198
|
+
severity: f.severity,
|
|
2199
|
+
confidence: f.confidence ?? "high",
|
|
2200
|
+
message: f.message,
|
|
2201
|
+
...(f.fix !== undefined ? { fix: f.fix } : {}),
|
|
2202
|
+
...(f.ref !== undefined ? { ref: f.ref } : {}),
|
|
2203
|
+
...(f.docsUrl !== undefined ? { docsUrl: f.docsUrl } : {}),
|
|
2204
|
+
...(f.pageUrl !== undefined ? { url: f.pageUrl } : {}),
|
|
2205
|
+
...(f.relatedUrls !== undefined ? { relatedUrls: f.relatedUrls } : {}),
|
|
2206
|
+
...(f.group !== undefined ? { group: f.group } : {}),
|
|
2207
|
+
...(f.similarity !== undefined ? { similarity: f.similarity } : {}),
|
|
2208
|
+
...(f.context !== undefined ? { context: f.context } : {}),
|
|
2209
|
+
...(f.effort !== undefined ? { effort: f.effort } : {}),
|
|
2210
|
+
})),
|
|
2211
|
+
rulesetVersion: CORE_RULESET_VERSION,
|
|
1461
2212
|
};
|
|
2213
|
+
if (lastModifiedHeader)
|
|
2214
|
+
entry.lastModified = lastModifiedHeader;
|
|
2215
|
+
else if (priorEntry?.lastModified)
|
|
2216
|
+
entry.lastModified = priorEntry.lastModified;
|
|
2217
|
+
if (etagHeader)
|
|
2218
|
+
entry.etag = etagHeader;
|
|
2219
|
+
else if (priorEntry?.etag)
|
|
2220
|
+
entry.etag = priorEntry.etag;
|
|
2221
|
+
if (sitemapLastmodForUrl)
|
|
2222
|
+
entry.sitemapLastmodAtAudit = sitemapLastmodForUrl;
|
|
2223
|
+
else if (priorEntry?.sitemapLastmodAtAudit)
|
|
2224
|
+
entry.sitemapLastmodAtAudit = priorEntry.sitemapLastmodAtAudit;
|
|
2225
|
+
urls[p.url] = entry;
|
|
1462
2226
|
}
|
|
2227
|
+
// `lastFullAuditAt` advances only when this run actually re-fetched every
|
|
2228
|
+
// candidate URL. In monitoring mode (matrix skipped some URLs), preserve
|
|
2229
|
+
// the prior baseline timestamp so callers can reason about staleness.
|
|
2230
|
+
// In fresh mode (every candidate URL was fetched), bump to now.
|
|
2231
|
+
const isMonitoringRun = effectiveMode === "monitoring" && priorState !== null;
|
|
2232
|
+
const lastFullAuditAt = isMonitoringRun
|
|
2233
|
+
? (priorState?.lastFullAuditAt ?? priorState?.lastRun ?? nowIso)
|
|
2234
|
+
: nowIso;
|
|
1463
2235
|
const newState = {
|
|
1464
2236
|
version: STATE_SCHEMA_VERSION,
|
|
1465
|
-
lastRun:
|
|
2237
|
+
lastRun: nowIso,
|
|
2238
|
+
lastFullAuditAt,
|
|
1466
2239
|
source,
|
|
1467
2240
|
renderMode,
|
|
2241
|
+
rulesetVersion: CORE_RULESET_VERSION,
|
|
1468
2242
|
urls,
|
|
1469
2243
|
summary: {
|
|
1470
2244
|
score: summary.risk,
|