@pseolint/core 0.4.3 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. package/README.md +264 -169
  2. package/dist/ai/manifest/diff.d.ts +78 -0
  3. package/dist/ai/manifest/diff.d.ts.map +1 -0
  4. package/dist/ai/manifest/diff.js +139 -0
  5. package/dist/ai/manifest/diff.js.map +1 -0
  6. package/dist/ai/manifest/index.d.ts +18 -0
  7. package/dist/ai/manifest/index.d.ts.map +1 -0
  8. package/dist/ai/manifest/index.js +15 -0
  9. package/dist/ai/manifest/index.js.map +1 -0
  10. package/dist/ai/manifest/validate-manifest.d.ts +37 -0
  11. package/dist/ai/manifest/validate-manifest.d.ts.map +1 -0
  12. package/dist/ai/manifest/validate-manifest.js +67 -0
  13. package/dist/ai/manifest/validate-manifest.js.map +1 -0
  14. package/dist/ai/manifest/validators/domain-patches.d.ts +15 -0
  15. package/dist/ai/manifest/validators/domain-patches.d.ts.map +1 -0
  16. package/dist/ai/manifest/validators/domain-patches.js +110 -0
  17. package/dist/ai/manifest/validators/domain-patches.js.map +1 -0
  18. package/dist/ai/manifest/validators/index.d.ts +5 -0
  19. package/dist/ai/manifest/validators/index.d.ts.map +1 -0
  20. package/dist/ai/manifest/validators/index.js +4 -0
  21. package/dist/ai/manifest/validators/index.js.map +1 -0
  22. package/dist/ai/manifest/validators/page-changes.d.ts +36 -0
  23. package/dist/ai/manifest/validators/page-changes.d.ts.map +1 -0
  24. package/dist/ai/manifest/validators/page-changes.js +221 -0
  25. package/dist/ai/manifest/validators/page-changes.js.map +1 -0
  26. package/dist/ai/manifest/validators/types.d.ts +17 -0
  27. package/dist/ai/manifest/validators/types.d.ts.map +1 -0
  28. package/dist/ai/manifest/validators/types.js +5 -0
  29. package/dist/ai/manifest/validators/types.js.map +1 -0
  30. package/dist/ai/orchestrate.d.ts +74 -0
  31. package/dist/ai/orchestrate.d.ts.map +1 -0
  32. package/dist/ai/orchestrate.js +54 -0
  33. package/dist/ai/orchestrate.js.map +1 -0
  34. package/dist/ai/orchestrator/budget.d.ts +57 -0
  35. package/dist/ai/orchestrator/budget.d.ts.map +1 -0
  36. package/dist/ai/orchestrator/budget.js +114 -0
  37. package/dist/ai/orchestrator/budget.js.map +1 -0
  38. package/dist/ai/orchestrator/finish-tool.d.ts +568 -0
  39. package/dist/ai/orchestrator/finish-tool.d.ts.map +1 -0
  40. package/dist/ai/orchestrator/finish-tool.js +114 -0
  41. package/dist/ai/orchestrator/finish-tool.js.map +1 -0
  42. package/dist/ai/orchestrator/index.d.ts +25 -0
  43. package/dist/ai/orchestrator/index.d.ts.map +1 -0
  44. package/dist/ai/orchestrator/index.js +21 -0
  45. package/dist/ai/orchestrator/index.js.map +1 -0
  46. package/dist/ai/orchestrator/log.d.ts +24 -0
  47. package/dist/ai/orchestrator/log.d.ts.map +1 -0
  48. package/dist/ai/orchestrator/log.js +48 -0
  49. package/dist/ai/orchestrator/log.js.map +1 -0
  50. package/dist/ai/orchestrator/page-cache.d.ts +64 -0
  51. package/dist/ai/orchestrator/page-cache.d.ts.map +1 -0
  52. package/dist/ai/orchestrator/page-cache.js +127 -0
  53. package/dist/ai/orchestrator/page-cache.js.map +1 -0
  54. package/dist/ai/orchestrator/prompt.d.ts +16 -0
  55. package/dist/ai/orchestrator/prompt.d.ts.map +1 -0
  56. package/dist/ai/orchestrator/prompt.js +52 -0
  57. package/dist/ai/orchestrator/prompt.js.map +1 -0
  58. package/dist/ai/orchestrator/runner.d.ts +65 -0
  59. package/dist/ai/orchestrator/runner.d.ts.map +1 -0
  60. package/dist/ai/orchestrator/runner.js +223 -0
  61. package/dist/ai/orchestrator/runner.js.map +1 -0
  62. package/dist/ai/orchestrator/session.d.ts +44 -0
  63. package/dist/ai/orchestrator/session.d.ts.map +1 -0
  64. package/dist/ai/orchestrator/session.js +64 -0
  65. package/dist/ai/orchestrator/session.js.map +1 -0
  66. package/dist/ai/orchestrator/types.d.ts +99 -0
  67. package/dist/ai/orchestrator/types.d.ts.map +1 -0
  68. package/dist/ai/orchestrator/types.js +8 -0
  69. package/dist/ai/orchestrator/types.js.map +1 -0
  70. package/dist/ai/probes/cache.d.ts +12 -0
  71. package/dist/ai/probes/cache.d.ts.map +1 -0
  72. package/dist/ai/probes/cache.js +46 -0
  73. package/dist/ai/probes/cache.js.map +1 -0
  74. package/dist/ai/tools/ask-ai-engine.d.ts +77 -0
  75. package/dist/ai/tools/ask-ai-engine.d.ts.map +1 -0
  76. package/dist/ai/tools/ask-ai-engine.js +253 -0
  77. package/dist/ai/tools/ask-ai-engine.js.map +1 -0
  78. package/dist/ai/tools/check-domain-crawler-access.d.ts +71 -0
  79. package/dist/ai/tools/check-domain-crawler-access.d.ts.map +1 -0
  80. package/dist/ai/tools/check-domain-crawler-access.js +76 -0
  81. package/dist/ai/tools/check-domain-crawler-access.js.map +1 -0
  82. package/dist/ai/tools/check-domain-llms-txt.d.ts +70 -0
  83. package/dist/ai/tools/check-domain-llms-txt.d.ts.map +1 -0
  84. package/dist/ai/tools/check-domain-llms-txt.js +75 -0
  85. package/dist/ai/tools/check-domain-llms-txt.js.map +1 -0
  86. package/dist/ai/tools/check-indexability.d.ts +58 -0
  87. package/dist/ai/tools/check-indexability.d.ts.map +1 -0
  88. package/dist/ai/tools/check-indexability.js +64 -0
  89. package/dist/ai/tools/check-indexability.js.map +1 -0
  90. package/dist/ai/tools/check-robots.d.ts +68 -0
  91. package/dist/ai/tools/check-robots.d.ts.map +1 -0
  92. package/dist/ai/tools/check-robots.js +90 -0
  93. package/dist/ai/tools/check-robots.js.map +1 -0
  94. package/dist/ai/tools/check-rule-answer-first.d.ts +54 -0
  95. package/dist/ai/tools/check-rule-answer-first.d.ts.map +1 -0
  96. package/dist/ai/tools/check-rule-answer-first.js +50 -0
  97. package/dist/ai/tools/check-rule-answer-first.js.map +1 -0
  98. package/dist/ai/tools/check-rule-canonical-consistency.d.ts +66 -0
  99. package/dist/ai/tools/check-rule-canonical-consistency.d.ts.map +1 -0
  100. package/dist/ai/tools/check-rule-canonical-consistency.js +51 -0
  101. package/dist/ai/tools/check-rule-canonical-consistency.js.map +1 -0
  102. package/dist/ai/tools/check-rule-citable-facts.d.ts +58 -0
  103. package/dist/ai/tools/check-rule-citable-facts.d.ts.map +1 -0
  104. package/dist/ai/tools/check-rule-citable-facts.js +41 -0
  105. package/dist/ai/tools/check-rule-citable-facts.js.map +1 -0
  106. package/dist/ai/tools/check-rule-content-modularity.d.ts +58 -0
  107. package/dist/ai/tools/check-rule-content-modularity.d.ts.map +1 -0
  108. package/dist/ai/tools/check-rule-content-modularity.js +45 -0
  109. package/dist/ai/tools/check-rule-content-modularity.js.map +1 -0
  110. package/dist/ai/tools/check-rule-faq-coverage.d.ts +54 -0
  111. package/dist/ai/tools/check-rule-faq-coverage.d.ts.map +1 -0
  112. package/dist/ai/tools/check-rule-faq-coverage.js +39 -0
  113. package/dist/ai/tools/check-rule-faq-coverage.js.map +1 -0
  114. package/dist/ai/tools/check-rule-freshness-signals.d.ts +54 -0
  115. package/dist/ai/tools/check-rule-freshness-signals.d.ts.map +1 -0
  116. package/dist/ai/tools/check-rule-freshness-signals.js +45 -0
  117. package/dist/ai/tools/check-rule-freshness-signals.js.map +1 -0
  118. package/dist/ai/tools/check-rule-json-ld-valid.d.ts +54 -0
  119. package/dist/ai/tools/check-rule-json-ld-valid.d.ts.map +1 -0
  120. package/dist/ai/tools/check-rule-json-ld-valid.js +44 -0
  121. package/dist/ai/tools/check-rule-json-ld-valid.js.map +1 -0
  122. package/dist/ai/tools/check-rule-missing-author.d.ts +54 -0
  123. package/dist/ai/tools/check-rule-missing-author.d.ts.map +1 -0
  124. package/dist/ai/tools/check-rule-missing-author.js +45 -0
  125. package/dist/ai/tools/check-rule-missing-author.js.map +1 -0
  126. package/dist/ai/tools/check-rule-near-duplicate.d.ts +82 -0
  127. package/dist/ai/tools/check-rule-near-duplicate.d.ts.map +1 -0
  128. package/dist/ai/tools/check-rule-near-duplicate.js +63 -0
  129. package/dist/ai/tools/check-rule-near-duplicate.js.map +1 -0
  130. package/dist/ai/tools/check-rule-required-fields.d.ts +50 -0
  131. package/dist/ai/tools/check-rule-required-fields.d.ts.map +1 -0
  132. package/dist/ai/tools/check-rule-required-fields.js +38 -0
  133. package/dist/ai/tools/check-rule-required-fields.js.map +1 -0
  134. package/dist/ai/tools/check-rule-schema-consistency.d.ts +54 -0
  135. package/dist/ai/tools/check-rule-schema-consistency.d.ts.map +1 -0
  136. package/dist/ai/tools/check-rule-schema-consistency.js +44 -0
  137. package/dist/ai/tools/check-rule-schema-consistency.js.map +1 -0
  138. package/dist/ai/tools/check-rule-summary-bait.d.ts +54 -0
  139. package/dist/ai/tools/check-rule-summary-bait.d.ts.map +1 -0
  140. package/dist/ai/tools/check-rule-summary-bait.js +39 -0
  141. package/dist/ai/tools/check-rule-summary-bait.js.map +1 -0
  142. package/dist/ai/tools/check-rule-thin-content.d.ts +66 -0
  143. package/dist/ai/tools/check-rule-thin-content.d.ts.map +1 -0
  144. package/dist/ai/tools/check-rule-thin-content.js +58 -0
  145. package/dist/ai/tools/check-rule-thin-content.js.map +1 -0
  146. package/dist/ai/tools/detect-templates.d.ts +60 -0
  147. package/dist/ai/tools/detect-templates.d.ts.map +1 -0
  148. package/dist/ai/tools/detect-templates.js +43 -0
  149. package/dist/ai/tools/detect-templates.js.map +1 -0
  150. package/dist/ai/tools/fetch-page.d.ts +70 -0
  151. package/dist/ai/tools/fetch-page.d.ts.map +1 -0
  152. package/dist/ai/tools/fetch-page.js +93 -0
  153. package/dist/ai/tools/fetch-page.js.map +1 -0
  154. package/dist/ai/tools/fetch-sitemap.d.ts +60 -0
  155. package/dist/ai/tools/fetch-sitemap.d.ts.map +1 -0
  156. package/dist/ai/tools/fetch-sitemap.js +116 -0
  157. package/dist/ai/tools/fetch-sitemap.js.map +1 -0
  158. package/dist/ai/tools/index.d.ts +1555 -0
  159. package/dist/ai/tools/index.d.ts.map +1 -0
  160. package/dist/ai/tools/index.js +119 -0
  161. package/dist/ai/tools/index.js.map +1 -0
  162. package/dist/ai/tools/parse-page.d.ts +94 -0
  163. package/dist/ai/tools/parse-page.d.ts.map +1 -0
  164. package/dist/ai/tools/parse-page.js +108 -0
  165. package/dist/ai/tools/parse-page.js.map +1 -0
  166. package/dist/ai/tools/query-serp.d.ts +113 -0
  167. package/dist/ai/tools/query-serp.d.ts.map +1 -0
  168. package/dist/ai/tools/query-serp.js +131 -0
  169. package/dist/ai/tools/query-serp.js.map +1 -0
  170. package/dist/ai/tools/sample-template.d.ts +67 -0
  171. package/dist/ai/tools/sample-template.d.ts.map +1 -0
  172. package/dist/ai/tools/sample-template.js +75 -0
  173. package/dist/ai/tools/sample-template.js.map +1 -0
  174. package/dist/ai/tools/types.d.ts +73 -0
  175. package/dist/ai/tools/types.d.ts.map +1 -0
  176. package/dist/ai/tools/types.js +64 -0
  177. package/dist/ai/tools/types.js.map +1 -0
  178. package/dist/ai/tools/validate-jsonld.d.ts +62 -0
  179. package/dist/ai/tools/validate-jsonld.d.ts.map +1 -0
  180. package/dist/ai/tools/validate-jsonld.js +84 -0
  181. package/dist/ai/tools/validate-jsonld.js.map +1 -0
  182. package/dist/auditor.d.ts +4 -0
  183. package/dist/auditor.d.ts.map +1 -1
  184. package/dist/auditor.js +629 -64
  185. package/dist/auditor.js.map +1 -1
  186. package/dist/backpressure.d.ts.map +1 -1
  187. package/dist/backpressure.js +10 -3
  188. package/dist/backpressure.js.map +1 -1
  189. package/dist/enrich-findings.d.ts.map +1 -1
  190. package/dist/enrich-findings.js +15 -1
  191. package/dist/enrich-findings.js.map +1 -1
  192. package/dist/formatters/console.d.ts.map +1 -1
  193. package/dist/formatters/console.js +13 -0
  194. package/dist/formatters/console.js.map +1 -1
  195. package/dist/formatters/markdown.d.ts.map +1 -1
  196. package/dist/formatters/markdown.js +20 -2
  197. package/dist/formatters/markdown.js.map +1 -1
  198. package/dist/index.d.ts +12 -1
  199. package/dist/index.d.ts.map +1 -1
  200. package/dist/index.js +8 -0
  201. package/dist/index.js.map +1 -1
  202. package/dist/rule-references.d.ts.map +1 -1
  203. package/dist/rule-references.js +5 -0
  204. package/dist/rule-references.js.map +1 -1
  205. package/dist/rules/content/heading-structure.d.ts +21 -0
  206. package/dist/rules/content/heading-structure.d.ts.map +1 -0
  207. package/dist/rules/content/heading-structure.js +56 -0
  208. package/dist/rules/content/heading-structure.js.map +1 -0
  209. package/dist/rules/content/image-alt-text.d.ts +18 -0
  210. package/dist/rules/content/image-alt-text.d.ts.map +1 -0
  211. package/dist/rules/content/image-alt-text.js +77 -0
  212. package/dist/rules/content/image-alt-text.js.map +1 -0
  213. package/dist/rules/content/title-uniqueness.d.ts +18 -0
  214. package/dist/rules/content/title-uniqueness.d.ts.map +1 -0
  215. package/dist/rules/content/title-uniqueness.js +70 -0
  216. package/dist/rules/content/title-uniqueness.js.map +1 -0
  217. package/dist/rules/links/host-section-divergence.d.ts +3 -0
  218. package/dist/rules/links/host-section-divergence.d.ts.map +1 -0
  219. package/dist/rules/links/host-section-divergence.js +158 -0
  220. package/dist/rules/links/host-section-divergence.js.map +1 -0
  221. package/dist/rules/links/link-depth.d.ts +12 -1
  222. package/dist/rules/links/link-depth.d.ts.map +1 -1
  223. package/dist/rules/links/link-depth.js +25 -12
  224. package/dist/rules/links/link-depth.js.map +1 -1
  225. package/dist/rules/scope.d.ts.map +1 -1
  226. package/dist/rules/scope.js +5 -0
  227. package/dist/rules/scope.js.map +1 -1
  228. package/dist/rules/spam/doorway-pattern.d.ts.map +1 -1
  229. package/dist/rules/spam/doorway-pattern.js +27 -4
  230. package/dist/rules/spam/doorway-pattern.js.map +1 -1
  231. package/dist/rules/spam/publication-velocity.d.ts +1 -1
  232. package/dist/rules/spam/publication-velocity.d.ts.map +1 -1
  233. package/dist/rules/spam/publication-velocity.js +9 -4
  234. package/dist/rules/spam/publication-velocity.js.map +1 -1
  235. package/dist/rules/spam/template-coverage.js +1 -1
  236. package/dist/rules/spam/template-coverage.js.map +1 -1
  237. package/dist/rules/spam/template-diversity.js +1 -1
  238. package/dist/rules/spam/template-diversity.js.map +1 -1
  239. package/dist/rules/tech/hreflang-consistency.d.ts.map +1 -1
  240. package/dist/rules/tech/hreflang-consistency.js +33 -4
  241. package/dist/rules/tech/hreflang-consistency.js.map +1 -1
  242. package/dist/rules/tech/og-completeness.d.ts +11 -0
  243. package/dist/rules/tech/og-completeness.d.ts.map +1 -1
  244. package/dist/rules/tech/og-completeness.js +22 -23
  245. package/dist/rules/tech/og-completeness.js.map +1 -1
  246. package/dist/ruleset-version.d.ts +8 -0
  247. package/dist/ruleset-version.d.ts.map +1 -0
  248. package/dist/ruleset-version.js +8 -0
  249. package/dist/ruleset-version.js.map +1 -0
  250. package/dist/scrape-strategy.d.ts +42 -0
  251. package/dist/scrape-strategy.d.ts.map +1 -0
  252. package/dist/scrape-strategy.js +101 -0
  253. package/dist/scrape-strategy.js.map +1 -0
  254. package/dist/site-classifier.d.ts.map +1 -1
  255. package/dist/site-classifier.js +1 -0
  256. package/dist/site-classifier.js.map +1 -1
  257. package/dist/state.d.ts +36 -1
  258. package/dist/state.d.ts.map +1 -1
  259. package/dist/state.js +3 -1
  260. package/dist/state.js.map +1 -1
  261. package/dist/stratified-sample.d.ts +9 -1
  262. package/dist/stratified-sample.d.ts.map +1 -1
  263. package/dist/stratified-sample.js +23 -6
  264. package/dist/stratified-sample.js.map +1 -1
  265. package/dist/types.d.ts +135 -2
  266. package/dist/types.d.ts.map +1 -1
  267. package/dist/url-normalize.d.ts.map +1 -1
  268. package/dist/url-normalize.js +13 -1
  269. package/dist/url-normalize.js.map +1 -1
  270. package/package.json +90 -90
package/dist/auditor.js CHANGED
@@ -18,7 +18,12 @@ import { thinContentRule } from "./rules/spam/thin-content.js";
18
18
  import { deadEndsRule } from "./rules/links/dead-ends.js";
19
19
  import { linkDepthRule } from "./rules/links/link-depth.js";
20
20
  import { clusterConnectivityRule } from "./rules/links/cluster-connectivity.js";
21
+ import { hostSectionDivergenceRule } from "./rules/links/host-section-divergence.js";
21
22
  import { orphanPagesRule } from "./rules/links/orphan-pages.js";
23
+ import { ogCompletenessRule } from "./rules/tech/og-completeness.js";
24
+ import { titleUniquenessRule } from "./rules/content/title-uniqueness.js";
25
+ import { headingStructureRule } from "./rules/content/heading-structure.js";
26
+ import { imageAltTextRule } from "./rules/content/image-alt-text.js";
22
27
  import { canonicalConsistencyRule } from "./rules/tech/canonical-consistency.js";
23
28
  import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-conflict.js";
24
29
  import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
@@ -55,14 +60,17 @@ import { SSRFError, validateTargetHost } from "./ssrf-guard.js";
55
60
  import { SAFE_MODE_PRESETS, resolveSafeModeKey } from "./safe-mode-preset.js";
56
61
  import { FetchObserver, computeReadiness, detectDevServer } from "./fetch-observer.js";
57
62
  import { BackpressureMonitor, OriginDegradedError } from "./backpressure.js";
58
- import { stratifiedSample } from "./stratified-sample.js";
63
+ import { stratifiedSample, mulberry32 } from "./stratified-sample.js";
59
64
  import { classifySite } from "./site-classifier.js";
60
65
  import { readState, writeState, computeContentHash, STATE_SCHEMA_VERSION, } from "./state.js";
66
+ import { CORE_RULESET_VERSION } from "./ruleset-version.js";
67
+ import { planScrapeStrategy, DEFAULT_AGE_FLOOR_DAYS } from "./scrape-strategy.js";
61
68
  const DEFAULTS = {
62
69
  nearDuplicateThreshold: 0.85,
63
70
  entitySwapThreshold: 0.95,
64
71
  thinContentMinWords: 300,
65
72
  publicationVelocityMaxPerDay: 100,
73
+ publicationVelocityMaxPerDayCorpusFraction: 0.10,
66
74
  boilerplateMaxRatio: 0.7,
67
75
  templateDiversityMinUniqueRatio: 0.35,
68
76
  uniqueValueMinWords: 100,
@@ -99,13 +107,60 @@ const SCORING_PROFILES = {
99
107
  "aeo/citable-facts": "info",
100
108
  "aeo/answer-first": "info",
101
109
  "aeo/summary-bait": "warning",
102
- "spam/thin-content": "warning",
110
+ // 2026-05-03 calibration round 5: Segment integrations had 24 thin
111
+ // pages (200-300 words is correct for a catalog record). thin-content
112
+ // contributing capped 40 impact pushed integrity to its 100 cap → 30
113
+ // contribution at small-marketing weight, which alone tripped
114
+ // 'concerning'. Demoting to info keeps the signal visible without
115
+ // tanking the verdict on catalog-shape sites mis-classified as
116
+ // small-marketing. Real marketing sites (linear.app etc) don't
117
+ // normally have many sub-300-word pages so this won't hide quality
118
+ // issues there.
119
+ "spam/thin-content": "info",
120
+ "aeo/freshness-signals": "info",
121
+ "content/missing-author": "info",
122
+ // 2026-05-03 calibration round 3: Segment integrations classified as
123
+ // small-marketing@0.88 and tripped doorway-pattern 300× critical
124
+ // (catalog records are thin + entity-swap by design — not actually a
125
+ // doorway funnel). The classifier mistakes catalog directories as
126
+ // small-marketing; this demotion absorbs that mis-classification
127
+ // without weakening detection on actual small-marketing sites
128
+ // (linear.app, supabase.com — none of which produce entity-swap pairs).
129
+ "spam/doorway-pattern": "warning",
130
+ // 2026-05-03 calibration round 4: spam/boilerplate-ratio fired ERROR
131
+ // on Segment's integration directory (24 pages, 60%+ shared template
132
+ // chrome). On a marketing-template site the rule is correct — repeated
133
+ // "About us" / "Pricing" copy across pages IS a quality issue. On a
134
+ // catalog mis-classified to small-marketing, the shared chrome IS the
135
+ // template — by design. Demote to warning here; real marketing sites
136
+ // (linear.app, supabase.com) won't trip it because their corpus is
137
+ // page-diverse, but catalog-shape pages classified as small-marketing
138
+ // (Segment, Wise) won't tank the verdict.
139
+ "spam/boilerplate-ratio": "warning",
140
+ // 2026-05-03 v0.5.2 round 10: og-completeness, heading-structure,
141
+ // image-alt-text were added as new rules and tipped Segment from
142
+ // concerning → critical because catalog/template-driven sites
143
+ // commonly have shared OG defaults, weird H1 patterns (multiple H1s
144
+ // for repeated nav cards), and unlabelled logo grids. These are
145
+ // real findings on isolated sites but typical for catalog shape;
146
+ // demote to info here so the signal stays visible without driving
147
+ // the verdict.
148
+ "tech/og-completeness": "info",
149
+ "content/heading-structure": "info",
150
+ "content/image-alt-text": "info",
103
151
  },
104
152
  confidenceOverrides: {
105
153
  "aeo/citable-facts": "low",
106
154
  "aeo/answer-first": "low",
107
155
  "aeo/summary-bait": "medium",
108
- "spam/thin-content": "medium",
156
+ "spam/thin-content": "low",
157
+ "aeo/freshness-signals": "low",
158
+ "content/missing-author": "low",
159
+ "spam/doorway-pattern": "medium",
160
+ "spam/boilerplate-ratio": "medium",
161
+ "tech/og-completeness": "low",
162
+ "content/heading-structure": "low",
163
+ "content/image-alt-text": "low",
109
164
  },
110
165
  },
111
166
  "blog": {
@@ -118,8 +173,82 @@ const SCORING_PROFILES = {
118
173
  },
119
174
  "programmatic-directory": {
120
175
  categoryWeights: { integrity: 0.55, discoverability: 0.15, citation: 0.20, data: 0.10, audit: 0 },
121
- severityOverrides: {},
122
- confidenceOverrides: {},
176
+ // Symmetry argument: every other profile has severity overrides for the
177
+ // rules that mis-fit its shape (`docs` demotes AEO + author rules,
178
+ // `ecommerce` demotes `aeo/citable-facts`, `small-marketing` demotes 4
179
+ // rules). `programmatic-directory` is the site type *most* structurally
180
+ // different from the "page = article" assumptions the AEO and EEAT rules
181
+ // are calibrated against — yet was the only profile with no overrides.
182
+ //
183
+ // Pre-calibration adjustment: demote (never escalate) the rules that
184
+ // first-principles analysis predicts will false-positive on catalog-
185
+ // shaped sites (Zapier integrations, G2 categories, Wise currency pairs,
186
+ // etc.). A reputable-pSEO calibration corpus + runner has been added
187
+ // (scripts/calibration-reputable-pseo.ts); these overrides will be
188
+ // tightened or loosened based on actual fire-rates measured against
189
+ // sites that demonstrably win in production. See
190
+ // docs/superpowers/specs/2026-05-03-calibration-against-reputable-pseo.md.
191
+ severityOverrides: {
192
+ // Catalog pages are tables, not prose. AEO rules calibrated on
193
+ // editorial content over-fire here.
194
+ "aeo/citable-facts": "info",
195
+ "aeo/answer-first": "info",
196
+ "aeo/content-modularity": "info",
197
+ // 2026-05-03 calibration: freshness-signals fired on every page of
198
+ // every reputable pSEO site. Catalog freshness is expressed via the
199
+ // data (live currency rates, current job listings, current pricing),
200
+ // not via visible "last updated" stamps. Demote.
201
+ "aeo/freshness-signals": "info",
202
+ // Authorship lives at the platform level (operator's about page),
203
+ // not on every catalog record. Following the rule's "add a byline"
204
+ // fix on a Zillow listing would actively make the page worse.
205
+ "content/missing-author": "info",
206
+ "content/eeat-signals": "info",
207
+ // Template uniformity is correct for catalogs by design. Keep the
208
+ // signal but cap at warning — never error.
209
+ "spam/template-diversity": "warning",
210
+ // 2026-05-03 v0.5.2 round 10: same catalog logic as small-marketing.
211
+ "tech/og-completeness": "info",
212
+ "content/heading-structure": "info",
213
+ "content/image-alt-text": "info",
214
+ // 2026-05-03 calibration round 2: catalogs are near-duplicate by
215
+ // design. spam/near-duplicate fires CRITICAL on every catalog pair.
216
+ // Demote to warning — keeps the signal visible without dominating
217
+ // the score.
218
+ "spam/near-duplicate": "warning",
219
+ // 2026-05-03 calibration round 5: catalog records are by-design
220
+ // shorter than the 300-word default. Demote to info on programmatic-
221
+ // directory; the data IS the content.
222
+ "spam/thin-content": "info",
223
+ // 2026-05-03 calibration round 2: doorway-pattern fires CRITICAL on
224
+ // every (thin + entity-swap) pair. On Segment integrations, integration
225
+ // pages are thin (200-300 words is the right amount for a directory
226
+ // record) and entity-swap (slack/google-sheets, slack/airtable, …) by
227
+ // design. The composite signal is genuinely true but the *intent*
228
+ // (doorway funnel) doesn't match the reality (catalog record).
229
+ // Demoting to warning preserves the signal without tanking the score.
230
+ "spam/doorway-pattern": "warning",
231
+ // 2026-05-03 calibration round 4: catalog pages share template chrome
232
+ // by design — same as `spam/template-diversity`, this signal is
233
+ // structurally true on programmatic-directories.
234
+ "spam/boilerplate-ratio": "warning",
235
+ },
236
+ confidenceOverrides: {
237
+ "aeo/citable-facts": "low",
238
+ "aeo/answer-first": "low",
239
+ "aeo/content-modularity": "low",
240
+ "aeo/freshness-signals": "low",
241
+ "content/missing-author": "low",
242
+ "content/eeat-signals": "low",
243
+ "spam/template-diversity": "medium",
244
+ "spam/near-duplicate": "medium",
245
+ "spam/doorway-pattern": "medium",
246
+ "spam/boilerplate-ratio": "medium",
247
+ "spam/thin-content": "low",
248
+ "tech/og-completeness": "low",
249
+ "content/heading-structure": "low",
250
+ "content/image-alt-text": "low",
251
+ },
123
252
  },
124
253
  "ecommerce": {
125
254
  categoryWeights: { integrity: 0.20, discoverability: 0.40, citation: 0.15, data: 0.25, audit: 0 },
@@ -146,8 +275,75 @@ const SCORING_PROFILES = {
146
275
  },
147
276
  "unclear": {
148
277
  categoryWeights: { integrity: 0.50, discoverability: 0.20, citation: 0.25, data: 0.05, audit: 0 },
149
- severityOverrides: {},
150
- confidenceOverrides: {},
278
+ // 2026-05-03 calibration round 2: the original "stay strict when unsure"
279
+ // intent meant that 4 of 5 reputable pSEO sites that classified as
280
+ // unclear (Zapier integrations, Typeform templates, Jasper templates,
281
+ // Numbeo cost-of-living) failed their verdict ceiling. The dominant
282
+ // driver was always `aeo/citable-facts` at full error severity — but
283
+ // catalog/template-gallery pages don't have prose, so the rule fires
284
+ // for a STRUCTURAL reason (page is a table, not a paragraph), not a
285
+ // QUALITY reason. Demoting the structurally-incompatible rules to
286
+ // info on `unclear` is conservative:
287
+ // - if site is genuinely editorial and got mis-classified, signals
288
+ // still surface (just info, not error) — author can act on them.
289
+ // - if site is catalog and got mis-classified to unclear, verdict
290
+ // no longer falsely tanks.
291
+ // Real spam signals (near-dup, doorway, thin) keep their severity.
292
+ severityOverrides: {
293
+ "aeo/citable-facts": "info",
294
+ "aeo/answer-first": "info",
295
+ "aeo/content-modularity": "info",
296
+ "aeo/freshness-signals": "info",
297
+ "content/missing-author": "info",
298
+ "content/eeat-signals": "info",
299
+ // 2026-05-03 calibration round 3: Airbyte classified as unclear@0.5
300
+ // and scored concerning despite all info-severity findings in the
301
+ // top 5. The 8 critical "blockers" came from spam/near-duplicate,
302
+ // spam/entity-swap, spam/doorway-pattern firing 1-2× each on its
303
+ // connectors directory — invisible per-rule but cumulatively pushing
304
+ // the score over 'caution'. On unclear sites we cannot tell whether
305
+ // these triple-fires represent a real doorway or a catalog; the
306
+ // calibration corpus shows reputable catalogs hitting them more
307
+ // often than real doorways do. Demote to warning — keeps the signal
308
+ // visible (it appears in shouldFix bucket, with full message) without
309
+ // tanking the verdict on a structurally-ambiguous site.
310
+ "spam/near-duplicate": "warning",
311
+ "spam/entity-swap": "warning",
312
+ "spam/doorway-pattern": "warning",
313
+ // 2026-05-03 calibration round 4: same boilerplate logic on unclear —
314
+ // we can't tell whether the site is a marketing site (boilerplate IS
315
+ // a quality issue) or a catalog (it isn't), so demote conservatively.
316
+ "spam/boilerplate-ratio": "warning",
317
+ // 2026-05-03 calibration round 5: same thin-content logic on unclear.
318
+ // Catalog-shape sites that classify as unclear (Zapier, Typeform,
319
+ // Jasper) had thin-content firing at error on the 5-15% of pages
320
+ // shorter than the 300-word default. Demote to info — surfaces the
321
+ // signal without driving the verdict on a structurally-ambiguous site.
322
+ "spam/thin-content": "info",
323
+ // 2026-05-03 v0.5.2 round 10: same demotions as programmatic-
324
+ // directory profile — these tipped Webflow/Zapier/Numbeo/Airbyte
325
+ // back into concerning territory because they classify as unclear
326
+ // and the new rules aren't yet calibrated for catalog shape.
327
+ "tech/og-completeness": "info",
328
+ "content/heading-structure": "info",
329
+ "content/image-alt-text": "info",
330
+ },
331
+ confidenceOverrides: {
332
+ "aeo/citable-facts": "low",
333
+ "aeo/answer-first": "low",
334
+ "aeo/content-modularity": "low",
335
+ "aeo/freshness-signals": "low",
336
+ "content/missing-author": "low",
337
+ "content/eeat-signals": "low",
338
+ "spam/near-duplicate": "medium",
339
+ "spam/entity-swap": "medium",
340
+ "spam/doorway-pattern": "medium",
341
+ "spam/boilerplate-ratio": "medium",
342
+ "spam/thin-content": "low",
343
+ "tech/og-completeness": "low",
344
+ "content/heading-structure": "low",
345
+ "content/image-alt-text": "low",
346
+ },
151
347
  },
152
348
  };
153
349
  /**
@@ -175,6 +371,10 @@ const RULE_IMPACTS = {
175
371
  "content/meta-uniqueness": { baseImpact: 8, perInstance: 2, maxImpact: 40 },
176
372
  "content/missing-author": { baseImpact: 4, perInstance: 1, maxImpact: 20 },
177
373
  "content/eeat-signals": { baseImpact: 4, perInstance: 1, maxImpact: 20 },
374
+ // 2026-05-03 v0.5.2 blind-spot fixes
375
+ "content/title-uniqueness": { baseImpact: 8, perInstance: 2, maxImpact: 25 }, // 2026-05-03 round 11: title is high-impact but the original 50-cap was disproportionate to other content rules and tipped Typeform into critical on a 6-finding cluster. Keep the rule at native error severity (duplicate titles ARE real bugs); just don't let one rule dominate the integrity bucket.
376
+ "content/heading-structure": { baseImpact: 5, perInstance: 1, maxImpact: 20 },
377
+ "content/image-alt-text": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
178
378
  // Tech — softened in v0.4.3-rc2 after dogfood showed nextjs.org regressing
179
379
  // from ready→caution on tech/canonical-consistency × 4 (legit cross-domain
180
380
  // canonicals on a CDN). Per-instance now 1 (was 3).
@@ -191,6 +391,7 @@ const RULE_IMPACTS = {
191
391
  // stripe.com from a single missing reciprocal pair — that should not be
192
392
  // treated as 350× the impact.
193
393
  "tech/hreflang-consistency": { baseImpact: 5, perInstance: 0, maxImpact: 5 },
394
+ "tech/og-completeness": { baseImpact: 4, perInstance: 1, maxImpact: 20 },
194
395
  // Links
195
396
  "links/orphan-pages": { baseImpact: 5, perInstance: 1, maxImpact: 25 },
196
397
  "links/dead-ends": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
@@ -242,6 +443,39 @@ function verdictForRisk(risk) {
242
443
  return "concerning";
243
444
  return "critical";
244
445
  }
446
+ /**
447
+ * 2026-05-03 v0.5.2 — apply the bring-your-own-authority shift to the
448
+ * verdict ladder. The raw `risk` number is unchanged; only the user-
449
+ * facing verdict mapping shifts.
450
+ *
451
+ * `authorityScore >= 80` (established brand) → shift ONE TIER LENIENT
452
+ * `authorityScore <= 30` (newer/lower) → shift ONE TIER STRICT
453
+ * 31..79 or undefined → no shift
454
+ *
455
+ * "One tier lenient" means: critical → concerning, concerning → caution,
456
+ * caution → ready, ready → ready (clamped). "One tier strict" is the
457
+ * inverse direction: ready → caution, caution → concerning,
458
+ * concerning → critical, critical → critical.
459
+ */
460
+ const VERDICT_LADDER = ["ready", "caution", "concerning", "critical"];
461
+ function shiftVerdictForAuthority(verdict, authorityScore) {
462
+ if (authorityScore === undefined)
463
+ return verdict;
464
+ if (!Number.isFinite(authorityScore))
465
+ return verdict;
466
+ if (authorityScore < 0 || authorityScore > 100)
467
+ return verdict;
468
+ const idx = VERDICT_LADDER.indexOf(verdict);
469
+ if (idx < 0)
470
+ return verdict;
471
+ if (authorityScore >= 80) {
472
+ return VERDICT_LADDER[Math.max(0, idx - 1)];
473
+ }
474
+ if (authorityScore <= 30) {
475
+ return VERDICT_LADDER[Math.min(VERDICT_LADDER.length - 1, idx + 1)];
476
+ }
477
+ return verdict;
478
+ }
245
479
  function gradeForPenalty(penalty) {
246
480
  if (penalty <= 20)
247
481
  return "A";
@@ -303,7 +537,15 @@ function runRulesOnPages(pages,
303
537
  * `respectNoindex: true` would hide noindex'd pages from the very rules
304
538
  * designed to flag accidental noindex'ing.
305
539
  */
306
- noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides, mode = "full") {
540
+ noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides, mode = "full",
541
+ /**
542
+ * 2026-05-03 calibration credibility fix: signals that the audit is
543
+ * running on a sampled subset of the discovered URLs. Rules whose
544
+ * outputs depend on a complete link graph (`links/unreachable-from-
545
+ * root`) skip their checks when this is true to avoid sampling-
546
+ * artifact false positives.
547
+ */
548
+ sampled = false) {
307
549
  const findings = [];
308
550
  const modeOk = (ruleId) => mode !== "diff" || isRuleAllowedInDiff(ruleId);
309
551
  const tag = (results) => results.map((r) => {
@@ -332,7 +574,7 @@ noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, in
332
574
  findings.push(...tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
333
575
  }
334
576
  if (isEnabled("spam/publication-velocity") && modeOk("spam/publication-velocity")) {
335
- findings.push(...tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay)));
577
+ findings.push(...tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay, resolvedRules.publicationVelocityMaxPerDayCorpusFraction)));
336
578
  }
337
579
  if (isEnabled("spam/boilerplate-ratio") && modeOk("spam/boilerplate-ratio")) {
338
580
  findings.push(...tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
@@ -356,6 +598,17 @@ noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, in
356
598
  if (isEnabled("content/eeat-signals") && modeOk("content/eeat-signals")) {
357
599
  findings.push(...tag(eeatSignalsRule(pages)));
358
600
  }
601
+ // 2026-05-03 v0.5.2 blind-spot fixes — title uniqueness + heading
602
+ // structure + image alt-text were tier-1 gaps in the blind-spot audit.
603
+ if (isEnabled("content/title-uniqueness") && modeOk("content/title-uniqueness")) {
604
+ findings.push(...tag(titleUniquenessRule(pages)));
605
+ }
606
+ if (isEnabled("content/heading-structure") && modeOk("content/heading-structure")) {
607
+ findings.push(...tag(headingStructureRule(pages)));
608
+ }
609
+ if (isEnabled("content/image-alt-text") && modeOk("content/image-alt-text")) {
610
+ findings.push(...tag(imageAltTextRule(pages)));
611
+ }
359
612
  // Link rules — use the global link graph
360
613
  if (isEnabled("links/orphan-pages") && modeOk("links/orphan-pages")) {
361
614
  findings.push(...tag(orphanPagesRule(pages, inbound, rootUrl)));
@@ -365,12 +618,15 @@ noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, in
365
618
  }
366
619
  if (isEnabled("links/link-depth") && modeOk("links/link-depth")) {
367
620
  if (rootUrl) {
368
- findings.push(...tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound)));
621
+ findings.push(...tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound, sampled)));
369
622
  }
370
623
  }
371
624
  if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
372
625
  findings.push(...tag(clusterConnectivityRule(pages, knownUrls)));
373
626
  }
627
+ if (isEnabled("links/host-section-divergence") && modeOk("links/host-section-divergence")) {
628
+ findings.push(...tag(hostSectionDivergenceRule(pages, adjacency)));
629
+ }
374
630
  // Tech rules
375
631
  if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
376
632
  findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
@@ -392,6 +648,11 @@ noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, in
392
648
  // inconsistent — see auditor.test.ts "emits technical SEO findings".
393
649
  findings.push(...tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
394
650
  }
651
+ // 2026-05-03 v0.5.2 blind-spot fix: og-completeness was referenced in
652
+ // the v0.4.x README without ever shipping. Now it does.
653
+ if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
654
+ findings.push(...tag(ogCompletenessRule(pages)));
655
+ }
395
656
  // Schema rules
396
657
  if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
397
658
  findings.push(...tag(jsonLdValidRule(pages)));
@@ -471,6 +732,26 @@ export function applyScoringProfileOverrides(findings, classification) {
471
732
  };
472
733
  });
473
734
  }
735
+ /**
736
+ * 2026-05-03 credibility: list of rule IDs that ACTUALLY had their severity
737
+ * remapped on this audit. Distinct from `profile.severityOverrides` which is
738
+ * the static set of demotions defined per profile — this is the subset of
739
+ * those that actually fired. Surfaced via `summary.appliedSeverityDemotions`
740
+ * so formatters can show the user "engine demoted X rules because <site
741
+ * type> profile" rather than hiding the mechanism.
742
+ */
743
+ function computeAppliedDemotions(findings, classification) {
744
+ const profile = profileFor(classification);
745
+ if (Object.keys(profile.severityOverrides).length === 0)
746
+ return [];
747
+ const applied = new Set();
748
+ for (const f of findings) {
749
+ if (profile.severityOverrides[f.ruleId] !== undefined) {
750
+ applied.add(f.ruleId);
751
+ }
752
+ }
753
+ return Array.from(applied).sort();
754
+ }
474
755
  /**
475
756
  * v0.4.3 — confidence-and-count-aware scoring. Replaces the v0.4 model that
476
757
  * counted only severity. Each rule has a `baseImpact + (count - 1) *
@@ -525,6 +806,24 @@ function scoreFromFindings(findings, classification) {
525
806
  arr.push(finding);
526
807
  groups.set(finding.ruleId, arr);
527
808
  }
809
+ // 2026-05-03 calibration credibility fix: track info-severity vs
810
+ // non-info contributions to each bucket separately so a flood of info
811
+ // findings can't fill the bucket cap and tank the verdict on its own.
812
+ // Round 7 surfaced this on Airbyte and round 8 on Zapier — both had
813
+ // ALL info-severity findings in their top drivers yet scored
814
+ // `concerning` because cumulative info impact filled the citation
815
+ // bucket past its 100 cap. Now: info contribution per bucket caps at
816
+ // 50; warning+ contribution caps at 100; final bucket = sum, capped
817
+ // at 100. A site with no real warning/error findings can score at
818
+ // most ~12.5 risk from info accumulation at typical 0.25 citation
819
+ // weight — which keeps verdict aligned with the visible severity in
820
+ // the report.
821
+ const bucketInfoOnly = {
822
+ integrity: 0, discoverability: 0, citation: 0, data: 0, audit: 0,
823
+ };
824
+ const bucketNonInfo = {
825
+ integrity: 0, discoverability: 0, citation: 0, data: 0, audit: 0,
826
+ };
528
827
  for (const [ruleId, group] of groups) {
529
828
  const namespace = ruleId.split("/")[0];
530
829
  const bucket = CATEGORY_MAP[namespace];
@@ -548,7 +847,22 @@ function scoreFromFindings(findings, classification) {
548
847
  if (bestMultiplier === 0)
549
848
  bestMultiplier = CONFIDENCE_MULTIPLIER.high;
550
849
  const weighted = cappedImpact * bestMultiplier;
551
- bucketRaw[bucket] = Math.min(100, bucketRaw[bucket] + weighted);
850
+ // Bucket the rule's contribution by the highest severity in the group.
851
+ // Mixed-severity groups (e.g. error + info) count toward non-info — once
852
+ // a rule has any non-info finding, its count contribution is treated as
853
+ // a real-issue signal, not info accumulation.
854
+ const isInfoOnly = group.every((f) => f.severity === "info");
855
+ if (isInfoOnly) {
856
+ bucketInfoOnly[bucket] += weighted;
857
+ }
858
+ else {
859
+ bucketNonInfo[bucket] += weighted;
860
+ }
861
+ }
862
+ for (const key of ["integrity", "discoverability", "citation", "data"]) {
863
+ const info = Math.min(50, bucketInfoOnly[key]);
864
+ const nonInfo = Math.min(100, bucketNonInfo[key]);
865
+ bucketRaw[key] = Math.min(100, info + nonInfo);
552
866
  }
553
867
  const cw = profile.categoryWeights;
554
868
  const weighted = bucketRaw.integrity * cw.integrity +
@@ -716,6 +1030,25 @@ function parseSitemapUrls(xml) {
716
1030
  const matches = Array.from(xml.matchAll(/<loc>\s*([^<\s]+)\s*<\/loc>/gi));
717
1031
  return matches.map((match) => match[1]).filter(Boolean);
718
1032
  }
1033
+ export function parseSitemapUrlsWithLastmod(xml) {
1034
+ const out = [];
1035
+ // Match both <url>...</url> blocks (in <urlset>) and <sitemap>...</sitemap>
1036
+ // blocks (in <sitemapindex>). Both carry <loc> + optional <lastmod>.
1037
+ const blocks = xml.matchAll(/<(url|sitemap)\b[^>]*>([\s\S]*?)<\/\1>/gi);
1038
+ for (const block of blocks) {
1039
+ const inner = block[2] ?? "";
1040
+ const locMatch = inner.match(/<loc\b[^>]*>([\s\S]*?)<\/loc>/i);
1041
+ if (!locMatch)
1042
+ continue;
1043
+ const url = locMatch[1].trim();
1044
+ if (!url)
1045
+ continue;
1046
+ const lastmodMatch = inner.match(/<lastmod\b[^>]*>([\s\S]*?)<\/lastmod>/i);
1047
+ const lastmod = lastmodMatch ? lastmodMatch[1].trim() : undefined;
1048
+ out.push({ url, lastmod });
1049
+ }
1050
+ return out;
1051
+ }
719
1052
  function looksLikeSitemap(text) {
720
1053
  const lowered = text.toLowerCase();
721
1054
  return lowered.includes("<urlset") || lowered.includes("<sitemapindex");
@@ -784,22 +1117,32 @@ function shouldIgnore(url, patterns) {
784
1117
  }
785
1118
  return false;
786
1119
  }
787
- function fisherYatesSample(items, n) {
1120
+ function fisherYatesSample(items, n, random = Math.random) {
788
1121
  const arr = [...items];
789
1122
  for (let i = arr.length - 1; i > 0 && arr.length - i <= n; i -= 1) {
790
- const j = Math.floor(Math.random() * (i + 1));
1123
+ const j = Math.floor(random() * (i + 1));
791
1124
  [arr[i], arr[j]] = [arr[j], arr[i]];
792
1125
  }
793
1126
  return arr.slice(arr.length - n);
794
1127
  }
795
1128
  async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop) {
796
1129
  visited.add(sitemapUrl);
797
- const locs = parseSitemapUrls(sitemapText);
1130
+ const entries = parseSitemapUrlsWithLastmod(sitemapText);
798
1131
  if (!isSitemapIndex(sitemapText)) {
799
- return locs;
1132
+ const urls = [];
1133
+ const lastmodByUrl = new Map();
1134
+ for (const entry of entries) {
1135
+ urls.push(entry.url);
1136
+ if (entry.lastmod !== undefined) {
1137
+ lastmodByUrl.set(entry.url, entry.lastmod);
1138
+ }
1139
+ }
1140
+ return { urls, lastmodByUrl };
800
1141
  }
801
1142
  const allUrls = [];
802
- for (const childUrl of locs) {
1143
+ const allLastmodByUrl = new Map();
1144
+ for (const entry of entries) {
1145
+ const childUrl = entry.url;
803
1146
  if (signal?.aborted)
804
1147
  throw signal.reason ?? new Error("aborted");
805
1148
  if (visited.has(childUrl))
@@ -810,10 +1153,13 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
810
1153
  const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
811
1154
  if (!childLike)
812
1155
  continue;
813
- const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
1156
+ const { urls: childUrls, lastmodByUrl: childLastmodByUrl } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
814
1157
  allUrls.push(...childUrls);
1158
+ for (const [u, lm] of childLastmodByUrl) {
1159
+ allLastmodByUrl.set(u, lm);
1160
+ }
815
1161
  }
816
- return allUrls;
1162
+ return { urls: allUrls, lastmodByUrl: allLastmodByUrl };
817
1163
  }
818
1164
  async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
819
1165
  if (!origin)
@@ -846,7 +1192,7 @@ function isDisallowedByRobots(urlPath, patterns) {
846
1192
  function budgetExceeded(b) {
847
1193
  return b.cap > 0 && b.used >= b.cap;
848
1194
  }
849
- async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000) {
1195
+ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null) {
850
1196
  // Memoized SSRF validator. When guardSsrf is on, every URL fetched by the
851
1197
  // audit (source, sitemap entries, redirects, discovered links) goes through
852
1198
  // this. DNS is hit once per unique hostname per audit — a 4k-page audit on
@@ -906,11 +1252,33 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
906
1252
  const isXml = (contentType.includes("xml") || looksLikeSitemap(text)) && sourceStatus !== -1;
907
1253
  if (isXml) {
908
1254
  const visited = new Set();
909
- const allSitemapUrls = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
1255
+ const { urls: allSitemapUrls, lastmodByUrl: sitemapLastmodByUrl } = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
910
1256
  // If we have a budget, sample from sitemap URLs before fetching
911
- const urlsToFetch = discoveryBudget > 0 && allSitemapUrls.length > discoveryBudget
1257
+ const sampledUrls = discoveryBudget > 0 && allSitemapUrls.length > discoveryBudget
912
1258
  ? fisherYatesSample(allSitemapUrls, discoveryBudget)
913
1259
  : allSitemapUrls;
1260
+ // v0.5: change-driven monitoring. Apply the decision matrix BEFORE
1261
+ // fetching bodies. URLs in plan.skip are not network-touched at all —
1262
+ // their findings will be carried forward from prior state by the caller.
1263
+ // This is the whole point of monitoring mode: rule eval is microseconds,
1264
+ // the fetch is seconds; move the skip decision upstream of the fetch.
1265
+ let scrapePlan;
1266
+ let urlsToFetch;
1267
+ if (monitoringContext) {
1268
+ scrapePlan = planScrapeStrategy({
1269
+ candidateUrls: sampledUrls,
1270
+ priorState: monitoringContext.priorState,
1271
+ sitemapLastmodByUrl,
1272
+ currentRulesetVersion: monitoringContext.currentRulesetVersion,
1273
+ ageFloorDays: monitoringContext.ageFloorDays,
1274
+ now: monitoringContext.now,
1275
+ forceRefetchUrls: monitoringContext.forceRefetchUrls,
1276
+ });
1277
+ urlsToFetch = Array.from(scrapePlan.refetch.keys());
1278
+ }
1279
+ else {
1280
+ urlsToFetch = sampledUrls;
1281
+ }
914
1282
  const pages = [];
915
1283
  // Fetch robots.txt once for the origin — reused for Crawl-Delay pacing and Disallow checks.
916
1284
  const sourceOrigin = (() => { try {
@@ -1017,7 +1385,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
1017
1385
  });
1018
1386
  }
1019
1387
  }
1020
- return { pages, sitemapUrls: new Set(allSitemapUrls), discoveredUrlCount: allSitemapUrls.length };
1388
+ return { pages, sitemapUrls: new Set(allSitemapUrls), sitemapLastmodByUrl, discoveredUrlCount: allSitemapUrls.length, scrapePlan };
1021
1389
  }
1022
1390
  if (contentType.includes("html") || looksLikeHtml(text)) {
1023
1391
  const initialPage = { url: source, html: text };
@@ -1158,12 +1526,26 @@ export async function auditSource(source, options) {
1158
1526
  let backpressureError = null;
1159
1527
  const signal = composeSignals(externalSignal, backpressureAbort.signal);
1160
1528
  const observer = new FetchObserver();
1529
+ // 2026-05-03 calibration: the prior (3s p95 cap, 2× baseline multiplier)
1530
+ // gate aborted 4 of 12 reputable-pSEO audits on what was normal load
1531
+ // variance — Zapier at p95=576ms (2.4× a 236ms baseline), Webflow at
1532
+ // p95=1808ms (2.2× 833ms), Airbyte at p95=1288ms (3.4× 380ms). For real
1533
+ // production CDNs these spikes are noise, not degradation. Raise the
1534
+ // gate so it still catches truly broken origins (sustained 4× slowdown
1535
+ // OR p95 above 8s) without tripping on normal audit-induced load.
1161
1536
  const monitor = backpressureEnabled
1162
1537
  ? new BackpressureMonitor({
1163
1538
  warmupSize: 10,
1164
- absoluteP95Ms: 3000,
1165
- baselineMultiplier: 2,
1166
- errorRatioThreshold: 0.1,
1539
+ absoluteP95Ms: 8000,
1540
+ baselineMultiplier: 4,
1541
+ // 2026-05-03 production fix: 0.1 (10%) was tripping pseolint.dev
1542
+ // audits on real production sites that legitimately return ~10% 5xx
1543
+ // (transient errors, async page renderers warming up, sites in
1544
+ // canary). Combined with the `>=` comparison bug (also fixed),
1545
+ // this aborted every web-app audit. 0.15 keeps the gate honest —
1546
+ // a sustained 15%+ 5xx rate is a real problem, not noise — while
1547
+ // letting transient errors not bring down the whole audit.
1548
+ errorRatioThreshold: 0.15,
1167
1549
  })
1168
1550
  : null;
1169
1551
  // v0.4: framework gets set on the first observation that carries headers
@@ -1195,6 +1577,8 @@ export async function auditSource(source, options) {
1195
1577
  entitySwapThreshold: options?.rules?.entitySwapThreshold ?? DEFAULTS.entitySwapThreshold,
1196
1578
  thinContentMinWords: options?.rules?.thinContentMinWords ?? DEFAULTS.thinContentMinWords,
1197
1579
  publicationVelocityMaxPerDay: options?.rules?.publicationVelocityMaxPerDay ?? DEFAULTS.publicationVelocityMaxPerDay,
1580
+ publicationVelocityMaxPerDayCorpusFraction: options?.rules?.publicationVelocityMaxPerDayCorpusFraction
1581
+ ?? DEFAULTS.publicationVelocityMaxPerDayCorpusFraction,
1198
1582
  boilerplateMaxRatio: options?.rules?.boilerplateMaxRatio ?? DEFAULTS.boilerplateMaxRatio,
1199
1583
  templateDiversityMinUniqueRatio: options?.rules?.templateDiversityMinUniqueRatio ?? DEFAULTS.templateDiversityMinUniqueRatio,
1200
1584
  uniqueValueMinWords: options?.rules?.uniqueValueMinWords ?? DEFAULTS.uniqueValueMinWords,
@@ -1236,7 +1620,65 @@ export async function auditSource(source, options) {
1236
1620
  const fetchByteBudget = { used: 0, cap: maxFetchBytes };
1237
1621
  // v0.4 §4.7: detectedFramework is set in onObservation above, side-effect
1238
1622
  // of the normal source URL fetch. No separate probe needed.
1239
- const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, discoveredUrlCount } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered);
1623
+ // v0.5: read prior state BEFORE loadPagesFromSource so the change-driven
1624
+ // monitoring decision matrix can run pre-fetch and tell loadPagesFromSource
1625
+ // which URLs to actually fetch. Reading state is cheap; doing it here also
1626
+ // means we know `priorState` once for both the monitoring path and the
1627
+ // post-audit state-write path further down.
1628
+ let priorState = null;
1629
+ const skippedUrls = [];
1630
+ const currentRenderMode = options?.render ? "rendered" : "static";
1631
+ if (options?.state?.path || options?.state?.since || options?.state?.exitOnRegression || options?.state?.mode) {
1632
+ const statePath = options.state?.path ?? ".pseolint/state.json";
1633
+ priorState = await readState(statePath);
1634
+ if (priorState && priorState.renderMode !== currentRenderMode) {
1635
+ console.error(`warning: prior state renderMode=${priorState.renderMode} differs from current ${currentRenderMode}. Performing full re-audit.`);
1636
+ priorState = null;
1637
+ }
1638
+ }
1639
+ // Effective monitoring mode:
1640
+ // - explicit `state.mode` wins ("monitoring" or "fresh")
1641
+ // - else if `--since` is passed and prior state exists → "monitoring" (back-compat alias)
1642
+ // - else if prior state exists → "monitoring" (auto, v0.5 default)
1643
+ // - else → "fresh" (no prior state available)
1644
+ const explicitMode = options?.state?.mode;
1645
+ const effectiveMode = explicitMode ??
1646
+ (priorState ? "monitoring" : "fresh");
1647
+ // Build the monitoring context only for HTTP sources in monitoring mode with
1648
+ // prior state. Single-page HTML and filesystem sources skip this — they are
1649
+ // exempted from the strategy (a single-page audit has nothing to plan; local
1650
+ // reads are cheap so re-reading every file beats branch complexity).
1651
+ const isHttpSource = /^https?:\/\//i.test(source);
1652
+ // If the user asked for monitoring against a filesystem source, surface that
1653
+ // we're ignoring the request. Silent bypass leads to "why is my state file
1654
+ // not being used?" debugging. Only log when the user actively chose
1655
+ // monitoring (explicit --mode or --since) — auto-monitoring on prior state
1656
+ // existence is implicit and shouldn't warn.
1657
+ if (!isHttpSource && effectiveMode === "monitoring" && (options?.state?.mode === "monitoring" || options?.state?.since)) {
1658
+ console.error("warning: monitoring mode requested but source is a local file/directory; reading every HTML file (the matrix only applies to HTTP sources).");
1659
+ }
1660
+ const monitoringContext = effectiveMode === "monitoring" && priorState && isHttpSource
1661
+ ? {
1662
+ priorState,
1663
+ currentRulesetVersion: CORE_RULESET_VERSION,
1664
+ ageFloorDays: options?.state?.ageFloorDays ?? DEFAULT_AGE_FLOOR_DAYS,
1665
+ now: new Date(),
1666
+ forceRefetchUrls: options?.force?.urls,
1667
+ }
1668
+ : null;
1669
+ if (!priorState && options?.state?.since) {
1670
+ console.error("no prior state found — performing full baseline audit");
1671
+ }
1672
+ const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, sitemapLastmodByUrl, discoveredUrlCount, scrapePlan } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext);
1673
+ // The scrapePlan tells us which URLs were skipped pre-fetch under monitoring
1674
+ // mode. Surface them in skippedUrls so they show up under summary.skippedUrls
1675
+ // (kept for back-compat with --since consumers); T7 will carry their prior
1676
+ // findings forward and T8 will surface the full plan in summary.scrapePlan.
1677
+ if (scrapePlan) {
1678
+ for (const url of scrapePlan.skip.keys()) {
1679
+ skippedUrls.push(url);
1680
+ }
1681
+ }
1240
1682
  throwIfAborted();
1241
1683
  const loadedPages = [...loadedPagesRaw];
1242
1684
  // v0.4 §4.7: content-type-aware crawling. Filter out fetched URLs whose
@@ -1265,34 +1707,11 @@ export async function auditSource(source, options) {
1265
1707
  if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
1266
1708
  console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
1267
1709
  }
1268
- // State read + delta filtering
1269
- let priorState = null;
1270
- const skippedUrls = [];
1271
- if (options?.state?.since || options?.state?.exitOnRegression) {
1272
- const statePath = options.state.path ?? ".pseolint/state.json";
1273
- priorState = await readState(statePath);
1274
- const currentRenderMode = options.render ? "rendered" : "static";
1275
- if (priorState && priorState.renderMode !== currentRenderMode) {
1276
- console.error(`warning: prior state renderMode=${priorState.renderMode} differs from current ${currentRenderMode}. Performing full re-audit.`);
1277
- priorState = null;
1278
- }
1279
- if (priorState && options.state.since) {
1280
- const kept = [];
1281
- for (const p of loadedPages) {
1282
- const prior = priorState.urls[p.url];
1283
- if (prior && prior.contentHash === computeContentHash(p.html)) {
1284
- skippedUrls.push(p.url);
1285
- }
1286
- else {
1287
- kept.push(p);
1288
- }
1289
- }
1290
- loadedPages.splice(0, loadedPages.length, ...kept);
1291
- }
1292
- else if (!priorState && options.state.since) {
1293
- console.error("no prior state found — performing full baseline audit");
1294
- }
1295
- }
1710
+ // v0.5: prior state was loaded BEFORE loadPagesFromSource so the change-
1711
+ // driven monitoring decision matrix could run pre-fetch. URLs the matrix
1712
+ // marked as "skip" were never fetched and are recorded in skippedUrls
1713
+ // above. The old post-fetch contentHash skip is gone — the decision now
1714
+ // happens upstream of the network round-trip.
1296
1715
  let robotsTxtContent = "";
1297
1716
  if (/^https?:\/\//i.test(source)) {
1298
1717
  try {
@@ -1330,14 +1749,22 @@ export async function auditSource(source, options) {
1330
1749
  ? deduped.filter((page) => !shouldIgnore(page.url, ignorePatterns))
1331
1750
  : deduped;
1332
1751
  const strategy = options?.samplingStrategy ?? "stratified";
1333
- const sampled = sampleSize > 0 && sampleSize < filtered.length
1752
+ // 2026-05-03 calibration credibility fix: when sampleSeed is set, use a
1753
+ // deterministic PRNG so repeated audits pick the same pages and the
1754
+ // verdict is reproducible. Without a seed, fall back to Math.random
1755
+ // (legacy behavior, kept for backward compatibility).
1756
+ const samplingRandom = options?.sampleSeed !== undefined
1757
+ ? mulberry32(options.sampleSeed)
1758
+ : Math.random;
1759
+ const isSampledAudit = sampleSize > 0 && sampleSize < filtered.length;
1760
+ const sampled = isSampledAudit
1334
1761
  ? (strategy === "stratified"
1335
1762
  ? (() => {
1336
1763
  const urlsMap = new Map(filtered.map(p => [p.url, p]));
1337
- const sampledUrls = stratifiedSample(filtered.map(p => p.url), sampleSize);
1764
+ const sampledUrls = stratifiedSample(filtered.map(p => p.url), sampleSize, samplingRandom);
1338
1765
  return sampledUrls.map(u => urlsMap.get(u));
1339
1766
  })()
1340
- : fisherYatesSample(filtered, sampleSize))
1767
+ : fisherYatesSample(filtered, sampleSize, samplingRandom))
1341
1768
  : filtered;
1342
1769
  const parsedPagesAll = sampled.map((page) => {
1343
1770
  const parsed = parseHtmlPage(page.html, page.url, { normalizeUrl: normalizeUrlOptions });
@@ -1486,7 +1913,7 @@ export async function auditSource(source, options) {
1486
1913
  continue;
1487
1914
  const groupRules = resolveGroupRules(resolvedRules, groupConfig?.overrides);
1488
1915
  const enabledCheck = (ruleId) => !suppressedRuleSet.has(ruleId) && isRuleEnabled(ruleId, groupConfig?.rules);
1489
- const findings = runRulesOnPages(groupPages, parsedPagesAll, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full");
1916
+ const findings = runRulesOnPages(groupPages, parsedPagesAll, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full", isSampledAudit);
1490
1917
  allFindings.push(...findings);
1491
1918
  groupPageCounts[groupName] = groupPages.length;
1492
1919
  // v0.4.3: per-group scoring uses the same site-classification profile so
@@ -1508,10 +1935,55 @@ export async function auditSource(source, options) {
1508
1935
  // the enrichment output so every downstream consumer (summary.issues, AI
1509
1936
  // triage input, telemetry, formatters) sees the corrected severity.
1510
1937
  enriched.findings = applyScoringProfileOverrides(enriched.findings, siteClassification);
1938
+ // v0.5: change-driven monitoring carry-forward. URLs that the pre-fetch
1939
+ // strategy marked as "skip" were never fetched this run, so no rule produced
1940
+ // findings for them. Restore their findings from prior state, marked with
1941
+ // `carriedForward: true` and `lastVerifiedAt` so consumers can reason about
1942
+ // staleness. Inject after enrichment + overrides — these findings already
1943
+ // went through both in their original run; re-running enrichment would
1944
+ // strip their template / cluster assignments because parsedPages doesn't
1945
+ // contain the skipped pages.
1946
+ if (priorState && skippedUrls.length > 0) {
1947
+ for (const url of skippedUrls) {
1948
+ const prior = priorState.urls[url];
1949
+ if (!prior || prior.findings.length === 0)
1950
+ continue;
1951
+ for (const f of prior.findings) {
1952
+ const carried = {
1953
+ ruleId: f.ruleId,
1954
+ severity: f.severity,
1955
+ message: f.message,
1956
+ confidence: f.confidence,
1957
+ carriedForward: true,
1958
+ lastVerifiedAt: prior.fetchedAt,
1959
+ // State stores `url` but the engine type uses `pageUrl` — map back.
1960
+ pageUrl: typeof f.url === "string" ? f.url : url,
1961
+ };
1962
+ // Optional fields are preserved opportunistically when present in state.
1963
+ if (typeof f.fix === "string")
1964
+ carried.fix = f.fix;
1965
+ if (typeof f.ref === "string")
1966
+ carried.ref = f.ref;
1967
+ if (typeof f.docsUrl === "string")
1968
+ carried.docsUrl = f.docsUrl;
1969
+ if (Array.isArray(f.relatedUrls))
1970
+ carried.relatedUrls = f.relatedUrls;
1971
+ if (typeof f.group === "string")
1972
+ carried.group = f.group;
1973
+ if (typeof f.similarity === "number")
1974
+ carried.similarity = f.similarity;
1975
+ if (f.context !== undefined)
1976
+ carried.context = f.context;
1977
+ if (f.effort !== undefined)
1978
+ carried.effort = f.effort;
1979
+ enriched.findings.push(carried);
1980
+ }
1981
+ }
1982
+ }
1511
1983
  const { risk, categories, bucketCounts } = scoreFromFindings(enriched.findings, siteClassification);
1512
1984
  const auditedPageCount = Object.values(groupPageCounts).reduce((a, b) => a + b, 0);
1513
1985
  const issues = bucketIssues(enriched.findings);
1514
- const verdict = verdictForRisk(risk);
1986
+ const verdict = shiftVerdictForAuthority(verdictForRisk(risk), options?.authorityScore);
1515
1987
  const headline = buildHeadline(bucketCounts);
1516
1988
  // audit/* findings are diagnostic-only and never appear in summary.issues.
1517
1989
  // Surface them under diagnostics so consumers (telemetry, debug UIs) can
@@ -1523,6 +1995,7 @@ export async function auditSource(source, options) {
1523
1995
  fetched: parsedPages.length,
1524
1996
  skipped: skippedByContentType.length + skippedByRobots.length + skippedUrls.length,
1525
1997
  };
1998
+ const appliedSeverityDemotions = computeAppliedDemotions(enriched.findings, siteClassification);
1526
1999
  const summary = {
1527
2000
  schemaVersion: SCHEMA_VERSION,
1528
2001
  verdict,
@@ -1531,6 +2004,7 @@ export async function auditSource(source, options) {
1531
2004
  categories,
1532
2005
  issues,
1533
2006
  siteClassification,
2007
+ appliedSeverityDemotions: appliedSeverityDemotions.length > 0 ? appliedSeverityDemotions : undefined,
1534
2008
  diagnostics: {
1535
2009
  originReadiness: readinessReport,
1536
2010
  crawlStats,
@@ -1577,6 +2051,31 @@ export async function auditSource(source, options) {
1577
2051
  if (allSkipped.length > 0) {
1578
2052
  summary.skippedUrls = allSkipped;
1579
2053
  }
2054
+ // v0.5+: surface the change-driven monitoring summary when this run was a
2055
+ // monitoring run (had prior state and didn't force --mode=fresh). Filesystem
2056
+ // sources don't get a scrapePlan because they bypass the matrix.
2057
+ if (effectiveMode === "monitoring" && priorState && scrapePlan) {
2058
+ const reasonCounts = {};
2059
+ for (const reason of scrapePlan.refetch.values()) {
2060
+ reasonCounts[reason] = (reasonCounts[reason] ?? 0) + 1;
2061
+ }
2062
+ for (const reason of scrapePlan.skip.values()) {
2063
+ reasonCounts[reason] = (reasonCounts[reason] ?? 0) + 1;
2064
+ }
2065
+ // `fetched` is the number of URLs whose bodies actually came back —
2066
+ // robots-disallowed, byte-budget-exceeded, content-type-filtered, and 4xx
2067
+ // URLs the matrix INTENDED to refetch may have dropped out before we got
2068
+ // here. `intended` (= scrapePlan.refetch.size) is exposed too so callers
2069
+ // can spot the gap (e.g. "intended 200, fetched 187, 13 URLs dropped").
2070
+ summary.scrapePlan = {
2071
+ fetched: loadedPages.length,
2072
+ intended: scrapePlan.refetch.size,
2073
+ carriedForward: scrapePlan.skip.size,
2074
+ reasonCounts,
2075
+ rulesetVersion: CORE_RULESET_VERSION,
2076
+ lastFullAuditAt: priorState.lastFullAuditAt ?? priorState.lastRun ?? null,
2077
+ };
2078
+ }
1580
2079
  // v0.4.1: surface noindex / auth skips as a discoverable diagnostic so the
1581
2080
  // user sees what the engine excluded. Catches the accidental-noindex bug:
1582
2081
  // pages silently dropped from indexing show up as a visible skip line
@@ -1619,6 +2118,13 @@ export async function auditSource(source, options) {
1619
2118
  for (const f of enrichedFindings) {
1620
2119
  if (!f.pageUrl)
1621
2120
  continue;
2121
+ // Carried-forward findings are not "current" — we did not re-verify them
2122
+ // this run. Including them would mask a genuine regression on a skipped
2123
+ // URL: prior set has rule X carried-forward, current set also has X
2124
+ // (carried-forward), comparison says "no new rule", we miss the case
2125
+ // where the page actually started failing rule Y too.
2126
+ if (f.carriedForward)
2127
+ continue;
1622
2128
  const set = currentFindings.get(f.pageUrl) ?? new Set();
1623
2129
  set.add(f.ruleId);
1624
2130
  currentFindings.set(f.pageUrl, set);
@@ -1644,6 +2150,12 @@ export async function auditSource(source, options) {
1644
2150
  const renderMode = options.render ? "rendered" : "static";
1645
2151
  const urls = {};
1646
2152
  const findingsByUrl = new Map();
2153
+ // v0.5+: persist full finding records per URL so future monitoring runs
2154
+ // can carry them forward when the URL is skipped pre-fetch. Carried-
2155
+ // forward findings (carriedForward=true) are NOT re-persisted under the
2156
+ // fetched URL — they belong to the prior entry that's preserved verbatim
2157
+ // for skipped URLs above.
2158
+ const fullFindingsByUrl = new Map();
1647
2159
  for (const f of enrichedFindings) {
1648
2160
  if (!f.pageUrl)
1649
2161
  continue;
@@ -1651,9 +2163,16 @@ export async function auditSource(source, options) {
1651
2163
  if (!list.includes(f.ruleId))
1652
2164
  list.push(f.ruleId);
1653
2165
  findingsByUrl.set(f.pageUrl, list);
2166
+ if (!f.carriedForward) {
2167
+ const records = fullFindingsByUrl.get(f.pageUrl) ?? [];
2168
+ records.push(f);
2169
+ fullFindingsByUrl.set(f.pageUrl, records);
2170
+ }
1654
2171
  }
1655
- // Preserve prior entries for URLs skipped by --since (they didn't change).
1656
- // Without this, delta runs would lose state for unchanged URLs.
2172
+ // Preserve prior entries for URLs the monitoring matrix skipped (we never
2173
+ // fetched them this run; their fetchedAt MUST NOT advance or the age floor
2174
+ // never trips). Skipped URLs include those in scrapePlan.skip plus any
2175
+ // robots-skipped URLs from prior runs that are still in priorState.
1657
2176
  if (priorState && skippedUrls.length > 0) {
1658
2177
  for (const url of skippedUrls) {
1659
2178
  const prior = priorState.urls[url];
@@ -1661,19 +2180,65 @@ export async function auditSource(source, options) {
1661
2180
  urls[url] = prior;
1662
2181
  }
1663
2182
  }
2183
+ const nowIso = new Date().toISOString();
1664
2184
  for (const p of loadedPages) {
1665
- urls[p.url] = {
2185
+ const priorEntry = priorState?.urls[p.url];
2186
+ const responseHeaders = p.httpMeta?.headers;
2187
+ const lastModifiedHeader = responseHeaders?.["last-modified"];
2188
+ const etagHeader = responseHeaders?.["etag"];
2189
+ const sitemapLastmodForUrl = sitemapLastmodByUrl?.get(p.url);
2190
+ const entry = {
1666
2191
  contentHash: computeContentHash(p.html),
1667
- fetchedAt: new Date().toISOString(),
2192
+ fetchedAt: nowIso,
1668
2193
  status: p.httpMeta?.statusCode ?? 200,
1669
2194
  findingIds: findingsByUrl.get(p.url) ?? [],
2195
+ findings: (fullFindingsByUrl.get(p.url) ?? []).map((f) => ({
2196
+ id: `${f.ruleId}::${p.url}`,
2197
+ ruleId: f.ruleId,
2198
+ severity: f.severity,
2199
+ confidence: f.confidence ?? "high",
2200
+ message: f.message,
2201
+ ...(f.fix !== undefined ? { fix: f.fix } : {}),
2202
+ ...(f.ref !== undefined ? { ref: f.ref } : {}),
2203
+ ...(f.docsUrl !== undefined ? { docsUrl: f.docsUrl } : {}),
2204
+ ...(f.pageUrl !== undefined ? { url: f.pageUrl } : {}),
2205
+ ...(f.relatedUrls !== undefined ? { relatedUrls: f.relatedUrls } : {}),
2206
+ ...(f.group !== undefined ? { group: f.group } : {}),
2207
+ ...(f.similarity !== undefined ? { similarity: f.similarity } : {}),
2208
+ ...(f.context !== undefined ? { context: f.context } : {}),
2209
+ ...(f.effort !== undefined ? { effort: f.effort } : {}),
2210
+ })),
2211
+ rulesetVersion: CORE_RULESET_VERSION,
1670
2212
  };
2213
+ if (lastModifiedHeader)
2214
+ entry.lastModified = lastModifiedHeader;
2215
+ else if (priorEntry?.lastModified)
2216
+ entry.lastModified = priorEntry.lastModified;
2217
+ if (etagHeader)
2218
+ entry.etag = etagHeader;
2219
+ else if (priorEntry?.etag)
2220
+ entry.etag = priorEntry.etag;
2221
+ if (sitemapLastmodForUrl)
2222
+ entry.sitemapLastmodAtAudit = sitemapLastmodForUrl;
2223
+ else if (priorEntry?.sitemapLastmodAtAudit)
2224
+ entry.sitemapLastmodAtAudit = priorEntry.sitemapLastmodAtAudit;
2225
+ urls[p.url] = entry;
1671
2226
  }
2227
+ // `lastFullAuditAt` advances only when this run actually re-fetched every
2228
+ // candidate URL. In monitoring mode (matrix skipped some URLs), preserve
2229
+ // the prior baseline timestamp so callers can reason about staleness.
2230
+ // In fresh mode (every candidate URL was fetched), bump to now.
2231
+ const isMonitoringRun = effectiveMode === "monitoring" && priorState !== null;
2232
+ const lastFullAuditAt = isMonitoringRun
2233
+ ? (priorState?.lastFullAuditAt ?? priorState?.lastRun ?? nowIso)
2234
+ : nowIso;
1672
2235
  const newState = {
1673
2236
  version: STATE_SCHEMA_VERSION,
1674
- lastRun: new Date().toISOString(),
2237
+ lastRun: nowIso,
2238
+ lastFullAuditAt,
1675
2239
  source,
1676
2240
  renderMode,
2241
+ rulesetVersion: CORE_RULESET_VERSION,
1677
2242
  urls,
1678
2243
  summary: {
1679
2244
  score: summary.risk,