@pseolint/core 0.4.0 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/dist/auditor.d.ts +12 -1
  2. package/dist/auditor.d.ts.map +1 -1
  3. package/dist/auditor.js +317 -43
  4. package/dist/auditor.js.map +1 -1
  5. package/dist/formatters/bucket-findings.d.ts +43 -0
  6. package/dist/formatters/bucket-findings.d.ts.map +1 -0
  7. package/dist/formatters/bucket-findings.js +110 -0
  8. package/dist/formatters/bucket-findings.js.map +1 -0
  9. package/dist/formatters/console.d.ts.map +1 -1
  10. package/dist/formatters/console.js +103 -34
  11. package/dist/formatters/console.js.map +1 -1
  12. package/dist/formatters/fixplan.d.ts +13 -0
  13. package/dist/formatters/fixplan.d.ts.map +1 -0
  14. package/dist/formatters/fixplan.js +328 -0
  15. package/dist/formatters/fixplan.js.map +1 -0
  16. package/dist/formatters/html.d.ts.map +1 -1
  17. package/dist/formatters/html.js +27 -0
  18. package/dist/formatters/html.js.map +1 -1
  19. package/dist/formatters/index.d.ts +2 -0
  20. package/dist/formatters/index.d.ts.map +1 -1
  21. package/dist/formatters/index.js +1 -0
  22. package/dist/formatters/index.js.map +1 -1
  23. package/dist/formatters/markdown.d.ts.map +1 -1
  24. package/dist/formatters/markdown.js +77 -7
  25. package/dist/formatters/markdown.js.map +1 -1
  26. package/dist/page-filter.d.ts +108 -0
  27. package/dist/page-filter.d.ts.map +1 -0
  28. package/dist/page-filter.js +207 -0
  29. package/dist/page-filter.js.map +1 -0
  30. package/dist/rules/aeo/answer-first.d.ts.map +1 -1
  31. package/dist/rules/aeo/answer-first.js +17 -3
  32. package/dist/rules/aeo/answer-first.js.map +1 -1
  33. package/dist/rules/aeo/citable-facts.d.ts.map +1 -1
  34. package/dist/rules/aeo/citable-facts.js +12 -1
  35. package/dist/rules/aeo/citable-facts.js.map +1 -1
  36. package/dist/rules/aeo/content-modularity.d.ts.map +1 -1
  37. package/dist/rules/aeo/content-modularity.js +3 -0
  38. package/dist/rules/aeo/content-modularity.js.map +1 -1
  39. package/dist/rules/aeo/crawler-access.d.ts.map +1 -1
  40. package/dist/rules/aeo/crawler-access.js +6 -0
  41. package/dist/rules/aeo/crawler-access.js.map +1 -1
  42. package/dist/rules/aeo/faq-coverage.d.ts.map +1 -1
  43. package/dist/rules/aeo/faq-coverage.js +4 -0
  44. package/dist/rules/aeo/faq-coverage.js.map +1 -1
  45. package/dist/rules/aeo/freshness-signals.d.ts.map +1 -1
  46. package/dist/rules/aeo/freshness-signals.js +9 -2
  47. package/dist/rules/aeo/freshness-signals.js.map +1 -1
  48. package/dist/rules/aeo/llms-txt.d.ts.map +1 -1
  49. package/dist/rules/aeo/llms-txt.js +6 -1
  50. package/dist/rules/aeo/llms-txt.js.map +1 -1
  51. package/dist/rules/aeo/summary-bait.d.ts.map +1 -1
  52. package/dist/rules/aeo/summary-bait.js +5 -2
  53. package/dist/rules/aeo/summary-bait.js.map +1 -1
  54. package/dist/rules/content/missing-author.d.ts.map +1 -1
  55. package/dist/rules/content/missing-author.js +10 -2
  56. package/dist/rules/content/missing-author.js.map +1 -1
  57. package/dist/rules/spam/thin-content.d.ts.map +1 -1
  58. package/dist/rules/spam/thin-content.js +9 -1
  59. package/dist/rules/spam/thin-content.js.map +1 -1
  60. package/dist/site-classifier.d.ts +1 -1
  61. package/dist/site-classifier.d.ts.map +1 -1
  62. package/dist/site-classifier.js +216 -0
  63. package/dist/site-classifier.js.map +1 -1
  64. package/dist/types.d.ts +77 -2
  65. package/dist/types.d.ts.map +1 -1
  66. package/dist/types.js.map +1 -1
  67. package/package.json +1 -1
package/dist/auditor.d.ts CHANGED
@@ -1,3 +1,14 @@
1
- import type { AuditOptions, AuditSummary } from "./types.js";
1
+ import type { AuditOptions, AuditSummary, RuleResult } from "./types.js";
2
+ import { type SiteClassification } from "./site-classifier.js";
3
+ /**
4
+ * v0.4.3 — apply per-site-type severity + confidence overrides BEFORE any
5
+ * bucketing happens, so blocker/shouldFix counts and category buckets all
6
+ * reflect the user-visible severity, not the rule's native severity.
7
+ *
8
+ * Returns a NEW array of findings (does not mutate the input). Only the
9
+ * `severity` and `confidence` fields are remapped; everything else is
10
+ * preserved by reference.
11
+ */
12
+ export declare function applyScoringProfileOverrides(findings: RuleResult[], classification: SiteClassification | undefined): RuleResult[];
2
13
  export declare function auditSource(source: string, options?: AuditOptions): Promise<AuditSummary>;
3
14
  //# sourceMappingURL=auditor.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"auditor.d.ts","sourceRoot":"","sources":["../src/auditor.ts"],"names":[],"mappings":"AAwDA,OAAO,KAAK,EACV,YAAY,EACZ,YAAY,EAab,MAAM,YAAY,CAAC;AAoiCpB,wBAAsB,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC,CAmrB/F"}
1
+ {"version":3,"file":"auditor.d.ts","sourceRoot":"","sources":["../src/auditor.ts"],"names":[],"mappings":"AAyDA,OAAO,KAAK,EACV,YAAY,EACZ,YAAY,EAWZ,UAAU,EAGX,MAAM,YAAY,CAAC;AAQpB,OAAO,EAAgB,KAAK,kBAAkB,EAAiB,MAAM,sBAAsB,CAAC;AAwhB5F;;;;;;;;GAQG;AACH,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,UAAU,EAAE,EACtB,cAAc,EAAE,kBAAkB,GAAG,SAAS,GAC7C,UAAU,EAAE,CAed;AAquBD,wBAAsB,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC,CA0wB/F"}
package/dist/auditor.js CHANGED
@@ -2,6 +2,7 @@ import { createHash } from "node:crypto";
2
2
  import { readdir, readFile, stat } from "node:fs/promises";
3
3
  import { extname, join, resolve } from "node:path";
4
4
  import { parseHtmlPage } from "./parser.js";
5
+ import { pageSkipReason } from "./page-filter.js";
5
6
  import { mergeNormalizeUrlOptions, normalizeAuditUrl } from "./url-normalize.js";
6
7
  import { eeatSignalsRule } from "./rules/content/eeat-signals.js";
7
8
  import { metaUniquenessRule } from "./rules/content/meta-uniqueness.js";
@@ -76,17 +77,6 @@ const DEFAULTS = {
76
77
  modularityMinSelfContainedRatio: 0.7,
77
78
  faqMinQuestionHeadings: 2
78
79
  };
79
- /**
80
- * v0.4 four-category weights. Audit is diagnostic-only (weight 0).
81
- * See 2026-04-29 v0.4 redesign spec §4.2.
82
- */
83
- const CATEGORY_WEIGHTS = {
84
- integrity: 0.50, // spam + content + cannibal
85
- discoverability: 0.20, // links + tech
86
- citation: 0.25, // aeo + schema
87
- data: 0.05, // data
88
- audit: 0, // diagnostics, never weighted
89
- };
90
80
  /**
91
81
  * Maps the v0.3 ruleId namespace prefix to the v0.4 four-bucket category.
92
82
  * Used by `scoreFromFindings` to bucket findings without changing rule IDs.
@@ -102,6 +92,138 @@ const CATEGORY_MAP = {
102
92
  data: "data",
103
93
  audit: "audit",
104
94
  };
95
+ const SCORING_PROFILES = {
96
+ "small-marketing": {
97
+ categoryWeights: { integrity: 0.30, discoverability: 0.40, citation: 0.20, data: 0.05, audit: 0 },
98
+ severityOverrides: {
99
+ "aeo/citable-facts": "info",
100
+ "aeo/answer-first": "info",
101
+ "aeo/summary-bait": "warning",
102
+ "spam/thin-content": "warning",
103
+ },
104
+ confidenceOverrides: {
105
+ "aeo/citable-facts": "low",
106
+ "aeo/answer-first": "low",
107
+ "aeo/summary-bait": "medium",
108
+ "spam/thin-content": "medium",
109
+ },
110
+ },
111
+ "blog": {
112
+ categoryWeights: { integrity: 0.40, discoverability: 0.25, citation: 0.30, data: 0.05, audit: 0 },
113
+ severityOverrides: {
114
+ "content/missing-author": "error",
115
+ "spam/thin-content": "error",
116
+ },
117
+ confidenceOverrides: {},
118
+ },
119
+ "programmatic-directory": {
120
+ categoryWeights: { integrity: 0.55, discoverability: 0.15, citation: 0.20, data: 0.10, audit: 0 },
121
+ severityOverrides: {},
122
+ confidenceOverrides: {},
123
+ },
124
+ "ecommerce": {
125
+ categoryWeights: { integrity: 0.20, discoverability: 0.40, citation: 0.15, data: 0.25, audit: 0 },
126
+ severityOverrides: {
127
+ "aeo/citable-facts": "info",
128
+ "schema/required-fields": "error",
129
+ },
130
+ confidenceOverrides: {
131
+ "aeo/citable-facts": "low",
132
+ },
133
+ },
134
+ "docs": {
135
+ categoryWeights: { integrity: 0.30, discoverability: 0.30, citation: 0.30, data: 0.10, audit: 0 },
136
+ severityOverrides: {
137
+ "aeo/citable-facts": "info",
138
+ "aeo/answer-first": "warning",
139
+ "content/missing-author": "info",
140
+ },
141
+ confidenceOverrides: {
142
+ "aeo/citable-facts": "low",
143
+ "aeo/answer-first": "low",
144
+ "content/missing-author": "low",
145
+ },
146
+ },
147
+ "unclear": {
148
+ categoryWeights: { integrity: 0.50, discoverability: 0.20, citation: 0.25, data: 0.05, audit: 0 },
149
+ severityOverrides: {},
150
+ confidenceOverrides: {},
151
+ },
152
+ };
153
+ /**
154
+ * Pick the scoring profile for a classification. Falls back to `unclear`
155
+ * (the conservative default) when classifier confidence is below 70%.
156
+ */
157
+ function profileFor(classification) {
158
+ if (!classification || classification.confidence < 0.7)
159
+ return SCORING_PROFILES.unclear;
160
+ return SCORING_PROFILES[classification.type] ?? SCORING_PROFILES.unclear;
161
+ }
162
+ const RULE_IMPACTS = {
163
+ // SpamBrain — high baseline, count amplifies (cluster matters)
164
+ "spam/near-duplicate": { baseImpact: 25, perInstance: 5, maxImpact: 80 },
165
+ "spam/entity-swap": { baseImpact: 25, perInstance: 5, maxImpact: 80 },
166
+ "spam/doorway-pattern": { baseImpact: 30, perInstance: 0, maxImpact: 30 },
167
+ "spam/template-coverage": { baseImpact: 15, perInstance: 3, maxImpact: 60 },
168
+ "spam/template-diversity": { baseImpact: 12, perInstance: 3, maxImpact: 50 },
169
+ "spam/boilerplate-ratio": { baseImpact: 10, perInstance: 2, maxImpact: 40 },
170
+ "spam/thin-content": { baseImpact: 8, perInstance: 2, maxImpact: 40 },
171
+ "spam/publication-velocity": { baseImpact: 8, perInstance: 2, maxImpact: 30 },
172
+ "cannibal/url-pattern": { baseImpact: 10, perInstance: 2, maxImpact: 40 },
173
+ // Content
174
+ "content/unique-value": { baseImpact: 10, perInstance: 2, maxImpact: 40 },
175
+ "content/meta-uniqueness": { baseImpact: 8, perInstance: 2, maxImpact: 40 },
176
+ "content/missing-author": { baseImpact: 4, perInstance: 1, maxImpact: 20 },
177
+ "content/eeat-signals": { baseImpact: 4, perInstance: 1, maxImpact: 20 },
178
+ // Tech — softened in v0.4.3-rc2 after dogfood showed nextjs.org regressing
179
+ // from ready→caution on tech/canonical-consistency × 4 (legit cross-domain
180
+ // canonicals on a CDN). Per-instance now 1 (was 3).
181
+ "tech/canonical-consistency": { baseImpact: 8, perInstance: 1, maxImpact: 25 },
182
+ "tech/canonical-noindex-conflict": { baseImpact: 10, perInstance: 2, maxImpact: 40 },
183
+ "tech/robots-noindex-conflict": { baseImpact: 10, perInstance: 2, maxImpact: 40 },
184
+ "tech/redirect-chain": { baseImpact: 5, perInstance: 1, maxImpact: 25 },
185
+ "tech/sitemap-completeness": { baseImpact: 8, perInstance: 1, maxImpact: 30 },
186
+ "tech/robots-sitemap-presence": { baseImpact: 8, perInstance: 0, maxImpact: 8 },
187
+ "tech/soft-404": { baseImpact: 6, perInstance: 1, maxImpact: 30 },
188
+ // hreflang — one bad declaration breaks all language pairs, so the COUNT
189
+ // doesn't compound. perInstance: 0 keeps it at the base impact regardless
190
+ // of how many language pairs are affected. Dogfood showed 350 findings on
191
+ // stripe.com from a single missing reciprocal pair — that should not be
192
+ // treated as 350× the impact.
193
+ "tech/hreflang-consistency": { baseImpact: 5, perInstance: 0, maxImpact: 5 },
194
+ // Links
195
+ "links/orphan-pages": { baseImpact: 5, perInstance: 1, maxImpact: 25 },
196
+ "links/dead-ends": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
197
+ "links/cluster-connectivity": { baseImpact: 5, perInstance: 1, maxImpact: 25 },
198
+ "links/link-depth": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
199
+ // AEO — much lower baselines than spam (AEO is opt-in optimization)
200
+ "aeo/citable-facts": { baseImpact: 2, perInstance: 1, maxImpact: 25 },
201
+ "aeo/answer-first": { baseImpact: 3, perInstance: 1, maxImpact: 25 },
202
+ "aeo/summary-bait": { baseImpact: 4, perInstance: 1, maxImpact: 25 },
203
+ "aeo/crawler-access": { baseImpact: 8, perInstance: 0, maxImpact: 8 },
204
+ "aeo/freshness-signals": { baseImpact: 2, perInstance: 1, maxImpact: 20 },
205
+ "aeo/llms-txt": { baseImpact: 4, perInstance: 0, maxImpact: 4 },
206
+ "aeo/faq-coverage": { baseImpact: 2, perInstance: 1, maxImpact: 15 },
207
+ "aeo/content-modularity": { baseImpact: 2, perInstance: 1, maxImpact: 15 },
208
+ // Schema
209
+ "schema/json-ld-valid": { baseImpact: 8, perInstance: 2, maxImpact: 35 },
210
+ "schema/required-fields": { baseImpact: 6, perInstance: 1, maxImpact: 30 },
211
+ "schema/consistency": { baseImpact: 3, perInstance: 1, maxImpact: 15 },
212
+ // Data
213
+ "data/data-binding": { baseImpact: 6, perInstance: 1, maxImpact: 30 },
214
+ };
215
+ const DEFAULT_RULE_IMPACT = { baseImpact: 5, perInstance: 1, maxImpact: 25 };
216
+ /**
217
+ * v0.4.3 — confidence-based discount applied to each finding's impact.
218
+ * Low-confidence findings contribute less to the bucket so they don't
219
+ * inflate the verdict on site types where they false-positive.
220
+ */
221
+ const CONFIDENCE_MULTIPLIER = {
222
+ high: 1.0,
223
+ medium: 0.6,
224
+ low: 0.3,
225
+ speculative: 0.1,
226
+ };
105
227
  /** Slug map for `RuleResult.docsUrl`. Defaults to the rule-id segment after the `/`. */
106
228
  const RULE_DOCS_SLUG = {
107
229
  // intentionally empty for v0.4 — slug = ruleId.split("/").pop() works for every shipped rule
@@ -172,7 +294,16 @@ function resolveGroupRules(baseRules, overrides) {
172
294
  }
173
295
  return result;
174
296
  }
175
- function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides, mode = "full") {
297
+ function runRulesOnPages(pages,
298
+ /**
299
+ * Full set of parsed pages including those filtered out by `respectNoindex`
300
+ * / `skipDetectedAuth`. Defaults to `pages` for backwards compat. The two
301
+ * noindex-conflict rules (`tech/canonical-noindex-conflict`,
302
+ * `tech/robots-noindex-conflict`) read this list specifically — without it,
303
+ * `respectNoindex: true` would hide noindex'd pages from the very rules
304
+ * designed to flag accidental noindex'ing.
305
+ */
306
+ noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides, mode = "full") {
176
307
  const findings = [];
177
308
  const modeOk = (ruleId) => mode !== "diff" || isRuleAllowedInDiff(ruleId);
178
309
  const tag = (results) => results.map((r) => {
@@ -245,10 +376,10 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
245
376
  findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
246
377
  }
247
378
  if (isEnabled("tech/canonical-noindex-conflict") && modeOk("tech/canonical-noindex-conflict")) {
248
- findings.push(...tag(canonicalNoindexConflictRule(pages, normalizeUrlOptions)));
379
+ findings.push(...tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
249
380
  }
250
381
  if (isEnabled("tech/robots-noindex-conflict") && modeOk("tech/robots-noindex-conflict")) {
251
- findings.push(...tag(robotsNoindexConflictRule(pages, inbound)));
382
+ findings.push(...tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
252
383
  }
253
384
  if (isEnabled("tech/redirect-chain") && modeOk("tech/redirect-chain")) {
254
385
  findings.push(...tag(redirectChainRule(pages)));
@@ -257,7 +388,9 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
257
388
  findings.push(...tag(soft404Rule(pages)));
258
389
  }
259
390
  if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
260
- findings.push(...tag(hreflangConsistencyRule(pages, normalizeUrlOptions)));
391
+ // hreflang declarations on noindex'd pages are still bugs when they're
392
+ // inconsistent — see auditor.test.ts "emits technical SEO findings".
393
+ findings.push(...tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
261
394
  }
262
395
  // Schema rules
263
396
  if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
@@ -311,13 +444,47 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
311
444
  function hashHtml(html) {
312
445
  return createHash("sha256").update(html, "utf8").digest("hex");
313
446
  }
314
- const SEVERITY_WEIGHTS = {
315
- critical: 40,
316
- error: 25,
317
- warning: 12,
318
- info: 5,
319
- };
320
- function scoreFromFindings(findings) {
447
+ /**
448
+ * v0.4.3 — apply per-site-type severity + confidence overrides BEFORE any
449
+ * bucketing happens, so blocker/shouldFix counts and category buckets all
450
+ * reflect the user-visible severity, not the rule's native severity.
451
+ *
452
+ * Returns a NEW array of findings (does not mutate the input). Only the
453
+ * `severity` and `confidence` fields are remapped; everything else is
454
+ * preserved by reference.
455
+ */
456
+ export function applyScoringProfileOverrides(findings, classification) {
457
+ const profile = profileFor(classification);
458
+ const sevHas = Object.keys(profile.severityOverrides).length > 0;
459
+ const confHas = Object.keys(profile.confidenceOverrides).length > 0;
460
+ if (!sevHas && !confHas)
461
+ return findings;
462
+ return findings.map((f) => {
463
+ const newSev = profile.severityOverrides[f.ruleId];
464
+ const newConf = profile.confidenceOverrides[f.ruleId];
465
+ if (newSev === undefined && newConf === undefined)
466
+ return f;
467
+ return {
468
+ ...f,
469
+ ...(newSev !== undefined ? { severity: newSev } : {}),
470
+ ...(newConf !== undefined ? { confidence: newConf } : {}),
471
+ };
472
+ });
473
+ }
474
+ /**
475
+ * v0.4.3 — confidence-and-count-aware scoring. Replaces the v0.4 model that
476
+ * counted only severity. Each rule has a `baseImpact + (count - 1) *
477
+ * perInstance` contribution capped by `maxImpact`. The result is multiplied
478
+ * by the finding's `confidence` (default `high` → 1.0). Per-site-type
479
+ * profiles can remap a rule's severity / confidence; this function expects
480
+ * those overrides to ALREADY be applied to the input findings.
481
+ *
482
+ * Bucket math: per-rule impacts sum into the rule's `CATEGORY_MAP` bucket;
483
+ * each bucket is then capped at 100 and weighted by the active scoring
484
+ * profile's `categoryWeights`.
485
+ */
486
+ function scoreFromFindings(findings, classification) {
487
+ const profile = profileFor(classification);
321
488
  // v0.4 four-bucket raw penalties.
322
489
  const bucketRaw = {
323
490
  integrity: 0,
@@ -336,18 +503,16 @@ function scoreFromFindings(findings) {
336
503
  let blockers = 0;
337
504
  let shouldFix = 0;
338
505
  let informational = 0;
506
+ // Group findings by ruleId so we can apply baseImpact + perInstance.
507
+ // Each group's weighted impact lands in its category bucket.
508
+ const groups = new Map();
339
509
  for (const finding of findings) {
340
510
  const namespace = finding.ruleId.split("/")[0];
341
511
  const bucket = CATEGORY_MAP[namespace];
342
512
  if (!bucket)
343
513
  continue;
344
- const weight = SEVERITY_WEIGHTS[finding.severity];
345
- // v0.4 buckets.
346
- bucketRaw[bucket] = Math.min(100, bucketRaw[bucket] + weight);
347
- if (bucket !== "audit") {
514
+ if (bucket !== "audit")
348
515
  bucketIssues[bucket] += 1;
349
- }
350
- // Issue-bucket counts (audit/* findings are diagnostic-only and excluded).
351
516
  if (bucket === "audit")
352
517
  continue;
353
518
  if (finding.severity === "critical" || finding.severity === "error")
@@ -356,11 +521,40 @@ function scoreFromFindings(findings) {
356
521
  shouldFix += 1;
357
522
  else
358
523
  informational += 1;
524
+ const arr = groups.get(finding.ruleId) ?? [];
525
+ arr.push(finding);
526
+ groups.set(finding.ruleId, arr);
359
527
  }
360
- const weighted = bucketRaw.integrity * CATEGORY_WEIGHTS.integrity +
361
- bucketRaw.discoverability * CATEGORY_WEIGHTS.discoverability +
362
- bucketRaw.citation * CATEGORY_WEIGHTS.citation +
363
- bucketRaw.data * CATEGORY_WEIGHTS.data;
528
+ for (const [ruleId, group] of groups) {
529
+ const namespace = ruleId.split("/")[0];
530
+ const bucket = CATEGORY_MAP[namespace];
531
+ if (!bucket || bucket === "audit")
532
+ continue;
533
+ const impactSpec = RULE_IMPACTS[ruleId] ?? DEFAULT_RULE_IMPACT;
534
+ const count = group.length;
535
+ const rawImpact = impactSpec.baseImpact + Math.max(0, count - 1) * impactSpec.perInstance;
536
+ const cap = impactSpec.maxImpact ?? Number.POSITIVE_INFINITY;
537
+ const cappedImpact = Math.min(cap, rawImpact);
538
+ // Confidence multiplier — use the WORST (highest-multiplier) confidence
539
+ // in the group so a rule that fires repeatedly with mixed confidence is
540
+ // not unfairly downweighted to its lowest-confidence instance.
541
+ let bestMultiplier = 0;
542
+ for (const f of group) {
543
+ const conf = f.confidence ?? "high";
544
+ const m = CONFIDENCE_MULTIPLIER[conf];
545
+ if (m > bestMultiplier)
546
+ bestMultiplier = m;
547
+ }
548
+ if (bestMultiplier === 0)
549
+ bestMultiplier = CONFIDENCE_MULTIPLIER.high;
550
+ const weighted = cappedImpact * bestMultiplier;
551
+ bucketRaw[bucket] = Math.min(100, bucketRaw[bucket] + weighted);
552
+ }
553
+ const cw = profile.categoryWeights;
554
+ const weighted = bucketRaw.integrity * cw.integrity +
555
+ bucketRaw.discoverability * cw.discoverability +
556
+ bucketRaw.citation * cw.citation +
557
+ bucketRaw.data * cw.data;
364
558
  const risk = Math.round(Math.min(100, weighted));
365
559
  const categories = {
366
560
  integrity: { grade: gradeForPenalty(bucketRaw.integrity), issues: bucketIssues.integrity },
@@ -944,6 +1138,11 @@ export async function auditSource(source, options) {
944
1138
  const concurrency = options?.concurrency ?? preset.concurrency ?? 5;
945
1139
  const timeoutMs = options?.timeout ?? 30000;
946
1140
  const ignorePatterns = options?.ignore ?? [];
1141
+ const respectNoindex = options?.respectNoindex ?? true;
1142
+ const skipDetectedAuth = options?.skipDetectedAuth ?? false;
1143
+ const skipBoilerplate = options?.skipBoilerplate ?? false;
1144
+ const skipSearchPages = options?.skipSearchPages ?? false;
1145
+ const skipEmptyBody = options?.skipEmptyBody ?? false;
947
1146
  const sampleSize = options?.sampleSize ?? preset.sampleSize ?? 0;
948
1147
  const externalSignal = options?.signal;
949
1148
  const guardSsrf = options?.guardSsrf ?? preset.guardSsrf ?? false;
@@ -1140,13 +1339,33 @@ export async function auditSource(source, options) {
1140
1339
  })()
1141
1340
  : fisherYatesSample(filtered, sampleSize))
1142
1341
  : filtered;
1143
- const parsedPages = sampled.map((page) => {
1342
+ const parsedPagesAll = sampled.map((page) => {
1144
1343
  const parsed = parseHtmlPage(page.html, page.url, { normalizeUrl: normalizeUrlOptions });
1145
1344
  if (page.httpMeta) {
1146
1345
  parsed.httpMeta = page.httpMeta;
1147
1346
  }
1148
1347
  return parsed;
1149
1348
  });
1349
+ // v0.4.1 §page-filter: drop noindex'd pages and (when enabled) heuristically
1350
+ // detected auth pages BEFORE rule evaluation. The site owner's noindex is a
1351
+ // hard signal — they already opted out of SEO indexing, so auditing those
1352
+ // URLs produces only noise. Auth detection is opt-in via skipDetectedAuth
1353
+ // (off for the CLI by default; on for the hosted web form).
1354
+ const skippedByPolicy = [];
1355
+ const parsedPages = parsedPagesAll.filter((p) => {
1356
+ const reason = pageSkipReason(p, {
1357
+ respectNoindex,
1358
+ skipDetectedAuth,
1359
+ skipBoilerplate,
1360
+ skipSearchPages,
1361
+ skipEmptyBody,
1362
+ });
1363
+ if (reason) {
1364
+ skippedByPolicy.push({ url: p.url, reason });
1365
+ return false;
1366
+ }
1367
+ return true;
1368
+ });
1150
1369
  const knownUrls = new Set(parsedPages.map((p) => p.url));
1151
1370
  const rootUrl = parsedPages.find((p) => /(^|[\\/])index\.html?$/i.test(p.url))?.url ?? parsedPages[0]?.url ?? "";
1152
1371
  const adjacency = new Map();
@@ -1267,10 +1486,13 @@ export async function auditSource(source, options) {
1267
1486
  continue;
1268
1487
  const groupRules = resolveGroupRules(resolvedRules, groupConfig?.overrides);
1269
1488
  const enabledCheck = (ruleId) => !suppressedRuleSet.has(ruleId) && isRuleEnabled(ruleId, groupConfig?.rules);
1270
- const findings = runRulesOnPages(groupPages, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full");
1489
+ const findings = runRulesOnPages(groupPages, parsedPagesAll, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full");
1271
1490
  allFindings.push(...findings);
1272
1491
  groupPageCounts[groupName] = groupPages.length;
1273
- const { risk: groupRisk } = scoreFromFindings(findings);
1492
+ // v0.4.3: per-group scoring uses the same site-classification profile so
1493
+ // group-level risk numbers reflect the same severity / confidence remaps
1494
+ // as the headline verdict.
1495
+ const { risk: groupRisk } = scoreFromFindings(applyScoringProfileOverrides(findings, siteClassification), siteClassification);
1274
1496
  groupScores[groupName] = groupRisk;
1275
1497
  }
1276
1498
  throwIfAborted();
@@ -1280,7 +1502,13 @@ export async function auditSource(source, options) {
1280
1502
  });
1281
1503
  // Populate docsUrl on every finding before they leave the engine.
1282
1504
  withDocsUrls(enriched.findings);
1283
- const { risk, categories, bucketCounts } = scoreFromFindings(enriched.findings);
1505
+ // v0.4.3: apply site-type-aware severity + confidence overrides so blocker
1506
+ // counts, issue buckets, and category bucketing all reflect the user-visible
1507
+ // severity (not the rule's native severity). The remapped findings replace
1508
+ // the enrichment output so every downstream consumer (summary.issues, AI
1509
+ // triage input, telemetry, formatters) sees the corrected severity.
1510
+ enriched.findings = applyScoringProfileOverrides(enriched.findings, siteClassification);
1511
+ const { risk, categories, bucketCounts } = scoreFromFindings(enriched.findings, siteClassification);
1284
1512
  const auditedPageCount = Object.values(groupPageCounts).reduce((a, b) => a + b, 0);
1285
1513
  const issues = bucketIssues(enriched.findings);
1286
1514
  const verdict = verdictForRisk(risk);
@@ -1317,22 +1545,68 @@ export async function auditSource(source, options) {
1317
1545
  if (cacheConfig) {
1318
1546
  summary.cacheStats = cacheStats;
1319
1547
  }
1320
- // v0.4 §4.5: warn when an `ignore` pattern matched zero discovered URLs.
1548
+ // v0.4 §4.5 / v0.4.1: warn when ignore patterns matched zero discovered URLs.
1549
+ // - Per-pattern warning fires only when `warnUnmatchedIgnore` is true
1550
+ // (set by the CLI when `--ignore` was passed explicitly). Quiet by
1551
+ // default for config-loaded patterns where broad safety lists like
1552
+ // `**/dashboard/**` legitimately don't match small marketing sites.
1553
+ // - When ALL patterns matched zero (strongest typo signal, e.g. user
1554
+ // wrote `*.json` instead of `**/*.json`), emit a single consolidated
1555
+ // warning regardless of source.
1321
1556
  if (ignorePatterns.length > 0) {
1322
- for (const pattern of ignorePatterns) {
1323
- const matched = deduped.some((p) => globMatchPathname(pattern, p.url));
1324
- if (!matched) {
1557
+ const unmatched = ignorePatterns.filter((pattern) => !deduped.some((p) => globMatchPathname(pattern, p.url)));
1558
+ if (unmatched.length === ignorePatterns.length) {
1559
+ // eslint-disable-next-line no-console
1560
+ console.warn(`[pseolint] none of the ${ignorePatterns.length} ignore pattern${ignorePatterns.length === 1 ? "" : "s"} matched any URLs — check config or --ignore for typos`);
1561
+ }
1562
+ else if (options?.warnUnmatchedIgnore === true) {
1563
+ for (const pattern of unmatched) {
1325
1564
  // eslint-disable-next-line no-console
1326
1565
  console.warn(`[pseolint] ignore pattern '${pattern}' matched 0 URLs — likely typo`);
1327
1566
  }
1328
1567
  }
1329
1568
  }
1330
- // Merge state-skipped (unchanged since last run) and robots-skipped (target
1331
- // robots.txt Disallow'd) URLs so callers have a single audit-skipped surface.
1332
- const allSkipped = [...skippedUrls, ...skippedByRobots];
1569
+ // Merge state-skipped (unchanged since last run), robots-skipped (target
1570
+ // robots.txt Disallow'd), and policy-skipped (noindex / detected-auth) URLs
1571
+ // so callers have a single audit-skipped surface.
1572
+ const allSkipped = [
1573
+ ...skippedUrls,
1574
+ ...skippedByRobots,
1575
+ ...skippedByPolicy.map((s) => s.url),
1576
+ ];
1333
1577
  if (allSkipped.length > 0) {
1334
1578
  summary.skippedUrls = allSkipped;
1335
1579
  }
1580
+ // v0.4.1: surface noindex / auth skips as a discoverable diagnostic so the
1581
+ // user sees what the engine excluded. Catches the accidental-noindex bug:
1582
+ // pages silently dropped from indexing show up as a visible skip line
1583
+ // instead of being absent without explanation.
1584
+ if (skippedByPolicy.length > 0) {
1585
+ const noindexCount = skippedByPolicy.filter((s) => s.reason === "noindex").length;
1586
+ const authCount = skippedByPolicy.filter((s) => s.reason === "auth-detected").length;
1587
+ const boilerplateCount = skippedByPolicy.filter((s) => s.reason === "boilerplate").length;
1588
+ const searchCount = skippedByPolicy.filter((s) => s.reason === "search-result").length;
1589
+ const spaShellCount = skippedByPolicy.filter((s) => s.reason === "spa-shell").length;
1590
+ const sample = skippedByPolicy.slice(0, 5).map((s) => `${s.url} (${s.reason})`).join(", ");
1591
+ const more = skippedByPolicy.length > 5 ? `, +${skippedByPolicy.length - 5} more` : "";
1592
+ const parts = [];
1593
+ if (noindexCount > 0)
1594
+ parts.push(`${noindexCount} marked noindex`);
1595
+ if (authCount > 0)
1596
+ parts.push(`${authCount} detected as auth (login/register/etc)`);
1597
+ if (boilerplateCount > 0)
1598
+ parts.push(`${boilerplateCount} cookie/legal/consent boilerplate`);
1599
+ if (searchCount > 0)
1600
+ parts.push(`${searchCount} search-result page${searchCount === 1 ? "" : "s"}`);
1601
+ if (spaShellCount > 0)
1602
+ parts.push(`${spaShellCount} un-hydrated SPA shell${spaShellCount === 1 ? "" : "s"}`);
1603
+ auditFindings.push({
1604
+ ruleId: "audit/skipped-by-policy",
1605
+ severity: "info",
1606
+ message: `Skipped ${skippedByPolicy.length} page${skippedByPolicy.length === 1 ? "" : "s"} from rule evaluation — ${parts.join(", ")}. First few: ${sample}${more}.`,
1607
+ relatedUrls: skippedByPolicy.map((s) => s.url),
1608
+ });
1609
+ }
1336
1610
  // Local flat view of every finding the engine produced, used internally for
1337
1611
  // state persistence, regression detection, AI triage input, and telemetry
1338
1612
  // counts. NOT exposed on the AuditSummary — consumers must use