@pseolint/core 0.3.2 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/README.md +49 -1
  2. package/dist/ai/triage.d.ts.map +1 -1
  3. package/dist/ai/triage.js +8 -1
  4. package/dist/ai/triage.js.map +1 -1
  5. package/dist/auditor.d.ts.map +1 -1
  6. package/dist/auditor.js +566 -136
  7. package/dist/auditor.js.map +1 -1
  8. package/dist/backpressure.d.ts +68 -0
  9. package/dist/backpressure.d.ts.map +1 -0
  10. package/dist/backpressure.js +81 -0
  11. package/dist/backpressure.js.map +1 -0
  12. package/dist/cache.d.ts +73 -0
  13. package/dist/cache.d.ts.map +1 -1
  14. package/dist/cache.js +258 -19
  15. package/dist/cache.js.map +1 -1
  16. package/dist/enrich-findings.d.ts.map +1 -1
  17. package/dist/enrich-findings.js +1 -14
  18. package/dist/enrich-findings.js.map +1 -1
  19. package/dist/fetch-observer.d.ts +97 -0
  20. package/dist/fetch-observer.d.ts.map +1 -0
  21. package/dist/fetch-observer.js +124 -0
  22. package/dist/fetch-observer.js.map +1 -0
  23. package/dist/formatters/console.d.ts +7 -9
  24. package/dist/formatters/console.d.ts.map +1 -1
  25. package/dist/formatters/console.js +218 -254
  26. package/dist/formatters/console.js.map +1 -1
  27. package/dist/formatters/html.d.ts +5 -1
  28. package/dist/formatters/html.d.ts.map +1 -1
  29. package/dist/formatters/html.js +352 -570
  30. package/dist/formatters/html.js.map +1 -1
  31. package/dist/formatters/index.d.ts +4 -1
  32. package/dist/formatters/index.d.ts.map +1 -1
  33. package/dist/formatters/index.js +1 -1
  34. package/dist/formatters/index.js.map +1 -1
  35. package/dist/formatters/json.d.ts +11 -1
  36. package/dist/formatters/json.d.ts.map +1 -1
  37. package/dist/formatters/json.js +5 -1
  38. package/dist/formatters/json.js.map +1 -1
  39. package/dist/formatters/markdown.d.ts +7 -1
  40. package/dist/formatters/markdown.d.ts.map +1 -1
  41. package/dist/formatters/markdown.js +77 -70
  42. package/dist/formatters/markdown.js.map +1 -1
  43. package/dist/index.d.ts +13 -8
  44. package/dist/index.d.ts.map +1 -1
  45. package/dist/index.js +6 -7
  46. package/dist/index.js.map +1 -1
  47. package/dist/page-filter.d.ts +50 -0
  48. package/dist/page-filter.d.ts.map +1 -0
  49. package/dist/page-filter.js +86 -0
  50. package/dist/page-filter.js.map +1 -0
  51. package/dist/rule-references.d.ts.map +1 -1
  52. package/dist/rule-references.js +0 -6
  53. package/dist/rule-references.js.map +1 -1
  54. package/dist/rules/content/unique-value.d.ts.map +1 -1
  55. package/dist/rules/content/unique-value.js +1 -0
  56. package/dist/rules/content/unique-value.js.map +1 -1
  57. package/dist/rules/scope.d.ts.map +1 -1
  58. package/dist/rules/scope.js +6 -14
  59. package/dist/rules/scope.js.map +1 -1
  60. package/dist/rules/tech/robots-sitemap-presence.d.ts +9 -1
  61. package/dist/rules/tech/robots-sitemap-presence.d.ts.map +1 -1
  62. package/dist/rules/tech/robots-sitemap-presence.js +14 -5
  63. package/dist/rules/tech/robots-sitemap-presence.js.map +1 -1
  64. package/dist/safe-mode-preset.d.ts +27 -0
  65. package/dist/safe-mode-preset.d.ts.map +1 -0
  66. package/dist/safe-mode-preset.js +54 -0
  67. package/dist/safe-mode-preset.js.map +1 -0
  68. package/dist/site-classifier.d.ts +83 -0
  69. package/dist/site-classifier.d.ts.map +1 -0
  70. package/dist/site-classifier.js +205 -0
  71. package/dist/site-classifier.js.map +1 -0
  72. package/dist/ssrf-guard.d.ts +96 -0
  73. package/dist/ssrf-guard.d.ts.map +1 -0
  74. package/dist/ssrf-guard.js +268 -0
  75. package/dist/ssrf-guard.js.map +1 -0
  76. package/dist/types.d.ts +202 -19
  77. package/dist/types.d.ts.map +1 -1
  78. package/dist/types.js +2 -1
  79. package/dist/types.js.map +1 -1
  80. package/package.json +2 -2
package/dist/auditor.js CHANGED
@@ -2,9 +2,9 @@ import { createHash } from "node:crypto";
2
2
  import { readdir, readFile, stat } from "node:fs/promises";
3
3
  import { extname, join, resolve } from "node:path";
4
4
  import { parseHtmlPage } from "./parser.js";
5
+ import { pageSkipReason } from "./page-filter.js";
5
6
  import { mergeNormalizeUrlOptions, normalizeAuditUrl } from "./url-normalize.js";
6
7
  import { eeatSignalsRule } from "./rules/content/eeat-signals.js";
7
- import { headingUniquenessRule } from "./rules/content/heading-uniqueness.js";
8
8
  import { metaUniquenessRule } from "./rules/content/meta-uniqueness.js";
9
9
  import { missingAuthorRule } from "./rules/content/missing-author.js";
10
10
  import { uniqueValueRule } from "./rules/content/unique-value.js";
@@ -18,12 +18,10 @@ import { thinContentRule } from "./rules/spam/thin-content.js";
18
18
  import { deadEndsRule } from "./rules/links/dead-ends.js";
19
19
  import { linkDepthRule } from "./rules/links/link-depth.js";
20
20
  import { clusterConnectivityRule } from "./rules/links/cluster-connectivity.js";
21
- import { hubPagesRule } from "./rules/links/hub-pages.js";
22
21
  import { orphanPagesRule } from "./rules/links/orphan-pages.js";
23
22
  import { canonicalConsistencyRule } from "./rules/tech/canonical-consistency.js";
24
23
  import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-conflict.js";
25
24
  import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
26
- import { ogCompletenessRule } from "./rules/tech/og-completeness.js";
27
25
  import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
28
26
  import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
29
27
  import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds } from "./rules/tech/robots-sitemap-presence.js";
@@ -33,7 +31,6 @@ import { freshnessSignalsRule } from "./rules/aeo/freshness-signals.js";
33
31
  import { faqCoverageRule } from "./rules/aeo/faq-coverage.js";
34
32
  import { answerFirstRule } from "./rules/aeo/answer-first.js";
35
33
  import { citableFactsRule } from "./rules/aeo/citable-facts.js";
36
- import { nonReplicableValueRule } from "./rules/aeo/non-replicable-value.js";
37
34
  import { contentModularityRule } from "./rules/aeo/content-modularity.js";
38
35
  import { summaryBaitRule } from "./rules/aeo/summary-bait.js";
39
36
  import { redirectChainRule } from "./rules/tech/redirect-chain.js";
@@ -41,8 +38,6 @@ import { soft404Rule } from "./rules/tech/soft-404.js";
41
38
  import { jsonLdValidRule } from "./rules/schema/json-ld-valid.js";
42
39
  import { requiredFieldsRule } from "./rules/schema/required-fields.js";
43
40
  import { schemaConsistencyRule } from "./rules/schema/consistency.js";
44
- import { titleOverlapRule } from "./rules/cannibal/title-overlap.js";
45
- import { keywordCollisionRule } from "./rules/cannibal/keyword-collision.js";
46
41
  import { urlPatternRule } from "./rules/cannibal/url-pattern.js";
47
42
  import { templateCoverageRule } from "./rules/spam/template-coverage.js";
48
43
  import { dataBindingRule, dataIdenticalRule } from "./rules/data/data-binding.js";
@@ -54,8 +49,14 @@ import { triageFindings } from "./ai/triage.js";
54
49
  import { createLanguageModel } from "./ai/adapters/index.js";
55
50
  import { promptTriageFeedback } from "./ai/feedback-prompt.js";
56
51
  import { generateRunId, appendTelemetryRecord, todayTriageSpendUsd, } from "./telemetry/index.js";
57
- import { cachedFetch } from "./cache.js";
52
+ import { SCHEMA_VERSION } from "./types.js";
53
+ import { cachedFetch, pruneCache } from "./cache.js";
54
+ import { SSRFError, validateTargetHost } from "./ssrf-guard.js";
55
+ import { SAFE_MODE_PRESETS, resolveSafeModeKey } from "./safe-mode-preset.js";
56
+ import { FetchObserver, computeReadiness, detectDevServer } from "./fetch-observer.js";
57
+ import { BackpressureMonitor, OriginDegradedError } from "./backpressure.js";
58
58
  import { stratifiedSample } from "./stratified-sample.js";
59
+ import { classifySite } from "./site-classifier.js";
59
60
  import { readState, writeState, computeContentHash, STATE_SCHEMA_VERSION, } from "./state.js";
60
61
  const DEFAULTS = {
61
62
  nearDuplicateThreshold: 0.85,
@@ -67,10 +68,6 @@ const DEFAULTS = {
67
68
  uniqueValueMinWords: 100,
68
69
  metaUniquenessMinJaccard: 0.9,
69
70
  linkDepthMaxClicks: 3,
70
- hubPagesMinSiblings: 4,
71
- hubPagesMaxSiblings: 50,
72
- titleOverlapThreshold: 0.8,
73
- keywordCollisionMinShared: 6,
74
71
  templateCoverageMinPages: 5,
75
72
  answerFirstMaxWords: 100,
76
73
  citableFactsMin: 3,
@@ -80,18 +77,82 @@ const DEFAULTS = {
80
77
  modularityMinSelfContainedRatio: 0.7,
81
78
  faqMinQuestionHeadings: 2
82
79
  };
80
+ /**
81
+ * v0.4 four-category weights. Audit is diagnostic-only (weight 0).
82
+ * See 2026-04-29 v0.4 redesign spec §4.2.
83
+ */
83
84
  const CATEGORY_WEIGHTS = {
84
- spam: 0.33,
85
- content: 0.19,
86
- aeo: 0.14,
87
- links: 0.11,
88
- tech: 0.07,
89
- data: 0.06,
90
- schema: 0.05,
91
- cannibal: 0.05,
92
- /** Dedup / crawl hygiene; does not affect composite score. */
93
- audit: 0
85
+ integrity: 0.50, // spam + content + cannibal
86
+ discoverability: 0.20, // links + tech
87
+ citation: 0.25, // aeo + schema
88
+ data: 0.05, // data
89
+ audit: 0, // diagnostics, never weighted
94
90
  };
91
+ /**
92
+ * Maps the v0.3 ruleId namespace prefix to the v0.4 four-bucket category.
93
+ * Used by `scoreFromFindings` to bucket findings without changing rule IDs.
94
+ */
95
+ const CATEGORY_MAP = {
96
+ spam: "integrity",
97
+ content: "integrity",
98
+ cannibal: "integrity",
99
+ links: "discoverability",
100
+ tech: "discoverability",
101
+ aeo: "citation",
102
+ schema: "citation",
103
+ data: "data",
104
+ audit: "audit",
105
+ };
106
+ /** Slug map for `RuleResult.docsUrl`. Defaults to the rule-id segment after the `/`. */
107
+ const RULE_DOCS_SLUG = {
108
+ // intentionally empty for v0.4 — slug = ruleId.split("/").pop() works for every shipped rule
109
+ };
110
+ function docsUrlFor(ruleId) {
111
+ const slug = RULE_DOCS_SLUG[ruleId] ?? ruleId.split("/").pop() ?? ruleId;
112
+ return `https://pseolint.dev/rules/${slug}`;
113
+ }
114
+ /** Verdict ladder thresholds — see spec §4.4. */
115
+ function verdictForRisk(risk) {
116
+ if (risk <= 20)
117
+ return "ready";
118
+ if (risk <= 40)
119
+ return "caution";
120
+ if (risk <= 60)
121
+ return "concerning";
122
+ return "critical";
123
+ }
124
+ function gradeForPenalty(penalty) {
125
+ if (penalty <= 20)
126
+ return "A";
127
+ if (penalty <= 40)
128
+ return "B";
129
+ if (penalty <= 60)
130
+ return "C";
131
+ if (penalty <= 80)
132
+ return "D";
133
+ return "F";
134
+ }
135
+ /** True for `text/html` and `application/xhtml+xml` only (treat as audit-eligible content). */
136
+ function isHtmlContentType(contentType) {
137
+ if (!contentType)
138
+ return true; // Local files / unknown — assume HTML.
139
+ const lower = contentType.toLowerCase();
140
+ return lower.includes("text/html") || lower.includes("application/xhtml+xml");
141
+ }
142
+ /** Glob match against a URL pathname only (not the full URL). v0.4 spec §4.5. */
143
+ function globMatchPathname(pattern, urlOrPath) {
144
+ let pathname;
145
+ try {
146
+ pathname = new URL(urlOrPath).pathname;
147
+ }
148
+ catch {
149
+ // Not a URL — treat as already-a-path. Force a leading slash for consistency.
150
+ pathname = urlOrPath.startsWith("/") ? urlOrPath : `/${urlOrPath}`;
151
+ }
152
+ // Allow patterns that don't begin with "/" by normalising both sides.
153
+ const normPattern = pattern.startsWith("/") || pattern.startsWith("*") ? pattern : `/${pattern}`;
154
+ return matchGlob(normPattern, pathname) || matchGlob(pattern, pathname);
155
+ }
95
156
  const DEFAULT_ENTITY_PATTERNS = [
96
157
  {
97
158
  placeholder: "[STATE]",
@@ -112,7 +173,16 @@ function resolveGroupRules(baseRules, overrides) {
112
173
  }
113
174
  return result;
114
175
  }
115
- function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides, mode = "full") {
176
+ function runRulesOnPages(pages,
177
+ /**
178
+ * Full set of parsed pages including those filtered out by `respectNoindex`
179
+ * / `skipDetectedAuth`. Defaults to `pages` for backwards compat. The two
180
+ * noindex-conflict rules (`tech/canonical-noindex-conflict`,
181
+ * `tech/robots-noindex-conflict`) read this list specifically — without it,
182
+ * `respectNoindex: true` would hide noindex'd pages from the very rules
183
+ * designed to flag accidental noindex'ing.
184
+ */
185
+ noindexAwarePages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides, mode = "full") {
116
186
  const findings = [];
117
187
  const modeOk = (ruleId) => mode !== "diff" || isRuleAllowedInDiff(ruleId);
118
188
  const tag = (results) => results.map((r) => {
@@ -156,9 +226,6 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
156
226
  if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
157
227
  findings.push(...tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
158
228
  }
159
- if (isEnabled("content/heading-uniqueness") && modeOk("content/heading-uniqueness")) {
160
- findings.push(...tag(headingUniquenessRule(pages, entityPatterns)));
161
- }
162
229
  if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
163
230
  findings.push(...tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
164
231
  }
@@ -183,18 +250,15 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
183
250
  if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
184
251
  findings.push(...tag(clusterConnectivityRule(pages, knownUrls)));
185
252
  }
186
- if (isEnabled("links/hub-pages") && modeOk("links/hub-pages")) {
187
- findings.push(...tag(hubPagesRule(pages, knownUrls, resolvedRules.hubPagesMinSiblings, resolvedRules.hubPagesMaxSiblings)));
188
- }
189
253
  // Tech rules
190
254
  if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
191
255
  findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
192
256
  }
193
257
  if (isEnabled("tech/canonical-noindex-conflict") && modeOk("tech/canonical-noindex-conflict")) {
194
- findings.push(...tag(canonicalNoindexConflictRule(pages, normalizeUrlOptions)));
258
+ findings.push(...tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
195
259
  }
196
260
  if (isEnabled("tech/robots-noindex-conflict") && modeOk("tech/robots-noindex-conflict")) {
197
- findings.push(...tag(robotsNoindexConflictRule(pages, inbound)));
261
+ findings.push(...tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
198
262
  }
199
263
  if (isEnabled("tech/redirect-chain") && modeOk("tech/redirect-chain")) {
200
264
  findings.push(...tag(redirectChainRule(pages)));
@@ -202,11 +266,10 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
202
266
  if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
203
267
  findings.push(...tag(soft404Rule(pages)));
204
268
  }
205
- if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
206
- findings.push(...tag(ogCompletenessRule(pages)));
207
- }
208
269
  if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
209
- findings.push(...tag(hreflangConsistencyRule(pages, normalizeUrlOptions)));
270
+ // hreflang declarations on noindex'd pages are still bugs when they're
271
+ // inconsistent — see auditor.test.ts "emits technical SEO findings".
272
+ findings.push(...tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
210
273
  }
211
274
  // Schema rules
212
275
  if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
@@ -240,9 +303,6 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
240
303
  targetFactsPerPage: resolvedRules.citableFactsTarget,
241
304
  })));
242
305
  }
243
- if (isEnabled("aeo/non-replicable-value")) {
244
- findings.push(...tag(nonReplicableValueRule(pages)));
245
- }
246
306
  if (isEnabled("aeo/content-modularity")) {
247
307
  findings.push(...tag(contentModularityRule(pages, {
248
308
  maxParagraphWords: resolvedRules.modularityMaxParagraphWords,
@@ -252,13 +312,9 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
252
312
  if (isEnabled("aeo/summary-bait")) {
253
313
  findings.push(...tag(summaryBaitRule(pages, entityPatterns)));
254
314
  }
255
- // Cannibal rules
256
- if (isEnabled("cannibal/title-overlap") && modeOk("cannibal/title-overlap")) {
257
- findings.push(...tag(titleOverlapRule(pages, entityPatterns, resolvedRules.titleOverlapThreshold)));
258
- }
259
- if (isEnabled("cannibal/keyword-collision") && modeOk("cannibal/keyword-collision")) {
260
- findings.push(...tag(keywordCollisionRule(pages, resolvedRules.keywordCollisionMinShared)));
261
- }
315
+ // Cannibal rules — only url-pattern survives in v0.4 (title-overlap and
316
+ // keyword-collision dropped due to high false-positive rates; see
317
+ // 2026-04-29 v0.4 redesign spec §4.3).
262
318
  if (isEnabled("cannibal/url-pattern") && modeOk("cannibal/url-pattern")) {
263
319
  findings.push(...tag(urlPatternRule(pages)));
264
320
  }
@@ -267,54 +323,110 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
267
323
  function hashHtml(html) {
268
324
  return createHash("sha256").update(html, "utf8").digest("hex");
269
325
  }
326
+ const SEVERITY_WEIGHTS = {
327
+ critical: 40,
328
+ error: 25,
329
+ warning: 12,
330
+ info: 5,
331
+ };
270
332
  function scoreFromFindings(findings) {
271
- const severityWeights = {
272
- critical: 40,
273
- error: 25,
274
- warning: 12,
275
- info: 5
333
+ // v0.4 four-bucket raw penalties.
334
+ const bucketRaw = {
335
+ integrity: 0,
336
+ discoverability: 0,
337
+ citation: 0,
338
+ data: 0,
339
+ audit: 0,
276
340
  };
277
- const raw = {
278
- spam: 0,
279
- content: 0,
280
- aeo: 0,
281
- links: 0,
282
- tech: 0,
341
+ const bucketIssues = {
342
+ integrity: 0,
343
+ discoverability: 0,
344
+ citation: 0,
283
345
  data: 0,
284
- schema: 0,
285
- cannibal: 0,
286
- audit: 0
346
+ audit: 0,
287
347
  };
348
+ let blockers = 0;
349
+ let shouldFix = 0;
350
+ let informational = 0;
288
351
  for (const finding of findings) {
289
- const category = finding.ruleId.split("/")[0];
290
- if (!(category in raw)) {
352
+ const namespace = finding.ruleId.split("/")[0];
353
+ const bucket = CATEGORY_MAP[namespace];
354
+ if (!bucket)
291
355
  continue;
356
+ const weight = SEVERITY_WEIGHTS[finding.severity];
357
+ // v0.4 buckets.
358
+ bucketRaw[bucket] = Math.min(100, bucketRaw[bucket] + weight);
359
+ if (bucket !== "audit") {
360
+ bucketIssues[bucket] += 1;
292
361
  }
293
- raw[category] = Math.min(100, raw[category] + severityWeights[finding.severity]);
294
- }
295
- const weighted = raw.spam * CATEGORY_WEIGHTS.spam +
296
- raw.content * CATEGORY_WEIGHTS.content +
297
- raw.aeo * CATEGORY_WEIGHTS.aeo +
298
- raw.links * CATEGORY_WEIGHTS.links +
299
- raw.tech * CATEGORY_WEIGHTS.tech +
300
- raw.data * CATEGORY_WEIGHTS.data +
301
- raw.schema * CATEGORY_WEIGHTS.schema +
302
- raw.cannibal * CATEGORY_WEIGHTS.cannibal +
303
- raw.audit * CATEGORY_WEIGHTS.audit;
362
+ // Issue-bucket counts (audit/* findings are diagnostic-only and excluded).
363
+ if (bucket === "audit")
364
+ continue;
365
+ if (finding.severity === "critical" || finding.severity === "error")
366
+ blockers += 1;
367
+ else if (finding.severity === "warning")
368
+ shouldFix += 1;
369
+ else
370
+ informational += 1;
371
+ }
372
+ const weighted = bucketRaw.integrity * CATEGORY_WEIGHTS.integrity +
373
+ bucketRaw.discoverability * CATEGORY_WEIGHTS.discoverability +
374
+ bucketRaw.citation * CATEGORY_WEIGHTS.citation +
375
+ bucketRaw.data * CATEGORY_WEIGHTS.data;
376
+ const risk = Math.round(Math.min(100, weighted));
377
+ const categories = {
378
+ integrity: { grade: gradeForPenalty(bucketRaw.integrity), issues: bucketIssues.integrity },
379
+ discoverability: { grade: gradeForPenalty(bucketRaw.discoverability), issues: bucketIssues.discoverability },
380
+ citation: { grade: gradeForPenalty(bucketRaw.citation), issues: bucketIssues.citation },
381
+ data: { grade: gradeForPenalty(bucketRaw.data), issues: bucketIssues.data },
382
+ audit: { grade: "A", issues: 0 },
383
+ };
304
384
  return {
305
- score: Math.round(Math.min(100, weighted)),
306
- categoryScores: {
307
- spam: raw.spam,
308
- content: raw.content,
309
- aeo: raw.aeo,
310
- links: raw.links,
311
- tech: raw.tech,
312
- data: raw.data,
313
- schema: raw.schema,
314
- cannibal: raw.cannibal
315
- }
385
+ risk,
386
+ categories,
387
+ bucketCounts: { blockers, shouldFix, informational },
316
388
  };
317
389
  }
390
+ function bucketIssues(findings) {
391
+ const blockers = [];
392
+ const shouldFix = [];
393
+ const informational = [];
394
+ for (const f of findings) {
395
+ // audit/* findings are diagnostics and never appear in issue buckets.
396
+ if (f.ruleId.startsWith("audit/"))
397
+ continue;
398
+ if (f.severity === "critical" || f.severity === "error")
399
+ blockers.push(f);
400
+ else if (f.severity === "warning")
401
+ shouldFix.push(f);
402
+ else
403
+ informational.push(f);
404
+ }
405
+ return { blockers, shouldFix, informational };
406
+ }
407
+ function buildHeadline(counts) {
408
+ const parts = [];
409
+ if (counts.blockers > 0) {
410
+ parts.push(`${counts.blockers} ship-blocker${counts.blockers === 1 ? "" : "s"}`);
411
+ }
412
+ if (counts.shouldFix > 0) {
413
+ parts.push(`${counts.shouldFix} should-fix`);
414
+ }
415
+ if (counts.informational > 0 && parts.length < 2) {
416
+ parts.push(`${counts.informational} informational`);
417
+ }
418
+ if (parts.length === 0)
419
+ return "No issues detected.";
420
+ return parts.join(", ");
421
+ }
422
+ /** Populate `docsUrl` on every finding that doesn't already have one. */
423
+ function withDocsUrls(findings) {
424
+ for (const f of findings) {
425
+ if (!f.docsUrl)
426
+ f.docsUrl = docsUrlFor(f.ruleId);
427
+ }
428
+ return findings;
429
+ }
318
430
  async function collectHtmlFiles(directory) {
319
431
  const entries = await readdir(directory, { withFileTypes: true });
320
432
  const files = await Promise.all(entries.map(async (entry) => {
@@ -330,10 +442,30 @@ async function collectHtmlFiles(directory) {
330
442
  }));
331
443
  return files.flat();
332
444
  }
333
- async function fetchWithRetry(url, timeoutMs, cache, stats) {
445
+ /**
446
+ * Combine up to N AbortSignals into one. The returned signal aborts as soon
447
+ * as any input aborts. Avoids the node-only `AbortSignal.any` for wider
448
+ * compatibility and keeps listeners weak-ish (one per input, no unbounded
449
+ * listener growth).
450
+ */
451
+ function composeSignals(...signals) {
452
+ const actual = signals.filter((s) => Boolean(s));
453
+ if (actual.length === 0)
454
+ return new AbortController().signal;
455
+ const ac = new AbortController();
456
+ for (const s of actual) {
457
+ if (s.aborted) {
458
+ ac.abort(s.reason);
459
+ return ac.signal;
460
+ }
461
+ s.addEventListener("abort", () => ac.abort(s.reason), { once: true });
462
+ }
463
+ return ac.signal;
464
+ }
465
+ async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop) {
334
466
  try {
335
467
  stats.total += 1;
336
- const r = await cachedFetch(url, { timeoutMs, cache });
468
+ const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
337
469
  if (r.fromCache) {
338
470
  stats.hits += 1;
339
471
  stats.bytesSavedEstimate += r.body.length;
@@ -342,14 +474,16 @@ async function fetchWithRetry(url, timeoutMs, cache, stats) {
342
474
  return null;
343
475
  return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
344
476
  }
345
- catch {
477
+ catch (err) {
478
+ if (signal?.aborted)
479
+ throw err; // propagate cancellation
346
480
  return null;
347
481
  }
348
482
  }
349
- async function fetchPageWithMeta(url, timeoutMs, cache, stats) {
483
+ async function fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects = true) {
350
484
  try {
351
485
  stats.total += 1;
352
- const r = await cachedFetch(url, { timeoutMs, cache });
486
+ const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, followRedirects, onObservation: stats.onObservation });
353
487
  if (r.fromCache) {
354
488
  stats.hits += 1;
355
489
  stats.bytesSavedEstimate += r.body.length;
@@ -366,13 +500,15 @@ async function fetchPageWithMeta(url, timeoutMs, cache, stats) {
366
500
  },
367
501
  };
368
502
  }
369
- catch {
503
+ catch (err) {
504
+ if (signal?.aborted)
505
+ throw err;
370
506
  return null;
371
507
  }
372
508
  }
373
- async function fetchTextStrict(url, timeoutMs, cache, stats) {
509
+ async function fetchTextStrict(url, timeoutMs, cache, stats, signal, validateHop) {
374
510
  stats.total += 1;
375
- const r = await cachedFetch(url, { timeoutMs, cache });
511
+ const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
376
512
  if (r.fromCache) {
377
513
  stats.hits += 1;
378
514
  stats.bytesSavedEstimate += r.body.length;
@@ -455,8 +591,13 @@ function matchGlob(pattern, value) {
455
591
  function shouldIgnore(url, patterns) {
456
592
  if (patterns.length === 0)
457
593
  return false;
594
+ // v0.4 §4.5: globs match against the URL pathname only, NOT the full URL.
595
+ // Operator intuition: `ignore: ["dashboard/**"]` should match
596
+ // `https://example.com/dashboard/...` even though the full URL contains the
597
+ // host. Previously globs matched the full URL and silently failed for users
598
+ // who didn't think to write `**/dashboard/**`.
458
599
  for (const pattern of patterns) {
459
- if (matchGlob(pattern, url))
600
+ if (globMatchPathname(pattern, url))
460
601
  return true;
461
602
  }
462
603
  return false;
@@ -469,7 +610,7 @@ function fisherYatesSample(items, n) {
469
610
  }
470
611
  return arr.slice(arr.length - n);
471
612
  }
472
- async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats) {
613
+ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop) {
473
614
  visited.add(sitemapUrl);
474
615
  const locs = parseSitemapUrls(sitemapText);
475
616
  if (!isSitemapIndex(sitemapText)) {
@@ -477,27 +618,32 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
477
618
  }
478
619
  const allUrls = [];
479
620
  for (const childUrl of locs) {
621
+ if (signal?.aborted)
622
+ throw signal.reason ?? new Error("aborted");
480
623
  if (visited.has(childUrl))
481
624
  continue;
482
- const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats);
625
+ const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop);
483
626
  if (!child)
484
627
  continue;
485
628
  const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
486
629
  if (!childLike)
487
630
  continue;
488
- const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats);
631
+ const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
489
632
  allUrls.push(...childUrls);
490
633
  }
491
634
  return allUrls;
492
635
  }
493
- async function fetchRobotsMeta(origin, timeoutMs, cache, stats) {
636
+ async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
494
637
  if (!origin)
495
638
  return { disallow: [], crawlDelaySec: 0 };
496
639
  try {
497
640
  const robotsUrl = `${origin}/robots.txt`;
498
- const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats);
641
+ const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats, signal, validateHop);
642
+ // Honor both the wildcard block AND any block specifically targeting us.
643
+ // A malicious target can't bypass our crawler by adding a targeted
644
+ // `User-agent: pseolint / Disallow: /` without a wildcard.
499
645
  return {
500
- disallow: parseDisallowPatterns(fetched.text),
646
+ disallow: parseDisallowPatterns(fetched.text, ["*", "pseolint"]),
501
647
  crawlDelaySec: parseCrawlDelaySeconds(fetched.text),
502
648
  };
503
649
  }
@@ -518,13 +664,42 @@ function isDisallowedByRobots(urlPath, patterns) {
518
664
  function budgetExceeded(b) {
519
665
  return b.cap > 0 && b.used >= b.cap;
520
666
  }
521
- async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }) {
667
+ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000) {
668
+ // Memoized SSRF validator. When guardSsrf is on, every URL fetched by the
669
+ // audit (source, sitemap entries, redirects, discovered links) goes through
670
+ // this. DNS is hit once per unique hostname per audit — a 4k-page audit on
671
+ // one origin does 1 DNS lookup, not 4k.
672
+ const ssrfCache = new Map();
673
+ const validateHop = guardSsrf
674
+ ? async (u) => {
675
+ let host;
676
+ try {
677
+ host = new URL(u).hostname;
678
+ }
679
+ catch {
680
+ throw new Error(`Refusing to fetch invalid URL: ${u}`);
681
+ }
682
+ let pending = ssrfCache.get(host);
683
+ if (!pending) {
684
+ pending = validateTargetHost(host).catch((err) => {
685
+ if (err instanceof SSRFError) {
686
+ throw new Error(`Refusing to fetch ${u}: ${err.reason}`);
687
+ }
688
+ throw err;
689
+ });
690
+ ssrfCache.set(host, pending);
691
+ }
692
+ await pending;
693
+ }
694
+ : undefined;
522
695
  if (/^https?:\/\//i.test(source)) {
696
+ if (validateHop)
697
+ await validateHop(source);
523
698
  let text;
524
699
  let contentType;
525
700
  let sourceStatus = 200;
526
701
  try {
527
- const fetched = await fetchTextStrict(source, timeoutMs, cache, stats);
702
+ const fetched = await fetchTextStrict(source, timeoutMs, cache, stats, signal, validateHop);
528
703
  text = fetched.text;
529
704
  contentType = fetched.contentType;
530
705
  }
@@ -533,7 +708,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
533
708
  if (source.includes("sitemap")) {
534
709
  try {
535
710
  const origin = new URL(source).origin;
536
- const fallback = await fetchTextStrict(origin, timeoutMs, cache, stats);
711
+ const fallback = await fetchTextStrict(origin, timeoutMs, cache, stats, signal, validateHop);
537
712
  text = fallback.text;
538
713
  contentType = fallback.contentType;
539
714
  sourceStatus = -1; // flag that we fell back
@@ -549,7 +724,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
549
724
  const isXml = (contentType.includes("xml") || looksLikeSitemap(text)) && sourceStatus !== -1;
550
725
  if (isXml) {
551
726
  const visited = new Set();
552
- const allSitemapUrls = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats);
727
+ const allSitemapUrls = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
553
728
  // If we have a budget, sample from sitemap URLs before fetching
554
729
  const urlsToFetch = discoveryBudget > 0 && allSitemapUrls.length > discoveryBudget
555
730
  ? fisherYatesSample(allSitemapUrls, discoveryBudget)
@@ -562,13 +737,29 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
562
737
  catch {
563
738
  return "";
564
739
  } })();
565
- const robots = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats);
740
+ const robots = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats, signal, validateHop);
566
741
  const effectiveConcurrency = robots.crawlDelaySec > 0 ? 1 : concurrency;
567
742
  const delayMs = robots.crawlDelaySec * 1000;
568
743
  await runWithConcurrency(urlsToFetch, effectiveConcurrency, async (url) => {
569
744
  if (budgetExceeded(byteBudget))
570
745
  return;
571
- const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
746
+ // Honor robots.txt for our own crawl when respectRobotsTxt is on (default).
747
+ // The existing robotsComplianceRule flags sitemap-vs-robots conflicts as
748
+ // findings; this actually refuses to fetch the disallowed URL. Keeps us
749
+ // legally defensible (we are a bot, our UA `pseolint` is public, and we
750
+ // respect Disallow directives) and removes the "crawler-for-hire" abuse
751
+ // vector when the library is invoked from a hosted service.
752
+ if (respectRobotsTxt) {
753
+ try {
754
+ const p = new URL(url).pathname;
755
+ if (isDisallowedByRobots(p, robots.disallow)) {
756
+ skippedByRobots.push(url);
757
+ return;
758
+ }
759
+ }
760
+ catch { /* URL parse failed — fall through, fetch will fail naturally */ }
761
+ }
762
+ const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
572
763
  if (result) {
573
764
  byteBudget.used += result.html.length;
574
765
  pages.push(result);
@@ -587,9 +778,16 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
587
778
  const discoveredUrls = new Set();
588
779
  // robots already fetched above; reuse its Disallow patterns here.
589
780
  const disallowPatterns = robots.disallow;
590
- for (const page of pages) {
781
+ let discoveryCeilingReached = false;
782
+ outer: for (const page of pages) {
591
783
  const linkMatches = Array.from(page.html.matchAll(/href=["']([^"']+)["']/gi));
592
784
  for (const match of linkMatches) {
785
+ if (discoveredUrls.size >= maxCrawlDiscovered) {
786
+ // Hard ceiling — don't let a malicious site with many self-links
787
+ // extend crawl discovery up to the byte budget.
788
+ discoveryCeilingReached = true;
789
+ break outer;
790
+ }
593
791
  const href = match[1];
594
792
  if (!href || href.startsWith("#") || /^mailto:|^tel:|^javascript:|^data:/i.test(href))
595
793
  continue;
@@ -614,6 +812,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
614
812
  }
615
813
  }
616
814
  }
815
+ if (discoveryCeilingReached) {
816
+ // eslint-disable-next-line no-console
817
+ console.error(`pseolint: crawl discovery hit maxCrawlDiscovered=${maxCrawlDiscovered} ceiling; sampling from the first ${discoveredUrls.size} URLs.`);
818
+ }
617
819
  if (discoveredUrls.size > 0) {
618
820
  const candidates = Array.from(discoveredUrls);
619
821
  // Fisher-Yates shuffle so we don't bias toward the first-discovered links (nav/footer).
@@ -623,7 +825,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
623
825
  await runWithConcurrency(toFetch, effectiveConcurrency, async (url) => {
624
826
  if (budgetExceeded(byteBudget))
625
827
  return;
626
- const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
828
+ const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
627
829
  if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
628
830
  byteBudget.used += result.html.length;
629
831
  pages.push(result);
@@ -700,7 +902,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
700
902
  }
701
903
  const newPages = [];
702
904
  await runWithConcurrency(urlsToFetch, concurrency, async (url) => {
703
- const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
905
+ const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
704
906
  if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
705
907
  newPages.push(result);
706
908
  knownCrawled.add(url);
@@ -744,10 +946,65 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
744
946
  export async function auditSource(source, options) {
745
947
  const runId = generateRunId();
746
948
  const runStartedAt = Date.now();
747
- const concurrency = options?.concurrency ?? 5;
949
+ // Apply safeMode preset first, then let explicit options override it. Using
950
+ // `??` preserves the "not set" vs "explicitly false" distinction — a user
951
+ // who picks safeMode="saas" but passes `guardSsrf: false` gets the explicit
952
+ // override. Localhost sources auto-promote to the `dev` preset unless the
953
+ // caller explicitly set `safeMode` or passed `autoDevPreset: false`.
954
+ const presetKey = resolveSafeModeKey(source, options);
955
+ const preset = SAFE_MODE_PRESETS[presetKey];
956
+ const concurrency = options?.concurrency ?? preset.concurrency ?? 5;
748
957
  const timeoutMs = options?.timeout ?? 30000;
749
958
  const ignorePatterns = options?.ignore ?? [];
750
- const sampleSize = options?.sampleSize ?? 0;
959
+ const respectNoindex = options?.respectNoindex ?? true;
960
+ const skipDetectedAuth = options?.skipDetectedAuth ?? false;
961
+ const sampleSize = options?.sampleSize ?? preset.sampleSize ?? 0;
962
+ const externalSignal = options?.signal;
963
+ const guardSsrf = options?.guardSsrf ?? preset.guardSsrf ?? false;
964
+ const respectRobotsTxt = options?.respectRobotsTxt ?? preset.respectRobotsTxt ?? true;
965
+ const followRedirects = options?.followRedirects ?? preset.followRedirects ?? true;
966
+ const maxCrawlDiscovered = options?.maxCrawlDiscovered ?? preset.maxCrawlDiscovered ?? 5000;
967
+ const skippedByRobots = [];
968
+ // Backpressure: watch TTFB + 5xx rate during the crawl and abort if the
969
+ // origin looks degraded. The audit signal is a composite of the caller's
970
+ // signal (ctrl-C, parent timeout) and the monitor's abort controller.
971
+ const backpressureEnabled = options?.backpressure !== false;
972
+ const backpressureAbort = new AbortController();
973
+ let backpressureError = null;
974
+ const signal = composeSignals(externalSignal, backpressureAbort.signal);
975
+ const observer = new FetchObserver();
976
+ const monitor = backpressureEnabled
977
+ ? new BackpressureMonitor({
978
+ warmupSize: 10,
979
+ absoluteP95Ms: 3000,
980
+ baselineMultiplier: 2,
981
+ errorRatioThreshold: 0.1,
982
+ })
983
+ : null;
984
+ // v0.4: framework gets set on the first observation that carries headers
985
+ // (the source URL fetch). Backpressure thresholds and computeReadiness use
986
+ // it to soften limits when auditing a dev server.
987
+ let detectedFramework = null;
988
+ const onObservation = (obs) => {
989
+ if (detectedFramework === null && obs.headers) {
990
+ detectedFramework = detectDevServer(obs.headers);
991
+ }
992
+ observer.record(obs);
993
+ if (!monitor)
994
+ return;
995
+ const decision = monitor.record(obs);
996
+ if (decision.shouldAbort && !backpressureError && decision.snapshot) {
997
+ backpressureError = new OriginDegradedError(decision.reason ?? "", decision.snapshot);
998
+ backpressureAbort.abort(backpressureError);
999
+ }
1000
+ };
1001
+ function throwIfAborted() {
1002
+ if (backpressureError)
1003
+ throw backpressureError;
1004
+ if (externalSignal?.aborted) {
1005
+ throw externalSignal.reason ?? new DOMException("Audit aborted", "AbortError");
1006
+ }
1007
+ }
751
1008
  const resolvedRules = {
752
1009
  nearDuplicateThreshold: options?.rules?.nearDuplicateThreshold ?? DEFAULTS.nearDuplicateThreshold,
753
1010
  entitySwapThreshold: options?.rules?.entitySwapThreshold ?? DEFAULTS.entitySwapThreshold,
@@ -758,10 +1015,6 @@ export async function auditSource(source, options) {
758
1015
  uniqueValueMinWords: options?.rules?.uniqueValueMinWords ?? DEFAULTS.uniqueValueMinWords,
759
1016
  metaUniquenessMinJaccard: options?.rules?.metaUniquenessMinJaccard ?? DEFAULTS.metaUniquenessMinJaccard,
760
1017
  linkDepthMaxClicks: options?.rules?.linkDepthMaxClicks ?? DEFAULTS.linkDepthMaxClicks,
761
- hubPagesMinSiblings: options?.rules?.hubPagesMinSiblings ?? DEFAULTS.hubPagesMinSiblings,
762
- hubPagesMaxSiblings: options?.rules?.hubPagesMaxSiblings ?? DEFAULTS.hubPagesMaxSiblings,
763
- titleOverlapThreshold: options?.rules?.titleOverlapThreshold ?? DEFAULTS.titleOverlapThreshold,
764
- keywordCollisionMinShared: options?.rules?.keywordCollisionMinShared ?? DEFAULTS.keywordCollisionMinShared,
765
1018
  templateCoverageMinPages: options?.rules?.templateCoverageMinPages ?? DEFAULTS.templateCoverageMinPages,
766
1019
  answerFirstMaxWords: options?.rules?.answerFirstMaxWords ?? DEFAULTS.answerFirstMaxWords,
767
1020
  citableFactsMin: options?.rules?.citableFactsMin ?? DEFAULTS.citableFactsMin,
@@ -783,18 +1036,47 @@ export async function auditSource(source, options) {
783
1036
  const discoveryBudget = options?.sampleSize && options.sampleSize > 0
784
1037
  ? Math.max(50, options.sampleSize * 2)
785
1038
  : 0;
786
- const cacheStats = { hits: 0, total: 0, bytesSavedEstimate: 0 };
1039
+ const cacheStats = { hits: 0, total: 0, bytesSavedEstimate: 0, onObservation };
787
1040
  const cacheConfig = options?.cache
788
1041
  ? {
789
1042
  dir: options.cache.dir ?? ".pseolint/cache",
790
1043
  ttlMs: options.cache.ttlMs ?? 7 * 24 * 60 * 60 * 1000,
791
1044
  }
792
1045
  : null;
1046
+ // Size cap (post-audit eviction). Default 200 MB keeps pSEO-scale sites in check;
1047
+ // a single full crawl of a 5k-page site averages ~250 KB per body = ~1.25 GB uncapped.
1048
+ const cacheMaxBytes = options?.cache?.maxBytes ?? 209_715_200;
793
1049
  const fillBudgetViaLinkDiscovery = options?.fillBudgetViaLinkDiscovery ?? false;
794
- const maxFetchBytes = options?.maxFetchBytes ?? 52_428_800;
1050
+ const maxFetchBytes = options?.maxFetchBytes ?? preset.maxFetchBytes ?? 52_428_800;
795
1051
  const fetchByteBudget = { used: 0, cap: maxFetchBytes };
796
- const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, discoveredUrlCount } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget);
1052
+ // v0.4 §4.7: detectedFramework is set in onObservation above, side-effect
1053
+ // of the normal source URL fetch. No separate probe needed.
1054
+ const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, discoveredUrlCount } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered);
1055
+ throwIfAborted();
797
1056
  const loadedPages = [...loadedPagesRaw];
1057
+ // v0.4 §4.7: content-type-aware crawling. Filter out fetched URLs whose
1058
+ // response Content-Type is not HTML (text/html or application/xhtml+xml).
1059
+ // Binary routes like /apple-icon, /opengraph-image, /icon get pushed to
1060
+ // crawlStats.skipped instead of being parsed as thin-content pages.
1061
+ const skippedByContentType = [];
1062
+ const htmlOnlyPages = [];
1063
+ for (const p of loadedPages) {
1064
+ // httpMeta is set on URL fetches; locally-loaded files have no httpMeta
1065
+ // and are always HTML by definition (collectHtmlFiles only picks .html).
1066
+ // We don't have content-type on the LoadedPage object. Heuristic: if html
1067
+ // body doesn't contain any HTML markers, treat as non-HTML.
1068
+ if (!p.httpMeta) {
1069
+ htmlOnlyPages.push(p);
1070
+ continue;
1071
+ }
1072
+ if (looksLikeHtml(p.html)) {
1073
+ htmlOnlyPages.push(p);
1074
+ }
1075
+ else {
1076
+ skippedByContentType.push(p.url);
1077
+ }
1078
+ }
1079
+ loadedPages.splice(0, loadedPages.length, ...htmlOnlyPages);
798
1080
  if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
799
1081
  console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
800
1082
  }
@@ -830,7 +1112,7 @@ export async function auditSource(source, options) {
830
1112
  if (/^https?:\/\//i.test(source)) {
831
1113
  try {
832
1114
  const origin = new URL(source).origin;
833
- const result = await fetchWithRetry(`${origin}/robots.txt`, timeoutMs, cacheConfig, cacheStats);
1115
+ const result = await fetchWithRetry(`${origin}/robots.txt`, timeoutMs, cacheConfig, cacheStats, signal);
834
1116
  if (result)
835
1117
  robotsTxtContent = result.text;
836
1118
  }
@@ -872,13 +1154,27 @@ export async function auditSource(source, options) {
872
1154
  })()
873
1155
  : fisherYatesSample(filtered, sampleSize))
874
1156
  : filtered;
875
- const parsedPages = sampled.map((page) => {
1157
+ const parsedPagesAll = sampled.map((page) => {
876
1158
  const parsed = parseHtmlPage(page.html, page.url, { normalizeUrl: normalizeUrlOptions });
877
1159
  if (page.httpMeta) {
878
1160
  parsed.httpMeta = page.httpMeta;
879
1161
  }
880
1162
  return parsed;
881
1163
  });
1164
+ // v0.4.1 §page-filter: drop noindex'd pages and (when enabled) heuristically
1165
+ // detected auth pages BEFORE rule evaluation. The site owner's noindex is a
1166
+ // hard signal — they already opted out of SEO indexing, so auditing those
1167
+ // URLs produces only noise. Auth detection is opt-in via skipDetectedAuth
1168
+ // (off for the CLI by default; on for the hosted web form).
1169
+ const skippedByPolicy = [];
1170
+ const parsedPages = parsedPagesAll.filter((p) => {
1171
+ const reason = pageSkipReason(p, { respectNoindex, skipDetectedAuth });
1172
+ if (reason) {
1173
+ skippedByPolicy.push({ url: p.url, reason });
1174
+ return false;
1175
+ }
1176
+ return true;
1177
+ });
882
1178
  const knownUrls = new Set(parsedPages.map((p) => p.url));
883
1179
  const rootUrl = parsedPages.find((p) => /(^|[\\/])index\.html?$/i.test(p.url))?.url ?? parsedPages[0]?.url ?? "";
884
1180
  const adjacency = new Map();
@@ -918,11 +1214,50 @@ export async function auditSource(source, options) {
918
1214
  }),
919
1215
  ]
920
1216
  : DEFAULT_ENTITY_PATTERNS;
1217
+ // v0.4 §4.11 — pre-flight site classification. We compute this BEFORE the
1218
+ // rule pipeline so the dispatcher can skip pSEO-only rules on small
1219
+ // marketing sites / blogs. Classification is computed off the FULL
1220
+ // discovered URL set (sitemap when available, else loaded URLs). This
1221
+ // matters: a sampled crawl of a 5000-page directory must still classify
1222
+ // as `programmatic-directory`, not `unclear`.
1223
+ const classifierUrls = (() => {
1224
+ if (sitemapUrlSet && sitemapUrlSet.size > 0) {
1225
+ return Array.from(sitemapUrlSet);
1226
+ }
1227
+ return loadedPagesRaw.map((p) => p.url);
1228
+ })();
1229
+ const classifierFramework = detectedFramework ?? "unknown";
1230
+ const computedClassification = classifySite({
1231
+ urls: classifierUrls,
1232
+ framework: classifierFramework,
1233
+ });
1234
+ // `--strict` (or AuditOptions.strict) keeps the classification but forces
1235
+ // every rule to run regardless of detected site type.
1236
+ const siteClassification = options?.strict
1237
+ ? { ...computedClassification, suppressedRules: [] }
1238
+ : computedClassification;
1239
+ const suppressedRuleSet = new Set(siteClassification.suppressedRules);
921
1240
  // Classify pages into groups and run only enabled rules per group
922
1241
  const classified = classifyPages(parsedPages, options?.pageGroups);
923
1242
  const allFindings = [...duplicateUrlFindings];
924
1243
  const groupScores = {};
925
1244
  const groupPageCounts = {};
1245
+ // Surface robots-skipped URLs so users don't silently get a smaller audit
1246
+ // than expected. One rollup finding (not per-URL) to avoid flooding the
1247
+ // output on large sites. Also included on summary.skippedUrls below.
1248
+ if (skippedByRobots.length > 0) {
1249
+ allFindings.push({
1250
+ ruleId: "audit/skipped-by-robots",
1251
+ severity: "info",
1252
+ message: `Skipped ${skippedByRobots.length} sitemap URL${skippedByRobots.length === 1 ? "" : "s"} because the target's robots.txt Disallow'd them: ${skippedByRobots.slice(0, 5).join(", ")}${skippedByRobots.length > 5 ? ", …" : ""}.`,
1253
+ fix: "If you own this site and want to audit these URLs anyway, pass `respectRobotsTxt: false` (or remove the Disallow directive).",
1254
+ relatedUrls: skippedByRobots,
1255
+ });
1256
+ }
1257
+ // v0.4 §4.4: origin readiness is now diagnostic-only. The previous
1258
+ // `audit/origin-readiness` finding emission was retired — the structured
1259
+ // ReadinessReport in `summary.diagnostics.originReadiness` is the canonical
1260
+ // signal now (no double-counting in the issue buckets).
926
1261
  const auditMode = options?.mode ?? "full";
927
1262
  // Site-wide rules (run once, outside group loop)
928
1263
  if (sitemapUrlSet && sitemapUrlSet.size > 0 && auditMode !== "diff") {
@@ -959,39 +1294,120 @@ export async function auditSource(source, options) {
959
1294
  if (groupConfig?.rules !== undefined && groupConfig.rules.length === 0)
960
1295
  continue;
961
1296
  const groupRules = resolveGroupRules(resolvedRules, groupConfig?.overrides);
962
- const enabledCheck = (ruleId) => isRuleEnabled(ruleId, groupConfig?.rules);
963
- const findings = runRulesOnPages(groupPages, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full");
1297
+ const enabledCheck = (ruleId) => !suppressedRuleSet.has(ruleId) && isRuleEnabled(ruleId, groupConfig?.rules);
1298
+ const findings = runRulesOnPages(groupPages, parsedPagesAll, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full");
964
1299
  allFindings.push(...findings);
965
1300
  groupPageCounts[groupName] = groupPages.length;
966
- const { score } = scoreFromFindings(findings);
967
- groupScores[groupName] = score;
1301
+ const { risk: groupRisk } = scoreFromFindings(findings);
1302
+ groupScores[groupName] = groupRisk;
968
1303
  }
1304
+ throwIfAborted();
969
1305
  // Enrich findings: cluster pairwise, detect templates, assign effort
970
1306
  const enriched = enrichFindings(allFindings, parsedPages, {
971
1307
  templateGenerated: options?.templateGenerated,
972
1308
  });
973
- const { score, categoryScores } = scoreFromFindings(enriched.findings);
1309
+ // Populate docsUrl on every finding before they leave the engine.
1310
+ withDocsUrls(enriched.findings);
1311
+ const { risk, categories, bucketCounts } = scoreFromFindings(enriched.findings);
974
1312
  const auditedPageCount = Object.values(groupPageCounts).reduce((a, b) => a + b, 0);
1313
+ const issues = bucketIssues(enriched.findings);
1314
+ const verdict = verdictForRisk(risk);
1315
+ const headline = buildHeadline(bucketCounts);
1316
+ // audit/* findings are diagnostic-only and never appear in summary.issues.
1317
+ // Surface them under diagnostics so consumers (telemetry, debug UIs) can
1318
+ // still see what was deduped or skipped.
1319
+ const auditFindings = enriched.findings.filter((f) => f.ruleId.startsWith("audit/"));
1320
+ const readinessReport = computeReadiness(observer.getAll(), { detectedFramework });
1321
+ const crawlStats = {
1322
+ discovered: discoveredUrlCount ?? loadedPagesRaw.length,
1323
+ fetched: parsedPages.length,
1324
+ skipped: skippedByContentType.length + skippedByRobots.length + skippedUrls.length,
1325
+ };
975
1326
  const summary = {
976
- score,
977
- categoryScores,
1327
+ schemaVersion: SCHEMA_VERSION,
1328
+ verdict,
1329
+ risk,
1330
+ headline,
1331
+ categories,
1332
+ issues,
1333
+ siteClassification,
1334
+ diagnostics: {
1335
+ originReadiness: readinessReport,
1336
+ crawlStats,
1337
+ auditFindings,
1338
+ },
978
1339
  groupScores: options?.pageGroups ? groupScores : undefined,
979
1340
  groupPageCounts: options?.pageGroups ? groupPageCounts : undefined,
980
1341
  pageCount: auditedPageCount || parsedPages.length,
981
- findings: enriched.findings,
982
1342
  templateDetected: enriched.templateDetected,
983
1343
  rawFindingCount: enriched.rawFindingCount,
984
1344
  };
985
1345
  if (cacheConfig) {
986
1346
  summary.cacheStats = cacheStats;
987
1347
  }
988
- if (skippedUrls.length > 0) {
989
- summary.skippedUrls = skippedUrls;
1348
+ // v0.4 §4.5 / v0.4.1: warn when ignore patterns matched zero discovered URLs.
1349
+ // - Per-pattern warning fires only when `warnUnmatchedIgnore` is true
1350
+ // (set by the CLI when `--ignore` was passed explicitly). Quiet by
1351
+ // default for config-loaded patterns where broad safety lists like
1352
+ // `**/dashboard/**` legitimately don't match small marketing sites.
1353
+ // - When ALL patterns matched zero (strongest typo signal, e.g. user
1354
+ // wrote `*.json` instead of `**/*.json`), emit a single consolidated
1355
+ // warning regardless of source.
1356
+ if (ignorePatterns.length > 0) {
1357
+ const unmatched = ignorePatterns.filter((pattern) => !deduped.some((p) => globMatchPathname(pattern, p.url)));
1358
+ if (unmatched.length === ignorePatterns.length) {
1359
+ // eslint-disable-next-line no-console
1360
+ console.warn(`[pseolint] none of the ${ignorePatterns.length} ignore pattern${ignorePatterns.length === 1 ? "" : "s"} matched any URLs — check config or --ignore for typos`);
1361
+ }
1362
+ else if (options?.warnUnmatchedIgnore === true) {
1363
+ for (const pattern of unmatched) {
1364
+ // eslint-disable-next-line no-console
1365
+ console.warn(`[pseolint] ignore pattern '${pattern}' matched 0 URLs — likely typo`);
1366
+ }
1367
+ }
990
1368
  }
1369
+ // Merge state-skipped (unchanged since last run), robots-skipped (target
1370
+ // robots.txt Disallow'd), and policy-skipped (noindex / detected-auth) URLs
1371
+ // so callers have a single audit-skipped surface.
1372
+ const allSkipped = [
1373
+ ...skippedUrls,
1374
+ ...skippedByRobots,
1375
+ ...skippedByPolicy.map((s) => s.url),
1376
+ ];
1377
+ if (allSkipped.length > 0) {
1378
+ summary.skippedUrls = allSkipped;
1379
+ }
1380
+ // v0.4.1: surface noindex / auth skips as a discoverable diagnostic so the
1381
+ // user sees what the engine excluded. Catches the accidental-noindex bug:
1382
+ // pages silently dropped from indexing show up as a visible skip line
1383
+ // instead of being absent without explanation.
1384
+ if (skippedByPolicy.length > 0) {
1385
+ const noindexCount = skippedByPolicy.filter((s) => s.reason === "noindex").length;
1386
+ const authCount = skippedByPolicy.filter((s) => s.reason === "auth-detected").length;
1387
+ const sample = skippedByPolicy.slice(0, 5).map((s) => `${s.url} (${s.reason})`).join(", ");
1388
+ const more = skippedByPolicy.length > 5 ? `, +${skippedByPolicy.length - 5} more` : "";
1389
+ const parts = [];
1390
+ if (noindexCount > 0)
1391
+ parts.push(`${noindexCount} marked noindex`);
1392
+ if (authCount > 0)
1393
+ parts.push(`${authCount} detected as auth (login/register/etc)`);
1394
+ auditFindings.push({
1395
+ ruleId: "audit/skipped-by-policy",
1396
+ severity: "info",
1397
+ message: `Skipped ${skippedByPolicy.length} page${skippedByPolicy.length === 1 ? "" : "s"} from rule evaluation — ${parts.join(", ")}. First few: ${sample}${more}.`,
1398
+ relatedUrls: skippedByPolicy.map((s) => s.url),
1399
+ });
1400
+ }
1401
+ // Local flat view of every finding the engine produced, used internally for
1402
+ // state persistence, regression detection, AI triage input, and telemetry
1403
+ // counts. NOT exposed on the AuditSummary — consumers must use
1404
+ // `summary.issues.{blockers,shouldFix,informational}` and
1405
+ // `summary.diagnostics.auditFindings`.
1406
+ const enrichedFindings = enriched.findings;
991
1407
  if (priorState && options?.state?.exitOnRegression) {
992
1408
  let hasRegression = false;
993
1409
  const currentFindings = new Map();
994
- for (const f of summary.findings) {
1410
+ for (const f of enrichedFindings) {
995
1411
  if (!f.pageUrl)
996
1412
  continue;
997
1413
  const set = currentFindings.get(f.pageUrl) ?? new Set();
@@ -1019,7 +1435,7 @@ export async function auditSource(source, options) {
1019
1435
  const renderMode = options.render ? "rendered" : "static";
1020
1436
  const urls = {};
1021
1437
  const findingsByUrl = new Map();
1022
- for (const f of summary.findings) {
1438
+ for (const f of enrichedFindings) {
1023
1439
  if (!f.pageUrl)
1024
1440
  continue;
1025
1441
  const list = findingsByUrl.get(f.pageUrl) ?? [];
@@ -1051,9 +1467,10 @@ export async function auditSource(source, options) {
1051
1467
  renderMode,
1052
1468
  urls,
1053
1469
  summary: {
1054
- score: summary.score,
1055
- totalFindings: summary.findings.length,
1056
- byCategory: Object.fromEntries(Object.entries(summary.categoryScores).map(([k, v]) => [k, v])),
1470
+ score: summary.risk,
1471
+ totalFindings: enrichedFindings.length,
1472
+ byCategory: Object.fromEntries(Object.entries(summary.categories)
1473
+ .map(([k, v]) => [k, v.issues])),
1057
1474
  },
1058
1475
  };
1059
1476
  await writeState(statePath, newState);
@@ -1089,7 +1506,8 @@ export async function auditSource(source, options) {
1089
1506
  spentTodayUsd = 0;
1090
1507
  }
1091
1508
  }
1092
- const outcome = await triageFindings(summary.findings, summary.pageCount, {
1509
+ throwIfAborted();
1510
+ const outcome = await triageFindings(enrichedFindings, summary.pageCount, {
1093
1511
  enabled: true,
1094
1512
  model: resolved.model,
1095
1513
  providerId: resolved.providerId,
@@ -1124,9 +1542,9 @@ export async function auditSource(source, options) {
1124
1542
  runId,
1125
1543
  timestamp: new Date().toISOString(),
1126
1544
  durationMs: Date.now() - runStartedAt,
1127
- score: summary.score,
1545
+ score: summary.risk,
1128
1546
  pageCount: summary.pageCount,
1129
- findingCount: summary.findings.length,
1547
+ findingCount: enrichedFindings.length,
1130
1548
  ...(summary.rawFindingCount !== undefined && { rawFindingCount: summary.rawFindingCount }),
1131
1549
  ...(summary.templateDetected !== undefined && { templateDetected: summary.templateDetected }),
1132
1550
  ...(summary.cacheStats && { cacheStats: summary.cacheStats }),
@@ -1181,7 +1599,19 @@ export async function auditSource(source, options) {
1181
1599
  }
1182
1600
  const aiHintEnabled = options?.ai?.suggest !== false;
1183
1601
  if (aiHintEnabled && !options?.ai?.enabled && process.env.ANTHROPIC_API_KEY) {
1184
- console.error(`💡 AI triage available — re-run with --ai to prioritize ${summary.findings.length} findings into a fix list.`);
1602
+ console.error(`💡 AI triage available — re-run with --ai to prioritize ${enrichedFindings.length} findings into a fix list.`);
1603
+ }
1604
+ if (cacheConfig && cacheMaxBytes > 0) {
1605
+ try {
1606
+ const pruneResult = await pruneCache(cacheConfig.dir, cacheMaxBytes);
1607
+ if (pruneResult.removedEntries > 0 || pruneResult.removedTmpFiles > 0) {
1608
+ const freedMb = ((pruneResult.before.bytes - pruneResult.after.bytes) / 1024 / 1024).toFixed(1);
1609
+ console.error(`pseolint: cache prune freed ${freedMb} MB (${pruneResult.removedEntries} entries, ${pruneResult.removedTmpFiles} .tmp files); size=${(pruneResult.after.bytes / 1024 / 1024).toFixed(1)}MB / cap=${(cacheMaxBytes / 1024 / 1024).toFixed(0)}MB`);
1610
+ }
1611
+ }
1612
+ catch {
1613
+ // Non-fatal: eviction failure must not break the audit.
1614
+ }
1185
1615
  }
1186
1616
  return summary;
1187
1617
  }