@pseolint/core 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/README.md +49 -1
  2. package/dist/ai/triage.d.ts.map +1 -1
  3. package/dist/ai/triage.js +8 -1
  4. package/dist/ai/triage.js.map +1 -1
  5. package/dist/auditor.d.ts.map +1 -1
  6. package/dist/auditor.js +495 -130
  7. package/dist/auditor.js.map +1 -1
  8. package/dist/backpressure.d.ts +68 -0
  9. package/dist/backpressure.d.ts.map +1 -0
  10. package/dist/backpressure.js +81 -0
  11. package/dist/backpressure.js.map +1 -0
  12. package/dist/cache.d.ts +73 -0
  13. package/dist/cache.d.ts.map +1 -1
  14. package/dist/cache.js +258 -19
  15. package/dist/cache.js.map +1 -1
  16. package/dist/enrich-findings.d.ts.map +1 -1
  17. package/dist/enrich-findings.js +1 -14
  18. package/dist/enrich-findings.js.map +1 -1
  19. package/dist/fetch-observer.d.ts +97 -0
  20. package/dist/fetch-observer.d.ts.map +1 -0
  21. package/dist/fetch-observer.js +124 -0
  22. package/dist/fetch-observer.js.map +1 -0
  23. package/dist/formatters/console.d.ts +7 -9
  24. package/dist/formatters/console.d.ts.map +1 -1
  25. package/dist/formatters/console.js +218 -254
  26. package/dist/formatters/console.js.map +1 -1
  27. package/dist/formatters/html.d.ts +5 -1
  28. package/dist/formatters/html.d.ts.map +1 -1
  29. package/dist/formatters/html.js +352 -570
  30. package/dist/formatters/html.js.map +1 -1
  31. package/dist/formatters/index.d.ts +4 -1
  32. package/dist/formatters/index.d.ts.map +1 -1
  33. package/dist/formatters/index.js +1 -1
  34. package/dist/formatters/index.js.map +1 -1
  35. package/dist/formatters/json.d.ts +11 -1
  36. package/dist/formatters/json.d.ts.map +1 -1
  37. package/dist/formatters/json.js +5 -1
  38. package/dist/formatters/json.js.map +1 -1
  39. package/dist/formatters/markdown.d.ts +7 -1
  40. package/dist/formatters/markdown.d.ts.map +1 -1
  41. package/dist/formatters/markdown.js +77 -70
  42. package/dist/formatters/markdown.js.map +1 -1
  43. package/dist/index.d.ts +13 -8
  44. package/dist/index.d.ts.map +1 -1
  45. package/dist/index.js +6 -7
  46. package/dist/index.js.map +1 -1
  47. package/dist/rule-references.d.ts.map +1 -1
  48. package/dist/rule-references.js +0 -6
  49. package/dist/rule-references.js.map +1 -1
  50. package/dist/rules/content/unique-value.d.ts.map +1 -1
  51. package/dist/rules/content/unique-value.js +1 -0
  52. package/dist/rules/content/unique-value.js.map +1 -1
  53. package/dist/rules/scope.d.ts.map +1 -1
  54. package/dist/rules/scope.js +6 -14
  55. package/dist/rules/scope.js.map +1 -1
  56. package/dist/rules/tech/robots-sitemap-presence.d.ts +9 -1
  57. package/dist/rules/tech/robots-sitemap-presence.d.ts.map +1 -1
  58. package/dist/rules/tech/robots-sitemap-presence.js +14 -5
  59. package/dist/rules/tech/robots-sitemap-presence.js.map +1 -1
  60. package/dist/safe-mode-preset.d.ts +27 -0
  61. package/dist/safe-mode-preset.d.ts.map +1 -0
  62. package/dist/safe-mode-preset.js +54 -0
  63. package/dist/safe-mode-preset.js.map +1 -0
  64. package/dist/site-classifier.d.ts +83 -0
  65. package/dist/site-classifier.d.ts.map +1 -0
  66. package/dist/site-classifier.js +205 -0
  67. package/dist/site-classifier.js.map +1 -0
  68. package/dist/ssrf-guard.d.ts +96 -0
  69. package/dist/ssrf-guard.d.ts.map +1 -0
  70. package/dist/ssrf-guard.js +268 -0
  71. package/dist/ssrf-guard.js.map +1 -0
  72. package/dist/types.d.ts +171 -19
  73. package/dist/types.d.ts.map +1 -1
  74. package/dist/types.js +2 -1
  75. package/dist/types.js.map +1 -1
  76. package/package.json +2 -2
package/dist/auditor.js CHANGED
@@ -4,7 +4,6 @@ import { extname, join, resolve } from "node:path";
4
4
  import { parseHtmlPage } from "./parser.js";
5
5
  import { mergeNormalizeUrlOptions, normalizeAuditUrl } from "./url-normalize.js";
6
6
  import { eeatSignalsRule } from "./rules/content/eeat-signals.js";
7
- import { headingUniquenessRule } from "./rules/content/heading-uniqueness.js";
8
7
  import { metaUniquenessRule } from "./rules/content/meta-uniqueness.js";
9
8
  import { missingAuthorRule } from "./rules/content/missing-author.js";
10
9
  import { uniqueValueRule } from "./rules/content/unique-value.js";
@@ -18,12 +17,10 @@ import { thinContentRule } from "./rules/spam/thin-content.js";
18
17
  import { deadEndsRule } from "./rules/links/dead-ends.js";
19
18
  import { linkDepthRule } from "./rules/links/link-depth.js";
20
19
  import { clusterConnectivityRule } from "./rules/links/cluster-connectivity.js";
21
- import { hubPagesRule } from "./rules/links/hub-pages.js";
22
20
  import { orphanPagesRule } from "./rules/links/orphan-pages.js";
23
21
  import { canonicalConsistencyRule } from "./rules/tech/canonical-consistency.js";
24
22
  import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-conflict.js";
25
23
  import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
26
- import { ogCompletenessRule } from "./rules/tech/og-completeness.js";
27
24
  import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
28
25
  import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
29
26
  import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds } from "./rules/tech/robots-sitemap-presence.js";
@@ -33,7 +30,6 @@ import { freshnessSignalsRule } from "./rules/aeo/freshness-signals.js";
33
30
  import { faqCoverageRule } from "./rules/aeo/faq-coverage.js";
34
31
  import { answerFirstRule } from "./rules/aeo/answer-first.js";
35
32
  import { citableFactsRule } from "./rules/aeo/citable-facts.js";
36
- import { nonReplicableValueRule } from "./rules/aeo/non-replicable-value.js";
37
33
  import { contentModularityRule } from "./rules/aeo/content-modularity.js";
38
34
  import { summaryBaitRule } from "./rules/aeo/summary-bait.js";
39
35
  import { redirectChainRule } from "./rules/tech/redirect-chain.js";
@@ -41,8 +37,6 @@ import { soft404Rule } from "./rules/tech/soft-404.js";
41
37
  import { jsonLdValidRule } from "./rules/schema/json-ld-valid.js";
42
38
  import { requiredFieldsRule } from "./rules/schema/required-fields.js";
43
39
  import { schemaConsistencyRule } from "./rules/schema/consistency.js";
44
- import { titleOverlapRule } from "./rules/cannibal/title-overlap.js";
45
- import { keywordCollisionRule } from "./rules/cannibal/keyword-collision.js";
46
40
  import { urlPatternRule } from "./rules/cannibal/url-pattern.js";
47
41
  import { templateCoverageRule } from "./rules/spam/template-coverage.js";
48
42
  import { dataBindingRule, dataIdenticalRule } from "./rules/data/data-binding.js";
@@ -54,8 +48,14 @@ import { triageFindings } from "./ai/triage.js";
54
48
  import { createLanguageModel } from "./ai/adapters/index.js";
55
49
  import { promptTriageFeedback } from "./ai/feedback-prompt.js";
56
50
  import { generateRunId, appendTelemetryRecord, todayTriageSpendUsd, } from "./telemetry/index.js";
57
- import { cachedFetch } from "./cache.js";
51
+ import { SCHEMA_VERSION } from "./types.js";
52
+ import { cachedFetch, pruneCache } from "./cache.js";
53
+ import { SSRFError, validateTargetHost } from "./ssrf-guard.js";
54
+ import { SAFE_MODE_PRESETS, resolveSafeModeKey } from "./safe-mode-preset.js";
55
+ import { FetchObserver, computeReadiness, detectDevServer } from "./fetch-observer.js";
56
+ import { BackpressureMonitor, OriginDegradedError } from "./backpressure.js";
58
57
  import { stratifiedSample } from "./stratified-sample.js";
58
+ import { classifySite } from "./site-classifier.js";
59
59
  import { readState, writeState, computeContentHash, STATE_SCHEMA_VERSION, } from "./state.js";
60
60
  const DEFAULTS = {
61
61
  nearDuplicateThreshold: 0.85,
@@ -67,10 +67,6 @@ const DEFAULTS = {
67
67
  uniqueValueMinWords: 100,
68
68
  metaUniquenessMinJaccard: 0.9,
69
69
  linkDepthMaxClicks: 3,
70
- hubPagesMinSiblings: 4,
71
- hubPagesMaxSiblings: 50,
72
- titleOverlapThreshold: 0.8,
73
- keywordCollisionMinShared: 6,
74
70
  templateCoverageMinPages: 5,
75
71
  answerFirstMaxWords: 100,
76
72
  citableFactsMin: 3,
@@ -80,18 +76,82 @@ const DEFAULTS = {
80
76
  modularityMinSelfContainedRatio: 0.7,
81
77
  faqMinQuestionHeadings: 2
82
78
  };
79
+ /**
80
+ * v0.4 four-category weights. Audit is diagnostic-only (weight 0).
81
+ * See 2026-04-29 v0.4 redesign spec §4.2.
82
+ */
83
83
  const CATEGORY_WEIGHTS = {
84
- spam: 0.33,
85
- content: 0.19,
86
- aeo: 0.14,
87
- links: 0.11,
88
- tech: 0.07,
89
- data: 0.06,
90
- schema: 0.05,
91
- cannibal: 0.05,
92
- /** Dedup / crawl hygiene; does not affect composite score. */
93
- audit: 0
84
+ integrity: 0.50, // spam + content + cannibal
85
+ discoverability: 0.20, // links + tech
86
+ citation: 0.25, // aeo + schema
87
+ data: 0.05, // data
88
+ audit: 0, // diagnostics, never weighted
94
89
  };
90
+ /**
91
+ * Maps the v0.3 ruleId namespace prefix to the v0.4 four-bucket category.
92
+ * Used by `scoreFromFindings` to bucket findings without changing rule IDs.
93
+ */
94
+ const CATEGORY_MAP = {
95
+ spam: "integrity",
96
+ content: "integrity",
97
+ cannibal: "integrity",
98
+ links: "discoverability",
99
+ tech: "discoverability",
100
+ aeo: "citation",
101
+ schema: "citation",
102
+ data: "data",
103
+ audit: "audit",
104
+ };
105
+ /** Slug map for `RuleResult.docsUrl`. Defaults to the rule-id segment after the `/`. */
106
+ const RULE_DOCS_SLUG = {
107
+ // intentionally empty for v0.4 — slug = ruleId.split("/").pop() works for every shipped rule
108
+ };
109
+ function docsUrlFor(ruleId) {
110
+ const slug = RULE_DOCS_SLUG[ruleId] ?? ruleId.split("/").pop() ?? ruleId;
111
+ return `https://pseolint.dev/rules/${slug}`;
112
+ }
113
+ /** Verdict ladder thresholds — see spec §4.4. */
114
+ function verdictForRisk(risk) {
115
+ if (risk <= 20)
116
+ return "ready";
117
+ if (risk <= 40)
118
+ return "caution";
119
+ if (risk <= 60)
120
+ return "concerning";
121
+ return "critical";
122
+ }
123
+ function gradeForPenalty(penalty) {
124
+ if (penalty <= 20)
125
+ return "A";
126
+ if (penalty <= 40)
127
+ return "B";
128
+ if (penalty <= 60)
129
+ return "C";
130
+ if (penalty <= 80)
131
+ return "D";
132
+ return "F";
133
+ }
134
+ /** True for `text/html` and `application/xhtml+xml` only (treat as audit-eligible content). */
135
+ function isHtmlContentType(contentType) {
136
+ if (!contentType)
137
+ return true; // Local files / unknown — assume HTML.
138
+ const lower = contentType.toLowerCase();
139
+ return lower.includes("text/html") || lower.includes("application/xhtml+xml");
140
+ }
141
+ /** Glob match against a URL pathname only (not the full URL). v0.4 spec §4.5. */
142
+ function globMatchPathname(pattern, urlOrPath) {
143
+ let pathname;
144
+ try {
145
+ pathname = new URL(urlOrPath).pathname;
146
+ }
147
+ catch {
148
+ // Not a URL — treat as already-a-path. Force a leading slash for consistency.
149
+ pathname = urlOrPath.startsWith("/") ? urlOrPath : `/${urlOrPath}`;
150
+ }
151
+ // Allow patterns that don't begin with "/" by normalising both sides.
152
+ const normPattern = pattern.startsWith("/") || pattern.startsWith("*") ? pattern : `/${pattern}`;
153
+ return matchGlob(normPattern, pathname) || matchGlob(pattern, pathname);
154
+ }
95
155
  const DEFAULT_ENTITY_PATTERNS = [
96
156
  {
97
157
  placeholder: "[STATE]",
@@ -156,9 +216,6 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
156
216
  if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
157
217
  findings.push(...tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
158
218
  }
159
- if (isEnabled("content/heading-uniqueness") && modeOk("content/heading-uniqueness")) {
160
- findings.push(...tag(headingUniquenessRule(pages, entityPatterns)));
161
- }
162
219
  if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
163
220
  findings.push(...tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
164
221
  }
@@ -183,9 +240,6 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
183
240
  if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
184
241
  findings.push(...tag(clusterConnectivityRule(pages, knownUrls)));
185
242
  }
186
- if (isEnabled("links/hub-pages") && modeOk("links/hub-pages")) {
187
- findings.push(...tag(hubPagesRule(pages, knownUrls, resolvedRules.hubPagesMinSiblings, resolvedRules.hubPagesMaxSiblings)));
188
- }
189
243
  // Tech rules
190
244
  if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
191
245
  findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
@@ -202,9 +256,6 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
202
256
  if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
203
257
  findings.push(...tag(soft404Rule(pages)));
204
258
  }
205
- if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
206
- findings.push(...tag(ogCompletenessRule(pages)));
207
- }
208
259
  if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
209
260
  findings.push(...tag(hreflangConsistencyRule(pages, normalizeUrlOptions)));
210
261
  }
@@ -240,9 +291,6 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
240
291
  targetFactsPerPage: resolvedRules.citableFactsTarget,
241
292
  })));
242
293
  }
243
- if (isEnabled("aeo/non-replicable-value")) {
244
- findings.push(...tag(nonReplicableValueRule(pages)));
245
- }
246
294
  if (isEnabled("aeo/content-modularity")) {
247
295
  findings.push(...tag(contentModularityRule(pages, {
248
296
  maxParagraphWords: resolvedRules.modularityMaxParagraphWords,
@@ -252,13 +300,9 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
252
300
  if (isEnabled("aeo/summary-bait")) {
253
301
  findings.push(...tag(summaryBaitRule(pages, entityPatterns)));
254
302
  }
255
- // Cannibal rules
256
- if (isEnabled("cannibal/title-overlap") && modeOk("cannibal/title-overlap")) {
257
- findings.push(...tag(titleOverlapRule(pages, entityPatterns, resolvedRules.titleOverlapThreshold)));
258
- }
259
- if (isEnabled("cannibal/keyword-collision") && modeOk("cannibal/keyword-collision")) {
260
- findings.push(...tag(keywordCollisionRule(pages, resolvedRules.keywordCollisionMinShared)));
261
- }
303
+ // Cannibal rules — only url-pattern survives in v0.4 (title-overlap and
304
+ // keyword-collision dropped due to high false-positive rates; see
305
+ // 2026-04-29 v0.4 redesign spec §4.3).
262
306
  if (isEnabled("cannibal/url-pattern") && modeOk("cannibal/url-pattern")) {
263
307
  findings.push(...tag(urlPatternRule(pages)));
264
308
  }
@@ -267,54 +311,110 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
267
311
  function hashHtml(html) {
268
312
  return createHash("sha256").update(html, "utf8").digest("hex");
269
313
  }
314
+ const SEVERITY_WEIGHTS = {
315
+ critical: 40,
316
+ error: 25,
317
+ warning: 12,
318
+ info: 5,
319
+ };
270
320
  function scoreFromFindings(findings) {
271
- const severityWeights = {
272
- critical: 40,
273
- error: 25,
274
- warning: 12,
275
- info: 5
321
+ // v0.4 four-bucket raw penalties.
322
+ const bucketRaw = {
323
+ integrity: 0,
324
+ discoverability: 0,
325
+ citation: 0,
326
+ data: 0,
327
+ audit: 0,
276
328
  };
277
- const raw = {
278
- spam: 0,
279
- content: 0,
280
- aeo: 0,
281
- links: 0,
282
- tech: 0,
329
+ const bucketIssues = {
330
+ integrity: 0,
331
+ discoverability: 0,
332
+ citation: 0,
283
333
  data: 0,
284
- schema: 0,
285
- cannibal: 0,
286
- audit: 0
334
+ audit: 0,
287
335
  };
336
+ let blockers = 0;
337
+ let shouldFix = 0;
338
+ let informational = 0;
288
339
  for (const finding of findings) {
289
- const category = finding.ruleId.split("/")[0];
290
- if (!(category in raw)) {
340
+ const namespace = finding.ruleId.split("/")[0];
341
+ const bucket = CATEGORY_MAP[namespace];
342
+ if (!bucket)
291
343
  continue;
344
+ const weight = SEVERITY_WEIGHTS[finding.severity];
345
+ // v0.4 buckets.
346
+ bucketRaw[bucket] = Math.min(100, bucketRaw[bucket] + weight);
347
+ if (bucket !== "audit") {
348
+ bucketIssues[bucket] += 1;
292
349
  }
293
- raw[category] = Math.min(100, raw[category] + severityWeights[finding.severity]);
294
- }
295
- const weighted = raw.spam * CATEGORY_WEIGHTS.spam +
296
- raw.content * CATEGORY_WEIGHTS.content +
297
- raw.aeo * CATEGORY_WEIGHTS.aeo +
298
- raw.links * CATEGORY_WEIGHTS.links +
299
- raw.tech * CATEGORY_WEIGHTS.tech +
300
- raw.data * CATEGORY_WEIGHTS.data +
301
- raw.schema * CATEGORY_WEIGHTS.schema +
302
- raw.cannibal * CATEGORY_WEIGHTS.cannibal +
303
- raw.audit * CATEGORY_WEIGHTS.audit;
350
+ // Issue-bucket counts (audit/* findings are diagnostic-only and excluded).
351
+ if (bucket === "audit")
352
+ continue;
353
+ if (finding.severity === "critical" || finding.severity === "error")
354
+ blockers += 1;
355
+ else if (finding.severity === "warning")
356
+ shouldFix += 1;
357
+ else
358
+ informational += 1;
359
+ }
360
+ const weighted = bucketRaw.integrity * CATEGORY_WEIGHTS.integrity +
361
+ bucketRaw.discoverability * CATEGORY_WEIGHTS.discoverability +
362
+ bucketRaw.citation * CATEGORY_WEIGHTS.citation +
363
+ bucketRaw.data * CATEGORY_WEIGHTS.data;
364
+ const risk = Math.round(Math.min(100, weighted));
365
+ const categories = {
366
+ integrity: { grade: gradeForPenalty(bucketRaw.integrity), issues: bucketIssues.integrity },
367
+ discoverability: { grade: gradeForPenalty(bucketRaw.discoverability), issues: bucketIssues.discoverability },
368
+ citation: { grade: gradeForPenalty(bucketRaw.citation), issues: bucketIssues.citation },
369
+ data: { grade: gradeForPenalty(bucketRaw.data), issues: bucketIssues.data },
370
+ audit: { grade: "A", issues: 0 },
371
+ };
304
372
  return {
305
- score: Math.round(Math.min(100, weighted)),
306
- categoryScores: {
307
- spam: raw.spam,
308
- content: raw.content,
309
- aeo: raw.aeo,
310
- links: raw.links,
311
- tech: raw.tech,
312
- data: raw.data,
313
- schema: raw.schema,
314
- cannibal: raw.cannibal
315
- }
373
+ risk,
374
+ categories,
375
+ bucketCounts: { blockers, shouldFix, informational },
316
376
  };
317
377
  }
378
+ function bucketIssues(findings) {
379
+ const blockers = [];
380
+ const shouldFix = [];
381
+ const informational = [];
382
+ for (const f of findings) {
383
+ // audit/* findings are diagnostics and never appear in issue buckets.
384
+ if (f.ruleId.startsWith("audit/"))
385
+ continue;
386
+ if (f.severity === "critical" || f.severity === "error")
387
+ blockers.push(f);
388
+ else if (f.severity === "warning")
389
+ shouldFix.push(f);
390
+ else
391
+ informational.push(f);
392
+ }
393
+ return { blockers, shouldFix, informational };
394
+ }
395
+ function buildHeadline(counts) {
396
+ const parts = [];
397
+ if (counts.blockers > 0) {
398
+ parts.push(`${counts.blockers} ship-blocker${counts.blockers === 1 ? "" : "s"}`);
399
+ }
400
+ if (counts.shouldFix > 0) {
401
+ parts.push(`${counts.shouldFix} should-fix`);
402
+ }
403
+ if (counts.informational > 0 && parts.length < 2) {
404
+ parts.push(`${counts.informational} informational`);
405
+ }
406
+ if (parts.length === 0)
407
+ return "No issues detected.";
408
+ return parts.join(", ");
409
+ }
410
+ /** Populate `docsUrl` on every finding that doesn't already have one. */
411
+ function withDocsUrls(findings) {
412
+ for (const f of findings) {
413
+ if (!f.docsUrl)
414
+ f.docsUrl = docsUrlFor(f.ruleId);
415
+ }
416
+ return findings;
417
+ }
318
418
  async function collectHtmlFiles(directory) {
319
419
  const entries = await readdir(directory, { withFileTypes: true });
320
420
  const files = await Promise.all(entries.map(async (entry) => {
@@ -330,10 +430,30 @@ async function collectHtmlFiles(directory) {
330
430
  }));
331
431
  return files.flat();
332
432
  }
333
- async function fetchWithRetry(url, timeoutMs, cache, stats) {
433
+ /**
434
+ * Combine up to N AbortSignals into one. The returned signal aborts as soon
435
+ * as any input aborts. Avoids the node-only `AbortSignal.any` for wider
436
+ * compatibility and keeps listeners weak-ish (one per input, no unbounded
437
+ * listener growth).
438
+ */
439
+ function composeSignals(...signals) {
440
+ const actual = signals.filter((s) => Boolean(s));
441
+ if (actual.length === 0)
442
+ return new AbortController().signal;
443
+ const ac = new AbortController();
444
+ for (const s of actual) {
445
+ if (s.aborted) {
446
+ ac.abort(s.reason);
447
+ return ac.signal;
448
+ }
449
+ s.addEventListener("abort", () => ac.abort(s.reason), { once: true });
450
+ }
451
+ return ac.signal;
452
+ }
453
+ async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop) {
334
454
  try {
335
455
  stats.total += 1;
336
- const r = await cachedFetch(url, { timeoutMs, cache });
456
+ const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
337
457
  if (r.fromCache) {
338
458
  stats.hits += 1;
339
459
  stats.bytesSavedEstimate += r.body.length;
@@ -342,14 +462,16 @@ async function fetchWithRetry(url, timeoutMs, cache, stats) {
342
462
  return null;
343
463
  return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
344
464
  }
345
- catch {
465
+ catch (err) {
466
+ if (signal?.aborted)
467
+ throw err; // propagate cancellation
346
468
  return null;
347
469
  }
348
470
  }
349
- async function fetchPageWithMeta(url, timeoutMs, cache, stats) {
471
+ async function fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects = true) {
350
472
  try {
351
473
  stats.total += 1;
352
- const r = await cachedFetch(url, { timeoutMs, cache });
474
+ const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, followRedirects, onObservation: stats.onObservation });
353
475
  if (r.fromCache) {
354
476
  stats.hits += 1;
355
477
  stats.bytesSavedEstimate += r.body.length;
@@ -366,13 +488,15 @@ async function fetchPageWithMeta(url, timeoutMs, cache, stats) {
366
488
  },
367
489
  };
368
490
  }
369
- catch {
491
+ catch (err) {
492
+ if (signal?.aborted)
493
+ throw err;
370
494
  return null;
371
495
  }
372
496
  }
373
- async function fetchTextStrict(url, timeoutMs, cache, stats) {
497
+ async function fetchTextStrict(url, timeoutMs, cache, stats, signal, validateHop) {
374
498
  stats.total += 1;
375
- const r = await cachedFetch(url, { timeoutMs, cache });
499
+ const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
376
500
  if (r.fromCache) {
377
501
  stats.hits += 1;
378
502
  stats.bytesSavedEstimate += r.body.length;
@@ -455,8 +579,13 @@ function matchGlob(pattern, value) {
455
579
  function shouldIgnore(url, patterns) {
456
580
  if (patterns.length === 0)
457
581
  return false;
582
+ // v0.4 §4.5: globs match against the URL pathname only, NOT the full URL.
583
+ // Operator intuition: `ignore: ["dashboard/**"]` should match
584
+ // `https://example.com/dashboard/...` even though the full URL contains the
585
+ // host. Previously globs matched the full URL and silently failed for users
586
+ // who didn't think to write `**/dashboard/**`.
458
587
  for (const pattern of patterns) {
459
- if (matchGlob(pattern, url))
588
+ if (globMatchPathname(pattern, url))
460
589
  return true;
461
590
  }
462
591
  return false;
@@ -469,7 +598,7 @@ function fisherYatesSample(items, n) {
469
598
  }
470
599
  return arr.slice(arr.length - n);
471
600
  }
472
- async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats) {
601
+ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop) {
473
602
  visited.add(sitemapUrl);
474
603
  const locs = parseSitemapUrls(sitemapText);
475
604
  if (!isSitemapIndex(sitemapText)) {
@@ -477,27 +606,32 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
477
606
  }
478
607
  const allUrls = [];
479
608
  for (const childUrl of locs) {
609
+ if (signal?.aborted)
610
+ throw signal.reason ?? new Error("aborted");
480
611
  if (visited.has(childUrl))
481
612
  continue;
482
- const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats);
613
+ const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop);
483
614
  if (!child)
484
615
  continue;
485
616
  const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
486
617
  if (!childLike)
487
618
  continue;
488
- const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats);
619
+ const childUrls = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
489
620
  allUrls.push(...childUrls);
490
621
  }
491
622
  return allUrls;
492
623
  }
493
- async function fetchRobotsMeta(origin, timeoutMs, cache, stats) {
624
+ async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
494
625
  if (!origin)
495
626
  return { disallow: [], crawlDelaySec: 0 };
496
627
  try {
497
628
  const robotsUrl = `${origin}/robots.txt`;
498
- const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats);
629
+ const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats, signal, validateHop);
630
+ // Honor both the wildcard block AND any block specifically targeting us.
631
+ // A malicious target can't bypass our crawler by adding a targeted
632
+ // `User-agent: pseolint / Disallow: /` without a wildcard.
499
633
  return {
500
- disallow: parseDisallowPatterns(fetched.text),
634
+ disallow: parseDisallowPatterns(fetched.text, ["*", "pseolint"]),
501
635
  crawlDelaySec: parseCrawlDelaySeconds(fetched.text),
502
636
  };
503
637
  }
@@ -518,13 +652,42 @@ function isDisallowedByRobots(urlPath, patterns) {
518
652
  function budgetExceeded(b) {
519
653
  return b.cap > 0 && b.used >= b.cap;
520
654
  }
521
- async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }) {
655
+ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000) {
656
+ // Memoized SSRF validator. When guardSsrf is on, every URL fetched by the
657
+ // audit (source, sitemap entries, redirects, discovered links) goes through
658
+ // this. DNS is hit once per unique hostname per audit — a 4k-page audit on
659
+ // one origin does 1 DNS lookup, not 4k.
660
+ const ssrfCache = new Map();
661
+ const validateHop = guardSsrf
662
+ ? async (u) => {
663
+ let host;
664
+ try {
665
+ host = new URL(u).hostname;
666
+ }
667
+ catch {
668
+ throw new Error(`Refusing to fetch invalid URL: ${u}`);
669
+ }
670
+ let pending = ssrfCache.get(host);
671
+ if (!pending) {
672
+ pending = validateTargetHost(host).catch((err) => {
673
+ if (err instanceof SSRFError) {
674
+ throw new Error(`Refusing to fetch ${u}: ${err.reason}`);
675
+ }
676
+ throw err;
677
+ });
678
+ ssrfCache.set(host, pending);
679
+ }
680
+ await pending;
681
+ }
682
+ : undefined;
522
683
  if (/^https?:\/\//i.test(source)) {
684
+ if (validateHop)
685
+ await validateHop(source);
523
686
  let text;
524
687
  let contentType;
525
688
  let sourceStatus = 200;
526
689
  try {
527
- const fetched = await fetchTextStrict(source, timeoutMs, cache, stats);
690
+ const fetched = await fetchTextStrict(source, timeoutMs, cache, stats, signal, validateHop);
528
691
  text = fetched.text;
529
692
  contentType = fetched.contentType;
530
693
  }
@@ -533,7 +696,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
533
696
  if (source.includes("sitemap")) {
534
697
  try {
535
698
  const origin = new URL(source).origin;
536
- const fallback = await fetchTextStrict(origin, timeoutMs, cache, stats);
699
+ const fallback = await fetchTextStrict(origin, timeoutMs, cache, stats, signal, validateHop);
537
700
  text = fallback.text;
538
701
  contentType = fallback.contentType;
539
702
  sourceStatus = -1; // flag that we fell back
@@ -549,7 +712,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
549
712
  const isXml = (contentType.includes("xml") || looksLikeSitemap(text)) && sourceStatus !== -1;
550
713
  if (isXml) {
551
714
  const visited = new Set();
552
- const allSitemapUrls = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats);
715
+ const allSitemapUrls = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
553
716
  // If we have a budget, sample from sitemap URLs before fetching
554
717
  const urlsToFetch = discoveryBudget > 0 && allSitemapUrls.length > discoveryBudget
555
718
  ? fisherYatesSample(allSitemapUrls, discoveryBudget)
@@ -562,13 +725,29 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
562
725
  catch {
563
726
  return "";
564
727
  } })();
565
- const robots = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats);
728
+ const robots = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats, signal, validateHop);
566
729
  const effectiveConcurrency = robots.crawlDelaySec > 0 ? 1 : concurrency;
567
730
  const delayMs = robots.crawlDelaySec * 1000;
568
731
  await runWithConcurrency(urlsToFetch, effectiveConcurrency, async (url) => {
569
732
  if (budgetExceeded(byteBudget))
570
733
  return;
571
- const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
734
+ // Honor robots.txt for our own crawl when respectRobotsTxt is on (default).
735
+ // The existing robotsComplianceRule flags sitemap-vs-robots conflicts as
736
+ // findings; this actually refuses to fetch the disallowed URL. Keeps us
737
+ // legally defensible (we are a bot, our UA `pseolint` is public, and we
738
+ // respect Disallow directives) and removes the "crawler-for-hire" abuse
739
+ // vector when the library is invoked from a hosted service.
740
+ if (respectRobotsTxt) {
741
+ try {
742
+ const p = new URL(url).pathname;
743
+ if (isDisallowedByRobots(p, robots.disallow)) {
744
+ skippedByRobots.push(url);
745
+ return;
746
+ }
747
+ }
748
+ catch { /* URL parse failed — fall through, fetch will fail naturally */ }
749
+ }
750
+ const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
572
751
  if (result) {
573
752
  byteBudget.used += result.html.length;
574
753
  pages.push(result);
@@ -587,9 +766,16 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
587
766
  const discoveredUrls = new Set();
588
767
  // robots already fetched above; reuse its Disallow patterns here.
589
768
  const disallowPatterns = robots.disallow;
590
- for (const page of pages) {
769
+ let discoveryCeilingReached = false;
770
+ outer: for (const page of pages) {
591
771
  const linkMatches = Array.from(page.html.matchAll(/href=["']([^"']+)["']/gi));
592
772
  for (const match of linkMatches) {
773
+ if (discoveredUrls.size >= maxCrawlDiscovered) {
774
+ // Hard ceiling — don't let a malicious site with many self-links
775
+ // extend crawl discovery up to the byte budget.
776
+ discoveryCeilingReached = true;
777
+ break outer;
778
+ }
593
779
  const href = match[1];
594
780
  if (!href || href.startsWith("#") || /^mailto:|^tel:|^javascript:|^data:/i.test(href))
595
781
  continue;
@@ -614,6 +800,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
614
800
  }
615
801
  }
616
802
  }
803
+ if (discoveryCeilingReached) {
804
+ // eslint-disable-next-line no-console
805
+ console.error(`pseolint: crawl discovery hit maxCrawlDiscovered=${maxCrawlDiscovered} ceiling; sampling from the first ${discoveredUrls.size} URLs.`);
806
+ }
617
807
  if (discoveredUrls.size > 0) {
618
808
  const candidates = Array.from(discoveredUrls);
619
809
  // Fisher-Yates shuffle so we don't bias toward the first-discovered links (nav/footer).
@@ -623,7 +813,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
623
813
  await runWithConcurrency(toFetch, effectiveConcurrency, async (url) => {
624
814
  if (budgetExceeded(byteBudget))
625
815
  return;
626
- const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
816
+ const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
627
817
  if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
628
818
  byteBudget.used += result.html.length;
629
819
  pages.push(result);
@@ -700,7 +890,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
700
890
  }
701
891
  const newPages = [];
702
892
  await runWithConcurrency(urlsToFetch, concurrency, async (url) => {
703
- const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
893
+ const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
704
894
  if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
705
895
  newPages.push(result);
706
896
  knownCrawled.add(url);
@@ -744,10 +934,63 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
744
934
  export async function auditSource(source, options) {
745
935
  const runId = generateRunId();
746
936
  const runStartedAt = Date.now();
747
- const concurrency = options?.concurrency ?? 5;
937
+ // Apply safeMode preset first, then let explicit options override it. Using
938
+ // `??` preserves the "not set" vs "explicitly false" distinction — a user
939
+ // who picks safeMode="saas" but passes `guardSsrf: false` gets the explicit
940
+ // override. Localhost sources auto-promote to the `dev` preset unless the
941
+ // caller explicitly set `safeMode` or passed `autoDevPreset: false`.
942
+ const presetKey = resolveSafeModeKey(source, options);
943
+ const preset = SAFE_MODE_PRESETS[presetKey];
944
+ const concurrency = options?.concurrency ?? preset.concurrency ?? 5;
748
945
  const timeoutMs = options?.timeout ?? 30000;
749
946
  const ignorePatterns = options?.ignore ?? [];
750
- const sampleSize = options?.sampleSize ?? 0;
947
+ const sampleSize = options?.sampleSize ?? preset.sampleSize ?? 0;
948
+ const externalSignal = options?.signal;
949
+ const guardSsrf = options?.guardSsrf ?? preset.guardSsrf ?? false;
950
+ const respectRobotsTxt = options?.respectRobotsTxt ?? preset.respectRobotsTxt ?? true;
951
+ const followRedirects = options?.followRedirects ?? preset.followRedirects ?? true;
952
+ const maxCrawlDiscovered = options?.maxCrawlDiscovered ?? preset.maxCrawlDiscovered ?? 5000;
953
+ const skippedByRobots = [];
954
+ // Backpressure: watch TTFB + 5xx rate during the crawl and abort if the
955
+ // origin looks degraded. The audit signal is a composite of the caller's
956
+ // signal (ctrl-C, parent timeout) and the monitor's abort controller.
957
+ const backpressureEnabled = options?.backpressure !== false;
958
+ const backpressureAbort = new AbortController();
959
+ let backpressureError = null;
960
+ const signal = composeSignals(externalSignal, backpressureAbort.signal);
961
+ const observer = new FetchObserver();
962
+ const monitor = backpressureEnabled
963
+ ? new BackpressureMonitor({
964
+ warmupSize: 10,
965
+ absoluteP95Ms: 3000,
966
+ baselineMultiplier: 2,
967
+ errorRatioThreshold: 0.1,
968
+ })
969
+ : null;
970
+ // v0.4: framework gets set on the first observation that carries headers
971
+ // (the source URL fetch). Backpressure thresholds and computeReadiness use
972
+ // it to soften limits when auditing a dev server.
973
+ let detectedFramework = null;
974
+ const onObservation = (obs) => {
975
+ if (detectedFramework === null && obs.headers) {
976
+ detectedFramework = detectDevServer(obs.headers);
977
+ }
978
+ observer.record(obs);
979
+ if (!monitor)
980
+ return;
981
+ const decision = monitor.record(obs);
982
+ if (decision.shouldAbort && !backpressureError && decision.snapshot) {
983
+ backpressureError = new OriginDegradedError(decision.reason ?? "", decision.snapshot);
984
+ backpressureAbort.abort(backpressureError);
985
+ }
986
+ };
987
+ function throwIfAborted() {
988
+ if (backpressureError)
989
+ throw backpressureError;
990
+ if (externalSignal?.aborted) {
991
+ throw externalSignal.reason ?? new DOMException("Audit aborted", "AbortError");
992
+ }
993
+ }
751
994
  const resolvedRules = {
752
995
  nearDuplicateThreshold: options?.rules?.nearDuplicateThreshold ?? DEFAULTS.nearDuplicateThreshold,
753
996
  entitySwapThreshold: options?.rules?.entitySwapThreshold ?? DEFAULTS.entitySwapThreshold,
@@ -758,10 +1001,6 @@ export async function auditSource(source, options) {
758
1001
  uniqueValueMinWords: options?.rules?.uniqueValueMinWords ?? DEFAULTS.uniqueValueMinWords,
759
1002
  metaUniquenessMinJaccard: options?.rules?.metaUniquenessMinJaccard ?? DEFAULTS.metaUniquenessMinJaccard,
760
1003
  linkDepthMaxClicks: options?.rules?.linkDepthMaxClicks ?? DEFAULTS.linkDepthMaxClicks,
761
- hubPagesMinSiblings: options?.rules?.hubPagesMinSiblings ?? DEFAULTS.hubPagesMinSiblings,
762
- hubPagesMaxSiblings: options?.rules?.hubPagesMaxSiblings ?? DEFAULTS.hubPagesMaxSiblings,
763
- titleOverlapThreshold: options?.rules?.titleOverlapThreshold ?? DEFAULTS.titleOverlapThreshold,
764
- keywordCollisionMinShared: options?.rules?.keywordCollisionMinShared ?? DEFAULTS.keywordCollisionMinShared,
765
1004
  templateCoverageMinPages: options?.rules?.templateCoverageMinPages ?? DEFAULTS.templateCoverageMinPages,
766
1005
  answerFirstMaxWords: options?.rules?.answerFirstMaxWords ?? DEFAULTS.answerFirstMaxWords,
767
1006
  citableFactsMin: options?.rules?.citableFactsMin ?? DEFAULTS.citableFactsMin,
@@ -783,18 +1022,47 @@ export async function auditSource(source, options) {
783
1022
  const discoveryBudget = options?.sampleSize && options.sampleSize > 0
784
1023
  ? Math.max(50, options.sampleSize * 2)
785
1024
  : 0;
786
- const cacheStats = { hits: 0, total: 0, bytesSavedEstimate: 0 };
1025
+ const cacheStats = { hits: 0, total: 0, bytesSavedEstimate: 0, onObservation };
787
1026
  const cacheConfig = options?.cache
788
1027
  ? {
789
1028
  dir: options.cache.dir ?? ".pseolint/cache",
790
1029
  ttlMs: options.cache.ttlMs ?? 7 * 24 * 60 * 60 * 1000,
791
1030
  }
792
1031
  : null;
1032
+ // Size cap (post-audit eviction). Default 200 MB keeps pSEO-scale sites in check;
1033
+ // a single full crawl of a 5k-page site averages ~250 KB per body = ~1.25 GB uncapped.
1034
+ const cacheMaxBytes = options?.cache?.maxBytes ?? 209_715_200;
793
1035
  const fillBudgetViaLinkDiscovery = options?.fillBudgetViaLinkDiscovery ?? false;
794
- const maxFetchBytes = options?.maxFetchBytes ?? 52_428_800;
1036
+ const maxFetchBytes = options?.maxFetchBytes ?? preset.maxFetchBytes ?? 52_428_800;
795
1037
  const fetchByteBudget = { used: 0, cap: maxFetchBytes };
796
- const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, discoveredUrlCount } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget);
1038
+ // v0.4 §4.7: detectedFramework is set in onObservation above, side-effect
1039
+ // of the normal source URL fetch. No separate probe needed.
1040
+ const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, discoveredUrlCount } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered);
1041
+ throwIfAborted();
797
1042
  const loadedPages = [...loadedPagesRaw];
1043
+ // v0.4 §4.7: content-type-aware crawling. Filter out fetched URLs whose
1044
+ // response Content-Type is not HTML (text/html or application/xhtml+xml).
1045
+ // Binary routes like /apple-icon, /opengraph-image, /icon get pushed to
1046
+ // crawlStats.skipped instead of being parsed as thin-content pages.
1047
+ const skippedByContentType = [];
1048
+ const htmlOnlyPages = [];
1049
+ for (const p of loadedPages) {
1050
+ // httpMeta is set on URL fetches; locally-loaded files have no httpMeta
1051
+ // and are always HTML by definition (collectHtmlFiles only picks .html).
1052
+ // We don't have content-type on the LoadedPage object. Heuristic: if html
1053
+ // body doesn't contain any HTML markers, treat as non-HTML.
1054
+ if (!p.httpMeta) {
1055
+ htmlOnlyPages.push(p);
1056
+ continue;
1057
+ }
1058
+ if (looksLikeHtml(p.html)) {
1059
+ htmlOnlyPages.push(p);
1060
+ }
1061
+ else {
1062
+ skippedByContentType.push(p.url);
1063
+ }
1064
+ }
1065
+ loadedPages.splice(0, loadedPages.length, ...htmlOnlyPages);
798
1066
  if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
799
1067
  console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
800
1068
  }
@@ -830,7 +1098,7 @@ export async function auditSource(source, options) {
830
1098
  if (/^https?:\/\//i.test(source)) {
831
1099
  try {
832
1100
  const origin = new URL(source).origin;
833
- const result = await fetchWithRetry(`${origin}/robots.txt`, timeoutMs, cacheConfig, cacheStats);
1101
+ const result = await fetchWithRetry(`${origin}/robots.txt`, timeoutMs, cacheConfig, cacheStats, signal);
834
1102
  if (result)
835
1103
  robotsTxtContent = result.text;
836
1104
  }
@@ -918,11 +1186,50 @@ export async function auditSource(source, options) {
918
1186
  }),
919
1187
  ]
920
1188
  : DEFAULT_ENTITY_PATTERNS;
1189
+ // v0.4 §4.11 — pre-flight site classification. We compute this BEFORE the
1190
+ // rule pipeline so the dispatcher can skip pSEO-only rules on small
1191
+ // marketing sites / blogs. Classification is computed off the FULL
1192
+ // discovered URL set (sitemap when available, else loaded URLs). This
1193
+ // matters: a sampled crawl of a 5000-page directory must still classify
1194
+ // as `programmatic-directory`, not `unclear`.
1195
+ const classifierUrls = (() => {
1196
+ if (sitemapUrlSet && sitemapUrlSet.size > 0) {
1197
+ return Array.from(sitemapUrlSet);
1198
+ }
1199
+ return loadedPagesRaw.map((p) => p.url);
1200
+ })();
1201
+ const classifierFramework = detectedFramework ?? "unknown";
1202
+ const computedClassification = classifySite({
1203
+ urls: classifierUrls,
1204
+ framework: classifierFramework,
1205
+ });
1206
+ // `--strict` (or AuditOptions.strict) keeps the classification but forces
1207
+ // every rule to run regardless of detected site type.
1208
+ const siteClassification = options?.strict
1209
+ ? { ...computedClassification, suppressedRules: [] }
1210
+ : computedClassification;
1211
+ const suppressedRuleSet = new Set(siteClassification.suppressedRules);
921
1212
  // Classify pages into groups and run only enabled rules per group
922
1213
  const classified = classifyPages(parsedPages, options?.pageGroups);
923
1214
  const allFindings = [...duplicateUrlFindings];
924
1215
  const groupScores = {};
925
1216
  const groupPageCounts = {};
1217
+ // Surface robots-skipped URLs so users don't silently get a smaller audit
1218
+ // than expected. One rollup finding (not per-URL) to avoid flooding the
1219
+ // output on large sites. Also included on summary.skippedUrls below.
1220
+ if (skippedByRobots.length > 0) {
1221
+ allFindings.push({
1222
+ ruleId: "audit/skipped-by-robots",
1223
+ severity: "info",
1224
+ message: `Skipped ${skippedByRobots.length} sitemap URL${skippedByRobots.length === 1 ? "" : "s"} because the target's robots.txt Disallow'd them: ${skippedByRobots.slice(0, 5).join(", ")}${skippedByRobots.length > 5 ? ", …" : ""}.`,
1225
+ fix: "If you own this site and want to audit these URLs anyway, pass `respectRobotsTxt: false` (or remove the Disallow directive).",
1226
+ relatedUrls: skippedByRobots,
1227
+ });
1228
+ }
1229
+ // v0.4 §4.4: origin readiness is now diagnostic-only. The previous
1230
+ // `audit/origin-readiness` finding emission was retired — the structured
1231
+ // ReadinessReport in `summary.diagnostics.originReadiness` is the canonical
1232
+ // signal now (no double-counting in the issue buckets).
926
1233
  const auditMode = options?.mode ?? "full";
927
1234
  // Site-wide rules (run once, outside group loop)
928
1235
  if (sitemapUrlSet && sitemapUrlSet.size > 0 && auditMode !== "diff") {
@@ -959,39 +1266,83 @@ export async function auditSource(source, options) {
959
1266
  if (groupConfig?.rules !== undefined && groupConfig.rules.length === 0)
960
1267
  continue;
961
1268
  const groupRules = resolveGroupRules(resolvedRules, groupConfig?.overrides);
962
- const enabledCheck = (ruleId) => isRuleEnabled(ruleId, groupConfig?.rules);
1269
+ const enabledCheck = (ruleId) => !suppressedRuleSet.has(ruleId) && isRuleEnabled(ruleId, groupConfig?.rules);
963
1270
  const findings = runRulesOnPages(groupPages, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full");
964
1271
  allFindings.push(...findings);
965
1272
  groupPageCounts[groupName] = groupPages.length;
966
- const { score } = scoreFromFindings(findings);
967
- groupScores[groupName] = score;
1273
+ const { risk: groupRisk } = scoreFromFindings(findings);
1274
+ groupScores[groupName] = groupRisk;
968
1275
  }
1276
+ throwIfAborted();
969
1277
  // Enrich findings: cluster pairwise, detect templates, assign effort
970
1278
  const enriched = enrichFindings(allFindings, parsedPages, {
971
1279
  templateGenerated: options?.templateGenerated,
972
1280
  });
973
- const { score, categoryScores } = scoreFromFindings(enriched.findings);
1281
+ // Populate docsUrl on every finding before they leave the engine.
1282
+ withDocsUrls(enriched.findings);
1283
+ const { risk, categories, bucketCounts } = scoreFromFindings(enriched.findings);
974
1284
  const auditedPageCount = Object.values(groupPageCounts).reduce((a, b) => a + b, 0);
1285
+ const issues = bucketIssues(enriched.findings);
1286
+ const verdict = verdictForRisk(risk);
1287
+ const headline = buildHeadline(bucketCounts);
1288
+ // audit/* findings are diagnostic-only and never appear in summary.issues.
1289
+ // Surface them under diagnostics so consumers (telemetry, debug UIs) can
1290
+ // still see what was deduped or skipped.
1291
+ const auditFindings = enriched.findings.filter((f) => f.ruleId.startsWith("audit/"));
1292
+ const readinessReport = computeReadiness(observer.getAll(), { detectedFramework });
1293
+ const crawlStats = {
1294
+ discovered: discoveredUrlCount ?? loadedPagesRaw.length,
1295
+ fetched: parsedPages.length,
1296
+ skipped: skippedByContentType.length + skippedByRobots.length + skippedUrls.length,
1297
+ };
975
1298
  const summary = {
976
- score,
977
- categoryScores,
1299
+ schemaVersion: SCHEMA_VERSION,
1300
+ verdict,
1301
+ risk,
1302
+ headline,
1303
+ categories,
1304
+ issues,
1305
+ siteClassification,
1306
+ diagnostics: {
1307
+ originReadiness: readinessReport,
1308
+ crawlStats,
1309
+ auditFindings,
1310
+ },
978
1311
  groupScores: options?.pageGroups ? groupScores : undefined,
979
1312
  groupPageCounts: options?.pageGroups ? groupPageCounts : undefined,
980
1313
  pageCount: auditedPageCount || parsedPages.length,
981
- findings: enriched.findings,
982
1314
  templateDetected: enriched.templateDetected,
983
1315
  rawFindingCount: enriched.rawFindingCount,
984
1316
  };
985
1317
  if (cacheConfig) {
986
1318
  summary.cacheStats = cacheStats;
987
1319
  }
988
- if (skippedUrls.length > 0) {
989
- summary.skippedUrls = skippedUrls;
1320
+ // v0.4 §4.5: warn when an `ignore` pattern matched zero discovered URLs.
1321
+ if (ignorePatterns.length > 0) {
1322
+ for (const pattern of ignorePatterns) {
1323
+ const matched = deduped.some((p) => globMatchPathname(pattern, p.url));
1324
+ if (!matched) {
1325
+ // eslint-disable-next-line no-console
1326
+ console.warn(`[pseolint] ignore pattern '${pattern}' matched 0 URLs — likely typo`);
1327
+ }
1328
+ }
990
1329
  }
1330
+ // Merge state-skipped (unchanged since last run) and robots-skipped (target
1331
+ // robots.txt Disallow'd) URLs so callers have a single audit-skipped surface.
1332
+ const allSkipped = [...skippedUrls, ...skippedByRobots];
1333
+ if (allSkipped.length > 0) {
1334
+ summary.skippedUrls = allSkipped;
1335
+ }
1336
+ // Local flat view of every finding the engine produced, used internally for
1337
+ // state persistence, regression detection, AI triage input, and telemetry
1338
+ // counts. NOT exposed on the AuditSummary — consumers must use
1339
+ // `summary.issues.{blockers,shouldFix,informational}` and
1340
+ // `summary.diagnostics.auditFindings`.
1341
+ const enrichedFindings = enriched.findings;
991
1342
  if (priorState && options?.state?.exitOnRegression) {
992
1343
  let hasRegression = false;
993
1344
  const currentFindings = new Map();
994
- for (const f of summary.findings) {
1345
+ for (const f of enrichedFindings) {
995
1346
  if (!f.pageUrl)
996
1347
  continue;
997
1348
  const set = currentFindings.get(f.pageUrl) ?? new Set();
@@ -1019,7 +1370,7 @@ export async function auditSource(source, options) {
1019
1370
  const renderMode = options.render ? "rendered" : "static";
1020
1371
  const urls = {};
1021
1372
  const findingsByUrl = new Map();
1022
- for (const f of summary.findings) {
1373
+ for (const f of enrichedFindings) {
1023
1374
  if (!f.pageUrl)
1024
1375
  continue;
1025
1376
  const list = findingsByUrl.get(f.pageUrl) ?? [];
@@ -1051,9 +1402,10 @@ export async function auditSource(source, options) {
1051
1402
  renderMode,
1052
1403
  urls,
1053
1404
  summary: {
1054
- score: summary.score,
1055
- totalFindings: summary.findings.length,
1056
- byCategory: Object.fromEntries(Object.entries(summary.categoryScores).map(([k, v]) => [k, v])),
1405
+ score: summary.risk,
1406
+ totalFindings: enrichedFindings.length,
1407
+ byCategory: Object.fromEntries(Object.entries(summary.categories)
1408
+ .map(([k, v]) => [k, v.issues])),
1057
1409
  },
1058
1410
  };
1059
1411
  await writeState(statePath, newState);
@@ -1089,7 +1441,8 @@ export async function auditSource(source, options) {
1089
1441
  spentTodayUsd = 0;
1090
1442
  }
1091
1443
  }
1092
- const outcome = await triageFindings(summary.findings, summary.pageCount, {
1444
+ throwIfAborted();
1445
+ const outcome = await triageFindings(enrichedFindings, summary.pageCount, {
1093
1446
  enabled: true,
1094
1447
  model: resolved.model,
1095
1448
  providerId: resolved.providerId,
@@ -1124,9 +1477,9 @@ export async function auditSource(source, options) {
1124
1477
  runId,
1125
1478
  timestamp: new Date().toISOString(),
1126
1479
  durationMs: Date.now() - runStartedAt,
1127
- score: summary.score,
1480
+ score: summary.risk,
1128
1481
  pageCount: summary.pageCount,
1129
- findingCount: summary.findings.length,
1482
+ findingCount: enrichedFindings.length,
1130
1483
  ...(summary.rawFindingCount !== undefined && { rawFindingCount: summary.rawFindingCount }),
1131
1484
  ...(summary.templateDetected !== undefined && { templateDetected: summary.templateDetected }),
1132
1485
  ...(summary.cacheStats && { cacheStats: summary.cacheStats }),
@@ -1181,7 +1534,19 @@ export async function auditSource(source, options) {
1181
1534
  }
1182
1535
  const aiHintEnabled = options?.ai?.suggest !== false;
1183
1536
  if (aiHintEnabled && !options?.ai?.enabled && process.env.ANTHROPIC_API_KEY) {
1184
- console.error(`💡 AI triage available — re-run with --ai to prioritize ${summary.findings.length} findings into a fix list.`);
1537
+ console.error(`💡 AI triage available — re-run with --ai to prioritize ${enrichedFindings.length} findings into a fix list.`);
1538
+ }
1539
+ if (cacheConfig && cacheMaxBytes > 0) {
1540
+ try {
1541
+ const pruneResult = await pruneCache(cacheConfig.dir, cacheMaxBytes);
1542
+ if (pruneResult.removedEntries > 0 || pruneResult.removedTmpFiles > 0) {
1543
+ const freedMb = ((pruneResult.before.bytes - pruneResult.after.bytes) / 1024 / 1024).toFixed(1);
1544
+ console.error(`pseolint: cache prune freed ${freedMb} MB (${pruneResult.removedEntries} entries, ${pruneResult.removedTmpFiles} .tmp files); size=${(pruneResult.after.bytes / 1024 / 1024).toFixed(1)}MB / cap=${(cacheMaxBytes / 1024 / 1024).toFixed(0)}MB`);
1545
+ }
1546
+ }
1547
+ catch {
1548
+ // Non-fatal: eviction failure must not break the audit.
1549
+ }
1185
1550
  }
1186
1551
  return summary;
1187
1552
  }