@pseolint/core 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/README.md +84 -15
  2. package/dist/ai/prompt.d.ts +1 -1
  3. package/dist/ai/prompt.d.ts.map +1 -1
  4. package/dist/ai/prompt.js +13 -1
  5. package/dist/ai/prompt.js.map +1 -1
  6. package/dist/auditor.d.ts.map +1 -1
  7. package/dist/auditor.js +197 -63
  8. package/dist/auditor.js.map +1 -1
  9. package/dist/cache.d.ts.map +1 -1
  10. package/dist/cache.js +38 -2
  11. package/dist/cache.js.map +1 -1
  12. package/dist/formatters/console.d.ts +9 -0
  13. package/dist/formatters/console.d.ts.map +1 -1
  14. package/dist/formatters/console.js +53 -0
  15. package/dist/formatters/console.js.map +1 -1
  16. package/dist/formatters/html.d.ts.map +1 -1
  17. package/dist/formatters/html.js +363 -135
  18. package/dist/formatters/html.js.map +1 -1
  19. package/dist/index.d.ts +10 -0
  20. package/dist/index.d.ts.map +1 -1
  21. package/dist/index.js +9 -0
  22. package/dist/index.js.map +1 -1
  23. package/dist/rule-references.d.ts.map +1 -1
  24. package/dist/rule-references.js +8 -0
  25. package/dist/rule-references.js.map +1 -1
  26. package/dist/rules/aeo/answer-first.d.ts +18 -0
  27. package/dist/rules/aeo/answer-first.d.ts.map +1 -0
  28. package/dist/rules/aeo/answer-first.js +191 -0
  29. package/dist/rules/aeo/answer-first.js.map +1 -0
  30. package/dist/rules/aeo/citable-facts.d.ts +9 -0
  31. package/dist/rules/aeo/citable-facts.d.ts.map +1 -0
  32. package/dist/rules/aeo/citable-facts.js +90 -0
  33. package/dist/rules/aeo/citable-facts.js.map +1 -0
  34. package/dist/rules/aeo/content-modularity.d.ts +11 -0
  35. package/dist/rules/aeo/content-modularity.d.ts.map +1 -0
  36. package/dist/rules/aeo/content-modularity.js +107 -0
  37. package/dist/rules/aeo/content-modularity.js.map +1 -0
  38. package/dist/rules/aeo/crawler-access.d.ts +25 -0
  39. package/dist/rules/aeo/crawler-access.d.ts.map +1 -0
  40. package/dist/rules/aeo/crawler-access.js +116 -0
  41. package/dist/rules/aeo/crawler-access.js.map +1 -0
  42. package/dist/rules/aeo/faq-coverage.d.ts +9 -0
  43. package/dist/rules/aeo/faq-coverage.d.ts.map +1 -0
  44. package/dist/rules/aeo/faq-coverage.js +71 -0
  45. package/dist/rules/aeo/faq-coverage.js.map +1 -0
  46. package/dist/rules/aeo/freshness-signals.d.ts +9 -0
  47. package/dist/rules/aeo/freshness-signals.d.ts.map +1 -0
  48. package/dist/rules/aeo/freshness-signals.js +109 -0
  49. package/dist/rules/aeo/freshness-signals.js.map +1 -0
  50. package/dist/rules/aeo/llms-txt.d.ts +24 -0
  51. package/dist/rules/aeo/llms-txt.d.ts.map +1 -0
  52. package/dist/rules/aeo/llms-txt.js +93 -0
  53. package/dist/rules/aeo/llms-txt.js.map +1 -0
  54. package/dist/rules/aeo/non-replicable-value.d.ts +9 -0
  55. package/dist/rules/aeo/non-replicable-value.d.ts.map +1 -0
  56. package/dist/rules/aeo/non-replicable-value.js +95 -0
  57. package/dist/rules/aeo/non-replicable-value.js.map +1 -0
  58. package/dist/rules/scope.d.ts +12 -0
  59. package/dist/rules/scope.d.ts.map +1 -0
  60. package/dist/rules/scope.js +66 -0
  61. package/dist/rules/scope.js.map +1 -0
  62. package/dist/rules/tech/robots-sitemap-presence.d.ts +16 -0
  63. package/dist/rules/tech/robots-sitemap-presence.d.ts.map +1 -1
  64. package/dist/rules/tech/robots-sitemap-presence.js +26 -2
  65. package/dist/rules/tech/robots-sitemap-presence.js.map +1 -1
  66. package/dist/types.d.ts +29 -0
  67. package/dist/types.d.ts.map +1 -1
  68. package/package.json +91 -66
package/dist/auditor.js CHANGED
@@ -26,7 +26,15 @@ import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
26
26
  import { ogCompletenessRule } from "./rules/tech/og-completeness.js";
27
27
  import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
28
28
  import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
29
- import { robotsComplianceRule } from "./rules/tech/robots-sitemap-presence.js";
29
+ import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds } from "./rules/tech/robots-sitemap-presence.js";
30
+ import { llmsTxtRule } from "./rules/aeo/llms-txt.js";
31
+ import { crawlerAccessRule } from "./rules/aeo/crawler-access.js";
32
+ import { freshnessSignalsRule } from "./rules/aeo/freshness-signals.js";
33
+ import { faqCoverageRule } from "./rules/aeo/faq-coverage.js";
34
+ import { answerFirstRule } from "./rules/aeo/answer-first.js";
35
+ import { citableFactsRule } from "./rules/aeo/citable-facts.js";
36
+ import { nonReplicableValueRule } from "./rules/aeo/non-replicable-value.js";
37
+ import { contentModularityRule } from "./rules/aeo/content-modularity.js";
30
38
  import { redirectChainRule } from "./rules/tech/redirect-chain.js";
31
39
  import { soft404Rule } from "./rules/tech/soft-404.js";
32
40
  import { jsonLdValidRule } from "./rules/schema/json-ld-valid.js";
@@ -38,6 +46,7 @@ import { urlPatternRule } from "./rules/cannibal/url-pattern.js";
38
46
  import { templateCoverageRule } from "./rules/spam/template-coverage.js";
39
47
  import { dataBindingRule, dataIdenticalRule } from "./rules/data/data-binding.js";
40
48
  import { classifyPages, isRuleEnabled } from "./page-classifier.js";
49
+ import { isRuleAllowedInDiff } from "./rules/scope.js";
41
50
  import { RULE_REFERENCES } from "./rule-references.js";
42
51
  import { enrichFindings } from "./enrich-findings.js";
43
52
  import { triageFindings } from "./ai/triage.js";
@@ -61,13 +70,21 @@ const DEFAULTS = {
61
70
  hubPagesMaxSiblings: 50,
62
71
  titleOverlapThreshold: 0.8,
63
72
  keywordCollisionMinShared: 6,
64
- templateCoverageMinPages: 5
73
+ templateCoverageMinPages: 5,
74
+ answerFirstMaxWords: 100,
75
+ citableFactsMin: 3,
76
+ citableFactsTarget: 8,
77
+ freshnessMaxStaleDays: 180,
78
+ modularityMaxParagraphWords: 200,
79
+ modularityMinSelfContainedRatio: 0.7,
80
+ faqMinQuestionHeadings: 2
65
81
  };
66
82
  const CATEGORY_WEIGHTS = {
67
- spam: 0.4,
68
- content: 0.25,
69
- links: 0.15,
70
- tech: 0.1,
83
+ spam: 0.35,
84
+ content: 0.2,
85
+ aeo: 0.15,
86
+ links: 0.12,
87
+ tech: 0.08,
71
88
  schema: 0.05,
72
89
  cannibal: 0.05,
73
90
  /** Dedup / crawl hygiene; does not affect composite score. */
@@ -93,8 +110,9 @@ function resolveGroupRules(baseRules, overrides) {
93
110
  }
94
111
  return result;
95
112
  }
96
- function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides) {
113
+ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, entityPatterns, overrides, mode = "full") {
97
114
  const findings = [];
115
+ const modeOk = (ruleId) => mode !== "diff" || isRuleAllowedInDiff(ruleId);
98
116
  const tag = (results) => results.map((r) => {
99
117
  const override = overrides?.[r.ruleId];
100
118
  return {
@@ -106,106 +124,137 @@ function runRulesOnPages(pages, resolvedRules, isEnabled, groupName, knownUrls,
106
124
  });
107
125
  // Spam rules — always compute cross-page data, only push findings if enabled
108
126
  const nearDuplicate = nearDuplicateRule(pages, resolvedRules.nearDuplicateThreshold);
109
- if (isEnabled("spam/near-duplicate")) {
127
+ if (isEnabled("spam/near-duplicate") && modeOk("spam/near-duplicate")) {
110
128
  findings.push(...tag(nearDuplicate.findings));
111
129
  }
112
130
  const entitySwap = entitySwapRule(pages, entityPatterns, resolvedRules.entitySwapThreshold);
113
- if (isEnabled("spam/entity-swap")) {
131
+ if (isEnabled("spam/entity-swap") && modeOk("spam/entity-swap")) {
114
132
  findings.push(...tag(entitySwap.findings));
115
133
  }
116
134
  const thinContent = thinContentRule(pages, resolvedRules.thinContentMinWords);
117
- if (isEnabled("spam/thin-content")) {
135
+ if (isEnabled("spam/thin-content") && modeOk("spam/thin-content")) {
118
136
  findings.push(...tag(thinContent.findings));
119
137
  }
120
- if (isEnabled("spam/doorway-pattern")) {
138
+ if (isEnabled("spam/doorway-pattern") && modeOk("spam/doorway-pattern")) {
121
139
  findings.push(...tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
122
140
  }
123
- if (isEnabled("spam/publication-velocity")) {
141
+ if (isEnabled("spam/publication-velocity") && modeOk("spam/publication-velocity")) {
124
142
  findings.push(...tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay)));
125
143
  }
126
- if (isEnabled("spam/boilerplate-ratio")) {
144
+ if (isEnabled("spam/boilerplate-ratio") && modeOk("spam/boilerplate-ratio")) {
127
145
  findings.push(...tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
128
146
  }
129
- if (isEnabled("spam/template-diversity")) {
147
+ if (isEnabled("spam/template-diversity") && modeOk("spam/template-diversity")) {
130
148
  findings.push(...tag(templateDiversityRule(pages, resolvedRules.templateDiversityMinUniqueRatio)));
131
149
  }
132
- if (isEnabled("spam/template-coverage")) {
150
+ if (isEnabled("spam/template-coverage") && modeOk("spam/template-coverage")) {
133
151
  findings.push(...tag(templateCoverageRule(pages, entityPatterns, resolvedRules.templateCoverageMinPages)));
134
152
  }
135
153
  // Content rules
136
- if (isEnabled("content/unique-value")) {
154
+ if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
137
155
  findings.push(...tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
138
156
  }
139
- if (isEnabled("content/heading-uniqueness")) {
157
+ if (isEnabled("content/heading-uniqueness") && modeOk("content/heading-uniqueness")) {
140
158
  findings.push(...tag(headingUniquenessRule(pages, entityPatterns)));
141
159
  }
142
- if (isEnabled("content/meta-uniqueness")) {
160
+ if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
143
161
  findings.push(...tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
144
162
  }
145
- if (isEnabled("content/missing-author")) {
163
+ if (isEnabled("content/missing-author") && modeOk("content/missing-author")) {
146
164
  findings.push(...tag(missingAuthorRule(pages)));
147
165
  }
148
- if (isEnabled("content/eeat-signals")) {
166
+ if (isEnabled("content/eeat-signals") && modeOk("content/eeat-signals")) {
149
167
  findings.push(...tag(eeatSignalsRule(pages)));
150
168
  }
151
169
  // Link rules — use the global link graph
152
- if (isEnabled("links/orphan-pages")) {
170
+ if (isEnabled("links/orphan-pages") && modeOk("links/orphan-pages")) {
153
171
  findings.push(...tag(orphanPagesRule(pages, inbound, rootUrl)));
154
172
  }
155
- if (isEnabled("links/dead-ends")) {
173
+ if (isEnabled("links/dead-ends") && modeOk("links/dead-ends")) {
156
174
  findings.push(...tag(deadEndsRule(pages, knownUrls, rootUrl)));
157
175
  }
158
- if (isEnabled("links/link-depth")) {
176
+ if (isEnabled("links/link-depth") && modeOk("links/link-depth")) {
159
177
  if (rootUrl) {
160
178
  findings.push(...tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound)));
161
179
  }
162
180
  }
163
- if (isEnabled("links/cluster-connectivity")) {
181
+ if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
164
182
  findings.push(...tag(clusterConnectivityRule(pages, knownUrls)));
165
183
  }
166
- if (isEnabled("links/hub-pages")) {
184
+ if (isEnabled("links/hub-pages") && modeOk("links/hub-pages")) {
167
185
  findings.push(...tag(hubPagesRule(pages, knownUrls, resolvedRules.hubPagesMinSiblings, resolvedRules.hubPagesMaxSiblings)));
168
186
  }
169
187
  // Tech rules
170
- if (isEnabled("tech/canonical-consistency")) {
188
+ if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
171
189
  findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
172
190
  }
173
- if (isEnabled("tech/canonical-noindex-conflict")) {
191
+ if (isEnabled("tech/canonical-noindex-conflict") && modeOk("tech/canonical-noindex-conflict")) {
174
192
  findings.push(...tag(canonicalNoindexConflictRule(pages, normalizeUrlOptions)));
175
193
  }
176
- if (isEnabled("tech/robots-noindex-conflict")) {
194
+ if (isEnabled("tech/robots-noindex-conflict") && modeOk("tech/robots-noindex-conflict")) {
177
195
  findings.push(...tag(robotsNoindexConflictRule(pages, inbound)));
178
196
  }
179
- if (isEnabled("tech/redirect-chain")) {
197
+ if (isEnabled("tech/redirect-chain") && modeOk("tech/redirect-chain")) {
180
198
  findings.push(...tag(redirectChainRule(pages)));
181
199
  }
182
- if (isEnabled("tech/soft-404")) {
200
+ if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
183
201
  findings.push(...tag(soft404Rule(pages)));
184
202
  }
185
- if (isEnabled("tech/og-completeness")) {
203
+ if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
186
204
  findings.push(...tag(ogCompletenessRule(pages)));
187
205
  }
188
- if (isEnabled("tech/hreflang-consistency")) {
206
+ if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
189
207
  findings.push(...tag(hreflangConsistencyRule(pages, normalizeUrlOptions)));
190
208
  }
191
209
  // Schema rules
192
- if (isEnabled("schema/json-ld-valid")) {
210
+ if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
193
211
  findings.push(...tag(jsonLdValidRule(pages)));
194
212
  }
195
- if (isEnabled("schema/required-fields")) {
213
+ if (isEnabled("schema/required-fields") && modeOk("schema/required-fields")) {
196
214
  findings.push(...tag(requiredFieldsRule(pages)));
197
215
  }
198
- if (isEnabled("schema/consistency")) {
216
+ if (isEnabled("schema/consistency") && modeOk("schema/consistency")) {
199
217
  findings.push(...tag(schemaConsistencyRule(pages)));
200
218
  }
219
+ // AEO rules
220
+ if (isEnabled("aeo/freshness-signals")) {
221
+ findings.push(...tag(freshnessSignalsRule(pages, {
222
+ maxStaleDays: resolvedRules.freshnessMaxStaleDays,
223
+ })));
224
+ }
225
+ if (isEnabled("aeo/faq-coverage")) {
226
+ findings.push(...tag(faqCoverageRule(pages, {
227
+ minQuestionHeadings: resolvedRules.faqMinQuestionHeadings,
228
+ })));
229
+ }
230
+ if (isEnabled("aeo/answer-first")) {
231
+ findings.push(...tag(answerFirstRule(pages, entityPatterns, {
232
+ maxFirstParagraphWords: resolvedRules.answerFirstMaxWords,
233
+ })));
234
+ }
235
+ if (isEnabled("aeo/citable-facts")) {
236
+ findings.push(...tag(citableFactsRule(pages, entityPatterns, {
237
+ minFactsPerPage: resolvedRules.citableFactsMin,
238
+ targetFactsPerPage: resolvedRules.citableFactsTarget,
239
+ })));
240
+ }
241
+ if (isEnabled("aeo/non-replicable-value")) {
242
+ findings.push(...tag(nonReplicableValueRule(pages)));
243
+ }
244
+ if (isEnabled("aeo/content-modularity")) {
245
+ findings.push(...tag(contentModularityRule(pages, {
246
+ maxParagraphWords: resolvedRules.modularityMaxParagraphWords,
247
+ minSelfContainedRatio: resolvedRules.modularityMinSelfContainedRatio,
248
+ })));
249
+ }
201
250
  // Cannibal rules
202
- if (isEnabled("cannibal/title-overlap")) {
251
+ if (isEnabled("cannibal/title-overlap") && modeOk("cannibal/title-overlap")) {
203
252
  findings.push(...tag(titleOverlapRule(pages, entityPatterns, resolvedRules.titleOverlapThreshold)));
204
253
  }
205
- if (isEnabled("cannibal/keyword-collision")) {
254
+ if (isEnabled("cannibal/keyword-collision") && modeOk("cannibal/keyword-collision")) {
206
255
  findings.push(...tag(keywordCollisionRule(pages, resolvedRules.keywordCollisionMinShared)));
207
256
  }
208
- if (isEnabled("cannibal/url-pattern")) {
257
+ if (isEnabled("cannibal/url-pattern") && modeOk("cannibal/url-pattern")) {
209
258
  findings.push(...tag(urlPatternRule(pages)));
210
259
  }
211
260
  return findings;
@@ -223,6 +272,7 @@ function scoreFromFindings(findings) {
223
272
  const raw = {
224
273
  spam: 0,
225
274
  content: 0,
275
+ aeo: 0,
226
276
  links: 0,
227
277
  tech: 0,
228
278
  schema: 0,
@@ -238,6 +288,7 @@ function scoreFromFindings(findings) {
238
288
  }
239
289
  const weighted = raw.spam * CATEGORY_WEIGHTS.spam +
240
290
  raw.content * CATEGORY_WEIGHTS.content +
291
+ raw.aeo * CATEGORY_WEIGHTS.aeo +
241
292
  raw.links * CATEGORY_WEIGHTS.links +
242
293
  raw.tech * CATEGORY_WEIGHTS.tech +
243
294
  raw.schema * CATEGORY_WEIGHTS.schema +
@@ -248,6 +299,7 @@ function scoreFromFindings(findings) {
248
299
  categoryScores: {
249
300
  spam: raw.spam,
250
301
  content: raw.content,
302
+ aeo: raw.aeo,
251
303
  links: raw.links,
252
304
  tech: raw.tech,
253
305
  schema: raw.schema,
@@ -430,7 +482,35 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
430
482
  }
431
483
  return allUrls;
432
484
  }
433
- async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats) {
485
+ async function fetchRobotsMeta(origin, timeoutMs, cache, stats) {
486
+ if (!origin)
487
+ return { disallow: [], crawlDelaySec: 0 };
488
+ try {
489
+ const robotsUrl = `${origin}/robots.txt`;
490
+ const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats);
491
+ return {
492
+ disallow: parseDisallowPatterns(fetched.text),
493
+ crawlDelaySec: parseCrawlDelaySeconds(fetched.text),
494
+ };
495
+ }
496
+ catch {
497
+ return { disallow: [], crawlDelaySec: 0 };
498
+ }
499
+ }
500
+ function sleep(ms) {
501
+ return new Promise((resolve) => setTimeout(resolve, ms));
502
+ }
503
+ function isDisallowedByRobots(urlPath, patterns) {
504
+ for (const pat of patterns) {
505
+ if (isBlockedByPattern(urlPath, pat))
506
+ return true;
507
+ }
508
+ return false;
509
+ }
510
+ function budgetExceeded(b) {
511
+ return b.cap > 0 && b.used >= b.cap;
512
+ }
513
+ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }) {
434
514
  if (/^https?:\/\//i.test(source)) {
435
515
  let text;
436
516
  let contentType;
@@ -467,23 +547,38 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
467
547
  ? fisherYatesSample(allSitemapUrls, discoveryBudget)
468
548
  : allSitemapUrls;
469
549
  const pages = [];
470
- await runWithConcurrency(urlsToFetch, concurrency, async (url) => {
550
+ // Fetch robots.txt once for the origin — reused for Crawl-Delay pacing and Disallow checks.
551
+ const sourceOrigin = (() => { try {
552
+ return new URL(source).origin;
553
+ }
554
+ catch {
555
+ return "";
556
+ } })();
557
+ const robots = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats);
558
+ const effectiveConcurrency = robots.crawlDelaySec > 0 ? 1 : concurrency;
559
+ const delayMs = robots.crawlDelaySec * 1000;
560
+ await runWithConcurrency(urlsToFetch, effectiveConcurrency, async (url) => {
561
+ if (budgetExceeded(byteBudget))
562
+ return;
471
563
  const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
472
564
  if (result) {
565
+ byteBudget.used += result.html.length;
473
566
  pages.push(result);
474
567
  }
568
+ if (delayMs > 0)
569
+ await sleep(delayMs);
475
570
  });
476
- // Skip additional crawl discovery when budget is active — sitemap is authoritative
477
- if (crawlDiscovery && discoveryBudget === 0) {
571
+ // Link discovery fills the sample.
572
+ // Legacy behavior: no budget set + crawlDiscovery true fill from links (unchanged).
573
+ // New behavior: budget set + crawlDiscovery true + opt-in flag → top up to budget.
574
+ const budgetUnderfilled = discoveryBudget > 0 && pages.length < discoveryBudget;
575
+ const legacyBudgetless = discoveryBudget === 0;
576
+ const shouldFill = crawlDiscovery && (legacyBudgetless || (budgetUnderfilled && fillBudgetViaLinkDiscovery));
577
+ if (shouldFill) {
478
578
  const sitemapUrlSet = new Set(allSitemapUrls);
479
579
  const discoveredUrls = new Set();
480
- let sourceOrigin;
481
- try {
482
- sourceOrigin = new URL(source).origin;
483
- }
484
- catch {
485
- sourceOrigin = "";
486
- }
580
+ // robots already fetched above; reuse its Disallow patterns here.
581
+ const disallowPatterns = robots.disallow;
487
582
  for (const page of pages) {
488
583
  const linkMatches = Array.from(page.html.matchAll(/href=["']([^"']+)["']/gi));
489
584
  for (const match of linkMatches) {
@@ -500,9 +595,11 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
500
595
  resolvedUrl.search = "";
501
596
  resolvedUrl.hash = "";
502
597
  const normalized = resolvedUrl.href;
503
- if (!sitemapUrlSet.has(normalized) && !discoveredUrls.has(normalized)) {
504
- discoveredUrls.add(normalized);
505
- }
598
+ if (sitemapUrlSet.has(normalized) || discoveredUrls.has(normalized))
599
+ continue;
600
+ if (isDisallowedByRobots(resolvedUrl.pathname, disallowPatterns))
601
+ continue;
602
+ discoveredUrls.add(normalized);
506
603
  }
507
604
  catch {
508
605
  continue;
@@ -510,11 +607,21 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
510
607
  }
511
608
  }
512
609
  if (discoveredUrls.size > 0) {
513
- await runWithConcurrency(Array.from(discoveredUrls), concurrency, async (url) => {
610
+ const candidates = Array.from(discoveredUrls);
611
+ // Fisher-Yates shuffle so we don't bias toward the first-discovered links (nav/footer).
612
+ const shuffled = fisherYatesSample(candidates, candidates.length);
613
+ const remaining = discoveryBudget === 0 ? Infinity : discoveryBudget - pages.length;
614
+ const toFetch = remaining === Infinity ? shuffled : shuffled.slice(0, remaining);
615
+ await runWithConcurrency(toFetch, effectiveConcurrency, async (url) => {
616
+ if (budgetExceeded(byteBudget))
617
+ return;
514
618
  const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
515
619
  if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
620
+ byteBudget.used += result.html.length;
516
621
  pages.push(result);
517
622
  }
623
+ if (delayMs > 0)
624
+ await sleep(delayMs);
518
625
  });
519
626
  }
520
627
  }
@@ -647,7 +754,14 @@ export async function auditSource(source, options) {
647
754
  hubPagesMaxSiblings: options?.rules?.hubPagesMaxSiblings ?? DEFAULTS.hubPagesMaxSiblings,
648
755
  titleOverlapThreshold: options?.rules?.titleOverlapThreshold ?? DEFAULTS.titleOverlapThreshold,
649
756
  keywordCollisionMinShared: options?.rules?.keywordCollisionMinShared ?? DEFAULTS.keywordCollisionMinShared,
650
- templateCoverageMinPages: options?.rules?.templateCoverageMinPages ?? DEFAULTS.templateCoverageMinPages
757
+ templateCoverageMinPages: options?.rules?.templateCoverageMinPages ?? DEFAULTS.templateCoverageMinPages,
758
+ answerFirstMaxWords: options?.rules?.answerFirstMaxWords ?? DEFAULTS.answerFirstMaxWords,
759
+ citableFactsMin: options?.rules?.citableFactsMin ?? DEFAULTS.citableFactsMin,
760
+ citableFactsTarget: options?.rules?.citableFactsTarget ?? DEFAULTS.citableFactsTarget,
761
+ freshnessMaxStaleDays: options?.rules?.freshnessMaxStaleDays ?? DEFAULTS.freshnessMaxStaleDays,
762
+ modularityMaxParagraphWords: options?.rules?.modularityMaxParagraphWords ?? DEFAULTS.modularityMaxParagraphWords,
763
+ modularityMinSelfContainedRatio: options?.rules?.modularityMinSelfContainedRatio ?? DEFAULTS.modularityMinSelfContainedRatio,
764
+ faqMinQuestionHeadings: options?.rules?.faqMinQuestionHeadings ?? DEFAULTS.faqMinQuestionHeadings
651
765
  };
652
766
  const normalizeUrlOptions = mergeNormalizeUrlOptions({
653
767
  stripQuery: options?.rules?.stripUrlQuery ?? true,
@@ -668,7 +782,10 @@ export async function auditSource(source, options) {
668
782
  ttlMs: options.cache.ttlMs ?? 7 * 24 * 60 * 60 * 1000,
669
783
  }
670
784
  : null;
671
- const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, discoveredUrlCount } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats);
785
+ const fillBudgetViaLinkDiscovery = options?.fillBudgetViaLinkDiscovery ?? false;
786
+ const maxFetchBytes = options?.maxFetchBytes ?? 52_428_800;
787
+ const fetchByteBudget = { used: 0, cap: maxFetchBytes };
788
+ const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, discoveredUrlCount } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget);
672
789
  const loadedPages = [...loadedPagesRaw];
673
790
  if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
674
791
  console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
@@ -778,9 +895,14 @@ export async function auditSource(source, options) {
778
895
  throw new Error(`Invalid regex flags "${rawFlags}" in entityPatterns for placeholder "${p.placeholder}". ` +
779
896
  `Only the flags g, i, m, s, u, y are permitted.`);
780
897
  }
898
+ // Entity patterns are used with String.replace to mask every occurrence, which
899
+ // requires the `g` flag. Add it if the user forgot — a silently broken "only first
900
+ // match masked" regex would make template-detection rules (answer-first,
901
+ // citable-facts) miss shared openers.
902
+ const normalizedFlags = rawFlags.includes("g") ? rawFlags : `${rawFlags}g`;
781
903
  try {
782
904
  // Flags validated against SAFE_FLAGS_RE above; pattern is from trusted local config, not HTTP input.
783
- return { placeholder: p.placeholder, pattern: new RegExp(p.pattern, rawFlags) }; // nosemgrep
905
+ return { placeholder: p.placeholder, pattern: new RegExp(p.pattern, normalizedFlags) }; // nosemgrep
784
906
  }
785
907
  catch (err) {
786
908
  throw new Error(`Invalid regex pattern for placeholder "${p.placeholder}": ${err.message}`);
@@ -793,8 +915,9 @@ export async function auditSource(source, options) {
793
915
  const allFindings = [...duplicateUrlFindings];
794
916
  const groupScores = {};
795
917
  const groupPageCounts = {};
918
+ const auditMode = options?.mode ?? "full";
796
919
  // Site-wide rules (run once, outside group loop)
797
- if (sitemapUrlSet && sitemapUrlSet.size > 0) {
920
+ if (sitemapUrlSet && sitemapUrlSet.size > 0 && auditMode !== "diff") {
798
921
  const sitemapFindings = sitemapCompletenessRule(parsedPages, sitemapUrlSet);
799
922
  allFindings.push(...sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
800
923
  if (robotsTxtContent) {
@@ -802,13 +925,24 @@ export async function auditSource(source, options) {
802
925
  allFindings.push(...robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
803
926
  }
804
927
  }
928
+ // AEO site-wide rules. These run unconditionally (consistent with sitemap-completeness
929
+ // and robots-compliance); page-group rule lists govern per-page AEO rules only.
930
+ const llmsFindings = await llmsTxtRule(source, { timeoutMs });
931
+ allFindings.push(...llmsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
932
+ if (robotsTxtContent) {
933
+ const crawlerFindings = crawlerAccessRule(robotsTxtContent);
934
+ allFindings.push(...crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
935
+ }
805
936
  // Data source comparison rules
806
937
  if (options?.dataSource?.records && options.dataSource.records.length > 0) {
807
- const dataFindings = [
808
- ...dataBindingRule(parsedPages, options.dataSource.records),
809
- ...dataIdenticalRule(parsedPages, options.dataSource.records),
810
- ];
811
- allFindings.push(...dataFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
938
+ if (auditMode !== "diff" || isRuleAllowedInDiff("data/missing-binding")) {
939
+ const dataBindingFindings = dataBindingRule(parsedPages, options.dataSource.records);
940
+ allFindings.push(...dataBindingFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
941
+ }
942
+ if (auditMode !== "diff" || isRuleAllowedInDiff("data/identical-across-pages")) {
943
+ const dataIdenticalFindings = dataIdenticalRule(parsedPages, options.dataSource.records);
944
+ allFindings.push(...dataIdenticalFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
945
+ }
812
946
  }
813
947
  for (const [groupName, groupPages] of classified) {
814
948
  if (groupPages.length === 0)
@@ -818,7 +952,7 @@ export async function auditSource(source, options) {
818
952
  continue;
819
953
  const groupRules = resolveGroupRules(resolvedRules, groupConfig?.overrides);
820
954
  const enabledCheck = (ruleId) => isRuleEnabled(ruleId, groupConfig?.rules);
821
- const findings = runRulesOnPages(groupPages, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides);
955
+ const findings = runRulesOnPages(groupPages, groupRules, enabledCheck, groupName, knownUrls, adjacency, inbound, rootUrl, normalizeUrlOptions, source, DEFAULT_ENTITY_PATTERNS, groupConfig?.overrides, options?.mode ?? "full");
822
956
  allFindings.push(...findings);
823
957
  groupPageCounts[groupName] = groupPages.length;
824
958
  const { score } = scoreFromFindings(findings);