@pseolint/core 0.6.3 → 0.6.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/auditor.js CHANGED
@@ -34,7 +34,7 @@ import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-con
34
34
  import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
35
35
  import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
36
36
  import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
37
- import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds } from "./rules/tech/robots-sitemap-presence.js";
37
+ import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds, parseSitemapDirectives } from "./rules/tech/robots-sitemap-presence.js";
38
38
  import { llmsTxtRule } from "./rules/aeo/llms-txt.js";
39
39
  import { crawlerAccessRule } from "./rules/aeo/crawler-access.js";
40
40
  import { freshnessSignalsRule } from "./rules/aeo/freshness-signals.js";
@@ -107,6 +107,24 @@ const CATEGORY_MAP = {
107
107
  data: "data",
108
108
  audit: "audit",
109
109
  };
110
+ /**
111
+ * Per-rule category overrides — take precedence over the namespace-level
112
+ * CATEGORY_MAP. A rule lands here when its namespace (chosen for code
113
+ * organisation) doesn't match the scoring bucket its *signal* belongs to.
114
+ *
115
+ * `links/host-section-divergence` lives in the links namespace because it reads
116
+ * the internal-link graph, but semantically it detects a spam-policy violation
117
+ * (Google's May 2024 site-reputation-abuse) — an INTEGRITY signal, not a
118
+ * discoverability one. Without this override it scored in the discoverability
119
+ * bucket (0.15 weight on programmatic-directory), so a confirmed parasite
120
+ * section moved the risk score by ~2pts despite registering as a blocker.
121
+ */
122
+ const RULE_CATEGORY_OVERRIDES = {
123
+ "links/host-section-divergence": "integrity",
124
+ };
125
+ export function categoryForRule(ruleId) {
126
+ return RULE_CATEGORY_OVERRIDES[ruleId] ?? CATEGORY_MAP[ruleId.split("/")[0]];
127
+ }
110
128
  const SCORING_PROFILES = {
111
129
  "small-marketing": {
112
130
  categoryWeights: { integrity: 0.30, discoverability: 0.40, citation: 0.20, data: 0.05, audit: 0 },
@@ -427,6 +445,13 @@ const RULE_IMPACTS = {
427
445
  "links/dead-ends": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
428
446
  "links/cluster-connectivity": { baseImpact: 5, perInstance: 1, maxImpact: 25 },
429
447
  "links/link-depth": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
448
+ // host-section-divergence is a reputation/integrity-grade signal that happens
449
+ // to live in the links namespace (it reads the link graph). It escalates to
450
+ // `error` and maps to manual-action risk, so it gets an explicit weight rather
451
+ // than inheriting DEFAULT_RULE_IMPACT (5/25), and is routed to the `integrity`
452
+ // bucket via RULE_CATEGORY_OVERRIDES so the score reflects the spam-policy
453
+ // severity rather than diluting into discoverability (0.15 weight).
454
+ "links/host-section-divergence": { baseImpact: 15, perInstance: 5, maxImpact: 45 },
430
455
  // AEO — much lower baselines than spam (AEO is opt-in optimization)
431
456
  "aeo/citable-facts": { baseImpact: 2, perInstance: 1, maxImpact: 25 },
432
457
  "aeo/answer-first": { baseImpact: 3, perInstance: 1, maxImpact: 25 },
@@ -590,157 +615,157 @@ sampled = false) {
590
615
  // Spam rules — always compute cross-page data, only push findings if enabled
591
616
  const nearDuplicate = nearDuplicateRule(pages, resolvedRules.nearDuplicateThreshold);
592
617
  if (isEnabled("spam/near-duplicate") && modeOk("spam/near-duplicate")) {
593
- findings.push(...tag(nearDuplicate.findings));
618
+ pushAll(findings, tag(nearDuplicate.findings));
594
619
  }
595
620
  const entitySwap = entitySwapRule(pages, entityPatterns, resolvedRules.entitySwapThreshold);
596
621
  if (isEnabled("spam/entity-swap") && modeOk("spam/entity-swap")) {
597
- findings.push(...tag(entitySwap.findings));
622
+ pushAll(findings, tag(entitySwap.findings));
598
623
  }
599
624
  const thinContent = thinContentRule(pages, resolvedRules.thinContentMinWords);
600
625
  if (isEnabled("spam/thin-content") && modeOk("spam/thin-content")) {
601
- findings.push(...tag(thinContent.findings));
626
+ pushAll(findings, tag(thinContent.findings));
602
627
  }
603
628
  if (isEnabled("spam/doorway-pattern") && modeOk("spam/doorway-pattern")) {
604
- findings.push(...tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
629
+ pushAll(findings, tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
605
630
  }
606
631
  if (isEnabled("spam/publication-velocity") && modeOk("spam/publication-velocity")) {
607
- findings.push(...tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay, resolvedRules.publicationVelocityMaxPerDayCorpusFraction)));
632
+ pushAll(findings, tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay, resolvedRules.publicationVelocityMaxPerDayCorpusFraction)));
608
633
  }
609
634
  if (isEnabled("spam/boilerplate-ratio") && modeOk("spam/boilerplate-ratio")) {
610
- findings.push(...tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
635
+ pushAll(findings, tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
611
636
  }
612
637
  if (isEnabled("spam/template-diversity") && modeOk("spam/template-diversity")) {
613
- findings.push(...tag(templateDiversityRule(pages, resolvedRules.templateDiversityMinUniqueRatio)));
638
+ pushAll(findings, tag(templateDiversityRule(pages, resolvedRules.templateDiversityMinUniqueRatio)));
614
639
  }
615
640
  if (isEnabled("spam/template-coverage") && modeOk("spam/template-coverage")) {
616
- findings.push(...tag(templateCoverageRule(pages, entityPatterns, resolvedRules.templateCoverageMinPages)));
641
+ pushAll(findings, tag(templateCoverageRule(pages, entityPatterns, resolvedRules.templateCoverageMinPages)));
617
642
  }
618
643
  // Content rules
619
644
  if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
620
- findings.push(...tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
645
+ pushAll(findings, tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
621
646
  }
622
647
  if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
623
- findings.push(...tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
648
+ pushAll(findings, tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
624
649
  }
625
650
  if (isEnabled("content/missing-author") && modeOk("content/missing-author")) {
626
- findings.push(...tag(missingAuthorRule(pages)));
651
+ pushAll(findings, tag(missingAuthorRule(pages)));
627
652
  }
628
653
  if (isEnabled("content/eeat-signals") && modeOk("content/eeat-signals")) {
629
- findings.push(...tag(eeatSignalsRule(pages)));
654
+ pushAll(findings, tag(eeatSignalsRule(pages)));
630
655
  }
631
656
  // 2026-05-03 v0.5.2 blind-spot fixes — title uniqueness + heading
632
657
  // structure + image alt-text were tier-1 gaps in the blind-spot audit.
633
658
  if (isEnabled("content/title-uniqueness") && modeOk("content/title-uniqueness")) {
634
- findings.push(...tag(titleUniquenessRule(pages)));
659
+ pushAll(findings, tag(titleUniquenessRule(pages)));
635
660
  }
636
661
  if (isEnabled("content/heading-structure") && modeOk("content/heading-structure")) {
637
- findings.push(...tag(headingStructureRule(pages)));
662
+ pushAll(findings, tag(headingStructureRule(pages)));
638
663
  }
639
664
  if (isEnabled("content/image-alt-text") && modeOk("content/image-alt-text")) {
640
- findings.push(...tag(imageAltTextRule(pages)));
665
+ pushAll(findings, tag(imageAltTextRule(pages)));
641
666
  }
642
667
  if (isEnabled("content/translation-no-op") && modeOk("content/translation-no-op")) {
643
- findings.push(...tag(translationNoOpRule(pages)));
668
+ pushAll(findings, tag(translationNoOpRule(pages)));
644
669
  }
645
670
  if (isEnabled("content/regurgitated-content") && modeOk("content/regurgitated-content")) {
646
- findings.push(...tag(regurgitatedContentRule(pages)));
671
+ pushAll(findings, tag(regurgitatedContentRule(pages)));
647
672
  }
648
673
  if (isEnabled("content/common-phrase-reuse") && modeOk("content/common-phrase-reuse")) {
649
- findings.push(...tag(commonPhraseReuseRule(pages)));
674
+ pushAll(findings, tag(commonPhraseReuseRule(pages)));
650
675
  }
651
676
  if (isEnabled("content/wikipedia-paraphrase") && modeOk("content/wikipedia-paraphrase")) {
652
- findings.push(...tag(wikipediaParaphraseRule(pages)));
677
+ pushAll(findings, tag(wikipediaParaphraseRule(pages)));
653
678
  }
654
679
  // Link rules — use the global link graph
655
680
  if (isEnabled("links/orphan-pages") && modeOk("links/orphan-pages")) {
656
- findings.push(...tag(orphanPagesRule(pages, inbound, rootUrl)));
681
+ pushAll(findings, tag(orphanPagesRule(pages, inbound, rootUrl)));
657
682
  }
658
683
  if (isEnabled("links/dead-ends") && modeOk("links/dead-ends")) {
659
- findings.push(...tag(deadEndsRule(pages, knownUrls, rootUrl)));
684
+ pushAll(findings, tag(deadEndsRule(pages, knownUrls, rootUrl)));
660
685
  }
661
686
  if (isEnabled("links/link-depth") && modeOk("links/link-depth")) {
662
687
  if (rootUrl) {
663
- findings.push(...tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound, sampled)));
688
+ pushAll(findings, tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound, sampled)));
664
689
  }
665
690
  }
666
691
  if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
667
- findings.push(...tag(clusterConnectivityRule(pages, knownUrls)));
692
+ pushAll(findings, tag(clusterConnectivityRule(pages, knownUrls)));
668
693
  }
669
694
  if (isEnabled("links/host-section-divergence") && modeOk("links/host-section-divergence")) {
670
- findings.push(...tag(hostSectionDivergenceRule(pages, adjacency)));
695
+ pushAll(findings, tag(hostSectionDivergenceRule(pages, adjacency)));
671
696
  }
672
697
  // Tech rules
673
698
  if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
674
- findings.push(...tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
699
+ pushAll(findings, tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
675
700
  }
676
701
  if (isEnabled("tech/canonical-noindex-conflict") && modeOk("tech/canonical-noindex-conflict")) {
677
- findings.push(...tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
702
+ pushAll(findings, tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
678
703
  }
679
704
  if (isEnabled("tech/robots-noindex-conflict") && modeOk("tech/robots-noindex-conflict")) {
680
- findings.push(...tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
705
+ pushAll(findings, tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
681
706
  }
682
707
  if (isEnabled("tech/redirect-chain") && modeOk("tech/redirect-chain")) {
683
- findings.push(...tag(redirectChainRule(pages)));
708
+ pushAll(findings, tag(redirectChainRule(pages)));
684
709
  }
685
710
  if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
686
- findings.push(...tag(soft404Rule(pages)));
711
+ pushAll(findings, tag(soft404Rule(pages)));
687
712
  }
688
713
  if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
689
714
  // hreflang declarations on noindex'd pages are still bugs when they're
690
715
  // inconsistent — see auditor.test.ts "emits technical SEO findings".
691
- findings.push(...tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
716
+ pushAll(findings, tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
692
717
  }
693
718
  // 2026-05-03 v0.5.2 blind-spot fix: og-completeness was referenced in
694
719
  // the v0.4.x README without ever shipping. Now it does.
695
720
  if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
696
- findings.push(...tag(ogCompletenessRule(pages)));
721
+ pushAll(findings, tag(ogCompletenessRule(pages)));
697
722
  }
698
723
  // Schema rules
699
724
  if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
700
- findings.push(...tag(jsonLdValidRule(pages)));
725
+ pushAll(findings, tag(jsonLdValidRule(pages)));
701
726
  }
702
727
  if (isEnabled("schema/required-fields") && modeOk("schema/required-fields")) {
703
- findings.push(...tag(requiredFieldsRule(pages)));
728
+ pushAll(findings, tag(requiredFieldsRule(pages)));
704
729
  }
705
730
  if (isEnabled("schema/consistency") && modeOk("schema/consistency")) {
706
- findings.push(...tag(schemaConsistencyRule(pages)));
731
+ pushAll(findings, tag(schemaConsistencyRule(pages)));
707
732
  }
708
733
  // AEO rules
709
734
  if (isEnabled("aeo/freshness-signals")) {
710
- findings.push(...tag(freshnessSignalsRule(pages, {
735
+ pushAll(findings, tag(freshnessSignalsRule(pages, {
711
736
  maxStaleDays: resolvedRules.freshnessMaxStaleDays,
712
737
  })));
713
738
  }
714
739
  if (isEnabled("aeo/faq-coverage")) {
715
- findings.push(...tag(faqCoverageRule(pages, {
740
+ pushAll(findings, tag(faqCoverageRule(pages, {
716
741
  minQuestionHeadings: resolvedRules.faqMinQuestionHeadings,
717
742
  })));
718
743
  }
719
744
  if (isEnabled("aeo/answer-first")) {
720
- findings.push(...tag(answerFirstRule(pages, entityPatterns, {
745
+ pushAll(findings, tag(answerFirstRule(pages, entityPatterns, {
721
746
  maxFirstParagraphWords: resolvedRules.answerFirstMaxWords,
722
747
  })));
723
748
  }
724
749
  if (isEnabled("aeo/citable-facts")) {
725
- findings.push(...tag(citableFactsRule(pages, entityPatterns, {
750
+ pushAll(findings, tag(citableFactsRule(pages, entityPatterns, {
726
751
  minFactsPerPage: resolvedRules.citableFactsMin,
727
752
  targetFactsPerPage: resolvedRules.citableFactsTarget,
728
753
  })));
729
754
  }
730
755
  if (isEnabled("aeo/content-modularity")) {
731
- findings.push(...tag(contentModularityRule(pages, {
756
+ pushAll(findings, tag(contentModularityRule(pages, {
732
757
  maxParagraphWords: resolvedRules.modularityMaxParagraphWords,
733
758
  minSelfContainedRatio: resolvedRules.modularityMinSelfContainedRatio,
734
759
  })));
735
760
  }
736
761
  if (isEnabled("aeo/summary-bait")) {
737
- findings.push(...tag(summaryBaitRule(pages, entityPatterns)));
762
+ pushAll(findings, tag(summaryBaitRule(pages, entityPatterns)));
738
763
  }
739
764
  // Cannibal rules — only url-pattern survives in v0.4 (title-overlap and
740
765
  // keyword-collision dropped due to high false-positive rates; see
741
766
  // 2026-04-29 v0.4 redesign spec §4.3).
742
767
  if (isEnabled("cannibal/url-pattern") && modeOk("cannibal/url-pattern")) {
743
- findings.push(...tag(urlPatternRule(pages)));
768
+ pushAll(findings, tag(urlPatternRule(pages)));
744
769
  }
745
770
  return findings;
746
771
  }
@@ -830,8 +855,7 @@ function scoreFromFindings(findings, classification, pageCount = 0) {
830
855
  // Each group's weighted impact lands in its category bucket.
831
856
  const groups = new Map();
832
857
  for (const finding of findings) {
833
- const namespace = finding.ruleId.split("/")[0];
834
- const bucket = CATEGORY_MAP[namespace];
858
+ const bucket = categoryForRule(finding.ruleId);
835
859
  if (!bucket)
836
860
  continue;
837
861
  if (bucket !== "audit")
@@ -867,8 +891,7 @@ function scoreFromFindings(findings, classification, pageCount = 0) {
867
891
  integrity: 0, discoverability: 0, citation: 0, data: 0, audit: 0,
868
892
  };
869
893
  for (const [ruleId, group] of groups) {
870
- const namespace = ruleId.split("/")[0];
871
- const bucket = CATEGORY_MAP[namespace];
894
+ const bucket = categoryForRule(ruleId);
872
895
  if (!bucket || bucket === "audit")
873
896
  continue;
874
897
  const impactSpec = RULE_IMPACTS[ruleId] ?? DEFAULT_RULE_IMPACT;
@@ -980,6 +1003,20 @@ function withDocsUrls(findings) {
980
1003
  }
981
1004
  return findings;
982
1005
  }
1006
+ /**
1007
+ * Append every item of `items` to `target` in place. Use this instead of
1008
+ * `target.push(...items)` whenever `items` can be large. The spread form passes
1009
+ * each element as a separate call argument, and V8 caps argument count
1010
+ * (~131072) — so `push(...bigArray)` throws `RangeError: Maximum call stack size
1011
+ * exceeded` on large inputs. A dense site makes the pairwise rules
1012
+ * (near-duplicate / entity-swap) emit C(N,2) findings, which blew the cap at the
1013
+ * rule-aggregation push *before* enrichment was even reached. The loop has no
1014
+ * such limit. See tests/integration/large-corpus-no-overflow.test.ts.
1015
+ */
1016
+ function pushAll(target, items) {
1017
+ for (const item of items)
1018
+ target.push(item);
1019
+ }
983
1020
  async function collectHtmlFiles(directory) {
984
1021
  const entries = await readdir(directory, { withFileTypes: true });
985
1022
  const files = await Promise.all(entries.map(async (entry) => {
@@ -1015,7 +1052,11 @@ function composeSignals(...signals) {
1015
1052
  }
1016
1053
  return ac.signal;
1017
1054
  }
1018
- async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop) {
1055
+ async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop,
1056
+ // Per-sitemap byte cap (sitemaps.org caps an uncompressed sitemap at 50 MB).
1057
+ // Guards against a hostile/misconfigured sitemap eating the whole byte budget
1058
+ // or memory. 0 / undefined = no cap.
1059
+ maxBytes) {
1019
1060
  try {
1020
1061
  stats.total += 1;
1021
1062
  const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
@@ -1025,6 +1066,11 @@ async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop)
1025
1066
  }
1026
1067
  if (r.status < 200 || r.status >= 300)
1027
1068
  return null;
1069
+ if (maxBytes && maxBytes > 0 && r.body.length > maxBytes) {
1070
+ // eslint-disable-next-line no-console
1071
+ console.error(`pseolint: sitemap ${url} is ${(r.body.length / 1_048_576).toFixed(0)}MB, over the ${(maxBytes / 1_048_576).toFixed(0)}MB cap — skipping it.`);
1072
+ return null;
1073
+ }
1028
1074
  return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
1029
1075
  }
1030
1076
  catch (err) {
@@ -1182,7 +1228,16 @@ function fisherYatesSample(items, n, random = Math.random) {
1182
1228
  }
1183
1229
  return arr.slice(arr.length - n);
1184
1230
  }
1185
- async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop) {
1231
+ /** sitemaps.org caps an uncompressed sitemap at 50 MB. */
1232
+ const SITEMAP_MAX_BYTES = 50 * 1024 * 1024;
1233
+ /**
1234
+ * Max `<sitemapindex>` nesting depth we recurse through. The protocol only
1235
+ * defines a single level of nesting, but some sites nest deeper; 5 is generous
1236
+ * while still bounding work (and stack) on a hostile/misconfigured index that a
1237
+ * `visited` set alone wouldn't catch (e.g. a deep non-cyclic chain).
1238
+ */
1239
+ const SITEMAP_MAX_DEPTH = 5;
1240
+ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth = 0, maxDepth = SITEMAP_MAX_DEPTH) {
1186
1241
  visited.add(sitemapUrl);
1187
1242
  const entries = parseSitemapUrlsWithLastmod(sitemapText);
1188
1243
  if (!isSitemapIndex(sitemapText)) {
@@ -1196,6 +1251,13 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
1196
1251
  }
1197
1252
  return { urls, lastmodByUrl };
1198
1253
  }
1254
+ // It's a sitemap index. Stop recursing past the depth cap (the index itself
1255
+ // carries no page URLs, only child-sitemap refs, so returning empty is safe).
1256
+ if (depth >= maxDepth) {
1257
+ // eslint-disable-next-line no-console
1258
+ console.error(`pseolint: sitemap-index nesting exceeded depth ${maxDepth} at ${sitemapUrl}; not recursing further.`);
1259
+ return { urls: [], lastmodByUrl: new Map() };
1260
+ }
1199
1261
  const allUrls = [];
1200
1262
  const allLastmodByUrl = new Map();
1201
1263
  for (const entry of entries) {
@@ -1204,14 +1266,14 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
1204
1266
  throw signal.reason ?? new Error("aborted");
1205
1267
  if (visited.has(childUrl))
1206
1268
  continue;
1207
- const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop);
1269
+ const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
1208
1270
  if (!child)
1209
1271
  continue;
1210
1272
  const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
1211
1273
  if (!childLike)
1212
1274
  continue;
1213
- const { urls: childUrls, lastmodByUrl: childLastmodByUrl } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
1214
- allUrls.push(...childUrls);
1275
+ const { urls: childUrls, lastmodByUrl: childLastmodByUrl } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth + 1, maxDepth);
1276
+ pushAll(allUrls, childUrls);
1215
1277
  for (const [u, lm] of childLastmodByUrl) {
1216
1278
  allLastmodByUrl.set(u, lm);
1217
1279
  }
@@ -1220,7 +1282,7 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
1220
1282
  }
1221
1283
  async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
1222
1284
  if (!origin)
1223
- return { disallow: [], crawlDelaySec: 0 };
1285
+ return { disallow: [], crawlDelaySec: 0, sitemaps: [] };
1224
1286
  try {
1225
1287
  const robotsUrl = `${origin}/robots.txt`;
1226
1288
  const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats, signal, validateHop);
@@ -1230,10 +1292,14 @@ async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validate
1230
1292
  return {
1231
1293
  disallow: parseDisallowPatterns(fetched.text, ["*", "pseolint"]),
1232
1294
  crawlDelaySec: parseCrawlDelaySeconds(fetched.text),
1295
+ // `Sitemap:` directives are origin-relative-agnostic (absolute URLs) and
1296
+ // there can be several. Surfaced so discovery can read the site's declared
1297
+ // sitemaps instead of guessing.
1298
+ sitemaps: parseSitemapDirectives(fetched.text),
1233
1299
  };
1234
1300
  }
1235
1301
  catch {
1236
- return { disallow: [], crawlDelaySec: 0 };
1302
+ return { disallow: [], crawlDelaySec: 0, sitemaps: [] };
1237
1303
  }
1238
1304
  }
1239
1305
  function sleep(ms) {
@@ -1249,7 +1315,12 @@ function isDisallowedByRobots(urlPath, patterns) {
1249
1315
  function budgetExceeded(b) {
1250
1316
  return b.cap > 0 && b.used >= b.cap;
1251
1317
  }
1252
- async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null) {
1318
+ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null,
1319
+ // Backpressure salvage: when provided, every page body that comes back is
1320
+ // pushed into this caller-owned array as it's fetched. If the watchdog aborts
1321
+ // mid-crawl and this function throws, the caller still holds the partial set
1322
+ // (the local `pages` array would otherwise be lost with the stack frame).
1323
+ pageSink) {
1253
1324
  // Memoized SSRF validator. When guardSsrf is on, every URL fetched by the
1254
1325
  // audit (source, sitemap entries, redirects, discovered links) goes through
1255
1326
  // this. DNS is hit once per unique hostname per audit — a 4k-page audit on
@@ -1336,7 +1407,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
1336
1407
  else {
1337
1408
  urlsToFetch = sampledUrls;
1338
1409
  }
1339
- const pages = [];
1410
+ // Reuse the caller's salvage sink as the live page accumulator so a
1411
+ // mid-crawl watchdog abort leaves the already-fetched pages visible to
1412
+ // the caller. Falls back to a private array when no sink is passed.
1413
+ const pages = pageSink ?? [];
1340
1414
  // Fetch robots.txt once for the origin — reused for Crawl-Delay pacing and Disallow checks.
1341
1415
  const sourceOrigin = (() => { try {
1342
1416
  return new URL(source).origin;
@@ -1446,7 +1520,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
1446
1520
  }
1447
1521
  if (contentType.includes("html") || looksLikeHtml(text)) {
1448
1522
  const initialPage = { url: source, html: text };
1449
- const pages = [initialPage];
1523
+ // See note above: reuse the caller's salvage sink so a watchdog abort
1524
+ // during link-discovery crawling preserves the pages fetched so far.
1525
+ const pages = pageSink ?? [];
1526
+ pages.push(initialPage);
1450
1527
  if (crawlDiscovery) {
1451
1528
  let sourceOrigin;
1452
1529
  try {
@@ -1458,6 +1535,92 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
1458
1535
  const knownCrawled = new Set([source]);
1459
1536
  const allDiscoveredUrls = new Set([source]);
1460
1537
  const maxDepth = 3;
1538
+ // Sitemap-first discovery (like Google). Before link-crawling, read the
1539
+ // sitemap(s) the site declares — link-crawl only reaches *linked* pages,
1540
+ // but a pSEO site's whole point is thousands of programmatic URLs that
1541
+ // may be sparsely linked (or behind a build-frozen, under-linked nav).
1542
+ // Sources of truth, in order:
1543
+ // 1. `Sitemap:` directives in robots.txt (there can be several)
1544
+ // 2. failing that, probe /sitemap.xml then /sitemap_index.xml
1545
+ // Sitemap-listed URLs are authoritative, so we fetch them FIRST; the
1546
+ // link-crawl below then fills any remaining budget and dedups against
1547
+ // them. When no sitemap exists, this is a no-op and we crawl as before.
1548
+ if (sourceOrigin) {
1549
+ const robotsForDiscovery = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats, signal, validateHop);
1550
+ const probing = robotsForDiscovery.sitemaps.length === 0;
1551
+ const sitemapCandidates = probing
1552
+ ? [`${sourceOrigin}/sitemap.xml`, `${sourceOrigin}/sitemap_index.xml`]
1553
+ : robotsForDiscovery.sitemaps;
1554
+ const visitedSitemaps = new Set();
1555
+ const sitemapListedUrls = [];
1556
+ for (const candidate of sitemapCandidates) {
1557
+ if (discoveryBudget > 0 && pages.length + sitemapListedUrls.length >= discoveryBudget)
1558
+ break;
1559
+ if (visitedSitemaps.has(candidate))
1560
+ continue;
1561
+ let smText;
1562
+ let smType;
1563
+ try {
1564
+ if (validateHop)
1565
+ await validateHop(candidate);
1566
+ const fetched = await fetchWithRetry(candidate, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
1567
+ if (!fetched)
1568
+ continue;
1569
+ smText = fetched.text;
1570
+ smType = fetched.contentType;
1571
+ }
1572
+ catch {
1573
+ continue; // SSRF refusal, network error, etc. — skip this candidate
1574
+ }
1575
+ if (!(smType.includes("xml") || looksLikeSitemap(smText)))
1576
+ continue;
1577
+ const { urls: discoveredSmUrls } = await collectUrlsFromSitemap(smText, candidate, visitedSitemaps, timeoutMs, cache, stats, signal, validateHop);
1578
+ pushAll(sitemapListedUrls, discoveredSmUrls);
1579
+ // When probing the conventional paths, stop at the first that hits.
1580
+ if (probing && discoveredSmUrls.length > 0)
1581
+ break;
1582
+ }
1583
+ // Same-origin + robots-aware filter, deduped against what we have.
1584
+ const seedUrls = Array.from(new Set(sitemapListedUrls)).filter((u) => {
1585
+ if (knownCrawled.has(u))
1586
+ return false;
1587
+ try {
1588
+ const parsed = new URL(u);
1589
+ if (parsed.origin !== sourceOrigin)
1590
+ return false;
1591
+ if (respectRobotsTxt && isDisallowedByRobots(parsed.pathname, robotsForDiscovery.disallow)) {
1592
+ skippedByRobots.push(u);
1593
+ return false;
1594
+ }
1595
+ return true;
1596
+ }
1597
+ catch {
1598
+ return false;
1599
+ }
1600
+ });
1601
+ for (const u of seedUrls)
1602
+ allDiscoveredUrls.add(u);
1603
+ // Cap the seed fetch. With a sampling budget, fit under it; without one
1604
+ // (the default "audit everything" path) bound by maxCrawlDiscovered, the
1605
+ // same ceiling the link-crawl honors — otherwise a homepage audit of a
1606
+ // site with a 50k-URL sitemap would try to fetch all of them (the link
1607
+ // crawl never could, so this would be an unbounded-egress regression).
1608
+ const seedToFetch = discoveryBudget > 0
1609
+ ? seedUrls.slice(0, Math.max(0, discoveryBudget - pages.length))
1610
+ : seedUrls.slice(0, maxCrawlDiscovered);
1611
+ if (seedToFetch.length > 0) {
1612
+ await runWithConcurrency(seedToFetch, concurrency, async (url) => {
1613
+ if (budgetExceeded(byteBudget))
1614
+ return;
1615
+ const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
1616
+ knownCrawled.add(url);
1617
+ if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
1618
+ byteBudget.used += result.html.length;
1619
+ pages.push(result);
1620
+ }
1621
+ });
1622
+ }
1623
+ }
1461
1624
  for (let depth = 0; depth < maxDepth; depth += 1) {
1462
1625
  // Stop if we've hit the discovery budget
1463
1626
  if (discoveryBudget > 0 && pages.length >= discoveryBudget)
@@ -1519,7 +1682,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
1519
1682
  knownCrawled.add(url);
1520
1683
  }
1521
1684
  });
1522
- pages.push(...newPages);
1685
+ pushAll(pages, newPages);
1523
1686
  if (newPages.length === 0)
1524
1687
  break;
1525
1688
  }
@@ -1610,6 +1773,13 @@ export async function auditSource(source, options) {
1610
1773
  const backpressureEnabled = options?.backpressure !== false;
1611
1774
  const backpressureAbort = new AbortController();
1612
1775
  let backpressureError = null;
1776
+ // Set once we've decided to salvage a partial report after a watchdog abort.
1777
+ // From that point `throwIfAborted` must NOT re-throw the backpressure error —
1778
+ // the watchdog already did its job (stopped fetching); the rest of the
1779
+ // pipeline runs over the pages collected so far and the truncation is
1780
+ // surfaced on the summary instead.
1781
+ let truncated = false;
1782
+ let truncatedReason;
1613
1783
  const signal = composeSignals(externalSignal, backpressureAbort.signal);
1614
1784
  const observer = new FetchObserver();
1615
1785
  // 2026-05-03 calibration: the prior (3s p95 cap, 2× baseline multiplier)
@@ -1651,12 +1821,32 @@ export async function auditSource(source, options) {
1651
1821
  backpressureAbort.abort(backpressureError);
1652
1822
  }
1653
1823
  };
1824
+ // Flip the run into salvage mode after a watchdog abort: record the reason so
1825
+ // assembly sets summary.truncated, and from here `throwIfAborted` will no
1826
+ // longer re-throw the backpressure error. Idempotent. Returns true when a
1827
+ // backpressure abort was present to salvage.
1828
+ function salvageBackpressure() {
1829
+ if (!backpressureError)
1830
+ return false;
1831
+ truncated = true;
1832
+ truncatedReason = backpressureError.message;
1833
+ return true;
1834
+ }
1654
1835
  function throwIfAborted() {
1655
- if (backpressureError)
1656
- throw backpressureError;
1836
+ // An EXTERNAL abort (ctrl-C, parent timeout) is always fatal: the caller
1837
+ // asked to stop, not to degrade. Check it first so it wins over salvage.
1657
1838
  if (externalSignal?.aborted) {
1658
1839
  throw externalSignal.reason ?? new DOMException("Audit aborted", "AbortError");
1659
1840
  }
1841
+ // A backpressure abort is salvageable. Once we've committed to a partial
1842
+ // report (`truncated`), swallow it and let the pipeline finish over the
1843
+ // pages collected so far. Before that commit, the loader-boundary catch
1844
+ // handles it; this guard only fires on the rare path where the loader
1845
+ // returned normally (e.g. a fetch mock that ignores the abort signal) yet
1846
+ // the watchdog still voted to abort — salvage rather than crash.
1847
+ if (backpressureError && !truncated) {
1848
+ salvageBackpressure();
1849
+ }
1660
1850
  }
1661
1851
  const resolvedRules = {
1662
1852
  nearDuplicateThreshold: options?.rules?.nearDuplicateThreshold ?? DEFAULTS.nearDuplicateThreshold,
@@ -1811,13 +2001,26 @@ export async function auditSource(source, options) {
1811
2001
  }
1812
2002
  : undefined;
1813
2003
  const pinnedPages = [];
1814
- await runWithConcurrency(Array.from(pinned), concurrency, async (url) => {
1815
- const result = await fetchPageWithMeta(url, timeoutMs, cacheConfig, cacheStats, signal, validateHopPinned, followRedirects);
1816
- if (result) {
1817
- fetchByteBudget.used += result.html.length;
1818
- pinnedPages.push(result);
2004
+ try {
2005
+ await runWithConcurrency(Array.from(pinned), concurrency, async (url) => {
2006
+ const result = await fetchPageWithMeta(url, timeoutMs, cacheConfig, cacheStats, signal, validateHopPinned, followRedirects);
2007
+ if (result) {
2008
+ fetchByteBudget.used += result.html.length;
2009
+ pinnedPages.push(result);
2010
+ }
2011
+ });
2012
+ }
2013
+ catch (err) {
2014
+ // Same salvage contract as the sitemap/crawl path: a watchdog abort
2015
+ // mid-fetch keeps the pages already collected in `pinnedPages`. Any other
2016
+ // error (external abort, SSRF rejection) is fatal — re-throw it.
2017
+ if (err instanceof OriginDegradedError) {
2018
+ salvageBackpressure();
1819
2019
  }
1820
- });
2020
+ else {
2021
+ throw err;
2022
+ }
2023
+ }
1821
2024
  loadedPagesRaw = pinnedPages;
1822
2025
  // No sitemap context in pinned mode
1823
2026
  sitemapUrlSet = undefined;
@@ -1826,12 +2029,46 @@ export async function auditSource(source, options) {
1826
2029
  scrapePlan = undefined;
1827
2030
  }
1828
2031
  else {
1829
- const loaded = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext);
1830
- loadedPagesRaw = loaded.pages;
1831
- sitemapUrlSet = loaded.sitemapUrls;
1832
- sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
1833
- discoveredUrlCount = loaded.discoveredUrlCount;
1834
- scrapePlan = loaded.scrapePlan;
2032
+ // Salvage sink: loadPagesFromSource fills this incrementally as pages come
2033
+ // back. If the backpressure watchdog aborts mid-crawl the call throws an
2034
+ // OriginDegradedError and the function's own return value is lost — but the
2035
+ // already-fetched pages survive here, so we recover them and continue the
2036
+ // pipeline with a `truncated` flag instead of throwing the whole run away.
2037
+ const pageSink = [];
2038
+ try {
2039
+ const loaded = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext, pageSink);
2040
+ loadedPagesRaw = loaded.pages;
2041
+ sitemapUrlSet = loaded.sitemapUrls;
2042
+ sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
2043
+ discoveredUrlCount = loaded.discoveredUrlCount;
2044
+ scrapePlan = loaded.scrapePlan;
2045
+ }
2046
+ catch (err) {
2047
+ // Only the watchdog abort is salvageable. An external abort (ctrl-C /
2048
+ // parent timeout) or any other error is fatal — re-throw it untouched so
2049
+ // --no-backpressure and ctrl-C behaviour are unchanged.
2050
+ if (err instanceof OriginDegradedError) {
2051
+ // Prefer the canonical backpressureError message (same object the
2052
+ // monitor raised); fall back to the caught error if somehow distinct.
2053
+ if (!salvageBackpressure()) {
2054
+ truncated = true;
2055
+ truncatedReason = err.message;
2056
+ }
2057
+ // Recover whatever was fetched before the abort. The sink is the same
2058
+ // array loadPagesFromSource was pushing into, so it holds the partial
2059
+ // page set even though the function never reached its `return`.
2060
+ loadedPagesRaw = pageSink;
2061
+ // No sitemap/discovery context survives a mid-sitemap abort; the
2062
+ // downstream classifier falls back to the loaded page URLs.
2063
+ sitemapUrlSet = undefined;
2064
+ sitemapLastmodByUrl = undefined;
2065
+ discoveredUrlCount = undefined;
2066
+ scrapePlan = undefined;
2067
+ }
2068
+ else {
2069
+ throw err;
2070
+ }
2071
+ }
1835
2072
  }
1836
2073
  // The scrapePlan tells us which URLs were skipped pre-fetch under monitoring
1837
2074
  // mode. Surface them in skippedUrls so they show up under summary.skippedUrls
@@ -2053,29 +2290,29 @@ export async function auditSource(source, options) {
2053
2290
  // Site-wide rules (run once, outside group loop)
2054
2291
  if (sitemapUrlSet && sitemapUrlSet.size > 0 && auditMode !== "diff") {
2055
2292
  const sitemapFindings = sitemapCompletenessRule(parsedPages, sitemapUrlSet);
2056
- allFindings.push(...sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2293
+ pushAll(allFindings, sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2057
2294
  if (robotsTxtContent) {
2058
2295
  const robotsFindings = robotsComplianceRule(parsedPages, sitemapUrlSet, robotsTxtContent);
2059
- allFindings.push(...robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2296
+ pushAll(allFindings, robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2060
2297
  }
2061
2298
  }
2062
2299
  // AEO site-wide rules. These run unconditionally (consistent with sitemap-completeness
2063
2300
  // and robots-compliance); page-group rule lists govern per-page AEO rules only.
2064
2301
  const llmsFindings = await llmsTxtRule(source, { timeoutMs });
2065
- allFindings.push(...llmsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2302
+ pushAll(allFindings, llmsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2066
2303
  if (robotsTxtContent) {
2067
2304
  const crawlerFindings = crawlerAccessRule(robotsTxtContent);
2068
- allFindings.push(...crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2305
+ pushAll(allFindings, crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2069
2306
  }
2070
2307
  // Data source comparison rules
2071
2308
  if (options?.dataSource?.records && options.dataSource.records.length > 0) {
2072
2309
  if (auditMode !== "diff" || isRuleAllowedInDiff("data/missing-binding")) {
2073
2310
  const dataBindingFindings = dataBindingRule(parsedPages, options.dataSource.records);
2074
- allFindings.push(...dataBindingFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2311
+ pushAll(allFindings, dataBindingFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2075
2312
  }
2076
2313
  if (auditMode !== "diff" || isRuleAllowedInDiff("data/identical-across-pages")) {
2077
2314
  const dataIdenticalFindings = dataIdenticalRule(parsedPages, options.dataSource.records);
2078
- allFindings.push(...dataIdenticalFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2315
+ pushAll(allFindings, dataIdenticalFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2079
2316
  }
2080
2317
  }
2081
2318
  for (const [groupName, groupPages] of classified) {
@@ -2095,7 +2332,7 @@ export async function auditSource(source, options) {
2095
2332
  // because the nav paths between locale-specific currency-converter URLs
2096
2333
  // were not in the pinned set).
2097
2334
  isSampledAudit || hasPinnedUrlsEarly);
2098
- allFindings.push(...findings);
2335
+ pushAll(allFindings, findings);
2099
2336
  groupPageCounts[groupName] = groupPages.length;
2100
2337
  // v0.4.3: per-group scoring uses the same site-classification profile so
2101
2338
  // group-level risk numbers reflect the same severity / confidence remaps
@@ -2113,7 +2350,7 @@ export async function auditSource(source, options) {
2113
2350
  (auditMode === "full" || isRuleAllowedInDiff("content/value-add"));
2114
2351
  if (isValueAddEnabled) {
2115
2352
  const valueAddFindings = valueAddRule(parsedPages, allFindings);
2116
- allFindings.push(...valueAddFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2353
+ pushAll(allFindings, valueAddFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
2117
2354
  }
2118
2355
  }
2119
2356
  // Enrich findings: cluster pairwise, detect templates, assign effort
@@ -2237,6 +2474,14 @@ export async function auditSource(source, options) {
2237
2474
  ? [...parsedPages.map((p) => p.url)].sort()
2238
2475
  : undefined,
2239
2476
  };
2477
+ // Partial-report flag: the backpressure watchdog aborted mid-crawl and we
2478
+ // salvaged whatever pages had been fetched. Consumers MUST treat coverage as
2479
+ // a lower bound (counts/verdict are partial). Only set when actually
2480
+ // truncated so complete runs keep `truncated` absent.
2481
+ if (truncated) {
2482
+ summary.truncated = true;
2483
+ summary.truncatedReason = truncatedReason;
2484
+ }
2240
2485
  if (cacheConfig) {
2241
2486
  summary.cacheStats = cacheStats;
2242
2487
  }