@pseolint/core 0.6.3 → 0.6.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +64 -0
- package/dist/ai/tools/fetch-sitemap.js +2 -1
- package/dist/ai/tools/fetch-sitemap.js.map +1 -1
- package/dist/auditor.d.ts +2 -1
- package/dist/auditor.d.ts.map +1 -1
- package/dist/auditor.js +324 -79
- package/dist/auditor.js.map +1 -1
- package/dist/enrich-findings.d.ts.map +1 -1
- package/dist/enrich-findings.js +27 -5
- package/dist/enrich-findings.js.map +1 -1
- package/dist/parser.d.ts.map +1 -1
- package/dist/parser.js +17 -1
- package/dist/parser.js.map +1 -1
- package/dist/rules/content/title-uniqueness.d.ts.map +1 -1
- package/dist/rules/content/title-uniqueness.js +13 -0
- package/dist/rules/content/title-uniqueness.js.map +1 -1
- package/dist/site-classifier.d.ts.map +1 -1
- package/dist/site-classifier.js +7 -1
- package/dist/site-classifier.js.map +1 -1
- package/dist/stratified-sample.js +2 -1
- package/dist/stratified-sample.js.map +1 -1
- package/dist/types.d.ts +47 -3
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +22 -2
- package/dist/types.js.map +1 -1
- package/package.json +5 -3
- package/schemas/audit-summary.schema.json +295 -0
package/dist/auditor.js
CHANGED
|
@@ -34,7 +34,7 @@ import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-con
|
|
|
34
34
|
import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
|
|
35
35
|
import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
|
|
36
36
|
import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
|
|
37
|
-
import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds } from "./rules/tech/robots-sitemap-presence.js";
|
|
37
|
+
import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds, parseSitemapDirectives } from "./rules/tech/robots-sitemap-presence.js";
|
|
38
38
|
import { llmsTxtRule } from "./rules/aeo/llms-txt.js";
|
|
39
39
|
import { crawlerAccessRule } from "./rules/aeo/crawler-access.js";
|
|
40
40
|
import { freshnessSignalsRule } from "./rules/aeo/freshness-signals.js";
|
|
@@ -107,6 +107,24 @@ const CATEGORY_MAP = {
|
|
|
107
107
|
data: "data",
|
|
108
108
|
audit: "audit",
|
|
109
109
|
};
|
|
110
|
+
/**
|
|
111
|
+
* Per-rule category overrides — take precedence over the namespace-level
|
|
112
|
+
* CATEGORY_MAP. A rule lands here when its namespace (chosen for code
|
|
113
|
+
* organisation) doesn't match the scoring bucket its *signal* belongs to.
|
|
114
|
+
*
|
|
115
|
+
* `links/host-section-divergence` lives in the links namespace because it reads
|
|
116
|
+
* the internal-link graph, but semantically it detects a spam-policy violation
|
|
117
|
+
* (Google's May 2024 site-reputation-abuse) — an INTEGRITY signal, not a
|
|
118
|
+
* discoverability one. Without this override it scored in the discoverability
|
|
119
|
+
* bucket (0.15 weight on programmatic-directory), so a confirmed parasite
|
|
120
|
+
* section moved the risk score by ~2pts despite registering as a blocker.
|
|
121
|
+
*/
|
|
122
|
+
const RULE_CATEGORY_OVERRIDES = {
|
|
123
|
+
"links/host-section-divergence": "integrity",
|
|
124
|
+
};
|
|
125
|
+
export function categoryForRule(ruleId) {
|
|
126
|
+
return RULE_CATEGORY_OVERRIDES[ruleId] ?? CATEGORY_MAP[ruleId.split("/")[0]];
|
|
127
|
+
}
|
|
110
128
|
const SCORING_PROFILES = {
|
|
111
129
|
"small-marketing": {
|
|
112
130
|
categoryWeights: { integrity: 0.30, discoverability: 0.40, citation: 0.20, data: 0.05, audit: 0 },
|
|
@@ -427,6 +445,13 @@ const RULE_IMPACTS = {
|
|
|
427
445
|
"links/dead-ends": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
|
|
428
446
|
"links/cluster-connectivity": { baseImpact: 5, perInstance: 1, maxImpact: 25 },
|
|
429
447
|
"links/link-depth": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
|
|
448
|
+
// host-section-divergence is a reputation/integrity-grade signal that happens
|
|
449
|
+
// to live in the links namespace (it reads the link graph). It escalates to
|
|
450
|
+
// `error` and maps to manual-action risk, so it gets an explicit weight rather
|
|
451
|
+
// than inheriting DEFAULT_RULE_IMPACT (5/25), and is routed to the `integrity`
|
|
452
|
+
// bucket via RULE_CATEGORY_OVERRIDES so the score reflects the spam-policy
|
|
453
|
+
// severity rather than diluting into discoverability (0.15 weight).
|
|
454
|
+
"links/host-section-divergence": { baseImpact: 15, perInstance: 5, maxImpact: 45 },
|
|
430
455
|
// AEO — much lower baselines than spam (AEO is opt-in optimization)
|
|
431
456
|
"aeo/citable-facts": { baseImpact: 2, perInstance: 1, maxImpact: 25 },
|
|
432
457
|
"aeo/answer-first": { baseImpact: 3, perInstance: 1, maxImpact: 25 },
|
|
@@ -590,157 +615,157 @@ sampled = false) {
|
|
|
590
615
|
// Spam rules — always compute cross-page data, only push findings if enabled
|
|
591
616
|
const nearDuplicate = nearDuplicateRule(pages, resolvedRules.nearDuplicateThreshold);
|
|
592
617
|
if (isEnabled("spam/near-duplicate") && modeOk("spam/near-duplicate")) {
|
|
593
|
-
findings
|
|
618
|
+
pushAll(findings, tag(nearDuplicate.findings));
|
|
594
619
|
}
|
|
595
620
|
const entitySwap = entitySwapRule(pages, entityPatterns, resolvedRules.entitySwapThreshold);
|
|
596
621
|
if (isEnabled("spam/entity-swap") && modeOk("spam/entity-swap")) {
|
|
597
|
-
findings
|
|
622
|
+
pushAll(findings, tag(entitySwap.findings));
|
|
598
623
|
}
|
|
599
624
|
const thinContent = thinContentRule(pages, resolvedRules.thinContentMinWords);
|
|
600
625
|
if (isEnabled("spam/thin-content") && modeOk("spam/thin-content")) {
|
|
601
|
-
findings
|
|
626
|
+
pushAll(findings, tag(thinContent.findings));
|
|
602
627
|
}
|
|
603
628
|
if (isEnabled("spam/doorway-pattern") && modeOk("spam/doorway-pattern")) {
|
|
604
|
-
findings
|
|
629
|
+
pushAll(findings, tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
|
|
605
630
|
}
|
|
606
631
|
if (isEnabled("spam/publication-velocity") && modeOk("spam/publication-velocity")) {
|
|
607
|
-
findings
|
|
632
|
+
pushAll(findings, tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay, resolvedRules.publicationVelocityMaxPerDayCorpusFraction)));
|
|
608
633
|
}
|
|
609
634
|
if (isEnabled("spam/boilerplate-ratio") && modeOk("spam/boilerplate-ratio")) {
|
|
610
|
-
findings
|
|
635
|
+
pushAll(findings, tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
|
|
611
636
|
}
|
|
612
637
|
if (isEnabled("spam/template-diversity") && modeOk("spam/template-diversity")) {
|
|
613
|
-
findings
|
|
638
|
+
pushAll(findings, tag(templateDiversityRule(pages, resolvedRules.templateDiversityMinUniqueRatio)));
|
|
614
639
|
}
|
|
615
640
|
if (isEnabled("spam/template-coverage") && modeOk("spam/template-coverage")) {
|
|
616
|
-
findings
|
|
641
|
+
pushAll(findings, tag(templateCoverageRule(pages, entityPatterns, resolvedRules.templateCoverageMinPages)));
|
|
617
642
|
}
|
|
618
643
|
// Content rules
|
|
619
644
|
if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
|
|
620
|
-
findings
|
|
645
|
+
pushAll(findings, tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
|
|
621
646
|
}
|
|
622
647
|
if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
|
|
623
|
-
findings
|
|
648
|
+
pushAll(findings, tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
|
|
624
649
|
}
|
|
625
650
|
if (isEnabled("content/missing-author") && modeOk("content/missing-author")) {
|
|
626
|
-
findings
|
|
651
|
+
pushAll(findings, tag(missingAuthorRule(pages)));
|
|
627
652
|
}
|
|
628
653
|
if (isEnabled("content/eeat-signals") && modeOk("content/eeat-signals")) {
|
|
629
|
-
findings
|
|
654
|
+
pushAll(findings, tag(eeatSignalsRule(pages)));
|
|
630
655
|
}
|
|
631
656
|
// 2026-05-03 v0.5.2 blind-spot fixes — title uniqueness + heading
|
|
632
657
|
// structure + image alt-text were tier-1 gaps in the blind-spot audit.
|
|
633
658
|
if (isEnabled("content/title-uniqueness") && modeOk("content/title-uniqueness")) {
|
|
634
|
-
findings
|
|
659
|
+
pushAll(findings, tag(titleUniquenessRule(pages)));
|
|
635
660
|
}
|
|
636
661
|
if (isEnabled("content/heading-structure") && modeOk("content/heading-structure")) {
|
|
637
|
-
findings
|
|
662
|
+
pushAll(findings, tag(headingStructureRule(pages)));
|
|
638
663
|
}
|
|
639
664
|
if (isEnabled("content/image-alt-text") && modeOk("content/image-alt-text")) {
|
|
640
|
-
findings
|
|
665
|
+
pushAll(findings, tag(imageAltTextRule(pages)));
|
|
641
666
|
}
|
|
642
667
|
if (isEnabled("content/translation-no-op") && modeOk("content/translation-no-op")) {
|
|
643
|
-
findings
|
|
668
|
+
pushAll(findings, tag(translationNoOpRule(pages)));
|
|
644
669
|
}
|
|
645
670
|
if (isEnabled("content/regurgitated-content") && modeOk("content/regurgitated-content")) {
|
|
646
|
-
findings
|
|
671
|
+
pushAll(findings, tag(regurgitatedContentRule(pages)));
|
|
647
672
|
}
|
|
648
673
|
if (isEnabled("content/common-phrase-reuse") && modeOk("content/common-phrase-reuse")) {
|
|
649
|
-
findings
|
|
674
|
+
pushAll(findings, tag(commonPhraseReuseRule(pages)));
|
|
650
675
|
}
|
|
651
676
|
if (isEnabled("content/wikipedia-paraphrase") && modeOk("content/wikipedia-paraphrase")) {
|
|
652
|
-
findings
|
|
677
|
+
pushAll(findings, tag(wikipediaParaphraseRule(pages)));
|
|
653
678
|
}
|
|
654
679
|
// Link rules — use the global link graph
|
|
655
680
|
if (isEnabled("links/orphan-pages") && modeOk("links/orphan-pages")) {
|
|
656
|
-
findings
|
|
681
|
+
pushAll(findings, tag(orphanPagesRule(pages, inbound, rootUrl)));
|
|
657
682
|
}
|
|
658
683
|
if (isEnabled("links/dead-ends") && modeOk("links/dead-ends")) {
|
|
659
|
-
findings
|
|
684
|
+
pushAll(findings, tag(deadEndsRule(pages, knownUrls, rootUrl)));
|
|
660
685
|
}
|
|
661
686
|
if (isEnabled("links/link-depth") && modeOk("links/link-depth")) {
|
|
662
687
|
if (rootUrl) {
|
|
663
|
-
findings
|
|
688
|
+
pushAll(findings, tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound, sampled)));
|
|
664
689
|
}
|
|
665
690
|
}
|
|
666
691
|
if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
|
|
667
|
-
findings
|
|
692
|
+
pushAll(findings, tag(clusterConnectivityRule(pages, knownUrls)));
|
|
668
693
|
}
|
|
669
694
|
if (isEnabled("links/host-section-divergence") && modeOk("links/host-section-divergence")) {
|
|
670
|
-
findings
|
|
695
|
+
pushAll(findings, tag(hostSectionDivergenceRule(pages, adjacency)));
|
|
671
696
|
}
|
|
672
697
|
// Tech rules
|
|
673
698
|
if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
|
|
674
|
-
findings
|
|
699
|
+
pushAll(findings, tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
|
|
675
700
|
}
|
|
676
701
|
if (isEnabled("tech/canonical-noindex-conflict") && modeOk("tech/canonical-noindex-conflict")) {
|
|
677
|
-
findings
|
|
702
|
+
pushAll(findings, tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
|
|
678
703
|
}
|
|
679
704
|
if (isEnabled("tech/robots-noindex-conflict") && modeOk("tech/robots-noindex-conflict")) {
|
|
680
|
-
findings
|
|
705
|
+
pushAll(findings, tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
|
|
681
706
|
}
|
|
682
707
|
if (isEnabled("tech/redirect-chain") && modeOk("tech/redirect-chain")) {
|
|
683
|
-
findings
|
|
708
|
+
pushAll(findings, tag(redirectChainRule(pages)));
|
|
684
709
|
}
|
|
685
710
|
if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
|
|
686
|
-
findings
|
|
711
|
+
pushAll(findings, tag(soft404Rule(pages)));
|
|
687
712
|
}
|
|
688
713
|
if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
|
|
689
714
|
// hreflang declarations on noindex'd pages are still bugs when they're
|
|
690
715
|
// inconsistent — see auditor.test.ts "emits technical SEO findings".
|
|
691
|
-
findings
|
|
716
|
+
pushAll(findings, tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
|
|
692
717
|
}
|
|
693
718
|
// 2026-05-03 v0.5.2 blind-spot fix: og-completeness was referenced in
|
|
694
719
|
// the v0.4.x README without ever shipping. Now it does.
|
|
695
720
|
if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
|
|
696
|
-
findings
|
|
721
|
+
pushAll(findings, tag(ogCompletenessRule(pages)));
|
|
697
722
|
}
|
|
698
723
|
// Schema rules
|
|
699
724
|
if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
|
|
700
|
-
findings
|
|
725
|
+
pushAll(findings, tag(jsonLdValidRule(pages)));
|
|
701
726
|
}
|
|
702
727
|
if (isEnabled("schema/required-fields") && modeOk("schema/required-fields")) {
|
|
703
|
-
findings
|
|
728
|
+
pushAll(findings, tag(requiredFieldsRule(pages)));
|
|
704
729
|
}
|
|
705
730
|
if (isEnabled("schema/consistency") && modeOk("schema/consistency")) {
|
|
706
|
-
findings
|
|
731
|
+
pushAll(findings, tag(schemaConsistencyRule(pages)));
|
|
707
732
|
}
|
|
708
733
|
// AEO rules
|
|
709
734
|
if (isEnabled("aeo/freshness-signals")) {
|
|
710
|
-
findings
|
|
735
|
+
pushAll(findings, tag(freshnessSignalsRule(pages, {
|
|
711
736
|
maxStaleDays: resolvedRules.freshnessMaxStaleDays,
|
|
712
737
|
})));
|
|
713
738
|
}
|
|
714
739
|
if (isEnabled("aeo/faq-coverage")) {
|
|
715
|
-
findings
|
|
740
|
+
pushAll(findings, tag(faqCoverageRule(pages, {
|
|
716
741
|
minQuestionHeadings: resolvedRules.faqMinQuestionHeadings,
|
|
717
742
|
})));
|
|
718
743
|
}
|
|
719
744
|
if (isEnabled("aeo/answer-first")) {
|
|
720
|
-
findings
|
|
745
|
+
pushAll(findings, tag(answerFirstRule(pages, entityPatterns, {
|
|
721
746
|
maxFirstParagraphWords: resolvedRules.answerFirstMaxWords,
|
|
722
747
|
})));
|
|
723
748
|
}
|
|
724
749
|
if (isEnabled("aeo/citable-facts")) {
|
|
725
|
-
findings
|
|
750
|
+
pushAll(findings, tag(citableFactsRule(pages, entityPatterns, {
|
|
726
751
|
minFactsPerPage: resolvedRules.citableFactsMin,
|
|
727
752
|
targetFactsPerPage: resolvedRules.citableFactsTarget,
|
|
728
753
|
})));
|
|
729
754
|
}
|
|
730
755
|
if (isEnabled("aeo/content-modularity")) {
|
|
731
|
-
findings
|
|
756
|
+
pushAll(findings, tag(contentModularityRule(pages, {
|
|
732
757
|
maxParagraphWords: resolvedRules.modularityMaxParagraphWords,
|
|
733
758
|
minSelfContainedRatio: resolvedRules.modularityMinSelfContainedRatio,
|
|
734
759
|
})));
|
|
735
760
|
}
|
|
736
761
|
if (isEnabled("aeo/summary-bait")) {
|
|
737
|
-
findings
|
|
762
|
+
pushAll(findings, tag(summaryBaitRule(pages, entityPatterns)));
|
|
738
763
|
}
|
|
739
764
|
// Cannibal rules — only url-pattern survives in v0.4 (title-overlap and
|
|
740
765
|
// keyword-collision dropped due to high false-positive rates; see
|
|
741
766
|
// 2026-04-29 v0.4 redesign spec §4.3).
|
|
742
767
|
if (isEnabled("cannibal/url-pattern") && modeOk("cannibal/url-pattern")) {
|
|
743
|
-
findings
|
|
768
|
+
pushAll(findings, tag(urlPatternRule(pages)));
|
|
744
769
|
}
|
|
745
770
|
return findings;
|
|
746
771
|
}
|
|
@@ -830,8 +855,7 @@ function scoreFromFindings(findings, classification, pageCount = 0) {
|
|
|
830
855
|
// Each group's weighted impact lands in its category bucket.
|
|
831
856
|
const groups = new Map();
|
|
832
857
|
for (const finding of findings) {
|
|
833
|
-
const
|
|
834
|
-
const bucket = CATEGORY_MAP[namespace];
|
|
858
|
+
const bucket = categoryForRule(finding.ruleId);
|
|
835
859
|
if (!bucket)
|
|
836
860
|
continue;
|
|
837
861
|
if (bucket !== "audit")
|
|
@@ -867,8 +891,7 @@ function scoreFromFindings(findings, classification, pageCount = 0) {
|
|
|
867
891
|
integrity: 0, discoverability: 0, citation: 0, data: 0, audit: 0,
|
|
868
892
|
};
|
|
869
893
|
for (const [ruleId, group] of groups) {
|
|
870
|
-
const
|
|
871
|
-
const bucket = CATEGORY_MAP[namespace];
|
|
894
|
+
const bucket = categoryForRule(ruleId);
|
|
872
895
|
if (!bucket || bucket === "audit")
|
|
873
896
|
continue;
|
|
874
897
|
const impactSpec = RULE_IMPACTS[ruleId] ?? DEFAULT_RULE_IMPACT;
|
|
@@ -980,6 +1003,20 @@ function withDocsUrls(findings) {
|
|
|
980
1003
|
}
|
|
981
1004
|
return findings;
|
|
982
1005
|
}
|
|
1006
|
+
/**
|
|
1007
|
+
* Append every item of `items` to `target` in place. Use this instead of
|
|
1008
|
+
* `target.push(...items)` whenever `items` can be large. The spread form passes
|
|
1009
|
+
* each element as a separate call argument, and V8 caps argument count
|
|
1010
|
+
* (~131072) — so `push(...bigArray)` throws `RangeError: Maximum call stack size
|
|
1011
|
+
* exceeded` on large inputs. A dense site makes the pairwise rules
|
|
1012
|
+
* (near-duplicate / entity-swap) emit C(N,2) findings, which blew the cap at the
|
|
1013
|
+
* rule-aggregation push *before* enrichment was even reached. The loop has no
|
|
1014
|
+
* such limit. See tests/integration/large-corpus-no-overflow.test.ts.
|
|
1015
|
+
*/
|
|
1016
|
+
function pushAll(target, items) {
|
|
1017
|
+
for (const item of items)
|
|
1018
|
+
target.push(item);
|
|
1019
|
+
}
|
|
983
1020
|
async function collectHtmlFiles(directory) {
|
|
984
1021
|
const entries = await readdir(directory, { withFileTypes: true });
|
|
985
1022
|
const files = await Promise.all(entries.map(async (entry) => {
|
|
@@ -1015,7 +1052,11 @@ function composeSignals(...signals) {
|
|
|
1015
1052
|
}
|
|
1016
1053
|
return ac.signal;
|
|
1017
1054
|
}
|
|
1018
|
-
async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop
|
|
1055
|
+
async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop,
|
|
1056
|
+
// Per-sitemap byte cap (sitemaps.org caps an uncompressed sitemap at 50 MB).
|
|
1057
|
+
// Guards against a hostile/misconfigured sitemap eating the whole byte budget
|
|
1058
|
+
// or memory. 0 / undefined = no cap.
|
|
1059
|
+
maxBytes) {
|
|
1019
1060
|
try {
|
|
1020
1061
|
stats.total += 1;
|
|
1021
1062
|
const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
|
|
@@ -1025,6 +1066,11 @@ async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop)
|
|
|
1025
1066
|
}
|
|
1026
1067
|
if (r.status < 200 || r.status >= 300)
|
|
1027
1068
|
return null;
|
|
1069
|
+
if (maxBytes && maxBytes > 0 && r.body.length > maxBytes) {
|
|
1070
|
+
// eslint-disable-next-line no-console
|
|
1071
|
+
console.error(`pseolint: sitemap ${url} is ${(r.body.length / 1_048_576).toFixed(0)}MB, over the ${(maxBytes / 1_048_576).toFixed(0)}MB cap — skipping it.`);
|
|
1072
|
+
return null;
|
|
1073
|
+
}
|
|
1028
1074
|
return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
|
|
1029
1075
|
}
|
|
1030
1076
|
catch (err) {
|
|
@@ -1182,7 +1228,16 @@ function fisherYatesSample(items, n, random = Math.random) {
|
|
|
1182
1228
|
}
|
|
1183
1229
|
return arr.slice(arr.length - n);
|
|
1184
1230
|
}
|
|
1185
|
-
|
|
1231
|
+
/** sitemaps.org caps an uncompressed sitemap at 50 MB. */
|
|
1232
|
+
const SITEMAP_MAX_BYTES = 50 * 1024 * 1024;
|
|
1233
|
+
/**
|
|
1234
|
+
* Max `<sitemapindex>` nesting depth we recurse through. The protocol only
|
|
1235
|
+
* defines a single level of nesting, but some sites nest deeper; 5 is generous
|
|
1236
|
+
* while still bounding work (and stack) on a hostile/misconfigured index that a
|
|
1237
|
+
* `visited` set alone wouldn't catch (e.g. a deep non-cyclic chain).
|
|
1238
|
+
*/
|
|
1239
|
+
const SITEMAP_MAX_DEPTH = 5;
|
|
1240
|
+
async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth = 0, maxDepth = SITEMAP_MAX_DEPTH) {
|
|
1186
1241
|
visited.add(sitemapUrl);
|
|
1187
1242
|
const entries = parseSitemapUrlsWithLastmod(sitemapText);
|
|
1188
1243
|
if (!isSitemapIndex(sitemapText)) {
|
|
@@ -1196,6 +1251,13 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
|
|
|
1196
1251
|
}
|
|
1197
1252
|
return { urls, lastmodByUrl };
|
|
1198
1253
|
}
|
|
1254
|
+
// It's a sitemap index. Stop recursing past the depth cap (the index itself
|
|
1255
|
+
// carries no page URLs, only child-sitemap refs, so returning empty is safe).
|
|
1256
|
+
if (depth >= maxDepth) {
|
|
1257
|
+
// eslint-disable-next-line no-console
|
|
1258
|
+
console.error(`pseolint: sitemap-index nesting exceeded depth ${maxDepth} at ${sitemapUrl}; not recursing further.`);
|
|
1259
|
+
return { urls: [], lastmodByUrl: new Map() };
|
|
1260
|
+
}
|
|
1199
1261
|
const allUrls = [];
|
|
1200
1262
|
const allLastmodByUrl = new Map();
|
|
1201
1263
|
for (const entry of entries) {
|
|
@@ -1204,14 +1266,14 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
|
|
|
1204
1266
|
throw signal.reason ?? new Error("aborted");
|
|
1205
1267
|
if (visited.has(childUrl))
|
|
1206
1268
|
continue;
|
|
1207
|
-
const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop);
|
|
1269
|
+
const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
|
|
1208
1270
|
if (!child)
|
|
1209
1271
|
continue;
|
|
1210
1272
|
const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
|
|
1211
1273
|
if (!childLike)
|
|
1212
1274
|
continue;
|
|
1213
|
-
const { urls: childUrls, lastmodByUrl: childLastmodByUrl } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
1214
|
-
allUrls
|
|
1275
|
+
const { urls: childUrls, lastmodByUrl: childLastmodByUrl } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth + 1, maxDepth);
|
|
1276
|
+
pushAll(allUrls, childUrls);
|
|
1215
1277
|
for (const [u, lm] of childLastmodByUrl) {
|
|
1216
1278
|
allLastmodByUrl.set(u, lm);
|
|
1217
1279
|
}
|
|
@@ -1220,7 +1282,7 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
|
|
|
1220
1282
|
}
|
|
1221
1283
|
async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
|
|
1222
1284
|
if (!origin)
|
|
1223
|
-
return { disallow: [], crawlDelaySec: 0 };
|
|
1285
|
+
return { disallow: [], crawlDelaySec: 0, sitemaps: [] };
|
|
1224
1286
|
try {
|
|
1225
1287
|
const robotsUrl = `${origin}/robots.txt`;
|
|
1226
1288
|
const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats, signal, validateHop);
|
|
@@ -1230,10 +1292,14 @@ async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validate
|
|
|
1230
1292
|
return {
|
|
1231
1293
|
disallow: parseDisallowPatterns(fetched.text, ["*", "pseolint"]),
|
|
1232
1294
|
crawlDelaySec: parseCrawlDelaySeconds(fetched.text),
|
|
1295
|
+
// `Sitemap:` directives are origin-relative-agnostic (absolute URLs) and
|
|
1296
|
+
// there can be several. Surfaced so discovery can read the site's declared
|
|
1297
|
+
// sitemaps instead of guessing.
|
|
1298
|
+
sitemaps: parseSitemapDirectives(fetched.text),
|
|
1233
1299
|
};
|
|
1234
1300
|
}
|
|
1235
1301
|
catch {
|
|
1236
|
-
return { disallow: [], crawlDelaySec: 0 };
|
|
1302
|
+
return { disallow: [], crawlDelaySec: 0, sitemaps: [] };
|
|
1237
1303
|
}
|
|
1238
1304
|
}
|
|
1239
1305
|
function sleep(ms) {
|
|
@@ -1249,7 +1315,12 @@ function isDisallowedByRobots(urlPath, patterns) {
|
|
|
1249
1315
|
function budgetExceeded(b) {
|
|
1250
1316
|
return b.cap > 0 && b.used >= b.cap;
|
|
1251
1317
|
}
|
|
1252
|
-
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null
|
|
1318
|
+
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null,
|
|
1319
|
+
// Backpressure salvage: when provided, every page body that comes back is
|
|
1320
|
+
// pushed into this caller-owned array as it's fetched. If the watchdog aborts
|
|
1321
|
+
// mid-crawl and this function throws, the caller still holds the partial set
|
|
1322
|
+
// (the local `pages` array would otherwise be lost with the stack frame).
|
|
1323
|
+
pageSink) {
|
|
1253
1324
|
// Memoized SSRF validator. When guardSsrf is on, every URL fetched by the
|
|
1254
1325
|
// audit (source, sitemap entries, redirects, discovered links) goes through
|
|
1255
1326
|
// this. DNS is hit once per unique hostname per audit — a 4k-page audit on
|
|
@@ -1336,7 +1407,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
1336
1407
|
else {
|
|
1337
1408
|
urlsToFetch = sampledUrls;
|
|
1338
1409
|
}
|
|
1339
|
-
|
|
1410
|
+
// Reuse the caller's salvage sink as the live page accumulator so a
|
|
1411
|
+
// mid-crawl watchdog abort leaves the already-fetched pages visible to
|
|
1412
|
+
// the caller. Falls back to a private array when no sink is passed.
|
|
1413
|
+
const pages = pageSink ?? [];
|
|
1340
1414
|
// Fetch robots.txt once for the origin — reused for Crawl-Delay pacing and Disallow checks.
|
|
1341
1415
|
const sourceOrigin = (() => { try {
|
|
1342
1416
|
return new URL(source).origin;
|
|
@@ -1446,7 +1520,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
1446
1520
|
}
|
|
1447
1521
|
if (contentType.includes("html") || looksLikeHtml(text)) {
|
|
1448
1522
|
const initialPage = { url: source, html: text };
|
|
1449
|
-
|
|
1523
|
+
// See note above: reuse the caller's salvage sink so a watchdog abort
|
|
1524
|
+
// during link-discovery crawling preserves the pages fetched so far.
|
|
1525
|
+
const pages = pageSink ?? [];
|
|
1526
|
+
pages.push(initialPage);
|
|
1450
1527
|
if (crawlDiscovery) {
|
|
1451
1528
|
let sourceOrigin;
|
|
1452
1529
|
try {
|
|
@@ -1458,6 +1535,92 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
1458
1535
|
const knownCrawled = new Set([source]);
|
|
1459
1536
|
const allDiscoveredUrls = new Set([source]);
|
|
1460
1537
|
const maxDepth = 3;
|
|
1538
|
+
// Sitemap-first discovery (like Google). Before link-crawling, read the
|
|
1539
|
+
// sitemap(s) the site declares — link-crawl only reaches *linked* pages,
|
|
1540
|
+
// but a pSEO site's whole point is thousands of programmatic URLs that
|
|
1541
|
+
// may be sparsely linked (or behind a build-frozen, under-linked nav).
|
|
1542
|
+
// Sources of truth, in order:
|
|
1543
|
+
// 1. `Sitemap:` directives in robots.txt (there can be several)
|
|
1544
|
+
// 2. failing that, probe /sitemap.xml then /sitemap_index.xml
|
|
1545
|
+
// Sitemap-listed URLs are authoritative, so we fetch them FIRST; the
|
|
1546
|
+
// link-crawl below then fills any remaining budget and dedups against
|
|
1547
|
+
// them. When no sitemap exists, this is a no-op and we crawl as before.
|
|
1548
|
+
if (sourceOrigin) {
|
|
1549
|
+
const robotsForDiscovery = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats, signal, validateHop);
|
|
1550
|
+
const probing = robotsForDiscovery.sitemaps.length === 0;
|
|
1551
|
+
const sitemapCandidates = probing
|
|
1552
|
+
? [`${sourceOrigin}/sitemap.xml`, `${sourceOrigin}/sitemap_index.xml`]
|
|
1553
|
+
: robotsForDiscovery.sitemaps;
|
|
1554
|
+
const visitedSitemaps = new Set();
|
|
1555
|
+
const sitemapListedUrls = [];
|
|
1556
|
+
for (const candidate of sitemapCandidates) {
|
|
1557
|
+
if (discoveryBudget > 0 && pages.length + sitemapListedUrls.length >= discoveryBudget)
|
|
1558
|
+
break;
|
|
1559
|
+
if (visitedSitemaps.has(candidate))
|
|
1560
|
+
continue;
|
|
1561
|
+
let smText;
|
|
1562
|
+
let smType;
|
|
1563
|
+
try {
|
|
1564
|
+
if (validateHop)
|
|
1565
|
+
await validateHop(candidate);
|
|
1566
|
+
const fetched = await fetchWithRetry(candidate, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
|
|
1567
|
+
if (!fetched)
|
|
1568
|
+
continue;
|
|
1569
|
+
smText = fetched.text;
|
|
1570
|
+
smType = fetched.contentType;
|
|
1571
|
+
}
|
|
1572
|
+
catch {
|
|
1573
|
+
continue; // SSRF refusal, network error, etc. — skip this candidate
|
|
1574
|
+
}
|
|
1575
|
+
if (!(smType.includes("xml") || looksLikeSitemap(smText)))
|
|
1576
|
+
continue;
|
|
1577
|
+
const { urls: discoveredSmUrls } = await collectUrlsFromSitemap(smText, candidate, visitedSitemaps, timeoutMs, cache, stats, signal, validateHop);
|
|
1578
|
+
pushAll(sitemapListedUrls, discoveredSmUrls);
|
|
1579
|
+
// When probing the conventional paths, stop at the first that hits.
|
|
1580
|
+
if (probing && discoveredSmUrls.length > 0)
|
|
1581
|
+
break;
|
|
1582
|
+
}
|
|
1583
|
+
// Same-origin + robots-aware filter, deduped against what we have.
|
|
1584
|
+
const seedUrls = Array.from(new Set(sitemapListedUrls)).filter((u) => {
|
|
1585
|
+
if (knownCrawled.has(u))
|
|
1586
|
+
return false;
|
|
1587
|
+
try {
|
|
1588
|
+
const parsed = new URL(u);
|
|
1589
|
+
if (parsed.origin !== sourceOrigin)
|
|
1590
|
+
return false;
|
|
1591
|
+
if (respectRobotsTxt && isDisallowedByRobots(parsed.pathname, robotsForDiscovery.disallow)) {
|
|
1592
|
+
skippedByRobots.push(u);
|
|
1593
|
+
return false;
|
|
1594
|
+
}
|
|
1595
|
+
return true;
|
|
1596
|
+
}
|
|
1597
|
+
catch {
|
|
1598
|
+
return false;
|
|
1599
|
+
}
|
|
1600
|
+
});
|
|
1601
|
+
for (const u of seedUrls)
|
|
1602
|
+
allDiscoveredUrls.add(u);
|
|
1603
|
+
// Cap the seed fetch. With a sampling budget, fit under it; without one
|
|
1604
|
+
// (the default "audit everything" path) bound by maxCrawlDiscovered, the
|
|
1605
|
+
// same ceiling the link-crawl honors — otherwise a homepage audit of a
|
|
1606
|
+
// site with a 50k-URL sitemap would try to fetch all of them (the link
|
|
1607
|
+
// crawl never could, so this would be an unbounded-egress regression).
|
|
1608
|
+
const seedToFetch = discoveryBudget > 0
|
|
1609
|
+
? seedUrls.slice(0, Math.max(0, discoveryBudget - pages.length))
|
|
1610
|
+
: seedUrls.slice(0, maxCrawlDiscovered);
|
|
1611
|
+
if (seedToFetch.length > 0) {
|
|
1612
|
+
await runWithConcurrency(seedToFetch, concurrency, async (url) => {
|
|
1613
|
+
if (budgetExceeded(byteBudget))
|
|
1614
|
+
return;
|
|
1615
|
+
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
|
|
1616
|
+
knownCrawled.add(url);
|
|
1617
|
+
if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
|
|
1618
|
+
byteBudget.used += result.html.length;
|
|
1619
|
+
pages.push(result);
|
|
1620
|
+
}
|
|
1621
|
+
});
|
|
1622
|
+
}
|
|
1623
|
+
}
|
|
1461
1624
|
for (let depth = 0; depth < maxDepth; depth += 1) {
|
|
1462
1625
|
// Stop if we've hit the discovery budget
|
|
1463
1626
|
if (discoveryBudget > 0 && pages.length >= discoveryBudget)
|
|
@@ -1519,7 +1682,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
1519
1682
|
knownCrawled.add(url);
|
|
1520
1683
|
}
|
|
1521
1684
|
});
|
|
1522
|
-
pages
|
|
1685
|
+
pushAll(pages, newPages);
|
|
1523
1686
|
if (newPages.length === 0)
|
|
1524
1687
|
break;
|
|
1525
1688
|
}
|
|
@@ -1610,6 +1773,13 @@ export async function auditSource(source, options) {
|
|
|
1610
1773
|
const backpressureEnabled = options?.backpressure !== false;
|
|
1611
1774
|
const backpressureAbort = new AbortController();
|
|
1612
1775
|
let backpressureError = null;
|
|
1776
|
+
// Set once we've decided to salvage a partial report after a watchdog abort.
|
|
1777
|
+
// From that point `throwIfAborted` must NOT re-throw the backpressure error —
|
|
1778
|
+
// the watchdog already did its job (stopped fetching); the rest of the
|
|
1779
|
+
// pipeline runs over the pages collected so far and the truncation is
|
|
1780
|
+
// surfaced on the summary instead.
|
|
1781
|
+
let truncated = false;
|
|
1782
|
+
let truncatedReason;
|
|
1613
1783
|
const signal = composeSignals(externalSignal, backpressureAbort.signal);
|
|
1614
1784
|
const observer = new FetchObserver();
|
|
1615
1785
|
// 2026-05-03 calibration: the prior (3s p95 cap, 2× baseline multiplier)
|
|
@@ -1651,12 +1821,32 @@ export async function auditSource(source, options) {
|
|
|
1651
1821
|
backpressureAbort.abort(backpressureError);
|
|
1652
1822
|
}
|
|
1653
1823
|
};
|
|
1824
|
+
// Flip the run into salvage mode after a watchdog abort: record the reason so
|
|
1825
|
+
// assembly sets summary.truncated, and from here `throwIfAborted` will no
|
|
1826
|
+
// longer re-throw the backpressure error. Idempotent. Returns true when a
|
|
1827
|
+
// backpressure abort was present to salvage.
|
|
1828
|
+
function salvageBackpressure() {
|
|
1829
|
+
if (!backpressureError)
|
|
1830
|
+
return false;
|
|
1831
|
+
truncated = true;
|
|
1832
|
+
truncatedReason = backpressureError.message;
|
|
1833
|
+
return true;
|
|
1834
|
+
}
|
|
1654
1835
|
function throwIfAborted() {
|
|
1655
|
-
|
|
1656
|
-
|
|
1836
|
+
// An EXTERNAL abort (ctrl-C, parent timeout) is always fatal: the caller
|
|
1837
|
+
// asked to stop, not to degrade. Check it first so it wins over salvage.
|
|
1657
1838
|
if (externalSignal?.aborted) {
|
|
1658
1839
|
throw externalSignal.reason ?? new DOMException("Audit aborted", "AbortError");
|
|
1659
1840
|
}
|
|
1841
|
+
// A backpressure abort is salvageable. Once we've committed to a partial
|
|
1842
|
+
// report (`truncated`), swallow it and let the pipeline finish over the
|
|
1843
|
+
// pages collected so far. Before that commit, the loader-boundary catch
|
|
1844
|
+
// handles it; this guard only fires on the rare path where the loader
|
|
1845
|
+
// returned normally (e.g. a fetch mock that ignores the abort signal) yet
|
|
1846
|
+
// the watchdog still voted to abort — salvage rather than crash.
|
|
1847
|
+
if (backpressureError && !truncated) {
|
|
1848
|
+
salvageBackpressure();
|
|
1849
|
+
}
|
|
1660
1850
|
}
|
|
1661
1851
|
const resolvedRules = {
|
|
1662
1852
|
nearDuplicateThreshold: options?.rules?.nearDuplicateThreshold ?? DEFAULTS.nearDuplicateThreshold,
|
|
@@ -1811,13 +2001,26 @@ export async function auditSource(source, options) {
|
|
|
1811
2001
|
}
|
|
1812
2002
|
: undefined;
|
|
1813
2003
|
const pinnedPages = [];
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
2004
|
+
try {
|
|
2005
|
+
await runWithConcurrency(Array.from(pinned), concurrency, async (url) => {
|
|
2006
|
+
const result = await fetchPageWithMeta(url, timeoutMs, cacheConfig, cacheStats, signal, validateHopPinned, followRedirects);
|
|
2007
|
+
if (result) {
|
|
2008
|
+
fetchByteBudget.used += result.html.length;
|
|
2009
|
+
pinnedPages.push(result);
|
|
2010
|
+
}
|
|
2011
|
+
});
|
|
2012
|
+
}
|
|
2013
|
+
catch (err) {
|
|
2014
|
+
// Same salvage contract as the sitemap/crawl path: a watchdog abort
|
|
2015
|
+
// mid-fetch keeps the pages already collected in `pinnedPages`. Any other
|
|
2016
|
+
// error (external abort, SSRF rejection) is fatal — re-throw it.
|
|
2017
|
+
if (err instanceof OriginDegradedError) {
|
|
2018
|
+
salvageBackpressure();
|
|
1819
2019
|
}
|
|
1820
|
-
|
|
2020
|
+
else {
|
|
2021
|
+
throw err;
|
|
2022
|
+
}
|
|
2023
|
+
}
|
|
1821
2024
|
loadedPagesRaw = pinnedPages;
|
|
1822
2025
|
// No sitemap context in pinned mode
|
|
1823
2026
|
sitemapUrlSet = undefined;
|
|
@@ -1826,12 +2029,46 @@ export async function auditSource(source, options) {
|
|
|
1826
2029
|
scrapePlan = undefined;
|
|
1827
2030
|
}
|
|
1828
2031
|
else {
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
|
|
1834
|
-
|
|
2032
|
+
// Salvage sink: loadPagesFromSource fills this incrementally as pages come
|
|
2033
|
+
// back. If the backpressure watchdog aborts mid-crawl the call throws an
|
|
2034
|
+
// OriginDegradedError and the function's own return value is lost — but the
|
|
2035
|
+
// already-fetched pages survive here, so we recover them and continue the
|
|
2036
|
+
// pipeline with a `truncated` flag instead of throwing the whole run away.
|
|
2037
|
+
const pageSink = [];
|
|
2038
|
+
try {
|
|
2039
|
+
const loaded = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext, pageSink);
|
|
2040
|
+
loadedPagesRaw = loaded.pages;
|
|
2041
|
+
sitemapUrlSet = loaded.sitemapUrls;
|
|
2042
|
+
sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
|
|
2043
|
+
discoveredUrlCount = loaded.discoveredUrlCount;
|
|
2044
|
+
scrapePlan = loaded.scrapePlan;
|
|
2045
|
+
}
|
|
2046
|
+
catch (err) {
|
|
2047
|
+
// Only the watchdog abort is salvageable. An external abort (ctrl-C /
|
|
2048
|
+
// parent timeout) or any other error is fatal — re-throw it untouched so
|
|
2049
|
+
// --no-backpressure and ctrl-C behaviour are unchanged.
|
|
2050
|
+
if (err instanceof OriginDegradedError) {
|
|
2051
|
+
// Prefer the canonical backpressureError message (same object the
|
|
2052
|
+
// monitor raised); fall back to the caught error if somehow distinct.
|
|
2053
|
+
if (!salvageBackpressure()) {
|
|
2054
|
+
truncated = true;
|
|
2055
|
+
truncatedReason = err.message;
|
|
2056
|
+
}
|
|
2057
|
+
// Recover whatever was fetched before the abort. The sink is the same
|
|
2058
|
+
// array loadPagesFromSource was pushing into, so it holds the partial
|
|
2059
|
+
// page set even though the function never reached its `return`.
|
|
2060
|
+
loadedPagesRaw = pageSink;
|
|
2061
|
+
// No sitemap/discovery context survives a mid-sitemap abort; the
|
|
2062
|
+
// downstream classifier falls back to the loaded page URLs.
|
|
2063
|
+
sitemapUrlSet = undefined;
|
|
2064
|
+
sitemapLastmodByUrl = undefined;
|
|
2065
|
+
discoveredUrlCount = undefined;
|
|
2066
|
+
scrapePlan = undefined;
|
|
2067
|
+
}
|
|
2068
|
+
else {
|
|
2069
|
+
throw err;
|
|
2070
|
+
}
|
|
2071
|
+
}
|
|
1835
2072
|
}
|
|
1836
2073
|
// The scrapePlan tells us which URLs were skipped pre-fetch under monitoring
|
|
1837
2074
|
// mode. Surface them in skippedUrls so they show up under summary.skippedUrls
|
|
@@ -2053,29 +2290,29 @@ export async function auditSource(source, options) {
|
|
|
2053
2290
|
// Site-wide rules (run once, outside group loop)
|
|
2054
2291
|
if (sitemapUrlSet && sitemapUrlSet.size > 0 && auditMode !== "diff") {
|
|
2055
2292
|
const sitemapFindings = sitemapCompletenessRule(parsedPages, sitemapUrlSet);
|
|
2056
|
-
allFindings
|
|
2293
|
+
pushAll(allFindings, sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2057
2294
|
if (robotsTxtContent) {
|
|
2058
2295
|
const robotsFindings = robotsComplianceRule(parsedPages, sitemapUrlSet, robotsTxtContent);
|
|
2059
|
-
allFindings
|
|
2296
|
+
pushAll(allFindings, robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2060
2297
|
}
|
|
2061
2298
|
}
|
|
2062
2299
|
// AEO site-wide rules. These run unconditionally (consistent with sitemap-completeness
|
|
2063
2300
|
// and robots-compliance); page-group rule lists govern per-page AEO rules only.
|
|
2064
2301
|
const llmsFindings = await llmsTxtRule(source, { timeoutMs });
|
|
2065
|
-
allFindings
|
|
2302
|
+
pushAll(allFindings, llmsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2066
2303
|
if (robotsTxtContent) {
|
|
2067
2304
|
const crawlerFindings = crawlerAccessRule(robotsTxtContent);
|
|
2068
|
-
allFindings
|
|
2305
|
+
pushAll(allFindings, crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2069
2306
|
}
|
|
2070
2307
|
// Data source comparison rules
|
|
2071
2308
|
if (options?.dataSource?.records && options.dataSource.records.length > 0) {
|
|
2072
2309
|
if (auditMode !== "diff" || isRuleAllowedInDiff("data/missing-binding")) {
|
|
2073
2310
|
const dataBindingFindings = dataBindingRule(parsedPages, options.dataSource.records);
|
|
2074
|
-
allFindings
|
|
2311
|
+
pushAll(allFindings, dataBindingFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2075
2312
|
}
|
|
2076
2313
|
if (auditMode !== "diff" || isRuleAllowedInDiff("data/identical-across-pages")) {
|
|
2077
2314
|
const dataIdenticalFindings = dataIdenticalRule(parsedPages, options.dataSource.records);
|
|
2078
|
-
allFindings
|
|
2315
|
+
pushAll(allFindings, dataIdenticalFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2079
2316
|
}
|
|
2080
2317
|
}
|
|
2081
2318
|
for (const [groupName, groupPages] of classified) {
|
|
@@ -2095,7 +2332,7 @@ export async function auditSource(source, options) {
|
|
|
2095
2332
|
// because the nav paths between locale-specific currency-converter URLs
|
|
2096
2333
|
// were not in the pinned set).
|
|
2097
2334
|
isSampledAudit || hasPinnedUrlsEarly);
|
|
2098
|
-
allFindings
|
|
2335
|
+
pushAll(allFindings, findings);
|
|
2099
2336
|
groupPageCounts[groupName] = groupPages.length;
|
|
2100
2337
|
// v0.4.3: per-group scoring uses the same site-classification profile so
|
|
2101
2338
|
// group-level risk numbers reflect the same severity / confidence remaps
|
|
@@ -2113,7 +2350,7 @@ export async function auditSource(source, options) {
|
|
|
2113
2350
|
(auditMode === "full" || isRuleAllowedInDiff("content/value-add"));
|
|
2114
2351
|
if (isValueAddEnabled) {
|
|
2115
2352
|
const valueAddFindings = valueAddRule(parsedPages, allFindings);
|
|
2116
|
-
allFindings
|
|
2353
|
+
pushAll(allFindings, valueAddFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2117
2354
|
}
|
|
2118
2355
|
}
|
|
2119
2356
|
// Enrich findings: cluster pairwise, detect templates, assign effort
|
|
@@ -2237,6 +2474,14 @@ export async function auditSource(source, options) {
|
|
|
2237
2474
|
? [...parsedPages.map((p) => p.url)].sort()
|
|
2238
2475
|
: undefined,
|
|
2239
2476
|
};
|
|
2477
|
+
// Partial-report flag: the backpressure watchdog aborted mid-crawl and we
|
|
2478
|
+
// salvaged whatever pages had been fetched. Consumers MUST treat coverage as
|
|
2479
|
+
// a lower bound (counts/verdict are partial). Only set when actually
|
|
2480
|
+
// truncated so complete runs keep `truncated` absent.
|
|
2481
|
+
if (truncated) {
|
|
2482
|
+
summary.truncated = true;
|
|
2483
|
+
summary.truncatedReason = truncatedReason;
|
|
2484
|
+
}
|
|
2240
2485
|
if (cacheConfig) {
|
|
2241
2486
|
summary.cacheStats = cacheStats;
|
|
2242
2487
|
}
|