@pseolint/core 0.6.4 → 0.6.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +64 -0
- package/dist/ai/tools/fetch-sitemap.js +2 -1
- package/dist/ai/tools/fetch-sitemap.js.map +1 -1
- package/dist/auditor.d.ts.map +1 -1
- package/dist/auditor.js +421 -85
- package/dist/auditor.js.map +1 -1
- package/dist/enrich-findings.d.ts.map +1 -1
- package/dist/enrich-findings.js +27 -5
- package/dist/enrich-findings.js.map +1 -1
- package/dist/parser.d.ts.map +1 -1
- package/dist/parser.js +17 -1
- package/dist/parser.js.map +1 -1
- package/dist/rules/content/title-uniqueness.d.ts.map +1 -1
- package/dist/rules/content/title-uniqueness.js +13 -0
- package/dist/rules/content/title-uniqueness.js.map +1 -1
- package/dist/rules/content/unique-value.d.ts.map +1 -1
- package/dist/rules/content/unique-value.js +29 -4
- package/dist/rules/content/unique-value.js.map +1 -1
- package/dist/site-classifier.d.ts.map +1 -1
- package/dist/site-classifier.js +7 -1
- package/dist/site-classifier.js.map +1 -1
- package/dist/stratified-sample.js +2 -1
- package/dist/stratified-sample.js.map +1 -1
- package/dist/types.d.ts +48 -3
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +14 -2
- package/dist/types.js.map +1 -1
- package/package.json +5 -3
- package/schemas/audit-summary.schema.json +300 -0
package/dist/auditor.js
CHANGED
|
@@ -34,7 +34,7 @@ import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-con
|
|
|
34
34
|
import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
|
|
35
35
|
import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
|
|
36
36
|
import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
|
|
37
|
-
import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds } from "./rules/tech/robots-sitemap-presence.js";
|
|
37
|
+
import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds, parseSitemapDirectives } from "./rules/tech/robots-sitemap-presence.js";
|
|
38
38
|
import { llmsTxtRule } from "./rules/aeo/llms-txt.js";
|
|
39
39
|
import { crawlerAccessRule } from "./rules/aeo/crawler-access.js";
|
|
40
40
|
import { freshnessSignalsRule } from "./rules/aeo/freshness-signals.js";
|
|
@@ -615,157 +615,157 @@ sampled = false) {
|
|
|
615
615
|
// Spam rules — always compute cross-page data, only push findings if enabled
|
|
616
616
|
const nearDuplicate = nearDuplicateRule(pages, resolvedRules.nearDuplicateThreshold);
|
|
617
617
|
if (isEnabled("spam/near-duplicate") && modeOk("spam/near-duplicate")) {
|
|
618
|
-
findings
|
|
618
|
+
pushAll(findings, tag(nearDuplicate.findings));
|
|
619
619
|
}
|
|
620
620
|
const entitySwap = entitySwapRule(pages, entityPatterns, resolvedRules.entitySwapThreshold);
|
|
621
621
|
if (isEnabled("spam/entity-swap") && modeOk("spam/entity-swap")) {
|
|
622
|
-
findings
|
|
622
|
+
pushAll(findings, tag(entitySwap.findings));
|
|
623
623
|
}
|
|
624
624
|
const thinContent = thinContentRule(pages, resolvedRules.thinContentMinWords);
|
|
625
625
|
if (isEnabled("spam/thin-content") && modeOk("spam/thin-content")) {
|
|
626
|
-
findings
|
|
626
|
+
pushAll(findings, tag(thinContent.findings));
|
|
627
627
|
}
|
|
628
628
|
if (isEnabled("spam/doorway-pattern") && modeOk("spam/doorway-pattern")) {
|
|
629
|
-
findings
|
|
629
|
+
pushAll(findings, tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
|
|
630
630
|
}
|
|
631
631
|
if (isEnabled("spam/publication-velocity") && modeOk("spam/publication-velocity")) {
|
|
632
|
-
findings
|
|
632
|
+
pushAll(findings, tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay, resolvedRules.publicationVelocityMaxPerDayCorpusFraction)));
|
|
633
633
|
}
|
|
634
634
|
if (isEnabled("spam/boilerplate-ratio") && modeOk("spam/boilerplate-ratio")) {
|
|
635
|
-
findings
|
|
635
|
+
pushAll(findings, tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
|
|
636
636
|
}
|
|
637
637
|
if (isEnabled("spam/template-diversity") && modeOk("spam/template-diversity")) {
|
|
638
|
-
findings
|
|
638
|
+
pushAll(findings, tag(templateDiversityRule(pages, resolvedRules.templateDiversityMinUniqueRatio)));
|
|
639
639
|
}
|
|
640
640
|
if (isEnabled("spam/template-coverage") && modeOk("spam/template-coverage")) {
|
|
641
|
-
findings
|
|
641
|
+
pushAll(findings, tag(templateCoverageRule(pages, entityPatterns, resolvedRules.templateCoverageMinPages)));
|
|
642
642
|
}
|
|
643
643
|
// Content rules
|
|
644
644
|
if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
|
|
645
|
-
findings
|
|
645
|
+
pushAll(findings, tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
|
|
646
646
|
}
|
|
647
647
|
if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
|
|
648
|
-
findings
|
|
648
|
+
pushAll(findings, tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
|
|
649
649
|
}
|
|
650
650
|
if (isEnabled("content/missing-author") && modeOk("content/missing-author")) {
|
|
651
|
-
findings
|
|
651
|
+
pushAll(findings, tag(missingAuthorRule(pages)));
|
|
652
652
|
}
|
|
653
653
|
if (isEnabled("content/eeat-signals") && modeOk("content/eeat-signals")) {
|
|
654
|
-
findings
|
|
654
|
+
pushAll(findings, tag(eeatSignalsRule(pages)));
|
|
655
655
|
}
|
|
656
656
|
// 2026-05-03 v0.5.2 blind-spot fixes — title uniqueness + heading
|
|
657
657
|
// structure + image alt-text were tier-1 gaps in the blind-spot audit.
|
|
658
658
|
if (isEnabled("content/title-uniqueness") && modeOk("content/title-uniqueness")) {
|
|
659
|
-
findings
|
|
659
|
+
pushAll(findings, tag(titleUniquenessRule(pages)));
|
|
660
660
|
}
|
|
661
661
|
if (isEnabled("content/heading-structure") && modeOk("content/heading-structure")) {
|
|
662
|
-
findings
|
|
662
|
+
pushAll(findings, tag(headingStructureRule(pages)));
|
|
663
663
|
}
|
|
664
664
|
if (isEnabled("content/image-alt-text") && modeOk("content/image-alt-text")) {
|
|
665
|
-
findings
|
|
665
|
+
pushAll(findings, tag(imageAltTextRule(pages)));
|
|
666
666
|
}
|
|
667
667
|
if (isEnabled("content/translation-no-op") && modeOk("content/translation-no-op")) {
|
|
668
|
-
findings
|
|
668
|
+
pushAll(findings, tag(translationNoOpRule(pages)));
|
|
669
669
|
}
|
|
670
670
|
if (isEnabled("content/regurgitated-content") && modeOk("content/regurgitated-content")) {
|
|
671
|
-
findings
|
|
671
|
+
pushAll(findings, tag(regurgitatedContentRule(pages)));
|
|
672
672
|
}
|
|
673
673
|
if (isEnabled("content/common-phrase-reuse") && modeOk("content/common-phrase-reuse")) {
|
|
674
|
-
findings
|
|
674
|
+
pushAll(findings, tag(commonPhraseReuseRule(pages)));
|
|
675
675
|
}
|
|
676
676
|
if (isEnabled("content/wikipedia-paraphrase") && modeOk("content/wikipedia-paraphrase")) {
|
|
677
|
-
findings
|
|
677
|
+
pushAll(findings, tag(wikipediaParaphraseRule(pages)));
|
|
678
678
|
}
|
|
679
679
|
// Link rules — use the global link graph
|
|
680
680
|
if (isEnabled("links/orphan-pages") && modeOk("links/orphan-pages")) {
|
|
681
|
-
findings
|
|
681
|
+
pushAll(findings, tag(orphanPagesRule(pages, inbound, rootUrl)));
|
|
682
682
|
}
|
|
683
683
|
if (isEnabled("links/dead-ends") && modeOk("links/dead-ends")) {
|
|
684
|
-
findings
|
|
684
|
+
pushAll(findings, tag(deadEndsRule(pages, knownUrls, rootUrl)));
|
|
685
685
|
}
|
|
686
686
|
if (isEnabled("links/link-depth") && modeOk("links/link-depth")) {
|
|
687
687
|
if (rootUrl) {
|
|
688
|
-
findings
|
|
688
|
+
pushAll(findings, tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound, sampled)));
|
|
689
689
|
}
|
|
690
690
|
}
|
|
691
691
|
if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
|
|
692
|
-
findings
|
|
692
|
+
pushAll(findings, tag(clusterConnectivityRule(pages, knownUrls)));
|
|
693
693
|
}
|
|
694
694
|
if (isEnabled("links/host-section-divergence") && modeOk("links/host-section-divergence")) {
|
|
695
|
-
findings
|
|
695
|
+
pushAll(findings, tag(hostSectionDivergenceRule(pages, adjacency)));
|
|
696
696
|
}
|
|
697
697
|
// Tech rules
|
|
698
698
|
if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
|
|
699
|
-
findings
|
|
699
|
+
pushAll(findings, tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
|
|
700
700
|
}
|
|
701
701
|
if (isEnabled("tech/canonical-noindex-conflict") && modeOk("tech/canonical-noindex-conflict")) {
|
|
702
|
-
findings
|
|
702
|
+
pushAll(findings, tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
|
|
703
703
|
}
|
|
704
704
|
if (isEnabled("tech/robots-noindex-conflict") && modeOk("tech/robots-noindex-conflict")) {
|
|
705
|
-
findings
|
|
705
|
+
pushAll(findings, tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
|
|
706
706
|
}
|
|
707
707
|
if (isEnabled("tech/redirect-chain") && modeOk("tech/redirect-chain")) {
|
|
708
|
-
findings
|
|
708
|
+
pushAll(findings, tag(redirectChainRule(pages)));
|
|
709
709
|
}
|
|
710
710
|
if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
|
|
711
|
-
findings
|
|
711
|
+
pushAll(findings, tag(soft404Rule(pages)));
|
|
712
712
|
}
|
|
713
713
|
if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
|
|
714
714
|
// hreflang declarations on noindex'd pages are still bugs when they're
|
|
715
715
|
// inconsistent — see auditor.test.ts "emits technical SEO findings".
|
|
716
|
-
findings
|
|
716
|
+
pushAll(findings, tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
|
|
717
717
|
}
|
|
718
718
|
// 2026-05-03 v0.5.2 blind-spot fix: og-completeness was referenced in
|
|
719
719
|
// the v0.4.x README without ever shipping. Now it does.
|
|
720
720
|
if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
|
|
721
|
-
findings
|
|
721
|
+
pushAll(findings, tag(ogCompletenessRule(pages)));
|
|
722
722
|
}
|
|
723
723
|
// Schema rules
|
|
724
724
|
if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
|
|
725
|
-
findings
|
|
725
|
+
pushAll(findings, tag(jsonLdValidRule(pages)));
|
|
726
726
|
}
|
|
727
727
|
if (isEnabled("schema/required-fields") && modeOk("schema/required-fields")) {
|
|
728
|
-
findings
|
|
728
|
+
pushAll(findings, tag(requiredFieldsRule(pages)));
|
|
729
729
|
}
|
|
730
730
|
if (isEnabled("schema/consistency") && modeOk("schema/consistency")) {
|
|
731
|
-
findings
|
|
731
|
+
pushAll(findings, tag(schemaConsistencyRule(pages)));
|
|
732
732
|
}
|
|
733
733
|
// AEO rules
|
|
734
734
|
if (isEnabled("aeo/freshness-signals")) {
|
|
735
|
-
findings
|
|
735
|
+
pushAll(findings, tag(freshnessSignalsRule(pages, {
|
|
736
736
|
maxStaleDays: resolvedRules.freshnessMaxStaleDays,
|
|
737
737
|
})));
|
|
738
738
|
}
|
|
739
739
|
if (isEnabled("aeo/faq-coverage")) {
|
|
740
|
-
findings
|
|
740
|
+
pushAll(findings, tag(faqCoverageRule(pages, {
|
|
741
741
|
minQuestionHeadings: resolvedRules.faqMinQuestionHeadings,
|
|
742
742
|
})));
|
|
743
743
|
}
|
|
744
744
|
if (isEnabled("aeo/answer-first")) {
|
|
745
|
-
findings
|
|
745
|
+
pushAll(findings, tag(answerFirstRule(pages, entityPatterns, {
|
|
746
746
|
maxFirstParagraphWords: resolvedRules.answerFirstMaxWords,
|
|
747
747
|
})));
|
|
748
748
|
}
|
|
749
749
|
if (isEnabled("aeo/citable-facts")) {
|
|
750
|
-
findings
|
|
750
|
+
pushAll(findings, tag(citableFactsRule(pages, entityPatterns, {
|
|
751
751
|
minFactsPerPage: resolvedRules.citableFactsMin,
|
|
752
752
|
targetFactsPerPage: resolvedRules.citableFactsTarget,
|
|
753
753
|
})));
|
|
754
754
|
}
|
|
755
755
|
if (isEnabled("aeo/content-modularity")) {
|
|
756
|
-
findings
|
|
756
|
+
pushAll(findings, tag(contentModularityRule(pages, {
|
|
757
757
|
maxParagraphWords: resolvedRules.modularityMaxParagraphWords,
|
|
758
758
|
minSelfContainedRatio: resolvedRules.modularityMinSelfContainedRatio,
|
|
759
759
|
})));
|
|
760
760
|
}
|
|
761
761
|
if (isEnabled("aeo/summary-bait")) {
|
|
762
|
-
findings
|
|
762
|
+
pushAll(findings, tag(summaryBaitRule(pages, entityPatterns)));
|
|
763
763
|
}
|
|
764
764
|
// Cannibal rules — only url-pattern survives in v0.4 (title-overlap and
|
|
765
765
|
// keyword-collision dropped due to high false-positive rates; see
|
|
766
766
|
// 2026-04-29 v0.4 redesign spec §4.3).
|
|
767
767
|
if (isEnabled("cannibal/url-pattern") && modeOk("cannibal/url-pattern")) {
|
|
768
|
-
findings
|
|
768
|
+
pushAll(findings, tag(urlPatternRule(pages)));
|
|
769
769
|
}
|
|
770
770
|
return findings;
|
|
771
771
|
}
|
|
@@ -1003,6 +1003,20 @@ function withDocsUrls(findings) {
|
|
|
1003
1003
|
}
|
|
1004
1004
|
return findings;
|
|
1005
1005
|
}
|
|
1006
|
+
/**
|
|
1007
|
+
* Append every item of `items` to `target` in place. Use this instead of
|
|
1008
|
+
* `target.push(...items)` whenever `items` can be large. The spread form passes
|
|
1009
|
+
* each element as a separate call argument, and V8 caps argument count
|
|
1010
|
+
* (~131072) — so `push(...bigArray)` throws `RangeError: Maximum call stack size
|
|
1011
|
+
* exceeded` on large inputs. A dense site makes the pairwise rules
|
|
1012
|
+
* (near-duplicate / entity-swap) emit C(N,2) findings, which blew the cap at the
|
|
1013
|
+
* rule-aggregation push *before* enrichment was even reached. The loop has no
|
|
1014
|
+
* such limit. See tests/integration/large-corpus-no-overflow.test.ts.
|
|
1015
|
+
*/
|
|
1016
|
+
function pushAll(target, items) {
|
|
1017
|
+
for (const item of items)
|
|
1018
|
+
target.push(item);
|
|
1019
|
+
}
|
|
1006
1020
|
async function collectHtmlFiles(directory) {
|
|
1007
1021
|
const entries = await readdir(directory, { withFileTypes: true });
|
|
1008
1022
|
const files = await Promise.all(entries.map(async (entry) => {
|
|
@@ -1038,7 +1052,11 @@ function composeSignals(...signals) {
|
|
|
1038
1052
|
}
|
|
1039
1053
|
return ac.signal;
|
|
1040
1054
|
}
|
|
1041
|
-
async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop
|
|
1055
|
+
async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop,
|
|
1056
|
+
// Per-sitemap byte cap (sitemaps.org caps an uncompressed sitemap at 50 MB).
|
|
1057
|
+
// Guards against a hostile/misconfigured sitemap eating the whole byte budget
|
|
1058
|
+
// or memory. 0 / undefined = no cap.
|
|
1059
|
+
maxBytes) {
|
|
1042
1060
|
try {
|
|
1043
1061
|
stats.total += 1;
|
|
1044
1062
|
const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
|
|
@@ -1048,6 +1066,11 @@ async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop)
|
|
|
1048
1066
|
}
|
|
1049
1067
|
if (r.status < 200 || r.status >= 300)
|
|
1050
1068
|
return null;
|
|
1069
|
+
if (maxBytes && maxBytes > 0 && r.body.length > maxBytes) {
|
|
1070
|
+
// eslint-disable-next-line no-console
|
|
1071
|
+
console.error(`pseolint: sitemap ${url} is ${(r.body.length / 1_048_576).toFixed(0)}MB, over the ${(maxBytes / 1_048_576).toFixed(0)}MB cap — skipping it.`);
|
|
1072
|
+
return null;
|
|
1073
|
+
}
|
|
1051
1074
|
return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
|
|
1052
1075
|
}
|
|
1053
1076
|
catch (err) {
|
|
@@ -1205,7 +1228,16 @@ function fisherYatesSample(items, n, random = Math.random) {
|
|
|
1205
1228
|
}
|
|
1206
1229
|
return arr.slice(arr.length - n);
|
|
1207
1230
|
}
|
|
1208
|
-
|
|
1231
|
+
/** sitemaps.org caps an uncompressed sitemap at 50 MB. */
|
|
1232
|
+
const SITEMAP_MAX_BYTES = 50 * 1024 * 1024;
|
|
1233
|
+
/**
|
|
1234
|
+
* Max `<sitemapindex>` nesting depth we recurse through. The protocol only
|
|
1235
|
+
* defines a single level of nesting, but some sites nest deeper; 5 is generous
|
|
1236
|
+
* while still bounding work (and stack) on a hostile/misconfigured index that a
|
|
1237
|
+
* `visited` set alone wouldn't catch (e.g. a deep non-cyclic chain).
|
|
1238
|
+
*/
|
|
1239
|
+
const SITEMAP_MAX_DEPTH = 5;
|
|
1240
|
+
async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth = 0, maxDepth = SITEMAP_MAX_DEPTH) {
|
|
1209
1241
|
visited.add(sitemapUrl);
|
|
1210
1242
|
const entries = parseSitemapUrlsWithLastmod(sitemapText);
|
|
1211
1243
|
if (!isSitemapIndex(sitemapText)) {
|
|
@@ -1217,33 +1249,50 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
|
|
|
1217
1249
|
lastmodByUrl.set(entry.url, entry.lastmod);
|
|
1218
1250
|
}
|
|
1219
1251
|
}
|
|
1220
|
-
return { urls, lastmodByUrl };
|
|
1252
|
+
return { urls, lastmodByUrl, childTotal: 0, childFailed: 0 };
|
|
1253
|
+
}
|
|
1254
|
+
// It's a sitemap index. Past the depth cap we stop recursing — but the
|
|
1255
|
+
// children we DON'T walk are unreached coverage, so report them as failed.
|
|
1256
|
+
if (depth >= maxDepth) {
|
|
1257
|
+
// eslint-disable-next-line no-console
|
|
1258
|
+
console.error(`pseolint: sitemap-index nesting exceeded depth ${maxDepth} at ${sitemapUrl}; not recursing further.`);
|
|
1259
|
+
return { urls: [], lastmodByUrl: new Map(), childTotal: entries.length, childFailed: entries.length };
|
|
1221
1260
|
}
|
|
1222
1261
|
const allUrls = [];
|
|
1223
1262
|
const allLastmodByUrl = new Map();
|
|
1263
|
+
let childTotal = 0;
|
|
1264
|
+
let childFailed = 0;
|
|
1224
1265
|
for (const entry of entries) {
|
|
1225
1266
|
const childUrl = entry.url;
|
|
1226
1267
|
if (signal?.aborted)
|
|
1227
1268
|
throw signal.reason ?? new Error("aborted");
|
|
1269
|
+
childTotal += 1;
|
|
1228
1270
|
if (visited.has(childUrl))
|
|
1271
|
+
continue; // already walked (cyclic index) — not a failure
|
|
1272
|
+
const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
|
|
1273
|
+
if (!child) {
|
|
1274
|
+
childFailed += 1;
|
|
1229
1275
|
continue;
|
|
1230
|
-
|
|
1231
|
-
if (!child)
|
|
1232
|
-
continue;
|
|
1276
|
+
}
|
|
1233
1277
|
const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
|
|
1234
|
-
if (!childLike)
|
|
1278
|
+
if (!childLike) {
|
|
1279
|
+
childFailed += 1;
|
|
1235
1280
|
continue;
|
|
1236
|
-
|
|
1237
|
-
|
|
1281
|
+
}
|
|
1282
|
+
const { urls: childUrls, lastmodByUrl: childLastmodByUrl, childTotal: ct, childFailed: cf } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth + 1, maxDepth);
|
|
1283
|
+
pushAll(allUrls, childUrls);
|
|
1238
1284
|
for (const [u, lm] of childLastmodByUrl) {
|
|
1239
1285
|
allLastmodByUrl.set(u, lm);
|
|
1240
1286
|
}
|
|
1287
|
+
// Accumulate nested index structure (a child that is itself an index).
|
|
1288
|
+
childTotal += ct;
|
|
1289
|
+
childFailed += cf;
|
|
1241
1290
|
}
|
|
1242
|
-
return { urls: allUrls, lastmodByUrl: allLastmodByUrl };
|
|
1291
|
+
return { urls: allUrls, lastmodByUrl: allLastmodByUrl, childTotal, childFailed };
|
|
1243
1292
|
}
|
|
1244
1293
|
async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
|
|
1245
1294
|
if (!origin)
|
|
1246
|
-
return { disallow: [], crawlDelaySec: 0 };
|
|
1295
|
+
return { disallow: [], crawlDelaySec: 0, sitemaps: [] };
|
|
1247
1296
|
try {
|
|
1248
1297
|
const robotsUrl = `${origin}/robots.txt`;
|
|
1249
1298
|
const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats, signal, validateHop);
|
|
@@ -1253,10 +1302,14 @@ async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validate
|
|
|
1253
1302
|
return {
|
|
1254
1303
|
disallow: parseDisallowPatterns(fetched.text, ["*", "pseolint"]),
|
|
1255
1304
|
crawlDelaySec: parseCrawlDelaySeconds(fetched.text),
|
|
1305
|
+
// `Sitemap:` directives are origin-relative-agnostic (absolute URLs) and
|
|
1306
|
+
// there can be several. Surfaced so discovery can read the site's declared
|
|
1307
|
+
// sitemaps instead of guessing.
|
|
1308
|
+
sitemaps: parseSitemapDirectives(fetched.text),
|
|
1256
1309
|
};
|
|
1257
1310
|
}
|
|
1258
1311
|
catch {
|
|
1259
|
-
return { disallow: [], crawlDelaySec: 0 };
|
|
1312
|
+
return { disallow: [], crawlDelaySec: 0, sitemaps: [] };
|
|
1260
1313
|
}
|
|
1261
1314
|
}
|
|
1262
1315
|
function sleep(ms) {
|
|
@@ -1272,7 +1325,12 @@ function isDisallowedByRobots(urlPath, patterns) {
|
|
|
1272
1325
|
function budgetExceeded(b) {
|
|
1273
1326
|
return b.cap > 0 && b.used >= b.cap;
|
|
1274
1327
|
}
|
|
1275
|
-
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null
|
|
1328
|
+
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null,
|
|
1329
|
+
// Backpressure salvage: when provided, every page body that comes back is
|
|
1330
|
+
// pushed into this caller-owned array as it's fetched. If the watchdog aborts
|
|
1331
|
+
// mid-crawl and this function throws, the caller still holds the partial set
|
|
1332
|
+
// (the local `pages` array would otherwise be lost with the stack frame).
|
|
1333
|
+
pageSink) {
|
|
1276
1334
|
// Memoized SSRF validator. When guardSsrf is on, every URL fetched by the
|
|
1277
1335
|
// audit (source, sitemap entries, redirects, discovered links) goes through
|
|
1278
1336
|
// this. DNS is hit once per unique hostname per audit — a 4k-page audit on
|
|
@@ -1332,7 +1390,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
1332
1390
|
const isXml = (contentType.includes("xml") || looksLikeSitemap(text)) && sourceStatus !== -1;
|
|
1333
1391
|
if (isXml) {
|
|
1334
1392
|
const visited = new Set();
|
|
1335
|
-
const { urls: allSitemapUrls, lastmodByUrl: sitemapLastmodByUrl } = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
1393
|
+
const { urls: allSitemapUrls, lastmodByUrl: sitemapLastmodByUrl, childTotal: sitemapChildTotal, childFailed: sitemapChildFailed } = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
1336
1394
|
// If we have a budget, sample from sitemap URLs before fetching
|
|
1337
1395
|
const sampledUrls = discoveryBudget > 0 && allSitemapUrls.length > discoveryBudget
|
|
1338
1396
|
? fisherYatesSample(allSitemapUrls, discoveryBudget)
|
|
@@ -1359,7 +1417,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
1359
1417
|
else {
|
|
1360
1418
|
urlsToFetch = sampledUrls;
|
|
1361
1419
|
}
|
|
1362
|
-
|
|
1420
|
+
// Reuse the caller's salvage sink as the live page accumulator so a
|
|
1421
|
+
// mid-crawl watchdog abort leaves the already-fetched pages visible to
|
|
1422
|
+
// the caller. Falls back to a private array when no sink is passed.
|
|
1423
|
+
const pages = pageSink ?? [];
|
|
1363
1424
|
// Fetch robots.txt once for the origin — reused for Crawl-Delay pacing and Disallow checks.
|
|
1364
1425
|
const sourceOrigin = (() => { try {
|
|
1365
1426
|
return new URL(source).origin;
|
|
@@ -1465,11 +1526,14 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
1465
1526
|
});
|
|
1466
1527
|
}
|
|
1467
1528
|
}
|
|
1468
|
-
return { pages, sitemapUrls: new Set(allSitemapUrls), sitemapLastmodByUrl, discoveredUrlCount: allSitemapUrls.length, scrapePlan };
|
|
1529
|
+
return { pages, sitemapUrls: new Set(allSitemapUrls), sitemapLastmodByUrl, discoveredUrlCount: allSitemapUrls.length, declaredSitemapUrlCount: allSitemapUrls.length, sitemapChildTotal, sitemapChildFailed, scrapePlan };
|
|
1469
1530
|
}
|
|
1470
1531
|
if (contentType.includes("html") || looksLikeHtml(text)) {
|
|
1471
1532
|
const initialPage = { url: source, html: text };
|
|
1472
|
-
|
|
1533
|
+
// See note above: reuse the caller's salvage sink so a watchdog abort
|
|
1534
|
+
// during link-discovery crawling preserves the pages fetched so far.
|
|
1535
|
+
const pages = pageSink ?? [];
|
|
1536
|
+
pages.push(initialPage);
|
|
1473
1537
|
if (crawlDiscovery) {
|
|
1474
1538
|
let sourceOrigin;
|
|
1475
1539
|
try {
|
|
@@ -1481,6 +1545,106 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
1481
1545
|
const knownCrawled = new Set([source]);
|
|
1482
1546
|
const allDiscoveredUrls = new Set([source]);
|
|
1483
1547
|
const maxDepth = 3;
|
|
1548
|
+
// Total URLs the discovered sitemap(s) declare — the basis for the
|
|
1549
|
+
// caller's coverage guardrail. Undefined when no sitemap is found.
|
|
1550
|
+
let declaredSitemapUrlCount;
|
|
1551
|
+
// Child-sitemap reachability for the guardrail: how many child sitemaps
|
|
1552
|
+
// an index referenced vs how many we could not fetch/parse. childFailed>0
|
|
1553
|
+
// means the declared URL list is itself incomplete.
|
|
1554
|
+
let sitemapChildTotal = 0;
|
|
1555
|
+
let sitemapChildFailed = 0;
|
|
1556
|
+
// Sitemap-first discovery (like Google). Before link-crawling, read the
|
|
1557
|
+
// sitemap(s) the site declares — link-crawl only reaches *linked* pages,
|
|
1558
|
+
// but a pSEO site's whole point is thousands of programmatic URLs that
|
|
1559
|
+
// may be sparsely linked (or behind a build-frozen, under-linked nav).
|
|
1560
|
+
// Sources of truth, in order:
|
|
1561
|
+
// 1. `Sitemap:` directives in robots.txt (there can be several)
|
|
1562
|
+
// 2. failing that, probe /sitemap.xml then /sitemap_index.xml
|
|
1563
|
+
// Sitemap-listed URLs are authoritative, so we fetch them FIRST; the
|
|
1564
|
+
// link-crawl below then fills any remaining budget and dedups against
|
|
1565
|
+
// them. When no sitemap exists, this is a no-op and we crawl as before.
|
|
1566
|
+
if (sourceOrigin) {
|
|
1567
|
+
const robotsForDiscovery = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats, signal, validateHop);
|
|
1568
|
+
const probing = robotsForDiscovery.sitemaps.length === 0;
|
|
1569
|
+
const sitemapCandidates = probing
|
|
1570
|
+
? [`${sourceOrigin}/sitemap.xml`, `${sourceOrigin}/sitemap_index.xml`]
|
|
1571
|
+
: robotsForDiscovery.sitemaps;
|
|
1572
|
+
const visitedSitemaps = new Set();
|
|
1573
|
+
const sitemapListedUrls = [];
|
|
1574
|
+
for (const candidate of sitemapCandidates) {
|
|
1575
|
+
if (discoveryBudget > 0 && pages.length + sitemapListedUrls.length >= discoveryBudget)
|
|
1576
|
+
break;
|
|
1577
|
+
if (visitedSitemaps.has(candidate))
|
|
1578
|
+
continue;
|
|
1579
|
+
let smText;
|
|
1580
|
+
let smType;
|
|
1581
|
+
try {
|
|
1582
|
+
if (validateHop)
|
|
1583
|
+
await validateHop(candidate);
|
|
1584
|
+
const fetched = await fetchWithRetry(candidate, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
|
|
1585
|
+
if (!fetched)
|
|
1586
|
+
continue;
|
|
1587
|
+
smText = fetched.text;
|
|
1588
|
+
smType = fetched.contentType;
|
|
1589
|
+
}
|
|
1590
|
+
catch {
|
|
1591
|
+
continue; // SSRF refusal, network error, etc. — skip this candidate
|
|
1592
|
+
}
|
|
1593
|
+
if (!(smType.includes("xml") || looksLikeSitemap(smText)))
|
|
1594
|
+
continue;
|
|
1595
|
+
const { urls: discoveredSmUrls, childTotal: ct, childFailed: cf } = await collectUrlsFromSitemap(smText, candidate, visitedSitemaps, timeoutMs, cache, stats, signal, validateHop);
|
|
1596
|
+
sitemapChildTotal += ct;
|
|
1597
|
+
sitemapChildFailed += cf;
|
|
1598
|
+
pushAll(sitemapListedUrls, discoveredSmUrls);
|
|
1599
|
+
// When probing the conventional paths, stop at the first that hits.
|
|
1600
|
+
if (probing && discoveredSmUrls.length > 0)
|
|
1601
|
+
break;
|
|
1602
|
+
}
|
|
1603
|
+
// Same-origin + robots-aware filter, deduped against what we have.
|
|
1604
|
+
// Record what the sitemap(s) declared (deduped) before same-origin /
|
|
1605
|
+
// robots filtering — the operator's site has this many URLs.
|
|
1606
|
+
if (sitemapListedUrls.length > 0)
|
|
1607
|
+
declaredSitemapUrlCount = new Set(sitemapListedUrls).size;
|
|
1608
|
+
const seedUrls = Array.from(new Set(sitemapListedUrls)).filter((u) => {
|
|
1609
|
+
if (knownCrawled.has(u))
|
|
1610
|
+
return false;
|
|
1611
|
+
try {
|
|
1612
|
+
const parsed = new URL(u);
|
|
1613
|
+
if (parsed.origin !== sourceOrigin)
|
|
1614
|
+
return false;
|
|
1615
|
+
if (respectRobotsTxt && isDisallowedByRobots(parsed.pathname, robotsForDiscovery.disallow)) {
|
|
1616
|
+
skippedByRobots.push(u);
|
|
1617
|
+
return false;
|
|
1618
|
+
}
|
|
1619
|
+
return true;
|
|
1620
|
+
}
|
|
1621
|
+
catch {
|
|
1622
|
+
return false;
|
|
1623
|
+
}
|
|
1624
|
+
});
|
|
1625
|
+
for (const u of seedUrls)
|
|
1626
|
+
allDiscoveredUrls.add(u);
|
|
1627
|
+
// Cap the seed fetch. With a sampling budget, fit under it; without one
|
|
1628
|
+
// (the default "audit everything" path) bound by maxCrawlDiscovered, the
|
|
1629
|
+
// same ceiling the link-crawl honors — otherwise a homepage audit of a
|
|
1630
|
+
// site with a 50k-URL sitemap would try to fetch all of them (the link
|
|
1631
|
+
// crawl never could, so this would be an unbounded-egress regression).
|
|
1632
|
+
const seedToFetch = discoveryBudget > 0
|
|
1633
|
+
? seedUrls.slice(0, Math.max(0, discoveryBudget - pages.length))
|
|
1634
|
+
: seedUrls.slice(0, maxCrawlDiscovered);
|
|
1635
|
+
if (seedToFetch.length > 0) {
|
|
1636
|
+
await runWithConcurrency(seedToFetch, concurrency, async (url) => {
|
|
1637
|
+
if (budgetExceeded(byteBudget))
|
|
1638
|
+
return;
|
|
1639
|
+
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
|
|
1640
|
+
knownCrawled.add(url);
|
|
1641
|
+
if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
|
|
1642
|
+
byteBudget.used += result.html.length;
|
|
1643
|
+
pages.push(result);
|
|
1644
|
+
}
|
|
1645
|
+
});
|
|
1646
|
+
}
|
|
1647
|
+
}
|
|
1484
1648
|
for (let depth = 0; depth < maxDepth; depth += 1) {
|
|
1485
1649
|
// Stop if we've hit the discovery budget
|
|
1486
1650
|
if (discoveryBudget > 0 && pages.length >= discoveryBudget)
|
|
@@ -1542,11 +1706,11 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
1542
1706
|
knownCrawled.add(url);
|
|
1543
1707
|
}
|
|
1544
1708
|
});
|
|
1545
|
-
pages
|
|
1709
|
+
pushAll(pages, newPages);
|
|
1546
1710
|
if (newPages.length === 0)
|
|
1547
1711
|
break;
|
|
1548
1712
|
}
|
|
1549
|
-
return { pages, discoveredUrlCount: allDiscoveredUrls.size };
|
|
1713
|
+
return { pages, discoveredUrlCount: allDiscoveredUrls.size, declaredSitemapUrlCount, sitemapChildTotal, sitemapChildFailed };
|
|
1550
1714
|
}
|
|
1551
1715
|
return { pages };
|
|
1552
1716
|
}
|
|
@@ -1633,6 +1797,14 @@ export async function auditSource(source, options) {
|
|
|
1633
1797
|
const backpressureEnabled = options?.backpressure !== false;
|
|
1634
1798
|
const backpressureAbort = new AbortController();
|
|
1635
1799
|
let backpressureError = null;
|
|
1800
|
+
// Set once we've decided to salvage a partial report after a watchdog abort.
|
|
1801
|
+
// From that point `throwIfAborted` must NOT re-throw the backpressure error —
|
|
1802
|
+
// the watchdog already did its job (stopped fetching); the rest of the
|
|
1803
|
+
// pipeline runs over the pages collected so far and the truncation is
|
|
1804
|
+
// surfaced on the summary instead.
|
|
1805
|
+
let truncated = false;
|
|
1806
|
+
let truncatedReason;
|
|
1807
|
+
let truncatedKind;
|
|
1636
1808
|
const signal = composeSignals(externalSignal, backpressureAbort.signal);
|
|
1637
1809
|
const observer = new FetchObserver();
|
|
1638
1810
|
// 2026-05-03 calibration: the prior (3s p95 cap, 2× baseline multiplier)
|
|
@@ -1674,12 +1846,33 @@ export async function auditSource(source, options) {
|
|
|
1674
1846
|
backpressureAbort.abort(backpressureError);
|
|
1675
1847
|
}
|
|
1676
1848
|
};
|
|
1849
|
+
// Flip the run into salvage mode after a watchdog abort: record the reason so
|
|
1850
|
+
// assembly sets summary.truncated, and from here `throwIfAborted` will no
|
|
1851
|
+
// longer re-throw the backpressure error. Idempotent. Returns true when a
|
|
1852
|
+
// backpressure abort was present to salvage.
|
|
1853
|
+
function salvageBackpressure() {
|
|
1854
|
+
if (!backpressureError)
|
|
1855
|
+
return false;
|
|
1856
|
+
truncated = true;
|
|
1857
|
+
truncatedReason = backpressureError.message;
|
|
1858
|
+
truncatedKind = "backpressure";
|
|
1859
|
+
return true;
|
|
1860
|
+
}
|
|
1677
1861
|
function throwIfAborted() {
|
|
1678
|
-
|
|
1679
|
-
|
|
1862
|
+
// An EXTERNAL abort (ctrl-C, parent timeout) is always fatal: the caller
|
|
1863
|
+
// asked to stop, not to degrade. Check it first so it wins over salvage.
|
|
1680
1864
|
if (externalSignal?.aborted) {
|
|
1681
1865
|
throw externalSignal.reason ?? new DOMException("Audit aborted", "AbortError");
|
|
1682
1866
|
}
|
|
1867
|
+
// A backpressure abort is salvageable. Once we've committed to a partial
|
|
1868
|
+
// report (`truncated`), swallow it and let the pipeline finish over the
|
|
1869
|
+
// pages collected so far. Before that commit, the loader-boundary catch
|
|
1870
|
+
// handles it; this guard only fires on the rare path where the loader
|
|
1871
|
+
// returned normally (e.g. a fetch mock that ignores the abort signal) yet
|
|
1872
|
+
// the watchdog still voted to abort — salvage rather than crash.
|
|
1873
|
+
if (backpressureError && !truncated) {
|
|
1874
|
+
salvageBackpressure();
|
|
1875
|
+
}
|
|
1683
1876
|
}
|
|
1684
1877
|
const resolvedRules = {
|
|
1685
1878
|
nearDuplicateThreshold: options?.rules?.nearDuplicateThreshold ?? DEFAULTS.nearDuplicateThreshold,
|
|
@@ -1785,6 +1978,9 @@ export async function auditSource(source, options) {
|
|
|
1785
1978
|
let sitemapUrlSet;
|
|
1786
1979
|
let sitemapLastmodByUrl;
|
|
1787
1980
|
let discoveredUrlCount;
|
|
1981
|
+
let declaredSitemapUrlCount;
|
|
1982
|
+
let sitemapChildTotal;
|
|
1983
|
+
let sitemapChildFailed;
|
|
1788
1984
|
let scrapePlan;
|
|
1789
1985
|
if (hasPinnedUrlsEarly) {
|
|
1790
1986
|
const pinned = options.pinnedUrls;
|
|
@@ -1834,13 +2030,26 @@ export async function auditSource(source, options) {
|
|
|
1834
2030
|
}
|
|
1835
2031
|
: undefined;
|
|
1836
2032
|
const pinnedPages = [];
|
|
1837
|
-
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
2033
|
+
try {
|
|
2034
|
+
await runWithConcurrency(Array.from(pinned), concurrency, async (url) => {
|
|
2035
|
+
const result = await fetchPageWithMeta(url, timeoutMs, cacheConfig, cacheStats, signal, validateHopPinned, followRedirects);
|
|
2036
|
+
if (result) {
|
|
2037
|
+
fetchByteBudget.used += result.html.length;
|
|
2038
|
+
pinnedPages.push(result);
|
|
2039
|
+
}
|
|
2040
|
+
});
|
|
2041
|
+
}
|
|
2042
|
+
catch (err) {
|
|
2043
|
+
// Same salvage contract as the sitemap/crawl path: a watchdog abort
|
|
2044
|
+
// mid-fetch keeps the pages already collected in `pinnedPages`. Any other
|
|
2045
|
+
// error (external abort, SSRF rejection) is fatal — re-throw it.
|
|
2046
|
+
if (err instanceof OriginDegradedError) {
|
|
2047
|
+
salvageBackpressure();
|
|
1842
2048
|
}
|
|
1843
|
-
|
|
2049
|
+
else {
|
|
2050
|
+
throw err;
|
|
2051
|
+
}
|
|
2052
|
+
}
|
|
1844
2053
|
loadedPagesRaw = pinnedPages;
|
|
1845
2054
|
// No sitemap context in pinned mode
|
|
1846
2055
|
sitemapUrlSet = undefined;
|
|
@@ -1849,13 +2058,60 @@ export async function auditSource(source, options) {
|
|
|
1849
2058
|
scrapePlan = undefined;
|
|
1850
2059
|
}
|
|
1851
2060
|
else {
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
2061
|
+
// Salvage sink: loadPagesFromSource fills this incrementally as pages come
|
|
2062
|
+
// back. If the backpressure watchdog aborts mid-crawl the call throws an
|
|
2063
|
+
// OriginDegradedError and the function's own return value is lost — but the
|
|
2064
|
+
// already-fetched pages survive here, so we recover them and continue the
|
|
2065
|
+
// pipeline with a `truncated` flag instead of throwing the whole run away.
|
|
2066
|
+
const pageSink = [];
|
|
2067
|
+
try {
|
|
2068
|
+
const loaded = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext, pageSink);
|
|
2069
|
+
loadedPagesRaw = loaded.pages;
|
|
2070
|
+
sitemapUrlSet = loaded.sitemapUrls;
|
|
2071
|
+
sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
|
|
2072
|
+
discoveredUrlCount = loaded.discoveredUrlCount;
|
|
2073
|
+
declaredSitemapUrlCount = loaded.declaredSitemapUrlCount;
|
|
2074
|
+
sitemapChildTotal = loaded.sitemapChildTotal;
|
|
2075
|
+
sitemapChildFailed = loaded.sitemapChildFailed;
|
|
2076
|
+
scrapePlan = loaded.scrapePlan;
|
|
2077
|
+
}
|
|
2078
|
+
catch (err) {
|
|
2079
|
+
// Only the watchdog abort is salvageable. An external abort (ctrl-C /
|
|
2080
|
+
// parent timeout) or any other error is fatal — re-throw it untouched so
|
|
2081
|
+
// --no-backpressure and ctrl-C behaviour are unchanged.
|
|
2082
|
+
if (err instanceof OriginDegradedError) {
|
|
2083
|
+
// Prefer the canonical backpressureError message (same object the
|
|
2084
|
+
// monitor raised); fall back to the caught error if somehow distinct.
|
|
2085
|
+
if (!salvageBackpressure()) {
|
|
2086
|
+
truncated = true;
|
|
2087
|
+
truncatedReason = err.message;
|
|
2088
|
+
truncatedKind = "backpressure";
|
|
2089
|
+
}
|
|
2090
|
+
// Recover whatever was fetched before the abort. The sink is the same
|
|
2091
|
+
// array loadPagesFromSource was pushing into, so it holds the partial
|
|
2092
|
+
// page set even though the function never reached its `return`.
|
|
2093
|
+
loadedPagesRaw = pageSink;
|
|
2094
|
+
// No sitemap/discovery context survives a mid-sitemap abort; the
|
|
2095
|
+
// downstream classifier falls back to the loaded page URLs.
|
|
2096
|
+
sitemapUrlSet = undefined;
|
|
2097
|
+
sitemapLastmodByUrl = undefined;
|
|
2098
|
+
discoveredUrlCount = undefined;
|
|
2099
|
+
declaredSitemapUrlCount = undefined;
|
|
2100
|
+
sitemapChildTotal = undefined;
|
|
2101
|
+
sitemapChildFailed = undefined;
|
|
2102
|
+
scrapePlan = undefined;
|
|
2103
|
+
}
|
|
2104
|
+
else {
|
|
2105
|
+
throw err;
|
|
2106
|
+
}
|
|
2107
|
+
}
|
|
1858
2108
|
}
|
|
2109
|
+
// Pages we successfully FETCHED (HTTP 2xx) from discovery — before content-type
|
|
2110
|
+
// and policy filtering, and before sampling. This is the right denominator for
|
|
2111
|
+
// the coverage guardrail: noindex / non-HTML pages were still *reached* (they
|
|
2112
|
+
// count), intentional sampling happens later (doesn't count against us), and
|
|
2113
|
+
// only genuinely-unreachable URLs (4xx/5xx) are missing from it.
|
|
2114
|
+
const fetchedCount = loadedPagesRaw.length;
|
|
1859
2115
|
// The scrapePlan tells us which URLs were skipped pre-fetch under monitoring
|
|
1860
2116
|
// mode. Surface them in skippedUrls so they show up under summary.skippedUrls
|
|
1861
2117
|
// (kept for back-compat with --since consumers); T7 will carry their prior
|
|
@@ -1889,7 +2145,10 @@ export async function auditSource(source, options) {
|
|
|
1889
2145
|
skippedByContentType.push(p.url);
|
|
1890
2146
|
}
|
|
1891
2147
|
}
|
|
1892
|
-
|
|
2148
|
+
// Replace contents in place without `splice(0, n, ...big)` — that spread hits
|
|
2149
|
+
// the V8 argument-count cap on large corpora (same class as pushAll).
|
|
2150
|
+
loadedPages.length = 0;
|
|
2151
|
+
pushAll(loadedPages, htmlOnlyPages);
|
|
1893
2152
|
if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
|
|
1894
2153
|
console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
|
|
1895
2154
|
}
|
|
@@ -2047,9 +2306,21 @@ export async function auditSource(source, options) {
|
|
|
2047
2306
|
const guardedClassification = applyDegenerationGuard(computedClassification, corpusStatsFromPages(parsedPages));
|
|
2048
2307
|
// `--strict` (or AuditOptions.strict) keeps the classification but forces
|
|
2049
2308
|
// every rule to run regardless of detected site type.
|
|
2309
|
+
//
|
|
2310
|
+
// A backpressure abort BEFORE classification salvages only a fragment of the
|
|
2311
|
+
// crawl (`truncated` is already set here; the coverage guardrail runs later).
|
|
2312
|
+
// Classifying that fragment — e.g. the 1 page left after the watchdog aborts a
|
|
2313
|
+
// cold-start origin — as `small-marketing` and suppressing the pSEO rules off
|
|
2314
|
+
// it is exactly what produced the confident false "READY" on a 5,600-page
|
|
2315
|
+
// site. When the run was truncated pre-classification we genuinely could not
|
|
2316
|
+
// determine the site type: force `unclear` (confidence 0, no suppression,
|
|
2317
|
+
// neutral scoring) so nothing masks the incompleteness.
|
|
2318
|
+
const classificationUnreliable = truncated;
|
|
2050
2319
|
const siteClassification = options?.strict
|
|
2051
2320
|
? { ...guardedClassification, suppressedRules: [] }
|
|
2052
|
-
:
|
|
2321
|
+
: classificationUnreliable
|
|
2322
|
+
? { ...guardedClassification, type: "unclear", confidence: 0, suppressedRules: [] }
|
|
2323
|
+
: guardedClassification;
|
|
2053
2324
|
const suppressedRuleSet = new Set(siteClassification.suppressedRules);
|
|
2054
2325
|
// Classify pages into groups and run only enabled rules per group
|
|
2055
2326
|
const classified = classifyPages(parsedPages, options?.pageGroups);
|
|
@@ -2076,29 +2347,29 @@ export async function auditSource(source, options) {
|
|
|
2076
2347
|
// Site-wide rules (run once, outside group loop)
|
|
2077
2348
|
if (sitemapUrlSet && sitemapUrlSet.size > 0 && auditMode !== "diff") {
|
|
2078
2349
|
const sitemapFindings = sitemapCompletenessRule(parsedPages, sitemapUrlSet);
|
|
2079
|
-
allFindings
|
|
2350
|
+
pushAll(allFindings, sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2080
2351
|
if (robotsTxtContent) {
|
|
2081
2352
|
const robotsFindings = robotsComplianceRule(parsedPages, sitemapUrlSet, robotsTxtContent);
|
|
2082
|
-
allFindings
|
|
2353
|
+
pushAll(allFindings, robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2083
2354
|
}
|
|
2084
2355
|
}
|
|
2085
2356
|
// AEO site-wide rules. These run unconditionally (consistent with sitemap-completeness
|
|
2086
2357
|
// and robots-compliance); page-group rule lists govern per-page AEO rules only.
|
|
2087
2358
|
const llmsFindings = await llmsTxtRule(source, { timeoutMs });
|
|
2088
|
-
allFindings
|
|
2359
|
+
pushAll(allFindings, llmsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2089
2360
|
if (robotsTxtContent) {
|
|
2090
2361
|
const crawlerFindings = crawlerAccessRule(robotsTxtContent);
|
|
2091
|
-
allFindings
|
|
2362
|
+
pushAll(allFindings, crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2092
2363
|
}
|
|
2093
2364
|
// Data source comparison rules
|
|
2094
2365
|
if (options?.dataSource?.records && options.dataSource.records.length > 0) {
|
|
2095
2366
|
if (auditMode !== "diff" || isRuleAllowedInDiff("data/missing-binding")) {
|
|
2096
2367
|
const dataBindingFindings = dataBindingRule(parsedPages, options.dataSource.records);
|
|
2097
|
-
allFindings
|
|
2368
|
+
pushAll(allFindings, dataBindingFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2098
2369
|
}
|
|
2099
2370
|
if (auditMode !== "diff" || isRuleAllowedInDiff("data/identical-across-pages")) {
|
|
2100
2371
|
const dataIdenticalFindings = dataIdenticalRule(parsedPages, options.dataSource.records);
|
|
2101
|
-
allFindings
|
|
2372
|
+
pushAll(allFindings, dataIdenticalFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2102
2373
|
}
|
|
2103
2374
|
}
|
|
2104
2375
|
for (const [groupName, groupPages] of classified) {
|
|
@@ -2118,7 +2389,7 @@ export async function auditSource(source, options) {
|
|
|
2118
2389
|
// because the nav paths between locale-specific currency-converter URLs
|
|
2119
2390
|
// were not in the pinned set).
|
|
2120
2391
|
isSampledAudit || hasPinnedUrlsEarly);
|
|
2121
|
-
allFindings
|
|
2392
|
+
pushAll(allFindings, findings);
|
|
2122
2393
|
groupPageCounts[groupName] = groupPages.length;
|
|
2123
2394
|
// v0.4.3: per-group scoring uses the same site-classification profile so
|
|
2124
2395
|
// group-level risk numbers reflect the same severity / confidence remaps
|
|
@@ -2136,7 +2407,7 @@ export async function auditSource(source, options) {
|
|
|
2136
2407
|
(auditMode === "full" || isRuleAllowedInDiff("content/value-add"));
|
|
2137
2408
|
if (isValueAddEnabled) {
|
|
2138
2409
|
const valueAddFindings = valueAddRule(parsedPages, allFindings);
|
|
2139
|
-
allFindings
|
|
2410
|
+
pushAll(allFindings, valueAddFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2140
2411
|
}
|
|
2141
2412
|
}
|
|
2142
2413
|
// Enrich findings: cluster pairwise, detect templates, assign effort
|
|
@@ -2260,6 +2531,71 @@ export async function auditSource(source, options) {
|
|
|
2260
2531
|
? [...parsedPages.map((p) => p.url)].sort()
|
|
2261
2532
|
: undefined,
|
|
2262
2533
|
};
|
|
2534
|
+
// Partial-report flag: the backpressure watchdog aborted mid-crawl and we
|
|
2535
|
+
// salvaged whatever pages had been fetched. Consumers MUST treat coverage as
|
|
2536
|
+
// a lower bound (counts/verdict are partial). Only set when actually
|
|
2537
|
+
// truncated so complete runs keep `truncated` absent.
|
|
2538
|
+
// ── Coverage guardrails (#4) ─────────────────────────────────────────────
|
|
2539
|
+
// A sitemap was found at discovery, so we know roughly how large the site is.
|
|
2540
|
+
// Two independent under-coverage signals, each reusing the `truncated`
|
|
2541
|
+
// partial-coverage surface (CLI/Action/MCP/web already flag it) tagged
|
|
2542
|
+
// `truncatedKind: "coverage"` so consumers can tell it apart from a
|
|
2543
|
+
// backpressure abort. Backpressure (set during the crawl) takes precedence.
|
|
2544
|
+
if (!truncated && sitemapChildFailed && sitemapChildFailed > 0) {
|
|
2545
|
+
// (A) Extraction-side: a sitemap INDEX referenced child sitemaps we could
|
|
2546
|
+
// not fetch/parse (404, non-sitemap, or beyond the depth cap). The declared
|
|
2547
|
+
// URL list is itself incomplete — the "unreachable child sitemaps" case a
|
|
2548
|
+
// urls-only count can never see (and the original false-negative class).
|
|
2549
|
+
truncated = true;
|
|
2550
|
+
truncatedKind = "coverage";
|
|
2551
|
+
truncatedReason =
|
|
2552
|
+
`${sitemapChildFailed} of ${sitemapChildTotal} child sitemaps referenced by the sitemap index could not be ` +
|
|
2553
|
+
`fetched or parsed — both the declared URL count and this audit are incomplete, so the verdict is not ` +
|
|
2554
|
+
`representative of the full site. Check that every child sitemap is reachable (HTTP 200, valid XML).`;
|
|
2555
|
+
// eslint-disable-next-line no-console
|
|
2556
|
+
console.error(`pseolint: ${truncatedReason}`);
|
|
2557
|
+
}
|
|
2558
|
+
if (!truncated && declaredSitemapUrlCount && declaredSitemapUrlCount >= 20) {
|
|
2559
|
+
// (B) Audit-side: the sitemap declared N URLs but we FETCHED far fewer than
|
|
2560
|
+
// we intended to. Compare against `fetchedCount` (pages actually fetched,
|
|
2561
|
+
// pre-filter/pre-sample) so legitimately-skipped pages (noindex, non-HTML)
|
|
2562
|
+
// and intentional sampling do NOT register as a shortfall. `intended` is
|
|
2563
|
+
// bounded by every deliberate limit — an explicit sample, the crawl cap, and
|
|
2564
|
+
// the declared total — so none of them false-fire.
|
|
2565
|
+
const sampleCap = sampleSize > 0 ? sampleSize : Number.POSITIVE_INFINITY;
|
|
2566
|
+
const crawlCap = maxCrawlDiscovered > 0 ? maxCrawlDiscovered : Number.POSITIVE_INFINITY;
|
|
2567
|
+
const intended = Math.min(sampleCap, crawlCap, declaredSitemapUrlCount);
|
|
2568
|
+
const floor = Math.max(20, Math.floor(intended * 0.05));
|
|
2569
|
+
// `intended >= 20`: only judge representativeness when we actually meant to
|
|
2570
|
+
// audit a substantial slice. A deliberately tiny sample/crawl cap (intended
|
|
2571
|
+
// < 20) is the operator's choice, not under-discovery — don't flag it (and
|
|
2572
|
+
// it would otherwise trip the absolute floor of 20).
|
|
2573
|
+
if (intended >= 20 && fetchedCount < floor) {
|
|
2574
|
+
const unreached = Math.max(0, declaredSitemapUrlCount - fetchedCount);
|
|
2575
|
+
const ratio = fetchedCount / declaredSitemapUrlCount;
|
|
2576
|
+
const pct = (ratio * 100).toFixed(ratio < 0.01 ? 2 : 1);
|
|
2577
|
+
truncated = true;
|
|
2578
|
+
truncatedKind = "coverage";
|
|
2579
|
+
truncatedReason =
|
|
2580
|
+
`Fetched ${fetchedCount} of ~${declaredSitemapUrlCount} sitemap-declared URLs (~${pct}% coverage); ` +
|
|
2581
|
+
`~${unreached} could not be retrieved (4xx/5xx, redirects, or robots-blocked). The verdict covers only the ` +
|
|
2582
|
+
`pages reached and is not representative — check for a stale sitemap or unreachable pages, or raise crawl limits.`;
|
|
2583
|
+
// eslint-disable-next-line no-console
|
|
2584
|
+
console.error(`pseolint: ${truncatedReason}`);
|
|
2585
|
+
}
|
|
2586
|
+
}
|
|
2587
|
+
if (truncated) {
|
|
2588
|
+
summary.truncated = true;
|
|
2589
|
+
summary.truncatedReason = truncatedReason;
|
|
2590
|
+
if (truncatedKind)
|
|
2591
|
+
summary.truncatedKind = truncatedKind;
|
|
2592
|
+
// A truncated run is incomplete — never present it as a clean green. Floor
|
|
2593
|
+
// the verdict to at least "caution" so the headline matches the partial-
|
|
2594
|
+
// coverage banner instead of the false "READY ✓" over a salvaged fragment.
|
|
2595
|
+
// ("ready" is the only rung below "caution"; everything else already is.)
|
|
2596
|
+
if (summary.verdict === "ready")
|
|
2597
|
+
summary.verdict = "caution";
|
|
2598
|
+
}
|
|
2263
2599
|
if (cacheConfig) {
|
|
2264
2600
|
summary.cacheStats = cacheStats;
|
|
2265
2601
|
}
|