@pseolint/core 0.6.4 → 0.6.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +64 -0
- package/dist/ai/tools/fetch-sitemap.js +2 -1
- package/dist/ai/tools/fetch-sitemap.js.map +1 -1
- package/dist/auditor.d.ts.map +1 -1
- package/dist/auditor.js +297 -75
- package/dist/auditor.js.map +1 -1
- package/dist/enrich-findings.d.ts.map +1 -1
- package/dist/enrich-findings.js +27 -5
- package/dist/enrich-findings.js.map +1 -1
- package/dist/parser.d.ts.map +1 -1
- package/dist/parser.js +17 -1
- package/dist/parser.js.map +1 -1
- package/dist/rules/content/title-uniqueness.d.ts.map +1 -1
- package/dist/rules/content/title-uniqueness.js +13 -0
- package/dist/rules/content/title-uniqueness.js.map +1 -1
- package/dist/site-classifier.d.ts.map +1 -1
- package/dist/site-classifier.js +7 -1
- package/dist/site-classifier.js.map +1 -1
- package/dist/stratified-sample.js +2 -1
- package/dist/stratified-sample.js.map +1 -1
- package/dist/types.d.ts +39 -3
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +14 -2
- package/dist/types.js.map +1 -1
- package/package.json +5 -3
- package/schemas/audit-summary.schema.json +295 -0
package/dist/auditor.js
CHANGED
|
@@ -34,7 +34,7 @@ import { canonicalNoindexConflictRule } from "./rules/tech/canonical-noindex-con
|
|
|
34
34
|
import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
|
|
35
35
|
import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
|
|
36
36
|
import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
|
|
37
|
-
import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds } from "./rules/tech/robots-sitemap-presence.js";
|
|
37
|
+
import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds, parseSitemapDirectives } from "./rules/tech/robots-sitemap-presence.js";
|
|
38
38
|
import { llmsTxtRule } from "./rules/aeo/llms-txt.js";
|
|
39
39
|
import { crawlerAccessRule } from "./rules/aeo/crawler-access.js";
|
|
40
40
|
import { freshnessSignalsRule } from "./rules/aeo/freshness-signals.js";
|
|
@@ -615,157 +615,157 @@ sampled = false) {
|
|
|
615
615
|
// Spam rules — always compute cross-page data, only push findings if enabled
|
|
616
616
|
const nearDuplicate = nearDuplicateRule(pages, resolvedRules.nearDuplicateThreshold);
|
|
617
617
|
if (isEnabled("spam/near-duplicate") && modeOk("spam/near-duplicate")) {
|
|
618
|
-
findings
|
|
618
|
+
pushAll(findings, tag(nearDuplicate.findings));
|
|
619
619
|
}
|
|
620
620
|
const entitySwap = entitySwapRule(pages, entityPatterns, resolvedRules.entitySwapThreshold);
|
|
621
621
|
if (isEnabled("spam/entity-swap") && modeOk("spam/entity-swap")) {
|
|
622
|
-
findings
|
|
622
|
+
pushAll(findings, tag(entitySwap.findings));
|
|
623
623
|
}
|
|
624
624
|
const thinContent = thinContentRule(pages, resolvedRules.thinContentMinWords);
|
|
625
625
|
if (isEnabled("spam/thin-content") && modeOk("spam/thin-content")) {
|
|
626
|
-
findings
|
|
626
|
+
pushAll(findings, tag(thinContent.findings));
|
|
627
627
|
}
|
|
628
628
|
if (isEnabled("spam/doorway-pattern") && modeOk("spam/doorway-pattern")) {
|
|
629
|
-
findings
|
|
629
|
+
pushAll(findings, tag(doorwayPatternRule(nearDuplicate.pairs, entitySwap.pairs, thinContent.thinContentUrls, pages)));
|
|
630
630
|
}
|
|
631
631
|
if (isEnabled("spam/publication-velocity") && modeOk("spam/publication-velocity")) {
|
|
632
|
-
findings
|
|
632
|
+
pushAll(findings, tag(publicationVelocityRule(pages, resolvedRules.publicationVelocityMaxPerDay, resolvedRules.publicationVelocityMaxPerDayCorpusFraction)));
|
|
633
633
|
}
|
|
634
634
|
if (isEnabled("spam/boilerplate-ratio") && modeOk("spam/boilerplate-ratio")) {
|
|
635
|
-
findings
|
|
635
|
+
pushAll(findings, tag(boilerplateRatioRule(pages, resolvedRules.boilerplateMaxRatio)));
|
|
636
636
|
}
|
|
637
637
|
if (isEnabled("spam/template-diversity") && modeOk("spam/template-diversity")) {
|
|
638
|
-
findings
|
|
638
|
+
pushAll(findings, tag(templateDiversityRule(pages, resolvedRules.templateDiversityMinUniqueRatio)));
|
|
639
639
|
}
|
|
640
640
|
if (isEnabled("spam/template-coverage") && modeOk("spam/template-coverage")) {
|
|
641
|
-
findings
|
|
641
|
+
pushAll(findings, tag(templateCoverageRule(pages, entityPatterns, resolvedRules.templateCoverageMinPages)));
|
|
642
642
|
}
|
|
643
643
|
// Content rules
|
|
644
644
|
if (isEnabled("content/unique-value") && modeOk("content/unique-value")) {
|
|
645
|
-
findings
|
|
645
|
+
pushAll(findings, tag(uniqueValueRule(pages, resolvedRules.uniqueValueMinWords)));
|
|
646
646
|
}
|
|
647
647
|
if (isEnabled("content/meta-uniqueness") && modeOk("content/meta-uniqueness")) {
|
|
648
|
-
findings
|
|
648
|
+
pushAll(findings, tag(metaUniquenessRule(pages, entityPatterns, resolvedRules.metaUniquenessMinJaccard)));
|
|
649
649
|
}
|
|
650
650
|
if (isEnabled("content/missing-author") && modeOk("content/missing-author")) {
|
|
651
|
-
findings
|
|
651
|
+
pushAll(findings, tag(missingAuthorRule(pages)));
|
|
652
652
|
}
|
|
653
653
|
if (isEnabled("content/eeat-signals") && modeOk("content/eeat-signals")) {
|
|
654
|
-
findings
|
|
654
|
+
pushAll(findings, tag(eeatSignalsRule(pages)));
|
|
655
655
|
}
|
|
656
656
|
// 2026-05-03 v0.5.2 blind-spot fixes — title uniqueness + heading
|
|
657
657
|
// structure + image alt-text were tier-1 gaps in the blind-spot audit.
|
|
658
658
|
if (isEnabled("content/title-uniqueness") && modeOk("content/title-uniqueness")) {
|
|
659
|
-
findings
|
|
659
|
+
pushAll(findings, tag(titleUniquenessRule(pages)));
|
|
660
660
|
}
|
|
661
661
|
if (isEnabled("content/heading-structure") && modeOk("content/heading-structure")) {
|
|
662
|
-
findings
|
|
662
|
+
pushAll(findings, tag(headingStructureRule(pages)));
|
|
663
663
|
}
|
|
664
664
|
if (isEnabled("content/image-alt-text") && modeOk("content/image-alt-text")) {
|
|
665
|
-
findings
|
|
665
|
+
pushAll(findings, tag(imageAltTextRule(pages)));
|
|
666
666
|
}
|
|
667
667
|
if (isEnabled("content/translation-no-op") && modeOk("content/translation-no-op")) {
|
|
668
|
-
findings
|
|
668
|
+
pushAll(findings, tag(translationNoOpRule(pages)));
|
|
669
669
|
}
|
|
670
670
|
if (isEnabled("content/regurgitated-content") && modeOk("content/regurgitated-content")) {
|
|
671
|
-
findings
|
|
671
|
+
pushAll(findings, tag(regurgitatedContentRule(pages)));
|
|
672
672
|
}
|
|
673
673
|
if (isEnabled("content/common-phrase-reuse") && modeOk("content/common-phrase-reuse")) {
|
|
674
|
-
findings
|
|
674
|
+
pushAll(findings, tag(commonPhraseReuseRule(pages)));
|
|
675
675
|
}
|
|
676
676
|
if (isEnabled("content/wikipedia-paraphrase") && modeOk("content/wikipedia-paraphrase")) {
|
|
677
|
-
findings
|
|
677
|
+
pushAll(findings, tag(wikipediaParaphraseRule(pages)));
|
|
678
678
|
}
|
|
679
679
|
// Link rules — use the global link graph
|
|
680
680
|
if (isEnabled("links/orphan-pages") && modeOk("links/orphan-pages")) {
|
|
681
|
-
findings
|
|
681
|
+
pushAll(findings, tag(orphanPagesRule(pages, inbound, rootUrl)));
|
|
682
682
|
}
|
|
683
683
|
if (isEnabled("links/dead-ends") && modeOk("links/dead-ends")) {
|
|
684
|
-
findings
|
|
684
|
+
pushAll(findings, tag(deadEndsRule(pages, knownUrls, rootUrl)));
|
|
685
685
|
}
|
|
686
686
|
if (isEnabled("links/link-depth") && modeOk("links/link-depth")) {
|
|
687
687
|
if (rootUrl) {
|
|
688
|
-
findings
|
|
688
|
+
pushAll(findings, tag(linkDepthRule(pages, adjacency, rootUrl, resolvedRules.linkDepthMaxClicks, inbound, sampled)));
|
|
689
689
|
}
|
|
690
690
|
}
|
|
691
691
|
if (isEnabled("links/cluster-connectivity") && modeOk("links/cluster-connectivity")) {
|
|
692
|
-
findings
|
|
692
|
+
pushAll(findings, tag(clusterConnectivityRule(pages, knownUrls)));
|
|
693
693
|
}
|
|
694
694
|
if (isEnabled("links/host-section-divergence") && modeOk("links/host-section-divergence")) {
|
|
695
|
-
findings
|
|
695
|
+
pushAll(findings, tag(hostSectionDivergenceRule(pages, adjacency)));
|
|
696
696
|
}
|
|
697
697
|
// Tech rules
|
|
698
698
|
if (isEnabled("tech/canonical-consistency") && modeOk("tech/canonical-consistency")) {
|
|
699
|
-
findings
|
|
699
|
+
pushAll(findings, tag(canonicalConsistencyRule(pages, knownUrls, normalizeUrlOptions)));
|
|
700
700
|
}
|
|
701
701
|
if (isEnabled("tech/canonical-noindex-conflict") && modeOk("tech/canonical-noindex-conflict")) {
|
|
702
|
-
findings
|
|
702
|
+
pushAll(findings, tag(canonicalNoindexConflictRule(noindexAwarePages, normalizeUrlOptions)));
|
|
703
703
|
}
|
|
704
704
|
if (isEnabled("tech/robots-noindex-conflict") && modeOk("tech/robots-noindex-conflict")) {
|
|
705
|
-
findings
|
|
705
|
+
pushAll(findings, tag(robotsNoindexConflictRule(noindexAwarePages, inbound)));
|
|
706
706
|
}
|
|
707
707
|
if (isEnabled("tech/redirect-chain") && modeOk("tech/redirect-chain")) {
|
|
708
|
-
findings
|
|
708
|
+
pushAll(findings, tag(redirectChainRule(pages)));
|
|
709
709
|
}
|
|
710
710
|
if (isEnabled("tech/soft-404") && modeOk("tech/soft-404")) {
|
|
711
|
-
findings
|
|
711
|
+
pushAll(findings, tag(soft404Rule(pages)));
|
|
712
712
|
}
|
|
713
713
|
if (isEnabled("tech/hreflang-consistency") && modeOk("tech/hreflang-consistency")) {
|
|
714
714
|
// hreflang declarations on noindex'd pages are still bugs when they're
|
|
715
715
|
// inconsistent — see auditor.test.ts "emits technical SEO findings".
|
|
716
|
-
findings
|
|
716
|
+
pushAll(findings, tag(hreflangConsistencyRule(noindexAwarePages, normalizeUrlOptions)));
|
|
717
717
|
}
|
|
718
718
|
// 2026-05-03 v0.5.2 blind-spot fix: og-completeness was referenced in
|
|
719
719
|
// the v0.4.x README without ever shipping. Now it does.
|
|
720
720
|
if (isEnabled("tech/og-completeness") && modeOk("tech/og-completeness")) {
|
|
721
|
-
findings
|
|
721
|
+
pushAll(findings, tag(ogCompletenessRule(pages)));
|
|
722
722
|
}
|
|
723
723
|
// Schema rules
|
|
724
724
|
if (isEnabled("schema/json-ld-valid") && modeOk("schema/json-ld-valid")) {
|
|
725
|
-
findings
|
|
725
|
+
pushAll(findings, tag(jsonLdValidRule(pages)));
|
|
726
726
|
}
|
|
727
727
|
if (isEnabled("schema/required-fields") && modeOk("schema/required-fields")) {
|
|
728
|
-
findings
|
|
728
|
+
pushAll(findings, tag(requiredFieldsRule(pages)));
|
|
729
729
|
}
|
|
730
730
|
if (isEnabled("schema/consistency") && modeOk("schema/consistency")) {
|
|
731
|
-
findings
|
|
731
|
+
pushAll(findings, tag(schemaConsistencyRule(pages)));
|
|
732
732
|
}
|
|
733
733
|
// AEO rules
|
|
734
734
|
if (isEnabled("aeo/freshness-signals")) {
|
|
735
|
-
findings
|
|
735
|
+
pushAll(findings, tag(freshnessSignalsRule(pages, {
|
|
736
736
|
maxStaleDays: resolvedRules.freshnessMaxStaleDays,
|
|
737
737
|
})));
|
|
738
738
|
}
|
|
739
739
|
if (isEnabled("aeo/faq-coverage")) {
|
|
740
|
-
findings
|
|
740
|
+
pushAll(findings, tag(faqCoverageRule(pages, {
|
|
741
741
|
minQuestionHeadings: resolvedRules.faqMinQuestionHeadings,
|
|
742
742
|
})));
|
|
743
743
|
}
|
|
744
744
|
if (isEnabled("aeo/answer-first")) {
|
|
745
|
-
findings
|
|
745
|
+
pushAll(findings, tag(answerFirstRule(pages, entityPatterns, {
|
|
746
746
|
maxFirstParagraphWords: resolvedRules.answerFirstMaxWords,
|
|
747
747
|
})));
|
|
748
748
|
}
|
|
749
749
|
if (isEnabled("aeo/citable-facts")) {
|
|
750
|
-
findings
|
|
750
|
+
pushAll(findings, tag(citableFactsRule(pages, entityPatterns, {
|
|
751
751
|
minFactsPerPage: resolvedRules.citableFactsMin,
|
|
752
752
|
targetFactsPerPage: resolvedRules.citableFactsTarget,
|
|
753
753
|
})));
|
|
754
754
|
}
|
|
755
755
|
if (isEnabled("aeo/content-modularity")) {
|
|
756
|
-
findings
|
|
756
|
+
pushAll(findings, tag(contentModularityRule(pages, {
|
|
757
757
|
maxParagraphWords: resolvedRules.modularityMaxParagraphWords,
|
|
758
758
|
minSelfContainedRatio: resolvedRules.modularityMinSelfContainedRatio,
|
|
759
759
|
})));
|
|
760
760
|
}
|
|
761
761
|
if (isEnabled("aeo/summary-bait")) {
|
|
762
|
-
findings
|
|
762
|
+
pushAll(findings, tag(summaryBaitRule(pages, entityPatterns)));
|
|
763
763
|
}
|
|
764
764
|
// Cannibal rules — only url-pattern survives in v0.4 (title-overlap and
|
|
765
765
|
// keyword-collision dropped due to high false-positive rates; see
|
|
766
766
|
// 2026-04-29 v0.4 redesign spec §4.3).
|
|
767
767
|
if (isEnabled("cannibal/url-pattern") && modeOk("cannibal/url-pattern")) {
|
|
768
|
-
findings
|
|
768
|
+
pushAll(findings, tag(urlPatternRule(pages)));
|
|
769
769
|
}
|
|
770
770
|
return findings;
|
|
771
771
|
}
|
|
@@ -1003,6 +1003,20 @@ function withDocsUrls(findings) {
|
|
|
1003
1003
|
}
|
|
1004
1004
|
return findings;
|
|
1005
1005
|
}
|
|
1006
|
+
/**
|
|
1007
|
+
* Append every item of `items` to `target` in place. Use this instead of
|
|
1008
|
+
* `target.push(...items)` whenever `items` can be large. The spread form passes
|
|
1009
|
+
* each element as a separate call argument, and V8 caps argument count
|
|
1010
|
+
* (~131072) — so `push(...bigArray)` throws `RangeError: Maximum call stack size
|
|
1011
|
+
* exceeded` on large inputs. A dense site makes the pairwise rules
|
|
1012
|
+
* (near-duplicate / entity-swap) emit C(N,2) findings, which blew the cap at the
|
|
1013
|
+
* rule-aggregation push *before* enrichment was even reached. The loop has no
|
|
1014
|
+
* such limit. See tests/integration/large-corpus-no-overflow.test.ts.
|
|
1015
|
+
*/
|
|
1016
|
+
function pushAll(target, items) {
|
|
1017
|
+
for (const item of items)
|
|
1018
|
+
target.push(item);
|
|
1019
|
+
}
|
|
1006
1020
|
async function collectHtmlFiles(directory) {
|
|
1007
1021
|
const entries = await readdir(directory, { withFileTypes: true });
|
|
1008
1022
|
const files = await Promise.all(entries.map(async (entry) => {
|
|
@@ -1038,7 +1052,11 @@ function composeSignals(...signals) {
|
|
|
1038
1052
|
}
|
|
1039
1053
|
return ac.signal;
|
|
1040
1054
|
}
|
|
1041
|
-
async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop
|
|
1055
|
+
async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop,
|
|
1056
|
+
// Per-sitemap byte cap (sitemaps.org caps an uncompressed sitemap at 50 MB).
|
|
1057
|
+
// Guards against a hostile/misconfigured sitemap eating the whole byte budget
|
|
1058
|
+
// or memory. 0 / undefined = no cap.
|
|
1059
|
+
maxBytes) {
|
|
1042
1060
|
try {
|
|
1043
1061
|
stats.total += 1;
|
|
1044
1062
|
const r = await cachedFetch(url, { timeoutMs, cache, signal, validateHop, onObservation: stats.onObservation });
|
|
@@ -1048,6 +1066,11 @@ async function fetchWithRetry(url, timeoutMs, cache, stats, signal, validateHop)
|
|
|
1048
1066
|
}
|
|
1049
1067
|
if (r.status < 200 || r.status >= 300)
|
|
1050
1068
|
return null;
|
|
1069
|
+
if (maxBytes && maxBytes > 0 && r.body.length > maxBytes) {
|
|
1070
|
+
// eslint-disable-next-line no-console
|
|
1071
|
+
console.error(`pseolint: sitemap ${url} is ${(r.body.length / 1_048_576).toFixed(0)}MB, over the ${(maxBytes / 1_048_576).toFixed(0)}MB cap — skipping it.`);
|
|
1072
|
+
return null;
|
|
1073
|
+
}
|
|
1051
1074
|
return { text: r.body, contentType: (r.headers["content-type"] ?? "").toLowerCase() };
|
|
1052
1075
|
}
|
|
1053
1076
|
catch (err) {
|
|
@@ -1205,7 +1228,16 @@ function fisherYatesSample(items, n, random = Math.random) {
|
|
|
1205
1228
|
}
|
|
1206
1229
|
return arr.slice(arr.length - n);
|
|
1207
1230
|
}
|
|
1208
|
-
|
|
1231
|
+
/** sitemaps.org caps an uncompressed sitemap at 50 MB. */
|
|
1232
|
+
const SITEMAP_MAX_BYTES = 50 * 1024 * 1024;
|
|
1233
|
+
/**
|
|
1234
|
+
* Max `<sitemapindex>` nesting depth we recurse through. The protocol only
|
|
1235
|
+
* defines a single level of nesting, but some sites nest deeper; 5 is generous
|
|
1236
|
+
* while still bounding work (and stack) on a hostile/misconfigured index that a
|
|
1237
|
+
* `visited` set alone wouldn't catch (e.g. a deep non-cyclic chain).
|
|
1238
|
+
*/
|
|
1239
|
+
const SITEMAP_MAX_DEPTH = 5;
|
|
1240
|
+
async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth = 0, maxDepth = SITEMAP_MAX_DEPTH) {
|
|
1209
1241
|
visited.add(sitemapUrl);
|
|
1210
1242
|
const entries = parseSitemapUrlsWithLastmod(sitemapText);
|
|
1211
1243
|
if (!isSitemapIndex(sitemapText)) {
|
|
@@ -1219,6 +1251,13 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
|
|
|
1219
1251
|
}
|
|
1220
1252
|
return { urls, lastmodByUrl };
|
|
1221
1253
|
}
|
|
1254
|
+
// It's a sitemap index. Stop recursing past the depth cap (the index itself
|
|
1255
|
+
// carries no page URLs, only child-sitemap refs, so returning empty is safe).
|
|
1256
|
+
if (depth >= maxDepth) {
|
|
1257
|
+
// eslint-disable-next-line no-console
|
|
1258
|
+
console.error(`pseolint: sitemap-index nesting exceeded depth ${maxDepth} at ${sitemapUrl}; not recursing further.`);
|
|
1259
|
+
return { urls: [], lastmodByUrl: new Map() };
|
|
1260
|
+
}
|
|
1222
1261
|
const allUrls = [];
|
|
1223
1262
|
const allLastmodByUrl = new Map();
|
|
1224
1263
|
for (const entry of entries) {
|
|
@@ -1227,14 +1266,14 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
|
|
|
1227
1266
|
throw signal.reason ?? new Error("aborted");
|
|
1228
1267
|
if (visited.has(childUrl))
|
|
1229
1268
|
continue;
|
|
1230
|
-
const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop);
|
|
1269
|
+
const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
|
|
1231
1270
|
if (!child)
|
|
1232
1271
|
continue;
|
|
1233
1272
|
const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
|
|
1234
1273
|
if (!childLike)
|
|
1235
1274
|
continue;
|
|
1236
|
-
const { urls: childUrls, lastmodByUrl: childLastmodByUrl } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
1237
|
-
allUrls
|
|
1275
|
+
const { urls: childUrls, lastmodByUrl: childLastmodByUrl } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth + 1, maxDepth);
|
|
1276
|
+
pushAll(allUrls, childUrls);
|
|
1238
1277
|
for (const [u, lm] of childLastmodByUrl) {
|
|
1239
1278
|
allLastmodByUrl.set(u, lm);
|
|
1240
1279
|
}
|
|
@@ -1243,7 +1282,7 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
|
|
|
1243
1282
|
}
|
|
1244
1283
|
async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
|
|
1245
1284
|
if (!origin)
|
|
1246
|
-
return { disallow: [], crawlDelaySec: 0 };
|
|
1285
|
+
return { disallow: [], crawlDelaySec: 0, sitemaps: [] };
|
|
1247
1286
|
try {
|
|
1248
1287
|
const robotsUrl = `${origin}/robots.txt`;
|
|
1249
1288
|
const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats, signal, validateHop);
|
|
@@ -1253,10 +1292,14 @@ async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validate
|
|
|
1253
1292
|
return {
|
|
1254
1293
|
disallow: parseDisallowPatterns(fetched.text, ["*", "pseolint"]),
|
|
1255
1294
|
crawlDelaySec: parseCrawlDelaySeconds(fetched.text),
|
|
1295
|
+
// `Sitemap:` directives are origin-relative-agnostic (absolute URLs) and
|
|
1296
|
+
// there can be several. Surfaced so discovery can read the site's declared
|
|
1297
|
+
// sitemaps instead of guessing.
|
|
1298
|
+
sitemaps: parseSitemapDirectives(fetched.text),
|
|
1256
1299
|
};
|
|
1257
1300
|
}
|
|
1258
1301
|
catch {
|
|
1259
|
-
return { disallow: [], crawlDelaySec: 0 };
|
|
1302
|
+
return { disallow: [], crawlDelaySec: 0, sitemaps: [] };
|
|
1260
1303
|
}
|
|
1261
1304
|
}
|
|
1262
1305
|
function sleep(ms) {
|
|
@@ -1272,7 +1315,12 @@ function isDisallowedByRobots(urlPath, patterns) {
|
|
|
1272
1315
|
function budgetExceeded(b) {
|
|
1273
1316
|
return b.cap > 0 && b.used >= b.cap;
|
|
1274
1317
|
}
|
|
1275
|
-
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null
|
|
1318
|
+
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }, signal, guardSsrf = false, respectRobotsTxt = true, skippedByRobots = [], followRedirects = true, maxCrawlDiscovered = 5000, monitoringContext = null,
|
|
1319
|
+
// Backpressure salvage: when provided, every page body that comes back is
|
|
1320
|
+
// pushed into this caller-owned array as it's fetched. If the watchdog aborts
|
|
1321
|
+
// mid-crawl and this function throws, the caller still holds the partial set
|
|
1322
|
+
// (the local `pages` array would otherwise be lost with the stack frame).
|
|
1323
|
+
pageSink) {
|
|
1276
1324
|
// Memoized SSRF validator. When guardSsrf is on, every URL fetched by the
|
|
1277
1325
|
// audit (source, sitemap entries, redirects, discovered links) goes through
|
|
1278
1326
|
// this. DNS is hit once per unique hostname per audit — a 4k-page audit on
|
|
@@ -1359,7 +1407,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
1359
1407
|
else {
|
|
1360
1408
|
urlsToFetch = sampledUrls;
|
|
1361
1409
|
}
|
|
1362
|
-
|
|
1410
|
+
// Reuse the caller's salvage sink as the live page accumulator so a
|
|
1411
|
+
// mid-crawl watchdog abort leaves the already-fetched pages visible to
|
|
1412
|
+
// the caller. Falls back to a private array when no sink is passed.
|
|
1413
|
+
const pages = pageSink ?? [];
|
|
1363
1414
|
// Fetch robots.txt once for the origin — reused for Crawl-Delay pacing and Disallow checks.
|
|
1364
1415
|
const sourceOrigin = (() => { try {
|
|
1365
1416
|
return new URL(source).origin;
|
|
@@ -1469,7 +1520,10 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
1469
1520
|
}
|
|
1470
1521
|
if (contentType.includes("html") || looksLikeHtml(text)) {
|
|
1471
1522
|
const initialPage = { url: source, html: text };
|
|
1472
|
-
|
|
1523
|
+
// See note above: reuse the caller's salvage sink so a watchdog abort
|
|
1524
|
+
// during link-discovery crawling preserves the pages fetched so far.
|
|
1525
|
+
const pages = pageSink ?? [];
|
|
1526
|
+
pages.push(initialPage);
|
|
1473
1527
|
if (crawlDiscovery) {
|
|
1474
1528
|
let sourceOrigin;
|
|
1475
1529
|
try {
|
|
@@ -1481,6 +1535,92 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
1481
1535
|
const knownCrawled = new Set([source]);
|
|
1482
1536
|
const allDiscoveredUrls = new Set([source]);
|
|
1483
1537
|
const maxDepth = 3;
|
|
1538
|
+
// Sitemap-first discovery (like Google). Before link-crawling, read the
|
|
1539
|
+
// sitemap(s) the site declares — link-crawl only reaches *linked* pages,
|
|
1540
|
+
// but a pSEO site's whole point is thousands of programmatic URLs that
|
|
1541
|
+
// may be sparsely linked (or behind a build-frozen, under-linked nav).
|
|
1542
|
+
// Sources of truth, in order:
|
|
1543
|
+
// 1. `Sitemap:` directives in robots.txt (there can be several)
|
|
1544
|
+
// 2. failing that, probe /sitemap.xml then /sitemap_index.xml
|
|
1545
|
+
// Sitemap-listed URLs are authoritative, so we fetch them FIRST; the
|
|
1546
|
+
// link-crawl below then fills any remaining budget and dedups against
|
|
1547
|
+
// them. When no sitemap exists, this is a no-op and we crawl as before.
|
|
1548
|
+
if (sourceOrigin) {
|
|
1549
|
+
const robotsForDiscovery = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats, signal, validateHop);
|
|
1550
|
+
const probing = robotsForDiscovery.sitemaps.length === 0;
|
|
1551
|
+
const sitemapCandidates = probing
|
|
1552
|
+
? [`${sourceOrigin}/sitemap.xml`, `${sourceOrigin}/sitemap_index.xml`]
|
|
1553
|
+
: robotsForDiscovery.sitemaps;
|
|
1554
|
+
const visitedSitemaps = new Set();
|
|
1555
|
+
const sitemapListedUrls = [];
|
|
1556
|
+
for (const candidate of sitemapCandidates) {
|
|
1557
|
+
if (discoveryBudget > 0 && pages.length + sitemapListedUrls.length >= discoveryBudget)
|
|
1558
|
+
break;
|
|
1559
|
+
if (visitedSitemaps.has(candidate))
|
|
1560
|
+
continue;
|
|
1561
|
+
let smText;
|
|
1562
|
+
let smType;
|
|
1563
|
+
try {
|
|
1564
|
+
if (validateHop)
|
|
1565
|
+
await validateHop(candidate);
|
|
1566
|
+
const fetched = await fetchWithRetry(candidate, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
|
|
1567
|
+
if (!fetched)
|
|
1568
|
+
continue;
|
|
1569
|
+
smText = fetched.text;
|
|
1570
|
+
smType = fetched.contentType;
|
|
1571
|
+
}
|
|
1572
|
+
catch {
|
|
1573
|
+
continue; // SSRF refusal, network error, etc. — skip this candidate
|
|
1574
|
+
}
|
|
1575
|
+
if (!(smType.includes("xml") || looksLikeSitemap(smText)))
|
|
1576
|
+
continue;
|
|
1577
|
+
const { urls: discoveredSmUrls } = await collectUrlsFromSitemap(smText, candidate, visitedSitemaps, timeoutMs, cache, stats, signal, validateHop);
|
|
1578
|
+
pushAll(sitemapListedUrls, discoveredSmUrls);
|
|
1579
|
+
// When probing the conventional paths, stop at the first that hits.
|
|
1580
|
+
if (probing && discoveredSmUrls.length > 0)
|
|
1581
|
+
break;
|
|
1582
|
+
}
|
|
1583
|
+
// Same-origin + robots-aware filter, deduped against what we have.
|
|
1584
|
+
const seedUrls = Array.from(new Set(sitemapListedUrls)).filter((u) => {
|
|
1585
|
+
if (knownCrawled.has(u))
|
|
1586
|
+
return false;
|
|
1587
|
+
try {
|
|
1588
|
+
const parsed = new URL(u);
|
|
1589
|
+
if (parsed.origin !== sourceOrigin)
|
|
1590
|
+
return false;
|
|
1591
|
+
if (respectRobotsTxt && isDisallowedByRobots(parsed.pathname, robotsForDiscovery.disallow)) {
|
|
1592
|
+
skippedByRobots.push(u);
|
|
1593
|
+
return false;
|
|
1594
|
+
}
|
|
1595
|
+
return true;
|
|
1596
|
+
}
|
|
1597
|
+
catch {
|
|
1598
|
+
return false;
|
|
1599
|
+
}
|
|
1600
|
+
});
|
|
1601
|
+
for (const u of seedUrls)
|
|
1602
|
+
allDiscoveredUrls.add(u);
|
|
1603
|
+
// Cap the seed fetch. With a sampling budget, fit under it; without one
|
|
1604
|
+
// (the default "audit everything" path) bound by maxCrawlDiscovered, the
|
|
1605
|
+
// same ceiling the link-crawl honors — otherwise a homepage audit of a
|
|
1606
|
+
// site with a 50k-URL sitemap would try to fetch all of them (the link
|
|
1607
|
+
// crawl never could, so this would be an unbounded-egress regression).
|
|
1608
|
+
const seedToFetch = discoveryBudget > 0
|
|
1609
|
+
? seedUrls.slice(0, Math.max(0, discoveryBudget - pages.length))
|
|
1610
|
+
: seedUrls.slice(0, maxCrawlDiscovered);
|
|
1611
|
+
if (seedToFetch.length > 0) {
|
|
1612
|
+
await runWithConcurrency(seedToFetch, concurrency, async (url) => {
|
|
1613
|
+
if (budgetExceeded(byteBudget))
|
|
1614
|
+
return;
|
|
1615
|
+
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats, signal, validateHop, followRedirects);
|
|
1616
|
+
knownCrawled.add(url);
|
|
1617
|
+
if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
|
|
1618
|
+
byteBudget.used += result.html.length;
|
|
1619
|
+
pages.push(result);
|
|
1620
|
+
}
|
|
1621
|
+
});
|
|
1622
|
+
}
|
|
1623
|
+
}
|
|
1484
1624
|
for (let depth = 0; depth < maxDepth; depth += 1) {
|
|
1485
1625
|
// Stop if we've hit the discovery budget
|
|
1486
1626
|
if (discoveryBudget > 0 && pages.length >= discoveryBudget)
|
|
@@ -1542,7 +1682,7 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
1542
1682
|
knownCrawled.add(url);
|
|
1543
1683
|
}
|
|
1544
1684
|
});
|
|
1545
|
-
pages
|
|
1685
|
+
pushAll(pages, newPages);
|
|
1546
1686
|
if (newPages.length === 0)
|
|
1547
1687
|
break;
|
|
1548
1688
|
}
|
|
@@ -1633,6 +1773,13 @@ export async function auditSource(source, options) {
|
|
|
1633
1773
|
const backpressureEnabled = options?.backpressure !== false;
|
|
1634
1774
|
const backpressureAbort = new AbortController();
|
|
1635
1775
|
let backpressureError = null;
|
|
1776
|
+
// Set once we've decided to salvage a partial report after a watchdog abort.
|
|
1777
|
+
// From that point `throwIfAborted` must NOT re-throw the backpressure error —
|
|
1778
|
+
// the watchdog already did its job (stopped fetching); the rest of the
|
|
1779
|
+
// pipeline runs over the pages collected so far and the truncation is
|
|
1780
|
+
// surfaced on the summary instead.
|
|
1781
|
+
let truncated = false;
|
|
1782
|
+
let truncatedReason;
|
|
1636
1783
|
const signal = composeSignals(externalSignal, backpressureAbort.signal);
|
|
1637
1784
|
const observer = new FetchObserver();
|
|
1638
1785
|
// 2026-05-03 calibration: the prior (3s p95 cap, 2× baseline multiplier)
|
|
@@ -1674,12 +1821,32 @@ export async function auditSource(source, options) {
|
|
|
1674
1821
|
backpressureAbort.abort(backpressureError);
|
|
1675
1822
|
}
|
|
1676
1823
|
};
|
|
1824
|
+
// Flip the run into salvage mode after a watchdog abort: record the reason so
|
|
1825
|
+
// assembly sets summary.truncated, and from here `throwIfAborted` will no
|
|
1826
|
+
// longer re-throw the backpressure error. Idempotent. Returns true when a
|
|
1827
|
+
// backpressure abort was present to salvage.
|
|
1828
|
+
function salvageBackpressure() {
|
|
1829
|
+
if (!backpressureError)
|
|
1830
|
+
return false;
|
|
1831
|
+
truncated = true;
|
|
1832
|
+
truncatedReason = backpressureError.message;
|
|
1833
|
+
return true;
|
|
1834
|
+
}
|
|
1677
1835
|
function throwIfAborted() {
|
|
1678
|
-
|
|
1679
|
-
|
|
1836
|
+
// An EXTERNAL abort (ctrl-C, parent timeout) is always fatal: the caller
|
|
1837
|
+
// asked to stop, not to degrade. Check it first so it wins over salvage.
|
|
1680
1838
|
if (externalSignal?.aborted) {
|
|
1681
1839
|
throw externalSignal.reason ?? new DOMException("Audit aborted", "AbortError");
|
|
1682
1840
|
}
|
|
1841
|
+
// A backpressure abort is salvageable. Once we've committed to a partial
|
|
1842
|
+
// report (`truncated`), swallow it and let the pipeline finish over the
|
|
1843
|
+
// pages collected so far. Before that commit, the loader-boundary catch
|
|
1844
|
+
// handles it; this guard only fires on the rare path where the loader
|
|
1845
|
+
// returned normally (e.g. a fetch mock that ignores the abort signal) yet
|
|
1846
|
+
// the watchdog still voted to abort — salvage rather than crash.
|
|
1847
|
+
if (backpressureError && !truncated) {
|
|
1848
|
+
salvageBackpressure();
|
|
1849
|
+
}
|
|
1683
1850
|
}
|
|
1684
1851
|
const resolvedRules = {
|
|
1685
1852
|
nearDuplicateThreshold: options?.rules?.nearDuplicateThreshold ?? DEFAULTS.nearDuplicateThreshold,
|
|
@@ -1834,13 +2001,26 @@ export async function auditSource(source, options) {
|
|
|
1834
2001
|
}
|
|
1835
2002
|
: undefined;
|
|
1836
2003
|
const pinnedPages = [];
|
|
1837
|
-
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
2004
|
+
try {
|
|
2005
|
+
await runWithConcurrency(Array.from(pinned), concurrency, async (url) => {
|
|
2006
|
+
const result = await fetchPageWithMeta(url, timeoutMs, cacheConfig, cacheStats, signal, validateHopPinned, followRedirects);
|
|
2007
|
+
if (result) {
|
|
2008
|
+
fetchByteBudget.used += result.html.length;
|
|
2009
|
+
pinnedPages.push(result);
|
|
2010
|
+
}
|
|
2011
|
+
});
|
|
2012
|
+
}
|
|
2013
|
+
catch (err) {
|
|
2014
|
+
// Same salvage contract as the sitemap/crawl path: a watchdog abort
|
|
2015
|
+
// mid-fetch keeps the pages already collected in `pinnedPages`. Any other
|
|
2016
|
+
// error (external abort, SSRF rejection) is fatal — re-throw it.
|
|
2017
|
+
if (err instanceof OriginDegradedError) {
|
|
2018
|
+
salvageBackpressure();
|
|
1842
2019
|
}
|
|
1843
|
-
|
|
2020
|
+
else {
|
|
2021
|
+
throw err;
|
|
2022
|
+
}
|
|
2023
|
+
}
|
|
1844
2024
|
loadedPagesRaw = pinnedPages;
|
|
1845
2025
|
// No sitemap context in pinned mode
|
|
1846
2026
|
sitemapUrlSet = undefined;
|
|
@@ -1849,12 +2029,46 @@ export async function auditSource(source, options) {
|
|
|
1849
2029
|
scrapePlan = undefined;
|
|
1850
2030
|
}
|
|
1851
2031
|
else {
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
2032
|
+
// Salvage sink: loadPagesFromSource fills this incrementally as pages come
|
|
2033
|
+
// back. If the backpressure watchdog aborts mid-crawl the call throws an
|
|
2034
|
+
// OriginDegradedError and the function's own return value is lost — but the
|
|
2035
|
+
// already-fetched pages survive here, so we recover them and continue the
|
|
2036
|
+
// pipeline with a `truncated` flag instead of throwing the whole run away.
|
|
2037
|
+
const pageSink = [];
|
|
2038
|
+
try {
|
|
2039
|
+
const loaded = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget, signal, guardSsrf, respectRobotsTxt, skippedByRobots, followRedirects, maxCrawlDiscovered, monitoringContext, pageSink);
|
|
2040
|
+
loadedPagesRaw = loaded.pages;
|
|
2041
|
+
sitemapUrlSet = loaded.sitemapUrls;
|
|
2042
|
+
sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
|
|
2043
|
+
discoveredUrlCount = loaded.discoveredUrlCount;
|
|
2044
|
+
scrapePlan = loaded.scrapePlan;
|
|
2045
|
+
}
|
|
2046
|
+
catch (err) {
|
|
2047
|
+
// Only the watchdog abort is salvageable. An external abort (ctrl-C /
|
|
2048
|
+
// parent timeout) or any other error is fatal — re-throw it untouched so
|
|
2049
|
+
// --no-backpressure and ctrl-C behaviour are unchanged.
|
|
2050
|
+
if (err instanceof OriginDegradedError) {
|
|
2051
|
+
// Prefer the canonical backpressureError message (same object the
|
|
2052
|
+
// monitor raised); fall back to the caught error if somehow distinct.
|
|
2053
|
+
if (!salvageBackpressure()) {
|
|
2054
|
+
truncated = true;
|
|
2055
|
+
truncatedReason = err.message;
|
|
2056
|
+
}
|
|
2057
|
+
// Recover whatever was fetched before the abort. The sink is the same
|
|
2058
|
+
// array loadPagesFromSource was pushing into, so it holds the partial
|
|
2059
|
+
// page set even though the function never reached its `return`.
|
|
2060
|
+
loadedPagesRaw = pageSink;
|
|
2061
|
+
// No sitemap/discovery context survives a mid-sitemap abort; the
|
|
2062
|
+
// downstream classifier falls back to the loaded page URLs.
|
|
2063
|
+
sitemapUrlSet = undefined;
|
|
2064
|
+
sitemapLastmodByUrl = undefined;
|
|
2065
|
+
discoveredUrlCount = undefined;
|
|
2066
|
+
scrapePlan = undefined;
|
|
2067
|
+
}
|
|
2068
|
+
else {
|
|
2069
|
+
throw err;
|
|
2070
|
+
}
|
|
2071
|
+
}
|
|
1858
2072
|
}
|
|
1859
2073
|
// The scrapePlan tells us which URLs were skipped pre-fetch under monitoring
|
|
1860
2074
|
// mode. Surface them in skippedUrls so they show up under summary.skippedUrls
|
|
@@ -2076,29 +2290,29 @@ export async function auditSource(source, options) {
|
|
|
2076
2290
|
// Site-wide rules (run once, outside group loop)
|
|
2077
2291
|
if (sitemapUrlSet && sitemapUrlSet.size > 0 && auditMode !== "diff") {
|
|
2078
2292
|
const sitemapFindings = sitemapCompletenessRule(parsedPages, sitemapUrlSet);
|
|
2079
|
-
allFindings
|
|
2293
|
+
pushAll(allFindings, sitemapFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2080
2294
|
if (robotsTxtContent) {
|
|
2081
2295
|
const robotsFindings = robotsComplianceRule(parsedPages, sitemapUrlSet, robotsTxtContent);
|
|
2082
|
-
allFindings
|
|
2296
|
+
pushAll(allFindings, robotsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2083
2297
|
}
|
|
2084
2298
|
}
|
|
2085
2299
|
// AEO site-wide rules. These run unconditionally (consistent with sitemap-completeness
|
|
2086
2300
|
// and robots-compliance); page-group rule lists govern per-page AEO rules only.
|
|
2087
2301
|
const llmsFindings = await llmsTxtRule(source, { timeoutMs });
|
|
2088
|
-
allFindings
|
|
2302
|
+
pushAll(allFindings, llmsFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2089
2303
|
if (robotsTxtContent) {
|
|
2090
2304
|
const crawlerFindings = crawlerAccessRule(robotsTxtContent);
|
|
2091
|
-
allFindings
|
|
2305
|
+
pushAll(allFindings, crawlerFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2092
2306
|
}
|
|
2093
2307
|
// Data source comparison rules
|
|
2094
2308
|
if (options?.dataSource?.records && options.dataSource.records.length > 0) {
|
|
2095
2309
|
if (auditMode !== "diff" || isRuleAllowedInDiff("data/missing-binding")) {
|
|
2096
2310
|
const dataBindingFindings = dataBindingRule(parsedPages, options.dataSource.records);
|
|
2097
|
-
allFindings
|
|
2311
|
+
pushAll(allFindings, dataBindingFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2098
2312
|
}
|
|
2099
2313
|
if (auditMode !== "diff" || isRuleAllowedInDiff("data/identical-across-pages")) {
|
|
2100
2314
|
const dataIdenticalFindings = dataIdenticalRule(parsedPages, options.dataSource.records);
|
|
2101
|
-
allFindings
|
|
2315
|
+
pushAll(allFindings, dataIdenticalFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2102
2316
|
}
|
|
2103
2317
|
}
|
|
2104
2318
|
for (const [groupName, groupPages] of classified) {
|
|
@@ -2118,7 +2332,7 @@ export async function auditSource(source, options) {
|
|
|
2118
2332
|
// because the nav paths between locale-specific currency-converter URLs
|
|
2119
2333
|
// were not in the pinned set).
|
|
2120
2334
|
isSampledAudit || hasPinnedUrlsEarly);
|
|
2121
|
-
allFindings
|
|
2335
|
+
pushAll(allFindings, findings);
|
|
2122
2336
|
groupPageCounts[groupName] = groupPages.length;
|
|
2123
2337
|
// v0.4.3: per-group scoring uses the same site-classification profile so
|
|
2124
2338
|
// group-level risk numbers reflect the same severity / confidence remaps
|
|
@@ -2136,7 +2350,7 @@ export async function auditSource(source, options) {
|
|
|
2136
2350
|
(auditMode === "full" || isRuleAllowedInDiff("content/value-add"));
|
|
2137
2351
|
if (isValueAddEnabled) {
|
|
2138
2352
|
const valueAddFindings = valueAddRule(parsedPages, allFindings);
|
|
2139
|
-
allFindings
|
|
2353
|
+
pushAll(allFindings, valueAddFindings.map((f) => ({ ...f, ref: f.ref ?? RULE_REFERENCES[f.ruleId] })));
|
|
2140
2354
|
}
|
|
2141
2355
|
}
|
|
2142
2356
|
// Enrich findings: cluster pairwise, detect templates, assign effort
|
|
@@ -2260,6 +2474,14 @@ export async function auditSource(source, options) {
|
|
|
2260
2474
|
? [...parsedPages.map((p) => p.url)].sort()
|
|
2261
2475
|
: undefined,
|
|
2262
2476
|
};
|
|
2477
|
+
// Partial-report flag: the backpressure watchdog aborted mid-crawl and we
|
|
2478
|
+
// salvaged whatever pages had been fetched. Consumers MUST treat coverage as
|
|
2479
|
+
// a lower bound (counts/verdict are partial). Only set when actually
|
|
2480
|
+
// truncated so complete runs keep `truncated` absent.
|
|
2481
|
+
if (truncated) {
|
|
2482
|
+
summary.truncated = true;
|
|
2483
|
+
summary.truncatedReason = truncatedReason;
|
|
2484
|
+
}
|
|
2263
2485
|
if (cacheConfig) {
|
|
2264
2486
|
summary.cacheStats = cacheStats;
|
|
2265
2487
|
}
|