@pseolint/core 0.6.5 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/README.md +3 -3
  2. package/dist/algorithms/fact-extraction.d.ts +46 -0
  3. package/dist/algorithms/fact-extraction.d.ts.map +1 -0
  4. package/dist/algorithms/fact-extraction.js +217 -0
  5. package/dist/algorithms/fact-extraction.js.map +1 -0
  6. package/dist/auditor.d.ts.map +1 -1
  7. package/dist/auditor.js +145 -15
  8. package/dist/auditor.js.map +1 -1
  9. package/dist/index.d.ts +4 -0
  10. package/dist/index.d.ts.map +1 -1
  11. package/dist/index.js +3 -0
  12. package/dist/index.js.map +1 -1
  13. package/dist/origin-preflight.d.ts +89 -0
  14. package/dist/origin-preflight.d.ts.map +1 -0
  15. package/dist/origin-preflight.js +93 -0
  16. package/dist/origin-preflight.js.map +1 -0
  17. package/dist/rule-references.d.ts.map +1 -1
  18. package/dist/rule-references.js +1 -0
  19. package/dist/rule-references.js.map +1 -1
  20. package/dist/rules/aeo/citable-facts.d.ts.map +1 -1
  21. package/dist/rules/aeo/citable-facts.js +4 -33
  22. package/dist/rules/aeo/citable-facts.js.map +1 -1
  23. package/dist/rules/content/citation-coverage.d.ts +11 -0
  24. package/dist/rules/content/citation-coverage.d.ts.map +1 -0
  25. package/dist/rules/content/citation-coverage.js +43 -0
  26. package/dist/rules/content/citation-coverage.js.map +1 -0
  27. package/dist/rules/content/unique-value.d.ts.map +1 -1
  28. package/dist/rules/content/unique-value.js +29 -4
  29. package/dist/rules/content/unique-value.js.map +1 -1
  30. package/dist/rules/content/value-add.d.ts.map +1 -1
  31. package/dist/rules/content/value-add.js +3 -1
  32. package/dist/rules/content/value-add.js.map +1 -1
  33. package/dist/rules/scope.d.ts.map +1 -1
  34. package/dist/rules/scope.js +1 -0
  35. package/dist/rules/scope.js.map +1 -1
  36. package/dist/site-classifier.d.ts.map +1 -1
  37. package/dist/site-classifier.js +1 -0
  38. package/dist/site-classifier.js.map +1 -1
  39. package/dist/types.d.ts +20 -5
  40. package/dist/types.d.ts.map +1 -1
  41. package/package.json +93 -93
  42. package/schemas/audit-summary.schema.json +6 -1
package/README.md CHANGED
@@ -1,8 +1,8 @@
1
1
  # @pseolint/core
2
2
 
3
- > Programmatic SEO audit engine — 45 rules, surfaced per-template, on every monitored release.
3
+ > Programmatic SEO audit engine — 44 rules, surfaced per-template, on every monitored release.
4
4
 
5
- The core engine behind [pseolint](https://www.npmjs.com/package/pseolint) v0.6.2. Use this package to embed pSEO auditing into your own tools, CI pipelines, or SaaS products.
5
+ The core engine behind [pseolint](https://www.npmjs.com/package/pseolint) v0.7.0. Use this package to embed pSEO auditing into your own tools, CI pipelines, or SaaS products.
6
6
 
7
7
  ## Install
8
8
 
@@ -34,7 +34,7 @@ for (const t of result.templates) {
34
34
 
35
35
  ## What It Checks
36
36
 
37
- 45 rules grouped into 4 scoring super-categories (v0.4): **Integrity** (spam + content + cannibal, weight 0.50), **Discoverability** (links + tech, 0.20), **Citation** (aeo + schema, 0.25), **Data** (0.05). Source-tree namespaces remain `spam/*`, `aeo/*`, etc. for stable rule IDs.
37
+ 44 rules grouped into 4 scoring super-categories (v0.4): **Integrity** (spam + content + cannibal, weight 0.50), **Discoverability** (links + tech, 0.20), **Citation** (aeo + schema, 0.25), **Data** (0.05). Source-tree namespaces remain `spam/*`, `aeo/*`, etc. for stable rule IDs.
38
38
 
39
39
  - **Spam / SpamBrain risk** (8) — near-duplicate (SimHash), entity-swap doorways, thin content, boilerplate ratio, template diversity, template coverage, publication velocity, doorway pattern (cluster-collapsed since v0.5.2)
40
40
  - **Technical SEO** (9) — canonical consistency, canonical/noindex and robots/noindex conflicts, sitemap completeness, robots compliance, redirect chains, soft 404s, hreflang reciprocity, robots-sitemap presence, **og-completeness** (v0.5.2)
@@ -0,0 +1,46 @@
1
+ import type { EntityMaskPattern, ParsedPage } from "../types.js";
2
+ export type FactKind = "money" | "percent" | "timeframe" | "date" | "isoDate" | "form" | "ratio" | "measurement";
3
+ export interface FactSpan {
4
+ value: string;
5
+ kind: FactKind;
6
+ }
7
+ export interface NamedEntity {
8
+ value: string;
9
+ source: "proper-noun" | "cue-word" | "json-ld";
10
+ type?: "organization" | "person" | "product" | "law" | "standard" | "place" | "other";
11
+ }
12
+ export interface Citation {
13
+ href: string;
14
+ domain: string;
15
+ authority: "authoritative" | "general";
16
+ reason?: "tld" | "allowlist";
17
+ }
18
+ export interface GroundedClaim {
19
+ sentence: string;
20
+ facts: string[];
21
+ citations: string[];
22
+ }
23
+ export interface PageFacts {
24
+ /** EXACTLY today's extractRawFacts() output (run on entity-masked text). Frozen. */
25
+ citableFacts: string[];
26
+ measurements: FactSpan[];
27
+ namedEntities: NamedEntity[];
28
+ citations: Citation[];
29
+ groundedClaims: GroundedClaim[];
30
+ }
31
+ export declare function extractCitableFacts(text: string): string[];
32
+ export declare function extractMeasurements(maskedText: string): FactSpan[];
33
+ export declare function extractNamedEntities(maskedText: string, jsonLd?: unknown[]): NamedEntity[];
34
+ export declare const DEFAULT_CITATION_ALLOWLIST: readonly string[];
35
+ export declare function registrableDomain(host: string): string;
36
+ export declare function classifyCitations(resolvedHrefs: readonly string[], pageUrl: string, allowlist?: readonly string[]): Citation[];
37
+ export declare function hasAuthoritativeCitation(resolvedHrefs: readonly string[], pageUrl: string, allowlist?: readonly string[]): boolean;
38
+ /**
39
+ * Deterministic approximation of "a verifiable claim": a block (<p>/<li>) that
40
+ * contains a statistic AND an outbound citation. Approximated at block level,
41
+ * not exact sentence level — documented limitation. Detects co-occurrence, not
42
+ * semantic truth. Consume at `speculative` confidence.
43
+ */
44
+ export declare function extractGroundedClaims(html: string, pageUrl: string, allowlist?: readonly string[]): GroundedClaim[];
45
+ export declare function extractPageFacts(page: Pick<ParsedPage, "url" | "contentText" | "html" | "resolvedHrefs" | "jsonLd">, entityPatterns: EntityMaskPattern[], allowlist?: readonly string[]): PageFacts;
46
+ //# sourceMappingURL=fact-extraction.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fact-extraction.d.ts","sourceRoot":"","sources":["../../src/algorithms/fact-extraction.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,iBAAiB,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAEjE,MAAM,MAAM,QAAQ,GAChB,OAAO,GAAG,SAAS,GAAG,WAAW,GAAG,MAAM,GAAG,SAAS,GAAG,MAAM,GAC/D,OAAO,GAAG,aAAa,CAAC;AAE5B,MAAM,WAAW,QAAQ;IACvB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,QAAQ,CAAC;CAChB;AAED,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,aAAa,GAAG,UAAU,GAAG,SAAS,CAAC;IAC/C,IAAI,CAAC,EAAE,cAAc,GAAG,QAAQ,GAAG,SAAS,GAAG,KAAK,GAAG,UAAU,GAAG,OAAO,GAAG,OAAO,CAAC;CACvF;AAED,MAAM,WAAW,QAAQ;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,eAAe,GAAG,SAAS,CAAC;IACvC,MAAM,CAAC,EAAE,KAAK,GAAG,WAAW,CAAC;CAC9B;AAED,MAAM,WAAW,aAAa;IAC5B,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,EAAE,MAAM,EAAE,CAAC;IAChB,SAAS,EAAE,MAAM,EAAE,CAAC;CACrB;AAED,MAAM,WAAW,SAAS;IACxB,oFAAoF;IACpF,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,YAAY,EAAE,QAAQ,EAAE,CAAC;IACzB,aAAa,EAAE,WAAW,EAAE,CAAC;IAC7B,SAAS,EAAE,QAAQ,EAAE,CAAC;IACtB,cAAc,EAAE,aAAa,EAAE,CAAC;CACjC;AAqBD,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAQ1D;AAWD,wBAAgB,mBAAmB,CAAC,UAAU,EAAE,MAAM,GAAG,QAAQ,EAAE,CAclE;AA4BD,wBAAgB,oBAAoB,CAAC,UAAU,EAAE,MAAM,EAAE,MAAM,GAAE,OAAO,EAAO,GAAG,WAAW,EAAE,CAe9F;AAED,eAAO,MAAM,0BAA0B,EAAE,SAAS,MAAM,EAIvD,CAAC;AAWF,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAMtD;AAMD,wBAAgB,iBAAiB,CAC/B,aAAa,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,EAAE,MAAM,EACf,SAAS,GAAE,SAAS,MAAM,EAA+B,GACxD,QAAQ,EAAE,CAqBZ;AAED,wBAAgB,wBAAwB,CACtC,aAAa,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,EAAE,MAAM,EACf,SAAS,GAAE,SAAS,MAAM,EAA+B,GACxD,OAAO,CAET;AAYD;;;;;GAKG;AACH,wBAAgB,qBAAqB,CACnC,IAAI,EAAE,MAAM,EACZ,OAAO,EAAE,MAAM,EACf,SAAS,GAAE,SAAS,MAAM,EAA+B,GACxD,aAAa,EAAE,CA0BjB;AAED,wBAAgB,gBAAgB,CAC9B,IAAI,EAAE,IAAI,CAAC,UAAU,EAAE,KAAK,GAAG,aAAa,GAAG,MAAM,GAAG,eAAe,GAAG,QAAQ,CAAC,EACnF,cAAc,EAAE,iBAAiB,EAAE,EACnC,SAAS,GAAE,SAAS,MAAM,EAA+B,GACxD,SAAS,CASX"}
@@ -0,0 +1,217 @@
1
+ import { load } from "cheerio";
2
+ import { maskEntities } from "./entity-mask.js";
3
+ // --- Numeric "citable" facts: the frozen subset aeo/citable-facts counts. ---
4
+ // These six patterns are lifted verbatim from rules/aeo/citable-facts.ts and
5
+ // MUST stay byte-identical to preserve the calibration corpus.
6
+ const CITABLE_FACT_PATTERNS = [
7
+ { name: "dollar", regex: /\$[\d,]+(\.\d{2})?/g },
8
+ { name: "percent", regex: /\b\d+(\.\d+)?\s*%/g },
9
+ {
10
+ name: "timeframe",
11
+ regex: /\b\d+(?:-\d+)?\s*(business\s+days?|days?|weeks?|months?|years?|hours?|minutes?)\b/gi,
12
+ },
13
+ {
14
+ name: "date",
15
+ regex: /\b(january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2}(?:,\s*\d{4})?\b/gi,
16
+ },
17
+ { name: "isoDate", regex: /\b\d{4}-\d{2}-\d{2}\b/g },
18
+ { name: "form", regex: /\bForm\s+[A-Z0-9][A-Z0-9-]*\b/g },
19
+ ];
20
+ export function extractCitableFacts(text) {
21
+ const out = new Set();
22
+ for (const { regex } of CITABLE_FACT_PATTERNS) {
23
+ const matches = text.match(regex);
24
+ if (!matches)
25
+ continue;
26
+ for (const m of matches)
27
+ out.add(m.trim().toLowerCase());
28
+ }
29
+ return Array.from(out);
30
+ }
31
+ // --- Measurements: NEW numeric kinds, deliberately separate from citableFacts. ---
32
+ const MEASUREMENT_UNITS = "kg|g|lb|lbs|oz|mi|km|cm|mm|ft|in|MB|GB|TB|KB|ms|fps|mph|kWh";
33
+ const MEASUREMENT_PATTERNS = [
34
+ { kind: "ratio", regex: /\b\d+(?:\.\d+)?\s*(?:out of|in)\s*\d+\b/gi },
35
+ { kind: "ratio", regex: /\b\d+\s*:\s*\d+\b/g },
36
+ { kind: "measurement", regex: new RegExp(`\\b\\d+(?:\\.\\d+)?\\s*(?:${MEASUREMENT_UNITS})\\b`, "g") },
37
+ ];
38
+ export function extractMeasurements(maskedText) {
39
+ const seen = new Set();
40
+ const out = [];
41
+ for (const { kind, regex } of MEASUREMENT_PATTERNS) {
42
+ const matches = maskedText.match(regex);
43
+ if (!matches)
44
+ continue;
45
+ for (const m of matches) {
46
+ const value = m.replace(/\s+/g, " ").trim().toLowerCase();
47
+ if (seen.has(value))
48
+ continue;
49
+ seen.add(value);
50
+ out.push({ value, kind });
51
+ }
52
+ }
53
+ return out;
54
+ }
55
+ const MULTI_WORD_PROPER_NOUN = /\b[A-Z][a-z]+(?:\s+(?:of\s+|de\s+|and\s+|the\s+)?[A-Z][a-z]+)+\b/g;
56
+ const ACRONYM = /\b(?:ISO|GDPR|HIPAA|FDA|SEC|FTC|EPA|W3C|IETF|RFC|NIST|OSHA|IRS|EU|UN|WHO|CCPA|PCI)\b/g;
57
+ const CUE_WORD = /\b(?:Inc|LLC|Ltd|Corp|GmbH|Act|Regulation|Directive|Agency|Department|Bureau|Commission|Authority|University|Institute|Association|Standard|Protocol)\b/;
58
+ const JSON_LD_ENTITY_TYPES = new Set([
59
+ "Organization", "GovernmentOrganization", "Corporation", "NGO",
60
+ "Person", "Product", "Brand",
61
+ ]);
62
+ function jsonLdEntities(nodes) {
63
+ const out = [];
64
+ const visit = (node) => {
65
+ if (Array.isArray(node)) {
66
+ node.forEach(visit);
67
+ return;
68
+ }
69
+ if (typeof node !== "object" || node === null)
70
+ return;
71
+ const obj = node;
72
+ const type = obj["@type"];
73
+ const name = obj["name"];
74
+ if (typeof name === "string" && typeof type === "string" && JSON_LD_ENTITY_TYPES.has(type)) {
75
+ out.push({ value: name.trim().toLowerCase(), source: "json-ld", type: "organization" });
76
+ }
77
+ for (const v of Object.values(obj))
78
+ visit(v);
79
+ };
80
+ nodes.forEach(visit);
81
+ return out;
82
+ }
83
+ export function extractNamedEntities(maskedText, jsonLd = []) {
84
+ const seen = new Set();
85
+ const out = [];
86
+ const push = (value, source) => {
87
+ const v = value.replace(/\s+/g, " ").trim().toLowerCase();
88
+ if (v.length < 2 || seen.has(v))
89
+ return;
90
+ seen.add(v);
91
+ out.push({ value: v, source });
92
+ };
93
+ for (const m of jsonLdEntities(jsonLd))
94
+ push(m.value, "json-ld");
95
+ for (const m of maskedText.match(ACRONYM) ?? [])
96
+ push(m, "cue-word");
97
+ for (const m of maskedText.match(MULTI_WORD_PROPER_NOUN) ?? []) {
98
+ push(m, CUE_WORD.test(m) ? "cue-word" : "proper-noun");
99
+ }
100
+ return out;
101
+ }
102
+ export const DEFAULT_CITATION_ALLOWLIST = [
103
+ "wikipedia.org", "w3.org", "iso.org", "ietf.org", "rfc-editor.org",
104
+ "doi.org", "nih.gov", "ncbi.nlm.nih.gov", "who.int", "schema.org",
105
+ "oecd.org", "worldbank.org", "europa.eu",
106
+ ];
107
+ const MULTI_PART_SUFFIXES = new Set([
108
+ "co.uk", "ac.uk", "gov.uk", "org.uk", "com.au", "gov.au", "edu.au",
109
+ "co.jp", "co.nz", "co.za", "com.br",
110
+ ]);
111
+ function hostOf(url) {
112
+ try {
113
+ return new URL(url).hostname.toLowerCase();
114
+ }
115
+ catch {
116
+ return null;
117
+ }
118
+ }
119
+ export function registrableDomain(host) {
120
+ const labels = host.replace(/^www\./, "").split(".");
121
+ if (labels.length <= 2)
122
+ return labels.join(".");
123
+ const lastTwo = labels.slice(-2).join(".");
124
+ if (MULTI_PART_SUFFIXES.has(lastTwo))
125
+ return labels.slice(-3).join(".");
126
+ return lastTwo;
127
+ }
128
+ function isAuthoritativeTld(host) {
129
+ return /\.(?:gov|edu|mil|int)$/.test(host) || /\.(?:gov|edu|ac)\.[a-z]{2}$/.test(host);
130
+ }
131
+ export function classifyCitations(resolvedHrefs, pageUrl, allowlist = DEFAULT_CITATION_ALLOWLIST) {
132
+ const pageHost = hostOf(pageUrl);
133
+ const pageDomain = pageHost ? registrableDomain(pageHost) : null;
134
+ const seen = new Set();
135
+ const out = [];
136
+ for (const href of resolvedHrefs) {
137
+ const host = hostOf(href);
138
+ if (!host)
139
+ continue;
140
+ const domain = registrableDomain(host);
141
+ if (pageDomain && domain === pageDomain)
142
+ continue; // internal link
143
+ if (seen.has(href))
144
+ continue;
145
+ seen.add(href);
146
+ if (isAuthoritativeTld(host)) {
147
+ out.push({ href, domain, authority: "authoritative", reason: "tld" });
148
+ }
149
+ else if (allowlist.some((d) => host === d || host.endsWith(`.${d}`))) {
150
+ out.push({ href, domain, authority: "authoritative", reason: "allowlist" });
151
+ }
152
+ else {
153
+ out.push({ href, domain, authority: "general" });
154
+ }
155
+ }
156
+ return out;
157
+ }
158
+ export function hasAuthoritativeCitation(resolvedHrefs, pageUrl, allowlist = DEFAULT_CITATION_ALLOWLIST) {
159
+ return classifyCitations(resolvedHrefs, pageUrl, allowlist).some((c) => c.authority === "authoritative");
160
+ }
161
+ const SENTENCE_SPLIT = /(?<=[.!?])\s+(?=[A-Z0-9"'(])/;
162
+ function resolveHrefs(hrefs, base) {
163
+ const out = [];
164
+ for (const h of hrefs) {
165
+ try {
166
+ out.push(new URL(h, base).href);
167
+ }
168
+ catch { /* skip unparseable */ }
169
+ }
170
+ return out;
171
+ }
172
+ /**
173
+ * Deterministic approximation of "a verifiable claim": a block (<p>/<li>) that
174
+ * contains a statistic AND an outbound citation. Approximated at block level,
175
+ * not exact sentence level — documented limitation. Detects co-occurrence, not
176
+ * semantic truth. Consume at `speculative` confidence.
177
+ */
178
+ export function extractGroundedClaims(html, pageUrl, allowlist = DEFAULT_CITATION_ALLOWLIST) {
179
+ const $ = load(html);
180
+ $("nav, header, footer, aside, script, style, noscript").remove();
181
+ const claims = [];
182
+ const scope = $("article").length > 0 ? $("article") : $("main").length > 0 ? $("main") : $("body");
183
+ scope.find("p, li").each((_i, el) => {
184
+ const $el = $(el);
185
+ const rawLinks = $el.find("a[href]").map((_j, a) => String($(a).attr("href") ?? "")).get();
186
+ const citations = classifyCitations(resolveHrefs(rawLinks, pageUrl), pageUrl, allowlist);
187
+ if (citations.length === 0)
188
+ return;
189
+ const text = $el.text().replace(/\s+/g, " ").trim();
190
+ for (const sentence of text.split(SENTENCE_SPLIT)) {
191
+ const facts = [
192
+ ...extractCitableFacts(sentence),
193
+ ...extractMeasurements(sentence).map((m) => m.value),
194
+ ];
195
+ if (facts.length === 0)
196
+ continue;
197
+ claims.push({
198
+ sentence: sentence.trim().slice(0, 240),
199
+ facts,
200
+ citations: citations.map((c) => c.href),
201
+ });
202
+ break; // one grounded claim per block is enough; avoids over-counting
203
+ }
204
+ });
205
+ return claims;
206
+ }
207
+ export function extractPageFacts(page, entityPatterns, allowlist = DEFAULT_CITATION_ALLOWLIST) {
208
+ const masked = maskEntities(page.contentText, entityPatterns);
209
+ return {
210
+ citableFacts: extractCitableFacts(masked),
211
+ measurements: extractMeasurements(masked),
212
+ namedEntities: extractNamedEntities(masked, page.jsonLd),
213
+ citations: classifyCitations(page.resolvedHrefs, page.url, allowlist),
214
+ groundedClaims: extractGroundedClaims(page.html, page.url, allowlist),
215
+ };
216
+ }
217
+ //# sourceMappingURL=fact-extraction.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fact-extraction.js","sourceRoot":"","sources":["../../src/algorithms/fact-extraction.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,SAAS,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAwChD,+EAA+E;AAC/E,6EAA6E;AAC7E,+DAA+D;AAC/D,MAAM,qBAAqB,GAA2C;IACpE,EAAE,IAAI,EAAE,QAAQ,EAAE,KAAK,EAAE,qBAAqB,EAAE;IAChD,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,oBAAoB,EAAE;IAChD;QACE,IAAI,EAAE,WAAW;QACjB,KAAK,EAAE,qFAAqF;KAC7F;IACD;QACE,IAAI,EAAE,MAAM;QACZ,KAAK,EACH,uHAAuH;KAC1H;IACD,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,wBAAwB,EAAE;IACpD,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,gCAAgC,EAAE;CAC1D,CAAC;AAEF,MAAM,UAAU,mBAAmB,CAAC,IAAY;IAC9C,MAAM,GAAG,GAAG,IAAI,GAAG,EAAU,CAAC;IAC9B,KAAK,MAAM,EAAE,KAAK,EAAE,IAAI,qBAAqB,EAAE,CAAC;QAC9C,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QAClC,IAAI,CAAC,OAAO;YAAE,SAAS;QACvB,KAAK,MAAM,CAAC,IAAI,OAAO;YAAE,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,CAAC;IAC3D,CAAC;IACD,OAAO,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AACzB,CAAC;AAED,oFAAoF;AACpF,MAAM,iBAAiB,GACrB,6DAA6D,CAAC;AAChE,MAAM,oBAAoB,GAA6C;IACrE,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,2CAA2C,EAAE;IACrE,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,oBAAoB,EAAE;IAC9C,EAAE,IAAI,EAAE,aAAa,EAAE,KAAK,EAAE,IAAI,MAAM,CAAC,6BAA6B,iBAAiB,MAAM,EAAE,GAAG,CAAC,EAAE;CACtG,CAAC;AAEF,MAAM,UAAU,mBAAmB,CAAC,UAAkB;IACpD,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,MAAM,GAAG,GAAe,EAAE,CAAC;IAC3B,KAAK,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,oBAAoB,EAAE,CAAC;QACnD,MAAM,OAAO,GAAG,UAAU,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACxC,IAAI,CAAC,OAAO;YAAE,SAAS;QACvB,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;YACxB,MAAM,KAAK,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;YAC1D,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC;gBAAE,SAAS;YAC9B,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;YAChB,GAAG,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;QAC5B,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,sBAAsB,GAAG,mEAAmE,CAAC;AACnG,MAAM,OAAO,GAAG,uFAAuF,CAAC;AACxG,MAAM,QAAQ,GAAG,yJAAyJ,CAAC;AAE3K,MAAM,oBAAoB,GAAG,IAAI,GAAG,CAAC;IACnC,cAAc,EAAE,wBAAwB,EAAE,aAAa,EAAE,KAAK;IAC9D,QAAQ,EAAE,SAAS,EAAE,OAAO;CAC7B,CAAC,CAAC;AAEH,SAAS,cAAc,CAAC,KAAgB;IACtC,MAAM,GAAG,GAAkB,EAAE,CAAC;IAC9B,MAAM,KAAK,GAAG,CAAC,IAAa,EAAQ,EAAE;QACpC,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;YAAC,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;YAAC,OAAO;QAAC,CAAC;QACzD,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,KAAK,IAAI;YAAE,OAAO;QACtD,MAAM,GAAG,GAAG,IAA+B,CAAC;QAC5C,MAAM,IAAI,GAAG,GAAG,CAAC,OAAO,CAAC,CAAC;QAC1B,MAAM,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC;QACzB,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,oBAAoB,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;YAC3F,GAAG,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,EAAE,cAAc,EAAE,CAAC,CAAC;QAC1F,CAAC;QACD,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,MAAM,CAAC,GAAG,CAAC;YAAE,KAAK,CAAC,CAAC,CAAC,CAAC;IAC/C,CAAC,CAAC;IACF,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;IACrB,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,UAAU,oBAAoB,CAAC,UAAkB,EAAE,SAAoB,EAAE;IAC7E,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,MAAM,GAAG,GAAkB,EAAE,CAAC;IAC9B,MAAM,IAAI,GAAG,CAAC,KAAa,EAAE,MAA6B,EAAQ,EAAE;QAClE,MAAM,CAAC,GAAG,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAC1D,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;YAAE,OAAO;QACxC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QACZ,GAAG,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;IACjC,CAAC,CAAC;IACF,KAAK,MAAM,CAAC,IAAI,cAAc,CAAC,MAAM,CAAC;QAAE,IAAI,CAAC,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;IACjE,KAAK,MAAM,CAAC,IAAI,UAAU,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,EAAE;QAAE,IAAI,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;IACrE,KAAK,MAAM,CAAC,IAAI,UAAU,CAAC,KAAK,CAAC,sBAAsB,CAAC,IAAI,EAAE,EAAE,CAAC;QAC/D,IAAI,CAAC,CAAC,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC;IACzD,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,CAAC,MAAM,0BAA0B,GAAsB;IAC3D,eAAe,EAAE,QAAQ,EAAE,SAAS,EAAE,UAAU,EAAE,gBAAgB;IAClE,SAAS,EAAE,SAAS,EAAE,kBAAkB,EAAE,SAAS,EAAE,YAAY;IACjE,UAAU,EAAE,eAAe,EAAE,WAAW;CACzC,CAAC;AAEF,MAAM,mBAAmB,GAAG,IAAI,GAAG,CAAC;IAClC,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ;IAClE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,QAAQ;CACpC,CAAC,CAAC;AAEH,SAAS,MAAM,CAAC,GAAW;IACzB,IAAI,CAAC;QAAC,OAAO,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;IAAC,CAAC;IAAC,MAAM,CAAC;QAAC,OAAO,IAAI,CAAC;IAAC,CAAC;AAC5E,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,IAAY;IAC5C,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IACrD,IAAI,MAAM,CAAC,MAAM,IAAI,CAAC;QAAE,OAAO,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC3C,IAAI,mBAAmB,CAAC,GAAG,CAAC,OAAO,CAAC;QAAE,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACxE,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,kBAAkB,CAAC,IAAY;IACtC,OAAO,wBAAwB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,6BAA6B,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACzF,CAAC;AAED,MAAM,UAAU,iBAAiB,CAC/B,aAAgC,EAChC,OAAe,EACf,YAA+B,0BAA0B;IAEzD,MAAM,QAAQ,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC;IACjC,MAAM,UAAU,GAAG,QAAQ,CAAC,CAAC,CAAC,iBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IACjE,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,MAAM,GAAG,GAAe,EAAE,CAAC;IAC3B,KAAK,MAAM,IAAI,IAAI,aAAa,EAAE,CAAC;QACjC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC;QAC1B,IAAI,CAAC,IAAI;YAAE,SAAS;QACpB,MAAM,MAAM,GAAG,iBAAiB,CAAC,IAAI,CAAC,CAAC;QACvC,IAAI,UAAU,IAAI,MAAM,KAAK,UAAU;YAAE,SAAS,CAAC,gBAAgB;QACnE,IAAI,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC;YAAE,SAAS;QAC7B,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QACf,IAAI,kBAAkB,CAAC,IAAI,CAAC,EAAE,CAAC;YAC7B,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,CAAC;QACxE,CAAC;aAAM,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,KAAK,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC;YACvE,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC,CAAC;QAC9E,CAAC;aAAM,CAAC;YACN,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,SAAS,EAAE,CAAC,CAAC;QACnD,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,UAAU,wBAAwB,CACtC,aAAgC,EAChC,OAAe,EACf,YAA+B,0BAA0B;IAEzD,OAAO,iBAAiB,CAAC,aAAa,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,eAAe,CAAC,CAAC;AAC3G,CAAC;AAED,MAAM,cAAc,GAAG,8BAA8B,CAAC;AAEtD,SAAS,YAAY,CAAC,KAAe,EAAE,IAAY;IACjD,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,IAAI,CAAC;YAAC,GAAG,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC;QAAC,CAAC;QAAC,MAAM,CAAC,CAAC,sBAAsB,CAAC,CAAC;IAC3E,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,qBAAqB,CACnC,IAAY,EACZ,OAAe,EACf,YAA+B,0BAA0B;IAEzD,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,CAAC,CAAC,qDAAqD,CAAC,CAAC,MAAM,EAAE,CAAC;IAClE,MAAM,MAAM,GAAoB,EAAE,CAAC;IACnC,MAAM,KAAK,GAAG,CAAC,CAAC,SAAS,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IACpG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;QAClC,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,QAAQ,GAAG,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC;QAC3F,MAAM,SAAS,GAAG,iBAAiB,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;QACzF,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO;QACnC,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QACpD,KAAK,MAAM,QAAQ,IAAI,IAAI,CAAC,KAAK,CAAC,cAAc,CAAC,EAAE,CAAC;YAClD,MAAM,KAAK,GAAG;gBACZ,GAAG,mBAAmB,CAAC,QAAQ,CAAC;gBAChC,GAAG,mBAAmB,CAAC,QAAQ,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;aACrD,CAAC;YACF,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;gBAAE,SAAS;YACjC,MAAM,CAAC,IAAI,CAAC;gBACV,QAAQ,EAAE,QAAQ,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC;gBACvC,KAAK;gBACL,SAAS,EAAE,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;aACxC,CAAC,CAAC;YACH,MAAM,CAAC,+DAA+D;QACxE,CAAC;IACH,CAAC,CAAC,CAAC;IACH,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,MAAM,UAAU,gBAAgB,CAC9B,IAAmF,EACnF,cAAmC,EACnC,YAA+B,0BAA0B;IAEzD,MAAM,MAAM,GAAG,YAAY,CAAC,IAAI,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC;IAC9D,OAAO;QACL,YAAY,EAAE,mBAAmB,CAAC,MAAM,CAAC;QACzC,YAAY,EAAE,mBAAmB,CAAC,MAAM,CAAC;QACzC,aAAa,EAAE,oBAAoB,CAAC,MAAM,EAAE,IAAI,CAAC,MAAM,CAAC;QACxD,SAAS,EAAE,iBAAiB,CAAC,IAAI,CAAC,aAAa,EAAE,IAAI,CAAC,GAAG,EAAE,SAAS,CAAC;QACrE,cAAc,EAAE,qBAAqB,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,GAAG,EAAE,SAAS,CAAC;KACtE,CAAC;AACJ,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"auditor.d.ts","sourceRoot":"","sources":["../src/auditor.ts"],"names":[],"mappings":"AAmEA,OAAO,KAAK,EACV,YAAY,EACZ,YAAY,EAGZ,WAAW,EAUX,UAAU,EAIX,MAAM,YAAY,CAAC;AAQpB,OAAO,EAA8D,KAAK,kBAAkB,EAAiB,MAAM,sBAAsB,CAAC;AA+D1I,wBAAgB,eAAe,CAAC,MAAM,EAAE,MAAM,GAAG,WAAW,GAAG,SAAS,CAEvE;AA6xBD;;;;;;;;GAQG;AACH,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,UAAU,EAAE,EACtB,cAAc,EAAE,kBAAkB,GAAG,SAAS,GAC7C,UAAU,EAAE,CAed;AAoYD,wBAAgB,2BAA2B,CAAC,GAAG,EAAE,MAAM,GAAG,KAAK,CAAC;IAAE,GAAG,EAAE,MAAM,CAAC;IAAC,OAAO,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC,CAgBjG;AAsoBD,wBAAsB,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC,CAirC/F"}
1
+ {"version":3,"file":"auditor.d.ts","sourceRoot":"","sources":["../src/auditor.ts"],"names":[],"mappings":"AAoEA,OAAO,KAAK,EACV,YAAY,EACZ,YAAY,EAGZ,WAAW,EAUX,UAAU,EAIX,MAAM,YAAY,CAAC;AAQpB,OAAO,EAA8D,KAAK,kBAAkB,EAAiB,MAAM,sBAAsB,CAAC;AAiE1I,wBAAgB,eAAe,CAAC,MAAM,EAAE,MAAM,GAAG,WAAW,GAAG,SAAS,CAEvE;AA2yBD;;;;;;;;GAQG;AACH,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,UAAU,EAAE,EACtB,cAAc,EAAE,kBAAkB,GAAG,SAAS,GAC7C,UAAU,EAAE,CAed;AAoYD,wBAAgB,2BAA2B,CAAC,GAAG,EAAE,MAAM,GAAG,KAAK,CAAC;IAAE,GAAG,EAAE,MAAM,CAAC;IAAC,OAAO,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC,CAgBjG;AA+pBD,wBAAsB,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC,CAgxC/F"}
package/dist/auditor.js CHANGED
@@ -5,6 +5,7 @@ import { parseHtmlPage } from "./parser.js";
5
5
  import { pageSkipReason } from "./page-filter.js";
6
6
  import { mergeNormalizeUrlOptions, normalizeAuditUrl } from "./url-normalize.js";
7
7
  import { eeatSignalsRule } from "./rules/content/eeat-signals.js";
8
+ import { citationCoverageRule } from "./rules/content/citation-coverage.js";
8
9
  import { metaUniquenessRule } from "./rules/content/meta-uniqueness.js";
9
10
  import { missingAuthorRule } from "./rules/content/missing-author.js";
10
11
  import { uniqueValueRule } from "./rules/content/unique-value.js";
@@ -84,6 +85,8 @@ const DEFAULTS = {
84
85
  metaUniquenessMinJaccard: 0.9,
85
86
  linkDepthMaxClicks: 3,
86
87
  templateCoverageMinPages: 5,
88
+ citationCoverageMinClaims: 4,
89
+ citationCoverageMinAuthoritative: 1,
87
90
  answerFirstMaxWords: 100,
88
91
  citableFactsMin: 3,
89
92
  citableFactsTarget: 8,
@@ -414,6 +417,9 @@ const RULE_IMPACTS = {
414
417
  "content/title-uniqueness": { baseImpact: 8, perInstance: 2, maxImpact: 25 }, // 2026-05-03 round 11: title is high-impact but the original 50-cap was disproportionate to other content rules and tipped Typeform into critical on a 6-finding cluster. Keep the rule at native error severity (duplicate titles ARE real bugs); just don't let one rule dominate the integrity bucket.
415
418
  "content/heading-structure": { baseImpact: 5, perInstance: 1, maxImpact: 20 },
416
419
  "content/image-alt-text": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
420
+ // Citation coverage is low-confidence (block-level grounded-claim heuristic);
421
+ // keep its impact modest so it nudges rather than dominates the score.
422
+ "content/citation-coverage": { baseImpact: 3, perInstance: 1, maxImpact: 15 },
417
423
  "content/translation-no-op": { baseImpact: 30, perInstance: 10, maxImpact: 60 },
418
424
  // v1 warning-severity heuristic; lower than translation-no-op since it's speculative
419
425
  "content/regurgitated-content": { baseImpact: 15, perInstance: 5, maxImpact: 35 },
@@ -653,6 +659,13 @@ sampled = false) {
653
659
  if (isEnabled("content/eeat-signals") && modeOk("content/eeat-signals")) {
654
660
  pushAll(findings, tag(eeatSignalsRule(pages)));
655
661
  }
662
+ if (isEnabled("content/citation-coverage") && modeOk("content/citation-coverage")) {
663
+ pushAll(findings, tag(citationCoverageRule(pages, entityPatterns, {
664
+ minClaims: resolvedRules.citationCoverageMinClaims,
665
+ minAuthoritative: resolvedRules.citationCoverageMinAuthoritative,
666
+ allowlist: resolvedRules.citationAllowlist,
667
+ })));
668
+ }
656
669
  // 2026-05-03 v0.5.2 blind-spot fixes — title uniqueness + heading
657
670
  // structure + image alt-text were tier-1 gaps in the blind-spot audit.
658
671
  if (isEnabled("content/title-uniqueness") && modeOk("content/title-uniqueness")) {
@@ -1249,36 +1262,46 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
1249
1262
  lastmodByUrl.set(entry.url, entry.lastmod);
1250
1263
  }
1251
1264
  }
1252
- return { urls, lastmodByUrl };
1265
+ return { urls, lastmodByUrl, childTotal: 0, childFailed: 0 };
1253
1266
  }
1254
- // It's a sitemap index. Stop recursing past the depth cap (the index itself
1255
- // carries no page URLs, only child-sitemap refs, so returning empty is safe).
1267
+ // It's a sitemap index. Past the depth cap we stop recursing — but the
1268
+ // children we DON'T walk are unreached coverage, so report them as failed.
1256
1269
  if (depth >= maxDepth) {
1257
1270
  // eslint-disable-next-line no-console
1258
1271
  console.error(`pseolint: sitemap-index nesting exceeded depth ${maxDepth} at ${sitemapUrl}; not recursing further.`);
1259
- return { urls: [], lastmodByUrl: new Map() };
1272
+ return { urls: [], lastmodByUrl: new Map(), childTotal: entries.length, childFailed: entries.length };
1260
1273
  }
1261
1274
  const allUrls = [];
1262
1275
  const allLastmodByUrl = new Map();
1276
+ let childTotal = 0;
1277
+ let childFailed = 0;
1263
1278
  for (const entry of entries) {
1264
1279
  const childUrl = entry.url;
1265
1280
  if (signal?.aborted)
1266
1281
  throw signal.reason ?? new Error("aborted");
1282
+ childTotal += 1;
1267
1283
  if (visited.has(childUrl))
1268
- continue;
1284
+ continue; // already walked (cyclic index) — not a failure
1269
1285
  const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
1270
- if (!child)
1286
+ if (!child) {
1287
+ childFailed += 1;
1271
1288
  continue;
1289
+ }
1272
1290
  const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
1273
- if (!childLike)
1291
+ if (!childLike) {
1292
+ childFailed += 1;
1274
1293
  continue;
1275
- const { urls: childUrls, lastmodByUrl: childLastmodByUrl } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth + 1, maxDepth);
1294
+ }
1295
+ const { urls: childUrls, lastmodByUrl: childLastmodByUrl, childTotal: ct, childFailed: cf } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth + 1, maxDepth);
1276
1296
  pushAll(allUrls, childUrls);
1277
1297
  for (const [u, lm] of childLastmodByUrl) {
1278
1298
  allLastmodByUrl.set(u, lm);
1279
1299
  }
1300
+ // Accumulate nested index structure (a child that is itself an index).
1301
+ childTotal += ct;
1302
+ childFailed += cf;
1280
1303
  }
1281
- return { urls: allUrls, lastmodByUrl: allLastmodByUrl };
1304
+ return { urls: allUrls, lastmodByUrl: allLastmodByUrl, childTotal, childFailed };
1282
1305
  }
1283
1306
  async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
1284
1307
  if (!origin)
@@ -1380,7 +1403,7 @@ pageSink) {
1380
1403
  const isXml = (contentType.includes("xml") || looksLikeSitemap(text)) && sourceStatus !== -1;
1381
1404
  if (isXml) {
1382
1405
  const visited = new Set();
1383
- const { urls: allSitemapUrls, lastmodByUrl: sitemapLastmodByUrl } = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
1406
+ const { urls: allSitemapUrls, lastmodByUrl: sitemapLastmodByUrl, childTotal: sitemapChildTotal, childFailed: sitemapChildFailed } = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
1384
1407
  // If we have a budget, sample from sitemap URLs before fetching
1385
1408
  const sampledUrls = discoveryBudget > 0 && allSitemapUrls.length > discoveryBudget
1386
1409
  ? fisherYatesSample(allSitemapUrls, discoveryBudget)
@@ -1516,7 +1539,7 @@ pageSink) {
1516
1539
  });
1517
1540
  }
1518
1541
  }
1519
- return { pages, sitemapUrls: new Set(allSitemapUrls), sitemapLastmodByUrl, discoveredUrlCount: allSitemapUrls.length, scrapePlan };
1542
+ return { pages, sitemapUrls: new Set(allSitemapUrls), sitemapLastmodByUrl, discoveredUrlCount: allSitemapUrls.length, declaredSitemapUrlCount: allSitemapUrls.length, sitemapChildTotal, sitemapChildFailed, scrapePlan };
1520
1543
  }
1521
1544
  if (contentType.includes("html") || looksLikeHtml(text)) {
1522
1545
  const initialPage = { url: source, html: text };
@@ -1535,6 +1558,14 @@ pageSink) {
1535
1558
  const knownCrawled = new Set([source]);
1536
1559
  const allDiscoveredUrls = new Set([source]);
1537
1560
  const maxDepth = 3;
1561
+ // Total URLs the discovered sitemap(s) declare — the basis for the
1562
+ // caller's coverage guardrail. Undefined when no sitemap is found.
1563
+ let declaredSitemapUrlCount;
1564
+ // Child-sitemap reachability for the guardrail: how many child sitemaps
1565
+ // an index referenced vs how many we could not fetch/parse. childFailed>0
1566
+ // means the declared URL list is itself incomplete.
1567
+ let sitemapChildTotal = 0;
1568
+ let sitemapChildFailed = 0;
1538
1569
  // Sitemap-first discovery (like Google). Before link-crawling, read the
1539
1570
  // sitemap(s) the site declares — link-crawl only reaches *linked* pages,
1540
1571
  // but a pSEO site's whole point is thousands of programmatic URLs that
@@ -1574,13 +1605,19 @@ pageSink) {
1574
1605
  }
1575
1606
  if (!(smType.includes("xml") || looksLikeSitemap(smText)))
1576
1607
  continue;
1577
- const { urls: discoveredSmUrls } = await collectUrlsFromSitemap(smText, candidate, visitedSitemaps, timeoutMs, cache, stats, signal, validateHop);
1608
+ const { urls: discoveredSmUrls, childTotal: ct, childFailed: cf } = await collectUrlsFromSitemap(smText, candidate, visitedSitemaps, timeoutMs, cache, stats, signal, validateHop);
1609
+ sitemapChildTotal += ct;
1610
+ sitemapChildFailed += cf;
1578
1611
  pushAll(sitemapListedUrls, discoveredSmUrls);
1579
1612
  // When probing the conventional paths, stop at the first that hits.
1580
1613
  if (probing && discoveredSmUrls.length > 0)
1581
1614
  break;
1582
1615
  }
1583
1616
  // Same-origin + robots-aware filter, deduped against what we have.
1617
+ // Record what the sitemap(s) declared (deduped) before same-origin /
1618
+ // robots filtering — the operator's site has this many URLs.
1619
+ if (sitemapListedUrls.length > 0)
1620
+ declaredSitemapUrlCount = new Set(sitemapListedUrls).size;
1584
1621
  const seedUrls = Array.from(new Set(sitemapListedUrls)).filter((u) => {
1585
1622
  if (knownCrawled.has(u))
1586
1623
  return false;
@@ -1686,7 +1723,7 @@ pageSink) {
1686
1723
  if (newPages.length === 0)
1687
1724
  break;
1688
1725
  }
1689
- return { pages, discoveredUrlCount: allDiscoveredUrls.size };
1726
+ return { pages, discoveredUrlCount: allDiscoveredUrls.size, declaredSitemapUrlCount, sitemapChildTotal, sitemapChildFailed };
1690
1727
  }
1691
1728
  return { pages };
1692
1729
  }
@@ -1780,6 +1817,7 @@ export async function auditSource(source, options) {
1780
1817
  // surfaced on the summary instead.
1781
1818
  let truncated = false;
1782
1819
  let truncatedReason;
1820
+ let truncatedKind;
1783
1821
  const signal = composeSignals(externalSignal, backpressureAbort.signal);
1784
1822
  const observer = new FetchObserver();
1785
1823
  // 2026-05-03 calibration: the prior (3s p95 cap, 2× baseline multiplier)
@@ -1830,6 +1868,7 @@ export async function auditSource(source, options) {
1830
1868
  return false;
1831
1869
  truncated = true;
1832
1870
  truncatedReason = backpressureError.message;
1871
+ truncatedKind = "backpressure";
1833
1872
  return true;
1834
1873
  }
1835
1874
  function throwIfAborted() {
@@ -1861,6 +1900,9 @@ export async function auditSource(source, options) {
1861
1900
  metaUniquenessMinJaccard: options?.rules?.metaUniquenessMinJaccard ?? DEFAULTS.metaUniquenessMinJaccard,
1862
1901
  linkDepthMaxClicks: options?.rules?.linkDepthMaxClicks ?? DEFAULTS.linkDepthMaxClicks,
1863
1902
  templateCoverageMinPages: options?.rules?.templateCoverageMinPages ?? DEFAULTS.templateCoverageMinPages,
1903
+ citationCoverageMinClaims: options?.rules?.citationCoverageMinClaims ?? DEFAULTS.citationCoverageMinClaims,
1904
+ citationCoverageMinAuthoritative: options?.rules?.citationCoverageMinAuthoritative ?? DEFAULTS.citationCoverageMinAuthoritative,
1905
+ citationAllowlist: options?.rules?.citationAllowlist,
1864
1906
  answerFirstMaxWords: options?.rules?.answerFirstMaxWords ?? DEFAULTS.answerFirstMaxWords,
1865
1907
  citableFactsMin: options?.rules?.citableFactsMin ?? DEFAULTS.citableFactsMin,
1866
1908
  citableFactsTarget: options?.rules?.citableFactsTarget ?? DEFAULTS.citableFactsTarget,
@@ -1952,6 +1994,9 @@ export async function auditSource(source, options) {
1952
1994
  let sitemapUrlSet;
1953
1995
  let sitemapLastmodByUrl;
1954
1996
  let discoveredUrlCount;
1997
+ let declaredSitemapUrlCount;
1998
+ let sitemapChildTotal;
1999
+ let sitemapChildFailed;
1955
2000
  let scrapePlan;
1956
2001
  if (hasPinnedUrlsEarly) {
1957
2002
  const pinned = options.pinnedUrls;
@@ -2041,6 +2086,9 @@ export async function auditSource(source, options) {
2041
2086
  sitemapUrlSet = loaded.sitemapUrls;
2042
2087
  sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
2043
2088
  discoveredUrlCount = loaded.discoveredUrlCount;
2089
+ declaredSitemapUrlCount = loaded.declaredSitemapUrlCount;
2090
+ sitemapChildTotal = loaded.sitemapChildTotal;
2091
+ sitemapChildFailed = loaded.sitemapChildFailed;
2044
2092
  scrapePlan = loaded.scrapePlan;
2045
2093
  }
2046
2094
  catch (err) {
@@ -2053,6 +2101,7 @@ export async function auditSource(source, options) {
2053
2101
  if (!salvageBackpressure()) {
2054
2102
  truncated = true;
2055
2103
  truncatedReason = err.message;
2104
+ truncatedKind = "backpressure";
2056
2105
  }
2057
2106
  // Recover whatever was fetched before the abort. The sink is the same
2058
2107
  // array loadPagesFromSource was pushing into, so it holds the partial
@@ -2063,6 +2112,9 @@ export async function auditSource(source, options) {
2063
2112
  sitemapUrlSet = undefined;
2064
2113
  sitemapLastmodByUrl = undefined;
2065
2114
  discoveredUrlCount = undefined;
2115
+ declaredSitemapUrlCount = undefined;
2116
+ sitemapChildTotal = undefined;
2117
+ sitemapChildFailed = undefined;
2066
2118
  scrapePlan = undefined;
2067
2119
  }
2068
2120
  else {
@@ -2070,6 +2122,12 @@ export async function auditSource(source, options) {
2070
2122
  }
2071
2123
  }
2072
2124
  }
2125
+ // Pages we successfully FETCHED (HTTP 2xx) from discovery — before content-type
2126
+ // and policy filtering, and before sampling. This is the right denominator for
2127
+ // the coverage guardrail: noindex / non-HTML pages were still *reached* (they
2128
+ // count), intentional sampling happens later (doesn't count against us), and
2129
+ // only genuinely-unreachable URLs (4xx/5xx) are missing from it.
2130
+ const fetchedCount = loadedPagesRaw.length;
2073
2131
  // The scrapePlan tells us which URLs were skipped pre-fetch under monitoring
2074
2132
  // mode. Surface them in skippedUrls so they show up under summary.skippedUrls
2075
2133
  // (kept for back-compat with --since consumers); T7 will carry their prior
@@ -2103,7 +2161,10 @@ export async function auditSource(source, options) {
2103
2161
  skippedByContentType.push(p.url);
2104
2162
  }
2105
2163
  }
2106
- loadedPages.splice(0, loadedPages.length, ...htmlOnlyPages);
2164
+ // Replace contents in place without `splice(0, n, ...big)` — that spread hits
2165
+ // the V8 argument-count cap on large corpora (same class as pushAll).
2166
+ loadedPages.length = 0;
2167
+ pushAll(loadedPages, htmlOnlyPages);
2107
2168
  if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
2108
2169
  console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
2109
2170
  }
@@ -2261,9 +2322,21 @@ export async function auditSource(source, options) {
2261
2322
  const guardedClassification = applyDegenerationGuard(computedClassification, corpusStatsFromPages(parsedPages));
2262
2323
  // `--strict` (or AuditOptions.strict) keeps the classification but forces
2263
2324
  // every rule to run regardless of detected site type.
2325
+ //
2326
+ // A backpressure abort BEFORE classification salvages only a fragment of the
2327
+ // crawl (`truncated` is already set here; the coverage guardrail runs later).
2328
+ // Classifying that fragment — e.g. the 1 page left after the watchdog aborts a
2329
+ // cold-start origin — as `small-marketing` and suppressing the pSEO rules off
2330
+ // it is exactly what produced the confident false "READY" on a 5,600-page
2331
+ // site. When the run was truncated pre-classification we genuinely could not
2332
+ // determine the site type: force `unclear` (confidence 0, no suppression,
2333
+ // neutral scoring) so nothing masks the incompleteness.
2334
+ const classificationUnreliable = truncated;
2264
2335
  const siteClassification = options?.strict
2265
2336
  ? { ...guardedClassification, suppressedRules: [] }
2266
- : guardedClassification;
2337
+ : classificationUnreliable
2338
+ ? { ...guardedClassification, type: "unclear", confidence: 0, suppressedRules: [] }
2339
+ : guardedClassification;
2267
2340
  const suppressedRuleSet = new Set(siteClassification.suppressedRules);
2268
2341
  // Classify pages into groups and run only enabled rules per group
2269
2342
  const classified = classifyPages(parsedPages, options?.pageGroups);
@@ -2478,9 +2551,66 @@ export async function auditSource(source, options) {
2478
2551
  // salvaged whatever pages had been fetched. Consumers MUST treat coverage as
2479
2552
  // a lower bound (counts/verdict are partial). Only set when actually
2480
2553
  // truncated so complete runs keep `truncated` absent.
2554
+ // ── Coverage guardrails (#4) ─────────────────────────────────────────────
2555
+ // A sitemap was found at discovery, so we know roughly how large the site is.
2556
+ // Two independent under-coverage signals, each reusing the `truncated`
2557
+ // partial-coverage surface (CLI/Action/MCP/web already flag it) tagged
2558
+ // `truncatedKind: "coverage"` so consumers can tell it apart from a
2559
+ // backpressure abort. Backpressure (set during the crawl) takes precedence.
2560
+ if (!truncated && sitemapChildFailed && sitemapChildFailed > 0) {
2561
+ // (A) Extraction-side: a sitemap INDEX referenced child sitemaps we could
2562
+ // not fetch/parse (404, non-sitemap, or beyond the depth cap). The declared
2563
+ // URL list is itself incomplete — the "unreachable child sitemaps" case a
2564
+ // urls-only count can never see (and the original false-negative class).
2565
+ truncated = true;
2566
+ truncatedKind = "coverage";
2567
+ truncatedReason =
2568
+ `${sitemapChildFailed} of ${sitemapChildTotal} child sitemaps referenced by the sitemap index could not be ` +
2569
+ `fetched or parsed — both the declared URL count and this audit are incomplete, so the verdict is not ` +
2570
+ `representative of the full site. Check that every child sitemap is reachable (HTTP 200, valid XML).`;
2571
+ // eslint-disable-next-line no-console
2572
+ console.error(`pseolint: ${truncatedReason}`);
2573
+ }
2574
+ if (!truncated && declaredSitemapUrlCount && declaredSitemapUrlCount >= 20) {
2575
+ // (B) Audit-side: the sitemap declared N URLs but we FETCHED far fewer than
2576
+ // we intended to. Compare against `fetchedCount` (pages actually fetched,
2577
+ // pre-filter/pre-sample) so legitimately-skipped pages (noindex, non-HTML)
2578
+ // and intentional sampling do NOT register as a shortfall. `intended` is
2579
+ // bounded by every deliberate limit — an explicit sample, the crawl cap, and
2580
+ // the declared total — so none of them false-fire.
2581
+ const sampleCap = sampleSize > 0 ? sampleSize : Number.POSITIVE_INFINITY;
2582
+ const crawlCap = maxCrawlDiscovered > 0 ? maxCrawlDiscovered : Number.POSITIVE_INFINITY;
2583
+ const intended = Math.min(sampleCap, crawlCap, declaredSitemapUrlCount);
2584
+ const floor = Math.max(20, Math.floor(intended * 0.05));
2585
+ // `intended >= 20`: only judge representativeness when we actually meant to
2586
+ // audit a substantial slice. A deliberately tiny sample/crawl cap (intended
2587
+ // < 20) is the operator's choice, not under-discovery — don't flag it (and
2588
+ // it would otherwise trip the absolute floor of 20).
2589
+ if (intended >= 20 && fetchedCount < floor) {
2590
+ const unreached = Math.max(0, declaredSitemapUrlCount - fetchedCount);
2591
+ const ratio = fetchedCount / declaredSitemapUrlCount;
2592
+ const pct = (ratio * 100).toFixed(ratio < 0.01 ? 2 : 1);
2593
+ truncated = true;
2594
+ truncatedKind = "coverage";
2595
+ truncatedReason =
2596
+ `Fetched ${fetchedCount} of ~${declaredSitemapUrlCount} sitemap-declared URLs (~${pct}% coverage); ` +
2597
+ `~${unreached} could not be retrieved (4xx/5xx, redirects, or robots-blocked). The verdict covers only the ` +
2598
+ `pages reached and is not representative — check for a stale sitemap or unreachable pages, or raise crawl limits.`;
2599
+ // eslint-disable-next-line no-console
2600
+ console.error(`pseolint: ${truncatedReason}`);
2601
+ }
2602
+ }
2481
2603
  if (truncated) {
2482
2604
  summary.truncated = true;
2483
2605
  summary.truncatedReason = truncatedReason;
2606
+ if (truncatedKind)
2607
+ summary.truncatedKind = truncatedKind;
2608
+ // A truncated run is incomplete — never present it as a clean green. Floor
2609
+ // the verdict to at least "caution" so the headline matches the partial-
2610
+ // coverage banner instead of the false "READY ✓" over a salvaged fragment.
2611
+ // ("ready" is the only rung below "caution"; everything else already is.)
2612
+ if (summary.verdict === "ready")
2613
+ summary.verdict = "caution";
2484
2614
  }
2485
2615
  if (cacheConfig) {
2486
2616
  summary.cacheStats = cacheStats;