@pseolint/core 0.6.5 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/dist/algorithms/fact-extraction.d.ts +46 -0
- package/dist/algorithms/fact-extraction.d.ts.map +1 -0
- package/dist/algorithms/fact-extraction.js +217 -0
- package/dist/algorithms/fact-extraction.js.map +1 -0
- package/dist/auditor.d.ts.map +1 -1
- package/dist/auditor.js +145 -15
- package/dist/auditor.js.map +1 -1
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -1
- package/dist/origin-preflight.d.ts +89 -0
- package/dist/origin-preflight.d.ts.map +1 -0
- package/dist/origin-preflight.js +93 -0
- package/dist/origin-preflight.js.map +1 -0
- package/dist/rule-references.d.ts.map +1 -1
- package/dist/rule-references.js +1 -0
- package/dist/rule-references.js.map +1 -1
- package/dist/rules/aeo/citable-facts.d.ts.map +1 -1
- package/dist/rules/aeo/citable-facts.js +4 -33
- package/dist/rules/aeo/citable-facts.js.map +1 -1
- package/dist/rules/content/citation-coverage.d.ts +11 -0
- package/dist/rules/content/citation-coverage.d.ts.map +1 -0
- package/dist/rules/content/citation-coverage.js +43 -0
- package/dist/rules/content/citation-coverage.js.map +1 -0
- package/dist/rules/content/unique-value.d.ts.map +1 -1
- package/dist/rules/content/unique-value.js +29 -4
- package/dist/rules/content/unique-value.js.map +1 -1
- package/dist/rules/content/value-add.d.ts.map +1 -1
- package/dist/rules/content/value-add.js +3 -1
- package/dist/rules/content/value-add.js.map +1 -1
- package/dist/rules/scope.d.ts.map +1 -1
- package/dist/rules/scope.js +1 -0
- package/dist/rules/scope.js.map +1 -1
- package/dist/site-classifier.d.ts.map +1 -1
- package/dist/site-classifier.js +1 -0
- package/dist/site-classifier.js.map +1 -1
- package/dist/types.d.ts +20 -5
- package/dist/types.d.ts.map +1 -1
- package/package.json +93 -93
- package/schemas/audit-summary.schema.json +6 -1
package/README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# @pseolint/core
|
|
2
2
|
|
|
3
|
-
> Programmatic SEO audit engine —
|
|
3
|
+
> Programmatic SEO audit engine — 44 rules, surfaced per-template, on every monitored release.
|
|
4
4
|
|
|
5
|
-
The core engine behind [pseolint](https://www.npmjs.com/package/pseolint) v0.
|
|
5
|
+
The core engine behind [pseolint](https://www.npmjs.com/package/pseolint) v0.7.0. Use this package to embed pSEO auditing into your own tools, CI pipelines, or SaaS products.
|
|
6
6
|
|
|
7
7
|
## Install
|
|
8
8
|
|
|
@@ -34,7 +34,7 @@ for (const t of result.templates) {
|
|
|
34
34
|
|
|
35
35
|
## What It Checks
|
|
36
36
|
|
|
37
|
-
|
|
37
|
+
44 rules grouped into 4 scoring super-categories (v0.4): **Integrity** (spam + content + cannibal, weight 0.50), **Discoverability** (links + tech, 0.20), **Citation** (aeo + schema, 0.25), **Data** (0.05). Source-tree namespaces remain `spam/*`, `aeo/*`, etc. for stable rule IDs.
|
|
38
38
|
|
|
39
39
|
- **Spam / SpamBrain risk** (8) — near-duplicate (SimHash), entity-swap doorways, thin content, boilerplate ratio, template diversity, template coverage, publication velocity, doorway pattern (cluster-collapsed since v0.5.2)
|
|
40
40
|
- **Technical SEO** (9) — canonical consistency, canonical/noindex and robots/noindex conflicts, sitemap completeness, robots compliance, redirect chains, soft 404s, hreflang reciprocity, robots-sitemap presence, **og-completeness** (v0.5.2)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import type { EntityMaskPattern, ParsedPage } from "../types.js";
|
|
2
|
+
export type FactKind = "money" | "percent" | "timeframe" | "date" | "isoDate" | "form" | "ratio" | "measurement";
|
|
3
|
+
export interface FactSpan {
|
|
4
|
+
value: string;
|
|
5
|
+
kind: FactKind;
|
|
6
|
+
}
|
|
7
|
+
export interface NamedEntity {
|
|
8
|
+
value: string;
|
|
9
|
+
source: "proper-noun" | "cue-word" | "json-ld";
|
|
10
|
+
type?: "organization" | "person" | "product" | "law" | "standard" | "place" | "other";
|
|
11
|
+
}
|
|
12
|
+
export interface Citation {
|
|
13
|
+
href: string;
|
|
14
|
+
domain: string;
|
|
15
|
+
authority: "authoritative" | "general";
|
|
16
|
+
reason?: "tld" | "allowlist";
|
|
17
|
+
}
|
|
18
|
+
export interface GroundedClaim {
|
|
19
|
+
sentence: string;
|
|
20
|
+
facts: string[];
|
|
21
|
+
citations: string[];
|
|
22
|
+
}
|
|
23
|
+
export interface PageFacts {
|
|
24
|
+
/** EXACTLY today's extractRawFacts() output (run on entity-masked text). Frozen. */
|
|
25
|
+
citableFacts: string[];
|
|
26
|
+
measurements: FactSpan[];
|
|
27
|
+
namedEntities: NamedEntity[];
|
|
28
|
+
citations: Citation[];
|
|
29
|
+
groundedClaims: GroundedClaim[];
|
|
30
|
+
}
|
|
31
|
+
export declare function extractCitableFacts(text: string): string[];
|
|
32
|
+
export declare function extractMeasurements(maskedText: string): FactSpan[];
|
|
33
|
+
export declare function extractNamedEntities(maskedText: string, jsonLd?: unknown[]): NamedEntity[];
|
|
34
|
+
export declare const DEFAULT_CITATION_ALLOWLIST: readonly string[];
|
|
35
|
+
export declare function registrableDomain(host: string): string;
|
|
36
|
+
export declare function classifyCitations(resolvedHrefs: readonly string[], pageUrl: string, allowlist?: readonly string[]): Citation[];
|
|
37
|
+
export declare function hasAuthoritativeCitation(resolvedHrefs: readonly string[], pageUrl: string, allowlist?: readonly string[]): boolean;
|
|
38
|
+
/**
|
|
39
|
+
* Deterministic approximation of "a verifiable claim": a block (<p>/<li>) that
|
|
40
|
+
* contains a statistic AND an outbound citation. Approximated at block level,
|
|
41
|
+
* not exact sentence level — documented limitation. Detects co-occurrence, not
|
|
42
|
+
* semantic truth. Consume at `speculative` confidence.
|
|
43
|
+
*/
|
|
44
|
+
export declare function extractGroundedClaims(html: string, pageUrl: string, allowlist?: readonly string[]): GroundedClaim[];
|
|
45
|
+
export declare function extractPageFacts(page: Pick<ParsedPage, "url" | "contentText" | "html" | "resolvedHrefs" | "jsonLd">, entityPatterns: EntityMaskPattern[], allowlist?: readonly string[]): PageFacts;
|
|
46
|
+
//# sourceMappingURL=fact-extraction.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fact-extraction.d.ts","sourceRoot":"","sources":["../../src/algorithms/fact-extraction.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,iBAAiB,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAEjE,MAAM,MAAM,QAAQ,GAChB,OAAO,GAAG,SAAS,GAAG,WAAW,GAAG,MAAM,GAAG,SAAS,GAAG,MAAM,GAC/D,OAAO,GAAG,aAAa,CAAC;AAE5B,MAAM,WAAW,QAAQ;IACvB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,QAAQ,CAAC;CAChB;AAED,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,aAAa,GAAG,UAAU,GAAG,SAAS,CAAC;IAC/C,IAAI,CAAC,EAAE,cAAc,GAAG,QAAQ,GAAG,SAAS,GAAG,KAAK,GAAG,UAAU,GAAG,OAAO,GAAG,OAAO,CAAC;CACvF;AAED,MAAM,WAAW,QAAQ;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,eAAe,GAAG,SAAS,CAAC;IACvC,MAAM,CAAC,EAAE,KAAK,GAAG,WAAW,CAAC;CAC9B;AAED,MAAM,WAAW,aAAa;IAC5B,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,EAAE,MAAM,EAAE,CAAC;IAChB,SAAS,EAAE,MAAM,EAAE,CAAC;CACrB;AAED,MAAM,WAAW,SAAS;IACxB,oFAAoF;IACpF,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,YAAY,EAAE,QAAQ,EAAE,CAAC;IACzB,aAAa,EAAE,WAAW,EAAE,CAAC;IAC7B,SAAS,EAAE,QAAQ,EAAE,CAAC;IACtB,cAAc,EAAE,aAAa,EAAE,CAAC;CACjC;AAqBD,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAQ1D;AAWD,wBAAgB,mBAAmB,CAAC,UAAU,EAAE,MAAM,GAAG,QAAQ,EAAE,CAclE;AA4BD,wBAAgB,oBAAoB,CAAC,UAAU,EAAE,MAAM,EAAE,MAAM,GAAE,OAAO,EAAO,GAAG,WAAW,EAAE,CAe9F;AAED,eAAO,MAAM,0BAA0B,EAAE,SAAS,MAAM,EAIvD,CAAC;AAWF,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAMtD;AAMD,wBAAgB,iBAAiB,CAC/B,aAAa,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,EAAE,MAAM,EACf,SAAS,GAAE,SAAS,MAAM,EAA+B,GACxD,QAAQ,EAAE,CAqBZ;AAED,wBAAgB,wBAAwB,CACtC,aAAa,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,EAAE,MAAM,EACf,SAAS,GAAE,SAAS,MAAM,EAA+B,GACxD,OAAO,CAET;AAYD;;;;;GAKG;AACH,wBAAgB,qBAAqB,CACnC,IAAI,EAAE,MAAM,EACZ,OAAO,EAAE,MAAM,EACf,SAAS,GAAE,SAAS,MAAM,EAA+B,GACxD,aAAa,EAAE,CA0BjB;AAED,wBAAgB,gBAAgB,CAC9B,IAAI,EAAE,IAAI,CAAC,UAAU,EAAE,KAAK,GAAG,aAAa,GAAG,MAAM,GAAG,eAAe,GAAG,QAAQ,CAAC,EACnF,cAAc,EAAE,iBAAiB,EAAE,EACnC,SAAS,GAAE,SAAS,MAAM,EAA+B,GACxD,SAAS,CASX"}
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
import { load } from "cheerio";
|
|
2
|
+
import { maskEntities } from "./entity-mask.js";
|
|
3
|
+
// --- Numeric "citable" facts: the frozen subset aeo/citable-facts counts. ---
|
|
4
|
+
// These six patterns are lifted verbatim from rules/aeo/citable-facts.ts and
|
|
5
|
+
// MUST stay byte-identical to preserve the calibration corpus.
|
|
6
|
+
const CITABLE_FACT_PATTERNS = [
|
|
7
|
+
{ name: "dollar", regex: /\$[\d,]+(\.\d{2})?/g },
|
|
8
|
+
{ name: "percent", regex: /\b\d+(\.\d+)?\s*%/g },
|
|
9
|
+
{
|
|
10
|
+
name: "timeframe",
|
|
11
|
+
regex: /\b\d+(?:-\d+)?\s*(business\s+days?|days?|weeks?|months?|years?|hours?|minutes?)\b/gi,
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
name: "date",
|
|
15
|
+
regex: /\b(january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2}(?:,\s*\d{4})?\b/gi,
|
|
16
|
+
},
|
|
17
|
+
{ name: "isoDate", regex: /\b\d{4}-\d{2}-\d{2}\b/g },
|
|
18
|
+
{ name: "form", regex: /\bForm\s+[A-Z0-9][A-Z0-9-]*\b/g },
|
|
19
|
+
];
|
|
20
|
+
export function extractCitableFacts(text) {
|
|
21
|
+
const out = new Set();
|
|
22
|
+
for (const { regex } of CITABLE_FACT_PATTERNS) {
|
|
23
|
+
const matches = text.match(regex);
|
|
24
|
+
if (!matches)
|
|
25
|
+
continue;
|
|
26
|
+
for (const m of matches)
|
|
27
|
+
out.add(m.trim().toLowerCase());
|
|
28
|
+
}
|
|
29
|
+
return Array.from(out);
|
|
30
|
+
}
|
|
31
|
+
// --- Measurements: NEW numeric kinds, deliberately separate from citableFacts. ---
|
|
32
|
+
const MEASUREMENT_UNITS = "kg|g|lb|lbs|oz|mi|km|cm|mm|ft|in|MB|GB|TB|KB|ms|fps|mph|kWh";
|
|
33
|
+
const MEASUREMENT_PATTERNS = [
|
|
34
|
+
{ kind: "ratio", regex: /\b\d+(?:\.\d+)?\s*(?:out of|in)\s*\d+\b/gi },
|
|
35
|
+
{ kind: "ratio", regex: /\b\d+\s*:\s*\d+\b/g },
|
|
36
|
+
{ kind: "measurement", regex: new RegExp(`\\b\\d+(?:\\.\\d+)?\\s*(?:${MEASUREMENT_UNITS})\\b`, "g") },
|
|
37
|
+
];
|
|
38
|
+
export function extractMeasurements(maskedText) {
|
|
39
|
+
const seen = new Set();
|
|
40
|
+
const out = [];
|
|
41
|
+
for (const { kind, regex } of MEASUREMENT_PATTERNS) {
|
|
42
|
+
const matches = maskedText.match(regex);
|
|
43
|
+
if (!matches)
|
|
44
|
+
continue;
|
|
45
|
+
for (const m of matches) {
|
|
46
|
+
const value = m.replace(/\s+/g, " ").trim().toLowerCase();
|
|
47
|
+
if (seen.has(value))
|
|
48
|
+
continue;
|
|
49
|
+
seen.add(value);
|
|
50
|
+
out.push({ value, kind });
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
return out;
|
|
54
|
+
}
|
|
55
|
+
const MULTI_WORD_PROPER_NOUN = /\b[A-Z][a-z]+(?:\s+(?:of\s+|de\s+|and\s+|the\s+)?[A-Z][a-z]+)+\b/g;
|
|
56
|
+
const ACRONYM = /\b(?:ISO|GDPR|HIPAA|FDA|SEC|FTC|EPA|W3C|IETF|RFC|NIST|OSHA|IRS|EU|UN|WHO|CCPA|PCI)\b/g;
|
|
57
|
+
const CUE_WORD = /\b(?:Inc|LLC|Ltd|Corp|GmbH|Act|Regulation|Directive|Agency|Department|Bureau|Commission|Authority|University|Institute|Association|Standard|Protocol)\b/;
|
|
58
|
+
const JSON_LD_ENTITY_TYPES = new Set([
|
|
59
|
+
"Organization", "GovernmentOrganization", "Corporation", "NGO",
|
|
60
|
+
"Person", "Product", "Brand",
|
|
61
|
+
]);
|
|
62
|
+
function jsonLdEntities(nodes) {
|
|
63
|
+
const out = [];
|
|
64
|
+
const visit = (node) => {
|
|
65
|
+
if (Array.isArray(node)) {
|
|
66
|
+
node.forEach(visit);
|
|
67
|
+
return;
|
|
68
|
+
}
|
|
69
|
+
if (typeof node !== "object" || node === null)
|
|
70
|
+
return;
|
|
71
|
+
const obj = node;
|
|
72
|
+
const type = obj["@type"];
|
|
73
|
+
const name = obj["name"];
|
|
74
|
+
if (typeof name === "string" && typeof type === "string" && JSON_LD_ENTITY_TYPES.has(type)) {
|
|
75
|
+
out.push({ value: name.trim().toLowerCase(), source: "json-ld", type: "organization" });
|
|
76
|
+
}
|
|
77
|
+
for (const v of Object.values(obj))
|
|
78
|
+
visit(v);
|
|
79
|
+
};
|
|
80
|
+
nodes.forEach(visit);
|
|
81
|
+
return out;
|
|
82
|
+
}
|
|
83
|
+
export function extractNamedEntities(maskedText, jsonLd = []) {
|
|
84
|
+
const seen = new Set();
|
|
85
|
+
const out = [];
|
|
86
|
+
const push = (value, source) => {
|
|
87
|
+
const v = value.replace(/\s+/g, " ").trim().toLowerCase();
|
|
88
|
+
if (v.length < 2 || seen.has(v))
|
|
89
|
+
return;
|
|
90
|
+
seen.add(v);
|
|
91
|
+
out.push({ value: v, source });
|
|
92
|
+
};
|
|
93
|
+
for (const m of jsonLdEntities(jsonLd))
|
|
94
|
+
push(m.value, "json-ld");
|
|
95
|
+
for (const m of maskedText.match(ACRONYM) ?? [])
|
|
96
|
+
push(m, "cue-word");
|
|
97
|
+
for (const m of maskedText.match(MULTI_WORD_PROPER_NOUN) ?? []) {
|
|
98
|
+
push(m, CUE_WORD.test(m) ? "cue-word" : "proper-noun");
|
|
99
|
+
}
|
|
100
|
+
return out;
|
|
101
|
+
}
|
|
102
|
+
export const DEFAULT_CITATION_ALLOWLIST = [
|
|
103
|
+
"wikipedia.org", "w3.org", "iso.org", "ietf.org", "rfc-editor.org",
|
|
104
|
+
"doi.org", "nih.gov", "ncbi.nlm.nih.gov", "who.int", "schema.org",
|
|
105
|
+
"oecd.org", "worldbank.org", "europa.eu",
|
|
106
|
+
];
|
|
107
|
+
const MULTI_PART_SUFFIXES = new Set([
|
|
108
|
+
"co.uk", "ac.uk", "gov.uk", "org.uk", "com.au", "gov.au", "edu.au",
|
|
109
|
+
"co.jp", "co.nz", "co.za", "com.br",
|
|
110
|
+
]);
|
|
111
|
+
function hostOf(url) {
|
|
112
|
+
try {
|
|
113
|
+
return new URL(url).hostname.toLowerCase();
|
|
114
|
+
}
|
|
115
|
+
catch {
|
|
116
|
+
return null;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
export function registrableDomain(host) {
|
|
120
|
+
const labels = host.replace(/^www\./, "").split(".");
|
|
121
|
+
if (labels.length <= 2)
|
|
122
|
+
return labels.join(".");
|
|
123
|
+
const lastTwo = labels.slice(-2).join(".");
|
|
124
|
+
if (MULTI_PART_SUFFIXES.has(lastTwo))
|
|
125
|
+
return labels.slice(-3).join(".");
|
|
126
|
+
return lastTwo;
|
|
127
|
+
}
|
|
128
|
+
function isAuthoritativeTld(host) {
|
|
129
|
+
return /\.(?:gov|edu|mil|int)$/.test(host) || /\.(?:gov|edu|ac)\.[a-z]{2}$/.test(host);
|
|
130
|
+
}
|
|
131
|
+
export function classifyCitations(resolvedHrefs, pageUrl, allowlist = DEFAULT_CITATION_ALLOWLIST) {
|
|
132
|
+
const pageHost = hostOf(pageUrl);
|
|
133
|
+
const pageDomain = pageHost ? registrableDomain(pageHost) : null;
|
|
134
|
+
const seen = new Set();
|
|
135
|
+
const out = [];
|
|
136
|
+
for (const href of resolvedHrefs) {
|
|
137
|
+
const host = hostOf(href);
|
|
138
|
+
if (!host)
|
|
139
|
+
continue;
|
|
140
|
+
const domain = registrableDomain(host);
|
|
141
|
+
if (pageDomain && domain === pageDomain)
|
|
142
|
+
continue; // internal link
|
|
143
|
+
if (seen.has(href))
|
|
144
|
+
continue;
|
|
145
|
+
seen.add(href);
|
|
146
|
+
if (isAuthoritativeTld(host)) {
|
|
147
|
+
out.push({ href, domain, authority: "authoritative", reason: "tld" });
|
|
148
|
+
}
|
|
149
|
+
else if (allowlist.some((d) => host === d || host.endsWith(`.${d}`))) {
|
|
150
|
+
out.push({ href, domain, authority: "authoritative", reason: "allowlist" });
|
|
151
|
+
}
|
|
152
|
+
else {
|
|
153
|
+
out.push({ href, domain, authority: "general" });
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
return out;
|
|
157
|
+
}
|
|
158
|
+
export function hasAuthoritativeCitation(resolvedHrefs, pageUrl, allowlist = DEFAULT_CITATION_ALLOWLIST) {
|
|
159
|
+
return classifyCitations(resolvedHrefs, pageUrl, allowlist).some((c) => c.authority === "authoritative");
|
|
160
|
+
}
|
|
161
|
+
const SENTENCE_SPLIT = /(?<=[.!?])\s+(?=[A-Z0-9"'(])/;
|
|
162
|
+
function resolveHrefs(hrefs, base) {
|
|
163
|
+
const out = [];
|
|
164
|
+
for (const h of hrefs) {
|
|
165
|
+
try {
|
|
166
|
+
out.push(new URL(h, base).href);
|
|
167
|
+
}
|
|
168
|
+
catch { /* skip unparseable */ }
|
|
169
|
+
}
|
|
170
|
+
return out;
|
|
171
|
+
}
|
|
172
|
+
/**
|
|
173
|
+
* Deterministic approximation of "a verifiable claim": a block (<p>/<li>) that
|
|
174
|
+
* contains a statistic AND an outbound citation. Approximated at block level,
|
|
175
|
+
* not exact sentence level — documented limitation. Detects co-occurrence, not
|
|
176
|
+
* semantic truth. Consume at `speculative` confidence.
|
|
177
|
+
*/
|
|
178
|
+
export function extractGroundedClaims(html, pageUrl, allowlist = DEFAULT_CITATION_ALLOWLIST) {
|
|
179
|
+
const $ = load(html);
|
|
180
|
+
$("nav, header, footer, aside, script, style, noscript").remove();
|
|
181
|
+
const claims = [];
|
|
182
|
+
const scope = $("article").length > 0 ? $("article") : $("main").length > 0 ? $("main") : $("body");
|
|
183
|
+
scope.find("p, li").each((_i, el) => {
|
|
184
|
+
const $el = $(el);
|
|
185
|
+
const rawLinks = $el.find("a[href]").map((_j, a) => String($(a).attr("href") ?? "")).get();
|
|
186
|
+
const citations = classifyCitations(resolveHrefs(rawLinks, pageUrl), pageUrl, allowlist);
|
|
187
|
+
if (citations.length === 0)
|
|
188
|
+
return;
|
|
189
|
+
const text = $el.text().replace(/\s+/g, " ").trim();
|
|
190
|
+
for (const sentence of text.split(SENTENCE_SPLIT)) {
|
|
191
|
+
const facts = [
|
|
192
|
+
...extractCitableFacts(sentence),
|
|
193
|
+
...extractMeasurements(sentence).map((m) => m.value),
|
|
194
|
+
];
|
|
195
|
+
if (facts.length === 0)
|
|
196
|
+
continue;
|
|
197
|
+
claims.push({
|
|
198
|
+
sentence: sentence.trim().slice(0, 240),
|
|
199
|
+
facts,
|
|
200
|
+
citations: citations.map((c) => c.href),
|
|
201
|
+
});
|
|
202
|
+
break; // one grounded claim per block is enough; avoids over-counting
|
|
203
|
+
}
|
|
204
|
+
});
|
|
205
|
+
return claims;
|
|
206
|
+
}
|
|
207
|
+
export function extractPageFacts(page, entityPatterns, allowlist = DEFAULT_CITATION_ALLOWLIST) {
|
|
208
|
+
const masked = maskEntities(page.contentText, entityPatterns);
|
|
209
|
+
return {
|
|
210
|
+
citableFacts: extractCitableFacts(masked),
|
|
211
|
+
measurements: extractMeasurements(masked),
|
|
212
|
+
namedEntities: extractNamedEntities(masked, page.jsonLd),
|
|
213
|
+
citations: classifyCitations(page.resolvedHrefs, page.url, allowlist),
|
|
214
|
+
groundedClaims: extractGroundedClaims(page.html, page.url, allowlist),
|
|
215
|
+
};
|
|
216
|
+
}
|
|
217
|
+
//# sourceMappingURL=fact-extraction.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fact-extraction.js","sourceRoot":"","sources":["../../src/algorithms/fact-extraction.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,SAAS,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAwChD,+EAA+E;AAC/E,6EAA6E;AAC7E,+DAA+D;AAC/D,MAAM,qBAAqB,GAA2C;IACpE,EAAE,IAAI,EAAE,QAAQ,EAAE,KAAK,EAAE,qBAAqB,EAAE;IAChD,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,oBAAoB,EAAE;IAChD;QACE,IAAI,EAAE,WAAW;QACjB,KAAK,EAAE,qFAAqF;KAC7F;IACD;QACE,IAAI,EAAE,MAAM;QACZ,KAAK,EACH,uHAAuH;KAC1H;IACD,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,wBAAwB,EAAE;IACpD,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,gCAAgC,EAAE;CAC1D,CAAC;AAEF,MAAM,UAAU,mBAAmB,CAAC,IAAY;IAC9C,MAAM,GAAG,GAAG,IAAI,GAAG,EAAU,CAAC;IAC9B,KAAK,MAAM,EAAE,KAAK,EAAE,IAAI,qBAAqB,EAAE,CAAC;QAC9C,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QAClC,IAAI,CAAC,OAAO;YAAE,SAAS;QACvB,KAAK,MAAM,CAAC,IAAI,OAAO;YAAE,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,CAAC;IAC3D,CAAC;IACD,OAAO,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AACzB,CAAC;AAED,oFAAoF;AACpF,MAAM,iBAAiB,GACrB,6DAA6D,CAAC;AAChE,MAAM,oBAAoB,GAA6C;IACrE,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,2CAA2C,EAAE;IACrE,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,oBAAoB,EAAE;IAC9C,EAAE,IAAI,EAAE,aAAa,EAAE,KAAK,EAAE,IAAI,MAAM,CAAC,6BAA6B,iBAAiB,MAAM,EAAE,GAAG,CAAC,EAAE;CACtG,CAAC;AAEF,MAAM,UAAU,mBAAmB,CAAC,UAAkB;IACpD,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,MAAM,GAAG,GAAe,EAAE,CAAC;IAC3B,KAAK,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,oBAAoB,EAAE,CAAC;QACnD,MAAM,OAAO,GAAG,UAAU,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACxC,IAAI,CAAC,OAAO;YAAE,SAAS;QACvB,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;YACxB,MAAM,KAAK,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;YAC1D,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC;gBAAE,SAAS;YAC9B,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;YAChB,GAAG,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;QAC5B,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,sBAAsB,GAAG,mEAAmE,CAAC;AACnG,MAAM,OAAO,GAAG,uFAAuF,CAAC;AACxG,MAAM,QAAQ,GAAG,yJAAyJ,CAAC;AAE3K,MAAM,oBAAoB,GAAG,IAAI,GAAG,CAAC;IACnC,cAAc,EAAE,wBAAwB,EAAE,aAAa,EAAE,KAAK;IAC9D,QAAQ,EAAE,SAAS,EAAE,OAAO;CAC7B,CAAC,CAAC;AAEH,SAAS,cAAc,CAAC,KAAgB;IACtC,MAAM,GAAG,GAAkB,EAAE,CAAC;IAC9B,MAAM,KAAK,GAAG,CAAC,IAAa,EAAQ,EAAE;QACpC,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;YAAC,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;YAAC,OAAO;QAAC,CAAC;QACzD,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,KAAK,IAAI;YAAE,OAAO;QACtD,MAAM,GAAG,GAAG,IAA+B,CAAC;QAC5C,MAAM,IAAI,GAAG,GAAG,CAAC,OAAO,CAAC,CAAC;QAC1B,MAAM,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC;QACzB,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,oBAAoB,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;YAC3F,GAAG,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,EAAE,cAAc,EAAE,CAAC,CAAC;QAC1F,CAAC;QACD,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,MAAM,CAAC,GAAG,CAAC;YAAE,KAAK,CAAC,CAAC,CAAC,CAAC;IAC/C,CAAC,CAAC;IACF,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;IACrB,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,UAAU,oBAAoB,CAAC,UAAkB,EAAE,SAAoB,EAAE;IAC7E,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,MAAM,GAAG,GAAkB,EAAE,CAAC;IAC9B,MAAM,IAAI,GAAG,CAAC,KAAa,EAAE,MAA6B,EAAQ,EAAE;QAClE,MAAM,CAAC,GAAG,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAC1D,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;YAAE,OAAO;QACxC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QACZ,GAAG,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;IACjC,CAAC,CAAC;IACF,KAAK,MAAM,CAAC,IAAI,cAAc,CAAC,MAAM,CAAC;QAAE,IAAI,CAAC,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;IACjE,KAAK,MAAM,CAAC,IAAI,UAAU,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,EAAE;QAAE,IAAI,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;IACrE,KAAK,MAAM,CAAC,IAAI,UAAU,CAAC,KAAK,CAAC,sBAAsB,CAAC,IAAI,EAAE,EAAE,CAAC;QAC/D,IAAI,CAAC,CAAC,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC;IACzD,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,CAAC,MAAM,0BAA0B,GAAsB;IAC3D,eAAe,EAAE,QAAQ,EAAE,SAAS,EAAE,UAAU,EAAE,gBAAgB;IAClE,SAAS,EAAE,SAAS,EAAE,kBAAkB,EAAE,SAAS,EAAE,YAAY;IACjE,UAAU,EAAE,eAAe,EAAE,WAAW;CACzC,CAAC;AAEF,MAAM,mBAAmB,GAAG,IAAI,GAAG,CAAC;IAClC,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ;IAClE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,QAAQ;CACpC,CAAC,CAAC;AAEH,SAAS,MAAM,CAAC,GAAW;IACzB,IAAI,CAAC;QAAC,OAAO,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;IAAC,CAAC;IAAC,MAAM,CAAC;QAAC,OAAO,IAAI,CAAC;IAAC,CAAC;AAC5E,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,IAAY;IAC5C,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IACrD,IAAI,MAAM,CAAC,MAAM,IAAI,CAAC;QAAE,OAAO,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC3C,IAAI,mBAAmB,CAAC,GAAG,CAAC,OAAO,CAAC;QAAE,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACxE,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,kBAAkB,CAAC,IAAY;IACtC,OAAO,wBAAwB,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,6BAA6B,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACzF,CAAC;AAED,MAAM,UAAU,iBAAiB,CAC/B,aAAgC,EAChC,OAAe,EACf,YAA+B,0BAA0B;IAEzD,MAAM,QAAQ,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC;IACjC,MAAM,UAAU,GAAG,QAAQ,CAAC,CAAC,CAAC,iBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IACjE,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,MAAM,GAAG,GAAe,EAAE,CAAC;IAC3B,KAAK,MAAM,IAAI,IAAI,aAAa,EAAE,CAAC;QACjC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC;QAC1B,IAAI,CAAC,IAAI;YAAE,SAAS;QACpB,MAAM,MAAM,GAAG,iBAAiB,CAAC,IAAI,CAAC,CAAC;QACvC,IAAI,UAAU,IAAI,MAAM,KAAK,UAAU;YAAE,SAAS,CAAC,gBAAgB;QACnE,IAAI,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC;YAAE,SAAS;QAC7B,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QACf,IAAI,kBAAkB,CAAC,IAAI,CAAC,EAAE,CAAC;YAC7B,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,CAAC;QACxE,CAAC;aAAM,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,KAAK,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC;YACvE,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC,CAAC;QAC9E,CAAC;aAAM,CAAC;YACN,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,SAAS,EAAE,CAAC,CAAC;QACnD,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,UAAU,wBAAwB,CACtC,aAAgC,EAChC,OAAe,EACf,YAA+B,0BAA0B;IAEzD,OAAO,iBAAiB,CAAC,aAAa,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,eAAe,CAAC,CAAC;AAC3G,CAAC;AAED,MAAM,cAAc,GAAG,8BAA8B,CAAC;AAEtD,SAAS,YAAY,CAAC,KAAe,EAAE,IAAY;IACjD,MAAM,GAAG,GAAa,EAAE,CAAC;IACzB,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,IAAI,CAAC;YAAC,GAAG,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC;QAAC,CAAC;QAAC,MAAM,CAAC,CAAC,sBAAsB,CAAC,CAAC;IAC3E,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,qBAAqB,CACnC,IAAY,EACZ,OAAe,EACf,YAA+B,0BAA0B;IAEzD,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC;IACrB,CAAC,CAAC,qDAAqD,CAAC,CAAC,MAAM,EAAE,CAAC;IAClE,MAAM,MAAM,GAAoB,EAAE,CAAC;IACnC,MAAM,KAAK,GAAG,CAAC,CAAC,SAAS,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IACpG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;QAClC,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,QAAQ,GAAG,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC;QAC3F,MAAM,SAAS,GAAG,iBAAiB,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;QACzF,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO;QACnC,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QACpD,KAAK,MAAM,QAAQ,IAAI,IAAI,CAAC,KAAK,CAAC,cAAc,CAAC,EAAE,CAAC;YAClD,MAAM,KAAK,GAAG;gBACZ,GAAG,mBAAmB,CAAC,QAAQ,CAAC;gBAChC,GAAG,mBAAmB,CAAC,QAAQ,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;aACrD,CAAC;YACF,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;gBAAE,SAAS;YACjC,MAAM,CAAC,IAAI,CAAC;gBACV,QAAQ,EAAE,QAAQ,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC;gBACvC,KAAK;gBACL,SAAS,EAAE,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;aACxC,CAAC,CAAC;YACH,MAAM,CAAC,+DAA+D;QACxE,CAAC;IACH,CAAC,CAAC,CAAC;IACH,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,MAAM,UAAU,gBAAgB,CAC9B,IAAmF,EACnF,cAAmC,EACnC,YAA+B,0BAA0B;IAEzD,MAAM,MAAM,GAAG,YAAY,CAAC,IAAI,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC;IAC9D,OAAO;QACL,YAAY,EAAE,mBAAmB,CAAC,MAAM,CAAC;QACzC,YAAY,EAAE,mBAAmB,CAAC,MAAM,CAAC;QACzC,aAAa,EAAE,oBAAoB,CAAC,MAAM,EAAE,IAAI,CAAC,MAAM,CAAC;QACxD,SAAS,EAAE,iBAAiB,CAAC,IAAI,CAAC,aAAa,EAAE,IAAI,CAAC,GAAG,EAAE,SAAS,CAAC;QACrE,cAAc,EAAE,qBAAqB,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,GAAG,EAAE,SAAS,CAAC;KACtE,CAAC;AACJ,CAAC"}
|
package/dist/auditor.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"auditor.d.ts","sourceRoot":"","sources":["../src/auditor.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"auditor.d.ts","sourceRoot":"","sources":["../src/auditor.ts"],"names":[],"mappings":"AAoEA,OAAO,KAAK,EACV,YAAY,EACZ,YAAY,EAGZ,WAAW,EAUX,UAAU,EAIX,MAAM,YAAY,CAAC;AAQpB,OAAO,EAA8D,KAAK,kBAAkB,EAAiB,MAAM,sBAAsB,CAAC;AAiE1I,wBAAgB,eAAe,CAAC,MAAM,EAAE,MAAM,GAAG,WAAW,GAAG,SAAS,CAEvE;AA2yBD;;;;;;;;GAQG;AACH,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,UAAU,EAAE,EACtB,cAAc,EAAE,kBAAkB,GAAG,SAAS,GAC7C,UAAU,EAAE,CAed;AAoYD,wBAAgB,2BAA2B,CAAC,GAAG,EAAE,MAAM,GAAG,KAAK,CAAC;IAAE,GAAG,EAAE,MAAM,CAAC;IAAC,OAAO,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC,CAgBjG;AA+pBD,wBAAsB,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC,CAgxC/F"}
|
package/dist/auditor.js
CHANGED
|
@@ -5,6 +5,7 @@ import { parseHtmlPage } from "./parser.js";
|
|
|
5
5
|
import { pageSkipReason } from "./page-filter.js";
|
|
6
6
|
import { mergeNormalizeUrlOptions, normalizeAuditUrl } from "./url-normalize.js";
|
|
7
7
|
import { eeatSignalsRule } from "./rules/content/eeat-signals.js";
|
|
8
|
+
import { citationCoverageRule } from "./rules/content/citation-coverage.js";
|
|
8
9
|
import { metaUniquenessRule } from "./rules/content/meta-uniqueness.js";
|
|
9
10
|
import { missingAuthorRule } from "./rules/content/missing-author.js";
|
|
10
11
|
import { uniqueValueRule } from "./rules/content/unique-value.js";
|
|
@@ -84,6 +85,8 @@ const DEFAULTS = {
|
|
|
84
85
|
metaUniquenessMinJaccard: 0.9,
|
|
85
86
|
linkDepthMaxClicks: 3,
|
|
86
87
|
templateCoverageMinPages: 5,
|
|
88
|
+
citationCoverageMinClaims: 4,
|
|
89
|
+
citationCoverageMinAuthoritative: 1,
|
|
87
90
|
answerFirstMaxWords: 100,
|
|
88
91
|
citableFactsMin: 3,
|
|
89
92
|
citableFactsTarget: 8,
|
|
@@ -414,6 +417,9 @@ const RULE_IMPACTS = {
|
|
|
414
417
|
"content/title-uniqueness": { baseImpact: 8, perInstance: 2, maxImpact: 25 }, // 2026-05-03 round 11: title is high-impact but the original 50-cap was disproportionate to other content rules and tipped Typeform into critical on a 6-finding cluster. Keep the rule at native error severity (duplicate titles ARE real bugs); just don't let one rule dominate the integrity bucket.
|
|
415
418
|
"content/heading-structure": { baseImpact: 5, perInstance: 1, maxImpact: 20 },
|
|
416
419
|
"content/image-alt-text": { baseImpact: 3, perInstance: 1, maxImpact: 20 },
|
|
420
|
+
// Citation coverage is low-confidence (block-level grounded-claim heuristic);
|
|
421
|
+
// keep its impact modest so it nudges rather than dominates the score.
|
|
422
|
+
"content/citation-coverage": { baseImpact: 3, perInstance: 1, maxImpact: 15 },
|
|
417
423
|
"content/translation-no-op": { baseImpact: 30, perInstance: 10, maxImpact: 60 },
|
|
418
424
|
// v1 warning-severity heuristic; lower than translation-no-op since it's speculative
|
|
419
425
|
"content/regurgitated-content": { baseImpact: 15, perInstance: 5, maxImpact: 35 },
|
|
@@ -653,6 +659,13 @@ sampled = false) {
|
|
|
653
659
|
if (isEnabled("content/eeat-signals") && modeOk("content/eeat-signals")) {
|
|
654
660
|
pushAll(findings, tag(eeatSignalsRule(pages)));
|
|
655
661
|
}
|
|
662
|
+
if (isEnabled("content/citation-coverage") && modeOk("content/citation-coverage")) {
|
|
663
|
+
pushAll(findings, tag(citationCoverageRule(pages, entityPatterns, {
|
|
664
|
+
minClaims: resolvedRules.citationCoverageMinClaims,
|
|
665
|
+
minAuthoritative: resolvedRules.citationCoverageMinAuthoritative,
|
|
666
|
+
allowlist: resolvedRules.citationAllowlist,
|
|
667
|
+
})));
|
|
668
|
+
}
|
|
656
669
|
// 2026-05-03 v0.5.2 blind-spot fixes — title uniqueness + heading
|
|
657
670
|
// structure + image alt-text were tier-1 gaps in the blind-spot audit.
|
|
658
671
|
if (isEnabled("content/title-uniqueness") && modeOk("content/title-uniqueness")) {
|
|
@@ -1249,36 +1262,46 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
|
|
|
1249
1262
|
lastmodByUrl.set(entry.url, entry.lastmod);
|
|
1250
1263
|
}
|
|
1251
1264
|
}
|
|
1252
|
-
return { urls, lastmodByUrl };
|
|
1265
|
+
return { urls, lastmodByUrl, childTotal: 0, childFailed: 0 };
|
|
1253
1266
|
}
|
|
1254
|
-
// It's a sitemap index.
|
|
1255
|
-
//
|
|
1267
|
+
// It's a sitemap index. Past the depth cap we stop recursing — but the
|
|
1268
|
+
// children we DON'T walk are unreached coverage, so report them as failed.
|
|
1256
1269
|
if (depth >= maxDepth) {
|
|
1257
1270
|
// eslint-disable-next-line no-console
|
|
1258
1271
|
console.error(`pseolint: sitemap-index nesting exceeded depth ${maxDepth} at ${sitemapUrl}; not recursing further.`);
|
|
1259
|
-
return { urls: [], lastmodByUrl: new Map() };
|
|
1272
|
+
return { urls: [], lastmodByUrl: new Map(), childTotal: entries.length, childFailed: entries.length };
|
|
1260
1273
|
}
|
|
1261
1274
|
const allUrls = [];
|
|
1262
1275
|
const allLastmodByUrl = new Map();
|
|
1276
|
+
let childTotal = 0;
|
|
1277
|
+
let childFailed = 0;
|
|
1263
1278
|
for (const entry of entries) {
|
|
1264
1279
|
const childUrl = entry.url;
|
|
1265
1280
|
if (signal?.aborted)
|
|
1266
1281
|
throw signal.reason ?? new Error("aborted");
|
|
1282
|
+
childTotal += 1;
|
|
1267
1283
|
if (visited.has(childUrl))
|
|
1268
|
-
continue;
|
|
1284
|
+
continue; // already walked (cyclic index) — not a failure
|
|
1269
1285
|
const child = await fetchWithRetry(childUrl, timeoutMs, cache, stats, signal, validateHop, SITEMAP_MAX_BYTES);
|
|
1270
|
-
if (!child)
|
|
1286
|
+
if (!child) {
|
|
1287
|
+
childFailed += 1;
|
|
1271
1288
|
continue;
|
|
1289
|
+
}
|
|
1272
1290
|
const childLike = child.contentType.includes("xml") || looksLikeSitemap(child.text);
|
|
1273
|
-
if (!childLike)
|
|
1291
|
+
if (!childLike) {
|
|
1292
|
+
childFailed += 1;
|
|
1274
1293
|
continue;
|
|
1275
|
-
|
|
1294
|
+
}
|
|
1295
|
+
const { urls: childUrls, lastmodByUrl: childLastmodByUrl, childTotal: ct, childFailed: cf } = await collectUrlsFromSitemap(child.text, childUrl, visited, timeoutMs, cache, stats, signal, validateHop, depth + 1, maxDepth);
|
|
1276
1296
|
pushAll(allUrls, childUrls);
|
|
1277
1297
|
for (const [u, lm] of childLastmodByUrl) {
|
|
1278
1298
|
allLastmodByUrl.set(u, lm);
|
|
1279
1299
|
}
|
|
1300
|
+
// Accumulate nested index structure (a child that is itself an index).
|
|
1301
|
+
childTotal += ct;
|
|
1302
|
+
childFailed += cf;
|
|
1280
1303
|
}
|
|
1281
|
-
return { urls: allUrls, lastmodByUrl: allLastmodByUrl };
|
|
1304
|
+
return { urls: allUrls, lastmodByUrl: allLastmodByUrl, childTotal, childFailed };
|
|
1282
1305
|
}
|
|
1283
1306
|
async function fetchRobotsMeta(origin, timeoutMs, cache, stats, signal, validateHop) {
|
|
1284
1307
|
if (!origin)
|
|
@@ -1380,7 +1403,7 @@ pageSink) {
|
|
|
1380
1403
|
const isXml = (contentType.includes("xml") || looksLikeSitemap(text)) && sourceStatus !== -1;
|
|
1381
1404
|
if (isXml) {
|
|
1382
1405
|
const visited = new Set();
|
|
1383
|
-
const { urls: allSitemapUrls, lastmodByUrl: sitemapLastmodByUrl } = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
1406
|
+
const { urls: allSitemapUrls, lastmodByUrl: sitemapLastmodByUrl, childTotal: sitemapChildTotal, childFailed: sitemapChildFailed } = await collectUrlsFromSitemap(text, source, visited, timeoutMs, cache, stats, signal, validateHop);
|
|
1384
1407
|
// If we have a budget, sample from sitemap URLs before fetching
|
|
1385
1408
|
const sampledUrls = discoveryBudget > 0 && allSitemapUrls.length > discoveryBudget
|
|
1386
1409
|
? fisherYatesSample(allSitemapUrls, discoveryBudget)
|
|
@@ -1516,7 +1539,7 @@ pageSink) {
|
|
|
1516
1539
|
});
|
|
1517
1540
|
}
|
|
1518
1541
|
}
|
|
1519
|
-
return { pages, sitemapUrls: new Set(allSitemapUrls), sitemapLastmodByUrl, discoveredUrlCount: allSitemapUrls.length, scrapePlan };
|
|
1542
|
+
return { pages, sitemapUrls: new Set(allSitemapUrls), sitemapLastmodByUrl, discoveredUrlCount: allSitemapUrls.length, declaredSitemapUrlCount: allSitemapUrls.length, sitemapChildTotal, sitemapChildFailed, scrapePlan };
|
|
1520
1543
|
}
|
|
1521
1544
|
if (contentType.includes("html") || looksLikeHtml(text)) {
|
|
1522
1545
|
const initialPage = { url: source, html: text };
|
|
@@ -1535,6 +1558,14 @@ pageSink) {
|
|
|
1535
1558
|
const knownCrawled = new Set([source]);
|
|
1536
1559
|
const allDiscoveredUrls = new Set([source]);
|
|
1537
1560
|
const maxDepth = 3;
|
|
1561
|
+
// Total URLs the discovered sitemap(s) declare — the basis for the
|
|
1562
|
+
// caller's coverage guardrail. Undefined when no sitemap is found.
|
|
1563
|
+
let declaredSitemapUrlCount;
|
|
1564
|
+
// Child-sitemap reachability for the guardrail: how many child sitemaps
|
|
1565
|
+
// an index referenced vs how many we could not fetch/parse. childFailed>0
|
|
1566
|
+
// means the declared URL list is itself incomplete.
|
|
1567
|
+
let sitemapChildTotal = 0;
|
|
1568
|
+
let sitemapChildFailed = 0;
|
|
1538
1569
|
// Sitemap-first discovery (like Google). Before link-crawling, read the
|
|
1539
1570
|
// sitemap(s) the site declares — link-crawl only reaches *linked* pages,
|
|
1540
1571
|
// but a pSEO site's whole point is thousands of programmatic URLs that
|
|
@@ -1574,13 +1605,19 @@ pageSink) {
|
|
|
1574
1605
|
}
|
|
1575
1606
|
if (!(smType.includes("xml") || looksLikeSitemap(smText)))
|
|
1576
1607
|
continue;
|
|
1577
|
-
const { urls: discoveredSmUrls } = await collectUrlsFromSitemap(smText, candidate, visitedSitemaps, timeoutMs, cache, stats, signal, validateHop);
|
|
1608
|
+
const { urls: discoveredSmUrls, childTotal: ct, childFailed: cf } = await collectUrlsFromSitemap(smText, candidate, visitedSitemaps, timeoutMs, cache, stats, signal, validateHop);
|
|
1609
|
+
sitemapChildTotal += ct;
|
|
1610
|
+
sitemapChildFailed += cf;
|
|
1578
1611
|
pushAll(sitemapListedUrls, discoveredSmUrls);
|
|
1579
1612
|
// When probing the conventional paths, stop at the first that hits.
|
|
1580
1613
|
if (probing && discoveredSmUrls.length > 0)
|
|
1581
1614
|
break;
|
|
1582
1615
|
}
|
|
1583
1616
|
// Same-origin + robots-aware filter, deduped against what we have.
|
|
1617
|
+
// Record what the sitemap(s) declared (deduped) before same-origin /
|
|
1618
|
+
// robots filtering — the operator's site has this many URLs.
|
|
1619
|
+
if (sitemapListedUrls.length > 0)
|
|
1620
|
+
declaredSitemapUrlCount = new Set(sitemapListedUrls).size;
|
|
1584
1621
|
const seedUrls = Array.from(new Set(sitemapListedUrls)).filter((u) => {
|
|
1585
1622
|
if (knownCrawled.has(u))
|
|
1586
1623
|
return false;
|
|
@@ -1686,7 +1723,7 @@ pageSink) {
|
|
|
1686
1723
|
if (newPages.length === 0)
|
|
1687
1724
|
break;
|
|
1688
1725
|
}
|
|
1689
|
-
return { pages, discoveredUrlCount: allDiscoveredUrls.size };
|
|
1726
|
+
return { pages, discoveredUrlCount: allDiscoveredUrls.size, declaredSitemapUrlCount, sitemapChildTotal, sitemapChildFailed };
|
|
1690
1727
|
}
|
|
1691
1728
|
return { pages };
|
|
1692
1729
|
}
|
|
@@ -1780,6 +1817,7 @@ export async function auditSource(source, options) {
|
|
|
1780
1817
|
// surfaced on the summary instead.
|
|
1781
1818
|
let truncated = false;
|
|
1782
1819
|
let truncatedReason;
|
|
1820
|
+
let truncatedKind;
|
|
1783
1821
|
const signal = composeSignals(externalSignal, backpressureAbort.signal);
|
|
1784
1822
|
const observer = new FetchObserver();
|
|
1785
1823
|
// 2026-05-03 calibration: the prior (3s p95 cap, 2× baseline multiplier)
|
|
@@ -1830,6 +1868,7 @@ export async function auditSource(source, options) {
|
|
|
1830
1868
|
return false;
|
|
1831
1869
|
truncated = true;
|
|
1832
1870
|
truncatedReason = backpressureError.message;
|
|
1871
|
+
truncatedKind = "backpressure";
|
|
1833
1872
|
return true;
|
|
1834
1873
|
}
|
|
1835
1874
|
function throwIfAborted() {
|
|
@@ -1861,6 +1900,9 @@ export async function auditSource(source, options) {
|
|
|
1861
1900
|
metaUniquenessMinJaccard: options?.rules?.metaUniquenessMinJaccard ?? DEFAULTS.metaUniquenessMinJaccard,
|
|
1862
1901
|
linkDepthMaxClicks: options?.rules?.linkDepthMaxClicks ?? DEFAULTS.linkDepthMaxClicks,
|
|
1863
1902
|
templateCoverageMinPages: options?.rules?.templateCoverageMinPages ?? DEFAULTS.templateCoverageMinPages,
|
|
1903
|
+
citationCoverageMinClaims: options?.rules?.citationCoverageMinClaims ?? DEFAULTS.citationCoverageMinClaims,
|
|
1904
|
+
citationCoverageMinAuthoritative: options?.rules?.citationCoverageMinAuthoritative ?? DEFAULTS.citationCoverageMinAuthoritative,
|
|
1905
|
+
citationAllowlist: options?.rules?.citationAllowlist,
|
|
1864
1906
|
answerFirstMaxWords: options?.rules?.answerFirstMaxWords ?? DEFAULTS.answerFirstMaxWords,
|
|
1865
1907
|
citableFactsMin: options?.rules?.citableFactsMin ?? DEFAULTS.citableFactsMin,
|
|
1866
1908
|
citableFactsTarget: options?.rules?.citableFactsTarget ?? DEFAULTS.citableFactsTarget,
|
|
@@ -1952,6 +1994,9 @@ export async function auditSource(source, options) {
|
|
|
1952
1994
|
let sitemapUrlSet;
|
|
1953
1995
|
let sitemapLastmodByUrl;
|
|
1954
1996
|
let discoveredUrlCount;
|
|
1997
|
+
let declaredSitemapUrlCount;
|
|
1998
|
+
let sitemapChildTotal;
|
|
1999
|
+
let sitemapChildFailed;
|
|
1955
2000
|
let scrapePlan;
|
|
1956
2001
|
if (hasPinnedUrlsEarly) {
|
|
1957
2002
|
const pinned = options.pinnedUrls;
|
|
@@ -2041,6 +2086,9 @@ export async function auditSource(source, options) {
|
|
|
2041
2086
|
sitemapUrlSet = loaded.sitemapUrls;
|
|
2042
2087
|
sitemapLastmodByUrl = loaded.sitemapLastmodByUrl;
|
|
2043
2088
|
discoveredUrlCount = loaded.discoveredUrlCount;
|
|
2089
|
+
declaredSitemapUrlCount = loaded.declaredSitemapUrlCount;
|
|
2090
|
+
sitemapChildTotal = loaded.sitemapChildTotal;
|
|
2091
|
+
sitemapChildFailed = loaded.sitemapChildFailed;
|
|
2044
2092
|
scrapePlan = loaded.scrapePlan;
|
|
2045
2093
|
}
|
|
2046
2094
|
catch (err) {
|
|
@@ -2053,6 +2101,7 @@ export async function auditSource(source, options) {
|
|
|
2053
2101
|
if (!salvageBackpressure()) {
|
|
2054
2102
|
truncated = true;
|
|
2055
2103
|
truncatedReason = err.message;
|
|
2104
|
+
truncatedKind = "backpressure";
|
|
2056
2105
|
}
|
|
2057
2106
|
// Recover whatever was fetched before the abort. The sink is the same
|
|
2058
2107
|
// array loadPagesFromSource was pushing into, so it holds the partial
|
|
@@ -2063,6 +2112,9 @@ export async function auditSource(source, options) {
|
|
|
2063
2112
|
sitemapUrlSet = undefined;
|
|
2064
2113
|
sitemapLastmodByUrl = undefined;
|
|
2065
2114
|
discoveredUrlCount = undefined;
|
|
2115
|
+
declaredSitemapUrlCount = undefined;
|
|
2116
|
+
sitemapChildTotal = undefined;
|
|
2117
|
+
sitemapChildFailed = undefined;
|
|
2066
2118
|
scrapePlan = undefined;
|
|
2067
2119
|
}
|
|
2068
2120
|
else {
|
|
@@ -2070,6 +2122,12 @@ export async function auditSource(source, options) {
|
|
|
2070
2122
|
}
|
|
2071
2123
|
}
|
|
2072
2124
|
}
|
|
2125
|
+
// Pages we successfully FETCHED (HTTP 2xx) from discovery — before content-type
|
|
2126
|
+
// and policy filtering, and before sampling. This is the right denominator for
|
|
2127
|
+
// the coverage guardrail: noindex / non-HTML pages were still *reached* (they
|
|
2128
|
+
// count), intentional sampling happens later (doesn't count against us), and
|
|
2129
|
+
// only genuinely-unreachable URLs (4xx/5xx) are missing from it.
|
|
2130
|
+
const fetchedCount = loadedPagesRaw.length;
|
|
2073
2131
|
// The scrapePlan tells us which URLs were skipped pre-fetch under monitoring
|
|
2074
2132
|
// mode. Surface them in skippedUrls so they show up under summary.skippedUrls
|
|
2075
2133
|
// (kept for back-compat with --since consumers); T7 will carry their prior
|
|
@@ -2103,7 +2161,10 @@ export async function auditSource(source, options) {
|
|
|
2103
2161
|
skippedByContentType.push(p.url);
|
|
2104
2162
|
}
|
|
2105
2163
|
}
|
|
2106
|
-
|
|
2164
|
+
// Replace contents in place without `splice(0, n, ...big)` — that spread hits
|
|
2165
|
+
// the V8 argument-count cap on large corpora (same class as pushAll).
|
|
2166
|
+
loadedPages.length = 0;
|
|
2167
|
+
pushAll(loadedPages, htmlOnlyPages);
|
|
2107
2168
|
if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
|
|
2108
2169
|
console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
|
|
2109
2170
|
}
|
|
@@ -2261,9 +2322,21 @@ export async function auditSource(source, options) {
|
|
|
2261
2322
|
const guardedClassification = applyDegenerationGuard(computedClassification, corpusStatsFromPages(parsedPages));
|
|
2262
2323
|
// `--strict` (or AuditOptions.strict) keeps the classification but forces
|
|
2263
2324
|
// every rule to run regardless of detected site type.
|
|
2325
|
+
//
|
|
2326
|
+
// A backpressure abort BEFORE classification salvages only a fragment of the
|
|
2327
|
+
// crawl (`truncated` is already set here; the coverage guardrail runs later).
|
|
2328
|
+
// Classifying that fragment — e.g. the 1 page left after the watchdog aborts a
|
|
2329
|
+
// cold-start origin — as `small-marketing` and suppressing the pSEO rules off
|
|
2330
|
+
// it is exactly what produced the confident false "READY" on a 5,600-page
|
|
2331
|
+
// site. When the run was truncated pre-classification we genuinely could not
|
|
2332
|
+
// determine the site type: force `unclear` (confidence 0, no suppression,
|
|
2333
|
+
// neutral scoring) so nothing masks the incompleteness.
|
|
2334
|
+
const classificationUnreliable = truncated;
|
|
2264
2335
|
const siteClassification = options?.strict
|
|
2265
2336
|
? { ...guardedClassification, suppressedRules: [] }
|
|
2266
|
-
:
|
|
2337
|
+
: classificationUnreliable
|
|
2338
|
+
? { ...guardedClassification, type: "unclear", confidence: 0, suppressedRules: [] }
|
|
2339
|
+
: guardedClassification;
|
|
2267
2340
|
const suppressedRuleSet = new Set(siteClassification.suppressedRules);
|
|
2268
2341
|
// Classify pages into groups and run only enabled rules per group
|
|
2269
2342
|
const classified = classifyPages(parsedPages, options?.pageGroups);
|
|
@@ -2478,9 +2551,66 @@ export async function auditSource(source, options) {
|
|
|
2478
2551
|
// salvaged whatever pages had been fetched. Consumers MUST treat coverage as
|
|
2479
2552
|
// a lower bound (counts/verdict are partial). Only set when actually
|
|
2480
2553
|
// truncated so complete runs keep `truncated` absent.
|
|
2554
|
+
// ── Coverage guardrails (#4) ─────────────────────────────────────────────
|
|
2555
|
+
// A sitemap was found at discovery, so we know roughly how large the site is.
|
|
2556
|
+
// Two independent under-coverage signals, each reusing the `truncated`
|
|
2557
|
+
// partial-coverage surface (CLI/Action/MCP/web already flag it) tagged
|
|
2558
|
+
// `truncatedKind: "coverage"` so consumers can tell it apart from a
|
|
2559
|
+
// backpressure abort. Backpressure (set during the crawl) takes precedence.
|
|
2560
|
+
if (!truncated && sitemapChildFailed && sitemapChildFailed > 0) {
|
|
2561
|
+
// (A) Extraction-side: a sitemap INDEX referenced child sitemaps we could
|
|
2562
|
+
// not fetch/parse (404, non-sitemap, or beyond the depth cap). The declared
|
|
2563
|
+
// URL list is itself incomplete — the "unreachable child sitemaps" case a
|
|
2564
|
+
// urls-only count can never see (and the original false-negative class).
|
|
2565
|
+
truncated = true;
|
|
2566
|
+
truncatedKind = "coverage";
|
|
2567
|
+
truncatedReason =
|
|
2568
|
+
`${sitemapChildFailed} of ${sitemapChildTotal} child sitemaps referenced by the sitemap index could not be ` +
|
|
2569
|
+
`fetched or parsed — both the declared URL count and this audit are incomplete, so the verdict is not ` +
|
|
2570
|
+
`representative of the full site. Check that every child sitemap is reachable (HTTP 200, valid XML).`;
|
|
2571
|
+
// eslint-disable-next-line no-console
|
|
2572
|
+
console.error(`pseolint: ${truncatedReason}`);
|
|
2573
|
+
}
|
|
2574
|
+
if (!truncated && declaredSitemapUrlCount && declaredSitemapUrlCount >= 20) {
|
|
2575
|
+
// (B) Audit-side: the sitemap declared N URLs but we FETCHED far fewer than
|
|
2576
|
+
// we intended to. Compare against `fetchedCount` (pages actually fetched,
|
|
2577
|
+
// pre-filter/pre-sample) so legitimately-skipped pages (noindex, non-HTML)
|
|
2578
|
+
// and intentional sampling do NOT register as a shortfall. `intended` is
|
|
2579
|
+
// bounded by every deliberate limit — an explicit sample, the crawl cap, and
|
|
2580
|
+
// the declared total — so none of them false-fire.
|
|
2581
|
+
const sampleCap = sampleSize > 0 ? sampleSize : Number.POSITIVE_INFINITY;
|
|
2582
|
+
const crawlCap = maxCrawlDiscovered > 0 ? maxCrawlDiscovered : Number.POSITIVE_INFINITY;
|
|
2583
|
+
const intended = Math.min(sampleCap, crawlCap, declaredSitemapUrlCount);
|
|
2584
|
+
const floor = Math.max(20, Math.floor(intended * 0.05));
|
|
2585
|
+
// `intended >= 20`: only judge representativeness when we actually meant to
|
|
2586
|
+
// audit a substantial slice. A deliberately tiny sample/crawl cap (intended
|
|
2587
|
+
// < 20) is the operator's choice, not under-discovery — don't flag it (and
|
|
2588
|
+
// it would otherwise trip the absolute floor of 20).
|
|
2589
|
+
if (intended >= 20 && fetchedCount < floor) {
|
|
2590
|
+
const unreached = Math.max(0, declaredSitemapUrlCount - fetchedCount);
|
|
2591
|
+
const ratio = fetchedCount / declaredSitemapUrlCount;
|
|
2592
|
+
const pct = (ratio * 100).toFixed(ratio < 0.01 ? 2 : 1);
|
|
2593
|
+
truncated = true;
|
|
2594
|
+
truncatedKind = "coverage";
|
|
2595
|
+
truncatedReason =
|
|
2596
|
+
`Fetched ${fetchedCount} of ~${declaredSitemapUrlCount} sitemap-declared URLs (~${pct}% coverage); ` +
|
|
2597
|
+
`~${unreached} could not be retrieved (4xx/5xx, redirects, or robots-blocked). The verdict covers only the ` +
|
|
2598
|
+
`pages reached and is not representative — check for a stale sitemap or unreachable pages, or raise crawl limits.`;
|
|
2599
|
+
// eslint-disable-next-line no-console
|
|
2600
|
+
console.error(`pseolint: ${truncatedReason}`);
|
|
2601
|
+
}
|
|
2602
|
+
}
|
|
2481
2603
|
if (truncated) {
|
|
2482
2604
|
summary.truncated = true;
|
|
2483
2605
|
summary.truncatedReason = truncatedReason;
|
|
2606
|
+
if (truncatedKind)
|
|
2607
|
+
summary.truncatedKind = truncatedKind;
|
|
2608
|
+
// A truncated run is incomplete — never present it as a clean green. Floor
|
|
2609
|
+
// the verdict to at least "caution" so the headline matches the partial-
|
|
2610
|
+
// coverage banner instead of the false "READY ✓" over a salvaged fragment.
|
|
2611
|
+
// ("ready" is the only rung below "caution"; everything else already is.)
|
|
2612
|
+
if (summary.verdict === "ready")
|
|
2613
|
+
summary.verdict = "caution";
|
|
2484
2614
|
}
|
|
2485
2615
|
if (cacheConfig) {
|
|
2486
2616
|
summary.cacheStats = cacheStats;
|