rankforge 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/robots.mjs ADDED
@@ -0,0 +1,60 @@
1
+ const emptyGroup = () => ({ agents: [], rules: [] });
2
+
3
+ export const parseRobotsTxt = (text) => {
4
+ const groups = [];
5
+ let current = emptyGroup();
6
+
7
+ const pushCurrent = () => {
8
+ if (current.agents.length || current.rules.length) groups.push(current);
9
+ current = emptyGroup();
10
+ };
11
+
12
+ for (const rawLine of String(text || "").split(/\r?\n/)) {
13
+ const line = rawLine.replace(/#.*/, "").trim();
14
+ if (!line) continue;
15
+
16
+ const [rawKey, ...rawValue] = line.split(":");
17
+ const key = rawKey.trim().toLowerCase();
18
+ const value = rawValue.join(":").trim();
19
+
20
+ if (key === "user-agent") {
21
+ if (current.rules.length) pushCurrent();
22
+ current.agents.push(value.toLowerCase());
23
+ continue;
24
+ }
25
+
26
+ if ((key === "allow" || key === "disallow") && current.agents.length) {
27
+ current.rules.push({ type: key, path: value });
28
+ }
29
+ }
30
+
31
+ pushCurrent();
32
+ return { groups };
33
+ };
34
+
35
+ const pathFor = (pathOrUrl) => {
36
+ try {
37
+ const parsed = new URL(pathOrUrl);
38
+ return parsed.pathname || "/";
39
+ } catch {
40
+ return String(pathOrUrl || "/");
41
+ }
42
+ };
43
+
44
+ const matchingGroups = (parsed, userAgent) => {
45
+ const agent = String(userAgent || "").toLowerCase();
46
+ const groups = parsed?.groups || [];
47
+ const exact = groups.filter((group) => group.agents.some((item) => item !== "*" && agent.includes(item)));
48
+ return exact.length ? exact : groups.filter((group) => group.agents.includes("*"));
49
+ };
50
+
51
+ export const isAllowedByRobots = (parsed, pathOrUrl, userAgent = "RankForgeBot") => {
52
+ const pathname = pathFor(pathOrUrl);
53
+ const rules = matchingGroups(parsed, userAgent)
54
+ .flatMap((group) => group.rules)
55
+ .filter((rule) => rule.path && pathname.startsWith(rule.path))
56
+ .sort((a, b) => b.path.length - a.path.length || (a.type === "allow" ? -1 : 1));
57
+
58
+ if (!rules.length) return true;
59
+ return rules[0].type === "allow";
60
+ };
@@ -0,0 +1,190 @@
1
+ import { cleanText } from "./html-extract.mjs";
2
+ import { structuredDataNodes, structuredDataTypeNames } from "./structured-data.mjs";
3
+ import { normalizeUrl } from "./url-utils.mjs";
4
+
5
+ const noindexPattern = /(?:^|,|\s)noindex(?:,|\s|$)/i;
6
+ const entitySchemaTypes = new Set(["Organization", "LocalBusiness", "Person", "Product", "Article"]);
7
+ const supportedStructuredDataFields = {
8
+ Organization: "name",
9
+ LocalBusiness: "name",
10
+ Product: "name",
11
+ Article: "headline",
12
+ };
13
+ const stopwords = new Set([
14
+ "a",
15
+ "an",
16
+ "and",
17
+ "are",
18
+ "as",
19
+ "at",
20
+ "by",
21
+ "for",
22
+ "from",
23
+ "in",
24
+ "of",
25
+ "on",
26
+ "or",
27
+ "the",
28
+ "to",
29
+ "with",
30
+ ]);
31
+
32
+ const evidencePath = (pageIndex, path) => `$.pages[${pageIndex}].evidence.${path}`;
33
+
34
+ const normalizeText = (value) =>
35
+ cleanText(value)
36
+ .toLowerCase()
37
+ .replace(/[^\p{L}\p{N}\s]/gu, " ")
38
+ .replace(/\s+/g, " ")
39
+ .trim();
40
+
41
+ const tokensFor = (value) =>
42
+ normalizeText(value)
43
+ .split(/\s+/)
44
+ .filter((token) => token.length > 2 && !stopwords.has(token));
45
+
46
+ const isNoindexed = (evidence = {}) => noindexPattern.test(evidence.robots || "");
47
+
48
+ const isSuccessfulPage = (page) => !Number.isInteger(page.status) || page.status < 400;
49
+
50
+ const sameCanonical = (page) => {
51
+ const canonical = page.evidence?.canonical;
52
+ if (!canonical) return true;
53
+ try {
54
+ return normalizeUrl(canonical) === normalizeUrl(page.finalUrl);
55
+ } catch {
56
+ return canonical === page.finalUrl;
57
+ }
58
+ };
59
+
60
+ const visibleContentSurface = (evidence = {}) =>
61
+ normalizeText([
62
+ evidence.title,
63
+ evidence.description,
64
+ evidence.siteName,
65
+ ...(Array.isArray(evidence.h1) ? evidence.h1 : []),
66
+ ...(Array.isArray(evidence.headings) ? evidence.headings.map((heading) => heading.text) : []),
67
+ evidence.visibleTextPreview,
68
+ ].filter(Boolean).join(" "));
69
+
70
+ const valueAppearsInSurface = (value, surface) => {
71
+ const normalizedValue = normalizeText(value);
72
+ if (normalizedValue.length < 4) return true;
73
+ const paddedSurface = ` ${surface} `;
74
+ if (paddedSurface.includes(` ${normalizedValue} `)) return true;
75
+
76
+ const tokens = tokensFor(normalizedValue);
77
+ if (tokens.length === 0) return true;
78
+ const surfaceTokens = new Set(tokensFor(surface));
79
+ const matched = tokens.filter((token) => surfaceTokens.has(token)).length;
80
+ if (tokens.length === 1) return matched === 1;
81
+ return matched >= Math.max(2, Math.ceil(tokens.length * 0.5));
82
+ };
83
+
84
+ export const contentFingerprint = (value) => {
85
+ const normalized = normalizeText(value);
86
+ return normalized.length >= 500 ? normalized : "";
87
+ };
88
+
89
+ export const structuredDataVisibleContentFacts = (page, pageIndex = 0) => {
90
+ const evidence = page.evidence || {};
91
+ const surface = visibleContentSurface(evidence);
92
+ const facts = [];
93
+
94
+ for (const [blockIndex, block] of (evidence.structuredData || []).entries()) {
95
+ if (!block?.data || block.parseError) continue;
96
+
97
+ for (const node of structuredDataNodes(block.data)) {
98
+ for (const type of structuredDataTypeNames(node["@type"])) {
99
+ const property = supportedStructuredDataFields[type];
100
+ if (!property) continue;
101
+
102
+ const value = node[property];
103
+ if (Array.isArray(value) || typeof value === "object") continue;
104
+ const cleaned = cleanText(value);
105
+ if (cleaned.length < 4 || valueAppearsInSurface(cleaned, surface)) continue;
106
+
107
+ facts.push({
108
+ ruleId: "structured_data.visible_content_mismatch",
109
+ evidence: [
110
+ evidencePath(pageIndex, `structuredData[${blockIndex}]`),
111
+ evidencePath(pageIndex, "visibleTextPreview"),
112
+ evidencePath(pageIndex, "title"),
113
+ evidencePath(pageIndex, "h1"),
114
+ evidencePath(pageIndex, "headings"),
115
+ ],
116
+ impact: `${type} structured data names "${cleaned}", but that value is not visible in page evidence.`,
117
+ });
118
+ }
119
+ }
120
+ }
121
+
122
+ return facts;
123
+ };
124
+
125
+ const hasAboutOrContactLink = (evidence = {}) =>
126
+ (evidence.links || []).some((link) => {
127
+ const haystack = `${link.href || ""} ${link.text || ""}`.toLowerCase();
128
+ return haystack.includes("about") || haystack.includes("contact");
129
+ });
130
+
131
+ const entitySignalCount = (evidence = {}) => {
132
+ let count = 0;
133
+ if (cleanText(evidence.siteName)) count += 1;
134
+ if ((evidence.schemaTypes || []).some((type) => entitySchemaTypes.has(type))) count += 1;
135
+ if (hasAboutOrContactLink(evidence)) count += 1;
136
+ if ((evidence.entitySignals?.authors || []).length > 0) count += 1;
137
+ if ((evidence.entitySignals?.dates || []).length > 0) count += 1;
138
+ return count;
139
+ };
140
+
141
+ export const entityClarityFacts = (page, pageIndex = 0) => {
142
+ const evidence = page.evidence || {};
143
+ const visibleTextCharacters = evidence.counts?.visibleTextCharacters || 0;
144
+ const hasPurposeSignal = cleanText(evidence.title) || (evidence.h1 || []).some((item) => cleanText(item));
145
+
146
+ if (!isSuccessfulPage(page) || isNoindexed(evidence) || visibleTextCharacters < 800 || !hasPurposeSignal) return [];
147
+ const signalCount = entitySignalCount(evidence);
148
+ if (signalCount > 1) return [];
149
+
150
+ return [{
151
+ ruleId: "geo.entity_clarity_gap",
152
+ evidence: [
153
+ evidencePath(pageIndex, "siteName"),
154
+ evidencePath(pageIndex, "schemaTypes"),
155
+ evidencePath(pageIndex, "links"),
156
+ evidencePath(pageIndex, "entitySignals"),
157
+ evidencePath(pageIndex, "counts.visibleTextCharacters"),
158
+ ],
159
+ impact: `Substantial page content has only ${signalCount} deterministic entity signal(s).`,
160
+ }];
161
+ };
162
+
163
+ export const duplicateContentClusterFacts = (pages = []) => {
164
+ const groups = new Map();
165
+
166
+ for (const [index, page] of pages.entries()) {
167
+ const evidence = page.evidence || {};
168
+ if (!isSuccessfulPage(page) || isNoindexed(evidence) || !sameCanonical(page)) continue;
169
+ if ((evidence.counts?.visibleTextCharacters || 0) < 600) continue;
170
+
171
+ const fingerprint = contentFingerprint(evidence.visibleTextPreview || "");
172
+ if (!fingerprint) continue;
173
+
174
+ const group = groups.get(fingerprint) || [];
175
+ group.push({ page, index });
176
+ groups.set(fingerprint, group);
177
+ }
178
+
179
+ return [...groups.values()]
180
+ .filter((group) => group.length >= 3)
181
+ .map((group) => ({
182
+ ruleId: "policy.duplicate_content_cluster",
183
+ affectedUrls: group.map(({ page }) => page.finalUrl),
184
+ evidence: group.flatMap(({ index }) => [
185
+ evidencePath(index, "visibleTextPreview"),
186
+ evidencePath(index, "counts.visibleTextCharacters"),
187
+ ]),
188
+ impact: `${group.length} pages share the same substantial normalized visible text preview.`,
189
+ }));
190
+ };
@@ -0,0 +1,360 @@
1
+ import { implementationTaskFor } from "./finding-task.mjs";
2
+ import { renderParityFacts } from "./render-parity.mjs";
3
+ import { entityClarityFacts, structuredDataVisibleContentFacts } from "./rule-depth.mjs";
4
+ import { getRule } from "./rules.mjs";
5
+ import { validateStructuredData } from "./structured-data.mjs";
6
+
7
+ const severityImpact = {
8
+ P0: 60,
9
+ P1: 40,
10
+ P2: 20,
11
+ P3: 10,
12
+ };
13
+
14
+ const ownerFor = (dimension) => {
15
+ if (
16
+ dimension === "structured_data" ||
17
+ dimension === "technical" ||
18
+ dimension === "crawl_index" ||
19
+ dimension === "performance"
20
+ ) {
21
+ return "Engineering";
22
+ }
23
+ if (dimension === "helpful_content" || dimension === "geo_readiness" || dimension === "entity_clarity") {
24
+ return "Content";
25
+ }
26
+ return "SEO";
27
+ };
28
+
29
+ const effortFor = (severity) => (severity === "P0" || severity === "P1" ? "M" : "S");
30
+
31
+ const noindexPattern = /(?:^|,|\s)noindex(?:,|\s|$)/i;
32
+
33
+ const headerValue = (headers, name) => {
34
+ const wanted = name.toLowerCase();
35
+ for (const [key, value] of Object.entries(headers || {})) {
36
+ if (key.toLowerCase() === wanted) return String(value);
37
+ }
38
+ return "";
39
+ };
40
+
41
+ const isHomepageLike = (snapshot) => {
42
+ try {
43
+ const parsed = new URL(snapshot.finalUrl);
44
+ return parsed.pathname === "/" || /\/index\.html?$/i.test(parsed.pathname);
45
+ } catch {
46
+ return /(?:^|\/)index\.html?$/i.test(snapshot.finalUrl || "");
47
+ }
48
+ };
49
+
50
+ const structuredDataTypes = (value) => {
51
+ if (!value) return [];
52
+ if (Array.isArray(value)) return value.flatMap(structuredDataTypes);
53
+ if (typeof value !== "object") return [];
54
+
55
+ const types = [];
56
+ if (value["@type"]) {
57
+ if (Array.isArray(value["@type"])) types.push(...value["@type"].map(String));
58
+ else types.push(String(value["@type"]));
59
+ }
60
+ if (value["@graph"]) types.push(...structuredDataTypes(value["@graph"]));
61
+ return types;
62
+ };
63
+
64
+ const hasStructuredDataType = (evidence, type) =>
65
+ (evidence.structuredData || []).some((item) => structuredDataTypes(item.data).includes(type));
66
+
67
+ const hasAboutOrContactLink = (evidence) =>
68
+ (evidence.links || []).some((link) => {
69
+ const haystack = `${link.href || ""} ${link.text || ""}`.toLowerCase();
70
+ return haystack.includes("about") || haystack.includes("contact");
71
+ });
72
+
73
+ const createFinding = (ruleId, snapshot, evidence, pageIndex, impact = null) => {
74
+ const rule = getRule(ruleId);
75
+ if (!rule) throw new Error(`Unknown rule: ${ruleId}`);
76
+ const owner = ownerFor(rule.dimension);
77
+ const effort = effortFor(rule.defaultSeverity);
78
+
79
+ return {
80
+ ruleId: rule.id,
81
+ title: rule.title,
82
+ severity: rule.defaultSeverity,
83
+ dimension: rule.dimension,
84
+ affectedUrls: [snapshot.finalUrl],
85
+ evidence,
86
+ impact: impact || rule.title,
87
+ recommendation: rule.recommendation,
88
+ implementationTask: implementationTaskFor(rule, owner, effort),
89
+ owner,
90
+ effort,
91
+ confidence: "high",
92
+ sources: rule.sources,
93
+ pageIndex,
94
+ };
95
+ };
96
+
97
+ export const evaluatePage = (snapshot, pageIndex = 0) => {
98
+ const findings = [];
99
+ const evidence = snapshot.evidence || {};
100
+ const visibleTextCharacters = evidence.counts?.visibleTextCharacters || 0;
101
+
102
+ if ((snapshot.redirectChain || []).length > 1) {
103
+ findings.push(
104
+ createFinding(
105
+ "technical.redirect_chain",
106
+ snapshot,
107
+ [`$.pages[${pageIndex}].redirectChain`],
108
+ pageIndex,
109
+ "Long redirect chains slow crawling and make canonical URL resolution less predictable.",
110
+ ),
111
+ );
112
+ }
113
+
114
+ if (Number.isInteger(snapshot.status) && snapshot.status >= 400) {
115
+ findings.push(
116
+ createFinding(
117
+ "technical.http_error",
118
+ snapshot,
119
+ [`$.pages[${pageIndex}].status`],
120
+ pageIndex,
121
+ "Important pages with HTTP errors may be ineligible for indexing or unusable for visitors.",
122
+ ),
123
+ );
124
+ }
125
+
126
+ if (snapshot.sourceType === "url" && /^http:\/\//i.test(snapshot.finalUrl)) {
127
+ findings.push(
128
+ createFinding(
129
+ "technical.https_missing",
130
+ snapshot,
131
+ [`$.pages[${pageIndex}].finalUrl`],
132
+ pageIndex,
133
+ "Important pages should be available over HTTPS.",
134
+ ),
135
+ );
136
+ }
137
+
138
+ if (snapshot.sourceType === "url" && !evidence.canonical && !noindexPattern.test(evidence.robots || "")) {
139
+ findings.push(
140
+ createFinding(
141
+ "indexability.canonical_missing",
142
+ snapshot,
143
+ [`$.pages[${pageIndex}].evidence.canonical`],
144
+ pageIndex,
145
+ "Canonical signals help consolidate duplicate or alternate URL signals.",
146
+ ),
147
+ );
148
+ }
149
+
150
+ for (const parityFact of renderParityFacts(snapshot, pageIndex)) {
151
+ findings.push(
152
+ createFinding(parityFact.ruleId, snapshot, parityFact.evidence, pageIndex, parityFact.impact),
153
+ );
154
+ }
155
+
156
+ if (noindexPattern.test(evidence.robots || "")) {
157
+ findings.push(
158
+ createFinding(
159
+ "indexability.noindex",
160
+ snapshot,
161
+ [`$.pages[${pageIndex}].evidence.robots`],
162
+ pageIndex,
163
+ "Pages with noindex are not eligible to appear in Google Search results.",
164
+ ),
165
+ );
166
+ }
167
+
168
+ if (noindexPattern.test(evidence.robots || "") && evidence.canonical) {
169
+ findings.push(
170
+ createFinding(
171
+ "indexability.noindex_canonical_conflict",
172
+ snapshot,
173
+ [`$.pages[${pageIndex}].evidence.robots`, `$.pages[${pageIndex}].evidence.canonical`],
174
+ pageIndex,
175
+ "Combining noindex with a canonical signal creates an ambiguous indexing strategy.",
176
+ ),
177
+ );
178
+ }
179
+
180
+ if (noindexPattern.test(headerValue(snapshot.headers, "x-robots-tag"))) {
181
+ findings.push(
182
+ createFinding(
183
+ "indexability.x_robots_noindex",
184
+ snapshot,
185
+ [`$.pages[${pageIndex}].headers.x-robots-tag`],
186
+ pageIndex,
187
+ "X-Robots-Tag noindex prevents indexing even when page HTML appears indexable.",
188
+ ),
189
+ );
190
+ }
191
+
192
+ if (!evidence.title) {
193
+ findings.push(
194
+ createFinding(
195
+ "appearance.title_missing",
196
+ snapshot,
197
+ [`$.pages[${pageIndex}].evidence.title`],
198
+ pageIndex,
199
+ "Missing titles limit control over search result title links.",
200
+ ),
201
+ );
202
+ }
203
+
204
+ if (!evidence.description) {
205
+ findings.push(
206
+ createFinding(
207
+ "appearance.meta_description_missing",
208
+ snapshot,
209
+ [`$.pages[${pageIndex}].evidence.description`],
210
+ pageIndex,
211
+ "Missing descriptions reduce the page's ability to provide useful snippet text.",
212
+ ),
213
+ );
214
+ }
215
+
216
+ if (!Array.isArray(evidence.h1) || evidence.h1.length === 0) {
217
+ findings.push(
218
+ createFinding(
219
+ "appearance.h1_missing",
220
+ snapshot,
221
+ [`$.pages[${pageIndex}].evidence.h1`],
222
+ pageIndex,
223
+ "A missing primary heading makes page purpose less explicit for users and crawlers.",
224
+ ),
225
+ );
226
+ }
227
+
228
+ if ((evidence.images || []).some((image) => image.alt === null || image.alt === "")) {
229
+ findings.push(
230
+ createFinding(
231
+ "appearance.image_alt_missing",
232
+ snapshot,
233
+ [`$.pages[${pageIndex}].evidence.images`],
234
+ pageIndex,
235
+ "Images without alt text provide weaker context for accessibility and image search.",
236
+ ),
237
+ );
238
+ }
239
+
240
+ if (snapshot.sourceType === "url" && isHomepageLike(snapshot) && !evidence.favicon) {
241
+ findings.push(
242
+ createFinding(
243
+ "appearance.favicon_missing",
244
+ snapshot,
245
+ [`$.pages[${pageIndex}].evidence.favicon`],
246
+ pageIndex,
247
+ "A missing favicon weakens brand presentation in search appearance surfaces.",
248
+ ),
249
+ );
250
+ }
251
+
252
+ if ((evidence.structuredData || []).some((item) => item.parseError)) {
253
+ findings.push(
254
+ createFinding(
255
+ "structured_data.invalid_jsonld",
256
+ snapshot,
257
+ [`$.pages[${pageIndex}].evidence.structuredData`],
258
+ pageIndex,
259
+ "Invalid JSON-LD cannot be parsed reliably for structured data eligibility.",
260
+ ),
261
+ );
262
+ }
263
+
264
+ for (const issue of validateStructuredData(evidence.structuredData || [])) {
265
+ findings.push(
266
+ createFinding(
267
+ "structured_data.required_property_missing",
268
+ snapshot,
269
+ [`$.pages[${pageIndex}].evidence.structuredData[${issue.blockIndex}]`],
270
+ pageIndex,
271
+ `${issue.type} structured data is missing required properties: ${issue.missing.join(", ")}.`,
272
+ ),
273
+ );
274
+ }
275
+
276
+ for (const depthFact of [
277
+ ...structuredDataVisibleContentFacts(snapshot, pageIndex),
278
+ ...entityClarityFacts(snapshot, pageIndex),
279
+ ]) {
280
+ findings.push(
281
+ createFinding(depthFact.ruleId, snapshot, depthFact.evidence, pageIndex, depthFact.impact),
282
+ );
283
+ }
284
+
285
+ if (visibleTextCharacters > 0 && visibleTextCharacters < 300) {
286
+ findings.push(
287
+ createFinding(
288
+ "content.thin_content",
289
+ snapshot,
290
+ [`$.pages[${pageIndex}].evidence.counts.visibleTextCharacters`],
291
+ pageIndex,
292
+ "Thin pages are less likely to satisfy visitor tasks or support AI/search answerability.",
293
+ ),
294
+ );
295
+ }
296
+
297
+ if (visibleTextCharacters >= 500 && (evidence.headings || []).length < 2) {
298
+ findings.push(
299
+ createFinding(
300
+ "content.answerability_gap",
301
+ snapshot,
302
+ [`$.pages[${pageIndex}].evidence.headings`, `$.pages[${pageIndex}].evidence.counts.visibleTextCharacters`],
303
+ pageIndex,
304
+ "Substantial pages need clear answerable sections to support users and generative search features.",
305
+ ),
306
+ );
307
+ }
308
+
309
+ if (
310
+ isHomepageLike(snapshot) &&
311
+ !hasStructuredDataType(evidence, "Organization") &&
312
+ !hasStructuredDataType(evidence, "LocalBusiness")
313
+ ) {
314
+ findings.push(
315
+ createFinding(
316
+ "structured_data.organization_missing",
317
+ snapshot,
318
+ [`$.pages[${pageIndex}].evidence.structuredData`],
319
+ pageIndex,
320
+ "Homepage-like pages should make organization identity clear where visible content supports it.",
321
+ ),
322
+ );
323
+ }
324
+
325
+ if (isHomepageLike(snapshot) && !hasAboutOrContactLink(evidence)) {
326
+ findings.push(
327
+ createFinding(
328
+ "entity.about_contact_missing",
329
+ snapshot,
330
+ [`$.pages[${pageIndex}].evidence.links`],
331
+ pageIndex,
332
+ "Homepage-like pages should expose crawlable about or contact paths for entity trust signals.",
333
+ ),
334
+ );
335
+ }
336
+
337
+ return findings;
338
+ };
339
+
340
+ export const scoreFindings = (findings) => {
341
+ const scores = {};
342
+
343
+ for (const finding of findings) {
344
+ const score = scores[finding.dimension] || {
345
+ score: 100,
346
+ findings: [],
347
+ p0: 0,
348
+ p1: 0,
349
+ p2: 0,
350
+ p3: 0,
351
+ };
352
+
353
+ score.findings.push(finding.ruleId);
354
+ score[finding.severity.toLowerCase()] += 1;
355
+ score.score = Math.max(0, score.score - severityImpact[finding.severity]);
356
+ scores[finding.dimension] = score;
357
+ }
358
+
359
+ return scores;
360
+ };