rankforge 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +30 -0
- package/package.json +49 -0
- package/src/audit-output-schema.mjs +88 -0
- package/src/audit.mjs +202 -0
- package/src/cli.mjs +508 -0
- package/src/config-schema.mjs +292 -0
- package/src/crawl.mjs +188 -0
- package/src/finding-task.mjs +9 -0
- package/src/html-extract.mjs +226 -0
- package/src/index.mjs +9 -0
- package/src/integrations.mjs +78 -0
- package/src/io-guards.mjs +196 -0
- package/src/performance.mjs +112 -0
- package/src/regex-guards.mjs +52 -0
- package/src/render-parity.mjs +149 -0
- package/src/render.mjs +45 -0
- package/src/repo-audit.mjs +429 -0
- package/src/repo-detect.mjs +87 -0
- package/src/repo-findings.mjs +9 -0
- package/src/repo-manifests.mjs +169 -0
- package/src/repo-process.mjs +298 -0
- package/src/repo-routes.mjs +46 -0
- package/src/report.mjs +898 -0
- package/src/robots.mjs +60 -0
- package/src/rule-depth.mjs +190 -0
- package/src/rule-engine.mjs +360 -0
- package/src/rules.mjs +350 -0
- package/src/site-rule-engine.mjs +177 -0
- package/src/sitemap.mjs +30 -0
- package/src/snapshot.mjs +119 -0
- package/src/source-map.json +28 -0
- package/src/structured-data.mjs +59 -0
- package/src/url-utils.mjs +25 -0
package/src/rules.mjs
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
const sources = {
|
|
2
|
+
searchEssentials: "https://developers.google.com/search/docs/essentials",
|
|
3
|
+
technicalRequirements: "https://developers.google.com/search/docs/essentials/technical",
|
|
4
|
+
howSearchWorks: "https://developers.google.com/search/docs/fundamentals/how-search-works",
|
|
5
|
+
crawlableLinks: "https://developers.google.com/search/docs/crawling-indexing/links-crawlable",
|
|
6
|
+
robotsMeta: "https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag",
|
|
7
|
+
robotsTxt: "https://developers.google.com/search/docs/crawling-indexing/robots/intro",
|
|
8
|
+
canonicalization: "https://developers.google.com/search/docs/crawling-indexing/canonicalization",
|
|
9
|
+
consolidateDuplicates: "https://developers.google.com/search/docs/crawling-indexing/consolidate-duplicate-urls",
|
|
10
|
+
sitemaps: "https://developers.google.com/search/docs/crawling-indexing/sitemaps/overview",
|
|
11
|
+
javascriptSeo: "https://developers.google.com/search/docs/crawling-indexing/javascript/javascript-seo-basics",
|
|
12
|
+
helpfulContent: "https://developers.google.com/search/docs/fundamentals/creating-helpful-content",
|
|
13
|
+
aiOptimization: "https://developers.google.com/search/docs/fundamentals/ai-optimization-guide",
|
|
14
|
+
structuredDataIntro: "https://developers.google.com/search/docs/appearance/structured-data/intro-structured-data",
|
|
15
|
+
structuredDataPolicies: "https://developers.google.com/search/docs/appearance/structured-data/sd-policies",
|
|
16
|
+
organizationSchema: "https://developers.google.com/search/docs/appearance/structured-data/organization",
|
|
17
|
+
pageExperience: "https://developers.google.com/search/docs/appearance/page-experience",
|
|
18
|
+
coreWebVitals: "https://developers.google.com/search/docs/appearance/core-web-vitals",
|
|
19
|
+
titleLinks: "https://developers.google.com/search/docs/appearance/title-link",
|
|
20
|
+
snippets: "https://developers.google.com/search/docs/appearance/snippet",
|
|
21
|
+
googleImages: "https://developers.google.com/search/docs/appearance/google-images",
|
|
22
|
+
favicon: "https://developers.google.com/search/docs/appearance/favicon-in-search",
|
|
23
|
+
spamPolicies: "https://developers.google.com/search/docs/essentials/spam-policies",
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
const rule = (id, dimension, defaultSeverity, title, recommendation, ruleSources) => ({
|
|
27
|
+
id,
|
|
28
|
+
dimension,
|
|
29
|
+
defaultSeverity,
|
|
30
|
+
title,
|
|
31
|
+
recommendation,
|
|
32
|
+
sources: ruleSources,
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
export const rules = [
|
|
36
|
+
rule(
|
|
37
|
+
"technical.http_error",
|
|
38
|
+
"technical",
|
|
39
|
+
"P0",
|
|
40
|
+
"Important page returns an unsuccessful HTTP status",
|
|
41
|
+
"Fix server, routing, or deployment errors so important pages return a successful status.",
|
|
42
|
+
[sources.technicalRequirements, sources.howSearchWorks],
|
|
43
|
+
),
|
|
44
|
+
rule(
|
|
45
|
+
"technical.redirect_chain",
|
|
46
|
+
"technical",
|
|
47
|
+
"P2",
|
|
48
|
+
"Page uses a long or unstable redirect chain",
|
|
49
|
+
"Reduce redirects so canonical URLs resolve directly and predictably.",
|
|
50
|
+
[sources.technicalRequirements],
|
|
51
|
+
),
|
|
52
|
+
rule(
|
|
53
|
+
"technical.https_missing",
|
|
54
|
+
"technical",
|
|
55
|
+
"P1",
|
|
56
|
+
"Page is not available over HTTPS",
|
|
57
|
+
"Serve important pages over HTTPS and redirect HTTP variants to the HTTPS canonical URL.",
|
|
58
|
+
[sources.searchEssentials],
|
|
59
|
+
),
|
|
60
|
+
rule(
|
|
61
|
+
"technical.rendered_content_missing",
|
|
62
|
+
"technical",
|
|
63
|
+
"P1",
|
|
64
|
+
"Rendered page is missing primary content",
|
|
65
|
+
"Ensure primary content is available in rendered HTML and does not depend on fragile client behavior.",
|
|
66
|
+
[sources.javascriptSeo, sources.technicalRequirements],
|
|
67
|
+
),
|
|
68
|
+
rule(
|
|
69
|
+
"technical.raw_rendered_mismatch",
|
|
70
|
+
"technical",
|
|
71
|
+
"P2",
|
|
72
|
+
"Raw and rendered page content differ materially",
|
|
73
|
+
"Review JavaScript rendering and hydration so crawlers and users receive consistent primary content.",
|
|
74
|
+
[sources.javascriptSeo],
|
|
75
|
+
),
|
|
76
|
+
rule(
|
|
77
|
+
"technical.rendered_title_changed",
|
|
78
|
+
"technical",
|
|
79
|
+
"P2",
|
|
80
|
+
"Rendered HTML changes the page title",
|
|
81
|
+
"Keep rendered title output aligned with raw HTML so crawlers and users receive a stable title signal.",
|
|
82
|
+
[sources.javascriptSeo, sources.titleLinks],
|
|
83
|
+
),
|
|
84
|
+
rule(
|
|
85
|
+
"technical.rendered_description_changed",
|
|
86
|
+
"technical",
|
|
87
|
+
"P3",
|
|
88
|
+
"Rendered HTML changes the meta description",
|
|
89
|
+
"Keep rendered meta description output aligned with raw HTML when the page relies on it for snippet context.",
|
|
90
|
+
[sources.javascriptSeo, sources.snippets],
|
|
91
|
+
),
|
|
92
|
+
rule(
|
|
93
|
+
"technical.rendered_canonical_changed",
|
|
94
|
+
"technical",
|
|
95
|
+
"P1",
|
|
96
|
+
"Rendered HTML changes the canonical URL",
|
|
97
|
+
"Keep canonical URL output stable between raw and rendered HTML so indexing signals remain consistent.",
|
|
98
|
+
[sources.javascriptSeo, sources.canonicalization],
|
|
99
|
+
),
|
|
100
|
+
rule(
|
|
101
|
+
"technical.rendered_primary_heading_missing",
|
|
102
|
+
"technical",
|
|
103
|
+
"P1",
|
|
104
|
+
"Rendered HTML removes the primary heading",
|
|
105
|
+
"Ensure the rendered page preserves the primary heading that communicates page purpose.",
|
|
106
|
+
[sources.javascriptSeo, sources.helpfulContent],
|
|
107
|
+
),
|
|
108
|
+
rule(
|
|
109
|
+
"technical.rendered_structured_data_lost",
|
|
110
|
+
"technical",
|
|
111
|
+
"P2",
|
|
112
|
+
"Rendered HTML removes structured data",
|
|
113
|
+
"Preserve structured data through rendering and hydration so eligible markup remains available.",
|
|
114
|
+
[sources.javascriptSeo, sources.structuredDataIntro, sources.structuredDataPolicies],
|
|
115
|
+
),
|
|
116
|
+
rule(
|
|
117
|
+
"crawl.robots_blocked",
|
|
118
|
+
"crawl_index",
|
|
119
|
+
"P1",
|
|
120
|
+
"robots.txt blocks an important page or resource",
|
|
121
|
+
"Update robots.txt rules so important pages and resources needed for rendering can be crawled.",
|
|
122
|
+
[sources.robotsTxt, sources.technicalRequirements],
|
|
123
|
+
),
|
|
124
|
+
rule(
|
|
125
|
+
"crawl.sitemap_missing",
|
|
126
|
+
"crawl_index",
|
|
127
|
+
"P2",
|
|
128
|
+
"No sitemap was discovered",
|
|
129
|
+
"Publish a sitemap that lists canonical, indexable URLs for important pages.",
|
|
130
|
+
[sources.sitemaps],
|
|
131
|
+
),
|
|
132
|
+
rule(
|
|
133
|
+
"crawl.broken_internal_link",
|
|
134
|
+
"crawl_index",
|
|
135
|
+
"P2",
|
|
136
|
+
"Internal link points to an unavailable URL",
|
|
137
|
+
"Fix or remove broken internal links so crawlers and users can move through the site reliably.",
|
|
138
|
+
[sources.crawlableLinks, sources.technicalRequirements],
|
|
139
|
+
),
|
|
140
|
+
rule(
|
|
141
|
+
"indexability.noindex",
|
|
142
|
+
"crawl_index",
|
|
143
|
+
"P1",
|
|
144
|
+
"Important page has a noindex directive",
|
|
145
|
+
"Remove noindex from pages intended to appear in Google Search.",
|
|
146
|
+
[sources.robotsMeta],
|
|
147
|
+
),
|
|
148
|
+
rule(
|
|
149
|
+
"indexability.x_robots_noindex",
|
|
150
|
+
"crawl_index",
|
|
151
|
+
"P1",
|
|
152
|
+
"Important page has an X-Robots-Tag noindex directive",
|
|
153
|
+
"Remove X-Robots-Tag noindex from pages intended to appear in Google Search.",
|
|
154
|
+
[sources.robotsMeta],
|
|
155
|
+
),
|
|
156
|
+
rule(
|
|
157
|
+
"indexability.canonical_missing",
|
|
158
|
+
"crawl_index",
|
|
159
|
+
"P2",
|
|
160
|
+
"Canonical URL is missing",
|
|
161
|
+
"Add a canonical signal to important indexable pages when duplicate or alternate URLs may exist.",
|
|
162
|
+
[sources.canonicalization, sources.consolidateDuplicates],
|
|
163
|
+
),
|
|
164
|
+
rule(
|
|
165
|
+
"indexability.canonical_target_error",
|
|
166
|
+
"crawl_index",
|
|
167
|
+
"P1",
|
|
168
|
+
"Canonical target is unavailable or not indexable",
|
|
169
|
+
"Point canonical tags to successful, indexable preferred URLs.",
|
|
170
|
+
[sources.canonicalization],
|
|
171
|
+
),
|
|
172
|
+
rule(
|
|
173
|
+
"indexability.noindex_canonical_conflict",
|
|
174
|
+
"crawl_index",
|
|
175
|
+
"P1",
|
|
176
|
+
"Page combines noindex with a canonical signal",
|
|
177
|
+
"Choose a single indexing strategy: remove noindex for indexable pages or align links and canonical signals to the preferred URL.",
|
|
178
|
+
[sources.robotsMeta, sources.canonicalization],
|
|
179
|
+
),
|
|
180
|
+
rule(
|
|
181
|
+
"indexability.noncanonical_in_sitemap",
|
|
182
|
+
"crawl_index",
|
|
183
|
+
"P2",
|
|
184
|
+
"Sitemap includes non-canonical URLs",
|
|
185
|
+
"List only canonical, indexable URLs in sitemaps.",
|
|
186
|
+
[sources.sitemaps, sources.canonicalization],
|
|
187
|
+
),
|
|
188
|
+
rule(
|
|
189
|
+
"appearance.title_missing",
|
|
190
|
+
"search_appearance",
|
|
191
|
+
"P1",
|
|
192
|
+
"Page is missing a title element",
|
|
193
|
+
"Add a unique, descriptive title that reflects the page's main content.",
|
|
194
|
+
[sources.titleLinks],
|
|
195
|
+
),
|
|
196
|
+
rule(
|
|
197
|
+
"appearance.title_duplicate",
|
|
198
|
+
"search_appearance",
|
|
199
|
+
"P2",
|
|
200
|
+
"Multiple pages use the same title",
|
|
201
|
+
"Write unique titles that distinguish page purpose and topic.",
|
|
202
|
+
[sources.titleLinks],
|
|
203
|
+
),
|
|
204
|
+
rule(
|
|
205
|
+
"appearance.meta_description_missing",
|
|
206
|
+
"search_appearance",
|
|
207
|
+
"P3",
|
|
208
|
+
"Page is missing a meta description",
|
|
209
|
+
"Add concise, useful page descriptions that can support better snippets.",
|
|
210
|
+
[sources.snippets],
|
|
211
|
+
),
|
|
212
|
+
rule(
|
|
213
|
+
"appearance.meta_description_duplicate",
|
|
214
|
+
"search_appearance",
|
|
215
|
+
"P3",
|
|
216
|
+
"Multiple pages use the same meta description",
|
|
217
|
+
"Write unique descriptions that summarize each page's specific purpose.",
|
|
218
|
+
[sources.snippets],
|
|
219
|
+
),
|
|
220
|
+
rule(
|
|
221
|
+
"appearance.h1_missing",
|
|
222
|
+
"search_appearance",
|
|
223
|
+
"P2",
|
|
224
|
+
"Page is missing a clear H1",
|
|
225
|
+
"Add one visible primary heading that describes the page topic.",
|
|
226
|
+
[sources.titleLinks, sources.helpfulContent],
|
|
227
|
+
),
|
|
228
|
+
rule(
|
|
229
|
+
"appearance.image_alt_missing",
|
|
230
|
+
"search_appearance",
|
|
231
|
+
"P3",
|
|
232
|
+
"Important image is missing alt text",
|
|
233
|
+
"Add descriptive alt text for meaningful images.",
|
|
234
|
+
[sources.googleImages],
|
|
235
|
+
),
|
|
236
|
+
rule(
|
|
237
|
+
"appearance.favicon_missing",
|
|
238
|
+
"search_appearance",
|
|
239
|
+
"P3",
|
|
240
|
+
"Favicon signal was not discovered",
|
|
241
|
+
"Publish a valid favicon that Google can access.",
|
|
242
|
+
[sources.favicon],
|
|
243
|
+
),
|
|
244
|
+
rule(
|
|
245
|
+
"structured_data.invalid_jsonld",
|
|
246
|
+
"structured_data",
|
|
247
|
+
"P1",
|
|
248
|
+
"JSON-LD structured data is invalid",
|
|
249
|
+
"Fix JSON-LD syntax so structured data can be parsed reliably.",
|
|
250
|
+
[sources.structuredDataIntro],
|
|
251
|
+
),
|
|
252
|
+
rule(
|
|
253
|
+
"structured_data.required_property_missing",
|
|
254
|
+
"structured_data",
|
|
255
|
+
"P2",
|
|
256
|
+
"Structured data is missing required properties",
|
|
257
|
+
"Add the required properties for supported schema types when the page's visible content supports the markup.",
|
|
258
|
+
[sources.structuredDataIntro, sources.structuredDataPolicies],
|
|
259
|
+
),
|
|
260
|
+
rule(
|
|
261
|
+
"structured_data.visible_content_mismatch",
|
|
262
|
+
"structured_data",
|
|
263
|
+
"P1",
|
|
264
|
+
"Structured data does not match visible page content",
|
|
265
|
+
"Keep structured data aligned with user-visible content and Google structured data policies.",
|
|
266
|
+
[sources.structuredDataPolicies],
|
|
267
|
+
),
|
|
268
|
+
rule(
|
|
269
|
+
"structured_data.organization_missing",
|
|
270
|
+
"structured_data",
|
|
271
|
+
"P2",
|
|
272
|
+
"Organization structured data is missing or unclear",
|
|
273
|
+
"Add Organization structured data where visible business identity supports it.",
|
|
274
|
+
[sources.organizationSchema, sources.structuredDataIntro],
|
|
275
|
+
),
|
|
276
|
+
rule(
|
|
277
|
+
"performance.lighthouse_poor",
|
|
278
|
+
"performance",
|
|
279
|
+
"P2",
|
|
280
|
+
"Imported Lighthouse performance score is poor",
|
|
281
|
+
"Investigate page weight, render-blocking work, server response time, and client-side execution before treating performance as search-ready.",
|
|
282
|
+
[sources.pageExperience, sources.coreWebVitals],
|
|
283
|
+
),
|
|
284
|
+
rule(
|
|
285
|
+
"performance.lcp_poor",
|
|
286
|
+
"performance",
|
|
287
|
+
"P2",
|
|
288
|
+
"Largest Contentful Paint is poor",
|
|
289
|
+
"Improve loading of the largest above-the-fold content element by optimizing server response, critical resources, images, and rendering work.",
|
|
290
|
+
[sources.pageExperience, sources.coreWebVitals],
|
|
291
|
+
),
|
|
292
|
+
rule(
|
|
293
|
+
"performance.cls_poor",
|
|
294
|
+
"performance",
|
|
295
|
+
"P2",
|
|
296
|
+
"Cumulative Layout Shift is poor",
|
|
297
|
+
"Reserve stable layout space for images, embeds, ads, and late-loading UI so visible content does not shift unexpectedly.",
|
|
298
|
+
[sources.pageExperience, sources.coreWebVitals],
|
|
299
|
+
),
|
|
300
|
+
rule(
|
|
301
|
+
"content.thin_content",
|
|
302
|
+
"helpful_content",
|
|
303
|
+
"P2",
|
|
304
|
+
"Page has limited useful main content",
|
|
305
|
+
"Expand visible content to satisfy the visitor's task with original, helpful information.",
|
|
306
|
+
[sources.helpfulContent],
|
|
307
|
+
),
|
|
308
|
+
rule(
|
|
309
|
+
"content.answerability_gap",
|
|
310
|
+
"geo_readiness",
|
|
311
|
+
"P2",
|
|
312
|
+
"Page lacks directly answerable sections",
|
|
313
|
+
"Add concise answer sections, definitions, steps, comparisons, FAQs, or evidence where useful.",
|
|
314
|
+
[sources.aiOptimization, sources.helpfulContent],
|
|
315
|
+
),
|
|
316
|
+
rule(
|
|
317
|
+
"entity.about_contact_missing",
|
|
318
|
+
"entity_clarity",
|
|
319
|
+
"P2",
|
|
320
|
+
"Organization identity or contact details are missing",
|
|
321
|
+
"Make organization identity, contact, and trust details easy to find on crawlable pages.",
|
|
322
|
+
[sources.searchEssentials, sources.organizationSchema],
|
|
323
|
+
),
|
|
324
|
+
rule(
|
|
325
|
+
"geo.entity_clarity_gap",
|
|
326
|
+
"geo_readiness",
|
|
327
|
+
"P2",
|
|
328
|
+
"Important entities and relationships are unclear",
|
|
329
|
+
"Name products, services, people, locations, and relationships plainly in visible content.",
|
|
330
|
+
[sources.aiOptimization, sources.helpfulContent],
|
|
331
|
+
),
|
|
332
|
+
rule(
|
|
333
|
+
"policy.hidden_text_risk",
|
|
334
|
+
"policy_risk",
|
|
335
|
+
"P1",
|
|
336
|
+
"Hidden text risk detected",
|
|
337
|
+
"Remove or justify hidden text patterns that could mislead users or search systems.",
|
|
338
|
+
[sources.spamPolicies],
|
|
339
|
+
),
|
|
340
|
+
rule(
|
|
341
|
+
"policy.duplicate_content_cluster",
|
|
342
|
+
"policy_risk",
|
|
343
|
+
"P2",
|
|
344
|
+
"Large duplicate content cluster detected",
|
|
345
|
+
"Consolidate duplicative pages into stronger canonical resources or clarify unique page value.",
|
|
346
|
+
[sources.consolidateDuplicates, sources.helpfulContent, sources.spamPolicies],
|
|
347
|
+
),
|
|
348
|
+
];
|
|
349
|
+
|
|
350
|
+
export const getRule = (id) => rules.find((item) => item.id === id);
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import { implementationTaskFor } from "./finding-task.mjs";
|
|
2
|
+
import { duplicateContentClusterFacts } from "./rule-depth.mjs";
|
|
3
|
+
import { getRule } from "./rules.mjs";
|
|
4
|
+
import { isHttpUrl, normalizeUrl, sameOrigin } from "./url-utils.mjs";
|
|
5
|
+
|
|
6
|
+
const ownerFor = (dimension) => {
|
|
7
|
+
if (dimension === "crawl_index" || dimension === "technical" || dimension === "structured_data") return "Engineering";
|
|
8
|
+
return "SEO";
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
const effortFor = (severity) => (severity === "P1" ? "M" : "S");
|
|
12
|
+
|
|
13
|
+
const finding = (ruleId, affectedUrls, evidence, impact) => {
|
|
14
|
+
const rule = getRule(ruleId);
|
|
15
|
+
const owner = ownerFor(rule.dimension);
|
|
16
|
+
const effort = effortFor(rule.defaultSeverity);
|
|
17
|
+
return {
|
|
18
|
+
ruleId: rule.id,
|
|
19
|
+
title: rule.title,
|
|
20
|
+
severity: rule.defaultSeverity,
|
|
21
|
+
dimension: rule.dimension,
|
|
22
|
+
affectedUrls,
|
|
23
|
+
evidence,
|
|
24
|
+
impact,
|
|
25
|
+
recommendation: rule.recommendation,
|
|
26
|
+
implementationTask: implementationTaskFor(rule, owner, effort),
|
|
27
|
+
owner,
|
|
28
|
+
effort,
|
|
29
|
+
confidence: "high",
|
|
30
|
+
sources: rule.sources,
|
|
31
|
+
};
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
const groupBy = (pages, selector) => {
|
|
35
|
+
const groups = new Map();
|
|
36
|
+
for (const [index, page] of pages.entries()) {
|
|
37
|
+
const key = selector(page);
|
|
38
|
+
if (!key) continue;
|
|
39
|
+
const normalized = String(key).trim().toLowerCase();
|
|
40
|
+
if (!normalized) continue;
|
|
41
|
+
const group = groups.get(normalized) || [];
|
|
42
|
+
group.push({ page, index });
|
|
43
|
+
groups.set(normalized, group);
|
|
44
|
+
}
|
|
45
|
+
return [...groups.values()].filter((group) => group.length > 1);
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
const pageMap = (pages) => {
|
|
49
|
+
const map = new Map();
|
|
50
|
+
for (const page of pages) {
|
|
51
|
+
try {
|
|
52
|
+
map.set(normalizeUrl(page.finalUrl), page);
|
|
53
|
+
} catch {
|
|
54
|
+
map.set(page.finalUrl, page);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
return map;
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
const safeNormalize = (value) => {
|
|
61
|
+
try {
|
|
62
|
+
return normalizeUrl(value);
|
|
63
|
+
} catch {
|
|
64
|
+
return value;
|
|
65
|
+
}
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
export const evaluateSite = (pages, context = {}) => {
|
|
69
|
+
const findings = [];
|
|
70
|
+
const byUrl = pageMap(pages);
|
|
71
|
+
|
|
72
|
+
if (context.crawled && (context.sitemaps || []).length === 0) {
|
|
73
|
+
findings.push(
|
|
74
|
+
finding(
|
|
75
|
+
"crawl.sitemap_missing",
|
|
76
|
+
[pages[0]?.finalUrl || context.origin || "site"],
|
|
77
|
+
["$.site.sitemaps"],
|
|
78
|
+
"No sitemap evidence was discovered or supplied for this crawled audit.",
|
|
79
|
+
),
|
|
80
|
+
);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
for (const skipped of context.skipped || []) {
|
|
84
|
+
if (skipped.reason !== "robots_blocked") continue;
|
|
85
|
+
findings.push(
|
|
86
|
+
finding(
|
|
87
|
+
"crawl.robots_blocked",
|
|
88
|
+
[skipped.url],
|
|
89
|
+
["$.site.skipped"],
|
|
90
|
+
"robots.txt blocks a URL discovered during crawl evidence collection.",
|
|
91
|
+
),
|
|
92
|
+
);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
for (const group of groupBy(pages, (page) => page.evidence?.title)) {
|
|
96
|
+
findings.push(
|
|
97
|
+
finding(
|
|
98
|
+
"appearance.title_duplicate",
|
|
99
|
+
group.map(({ page }) => page.finalUrl),
|
|
100
|
+
group.map(({ index }) => `$.pages[${index}].evidence.title`),
|
|
101
|
+
"Duplicate titles make it harder to distinguish page purpose in search results.",
|
|
102
|
+
),
|
|
103
|
+
);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
for (const group of groupBy(pages, (page) => page.evidence?.description)) {
|
|
107
|
+
findings.push(
|
|
108
|
+
finding(
|
|
109
|
+
"appearance.meta_description_duplicate",
|
|
110
|
+
group.map(({ page }) => page.finalUrl),
|
|
111
|
+
group.map(({ index }) => `$.pages[${index}].evidence.description`),
|
|
112
|
+
"Duplicate meta descriptions reduce page-specific snippet clarity.",
|
|
113
|
+
),
|
|
114
|
+
);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
for (const [index, page] of pages.entries()) {
|
|
118
|
+
if (!isHttpUrl(page.finalUrl)) continue;
|
|
119
|
+
|
|
120
|
+
for (const link of page.evidence?.links || []) {
|
|
121
|
+
if (!link.href || !isHttpUrl(link.href) || !sameOrigin(page.finalUrl, link.href)) continue;
|
|
122
|
+
const linked = byUrl.get(safeNormalize(link.href));
|
|
123
|
+
if (linked && Number.isInteger(linked.status) && linked.status >= 400) {
|
|
124
|
+
findings.push(
|
|
125
|
+
finding(
|
|
126
|
+
"crawl.broken_internal_link",
|
|
127
|
+
[page.finalUrl, linked.finalUrl],
|
|
128
|
+
[`$.pages[${index}].evidence.links`, `$.pages[${pages.indexOf(linked)}].status`],
|
|
129
|
+
"Internal links to unavailable pages waste crawl paths and create a poor user experience.",
|
|
130
|
+
),
|
|
131
|
+
);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const canonical = page.evidence?.canonical;
|
|
136
|
+
if (canonical) {
|
|
137
|
+
const canonicalPage = byUrl.get(normalizeUrl(canonical));
|
|
138
|
+
if (canonicalPage && Number.isInteger(canonicalPage.status) && canonicalPage.status >= 400) {
|
|
139
|
+
findings.push(
|
|
140
|
+
finding(
|
|
141
|
+
"indexability.canonical_target_error",
|
|
142
|
+
[page.finalUrl, canonicalPage.finalUrl],
|
|
143
|
+
[`$.pages[${index}].evidence.canonical`, `$.pages[${pages.indexOf(canonicalPage)}].status`],
|
|
144
|
+
"Canonical signals should point to successful, indexable preferred URLs.",
|
|
145
|
+
),
|
|
146
|
+
);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
if (
|
|
150
|
+
context.sitemapUrls?.includes(safeNormalize(page.finalUrl)) &&
|
|
151
|
+
safeNormalize(canonical) !== safeNormalize(page.finalUrl)
|
|
152
|
+
) {
|
|
153
|
+
findings.push(
|
|
154
|
+
finding(
|
|
155
|
+
"indexability.noncanonical_in_sitemap",
|
|
156
|
+
[page.finalUrl],
|
|
157
|
+
[`$.pages[${index}].evidence.canonical`],
|
|
158
|
+
"Sitemaps should list canonical URLs rather than alternates or duplicates.",
|
|
159
|
+
),
|
|
160
|
+
);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
for (const clusterFact of duplicateContentClusterFacts(pages)) {
|
|
166
|
+
findings.push(
|
|
167
|
+
finding(
|
|
168
|
+
clusterFact.ruleId,
|
|
169
|
+
clusterFact.affectedUrls,
|
|
170
|
+
clusterFact.evidence,
|
|
171
|
+
clusterFact.impact,
|
|
172
|
+
),
|
|
173
|
+
);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
return findings;
|
|
177
|
+
};
|
package/src/sitemap.mjs
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
const decodeEntities = (value) =>
|
|
2
|
+
String(value || "")
|
|
3
|
+
.replace(/&/g, "&")
|
|
4
|
+
.replace(/</g, "<")
|
|
5
|
+
.replace(/>/g, ">")
|
|
6
|
+
.replace(/"/g, '"')
|
|
7
|
+
.replace(/'/g, "'");
|
|
8
|
+
|
|
9
|
+
const locs = (xml) => {
|
|
10
|
+
const results = [];
|
|
11
|
+
let match;
|
|
12
|
+
const pattern = /<loc\b[^>]*>([\s\S]*?)<\/loc>/gi;
|
|
13
|
+
while ((match = pattern.exec(xml))) results.push(decodeEntities(match[1].trim()));
|
|
14
|
+
return results;
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
export const parseSitemap = (xml) => {
|
|
18
|
+
const body = String(xml || "");
|
|
19
|
+
const locations = locs(body);
|
|
20
|
+
|
|
21
|
+
if (/<sitemapindex\b/i.test(body)) {
|
|
22
|
+
return { type: "sitemapindex", urls: [], sitemaps: locations };
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
if (/<urlset\b/i.test(body)) {
|
|
26
|
+
return { type: "urlset", urls: locations, sitemaps: [] };
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
return { type: "unknown", urls: [], sitemaps: [] };
|
|
30
|
+
};
|
package/src/snapshot.mjs
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import crypto from "node:crypto";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { extractHtmlEvidence } from "./html-extract.mjs";
|
|
4
|
+
import { fetchWithGuards, readResponseTextLimited, readTextFileLimited, resolveLimits } from "./io-guards.mjs";
|
|
5
|
+
import { renderHtml } from "./render.mjs";
|
|
6
|
+
import { isHttpUrl } from "./url-utils.mjs";
|
|
7
|
+
|
|
8
|
+
const userAgent = "RankForge GEO SEO audit snapshot";
|
|
9
|
+
|
|
10
|
+
const hash = (value) => crypto.createHash("sha256").update(value).digest("hex");
|
|
11
|
+
|
|
12
|
+
const headersToObject = (headers) => {
|
|
13
|
+
const result = {};
|
|
14
|
+
for (const [key, value] of headers.entries()) result[key] = value;
|
|
15
|
+
return result;
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
const fetchRaw = async (target, options = {}) => {
|
|
19
|
+
let current = target;
|
|
20
|
+
const redirectChain = [];
|
|
21
|
+
const maxRedirects = options.maxRedirects ?? 5;
|
|
22
|
+
const limits = resolveLimits(options.limits);
|
|
23
|
+
|
|
24
|
+
for (let attempt = 0; attempt <= maxRedirects; attempt++) {
|
|
25
|
+
const response = await fetchWithGuards(current, {
|
|
26
|
+
security: options.security,
|
|
27
|
+
limits,
|
|
28
|
+
fetchOptions: {
|
|
29
|
+
headers: { "user-agent": userAgent },
|
|
30
|
+
redirect: "manual",
|
|
31
|
+
},
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
const headers = headersToObject(response.headers);
|
|
35
|
+
const location = response.headers.get("location");
|
|
36
|
+
if ([301, 302, 303, 307, 308].includes(response.status) && location) {
|
|
37
|
+
const nextUrl = new URL(location, current).href;
|
|
38
|
+
redirectChain.push({ url: current, status: response.status, location: nextUrl });
|
|
39
|
+
current = nextUrl;
|
|
40
|
+
continue;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return {
|
|
44
|
+
sourceType: "url",
|
|
45
|
+
finalUrl: response.url || current,
|
|
46
|
+
status: response.status,
|
|
47
|
+
ok: response.ok,
|
|
48
|
+
headers,
|
|
49
|
+
redirectChain,
|
|
50
|
+
html: await readResponseTextLimited(response, {
|
|
51
|
+
limits,
|
|
52
|
+
maxBytes: limits.maxHtmlBytes,
|
|
53
|
+
label: current,
|
|
54
|
+
}),
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
throw new Error(`Too many redirects while fetching ${target}`);
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
const readFile = (target, options = {}) => {
|
|
62
|
+
const fullPath = path.resolve(target);
|
|
63
|
+
const limits = resolveLimits(options.limits);
|
|
64
|
+
return {
|
|
65
|
+
sourceType: "file",
|
|
66
|
+
finalUrl: fullPath,
|
|
67
|
+
status: null,
|
|
68
|
+
ok: true,
|
|
69
|
+
headers: {},
|
|
70
|
+
redirectChain: [],
|
|
71
|
+
html: readTextFileLimited(fullPath, {
|
|
72
|
+
security: options.security,
|
|
73
|
+
limits,
|
|
74
|
+
maxBytes: limits.maxHtmlBytes,
|
|
75
|
+
}),
|
|
76
|
+
};
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
export const collectSnapshot = async (target, options = {}) => {
|
|
80
|
+
const raw = isHttpUrl(target) ? await fetchRaw(target, options) : readFile(target, options);
|
|
81
|
+
const baseUrl = raw.sourceType === "url" ? raw.finalUrl : null;
|
|
82
|
+
const evidence = extractHtmlEvidence(raw.html, baseUrl);
|
|
83
|
+
const shouldRender = options.render === "always" || options.render === "auto" || options.renderer;
|
|
84
|
+
|
|
85
|
+
let render = { status: "not_requested" };
|
|
86
|
+
if (shouldRender) {
|
|
87
|
+
const rendered = await renderHtml(raw.finalUrl, {
|
|
88
|
+
...options,
|
|
89
|
+
html: raw.html,
|
|
90
|
+
finalUrl: raw.finalUrl,
|
|
91
|
+
});
|
|
92
|
+
if (rendered.status === "rendered") {
|
|
93
|
+
const renderedEvidence = extractHtmlEvidence(rendered.html, baseUrl);
|
|
94
|
+
render = {
|
|
95
|
+
status: "rendered",
|
|
96
|
+
renderedHash: hash(rendered.html),
|
|
97
|
+
evidence: renderedEvidence,
|
|
98
|
+
textDeltaCharacters: Math.abs(
|
|
99
|
+
(renderedEvidence.counts?.visibleTextCharacters || 0) - (evidence.counts?.visibleTextCharacters || 0),
|
|
100
|
+
),
|
|
101
|
+
};
|
|
102
|
+
} else {
|
|
103
|
+
render = rendered;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return {
|
|
108
|
+
target,
|
|
109
|
+
sourceType: raw.sourceType,
|
|
110
|
+
finalUrl: raw.finalUrl,
|
|
111
|
+
status: raw.status,
|
|
112
|
+
ok: raw.ok,
|
|
113
|
+
headers: raw.headers,
|
|
114
|
+
redirectChain: raw.redirectChain,
|
|
115
|
+
rawHash: hash(raw.html),
|
|
116
|
+
evidence,
|
|
117
|
+
render,
|
|
118
|
+
};
|
|
119
|
+
};
|