@djangocfg/seo 2.1.50
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +192 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.mjs +3780 -0
- package/dist/cli.mjs.map +1 -0
- package/dist/crawler/index.d.ts +88 -0
- package/dist/crawler/index.mjs +610 -0
- package/dist/crawler/index.mjs.map +1 -0
- package/dist/google-console/index.d.ts +95 -0
- package/dist/google-console/index.mjs +539 -0
- package/dist/google-console/index.mjs.map +1 -0
- package/dist/index.d.ts +285 -0
- package/dist/index.mjs +3236 -0
- package/dist/index.mjs.map +1 -0
- package/dist/link-checker/index.d.ts +76 -0
- package/dist/link-checker/index.mjs +326 -0
- package/dist/link-checker/index.mjs.map +1 -0
- package/dist/markdown-report-B3QdDzxE.d.ts +193 -0
- package/dist/reports/index.d.ts +24 -0
- package/dist/reports/index.mjs +836 -0
- package/dist/reports/index.mjs.map +1 -0
- package/dist/routes/index.d.ts +69 -0
- package/dist/routes/index.mjs +372 -0
- package/dist/routes/index.mjs.map +1 -0
- package/dist/scanner-Cz4Th2Pt.d.ts +60 -0
- package/dist/types/index.d.ts +144 -0
- package/dist/types/index.mjs +3 -0
- package/dist/types/index.mjs.map +1 -0
- package/package.json +114 -0
- package/src/analyzer.ts +256 -0
- package/src/cli/commands/audit.ts +260 -0
- package/src/cli/commands/content.ts +180 -0
- package/src/cli/commands/crawl.ts +32 -0
- package/src/cli/commands/index.ts +12 -0
- package/src/cli/commands/inspect.ts +60 -0
- package/src/cli/commands/links.ts +41 -0
- package/src/cli/commands/robots.ts +36 -0
- package/src/cli/commands/routes.ts +126 -0
- package/src/cli/commands/sitemap.ts +48 -0
- package/src/cli/index.ts +149 -0
- package/src/cli/types.ts +40 -0
- package/src/config.ts +207 -0
- package/src/content/index.ts +51 -0
- package/src/content/link-checker.ts +182 -0
- package/src/content/link-fixer.ts +188 -0
- package/src/content/scanner.ts +200 -0
- package/src/content/sitemap-generator.ts +321 -0
- package/src/content/types.ts +140 -0
- package/src/crawler/crawler.ts +425 -0
- package/src/crawler/index.ts +10 -0
- package/src/crawler/robots-parser.ts +171 -0
- package/src/crawler/sitemap-validator.ts +204 -0
- package/src/google-console/analyzer.ts +317 -0
- package/src/google-console/auth.ts +100 -0
- package/src/google-console/client.ts +281 -0
- package/src/google-console/index.ts +9 -0
- package/src/index.ts +144 -0
- package/src/link-checker/index.ts +461 -0
- package/src/reports/claude-context.ts +149 -0
- package/src/reports/generator.ts +244 -0
- package/src/reports/index.ts +27 -0
- package/src/reports/json-report.ts +320 -0
- package/src/reports/markdown-report.ts +246 -0
- package/src/reports/split-report.ts +252 -0
- package/src/routes/analyzer.ts +324 -0
- package/src/routes/index.ts +25 -0
- package/src/routes/scanner.ts +298 -0
- package/src/types/index.ts +222 -0
- package/src/utils/index.ts +154 -0
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @djangocfg/seo - Site Crawler
|
|
3
|
+
* Internal site crawler for SEO analysis
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { load } from 'cheerio';
|
|
7
|
+
import pLimit from 'p-limit';
|
|
8
|
+
import consola from 'consola';
|
|
9
|
+
import type { CrawlResult, CrawlerConfig, SeoIssue } from '../types/index.js';
|
|
10
|
+
|
|
11
|
+
const DEFAULT_CONFIG: Required<CrawlerConfig> = {
|
|
12
|
+
maxPages: 100,
|
|
13
|
+
maxDepth: 3,
|
|
14
|
+
concurrency: 5,
|
|
15
|
+
timeout: 30000,
|
|
16
|
+
userAgent: 'DjangoCFG-SEO-Crawler/1.0 (+https://djangocfg.com/bot)',
|
|
17
|
+
respectRobotsTxt: true,
|
|
18
|
+
includePatterns: [],
|
|
19
|
+
excludePatterns: [
|
|
20
|
+
'/api/',
|
|
21
|
+
'/admin/',
|
|
22
|
+
'/_next/',
|
|
23
|
+
'/static/',
|
|
24
|
+
'.pdf',
|
|
25
|
+
'.jpg',
|
|
26
|
+
'.png',
|
|
27
|
+
'.gif',
|
|
28
|
+
'.svg',
|
|
29
|
+
'.css',
|
|
30
|
+
'.js',
|
|
31
|
+
],
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
export class SiteCrawler {
|
|
35
|
+
private config: Required<CrawlerConfig>;
|
|
36
|
+
private baseUrl: URL;
|
|
37
|
+
private visited = new Set<string>();
|
|
38
|
+
private queue: Array<{ url: string; depth: number }> = [];
|
|
39
|
+
private results: CrawlResult[] = [];
|
|
40
|
+
private limit: ReturnType<typeof pLimit>;
|
|
41
|
+
|
|
42
|
+
constructor(siteUrl: string, config?: CrawlerConfig) {
|
|
43
|
+
this.config = { ...DEFAULT_CONFIG, ...config };
|
|
44
|
+
this.baseUrl = new URL(siteUrl);
|
|
45
|
+
this.limit = pLimit(this.config.concurrency);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Start crawling the site
|
|
50
|
+
*/
|
|
51
|
+
async crawl(): Promise<CrawlResult[]> {
|
|
52
|
+
consola.info(`Starting crawl of ${this.baseUrl.origin}`);
|
|
53
|
+
consola.info(`Config: maxPages=${this.config.maxPages}, maxDepth=${this.config.maxDepth}`);
|
|
54
|
+
|
|
55
|
+
this.queue.push({ url: this.baseUrl.href, depth: 0 });
|
|
56
|
+
|
|
57
|
+
while (this.queue.length > 0 && this.results.length < this.config.maxPages) {
|
|
58
|
+
const batch = this.queue.splice(0, this.config.concurrency);
|
|
59
|
+
|
|
60
|
+
const promises = batch.map(({ url, depth }) =>
|
|
61
|
+
this.limit(() => this.crawlPage(url, depth))
|
|
62
|
+
);
|
|
63
|
+
|
|
64
|
+
await Promise.all(promises);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
consola.success(`Crawl complete. Crawled ${this.results.length} pages.`);
|
|
68
|
+
return this.results;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Crawl a single page
|
|
73
|
+
*/
|
|
74
|
+
private async crawlPage(url: string, depth: number): Promise<void> {
|
|
75
|
+
const normalizedUrl = this.normalizeUrl(url);
|
|
76
|
+
|
|
77
|
+
if (this.visited.has(normalizedUrl)) return;
|
|
78
|
+
if (this.shouldExclude(normalizedUrl)) return;
|
|
79
|
+
|
|
80
|
+
this.visited.add(normalizedUrl);
|
|
81
|
+
|
|
82
|
+
const startTime = Date.now();
|
|
83
|
+
const result: CrawlResult = {
|
|
84
|
+
url: normalizedUrl,
|
|
85
|
+
statusCode: 0,
|
|
86
|
+
links: { internal: [], external: [] },
|
|
87
|
+
images: [],
|
|
88
|
+
loadTime: 0,
|
|
89
|
+
errors: [],
|
|
90
|
+
warnings: [],
|
|
91
|
+
crawledAt: new Date().toISOString(),
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
try {
|
|
95
|
+
const controller = new AbortController();
|
|
96
|
+
const timeoutId = setTimeout(() => controller.abort(), this.config.timeout);
|
|
97
|
+
|
|
98
|
+
const response = await fetch(normalizedUrl, {
|
|
99
|
+
headers: {
|
|
100
|
+
'User-Agent': this.config.userAgent,
|
|
101
|
+
Accept: 'text/html,application/xhtml+xml',
|
|
102
|
+
},
|
|
103
|
+
signal: controller.signal,
|
|
104
|
+
redirect: 'follow',
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
// TTFB = time from request start to first response headers
|
|
108
|
+
result.ttfb = Date.now() - startTime;
|
|
109
|
+
|
|
110
|
+
clearTimeout(timeoutId);
|
|
111
|
+
|
|
112
|
+
result.statusCode = response.status;
|
|
113
|
+
result.contentType = response.headers.get('content-type') || undefined;
|
|
114
|
+
result.contentLength = Number(response.headers.get('content-length')) || undefined;
|
|
115
|
+
|
|
116
|
+
if (response.ok && result.contentType?.includes('text/html')) {
|
|
117
|
+
const html = await response.text();
|
|
118
|
+
this.parseHtml(html, result, normalizedUrl, depth);
|
|
119
|
+
} else if (!response.ok) {
|
|
120
|
+
result.errors.push(`HTTP ${response.status}: ${response.statusText}`);
|
|
121
|
+
}
|
|
122
|
+
} catch (error) {
|
|
123
|
+
if (error instanceof Error) {
|
|
124
|
+
if (error.name === 'AbortError') {
|
|
125
|
+
result.errors.push('Request timeout');
|
|
126
|
+
} else {
|
|
127
|
+
result.errors.push(error.message);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
result.loadTime = Date.now() - startTime;
|
|
133
|
+
this.results.push(result);
|
|
134
|
+
|
|
135
|
+
consola.debug(`Crawled: ${normalizedUrl} (${result.statusCode}) - ${result.loadTime}ms`);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Parse HTML and extract SEO-relevant data
|
|
140
|
+
*/
|
|
141
|
+
private parseHtml(html: string, result: CrawlResult, pageUrl: string, depth: number): void {
|
|
142
|
+
const $ = load(html);
|
|
143
|
+
|
|
144
|
+
// Title
|
|
145
|
+
result.title = $('title').first().text().trim() || undefined;
|
|
146
|
+
if (!result.title) {
|
|
147
|
+
result.warnings.push('Missing title tag');
|
|
148
|
+
} else if (result.title.length > 60) {
|
|
149
|
+
result.warnings.push(`Title too long (${result.title.length} chars, recommended: <60)`);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Meta description
|
|
153
|
+
result.metaDescription =
|
|
154
|
+
$('meta[name="description"]').attr('content')?.trim() || undefined;
|
|
155
|
+
if (!result.metaDescription) {
|
|
156
|
+
result.warnings.push('Missing meta description');
|
|
157
|
+
} else if (result.metaDescription.length > 160) {
|
|
158
|
+
result.warnings.push(
|
|
159
|
+
`Meta description too long (${result.metaDescription.length} chars, recommended: <160)`
|
|
160
|
+
);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Meta robots
|
|
164
|
+
result.metaRobots = $('meta[name="robots"]').attr('content')?.trim() || undefined;
|
|
165
|
+
const xRobots = $('meta[http-equiv="X-Robots-Tag"]').attr('content')?.trim();
|
|
166
|
+
if (xRobots) {
|
|
167
|
+
result.metaRobots = result.metaRobots ? `${result.metaRobots}, ${xRobots}` : xRobots;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Canonical
|
|
171
|
+
result.canonicalUrl = $('link[rel="canonical"]').attr('href')?.trim() || undefined;
|
|
172
|
+
if (!result.canonicalUrl) {
|
|
173
|
+
result.warnings.push('Missing canonical tag');
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Headings
|
|
177
|
+
result.h1 = $('h1')
|
|
178
|
+
.map((_, el) => $(el).text().trim())
|
|
179
|
+
.get();
|
|
180
|
+
result.h2 = $('h2')
|
|
181
|
+
.map((_, el) => $(el).text().trim())
|
|
182
|
+
.get();
|
|
183
|
+
|
|
184
|
+
if (result.h1.length === 0) {
|
|
185
|
+
result.warnings.push('Missing H1 tag');
|
|
186
|
+
} else if (result.h1.length > 1) {
|
|
187
|
+
result.warnings.push(`Multiple H1 tags (${result.h1.length})`);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// Links
|
|
191
|
+
$('a[href]').each((_, el) => {
|
|
192
|
+
const href = $(el).attr('href');
|
|
193
|
+
if (!href) return;
|
|
194
|
+
|
|
195
|
+
try {
|
|
196
|
+
const linkUrl = new URL(href, pageUrl);
|
|
197
|
+
|
|
198
|
+
if (linkUrl.hostname === this.baseUrl.hostname) {
|
|
199
|
+
const internalUrl = this.normalizeUrl(linkUrl.href);
|
|
200
|
+
result.links.internal.push(internalUrl);
|
|
201
|
+
|
|
202
|
+
// Add to crawl queue
|
|
203
|
+
if (depth < this.config.maxDepth && !this.visited.has(internalUrl)) {
|
|
204
|
+
this.queue.push({ url: internalUrl, depth: depth + 1 });
|
|
205
|
+
}
|
|
206
|
+
} else {
|
|
207
|
+
result.links.external.push(linkUrl.href);
|
|
208
|
+
}
|
|
209
|
+
} catch {
|
|
210
|
+
// Invalid URL, skip
|
|
211
|
+
}
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
// Images
|
|
215
|
+
$('img').each((_, el) => {
|
|
216
|
+
const src = $(el).attr('src');
|
|
217
|
+
const alt = $(el).attr('alt');
|
|
218
|
+
|
|
219
|
+
if (src) {
|
|
220
|
+
result.images.push({
|
|
221
|
+
src,
|
|
222
|
+
alt,
|
|
223
|
+
hasAlt: alt !== undefined && alt.trim().length > 0,
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
});
|
|
227
|
+
|
|
228
|
+
const imagesWithoutAlt = result.images.filter((img) => !img.hasAlt);
|
|
229
|
+
if (imagesWithoutAlt.length > 0) {
|
|
230
|
+
result.warnings.push(`${imagesWithoutAlt.length} images without alt text`);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Normalize URL for deduplication
|
|
236
|
+
*/
|
|
237
|
+
private normalizeUrl(url: string): string {
|
|
238
|
+
try {
|
|
239
|
+
const parsed = new URL(url, this.baseUrl.href);
|
|
240
|
+
// Remove trailing slash, hash, and sort query params
|
|
241
|
+
parsed.hash = '';
|
|
242
|
+
let pathname = parsed.pathname;
|
|
243
|
+
if (pathname.endsWith('/') && pathname !== '/') {
|
|
244
|
+
pathname = pathname.slice(0, -1);
|
|
245
|
+
}
|
|
246
|
+
parsed.pathname = pathname;
|
|
247
|
+
return parsed.href;
|
|
248
|
+
} catch {
|
|
249
|
+
return url;
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
/**
|
|
254
|
+
* Check if URL should be excluded
|
|
255
|
+
*/
|
|
256
|
+
private shouldExclude(url: string): boolean {
|
|
257
|
+
// Check include patterns first
|
|
258
|
+
if (this.config.includePatterns.length > 0) {
|
|
259
|
+
const included = this.config.includePatterns.some((pattern) =>
|
|
260
|
+
url.includes(pattern)
|
|
261
|
+
);
|
|
262
|
+
if (!included) return true;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// Check exclude patterns
|
|
266
|
+
return this.config.excludePatterns.some((pattern) => url.includes(pattern));
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/**
|
|
271
|
+
* Analyze crawl results for SEO issues
|
|
272
|
+
*/
|
|
273
|
+
export function analyzeCrawlResults(results: CrawlResult[]): SeoIssue[] {
|
|
274
|
+
const issues: SeoIssue[] = [];
|
|
275
|
+
|
|
276
|
+
for (const result of results) {
|
|
277
|
+
// HTTP errors
|
|
278
|
+
if (result.statusCode >= 400) {
|
|
279
|
+
issues.push({
|
|
280
|
+
id: `http-error-${hash(result.url)}`,
|
|
281
|
+
url: result.url,
|
|
282
|
+
category: 'technical',
|
|
283
|
+
severity: result.statusCode >= 500 ? 'critical' : 'error',
|
|
284
|
+
title: `HTTP ${result.statusCode} error`,
|
|
285
|
+
description: `Page returns ${result.statusCode} status code.`,
|
|
286
|
+
recommendation:
|
|
287
|
+
result.statusCode === 404
|
|
288
|
+
? 'Either restore the content or set up a redirect.'
|
|
289
|
+
: 'Fix the server error and ensure the page is accessible.',
|
|
290
|
+
detectedAt: result.crawledAt,
|
|
291
|
+
metadata: { statusCode: result.statusCode },
|
|
292
|
+
});
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// Missing title
|
|
296
|
+
if (!result.title && result.statusCode === 200) {
|
|
297
|
+
issues.push({
|
|
298
|
+
id: `missing-title-${hash(result.url)}`,
|
|
299
|
+
url: result.url,
|
|
300
|
+
category: 'content',
|
|
301
|
+
severity: 'error',
|
|
302
|
+
title: 'Missing title tag',
|
|
303
|
+
description: 'This page does not have a title tag.',
|
|
304
|
+
recommendation: 'Add a unique, descriptive title tag (50-60 characters).',
|
|
305
|
+
detectedAt: result.crawledAt,
|
|
306
|
+
});
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// Missing meta description
|
|
310
|
+
if (!result.metaDescription && result.statusCode === 200) {
|
|
311
|
+
issues.push({
|
|
312
|
+
id: `missing-meta-desc-${hash(result.url)}`,
|
|
313
|
+
url: result.url,
|
|
314
|
+
category: 'content',
|
|
315
|
+
severity: 'warning',
|
|
316
|
+
title: 'Missing meta description',
|
|
317
|
+
description: 'This page does not have a meta description.',
|
|
318
|
+
recommendation: 'Add a unique meta description (120-160 characters).',
|
|
319
|
+
detectedAt: result.crawledAt,
|
|
320
|
+
});
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Missing H1
|
|
324
|
+
if (result.h1 && result.h1.length === 0 && result.statusCode === 200) {
|
|
325
|
+
issues.push({
|
|
326
|
+
id: `missing-h1-${hash(result.url)}`,
|
|
327
|
+
url: result.url,
|
|
328
|
+
category: 'content',
|
|
329
|
+
severity: 'warning',
|
|
330
|
+
title: 'Missing H1 heading',
|
|
331
|
+
description: 'This page does not have an H1 heading.',
|
|
332
|
+
recommendation: 'Add a single H1 heading that describes the page content.',
|
|
333
|
+
detectedAt: result.crawledAt,
|
|
334
|
+
});
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// Multiple H1s
|
|
338
|
+
if (result.h1 && result.h1.length > 1) {
|
|
339
|
+
issues.push({
|
|
340
|
+
id: `multiple-h1-${hash(result.url)}`,
|
|
341
|
+
url: result.url,
|
|
342
|
+
category: 'content',
|
|
343
|
+
severity: 'warning',
|
|
344
|
+
title: 'Multiple H1 headings',
|
|
345
|
+
description: `This page has ${result.h1.length} H1 headings.`,
|
|
346
|
+
recommendation: 'Use only one H1 heading per page.',
|
|
347
|
+
detectedAt: result.crawledAt,
|
|
348
|
+
metadata: { h1Count: result.h1.length },
|
|
349
|
+
});
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
// Images without alt
|
|
353
|
+
const imagesWithoutAlt = result.images.filter((img) => !img.hasAlt);
|
|
354
|
+
if (imagesWithoutAlt.length > 0) {
|
|
355
|
+
issues.push({
|
|
356
|
+
id: `images-no-alt-${hash(result.url)}`,
|
|
357
|
+
url: result.url,
|
|
358
|
+
category: 'content',
|
|
359
|
+
severity: 'info',
|
|
360
|
+
title: 'Images without alt text',
|
|
361
|
+
description: `${imagesWithoutAlt.length} images are missing alt text.`,
|
|
362
|
+
recommendation: 'Add descriptive alt text to all images for accessibility and SEO.',
|
|
363
|
+
detectedAt: result.crawledAt,
|
|
364
|
+
metadata: { count: imagesWithoutAlt.length },
|
|
365
|
+
});
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
// Slow load time (> 3s)
|
|
369
|
+
if (result.loadTime > 3000) {
|
|
370
|
+
issues.push({
|
|
371
|
+
id: `slow-page-${hash(result.url)}`,
|
|
372
|
+
url: result.url,
|
|
373
|
+
category: 'performance',
|
|
374
|
+
severity: result.loadTime > 5000 ? 'error' : 'warning',
|
|
375
|
+
title: 'Slow page load time',
|
|
376
|
+
description: `Page took ${result.loadTime}ms to load.`,
|
|
377
|
+
recommendation: 'Optimize page load time. Target under 3 seconds.',
|
|
378
|
+
detectedAt: result.crawledAt,
|
|
379
|
+
metadata: { loadTime: result.loadTime },
|
|
380
|
+
});
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
// Slow TTFB (> 800ms)
|
|
384
|
+
if (result.ttfb && result.ttfb > 800) {
|
|
385
|
+
issues.push({
|
|
386
|
+
id: `slow-ttfb-${hash(result.url)}`,
|
|
387
|
+
url: result.url,
|
|
388
|
+
category: 'performance',
|
|
389
|
+
severity: result.ttfb > 1500 ? 'error' : 'warning',
|
|
390
|
+
title: 'Slow Time to First Byte',
|
|
391
|
+
description: `TTFB is ${result.ttfb}ms. Server responded slowly.`,
|
|
392
|
+
recommendation: 'Optimize server response. Target TTFB under 800ms. Consider CDN, caching, or server upgrades.',
|
|
393
|
+
detectedAt: result.crawledAt,
|
|
394
|
+
metadata: { ttfb: result.ttfb },
|
|
395
|
+
});
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// Noindex check
|
|
399
|
+
if (result.metaRobots?.includes('noindex')) {
|
|
400
|
+
issues.push({
|
|
401
|
+
id: `noindex-${hash(result.url)}`,
|
|
402
|
+
url: result.url,
|
|
403
|
+
category: 'indexing',
|
|
404
|
+
severity: 'info',
|
|
405
|
+
title: 'Page marked as noindex',
|
|
406
|
+
description: 'This page has a noindex directive.',
|
|
407
|
+
recommendation: 'Verify this is intentional. Remove noindex if the page should be indexed.',
|
|
408
|
+
detectedAt: result.crawledAt,
|
|
409
|
+
metadata: { metaRobots: result.metaRobots },
|
|
410
|
+
});
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
return issues;
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
function hash(str: string): string {
|
|
418
|
+
let hash = 0;
|
|
419
|
+
for (let i = 0; i < str.length; i++) {
|
|
420
|
+
const char = str.charCodeAt(i);
|
|
421
|
+
hash = (hash << 5) - hash + char;
|
|
422
|
+
hash = hash & hash;
|
|
423
|
+
}
|
|
424
|
+
return Math.abs(hash).toString(36);
|
|
425
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @djangocfg/seo - Crawler Module
|
|
3
|
+
* Site crawler and analysis tools
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export { SiteCrawler, analyzeCrawlResults } from './crawler.js';
|
|
7
|
+
export { analyzeRobotsTxt, isUrlAllowed } from './robots-parser.js';
|
|
8
|
+
export { analyzeSitemap, analyzeAllSitemaps } from './sitemap-validator.js';
|
|
9
|
+
export type { RobotsAnalysis } from './robots-parser.js';
|
|
10
|
+
export type { SitemapAnalysis } from './sitemap-validator.js';
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @djangocfg/seo - Robots.txt Parser
|
|
3
|
+
* Parse and analyze robots.txt files
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import robotsParser from 'robots-parser';
|
|
7
|
+
import consola from 'consola';
|
|
8
|
+
import type { SeoIssue } from '../types/index.js';
|
|
9
|
+
|
|
10
|
+
export interface RobotsAnalysis {
|
|
11
|
+
exists: boolean;
|
|
12
|
+
content?: string;
|
|
13
|
+
sitemaps: string[];
|
|
14
|
+
allowedPaths: string[];
|
|
15
|
+
disallowedPaths: string[];
|
|
16
|
+
crawlDelay?: number;
|
|
17
|
+
issues: SeoIssue[];
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Fetch and parse robots.txt for a site
|
|
22
|
+
*/
|
|
23
|
+
export async function analyzeRobotsTxt(siteUrl: string): Promise<RobotsAnalysis> {
|
|
24
|
+
const robotsUrl = new URL('/robots.txt', siteUrl).href;
|
|
25
|
+
|
|
26
|
+
const analysis: RobotsAnalysis = {
|
|
27
|
+
exists: false,
|
|
28
|
+
sitemaps: [],
|
|
29
|
+
allowedPaths: [],
|
|
30
|
+
disallowedPaths: [],
|
|
31
|
+
issues: [],
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
try {
|
|
35
|
+
const response = await fetch(robotsUrl);
|
|
36
|
+
|
|
37
|
+
if (!response.ok) {
|
|
38
|
+
analysis.issues.push({
|
|
39
|
+
id: 'missing-robots-txt',
|
|
40
|
+
url: robotsUrl,
|
|
41
|
+
category: 'technical',
|
|
42
|
+
severity: 'warning',
|
|
43
|
+
title: 'Missing robots.txt',
|
|
44
|
+
description: `No robots.txt file found (HTTP ${response.status}).`,
|
|
45
|
+
recommendation: 'Create a robots.txt file to control crawler access.',
|
|
46
|
+
detectedAt: new Date().toISOString(),
|
|
47
|
+
});
|
|
48
|
+
return analysis;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
analysis.exists = true;
|
|
52
|
+
analysis.content = await response.text();
|
|
53
|
+
|
|
54
|
+
// Parse robots.txt
|
|
55
|
+
const robots = robotsParser(robotsUrl, analysis.content);
|
|
56
|
+
|
|
57
|
+
// Extract sitemaps
|
|
58
|
+
analysis.sitemaps = robots.getSitemaps();
|
|
59
|
+
|
|
60
|
+
if (analysis.sitemaps.length === 0) {
|
|
61
|
+
analysis.issues.push({
|
|
62
|
+
id: 'no-sitemap-in-robots',
|
|
63
|
+
url: robotsUrl,
|
|
64
|
+
category: 'technical',
|
|
65
|
+
severity: 'info',
|
|
66
|
+
title: 'No sitemap in robots.txt',
|
|
67
|
+
description: 'No sitemap URL is declared in robots.txt.',
|
|
68
|
+
recommendation: 'Add a Sitemap directive pointing to your XML sitemap.',
|
|
69
|
+
detectedAt: new Date().toISOString(),
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Parse rules (simplified extraction)
|
|
74
|
+
const lines = analysis.content.split('\n');
|
|
75
|
+
let currentUserAgent = '*';
|
|
76
|
+
|
|
77
|
+
for (const line of lines) {
|
|
78
|
+
const trimmed = line.trim().toLowerCase();
|
|
79
|
+
|
|
80
|
+
if (trimmed.startsWith('user-agent:')) {
|
|
81
|
+
currentUserAgent = trimmed.replace('user-agent:', '').trim();
|
|
82
|
+
} else if (trimmed.startsWith('disallow:')) {
|
|
83
|
+
const path = line.trim().replace(/disallow:/i, '').trim();
|
|
84
|
+
if (path) {
|
|
85
|
+
analysis.disallowedPaths.push(path);
|
|
86
|
+
}
|
|
87
|
+
} else if (trimmed.startsWith('allow:')) {
|
|
88
|
+
const path = line.trim().replace(/allow:/i, '').trim();
|
|
89
|
+
if (path) {
|
|
90
|
+
analysis.allowedPaths.push(path);
|
|
91
|
+
}
|
|
92
|
+
} else if (trimmed.startsWith('crawl-delay:')) {
|
|
93
|
+
const delay = parseInt(trimmed.replace('crawl-delay:', '').trim(), 10);
|
|
94
|
+
if (!isNaN(delay)) {
|
|
95
|
+
analysis.crawlDelay = delay;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Check for blocking important paths
|
|
101
|
+
const importantPaths = ['/', '/sitemap.xml'];
|
|
102
|
+
for (const path of importantPaths) {
|
|
103
|
+
if (!robots.isAllowed(new URL(path, siteUrl).href, 'Googlebot')) {
|
|
104
|
+
analysis.issues.push({
|
|
105
|
+
id: `blocked-important-path-${path.replace(/\//g, '-')}`,
|
|
106
|
+
url: siteUrl,
|
|
107
|
+
category: 'crawling',
|
|
108
|
+
severity: 'error',
|
|
109
|
+
title: `Important path blocked: ${path}`,
|
|
110
|
+
description: `The path ${path} is blocked in robots.txt.`,
|
|
111
|
+
recommendation: `Ensure ${path} is accessible to search engines.`,
|
|
112
|
+
detectedAt: new Date().toISOString(),
|
|
113
|
+
metadata: { path },
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// Check for excessively restrictive rules
|
|
119
|
+
if (analysis.disallowedPaths.includes('/')) {
|
|
120
|
+
analysis.issues.push({
|
|
121
|
+
id: 'all-blocked',
|
|
122
|
+
url: robotsUrl,
|
|
123
|
+
category: 'crawling',
|
|
124
|
+
severity: 'critical',
|
|
125
|
+
title: 'Entire site blocked',
|
|
126
|
+
description: 'robots.txt blocks access to the entire site (Disallow: /).',
|
|
127
|
+
recommendation: 'Remove or modify this rule if you want your site to be indexed.',
|
|
128
|
+
detectedAt: new Date().toISOString(),
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
consola.debug(`Analyzed robots.txt: ${analysis.disallowedPaths.length} disallow rules`);
|
|
133
|
+
} catch (error) {
|
|
134
|
+
consola.error('Failed to fetch robots.txt:', error);
|
|
135
|
+
analysis.issues.push({
|
|
136
|
+
id: 'robots-txt-error',
|
|
137
|
+
url: robotsUrl,
|
|
138
|
+
category: 'technical',
|
|
139
|
+
severity: 'warning',
|
|
140
|
+
title: 'Failed to fetch robots.txt',
|
|
141
|
+
description: `Error fetching robots.txt: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
|
142
|
+
recommendation: 'Ensure robots.txt is accessible.',
|
|
143
|
+
detectedAt: new Date().toISOString(),
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
return analysis;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Check if a URL is allowed by robots.txt
|
|
152
|
+
*/
|
|
153
|
+
export async function isUrlAllowed(
|
|
154
|
+
siteUrl: string,
|
|
155
|
+
url: string,
|
|
156
|
+
userAgent = 'Googlebot'
|
|
157
|
+
): Promise<boolean> {
|
|
158
|
+
const robotsUrl = new URL('/robots.txt', siteUrl).href;
|
|
159
|
+
|
|
160
|
+
try {
|
|
161
|
+
const response = await fetch(robotsUrl);
|
|
162
|
+
if (!response.ok) return true; // No robots.txt = allow all
|
|
163
|
+
|
|
164
|
+
const content = await response.text();
|
|
165
|
+
const robots = robotsParser(robotsUrl, content);
|
|
166
|
+
|
|
167
|
+
return robots.isAllowed(url, userAgent) ?? true;
|
|
168
|
+
} catch {
|
|
169
|
+
return true; // Error fetching = allow
|
|
170
|
+
}
|
|
171
|
+
}
|